@juspay/neurolink 9.36.1 → 9.37.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (202) hide show
  1. package/CHANGELOG.md +6 -0
  2. package/dist/auth/errors.d.ts +1 -1
  3. package/dist/auth/middleware/AuthMiddleware.d.ts +1 -1
  4. package/dist/auth/providers/BaseAuthProvider.d.ts +1 -1
  5. package/dist/browser/neurolink.min.js +921 -423
  6. package/dist/cli/commands/evaluate.d.ts +48 -0
  7. package/dist/cli/commands/evaluate.js +955 -0
  8. package/dist/cli/parser.js +4 -1
  9. package/dist/evaluation/BatchEvaluator.d.ts +163 -0
  10. package/dist/evaluation/BatchEvaluator.js +267 -0
  11. package/dist/evaluation/EvaluationAggregator.d.ts +272 -0
  12. package/dist/evaluation/EvaluationAggregator.js +377 -0
  13. package/dist/evaluation/EvaluatorFactory.d.ts +113 -0
  14. package/dist/evaluation/EvaluatorFactory.js +280 -0
  15. package/dist/evaluation/EvaluatorRegistry.d.ts +160 -0
  16. package/dist/evaluation/EvaluatorRegistry.js +184 -0
  17. package/dist/evaluation/errors/EvaluationError.d.ts +189 -0
  18. package/dist/evaluation/errors/EvaluationError.js +206 -0
  19. package/dist/evaluation/errors/index.d.ts +4 -0
  20. package/dist/evaluation/errors/index.js +4 -0
  21. package/dist/evaluation/hooks/index.d.ts +6 -0
  22. package/dist/evaluation/hooks/index.js +6 -0
  23. package/dist/evaluation/hooks/langfuseAdapter.d.ts +99 -0
  24. package/dist/evaluation/hooks/langfuseAdapter.js +172 -0
  25. package/dist/evaluation/hooks/observabilityHooks.d.ts +129 -0
  26. package/dist/evaluation/hooks/observabilityHooks.js +181 -0
  27. package/dist/evaluation/index.d.ts +11 -2
  28. package/dist/evaluation/index.js +15 -0
  29. package/dist/evaluation/pipeline/evaluationPipeline.d.ts +114 -0
  30. package/dist/evaluation/pipeline/evaluationPipeline.js +381 -0
  31. package/dist/evaluation/pipeline/index.d.ts +8 -0
  32. package/dist/evaluation/pipeline/index.js +8 -0
  33. package/dist/evaluation/pipeline/pipelineBuilder.d.ts +126 -0
  34. package/dist/evaluation/pipeline/pipelineBuilder.js +260 -0
  35. package/dist/evaluation/pipeline/presets.d.ts +66 -0
  36. package/dist/evaluation/pipeline/presets.js +224 -0
  37. package/dist/evaluation/pipeline/strategies/batchStrategy.d.ts +99 -0
  38. package/dist/evaluation/pipeline/strategies/batchStrategy.js +238 -0
  39. package/dist/evaluation/pipeline/strategies/index.d.ts +6 -0
  40. package/dist/evaluation/pipeline/strategies/index.js +6 -0
  41. package/dist/evaluation/pipeline/strategies/samplingStrategy.d.ts +76 -0
  42. package/dist/evaluation/pipeline/strategies/samplingStrategy.js +238 -0
  43. package/dist/evaluation/reporting/index.d.ts +6 -0
  44. package/dist/evaluation/reporting/index.js +6 -0
  45. package/dist/evaluation/reporting/metricsCollector.d.ts +147 -0
  46. package/dist/evaluation/reporting/metricsCollector.js +285 -0
  47. package/dist/evaluation/reporting/reportGenerator.d.ts +90 -0
  48. package/dist/evaluation/reporting/reportGenerator.js +374 -0
  49. package/dist/evaluation/scorers/baseScorer.d.ts +83 -0
  50. package/dist/evaluation/scorers/baseScorer.js +232 -0
  51. package/dist/evaluation/scorers/customScorerUtils.d.ts +95 -0
  52. package/dist/evaluation/scorers/customScorerUtils.js +381 -0
  53. package/dist/evaluation/scorers/index.d.ts +10 -0
  54. package/dist/evaluation/scorers/index.js +16 -0
  55. package/dist/evaluation/scorers/llm/answerRelevancyScorer.d.ts +12 -0
  56. package/dist/evaluation/scorers/llm/answerRelevancyScorer.js +99 -0
  57. package/dist/evaluation/scorers/llm/baseLLMScorer.d.ts +71 -0
  58. package/dist/evaluation/scorers/llm/baseLLMScorer.js +281 -0
  59. package/dist/evaluation/scorers/llm/biasDetectionScorer.d.ts +12 -0
  60. package/dist/evaluation/scorers/llm/biasDetectionScorer.js +127 -0
  61. package/dist/evaluation/scorers/llm/contextPrecisionScorer.d.ts +12 -0
  62. package/dist/evaluation/scorers/llm/contextPrecisionScorer.js +92 -0
  63. package/dist/evaluation/scorers/llm/contextRelevancyScorer.d.ts +12 -0
  64. package/dist/evaluation/scorers/llm/contextRelevancyScorer.js +107 -0
  65. package/dist/evaluation/scorers/llm/faithfulnessScorer.d.ts +12 -0
  66. package/dist/evaluation/scorers/llm/faithfulnessScorer.js +121 -0
  67. package/dist/evaluation/scorers/llm/hallucinationScorer.d.ts +12 -0
  68. package/dist/evaluation/scorers/llm/hallucinationScorer.js +140 -0
  69. package/dist/evaluation/scorers/llm/index.d.ts +15 -0
  70. package/dist/evaluation/scorers/llm/index.js +16 -0
  71. package/dist/evaluation/scorers/llm/promptAlignmentScorer.d.ts +12 -0
  72. package/dist/evaluation/scorers/llm/promptAlignmentScorer.js +106 -0
  73. package/dist/evaluation/scorers/llm/summarizationScorer.d.ts +12 -0
  74. package/dist/evaluation/scorers/llm/summarizationScorer.js +114 -0
  75. package/dist/evaluation/scorers/llm/toneConsistencyScorer.d.ts +12 -0
  76. package/dist/evaluation/scorers/llm/toneConsistencyScorer.js +106 -0
  77. package/dist/evaluation/scorers/llm/toxicityScorer.d.ts +12 -0
  78. package/dist/evaluation/scorers/llm/toxicityScorer.js +121 -0
  79. package/dist/evaluation/scorers/rule/baseRuleScorer.d.ts +77 -0
  80. package/dist/evaluation/scorers/rule/baseRuleScorer.js +233 -0
  81. package/dist/evaluation/scorers/rule/contentSimilarityScorer.d.ts +108 -0
  82. package/dist/evaluation/scorers/rule/contentSimilarityScorer.js +350 -0
  83. package/dist/evaluation/scorers/rule/formatScorer.d.ts +147 -0
  84. package/dist/evaluation/scorers/rule/formatScorer.js +470 -0
  85. package/dist/evaluation/scorers/rule/index.d.ts +9 -0
  86. package/dist/evaluation/scorers/rule/index.js +10 -0
  87. package/dist/evaluation/scorers/rule/keywordCoverageScorer.d.ts +83 -0
  88. package/dist/evaluation/scorers/rule/keywordCoverageScorer.js +347 -0
  89. package/dist/evaluation/scorers/rule/lengthScorer.d.ts +105 -0
  90. package/dist/evaluation/scorers/rule/lengthScorer.js +351 -0
  91. package/dist/evaluation/scorers/scorerBuilder.d.ts +161 -0
  92. package/dist/evaluation/scorers/scorerBuilder.js +420 -0
  93. package/dist/evaluation/scorers/scorerRegistry.d.ts +62 -0
  94. package/dist/evaluation/scorers/scorerRegistry.js +467 -0
  95. package/dist/index.d.ts +37 -25
  96. package/dist/index.js +65 -26
  97. package/dist/lib/auth/providers/BaseAuthProvider.d.ts +1 -1
  98. package/dist/lib/evaluation/BatchEvaluator.d.ts +163 -0
  99. package/dist/lib/evaluation/BatchEvaluator.js +268 -0
  100. package/dist/lib/evaluation/EvaluationAggregator.d.ts +272 -0
  101. package/dist/lib/evaluation/EvaluationAggregator.js +378 -0
  102. package/dist/lib/evaluation/EvaluatorFactory.d.ts +113 -0
  103. package/dist/lib/evaluation/EvaluatorFactory.js +281 -0
  104. package/dist/lib/evaluation/EvaluatorRegistry.d.ts +160 -0
  105. package/dist/lib/evaluation/EvaluatorRegistry.js +185 -0
  106. package/dist/lib/evaluation/errors/EvaluationError.d.ts +189 -0
  107. package/dist/lib/evaluation/errors/EvaluationError.js +207 -0
  108. package/dist/lib/evaluation/errors/index.d.ts +4 -0
  109. package/dist/lib/evaluation/errors/index.js +5 -0
  110. package/dist/lib/evaluation/hooks/index.d.ts +6 -0
  111. package/dist/lib/evaluation/hooks/index.js +7 -0
  112. package/dist/lib/evaluation/hooks/langfuseAdapter.d.ts +99 -0
  113. package/dist/lib/evaluation/hooks/langfuseAdapter.js +173 -0
  114. package/dist/lib/evaluation/hooks/observabilityHooks.d.ts +129 -0
  115. package/dist/lib/evaluation/hooks/observabilityHooks.js +182 -0
  116. package/dist/lib/evaluation/index.d.ts +11 -2
  117. package/dist/lib/evaluation/index.js +15 -0
  118. package/dist/lib/evaluation/pipeline/evaluationPipeline.d.ts +114 -0
  119. package/dist/lib/evaluation/pipeline/evaluationPipeline.js +382 -0
  120. package/dist/lib/evaluation/pipeline/index.d.ts +8 -0
  121. package/dist/lib/evaluation/pipeline/index.js +9 -0
  122. package/dist/lib/evaluation/pipeline/pipelineBuilder.d.ts +126 -0
  123. package/dist/lib/evaluation/pipeline/pipelineBuilder.js +261 -0
  124. package/dist/lib/evaluation/pipeline/presets.d.ts +66 -0
  125. package/dist/lib/evaluation/pipeline/presets.js +225 -0
  126. package/dist/lib/evaluation/pipeline/strategies/batchStrategy.d.ts +99 -0
  127. package/dist/lib/evaluation/pipeline/strategies/batchStrategy.js +239 -0
  128. package/dist/lib/evaluation/pipeline/strategies/index.d.ts +6 -0
  129. package/dist/lib/evaluation/pipeline/strategies/index.js +7 -0
  130. package/dist/lib/evaluation/pipeline/strategies/samplingStrategy.d.ts +76 -0
  131. package/dist/lib/evaluation/pipeline/strategies/samplingStrategy.js +239 -0
  132. package/dist/lib/evaluation/reporting/index.d.ts +6 -0
  133. package/dist/lib/evaluation/reporting/index.js +7 -0
  134. package/dist/lib/evaluation/reporting/metricsCollector.d.ts +147 -0
  135. package/dist/lib/evaluation/reporting/metricsCollector.js +286 -0
  136. package/dist/lib/evaluation/reporting/reportGenerator.d.ts +90 -0
  137. package/dist/lib/evaluation/reporting/reportGenerator.js +375 -0
  138. package/dist/lib/evaluation/scorers/baseScorer.d.ts +83 -0
  139. package/dist/lib/evaluation/scorers/baseScorer.js +233 -0
  140. package/dist/lib/evaluation/scorers/customScorerUtils.d.ts +95 -0
  141. package/dist/lib/evaluation/scorers/customScorerUtils.js +382 -0
  142. package/dist/lib/evaluation/scorers/index.d.ts +10 -0
  143. package/dist/lib/evaluation/scorers/index.js +17 -0
  144. package/dist/lib/evaluation/scorers/llm/answerRelevancyScorer.d.ts +12 -0
  145. package/dist/lib/evaluation/scorers/llm/answerRelevancyScorer.js +100 -0
  146. package/dist/lib/evaluation/scorers/llm/baseLLMScorer.d.ts +71 -0
  147. package/dist/lib/evaluation/scorers/llm/baseLLMScorer.js +282 -0
  148. package/dist/lib/evaluation/scorers/llm/biasDetectionScorer.d.ts +12 -0
  149. package/dist/lib/evaluation/scorers/llm/biasDetectionScorer.js +128 -0
  150. package/dist/lib/evaluation/scorers/llm/contextPrecisionScorer.d.ts +12 -0
  151. package/dist/lib/evaluation/scorers/llm/contextPrecisionScorer.js +93 -0
  152. package/dist/lib/evaluation/scorers/llm/contextRelevancyScorer.d.ts +12 -0
  153. package/dist/lib/evaluation/scorers/llm/contextRelevancyScorer.js +108 -0
  154. package/dist/lib/evaluation/scorers/llm/faithfulnessScorer.d.ts +12 -0
  155. package/dist/lib/evaluation/scorers/llm/faithfulnessScorer.js +122 -0
  156. package/dist/lib/evaluation/scorers/llm/hallucinationScorer.d.ts +12 -0
  157. package/dist/lib/evaluation/scorers/llm/hallucinationScorer.js +141 -0
  158. package/dist/lib/evaluation/scorers/llm/index.d.ts +15 -0
  159. package/dist/lib/evaluation/scorers/llm/index.js +17 -0
  160. package/dist/lib/evaluation/scorers/llm/promptAlignmentScorer.d.ts +12 -0
  161. package/dist/lib/evaluation/scorers/llm/promptAlignmentScorer.js +107 -0
  162. package/dist/lib/evaluation/scorers/llm/summarizationScorer.d.ts +12 -0
  163. package/dist/lib/evaluation/scorers/llm/summarizationScorer.js +115 -0
  164. package/dist/lib/evaluation/scorers/llm/toneConsistencyScorer.d.ts +12 -0
  165. package/dist/lib/evaluation/scorers/llm/toneConsistencyScorer.js +107 -0
  166. package/dist/lib/evaluation/scorers/llm/toxicityScorer.d.ts +12 -0
  167. package/dist/lib/evaluation/scorers/llm/toxicityScorer.js +122 -0
  168. package/dist/lib/evaluation/scorers/rule/baseRuleScorer.d.ts +77 -0
  169. package/dist/lib/evaluation/scorers/rule/baseRuleScorer.js +234 -0
  170. package/dist/lib/evaluation/scorers/rule/contentSimilarityScorer.d.ts +108 -0
  171. package/dist/lib/evaluation/scorers/rule/contentSimilarityScorer.js +351 -0
  172. package/dist/lib/evaluation/scorers/rule/formatScorer.d.ts +147 -0
  173. package/dist/lib/evaluation/scorers/rule/formatScorer.js +471 -0
  174. package/dist/lib/evaluation/scorers/rule/index.d.ts +9 -0
  175. package/dist/lib/evaluation/scorers/rule/index.js +11 -0
  176. package/dist/lib/evaluation/scorers/rule/keywordCoverageScorer.d.ts +83 -0
  177. package/dist/lib/evaluation/scorers/rule/keywordCoverageScorer.js +348 -0
  178. package/dist/lib/evaluation/scorers/rule/lengthScorer.d.ts +105 -0
  179. package/dist/lib/evaluation/scorers/rule/lengthScorer.js +352 -0
  180. package/dist/lib/evaluation/scorers/scorerBuilder.d.ts +161 -0
  181. package/dist/lib/evaluation/scorers/scorerBuilder.js +421 -0
  182. package/dist/lib/evaluation/scorers/scorerRegistry.d.ts +62 -0
  183. package/dist/lib/evaluation/scorers/scorerRegistry.js +468 -0
  184. package/dist/lib/index.d.ts +37 -25
  185. package/dist/lib/index.js +65 -26
  186. package/dist/lib/neurolink.d.ts +204 -0
  187. package/dist/lib/neurolink.js +296 -0
  188. package/dist/lib/types/index.d.ts +3 -1
  189. package/dist/lib/types/index.js +3 -2
  190. package/dist/lib/types/scorerTypes.d.ts +423 -0
  191. package/dist/lib/types/scorerTypes.js +6 -0
  192. package/dist/lib/utils/errorHandling.d.ts +20 -0
  193. package/dist/lib/utils/errorHandling.js +60 -0
  194. package/dist/neurolink.d.ts +204 -0
  195. package/dist/neurolink.js +296 -0
  196. package/dist/types/index.d.ts +3 -1
  197. package/dist/types/index.js +3 -2
  198. package/dist/types/scorerTypes.d.ts +423 -0
  199. package/dist/types/scorerTypes.js +5 -0
  200. package/dist/utils/errorHandling.d.ts +20 -0
  201. package/dist/utils/errorHandling.js +60 -0
  202. package/package.json +1 -1
@@ -0,0 +1,955 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * NeuroLink CLI Evaluate Command
4
+ *
5
+ * Evaluate AI responses using configured scorers and pipelines.
6
+ * Supports subcommands: run, score, report, presets, scorers (list-scorers)
7
+ */
8
+ import chalk from "chalk";
9
+ import ora from "ora";
10
+ import fs from "node:fs";
11
+ import { EvaluationPipeline, getPreset, getPresetNames, PipelinePresets, } from "../../lib/evaluation/pipeline/index.js";
12
+ import { ScorerRegistry } from "../../lib/evaluation/scorers/index.js";
13
+ import { ReportGenerator, } from "../../lib/evaluation/reporting/reportGenerator.js";
14
+ import { logger } from "../../lib/utils/logger.js";
15
+ /**
16
+ * Format score result for display
17
+ */
18
+ function formatScoreResult(result, verbose) {
19
+ const passIcon = result.passed ? chalk.green("PASS") : chalk.red("FAIL");
20
+ const scoreColor = result.passed ? chalk.green : chalk.red;
21
+ let output = ` ${passIcon} ${chalk.cyan(result.scorerName)}: ${scoreColor(result.score.toFixed(2))}`;
22
+ if (verbose) {
23
+ output += `\n ${chalk.gray(result.reasoning)}`;
24
+ output += `\n ${chalk.gray(`(${result.computeTime}ms)`)}`;
25
+ }
26
+ return output;
27
+ }
28
+ /**
29
+ * Check if a preset name is valid
30
+ */
31
+ function isValidPreset(name) {
32
+ return name in PipelinePresets;
33
+ }
34
+ /**
35
+ * Create scorer input from command arguments
36
+ */
37
+ function createScorerInput(argv) {
38
+ // Handle context - can be array of strings or path to file
39
+ let contextArray;
40
+ if (argv.context) {
41
+ if (typeof argv.context === "string") {
42
+ // Check if it's a file path
43
+ if (fs.existsSync(argv.context)) {
44
+ try {
45
+ const content = fs.readFileSync(argv.context, "utf-8");
46
+ const parsed = JSON.parse(content);
47
+ contextArray = Array.isArray(parsed) ? parsed : [content];
48
+ }
49
+ catch {
50
+ contextArray = [argv.context];
51
+ }
52
+ }
53
+ else {
54
+ contextArray = [argv.context];
55
+ }
56
+ }
57
+ else {
58
+ contextArray = argv.context;
59
+ }
60
+ }
61
+ return {
62
+ query: argv.query ?? argv.input ?? "",
63
+ response: argv.output ?? argv.input ?? "",
64
+ context: contextArray,
65
+ groundTruth: argv.groundTruth,
66
+ };
67
+ }
68
+ /**
69
+ * List-scorers subcommand - List all available scorers
70
+ */
71
+ const listScorersCommand = {
72
+ command: "list-scorers",
73
+ describe: "List all available scorers",
74
+ builder: (yargs) => yargs
75
+ .option("category", {
76
+ type: "string",
77
+ describe: "Filter by category (accuracy, relevancy, safety, quality, faithfulness)",
78
+ })
79
+ .option("type", {
80
+ type: "string",
81
+ describe: "Filter by type (llm, rule)",
82
+ choices: ["llm", "rule"],
83
+ })
84
+ .option("detailed", {
85
+ type: "boolean",
86
+ describe: "Show detailed scorer information",
87
+ default: false,
88
+ })
89
+ .option("json", {
90
+ type: "boolean",
91
+ describe: "Output as JSON",
92
+ default: false,
93
+ })
94
+ .example("$0 evaluate list-scorers", "List all scorers")
95
+ .example("$0 evaluate list-scorers --category safety", "List safety scorers")
96
+ .example("$0 evaluate list-scorers --type rule --detailed", "List rule-based scorers with details"),
97
+ handler: async (argv) => {
98
+ const { category, type, json, detailed } = argv;
99
+ await ScorerRegistry.registerBuiltInScorers();
100
+ let scorerList = ScorerRegistry.list();
101
+ // Apply filters
102
+ if (category) {
103
+ scorerList = scorerList.filter((s) => s.category === category);
104
+ }
105
+ if (type) {
106
+ scorerList = scorerList.filter((s) => s.type === type);
107
+ }
108
+ if (json) {
109
+ logger.always(JSON.stringify(scorerList, null, 2));
110
+ }
111
+ else {
112
+ logger.always("");
113
+ logger.always(chalk.bold("Available Scorers:"));
114
+ logger.always(chalk.gray("-".repeat(60)));
115
+ // Group by category
116
+ const byCategory = new Map();
117
+ for (const s of scorerList) {
118
+ const cat = s.category;
119
+ if (!byCategory.has(cat)) {
120
+ byCategory.set(cat, []);
121
+ }
122
+ const categoryList = byCategory.get(cat);
123
+ if (categoryList) {
124
+ categoryList.push(s);
125
+ }
126
+ }
127
+ for (const [cat, scorers] of byCategory) {
128
+ logger.always("");
129
+ logger.always(chalk.bold.underline(cat.toUpperCase()));
130
+ for (const metadata of scorers) {
131
+ const typeIcon = metadata.type === "llm" ? "AI" : "Rule";
132
+ logger.always("");
133
+ logger.always(` ${chalk.cyan(metadata.id)} [${typeIcon}]`);
134
+ logger.always(` ${chalk.gray(metadata.description)}`);
135
+ if (detailed) {
136
+ logger.always(` Required: ${metadata.requiredInputs.join(", ") || "none"}`);
137
+ if (metadata.optionalInputs.length > 0) {
138
+ logger.always(` Optional: ${metadata.optionalInputs.join(", ")}`);
139
+ }
140
+ }
141
+ }
142
+ }
143
+ logger.always("");
144
+ logger.always(chalk.gray(`Total: ${scorerList.length} scorers`));
145
+ }
146
+ },
147
+ };
148
+ /**
149
+ * Run-pipeline subcommand - Run evaluation using a predefined pipeline preset
150
+ */
151
+ const runPipelineCommand = {
152
+ command: "run-pipeline",
153
+ describe: "Run evaluation using a predefined pipeline preset",
154
+ builder: (yargs) => yargs
155
+ .option("preset", {
156
+ type: "string",
157
+ describe: `Pipeline preset to use (${getPresetNames().join(", ")})`,
158
+ alias: "p",
159
+ demandOption: true,
160
+ })
161
+ .option("input", {
162
+ type: "string",
163
+ describe: "AI response text to evaluate",
164
+ alias: "i",
165
+ demandOption: true,
166
+ })
167
+ .option("query", {
168
+ type: "string",
169
+ describe: "Original user query",
170
+ alias: "q",
171
+ })
172
+ .option("context", {
173
+ type: "string",
174
+ describe: "Path to context file (JSON format) or context string",
175
+ alias: "c",
176
+ })
177
+ .option("threshold", {
178
+ type: "number",
179
+ describe: "Custom pass threshold (0-1)",
180
+ alias: "t",
181
+ })
182
+ .option("format", {
183
+ type: "string",
184
+ describe: "Output format",
185
+ choices: ["text", "json", "table"],
186
+ default: "text",
187
+ })
188
+ .option("json", {
189
+ type: "boolean",
190
+ describe: "Output results as JSON (shorthand for --format json)",
191
+ default: false,
192
+ })
193
+ .option("verbose", {
194
+ type: "boolean",
195
+ describe: "Show detailed reasoning and timing",
196
+ alias: "v",
197
+ default: false,
198
+ })
199
+ .example('$0 evaluate run-pipeline --preset quality --input "The capital of France is Paris."', "Run quality evaluation")
200
+ .example('$0 evaluate run-pipeline --preset rag --input "Response" --query "Question" --context ./context.json', "Run RAG evaluation with context file"),
201
+ handler: async (argv) => {
202
+ const { preset, input, query, context, threshold, json, verbose, format } = argv;
203
+ const outputFormat = json ? "json" : format;
204
+ const spinner = outputFormat === "json"
205
+ ? null
206
+ : ora(`Running ${preset} evaluation pipeline...`).start();
207
+ try {
208
+ if (!isValidPreset(preset)) {
209
+ spinner?.fail(`Unknown pipeline preset: ${preset}`);
210
+ logger.always(chalk.gray(`Available presets: ${getPresetNames().join(", ")}`));
211
+ process.exit(1);
212
+ }
213
+ const presetConfig = getPreset(preset);
214
+ // Apply custom threshold if provided
215
+ if (threshold !== undefined) {
216
+ presetConfig.passThreshold = threshold;
217
+ }
218
+ const evaluationPipeline = new EvaluationPipeline(presetConfig);
219
+ const scorerInput = createScorerInput({
220
+ input: query,
221
+ output: input,
222
+ context,
223
+ });
224
+ await evaluationPipeline.initialize();
225
+ const result = await evaluationPipeline.execute(scorerInput);
226
+ spinner?.stop();
227
+ if (outputFormat === "json") {
228
+ logger.always(JSON.stringify(result, null, 2));
229
+ }
230
+ else if (outputFormat === "table") {
231
+ logger.always("");
232
+ logger.always(chalk.bold(`Pipeline: ${preset}`));
233
+ logger.always(chalk.gray("-".repeat(50)));
234
+ // Table header
235
+ logger.always(`${chalk.bold("Scorer".padEnd(25))} ${chalk.bold("Score".padEnd(10))} ${chalk.bold("Status")}`);
236
+ logger.always(chalk.gray("-".repeat(50)));
237
+ for (const score of result.scores) {
238
+ const status = score.passed ? chalk.green("PASS") : chalk.red("FAIL");
239
+ const scoreColor = score.passed ? chalk.green : chalk.red;
240
+ logger.always(`${score.scorerName.padEnd(25)} ${scoreColor(score.score.toFixed(2).padEnd(10))} ${status}`);
241
+ }
242
+ logger.always(chalk.gray("-".repeat(50)));
243
+ const overallColor = result.passed ? chalk.green : chalk.red;
244
+ logger.always(`${"Overall".padEnd(25)} ${overallColor(result.overallScore.toFixed(2).padEnd(10))} ${result.passed ? chalk.green("PASS") : chalk.red("FAIL")}`);
245
+ }
246
+ else {
247
+ // Text format
248
+ logger.always("");
249
+ logger.always(chalk.bold(`Pipeline: ${preset} Evaluation Results`));
250
+ logger.always(chalk.gray("-".repeat(50)));
251
+ const overallColor = result.passed ? chalk.green : chalk.red;
252
+ const overallIcon = result.passed ? "PASS" : "FAIL";
253
+ logger.always(`${overallColor(overallIcon)} Overall Score: ${overallColor(result.overallScore.toFixed(2))} (${result.aggregationMethod})`);
254
+ logger.always("");
255
+ logger.always(chalk.bold("Individual Scores:"));
256
+ for (const score of result.scores) {
257
+ logger.always(formatScoreResult(score, verbose ?? false));
258
+ }
259
+ if (result.errors.length > 0) {
260
+ logger.always("");
261
+ logger.always(chalk.yellow("Errors:"));
262
+ for (const error of result.errors) {
263
+ logger.always(` ${chalk.yellow("!")} ${error.scorerId}: ${error.error}`);
264
+ }
265
+ }
266
+ logger.always("");
267
+ logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
268
+ }
269
+ }
270
+ catch (error) {
271
+ spinner?.fail("Pipeline evaluation failed");
272
+ const errorMessage = error instanceof Error ? error.message : String(error);
273
+ logger.error(chalk.red(`Error: ${errorMessage}`));
274
+ process.exit(1);
275
+ }
276
+ },
277
+ };
278
+ /**
279
+ * Run subcommand - Execute evaluation pipeline (legacy support)
280
+ */
281
+ const runCommand = {
282
+ command: "run",
283
+ describe: "Run evaluation pipeline on a response",
284
+ builder: (yargs) => yargs
285
+ .option("input", {
286
+ type: "string",
287
+ describe: "Input query/question that was asked",
288
+ alias: "i",
289
+ })
290
+ .option("output", {
291
+ type: "string",
292
+ describe: "Output/answer to evaluate",
293
+ alias: "o",
294
+ })
295
+ .option("context", {
296
+ type: "array",
297
+ string: true,
298
+ describe: "Context documents for RAG evaluation (can be used multiple times)",
299
+ alias: "c",
300
+ })
301
+ .option("ground-truth", {
302
+ type: "string",
303
+ describe: "Expected/correct answer for accuracy evaluation",
304
+ alias: "g",
305
+ })
306
+ .option("pipeline", {
307
+ type: "string",
308
+ describe: `Pipeline preset to use (${getPresetNames().join(", ")})`,
309
+ alias: "p",
310
+ })
311
+ .option("scorer", {
312
+ type: "array",
313
+ string: true,
314
+ describe: "Specific scorers to use (can be used multiple times)",
315
+ alias: "s",
316
+ })
317
+ .option("json", {
318
+ type: "boolean",
319
+ describe: "Output results as JSON",
320
+ default: false,
321
+ })
322
+ .option("verbose", {
323
+ type: "boolean",
324
+ describe: "Show detailed reasoning and timing",
325
+ alias: "v",
326
+ default: false,
327
+ })
328
+ .example('$0 evaluate run -i "What is the capital of France?" -o "Paris" -p quality', "Evaluate a response using the quality pipeline"),
329
+ handler: async (argv) => {
330
+ const { input, output, context, groundTruth, pipeline, scorer, json, verbose, } = argv;
331
+ if (!input || !output) {
332
+ logger.error(chalk.red("Error: Both --input and --output are required"));
333
+ logger.always(chalk.gray("Use --help for usage information"));
334
+ process.exit(1);
335
+ }
336
+ const spinner = json ? null : ora("Initializing evaluation...").start();
337
+ try {
338
+ const scorerInput = createScorerInput({
339
+ input,
340
+ output,
341
+ context,
342
+ groundTruth,
343
+ });
344
+ let evaluationPipeline;
345
+ if (pipeline) {
346
+ if (!isValidPreset(pipeline)) {
347
+ spinner?.fail(`Unknown pipeline preset: ${pipeline}`);
348
+ logger.always(chalk.gray(`Available presets: ${getPresetNames().join(", ")}`));
349
+ process.exit(1);
350
+ }
351
+ const presetConfig = getPreset(pipeline);
352
+ evaluationPipeline = new EvaluationPipeline(presetConfig);
353
+ }
354
+ else if (scorer && scorer.length > 0) {
355
+ const pipelineConfig = {
356
+ name: "CLI Custom Pipeline",
357
+ description: "Custom pipeline from CLI scorer arguments",
358
+ scorers: scorer.map((s) => ({ id: s })),
359
+ executionMode: "parallel",
360
+ };
361
+ evaluationPipeline = new EvaluationPipeline(pipelineConfig);
362
+ }
363
+ else {
364
+ const defaultPreset = getPreset("quality");
365
+ evaluationPipeline = new EvaluationPipeline(defaultPreset);
366
+ }
367
+ if (spinner) {
368
+ spinner.text = "Running evaluation...";
369
+ }
370
+ await evaluationPipeline.initialize();
371
+ const result = await evaluationPipeline.execute(scorerInput);
372
+ spinner?.stop();
373
+ if (json) {
374
+ logger.always(JSON.stringify(result, null, 2));
375
+ }
376
+ else {
377
+ logger.always("");
378
+ logger.always(chalk.bold("Evaluation Results"));
379
+ logger.always(chalk.gray("-".repeat(50)));
380
+ const overallColor = result.passed ? chalk.green : chalk.red;
381
+ const overallIcon = result.passed ? "PASS" : "FAIL";
382
+ logger.always(`${overallColor(overallIcon)} Overall Score: ${overallColor(result.overallScore.toFixed(2))} (${result.aggregationMethod})`);
383
+ logger.always("");
384
+ logger.always(chalk.bold("Individual Scores:"));
385
+ for (const score of result.scores) {
386
+ logger.always(formatScoreResult(score, verbose ?? false));
387
+ }
388
+ if (result.errors.length > 0) {
389
+ logger.always("");
390
+ logger.always(chalk.yellow("Errors:"));
391
+ for (const error of result.errors) {
392
+ logger.always(` ${chalk.yellow("!")} ${error.scorerId}: ${error.error}`);
393
+ }
394
+ }
395
+ if (result.skippedScorers.length > 0 && verbose) {
396
+ logger.always("");
397
+ logger.always(chalk.gray(`Skipped: ${result.skippedScorers.join(", ")}`));
398
+ }
399
+ logger.always("");
400
+ logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
401
+ }
402
+ }
403
+ catch (error) {
404
+ spinner?.fail("Evaluation failed");
405
+ const errorMessage = error instanceof Error ? error.message : String(error);
406
+ logger.error(chalk.red(`Error: ${errorMessage}`));
407
+ process.exit(1);
408
+ }
409
+ },
410
+ };
411
+ /**
412
+ * Score subcommand - Score a single response with a specific scorer
413
+ */
414
+ const scoreCommand = {
415
+ command: "score <scorer>",
416
+ describe: "Score a response using a single scorer",
417
+ builder: (yargs) => yargs
418
+ .positional("scorer", {
419
+ type: "string",
420
+ describe: "Scorer ID to use (e.g., hallucination, toxicity)",
421
+ demandOption: true,
422
+ })
423
+ .option("input", {
424
+ type: "string",
425
+ describe: "Input query/question that was asked",
426
+ alias: "i",
427
+ })
428
+ .option("output", {
429
+ type: "string",
430
+ describe: "Output/answer to evaluate",
431
+ alias: "o",
432
+ })
433
+ .option("context", {
434
+ type: "array",
435
+ string: true,
436
+ describe: "Context documents for evaluation",
437
+ alias: "c",
438
+ })
439
+ .option("ground-truth", {
440
+ type: "string",
441
+ describe: "Expected answer for comparison",
442
+ alias: "g",
443
+ })
444
+ .option("json", {
445
+ type: "boolean",
446
+ describe: "Output results as JSON",
447
+ default: false,
448
+ })
449
+ .option("verbose", {
450
+ type: "boolean",
451
+ describe: "Show detailed output",
452
+ alias: "v",
453
+ default: false,
454
+ })
455
+ .example('$0 evaluate score toxicity -o "This is a test response"', "Score a response for toxicity")
456
+ .example('$0 evaluate score hallucination -i "What is 2+2?" -o "2+2 equals 4" --json', "Score for hallucinations and output JSON"),
457
+ handler: async (argv) => {
458
+ const { scorer, input, output, context, groundTruth, json, verbose } = argv;
459
+ if (!output) {
460
+ logger.error(chalk.red("Error: --output is required"));
461
+ logger.always(chalk.gray("Use --help for usage information"));
462
+ process.exit(1);
463
+ }
464
+ const spinnerInstance = json
465
+ ? null
466
+ : ora(`Loading scorer: ${scorer}...`).start();
467
+ try {
468
+ await ScorerRegistry.registerBuiltInScorers();
469
+ const scorerInstance = await ScorerRegistry.getScorer(scorer);
470
+ if (!scorerInstance) {
471
+ spinnerInstance?.fail(`Scorer not found: ${scorer}`);
472
+ const available = ScorerRegistry.list().map((s) => s.id);
473
+ logger.always(chalk.gray(`Available scorers: ${available.join(", ")}`));
474
+ process.exit(1);
475
+ }
476
+ if (spinnerInstance) {
477
+ spinnerInstance.text = "Running scorer...";
478
+ }
479
+ const scorerInput = createScorerInput({
480
+ input: input ?? "",
481
+ output,
482
+ context,
483
+ groundTruth,
484
+ });
485
+ const validation = scorerInstance.validateInput(scorerInput);
486
+ if (!validation.valid) {
487
+ spinnerInstance?.fail("Input validation failed");
488
+ for (const err of validation.errors) {
489
+ logger.always(chalk.red(` - ${err}`));
490
+ }
491
+ process.exit(1);
492
+ }
493
+ const result = await scorerInstance.score(scorerInput);
494
+ spinnerInstance?.stop();
495
+ if (json) {
496
+ logger.always(JSON.stringify(result, null, 2));
497
+ }
498
+ else {
499
+ logger.always("");
500
+ logger.always(chalk.bold(`${result.scorerName} Score: ${result.score.toFixed(2)}/10`));
501
+ logger.always(result.passed
502
+ ? chalk.green(" Status: PASSED")
503
+ : chalk.red(" Status: FAILED"));
504
+ logger.always(` Threshold: ${result.threshold}`);
505
+ logger.always(` Time: ${result.computeTime}ms`);
506
+ if (verbose || !result.passed) {
507
+ logger.always("");
508
+ logger.always(chalk.gray("Reasoning:"));
509
+ logger.always(chalk.gray(` ${result.reasoning}`));
510
+ }
511
+ if (result.confidence !== undefined) {
512
+ logger.always("");
513
+ logger.always(chalk.gray(`Confidence: ${(result.confidence * 100).toFixed(1)}%`));
514
+ }
515
+ if (verbose && result.metadata) {
516
+ logger.always("");
517
+ logger.always(chalk.gray("Metadata:"));
518
+ logger.always(chalk.gray(JSON.stringify(result.metadata, null, 2)));
519
+ }
520
+ }
521
+ }
522
+ catch (error) {
523
+ spinnerInstance?.fail("Scoring failed");
524
+ const errorMessage = error instanceof Error ? error.message : String(error);
525
+ logger.error(chalk.red(`Error: ${errorMessage}`));
526
+ process.exit(1);
527
+ }
528
+ },
529
+ };
530
+ /**
531
+ * Report subcommand - Generate evaluation report
532
+ */
533
+ const reportCommand = {
534
+ command: "report",
535
+ describe: "Generate an evaluation report",
536
+ builder: (yargs) => yargs
537
+ .option("input", {
538
+ type: "string",
539
+ describe: "Input query/question that was asked",
540
+ alias: "i",
541
+ })
542
+ .option("output", {
543
+ type: "string",
544
+ describe: "Output/answer to evaluate",
545
+ alias: "o",
546
+ })
547
+ .option("context", {
548
+ type: "array",
549
+ string: true,
550
+ describe: "Context documents for evaluation",
551
+ alias: "c",
552
+ })
553
+ .option("ground-truth", {
554
+ type: "string",
555
+ describe: "Expected answer for comparison",
556
+ alias: "g",
557
+ })
558
+ .option("pipeline", {
559
+ type: "string",
560
+ describe: `Pipeline preset to use (${getPresetNames().join(", ")})`,
561
+ alias: "p",
562
+ })
563
+ .option("scorer", {
564
+ type: "array",
565
+ string: true,
566
+ describe: "Specific scorers to use",
567
+ alias: "s",
568
+ })
569
+ .option("format", {
570
+ type: "string",
571
+ describe: "Report format (text, json, markdown, html)",
572
+ choices: ["text", "json", "markdown", "html"],
573
+ default: "text",
574
+ })
575
+ .option("output-file", {
576
+ type: "string",
577
+ describe: "Save report to file",
578
+ alias: "f",
579
+ })
580
+ .option("verbose", {
581
+ type: "boolean",
582
+ describe: "Include detailed information in report",
583
+ alias: "v",
584
+ default: true,
585
+ })
586
+ .example('$0 evaluate report -i "Question" -o "Answer" -p quality --format markdown', "Generate markdown report")
587
+ .example('$0 evaluate report -i "Question" -o "Answer" -p rag --format html -f report.html', "Generate HTML report and save to file"),
588
+ handler: async (argv) => {
589
+ const { input, output, context, groundTruth, pipeline, scorer, format, outputFile, verbose, } = argv;
590
+ if (!input || !output) {
591
+ logger.error(chalk.red("Error: Both --input and --output are required"));
592
+ logger.always(chalk.gray("Use --help for usage information"));
593
+ process.exit(1);
594
+ }
595
+ const spinnerInstance = ora("Running evaluation...").start();
596
+ try {
597
+ const scorerInput = createScorerInput({
598
+ input,
599
+ output,
600
+ context,
601
+ groundTruth,
602
+ });
603
+ let evaluationPipeline;
604
+ if (pipeline && isValidPreset(pipeline)) {
605
+ evaluationPipeline = new EvaluationPipeline(getPreset(pipeline));
606
+ }
607
+ else if (scorer && scorer.length > 0) {
608
+ const pipelineConfig = {
609
+ name: "CLI Custom Pipeline",
610
+ scorers: scorer.map((s) => ({ id: s })),
611
+ executionMode: "parallel",
612
+ };
613
+ evaluationPipeline = new EvaluationPipeline(pipelineConfig);
614
+ }
615
+ else {
616
+ evaluationPipeline = new EvaluationPipeline(getPreset("quality"));
617
+ }
618
+ await evaluationPipeline.initialize();
619
+ const result = await evaluationPipeline.execute(scorerInput);
620
+ spinnerInstance.text = "Generating report...";
621
+ const reportData = {
622
+ title: `Evaluation Report - ${pipeline ?? "Custom Pipeline"}`,
623
+ timestamp: Date.now(),
624
+ result,
625
+ customSections: [
626
+ {
627
+ title: "Input",
628
+ content: { query: input, responseLength: output.length },
629
+ },
630
+ ],
631
+ };
632
+ const validFormats = ["text", "json", "markdown", "html"];
633
+ const reportFormat = validFormats.includes(format)
634
+ ? format
635
+ : "text";
636
+ const generator = new ReportGenerator({
637
+ format: reportFormat,
638
+ includeReasoning: verbose ?? true,
639
+ includeMetadata: verbose ?? true,
640
+ includeTiming: true,
641
+ });
642
+ const report = generator.generate(reportData);
643
+ spinnerInstance.stop();
644
+ if (outputFile) {
645
+ const fsPromises = await import("node:fs/promises");
646
+ await fsPromises.writeFile(outputFile, report.content, "utf-8");
647
+ logger.always(chalk.green(`Report saved to: ${outputFile}`));
648
+ }
649
+ else {
650
+ logger.always(report.content);
651
+ }
652
+ }
653
+ catch (error) {
654
+ spinnerInstance.fail("Report generation failed");
655
+ const errorMessage = error instanceof Error ? error.message : String(error);
656
+ logger.error(chalk.red(`Error: ${errorMessage}`));
657
+ process.exit(1);
658
+ }
659
+ },
660
+ };
661
+ /**
662
+ * Presets subcommand - List available pipeline presets
663
+ */
664
+ const presetsCommand = {
665
+ command: "presets [preset]",
666
+ describe: "List available pipeline presets or show details of a specific preset",
667
+ builder: (yargs) => yargs
668
+ .positional("preset", {
669
+ type: "string",
670
+ describe: "Specific preset to show details for",
671
+ })
672
+ .option("json", {
673
+ type: "boolean",
674
+ describe: "Output as JSON",
675
+ default: false,
676
+ })
677
+ .example("$0 evaluate presets", "List all available presets")
678
+ .example("$0 evaluate presets rag", "Show details of the RAG preset"),
679
+ handler: async (argv) => {
680
+ const { preset, json } = argv;
681
+ if (preset) {
682
+ // Show specific preset details
683
+ if (!isValidPreset(preset)) {
684
+ logger.error(chalk.red(`Unknown preset: ${preset}`));
685
+ logger.always(chalk.gray(`Available presets: ${getPresetNames().join(", ")}`));
686
+ process.exit(1);
687
+ }
688
+ const config = getPreset(preset);
689
+ if (json) {
690
+ logger.always(JSON.stringify(config, null, 2));
691
+ }
692
+ else {
693
+ logger.always("");
694
+ logger.always(chalk.bold(`Preset: ${chalk.cyan(preset)}`));
695
+ logger.always(chalk.gray("-".repeat(50)));
696
+ if (config.description) {
697
+ logger.always(`Description: ${config.description}`);
698
+ }
699
+ logger.always(`Pass Threshold: ${config.passThreshold ?? 0.7}`);
700
+ logger.always(`Execution Mode: ${config.executionMode ?? "parallel"}`);
701
+ logger.always("");
702
+ logger.always(chalk.bold("Scorers:"));
703
+ for (const s of config.scorers) {
704
+ const weight = s.config?.weight ?? 1.0;
705
+ const threshold = s.config?.threshold ?? "default";
706
+ logger.always(` - ${chalk.cyan(s.id)} (weight: ${weight}, threshold: ${threshold})`);
707
+ }
708
+ if (config.requiredScorers && config.requiredScorers.length > 0) {
709
+ logger.always("");
710
+ logger.always(chalk.bold("Required Scorers: ") +
711
+ config.requiredScorers.join(", "));
712
+ }
713
+ if (config.aggregation) {
714
+ logger.always("");
715
+ logger.always(chalk.bold("Aggregation: ") + config.aggregation.method);
716
+ }
717
+ }
718
+ }
719
+ else {
720
+ // List all presets
721
+ const presets = getPresetNames();
722
+ if (json) {
723
+ const presetData = Object.fromEntries(presets.filter(isValidPreset).map((p) => [p, getPreset(p)]));
724
+ logger.always(JSON.stringify(presetData, null, 2));
725
+ }
726
+ else {
727
+ logger.always("");
728
+ logger.always(chalk.bold("Available Pipeline Presets:"));
729
+ logger.always(chalk.gray("-".repeat(50)));
730
+ for (const p of presets) {
731
+ if (isValidPreset(p)) {
732
+ const config = getPreset(p);
733
+ logger.always("");
734
+ logger.always(` ${chalk.cyan(p)}`);
735
+ if (config.description) {
736
+ logger.always(` ${chalk.gray(config.description)}`);
737
+ }
738
+ logger.always(` Scorers: ${config.scorers.map((s) => s.id).join(", ")}`);
739
+ }
740
+ }
741
+ logger.always("");
742
+ logger.always(chalk.gray('Use "neurolink evaluate presets <name>" for more details'));
743
+ }
744
+ }
745
+ },
746
+ };
747
+ /**
748
+ * Main evaluate command with subcommands
749
+ */
750
+ export const evaluateCommand = {
751
+ command: "evaluate [subcommand]",
752
+ describe: "Evaluate AI responses using RAGAS-style scorers and pipelines",
753
+ builder: (yargs) => yargs
754
+ .command(listScorersCommand)
755
+ .command(runPipelineCommand)
756
+ .command(runCommand)
757
+ .command(scoreCommand)
758
+ .command(reportCommand)
759
+ .command(presetsCommand)
760
+ .option("input", {
761
+ type: "string",
762
+ describe: "AI response text to evaluate",
763
+ alias: "i",
764
+ })
765
+ .option("query", {
766
+ type: "string",
767
+ describe: "Original user query",
768
+ alias: "q",
769
+ })
770
+ .option("scorers", {
771
+ type: "array",
772
+ string: true,
773
+ describe: "List of scorers to use for evaluation",
774
+ alias: "s",
775
+ })
776
+ .option("context", {
777
+ type: "string",
778
+ describe: "Path to context file (JSON format)",
779
+ alias: "c",
780
+ })
781
+ .option("threshold", {
782
+ type: "number",
783
+ describe: "Minimum score threshold for passing (0-1)",
784
+ alias: "t",
785
+ })
786
+ .option("format", {
787
+ type: "string",
788
+ describe: "Output format",
789
+ choices: ["text", "json", "table"],
790
+ default: "text",
791
+ })
792
+ .option("json", {
793
+ type: "boolean",
794
+ describe: "Output results as JSON (shorthand for --format json)",
795
+ default: false,
796
+ })
797
+ .option("verbose", {
798
+ type: "boolean",
799
+ describe: "Show detailed reasoning and timing",
800
+ alias: "v",
801
+ default: false,
802
+ })
803
+ .example('$0 evaluate --input "Response text" --query "User question" --scorers hallucination toxicity', "Evaluate with specific scorers")
804
+ .example('$0 evaluate --input "Response" --query "Query" --context ./context.json --format json', "Evaluate with context file and JSON output")
805
+ .example("$0 evaluate list-scorers", "List all available scorers")
806
+ .example('$0 evaluate run-pipeline --preset quality --input "Response"', "Run quality pipeline evaluation"),
807
+ handler: async (argv) => {
808
+ const { input, query, scorers, context, threshold, json, verbose, format } = argv;
809
+ // If no input provided and no subcommand executed, show help
810
+ if (!input) {
811
+ return;
812
+ }
813
+ const outputFormat = json ? "json" : format;
814
+ const spinner = outputFormat === "json" ? null : ora("Running evaluation...").start();
815
+ try {
816
+ // Load context if provided
817
+ let contextArray;
818
+ if (context) {
819
+ if (fs.existsSync(context)) {
820
+ try {
821
+ const content = fs.readFileSync(context, "utf-8");
822
+ const parsed = JSON.parse(content);
823
+ contextArray = Array.isArray(parsed) ? parsed : [content];
824
+ }
825
+ catch {
826
+ contextArray = [context];
827
+ }
828
+ }
829
+ else {
830
+ contextArray = [context];
831
+ }
832
+ }
833
+ const scorerInput = {
834
+ query: query ?? "",
835
+ response: input,
836
+ context: contextArray,
837
+ };
838
+ let evaluationPipeline;
839
+ if (scorers && scorers.length > 0) {
840
+ const pipelineConfig = {
841
+ name: "CLI Custom Pipeline",
842
+ description: "Custom pipeline from CLI scorer arguments",
843
+ scorers: scorers.map((s) => ({ id: s })),
844
+ executionMode: "parallel",
845
+ passThreshold: threshold ?? 0.7,
846
+ };
847
+ evaluationPipeline = new EvaluationPipeline(pipelineConfig);
848
+ }
849
+ else {
850
+ const defaultPreset = getPreset("quality");
851
+ if (threshold !== undefined) {
852
+ defaultPreset.passThreshold = threshold;
853
+ }
854
+ evaluationPipeline = new EvaluationPipeline(defaultPreset);
855
+ }
856
+ await evaluationPipeline.initialize();
857
+ const result = await evaluationPipeline.execute(scorerInput);
858
+ spinner?.stop();
859
+ if (outputFormat === "json") {
860
+ logger.always(JSON.stringify(result, null, 2));
861
+ }
862
+ else if (outputFormat === "table") {
863
+ logger.always("");
864
+ logger.always(chalk.bold("Evaluation Results"));
865
+ logger.always(chalk.gray("-".repeat(50)));
866
+ logger.always(`${chalk.bold("Scorer".padEnd(25))} ${chalk.bold("Score".padEnd(10))} ${chalk.bold("Status")}`);
867
+ logger.always(chalk.gray("-".repeat(50)));
868
+ for (const score of result.scores) {
869
+ const status = score.passed ? chalk.green("PASS") : chalk.red("FAIL");
870
+ const scoreColor = score.passed ? chalk.green : chalk.red;
871
+ logger.always(`${score.scorerName.padEnd(25)} ${scoreColor(score.score.toFixed(2).padEnd(10))} ${status}`);
872
+ }
873
+ logger.always(chalk.gray("-".repeat(50)));
874
+ const overallColor = result.passed ? chalk.green : chalk.red;
875
+ logger.always(`${"Overall".padEnd(25)} ${overallColor(result.overallScore.toFixed(2).padEnd(10))} ${result.passed ? chalk.green("PASS") : chalk.red("FAIL")}`);
876
+ logger.always("");
877
+ logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
878
+ }
879
+ else {
880
+ logger.always("");
881
+ logger.always(chalk.bold("Evaluation Results"));
882
+ logger.always(chalk.gray("-".repeat(50)));
883
+ const overallColor = result.passed ? chalk.green : chalk.red;
884
+ const overallIcon = result.passed ? "PASS" : "FAIL";
885
+ logger.always(`${overallColor(overallIcon)} Overall Score: ${overallColor(result.overallScore.toFixed(2))} (${result.aggregationMethod})`);
886
+ logger.always("");
887
+ logger.always(chalk.bold("Individual Scores:"));
888
+ for (const score of result.scores) {
889
+ logger.always(formatScoreResult(score, verbose ?? false));
890
+ }
891
+ if (result.errors.length > 0) {
892
+ logger.always("");
893
+ logger.always(chalk.yellow("Errors:"));
894
+ for (const error of result.errors) {
895
+ logger.always(` ${chalk.yellow("!")} ${error.scorerId}: ${error.error}`);
896
+ }
897
+ }
898
+ logger.always("");
899
+ logger.always(chalk.gray(`Total time: ${result.totalComputeTime}ms`));
900
+ }
901
+ }
902
+ catch (error) {
903
+ spinner?.fail("Evaluation failed");
904
+ const errorMessage = error instanceof Error ? error.message : String(error);
905
+ logger.error(chalk.red(`Error: ${errorMessage}`));
906
+ process.exit(1);
907
+ }
908
+ },
909
+ };
910
+ /**
911
+ * Create evaluate command factory for CLICommandFactory
912
+ */
913
+ export class EvaluateCommandFactory {
914
+ /**
915
+ * Create the evaluate command module
916
+ */
917
+ static createEvaluateCommand() {
918
+ return evaluateCommand;
919
+ }
920
+ /**
921
+ * List available scorers (utility method)
922
+ */
923
+ static async listScorers() {
924
+ await ScorerRegistry.registerBuiltInScorers();
925
+ const scorerList = ScorerRegistry.list();
926
+ logger.always(chalk.bold("Available Scorers:"));
927
+ logger.always("");
928
+ for (const metadata of scorerList) {
929
+ logger.always(` ${chalk.cyan(metadata.id)}`);
930
+ logger.always(` ${chalk.gray(metadata.description)}`);
931
+ logger.always(` Type: ${metadata.type}, Category: ${metadata.category}`);
932
+ logger.always("");
933
+ }
934
+ }
935
+ /**
936
+ * List available pipeline presets (utility method)
937
+ */
938
+ static listPipelines() {
939
+ const presets = getPresetNames();
940
+ logger.always(chalk.bold("Available Pipeline Presets:"));
941
+ logger.always("");
942
+ for (const preset of presets) {
943
+ if (isValidPreset(preset)) {
944
+ const config = getPreset(preset);
945
+ logger.always(` ${chalk.cyan(preset)}`);
946
+ if (config.description) {
947
+ logger.always(` ${chalk.gray(config.description)}`);
948
+ }
949
+ logger.always(` Scorers: ${config.scorers.map((s) => s.id).join(", ")}`);
950
+ logger.always("");
951
+ }
952
+ }
953
+ }
954
+ }
955
+ //# sourceMappingURL=evaluate.js.map