@juspay/neurolink 9.3.0 → 9.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +12 -0
- package/README.md +8 -8
- package/dist/cli/commands/config.d.ts +3 -3
- package/dist/cli/index.js +1 -0
- package/dist/index.d.ts +35 -0
- package/dist/index.js +17 -0
- package/dist/lib/agent/directTools.d.ts +5 -5
- package/dist/lib/index.d.ts +35 -0
- package/dist/lib/index.js +17 -0
- package/dist/lib/neurolink.d.ts +12 -1
- package/dist/lib/neurolink.js +265 -4
- package/dist/lib/server/utils/validation.d.ts +8 -8
- package/dist/lib/types/generateTypes.d.ts +28 -0
- package/dist/lib/types/index.d.ts +6 -0
- package/dist/lib/types/index.js +12 -0
- package/dist/lib/types/modelTypes.d.ts +2 -2
- package/dist/lib/types/streamTypes.d.ts +35 -0
- package/dist/lib/types/workflowTypes.d.ts +558 -0
- package/dist/lib/types/workflowTypes.js +32 -0
- package/dist/lib/workflow/LAYER-EXAMPLES.d.ts +13 -0
- package/dist/lib/workflow/LAYER-EXAMPLES.js +312 -0
- package/dist/lib/workflow/PROMPT-EXAMPLES.d.ts +117 -0
- package/dist/lib/workflow/PROMPT-EXAMPLES.js +246 -0
- package/dist/lib/workflow/config.d.ts +1569 -0
- package/dist/lib/workflow/config.js +399 -0
- package/dist/lib/workflow/core/ensembleExecutor.d.ts +56 -0
- package/dist/lib/workflow/core/ensembleExecutor.js +398 -0
- package/dist/lib/workflow/core/judgeScorer.d.ts +26 -0
- package/dist/lib/workflow/core/judgeScorer.js +527 -0
- package/dist/lib/workflow/core/responseConditioner.d.ts +22 -0
- package/dist/lib/workflow/core/responseConditioner.js +226 -0
- package/dist/lib/workflow/core/types/conditionerTypes.d.ts +7 -0
- package/dist/lib/workflow/core/types/conditionerTypes.js +8 -0
- package/dist/lib/workflow/core/types/ensembleTypes.d.ts +7 -0
- package/dist/lib/workflow/core/types/ensembleTypes.js +8 -0
- package/dist/lib/workflow/core/types/index.d.ts +7 -0
- package/dist/lib/workflow/core/types/index.js +8 -0
- package/dist/lib/workflow/core/types/judgeTypes.d.ts +7 -0
- package/dist/lib/workflow/core/types/judgeTypes.js +8 -0
- package/dist/lib/workflow/core/types/layerTypes.d.ts +7 -0
- package/dist/lib/workflow/core/types/layerTypes.js +8 -0
- package/dist/lib/workflow/core/types/registryTypes.d.ts +7 -0
- package/dist/lib/workflow/core/types/registryTypes.js +8 -0
- package/dist/lib/workflow/core/workflowRegistry.d.ts +73 -0
- package/dist/lib/workflow/core/workflowRegistry.js +305 -0
- package/dist/lib/workflow/core/workflowRunner.d.ts +115 -0
- package/dist/lib/workflow/core/workflowRunner.js +554 -0
- package/dist/lib/workflow/index.d.ts +36 -0
- package/dist/lib/workflow/index.js +51 -0
- package/dist/lib/workflow/types.d.ts +19 -0
- package/dist/lib/workflow/types.js +10 -0
- package/dist/lib/workflow/utils/types/index.d.ts +7 -0
- package/dist/lib/workflow/utils/types/index.js +8 -0
- package/dist/lib/workflow/utils/types/metricsTypes.d.ts +7 -0
- package/dist/lib/workflow/utils/types/metricsTypes.js +8 -0
- package/dist/lib/workflow/utils/types/validationTypes.d.ts +7 -0
- package/dist/lib/workflow/utils/types/validationTypes.js +8 -0
- package/dist/lib/workflow/utils/workflowMetrics.d.ts +76 -0
- package/dist/lib/workflow/utils/workflowMetrics.js +312 -0
- package/dist/lib/workflow/utils/workflowValidation.d.ts +29 -0
- package/dist/lib/workflow/utils/workflowValidation.js +421 -0
- package/dist/lib/workflow/workflows/adaptiveWorkflow.d.ts +72 -0
- package/dist/lib/workflow/workflows/adaptiveWorkflow.js +367 -0
- package/dist/lib/workflow/workflows/consensusWorkflow.d.ts +69 -0
- package/dist/lib/workflow/workflows/consensusWorkflow.js +193 -0
- package/dist/lib/workflow/workflows/fallbackWorkflow.d.ts +49 -0
- package/dist/lib/workflow/workflows/fallbackWorkflow.js +226 -0
- package/dist/lib/workflow/workflows/multiJudgeWorkflow.d.ts +70 -0
- package/dist/lib/workflow/workflows/multiJudgeWorkflow.js +352 -0
- package/dist/neurolink.d.ts +12 -1
- package/dist/neurolink.js +265 -4
- package/dist/types/generateTypes.d.ts +28 -0
- package/dist/types/index.d.ts +6 -0
- package/dist/types/index.js +12 -0
- package/dist/types/streamTypes.d.ts +35 -0
- package/dist/types/workflowTypes.d.ts +558 -0
- package/dist/types/workflowTypes.js +31 -0
- package/dist/workflow/LAYER-EXAMPLES.d.ts +13 -0
- package/dist/workflow/LAYER-EXAMPLES.js +311 -0
- package/dist/workflow/PROMPT-EXAMPLES.d.ts +117 -0
- package/dist/workflow/PROMPT-EXAMPLES.js +245 -0
- package/dist/workflow/config.d.ts +1569 -0
- package/dist/workflow/config.js +398 -0
- package/dist/workflow/core/ensembleExecutor.d.ts +56 -0
- package/dist/workflow/core/ensembleExecutor.js +397 -0
- package/dist/workflow/core/judgeScorer.d.ts +26 -0
- package/dist/workflow/core/judgeScorer.js +526 -0
- package/dist/workflow/core/responseConditioner.d.ts +22 -0
- package/dist/workflow/core/responseConditioner.js +225 -0
- package/dist/workflow/core/types/conditionerTypes.d.ts +7 -0
- package/dist/workflow/core/types/conditionerTypes.js +7 -0
- package/dist/workflow/core/types/ensembleTypes.d.ts +7 -0
- package/dist/workflow/core/types/ensembleTypes.js +7 -0
- package/dist/workflow/core/types/index.d.ts +7 -0
- package/dist/workflow/core/types/index.js +7 -0
- package/dist/workflow/core/types/judgeTypes.d.ts +7 -0
- package/dist/workflow/core/types/judgeTypes.js +7 -0
- package/dist/workflow/core/types/layerTypes.d.ts +7 -0
- package/dist/workflow/core/types/layerTypes.js +7 -0
- package/dist/workflow/core/types/registryTypes.d.ts +7 -0
- package/dist/workflow/core/types/registryTypes.js +7 -0
- package/dist/workflow/core/workflowRegistry.d.ts +73 -0
- package/dist/workflow/core/workflowRegistry.js +304 -0
- package/dist/workflow/core/workflowRunner.d.ts +115 -0
- package/dist/workflow/core/workflowRunner.js +553 -0
- package/dist/workflow/index.d.ts +36 -0
- package/dist/workflow/index.js +50 -0
- package/dist/workflow/types.d.ts +19 -0
- package/dist/workflow/types.js +9 -0
- package/dist/workflow/utils/types/index.d.ts +7 -0
- package/dist/workflow/utils/types/index.js +7 -0
- package/dist/workflow/utils/types/metricsTypes.d.ts +7 -0
- package/dist/workflow/utils/types/metricsTypes.js +7 -0
- package/dist/workflow/utils/types/validationTypes.d.ts +7 -0
- package/dist/workflow/utils/types/validationTypes.js +7 -0
- package/dist/workflow/utils/workflowMetrics.d.ts +76 -0
- package/dist/workflow/utils/workflowMetrics.js +311 -0
- package/dist/workflow/utils/workflowValidation.d.ts +29 -0
- package/dist/workflow/utils/workflowValidation.js +420 -0
- package/dist/workflow/workflows/adaptiveWorkflow.d.ts +72 -0
- package/dist/workflow/workflows/adaptiveWorkflow.js +366 -0
- package/dist/workflow/workflows/consensusWorkflow.d.ts +69 -0
- package/dist/workflow/workflows/consensusWorkflow.js +192 -0
- package/dist/workflow/workflows/fallbackWorkflow.d.ts +49 -0
- package/dist/workflow/workflows/fallbackWorkflow.js +225 -0
- package/dist/workflow/workflows/multiJudgeWorkflow.d.ts +70 -0
- package/dist/workflow/workflows/multiJudgeWorkflow.js +351 -0
- package/package.json +3 -2
|
@@ -0,0 +1,527 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* workflow/core/judgeScorer.ts
|
|
3
|
+
* Judge-based scoring system for ensemble response evaluation
|
|
4
|
+
*/
|
|
5
|
+
import { AIProviderFactory } from "../../core/factory.js";
|
|
6
|
+
import { logger } from "../../utils/logger.js";
|
|
7
|
+
import { MAX_REASONING_LENGTH } from "../config.js";
|
|
8
|
+
import { WorkflowError } from "../types.js";
|
|
9
|
+
const functionTag = "JudgeScorer";
|
|
10
|
+
// ============================================================================
|
|
11
|
+
// SCORING FUNCTIONS
|
|
12
|
+
// ============================================================================
|
|
13
|
+
/**
|
|
14
|
+
* Execute judge scoring on ensemble responses
|
|
15
|
+
* @param options - Scoring options including judges and responses
|
|
16
|
+
* @returns Score result with judge evaluation
|
|
17
|
+
*/
|
|
18
|
+
export async function scoreEnsemble(options) {
|
|
19
|
+
const startTime = Date.now();
|
|
20
|
+
const { judges, responses, originalPrompt, systemPrompt, timeout, workflowDefaults, } = options;
|
|
21
|
+
logger.info(`[${functionTag}] Starting judge scoring`, {
|
|
22
|
+
judgeCount: judges.length,
|
|
23
|
+
responseCount: responses.length,
|
|
24
|
+
});
|
|
25
|
+
try {
|
|
26
|
+
// Filter successful responses for evaluation
|
|
27
|
+
const successfulResponses = responses.filter((r) => r.status === "success" && r.content.trim() !== "");
|
|
28
|
+
if (successfulResponses.length === 0) {
|
|
29
|
+
throw new WorkflowError("No successful responses to evaluate", {
|
|
30
|
+
code: "NO_RESPONSES_TO_EVALUATE",
|
|
31
|
+
workflowId: "judge",
|
|
32
|
+
phase: "judge",
|
|
33
|
+
retryable: false,
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
if (judges.length === 1) {
|
|
37
|
+
// Single judge scoring
|
|
38
|
+
const judgeResult = await executeSingleJudge(judges[0], successfulResponses, originalPrompt, systemPrompt, timeout, workflowDefaults?.judgePrompt);
|
|
39
|
+
return {
|
|
40
|
+
scores: judgeResult,
|
|
41
|
+
judgeTime: Date.now() - startTime,
|
|
42
|
+
};
|
|
43
|
+
}
|
|
44
|
+
else {
|
|
45
|
+
// Multi-judge voting
|
|
46
|
+
const multiJudgeResult = await executeMultiJudge(judges, successfulResponses, originalPrompt, systemPrompt, timeout, workflowDefaults?.judgePrompt);
|
|
47
|
+
return {
|
|
48
|
+
scores: multiJudgeResult,
|
|
49
|
+
judgeTime: Date.now() - startTime,
|
|
50
|
+
};
|
|
51
|
+
}
|
|
52
|
+
}
|
|
53
|
+
catch (error) {
|
|
54
|
+
const err = error;
|
|
55
|
+
logger.error(`[${functionTag}] Judge scoring failed`, {
|
|
56
|
+
error: err.message,
|
|
57
|
+
});
|
|
58
|
+
const workflowError = error instanceof WorkflowError
|
|
59
|
+
? error
|
|
60
|
+
: new WorkflowError(err.message, {
|
|
61
|
+
code: "JUDGE_SCORING_ERROR",
|
|
62
|
+
workflowId: "judge",
|
|
63
|
+
phase: "judge",
|
|
64
|
+
retryable: true,
|
|
65
|
+
});
|
|
66
|
+
return {
|
|
67
|
+
scores: createEmptyScores(judges[0], responses),
|
|
68
|
+
judgeTime: Date.now() - startTime,
|
|
69
|
+
error: workflowError,
|
|
70
|
+
};
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Execute single judge evaluation
|
|
75
|
+
* @param judge - Judge configuration
|
|
76
|
+
* @param responses - Successful ensemble responses
|
|
77
|
+
* @param originalPrompt - Original user prompt
|
|
78
|
+
* @param systemPrompt - Optional system prompt
|
|
79
|
+
* @param timeout - Judge timeout in milliseconds
|
|
80
|
+
* @returns Judge scores with evaluation
|
|
81
|
+
*/
|
|
82
|
+
async function executeSingleJudge(judge, responses, originalPrompt, systemPrompt, timeout, workflowDefaultJudgePrompt) {
|
|
83
|
+
const startTime = Date.now();
|
|
84
|
+
logger.debug(`[${functionTag}] Executing single judge`, {
|
|
85
|
+
provider: judge.provider,
|
|
86
|
+
model: judge.model,
|
|
87
|
+
});
|
|
88
|
+
// Resolve judge prompt with hierarchical fallback:
|
|
89
|
+
// 1. Judge-specific customPrompt (highest priority)
|
|
90
|
+
// 2. Workflow-level default judge prompt
|
|
91
|
+
// 3. Built-in default template
|
|
92
|
+
const resolvedJudgePrompt = judge.customPrompt || workflowDefaultJudgePrompt;
|
|
93
|
+
// Create judge prompt (will use resolvedJudgePrompt if provided, otherwise default template)
|
|
94
|
+
const judgePrompt = createJudgePrompt(judge, responses, originalPrompt, resolvedJudgePrompt);
|
|
95
|
+
// Execute judge
|
|
96
|
+
const provider = await AIProviderFactory.createProvider(judge.provider, judge.model);
|
|
97
|
+
const result = await provider.generate({
|
|
98
|
+
prompt: judgePrompt,
|
|
99
|
+
systemPrompt: systemPrompt || judge.systemPrompt,
|
|
100
|
+
temperature: judge.temperature || 0.1,
|
|
101
|
+
maxTokens: judge.maxTokens || 2000,
|
|
102
|
+
timeout: timeout || judge.timeout || 10000,
|
|
103
|
+
});
|
|
104
|
+
// Parse judge response
|
|
105
|
+
const parsed = parseJudgeResponse(result?.content || "", responses, judge);
|
|
106
|
+
// Build JudgeScores
|
|
107
|
+
const judgeScores = {
|
|
108
|
+
judgeProvider: judge.provider,
|
|
109
|
+
judgeModel: judge.model,
|
|
110
|
+
scores: parsed.scores,
|
|
111
|
+
ranking: parsed.ranking,
|
|
112
|
+
bestResponse: parsed.bestResponse,
|
|
113
|
+
criteria: judge.criteria,
|
|
114
|
+
reasoning: parsed.reasoning,
|
|
115
|
+
synthesizedResponse: parsed.synthesizedResponse, // Include synthesized response if present
|
|
116
|
+
confidenceInJudgment: parsed.confidenceInJudgment,
|
|
117
|
+
judgeTime: Date.now() - startTime,
|
|
118
|
+
timestamp: new Date().toISOString(),
|
|
119
|
+
};
|
|
120
|
+
logger.debug(`[${functionTag}] Single judge completed`, {
|
|
121
|
+
bestResponse: judgeScores.bestResponse,
|
|
122
|
+
hasSynthesizedResponse: !!parsed.synthesizedResponse,
|
|
123
|
+
judgeTime: judgeScores.judgeTime,
|
|
124
|
+
});
|
|
125
|
+
return judgeScores;
|
|
126
|
+
}
|
|
127
|
+
/**
|
|
128
|
+
* Execute multi-judge voting
|
|
129
|
+
* @param judges - Array of judge configurations
|
|
130
|
+
* @param responses - Successful ensemble responses
|
|
131
|
+
* @param originalPrompt - Original user prompt
|
|
132
|
+
* @param systemPrompt - Optional system prompt
|
|
133
|
+
* @param timeout - Judge timeout in milliseconds
|
|
134
|
+
* @returns Multi-judge scores with aggregated results
|
|
135
|
+
*/
|
|
136
|
+
async function executeMultiJudge(judges, responses, originalPrompt, systemPrompt, timeout, workflowDefaultJudgePrompt) {
|
|
137
|
+
const startTime = Date.now();
|
|
138
|
+
logger.debug(`[${functionTag}] Executing multi-judge voting`, {
|
|
139
|
+
judgeCount: judges.length,
|
|
140
|
+
});
|
|
141
|
+
// Execute all judges in parallel
|
|
142
|
+
const judgePromises = judges.map((judge) => executeSingleJudge(judge, responses, originalPrompt, systemPrompt, timeout, workflowDefaultJudgePrompt).catch((error) => {
|
|
143
|
+
logger.warn(`[${functionTag}] Judge failed`, {
|
|
144
|
+
provider: judge.provider,
|
|
145
|
+
model: judge.model,
|
|
146
|
+
error: error.message,
|
|
147
|
+
});
|
|
148
|
+
return createEmptyJudgeScores(judge, responses);
|
|
149
|
+
}));
|
|
150
|
+
const judgeResults = await Promise.all(judgePromises);
|
|
151
|
+
// Aggregate scores using average (can be configurable in future)
|
|
152
|
+
const aggregated = aggregateJudgeScores(judgeResults, "average");
|
|
153
|
+
const multiJudgeScores = {
|
|
154
|
+
judges: judgeResults,
|
|
155
|
+
averageScores: aggregated.averageScores,
|
|
156
|
+
aggregatedRanking: aggregated.ranking,
|
|
157
|
+
consensusLevel: calculateConsensusLevel(judgeResults),
|
|
158
|
+
bestResponse: aggregated.bestResponse,
|
|
159
|
+
confidence: aggregated.confidence,
|
|
160
|
+
votingStrategy: "average",
|
|
161
|
+
// Expose unified interface fields
|
|
162
|
+
judgeProvider: judgeResults[0]?.judgeProvider,
|
|
163
|
+
judgeModel: `multi-judge-${judges.length}`,
|
|
164
|
+
scores: aggregated.averageScores,
|
|
165
|
+
ranking: aggregated.ranking,
|
|
166
|
+
reasoning: aggregated.reasoning,
|
|
167
|
+
confidenceInJudgment: aggregated.confidence,
|
|
168
|
+
criteria: judges[0]?.criteria || [],
|
|
169
|
+
judgeTime: Date.now() - startTime,
|
|
170
|
+
timestamp: new Date().toISOString(),
|
|
171
|
+
};
|
|
172
|
+
logger.debug(`[${functionTag}] Multi-judge completed`, {
|
|
173
|
+
bestResponse: multiJudgeScores.bestResponse,
|
|
174
|
+
consensusLevel: multiJudgeScores.consensusLevel,
|
|
175
|
+
judgeTime: multiJudgeScores.judgeTime,
|
|
176
|
+
});
|
|
177
|
+
return multiJudgeScores;
|
|
178
|
+
}
|
|
179
|
+
// ============================================================================
|
|
180
|
+
// HELPER FUNCTIONS
|
|
181
|
+
// ============================================================================
|
|
182
|
+
/**
|
|
183
|
+
* Create judge evaluation prompt
|
|
184
|
+
* @param judge - Judge configuration
|
|
185
|
+
* @param responses - Ensemble responses to evaluate
|
|
186
|
+
* @param originalPrompt - Original user prompt
|
|
187
|
+
* @param customPrompt - Custom evaluation prompt (overrides default)
|
|
188
|
+
* @returns Formatted judge prompt
|
|
189
|
+
*/
|
|
190
|
+
function createJudgePrompt(judge, responses, originalPrompt, customPrompt) {
|
|
191
|
+
// If custom prompt provided, use it
|
|
192
|
+
if (customPrompt) {
|
|
193
|
+
logger.debug(`[${functionTag}] Using custom judge prompt`);
|
|
194
|
+
return customPrompt;
|
|
195
|
+
}
|
|
196
|
+
// Build response blocks
|
|
197
|
+
const responseBlocks = responses
|
|
198
|
+
.map((r, index) => {
|
|
199
|
+
const identifier = `response-${index}`;
|
|
200
|
+
const modelInfo = judge.blindEvaluation
|
|
201
|
+
? `Response ${index + 1}`
|
|
202
|
+
: `${r.provider}/${r.model}`;
|
|
203
|
+
return `
|
|
204
|
+
<response id="${identifier}">
|
|
205
|
+
<model>${modelInfo}</model>
|
|
206
|
+
<content>
|
|
207
|
+
${r.content}
|
|
208
|
+
</content>
|
|
209
|
+
</response>`;
|
|
210
|
+
})
|
|
211
|
+
.join("\n");
|
|
212
|
+
const criteriaList = judge.criteria
|
|
213
|
+
.map((c, i) => `${i + 1}. ${c}`)
|
|
214
|
+
.join("\n");
|
|
215
|
+
// If synthesis is enabled, judge creates improved response
|
|
216
|
+
if (judge.synthesizeImprovedResponse) {
|
|
217
|
+
return `You are an expert AI evaluator and synthesizer. Your task is to:
|
|
218
|
+
1. Evaluate all responses
|
|
219
|
+
2. Synthesize an IMPROVED final response that combines their strengths
|
|
220
|
+
|
|
221
|
+
USER QUESTION:
|
|
222
|
+
${originalPrompt}
|
|
223
|
+
|
|
224
|
+
RESPONSES TO EVALUATE:
|
|
225
|
+
${responseBlocks}
|
|
226
|
+
|
|
227
|
+
EVALUATION CRITERIA:
|
|
228
|
+
${criteriaList}
|
|
229
|
+
|
|
230
|
+
INSTRUCTIONS:
|
|
231
|
+
1. Score each response on a scale of 0-100 (0 = poor, 100 = excellent)
|
|
232
|
+
2. Consider all evaluation criteria listed above
|
|
233
|
+
3. Provide a ranking of responses from best to worst
|
|
234
|
+
4. Identify the single best response
|
|
235
|
+
5. Provide brief reasoning for your evaluation (max 200 characters)
|
|
236
|
+
6. **SYNTHESIZE an improved response** that:
|
|
237
|
+
- Combines the best elements from all responses
|
|
238
|
+
- Addresses any weaknesses identified in the evaluation
|
|
239
|
+
- Maintains accuracy and technical correctness
|
|
240
|
+
- Is more complete and higher quality than any single response
|
|
241
|
+
- Directly answers the user's question (no meta-commentary)
|
|
242
|
+
7. Rate your confidence in this judgment (0.0 to 1.0)
|
|
243
|
+
|
|
244
|
+
Respond in JSON format:
|
|
245
|
+
{
|
|
246
|
+
"scores": {
|
|
247
|
+
"response-0": 85,
|
|
248
|
+
"response-1": 92
|
|
249
|
+
},
|
|
250
|
+
"ranking": ["response-1", "response-0"],
|
|
251
|
+
"bestResponse": "response-1",
|
|
252
|
+
"reasoning": "Brief explanation of evaluation",
|
|
253
|
+
"synthesizedResponse": "Your improved, synthesized response here",
|
|
254
|
+
"confidenceInJudgment": 0.9
|
|
255
|
+
}`;
|
|
256
|
+
}
|
|
257
|
+
// Standard evaluation (no synthesis)
|
|
258
|
+
return `You are an expert AI evaluator. Evaluate the following responses to the user's question.
|
|
259
|
+
|
|
260
|
+
USER QUESTION:
|
|
261
|
+
${originalPrompt}
|
|
262
|
+
|
|
263
|
+
RESPONSES TO EVALUATE:
|
|
264
|
+
${responseBlocks}
|
|
265
|
+
|
|
266
|
+
EVALUATION CRITERIA:
|
|
267
|
+
${criteriaList}
|
|
268
|
+
|
|
269
|
+
INSTRUCTIONS:
|
|
270
|
+
1. Score each response on a scale of 0-100 (0 = poor, 100 = excellent)
|
|
271
|
+
2. Consider all evaluation criteria listed above
|
|
272
|
+
3. Provide a ranking of responses from best to worst
|
|
273
|
+
4. Identify the single best response
|
|
274
|
+
5. Provide brief reasoning for your evaluation (max 200 characters)
|
|
275
|
+
6. Rate your confidence in this judgment (0.0 to 1.0)
|
|
276
|
+
|
|
277
|
+
Respond in JSON format:
|
|
278
|
+
{
|
|
279
|
+
"scores": {
|
|
280
|
+
"response-0": 85,
|
|
281
|
+
"response-1": 92
|
|
282
|
+
},
|
|
283
|
+
"ranking": ["response-1", "response-0"],
|
|
284
|
+
"bestResponse": "response-1",
|
|
285
|
+
"reasoning": "Brief explanation of evaluation",
|
|
286
|
+
"confidenceInJudgment": 0.9
|
|
287
|
+
}`;
|
|
288
|
+
}
|
|
289
|
+
/**
|
|
290
|
+
* Parse judge response to extract scores
|
|
291
|
+
* @param content - Raw judge response content
|
|
292
|
+
* @param responses - Original ensemble responses
|
|
293
|
+
* @param _judge - Judge configuration (unused)
|
|
294
|
+
* @returns Parsed judge response with scores
|
|
295
|
+
*/
|
|
296
|
+
function parseJudgeResponse(content, responses, _judge) {
|
|
297
|
+
try {
|
|
298
|
+
// Try to extract JSON from response
|
|
299
|
+
const jsonMatch = content.match(/\{[\s\S]*\}/);
|
|
300
|
+
if (!jsonMatch) {
|
|
301
|
+
logger.warn(`[${functionTag}] No JSON found in judge response`);
|
|
302
|
+
return createFallbackScores(responses);
|
|
303
|
+
}
|
|
304
|
+
const parsed = JSON.parse(jsonMatch[0]);
|
|
305
|
+
// Validate and normalize scores to 0-100 range
|
|
306
|
+
const scores = {};
|
|
307
|
+
Object.keys(parsed.scores || {}).forEach((key) => {
|
|
308
|
+
const score = Number(parsed.scores[key]);
|
|
309
|
+
scores[key] = Math.max(0, Math.min(100, score));
|
|
310
|
+
});
|
|
311
|
+
// Ensure all responses have scores
|
|
312
|
+
responses.forEach((_, index) => {
|
|
313
|
+
const key = `response-${index}`;
|
|
314
|
+
if (!(key in scores)) {
|
|
315
|
+
scores[key] = 50; // Default neutral score
|
|
316
|
+
}
|
|
317
|
+
});
|
|
318
|
+
return {
|
|
319
|
+
scores,
|
|
320
|
+
ranking: parsed.ranking || generateRankingFromScores(scores),
|
|
321
|
+
bestResponse: parsed.bestResponse || findBestResponse(scores),
|
|
322
|
+
reasoning: truncateReasoning(parsed.reasoning || "No reasoning provided"),
|
|
323
|
+
synthesizedResponse: parsed.synthesizedResponse, // Extract synthesized response if present
|
|
324
|
+
confidenceInJudgment: normalizeConfidence(parsed.confidenceInJudgment),
|
|
325
|
+
};
|
|
326
|
+
}
|
|
327
|
+
catch (error) {
|
|
328
|
+
logger.warn(`[${functionTag}] Failed to parse judge response`, {
|
|
329
|
+
error: error.message,
|
|
330
|
+
});
|
|
331
|
+
return createFallbackScores(responses);
|
|
332
|
+
}
|
|
333
|
+
}
|
|
334
|
+
/**
|
|
335
|
+
* Create fallback scores when parsing fails
|
|
336
|
+
* @param responses - Ensemble responses
|
|
337
|
+
* @returns Default scores with equal values
|
|
338
|
+
*/
|
|
339
|
+
function createFallbackScores(responses) {
|
|
340
|
+
const scores = {};
|
|
341
|
+
const ranking = [];
|
|
342
|
+
responses.forEach((_, index) => {
|
|
343
|
+
const key = `response-${index}`;
|
|
344
|
+
scores[key] = 50; // Neutral score
|
|
345
|
+
ranking.push(key);
|
|
346
|
+
});
|
|
347
|
+
return {
|
|
348
|
+
scores,
|
|
349
|
+
ranking,
|
|
350
|
+
bestResponse: ranking[0],
|
|
351
|
+
reasoning: "Unable to parse judge evaluation",
|
|
352
|
+
confidenceInJudgment: 0.5,
|
|
353
|
+
};
|
|
354
|
+
}
|
|
355
|
+
/**
|
|
356
|
+
* Generate ranking from scores
|
|
357
|
+
* @param scores - Score record
|
|
358
|
+
* @returns Array of response IDs sorted by score descending
|
|
359
|
+
*/
|
|
360
|
+
function generateRankingFromScores(scores) {
|
|
361
|
+
return Object.keys(scores).sort((a, b) => scores[b] - scores[a]);
|
|
362
|
+
}
|
|
363
|
+
/**
|
|
364
|
+
* Find best response from scores
|
|
365
|
+
* @param scores - Score record
|
|
366
|
+
* @returns Response ID with highest score
|
|
367
|
+
*/
|
|
368
|
+
function findBestResponse(scores) {
|
|
369
|
+
let bestId = "";
|
|
370
|
+
let bestScore = -1;
|
|
371
|
+
Object.keys(scores).forEach((key) => {
|
|
372
|
+
if (scores[key] > bestScore) {
|
|
373
|
+
bestScore = scores[key];
|
|
374
|
+
bestId = key;
|
|
375
|
+
}
|
|
376
|
+
});
|
|
377
|
+
return bestId || Object.keys(scores)[0];
|
|
378
|
+
}
|
|
379
|
+
/**
|
|
380
|
+
* Truncate reasoning to max 200 characters
|
|
381
|
+
* @param reasoning - Reasoning text
|
|
382
|
+
* @returns Truncated reasoning
|
|
383
|
+
*/
|
|
384
|
+
function truncateReasoning(reasoning) {
|
|
385
|
+
if (reasoning.length <= MAX_REASONING_LENGTH) {
|
|
386
|
+
return reasoning;
|
|
387
|
+
}
|
|
388
|
+
return reasoning.substring(0, MAX_REASONING_LENGTH - 3) + "...";
|
|
389
|
+
}
|
|
390
|
+
/**
|
|
391
|
+
* Normalize confidence to 0-1 range
|
|
392
|
+
* @param confidence - Confidence value
|
|
393
|
+
* @returns Normalized confidence between 0 and 1
|
|
394
|
+
*/
|
|
395
|
+
function normalizeConfidence(confidence) {
|
|
396
|
+
if (typeof confidence !== "number") {
|
|
397
|
+
return 0.5;
|
|
398
|
+
}
|
|
399
|
+
return Math.max(0, Math.min(1, confidence));
|
|
400
|
+
}
|
|
401
|
+
/**
|
|
402
|
+
* Aggregate multiple judge scores
|
|
403
|
+
* @param judgeResults - Array of judge score results
|
|
404
|
+
* @param _strategy - Aggregation strategy (currently only 'average')
|
|
405
|
+
* @returns Aggregated scores and ranking
|
|
406
|
+
*/
|
|
407
|
+
function aggregateJudgeScores(judgeResults, _strategy) {
|
|
408
|
+
// Collect all response IDs
|
|
409
|
+
const responseIds = new Set();
|
|
410
|
+
judgeResults.forEach((result) => {
|
|
411
|
+
Object.keys(result.scores).forEach((id) => responseIds.add(id));
|
|
412
|
+
});
|
|
413
|
+
// Calculate average scores
|
|
414
|
+
const averageScores = {};
|
|
415
|
+
responseIds.forEach((id) => {
|
|
416
|
+
const scores = judgeResults
|
|
417
|
+
.map((result) => result.scores[id])
|
|
418
|
+
.filter((score) => score !== undefined);
|
|
419
|
+
if (scores.length > 0) {
|
|
420
|
+
averageScores[id] =
|
|
421
|
+
scores.reduce((sum, score) => sum + score, 0) / scores.length;
|
|
422
|
+
}
|
|
423
|
+
else {
|
|
424
|
+
averageScores[id] = 50; // Default
|
|
425
|
+
}
|
|
426
|
+
});
|
|
427
|
+
// Generate ranking from average scores
|
|
428
|
+
const ranking = generateRankingFromScores(averageScores);
|
|
429
|
+
const bestResponse = ranking[0];
|
|
430
|
+
// Calculate aggregate confidence
|
|
431
|
+
const confidences = judgeResults
|
|
432
|
+
.map((r) => r.confidenceInJudgment || 0.5)
|
|
433
|
+
.filter((c) => c > 0);
|
|
434
|
+
const confidence = confidences.length > 0
|
|
435
|
+
? confidences.reduce((sum, c) => sum + c, 0) / confidences.length
|
|
436
|
+
: 0.5;
|
|
437
|
+
// Aggregate reasoning
|
|
438
|
+
const reasoning = `Aggregated from ${judgeResults.length} judges`;
|
|
439
|
+
return {
|
|
440
|
+
averageScores,
|
|
441
|
+
ranking,
|
|
442
|
+
bestResponse,
|
|
443
|
+
confidence,
|
|
444
|
+
reasoning,
|
|
445
|
+
};
|
|
446
|
+
}
|
|
447
|
+
/**
|
|
448
|
+
* Calculate consensus level between judges
|
|
449
|
+
* @param judgeResults - Array of judge score results
|
|
450
|
+
* @returns Consensus level between 0 and 1
|
|
451
|
+
*/
|
|
452
|
+
function calculateConsensusLevel(judgeResults) {
|
|
453
|
+
if (judgeResults.length < 2) {
|
|
454
|
+
return 1.0; // Perfect consensus with single judge
|
|
455
|
+
}
|
|
456
|
+
// Calculate agreement on best response
|
|
457
|
+
const bestResponses = judgeResults.map((r) => r.bestResponse);
|
|
458
|
+
const modeCounts = new Map();
|
|
459
|
+
bestResponses.forEach((response) => {
|
|
460
|
+
if (response) {
|
|
461
|
+
modeCounts.set(response, (modeCounts.get(response) || 0) + 1);
|
|
462
|
+
}
|
|
463
|
+
});
|
|
464
|
+
const maxCount = Math.max(...Array.from(modeCounts.values()));
|
|
465
|
+
return maxCount / judgeResults.length;
|
|
466
|
+
}
|
|
467
|
+
/**
|
|
468
|
+
* Create empty judge scores for error cases
|
|
469
|
+
* @param judge - Judge configuration
|
|
470
|
+
* @param responses - Ensemble responses
|
|
471
|
+
* @returns Empty judge scores
|
|
472
|
+
*/
|
|
473
|
+
function createEmptyJudgeScores(judge, responses) {
|
|
474
|
+
const scores = {};
|
|
475
|
+
responses.forEach((_, index) => {
|
|
476
|
+
scores[`response-${index}`] = 50;
|
|
477
|
+
});
|
|
478
|
+
return {
|
|
479
|
+
judgeProvider: judge.provider,
|
|
480
|
+
judgeModel: judge.model,
|
|
481
|
+
scores,
|
|
482
|
+
criteria: judge.criteria,
|
|
483
|
+
judgeTime: 0,
|
|
484
|
+
timestamp: new Date().toISOString(),
|
|
485
|
+
};
|
|
486
|
+
}
|
|
487
|
+
/**
|
|
488
|
+
* Create empty scores for error cases
|
|
489
|
+
* @param judge - Judge configuration
|
|
490
|
+
* @param responses - Ensemble responses
|
|
491
|
+
* @returns Empty judge scores
|
|
492
|
+
*/
|
|
493
|
+
function createEmptyScores(judge, responses) {
|
|
494
|
+
return createEmptyJudgeScores(judge, responses);
|
|
495
|
+
}
|
|
496
|
+
/**
|
|
497
|
+
* Get best response from judge scores
|
|
498
|
+
* @param scores - Judge scores or multi-judge scores
|
|
499
|
+
* @param responses - Original ensemble responses
|
|
500
|
+
* @returns Best ensemble response
|
|
501
|
+
*/
|
|
502
|
+
export function getBestResponse(scores, responses) {
|
|
503
|
+
const bestId = scores.bestResponse;
|
|
504
|
+
if (!bestId) {
|
|
505
|
+
return undefined;
|
|
506
|
+
}
|
|
507
|
+
const index = parseInt(bestId.replace("response-", ""), 10);
|
|
508
|
+
return responses[index];
|
|
509
|
+
}
|
|
510
|
+
/**
|
|
511
|
+
* Get ranked responses
|
|
512
|
+
* @param scores - Judge scores or multi-judge scores
|
|
513
|
+
* @param responses - Original ensemble responses
|
|
514
|
+
* @returns Responses sorted by ranking
|
|
515
|
+
*/
|
|
516
|
+
export function getRankedResponses(scores, responses) {
|
|
517
|
+
if (!scores.ranking || scores.ranking.length === 0) {
|
|
518
|
+
return responses;
|
|
519
|
+
}
|
|
520
|
+
return scores.ranking
|
|
521
|
+
.map((id) => {
|
|
522
|
+
const index = parseInt(id.replace("response-", ""), 10);
|
|
523
|
+
return responses[index];
|
|
524
|
+
})
|
|
525
|
+
.filter((r) => r !== undefined);
|
|
526
|
+
}
|
|
527
|
+
//# sourceMappingURL=judgeScorer.js.map
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* workflow/core/responseConditioner.ts
|
|
3
|
+
* Response conditioning and synthesis
|
|
4
|
+
*
|
|
5
|
+
* Uses judge feedback and ensemble responses to synthesize an improved final response.
|
|
6
|
+
* Combines strengths from multiple responses based on evaluation insights.
|
|
7
|
+
*/
|
|
8
|
+
import type { ConditioningConfig } from "../types.js";
|
|
9
|
+
import type { ConditionOptions, ConditionResult } from "./types/index.js";
|
|
10
|
+
/**
|
|
11
|
+
* Condition response by synthesizing improved version using judge feedback
|
|
12
|
+
*
|
|
13
|
+
* @param options - Conditioning options including all responses and judge feedback
|
|
14
|
+
* @returns Conditioned result with synthesized improved content
|
|
15
|
+
*/
|
|
16
|
+
export declare function conditionResponse(options: ConditionOptions): Promise<ConditionResult>;
|
|
17
|
+
/**
|
|
18
|
+
* Check if conditioning is enabled
|
|
19
|
+
* @param config - Conditioning configuration
|
|
20
|
+
* @returns True if conditioning should be applied
|
|
21
|
+
*/
|
|
22
|
+
export declare function isConditioningEnabled(config?: ConditioningConfig): boolean;
|