kimi-vercel-ai-sdk-provider 0.3.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +567 -17
- package/dist/index.d.mts +1750 -3
- package/dist/index.d.ts +1750 -3
- package/dist/index.js +2317 -161
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +2292 -160
- package/dist/index.mjs.map +1 -1
- package/package.json +1 -1
- package/src/__tests__/auto-detect.test.ts +140 -0
- package/src/__tests__/code-validation.test.ts +267 -0
- package/src/__tests__/ensemble.test.ts +242 -0
- package/src/__tests__/file-cache.test.ts +310 -0
- package/src/__tests__/model-config.test.ts +120 -0
- package/src/__tests__/multi-agent.test.ts +201 -0
- package/src/__tests__/project-tools.test.ts +181 -0
- package/src/__tests__/reasoning-utils.test.ts +164 -0
- package/src/__tests__/tools.test.ts +76 -8
- package/src/chat/kimi-chat-language-model.ts +21 -2
- package/src/chat/kimi-chat-settings.ts +15 -1
- package/src/code-validation/detector.ts +319 -0
- package/src/code-validation/index.ts +31 -0
- package/src/code-validation/types.ts +291 -0
- package/src/code-validation/validator.ts +547 -0
- package/src/core/errors.ts +91 -0
- package/src/core/index.ts +15 -3
- package/src/core/types.ts +57 -2
- package/src/core/utils.ts +138 -0
- package/src/ensemble/index.ts +17 -0
- package/src/ensemble/multi-sampler.ts +433 -0
- package/src/ensemble/types.ts +279 -0
- package/src/files/attachment-processor.ts +51 -4
- package/src/files/file-cache.ts +260 -0
- package/src/files/index.ts +16 -1
- package/src/index.ts +102 -3
- package/src/kimi-provider.ts +354 -1
- package/src/multi-agent/index.ts +21 -0
- package/src/multi-agent/types.ts +312 -0
- package/src/multi-agent/workflows.ts +539 -0
- package/src/project-tools/index.ts +16 -0
- package/src/project-tools/scaffolder.ts +494 -0
- package/src/project-tools/types.ts +244 -0
- package/src/tools/auto-detect.ts +276 -0
- package/src/tools/index.ts +6 -2
- package/src/tools/prepare-tools.ts +179 -4
package/src/core/types.ts
CHANGED
|
@@ -70,18 +70,68 @@ export interface KimiModelCapabilities {
|
|
|
70
70
|
* Whether the model supports structured outputs.
|
|
71
71
|
*/
|
|
72
72
|
structuredOutputs?: boolean;
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* Default temperature for the model.
|
|
76
|
+
* Thinking models require temperature=1.0 for optimal reasoning.
|
|
77
|
+
*/
|
|
78
|
+
defaultTemperature?: number;
|
|
79
|
+
|
|
80
|
+
/**
|
|
81
|
+
* Whether temperature is locked (cannot be changed).
|
|
82
|
+
* Thinking models have this set to true.
|
|
83
|
+
*/
|
|
84
|
+
temperatureLocked?: boolean;
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Default max output tokens for the model.
|
|
88
|
+
* Thinking models need higher limits to avoid truncated reasoning.
|
|
89
|
+
*/
|
|
90
|
+
defaultMaxOutputTokens?: number;
|
|
73
91
|
}
|
|
74
92
|
|
|
93
|
+
/**
|
|
94
|
+
* Default temperature for thinking models.
|
|
95
|
+
* Kimi thinking models require temperature=1.0 for optimal reasoning quality.
|
|
96
|
+
*/
|
|
97
|
+
export const THINKING_MODEL_TEMPERATURE = 1.0;
|
|
98
|
+
|
|
99
|
+
/**
|
|
100
|
+
* Default max output tokens for thinking models.
|
|
101
|
+
* Higher limit ensures reasoning traces aren't truncated.
|
|
102
|
+
*/
|
|
103
|
+
export const THINKING_MODEL_DEFAULT_MAX_TOKENS = 32768;
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Default max output tokens for standard models.
|
|
107
|
+
*/
|
|
108
|
+
export const STANDARD_MODEL_DEFAULT_MAX_TOKENS = 4096;
|
|
109
|
+
|
|
75
110
|
/**
|
|
76
111
|
* Infer model capabilities from the model ID.
|
|
77
112
|
*
|
|
78
113
|
* @param modelId - The model identifier
|
|
79
114
|
* @returns Inferred capabilities based on model name patterns
|
|
80
115
|
*
|
|
116
|
+
* @remarks
|
|
117
|
+
* This function automatically detects model capabilities and sets
|
|
118
|
+
* appropriate defaults:
|
|
119
|
+
* - Thinking models (`-thinking` suffix) get temperature=1.0 locked
|
|
120
|
+
* - Thinking models get 32k default max_tokens to avoid truncation
|
|
121
|
+
* - K2.5 models get video input support
|
|
122
|
+
*
|
|
81
123
|
* @example
|
|
82
124
|
* ```ts
|
|
83
125
|
* const caps = inferModelCapabilities('kimi-k2.5-thinking');
|
|
84
|
-
* // {
|
|
126
|
+
* // {
|
|
127
|
+
* // thinking: true,
|
|
128
|
+
* // alwaysThinking: true,
|
|
129
|
+
* // videoInput: true,
|
|
130
|
+
* // temperatureLocked: true,
|
|
131
|
+
* // defaultTemperature: 1.0,
|
|
132
|
+
* // defaultMaxOutputTokens: 32768,
|
|
133
|
+
* // ...
|
|
134
|
+
* // }
|
|
85
135
|
* ```
|
|
86
136
|
*/
|
|
87
137
|
export function inferModelCapabilities(modelId: string): KimiModelCapabilities {
|
|
@@ -96,7 +146,12 @@ export function inferModelCapabilities(modelId: string): KimiModelCapabilities {
|
|
|
96
146
|
maxContextSize: 256_000, // 256k context window
|
|
97
147
|
toolCalling: true,
|
|
98
148
|
jsonMode: true,
|
|
99
|
-
structuredOutputs: true
|
|
149
|
+
structuredOutputs: true,
|
|
150
|
+
// Thinking models require temperature=1.0 for optimal reasoning
|
|
151
|
+
defaultTemperature: isThinkingModel ? THINKING_MODEL_TEMPERATURE : undefined,
|
|
152
|
+
temperatureLocked: isThinkingModel,
|
|
153
|
+
// Thinking models need higher token limits to avoid truncated reasoning
|
|
154
|
+
defaultMaxOutputTokens: isThinkingModel ? THINKING_MODEL_DEFAULT_MAX_TOKENS : STANDARD_MODEL_DEFAULT_MAX_TOKENS
|
|
100
155
|
};
|
|
101
156
|
}
|
|
102
157
|
|
package/src/core/utils.ts
CHANGED
|
@@ -208,3 +208,141 @@ export function extractMessageContent(message: {
|
|
|
208
208
|
|
|
209
209
|
return { text, reasoning };
|
|
210
210
|
}
|
|
211
|
+
|
|
212
|
+
// ============================================================================
|
|
213
|
+
// Multi-turn Reasoning Utilities
|
|
214
|
+
// ============================================================================
|
|
215
|
+
|
|
216
|
+
/**
|
|
217
|
+
* Information about reasoning content in a conversation.
|
|
218
|
+
*/
|
|
219
|
+
export interface ReasoningAnalysis {
|
|
220
|
+
/** Total number of messages with reasoning content */
|
|
221
|
+
messagesWithReasoning: number;
|
|
222
|
+
/** Total reasoning tokens (estimated by character count / 4) */
|
|
223
|
+
estimatedReasoningTokens: number;
|
|
224
|
+
/** Whether reasoning is properly preserved in the conversation */
|
|
225
|
+
isPreserved: boolean;
|
|
226
|
+
/** Messages that are missing expected reasoning content */
|
|
227
|
+
missingReasoningIndices: number[];
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/**
|
|
231
|
+
* Analyze reasoning content preservation in a conversation.
|
|
232
|
+
*
|
|
233
|
+
* This utility helps verify that reasoning content is being properly
|
|
234
|
+
* preserved across multi-turn conversations with thinking models.
|
|
235
|
+
* Kimi requires reasoning content to be maintained in the message
|
|
236
|
+
* history for logical continuity in agentic/tool-calling scenarios.
|
|
237
|
+
*
|
|
238
|
+
* @param messages - Array of messages to analyze
|
|
239
|
+
* @returns Analysis of reasoning preservation
|
|
240
|
+
*
|
|
241
|
+
* @example
|
|
242
|
+
* ```ts
|
|
243
|
+
* const analysis = analyzeReasoningPreservation(messages);
|
|
244
|
+
* if (!analysis.isPreserved) {
|
|
245
|
+
* console.warn('Reasoning content missing from messages:', analysis.missingReasoningIndices);
|
|
246
|
+
* }
|
|
247
|
+
* ```
|
|
248
|
+
*/
|
|
249
|
+
export function analyzeReasoningPreservation(
|
|
250
|
+
messages: Array<{
|
|
251
|
+
role: string;
|
|
252
|
+
content?: unknown;
|
|
253
|
+
reasoning_content?: string | null;
|
|
254
|
+
reasoning?: string | null;
|
|
255
|
+
}>
|
|
256
|
+
): ReasoningAnalysis {
|
|
257
|
+
let messagesWithReasoning = 0;
|
|
258
|
+
let totalReasoningChars = 0;
|
|
259
|
+
const missingReasoningIndices: number[] = [];
|
|
260
|
+
|
|
261
|
+
// Track whether we've seen a tool call that should have reasoning preserved
|
|
262
|
+
let expectReasoningAfterToolCall = false;
|
|
263
|
+
|
|
264
|
+
for (let i = 0; i < messages.length; i++) {
|
|
265
|
+
const message = messages[i];
|
|
266
|
+
|
|
267
|
+
if (message.role === 'assistant') {
|
|
268
|
+
const { reasoning } = extractMessageContent(message);
|
|
269
|
+
|
|
270
|
+
if (reasoning.length > 0) {
|
|
271
|
+
messagesWithReasoning++;
|
|
272
|
+
totalReasoningChars += reasoning.length;
|
|
273
|
+
expectReasoningAfterToolCall = false;
|
|
274
|
+
} else if (expectReasoningAfterToolCall) {
|
|
275
|
+
// This assistant message should have reasoning from the previous turn
|
|
276
|
+
missingReasoningIndices.push(i);
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
// Check if this message has tool calls
|
|
280
|
+
if ('tool_calls' in message && Array.isArray(message.tool_calls) && message.tool_calls.length > 0) {
|
|
281
|
+
expectReasoningAfterToolCall = true;
|
|
282
|
+
}
|
|
283
|
+
} else if (message.role === 'tool') {
|
|
284
|
+
// After a tool response, we expect the next assistant message to potentially have reasoning
|
|
285
|
+
expectReasoningAfterToolCall = true;
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
return {
|
|
290
|
+
messagesWithReasoning,
|
|
291
|
+
estimatedReasoningTokens: Math.ceil(totalReasoningChars / 4),
|
|
292
|
+
isPreserved: missingReasoningIndices.length === 0,
|
|
293
|
+
missingReasoningIndices
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
/**
|
|
298
|
+
* Check if a conversation is suitable for thinking models.
|
|
299
|
+
*
|
|
300
|
+
* Thinking models work best with:
|
|
301
|
+
* - Complex reasoning tasks
|
|
302
|
+
* - Multi-step problem solving
|
|
303
|
+
* - Tasks requiring chain-of-thought
|
|
304
|
+
*
|
|
305
|
+
* This helper provides guidance on whether a thinking model would benefit
|
|
306
|
+
* the conversation.
|
|
307
|
+
*
|
|
308
|
+
* @param messageCount - Number of messages in the conversation
|
|
309
|
+
* @param hasToolCalls - Whether the conversation includes tool calls
|
|
310
|
+
* @param estimatedComplexity - Estimated task complexity (0-1)
|
|
311
|
+
* @returns Recommendation on using thinking models
|
|
312
|
+
*/
|
|
313
|
+
export function recommendThinkingModel(
|
|
314
|
+
messageCount: number,
|
|
315
|
+
hasToolCalls: boolean,
|
|
316
|
+
estimatedComplexity: number
|
|
317
|
+
): { recommended: boolean; reason: string } {
|
|
318
|
+
// Thinking models are recommended for:
|
|
319
|
+
// 1. Complex tasks (complexity > 0.5)
|
|
320
|
+
// 2. Agentic scenarios with tool calls
|
|
321
|
+
// 3. Multi-turn conversations where reasoning continuity matters
|
|
322
|
+
|
|
323
|
+
if (estimatedComplexity > 0.7) {
|
|
324
|
+
return {
|
|
325
|
+
recommended: true,
|
|
326
|
+
reason: 'High complexity task benefits from extended reasoning'
|
|
327
|
+
};
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
if (hasToolCalls && messageCount > 2) {
|
|
331
|
+
return {
|
|
332
|
+
recommended: true,
|
|
333
|
+
reason: 'Multi-turn tool usage benefits from reasoning preservation'
|
|
334
|
+
};
|
|
335
|
+
}
|
|
336
|
+
|
|
337
|
+
if (estimatedComplexity > 0.5) {
|
|
338
|
+
return {
|
|
339
|
+
recommended: true,
|
|
340
|
+
reason: 'Moderate complexity may benefit from reasoning'
|
|
341
|
+
};
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
return {
|
|
345
|
+
recommended: false,
|
|
346
|
+
reason: 'Standard model sufficient for this task'
|
|
347
|
+
};
|
|
348
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Ensemble module exports.
|
|
3
|
+
* @module
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export type { GenerateFunction, MultiSamplerOptions } from './multi-sampler';
|
|
7
|
+
export type {
|
|
8
|
+
EnsembleConfig,
|
|
9
|
+
EnsembleMetadata,
|
|
10
|
+
EnsembleResponse,
|
|
11
|
+
EnsembleResult,
|
|
12
|
+
EnsembleState,
|
|
13
|
+
ScoringHeuristic,
|
|
14
|
+
ScoringOptions,
|
|
15
|
+
SelectionStrategy
|
|
16
|
+
} from './types';
|
|
17
|
+
export { MultiSampler, createSingletonEnsembleResult } from './multi-sampler';
|
|
@@ -0,0 +1,433 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-sampler implementation for ensemble generation.
|
|
3
|
+
* @module
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import type {
|
|
7
|
+
EnsembleConfig,
|
|
8
|
+
EnsembleMetadata,
|
|
9
|
+
EnsembleResponse,
|
|
10
|
+
EnsembleResult,
|
|
11
|
+
LanguageModelUsage,
|
|
12
|
+
ScoringHeuristic,
|
|
13
|
+
SelectionStrategy
|
|
14
|
+
} from './types';
|
|
15
|
+
|
|
16
|
+
// ============================================================================
|
|
17
|
+
// Types
|
|
18
|
+
// ============================================================================
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* A function that generates a single response.
|
|
22
|
+
*/
|
|
23
|
+
export type GenerateFunction = (options: { temperature: number; sampleIndex: number }) => Promise<{
|
|
24
|
+
text: string;
|
|
25
|
+
reasoning?: string;
|
|
26
|
+
toolCalls?: unknown[];
|
|
27
|
+
toolResults?: unknown[];
|
|
28
|
+
usage?: LanguageModelUsage;
|
|
29
|
+
finishReason: string;
|
|
30
|
+
}>;
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Options for creating a multi-sampler.
|
|
34
|
+
*/
|
|
35
|
+
export interface MultiSamplerOptions {
|
|
36
|
+
/**
|
|
37
|
+
* The model ID being used.
|
|
38
|
+
*/
|
|
39
|
+
modelId: string;
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Base temperature for generation.
|
|
43
|
+
*/
|
|
44
|
+
baseTemperature?: number;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// ============================================================================
|
|
48
|
+
// MultiSampler Class
|
|
49
|
+
// ============================================================================
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Multi-sampler for generating multiple responses and selecting the best one.
|
|
53
|
+
*
|
|
54
|
+
* @example
|
|
55
|
+
* ```ts
|
|
56
|
+
* const sampler = new MultiSampler({ modelId: 'kimi-k2.5' });
|
|
57
|
+
* const result = await sampler.generate(generateFn, {
|
|
58
|
+
* n: 3,
|
|
59
|
+
* selectionStrategy: 'best',
|
|
60
|
+
* scoringHeuristic: 'code',
|
|
61
|
+
* });
|
|
62
|
+
* ```
|
|
63
|
+
*/
|
|
64
|
+
export class MultiSampler {
|
|
65
|
+
private modelId: string;
|
|
66
|
+
private baseTemperature: number;
|
|
67
|
+
|
|
68
|
+
constructor(options: MultiSamplerOptions) {
|
|
69
|
+
this.modelId = options.modelId;
|
|
70
|
+
this.baseTemperature = options.baseTemperature ?? 0.7;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
/**
|
|
74
|
+
* Generate multiple samples and select the best one.
|
|
75
|
+
*
|
|
76
|
+
* @param generateFn - Function to generate a single response
|
|
77
|
+
* @param config - Ensemble configuration
|
|
78
|
+
* @returns The ensemble result
|
|
79
|
+
*/
|
|
80
|
+
async generate(generateFn: GenerateFunction, config: EnsembleConfig): Promise<EnsembleResult> {
|
|
81
|
+
const startTime = Date.now();
|
|
82
|
+
const {
|
|
83
|
+
n,
|
|
84
|
+
selectionStrategy = 'best',
|
|
85
|
+
temperatureVariance = 0.1,
|
|
86
|
+
scoringHeuristic = 'confidence',
|
|
87
|
+
customScorer,
|
|
88
|
+
timeoutMs = 60000,
|
|
89
|
+
allowPartialFailure = true,
|
|
90
|
+
minSuccessfulSamples = 1
|
|
91
|
+
} = config;
|
|
92
|
+
|
|
93
|
+
// Validate configuration
|
|
94
|
+
if (n < 1 || n > 10) {
|
|
95
|
+
throw new Error('Ensemble n must be between 1 and 10');
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Generate samples in parallel
|
|
99
|
+
const promises = Array.from({ length: n }, async (_, i) => {
|
|
100
|
+
const temperature = Math.min(this.baseTemperature + i * temperatureVariance, 2.0);
|
|
101
|
+
const sampleStart = Date.now();
|
|
102
|
+
|
|
103
|
+
try {
|
|
104
|
+
const result = await generateFn({ temperature, sampleIndex: i });
|
|
105
|
+
return {
|
|
106
|
+
text: result.text,
|
|
107
|
+
reasoning: result.reasoning,
|
|
108
|
+
toolCalls: result.toolCalls,
|
|
109
|
+
toolResults: result.toolResults,
|
|
110
|
+
usage: result.usage,
|
|
111
|
+
sampleIndex: i,
|
|
112
|
+
temperature,
|
|
113
|
+
finishReason: result.finishReason,
|
|
114
|
+
success: true,
|
|
115
|
+
durationMs: Date.now() - sampleStart
|
|
116
|
+
} as EnsembleResponse;
|
|
117
|
+
} catch (error) {
|
|
118
|
+
return {
|
|
119
|
+
text: '',
|
|
120
|
+
sampleIndex: i,
|
|
121
|
+
temperature,
|
|
122
|
+
finishReason: 'error',
|
|
123
|
+
success: false,
|
|
124
|
+
error: error instanceof Error ? error.message : 'Unknown error',
|
|
125
|
+
durationMs: Date.now() - sampleStart
|
|
126
|
+
} as EnsembleResponse;
|
|
127
|
+
}
|
|
128
|
+
});
|
|
129
|
+
|
|
130
|
+
// Wait for all samples with timeout
|
|
131
|
+
let responses: EnsembleResponse[];
|
|
132
|
+
try {
|
|
133
|
+
responses = await Promise.race([
|
|
134
|
+
Promise.all(promises),
|
|
135
|
+
new Promise<never>((_, reject) =>
|
|
136
|
+
setTimeout(() => reject(new Error('Ensemble generation timed out')), timeoutMs)
|
|
137
|
+
)
|
|
138
|
+
]);
|
|
139
|
+
} catch (_error) {
|
|
140
|
+
// On timeout, wait a bit more to collect partial results
|
|
141
|
+
const partialResponses = await Promise.all(
|
|
142
|
+
promises.map((p) =>
|
|
143
|
+
p.catch(
|
|
144
|
+
() =>
|
|
145
|
+
({
|
|
146
|
+
text: '',
|
|
147
|
+
sampleIndex: -1,
|
|
148
|
+
temperature: 0,
|
|
149
|
+
finishReason: 'timeout',
|
|
150
|
+
success: false,
|
|
151
|
+
error: 'Timed out'
|
|
152
|
+
}) as EnsembleResponse
|
|
153
|
+
)
|
|
154
|
+
)
|
|
155
|
+
);
|
|
156
|
+
responses = partialResponses.filter((r) => r.sampleIndex >= 0);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
// Filter successful responses
|
|
160
|
+
const successfulResponses = responses.filter((r) => r.success);
|
|
161
|
+
|
|
162
|
+
if (successfulResponses.length < minSuccessfulSamples && !allowPartialFailure) {
|
|
163
|
+
throw new Error(
|
|
164
|
+
`Only ${successfulResponses.length} samples succeeded, minimum required is ${minSuccessfulSamples}`
|
|
165
|
+
);
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
if (successfulResponses.length === 0) {
|
|
169
|
+
throw new Error('All ensemble samples failed');
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
// Apply selection strategy
|
|
173
|
+
const result = this.selectBest(successfulResponses, responses, {
|
|
174
|
+
selectionStrategy,
|
|
175
|
+
scoringHeuristic,
|
|
176
|
+
customScorer,
|
|
177
|
+
modelId: this.modelId,
|
|
178
|
+
startTime
|
|
179
|
+
});
|
|
180
|
+
|
|
181
|
+
return result;
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
/**
|
|
185
|
+
* Select the best response based on the strategy.
|
|
186
|
+
*/
|
|
187
|
+
private selectBest(
|
|
188
|
+
successfulResponses: EnsembleResponse[],
|
|
189
|
+
allResponses: EnsembleResponse[],
|
|
190
|
+
options: {
|
|
191
|
+
selectionStrategy: SelectionStrategy;
|
|
192
|
+
scoringHeuristic: ScoringHeuristic;
|
|
193
|
+
customScorer?: (response: EnsembleResponse) => number;
|
|
194
|
+
modelId: string;
|
|
195
|
+
startTime: number;
|
|
196
|
+
}
|
|
197
|
+
): EnsembleResult {
|
|
198
|
+
const { selectionStrategy, scoringHeuristic, customScorer, modelId, startTime } = options;
|
|
199
|
+
|
|
200
|
+
// Score all successful responses
|
|
201
|
+
const scored = successfulResponses.map((r) => {
|
|
202
|
+
return {
|
|
203
|
+
...r,
|
|
204
|
+
score: this.calculateScore(r, scoringHeuristic, customScorer)
|
|
205
|
+
};
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
let winner: EnsembleResponse;
|
|
209
|
+
let alternatives: EnsembleResponse[] | undefined;
|
|
210
|
+
|
|
211
|
+
switch (selectionStrategy) {
|
|
212
|
+
case 'first':
|
|
213
|
+
winner = scored[0];
|
|
214
|
+
break;
|
|
215
|
+
|
|
216
|
+
case 'vote':
|
|
217
|
+
winner = this.majorityVote(scored);
|
|
218
|
+
break;
|
|
219
|
+
|
|
220
|
+
case 'best':
|
|
221
|
+
scored.sort((a, b) => (b.score ?? 0) - (a.score ?? 0));
|
|
222
|
+
winner = scored[0];
|
|
223
|
+
break;
|
|
224
|
+
|
|
225
|
+
case 'all':
|
|
226
|
+
winner = scored[0];
|
|
227
|
+
alternatives = scored;
|
|
228
|
+
break;
|
|
229
|
+
|
|
230
|
+
default:
|
|
231
|
+
throw new Error(`Unknown selection strategy: ${selectionStrategy}`);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
const metadata: EnsembleMetadata = {
|
|
235
|
+
nRequested: allResponses.length,
|
|
236
|
+
nCompleted: successfulResponses.length,
|
|
237
|
+
nFailed: allResponses.filter((r) => !r.success).length,
|
|
238
|
+
selectionStrategy,
|
|
239
|
+
winningIndex: winner.sampleIndex,
|
|
240
|
+
scores: scored.map((r) => r.score ?? 0),
|
|
241
|
+
durationMs: Date.now() - startTime,
|
|
242
|
+
modelId,
|
|
243
|
+
totalUsage: this.aggregateUsage(successfulResponses)
|
|
244
|
+
};
|
|
245
|
+
|
|
246
|
+
return {
|
|
247
|
+
text: winner.text,
|
|
248
|
+
reasoning: winner.reasoning,
|
|
249
|
+
toolCalls: winner.toolCalls as EnsembleResult['toolCalls'],
|
|
250
|
+
toolResults: winner.toolResults as EnsembleResult['toolResults'],
|
|
251
|
+
usage: winner.usage ?? { promptTokens: 0, completionTokens: 0, totalTokens: 0 },
|
|
252
|
+
alternatives,
|
|
253
|
+
metadata
|
|
254
|
+
};
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
/**
|
|
258
|
+
* Calculate score for a response based on the heuristic.
|
|
259
|
+
*/
|
|
260
|
+
private calculateScore(
|
|
261
|
+
response: EnsembleResponse,
|
|
262
|
+
heuristic: ScoringHeuristic,
|
|
263
|
+
customScorer?: (response: EnsembleResponse) => number
|
|
264
|
+
): number {
|
|
265
|
+
switch (heuristic) {
|
|
266
|
+
case 'length':
|
|
267
|
+
// Prefer concise answers (inverse length, normalized)
|
|
268
|
+
return 1000 / (response.text.length + 1);
|
|
269
|
+
|
|
270
|
+
case 'confidence':
|
|
271
|
+
// Higher completion tokens often indicates more complete reasoning
|
|
272
|
+
return response.usage?.completionTokens ?? 0;
|
|
273
|
+
|
|
274
|
+
case 'code':
|
|
275
|
+
return this.scoreCodeQuality(response.text);
|
|
276
|
+
|
|
277
|
+
case 'custom':
|
|
278
|
+
if (!customScorer) {
|
|
279
|
+
throw new Error('Custom scorer function required for custom heuristic');
|
|
280
|
+
}
|
|
281
|
+
return customScorer(response);
|
|
282
|
+
|
|
283
|
+
default:
|
|
284
|
+
return 0;
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
/**
|
|
289
|
+
* Score code quality based on heuristics.
|
|
290
|
+
*/
|
|
291
|
+
private scoreCodeQuality(text: string): number {
|
|
292
|
+
let score = 100;
|
|
293
|
+
|
|
294
|
+
// Deduct for common error patterns
|
|
295
|
+
const errorPatterns = [
|
|
296
|
+
{ pattern: /SyntaxError/gi, penalty: 25 },
|
|
297
|
+
{ pattern: /ReferenceError/gi, penalty: 20 },
|
|
298
|
+
{ pattern: /TypeError/gi, penalty: 20 },
|
|
299
|
+
{ pattern: /undefined is not/gi, penalty: 15 },
|
|
300
|
+
{ pattern: /cannot read property/gi, penalty: 15 },
|
|
301
|
+
{ pattern: /is not defined/gi, penalty: 15 },
|
|
302
|
+
{ pattern: /unexpected token/gi, penalty: 20 },
|
|
303
|
+
{ pattern: /null is not/gi, penalty: 15 }
|
|
304
|
+
];
|
|
305
|
+
|
|
306
|
+
for (const { pattern, penalty } of errorPatterns) {
|
|
307
|
+
const matches = text.match(pattern);
|
|
308
|
+
if (matches) {
|
|
309
|
+
score -= penalty * matches.length;
|
|
310
|
+
}
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Bonus for proper code blocks
|
|
314
|
+
if (text.includes('```')) {
|
|
315
|
+
score += 10;
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
// Bonus for comments/documentation
|
|
319
|
+
if (/\/\/.*|\/\*[\s\S]*?\*\/|#.*/.test(text)) {
|
|
320
|
+
score += 5;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
// Bonus for test mentions
|
|
324
|
+
if (/\b(test|spec|assert|expect|describe|it)\b/i.test(text)) {
|
|
325
|
+
score += 5;
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
// Bonus for type annotations (TypeScript)
|
|
329
|
+
if (/:\s*(string|number|boolean|void|any|unknown|never)\b/.test(text)) {
|
|
330
|
+
score += 5;
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
// Penalty for TODO/FIXME left in code
|
|
334
|
+
if (/\b(TODO|FIXME|XXX|HACK)\b/i.test(text)) {
|
|
335
|
+
score -= 5;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
return Math.max(0, score);
|
|
339
|
+
}
|
|
340
|
+
|
|
341
|
+
/**
|
|
342
|
+
* Select the most common response (majority voting).
|
|
343
|
+
*/
|
|
344
|
+
private majorityVote(responses: EnsembleResponse[]): EnsembleResponse {
|
|
345
|
+
// Simple text similarity voting based on normalized text
|
|
346
|
+
const normalized = responses.map((r) => {
|
|
347
|
+
return {
|
|
348
|
+
response: r,
|
|
349
|
+
key: r.text.toLowerCase().replace(/\s+/g, ' ').trim().slice(0, 500)
|
|
350
|
+
};
|
|
351
|
+
});
|
|
352
|
+
|
|
353
|
+
const votes = new Map<string, { count: number; response: EnsembleResponse }>();
|
|
354
|
+
|
|
355
|
+
for (const { response, key } of normalized) {
|
|
356
|
+
const existing = votes.get(key);
|
|
357
|
+
if (existing) {
|
|
358
|
+
existing.count++;
|
|
359
|
+
} else {
|
|
360
|
+
votes.set(key, { count: 1, response });
|
|
361
|
+
}
|
|
362
|
+
}
|
|
363
|
+
|
|
364
|
+
// Find the response with the most votes
|
|
365
|
+
let maxVotes = 0;
|
|
366
|
+
let winner = responses[0];
|
|
367
|
+
|
|
368
|
+
for (const { count, response } of votes.values()) {
|
|
369
|
+
if (count > maxVotes) {
|
|
370
|
+
maxVotes = count;
|
|
371
|
+
winner = response;
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
return winner;
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
/**
|
|
379
|
+
* Aggregate usage across all responses.
|
|
380
|
+
*/
|
|
381
|
+
private aggregateUsage(responses: EnsembleResponse[]): LanguageModelUsage {
|
|
382
|
+
return responses.reduce(
|
|
383
|
+
(acc, r) => {
|
|
384
|
+
return {
|
|
385
|
+
promptTokens: acc.promptTokens + (r.usage?.promptTokens ?? 0),
|
|
386
|
+
completionTokens: acc.completionTokens + (r.usage?.completionTokens ?? 0),
|
|
387
|
+
totalTokens: acc.totalTokens + (r.usage?.totalTokens ?? 0)
|
|
388
|
+
};
|
|
389
|
+
},
|
|
390
|
+
{ promptTokens: 0, completionTokens: 0, totalTokens: 0 }
|
|
391
|
+
);
|
|
392
|
+
}
|
|
393
|
+
}
|
|
394
|
+
|
|
395
|
+
// ============================================================================
|
|
396
|
+
// Utility Functions
|
|
397
|
+
// ============================================================================
|
|
398
|
+
|
|
399
|
+
/**
|
|
400
|
+
* Create a simple ensemble result from a single response.
|
|
401
|
+
* Useful for when ensemble is disabled but consistent return types are needed.
|
|
402
|
+
*/
|
|
403
|
+
export function createSingletonEnsembleResult(
|
|
404
|
+
response: {
|
|
405
|
+
text: string;
|
|
406
|
+
reasoning?: string;
|
|
407
|
+
toolCalls?: unknown[];
|
|
408
|
+
toolResults?: unknown[];
|
|
409
|
+
usage?: LanguageModelUsage;
|
|
410
|
+
finishReason: string;
|
|
411
|
+
},
|
|
412
|
+
modelId: string,
|
|
413
|
+
durationMs: number
|
|
414
|
+
): EnsembleResult {
|
|
415
|
+
return {
|
|
416
|
+
text: response.text,
|
|
417
|
+
reasoning: response.reasoning,
|
|
418
|
+
toolCalls: response.toolCalls as EnsembleResult['toolCalls'],
|
|
419
|
+
toolResults: response.toolResults as EnsembleResult['toolResults'],
|
|
420
|
+
usage: response.usage ?? { promptTokens: 0, completionTokens: 0, totalTokens: 0 },
|
|
421
|
+
metadata: {
|
|
422
|
+
nRequested: 1,
|
|
423
|
+
nCompleted: 1,
|
|
424
|
+
nFailed: 0,
|
|
425
|
+
selectionStrategy: 'first',
|
|
426
|
+
winningIndex: 0,
|
|
427
|
+
scores: [100],
|
|
428
|
+
durationMs,
|
|
429
|
+
modelId,
|
|
430
|
+
totalUsage: response.usage ?? { promptTokens: 0, completionTokens: 0, totalTokens: 0 }
|
|
431
|
+
}
|
|
432
|
+
};
|
|
433
|
+
}
|