@thispointon/kondi-chat 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +556 -0
- package/bin/kondi-chat +56 -0
- package/bin/kondi-chat.js +72 -0
- package/package.json +55 -0
- package/scripts/demo.tape +49 -0
- package/scripts/postinstall.cjs +103 -0
- package/src/audit/analytics.ts +261 -0
- package/src/audit/ledger.ts +253 -0
- package/src/audit/telemetry.ts +165 -0
- package/src/cli/backend.ts +675 -0
- package/src/cli/commands.ts +419 -0
- package/src/cli/help.ts +182 -0
- package/src/cli/submit-helpers.ts +159 -0
- package/src/cli/submit.ts +539 -0
- package/src/cli/wizard.ts +121 -0
- package/src/context/bootstrap.ts +138 -0
- package/src/context/budget.ts +100 -0
- package/src/context/manager.ts +666 -0
- package/src/context/memory.ts +160 -0
- package/src/context/preflight.ts +176 -0
- package/src/context/project-brain.ts +101 -0
- package/src/context/receipts.ts +108 -0
- package/src/context/skills.ts +154 -0
- package/src/context/symbol-index.ts +240 -0
- package/src/council/profiles.ts +137 -0
- package/src/council/tool.ts +138 -0
- package/src/council-engine/cli/council-artifacts.ts +230 -0
- package/src/council-engine/cli/council-config.ts +178 -0
- package/src/council-engine/cli/council-session-export.ts +116 -0
- package/src/council-engine/cli/kondi.ts +98 -0
- package/src/council-engine/cli/llm-caller.ts +229 -0
- package/src/council-engine/cli/localStorage-shim.ts +119 -0
- package/src/council-engine/cli/node-platform.ts +68 -0
- package/src/council-engine/cli/run-council.ts +481 -0
- package/src/council-engine/cli/run-pipeline.ts +772 -0
- package/src/council-engine/cli/session-export.ts +153 -0
- package/src/council-engine/configs/councils/analysis.json +101 -0
- package/src/council-engine/configs/councils/code-planning.json +86 -0
- package/src/council-engine/configs/councils/coding.json +89 -0
- package/src/council-engine/configs/councils/debate.json +97 -0
- package/src/council-engine/configs/councils/solo-claude.json +34 -0
- package/src/council-engine/configs/councils/solo-gpt.json +34 -0
- package/src/council-engine/council/coding-orchestrator.ts +1205 -0
- package/src/council-engine/council/context-bootstrap.ts +147 -0
- package/src/council-engine/council/context-inspection.ts +42 -0
- package/src/council-engine/council/context-store.ts +763 -0
- package/src/council-engine/council/deliberation-orchestrator.ts +2762 -0
- package/src/council-engine/council/factory.ts +164 -0
- package/src/council-engine/council/index.ts +201 -0
- package/src/council-engine/council/ledger-store.ts +438 -0
- package/src/council-engine/council/prompts.ts +1689 -0
- package/src/council-engine/council/storage-cleanup.ts +164 -0
- package/src/council-engine/council/store.ts +1110 -0
- package/src/council-engine/council/synthesis.ts +291 -0
- package/src/council-engine/council/types.ts +845 -0
- package/src/council-engine/council/validation.ts +613 -0
- package/src/council-engine/pipeline/build-detect.ts +73 -0
- package/src/council-engine/pipeline/executor.ts +1048 -0
- package/src/council-engine/pipeline/index.ts +9 -0
- package/src/council-engine/pipeline/install-detect.ts +84 -0
- package/src/council-engine/pipeline/memory-store.ts +182 -0
- package/src/council-engine/pipeline/output-parsers.ts +146 -0
- package/src/council-engine/pipeline/run-output.ts +149 -0
- package/src/council-engine/pipeline/session-import.ts +177 -0
- package/src/council-engine/pipeline/store.ts +753 -0
- package/src/council-engine/pipeline/test-detect.ts +82 -0
- package/src/council-engine/pipeline/types.ts +401 -0
- package/src/council-engine/services/deliberationSummary.ts +114 -0
- package/src/council-engine/tsconfig.json +16 -0
- package/src/council-engine/types/mcp.ts +122 -0
- package/src/council-engine/utils/filterTools.ts +73 -0
- package/src/engine/apply.ts +238 -0
- package/src/engine/checkpoints.ts +237 -0
- package/src/engine/consultants.ts +347 -0
- package/src/engine/diff.ts +171 -0
- package/src/engine/errors.ts +102 -0
- package/src/engine/git-tools.ts +246 -0
- package/src/engine/hooks.ts +181 -0
- package/src/engine/loop-guard.ts +155 -0
- package/src/engine/permissions.ts +293 -0
- package/src/engine/pipeline.ts +376 -0
- package/src/engine/sub-agents.ts +133 -0
- package/src/engine/task-card.ts +185 -0
- package/src/engine/task-router.ts +256 -0
- package/src/engine/task-store.ts +86 -0
- package/src/engine/tools.ts +783 -0
- package/src/engine/verify.ts +111 -0
- package/src/mcp/client.ts +225 -0
- package/src/mcp/config.ts +120 -0
- package/src/mcp/tool-manager.ts +192 -0
- package/src/mcp/types.ts +61 -0
- package/src/providers/llm-caller.ts +943 -0
- package/src/providers/rate-limiter.ts +238 -0
- package/src/router/NOTES.md +28 -0
- package/src/router/collector.ts +474 -0
- package/src/router/embeddings.ts +286 -0
- package/src/router/index.ts +299 -0
- package/src/router/intent-router.ts +225 -0
- package/src/router/nn-router.ts +205 -0
- package/src/router/profiles.ts +309 -0
- package/src/router/registry.ts +565 -0
- package/src/router/rules.ts +274 -0
- package/src/router/train.py +408 -0
- package/src/session/store.ts +211 -0
- package/src/test-utils/mock-llm.ts +39 -0
- package/src/types.ts +322 -0
- package/src/web/manager.ts +311 -0
|
@@ -0,0 +1,943 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-provider LLM caller — direct HTTP, no SDKs.
|
|
3
|
+
*
|
|
4
|
+
* Supports:
|
|
5
|
+
* - Simple single-turn (systemPrompt + userMessage)
|
|
6
|
+
* - Multi-turn with tool use (messages array + tools)
|
|
7
|
+
* - Anthropic, OpenAI-compatible, and Gemini providers
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import type {
|
|
11
|
+
ProviderId, LLMRequest, LLMResponse,
|
|
12
|
+
ToolDefinition, ToolCall, LLMMessage,
|
|
13
|
+
} from '../types.ts';
|
|
14
|
+
import { getRateLimiter, RateLimitOverflowError } from './rate-limiter.ts';
|
|
15
|
+
import { estimateTokens } from '../context/budget.ts';
|
|
16
|
+
import { LlmCallError, type ErrorSeverity } from '../engine/errors.ts';
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Classify an HTTP status as recoverable (worth retrying) vs. fatal.
|
|
20
|
+
* 5xx + throttling codes are recoverable; everything else is treated as
|
|
21
|
+
* a hard failure (wrong key, bad request, unknown model, …).
|
|
22
|
+
*/
|
|
23
|
+
function severityForStatus(status: number): ErrorSeverity {
|
|
24
|
+
if (status === 429) return 'recoverable';
|
|
25
|
+
if (status >= 500 && status < 600) return 'recoverable';
|
|
26
|
+
return 'fatal';
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
// ---------------------------------------------------------------------------
|
|
30
|
+
// SSE stream parser
|
|
31
|
+
// ---------------------------------------------------------------------------
|
|
32
|
+
|
|
33
|
+
async function* parseSSE(resp: Response): AsyncGenerator<{ type?: string; data?: any }> {
|
|
34
|
+
const reader = resp.body?.getReader();
|
|
35
|
+
if (!reader) return;
|
|
36
|
+
|
|
37
|
+
const decoder = new TextDecoder();
|
|
38
|
+
let buffer = '';
|
|
39
|
+
let eventType: string | undefined;
|
|
40
|
+
let dataLines: string[] = [];
|
|
41
|
+
|
|
42
|
+
try {
|
|
43
|
+
while (true) {
|
|
44
|
+
const { done, value } = await reader.read();
|
|
45
|
+
if (done) break;
|
|
46
|
+
|
|
47
|
+
buffer += decoder.decode(value, { stream: true });
|
|
48
|
+
const lines = buffer.split('\n');
|
|
49
|
+
buffer = lines.pop() || '';
|
|
50
|
+
|
|
51
|
+
for (const line of lines) {
|
|
52
|
+
const trimmed = line.trim();
|
|
53
|
+
if (trimmed === '') {
|
|
54
|
+
// Blank line = end of SSE event
|
|
55
|
+
if (dataLines.length > 0) {
|
|
56
|
+
const joined = dataLines.join('\n');
|
|
57
|
+
let parsed: any;
|
|
58
|
+
try {
|
|
59
|
+
parsed = JSON.parse(joined);
|
|
60
|
+
} catch {
|
|
61
|
+
parsed = joined;
|
|
62
|
+
}
|
|
63
|
+
yield { type: eventType, data: parsed };
|
|
64
|
+
}
|
|
65
|
+
eventType = undefined;
|
|
66
|
+
dataLines = [];
|
|
67
|
+
} else if (trimmed.startsWith('event:') || trimmed.startsWith('event :')) {
|
|
68
|
+
eventType = trimmed.replace(/^event\s*:\s*/, '');
|
|
69
|
+
} else if (trimmed.startsWith('data:') || trimmed.startsWith('data :')) {
|
|
70
|
+
const raw = trimmed.replace(/^data\s*:\s*/, '');
|
|
71
|
+
if (raw === '[DONE]') continue;
|
|
72
|
+
dataLines.push(raw);
|
|
73
|
+
}
|
|
74
|
+
// Ignore other lines (comments starting with :, id:, retry:, etc.)
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Flush any remaining event at end of stream
|
|
79
|
+
if (dataLines.length > 0) {
|
|
80
|
+
const joined = dataLines.join('\n');
|
|
81
|
+
let parsed: any;
|
|
82
|
+
try {
|
|
83
|
+
parsed = JSON.parse(joined);
|
|
84
|
+
} catch {
|
|
85
|
+
parsed = joined;
|
|
86
|
+
}
|
|
87
|
+
yield { type: eventType, data: parsed };
|
|
88
|
+
}
|
|
89
|
+
} finally {
|
|
90
|
+
reader.releaseLock();
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
// ---------------------------------------------------------------------------
|
|
95
|
+
// DeepSeek JSON sanitizer
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* DeepSeek's JSON body parser is stricter than OpenAI/Anthropic — it
|
|
100
|
+
* interprets `\x` inside JSON string values as a hex escape and fails
|
|
101
|
+
* if the hex digits are missing or invalid. This happens when file
|
|
102
|
+
* content containing ANSI escape codes (e.g. `\x1b[31m`) or other
|
|
103
|
+
* `\xNN` sequences is embedded in the conversation context.
|
|
104
|
+
*
|
|
105
|
+
* JSON.stringify already escapes backslashes to `\\`, so the JSON
|
|
106
|
+
* contains `\\x1b` (literal backslash + x + 1 + b). But DeepSeek
|
|
107
|
+
* appears to double-parse the string, treating `\x` as a hex escape.
|
|
108
|
+
*
|
|
109
|
+
* Fix: replace `\\x` with `\\\\x` so after DeepSeek's extra parse
|
|
110
|
+
* pass it becomes the literal `\x` the model sees. Also strip null
|
|
111
|
+
* bytes and other control chars that can corrupt JSON parsing.
|
|
112
|
+
*/
|
|
113
|
+
function sanitizeJsonForDeepSeek(json: string): string {
|
|
114
|
+
return json
|
|
115
|
+
// \\x (escaped backslash + x) → \\\\x (double-escaped)
|
|
116
|
+
.replace(/\\\\x/g, '\\\\\\\\x')
|
|
117
|
+
// Bare control characters that snuck through (shouldn't happen
|
|
118
|
+
// with JSON.stringify, but belt-and-suspenders)
|
|
119
|
+
.replace(/[\x00-\x08\x0b\x0c\x0e-\x1f]/g, '');
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
// ---------------------------------------------------------------------------
|
|
123
|
+
// Default models per provider
|
|
124
|
+
// ---------------------------------------------------------------------------
|
|
125
|
+
|
|
126
|
+
export const DEFAULT_MODELS: Record<ProviderId, string> = {
|
|
127
|
+
'anthropic': 'claude-sonnet-4-5-20250929',
|
|
128
|
+
'openai': 'gpt-4o',
|
|
129
|
+
'deepseek': 'deepseek-chat',
|
|
130
|
+
'google': 'models/gemini-2.5-flash',
|
|
131
|
+
'xai': 'grok-3',
|
|
132
|
+
'zai': 'glm-4.6',
|
|
133
|
+
'ollama': 'llama3.1',
|
|
134
|
+
'nvidia-router': 'auto',
|
|
135
|
+
};
|
|
136
|
+
|
|
137
|
+
// ---------------------------------------------------------------------------
|
|
138
|
+
// API key resolution
|
|
139
|
+
// ---------------------------------------------------------------------------
|
|
140
|
+
|
|
141
|
+
function getApiKey(provider: ProviderId): string | undefined {
|
|
142
|
+
switch (provider) {
|
|
143
|
+
case 'anthropic': return process.env.ANTHROPIC_API_KEY;
|
|
144
|
+
case 'openai': return process.env.OPENAI_API_KEY;
|
|
145
|
+
case 'deepseek': return process.env.DEEPSEEK_API_KEY;
|
|
146
|
+
case 'xai': return process.env.XAI_API_KEY;
|
|
147
|
+
case 'zai': return process.env.ZAI_API_KEY;
|
|
148
|
+
case 'google': return process.env.GOOGLE_API_KEY;
|
|
149
|
+
case 'nvidia-router': return process.env.NVIDIA_API_KEY;
|
|
150
|
+
default: return undefined;
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// ---------------------------------------------------------------------------
|
|
155
|
+
// Anthropic
|
|
156
|
+
// ---------------------------------------------------------------------------
|
|
157
|
+
|
|
158
|
+
async function callAnthropic(
|
|
159
|
+
apiKey: string,
|
|
160
|
+
model: string,
|
|
161
|
+
req: LLMRequest,
|
|
162
|
+
): Promise<LLMResponse> {
|
|
163
|
+
const start = Date.now();
|
|
164
|
+
|
|
165
|
+
const systemContent: Array<{ type: string; text: string; cache_control?: { type: string } }> = [];
|
|
166
|
+
if (req.cacheablePrefix) {
|
|
167
|
+
systemContent.push({
|
|
168
|
+
type: 'text',
|
|
169
|
+
text: req.cacheablePrefix,
|
|
170
|
+
cache_control: { type: 'ephemeral' },
|
|
171
|
+
});
|
|
172
|
+
}
|
|
173
|
+
systemContent.push({ type: 'text', text: req.systemPrompt });
|
|
174
|
+
|
|
175
|
+
let messages: any[];
|
|
176
|
+
if (req.messages) {
|
|
177
|
+
messages = anthropicMessages(req.messages);
|
|
178
|
+
} else {
|
|
179
|
+
messages = [{ role: 'user', content: req.userMessage || '' }];
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
const tools = req.tools?.map(t => ({
|
|
183
|
+
name: t.name,
|
|
184
|
+
description: t.description,
|
|
185
|
+
input_schema: t.parameters,
|
|
186
|
+
}));
|
|
187
|
+
|
|
188
|
+
const body: any = {
|
|
189
|
+
model,
|
|
190
|
+
max_tokens: req.maxOutputTokens ?? 8192,
|
|
191
|
+
system: systemContent,
|
|
192
|
+
messages,
|
|
193
|
+
...(req.temperature !== undefined ? { temperature: req.temperature } : {}),
|
|
194
|
+
...(tools ? { tools } : {}),
|
|
195
|
+
...(req.stream ? { stream: true } : {}),
|
|
196
|
+
};
|
|
197
|
+
|
|
198
|
+
const resp = await fetch('https://api.anthropic.com/v1/messages', {
|
|
199
|
+
method: 'POST',
|
|
200
|
+
headers: {
|
|
201
|
+
'Content-Type': 'application/json',
|
|
202
|
+
'anthropic-version': '2023-06-01',
|
|
203
|
+
'anthropic-beta': 'prompt-caching-2024-07-31',
|
|
204
|
+
'x-api-key': apiKey,
|
|
205
|
+
},
|
|
206
|
+
body: JSON.stringify(body),
|
|
207
|
+
});
|
|
208
|
+
|
|
209
|
+
if (!resp.ok) {
|
|
210
|
+
const text = await resp.text();
|
|
211
|
+
throw new LlmCallError(
|
|
212
|
+
`Anthropic API ${resp.status}: ${text.substring(0, 500)}`,
|
|
213
|
+
{ provider: 'anthropic', model, status: resp.status, severity: severityForStatus(resp.status) },
|
|
214
|
+
);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
// Streaming path
|
|
218
|
+
if (req.stream && req.onToken) {
|
|
219
|
+
let content = '';
|
|
220
|
+
let reasoningContent = '';
|
|
221
|
+
const toolCalls: ToolCall[] = [];
|
|
222
|
+
let inputTokens = 0;
|
|
223
|
+
let outputTokens = 0;
|
|
224
|
+
let cached = false;
|
|
225
|
+
let cachedInputTokens = 0;
|
|
226
|
+
|
|
227
|
+
// Track tool_use blocks being built
|
|
228
|
+
let currentToolId = '';
|
|
229
|
+
let currentToolName = '';
|
|
230
|
+
let currentToolJson = '';
|
|
231
|
+
let inThinkingBlock = false;
|
|
232
|
+
|
|
233
|
+
for await (const event of parseSSE(resp)) {
|
|
234
|
+
try {
|
|
235
|
+
if (event.type === 'message_start') {
|
|
236
|
+
const usage = event.data?.message?.usage;
|
|
237
|
+
if (usage) {
|
|
238
|
+
inputTokens = usage.input_tokens || 0;
|
|
239
|
+
cachedInputTokens = usage.cache_read_input_tokens || 0;
|
|
240
|
+
cached = cachedInputTokens > 0;
|
|
241
|
+
}
|
|
242
|
+
} else if (event.type === 'content_block_start') {
|
|
243
|
+
const block = event.data?.content_block;
|
|
244
|
+
if (block?.type === 'tool_use') {
|
|
245
|
+
currentToolId = block.id;
|
|
246
|
+
currentToolName = block.name;
|
|
247
|
+
currentToolJson = '';
|
|
248
|
+
} else if (block?.type === 'thinking') {
|
|
249
|
+
inThinkingBlock = true;
|
|
250
|
+
if (block.thinking) reasoningContent += block.thinking;
|
|
251
|
+
}
|
|
252
|
+
} else if (event.type === 'content_block_delta') {
|
|
253
|
+
const delta = event.data?.delta;
|
|
254
|
+
if (delta?.type === 'text_delta' && delta.text) {
|
|
255
|
+
content += delta.text;
|
|
256
|
+
req.onToken(delta.text);
|
|
257
|
+
} else if (delta?.type === 'input_json_delta' && delta.partial_json) {
|
|
258
|
+
currentToolJson += delta.partial_json;
|
|
259
|
+
} else if (delta?.type === 'thinking_delta' && delta.thinking) {
|
|
260
|
+
reasoningContent += delta.thinking;
|
|
261
|
+
}
|
|
262
|
+
} else if (event.type === 'content_block_stop') {
|
|
263
|
+
if (inThinkingBlock) {
|
|
264
|
+
inThinkingBlock = false;
|
|
265
|
+
}
|
|
266
|
+
if (currentToolId) {
|
|
267
|
+
try {
|
|
268
|
+
toolCalls.push({
|
|
269
|
+
id: currentToolId,
|
|
270
|
+
name: currentToolName,
|
|
271
|
+
arguments: currentToolJson ? JSON.parse(currentToolJson) : {},
|
|
272
|
+
});
|
|
273
|
+
} catch {
|
|
274
|
+
toolCalls.push({ id: currentToolId, name: currentToolName, arguments: {} });
|
|
275
|
+
}
|
|
276
|
+
currentToolId = '';
|
|
277
|
+
currentToolName = '';
|
|
278
|
+
currentToolJson = '';
|
|
279
|
+
}
|
|
280
|
+
} else if (event.type === 'message_delta') {
|
|
281
|
+
const usage = event.data?.usage;
|
|
282
|
+
if (usage) outputTokens = usage.output_tokens || 0;
|
|
283
|
+
}
|
|
284
|
+
} catch { /* skip malformed SSE event */ }
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
return {
|
|
288
|
+
content, model, provider: 'anthropic',
|
|
289
|
+
inputTokens, outputTokens,
|
|
290
|
+
latencyMs: Date.now() - start, cached,
|
|
291
|
+
...(cachedInputTokens > 0 ? { cachedInputTokens } : {}),
|
|
292
|
+
...(toolCalls.length > 0 ? { toolCalls } : {}),
|
|
293
|
+
...(reasoningContent ? { reasoningContent } : {}),
|
|
294
|
+
};
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// Non-streaming path
|
|
298
|
+
const data: any = await resp.json();
|
|
299
|
+
const usage = data.usage || {};
|
|
300
|
+
|
|
301
|
+
let content = '';
|
|
302
|
+
let reasoningContent = '';
|
|
303
|
+
const toolCalls: ToolCall[] = [];
|
|
304
|
+
|
|
305
|
+
for (const block of data.content || []) {
|
|
306
|
+
if (block.type === 'text') {
|
|
307
|
+
content += block.text;
|
|
308
|
+
} else if (block.type === 'thinking') {
|
|
309
|
+
reasoningContent += block.thinking || '';
|
|
310
|
+
} else if (block.type === 'tool_use') {
|
|
311
|
+
toolCalls.push({
|
|
312
|
+
id: block.id,
|
|
313
|
+
name: block.name,
|
|
314
|
+
arguments: block.input || {},
|
|
315
|
+
});
|
|
316
|
+
}
|
|
317
|
+
}
|
|
318
|
+
|
|
319
|
+
return {
|
|
320
|
+
content, model, provider: 'anthropic',
|
|
321
|
+
inputTokens: usage.input_tokens || 0,
|
|
322
|
+
outputTokens: usage.output_tokens || 0,
|
|
323
|
+
latencyMs: Date.now() - start,
|
|
324
|
+
cached: (usage.cache_read_input_tokens || 0) > 0,
|
|
325
|
+
...((usage.cache_read_input_tokens || 0) > 0 ? { cachedInputTokens: usage.cache_read_input_tokens } : {}),
|
|
326
|
+
...(toolCalls.length > 0 ? { toolCalls } : {}),
|
|
327
|
+
...(reasoningContent ? { reasoningContent } : {}),
|
|
328
|
+
};
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
/** Convert abstract LLMMessage[] to Anthropic message format */
|
|
332
|
+
function anthropicMessages(messages: LLMMessage[]): any[] {
|
|
333
|
+
const result: any[] = [];
|
|
334
|
+
|
|
335
|
+
for (const msg of messages) {
|
|
336
|
+
if (msg.role === 'user') {
|
|
337
|
+
result.push({ role: 'user', content: msg.content || '' });
|
|
338
|
+
} else if (msg.role === 'assistant') {
|
|
339
|
+
// Assistant message may have text + tool_use blocks
|
|
340
|
+
const content: any[] = [];
|
|
341
|
+
if (msg.content) {
|
|
342
|
+
content.push({ type: 'text', text: msg.content });
|
|
343
|
+
}
|
|
344
|
+
if (msg.toolCalls) {
|
|
345
|
+
for (const tc of msg.toolCalls) {
|
|
346
|
+
content.push({
|
|
347
|
+
type: 'tool_use',
|
|
348
|
+
id: tc.id,
|
|
349
|
+
name: tc.name,
|
|
350
|
+
input: tc.arguments,
|
|
351
|
+
});
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
result.push({ role: 'assistant', content });
|
|
355
|
+
} else if (msg.role === 'tool') {
|
|
356
|
+
// Anthropic: tool results are sent as user messages with tool_result content blocks
|
|
357
|
+
const content: any[] = [];
|
|
358
|
+
if (msg.toolResults) {
|
|
359
|
+
for (const tr of msg.toolResults) {
|
|
360
|
+
content.push({
|
|
361
|
+
type: 'tool_result',
|
|
362
|
+
tool_use_id: tr.toolCallId,
|
|
363
|
+
content: tr.content,
|
|
364
|
+
...(tr.isError ? { is_error: true } : {}),
|
|
365
|
+
});
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
result.push({ role: 'user', content });
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
return result;
|
|
373
|
+
}
|
|
374
|
+
|
|
375
|
+
// ---------------------------------------------------------------------------
|
|
376
|
+
// OpenAI-compatible (OpenAI, DeepSeek, xAI, NVIDIA router, Ollama)
|
|
377
|
+
// ---------------------------------------------------------------------------
|
|
378
|
+
|
|
379
|
+
async function callOpenAICompatible(
|
|
380
|
+
baseUrl: string,
|
|
381
|
+
apiKey: string,
|
|
382
|
+
model: string,
|
|
383
|
+
provider: ProviderId,
|
|
384
|
+
req: LLMRequest,
|
|
385
|
+
): Promise<LLMResponse> {
|
|
386
|
+
const start = Date.now();
|
|
387
|
+
|
|
388
|
+
const systemContent = req.cacheablePrefix
|
|
389
|
+
? `${req.cacheablePrefix}\n\n${req.systemPrompt}`
|
|
390
|
+
: req.systemPrompt;
|
|
391
|
+
|
|
392
|
+
// Messages — multi-turn or single-turn
|
|
393
|
+
let messages: any[];
|
|
394
|
+
if (req.messages) {
|
|
395
|
+
messages = [
|
|
396
|
+
{ role: 'system', content: systemContent },
|
|
397
|
+
...openaiMessages(req.messages),
|
|
398
|
+
];
|
|
399
|
+
} else {
|
|
400
|
+
messages = [
|
|
401
|
+
{ role: 'system', content: systemContent },
|
|
402
|
+
{ role: 'user', content: req.userMessage || '' },
|
|
403
|
+
];
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// Tools
|
|
407
|
+
const tools = req.tools?.map(t => ({
|
|
408
|
+
type: 'function' as const,
|
|
409
|
+
function: {
|
|
410
|
+
name: t.name,
|
|
411
|
+
description: t.description,
|
|
412
|
+
parameters: t.parameters,
|
|
413
|
+
},
|
|
414
|
+
}));
|
|
415
|
+
|
|
416
|
+
const max = req.maxOutputTokens ?? 8192;
|
|
417
|
+
const body: any = {
|
|
418
|
+
model,
|
|
419
|
+
messages,
|
|
420
|
+
...(req.temperature !== undefined ? { temperature: req.temperature } : {}),
|
|
421
|
+
...(tools ? { tools } : {}),
|
|
422
|
+
...(req.stream ? { stream: true } : {}),
|
|
423
|
+
};
|
|
424
|
+
|
|
425
|
+
if (provider === 'openai') {
|
|
426
|
+
body.max_completion_tokens = max; // new OpenAI param
|
|
427
|
+
} else {
|
|
428
|
+
body.max_tokens = max; // legacy / compatible providers
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
// DeepSeek's JSON parser is stricter than most — it rejects bare \x
|
|
432
|
+
// hex escapes and broken \u sequences that appear in file content
|
|
433
|
+
// (ANSI codes, binary fragments, CSV artifacts). Sanitize the body
|
|
434
|
+
// JSON string to replace these with safe placeholders.
|
|
435
|
+
let bodyJson = JSON.stringify(body);
|
|
436
|
+
if (provider === 'deepseek') {
|
|
437
|
+
bodyJson = sanitizeJsonForDeepSeek(bodyJson);
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
const resp = await fetch(`${baseUrl}/chat/completions`, {
|
|
441
|
+
method: 'POST',
|
|
442
|
+
headers: {
|
|
443
|
+
'Content-Type': 'application/json',
|
|
444
|
+
'Authorization': `Bearer ${apiKey}`,
|
|
445
|
+
},
|
|
446
|
+
body: bodyJson,
|
|
447
|
+
});
|
|
448
|
+
|
|
449
|
+
if (!resp.ok) {
|
|
450
|
+
const text = await resp.text();
|
|
451
|
+
throw new LlmCallError(
|
|
452
|
+
`${provider} API ${resp.status}: ${text.substring(0, 500)}`,
|
|
453
|
+
{ provider, model, status: resp.status, severity: severityForStatus(resp.status) },
|
|
454
|
+
);
|
|
455
|
+
}
|
|
456
|
+
|
|
457
|
+
// Streaming path
|
|
458
|
+
if (req.stream && req.onToken) {
|
|
459
|
+
let content = '';
|
|
460
|
+
let reasoningContent = '';
|
|
461
|
+
const toolCalls: ToolCall[] = [];
|
|
462
|
+
const toolJsonBuffers: Map<number, { id: string; name: string; json: string }> = new Map();
|
|
463
|
+
let actualModel = model;
|
|
464
|
+
let inputTokens = 0;
|
|
465
|
+
let outputTokens = 0;
|
|
466
|
+
let cachedInputTokens = 0;
|
|
467
|
+
|
|
468
|
+
for await (const event of parseSSE(resp)) {
|
|
469
|
+
if (!event.data || event.data === '[DONE]') continue;
|
|
470
|
+
|
|
471
|
+
let chunk: any;
|
|
472
|
+
try {
|
|
473
|
+
if (typeof event.data === 'string') {
|
|
474
|
+
const raw = event.data.trim();
|
|
475
|
+
const clean = raw.startsWith('data:') ? raw.slice(5).trim() : raw;
|
|
476
|
+
chunk = JSON.parse(clean);
|
|
477
|
+
} else {
|
|
478
|
+
chunk = event.data;
|
|
479
|
+
}
|
|
480
|
+
} catch {
|
|
481
|
+
continue; // Skip unparseable chunks
|
|
482
|
+
}
|
|
483
|
+
if (chunk.model) actualModel = chunk.model;
|
|
484
|
+
|
|
485
|
+
const delta = chunk.choices?.[0]?.delta;
|
|
486
|
+
if (!delta) continue;
|
|
487
|
+
|
|
488
|
+
// Text content
|
|
489
|
+
if (delta.content) {
|
|
490
|
+
content += delta.content;
|
|
491
|
+
req.onToken(delta.content);
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
// Reasoning content (z.ai GLM-5.x, OpenAI o-series, DeepSeek-R1, Ollama Qwopus).
|
|
495
|
+
// Ollama uses `delta.reasoning`; others use `delta.reasoning_content`.
|
|
496
|
+
const reasoning = delta.reasoning_content || delta.reasoning;
|
|
497
|
+
if (reasoning) {
|
|
498
|
+
reasoningContent += reasoning;
|
|
499
|
+
}
|
|
500
|
+
|
|
501
|
+
// Tool calls (streamed incrementally)
|
|
502
|
+
if (delta.tool_calls) {
|
|
503
|
+
for (const tc of delta.tool_calls) {
|
|
504
|
+
const idx = tc.index ?? 0;
|
|
505
|
+
if (tc.id) {
|
|
506
|
+
toolJsonBuffers.set(idx, { id: tc.id, name: tc.function?.name || '', json: '' });
|
|
507
|
+
}
|
|
508
|
+
const buf = toolJsonBuffers.get(idx);
|
|
509
|
+
if (buf && tc.function?.arguments) {
|
|
510
|
+
buf.json += tc.function.arguments;
|
|
511
|
+
}
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
// Usage (some providers send this in the final chunk)
|
|
516
|
+
if (chunk.usage) {
|
|
517
|
+
inputTokens = chunk.usage.prompt_tokens || 0;
|
|
518
|
+
outputTokens = chunk.usage.completion_tokens || 0;
|
|
519
|
+
cachedInputTokens = chunk.usage.prompt_tokens_details?.cached_tokens || 0;
|
|
520
|
+
}
|
|
521
|
+
}
|
|
522
|
+
|
|
523
|
+
// Finalize tool calls
|
|
524
|
+
for (const buf of toolJsonBuffers.values()) {
|
|
525
|
+
try {
|
|
526
|
+
toolCalls.push({ id: buf.id, name: buf.name, arguments: buf.json ? JSON.parse(buf.json) : {} });
|
|
527
|
+
} catch {
|
|
528
|
+
toolCalls.push({ id: buf.id, name: buf.name, arguments: {} });
|
|
529
|
+
}
|
|
530
|
+
}
|
|
531
|
+
|
|
532
|
+
return {
|
|
533
|
+
content, model: actualModel, provider,
|
|
534
|
+
inputTokens, outputTokens,
|
|
535
|
+
latencyMs: Date.now() - start,
|
|
536
|
+
...(cachedInputTokens > 0 ? { cachedInputTokens, cached: true } : {}),
|
|
537
|
+
...(toolCalls.length > 0 ? { toolCalls } : {}),
|
|
538
|
+
...(reasoningContent ? { reasoningContent } : {}),
|
|
539
|
+
};
|
|
540
|
+
}
|
|
541
|
+
|
|
542
|
+
// Non-streaming path
|
|
543
|
+
const data: any = await resp.json();
|
|
544
|
+
const choice = data.choices?.[0]?.message || {};
|
|
545
|
+
const usage = data.usage || {};
|
|
546
|
+
const cachedInputTokensNs = usage.prompt_tokens_details?.cached_tokens || 0;
|
|
547
|
+
const actualModel = data.model || model;
|
|
548
|
+
|
|
549
|
+
const toolCalls: ToolCall[] = [];
|
|
550
|
+
if (choice.tool_calls) {
|
|
551
|
+
for (const tc of choice.tool_calls) {
|
|
552
|
+
toolCalls.push({
|
|
553
|
+
id: tc.id,
|
|
554
|
+
name: tc.function.name,
|
|
555
|
+
arguments: typeof tc.function.arguments === 'string'
|
|
556
|
+
? JSON.parse(tc.function.arguments)
|
|
557
|
+
: tc.function.arguments,
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
return {
|
|
563
|
+
content: choice.content || '',
|
|
564
|
+
model: actualModel,
|
|
565
|
+
provider,
|
|
566
|
+
inputTokens: usage.prompt_tokens || 0,
|
|
567
|
+
outputTokens: usage.completion_tokens || 0,
|
|
568
|
+
latencyMs: Date.now() - start,
|
|
569
|
+
...(cachedInputTokensNs > 0 ? { cachedInputTokens: cachedInputTokensNs, cached: true } : {}),
|
|
570
|
+
...(toolCalls.length > 0 ? { toolCalls } : {}),
|
|
571
|
+
...((choice.reasoning_content || choice.reasoning) ? { reasoningContent: choice.reasoning_content || choice.reasoning } : {}),
|
|
572
|
+
};
|
|
573
|
+
}
|
|
574
|
+
|
|
575
|
+
/** Convert abstract LLMMessage[] to OpenAI message format */
|
|
576
|
+
function openaiMessages(messages: LLMMessage[]): any[] {
|
|
577
|
+
const result: any[] = [];
|
|
578
|
+
|
|
579
|
+
for (const msg of messages) {
|
|
580
|
+
if (msg.role === 'user') {
|
|
581
|
+
result.push({ role: 'user', content: msg.content || '' });
|
|
582
|
+
} else if (msg.role === 'assistant') {
|
|
583
|
+
const entry: any = { role: 'assistant' };
|
|
584
|
+
if (msg.content) entry.content = msg.content;
|
|
585
|
+
// DeepSeek requires reasoning_content to be passed back in multi-turn.
|
|
586
|
+
if (msg.reasoningContent) entry.reasoning_content = msg.reasoningContent;
|
|
587
|
+
if (msg.toolCalls) {
|
|
588
|
+
entry.tool_calls = msg.toolCalls.map(tc => ({
|
|
589
|
+
id: tc.id,
|
|
590
|
+
type: 'function',
|
|
591
|
+
function: {
|
|
592
|
+
name: tc.name,
|
|
593
|
+
arguments: JSON.stringify(tc.arguments),
|
|
594
|
+
},
|
|
595
|
+
}));
|
|
596
|
+
}
|
|
597
|
+
result.push(entry);
|
|
598
|
+
} else if (msg.role === 'tool') {
|
|
599
|
+
// OpenAI: each tool result is a separate message with role: 'tool'
|
|
600
|
+
if (msg.toolResults) {
|
|
601
|
+
for (const tr of msg.toolResults) {
|
|
602
|
+
result.push({
|
|
603
|
+
role: 'tool',
|
|
604
|
+
tool_call_id: tr.toolCallId,
|
|
605
|
+
content: tr.content,
|
|
606
|
+
});
|
|
607
|
+
}
|
|
608
|
+
}
|
|
609
|
+
}
|
|
610
|
+
}
|
|
611
|
+
|
|
612
|
+
return result;
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
// ---------------------------------------------------------------------------
|
|
616
|
+
// Gemini
|
|
617
|
+
// ---------------------------------------------------------------------------
|
|
618
|
+
|
|
619
|
+
async function callGemini(
|
|
620
|
+
apiKey: string,
|
|
621
|
+
model: string,
|
|
622
|
+
req: LLMRequest,
|
|
623
|
+
): Promise<LLMResponse> {
|
|
624
|
+
const start = Date.now();
|
|
625
|
+
|
|
626
|
+
const systemText = req.cacheablePrefix
|
|
627
|
+
? `${req.cacheablePrefix}\n\n${req.systemPrompt}`
|
|
628
|
+
: req.systemPrompt;
|
|
629
|
+
|
|
630
|
+
// Gemini tool use: function_declarations
|
|
631
|
+
const tools = req.tools ? [{
|
|
632
|
+
function_declarations: req.tools.map(t => ({
|
|
633
|
+
name: t.name,
|
|
634
|
+
description: t.description,
|
|
635
|
+
parameters: t.parameters,
|
|
636
|
+
})),
|
|
637
|
+
}] : undefined;
|
|
638
|
+
|
|
639
|
+
// Messages — multi-turn or single-turn
|
|
640
|
+
let contents: any[];
|
|
641
|
+
if (req.messages) {
|
|
642
|
+
contents = geminiMessages(req.messages);
|
|
643
|
+
} else {
|
|
644
|
+
contents = [{ role: 'user', parts: [{ text: req.userMessage || '' }] }];
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
const body: any = {
|
|
648
|
+
system_instruction: { parts: [{ text: systemText }] },
|
|
649
|
+
contents,
|
|
650
|
+
generationConfig: {
|
|
651
|
+
maxOutputTokens: req.maxOutputTokens ?? 8192,
|
|
652
|
+
...(req.temperature !== undefined ? { temperature: req.temperature } : {}),
|
|
653
|
+
},
|
|
654
|
+
...(tools ? { tools } : {}),
|
|
655
|
+
};
|
|
656
|
+
|
|
657
|
+
const resp = await fetch(
|
|
658
|
+
`https://generativelanguage.googleapis.com/v1beta/${model}:generateContent?key=${apiKey}`,
|
|
659
|
+
{
|
|
660
|
+
method: 'POST',
|
|
661
|
+
headers: { 'Content-Type': 'application/json' },
|
|
662
|
+
body: JSON.stringify(body),
|
|
663
|
+
},
|
|
664
|
+
);
|
|
665
|
+
|
|
666
|
+
if (!resp.ok) {
|
|
667
|
+
const text = await resp.text();
|
|
668
|
+
throw new LlmCallError(
|
|
669
|
+
`Gemini API ${resp.status}: ${text.substring(0, 500)}`,
|
|
670
|
+
{ provider: 'google', model, status: resp.status, severity: severityForStatus(resp.status) },
|
|
671
|
+
);
|
|
672
|
+
}
|
|
673
|
+
|
|
674
|
+
const data: any = await resp.json();
|
|
675
|
+
const usage = data.usageMetadata || {};
|
|
676
|
+
|
|
677
|
+
let content = '';
|
|
678
|
+
const toolCalls: ToolCall[] = [];
|
|
679
|
+
|
|
680
|
+
const parts = data.candidates?.[0]?.content?.parts || [];
|
|
681
|
+
for (const part of parts) {
|
|
682
|
+
if (part.text) {
|
|
683
|
+
content += part.text;
|
|
684
|
+
} else if (part.functionCall) {
|
|
685
|
+
toolCalls.push({
|
|
686
|
+
id: `gemini-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
|
|
687
|
+
name: part.functionCall.name,
|
|
688
|
+
arguments: part.functionCall.args || {},
|
|
689
|
+
});
|
|
690
|
+
}
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
return {
|
|
694
|
+
content,
|
|
695
|
+
model,
|
|
696
|
+
provider: 'google',
|
|
697
|
+
inputTokens: usage.promptTokenCount || 0,
|
|
698
|
+
outputTokens: usage.candidatesTokenCount || 0,
|
|
699
|
+
latencyMs: Date.now() - start,
|
|
700
|
+
...(toolCalls.length > 0 ? { toolCalls } : {}),
|
|
701
|
+
};
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
/** Convert abstract LLMMessage[] to Gemini contents format */
|
|
705
|
+
function geminiMessages(messages: LLMMessage[]): any[] {
|
|
706
|
+
const result: any[] = [];
|
|
707
|
+
|
|
708
|
+
for (const msg of messages) {
|
|
709
|
+
if (msg.role === 'user') {
|
|
710
|
+
result.push({ role: 'user', parts: [{ text: msg.content || '' }] });
|
|
711
|
+
} else if (msg.role === 'assistant') {
|
|
712
|
+
const parts: any[] = [];
|
|
713
|
+
if (msg.content) parts.push({ text: msg.content });
|
|
714
|
+
if (msg.toolCalls) {
|
|
715
|
+
for (const tc of msg.toolCalls) {
|
|
716
|
+
parts.push({
|
|
717
|
+
functionCall: { name: tc.name, args: tc.arguments },
|
|
718
|
+
});
|
|
719
|
+
}
|
|
720
|
+
}
|
|
721
|
+
result.push({ role: 'model', parts });
|
|
722
|
+
} else if (msg.role === 'tool') {
|
|
723
|
+
// Gemini: functionResponse parts
|
|
724
|
+
const parts: any[] = [];
|
|
725
|
+
if (msg.toolResults) {
|
|
726
|
+
for (const tr of msg.toolResults) {
|
|
727
|
+
parts.push({
|
|
728
|
+
functionResponse: {
|
|
729
|
+
name: tr.toolCallId, // Gemini uses name, not id
|
|
730
|
+
response: { content: tr.content },
|
|
731
|
+
},
|
|
732
|
+
});
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
result.push({ role: 'function', parts });
|
|
736
|
+
}
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
return result;
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
// ---------------------------------------------------------------------------
|
|
743
|
+
// Unified router
|
|
744
|
+
// ---------------------------------------------------------------------------
|
|
745
|
+
|
|
746
|
+
const MAX_RETRIES = 2;
|
|
747
|
+
const RETRYABLE_STATUS_CODES = new Set([429, 500, 502, 503, 504, 529]);
|
|
748
|
+
// Per-call LLM timeout. Reasoning models (GLM-5.x, o-series, R1) routinely
|
|
749
|
+
// burn 60–180s of hidden CoT before producing a visible byte, so the cap
|
|
750
|
+
// has to accommodate them. Non-reasoning models still return in seconds;
|
|
751
|
+
// raising the ceiling just removes a false-positive timeout, it doesn't
|
|
752
|
+
// slow anything down.
|
|
753
|
+
// Spec 13 — per-call timeout and per-turn wall-clock cap.
|
|
754
|
+
const LLM_TIMEOUT_MS = 240_000;
|
|
755
|
+
const TURN_WALL_CLOCK_MS = 600_000;
|
|
756
|
+
|
|
757
|
+
function withTimeout<T>(p: Promise<T>, ms: number, label: string): Promise<T> {
|
|
758
|
+
return new Promise<T>((resolve, reject) => {
|
|
759
|
+
const timer = setTimeout(() => reject(new Error(`${label} timeout after ${ms}ms`)), ms);
|
|
760
|
+
p.then(v => { clearTimeout(timer); resolve(v); },
|
|
761
|
+
e => { clearTimeout(timer); reject(e); });
|
|
762
|
+
});
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
function parseRetryAfter(msg: string): number | null {
|
|
766
|
+
const m = msg.match(/retry[- ]?after[^0-9]*(\d+)/i);
|
|
767
|
+
if (!m) return null;
|
|
768
|
+
return parseInt(m[1], 10) * 1000;
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
/**
|
|
772
|
+
* Fallback chains: when a model is overloaded (529) or rate-limited (429),
|
|
773
|
+
* try the next model in the chain before giving up.
|
|
774
|
+
*/
|
|
775
|
+
const FALLBACK_CHAINS: Record<string, { provider: ProviderId; model: string }[]> = {
|
|
776
|
+
'claude-opus-4-20250514': [
|
|
777
|
+
{ provider: 'anthropic', model: 'claude-sonnet-4-5-20250929' },
|
|
778
|
+
{ provider: 'openai', model: 'gpt-5.4' },
|
|
779
|
+
],
|
|
780
|
+
'claude-sonnet-4-5-20250929': [
|
|
781
|
+
{ provider: 'openai', model: 'gpt-5.4' },
|
|
782
|
+
{ provider: 'anthropic', model: 'claude-haiku-4-5-20251001' },
|
|
783
|
+
],
|
|
784
|
+
'gpt-5.4': [
|
|
785
|
+
{ provider: 'anthropic', model: 'claude-sonnet-4-5-20250929' },
|
|
786
|
+
{ provider: 'openai', model: 'gpt-5.4-mini' },
|
|
787
|
+
],
|
|
788
|
+
};
|
|
789
|
+
|
|
790
|
+
export async function callLLM(req: LLMRequest): Promise<LLMResponse> {
|
|
791
|
+
const provider = req.provider || 'anthropic';
|
|
792
|
+
const model = req.model || DEFAULT_MODELS[provider] || 'claude-sonnet-4-5-20250929';
|
|
793
|
+
|
|
794
|
+
const apiKey = getApiKey(provider);
|
|
795
|
+
if (!apiKey && provider !== 'ollama') {
|
|
796
|
+
const envVars: Record<string, string> = {
|
|
797
|
+
'anthropic': 'ANTHROPIC_API_KEY',
|
|
798
|
+
'openai': 'OPENAI_API_KEY',
|
|
799
|
+
'deepseek': 'DEEPSEEK_API_KEY',
|
|
800
|
+
'xai': 'XAI_API_KEY',
|
|
801
|
+
'zai': 'ZAI_API_KEY',
|
|
802
|
+
'google': 'GOOGLE_API_KEY',
|
|
803
|
+
'nvidia-router': 'NVIDIA_API_KEY',
|
|
804
|
+
};
|
|
805
|
+
throw new LlmCallError(
|
|
806
|
+
`No API key for "${provider}". Set ${envVars[provider] || 'API_KEY'} in environment or .env file.`,
|
|
807
|
+
{ provider, model, severity: 'fatal' },
|
|
808
|
+
);
|
|
809
|
+
}
|
|
810
|
+
|
|
811
|
+
// Try the requested model first
|
|
812
|
+
let lastError: Error | null = null;
|
|
813
|
+
|
|
814
|
+
const limiter = getRateLimiter();
|
|
815
|
+
const estTokens = estimateTokens(
|
|
816
|
+
(req.systemPrompt || '') +
|
|
817
|
+
(req.userMessage || '') +
|
|
818
|
+
(req.messages?.map(m => m.content || '').join('\n') || ''),
|
|
819
|
+
) + (req.maxOutputTokens || 2048);
|
|
820
|
+
|
|
821
|
+
const turnDeadline = Date.now() + TURN_WALL_CLOCK_MS;
|
|
822
|
+
for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
|
|
823
|
+
try {
|
|
824
|
+
if (limiter) {
|
|
825
|
+
try { await limiter.acquire(provider, estTokens); }
|
|
826
|
+
catch (e) {
|
|
827
|
+
if (e instanceof RateLimitOverflowError) {
|
|
828
|
+
// Surface as retryable error so the existing fallback kicks in.
|
|
829
|
+
throw new LlmCallError(
|
|
830
|
+
`API 503: ${e.message}`,
|
|
831
|
+
{ provider, model, status: 503, severity: 'recoverable', cause: e },
|
|
832
|
+
);
|
|
833
|
+
}
|
|
834
|
+
throw e;
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
const response = await withTimeout(callProvider(provider, apiKey, model, req), LLM_TIMEOUT_MS, `${provider}/${model}`);
|
|
838
|
+
if (limiter) {
|
|
839
|
+
limiter.recordResponse(provider, response.inputTokens, response.outputTokens, estTokens, response.responseHeaders);
|
|
840
|
+
}
|
|
841
|
+
return response;
|
|
842
|
+
} catch (error) {
|
|
843
|
+
lastError = error as Error;
|
|
844
|
+
// Prefer the typed status from LlmCallError; fall back to parsing
|
|
845
|
+
// the message for legacy errors (anything thrown as bare Error).
|
|
846
|
+
let statusCode = 0;
|
|
847
|
+
if (lastError instanceof LlmCallError && typeof lastError.status === 'number') {
|
|
848
|
+
statusCode = lastError.status;
|
|
849
|
+
} else {
|
|
850
|
+
const statusMatch = lastError.message.match(/API (\d+):/);
|
|
851
|
+
if (statusMatch) statusCode = parseInt(statusMatch[1]);
|
|
852
|
+
}
|
|
853
|
+
const retryAfter = parseRetryAfter(lastError.message);
|
|
854
|
+
const isTimeout = /timeout after/.test(lastError.message);
|
|
855
|
+
const retryable = RETRYABLE_STATUS_CODES.has(statusCode) || isTimeout;
|
|
856
|
+
if (limiter && (statusCode === 429 || statusCode === 503)) {
|
|
857
|
+
limiter.recordThrottle(provider, retryAfter ?? 5000);
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
if (attempt < MAX_RETRIES && retryable && Date.now() < turnDeadline) {
|
|
861
|
+
const baseDelay = retryAfter ?? Math.min(1000 * Math.pow(2, attempt), 8_000);
|
|
862
|
+
const delay = Math.min(baseDelay, Math.max(0, turnDeadline - Date.now()));
|
|
863
|
+
// Do NOT write to stderr here — the Rust TUI renders to stderr and
|
|
864
|
+
// any stray bytes corrupt the frame. Details land in backend.log.
|
|
865
|
+
await new Promise(resolve => setTimeout(resolve, delay));
|
|
866
|
+
continue;
|
|
867
|
+
}
|
|
868
|
+
|
|
869
|
+
// Not retryable — break to fallback
|
|
870
|
+
break;
|
|
871
|
+
}
|
|
872
|
+
}
|
|
873
|
+
|
|
874
|
+
// Try fallback models
|
|
875
|
+
const fallbacks = FALLBACK_CHAINS[model] || [];
|
|
876
|
+
for (const fb of fallbacks) {
|
|
877
|
+
const fbKey = getApiKey(fb.provider);
|
|
878
|
+
if (!fbKey && fb.provider !== 'ollama') continue;
|
|
879
|
+
|
|
880
|
+
try {
|
|
881
|
+
const fbReq = { ...req, provider: fb.provider, model: fb.model };
|
|
882
|
+
// Wrap fallback in the same timeout as the primary call — otherwise
|
|
883
|
+
// a hung fallback provider can block the turn forever.
|
|
884
|
+
const fbResponse = await withTimeout(
|
|
885
|
+
callProvider(fb.provider, fbKey, fb.model, fbReq),
|
|
886
|
+
LLM_TIMEOUT_MS,
|
|
887
|
+
`${fb.provider}/${fb.model}`,
|
|
888
|
+
);
|
|
889
|
+
fbResponse.wasFallback = true;
|
|
890
|
+
fbResponse.requestedModel = model;
|
|
891
|
+
return fbResponse;
|
|
892
|
+
} catch {
|
|
893
|
+
// Fallback also failed — try next
|
|
894
|
+
continue;
|
|
895
|
+
}
|
|
896
|
+
}
|
|
897
|
+
|
|
898
|
+
// All retries and fallbacks exhausted
|
|
899
|
+
throw lastError ?? new Error('All retry attempts and fallbacks exhausted');
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
function callProvider(
|
|
903
|
+
provider: ProviderId,
|
|
904
|
+
apiKey: string | undefined,
|
|
905
|
+
model: string,
|
|
906
|
+
req: LLMRequest,
|
|
907
|
+
): Promise<LLMResponse> {
|
|
908
|
+
switch (provider) {
|
|
909
|
+
case 'anthropic':
|
|
910
|
+
return callAnthropic(apiKey!, model, req);
|
|
911
|
+
|
|
912
|
+
case 'openai':
|
|
913
|
+
return callOpenAICompatible('https://api.openai.com/v1', apiKey!, model, provider, req);
|
|
914
|
+
|
|
915
|
+
case 'deepseek':
|
|
916
|
+
return callOpenAICompatible('https://api.deepseek.com/v1', apiKey!, model, provider, req);
|
|
917
|
+
|
|
918
|
+
case 'xai':
|
|
919
|
+
return callOpenAICompatible('https://api.x.ai/v1', apiKey!, model, provider, req);
|
|
920
|
+
|
|
921
|
+
case 'zai':
|
|
922
|
+
// Coding Plan endpoint — required for GLM Coding Plan subscriptions.
|
|
923
|
+
// The general-purpose /api/paas/v4 endpoint returns 1113 for coding-plan keys.
|
|
924
|
+
return callOpenAICompatible('https://api.z.ai/api/coding/paas/v4', apiKey!, model, provider, req);
|
|
925
|
+
|
|
926
|
+
case 'nvidia-router': {
|
|
927
|
+
const routerUrl = process.env.NVIDIA_ROUTER_URL || 'http://localhost:8001/v1';
|
|
928
|
+
return callOpenAICompatible(routerUrl, apiKey!, model, provider, req);
|
|
929
|
+
}
|
|
930
|
+
|
|
931
|
+
case 'google':
|
|
932
|
+
return callGemini(apiKey!, model, req);
|
|
933
|
+
|
|
934
|
+
case 'ollama':
|
|
935
|
+
return callOpenAICompatible('http://localhost:11434/v1', 'ollama', model, provider, req);
|
|
936
|
+
|
|
937
|
+
default:
|
|
938
|
+
throw new LlmCallError(
|
|
939
|
+
`Unknown provider "${provider}". Supported: anthropic, openai, deepseek, xai, zai, google, ollama, nvidia-router`,
|
|
940
|
+
{ provider, model, severity: 'fatal' },
|
|
941
|
+
);
|
|
942
|
+
}
|
|
943
|
+
}
|