@thispointon/kondi-chat 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +556 -0
  3. package/bin/kondi-chat +56 -0
  4. package/bin/kondi-chat.js +72 -0
  5. package/package.json +55 -0
  6. package/scripts/demo.tape +49 -0
  7. package/scripts/postinstall.cjs +103 -0
  8. package/src/audit/analytics.ts +261 -0
  9. package/src/audit/ledger.ts +253 -0
  10. package/src/audit/telemetry.ts +165 -0
  11. package/src/cli/backend.ts +675 -0
  12. package/src/cli/commands.ts +419 -0
  13. package/src/cli/help.ts +182 -0
  14. package/src/cli/submit-helpers.ts +159 -0
  15. package/src/cli/submit.ts +539 -0
  16. package/src/cli/wizard.ts +121 -0
  17. package/src/context/bootstrap.ts +138 -0
  18. package/src/context/budget.ts +100 -0
  19. package/src/context/manager.ts +666 -0
  20. package/src/context/memory.ts +160 -0
  21. package/src/context/preflight.ts +176 -0
  22. package/src/context/project-brain.ts +101 -0
  23. package/src/context/receipts.ts +108 -0
  24. package/src/context/skills.ts +154 -0
  25. package/src/context/symbol-index.ts +240 -0
  26. package/src/council/profiles.ts +137 -0
  27. package/src/council/tool.ts +138 -0
  28. package/src/council-engine/cli/council-artifacts.ts +230 -0
  29. package/src/council-engine/cli/council-config.ts +178 -0
  30. package/src/council-engine/cli/council-session-export.ts +116 -0
  31. package/src/council-engine/cli/kondi.ts +98 -0
  32. package/src/council-engine/cli/llm-caller.ts +229 -0
  33. package/src/council-engine/cli/localStorage-shim.ts +119 -0
  34. package/src/council-engine/cli/node-platform.ts +68 -0
  35. package/src/council-engine/cli/run-council.ts +481 -0
  36. package/src/council-engine/cli/run-pipeline.ts +772 -0
  37. package/src/council-engine/cli/session-export.ts +153 -0
  38. package/src/council-engine/configs/councils/analysis.json +101 -0
  39. package/src/council-engine/configs/councils/code-planning.json +86 -0
  40. package/src/council-engine/configs/councils/coding.json +89 -0
  41. package/src/council-engine/configs/councils/debate.json +97 -0
  42. package/src/council-engine/configs/councils/solo-claude.json +34 -0
  43. package/src/council-engine/configs/councils/solo-gpt.json +34 -0
  44. package/src/council-engine/council/coding-orchestrator.ts +1205 -0
  45. package/src/council-engine/council/context-bootstrap.ts +147 -0
  46. package/src/council-engine/council/context-inspection.ts +42 -0
  47. package/src/council-engine/council/context-store.ts +763 -0
  48. package/src/council-engine/council/deliberation-orchestrator.ts +2762 -0
  49. package/src/council-engine/council/factory.ts +164 -0
  50. package/src/council-engine/council/index.ts +201 -0
  51. package/src/council-engine/council/ledger-store.ts +438 -0
  52. package/src/council-engine/council/prompts.ts +1689 -0
  53. package/src/council-engine/council/storage-cleanup.ts +164 -0
  54. package/src/council-engine/council/store.ts +1110 -0
  55. package/src/council-engine/council/synthesis.ts +291 -0
  56. package/src/council-engine/council/types.ts +845 -0
  57. package/src/council-engine/council/validation.ts +613 -0
  58. package/src/council-engine/pipeline/build-detect.ts +73 -0
  59. package/src/council-engine/pipeline/executor.ts +1048 -0
  60. package/src/council-engine/pipeline/index.ts +9 -0
  61. package/src/council-engine/pipeline/install-detect.ts +84 -0
  62. package/src/council-engine/pipeline/memory-store.ts +182 -0
  63. package/src/council-engine/pipeline/output-parsers.ts +146 -0
  64. package/src/council-engine/pipeline/run-output.ts +149 -0
  65. package/src/council-engine/pipeline/session-import.ts +177 -0
  66. package/src/council-engine/pipeline/store.ts +753 -0
  67. package/src/council-engine/pipeline/test-detect.ts +82 -0
  68. package/src/council-engine/pipeline/types.ts +401 -0
  69. package/src/council-engine/services/deliberationSummary.ts +114 -0
  70. package/src/council-engine/tsconfig.json +16 -0
  71. package/src/council-engine/types/mcp.ts +122 -0
  72. package/src/council-engine/utils/filterTools.ts +73 -0
  73. package/src/engine/apply.ts +238 -0
  74. package/src/engine/checkpoints.ts +237 -0
  75. package/src/engine/consultants.ts +347 -0
  76. package/src/engine/diff.ts +171 -0
  77. package/src/engine/errors.ts +102 -0
  78. package/src/engine/git-tools.ts +246 -0
  79. package/src/engine/hooks.ts +181 -0
  80. package/src/engine/loop-guard.ts +155 -0
  81. package/src/engine/permissions.ts +293 -0
  82. package/src/engine/pipeline.ts +376 -0
  83. package/src/engine/sub-agents.ts +133 -0
  84. package/src/engine/task-card.ts +185 -0
  85. package/src/engine/task-router.ts +256 -0
  86. package/src/engine/task-store.ts +86 -0
  87. package/src/engine/tools.ts +783 -0
  88. package/src/engine/verify.ts +111 -0
  89. package/src/mcp/client.ts +225 -0
  90. package/src/mcp/config.ts +120 -0
  91. package/src/mcp/tool-manager.ts +192 -0
  92. package/src/mcp/types.ts +61 -0
  93. package/src/providers/llm-caller.ts +943 -0
  94. package/src/providers/rate-limiter.ts +238 -0
  95. package/src/router/NOTES.md +28 -0
  96. package/src/router/collector.ts +474 -0
  97. package/src/router/embeddings.ts +286 -0
  98. package/src/router/index.ts +299 -0
  99. package/src/router/intent-router.ts +225 -0
  100. package/src/router/nn-router.ts +205 -0
  101. package/src/router/profiles.ts +309 -0
  102. package/src/router/registry.ts +565 -0
  103. package/src/router/rules.ts +274 -0
  104. package/src/router/train.py +408 -0
  105. package/src/session/store.ts +211 -0
  106. package/src/test-utils/mock-llm.ts +39 -0
  107. package/src/types.ts +322 -0
  108. package/src/web/manager.ts +311 -0
@@ -0,0 +1,943 @@
1
+ /**
2
+ * Multi-provider LLM caller — direct HTTP, no SDKs.
3
+ *
4
+ * Supports:
5
+ * - Simple single-turn (systemPrompt + userMessage)
6
+ * - Multi-turn with tool use (messages array + tools)
7
+ * - Anthropic, OpenAI-compatible, and Gemini providers
8
+ */
9
+
10
+ import type {
11
+ ProviderId, LLMRequest, LLMResponse,
12
+ ToolDefinition, ToolCall, LLMMessage,
13
+ } from '../types.ts';
14
+ import { getRateLimiter, RateLimitOverflowError } from './rate-limiter.ts';
15
+ import { estimateTokens } from '../context/budget.ts';
16
+ import { LlmCallError, type ErrorSeverity } from '../engine/errors.ts';
17
+
18
+ /**
19
+ * Classify an HTTP status as recoverable (worth retrying) vs. fatal.
20
+ * 5xx + throttling codes are recoverable; everything else is treated as
21
+ * a hard failure (wrong key, bad request, unknown model, …).
22
+ */
23
+ function severityForStatus(status: number): ErrorSeverity {
24
+ if (status === 429) return 'recoverable';
25
+ if (status >= 500 && status < 600) return 'recoverable';
26
+ return 'fatal';
27
+ }
28
+
29
+ // ---------------------------------------------------------------------------
30
+ // SSE stream parser
31
+ // ---------------------------------------------------------------------------
32
+
33
+ async function* parseSSE(resp: Response): AsyncGenerator<{ type?: string; data?: any }> {
34
+ const reader = resp.body?.getReader();
35
+ if (!reader) return;
36
+
37
+ const decoder = new TextDecoder();
38
+ let buffer = '';
39
+ let eventType: string | undefined;
40
+ let dataLines: string[] = [];
41
+
42
+ try {
43
+ while (true) {
44
+ const { done, value } = await reader.read();
45
+ if (done) break;
46
+
47
+ buffer += decoder.decode(value, { stream: true });
48
+ const lines = buffer.split('\n');
49
+ buffer = lines.pop() || '';
50
+
51
+ for (const line of lines) {
52
+ const trimmed = line.trim();
53
+ if (trimmed === '') {
54
+ // Blank line = end of SSE event
55
+ if (dataLines.length > 0) {
56
+ const joined = dataLines.join('\n');
57
+ let parsed: any;
58
+ try {
59
+ parsed = JSON.parse(joined);
60
+ } catch {
61
+ parsed = joined;
62
+ }
63
+ yield { type: eventType, data: parsed };
64
+ }
65
+ eventType = undefined;
66
+ dataLines = [];
67
+ } else if (trimmed.startsWith('event:') || trimmed.startsWith('event :')) {
68
+ eventType = trimmed.replace(/^event\s*:\s*/, '');
69
+ } else if (trimmed.startsWith('data:') || trimmed.startsWith('data :')) {
70
+ const raw = trimmed.replace(/^data\s*:\s*/, '');
71
+ if (raw === '[DONE]') continue;
72
+ dataLines.push(raw);
73
+ }
74
+ // Ignore other lines (comments starting with :, id:, retry:, etc.)
75
+ }
76
+ }
77
+
78
+ // Flush any remaining event at end of stream
79
+ if (dataLines.length > 0) {
80
+ const joined = dataLines.join('\n');
81
+ let parsed: any;
82
+ try {
83
+ parsed = JSON.parse(joined);
84
+ } catch {
85
+ parsed = joined;
86
+ }
87
+ yield { type: eventType, data: parsed };
88
+ }
89
+ } finally {
90
+ reader.releaseLock();
91
+ }
92
+ }
93
+
94
+ // ---------------------------------------------------------------------------
95
+ // DeepSeek JSON sanitizer
96
+ // ---------------------------------------------------------------------------
97
+
98
+ /**
99
+ * DeepSeek's JSON body parser is stricter than OpenAI/Anthropic — it
100
+ * interprets `\x` inside JSON string values as a hex escape and fails
101
+ * if the hex digits are missing or invalid. This happens when file
102
+ * content containing ANSI escape codes (e.g. `\x1b[31m`) or other
103
+ * `\xNN` sequences is embedded in the conversation context.
104
+ *
105
+ * JSON.stringify already escapes backslashes to `\\`, so the JSON
106
+ * contains `\\x1b` (literal backslash + x + 1 + b). But DeepSeek
107
+ * appears to double-parse the string, treating `\x` as a hex escape.
108
+ *
109
+ * Fix: replace `\\x` with `\\\\x` so after DeepSeek's extra parse
110
+ * pass it becomes the literal `\x` the model sees. Also strip null
111
+ * bytes and other control chars that can corrupt JSON parsing.
112
+ */
113
+ function sanitizeJsonForDeepSeek(json: string): string {
114
+ return json
115
+ // \\x (escaped backslash + x) → \\\\x (double-escaped)
116
+ .replace(/\\\\x/g, '\\\\\\\\x')
117
+ // Bare control characters that snuck through (shouldn't happen
118
+ // with JSON.stringify, but belt-and-suspenders)
119
+ .replace(/[\x00-\x08\x0b\x0c\x0e-\x1f]/g, '');
120
+ }
121
+
122
+ // ---------------------------------------------------------------------------
123
+ // Default models per provider
124
+ // ---------------------------------------------------------------------------
125
+
126
+ export const DEFAULT_MODELS: Record<ProviderId, string> = {
127
+ 'anthropic': 'claude-sonnet-4-5-20250929',
128
+ 'openai': 'gpt-4o',
129
+ 'deepseek': 'deepseek-chat',
130
+ 'google': 'models/gemini-2.5-flash',
131
+ 'xai': 'grok-3',
132
+ 'zai': 'glm-4.6',
133
+ 'ollama': 'llama3.1',
134
+ 'nvidia-router': 'auto',
135
+ };
136
+
137
+ // ---------------------------------------------------------------------------
138
+ // API key resolution
139
+ // ---------------------------------------------------------------------------
140
+
141
+ function getApiKey(provider: ProviderId): string | undefined {
142
+ switch (provider) {
143
+ case 'anthropic': return process.env.ANTHROPIC_API_KEY;
144
+ case 'openai': return process.env.OPENAI_API_KEY;
145
+ case 'deepseek': return process.env.DEEPSEEK_API_KEY;
146
+ case 'xai': return process.env.XAI_API_KEY;
147
+ case 'zai': return process.env.ZAI_API_KEY;
148
+ case 'google': return process.env.GOOGLE_API_KEY;
149
+ case 'nvidia-router': return process.env.NVIDIA_API_KEY;
150
+ default: return undefined;
151
+ }
152
+ }
153
+
154
+ // ---------------------------------------------------------------------------
155
+ // Anthropic
156
+ // ---------------------------------------------------------------------------
157
+
158
+ async function callAnthropic(
159
+ apiKey: string,
160
+ model: string,
161
+ req: LLMRequest,
162
+ ): Promise<LLMResponse> {
163
+ const start = Date.now();
164
+
165
+ const systemContent: Array<{ type: string; text: string; cache_control?: { type: string } }> = [];
166
+ if (req.cacheablePrefix) {
167
+ systemContent.push({
168
+ type: 'text',
169
+ text: req.cacheablePrefix,
170
+ cache_control: { type: 'ephemeral' },
171
+ });
172
+ }
173
+ systemContent.push({ type: 'text', text: req.systemPrompt });
174
+
175
+ let messages: any[];
176
+ if (req.messages) {
177
+ messages = anthropicMessages(req.messages);
178
+ } else {
179
+ messages = [{ role: 'user', content: req.userMessage || '' }];
180
+ }
181
+
182
+ const tools = req.tools?.map(t => ({
183
+ name: t.name,
184
+ description: t.description,
185
+ input_schema: t.parameters,
186
+ }));
187
+
188
+ const body: any = {
189
+ model,
190
+ max_tokens: req.maxOutputTokens ?? 8192,
191
+ system: systemContent,
192
+ messages,
193
+ ...(req.temperature !== undefined ? { temperature: req.temperature } : {}),
194
+ ...(tools ? { tools } : {}),
195
+ ...(req.stream ? { stream: true } : {}),
196
+ };
197
+
198
+ const resp = await fetch('https://api.anthropic.com/v1/messages', {
199
+ method: 'POST',
200
+ headers: {
201
+ 'Content-Type': 'application/json',
202
+ 'anthropic-version': '2023-06-01',
203
+ 'anthropic-beta': 'prompt-caching-2024-07-31',
204
+ 'x-api-key': apiKey,
205
+ },
206
+ body: JSON.stringify(body),
207
+ });
208
+
209
+ if (!resp.ok) {
210
+ const text = await resp.text();
211
+ throw new LlmCallError(
212
+ `Anthropic API ${resp.status}: ${text.substring(0, 500)}`,
213
+ { provider: 'anthropic', model, status: resp.status, severity: severityForStatus(resp.status) },
214
+ );
215
+ }
216
+
217
+ // Streaming path
218
+ if (req.stream && req.onToken) {
219
+ let content = '';
220
+ let reasoningContent = '';
221
+ const toolCalls: ToolCall[] = [];
222
+ let inputTokens = 0;
223
+ let outputTokens = 0;
224
+ let cached = false;
225
+ let cachedInputTokens = 0;
226
+
227
+ // Track tool_use blocks being built
228
+ let currentToolId = '';
229
+ let currentToolName = '';
230
+ let currentToolJson = '';
231
+ let inThinkingBlock = false;
232
+
233
+ for await (const event of parseSSE(resp)) {
234
+ try {
235
+ if (event.type === 'message_start') {
236
+ const usage = event.data?.message?.usage;
237
+ if (usage) {
238
+ inputTokens = usage.input_tokens || 0;
239
+ cachedInputTokens = usage.cache_read_input_tokens || 0;
240
+ cached = cachedInputTokens > 0;
241
+ }
242
+ } else if (event.type === 'content_block_start') {
243
+ const block = event.data?.content_block;
244
+ if (block?.type === 'tool_use') {
245
+ currentToolId = block.id;
246
+ currentToolName = block.name;
247
+ currentToolJson = '';
248
+ } else if (block?.type === 'thinking') {
249
+ inThinkingBlock = true;
250
+ if (block.thinking) reasoningContent += block.thinking;
251
+ }
252
+ } else if (event.type === 'content_block_delta') {
253
+ const delta = event.data?.delta;
254
+ if (delta?.type === 'text_delta' && delta.text) {
255
+ content += delta.text;
256
+ req.onToken(delta.text);
257
+ } else if (delta?.type === 'input_json_delta' && delta.partial_json) {
258
+ currentToolJson += delta.partial_json;
259
+ } else if (delta?.type === 'thinking_delta' && delta.thinking) {
260
+ reasoningContent += delta.thinking;
261
+ }
262
+ } else if (event.type === 'content_block_stop') {
263
+ if (inThinkingBlock) {
264
+ inThinkingBlock = false;
265
+ }
266
+ if (currentToolId) {
267
+ try {
268
+ toolCalls.push({
269
+ id: currentToolId,
270
+ name: currentToolName,
271
+ arguments: currentToolJson ? JSON.parse(currentToolJson) : {},
272
+ });
273
+ } catch {
274
+ toolCalls.push({ id: currentToolId, name: currentToolName, arguments: {} });
275
+ }
276
+ currentToolId = '';
277
+ currentToolName = '';
278
+ currentToolJson = '';
279
+ }
280
+ } else if (event.type === 'message_delta') {
281
+ const usage = event.data?.usage;
282
+ if (usage) outputTokens = usage.output_tokens || 0;
283
+ }
284
+ } catch { /* skip malformed SSE event */ }
285
+ }
286
+
287
+ return {
288
+ content, model, provider: 'anthropic',
289
+ inputTokens, outputTokens,
290
+ latencyMs: Date.now() - start, cached,
291
+ ...(cachedInputTokens > 0 ? { cachedInputTokens } : {}),
292
+ ...(toolCalls.length > 0 ? { toolCalls } : {}),
293
+ ...(reasoningContent ? { reasoningContent } : {}),
294
+ };
295
+ }
296
+
297
+ // Non-streaming path
298
+ const data: any = await resp.json();
299
+ const usage = data.usage || {};
300
+
301
+ let content = '';
302
+ let reasoningContent = '';
303
+ const toolCalls: ToolCall[] = [];
304
+
305
+ for (const block of data.content || []) {
306
+ if (block.type === 'text') {
307
+ content += block.text;
308
+ } else if (block.type === 'thinking') {
309
+ reasoningContent += block.thinking || '';
310
+ } else if (block.type === 'tool_use') {
311
+ toolCalls.push({
312
+ id: block.id,
313
+ name: block.name,
314
+ arguments: block.input || {},
315
+ });
316
+ }
317
+ }
318
+
319
+ return {
320
+ content, model, provider: 'anthropic',
321
+ inputTokens: usage.input_tokens || 0,
322
+ outputTokens: usage.output_tokens || 0,
323
+ latencyMs: Date.now() - start,
324
+ cached: (usage.cache_read_input_tokens || 0) > 0,
325
+ ...((usage.cache_read_input_tokens || 0) > 0 ? { cachedInputTokens: usage.cache_read_input_tokens } : {}),
326
+ ...(toolCalls.length > 0 ? { toolCalls } : {}),
327
+ ...(reasoningContent ? { reasoningContent } : {}),
328
+ };
329
+ }
330
+
331
+ /** Convert abstract LLMMessage[] to Anthropic message format */
332
+ function anthropicMessages(messages: LLMMessage[]): any[] {
333
+ const result: any[] = [];
334
+
335
+ for (const msg of messages) {
336
+ if (msg.role === 'user') {
337
+ result.push({ role: 'user', content: msg.content || '' });
338
+ } else if (msg.role === 'assistant') {
339
+ // Assistant message may have text + tool_use blocks
340
+ const content: any[] = [];
341
+ if (msg.content) {
342
+ content.push({ type: 'text', text: msg.content });
343
+ }
344
+ if (msg.toolCalls) {
345
+ for (const tc of msg.toolCalls) {
346
+ content.push({
347
+ type: 'tool_use',
348
+ id: tc.id,
349
+ name: tc.name,
350
+ input: tc.arguments,
351
+ });
352
+ }
353
+ }
354
+ result.push({ role: 'assistant', content });
355
+ } else if (msg.role === 'tool') {
356
+ // Anthropic: tool results are sent as user messages with tool_result content blocks
357
+ const content: any[] = [];
358
+ if (msg.toolResults) {
359
+ for (const tr of msg.toolResults) {
360
+ content.push({
361
+ type: 'tool_result',
362
+ tool_use_id: tr.toolCallId,
363
+ content: tr.content,
364
+ ...(tr.isError ? { is_error: true } : {}),
365
+ });
366
+ }
367
+ }
368
+ result.push({ role: 'user', content });
369
+ }
370
+ }
371
+
372
+ return result;
373
+ }
374
+
375
+ // ---------------------------------------------------------------------------
376
+ // OpenAI-compatible (OpenAI, DeepSeek, xAI, NVIDIA router, Ollama)
377
+ // ---------------------------------------------------------------------------
378
+
379
+ async function callOpenAICompatible(
380
+ baseUrl: string,
381
+ apiKey: string,
382
+ model: string,
383
+ provider: ProviderId,
384
+ req: LLMRequest,
385
+ ): Promise<LLMResponse> {
386
+ const start = Date.now();
387
+
388
+ const systemContent = req.cacheablePrefix
389
+ ? `${req.cacheablePrefix}\n\n${req.systemPrompt}`
390
+ : req.systemPrompt;
391
+
392
+ // Messages — multi-turn or single-turn
393
+ let messages: any[];
394
+ if (req.messages) {
395
+ messages = [
396
+ { role: 'system', content: systemContent },
397
+ ...openaiMessages(req.messages),
398
+ ];
399
+ } else {
400
+ messages = [
401
+ { role: 'system', content: systemContent },
402
+ { role: 'user', content: req.userMessage || '' },
403
+ ];
404
+ }
405
+
406
+ // Tools
407
+ const tools = req.tools?.map(t => ({
408
+ type: 'function' as const,
409
+ function: {
410
+ name: t.name,
411
+ description: t.description,
412
+ parameters: t.parameters,
413
+ },
414
+ }));
415
+
416
+ const max = req.maxOutputTokens ?? 8192;
417
+ const body: any = {
418
+ model,
419
+ messages,
420
+ ...(req.temperature !== undefined ? { temperature: req.temperature } : {}),
421
+ ...(tools ? { tools } : {}),
422
+ ...(req.stream ? { stream: true } : {}),
423
+ };
424
+
425
+ if (provider === 'openai') {
426
+ body.max_completion_tokens = max; // new OpenAI param
427
+ } else {
428
+ body.max_tokens = max; // legacy / compatible providers
429
+ }
430
+
431
+ // DeepSeek's JSON parser is stricter than most — it rejects bare \x
432
+ // hex escapes and broken \u sequences that appear in file content
433
+ // (ANSI codes, binary fragments, CSV artifacts). Sanitize the body
434
+ // JSON string to replace these with safe placeholders.
435
+ let bodyJson = JSON.stringify(body);
436
+ if (provider === 'deepseek') {
437
+ bodyJson = sanitizeJsonForDeepSeek(bodyJson);
438
+ }
439
+
440
+ const resp = await fetch(`${baseUrl}/chat/completions`, {
441
+ method: 'POST',
442
+ headers: {
443
+ 'Content-Type': 'application/json',
444
+ 'Authorization': `Bearer ${apiKey}`,
445
+ },
446
+ body: bodyJson,
447
+ });
448
+
449
+ if (!resp.ok) {
450
+ const text = await resp.text();
451
+ throw new LlmCallError(
452
+ `${provider} API ${resp.status}: ${text.substring(0, 500)}`,
453
+ { provider, model, status: resp.status, severity: severityForStatus(resp.status) },
454
+ );
455
+ }
456
+
457
+ // Streaming path
458
+ if (req.stream && req.onToken) {
459
+ let content = '';
460
+ let reasoningContent = '';
461
+ const toolCalls: ToolCall[] = [];
462
+ const toolJsonBuffers: Map<number, { id: string; name: string; json: string }> = new Map();
463
+ let actualModel = model;
464
+ let inputTokens = 0;
465
+ let outputTokens = 0;
466
+ let cachedInputTokens = 0;
467
+
468
+ for await (const event of parseSSE(resp)) {
469
+ if (!event.data || event.data === '[DONE]') continue;
470
+
471
+ let chunk: any;
472
+ try {
473
+ if (typeof event.data === 'string') {
474
+ const raw = event.data.trim();
475
+ const clean = raw.startsWith('data:') ? raw.slice(5).trim() : raw;
476
+ chunk = JSON.parse(clean);
477
+ } else {
478
+ chunk = event.data;
479
+ }
480
+ } catch {
481
+ continue; // Skip unparseable chunks
482
+ }
483
+ if (chunk.model) actualModel = chunk.model;
484
+
485
+ const delta = chunk.choices?.[0]?.delta;
486
+ if (!delta) continue;
487
+
488
+ // Text content
489
+ if (delta.content) {
490
+ content += delta.content;
491
+ req.onToken(delta.content);
492
+ }
493
+
494
+ // Reasoning content (z.ai GLM-5.x, OpenAI o-series, DeepSeek-R1, Ollama Qwopus).
495
+ // Ollama uses `delta.reasoning`; others use `delta.reasoning_content`.
496
+ const reasoning = delta.reasoning_content || delta.reasoning;
497
+ if (reasoning) {
498
+ reasoningContent += reasoning;
499
+ }
500
+
501
+ // Tool calls (streamed incrementally)
502
+ if (delta.tool_calls) {
503
+ for (const tc of delta.tool_calls) {
504
+ const idx = tc.index ?? 0;
505
+ if (tc.id) {
506
+ toolJsonBuffers.set(idx, { id: tc.id, name: tc.function?.name || '', json: '' });
507
+ }
508
+ const buf = toolJsonBuffers.get(idx);
509
+ if (buf && tc.function?.arguments) {
510
+ buf.json += tc.function.arguments;
511
+ }
512
+ }
513
+ }
514
+
515
+ // Usage (some providers send this in the final chunk)
516
+ if (chunk.usage) {
517
+ inputTokens = chunk.usage.prompt_tokens || 0;
518
+ outputTokens = chunk.usage.completion_tokens || 0;
519
+ cachedInputTokens = chunk.usage.prompt_tokens_details?.cached_tokens || 0;
520
+ }
521
+ }
522
+
523
+ // Finalize tool calls
524
+ for (const buf of toolJsonBuffers.values()) {
525
+ try {
526
+ toolCalls.push({ id: buf.id, name: buf.name, arguments: buf.json ? JSON.parse(buf.json) : {} });
527
+ } catch {
528
+ toolCalls.push({ id: buf.id, name: buf.name, arguments: {} });
529
+ }
530
+ }
531
+
532
+ return {
533
+ content, model: actualModel, provider,
534
+ inputTokens, outputTokens,
535
+ latencyMs: Date.now() - start,
536
+ ...(cachedInputTokens > 0 ? { cachedInputTokens, cached: true } : {}),
537
+ ...(toolCalls.length > 0 ? { toolCalls } : {}),
538
+ ...(reasoningContent ? { reasoningContent } : {}),
539
+ };
540
+ }
541
+
542
+ // Non-streaming path
543
+ const data: any = await resp.json();
544
+ const choice = data.choices?.[0]?.message || {};
545
+ const usage = data.usage || {};
546
+ const cachedInputTokensNs = usage.prompt_tokens_details?.cached_tokens || 0;
547
+ const actualModel = data.model || model;
548
+
549
+ const toolCalls: ToolCall[] = [];
550
+ if (choice.tool_calls) {
551
+ for (const tc of choice.tool_calls) {
552
+ toolCalls.push({
553
+ id: tc.id,
554
+ name: tc.function.name,
555
+ arguments: typeof tc.function.arguments === 'string'
556
+ ? JSON.parse(tc.function.arguments)
557
+ : tc.function.arguments,
558
+ });
559
+ }
560
+ }
561
+
562
+ return {
563
+ content: choice.content || '',
564
+ model: actualModel,
565
+ provider,
566
+ inputTokens: usage.prompt_tokens || 0,
567
+ outputTokens: usage.completion_tokens || 0,
568
+ latencyMs: Date.now() - start,
569
+ ...(cachedInputTokensNs > 0 ? { cachedInputTokens: cachedInputTokensNs, cached: true } : {}),
570
+ ...(toolCalls.length > 0 ? { toolCalls } : {}),
571
+ ...((choice.reasoning_content || choice.reasoning) ? { reasoningContent: choice.reasoning_content || choice.reasoning } : {}),
572
+ };
573
+ }
574
+
575
+ /** Convert abstract LLMMessage[] to OpenAI message format */
576
+ function openaiMessages(messages: LLMMessage[]): any[] {
577
+ const result: any[] = [];
578
+
579
+ for (const msg of messages) {
580
+ if (msg.role === 'user') {
581
+ result.push({ role: 'user', content: msg.content || '' });
582
+ } else if (msg.role === 'assistant') {
583
+ const entry: any = { role: 'assistant' };
584
+ if (msg.content) entry.content = msg.content;
585
+ // DeepSeek requires reasoning_content to be passed back in multi-turn.
586
+ if (msg.reasoningContent) entry.reasoning_content = msg.reasoningContent;
587
+ if (msg.toolCalls) {
588
+ entry.tool_calls = msg.toolCalls.map(tc => ({
589
+ id: tc.id,
590
+ type: 'function',
591
+ function: {
592
+ name: tc.name,
593
+ arguments: JSON.stringify(tc.arguments),
594
+ },
595
+ }));
596
+ }
597
+ result.push(entry);
598
+ } else if (msg.role === 'tool') {
599
+ // OpenAI: each tool result is a separate message with role: 'tool'
600
+ if (msg.toolResults) {
601
+ for (const tr of msg.toolResults) {
602
+ result.push({
603
+ role: 'tool',
604
+ tool_call_id: tr.toolCallId,
605
+ content: tr.content,
606
+ });
607
+ }
608
+ }
609
+ }
610
+ }
611
+
612
+ return result;
613
+ }
614
+
615
+ // ---------------------------------------------------------------------------
616
+ // Gemini
617
+ // ---------------------------------------------------------------------------
618
+
619
+ async function callGemini(
620
+ apiKey: string,
621
+ model: string,
622
+ req: LLMRequest,
623
+ ): Promise<LLMResponse> {
624
+ const start = Date.now();
625
+
626
+ const systemText = req.cacheablePrefix
627
+ ? `${req.cacheablePrefix}\n\n${req.systemPrompt}`
628
+ : req.systemPrompt;
629
+
630
+ // Gemini tool use: function_declarations
631
+ const tools = req.tools ? [{
632
+ function_declarations: req.tools.map(t => ({
633
+ name: t.name,
634
+ description: t.description,
635
+ parameters: t.parameters,
636
+ })),
637
+ }] : undefined;
638
+
639
+ // Messages — multi-turn or single-turn
640
+ let contents: any[];
641
+ if (req.messages) {
642
+ contents = geminiMessages(req.messages);
643
+ } else {
644
+ contents = [{ role: 'user', parts: [{ text: req.userMessage || '' }] }];
645
+ }
646
+
647
+ const body: any = {
648
+ system_instruction: { parts: [{ text: systemText }] },
649
+ contents,
650
+ generationConfig: {
651
+ maxOutputTokens: req.maxOutputTokens ?? 8192,
652
+ ...(req.temperature !== undefined ? { temperature: req.temperature } : {}),
653
+ },
654
+ ...(tools ? { tools } : {}),
655
+ };
656
+
657
+ const resp = await fetch(
658
+ `https://generativelanguage.googleapis.com/v1beta/${model}:generateContent?key=${apiKey}`,
659
+ {
660
+ method: 'POST',
661
+ headers: { 'Content-Type': 'application/json' },
662
+ body: JSON.stringify(body),
663
+ },
664
+ );
665
+
666
+ if (!resp.ok) {
667
+ const text = await resp.text();
668
+ throw new LlmCallError(
669
+ `Gemini API ${resp.status}: ${text.substring(0, 500)}`,
670
+ { provider: 'google', model, status: resp.status, severity: severityForStatus(resp.status) },
671
+ );
672
+ }
673
+
674
+ const data: any = await resp.json();
675
+ const usage = data.usageMetadata || {};
676
+
677
+ let content = '';
678
+ const toolCalls: ToolCall[] = [];
679
+
680
+ const parts = data.candidates?.[0]?.content?.parts || [];
681
+ for (const part of parts) {
682
+ if (part.text) {
683
+ content += part.text;
684
+ } else if (part.functionCall) {
685
+ toolCalls.push({
686
+ id: `gemini-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`,
687
+ name: part.functionCall.name,
688
+ arguments: part.functionCall.args || {},
689
+ });
690
+ }
691
+ }
692
+
693
+ return {
694
+ content,
695
+ model,
696
+ provider: 'google',
697
+ inputTokens: usage.promptTokenCount || 0,
698
+ outputTokens: usage.candidatesTokenCount || 0,
699
+ latencyMs: Date.now() - start,
700
+ ...(toolCalls.length > 0 ? { toolCalls } : {}),
701
+ };
702
+ }
703
+
704
+ /** Convert abstract LLMMessage[] to Gemini contents format */
705
+ function geminiMessages(messages: LLMMessage[]): any[] {
706
+ const result: any[] = [];
707
+
708
+ for (const msg of messages) {
709
+ if (msg.role === 'user') {
710
+ result.push({ role: 'user', parts: [{ text: msg.content || '' }] });
711
+ } else if (msg.role === 'assistant') {
712
+ const parts: any[] = [];
713
+ if (msg.content) parts.push({ text: msg.content });
714
+ if (msg.toolCalls) {
715
+ for (const tc of msg.toolCalls) {
716
+ parts.push({
717
+ functionCall: { name: tc.name, args: tc.arguments },
718
+ });
719
+ }
720
+ }
721
+ result.push({ role: 'model', parts });
722
+ } else if (msg.role === 'tool') {
723
+ // Gemini: functionResponse parts
724
+ const parts: any[] = [];
725
+ if (msg.toolResults) {
726
+ for (const tr of msg.toolResults) {
727
+ parts.push({
728
+ functionResponse: {
729
+ name: tr.toolCallId, // Gemini uses name, not id
730
+ response: { content: tr.content },
731
+ },
732
+ });
733
+ }
734
+ }
735
+ result.push({ role: 'function', parts });
736
+ }
737
+ }
738
+
739
+ return result;
740
+ }
741
+
742
+ // ---------------------------------------------------------------------------
743
+ // Unified router
744
+ // ---------------------------------------------------------------------------
745
+
746
+ const MAX_RETRIES = 2;
747
+ const RETRYABLE_STATUS_CODES = new Set([429, 500, 502, 503, 504, 529]);
748
+ // Per-call LLM timeout. Reasoning models (GLM-5.x, o-series, R1) routinely
749
+ // burn 60–180s of hidden CoT before producing a visible byte, so the cap
750
+ // has to accommodate them. Non-reasoning models still return in seconds;
751
+ // raising the ceiling just removes a false-positive timeout, it doesn't
752
+ // slow anything down.
753
+ // Spec 13 — per-call timeout and per-turn wall-clock cap.
754
+ const LLM_TIMEOUT_MS = 240_000;
755
+ const TURN_WALL_CLOCK_MS = 600_000;
756
+
757
+ function withTimeout<T>(p: Promise<T>, ms: number, label: string): Promise<T> {
758
+ return new Promise<T>((resolve, reject) => {
759
+ const timer = setTimeout(() => reject(new Error(`${label} timeout after ${ms}ms`)), ms);
760
+ p.then(v => { clearTimeout(timer); resolve(v); },
761
+ e => { clearTimeout(timer); reject(e); });
762
+ });
763
+ }
764
+
765
+ function parseRetryAfter(msg: string): number | null {
766
+ const m = msg.match(/retry[- ]?after[^0-9]*(\d+)/i);
767
+ if (!m) return null;
768
+ return parseInt(m[1], 10) * 1000;
769
+ }
770
+
771
+ /**
772
+ * Fallback chains: when a model is overloaded (529) or rate-limited (429),
773
+ * try the next model in the chain before giving up.
774
+ */
775
+ const FALLBACK_CHAINS: Record<string, { provider: ProviderId; model: string }[]> = {
776
+ 'claude-opus-4-20250514': [
777
+ { provider: 'anthropic', model: 'claude-sonnet-4-5-20250929' },
778
+ { provider: 'openai', model: 'gpt-5.4' },
779
+ ],
780
+ 'claude-sonnet-4-5-20250929': [
781
+ { provider: 'openai', model: 'gpt-5.4' },
782
+ { provider: 'anthropic', model: 'claude-haiku-4-5-20251001' },
783
+ ],
784
+ 'gpt-5.4': [
785
+ { provider: 'anthropic', model: 'claude-sonnet-4-5-20250929' },
786
+ { provider: 'openai', model: 'gpt-5.4-mini' },
787
+ ],
788
+ };
789
+
790
+ export async function callLLM(req: LLMRequest): Promise<LLMResponse> {
791
+ const provider = req.provider || 'anthropic';
792
+ const model = req.model || DEFAULT_MODELS[provider] || 'claude-sonnet-4-5-20250929';
793
+
794
+ const apiKey = getApiKey(provider);
795
+ if (!apiKey && provider !== 'ollama') {
796
+ const envVars: Record<string, string> = {
797
+ 'anthropic': 'ANTHROPIC_API_KEY',
798
+ 'openai': 'OPENAI_API_KEY',
799
+ 'deepseek': 'DEEPSEEK_API_KEY',
800
+ 'xai': 'XAI_API_KEY',
801
+ 'zai': 'ZAI_API_KEY',
802
+ 'google': 'GOOGLE_API_KEY',
803
+ 'nvidia-router': 'NVIDIA_API_KEY',
804
+ };
805
+ throw new LlmCallError(
806
+ `No API key for "${provider}". Set ${envVars[provider] || 'API_KEY'} in environment or .env file.`,
807
+ { provider, model, severity: 'fatal' },
808
+ );
809
+ }
810
+
811
+ // Try the requested model first
812
+ let lastError: Error | null = null;
813
+
814
+ const limiter = getRateLimiter();
815
+ const estTokens = estimateTokens(
816
+ (req.systemPrompt || '') +
817
+ (req.userMessage || '') +
818
+ (req.messages?.map(m => m.content || '').join('\n') || ''),
819
+ ) + (req.maxOutputTokens || 2048);
820
+
821
+ const turnDeadline = Date.now() + TURN_WALL_CLOCK_MS;
822
+ for (let attempt = 0; attempt <= MAX_RETRIES; attempt++) {
823
+ try {
824
+ if (limiter) {
825
+ try { await limiter.acquire(provider, estTokens); }
826
+ catch (e) {
827
+ if (e instanceof RateLimitOverflowError) {
828
+ // Surface as retryable error so the existing fallback kicks in.
829
+ throw new LlmCallError(
830
+ `API 503: ${e.message}`,
831
+ { provider, model, status: 503, severity: 'recoverable', cause: e },
832
+ );
833
+ }
834
+ throw e;
835
+ }
836
+ }
837
+ const response = await withTimeout(callProvider(provider, apiKey, model, req), LLM_TIMEOUT_MS, `${provider}/${model}`);
838
+ if (limiter) {
839
+ limiter.recordResponse(provider, response.inputTokens, response.outputTokens, estTokens, response.responseHeaders);
840
+ }
841
+ return response;
842
+ } catch (error) {
843
+ lastError = error as Error;
844
+ // Prefer the typed status from LlmCallError; fall back to parsing
845
+ // the message for legacy errors (anything thrown as bare Error).
846
+ let statusCode = 0;
847
+ if (lastError instanceof LlmCallError && typeof lastError.status === 'number') {
848
+ statusCode = lastError.status;
849
+ } else {
850
+ const statusMatch = lastError.message.match(/API (\d+):/);
851
+ if (statusMatch) statusCode = parseInt(statusMatch[1]);
852
+ }
853
+ const retryAfter = parseRetryAfter(lastError.message);
854
+ const isTimeout = /timeout after/.test(lastError.message);
855
+ const retryable = RETRYABLE_STATUS_CODES.has(statusCode) || isTimeout;
856
+ if (limiter && (statusCode === 429 || statusCode === 503)) {
857
+ limiter.recordThrottle(provider, retryAfter ?? 5000);
858
+ }
859
+
860
+ if (attempt < MAX_RETRIES && retryable && Date.now() < turnDeadline) {
861
+ const baseDelay = retryAfter ?? Math.min(1000 * Math.pow(2, attempt), 8_000);
862
+ const delay = Math.min(baseDelay, Math.max(0, turnDeadline - Date.now()));
863
+ // Do NOT write to stderr here — the Rust TUI renders to stderr and
864
+ // any stray bytes corrupt the frame. Details land in backend.log.
865
+ await new Promise(resolve => setTimeout(resolve, delay));
866
+ continue;
867
+ }
868
+
869
+ // Not retryable — break to fallback
870
+ break;
871
+ }
872
+ }
873
+
874
+ // Try fallback models
875
+ const fallbacks = FALLBACK_CHAINS[model] || [];
876
+ for (const fb of fallbacks) {
877
+ const fbKey = getApiKey(fb.provider);
878
+ if (!fbKey && fb.provider !== 'ollama') continue;
879
+
880
+ try {
881
+ const fbReq = { ...req, provider: fb.provider, model: fb.model };
882
+ // Wrap fallback in the same timeout as the primary call — otherwise
883
+ // a hung fallback provider can block the turn forever.
884
+ const fbResponse = await withTimeout(
885
+ callProvider(fb.provider, fbKey, fb.model, fbReq),
886
+ LLM_TIMEOUT_MS,
887
+ `${fb.provider}/${fb.model}`,
888
+ );
889
+ fbResponse.wasFallback = true;
890
+ fbResponse.requestedModel = model;
891
+ return fbResponse;
892
+ } catch {
893
+ // Fallback also failed — try next
894
+ continue;
895
+ }
896
+ }
897
+
898
+ // All retries and fallbacks exhausted
899
+ throw lastError ?? new Error('All retry attempts and fallbacks exhausted');
900
+ }
901
+
902
+ function callProvider(
903
+ provider: ProviderId,
904
+ apiKey: string | undefined,
905
+ model: string,
906
+ req: LLMRequest,
907
+ ): Promise<LLMResponse> {
908
+ switch (provider) {
909
+ case 'anthropic':
910
+ return callAnthropic(apiKey!, model, req);
911
+
912
+ case 'openai':
913
+ return callOpenAICompatible('https://api.openai.com/v1', apiKey!, model, provider, req);
914
+
915
+ case 'deepseek':
916
+ return callOpenAICompatible('https://api.deepseek.com/v1', apiKey!, model, provider, req);
917
+
918
+ case 'xai':
919
+ return callOpenAICompatible('https://api.x.ai/v1', apiKey!, model, provider, req);
920
+
921
+ case 'zai':
922
+ // Coding Plan endpoint — required for GLM Coding Plan subscriptions.
923
+ // The general-purpose /api/paas/v4 endpoint returns 1113 for coding-plan keys.
924
+ return callOpenAICompatible('https://api.z.ai/api/coding/paas/v4', apiKey!, model, provider, req);
925
+
926
+ case 'nvidia-router': {
927
+ const routerUrl = process.env.NVIDIA_ROUTER_URL || 'http://localhost:8001/v1';
928
+ return callOpenAICompatible(routerUrl, apiKey!, model, provider, req);
929
+ }
930
+
931
+ case 'google':
932
+ return callGemini(apiKey!, model, req);
933
+
934
+ case 'ollama':
935
+ return callOpenAICompatible('http://localhost:11434/v1', 'ollama', model, provider, req);
936
+
937
+ default:
938
+ throw new LlmCallError(
939
+ `Unknown provider "${provider}". Supported: anthropic, openai, deepseek, xai, zai, google, ollama, nvidia-router`,
940
+ { provider, model, severity: 'fatal' },
941
+ );
942
+ }
943
+ }