@smilintux/skcapstone 0.2.5 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +61 -0
  2. package/docs/CUSTOM_AGENT.md +172 -0
  3. package/openclaw-plugin/src/index.ts +75 -4
  4. package/package.json +1 -1
  5. package/pyproject.toml +2 -1
  6. package/scripts/install.ps1 +2 -1
  7. package/scripts/install.sh +2 -1
  8. package/scripts/nvidia-proxy.mjs +663 -0
  9. package/src/skcapstone/__init__.py +70 -1
  10. package/src/skcapstone/agent_card.py +4 -1
  11. package/src/skcapstone/blueprint_registry.py +78 -0
  12. package/src/skcapstone/cli/_common.py +5 -5
  13. package/src/skcapstone/cli/card.py +36 -5
  14. package/src/skcapstone/cli/config_cmd.py +53 -1
  15. package/src/skcapstone/cli/memory.py +172 -1
  16. package/src/skcapstone/cli/peer.py +3 -1
  17. package/src/skcapstone/cli/peers_dir.py +3 -1
  18. package/src/skcapstone/cli/preflight_cmd.py +4 -0
  19. package/src/skcapstone/cli/skills_cmd.py +120 -24
  20. package/src/skcapstone/cli/soul.py +47 -24
  21. package/src/skcapstone/cli/status.py +17 -11
  22. package/src/skcapstone/cli/usage_cmd.py +21 -2
  23. package/src/skcapstone/consciousness_config.py +27 -0
  24. package/src/skcapstone/daemon.py +28 -9
  25. package/src/skcapstone/defaults/lumina/config/skgraph.yaml +12 -0
  26. package/src/skcapstone/defaults/lumina/config/skvector.yaml +9 -0
  27. package/src/skcapstone/defaults/lumina/manifest.json +18 -0
  28. package/src/skcapstone/defaults/lumina/soul/active.json +1 -1
  29. package/src/skcapstone/defaults/lumina/soul/base.json +12 -2
  30. package/src/skcapstone/defaults/lumina/wallet/joules.json +7 -0
  31. package/src/skcapstone/doctor.py +115 -0
  32. package/src/skcapstone/dreaming.py +761 -0
  33. package/src/skcapstone/mcp_tools/notification_tools.py +12 -11
  34. package/src/skcapstone/notifications.py +40 -27
  35. package/src/skcapstone/onboard.py +46 -0
  36. package/src/skcapstone/pillars/sync.py +11 -4
  37. package/src/skcapstone/register.py +8 -0
  38. package/src/skcapstone/scheduled_tasks.py +45 -0
  39. package/src/skcapstone/soul.py +19 -0
  40. package/systemd/skcapstone.service +5 -6
@@ -0,0 +1,663 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * NVIDIA NIM API Proxy
4
+ *
5
+ * Sits between OpenClaw and the NVIDIA NIM API. Handles the fact that
6
+ * NVIDIA NIM rejects responses with multiple tool calls (400 error)
7
+ * even when parallel_tool_calls: false is set.
8
+ *
9
+ * Strategy:
10
+ * 1. Inject parallel_tool_calls: false + system instruction
11
+ * 2. On 400 "single tool-calls": reduce to max 6 tools + force tool_choice
12
+ * 3. On second 400: send with just 1 tool (the most likely one) via tool_choice
13
+ * 4. Final fallback: strip tools, get text-only response
14
+ *
15
+ * Usage:
16
+ * node nvidia-proxy.mjs [--port 18780] [--target https://integrate.api.nvidia.com/v1]
17
+ *
18
+ * Then point OpenClaw's nvidia provider baseUrl to http://127.0.0.1:18780/v1
19
+ */
20
+
21
+ import http from "node:http";
22
+ import https from "node:https";
23
+ import { URL } from "node:url";
24
+
25
+ const DEFAULT_PORT = parseInt(process.env.NVIDIA_PROXY_PORT || "18780", 10);
26
+ const DEFAULT_TARGET = process.env.NVIDIA_PROXY_TARGET || "https://integrate.api.nvidia.com/v1";
27
+ const MAX_RETRIES = 4;
28
+ const MAX_429_RETRIES = 3;
29
+ const RATE_LIMIT_DELAY_MS = 2000;
30
+ const MAX_SYSTEM_BYTES = 25000;
31
+
32
+ const args = process.argv.slice(2);
33
+ let port = DEFAULT_PORT;
34
+ let targetBase = DEFAULT_TARGET;
35
+
36
+ for (let i = 0; i < args.length; i++) {
37
+ if (args[i] === "--port" && args[i + 1]) port = parseInt(args[++i], 10);
38
+ if (args[i] === "--target" && args[i + 1]) targetBase = args[++i];
39
+ }
40
+
41
+ const targetUrl = new URL(targetBase.replace(/\/v1\/?$/, ""));
42
+
43
+ /** Send a request to NVIDIA and return { status, headers, body } */
44
+ function sendUpstream(reqUrl, method, headers, body) {
45
+ return new Promise((resolve) => {
46
+ const upstream = new URL(reqUrl, targetUrl);
47
+ const proxyHeaders = { ...headers };
48
+ proxyHeaders.host = upstream.host;
49
+ proxyHeaders["content-length"] = body.length;
50
+ delete proxyHeaders.connection;
51
+ delete proxyHeaders["keep-alive"];
52
+
53
+ const transport = upstream.protocol === "https:" ? https : http;
54
+ const upstreamReq = transport.request(
55
+ {
56
+ hostname: upstream.hostname,
57
+ port: upstream.port || (upstream.protocol === "https:" ? 443 : 80),
58
+ path: upstream.pathname + upstream.search,
59
+ method,
60
+ headers: proxyHeaders,
61
+ },
62
+ (upstreamRes) => {
63
+ const chunks = [];
64
+ upstreamRes.on("data", (c) => chunks.push(c));
65
+ upstreamRes.on("end", () => {
66
+ resolve({
67
+ status: upstreamRes.statusCode,
68
+ headers: upstreamRes.headers,
69
+ body: Buffer.concat(chunks),
70
+ });
71
+ });
72
+ },
73
+ );
74
+ upstreamReq.on("error", (err) => {
75
+ resolve({ status: 502, headers: {}, body: Buffer.from(JSON.stringify({ error: { message: err.message } })) });
76
+ });
77
+ upstreamReq.write(body);
78
+ upstreamReq.end();
79
+ });
80
+ }
81
+
82
+ /**
83
+ * Send a 200 response, converting to SSE if the original request was streaming.
84
+ * @param {http.ServerResponse} clientRes
85
+ * @param {object} resBody - parsed JSON response body
86
+ * @param {object} headers - upstream response headers
87
+ * @param {boolean} asSSE - whether to wrap as SSE
88
+ */
89
+ /**
90
+ * Sanitize model text content — strip leaked tool call markup from Kimi K2.5.
91
+ * When tools are stripped, Kimi embeds raw tool syntax in text output.
92
+ */
93
+ function sanitizeContent(text) {
94
+ if (!text) return text;
95
+ // Strip Kimi's leaked tool call markup blocks
96
+ let cleaned = text.replace(/<\|tool_calls_section_begin\|>[\s\S]*?<\|tool_calls_section_end\|>/g, "");
97
+ // Strip individual tool call fragments that might not have the section wrapper
98
+ cleaned = cleaned.replace(/<\|tool_call_begin\|>[\s\S]*?<\|tool_call_end\|>/g, "");
99
+ cleaned = cleaned.replace(/<\|tool_call_argument_begin\|>[\s\S]*?(<\|tool_call_end\|>|$)/g, "");
100
+ // Clean up leftover whitespace from removed blocks
101
+ cleaned = cleaned.replace(/\n{3,}/g, "\n\n").trim();
102
+ if (cleaned !== text) {
103
+ console.log(`[nvidia-proxy] SANITIZED: stripped leaked tool call markup (${text.length} → ${cleaned.length} chars)`);
104
+ }
105
+ return cleaned;
106
+ }
107
+
108
+ function sendOk(clientRes, resBody, headers, asSSE) {
109
+ // Sanitize text content before sending
110
+ const choice = resBody.choices?.[0];
111
+ if (choice?.message?.content) {
112
+ choice.message.content = sanitizeContent(choice.message.content);
113
+ }
114
+ if (asSSE) {
115
+ if (!clientRes.headersSent) {
116
+ const sseHeaders = { ...headers };
117
+ sseHeaders["content-type"] = "text/event-stream; charset=utf-8";
118
+ delete sseHeaders["content-length"];
119
+ delete sseHeaders["transfer-encoding"];
120
+ sseHeaders["cache-control"] = "no-cache";
121
+ clientRes.writeHead(200, sseHeaders);
122
+ }
123
+
124
+ const base = { id: resBody.id, object: "chat.completion.chunk", created: resBody.created, model: resBody.model };
125
+ const choice = resBody.choices?.[0];
126
+
127
+ if (!choice) {
128
+ clientRes.write("data: [DONE]\n\n");
129
+ clientRes.end();
130
+ return;
131
+ }
132
+
133
+ const msg = choice.message || {};
134
+
135
+ // 1. Role chunk
136
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: { role: msg.role || "assistant" }, finish_reason: null }] })}\n\n`);
137
+
138
+ // 2. Content chunks (split into smaller pieces for proper streaming behavior)
139
+ const content = msg.content || "";
140
+ if (content) {
141
+ const chunkSize = 100;
142
+ for (let i = 0; i < content.length; i += chunkSize) {
143
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: { content: content.slice(i, i + chunkSize) }, finish_reason: null }] })}\n\n`);
144
+ }
145
+ }
146
+
147
+ // 3. Tool calls (if any) — send as a single delta
148
+ if (msg.tool_calls && msg.tool_calls.length > 0) {
149
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: { tool_calls: msg.tool_calls }, finish_reason: null }] })}\n\n`);
150
+ }
151
+
152
+ // 4. Usage chunk (if present)
153
+ if (resBody.usage) {
154
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: {}, finish_reason: choice.finish_reason || "stop" }], usage: resBody.usage })}\n\n`);
155
+ } else {
156
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: {}, finish_reason: choice.finish_reason || "stop" }] })}\n\n`);
157
+ }
158
+
159
+ clientRes.write("data: [DONE]\n\n");
160
+ clientRes.end();
161
+ } else {
162
+ const body = Buffer.from(JSON.stringify(resBody), "utf-8");
163
+ const outHeaders = { ...headers };
164
+ outHeaders["content-length"] = body.length;
165
+ clientRes.writeHead(200, outHeaders);
166
+ clientRes.end(body);
167
+ }
168
+ }
169
+
170
+ const SINGLE_TOOL_INSTRUCTION =
171
+ "You MUST call exactly ONE tool per response. Never call multiple tools at once.";
172
+
173
+ const MAX_BODY_BYTES = 60000;
174
+
175
+ /**
176
+ * Trim conversation history to keep body size under MAX_BODY_BYTES.
177
+ * Preserves: system messages, first 2 user/assistant messages (identity/rehydration),
178
+ * and the most recent messages. Drops middle messages first.
179
+ * Tool result messages with large content get their content truncated first.
180
+ */
181
+ function trimConversationHistory(parsed) {
182
+ if (!Array.isArray(parsed.messages) || parsed.messages.length < 6) return;
183
+
184
+ // First pass: truncate large tool results (keep first 500 chars)
185
+ for (const m of parsed.messages) {
186
+ if (m.role === "tool" || m.role === "toolResult") {
187
+ if (typeof m.content === "string" && m.content.length > 500) {
188
+ m.content = m.content.slice(0, 500) + "\n...[truncated]";
189
+ } else if (Array.isArray(m.content)) {
190
+ for (const c of m.content) {
191
+ if (c.type === "text" && typeof c.text === "string" && c.text.length > 500) {
192
+ c.text = c.text.slice(0, 500) + "\n...[truncated]";
193
+ }
194
+ }
195
+ }
196
+ }
197
+ }
198
+
199
+ // Check if we're still over budget
200
+ let bodySize = Buffer.byteLength(JSON.stringify(parsed), "utf-8");
201
+ if (bodySize <= MAX_BODY_BYTES) return;
202
+
203
+ // Second pass: drop middle messages, then progressively shrink tail until under budget
204
+ const msgs = parsed.messages;
205
+ const system = msgs.filter(m => m.role === "system");
206
+ const nonSystem = msgs.filter(m => m.role !== "system");
207
+
208
+ if (nonSystem.length <= 4) return; // not enough to trim
209
+
210
+ const keepStart = 2;
211
+ let keepEnd = Math.min(6, nonSystem.length - keepStart);
212
+
213
+ // Loop: keep reducing tail until under budget
214
+ while (keepEnd >= 2) {
215
+ const dropped = nonSystem.length - keepStart - keepEnd;
216
+ const trimmed = [
217
+ ...system,
218
+ ...nonSystem.slice(0, keepStart),
219
+ ...(dropped > 0 ? [{ role: "system", content: `[${dropped} earlier messages trimmed to save context]` }] : []),
220
+ ...nonSystem.slice(-keepEnd),
221
+ ];
222
+ const candidateSize = Buffer.byteLength(JSON.stringify({ ...parsed, messages: trimmed }), "utf-8");
223
+ if (candidateSize <= MAX_BODY_BYTES) {
224
+ parsed.messages = trimmed;
225
+ console.log(`[nvidia-proxy] trimmed history: dropped ${dropped} middle messages, keepEnd=${keepEnd}, bodyLen now ~${candidateSize}`);
226
+ return;
227
+ }
228
+ keepEnd--;
229
+ }
230
+
231
+ // Last resort: system + first user message (the original request) + last 2 non-system
232
+ // Always keep the first user message so the model remembers what was asked
233
+ const firstUser = nonSystem.find(m => m.role === "user");
234
+ const lastTwo = nonSystem.slice(-2);
235
+ const minimal = [
236
+ ...system,
237
+ ...(firstUser && !lastTwo.includes(firstUser) ? [firstUser, { role: "system", content: "[middle messages trimmed — focus on answering the user's request above]" }] : []),
238
+ ...lastTwo,
239
+ ];
240
+ parsed.messages = minimal;
241
+ bodySize = Buffer.byteLength(JSON.stringify(parsed), "utf-8");
242
+ console.log(`[nvidia-proxy] trimmed history: AGGRESSIVE — kept system + first user + last 2, bodyLen now ~${bodySize}`);
243
+ }
244
+
245
+ /**
246
+ * Trim system messages to keep total system content under MAX_SYSTEM_BYTES.
247
+ * Finds the largest system messages and truncates them, keeping head + tail
248
+ * with a trimming notice in the middle.
249
+ */
250
+ function trimSystemMessages(parsed) {
251
+ if (!Array.isArray(parsed.messages)) return;
252
+
253
+ const systemMsgs = parsed.messages.filter(m => m.role === "system" && typeof m.content === "string");
254
+ if (systemMsgs.length === 0) return;
255
+
256
+ const before = systemMsgs.reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
257
+ if (before <= MAX_SYSTEM_BYTES) return;
258
+
259
+ let trimmedCount = 0;
260
+
261
+ // Sort by size descending to trim largest first
262
+ const sorted = [...systemMsgs].sort((a, b) => b.content.length - a.content.length);
263
+
264
+ for (const msg of sorted) {
265
+ // Re-measure current total
266
+ const currentTotal = parsed.messages
267
+ .filter(m => m.role === "system" && typeof m.content === "string")
268
+ .reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
269
+ if (currentTotal <= MAX_SYSTEM_BYTES) break;
270
+
271
+ // Skip messages already under 4000 chars
272
+ if (msg.content.length <= 4000) break;
273
+
274
+ const head = msg.content.slice(0, 3000);
275
+ const tail = msg.content.slice(-1000);
276
+ msg.content = head + "\n\n[...content trimmed to save context — use skmemory_ritual tool for full identity...]\n\n" + tail;
277
+ trimmedCount++;
278
+ }
279
+
280
+ if (trimmedCount > 0) {
281
+ const after = parsed.messages
282
+ .filter(m => m.role === "system" && typeof m.content === "string")
283
+ .reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
284
+ console.log(`[nvidia-proxy] trimmed system prompt: ${before} → ${after} bytes (${trimmedCount} messages trimmed)`);
285
+ }
286
+ }
287
+
288
+ /**
289
+ * Strip tool_calls from conversation history to prevent the model from
290
+ * learning the pattern of calling multiple tools. Converts assistant
291
+ * tool_call messages to plain text and removes tool result messages.
292
+ */
293
+ function stripToolCallHistory(messages) {
294
+ if (!Array.isArray(messages)) return;
295
+ // Remove tool result messages
296
+ for (let i = messages.length - 1; i >= 0; i--) {
297
+ const m = messages[i];
298
+ if (m.role === "tool" || m.role === "toolResult") {
299
+ messages.splice(i, 1);
300
+ } else if (m.role === "assistant" && m.tool_calls) {
301
+ // Convert tool_call messages to plain text summaries
302
+ const toolNames = m.tool_calls.map((tc) => tc.function?.name).join(", ");
303
+ m.content = m.content || `[Used: ${toolNames}]`;
304
+ delete m.tool_calls;
305
+ }
306
+ }
307
+ }
308
+
309
+ /** Priority tools — kept when reducing tool count (order matters) */
310
+ const PRIORITY_TOOLS = [
311
+ // Core agent tools
312
+ "exec", "read", "write", "edit",
313
+ // Communication (critical for Telegram)
314
+ "message",
315
+ // Memory tools (most frequently needed)
316
+ "skmemory_health", "skmemory_search", "skmemory_snapshot",
317
+ "skmemory_ritual", "skmemory_context", "skmemory_list",
318
+ // Web tools
319
+ "web_search", "web_fetch",
320
+ // Communication (other channels)
321
+ "skchat_send", "skcomm_send",
322
+ // SKCapstone
323
+ "skcapstone_status", "skcapstone_whoami", "skcapstone_mood",
324
+ // Cloud 9
325
+ "cloud9_oof", "cloud9_rehydrate",
326
+ // Memory (infrequent)
327
+ "skmemory_export", "skmemory_import_seeds",
328
+ ];
329
+
330
+ /**
331
+ * Reduce the tools array to at most `max` tools, preferring tools
332
+ * mentioned in recent messages and priority tools.
333
+ * Avoids generic tools (exec, read) when specific tools are available.
334
+ */
335
+ function reduceTools(tools, messages, max) {
336
+ if (tools.length <= max) return tools;
337
+
338
+ // Score each tool — higher = more likely to be kept
339
+ const scores = new Map();
340
+ for (const t of tools) {
341
+ const name = t.function?.name || "";
342
+ let score = 0;
343
+
344
+ // Boost tools mentioned in the user's last message (strongest signal)
345
+ const lastUserMsg = [...(messages || [])].reverse().find(m => m.role === "user");
346
+ if (lastUserMsg) {
347
+ const text = typeof lastUserMsg.content === "string" ? lastUserMsg.content : JSON.stringify(lastUserMsg.content || "");
348
+ if (text.includes(name)) score += 200;
349
+ // Also match partial names (e.g., "health" matches "skmemory_health")
350
+ const parts = name.split("_");
351
+ for (const part of parts) {
352
+ if (part.length > 3 && text.toLowerCase().includes(part.toLowerCase())) score += 100;
353
+ }
354
+ }
355
+
356
+ // Priority list bonus
357
+ const prioIdx = PRIORITY_TOOLS.indexOf(name);
358
+ if (prioIdx >= 0) score += 50 - prioIdx;
359
+
360
+ // Boost tools in recent assistant tool_calls
361
+ const recentMsgs = (messages || []).slice(-6);
362
+ for (const m of recentMsgs) {
363
+ if (m.tool_calls) {
364
+ for (const tc of m.tool_calls) {
365
+ if (tc.function?.name === name) score += 80;
366
+ }
367
+ }
368
+ }
369
+
370
+ // Penalize process tool (exec is critical for agent operation)
371
+ if (name === "process") score -= 30;
372
+
373
+ scores.set(name, { tool: t, score });
374
+ }
375
+
376
+ const sorted = [...scores.values()].sort((a, b) => b.score - a.score);
377
+ return sorted.slice(0, max).map((s) => s.tool);
378
+ }
379
+
380
+ async function proxyRequest(clientReq, clientRes) {
381
+ const chunks = [];
382
+ for await (const chunk of clientReq) chunks.push(chunk);
383
+ let body = Buffer.concat(chunks);
384
+ const contentType = clientReq.headers["content-type"] || "";
385
+
386
+ const isChatCompletion =
387
+ contentType.includes("application/json") &&
388
+ clientReq.url.includes("/chat/completions");
389
+
390
+ let parsed = null;
391
+ if (isChatCompletion) {
392
+ try {
393
+ parsed = JSON.parse(body.toString("utf-8"));
394
+ } catch {
395
+ // pass through
396
+ }
397
+ }
398
+
399
+ // For non-tool requests or non-chat-completions, just proxy through
400
+ if (!parsed || !parsed.tools || !Array.isArray(parsed.tools) || parsed.tools.length === 0) {
401
+ const res = await sendUpstream(clientReq.url, clientReq.method, clientReq.headers, body);
402
+ clientRes.writeHead(res.status, res.headers);
403
+ clientRes.end(res.body);
404
+ return;
405
+ }
406
+
407
+ // Save original tools for reference
408
+ const allTools = [...parsed.tools];
409
+
410
+ // Tool request — proactively limit tools to reduce parallel call tendency
411
+ parsed.parallel_tool_calls = false;
412
+ // Force non-streaming for tool requests — proxy buffers full response anyway,
413
+ // and streaming (SSE) prevents us from inspecting/fixing tool calls
414
+ const wasStreaming = parsed.stream;
415
+ parsed.stream = false;
416
+ delete parsed.stream_options;
417
+ // With 94 tools the model almost always tries parallel calls.
418
+ // Reduce to max 12 most relevant tools on first attempt.
419
+ if (allTools.length > 12) {
420
+ parsed.tools = reduceTools(allTools, parsed.messages, 12);
421
+ const names = parsed.tools.map(t => t.function?.name).join(",");
422
+ console.log(`[nvidia-proxy] proactive reduction: ${allTools.length}→${parsed.tools.length} tools [${names}]`);
423
+ }
424
+
425
+ // Add system instruction to force single tool call
426
+ if (Array.isArray(parsed.messages)) {
427
+ const hasInstruction = parsed.messages.some(
428
+ (m) => m.role === "system" && typeof m.content === "string" && m.content.includes("ONE tool at a time"),
429
+ );
430
+ if (!hasInstruction) {
431
+ parsed.messages.unshift({
432
+ role: "system",
433
+ content: SINGLE_TOOL_INSTRUCTION,
434
+ });
435
+ }
436
+ }
437
+
438
+ // Trim conversation history FIRST so tool limiter counts only surviving messages
439
+ trimConversationHistory(parsed);
440
+ trimSystemMessages(parsed);
441
+
442
+ // After trimming, check if too many tool calls remain — force text response
443
+ if (Array.isArray(parsed.messages) && parsed.tools?.length > 0) {
444
+ const toolResultCount = parsed.messages.filter(m => m.role === "tool" || m.role === "toolResult").length;
445
+ if (toolResultCount >= 8) {
446
+ console.log(`[nvidia-proxy] TOOL LIMIT: ${toolResultCount} tool results after trimming — stripping tools, forcing text response`);
447
+ parsed.tools = [];
448
+ delete parsed.tool_choice;
449
+ parsed.messages.push({
450
+ role: "system",
451
+ content: "You have gathered enough information from tool calls. NOW respond to the user with a comprehensive text answer. Do NOT try to call more tools. Do NOT output any tool call markup. Synthesize what you learned and reply directly.",
452
+ });
453
+ }
454
+ }
455
+
456
+ const model = parsed.model || "unknown";
457
+
458
+ // If client wanted streaming, start SSE headers early so we can send keep-alive
459
+ // comments while waiting for NVIDIA. This keeps the gateway's typing indicator alive.
460
+ let sseStarted = false;
461
+ let keepAliveTimer = null;
462
+ function startSSEKeepAlive() {
463
+ if (!wasStreaming || sseStarted) return;
464
+ sseStarted = true;
465
+ clientRes.writeHead(200, {
466
+ "content-type": "text/event-stream; charset=utf-8",
467
+ "cache-control": "no-cache",
468
+ "connection": "keep-alive",
469
+ });
470
+ keepAliveTimer = setInterval(() => {
471
+ try { clientRes.write(": keep-alive\n\n"); } catch {}
472
+ }, 5000);
473
+ }
474
+ function stopKeepAlive() {
475
+ if (keepAliveTimer) { clearInterval(keepAliveTimer); keepAliveTimer = null; }
476
+ }
477
+
478
+ for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
479
+ const currentToolCount = parsed.tools ? parsed.tools.length : 0;
480
+ const reqBody = Buffer.from(JSON.stringify(parsed), "utf-8");
481
+ console.log(
482
+ `[nvidia-proxy] ${new Date().toISOString()} attempt=${attempt} model=${model} tools=${currentToolCount} bodyLen=${reqBody.length}`,
483
+ );
484
+
485
+ // Start keep-alive comments while NVIDIA processes
486
+ if (wasStreaming) startSSEKeepAlive();
487
+
488
+ let res;
489
+ // Handle 429 rate limiting with internal retries + backoff
490
+ for (let r429 = 0; r429 <= MAX_429_RETRIES; r429++) {
491
+ res = await sendUpstream(clientReq.url, clientReq.method, clientReq.headers, reqBody);
492
+ if (res.status !== 429 || r429 === MAX_429_RETRIES) break;
493
+ const delay = RATE_LIMIT_DELAY_MS * (r429 + 1);
494
+ console.log(`[nvidia-proxy] 429 rate limited, waiting ${delay}ms (retry ${r429 + 1}/${MAX_429_RETRIES})...`);
495
+ await new Promise(r => setTimeout(r, delay));
496
+ }
497
+
498
+ if (res.status === 400) {
499
+ const errText = res.body.toString("utf-8");
500
+ if (errText.includes("single tool-calls") && attempt < MAX_RETRIES) {
501
+ console.log(`[nvidia-proxy] 400 parallel tool-calls rejected, retrying (${attempt}/${MAX_RETRIES})...`);
502
+
503
+ if (attempt === 1) {
504
+ // Attempt 2: reduce to 6 tools + strip tool_calls from history
505
+ // The massive conversation history with tool_calls trains the model to call multiple
506
+ parsed.tools = reduceTools(allTools, parsed.messages, 6);
507
+ stripToolCallHistory(parsed.messages);
508
+ const toolNames = parsed.tools.map(t => t.function?.name).join(",");
509
+ console.log(`[nvidia-proxy] retry: ${parsed.tools.length} tools [${toolNames}], stripped history`);
510
+ } else if (attempt === 2) {
511
+ // Attempt 3: single tool, forced choice
512
+ parsed.tools = reduceTools(allTools, parsed.messages, 1);
513
+ const topTool = parsed.tools[0]?.function?.name;
514
+ if (topTool) {
515
+ parsed.tool_choice = { type: "function", function: { name: topTool } };
516
+ }
517
+ console.log(`[nvidia-proxy] retry: 1 tool, forced=${topTool}`);
518
+ } else {
519
+ // Attempt 4 (final): strip all tools, text-only
520
+ delete parsed.tools;
521
+ delete parsed.tool_choice;
522
+ delete parsed.parallel_tool_calls;
523
+ stripToolCallHistory(parsed.messages);
524
+ console.log(`[nvidia-proxy] final retry: stripped all tools, text-only`);
525
+ }
526
+ continue;
527
+ }
528
+ }
529
+
530
+ // Log tool calls in successful responses
531
+ if (res.status === 200) {
532
+ try {
533
+ const bodyStr = res.body.toString("utf-8");
534
+ const peek = JSON.parse(bodyStr);
535
+ const tc = peek.choices?.[0]?.message?.tool_calls;
536
+ if (tc && tc.length > 0) {
537
+ const names = tc.map(c => c.function?.name).join(", ");
538
+ console.log(`[nvidia-proxy] model called: [${names}] (${tc.length} calls)`);
539
+ } else {
540
+ const content = peek.choices?.[0]?.message?.content;
541
+ console.log(`[nvidia-proxy] model response: text (${content ? content.length : 0} chars)`);
542
+ }
543
+ } catch {
544
+ // SSE streaming responses can't be parsed as JSON — this is expected
545
+ }
546
+ }
547
+
548
+ // Fix ghost tool calls: finish_reason says "tool_calls" but no actual tool_calls present
549
+ if (res.status === 200 && parsed.tools) {
550
+ try {
551
+ const resBody = JSON.parse(res.body.toString("utf-8"));
552
+ const choice = resBody.choices?.[0];
553
+ if (choice && (choice.finish_reason === "tool_calls" || choice.finish_reason === "function_call") && !choice.message?.tool_calls?.length) {
554
+ console.warn(`[nvidia-proxy] GHOST TOOL CALL: finish_reason=${choice.finish_reason} but no tool_calls — fixing to stop`);
555
+ choice.finish_reason = "stop";
556
+ stopKeepAlive();
557
+ sendOk(clientRes, resBody, res.headers, wasStreaming);
558
+ return;
559
+ }
560
+ } catch {
561
+ // Not JSON — pass through
562
+ }
563
+ }
564
+
565
+ // Check for hallucinated/invalid tool names (e.g., Kimi K2.5 "callauto" bug)
566
+ if (res.status === 200 && parsed.tools) {
567
+ try {
568
+ const resBody = JSON.parse(res.body.toString("utf-8"));
569
+ const choice = resBody.choices?.[0];
570
+ if (choice?.message?.tool_calls) {
571
+ // Compare against ALL original tools, not just the reduced set
572
+ const allToolNames = new Set(allTools.map(t => t.function?.name));
573
+ const invalidCalls = choice.message.tool_calls.filter(
574
+ tc => !tc.function?.name || !allToolNames.has(tc.function.name)
575
+ );
576
+ if (invalidCalls.length > 0) {
577
+ const badNames = invalidCalls.map(tc => tc.function?.name || "(empty)").join(", ");
578
+ console.warn(`[nvidia-proxy] CALLAUTO DETECTED: invalid tool names [${badNames}] — stripping tool_calls, returning text-only`);
579
+ // Strip invalid tool calls, keep only content
580
+ choice.message.tool_calls = choice.message.tool_calls.filter(
581
+ tc => tc.function?.name && allToolNames.has(tc.function.name)
582
+ );
583
+ if (choice.message.tool_calls.length === 0) {
584
+ delete choice.message.tool_calls;
585
+ choice.finish_reason = "stop";
586
+ }
587
+ stopKeepAlive();
588
+ sendOk(clientRes, resBody, res.headers, wasStreaming);
589
+ return;
590
+ }
591
+ }
592
+ } catch {
593
+ // Not JSON — pass through
594
+ }
595
+ }
596
+
597
+ // Check for successful response with multiple tool calls — trim to just the first one
598
+ if (res.status === 200 && parsed.tools) {
599
+ try {
600
+ const resBody = JSON.parse(res.body.toString("utf-8"));
601
+ const choice = resBody.choices?.[0];
602
+ if (choice?.message?.tool_calls && choice.message.tool_calls.length > 1) {
603
+ console.log(
604
+ `[nvidia-proxy] trimming ${choice.message.tool_calls.length} tool_calls to 1 (${choice.message.tool_calls[0].function?.name})`,
605
+ );
606
+ choice.message.tool_calls = [choice.message.tool_calls[0]];
607
+ stopKeepAlive();
608
+ sendOk(clientRes, resBody, res.headers, wasStreaming);
609
+ return;
610
+ }
611
+ } catch {
612
+ // Not JSON or parse error — pass through as-is
613
+ }
614
+ }
615
+
616
+ // Success or non-retryable error
617
+ stopKeepAlive();
618
+ if (res.status >= 400) {
619
+ console.error(`[nvidia-proxy] ${res.status} ERROR: ${res.body.toString("utf-8").slice(0, 300)}`);
620
+ if (!clientRes.headersSent) {
621
+ clientRes.writeHead(res.status, res.headers);
622
+ }
623
+ clientRes.end(res.body);
624
+ return;
625
+ }
626
+
627
+ console.log(`[nvidia-proxy] ${res.status} OK (attempt ${attempt})`);
628
+ if (wasStreaming && res.status === 200) {
629
+ try {
630
+ const resBody = JSON.parse(res.body.toString("utf-8"));
631
+ sendOk(clientRes, resBody, res.headers, true);
632
+ } catch {
633
+ // Can't parse — send raw
634
+ if (!clientRes.headersSent) {
635
+ clientRes.writeHead(res.status, res.headers);
636
+ }
637
+ clientRes.end(res.body);
638
+ }
639
+ } else {
640
+ if (!clientRes.headersSent) {
641
+ clientRes.writeHead(res.status, res.headers);
642
+ }
643
+ clientRes.end(res.body);
644
+ }
645
+ return;
646
+ }
647
+ }
648
+
649
+ const server = http.createServer(proxyRequest);
650
+
651
+ server.listen(port, "127.0.0.1", () => {
652
+ console.log(`[nvidia-proxy] listening on http://127.0.0.1:${port}`);
653
+ console.log(`[nvidia-proxy] proxying to ${targetUrl.origin}`);
654
+ console.log(`[nvidia-proxy] retry strategy: 12 tools→6 tools→1 tool (forced)→text-only (max ${MAX_RETRIES} attempts)`);
655
+ console.log(`[nvidia-proxy] also trims multi-tool responses to single tool call`);
656
+ });
657
+
658
+ for (const sig of ["SIGINT", "SIGTERM"]) {
659
+ process.on(sig, () => {
660
+ console.log(`[nvidia-proxy] ${sig} received, shutting down`);
661
+ server.close(() => process.exit(0));
662
+ });
663
+ }