@smilintux/skcapstone 0.2.6 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/README.md +61 -0
  2. package/docs/CUSTOM_AGENT.md +184 -0
  3. package/docs/GETTING_STARTED.md +3 -0
  4. package/openclaw-plugin/src/index.ts +75 -4
  5. package/package.json +1 -1
  6. package/pyproject.toml +1 -1
  7. package/scripts/archive-sessions.sh +72 -0
  8. package/scripts/install.ps1 +2 -1
  9. package/scripts/install.sh +2 -1
  10. package/scripts/nvidia-proxy.mjs +727 -0
  11. package/scripts/telegram-catchup-all.sh +136 -0
  12. package/src/skcapstone/__init__.py +70 -1
  13. package/src/skcapstone/agent_card.py +4 -1
  14. package/src/skcapstone/blueprint_registry.py +78 -0
  15. package/src/skcapstone/blueprints/builtins/itil-operations.yaml +40 -0
  16. package/src/skcapstone/cli/__init__.py +2 -0
  17. package/src/skcapstone/cli/_common.py +5 -5
  18. package/src/skcapstone/cli/card.py +36 -5
  19. package/src/skcapstone/cli/config_cmd.py +53 -1
  20. package/src/skcapstone/cli/itil.py +434 -0
  21. package/src/skcapstone/cli/peer.py +3 -1
  22. package/src/skcapstone/cli/peers_dir.py +3 -1
  23. package/src/skcapstone/cli/preflight_cmd.py +4 -0
  24. package/src/skcapstone/cli/skills_cmd.py +120 -24
  25. package/src/skcapstone/cli/soul.py +47 -24
  26. package/src/skcapstone/cli/status.py +17 -11
  27. package/src/skcapstone/cli/usage_cmd.py +7 -2
  28. package/src/skcapstone/consciousness_config.py +27 -0
  29. package/src/skcapstone/coordination.py +1 -0
  30. package/src/skcapstone/daemon.py +28 -9
  31. package/src/skcapstone/defaults/lumina/manifest.json +1 -1
  32. package/src/skcapstone/doctor.py +115 -0
  33. package/src/skcapstone/dreaming.py +761 -0
  34. package/src/skcapstone/itil.py +1104 -0
  35. package/src/skcapstone/mcp_server.py +258 -0
  36. package/src/skcapstone/mcp_tools/__init__.py +2 -0
  37. package/src/skcapstone/mcp_tools/gtd_tools.py +1 -1
  38. package/src/skcapstone/mcp_tools/itil_tools.py +657 -0
  39. package/src/skcapstone/mcp_tools/notification_tools.py +12 -11
  40. package/src/skcapstone/notifications.py +40 -27
  41. package/src/skcapstone/onboard.py +46 -0
  42. package/src/skcapstone/pillars/sync.py +11 -4
  43. package/src/skcapstone/register.py +8 -0
  44. package/src/skcapstone/scheduled_tasks.py +107 -0
  45. package/src/skcapstone/service_health.py +81 -2
  46. package/src/skcapstone/soul.py +19 -0
  47. package/systemd/skcapstone.service +5 -6
@@ -0,0 +1,727 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * NVIDIA NIM API Proxy
4
+ *
5
+ * Sits between OpenClaw and the NVIDIA NIM API. Handles the fact that
6
+ * NVIDIA NIM rejects responses with multiple tool calls (400 error)
7
+ * even when parallel_tool_calls: false is set.
8
+ *
9
+ * Strategy:
10
+ * 1. Inject parallel_tool_calls: false + system instruction
11
+ * 2. On 400 "single tool-calls": reduce to max 6 tools + force tool_choice
12
+ * 3. On second 400: send with just 1 tool (the most likely one) via tool_choice
13
+ * 4. Final fallback: strip tools, get text-only response
14
+ *
15
+ * Usage:
16
+ * node nvidia-proxy.mjs [--port 18780] [--target https://integrate.api.nvidia.com/v1]
17
+ *
18
+ * Then point OpenClaw's nvidia provider baseUrl to http://127.0.0.1:18780/v1
19
+ */
20
+
21
+ import http from "node:http";
22
+ import https from "node:https";
23
+ import { URL } from "node:url";
24
+
25
+ const DEFAULT_PORT = parseInt(process.env.NVIDIA_PROXY_PORT || "18780", 10);
26
+ const DEFAULT_TARGET = process.env.NVIDIA_PROXY_TARGET || "https://integrate.api.nvidia.com/v1";
27
+ const MAX_RETRIES = 4;
28
+ const MAX_429_RETRIES = 3;
29
+ const RATE_LIMIT_DELAY_MS = 2000;
30
+ const MAX_SYSTEM_BYTES = 25000;
31
+ const toolCallCounters = new Map(); // Per-model tool call counters
32
+
33
+ const args = process.argv.slice(2);
34
+ let port = DEFAULT_PORT;
35
+ let targetBase = DEFAULT_TARGET;
36
+
37
+ for (let i = 0; i < args.length; i++) {
38
+ if (args[i] === "--port" && args[i + 1]) port = parseInt(args[++i], 10);
39
+ if (args[i] === "--target" && args[i + 1]) targetBase = args[++i];
40
+ }
41
+
42
+ const targetUrl = new URL(targetBase.replace(/\/v1\/?$/, ""));
43
+
44
+ /** Send a request to NVIDIA and return { status, headers, body } */
45
+ function sendUpstream(reqUrl, method, headers, body) {
46
+ return new Promise((resolve) => {
47
+ const upstream = new URL(reqUrl, targetUrl);
48
+ const proxyHeaders = { ...headers };
49
+ proxyHeaders.host = upstream.host;
50
+ proxyHeaders["content-length"] = body.length;
51
+ delete proxyHeaders.connection;
52
+ delete proxyHeaders["keep-alive"];
53
+
54
+ const transport = upstream.protocol === "https:" ? https : http;
55
+ const upstreamReq = transport.request(
56
+ {
57
+ hostname: upstream.hostname,
58
+ port: upstream.port || (upstream.protocol === "https:" ? 443 : 80),
59
+ path: upstream.pathname + upstream.search,
60
+ method,
61
+ headers: proxyHeaders,
62
+ },
63
+ (upstreamRes) => {
64
+ const chunks = [];
65
+ upstreamRes.on("data", (c) => chunks.push(c));
66
+ upstreamRes.on("end", () => {
67
+ resolve({
68
+ status: upstreamRes.statusCode,
69
+ headers: upstreamRes.headers,
70
+ body: Buffer.concat(chunks),
71
+ });
72
+ });
73
+ },
74
+ );
75
+ upstreamReq.on("error", (err) => {
76
+ resolve({ status: 502, headers: {}, body: Buffer.from(JSON.stringify({ error: { message: err.message } })) });
77
+ });
78
+ upstreamReq.write(body);
79
+ upstreamReq.end();
80
+ });
81
+ }
82
+
83
+ /**
84
+ * Send a 200 response, converting to SSE if the original request was streaming.
85
+ * @param {http.ServerResponse} clientRes
86
+ * @param {object} resBody - parsed JSON response body
87
+ * @param {object} headers - upstream response headers
88
+ * @param {boolean} asSSE - whether to wrap as SSE
89
+ */
90
+ /**
91
+ * Sanitize model text content — strip leaked tool call markup from Kimi K2.5.
92
+ * When tools are stripped, Kimi embeds raw tool syntax in text output.
93
+ */
94
+ function sanitizeContent(text) {
95
+ if (!text) return text;
96
+ // Strip Kimi's leaked tool call markup blocks
97
+ let cleaned = text.replace(/<\|tool_calls_section_begin\|>[\s\S]*?<\|tool_calls_section_end\|>/g, "");
98
+ // Strip individual tool call fragments that might not have the section wrapper
99
+ cleaned = cleaned.replace(/<\|tool_call_begin\|>[\s\S]*?<\|tool_call_end\|>/g, "");
100
+ cleaned = cleaned.replace(/<\|tool_call_argument_begin\|>[\s\S]*?(<\|tool_call_end\|>|$)/g, "");
101
+ // Clean up leftover whitespace from removed blocks
102
+ cleaned = cleaned.replace(/\n{3,}/g, "\n\n").trim();
103
+ if (cleaned !== text) {
104
+ console.log(`[nvidia-proxy] SANITIZED: stripped leaked tool call markup (${text.length} → ${cleaned.length} chars)`);
105
+ }
106
+ // If sanitization removed everything, inject a fallback so the gateway delivers something
107
+ if (!cleaned && text.length > 0) {
108
+ cleaned = "I'm here but had a brief processing hiccup. Could you repeat your last message? 💜";
109
+ console.log(`[nvidia-proxy] SANITIZED: injected fallback (original was 100% markup)`);
110
+ }
111
+ return cleaned;
112
+ }
113
+
114
+ function sendOk(clientRes, resBody, headers, asSSE) {
115
+ // Sanitize text content before sending
116
+ const choice = resBody.choices?.[0];
117
+ if (choice?.message?.content) {
118
+ choice.message.content = sanitizeContent(choice.message.content);
119
+ }
120
+ // Kimi K2.5 sometimes puts its response in "reasoning" instead of "content"
121
+ // Only promote if reasoning is substantial (>200 chars) — short reasoning like
122
+ // "Let me call the tool" is just inner monologue that shouldn't be user-facing
123
+ if (choice?.message && !choice.message.content && choice.message.reasoning) {
124
+ const cleaned = sanitizeContent(choice.message.reasoning.trim());
125
+ if (cleaned.length > 150) {
126
+ choice.message.content = cleaned;
127
+ console.log(`[nvidia-proxy] promoted reasoning→content (${cleaned.length} chars)`);
128
+ } else {
129
+ console.log(`[nvidia-proxy] suppressed short reasoning (${cleaned.length} chars): ${cleaned.slice(0, 80)}...`);
130
+ }
131
+ delete choice.message.reasoning;
132
+ }
133
+ // If model returned empty text (no tool calls), inject fallback so gateway delivers something
134
+ if (choice?.message && !choice.message.tool_calls?.length && choice.finish_reason !== "tool_calls") {
135
+ if (!choice.message.content || choice.message.content.trim().length === 0) {
136
+ choice.message.content = "I had a brief processing hiccup — could you say that again? 💜";
137
+ console.log(`[nvidia-proxy] injected fallback for empty text response`);
138
+ }
139
+ }
140
+ if (asSSE) {
141
+ if (!clientRes.headersSent) {
142
+ const sseHeaders = { ...headers };
143
+ sseHeaders["content-type"] = "text/event-stream; charset=utf-8";
144
+ delete sseHeaders["content-length"];
145
+ delete sseHeaders["transfer-encoding"];
146
+ sseHeaders["cache-control"] = "no-cache";
147
+ clientRes.writeHead(200, sseHeaders);
148
+ }
149
+
150
+ const base = { id: resBody.id, object: "chat.completion.chunk", created: resBody.created, model: resBody.model };
151
+ const choice = resBody.choices?.[0];
152
+
153
+ if (!choice) {
154
+ clientRes.write("data: [DONE]\n\n");
155
+ clientRes.end();
156
+ return;
157
+ }
158
+
159
+ const msg = choice.message || {};
160
+
161
+ // 1. Role chunk
162
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: { role: msg.role || "assistant" }, finish_reason: null }] })}\n\n`);
163
+
164
+ // 2. Content chunks (split into smaller pieces for proper streaming behavior)
165
+ const content = msg.content || "";
166
+ if (content) {
167
+ const chunkSize = 100;
168
+ for (let i = 0; i < content.length; i += chunkSize) {
169
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: { content: content.slice(i, i + chunkSize) }, finish_reason: null }] })}\n\n`);
170
+ }
171
+ }
172
+
173
+ // 3. Tool calls (if any) — send as a single delta
174
+ if (msg.tool_calls && msg.tool_calls.length > 0) {
175
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: { tool_calls: msg.tool_calls }, finish_reason: null }] })}\n\n`);
176
+ }
177
+
178
+ // 4. Usage chunk (if present)
179
+ if (resBody.usage) {
180
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: {}, finish_reason: choice.finish_reason || "stop" }], usage: resBody.usage })}\n\n`);
181
+ } else {
182
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: {}, finish_reason: choice.finish_reason || "stop" }] })}\n\n`);
183
+ }
184
+
185
+ clientRes.write("data: [DONE]\n\n");
186
+ clientRes.end();
187
+ } else {
188
+ const body = Buffer.from(JSON.stringify(resBody), "utf-8");
189
+ const outHeaders = { ...headers };
190
+ outHeaders["content-length"] = body.length;
191
+ clientRes.writeHead(200, outHeaders);
192
+ clientRes.end(body);
193
+ }
194
+ }
195
+
196
+ const SINGLE_TOOL_INSTRUCTION =
197
+ "You MUST call exactly ONE tool per response. Never call multiple tools at once.";
198
+
199
+ const MAX_BODY_BYTES = 60000;
200
+
201
+ /**
202
+ * Trim conversation history to keep body size under MAX_BODY_BYTES.
203
+ * Preserves: system messages, first 2 user/assistant messages (identity/rehydration),
204
+ * and the most recent messages. Drops middle messages first.
205
+ * Tool result messages with large content get their content truncated first.
206
+ */
207
+ function trimConversationHistory(parsed) {
208
+ if (!Array.isArray(parsed.messages) || parsed.messages.length < 6) return;
209
+
210
+ // Debug: log message roles
211
+ const roleSummary = parsed.messages.map(m => m.role).join(",");
212
+ console.log(`[nvidia-proxy] conversation roles (${parsed.messages.length} msgs): ${roleSummary}`);
213
+
214
+ // First pass: truncate large tool results (keep first 500 chars)
215
+ for (const m of parsed.messages) {
216
+ if (m.role === "tool" || m.role === "toolResult") {
217
+ if (typeof m.content === "string" && m.content.length > 1500) {
218
+ m.content = m.content.slice(0, 1500) + "\n...[truncated]";
219
+ } else if (Array.isArray(m.content)) {
220
+ for (const c of m.content) {
221
+ if (c.type === "text" && typeof c.text === "string" && c.text.length > 1500) {
222
+ c.text = c.text.slice(0, 1500) + "\n...[truncated]";
223
+ }
224
+ }
225
+ }
226
+ }
227
+ }
228
+
229
+ // Check if we're still over budget
230
+ let bodySize = Buffer.byteLength(JSON.stringify(parsed), "utf-8");
231
+ if (bodySize <= MAX_BODY_BYTES) return;
232
+
233
+ // Second pass: drop middle messages, then progressively shrink tail until under budget
234
+ const msgs = parsed.messages;
235
+ const system = msgs.filter(m => m.role === "system");
236
+ const nonSystem = msgs.filter(m => m.role !== "system");
237
+
238
+ if (nonSystem.length <= 4) return; // not enough to trim
239
+
240
+ const keepStart = 2;
241
+ let keepEnd = Math.min(6, nonSystem.length - keepStart);
242
+
243
+ // Loop: keep reducing tail until under budget
244
+ while (keepEnd >= 2) {
245
+ const dropped = nonSystem.length - keepStart - keepEnd;
246
+ const trimmed = [
247
+ ...system,
248
+ ...nonSystem.slice(0, keepStart),
249
+ ...(dropped > 0 ? [{ role: "system", content: `[${dropped} earlier messages trimmed to save context]` }] : []),
250
+ ...nonSystem.slice(-keepEnd),
251
+ ];
252
+ const candidateSize = Buffer.byteLength(JSON.stringify({ ...parsed, messages: trimmed }), "utf-8");
253
+ if (candidateSize <= MAX_BODY_BYTES) {
254
+ parsed.messages = trimmed;
255
+ console.log(`[nvidia-proxy] trimmed history: dropped ${dropped} middle messages, keepEnd=${keepEnd}, bodyLen now ~${candidateSize}`);
256
+ return;
257
+ }
258
+ keepEnd--;
259
+ }
260
+
261
+ // Last resort: system + first user message + last N non-system
262
+ // Keep enough tail to include tool result pairs (assistant tool_call + tool result)
263
+ const firstUser = nonSystem.find(m => m.role === "user");
264
+ // Try last 4 first (covers tool_call + result + next tool_call + result)
265
+ // Then fall back to last 2 if still too big
266
+ for (const tailSize of [4, 2]) {
267
+ const lastN = nonSystem.slice(-tailSize);
268
+ const minimal = [
269
+ ...system,
270
+ ...(firstUser && !lastN.includes(firstUser) ? [firstUser, { role: "system", content: "[earlier messages trimmed — answer the user's request using tool results below]" }] : []),
271
+ ...lastN,
272
+ ];
273
+ const candidateSize = Buffer.byteLength(JSON.stringify({ ...parsed, messages: minimal }), "utf-8");
274
+ if (candidateSize <= MAX_BODY_BYTES) {
275
+ parsed.messages = minimal;
276
+ console.log(`[nvidia-proxy] trimmed history: AGGRESSIVE — kept system + first user + last ${tailSize}, bodyLen now ~${candidateSize}`);
277
+ return;
278
+ }
279
+ }
280
+ // Absolute last resort
281
+ const lastTwo = nonSystem.slice(-2);
282
+ const minimal = [
283
+ ...system,
284
+ ...(firstUser && !lastTwo.includes(firstUser) ? [firstUser, { role: "system", content: "[earlier messages trimmed — answer the user's request using tool results below]" }] : []),
285
+ ...lastTwo,
286
+ ];
287
+ parsed.messages = minimal;
288
+ bodySize = Buffer.byteLength(JSON.stringify(parsed), "utf-8");
289
+ console.log(`[nvidia-proxy] trimmed history: AGGRESSIVE — kept system + first user + last 2, bodyLen now ~${bodySize}`);
290
+ }
291
+
292
+ /**
293
+ * Trim system messages to keep total system content under MAX_SYSTEM_BYTES.
294
+ * Finds the largest system messages and truncates them, keeping head + tail
295
+ * with a trimming notice in the middle.
296
+ */
297
+ function trimSystemMessages(parsed) {
298
+ if (!Array.isArray(parsed.messages)) return;
299
+
300
+ const systemMsgs = parsed.messages.filter(m => m.role === "system" && typeof m.content === "string");
301
+ if (systemMsgs.length === 0) return;
302
+
303
+ const before = systemMsgs.reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
304
+ if (before <= MAX_SYSTEM_BYTES) return;
305
+
306
+ let trimmedCount = 0;
307
+
308
+ // Sort by size descending to trim largest first
309
+ const sorted = [...systemMsgs].sort((a, b) => b.content.length - a.content.length);
310
+
311
+ for (const msg of sorted) {
312
+ // Re-measure current total
313
+ const currentTotal = parsed.messages
314
+ .filter(m => m.role === "system" && typeof m.content === "string")
315
+ .reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
316
+ if (currentTotal <= MAX_SYSTEM_BYTES) break;
317
+
318
+ // Skip messages already under 4000 chars
319
+ if (msg.content.length <= 4000) break;
320
+
321
+ const head = msg.content.slice(0, 3000);
322
+ const tail = msg.content.slice(-1000);
323
+ msg.content = head + "\n\n[...content trimmed to save context — use skmemory_ritual tool for full identity...]\n\n" + tail;
324
+ trimmedCount++;
325
+ }
326
+
327
+ if (trimmedCount > 0) {
328
+ const after = parsed.messages
329
+ .filter(m => m.role === "system" && typeof m.content === "string")
330
+ .reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
331
+ console.log(`[nvidia-proxy] trimmed system prompt: ${before} → ${after} bytes (${trimmedCount} messages trimmed)`);
332
+ }
333
+ }
334
+
335
+ /**
336
+ * Strip tool_calls from conversation history to prevent the model from
337
+ * learning the pattern of calling multiple tools. Converts assistant
338
+ * tool_call messages to plain text and removes tool result messages.
339
+ */
340
+ function stripToolCallHistory(messages) {
341
+ if (!Array.isArray(messages)) return;
342
+ // Remove tool result messages
343
+ for (let i = messages.length - 1; i >= 0; i--) {
344
+ const m = messages[i];
345
+ if (m.role === "tool" || m.role === "toolResult") {
346
+ messages.splice(i, 1);
347
+ } else if (m.role === "assistant" && m.tool_calls) {
348
+ // Convert tool_call messages to plain text summaries
349
+ const toolNames = m.tool_calls.map((tc) => tc.function?.name).join(", ");
350
+ m.content = m.content || `[Used: ${toolNames}]`;
351
+ delete m.tool_calls;
352
+ }
353
+ }
354
+ }
355
+
356
+ /** Priority tools — kept when reducing tool count (order matters) */
357
+ const PRIORITY_TOOLS = [
358
+ // Core agent tools
359
+ "exec", "read", "write", "edit",
360
+ // Communication (critical for Telegram)
361
+ "message",
362
+ // Memory tools (most frequently needed)
363
+ "skmemory_health", "skmemory_search", "skmemory_snapshot",
364
+ "skmemory_ritual", "skmemory_context", "skmemory_list",
365
+ // Web tools
366
+ "web_search", "web_fetch",
367
+ // Communication (other channels)
368
+ "skchat_send", "skcomm_send",
369
+ // SKCapstone
370
+ "skcapstone_status", "skcapstone_whoami", "skcapstone_mood",
371
+ // Cloud 9
372
+ "cloud9_oof", "cloud9_rehydrate",
373
+ // Memory (infrequent)
374
+ "skmemory_export", "skmemory_import_seeds",
375
+ ];
376
+
377
+ /**
378
+ * Reduce the tools array to at most `max` tools, preferring tools
379
+ * mentioned in recent messages and priority tools.
380
+ * Avoids generic tools (exec, read) when specific tools are available.
381
+ */
382
+ function reduceTools(tools, messages, max) {
383
+ if (tools.length <= max) return tools;
384
+
385
+ // Score each tool — higher = more likely to be kept
386
+ const scores = new Map();
387
+ for (const t of tools) {
388
+ const name = t.function?.name || "";
389
+ let score = 0;
390
+
391
+ // Boost tools mentioned in the user's last message (strongest signal)
392
+ const lastUserMsg = [...(messages || [])].reverse().find(m => m.role === "user");
393
+ if (lastUserMsg) {
394
+ const text = typeof lastUserMsg.content === "string" ? lastUserMsg.content : JSON.stringify(lastUserMsg.content || "");
395
+ if (text.includes(name)) score += 200;
396
+ // Also match partial names (e.g., "health" matches "skmemory_health")
397
+ const parts = name.split("_");
398
+ for (const part of parts) {
399
+ if (part.length > 3 && text.toLowerCase().includes(part.toLowerCase())) score += 100;
400
+ }
401
+ }
402
+
403
+ // Priority list bonus
404
+ const prioIdx = PRIORITY_TOOLS.indexOf(name);
405
+ if (prioIdx >= 0) score += 50 - prioIdx;
406
+
407
+ // Boost tools in recent assistant tool_calls
408
+ const recentMsgs = (messages || []).slice(-6);
409
+ for (const m of recentMsgs) {
410
+ if (m.tool_calls) {
411
+ for (const tc of m.tool_calls) {
412
+ if (tc.function?.name === name) score += 80;
413
+ }
414
+ }
415
+ }
416
+
417
+ // Penalize process tool (exec is critical for agent operation)
418
+ if (name === "process") score -= 30;
419
+
420
+ scores.set(name, { tool: t, score });
421
+ }
422
+
423
+ const sorted = [...scores.values()].sort((a, b) => b.score - a.score);
424
+ return sorted.slice(0, max).map((s) => s.tool);
425
+ }
426
+
427
+ async function proxyRequest(clientReq, clientRes) {
428
+ const chunks = [];
429
+ for await (const chunk of clientReq) chunks.push(chunk);
430
+ let body = Buffer.concat(chunks);
431
+ const contentType = clientReq.headers["content-type"] || "";
432
+
433
+ const isChatCompletion =
434
+ contentType.includes("application/json") &&
435
+ clientReq.url.includes("/chat/completions");
436
+
437
+ let parsed = null;
438
+ if (isChatCompletion) {
439
+ try {
440
+ parsed = JSON.parse(body.toString("utf-8"));
441
+ } catch {
442
+ // pass through
443
+ }
444
+ }
445
+
446
+ // For non-tool requests or non-chat-completions, just proxy through
447
+ if (!parsed || !parsed.tools || !Array.isArray(parsed.tools) || parsed.tools.length === 0) {
448
+ const res = await sendUpstream(clientReq.url, clientReq.method, clientReq.headers, body);
449
+ clientRes.writeHead(res.status, res.headers);
450
+ clientRes.end(res.body);
451
+ return;
452
+ }
453
+
454
+ // Save original tools for reference
455
+ const allTools = [...parsed.tools];
456
+
457
+ // Tool request — proactively limit tools to reduce parallel call tendency
458
+ parsed.parallel_tool_calls = false;
459
+ // Force non-streaming for tool requests — proxy buffers full response anyway,
460
+ // and streaming (SSE) prevents us from inspecting/fixing tool calls
461
+ const wasStreaming = parsed.stream;
462
+ parsed.stream = false;
463
+ delete parsed.stream_options;
464
+ // With 94 tools the model almost always tries parallel calls.
465
+ // Reduce to max 12 most relevant tools on first attempt.
466
+ if (allTools.length > 12) {
467
+ parsed.tools = reduceTools(allTools, parsed.messages, 12);
468
+ const names = parsed.tools.map(t => t.function?.name).join(",");
469
+ console.log(`[nvidia-proxy] proactive reduction: ${allTools.length}→${parsed.tools.length} tools [${names}]`);
470
+ }
471
+
472
+ // Add system instruction to force single tool call
473
+ if (Array.isArray(parsed.messages)) {
474
+ const hasInstruction = parsed.messages.some(
475
+ (m) => m.role === "system" && typeof m.content === "string" && m.content.includes("ONE tool at a time"),
476
+ );
477
+ if (!hasInstruction) {
478
+ parsed.messages.unshift({
479
+ role: "system",
480
+ content: SINGLE_TOOL_INSTRUCTION,
481
+ });
482
+ }
483
+ }
484
+
485
+ // Trim system messages FIRST to free up budget for conversation history
486
+ trimSystemMessages(parsed);
487
+ trimConversationHistory(parsed);
488
+
489
+ // Track tool call rounds per-model to avoid cross-session interference.
490
+ if (Array.isArray(parsed.messages) && parsed.tools?.length > 0) {
491
+ const modelKey = parsed.model || "unknown";
492
+ const nonSystemMsgs = parsed.messages.filter(m => m.role !== "system");
493
+ const lastNonSystem = nonSystemMsgs[nonSystemMsgs.length - 1];
494
+ const hasToolResult = lastNonSystem?.role === "tool" || lastNonSystem?.role === "toolResult";
495
+
496
+ let counter = toolCallCounters.get(modelKey) || 0;
497
+ if (hasToolResult) {
498
+ counter++;
499
+ } else if (lastNonSystem?.role === "user") {
500
+ counter = 0;
501
+ }
502
+ toolCallCounters.set(modelKey, counter);
503
+
504
+ if (counter >= 6) {
505
+ console.log(`[nvidia-proxy] TOOL LIMIT: ${counter} consecutive tool rounds (${modelKey}) — stripping tools, forcing text response`);
506
+ parsed.tools = [];
507
+ delete parsed.tool_choice;
508
+ parsed.messages.push({
509
+ role: "system",
510
+ content: "STOP calling tools. You have made 6+ tool calls already. NOW respond to the user with a comprehensive text answer based on what you've gathered. Do NOT call any more tools. Do NOT output any special tokens or markup like <|tool_call_begin|> or <|tool_calls_section_begin|>. Write plain text only. Start your response with a greeting or summary — no XML, no special tokens, just normal language.",
511
+ });
512
+ toolCallCounters.set(modelKey, 0);
513
+ }
514
+ }
515
+
516
+ const model = parsed.model || "unknown";
517
+
518
+ // If client wanted streaming, start SSE headers early so we can send keep-alive
519
+ // comments while waiting for NVIDIA. This keeps the gateway's typing indicator alive.
520
+ let sseStarted = false;
521
+ let keepAliveTimer = null;
522
+ function startSSEKeepAlive() {
523
+ if (!wasStreaming || sseStarted) return;
524
+ sseStarted = true;
525
+ clientRes.writeHead(200, {
526
+ "content-type": "text/event-stream; charset=utf-8",
527
+ "cache-control": "no-cache",
528
+ "connection": "keep-alive",
529
+ });
530
+ keepAliveTimer = setInterval(() => {
531
+ try { clientRes.write(": keep-alive\n\n"); } catch {}
532
+ }, 5000);
533
+ }
534
+ function stopKeepAlive() {
535
+ if (keepAliveTimer) { clearInterval(keepAliveTimer); keepAliveTimer = null; }
536
+ }
537
+
538
+ for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
539
+ const currentToolCount = parsed.tools ? parsed.tools.length : 0;
540
+ const reqBody = Buffer.from(JSON.stringify(parsed), "utf-8");
541
+ console.log(
542
+ `[nvidia-proxy] ${new Date().toISOString()} attempt=${attempt} model=${model} tools=${currentToolCount} bodyLen=${reqBody.length}`,
543
+ );
544
+
545
+ // Start keep-alive comments while NVIDIA processes
546
+ if (wasStreaming) startSSEKeepAlive();
547
+
548
+ let res;
549
+ // Handle 429 rate limiting with internal retries + backoff
550
+ for (let r429 = 0; r429 <= MAX_429_RETRIES; r429++) {
551
+ res = await sendUpstream(clientReq.url, clientReq.method, clientReq.headers, reqBody);
552
+ if (res.status !== 429 || r429 === MAX_429_RETRIES) break;
553
+ const delay = RATE_LIMIT_DELAY_MS * (r429 + 1);
554
+ console.log(`[nvidia-proxy] 429 rate limited, waiting ${delay}ms (retry ${r429 + 1}/${MAX_429_RETRIES})...`);
555
+ await new Promise(r => setTimeout(r, delay));
556
+ }
557
+
558
+ if (res.status === 400) {
559
+ const errText = res.body.toString("utf-8");
560
+ if (errText.includes("single tool-calls") && attempt < MAX_RETRIES) {
561
+ console.log(`[nvidia-proxy] 400 parallel tool-calls rejected, retrying (${attempt}/${MAX_RETRIES})...`);
562
+
563
+ if (attempt === 1) {
564
+ // Attempt 2: reduce to 6 tools + strip tool_calls from history
565
+ // The massive conversation history with tool_calls trains the model to call multiple
566
+ parsed.tools = reduceTools(allTools, parsed.messages, 6);
567
+ stripToolCallHistory(parsed.messages);
568
+ const toolNames = parsed.tools.map(t => t.function?.name).join(",");
569
+ console.log(`[nvidia-proxy] retry: ${parsed.tools.length} tools [${toolNames}], stripped history`);
570
+ } else if (attempt === 2) {
571
+ // Attempt 3: single tool, forced choice
572
+ parsed.tools = reduceTools(allTools, parsed.messages, 1);
573
+ const topTool = parsed.tools[0]?.function?.name;
574
+ if (topTool) {
575
+ parsed.tool_choice = { type: "function", function: { name: topTool } };
576
+ }
577
+ console.log(`[nvidia-proxy] retry: 1 tool, forced=${topTool}`);
578
+ } else {
579
+ // Attempt 4 (final): strip all tools, text-only
580
+ delete parsed.tools;
581
+ delete parsed.tool_choice;
582
+ delete parsed.parallel_tool_calls;
583
+ stripToolCallHistory(parsed.messages);
584
+ console.log(`[nvidia-proxy] final retry: stripped all tools, text-only`);
585
+ }
586
+ continue;
587
+ }
588
+ }
589
+
590
+ // Log tool calls in successful responses
591
+ if (res.status === 200) {
592
+ try {
593
+ const bodyStr = res.body.toString("utf-8");
594
+ const peek = JSON.parse(bodyStr);
595
+ const tc = peek.choices?.[0]?.message?.tool_calls;
596
+ if (tc && tc.length > 0) {
597
+ const names = tc.map(c => c.function?.name).join(", ");
598
+ console.log(`[nvidia-proxy] model called: [${names}] (${tc.length} calls)`);
599
+ } else {
600
+ const content = peek.choices?.[0]?.message?.content;
601
+ const fr = peek.choices?.[0]?.finish_reason;
602
+ console.log(`[nvidia-proxy] model response: text (${content ? content.length : 0} chars) finish_reason=${fr}`);
603
+ if (!content || content.length === 0) {
604
+ console.log(`[nvidia-proxy] EMPTY RESPONSE DEBUG: ${JSON.stringify(peek.choices?.[0]).slice(0, 500)}`);
605
+ }
606
+ }
607
+ } catch {
608
+ // SSE streaming responses can't be parsed as JSON — this is expected
609
+ }
610
+ }
611
+
612
+ // Fix ghost tool calls: finish_reason says "tool_calls" but no actual tool_calls present
613
+ if (res.status === 200 && parsed.tools) {
614
+ try {
615
+ const resBody = JSON.parse(res.body.toString("utf-8"));
616
+ const choice = resBody.choices?.[0];
617
+ if (choice && (choice.finish_reason === "tool_calls" || choice.finish_reason === "function_call") && !choice.message?.tool_calls?.length) {
618
+ console.warn(`[nvidia-proxy] GHOST TOOL CALL: finish_reason=${choice.finish_reason} but no tool_calls — fixing to stop`);
619
+ choice.finish_reason = "stop";
620
+ stopKeepAlive();
621
+ sendOk(clientRes, resBody, res.headers, wasStreaming);
622
+ return;
623
+ }
624
+ } catch {
625
+ // Not JSON — pass through
626
+ }
627
+ }
628
+
629
+ // Check for hallucinated/invalid tool names (e.g., Kimi K2.5 "callauto" bug)
630
+ if (res.status === 200 && parsed.tools) {
631
+ try {
632
+ const resBody = JSON.parse(res.body.toString("utf-8"));
633
+ const choice = resBody.choices?.[0];
634
+ if (choice?.message?.tool_calls) {
635
+ // Compare against ALL original tools, not just the reduced set
636
+ const allToolNames = new Set(allTools.map(t => t.function?.name));
637
+ const invalidCalls = choice.message.tool_calls.filter(
638
+ tc => !tc.function?.name || !allToolNames.has(tc.function.name)
639
+ );
640
+ if (invalidCalls.length > 0) {
641
+ const badNames = invalidCalls.map(tc => tc.function?.name || "(empty)").join(", ");
642
+ console.warn(`[nvidia-proxy] CALLAUTO DETECTED: invalid tool names [${badNames}] — stripping tool_calls, returning text-only`);
643
+ // Strip invalid tool calls, keep only content
644
+ choice.message.tool_calls = choice.message.tool_calls.filter(
645
+ tc => tc.function?.name && allToolNames.has(tc.function.name)
646
+ );
647
+ if (choice.message.tool_calls.length === 0) {
648
+ delete choice.message.tool_calls;
649
+ choice.finish_reason = "stop";
650
+ }
651
+ stopKeepAlive();
652
+ sendOk(clientRes, resBody, res.headers, wasStreaming);
653
+ return;
654
+ }
655
+ }
656
+ } catch {
657
+ // Not JSON — pass through
658
+ }
659
+ }
660
+
661
+ // Check for successful response with multiple tool calls — trim to just the first one
662
+ if (res.status === 200 && parsed.tools) {
663
+ try {
664
+ const resBody = JSON.parse(res.body.toString("utf-8"));
665
+ const choice = resBody.choices?.[0];
666
+ if (choice?.message?.tool_calls && choice.message.tool_calls.length > 1) {
667
+ console.log(
668
+ `[nvidia-proxy] trimming ${choice.message.tool_calls.length} tool_calls to 1 (${choice.message.tool_calls[0].function?.name})`,
669
+ );
670
+ choice.message.tool_calls = [choice.message.tool_calls[0]];
671
+ stopKeepAlive();
672
+ sendOk(clientRes, resBody, res.headers, wasStreaming);
673
+ return;
674
+ }
675
+ } catch {
676
+ // Not JSON or parse error — pass through as-is
677
+ }
678
+ }
679
+
680
+ // Success or non-retryable error
681
+ stopKeepAlive();
682
+ if (res.status >= 400) {
683
+ console.error(`[nvidia-proxy] ${res.status} ERROR: ${res.body.toString("utf-8").slice(0, 300)}`);
684
+ if (!clientRes.headersSent) {
685
+ clientRes.writeHead(res.status, res.headers);
686
+ }
687
+ clientRes.end(res.body);
688
+ return;
689
+ }
690
+
691
+ console.log(`[nvidia-proxy] ${res.status} OK (attempt ${attempt})`);
692
+ if (wasStreaming && res.status === 200) {
693
+ try {
694
+ const resBody = JSON.parse(res.body.toString("utf-8"));
695
+ sendOk(clientRes, resBody, res.headers, true);
696
+ } catch {
697
+ // Can't parse — send raw
698
+ if (!clientRes.headersSent) {
699
+ clientRes.writeHead(res.status, res.headers);
700
+ }
701
+ clientRes.end(res.body);
702
+ }
703
+ } else {
704
+ if (!clientRes.headersSent) {
705
+ clientRes.writeHead(res.status, res.headers);
706
+ }
707
+ clientRes.end(res.body);
708
+ }
709
+ return;
710
+ }
711
+ }
712
+
713
+ const server = http.createServer(proxyRequest);
714
+
715
+ server.listen(port, "127.0.0.1", () => {
716
+ console.log(`[nvidia-proxy] listening on http://127.0.0.1:${port}`);
717
+ console.log(`[nvidia-proxy] proxying to ${targetUrl.origin}`);
718
+ console.log(`[nvidia-proxy] retry strategy: 12 tools→6 tools→1 tool (forced)→text-only (max ${MAX_RETRIES} attempts)`);
719
+ console.log(`[nvidia-proxy] also trims multi-tool responses to single tool call`);
720
+ });
721
+
722
+ for (const sig of ["SIGINT", "SIGTERM"]) {
723
+ process.on(sig, () => {
724
+ console.log(`[nvidia-proxy] ${sig} received, shutting down`);
725
+ server.close(() => process.exit(0));
726
+ });
727
+ }