@smilintux/skcapstone 0.4.6 → 0.4.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/.github/workflows/publish.yml +8 -1
  2. package/docs/CUSTOM_AGENT.md +184 -0
  3. package/docs/GETTING_STARTED.md +3 -0
  4. package/launchd/com.skcapstone.daemon.plist +52 -0
  5. package/launchd/com.skcapstone.memory-compress.plist +45 -0
  6. package/launchd/com.skcapstone.skcomm-heartbeat.plist +33 -0
  7. package/launchd/com.skcapstone.skcomm-queue-drain.plist +34 -0
  8. package/launchd/install-launchd.sh +156 -0
  9. package/package.json +1 -1
  10. package/pyproject.toml +1 -1
  11. package/scripts/archive-sessions.sh +88 -0
  12. package/scripts/install.sh +39 -8
  13. package/scripts/notion-api.py +259 -0
  14. package/scripts/nvidia-proxy.mjs +856 -0
  15. package/scripts/proxy-monitor.sh +89 -0
  16. package/scripts/skgateway.mjs +856 -0
  17. package/scripts/telegram-catchup-all.sh +136 -0
  18. package/src/skcapstone/__init__.py +1 -1
  19. package/src/skcapstone/blueprints/builtins/itil-operations.yaml +40 -0
  20. package/src/skcapstone/cli/__init__.py +2 -0
  21. package/src/skcapstone/cli/daemon.py +116 -41
  22. package/src/skcapstone/cli/itil.py +434 -0
  23. package/src/skcapstone/consciousness_config.py +27 -0
  24. package/src/skcapstone/coordination.py +1 -0
  25. package/src/skcapstone/daemon.py +19 -11
  26. package/src/skcapstone/dreaming.py +761 -0
  27. package/src/skcapstone/fuse_mount.py +21 -13
  28. package/src/skcapstone/heartbeat.py +33 -29
  29. package/src/skcapstone/itil.py +1104 -0
  30. package/src/skcapstone/launchd.py +426 -0
  31. package/src/skcapstone/mcp_server.py +258 -0
  32. package/src/skcapstone/mcp_tools/__init__.py +2 -0
  33. package/src/skcapstone/mcp_tools/gtd_tools.py +1 -1
  34. package/src/skcapstone/mcp_tools/itil_tools.py +657 -0
  35. package/src/skcapstone/onboard.py +130 -10
  36. package/src/skcapstone/scheduled_tasks.py +107 -0
  37. package/src/skcapstone/service_health.py +81 -2
  38. package/src/skcapstone/systemd.py +17 -0
@@ -0,0 +1,856 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * NVIDIA NIM API Proxy
4
+ *
5
+ * Sits between OpenClaw and the NVIDIA NIM API. Handles the fact that
6
+ * NVIDIA NIM rejects responses with multiple tool calls (400 error)
7
+ * even when parallel_tool_calls: false is set.
8
+ *
9
+ * Strategy:
10
+ * 1. Inject parallel_tool_calls: false + system instruction
11
+ * 2. On 400 "single tool-calls": reduce to max 6 tools + force tool_choice
12
+ * 3. On second 400: send with just 1 tool (the most likely one) via tool_choice
13
+ * 4. Final fallback: strip tools, get text-only response
14
+ *
15
+ * Usage:
16
+ * node nvidia-proxy.mjs [--port 18780] [--target https://integrate.api.nvidia.com/v1]
17
+ *
18
+ * Then point OpenClaw's nvidia provider baseUrl to http://127.0.0.1:18780/v1
19
+ */
20
+
21
+ import http from "node:http";
22
+ import https from "node:https";
23
+ import { URL } from "node:url";
24
+
25
+ const DEFAULT_PORT = parseInt(process.env.NVIDIA_PROXY_PORT || "18780", 10);
26
+ const DEFAULT_TARGET = process.env.NVIDIA_PROXY_TARGET || "https://integrate.api.nvidia.com/v1";
27
+ const MAX_RETRIES = 4;
28
+ const MAX_429_RETRIES = 3;
29
+ const RATE_LIMIT_DELAY_MS = 2000;
30
+ const MAX_SYSTEM_BYTES = 40000;
31
+ const toolCallCounters = new Map(); // Per-model tool call counters
32
+
33
+ const args = process.argv.slice(2);
34
+ let port = DEFAULT_PORT;
35
+ let targetBase = DEFAULT_TARGET;
36
+
37
+ for (let i = 0; i < args.length; i++) {
38
+ if (args[i] === "--port" && args[i + 1]) port = parseInt(args[++i], 10);
39
+ if (args[i] === "--target" && args[i + 1]) targetBase = args[++i];
40
+ }
41
+
42
+ const targetUrl = new URL(targetBase.replace(/\/v1\/?$/, ""));
43
+
44
+ /** Send a request to NVIDIA and return { status, headers, body } */
45
+ function sendUpstream(reqUrl, method, headers, body) {
46
+ return new Promise((resolve) => {
47
+ const upstream = new URL(reqUrl, targetUrl);
48
+ const proxyHeaders = { ...headers };
49
+ proxyHeaders.host = upstream.host;
50
+ proxyHeaders["content-length"] = body.length;
51
+ delete proxyHeaders.connection;
52
+ delete proxyHeaders["keep-alive"];
53
+
54
+ const transport = upstream.protocol === "https:" ? https : http;
55
+ const upstreamReq = transport.request(
56
+ {
57
+ hostname: upstream.hostname,
58
+ port: upstream.port || (upstream.protocol === "https:" ? 443 : 80),
59
+ path: upstream.pathname + upstream.search,
60
+ method,
61
+ headers: proxyHeaders,
62
+ },
63
+ (upstreamRes) => {
64
+ const chunks = [];
65
+ upstreamRes.on("data", (c) => chunks.push(c));
66
+ upstreamRes.on("end", () => {
67
+ resolve({
68
+ status: upstreamRes.statusCode,
69
+ headers: upstreamRes.headers,
70
+ body: Buffer.concat(chunks),
71
+ });
72
+ });
73
+ },
74
+ );
75
+ upstreamReq.on("error", (err) => {
76
+ resolve({ status: 502, headers: {}, body: Buffer.from(JSON.stringify({ error: { message: err.message } })) });
77
+ });
78
+ upstreamReq.write(body);
79
+ upstreamReq.end();
80
+ });
81
+ }
82
+
83
+ /**
84
+ * Send a 200 response, converting to SSE if the original request was streaming.
85
+ * @param {http.ServerResponse} clientRes
86
+ * @param {object} resBody - parsed JSON response body
87
+ * @param {object} headers - upstream response headers
88
+ * @param {boolean} asSSE - whether to wrap as SSE
89
+ */
90
+ /**
91
+ * Sanitize model text content — strip leaked tool call markup from Kimi K2.5.
92
+ * When tools are stripped, Kimi embeds raw tool syntax in text output.
93
+ */
94
+ function sanitizeContent(text) {
95
+ if (!text) return text;
96
+ // Strip Kimi's leaked tool call markup blocks
97
+ let cleaned = text.replace(/<\|tool_calls_section_begin\|>[\s\S]*?<\|tool_calls_section_end\|>/g, "");
98
+ // Strip individual tool call fragments that might not have the section wrapper
99
+ cleaned = cleaned.replace(/<\|tool_call_begin\|>[\s\S]*?<\|tool_call_end\|>/g, "");
100
+ cleaned = cleaned.replace(/<\|tool_call_argument_begin\|>[\s\S]*?(<\|tool_call_end\|>|$)/g, "");
101
+
102
+ // Strip leaked chain-of-thought / planning text.
103
+ // Kimi sometimes outputs its reasoning as user-visible text, e.g.:
104
+ // "The user wants me to... I should first... Let me call the ritual tool first."
105
+ // Detect: starts with "The user wants me to" or "I need to" or "I should" followed
106
+ // by planning language and ending before any real content.
107
+ const thinkingPattern = /^(The user wants me to|I need to|I should|Let me first|First,? I'?ll|I'?ll start by|My plan is to)[^\n]*\n?(\n?(I should|I need to|Let me|I'?ll|Then I|First|Next)[^\n]*\n?)*/i;
108
+ const thinkingMatch = cleaned.match(thinkingPattern);
109
+ if (thinkingMatch) {
110
+ const thinkingText = thinkingMatch[0];
111
+ const remainder = cleaned.slice(thinkingText.length).trim();
112
+ // Only strip if the thinking block is the ENTIRE response or is followed by real content
113
+ if (!remainder) {
114
+ // Entire response is just planning — suppress it, let the tool call go through
115
+ console.log(`[nvidia-proxy] SANITIZED: stripped leaked thinking (${thinkingText.length} chars)`);
116
+ cleaned = "";
117
+ } else if (remainder.length > 50) {
118
+ // Has real content after the thinking preamble — keep only the real part
119
+ console.log(`[nvidia-proxy] SANITIZED: stripped thinking preamble (${thinkingText.length} chars), kept ${remainder.length} chars`);
120
+ cleaned = remainder;
121
+ }
122
+ }
123
+
124
+ // Clean up leftover whitespace from removed blocks
125
+ cleaned = cleaned.replace(/\n{3,}/g, "\n\n").trim();
126
+ if (cleaned !== text) {
127
+ console.log(`[nvidia-proxy] SANITIZED: stripped leaked tool call markup (${text.length} → ${cleaned.length} chars)`);
128
+ }
129
+ // Don't inject fallback here — let sendOk() handle it, since it knows
130
+ // whether tool_calls exist alongside the empty content. Injecting here
131
+ // causes false "hiccup" messages when the model made a valid tool call
132
+ // but its text content was all leaked markup/thinking.
133
+ return cleaned;
134
+ }
135
+
136
+ function sendOk(clientRes, resBody, headers, asSSE) {
137
+ // Sanitize text content before sending
138
+ const choice = resBody.choices?.[0];
139
+ if (choice?.message?.content) {
140
+ choice.message.content = sanitizeContent(choice.message.content);
141
+ }
142
+ // Kimi K2.5 sometimes puts its response in "reasoning" instead of "content"
143
+ // Only promote if reasoning is substantial (>200 chars) — short reasoning like
144
+ // "Let me call the tool" is just inner monologue that shouldn't be user-facing
145
+ if (choice?.message && !choice.message.content && choice.message.reasoning) {
146
+ const cleaned = sanitizeContent(choice.message.reasoning.trim());
147
+ if (cleaned.length > 150) {
148
+ choice.message.content = cleaned;
149
+ console.log(`[nvidia-proxy] promoted reasoning→content (${cleaned.length} chars)`);
150
+ } else {
151
+ console.log(`[nvidia-proxy] suppressed short reasoning (${cleaned.length} chars): ${cleaned.slice(0, 80)}...`);
152
+ }
153
+ delete choice.message.reasoning;
154
+ }
155
+ // If model returned empty text (no tool calls), inject fallback so gateway delivers something
156
+ if (choice?.message && !choice.message.tool_calls?.length && choice.finish_reason !== "tool_calls") {
157
+ if (!choice.message.content || choice.message.content.trim().length === 0) {
158
+ choice.message.content = "I had a brief processing hiccup — could you say that again? 💜";
159
+ console.log(`[nvidia-proxy] injected fallback for empty text response`);
160
+ }
161
+ }
162
+ if (asSSE) {
163
+ if (!clientRes.headersSent) {
164
+ const sseHeaders = { ...headers };
165
+ sseHeaders["content-type"] = "text/event-stream; charset=utf-8";
166
+ delete sseHeaders["content-length"];
167
+ delete sseHeaders["transfer-encoding"];
168
+ sseHeaders["cache-control"] = "no-cache";
169
+ clientRes.writeHead(200, sseHeaders);
170
+ }
171
+
172
+ const base = { id: resBody.id, object: "chat.completion.chunk", created: resBody.created, model: resBody.model };
173
+ const choice = resBody.choices?.[0];
174
+
175
+ if (!choice) {
176
+ clientRes.write("data: [DONE]\n\n");
177
+ clientRes.end();
178
+ return;
179
+ }
180
+
181
+ const msg = choice.message || {};
182
+
183
+ // 1. Role chunk
184
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: { role: msg.role || "assistant" }, finish_reason: null }] })}\n\n`);
185
+
186
+ // 2. Content chunks (split into smaller pieces for proper streaming behavior)
187
+ const content = msg.content || "";
188
+ if (content) {
189
+ const chunkSize = 100;
190
+ for (let i = 0; i < content.length; i += chunkSize) {
191
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: { content: content.slice(i, i + chunkSize) }, finish_reason: null }] })}\n\n`);
192
+ }
193
+ }
194
+
195
+ // 3. Tool calls (if any) — send as a single delta
196
+ if (msg.tool_calls && msg.tool_calls.length > 0) {
197
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: { tool_calls: msg.tool_calls }, finish_reason: null }] })}\n\n`);
198
+ }
199
+
200
+ // 4. Usage chunk (if present)
201
+ if (resBody.usage) {
202
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: {}, finish_reason: choice.finish_reason || "stop" }], usage: resBody.usage })}\n\n`);
203
+ } else {
204
+ clientRes.write(`data: ${JSON.stringify({ ...base, choices: [{ index: 0, delta: {}, finish_reason: choice.finish_reason || "stop" }] })}\n\n`);
205
+ }
206
+
207
+ clientRes.write("data: [DONE]\n\n");
208
+ clientRes.end();
209
+ } else {
210
+ const body = Buffer.from(JSON.stringify(resBody), "utf-8");
211
+ const outHeaders = { ...headers };
212
+ outHeaders["content-length"] = body.length;
213
+ clientRes.writeHead(200, outHeaders);
214
+ clientRes.end(body);
215
+ }
216
+ }
217
+
218
+ const SINGLE_TOOL_INSTRUCTION =
219
+ "You MUST call exactly ONE tool per response. Never call multiple tools at once.";
220
+
221
+ const MAX_BODY_BYTES = 120000;
222
+
223
+ /**
224
+ * Trim conversation history to keep body size under MAX_BODY_BYTES.
225
+ * Preserves: system messages, first 2 user/assistant messages (identity/rehydration),
226
+ * and the most recent messages. Drops middle messages first.
227
+ * Tool result messages with large content get their content truncated first.
228
+ */
229
+ function trimConversationHistory(parsed) {
230
+ if (!Array.isArray(parsed.messages) || parsed.messages.length < 6) return;
231
+
232
+ // Debug: log message roles
233
+ const roleSummary = parsed.messages.map(m => m.role).join(",");
234
+ console.log(`[nvidia-proxy] conversation roles (${parsed.messages.length} msgs): ${roleSummary}`);
235
+
236
+ // First pass: truncate large tool results (keep first 500 chars)
237
+ for (const m of parsed.messages) {
238
+ if (m.role === "tool" || m.role === "toolResult") {
239
+ if (typeof m.content === "string" && m.content.length > 1500) {
240
+ m.content = m.content.slice(0, 1500) + "\n...[truncated]";
241
+ } else if (Array.isArray(m.content)) {
242
+ for (const c of m.content) {
243
+ if (c.type === "text" && typeof c.text === "string" && c.text.length > 1500) {
244
+ c.text = c.text.slice(0, 1500) + "\n...[truncated]";
245
+ }
246
+ }
247
+ }
248
+ }
249
+ }
250
+
251
+ // Check if we're still over budget
252
+ let bodySize = Buffer.byteLength(JSON.stringify(parsed), "utf-8");
253
+ if (bodySize <= MAX_BODY_BYTES) return;
254
+
255
+ // Second pass: drop middle messages, then progressively shrink tail until under budget
256
+ const msgs = parsed.messages;
257
+ const system = msgs.filter(m => m.role === "system");
258
+ const nonSystem = msgs.filter(m => m.role !== "system");
259
+
260
+ if (nonSystem.length <= 4) return; // not enough to trim
261
+
262
+ const keepStart = 2;
263
+ let keepEnd = Math.min(12, nonSystem.length - keepStart);
264
+
265
+ // Loop: keep reducing tail until under budget
266
+ while (keepEnd >= 2) {
267
+ const dropped = nonSystem.length - keepStart - keepEnd;
268
+ const trimmed = [
269
+ ...system,
270
+ ...nonSystem.slice(0, keepStart),
271
+ ...(dropped > 0 ? [{ role: "system", content: `[${dropped} earlier messages trimmed to save context]` }] : []),
272
+ ...nonSystem.slice(-keepEnd),
273
+ ];
274
+ const candidateSize = Buffer.byteLength(JSON.stringify({ ...parsed, messages: trimmed }), "utf-8");
275
+ if (candidateSize <= MAX_BODY_BYTES) {
276
+ parsed.messages = trimmed;
277
+ console.log(`[nvidia-proxy] trimmed history: dropped ${dropped} middle messages, keepEnd=${keepEnd}, bodyLen now ~${candidateSize}`);
278
+ return;
279
+ }
280
+ keepEnd--;
281
+ }
282
+
283
+ // Last resort: system + first user message + last N non-system
284
+ // Keep enough tail to include tool result pairs (assistant tool_call + tool result)
285
+ const firstUser = nonSystem.find(m => m.role === "user");
286
+ // Try last 4 first (covers tool_call + result + next tool_call + result)
287
+ // Then fall back to last 2 if still too big
288
+ for (const tailSize of [4, 2]) {
289
+ const lastN = nonSystem.slice(-tailSize);
290
+ const minimal = [
291
+ ...system,
292
+ ...(firstUser && !lastN.includes(firstUser) ? [firstUser, { role: "system", content: "[earlier messages trimmed — answer the user's request using tool results below]" }] : []),
293
+ ...lastN,
294
+ ];
295
+ const candidateSize = Buffer.byteLength(JSON.stringify({ ...parsed, messages: minimal }), "utf-8");
296
+ if (candidateSize <= MAX_BODY_BYTES) {
297
+ parsed.messages = minimal;
298
+ console.log(`[nvidia-proxy] trimmed history: AGGRESSIVE — kept system + first user + last ${tailSize}, bodyLen now ~${candidateSize}`);
299
+ return;
300
+ }
301
+ }
302
+ // Absolute last resort
303
+ const lastTwo = nonSystem.slice(-2);
304
+ const minimal = [
305
+ ...system,
306
+ ...(firstUser && !lastTwo.includes(firstUser) ? [firstUser, { role: "system", content: "[earlier messages trimmed — answer the user's request using tool results below]" }] : []),
307
+ ...lastTwo,
308
+ ];
309
+ parsed.messages = minimal;
310
+ bodySize = Buffer.byteLength(JSON.stringify(parsed), "utf-8");
311
+ console.log(`[nvidia-proxy] trimmed history: AGGRESSIVE — kept system + first user + last 2, bodyLen now ~${bodySize}`);
312
+ }
313
+
314
+ /**
315
+ * Trim system messages to keep total system content under MAX_SYSTEM_BYTES.
316
+ * Finds the largest system messages and truncates them, keeping head + tail
317
+ * with a trimming notice in the middle.
318
+ */
319
+ function trimSystemMessages(parsed) {
320
+ if (!Array.isArray(parsed.messages)) return;
321
+
322
+ const systemMsgs = parsed.messages.filter(m => m.role === "system" && typeof m.content === "string");
323
+ if (systemMsgs.length === 0) return;
324
+
325
+ const before = systemMsgs.reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
326
+ if (before <= MAX_SYSTEM_BYTES) return;
327
+
328
+ let trimmedCount = 0;
329
+
330
+ // Sort by size descending to trim largest first
331
+ const sorted = [...systemMsgs].sort((a, b) => b.content.length - a.content.length);
332
+
333
+ for (const msg of sorted) {
334
+ // Re-measure current total
335
+ const currentTotal = parsed.messages
336
+ .filter(m => m.role === "system" && typeof m.content === "string")
337
+ .reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
338
+ if (currentTotal <= MAX_SYSTEM_BYTES) break;
339
+
340
+ // Skip messages already under 4000 chars
341
+ if (msg.content.length <= 4000) break;
342
+
343
+ const head = msg.content.slice(0, 3000);
344
+ const tail = msg.content.slice(-1000);
345
+ msg.content = head + "\n\n[...content trimmed to save context — use skmemory_ritual tool for full identity...]\n\n" + tail;
346
+ trimmedCount++;
347
+ }
348
+
349
+ if (trimmedCount > 0) {
350
+ const after = parsed.messages
351
+ .filter(m => m.role === "system" && typeof m.content === "string")
352
+ .reduce((sum, m) => sum + Buffer.byteLength(m.content, "utf-8"), 0);
353
+ console.log(`[nvidia-proxy] trimmed system prompt: ${before} → ${after} bytes (${trimmedCount} messages trimmed)`);
354
+ }
355
+ }
356
+
357
+ /**
358
+ * Strip tool_calls from conversation history to prevent the model from
359
+ * learning the pattern of calling multiple tools. Converts assistant
360
+ * tool_call messages to plain text and removes tool result messages.
361
+ */
362
+ function stripToolCallHistory(messages) {
363
+ if (!Array.isArray(messages)) return;
364
+ // Remove tool result messages
365
+ for (let i = messages.length - 1; i >= 0; i--) {
366
+ const m = messages[i];
367
+ if (m.role === "tool" || m.role === "toolResult") {
368
+ messages.splice(i, 1);
369
+ } else if (m.role === "assistant" && m.tool_calls) {
370
+ // Convert tool_call messages to plain text summaries
371
+ const toolNames = m.tool_calls.map((tc) => tc.function?.name).join(", ");
372
+ m.content = m.content || `[Used: ${toolNames}]`;
373
+ delete m.tool_calls;
374
+ }
375
+ }
376
+ }
377
+
378
+ /** Tools that ALWAYS survive reduction — guaranteed slots, never cut */
379
+ const GUARANTEED_TOOLS = [
380
+ "exec", "read", "write", "edit", "message",
381
+ ];
382
+
383
+ /**
384
+ * Semantic keyword → tool group mapping.
385
+ * When keywords appear in the user's last message, the associated tools
386
+ * get a +300 boost (stronger than any other signal) so they make the cut.
387
+ */
388
+ const TOOL_GROUPS = {
389
+ // Emotions & Cloud 9
390
+ "emotion|oof|feb|feeling|love|cloud9|cloud 9|rehydrat|warmth|heart": [
391
+ "cloud9_generate", "cloud9_rehydrate", "cloud9_list", "cloud9_validate",
392
+ "cloud9_oof", "cloud9_love", "cloud9_seed_plant", "cloud9_seed_germinate",
393
+ ],
394
+ // GTD & Coordination
395
+ "gtd|inbox|task|todo|coordination|coord|board|claim|assign": [
396
+ "skcapstone_coord_status", "skcapstone_coord_claim", "skcapstone_coord_complete",
397
+ "skcapstone_coord_create", "skcapstone_summary",
398
+ ],
399
+ // Git & Code
400
+ "git|repo|commit|pull request|pr|issue|branch|merge|forgejo": [
401
+ "skgit_repos", "skgit_issues", "skgit_create_issue", "skgit_pulls", "skgit_status",
402
+ ],
403
+ // Chat & Communication
404
+ "chat|inbox|dm|group chat|peer|send message|who.s online|thread": [
405
+ "skchat_send", "skchat_inbox", "skchat_history", "skchat_search",
406
+ "skchat_who", "skchat_group_send", "skchat_group_list", "skchat_send_file",
407
+ "skchat_status", "skcomm_send", "skcomm_status",
408
+ ],
409
+ // Security
410
+ "security|scan|secret|vulnerab|audit|injection|phishing|threat": [
411
+ "sksecurity_scan", "sksecurity_screen", "sksecurity_secrets",
412
+ "sksecurity_events", "sksecurity_status", "sksecurity_audit",
413
+ ],
414
+ // Identity & Auth
415
+ "identity|did|auth|pma|capauth|verify|mesh|peer": [
416
+ "capauth_profile", "capauth_verify", "capauth_pma_status",
417
+ "capauth_mesh_peers", "capauth_mesh_status",
418
+ ],
419
+ // Soul & Agent management
420
+ "soul|persona|swap|agent|switch soul|who am i|whoami": [
421
+ "skcapstone_soul_list", "skcapstone_soul_swap", "skcapstone_soul_status",
422
+ "skcapstone_soul_show", "skcapstone_agent_list", "skcapstone_agent_status",
423
+ "skcapstone_whoami",
424
+ ],
425
+ // Web & Research
426
+ "search|web|browse|fetch|url|google|look up|find online": [
427
+ "web_search", "web_fetch",
428
+ ],
429
+ // Status & Health
430
+ "status|health|doctor|diagnos": [
431
+ "skcapstone_status", "skcapstone_doctor", "skmemory_health",
432
+ "skchat_daemon_status", "skcomm_status",
433
+ ],
434
+ // Projects & Notion (Lumina delegates to project-ops via sessions_spawn)
435
+ "notion|project|brother john|swapseat|swap seat|chiro|davidrich|board|kanban|milestone": [
436
+ "notion_read", "notion_append", "notion_add_todo", "sessions_spawn", "subagents", "exec", "read",
437
+ ],
438
+ };
439
+
440
+ /** Pre-compile keyword regexes for TOOL_GROUPS */
441
+ const TOOL_GROUP_ENTRIES = Object.entries(TOOL_GROUPS).map(([keywords, tools]) => ({
442
+ regex: new RegExp(keywords, "i"),
443
+ tools,
444
+ }));
445
+
446
+ /** Priority tools — kept when reducing tool count (order matters) */
447
+ const PRIORITY_TOOLS = [
448
+ // Core agent tools (also guaranteed above)
449
+ "exec", "read", "write", "edit",
450
+ // Communication (critical for Telegram)
451
+ "message",
452
+ // Memory tools (most frequently needed)
453
+ "skmemory_health", "skmemory_search", "skmemory_snapshot",
454
+ "skmemory_ritual", "skmemory_context", "skmemory_list",
455
+ // Web tools
456
+ "web_search", "web_fetch",
457
+ // Communication (other channels)
458
+ "skchat_send", "skcomm_send",
459
+ // SKCapstone
460
+ "skcapstone_status", "skcapstone_whoami", "skcapstone_mood",
461
+ // Cloud 9
462
+ "cloud9_oof", "cloud9_rehydrate",
463
+ // Memory (infrequent)
464
+ "skmemory_export", "skmemory_import_seeds",
465
+ ];
466
+
467
+ /**
468
+ * Reduce the tools array to at most `max` tools, preferring tools
469
+ * mentioned in recent messages and priority tools.
470
+ * GUARANTEED_TOOLS always survive — remaining slots filled by score.
471
+ */
472
+ function reduceTools(tools, messages, max) {
473
+ if (tools.length <= max) return tools;
474
+
475
+ // Separate guaranteed tools from the rest
476
+ const guaranteed = [];
477
+ const rest = [];
478
+ for (const t of tools) {
479
+ const name = t.function?.name || "";
480
+ if (GUARANTEED_TOOLS.includes(name)) {
481
+ guaranteed.push(t);
482
+ } else {
483
+ rest.push(t);
484
+ }
485
+ }
486
+
487
+ // If guaranteed tools already fill the budget, return just those
488
+ if (guaranteed.length >= max) return guaranteed.slice(0, max);
489
+
490
+ // Score remaining tools — higher = more likely to be kept
491
+ const remainingSlots = max - guaranteed.length;
492
+ const scores = new Map();
493
+
494
+ // Extract user's last message text once for all scoring
495
+ const lastUserMsg = [...(messages || [])].reverse().find(m => m.role === "user");
496
+ const userText = lastUserMsg
497
+ ? (typeof lastUserMsg.content === "string" ? lastUserMsg.content : JSON.stringify(lastUserMsg.content || ""))
498
+ : "";
499
+
500
+ // Determine which tool groups are activated by the user's message
501
+ const activatedTools = new Set();
502
+ if (userText) {
503
+ for (const { regex, tools: groupTools } of TOOL_GROUP_ENTRIES) {
504
+ if (regex.test(userText)) {
505
+ for (const t of groupTools) activatedTools.add(t);
506
+ }
507
+ }
508
+ if (activatedTools.size > 0) {
509
+ console.log(`[nvidia-proxy] keyword-activated tools: [${[...activatedTools].join(",")}]`);
510
+ }
511
+ }
512
+
513
+ for (const t of rest) {
514
+ const name = t.function?.name || "";
515
+ let score = 0;
516
+
517
+ // STRONGEST: Semantic keyword group match (+300)
518
+ if (activatedTools.has(name)) score += 300;
519
+
520
+ // Boost tools mentioned in the user's last message
521
+ if (userText) {
522
+ if (userText.includes(name)) score += 200;
523
+ // Also match partial names (e.g., "health" matches "skmemory_health")
524
+ const parts = name.split("_");
525
+ for (const part of parts) {
526
+ if (part.length > 3 && userText.toLowerCase().includes(part.toLowerCase())) score += 100;
527
+ }
528
+ }
529
+
530
+ // Priority list bonus
531
+ const prioIdx = PRIORITY_TOOLS.indexOf(name);
532
+ if (prioIdx >= 0) score += 50 - prioIdx;
533
+
534
+ // Boost tools in recent assistant tool_calls
535
+ const recentMsgs = (messages || []).slice(-6);
536
+ for (const m of recentMsgs) {
537
+ if (m.tool_calls) {
538
+ for (const tc of m.tool_calls) {
539
+ if (tc.function?.name === name) score += 80;
540
+ }
541
+ }
542
+ }
543
+
544
+ // Penalize process tool (exec is critical for agent operation)
545
+ if (name === "process") score -= 30;
546
+
547
+ scores.set(name, { tool: t, score });
548
+ }
549
+
550
+ const sorted = [...scores.values()].sort((a, b) => b.score - a.score);
551
+ const topRest = sorted.slice(0, remainingSlots).map((s) => s.tool);
552
+ return [...guaranteed, ...topRest];
553
+ }
554
+
555
+ async function proxyRequest(clientReq, clientRes) {
556
+ const chunks = [];
557
+ for await (const chunk of clientReq) chunks.push(chunk);
558
+ let body = Buffer.concat(chunks);
559
+ const contentType = clientReq.headers["content-type"] || "";
560
+
561
+ const isChatCompletion =
562
+ contentType.includes("application/json") &&
563
+ clientReq.url.includes("/chat/completions");
564
+
565
+ let parsed = null;
566
+ if (isChatCompletion) {
567
+ try {
568
+ parsed = JSON.parse(body.toString("utf-8"));
569
+ } catch {
570
+ // pass through
571
+ }
572
+ }
573
+
574
+ // For non-tool requests or non-chat-completions, just proxy through
575
+ if (!parsed || !parsed.tools || !Array.isArray(parsed.tools) || parsed.tools.length === 0) {
576
+ const res = await sendUpstream(clientReq.url, clientReq.method, clientReq.headers, body);
577
+ clientRes.writeHead(res.status, res.headers);
578
+ clientRes.end(res.body);
579
+ return;
580
+ }
581
+
582
+ // Save original tools for reference
583
+ const allTools = [...parsed.tools];
584
+
585
+ // Tool request — proactively limit tools to reduce parallel call tendency
586
+ parsed.parallel_tool_calls = false;
587
+ // Force non-streaming for tool requests — proxy buffers full response anyway,
588
+ // and streaming (SSE) prevents us from inspecting/fixing tool calls
589
+ const wasStreaming = parsed.stream;
590
+ parsed.stream = false;
591
+ delete parsed.stream_options;
592
+ // With 94 tools the model almost always tries parallel calls.
593
+ // Reduce to max 16 most relevant tools on first attempt.
594
+ // 5 guaranteed (exec,read,write,edit,message) + 11 scored slots.
595
+ if (allTools.length > 16) {
596
+ parsed.tools = reduceTools(allTools, parsed.messages, 16);
597
+ const names = parsed.tools.map(t => t.function?.name).join(",");
598
+ console.log(`[nvidia-proxy] proactive reduction: ${allTools.length}→${parsed.tools.length} tools [${names}]`);
599
+ }
600
+
601
+ // Add system instruction to force single tool call
602
+ if (Array.isArray(parsed.messages)) {
603
+ const hasInstruction = parsed.messages.some(
604
+ (m) => m.role === "system" && typeof m.content === "string" && m.content.includes("ONE tool at a time"),
605
+ );
606
+ if (!hasInstruction) {
607
+ parsed.messages.unshift({
608
+ role: "system",
609
+ content: SINGLE_TOOL_INSTRUCTION,
610
+ });
611
+ }
612
+ }
613
+
614
+ // Trim system messages FIRST to free up budget for conversation history
615
+ trimSystemMessages(parsed);
616
+ trimConversationHistory(parsed);
617
+
618
+ // Track tool call rounds per-model to avoid cross-session interference.
619
+ if (Array.isArray(parsed.messages) && parsed.tools?.length > 0) {
620
+ const modelKey = parsed.model || "unknown";
621
+ const nonSystemMsgs = parsed.messages.filter(m => m.role !== "system");
622
+ const lastNonSystem = nonSystemMsgs[nonSystemMsgs.length - 1];
623
+ const hasToolResult = lastNonSystem?.role === "tool" || lastNonSystem?.role === "toolResult";
624
+
625
+ let counter = toolCallCounters.get(modelKey) || 0;
626
+ if (hasToolResult) {
627
+ counter++;
628
+ } else if (lastNonSystem?.role === "user") {
629
+ counter = 0;
630
+ }
631
+ toolCallCounters.set(modelKey, counter);
632
+
633
+ if (counter >= 10) {
634
+ console.log(`[nvidia-proxy] TOOL LIMIT: ${counter} consecutive tool rounds (${modelKey}) — stripping tools, forcing text response`);
635
+ parsed.tools = [];
636
+ delete parsed.tool_choice;
637
+ parsed.messages.push({
638
+ role: "system",
639
+ content: "STOP calling tools. You have made 10+ tool calls already. NOW respond to the user with a comprehensive text answer based on what you've gathered. Do NOT call any more tools. Do NOT output any special tokens or markup like <|tool_call_begin|> or <|tool_calls_section_begin|>. Write plain text only. Start your response with a greeting or summary — no XML, no special tokens, just normal language.",
640
+ });
641
+ toolCallCounters.set(modelKey, 0);
642
+ }
643
+ }
644
+
645
+ const model = parsed.model || "unknown";
646
+
647
+ // If client wanted streaming, start SSE headers early so we can send keep-alive
648
+ // comments while waiting for NVIDIA. This keeps the gateway's typing indicator alive.
649
+ let sseStarted = false;
650
+ let keepAliveTimer = null;
651
+ function startSSEKeepAlive() {
652
+ if (!wasStreaming || sseStarted) return;
653
+ sseStarted = true;
654
+ clientRes.writeHead(200, {
655
+ "content-type": "text/event-stream; charset=utf-8",
656
+ "cache-control": "no-cache",
657
+ "connection": "keep-alive",
658
+ });
659
+ keepAliveTimer = setInterval(() => {
660
+ try { clientRes.write(": keep-alive\n\n"); } catch {}
661
+ }, 5000);
662
+ }
663
+ function stopKeepAlive() {
664
+ if (keepAliveTimer) { clearInterval(keepAliveTimer); keepAliveTimer = null; }
665
+ }
666
+
667
+ for (let attempt = 1; attempt <= MAX_RETRIES; attempt++) {
668
+ const currentToolCount = parsed.tools ? parsed.tools.length : 0;
669
+ const reqBody = Buffer.from(JSON.stringify(parsed), "utf-8");
670
+ console.log(
671
+ `[nvidia-proxy] ${new Date().toISOString()} attempt=${attempt} model=${model} tools=${currentToolCount} bodyLen=${reqBody.length}`,
672
+ );
673
+
674
+ // Start keep-alive comments while NVIDIA processes
675
+ if (wasStreaming) startSSEKeepAlive();
676
+
677
+ let res;
678
+ // Handle 429 rate limiting with internal retries + backoff
679
+ for (let r429 = 0; r429 <= MAX_429_RETRIES; r429++) {
680
+ res = await sendUpstream(clientReq.url, clientReq.method, clientReq.headers, reqBody);
681
+ if (res.status !== 429 || r429 === MAX_429_RETRIES) break;
682
+ const delay = RATE_LIMIT_DELAY_MS * (r429 + 1);
683
+ console.log(`[nvidia-proxy] 429 rate limited, waiting ${delay}ms (retry ${r429 + 1}/${MAX_429_RETRIES})...`);
684
+ await new Promise(r => setTimeout(r, delay));
685
+ }
686
+
687
+ if (res.status === 400) {
688
+ const errText = res.body.toString("utf-8");
689
+ if (errText.includes("single tool-calls") && attempt < MAX_RETRIES) {
690
+ console.log(`[nvidia-proxy] 400 parallel tool-calls rejected, retrying (${attempt}/${MAX_RETRIES})...`);
691
+
692
+ if (attempt === 1) {
693
+ // Attempt 2: reduce to 8 tools + strip tool_calls from history
694
+ // The massive conversation history with tool_calls trains the model to call multiple
695
+ parsed.tools = reduceTools(allTools, parsed.messages, 8);
696
+ stripToolCallHistory(parsed.messages);
697
+ const toolNames = parsed.tools.map(t => t.function?.name).join(",");
698
+ console.log(`[nvidia-proxy] retry: ${parsed.tools.length} tools [${toolNames}], stripped history`);
699
+ } else if (attempt === 2) {
700
+ // Attempt 3: single tool, forced choice
701
+ parsed.tools = reduceTools(allTools, parsed.messages, 1);
702
+ const topTool = parsed.tools[0]?.function?.name;
703
+ if (topTool) {
704
+ parsed.tool_choice = { type: "function", function: { name: topTool } };
705
+ }
706
+ console.log(`[nvidia-proxy] retry: 1 tool, forced=${topTool}`);
707
+ } else {
708
+ // Attempt 4 (final): strip all tools, text-only
709
+ delete parsed.tools;
710
+ delete parsed.tool_choice;
711
+ delete parsed.parallel_tool_calls;
712
+ stripToolCallHistory(parsed.messages);
713
+ console.log(`[nvidia-proxy] final retry: stripped all tools, text-only`);
714
+ }
715
+ continue;
716
+ }
717
+ }
718
+
719
+ // Log tool calls in successful responses
720
+ if (res.status === 200) {
721
+ try {
722
+ const bodyStr = res.body.toString("utf-8");
723
+ const peek = JSON.parse(bodyStr);
724
+ const tc = peek.choices?.[0]?.message?.tool_calls;
725
+ if (tc && tc.length > 0) {
726
+ const names = tc.map(c => c.function?.name).join(", ");
727
+ console.log(`[nvidia-proxy] model called: [${names}] (${tc.length} calls)`);
728
+ } else {
729
+ const content = peek.choices?.[0]?.message?.content;
730
+ const fr = peek.choices?.[0]?.finish_reason;
731
+ console.log(`[nvidia-proxy] model response: text (${content ? content.length : 0} chars) finish_reason=${fr}`);
732
+ if (!content || content.length === 0) {
733
+ console.log(`[nvidia-proxy] EMPTY RESPONSE DEBUG: ${JSON.stringify(peek.choices?.[0]).slice(0, 500)}`);
734
+ }
735
+ }
736
+ } catch {
737
+ // SSE streaming responses can't be parsed as JSON — this is expected
738
+ }
739
+ }
740
+
741
+ // Fix ghost tool calls: finish_reason says "tool_calls" but no actual tool_calls present
742
+ if (res.status === 200 && parsed.tools) {
743
+ try {
744
+ const resBody = JSON.parse(res.body.toString("utf-8"));
745
+ const choice = resBody.choices?.[0];
746
+ if (choice && (choice.finish_reason === "tool_calls" || choice.finish_reason === "function_call") && !choice.message?.tool_calls?.length) {
747
+ console.warn(`[nvidia-proxy] GHOST TOOL CALL: finish_reason=${choice.finish_reason} but no tool_calls — fixing to stop`);
748
+ choice.finish_reason = "stop";
749
+ stopKeepAlive();
750
+ sendOk(clientRes, resBody, res.headers, wasStreaming);
751
+ return;
752
+ }
753
+ } catch {
754
+ // Not JSON — pass through
755
+ }
756
+ }
757
+
758
+ // Check for hallucinated/invalid tool names (e.g., Kimi K2.5 "callauto" bug)
759
+ if (res.status === 200 && parsed.tools) {
760
+ try {
761
+ const resBody = JSON.parse(res.body.toString("utf-8"));
762
+ const choice = resBody.choices?.[0];
763
+ if (choice?.message?.tool_calls) {
764
+ // Compare against ALL original tools, not just the reduced set
765
+ const allToolNames = new Set(allTools.map(t => t.function?.name));
766
+ const invalidCalls = choice.message.tool_calls.filter(
767
+ tc => !tc.function?.name || !allToolNames.has(tc.function.name)
768
+ );
769
+ if (invalidCalls.length > 0) {
770
+ const badNames = invalidCalls.map(tc => tc.function?.name || "(empty)").join(", ");
771
+ console.warn(`[nvidia-proxy] CALLAUTO DETECTED: invalid tool names [${badNames}] — stripping tool_calls, returning text-only`);
772
+ // Strip invalid tool calls, keep only content
773
+ choice.message.tool_calls = choice.message.tool_calls.filter(
774
+ tc => tc.function?.name && allToolNames.has(tc.function.name)
775
+ );
776
+ if (choice.message.tool_calls.length === 0) {
777
+ delete choice.message.tool_calls;
778
+ choice.finish_reason = "stop";
779
+ }
780
+ stopKeepAlive();
781
+ sendOk(clientRes, resBody, res.headers, wasStreaming);
782
+ return;
783
+ }
784
+ }
785
+ } catch {
786
+ // Not JSON — pass through
787
+ }
788
+ }
789
+
790
+ // Check for successful response with multiple tool calls — trim to just the first one
791
+ if (res.status === 200 && parsed.tools) {
792
+ try {
793
+ const resBody = JSON.parse(res.body.toString("utf-8"));
794
+ const choice = resBody.choices?.[0];
795
+ if (choice?.message?.tool_calls && choice.message.tool_calls.length > 1) {
796
+ console.log(
797
+ `[nvidia-proxy] trimming ${choice.message.tool_calls.length} tool_calls to 1 (${choice.message.tool_calls[0].function?.name})`,
798
+ );
799
+ choice.message.tool_calls = [choice.message.tool_calls[0]];
800
+ stopKeepAlive();
801
+ sendOk(clientRes, resBody, res.headers, wasStreaming);
802
+ return;
803
+ }
804
+ } catch {
805
+ // Not JSON or parse error — pass through as-is
806
+ }
807
+ }
808
+
809
+ // Success or non-retryable error
810
+ stopKeepAlive();
811
+ if (res.status >= 400) {
812
+ console.error(`[nvidia-proxy] ${res.status} ERROR: ${res.body.toString("utf-8").slice(0, 300)}`);
813
+ if (!clientRes.headersSent) {
814
+ clientRes.writeHead(res.status, res.headers);
815
+ }
816
+ clientRes.end(res.body);
817
+ return;
818
+ }
819
+
820
+ console.log(`[nvidia-proxy] ${res.status} OK (attempt ${attempt})`);
821
+ if (wasStreaming && res.status === 200) {
822
+ try {
823
+ const resBody = JSON.parse(res.body.toString("utf-8"));
824
+ sendOk(clientRes, resBody, res.headers, true);
825
+ } catch {
826
+ // Can't parse — send raw
827
+ if (!clientRes.headersSent) {
828
+ clientRes.writeHead(res.status, res.headers);
829
+ }
830
+ clientRes.end(res.body);
831
+ }
832
+ } else {
833
+ if (!clientRes.headersSent) {
834
+ clientRes.writeHead(res.status, res.headers);
835
+ }
836
+ clientRes.end(res.body);
837
+ }
838
+ return;
839
+ }
840
+ }
841
+
842
+ const server = http.createServer(proxyRequest);
843
+
844
+ server.listen(port, "127.0.0.1", () => {
845
+ console.log(`[nvidia-proxy] listening on http://127.0.0.1:${port}`);
846
+ console.log(`[nvidia-proxy] proxying to ${targetUrl.origin}`);
847
+ console.log(`[nvidia-proxy] retry strategy: 16 tools (5 guaranteed)→8 tools→1 tool (forced)→text-only (max ${MAX_RETRIES} attempts)`);
848
+ console.log(`[nvidia-proxy] also trims multi-tool responses to single tool call`);
849
+ });
850
+
851
+ for (const sig of ["SIGINT", "SIGTERM"]) {
852
+ process.on(sig, () => {
853
+ console.log(`[nvidia-proxy] ${sig} received, shutting down`);
854
+ server.close(() => process.exit(0));
855
+ });
856
+ }