lynkr 9.0.1 → 9.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/README.md +70 -21
  2. package/bin/cli.js +34 -4
  3. package/bin/lynkr-trajectory.js +136 -0
  4. package/bin/lynkr-usage.js +219 -0
  5. package/funding.json +110 -0
  6. package/index.js +7 -3
  7. package/install.sh +3 -3
  8. package/lynkr-skill.tar.gz +0 -0
  9. package/native/Cargo.toml +26 -0
  10. package/native/index.js +29 -0
  11. package/native/lynkr-native.node +0 -0
  12. package/native/src/lib.rs +321 -0
  13. package/package.json +6 -5
  14. package/public/dashboard.html +665 -0
  15. package/src/api/files-multipart.js +30 -0
  16. package/src/api/files-router.js +81 -0
  17. package/src/api/middleware/budget.js +19 -1
  18. package/src/api/middleware/load-shedding.js +17 -0
  19. package/src/api/openai-router.js +353 -301
  20. package/src/api/router.js +275 -40
  21. package/src/cache/prompt.js +13 -0
  22. package/src/clients/databricks.js +42 -18
  23. package/src/clients/ollama-utils.js +21 -17
  24. package/src/clients/openai-format.js +50 -10
  25. package/src/clients/openrouter-utils.js +42 -37
  26. package/src/clients/prompt-cache-injection.js +140 -0
  27. package/src/clients/provider-capabilities.js +41 -0
  28. package/src/clients/responses-format.js +8 -7
  29. package/src/clients/standard-tools.js +1 -1
  30. package/src/clients/xml-tool-extractor.js +307 -0
  31. package/src/cluster.js +82 -0
  32. package/src/config/index.js +16 -0
  33. package/src/context/distill.js +15 -0
  34. package/src/context/tool-result-compressor.js +563 -0
  35. package/src/dashboard/api.js +170 -0
  36. package/src/dashboard/router.js +13 -0
  37. package/src/headroom/client.js +3 -109
  38. package/src/headroom/index.js +0 -14
  39. package/src/memory/extractor.js +22 -0
  40. package/src/memory/search.js +0 -50
  41. package/src/orchestrator/index.js +163 -204
  42. package/src/orchestrator/preflight.js +188 -0
  43. package/src/routing/index.js +64 -32
  44. package/src/routing/interaction.js +183 -0
  45. package/src/routing/risk-analyzer.js +194 -0
  46. package/src/routing/telemetry.js +47 -2
  47. package/src/server.js +15 -0
  48. package/src/stores/file-store.js +104 -0
  49. package/src/stores/response-store.js +25 -0
  50. package/src/tools/index.js +1 -1
  51. package/src/tools/smart-selection.js +11 -2
  52. package/src/tools/web.js +1 -1
  53. package/src/training/trajectory-compressor.js +266 -0
  54. package/src/usage/aggregator.js +206 -0
  55. package/src/utils/markdown-ansi.js +146 -0
  56. package/.lynkr/telemetry.db +0 -0
  57. package/.lynkr/telemetry.db-shm +0 -0
  58. package/.lynkr/telemetry.db-wal +0 -0
package/src/api/router.js CHANGED
@@ -6,8 +6,10 @@ const logger = require("../logger");
6
6
  const { createRateLimiter } = require("./middleware/rate-limiter");
7
7
  const openaiRouter = require("./openai-router");
8
8
  const providersRouter = require("./providers-handler");
9
- const { getRoutingHeaders, getRoutingStats, analyzeComplexity, getModelTierSelector } = require("../routing");
9
+ const { getRoutingHeaders, getRoutingStats, analyzeComplexity, getModelTierSelector, analyzeRisk } = require("../routing");
10
+ const { buildInteractionBlock } = require("../routing/interaction");
10
11
  const { validateCwd } = require("../workspace");
12
+ const { renderText } = require("../utils/markdown-ansi");
11
13
 
12
14
  const router = express.Router();
13
15
 
@@ -63,6 +65,24 @@ router.get("/health", (req, res) => {
63
65
  res.json({ status: "ok" });
64
66
  });
65
67
 
68
+ // Usage report — same data as `lynkr usage` CLI, served as JSON for
69
+ // dashboards / agents / scripts that want to surface spend & savings.
70
+ router.get("/v1/usage", (req, res) => {
71
+ try {
72
+ const aggregator = require("../usage/aggregator");
73
+ const window = req.query.window || (req.query.days ? `${parseInt(req.query.days, 10)}d` : "30d");
74
+ const usage = aggregator.getUsage({
75
+ window,
76
+ flagship: req.query.flagship,
77
+ provider: req.query.provider,
78
+ model: req.query.model,
79
+ });
80
+ res.json(usage);
81
+ } catch (err) {
82
+ res.status(500).json({ error: err.message });
83
+ }
84
+ });
85
+
66
86
  // Routing stats endpoint (Phase 3: Metrics)
67
87
  router.get("/routing/stats", (req, res) => {
68
88
  const stats = getRoutingStats();
@@ -213,7 +233,46 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
213
233
  const { createTimer } = require("../utils/perf-timer");
214
234
  const timer = createTimer("POST /v1/messages");
215
235
  metrics.recordRequest();
216
- // Support both query parameter (?stream=true) and body parameter ({"stream": true})
236
+
237
+ // Convert Anthropic server tools (web_search_20260209, etc.) to regular
238
+ // function tools so non-Anthropic providers can execute them via Lynkr.
239
+ // The orchestrator's SERVER_SIDE_TOOLS handling will execute them server-side.
240
+ if (Array.isArray(req.body?.tools)) {
241
+ const incomingToolTypes = req.body.tools.map(t => t?.type || t?.name).filter(Boolean);
242
+ logger.info({ incomingToolTypes }, "Incoming /v1/messages tool types");
243
+ req.body.tools = req.body.tools.map((tool) => {
244
+ if (tool?.type?.startsWith?.("web_search_20")) {
245
+ logger.info({ originalType: tool.type, name: tool.name }, "Converting web_search server tool to function tool");
246
+ return {
247
+ name: tool.name || "web_search",
248
+ description: "Search the web for up-to-date information. Returns relevant search results from the web.",
249
+ input_schema: {
250
+ type: "object",
251
+ properties: {
252
+ query: { type: "string", description: "Search query" },
253
+ },
254
+ required: ["query"],
255
+ },
256
+ };
257
+ }
258
+ if (tool?.type?.startsWith?.("web_fetch_")) {
259
+ return {
260
+ name: tool.name || "web_fetch",
261
+ description: "Fetch the contents of a URL.",
262
+ input_schema: {
263
+ type: "object",
264
+ properties: {
265
+ url: { type: "string", description: "URL to fetch" },
266
+ },
267
+ required: ["url"],
268
+ },
269
+ };
270
+ }
271
+ return tool;
272
+ });
273
+ }
274
+
275
+ // Support both query parameter (?stream=true) and body parameter ({"stream": true})
217
276
  const wantsStream = Boolean(req.query?.stream === 'true' || req.body?.stream);
218
277
  const hasTools = Array.isArray(req.body?.tools) && req.body.tools.length > 0;
219
278
  timer.mark("parseRequest");
@@ -221,24 +280,70 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
221
280
  // Analyze complexity for routing headers (Phase 3)
222
281
  const complexity = await analyzeComplexity(req.body);
223
282
  timer.mark("analyzeComplexity");
283
+
284
+ // Risk axis runs alongside complexity. Cheap pure-string scan, no I/O.
285
+ let preRouteRisk = null;
286
+ try {
287
+ preRouteRisk = analyzeRisk(req.body);
288
+ } catch (err) {
289
+ logger.debug({ err: err.message }, '[Router] Risk analysis failed in pre-route');
290
+ }
291
+
292
+ // Pre-route tier: high-risk forces COMPLEX, otherwise tier is
293
+ // inferred from the complexity recommendation. The actual final
294
+ // tier may differ (invokeModel re-runs determineProviderSmart) —
295
+ // this is best-effort for header surfacing.
224
296
  let preRouteProvider = 'cloud';
225
- if (complexity.recommendation === 'local') {
226
- // Use tier config to determine actual provider instead of hardcoding 'ollama'
297
+ let preRouteTier = null;
298
+ let preRouteModel = null;
299
+ let preRouteMethod = 'complexity';
300
+ let preRouteReason = complexity.breakdown?.taskType?.reason || complexity.recommendation;
301
+
302
+ if (preRouteRisk?.level === 'high') {
227
303
  try {
228
304
  const selector = getModelTierSelector();
229
- const tierResult = selector.selectModel('SIMPLE', null);
305
+ const tierResult = selector.selectModel('COMPLEX', null);
230
306
  preRouteProvider = tierResult.provider;
307
+ preRouteTier = 'COMPLEX';
308
+ preRouteModel = tierResult.model;
309
+ preRouteMethod = 'risk';
310
+ preRouteReason = 'high_risk_forced_tier';
231
311
  } catch (_) {
232
- preRouteProvider = 'ollama';
312
+ // Risk-forced tier not configured; fall back to normal flow.
233
313
  }
234
314
  }
235
- const routingHeaders = getRoutingHeaders({
315
+
316
+ if (!preRouteTier) {
317
+ if (complexity.recommendation === 'local') {
318
+ try {
319
+ const selector = getModelTierSelector();
320
+ const tierResult = selector.selectModel('SIMPLE', null);
321
+ preRouteProvider = tierResult.provider;
322
+ preRouteTier = 'SIMPLE';
323
+ preRouteModel = tierResult.model;
324
+ } catch (_) {
325
+ preRouteProvider = 'ollama';
326
+ }
327
+ }
328
+ }
329
+
330
+ const preRouteDecision = {
236
331
  provider: preRouteProvider,
332
+ tier: preRouteTier,
333
+ model: preRouteModel,
334
+ method: preRouteMethod,
335
+ reason: preRouteReason,
237
336
  score: complexity.score,
238
337
  threshold: complexity.threshold,
239
- method: 'complexity',
240
- reason: complexity.breakdown?.taskType?.reason || complexity.recommendation,
241
- });
338
+ risk: preRouteRisk,
339
+ };
340
+
341
+ const routingHeaders = getRoutingHeaders(preRouteDecision);
342
+
343
+ // Build the interaction block once. It travels in headers always
344
+ // (X-Lynkr-Interaction-* derived fields) and optionally into the
345
+ // response body when LYNKR_VISIBLE_ROUTING=true.
346
+ const interaction = buildInteractionBlock(preRouteDecision);
242
347
 
243
348
  // Extract client CWD from request body or header
244
349
  const clientCwd = validateCwd(req.body?.cwd || req.headers['x-workspace-cwd']);
@@ -369,7 +474,11 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
369
474
  })}\n\n`);
370
475
 
371
476
  // 2. content_block_start and content_block_delta for each content block
372
- const contentBlocks = msg.content || [];
477
+ // Filter out server-side tools that shouldn't reach the client
478
+ const _serverTools = new Set(["task", "websearch", "webfetch", "web_search", "web_fetch", "web_agent"]);
479
+ const contentBlocks = (msg.content || []).filter(b =>
480
+ !(b.type === "tool_use" && _serverTools.has((b.name || "").toLowerCase()))
481
+ );
373
482
  for (let i = 0; i < contentBlocks.length; i++) {
374
483
  const block = contentBlocks[i];
375
484
 
@@ -381,38 +490,90 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
381
490
  content_block: { type: "text", text: "" }
382
491
  })}\n\n`);
383
492
 
384
- // Send text in chunks
385
- const text = block.text || "";
386
- const chunkSize = 20;
387
- for (let j = 0; j < text.length; j += chunkSize) {
388
- const chunk = text.slice(j, j + chunkSize);
389
- res.write(`event: content_block_delta\n`);
390
- res.write(`data: ${JSON.stringify({
391
- type: "content_block_delta",
392
- index: i,
393
- delta: { type: "text_delta", text: chunk }
394
- })}\n\n`);
493
+ // Send text one chunk when ANSI rendering is active (splitting
494
+ // ANSI escape sequences across 20-char chunks breaks terminal output).
495
+ // Plain text falls back to line-level chunks for a trickle effect.
496
+ // Never apply ANSI rendering to HTML content (<artifact> blocks):
497
+ // ANSI codes corrupt CSS selectors like `*` and break the browser viewer.
498
+ const rawBlockText = block.text || "";
499
+ const isHtmlContent = rawBlockText.includes("<artifact") || rawBlockText.trimStart().startsWith("<");
500
+ const text = isHtmlContent ? rawBlockText : renderText(rawBlockText);
501
+ const { enabled: ansiEnabled } = require("../utils/markdown-ansi");
502
+ if (ansiEnabled && !isHtmlContent) {
503
+ if (text.length > 0) {
504
+ res.write(`event: content_block_delta\n`);
505
+ res.write(`data: ${JSON.stringify({
506
+ type: "content_block_delta",
507
+ index: i,
508
+ delta: { type: "text_delta", text }
509
+ })}\n\n`);
510
+ }
511
+ } else {
512
+ const lines = text.split("\n");
513
+ for (const line of lines) {
514
+ const lineWithNl = line + "\n";
515
+ res.write(`event: content_block_delta\n`);
516
+ res.write(`data: ${JSON.stringify({
517
+ type: "content_block_delta",
518
+ index: i,
519
+ delta: { type: "text_delta", text: lineWithNl }
520
+ })}\n\n`);
521
+ }
395
522
  }
396
523
 
397
524
  res.write(`event: content_block_stop\n`);
398
525
  res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
399
- } else if (block.type === "tool_use") {
526
+ } else if (block.type === "thinking") {
400
527
  res.write(`event: content_block_start\n`);
401
528
  res.write(`data: ${JSON.stringify({
402
529
  type: "content_block_start",
403
530
  index: i,
404
- content_block: { type: "tool_use", id: block.id, name: block.name, input: {} }
405
- })}\n\n`);
406
-
407
- res.write(`event: content_block_delta\n`);
408
- res.write(`data: ${JSON.stringify({
409
- type: "content_block_delta",
410
- index: i,
411
- delta: { type: "input_json_delta", partial_json: JSON.stringify(block.input) }
531
+ content_block: { type: "thinking", thinking: "" }
412
532
  })}\n\n`);
413
-
533
+ const thinkingText = block.thinking || "";
534
+ const thinkChunkSize = 40;
535
+ for (let j = 0; j < thinkingText.length; j += thinkChunkSize) {
536
+ res.write(`event: content_block_delta\n`);
537
+ res.write(`data: ${JSON.stringify({
538
+ type: "content_block_delta",
539
+ index: i,
540
+ delta: { type: "thinking_delta", thinking: thinkingText.slice(j, j + thinkChunkSize) }
541
+ })}\n\n`);
542
+ }
414
543
  res.write(`event: content_block_stop\n`);
415
544
  res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
545
+ } else if (block.type === "tool_use") {
546
+ // Original request had no tools → model hallucinated a tool call.
547
+ // Extract file content from write-style tools and wrap it in an
548
+ // <artifact> block so open-design routes it to the Design panel.
549
+ const toolName = (block.name || "").toLowerCase();
550
+ const writeTools = new Set(["write", "create_file", "write_file", "str_replace_editor"]);
551
+ if (writeTools.has(toolName)) {
552
+ const rawContent = block.input?.content ?? block.input?.file_content ?? block.input?.new_content ?? "";
553
+ const filePath = String(block.input?.file_path ?? block.input?.filename ?? "design.html");
554
+ const content = String(rawContent);
555
+ if (content) {
556
+ // Wrap in <artifact> so open-design's parser routes it to the file viewer.
557
+ const identifier = filePath.replace(/[^a-zA-Z0-9._-]/g, "_");
558
+ const title = filePath;
559
+ const wrapped = `<artifact identifier="${identifier}" type="text/html" title="${title}">\n${content}\n</artifact>`;
560
+ res.write(`event: content_block_start\n`);
561
+ res.write(`data: ${JSON.stringify({
562
+ type: "content_block_start",
563
+ index: i,
564
+ content_block: { type: "text", text: "" }
565
+ })}\n\n`);
566
+ res.write(`event: content_block_delta\n`);
567
+ res.write(`data: ${JSON.stringify({
568
+ type: "content_block_delta",
569
+ index: i,
570
+ delta: { type: "text_delta", text: wrapped }
571
+ })}\n\n`);
572
+ res.write(`event: content_block_stop\n`);
573
+ res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
574
+ }
575
+ }
576
+ // Non-write tool_use in a tool-less request is silently dropped.
416
577
  }
417
578
  }
418
579
 
@@ -488,7 +649,11 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
488
649
  })}\n\n`);
489
650
 
490
651
  // 2. content_block_start and content_block_delta for each content block
491
- const contentBlocks = msg.content || [];
652
+ // Filter out server-side tools that shouldn't reach the client
653
+ const _serverTools = new Set(["task", "websearch", "webfetch", "web_search", "web_fetch", "web_agent"]);
654
+ const contentBlocks = (msg.content || []).filter(b =>
655
+ !(b.type === "tool_use" && _serverTools.has((b.name || "").toLowerCase()))
656
+ );
492
657
  for (let i = 0; i < contentBlocks.length; i++) {
493
658
  const block = contentBlocks[i];
494
659
 
@@ -500,18 +665,51 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
500
665
  content_block: { type: "text", text: "" }
501
666
  })}\n\n`);
502
667
 
503
- const text = block.text || "";
504
- const chunkSize = 20;
505
- for (let j = 0; j < text.length; j += chunkSize) {
506
- const chunk = text.slice(j, j + chunkSize);
668
+ const rawBlockText2 = block.text || "";
669
+ const isHtmlContent2 = rawBlockText2.includes("<artifact") || rawBlockText2.trimStart().startsWith("<");
670
+ const text = isHtmlContent2 ? rawBlockText2 : renderText(rawBlockText2);
671
+ const { enabled: ansiEnabled } = require("../utils/markdown-ansi");
672
+ if (ansiEnabled && !isHtmlContent2) {
673
+ if (text.length > 0) {
674
+ res.write(`event: content_block_delta\n`);
675
+ res.write(`data: ${JSON.stringify({
676
+ type: "content_block_delta",
677
+ index: i,
678
+ delta: { type: "text_delta", text }
679
+ })}\n\n`);
680
+ }
681
+ } else {
682
+ const lines = text.split("\n");
683
+ for (const line of lines) {
684
+ const lineWithNl = line + "\n";
685
+ res.write(`event: content_block_delta\n`);
686
+ res.write(`data: ${JSON.stringify({
687
+ type: "content_block_delta",
688
+ index: i,
689
+ delta: { type: "text_delta", text: lineWithNl }
690
+ })}\n\n`);
691
+ }
692
+ }
693
+
694
+ res.write(`event: content_block_stop\n`);
695
+ res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
696
+ } else if (block.type === "thinking") {
697
+ res.write(`event: content_block_start\n`);
698
+ res.write(`data: ${JSON.stringify({
699
+ type: "content_block_start",
700
+ index: i,
701
+ content_block: { type: "thinking", thinking: "" }
702
+ })}\n\n`);
703
+ const thinkingText = block.thinking || "";
704
+ const thinkChunkSize = 40;
705
+ for (let j = 0; j < thinkingText.length; j += thinkChunkSize) {
507
706
  res.write(`event: content_block_delta\n`);
508
707
  res.write(`data: ${JSON.stringify({
509
708
  type: "content_block_delta",
510
709
  index: i,
511
- delta: { type: "text_delta", text: chunk }
710
+ delta: { type: "thinking_delta", thinking: thinkingText.slice(j, j + thinkChunkSize) }
512
711
  })}\n\n`);
513
712
  }
514
-
515
713
  res.write(`event: content_block_stop\n`);
516
714
  res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
517
715
  } else if (block.type === "tool_use") {
@@ -566,8 +764,33 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
566
764
  });
567
765
  }
568
766
 
767
+ // Inject visible interaction block into the response body when
768
+ // LYNKR_VISIBLE_ROUTING=true. We only mutate JSON bodies — and only
769
+ // when the response looks like a valid Anthropic Message — so this
770
+ // is a no-op for streamed / error / non-message responses.
771
+ let finalBody = result.body;
772
+ if (
773
+ config.routing?.visibleInteraction &&
774
+ interaction &&
775
+ result.status >= 200 && result.status < 300 &&
776
+ result.body
777
+ ) {
778
+ try {
779
+ const text = Buffer.isBuffer(result.body) ? result.body.toString('utf8') : result.body;
780
+ if (typeof text === 'string' && text.startsWith('{')) {
781
+ const parsed = JSON.parse(text);
782
+ if (parsed && typeof parsed === 'object' && parsed.type === 'message') {
783
+ parsed.lynkr_interaction = interaction;
784
+ finalBody = JSON.stringify(parsed);
785
+ }
786
+ }
787
+ } catch (err) {
788
+ logger.debug({ err: err.message }, '[Router] Skipped interaction injection (non-JSON body)');
789
+ }
790
+ }
791
+
569
792
  metrics.recordResponse(result.status);
570
- res.status(result.status).send(result.body);
793
+ res.status(result.status).send(finalBody);
571
794
  } catch (error) {
572
795
  next(error);
573
796
  }
@@ -724,6 +947,18 @@ router.get("/metrics/compression", async (req, res) => {
724
947
  }
725
948
  });
726
949
 
950
+ router.get("/metrics/tool-compression", (req, res) => {
951
+ const { getMetrics } = require("../context/tool-result-compressor");
952
+ res.json(getMetrics());
953
+ });
954
+
955
+ router.get("/tee/:id", (req, res) => {
956
+ const { teeGet } = require("../context/tool-result-compressor");
957
+ const content = teeGet(req.params.id);
958
+ if (!content) return res.status(404).json({ error: "Tee entry not found or expired" });
959
+ res.type("text/plain").send(content);
960
+ });
961
+
727
962
  router.get("/health/headroom", async (req, res) => {
728
963
  try {
729
964
  const { getHeadroomManager } = require("../headroom");
@@ -5,6 +5,15 @@ try {
5
5
  } catch {
6
6
  Database = null;
7
7
  }
8
+
9
+ // Try to load native Rust cache key computation (4x faster for small payloads)
10
+ let nativeCacheKey = null;
11
+ try {
12
+ const native = require('../../native');
13
+ if (native.available && native.computeCacheKey) {
14
+ nativeCacheKey = native.computeCacheKey;
15
+ }
16
+ } catch { /* native module not available — use JS */ }
8
17
  const path = require("path");
9
18
  const fs = require("fs");
10
19
  const config = require("../config");
@@ -164,6 +173,10 @@ class PromptCache {
164
173
  max_tokens: payload.max_tokens ?? null,
165
174
  };
166
175
  const serialised = stableStringify(canonical);
176
+ // Use Rust for small payloads where it's 4x faster
177
+ if (nativeCacheKey && serialised.length < 5000) {
178
+ return nativeCacheKey(serialised);
179
+ }
167
180
  return crypto.createHash("sha256").update(serialised).digest("hex");
168
181
  } catch (error) {
169
182
  logger.warn(
@@ -34,19 +34,20 @@ logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI bulkhead initialized");
34
34
 
35
35
 
36
36
  // HTTP connection pooling for better performance
37
+ // Increased maxSockets for high-concurrency team deployments (50+ devs)
37
38
  const httpAgent = new http.Agent({
38
39
  keepAlive: true,
39
- maxSockets: 50,
40
- maxFreeSockets: 10,
41
- timeout: 60000,
40
+ maxSockets: 200,
41
+ maxFreeSockets: 20,
42
+ timeout: 120000,
42
43
  keepAliveMsecs: 30000,
43
44
  });
44
45
 
45
46
  const httpsAgent = new https.Agent({
46
47
  keepAlive: true,
47
- maxSockets: 50,
48
- maxFreeSockets: 10,
49
- timeout: 60000,
48
+ maxSockets: 200,
49
+ maxFreeSockets: 20,
50
+ timeout: 120000,
50
51
  keepAliveMsecs: 30000,
51
52
  });
52
53
 
@@ -220,7 +221,7 @@ async function invokeOllama(body) {
220
221
  const useAnthropicApi = await hasAnthropicEndpoint(config.ollama.endpoint);
221
222
 
222
223
  // Check if model supports tools FIRST (before wasteful injection)
223
- const supportsTools = await checkOllamaToolSupport(config.ollama.model);
224
+ const supportsTools = await checkOllamaToolSupport(modelName);
224
225
  const injectToolsOllama = process.env.INJECT_TOOLS_OLLAMA !== "false";
225
226
 
226
227
  // Determine tools to send
@@ -271,7 +272,7 @@ async function invokeOllama(body) {
271
272
  model: modelName,
272
273
  messages: body.messages,
273
274
  max_tokens: body.max_tokens || 16384,
274
- stream: false,
275
+ stream: body.stream ?? false,
275
276
  };
276
277
 
277
278
  if (body.system) ollamaBody.system = body.system;
@@ -338,7 +339,7 @@ async function invokeOllama(body) {
338
339
  const ollamaBody = {
339
340
  model: modelName,
340
341
  messages: deduplicated,
341
- stream: false,
342
+ stream: body.stream ?? false,
342
343
  options: {
343
344
  temperature: body.temperature ?? 0.7,
344
345
  num_predict: body.max_tokens ?? 16384,
@@ -475,13 +476,17 @@ async function invokeAzureOpenAI(body) {
475
476
  // System prompt injection disabled - breaks model response
476
477
  // Tool guidance now provided via tool descriptions instead
477
478
 
479
+ const azureDeployment = body._suggestionModeModel || body._tierModel || config.azureOpenAI.deployment || "";
480
+ const isGpt5 = /gpt-5/i.test(azureDeployment);
481
+ const maxTokensKey = isGpt5 ? "max_completion_tokens" : "max_tokens";
482
+
478
483
  const azureBody = {
479
484
  messages,
480
- temperature: body.temperature ?? 0.3, // Lower temperature for more deterministic, action-oriented behavior
481
- max_tokens: Math.min(body.max_tokens ?? 16384, 16384), // Cap at Azure OpenAI's limit
485
+ temperature: body.temperature ?? 0.3,
486
+ [maxTokensKey]: Math.min(body.max_tokens ?? 16384, 16384),
482
487
  top_p: body.top_p ?? 1.0,
483
- stream: false, // Force non-streaming for Azure OpenAI - streaming format conversion not yet implemented
484
- model: body._suggestionModeModel || body._tierModel || config.azureOpenAI.deployment
488
+ stream: false,
489
+ model: azureDeployment
485
490
  };
486
491
 
487
492
  // Add tools - inject standard tools if client didn't send any (passthrough mode)
@@ -1598,20 +1603,34 @@ function convertOpenAIToAnthropic(response) {
1598
1603
  const message = choice.message || {};
1599
1604
  const content = [];
1600
1605
 
1606
+ // Extract tool calls embedded as XML/text in content (Minimax, Qwen, GLM, etc.)
1607
+ if (!message.tool_calls?.length && typeof message.content === "string" && message.content.trim()) {
1608
+ const { extractToolCallsFromText } = require("./xml-tool-extractor");
1609
+ const extracted = extractToolCallsFromText(message.content);
1610
+ if (extracted.toolCalls.length > 0) {
1611
+ message.tool_calls = extracted.toolCalls;
1612
+ message.content = extracted.cleanedText;
1613
+ choice.finish_reason = "tool_calls";
1614
+ }
1615
+ }
1616
+
1601
1617
  // Add text content from message.content
1602
1618
  // Don't add placeholder text if there are tool_calls - tools are the actual response
1603
1619
  const hasToolCalls = Array.isArray(message.tool_calls) && message.tool_calls.length > 0;
1604
1620
 
1605
- // Extract text content - handle thinking models that split content/reasoning
1621
+ // Extract text content and reasoning from thinking models
1606
1622
  const textContent = typeof message.content === 'string' ? message.content : '';
1607
1623
  const reasoningContent = typeof message.reasoning_content === 'string' ? message.reasoning_content : '';
1608
1624
 
1625
+ // Emit reasoning_content as a proper thinking block (not discarded)
1626
+ if (reasoningContent) {
1627
+ content.push({ type: "thinking", thinking: reasoningContent });
1628
+ }
1629
+
1609
1630
  if (textContent) {
1610
- // Has regular content - use it directly (ignore reasoning_content chain-of-thought)
1611
1631
  content.push({ type: "text", text: textContent });
1612
- } else if (reasoningContent) {
1613
- // Fallback: thinking models where content is empty but reasoning has the output
1614
- content.push({ type: "text", text: reasoningContent });
1632
+ } else if (!reasoningContent) {
1633
+ // No content and no reasoning will be handled by the empty check below
1615
1634
  }
1616
1635
 
1617
1636
  // Convert tool calls
@@ -2028,6 +2047,11 @@ async function invokeModel(body, options = {}) {
2028
2047
  body._tierModel = tierSelectedModel;
2029
2048
  }
2030
2049
 
2050
+ // Inject provider-side prompt caching (cache_control breakpoints)
2051
+ // Reduces input token cost by up to 90% and latency by up to 80%
2052
+ const { injectPromptCaching } = require('./prompt-cache-injection');
2053
+ injectPromptCaching(body, initialProvider);
2054
+
2031
2055
  // Build routing decision object for response headers
2032
2056
  const routingDecision = {
2033
2057
  provider: initialProvider,
@@ -77,25 +77,29 @@ async function hasAnthropicEndpoint(baseUrl) {
77
77
  if (anthropicEndpointAvailable !== null) return anthropicEndpointAvailable;
78
78
 
79
79
  try {
80
- // Send a minimal request we only care about whether the route exists
81
- const res = await fetch(`${baseUrl}/v1/messages`, {
82
- method: "POST",
83
- headers: {
84
- "Content-Type": "application/json",
85
- "anthropic-version": "2023-06-01",
86
- },
87
- body: JSON.stringify({
88
- model: "probe",
89
- max_tokens: 1,
90
- messages: [{ role: "user", content: "hi" }],
91
- }),
80
+ // Check Ollama version/v1/messages requires v0.14.0+
81
+ // This is instant (no LLM inference) vs the old probe that sent a real request
82
+ const controller = new AbortController();
83
+ const timeout = setTimeout(() => controller.abort(), 3000);
84
+ const versionRes = await fetch(`${baseUrl}/api/version`, {
85
+ method: "GET",
86
+ signal: controller.signal,
92
87
  });
93
-
94
- // 404 → endpoint doesn't exist (old Ollama)
95
- // Any other status (200, 400, 500) → endpoint exists
96
- anthropicEndpointAvailable = res.status !== 404;
88
+ clearTimeout(timeout);
89
+
90
+ if (versionRes.ok) {
91
+ const versionData = await versionRes.json().catch(() => null);
92
+ const version = versionData?.version || "0.0.0";
93
+ const [major, minor] = version.split(".").map(Number);
94
+
95
+ // v0.14.0+ has the Anthropic Messages API
96
+ anthropicEndpointAvailable = major > 0 || (major === 0 && minor >= 14);
97
+ } else {
98
+ // Can't determine version — fall back to legacy
99
+ anthropicEndpointAvailable = false;
100
+ }
97
101
  logger.info(
98
- { available: anthropicEndpointAvailable, status: res.status },
102
+ { available: anthropicEndpointAvailable, status: versionRes.status },
99
103
  anthropicEndpointAvailable
100
104
  ? "Ollama Anthropic API detected (/v1/messages) — using native passthrough"
101
105
  : "Ollama Anthropic API not available — falling back to legacy /api/chat (upgrade to Ollama v0.14.0+ for best results)"