lynkr 9.0.1 → 9.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +70 -21
- package/bin/cli.js +34 -4
- package/bin/lynkr-trajectory.js +136 -0
- package/bin/lynkr-usage.js +219 -0
- package/funding.json +110 -0
- package/index.js +7 -3
- package/install.sh +3 -3
- package/lynkr-skill.tar.gz +0 -0
- package/native/Cargo.toml +26 -0
- package/native/index.js +29 -0
- package/native/lynkr-native.node +0 -0
- package/native/src/lib.rs +321 -0
- package/package.json +6 -5
- package/public/dashboard.html +665 -0
- package/src/api/files-multipart.js +30 -0
- package/src/api/files-router.js +81 -0
- package/src/api/middleware/budget.js +19 -1
- package/src/api/middleware/load-shedding.js +17 -0
- package/src/api/openai-router.js +353 -301
- package/src/api/router.js +275 -40
- package/src/cache/prompt.js +13 -0
- package/src/clients/databricks.js +42 -18
- package/src/clients/ollama-utils.js +21 -17
- package/src/clients/openai-format.js +50 -10
- package/src/clients/openrouter-utils.js +42 -37
- package/src/clients/prompt-cache-injection.js +140 -0
- package/src/clients/provider-capabilities.js +41 -0
- package/src/clients/responses-format.js +8 -7
- package/src/clients/standard-tools.js +1 -1
- package/src/clients/xml-tool-extractor.js +307 -0
- package/src/cluster.js +82 -0
- package/src/config/index.js +16 -0
- package/src/context/distill.js +15 -0
- package/src/context/tool-result-compressor.js +563 -0
- package/src/dashboard/api.js +170 -0
- package/src/dashboard/router.js +13 -0
- package/src/headroom/client.js +3 -109
- package/src/headroom/index.js +0 -14
- package/src/memory/extractor.js +22 -0
- package/src/memory/search.js +0 -50
- package/src/orchestrator/index.js +163 -204
- package/src/orchestrator/preflight.js +188 -0
- package/src/routing/index.js +64 -32
- package/src/routing/interaction.js +183 -0
- package/src/routing/risk-analyzer.js +194 -0
- package/src/routing/telemetry.js +47 -2
- package/src/server.js +15 -0
- package/src/stores/file-store.js +104 -0
- package/src/stores/response-store.js +25 -0
- package/src/tools/index.js +1 -1
- package/src/tools/smart-selection.js +11 -2
- package/src/tools/web.js +1 -1
- package/src/training/trajectory-compressor.js +266 -0
- package/src/usage/aggregator.js +206 -0
- package/src/utils/markdown-ansi.js +146 -0
- package/.lynkr/telemetry.db +0 -0
- package/.lynkr/telemetry.db-shm +0 -0
- package/.lynkr/telemetry.db-wal +0 -0
package/src/api/router.js
CHANGED
|
@@ -6,8 +6,10 @@ const logger = require("../logger");
|
|
|
6
6
|
const { createRateLimiter } = require("./middleware/rate-limiter");
|
|
7
7
|
const openaiRouter = require("./openai-router");
|
|
8
8
|
const providersRouter = require("./providers-handler");
|
|
9
|
-
const { getRoutingHeaders, getRoutingStats, analyzeComplexity, getModelTierSelector } = require("../routing");
|
|
9
|
+
const { getRoutingHeaders, getRoutingStats, analyzeComplexity, getModelTierSelector, analyzeRisk } = require("../routing");
|
|
10
|
+
const { buildInteractionBlock } = require("../routing/interaction");
|
|
10
11
|
const { validateCwd } = require("../workspace");
|
|
12
|
+
const { renderText } = require("../utils/markdown-ansi");
|
|
11
13
|
|
|
12
14
|
const router = express.Router();
|
|
13
15
|
|
|
@@ -63,6 +65,24 @@ router.get("/health", (req, res) => {
|
|
|
63
65
|
res.json({ status: "ok" });
|
|
64
66
|
});
|
|
65
67
|
|
|
68
|
+
// Usage report — same data as `lynkr usage` CLI, served as JSON for
|
|
69
|
+
// dashboards / agents / scripts that want to surface spend & savings.
|
|
70
|
+
router.get("/v1/usage", (req, res) => {
|
|
71
|
+
try {
|
|
72
|
+
const aggregator = require("../usage/aggregator");
|
|
73
|
+
const window = req.query.window || (req.query.days ? `${parseInt(req.query.days, 10)}d` : "30d");
|
|
74
|
+
const usage = aggregator.getUsage({
|
|
75
|
+
window,
|
|
76
|
+
flagship: req.query.flagship,
|
|
77
|
+
provider: req.query.provider,
|
|
78
|
+
model: req.query.model,
|
|
79
|
+
});
|
|
80
|
+
res.json(usage);
|
|
81
|
+
} catch (err) {
|
|
82
|
+
res.status(500).json({ error: err.message });
|
|
83
|
+
}
|
|
84
|
+
});
|
|
85
|
+
|
|
66
86
|
// Routing stats endpoint (Phase 3: Metrics)
|
|
67
87
|
router.get("/routing/stats", (req, res) => {
|
|
68
88
|
const stats = getRoutingStats();
|
|
@@ -213,7 +233,46 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
|
|
|
213
233
|
const { createTimer } = require("../utils/perf-timer");
|
|
214
234
|
const timer = createTimer("POST /v1/messages");
|
|
215
235
|
metrics.recordRequest();
|
|
216
|
-
|
|
236
|
+
|
|
237
|
+
// Convert Anthropic server tools (web_search_20260209, etc.) to regular
|
|
238
|
+
// function tools so non-Anthropic providers can execute them via Lynkr.
|
|
239
|
+
// The orchestrator's SERVER_SIDE_TOOLS handling will execute them server-side.
|
|
240
|
+
if (Array.isArray(req.body?.tools)) {
|
|
241
|
+
const incomingToolTypes = req.body.tools.map(t => t?.type || t?.name).filter(Boolean);
|
|
242
|
+
logger.info({ incomingToolTypes }, "Incoming /v1/messages tool types");
|
|
243
|
+
req.body.tools = req.body.tools.map((tool) => {
|
|
244
|
+
if (tool?.type?.startsWith?.("web_search_20")) {
|
|
245
|
+
logger.info({ originalType: tool.type, name: tool.name }, "Converting web_search server tool to function tool");
|
|
246
|
+
return {
|
|
247
|
+
name: tool.name || "web_search",
|
|
248
|
+
description: "Search the web for up-to-date information. Returns relevant search results from the web.",
|
|
249
|
+
input_schema: {
|
|
250
|
+
type: "object",
|
|
251
|
+
properties: {
|
|
252
|
+
query: { type: "string", description: "Search query" },
|
|
253
|
+
},
|
|
254
|
+
required: ["query"],
|
|
255
|
+
},
|
|
256
|
+
};
|
|
257
|
+
}
|
|
258
|
+
if (tool?.type?.startsWith?.("web_fetch_")) {
|
|
259
|
+
return {
|
|
260
|
+
name: tool.name || "web_fetch",
|
|
261
|
+
description: "Fetch the contents of a URL.",
|
|
262
|
+
input_schema: {
|
|
263
|
+
type: "object",
|
|
264
|
+
properties: {
|
|
265
|
+
url: { type: "string", description: "URL to fetch" },
|
|
266
|
+
},
|
|
267
|
+
required: ["url"],
|
|
268
|
+
},
|
|
269
|
+
};
|
|
270
|
+
}
|
|
271
|
+
return tool;
|
|
272
|
+
});
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Support both query parameter (?stream=true) and body parameter ({"stream": true})
|
|
217
276
|
const wantsStream = Boolean(req.query?.stream === 'true' || req.body?.stream);
|
|
218
277
|
const hasTools = Array.isArray(req.body?.tools) && req.body.tools.length > 0;
|
|
219
278
|
timer.mark("parseRequest");
|
|
@@ -221,24 +280,70 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
|
|
|
221
280
|
// Analyze complexity for routing headers (Phase 3)
|
|
222
281
|
const complexity = await analyzeComplexity(req.body);
|
|
223
282
|
timer.mark("analyzeComplexity");
|
|
283
|
+
|
|
284
|
+
// Risk axis runs alongside complexity. Cheap pure-string scan, no I/O.
|
|
285
|
+
let preRouteRisk = null;
|
|
286
|
+
try {
|
|
287
|
+
preRouteRisk = analyzeRisk(req.body);
|
|
288
|
+
} catch (err) {
|
|
289
|
+
logger.debug({ err: err.message }, '[Router] Risk analysis failed in pre-route');
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Pre-route tier: high-risk forces COMPLEX, otherwise tier is
|
|
293
|
+
// inferred from the complexity recommendation. The actual final
|
|
294
|
+
// tier may differ (invokeModel re-runs determineProviderSmart) —
|
|
295
|
+
// this is best-effort for header surfacing.
|
|
224
296
|
let preRouteProvider = 'cloud';
|
|
225
|
-
|
|
226
|
-
|
|
297
|
+
let preRouteTier = null;
|
|
298
|
+
let preRouteModel = null;
|
|
299
|
+
let preRouteMethod = 'complexity';
|
|
300
|
+
let preRouteReason = complexity.breakdown?.taskType?.reason || complexity.recommendation;
|
|
301
|
+
|
|
302
|
+
if (preRouteRisk?.level === 'high') {
|
|
227
303
|
try {
|
|
228
304
|
const selector = getModelTierSelector();
|
|
229
|
-
const tierResult = selector.selectModel('
|
|
305
|
+
const tierResult = selector.selectModel('COMPLEX', null);
|
|
230
306
|
preRouteProvider = tierResult.provider;
|
|
307
|
+
preRouteTier = 'COMPLEX';
|
|
308
|
+
preRouteModel = tierResult.model;
|
|
309
|
+
preRouteMethod = 'risk';
|
|
310
|
+
preRouteReason = 'high_risk_forced_tier';
|
|
231
311
|
} catch (_) {
|
|
232
|
-
|
|
312
|
+
// Risk-forced tier not configured; fall back to normal flow.
|
|
233
313
|
}
|
|
234
314
|
}
|
|
235
|
-
|
|
315
|
+
|
|
316
|
+
if (!preRouteTier) {
|
|
317
|
+
if (complexity.recommendation === 'local') {
|
|
318
|
+
try {
|
|
319
|
+
const selector = getModelTierSelector();
|
|
320
|
+
const tierResult = selector.selectModel('SIMPLE', null);
|
|
321
|
+
preRouteProvider = tierResult.provider;
|
|
322
|
+
preRouteTier = 'SIMPLE';
|
|
323
|
+
preRouteModel = tierResult.model;
|
|
324
|
+
} catch (_) {
|
|
325
|
+
preRouteProvider = 'ollama';
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
const preRouteDecision = {
|
|
236
331
|
provider: preRouteProvider,
|
|
332
|
+
tier: preRouteTier,
|
|
333
|
+
model: preRouteModel,
|
|
334
|
+
method: preRouteMethod,
|
|
335
|
+
reason: preRouteReason,
|
|
237
336
|
score: complexity.score,
|
|
238
337
|
threshold: complexity.threshold,
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
338
|
+
risk: preRouteRisk,
|
|
339
|
+
};
|
|
340
|
+
|
|
341
|
+
const routingHeaders = getRoutingHeaders(preRouteDecision);
|
|
342
|
+
|
|
343
|
+
// Build the interaction block once. It travels in headers always
|
|
344
|
+
// (X-Lynkr-Interaction-* derived fields) and optionally into the
|
|
345
|
+
// response body when LYNKR_VISIBLE_ROUTING=true.
|
|
346
|
+
const interaction = buildInteractionBlock(preRouteDecision);
|
|
242
347
|
|
|
243
348
|
// Extract client CWD from request body or header
|
|
244
349
|
const clientCwd = validateCwd(req.body?.cwd || req.headers['x-workspace-cwd']);
|
|
@@ -369,7 +474,11 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
|
|
|
369
474
|
})}\n\n`);
|
|
370
475
|
|
|
371
476
|
// 2. content_block_start and content_block_delta for each content block
|
|
372
|
-
|
|
477
|
+
// Filter out server-side tools that shouldn't reach the client
|
|
478
|
+
const _serverTools = new Set(["task", "websearch", "webfetch", "web_search", "web_fetch", "web_agent"]);
|
|
479
|
+
const contentBlocks = (msg.content || []).filter(b =>
|
|
480
|
+
!(b.type === "tool_use" && _serverTools.has((b.name || "").toLowerCase()))
|
|
481
|
+
);
|
|
373
482
|
for (let i = 0; i < contentBlocks.length; i++) {
|
|
374
483
|
const block = contentBlocks[i];
|
|
375
484
|
|
|
@@ -381,38 +490,90 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
|
|
|
381
490
|
content_block: { type: "text", text: "" }
|
|
382
491
|
})}\n\n`);
|
|
383
492
|
|
|
384
|
-
// Send text
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
493
|
+
// Send text — one chunk when ANSI rendering is active (splitting
|
|
494
|
+
// ANSI escape sequences across 20-char chunks breaks terminal output).
|
|
495
|
+
// Plain text falls back to line-level chunks for a trickle effect.
|
|
496
|
+
// Never apply ANSI rendering to HTML content (<artifact> blocks):
|
|
497
|
+
// ANSI codes corrupt CSS selectors like `*` and break the browser viewer.
|
|
498
|
+
const rawBlockText = block.text || "";
|
|
499
|
+
const isHtmlContent = rawBlockText.includes("<artifact") || rawBlockText.trimStart().startsWith("<");
|
|
500
|
+
const text = isHtmlContent ? rawBlockText : renderText(rawBlockText);
|
|
501
|
+
const { enabled: ansiEnabled } = require("../utils/markdown-ansi");
|
|
502
|
+
if (ansiEnabled && !isHtmlContent) {
|
|
503
|
+
if (text.length > 0) {
|
|
504
|
+
res.write(`event: content_block_delta\n`);
|
|
505
|
+
res.write(`data: ${JSON.stringify({
|
|
506
|
+
type: "content_block_delta",
|
|
507
|
+
index: i,
|
|
508
|
+
delta: { type: "text_delta", text }
|
|
509
|
+
})}\n\n`);
|
|
510
|
+
}
|
|
511
|
+
} else {
|
|
512
|
+
const lines = text.split("\n");
|
|
513
|
+
for (const line of lines) {
|
|
514
|
+
const lineWithNl = line + "\n";
|
|
515
|
+
res.write(`event: content_block_delta\n`);
|
|
516
|
+
res.write(`data: ${JSON.stringify({
|
|
517
|
+
type: "content_block_delta",
|
|
518
|
+
index: i,
|
|
519
|
+
delta: { type: "text_delta", text: lineWithNl }
|
|
520
|
+
})}\n\n`);
|
|
521
|
+
}
|
|
395
522
|
}
|
|
396
523
|
|
|
397
524
|
res.write(`event: content_block_stop\n`);
|
|
398
525
|
res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
|
|
399
|
-
} else if (block.type === "
|
|
526
|
+
} else if (block.type === "thinking") {
|
|
400
527
|
res.write(`event: content_block_start\n`);
|
|
401
528
|
res.write(`data: ${JSON.stringify({
|
|
402
529
|
type: "content_block_start",
|
|
403
530
|
index: i,
|
|
404
|
-
content_block: { type: "
|
|
405
|
-
})}\n\n`);
|
|
406
|
-
|
|
407
|
-
res.write(`event: content_block_delta\n`);
|
|
408
|
-
res.write(`data: ${JSON.stringify({
|
|
409
|
-
type: "content_block_delta",
|
|
410
|
-
index: i,
|
|
411
|
-
delta: { type: "input_json_delta", partial_json: JSON.stringify(block.input) }
|
|
531
|
+
content_block: { type: "thinking", thinking: "" }
|
|
412
532
|
})}\n\n`);
|
|
413
|
-
|
|
533
|
+
const thinkingText = block.thinking || "";
|
|
534
|
+
const thinkChunkSize = 40;
|
|
535
|
+
for (let j = 0; j < thinkingText.length; j += thinkChunkSize) {
|
|
536
|
+
res.write(`event: content_block_delta\n`);
|
|
537
|
+
res.write(`data: ${JSON.stringify({
|
|
538
|
+
type: "content_block_delta",
|
|
539
|
+
index: i,
|
|
540
|
+
delta: { type: "thinking_delta", thinking: thinkingText.slice(j, j + thinkChunkSize) }
|
|
541
|
+
})}\n\n`);
|
|
542
|
+
}
|
|
414
543
|
res.write(`event: content_block_stop\n`);
|
|
415
544
|
res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
|
|
545
|
+
} else if (block.type === "tool_use") {
|
|
546
|
+
// Original request had no tools → model hallucinated a tool call.
|
|
547
|
+
// Extract file content from write-style tools and wrap it in an
|
|
548
|
+
// <artifact> block so open-design routes it to the Design panel.
|
|
549
|
+
const toolName = (block.name || "").toLowerCase();
|
|
550
|
+
const writeTools = new Set(["write", "create_file", "write_file", "str_replace_editor"]);
|
|
551
|
+
if (writeTools.has(toolName)) {
|
|
552
|
+
const rawContent = block.input?.content ?? block.input?.file_content ?? block.input?.new_content ?? "";
|
|
553
|
+
const filePath = String(block.input?.file_path ?? block.input?.filename ?? "design.html");
|
|
554
|
+
const content = String(rawContent);
|
|
555
|
+
if (content) {
|
|
556
|
+
// Wrap in <artifact> so open-design's parser routes it to the file viewer.
|
|
557
|
+
const identifier = filePath.replace(/[^a-zA-Z0-9._-]/g, "_");
|
|
558
|
+
const title = filePath;
|
|
559
|
+
const wrapped = `<artifact identifier="${identifier}" type="text/html" title="${title}">\n${content}\n</artifact>`;
|
|
560
|
+
res.write(`event: content_block_start\n`);
|
|
561
|
+
res.write(`data: ${JSON.stringify({
|
|
562
|
+
type: "content_block_start",
|
|
563
|
+
index: i,
|
|
564
|
+
content_block: { type: "text", text: "" }
|
|
565
|
+
})}\n\n`);
|
|
566
|
+
res.write(`event: content_block_delta\n`);
|
|
567
|
+
res.write(`data: ${JSON.stringify({
|
|
568
|
+
type: "content_block_delta",
|
|
569
|
+
index: i,
|
|
570
|
+
delta: { type: "text_delta", text: wrapped }
|
|
571
|
+
})}\n\n`);
|
|
572
|
+
res.write(`event: content_block_stop\n`);
|
|
573
|
+
res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
// Non-write tool_use in a tool-less request is silently dropped.
|
|
416
577
|
}
|
|
417
578
|
}
|
|
418
579
|
|
|
@@ -488,7 +649,11 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
|
|
|
488
649
|
})}\n\n`);
|
|
489
650
|
|
|
490
651
|
// 2. content_block_start and content_block_delta for each content block
|
|
491
|
-
|
|
652
|
+
// Filter out server-side tools that shouldn't reach the client
|
|
653
|
+
const _serverTools = new Set(["task", "websearch", "webfetch", "web_search", "web_fetch", "web_agent"]);
|
|
654
|
+
const contentBlocks = (msg.content || []).filter(b =>
|
|
655
|
+
!(b.type === "tool_use" && _serverTools.has((b.name || "").toLowerCase()))
|
|
656
|
+
);
|
|
492
657
|
for (let i = 0; i < contentBlocks.length; i++) {
|
|
493
658
|
const block = contentBlocks[i];
|
|
494
659
|
|
|
@@ -500,18 +665,51 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
|
|
|
500
665
|
content_block: { type: "text", text: "" }
|
|
501
666
|
})}\n\n`);
|
|
502
667
|
|
|
503
|
-
const
|
|
504
|
-
const
|
|
505
|
-
|
|
506
|
-
|
|
668
|
+
const rawBlockText2 = block.text || "";
|
|
669
|
+
const isHtmlContent2 = rawBlockText2.includes("<artifact") || rawBlockText2.trimStart().startsWith("<");
|
|
670
|
+
const text = isHtmlContent2 ? rawBlockText2 : renderText(rawBlockText2);
|
|
671
|
+
const { enabled: ansiEnabled } = require("../utils/markdown-ansi");
|
|
672
|
+
if (ansiEnabled && !isHtmlContent2) {
|
|
673
|
+
if (text.length > 0) {
|
|
674
|
+
res.write(`event: content_block_delta\n`);
|
|
675
|
+
res.write(`data: ${JSON.stringify({
|
|
676
|
+
type: "content_block_delta",
|
|
677
|
+
index: i,
|
|
678
|
+
delta: { type: "text_delta", text }
|
|
679
|
+
})}\n\n`);
|
|
680
|
+
}
|
|
681
|
+
} else {
|
|
682
|
+
const lines = text.split("\n");
|
|
683
|
+
for (const line of lines) {
|
|
684
|
+
const lineWithNl = line + "\n";
|
|
685
|
+
res.write(`event: content_block_delta\n`);
|
|
686
|
+
res.write(`data: ${JSON.stringify({
|
|
687
|
+
type: "content_block_delta",
|
|
688
|
+
index: i,
|
|
689
|
+
delta: { type: "text_delta", text: lineWithNl }
|
|
690
|
+
})}\n\n`);
|
|
691
|
+
}
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
res.write(`event: content_block_stop\n`);
|
|
695
|
+
res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
|
|
696
|
+
} else if (block.type === "thinking") {
|
|
697
|
+
res.write(`event: content_block_start\n`);
|
|
698
|
+
res.write(`data: ${JSON.stringify({
|
|
699
|
+
type: "content_block_start",
|
|
700
|
+
index: i,
|
|
701
|
+
content_block: { type: "thinking", thinking: "" }
|
|
702
|
+
})}\n\n`);
|
|
703
|
+
const thinkingText = block.thinking || "";
|
|
704
|
+
const thinkChunkSize = 40;
|
|
705
|
+
for (let j = 0; j < thinkingText.length; j += thinkChunkSize) {
|
|
507
706
|
res.write(`event: content_block_delta\n`);
|
|
508
707
|
res.write(`data: ${JSON.stringify({
|
|
509
708
|
type: "content_block_delta",
|
|
510
709
|
index: i,
|
|
511
|
-
delta: { type: "
|
|
710
|
+
delta: { type: "thinking_delta", thinking: thinkingText.slice(j, j + thinkChunkSize) }
|
|
512
711
|
})}\n\n`);
|
|
513
712
|
}
|
|
514
|
-
|
|
515
713
|
res.write(`event: content_block_stop\n`);
|
|
516
714
|
res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
|
|
517
715
|
} else if (block.type === "tool_use") {
|
|
@@ -566,8 +764,33 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
|
|
|
566
764
|
});
|
|
567
765
|
}
|
|
568
766
|
|
|
767
|
+
// Inject visible interaction block into the response body when
|
|
768
|
+
// LYNKR_VISIBLE_ROUTING=true. We only mutate JSON bodies — and only
|
|
769
|
+
// when the response looks like a valid Anthropic Message — so this
|
|
770
|
+
// is a no-op for streamed / error / non-message responses.
|
|
771
|
+
let finalBody = result.body;
|
|
772
|
+
if (
|
|
773
|
+
config.routing?.visibleInteraction &&
|
|
774
|
+
interaction &&
|
|
775
|
+
result.status >= 200 && result.status < 300 &&
|
|
776
|
+
result.body
|
|
777
|
+
) {
|
|
778
|
+
try {
|
|
779
|
+
const text = Buffer.isBuffer(result.body) ? result.body.toString('utf8') : result.body;
|
|
780
|
+
if (typeof text === 'string' && text.startsWith('{')) {
|
|
781
|
+
const parsed = JSON.parse(text);
|
|
782
|
+
if (parsed && typeof parsed === 'object' && parsed.type === 'message') {
|
|
783
|
+
parsed.lynkr_interaction = interaction;
|
|
784
|
+
finalBody = JSON.stringify(parsed);
|
|
785
|
+
}
|
|
786
|
+
}
|
|
787
|
+
} catch (err) {
|
|
788
|
+
logger.debug({ err: err.message }, '[Router] Skipped interaction injection (non-JSON body)');
|
|
789
|
+
}
|
|
790
|
+
}
|
|
791
|
+
|
|
569
792
|
metrics.recordResponse(result.status);
|
|
570
|
-
res.status(result.status).send(
|
|
793
|
+
res.status(result.status).send(finalBody);
|
|
571
794
|
} catch (error) {
|
|
572
795
|
next(error);
|
|
573
796
|
}
|
|
@@ -724,6 +947,18 @@ router.get("/metrics/compression", async (req, res) => {
|
|
|
724
947
|
}
|
|
725
948
|
});
|
|
726
949
|
|
|
950
|
+
router.get("/metrics/tool-compression", (req, res) => {
|
|
951
|
+
const { getMetrics } = require("../context/tool-result-compressor");
|
|
952
|
+
res.json(getMetrics());
|
|
953
|
+
});
|
|
954
|
+
|
|
955
|
+
router.get("/tee/:id", (req, res) => {
|
|
956
|
+
const { teeGet } = require("../context/tool-result-compressor");
|
|
957
|
+
const content = teeGet(req.params.id);
|
|
958
|
+
if (!content) return res.status(404).json({ error: "Tee entry not found or expired" });
|
|
959
|
+
res.type("text/plain").send(content);
|
|
960
|
+
});
|
|
961
|
+
|
|
727
962
|
router.get("/health/headroom", async (req, res) => {
|
|
728
963
|
try {
|
|
729
964
|
const { getHeadroomManager } = require("../headroom");
|
package/src/cache/prompt.js
CHANGED
|
@@ -5,6 +5,15 @@ try {
|
|
|
5
5
|
} catch {
|
|
6
6
|
Database = null;
|
|
7
7
|
}
|
|
8
|
+
|
|
9
|
+
// Try to load native Rust cache key computation (4x faster for small payloads)
|
|
10
|
+
let nativeCacheKey = null;
|
|
11
|
+
try {
|
|
12
|
+
const native = require('../../native');
|
|
13
|
+
if (native.available && native.computeCacheKey) {
|
|
14
|
+
nativeCacheKey = native.computeCacheKey;
|
|
15
|
+
}
|
|
16
|
+
} catch { /* native module not available — use JS */ }
|
|
8
17
|
const path = require("path");
|
|
9
18
|
const fs = require("fs");
|
|
10
19
|
const config = require("../config");
|
|
@@ -164,6 +173,10 @@ class PromptCache {
|
|
|
164
173
|
max_tokens: payload.max_tokens ?? null,
|
|
165
174
|
};
|
|
166
175
|
const serialised = stableStringify(canonical);
|
|
176
|
+
// Use Rust for small payloads where it's 4x faster
|
|
177
|
+
if (nativeCacheKey && serialised.length < 5000) {
|
|
178
|
+
return nativeCacheKey(serialised);
|
|
179
|
+
}
|
|
167
180
|
return crypto.createHash("sha256").update(serialised).digest("hex");
|
|
168
181
|
} catch (error) {
|
|
169
182
|
logger.warn(
|
|
@@ -34,19 +34,20 @@ logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI bulkhead initialized");
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
// HTTP connection pooling for better performance
|
|
37
|
+
// Increased maxSockets for high-concurrency team deployments (50+ devs)
|
|
37
38
|
const httpAgent = new http.Agent({
|
|
38
39
|
keepAlive: true,
|
|
39
|
-
maxSockets:
|
|
40
|
-
maxFreeSockets:
|
|
41
|
-
timeout:
|
|
40
|
+
maxSockets: 200,
|
|
41
|
+
maxFreeSockets: 20,
|
|
42
|
+
timeout: 120000,
|
|
42
43
|
keepAliveMsecs: 30000,
|
|
43
44
|
});
|
|
44
45
|
|
|
45
46
|
const httpsAgent = new https.Agent({
|
|
46
47
|
keepAlive: true,
|
|
47
|
-
maxSockets:
|
|
48
|
-
maxFreeSockets:
|
|
49
|
-
timeout:
|
|
48
|
+
maxSockets: 200,
|
|
49
|
+
maxFreeSockets: 20,
|
|
50
|
+
timeout: 120000,
|
|
50
51
|
keepAliveMsecs: 30000,
|
|
51
52
|
});
|
|
52
53
|
|
|
@@ -220,7 +221,7 @@ async function invokeOllama(body) {
|
|
|
220
221
|
const useAnthropicApi = await hasAnthropicEndpoint(config.ollama.endpoint);
|
|
221
222
|
|
|
222
223
|
// Check if model supports tools FIRST (before wasteful injection)
|
|
223
|
-
const supportsTools = await checkOllamaToolSupport(
|
|
224
|
+
const supportsTools = await checkOllamaToolSupport(modelName);
|
|
224
225
|
const injectToolsOllama = process.env.INJECT_TOOLS_OLLAMA !== "false";
|
|
225
226
|
|
|
226
227
|
// Determine tools to send
|
|
@@ -271,7 +272,7 @@ async function invokeOllama(body) {
|
|
|
271
272
|
model: modelName,
|
|
272
273
|
messages: body.messages,
|
|
273
274
|
max_tokens: body.max_tokens || 16384,
|
|
274
|
-
stream: false,
|
|
275
|
+
stream: body.stream ?? false,
|
|
275
276
|
};
|
|
276
277
|
|
|
277
278
|
if (body.system) ollamaBody.system = body.system;
|
|
@@ -338,7 +339,7 @@ async function invokeOllama(body) {
|
|
|
338
339
|
const ollamaBody = {
|
|
339
340
|
model: modelName,
|
|
340
341
|
messages: deduplicated,
|
|
341
|
-
stream: false,
|
|
342
|
+
stream: body.stream ?? false,
|
|
342
343
|
options: {
|
|
343
344
|
temperature: body.temperature ?? 0.7,
|
|
344
345
|
num_predict: body.max_tokens ?? 16384,
|
|
@@ -475,13 +476,17 @@ async function invokeAzureOpenAI(body) {
|
|
|
475
476
|
// System prompt injection disabled - breaks model response
|
|
476
477
|
// Tool guidance now provided via tool descriptions instead
|
|
477
478
|
|
|
479
|
+
const azureDeployment = body._suggestionModeModel || body._tierModel || config.azureOpenAI.deployment || "";
|
|
480
|
+
const isGpt5 = /gpt-5/i.test(azureDeployment);
|
|
481
|
+
const maxTokensKey = isGpt5 ? "max_completion_tokens" : "max_tokens";
|
|
482
|
+
|
|
478
483
|
const azureBody = {
|
|
479
484
|
messages,
|
|
480
|
-
temperature: body.temperature ?? 0.3,
|
|
481
|
-
|
|
485
|
+
temperature: body.temperature ?? 0.3,
|
|
486
|
+
[maxTokensKey]: Math.min(body.max_tokens ?? 16384, 16384),
|
|
482
487
|
top_p: body.top_p ?? 1.0,
|
|
483
|
-
stream: false,
|
|
484
|
-
model:
|
|
488
|
+
stream: false,
|
|
489
|
+
model: azureDeployment
|
|
485
490
|
};
|
|
486
491
|
|
|
487
492
|
// Add tools - inject standard tools if client didn't send any (passthrough mode)
|
|
@@ -1598,20 +1603,34 @@ function convertOpenAIToAnthropic(response) {
|
|
|
1598
1603
|
const message = choice.message || {};
|
|
1599
1604
|
const content = [];
|
|
1600
1605
|
|
|
1606
|
+
// Extract tool calls embedded as XML/text in content (Minimax, Qwen, GLM, etc.)
|
|
1607
|
+
if (!message.tool_calls?.length && typeof message.content === "string" && message.content.trim()) {
|
|
1608
|
+
const { extractToolCallsFromText } = require("./xml-tool-extractor");
|
|
1609
|
+
const extracted = extractToolCallsFromText(message.content);
|
|
1610
|
+
if (extracted.toolCalls.length > 0) {
|
|
1611
|
+
message.tool_calls = extracted.toolCalls;
|
|
1612
|
+
message.content = extracted.cleanedText;
|
|
1613
|
+
choice.finish_reason = "tool_calls";
|
|
1614
|
+
}
|
|
1615
|
+
}
|
|
1616
|
+
|
|
1601
1617
|
// Add text content from message.content
|
|
1602
1618
|
// Don't add placeholder text if there are tool_calls - tools are the actual response
|
|
1603
1619
|
const hasToolCalls = Array.isArray(message.tool_calls) && message.tool_calls.length > 0;
|
|
1604
1620
|
|
|
1605
|
-
// Extract text content
|
|
1621
|
+
// Extract text content and reasoning from thinking models
|
|
1606
1622
|
const textContent = typeof message.content === 'string' ? message.content : '';
|
|
1607
1623
|
const reasoningContent = typeof message.reasoning_content === 'string' ? message.reasoning_content : '';
|
|
1608
1624
|
|
|
1625
|
+
// Emit reasoning_content as a proper thinking block (not discarded)
|
|
1626
|
+
if (reasoningContent) {
|
|
1627
|
+
content.push({ type: "thinking", thinking: reasoningContent });
|
|
1628
|
+
}
|
|
1629
|
+
|
|
1609
1630
|
if (textContent) {
|
|
1610
|
-
// Has regular content - use it directly (ignore reasoning_content chain-of-thought)
|
|
1611
1631
|
content.push({ type: "text", text: textContent });
|
|
1612
|
-
} else if (reasoningContent) {
|
|
1613
|
-
//
|
|
1614
|
-
content.push({ type: "text", text: reasoningContent });
|
|
1632
|
+
} else if (!reasoningContent) {
|
|
1633
|
+
// No content and no reasoning — will be handled by the empty check below
|
|
1615
1634
|
}
|
|
1616
1635
|
|
|
1617
1636
|
// Convert tool calls
|
|
@@ -2028,6 +2047,11 @@ async function invokeModel(body, options = {}) {
|
|
|
2028
2047
|
body._tierModel = tierSelectedModel;
|
|
2029
2048
|
}
|
|
2030
2049
|
|
|
2050
|
+
// Inject provider-side prompt caching (cache_control breakpoints)
|
|
2051
|
+
// Reduces input token cost by up to 90% and latency by up to 80%
|
|
2052
|
+
const { injectPromptCaching } = require('./prompt-cache-injection');
|
|
2053
|
+
injectPromptCaching(body, initialProvider);
|
|
2054
|
+
|
|
2031
2055
|
// Build routing decision object for response headers
|
|
2032
2056
|
const routingDecision = {
|
|
2033
2057
|
provider: initialProvider,
|
|
@@ -77,25 +77,29 @@ async function hasAnthropicEndpoint(baseUrl) {
|
|
|
77
77
|
if (anthropicEndpointAvailable !== null) return anthropicEndpointAvailable;
|
|
78
78
|
|
|
79
79
|
try {
|
|
80
|
-
//
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
body: JSON.stringify({
|
|
88
|
-
model: "probe",
|
|
89
|
-
max_tokens: 1,
|
|
90
|
-
messages: [{ role: "user", content: "hi" }],
|
|
91
|
-
}),
|
|
80
|
+
// Check Ollama version — /v1/messages requires v0.14.0+
|
|
81
|
+
// This is instant (no LLM inference) vs the old probe that sent a real request
|
|
82
|
+
const controller = new AbortController();
|
|
83
|
+
const timeout = setTimeout(() => controller.abort(), 3000);
|
|
84
|
+
const versionRes = await fetch(`${baseUrl}/api/version`, {
|
|
85
|
+
method: "GET",
|
|
86
|
+
signal: controller.signal,
|
|
92
87
|
});
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
88
|
+
clearTimeout(timeout);
|
|
89
|
+
|
|
90
|
+
if (versionRes.ok) {
|
|
91
|
+
const versionData = await versionRes.json().catch(() => null);
|
|
92
|
+
const version = versionData?.version || "0.0.0";
|
|
93
|
+
const [major, minor] = version.split(".").map(Number);
|
|
94
|
+
|
|
95
|
+
// v0.14.0+ has the Anthropic Messages API
|
|
96
|
+
anthropicEndpointAvailable = major > 0 || (major === 0 && minor >= 14);
|
|
97
|
+
} else {
|
|
98
|
+
// Can't determine version — fall back to legacy
|
|
99
|
+
anthropicEndpointAvailable = false;
|
|
100
|
+
}
|
|
97
101
|
logger.info(
|
|
98
|
-
{ available: anthropicEndpointAvailable, status:
|
|
102
|
+
{ available: anthropicEndpointAvailable, status: versionRes.status },
|
|
99
103
|
anthropicEndpointAvailable
|
|
100
104
|
? "Ollama Anthropic API detected (/v1/messages) — using native passthrough"
|
|
101
105
|
: "Ollama Anthropic API not available — falling back to legacy /api/chat (upgrade to Ollama v0.14.0+ for best results)"
|