lynkr 8.0.1 → 9.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +238 -315
- package/bin/cli.js +16 -3
- package/index.js +7 -3
- package/install.sh +3 -3
- package/lynkr-skill.tar.gz +0 -0
- package/native/Cargo.toml +26 -0
- package/native/index.js +29 -0
- package/native/lynkr-native.node +0 -0
- package/native/src/lib.rs +321 -0
- package/package.json +8 -6
- package/src/api/files-multipart.js +30 -0
- package/src/api/files-router.js +81 -0
- package/src/api/openai-router.js +379 -308
- package/src/api/providers-handler.js +171 -3
- package/src/api/router.js +109 -5
- package/src/cache/prompt.js +13 -0
- package/src/clients/circuit-breaker.js +10 -247
- package/src/clients/codex-process.js +342 -0
- package/src/clients/codex-utils.js +143 -0
- package/src/clients/databricks.js +243 -76
- package/src/clients/ollama-utils.js +21 -17
- package/src/clients/openai-format.js +20 -6
- package/src/clients/openrouter-utils.js +42 -37
- package/src/clients/prompt-cache-injection.js +140 -0
- package/src/clients/provider-capabilities.js +41 -0
- package/src/clients/resilience.js +540 -0
- package/src/clients/responses-format.js +8 -7
- package/src/clients/retry.js +22 -167
- package/src/clients/standard-tools.js +1 -1
- package/src/clients/xml-tool-extractor.js +307 -0
- package/src/cluster.js +82 -0
- package/src/config/index.js +66 -0
- package/src/context/compression.js +42 -9
- package/src/context/distill.js +507 -0
- package/src/context/tool-result-compressor.js +563 -0
- package/src/memory/extractor.js +22 -0
- package/src/orchestrator/index.js +147 -205
- package/src/routing/complexity-analyzer.js +258 -5
- package/src/routing/index.js +15 -34
- package/src/routing/latency-tracker.js +148 -0
- package/src/routing/model-tiers.js +2 -0
- package/src/routing/quality-scorer.js +113 -0
- package/src/routing/telemetry.js +502 -0
- package/src/server.js +23 -0
- package/src/stores/file-store.js +69 -0
- package/src/stores/response-store.js +25 -0
- package/src/tools/code-graph.js +538 -0
- package/src/tools/code-mode.js +304 -0
- package/src/tools/index.js +1 -1
- package/src/tools/lazy-loader.js +11 -0
- package/src/tools/mcp-remote.js +7 -0
- package/src/tools/smart-selection.js +11 -0
- package/src/tools/web.js +1 -1
- package/src/utils/payload.js +206 -0
- package/src/utils/perf-timer.js +80 -0
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
const crypto = require("crypto");
|
|
1
2
|
const config = require("../config");
|
|
2
3
|
const http = require("http");
|
|
3
4
|
const https = require("https");
|
|
@@ -5,6 +6,7 @@ const { withRetry } = require("./retry");
|
|
|
5
6
|
const { getCircuitBreakerRegistry } = require("./circuit-breaker");
|
|
6
7
|
const { getMetricsCollector } = require("../observability/metrics");
|
|
7
8
|
const { getHealthTracker } = require("../observability/health-tracker");
|
|
9
|
+
const { createBulkhead } = require("./resilience");
|
|
8
10
|
const logger = require("../logger");
|
|
9
11
|
const { STANDARD_TOOLS, STANDARD_TOOL_NAMES } = require("./standard-tools");
|
|
10
12
|
const { convertAnthropicToolsToOpenRouter } = require("./openrouter-utils");
|
|
@@ -12,6 +14,9 @@ const {
|
|
|
12
14
|
detectModelFamily
|
|
13
15
|
} = require("./bedrock-utils");
|
|
14
16
|
const { getGPTSystemPromptAddendum } = require("./gpt-utils");
|
|
17
|
+
const telemetry = require("../routing/telemetry");
|
|
18
|
+
const { scoreResponseQuality } = require("../routing/quality-scorer");
|
|
19
|
+
const { getLatencyTracker } = require("../routing/latency-tracker");
|
|
15
20
|
|
|
16
21
|
|
|
17
22
|
|
|
@@ -20,70 +25,29 @@ if (typeof fetch !== "function") {
|
|
|
20
25
|
throw new Error("Node 18+ is required for the built-in fetch API.");
|
|
21
26
|
}
|
|
22
27
|
|
|
23
|
-
|
|
24
|
-
* Simple Semaphore for limiting concurrent requests
|
|
25
|
-
* Used to prevent Z.AI rate limiting from parallel Claude Code CLI calls
|
|
26
|
-
*/
|
|
27
|
-
class Semaphore {
|
|
28
|
-
constructor(maxConcurrent = 2) {
|
|
29
|
-
this.maxConcurrent = maxConcurrent;
|
|
30
|
-
this.current = 0;
|
|
31
|
-
this.queue = [];
|
|
32
|
-
}
|
|
33
|
-
|
|
34
|
-
async acquire() {
|
|
35
|
-
if (this.current < this.maxConcurrent) {
|
|
36
|
-
this.current++;
|
|
37
|
-
return;
|
|
38
|
-
}
|
|
39
|
-
|
|
40
|
-
// Wait in queue
|
|
41
|
-
return new Promise((resolve) => {
|
|
42
|
-
this.queue.push(resolve);
|
|
43
|
-
});
|
|
44
|
-
}
|
|
45
|
-
|
|
46
|
-
release() {
|
|
47
|
-
this.current--;
|
|
48
|
-
if (this.queue.length > 0 && this.current < this.maxConcurrent) {
|
|
49
|
-
this.current++;
|
|
50
|
-
const next = this.queue.shift();
|
|
51
|
-
next();
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
async run(fn) {
|
|
56
|
-
await this.acquire();
|
|
57
|
-
try {
|
|
58
|
-
return await fn();
|
|
59
|
-
} finally {
|
|
60
|
-
this.release();
|
|
61
|
-
}
|
|
62
|
-
}
|
|
63
|
-
}
|
|
64
|
-
|
|
65
|
-
// Z.AI request semaphore - limit concurrent requests to avoid rate limiting
|
|
28
|
+
// Z.AI request bulkhead - limit concurrent requests to avoid rate limiting
|
|
66
29
|
// Configurable via ZAI_MAX_CONCURRENT env var (default: 2)
|
|
67
30
|
const zaiMaxConcurrent = parseInt(process.env.ZAI_MAX_CONCURRENT || '2', 10);
|
|
68
|
-
const zaiSemaphore =
|
|
69
|
-
logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI
|
|
31
|
+
const zaiSemaphore = createBulkhead({ maxConcurrent: zaiMaxConcurrent, maxQueue: 50 });
|
|
32
|
+
logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI bulkhead initialized");
|
|
70
33
|
|
|
71
34
|
|
|
72
35
|
|
|
73
36
|
// HTTP connection pooling for better performance
|
|
37
|
+
// Increased maxSockets for high-concurrency team deployments (50+ devs)
|
|
74
38
|
const httpAgent = new http.Agent({
|
|
75
39
|
keepAlive: true,
|
|
76
|
-
maxSockets:
|
|
77
|
-
maxFreeSockets:
|
|
78
|
-
timeout:
|
|
40
|
+
maxSockets: 200,
|
|
41
|
+
maxFreeSockets: 20,
|
|
42
|
+
timeout: 120000,
|
|
79
43
|
keepAliveMsecs: 30000,
|
|
80
44
|
});
|
|
81
45
|
|
|
82
46
|
const httpsAgent = new https.Agent({
|
|
83
47
|
keepAlive: true,
|
|
84
|
-
maxSockets:
|
|
85
|
-
maxFreeSockets:
|
|
86
|
-
timeout:
|
|
48
|
+
maxSockets: 200,
|
|
49
|
+
maxFreeSockets: 20,
|
|
50
|
+
timeout: 120000,
|
|
87
51
|
keepAliveMsecs: 30000,
|
|
88
52
|
});
|
|
89
53
|
|
|
@@ -307,8 +271,8 @@ async function invokeOllama(body) {
|
|
|
307
271
|
const ollamaBody = {
|
|
308
272
|
model: modelName,
|
|
309
273
|
messages: body.messages,
|
|
310
|
-
max_tokens: body.max_tokens ||
|
|
311
|
-
stream: false,
|
|
274
|
+
max_tokens: body.max_tokens || 16384,
|
|
275
|
+
stream: body.stream ?? false,
|
|
312
276
|
};
|
|
313
277
|
|
|
314
278
|
if (body.system) ollamaBody.system = body.system;
|
|
@@ -375,7 +339,7 @@ async function invokeOllama(body) {
|
|
|
375
339
|
const ollamaBody = {
|
|
376
340
|
model: modelName,
|
|
377
341
|
messages: deduplicated,
|
|
378
|
-
stream: false,
|
|
342
|
+
stream: body.stream ?? false,
|
|
379
343
|
options: {
|
|
380
344
|
temperature: body.temperature ?? 0.7,
|
|
381
345
|
num_predict: body.max_tokens ?? 16384,
|
|
@@ -432,7 +396,7 @@ async function invokeOpenRouter(body) {
|
|
|
432
396
|
model: body._suggestionModeModel || body._tierModel || config.openrouter.model,
|
|
433
397
|
messages,
|
|
434
398
|
temperature: body.temperature ?? 0.7,
|
|
435
|
-
max_tokens: body.max_tokens ??
|
|
399
|
+
max_tokens: body.max_tokens ?? 16384,
|
|
436
400
|
top_p: body.top_p ?? 1.0,
|
|
437
401
|
stream: body.stream ?? false
|
|
438
402
|
};
|
|
@@ -515,7 +479,7 @@ async function invokeAzureOpenAI(body) {
|
|
|
515
479
|
const azureBody = {
|
|
516
480
|
messages,
|
|
517
481
|
temperature: body.temperature ?? 0.3, // Lower temperature for more deterministic, action-oriented behavior
|
|
518
|
-
max_tokens: Math.min(body.max_tokens ??
|
|
482
|
+
max_tokens: Math.min(body.max_tokens ?? 16384, 16384), // Cap at Azure OpenAI's limit
|
|
519
483
|
top_p: body.top_p ?? 1.0,
|
|
520
484
|
stream: false, // Force non-streaming for Azure OpenAI - streaming format conversion not yet implemented
|
|
521
485
|
model: body._suggestionModeModel || body._tierModel || config.azureOpenAI.deployment
|
|
@@ -911,7 +875,7 @@ async function invokeOpenAI(body) {
|
|
|
911
875
|
model: body._suggestionModeModel || body._tierModel || config.openai.model || "gpt-4o",
|
|
912
876
|
messages,
|
|
913
877
|
temperature: body.temperature ?? 0.7,
|
|
914
|
-
max_tokens: body.max_tokens ??
|
|
878
|
+
max_tokens: body.max_tokens ?? 16384,
|
|
915
879
|
top_p: body.top_p ?? 1.0,
|
|
916
880
|
stream: body.stream ?? false
|
|
917
881
|
};
|
|
@@ -1012,7 +976,7 @@ async function invokeLlamaCpp(body) {
|
|
|
1012
976
|
const llamacppBody = {
|
|
1013
977
|
messages: deduplicated,
|
|
1014
978
|
temperature: body.temperature ?? 0.7,
|
|
1015
|
-
max_tokens: body.max_tokens ??
|
|
979
|
+
max_tokens: body.max_tokens ?? 16384,
|
|
1016
980
|
top_p: body.top_p ?? 1.0,
|
|
1017
981
|
stream: body.stream ?? false
|
|
1018
982
|
};
|
|
@@ -1096,7 +1060,7 @@ async function invokeLMStudio(body) {
|
|
|
1096
1060
|
const lmstudioBody = {
|
|
1097
1061
|
messages,
|
|
1098
1062
|
temperature: body.temperature ?? 0.7,
|
|
1099
|
-
max_tokens: body.max_tokens ??
|
|
1063
|
+
max_tokens: body.max_tokens ?? 16384,
|
|
1100
1064
|
top_p: body.top_p ?? 1.0,
|
|
1101
1065
|
stream: body.stream ?? false
|
|
1102
1066
|
};
|
|
@@ -1411,7 +1375,7 @@ async function invokeZai(body) {
|
|
|
1411
1375
|
zaiBody = {
|
|
1412
1376
|
model: mappedModel,
|
|
1413
1377
|
messages,
|
|
1414
|
-
max_tokens: body.max_tokens ||
|
|
1378
|
+
max_tokens: body.max_tokens || 16384,
|
|
1415
1379
|
temperature: body.temperature ?? 0.7,
|
|
1416
1380
|
stream: body.stream,
|
|
1417
1381
|
};
|
|
@@ -1473,12 +1437,9 @@ async function invokeZai(body) {
|
|
|
1473
1437
|
zaiBody: JSON.stringify(zaiBody).substring(0, 1000),
|
|
1474
1438
|
}, "Z.AI request body (truncated)");
|
|
1475
1439
|
|
|
1476
|
-
// Use
|
|
1477
|
-
return zaiSemaphore.
|
|
1478
|
-
logger.debug(
|
|
1479
|
-
queueLength: zaiSemaphore.queue.length,
|
|
1480
|
-
currentConcurrent: zaiSemaphore.current,
|
|
1481
|
-
}, "Z.AI semaphore status");
|
|
1440
|
+
// Use bulkhead to limit concurrent Z.AI requests (prevents rate limiting)
|
|
1441
|
+
return zaiSemaphore.execute(async () => {
|
|
1442
|
+
logger.debug("Z.AI bulkhead executing request");
|
|
1482
1443
|
|
|
1483
1444
|
const response = await performJsonRequest(endpoint, { headers, body: zaiBody }, "Z.AI");
|
|
1484
1445
|
|
|
@@ -1560,7 +1521,7 @@ async function invokeMoonshot(body) {
|
|
|
1560
1521
|
const moonshotBody = {
|
|
1561
1522
|
model: mappedModel,
|
|
1562
1523
|
messages,
|
|
1563
|
-
max_tokens: body.max_tokens ||
|
|
1524
|
+
max_tokens: body.max_tokens || 16384,
|
|
1564
1525
|
temperature: body.temperature ?? 0.7,
|
|
1565
1526
|
top_p: body.top_p ?? 1.0,
|
|
1566
1527
|
stream: false, // Force non-streaming - OpenAI SSE to Anthropic SSE conversion not implemented
|
|
@@ -1638,20 +1599,34 @@ function convertOpenAIToAnthropic(response) {
|
|
|
1638
1599
|
const message = choice.message || {};
|
|
1639
1600
|
const content = [];
|
|
1640
1601
|
|
|
1602
|
+
// Extract tool calls embedded as XML/text in content (Minimax, Qwen, GLM, etc.)
|
|
1603
|
+
if (!message.tool_calls?.length && typeof message.content === "string" && message.content.trim()) {
|
|
1604
|
+
const { extractToolCallsFromText } = require("./xml-tool-extractor");
|
|
1605
|
+
const extracted = extractToolCallsFromText(message.content);
|
|
1606
|
+
if (extracted.toolCalls.length > 0) {
|
|
1607
|
+
message.tool_calls = extracted.toolCalls;
|
|
1608
|
+
message.content = extracted.cleanedText;
|
|
1609
|
+
choice.finish_reason = "tool_calls";
|
|
1610
|
+
}
|
|
1611
|
+
}
|
|
1612
|
+
|
|
1641
1613
|
// Add text content from message.content
|
|
1642
1614
|
// Don't add placeholder text if there are tool_calls - tools are the actual response
|
|
1643
1615
|
const hasToolCalls = Array.isArray(message.tool_calls) && message.tool_calls.length > 0;
|
|
1644
1616
|
|
|
1645
|
-
// Extract text content
|
|
1617
|
+
// Extract text content and reasoning from thinking models
|
|
1646
1618
|
const textContent = typeof message.content === 'string' ? message.content : '';
|
|
1647
1619
|
const reasoningContent = typeof message.reasoning_content === 'string' ? message.reasoning_content : '';
|
|
1648
1620
|
|
|
1621
|
+
// Emit reasoning_content as a proper thinking block (not discarded)
|
|
1622
|
+
if (reasoningContent) {
|
|
1623
|
+
content.push({ type: "thinking", thinking: reasoningContent });
|
|
1624
|
+
}
|
|
1625
|
+
|
|
1649
1626
|
if (textContent) {
|
|
1650
|
-
// Has regular content - use it directly (ignore reasoning_content chain-of-thought)
|
|
1651
1627
|
content.push({ type: "text", text: textContent });
|
|
1652
|
-
} else if (reasoningContent) {
|
|
1653
|
-
//
|
|
1654
|
-
content.push({ type: "text", text: reasoningContent });
|
|
1628
|
+
} else if (!reasoningContent) {
|
|
1629
|
+
// No content and no reasoning — will be handled by the empty check below
|
|
1655
1630
|
}
|
|
1656
1631
|
|
|
1657
1632
|
// Convert tool calls
|
|
@@ -1791,7 +1766,7 @@ async function invokeVertex(body) {
|
|
|
1791
1766
|
contents,
|
|
1792
1767
|
generationConfig: {
|
|
1793
1768
|
temperature: body.temperature ?? 0.7,
|
|
1794
|
-
maxOutputTokens: body.max_tokens ||
|
|
1769
|
+
maxOutputTokens: body.max_tokens || 16384,
|
|
1795
1770
|
topP: body.top_p ?? 1.0,
|
|
1796
1771
|
}
|
|
1797
1772
|
};
|
|
@@ -2000,6 +1975,54 @@ function convertGeminiToAnthropic(response, requestedModel) {
|
|
|
2000
1975
|
};
|
|
2001
1976
|
}
|
|
2002
1977
|
|
|
1978
|
+
async function invokeCodex(body) {
|
|
1979
|
+
const { getCodexProcess } = require("./codex-process");
|
|
1980
|
+
const { convertAnthropicToCodexPrompt, convertCodexResponseToAnthropic } = require("./codex-utils");
|
|
1981
|
+
|
|
1982
|
+
const codex = getCodexProcess();
|
|
1983
|
+
await codex.ensureRunning();
|
|
1984
|
+
|
|
1985
|
+
const model = body._tierModel || config.codex?.model || "gpt-5.3-codex";
|
|
1986
|
+
const { prompt, systemContext } = convertAnthropicToCodexPrompt(body);
|
|
1987
|
+
|
|
1988
|
+
if (!prompt) {
|
|
1989
|
+
throw new Error("Codex: no prompt content to send");
|
|
1990
|
+
}
|
|
1991
|
+
|
|
1992
|
+
// Start a new thread
|
|
1993
|
+
const threadParams = { model };
|
|
1994
|
+
if (systemContext) {
|
|
1995
|
+
threadParams.instructions = systemContext;
|
|
1996
|
+
}
|
|
1997
|
+
const threadResult = await codex.sendRequest("thread/start", threadParams);
|
|
1998
|
+
const threadId = threadResult?.threadId || threadResult?.id;
|
|
1999
|
+
|
|
2000
|
+
if (!threadId) {
|
|
2001
|
+
throw new Error("Codex: thread/start did not return a threadId");
|
|
2002
|
+
}
|
|
2003
|
+
|
|
2004
|
+
logger.debug({ threadId, model, promptLength: prompt.length }, "[Codex] Thread started");
|
|
2005
|
+
|
|
2006
|
+
// Send the turn and collect response
|
|
2007
|
+
const turnResult = await codex.sendTurn(threadId, prompt, model);
|
|
2008
|
+
|
|
2009
|
+
logger.debug({
|
|
2010
|
+
threadId,
|
|
2011
|
+
responseLength: turnResult.text?.length || 0,
|
|
2012
|
+
}, "[Codex] Turn completed");
|
|
2013
|
+
|
|
2014
|
+
// Convert to Anthropic format
|
|
2015
|
+
const anthropicJson = convertCodexResponseToAnthropic(turnResult, model);
|
|
2016
|
+
|
|
2017
|
+
return {
|
|
2018
|
+
ok: true,
|
|
2019
|
+
status: 200,
|
|
2020
|
+
json: anthropicJson,
|
|
2021
|
+
text: JSON.stringify(anthropicJson),
|
|
2022
|
+
contentType: "application/json",
|
|
2023
|
+
};
|
|
2024
|
+
}
|
|
2025
|
+
|
|
2003
2026
|
async function invokeModel(body, options = {}) {
|
|
2004
2027
|
const { determineProviderSmart, isFallbackEnabled, getFallbackProvider } = require("./routing");
|
|
2005
2028
|
const metricsCollector = getMetricsCollector();
|
|
@@ -2007,9 +2030,11 @@ async function invokeModel(body, options = {}) {
|
|
|
2007
2030
|
const healthTracker = getHealthTracker();
|
|
2008
2031
|
|
|
2009
2032
|
// Determine provider via async tier routing
|
|
2033
|
+
// Thread workspace for code-graph integration (from X-Lynkr-Workspace header or body._workspace)
|
|
2034
|
+
const workspace = body._workspace || options.workspace || null;
|
|
2010
2035
|
const routingResult = options.forceProvider
|
|
2011
2036
|
? { provider: options.forceProvider, model: null, method: 'forced' }
|
|
2012
|
-
: await determineProviderSmart(body);
|
|
2037
|
+
: await determineProviderSmart(body, { workspace });
|
|
2013
2038
|
const initialProvider = routingResult.provider;
|
|
2014
2039
|
const tierSelectedModel = routingResult.model;
|
|
2015
2040
|
|
|
@@ -2018,6 +2043,11 @@ async function invokeModel(body, options = {}) {
|
|
|
2018
2043
|
body._tierModel = tierSelectedModel;
|
|
2019
2044
|
}
|
|
2020
2045
|
|
|
2046
|
+
// Inject provider-side prompt caching (cache_control breakpoints)
|
|
2047
|
+
// Reduces input token cost by up to 90% and latency by up to 80%
|
|
2048
|
+
const { injectPromptCaching } = require('./prompt-cache-injection');
|
|
2049
|
+
injectPromptCaching(body, initialProvider);
|
|
2050
|
+
|
|
2021
2051
|
// Build routing decision object for response headers
|
|
2022
2052
|
const routingDecision = {
|
|
2023
2053
|
provider: initialProvider,
|
|
@@ -2081,6 +2111,8 @@ async function invokeModel(body, options = {}) {
|
|
|
2081
2111
|
return await invokeVertex(body);
|
|
2082
2112
|
} else if (initialProvider === "moonshot") {
|
|
2083
2113
|
return await invokeMoonshot(body);
|
|
2114
|
+
} else if (initialProvider === "codex") {
|
|
2115
|
+
return await invokeCodex(body);
|
|
2084
2116
|
}
|
|
2085
2117
|
return await invokeDatabricks(body);
|
|
2086
2118
|
});
|
|
@@ -2091,10 +2123,13 @@ async function invokeModel(body, options = {}) {
|
|
|
2091
2123
|
metricsCollector.recordDatabricksRequest(true, retries);
|
|
2092
2124
|
healthTracker.recordSuccess(initialProvider, latency);
|
|
2093
2125
|
|
|
2126
|
+
// Record latency for routing intelligence
|
|
2127
|
+
getLatencyTracker().record(initialProvider, latency);
|
|
2128
|
+
|
|
2094
2129
|
// Record tokens and cost savings
|
|
2130
|
+
const outputTokens = result.json?.usage?.output_tokens || result.json?.usage?.completion_tokens || 0;
|
|
2131
|
+
const inputTokens = result.json?.usage?.input_tokens || result.json?.usage?.prompt_tokens || 0;
|
|
2095
2132
|
if (result.json?.usage) {
|
|
2096
|
-
const inputTokens = result.json.usage.input_tokens || result.json.usage.prompt_tokens || 0;
|
|
2097
|
-
const outputTokens = result.json.usage.output_tokens || result.json.usage.completion_tokens || 0;
|
|
2098
2133
|
metricsCollector.recordTokens(inputTokens, outputTokens);
|
|
2099
2134
|
|
|
2100
2135
|
// Estimate cost savings if Ollama was used
|
|
@@ -2104,6 +2139,53 @@ async function invokeModel(body, options = {}) {
|
|
|
2104
2139
|
}
|
|
2105
2140
|
}
|
|
2106
2141
|
|
|
2142
|
+
// Count tool calls in response
|
|
2143
|
+
const toolCallsMade = result.json?.content?.filter?.(
|
|
2144
|
+
(b) => b.type === "tool_use"
|
|
2145
|
+
)?.length || 0;
|
|
2146
|
+
|
|
2147
|
+
// Compute quality score
|
|
2148
|
+
const qualityScore = scoreResponseQuality(
|
|
2149
|
+
{ tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
|
|
2150
|
+
null,
|
|
2151
|
+
{
|
|
2152
|
+
status_code: 200,
|
|
2153
|
+
output_tokens: outputTokens,
|
|
2154
|
+
tool_calls_made: toolCallsMade,
|
|
2155
|
+
was_fallback: false,
|
|
2156
|
+
retry_count: retries,
|
|
2157
|
+
error_type: null,
|
|
2158
|
+
latency_ms: latency,
|
|
2159
|
+
}
|
|
2160
|
+
);
|
|
2161
|
+
|
|
2162
|
+
// Record routing telemetry (non-blocking)
|
|
2163
|
+
telemetry.record({
|
|
2164
|
+
request_id: crypto.randomUUID(),
|
|
2165
|
+
session_id: body._sessionId || null,
|
|
2166
|
+
timestamp: Date.now(),
|
|
2167
|
+
complexity_score: routingResult.score ?? null,
|
|
2168
|
+
tier: routingDecision.tier,
|
|
2169
|
+
agentic_type: routingResult.agenticResult?.agentType || null,
|
|
2170
|
+
tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
|
|
2171
|
+
input_tokens: inputTokens || null,
|
|
2172
|
+
message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
|
|
2173
|
+
request_type: routingResult.analysis?.requestType || null,
|
|
2174
|
+
provider: initialProvider,
|
|
2175
|
+
model: routingDecision.model,
|
|
2176
|
+
routing_method: routingDecision.method,
|
|
2177
|
+
was_fallback: false,
|
|
2178
|
+
output_tokens: outputTokens || null,
|
|
2179
|
+
latency_ms: latency,
|
|
2180
|
+
status_code: 200,
|
|
2181
|
+
error_type: null,
|
|
2182
|
+
tool_calls_made: toolCallsMade,
|
|
2183
|
+
retry_count: retries,
|
|
2184
|
+
circuit_breaker_state: breaker.state,
|
|
2185
|
+
quality_score: qualityScore,
|
|
2186
|
+
tokens_per_second: outputTokens && latency > 0 ? outputTokens / (latency / 1000) : null,
|
|
2187
|
+
});
|
|
2188
|
+
|
|
2107
2189
|
// Return result with provider info and routing decision for headers
|
|
2108
2190
|
return {
|
|
2109
2191
|
...result,
|
|
@@ -2113,8 +2195,10 @@ async function invokeModel(body, options = {}) {
|
|
|
2113
2195
|
|
|
2114
2196
|
} catch (err) {
|
|
2115
2197
|
// Record failure
|
|
2198
|
+
const failLatency = Date.now() - startTime;
|
|
2116
2199
|
metricsCollector.recordProviderFailure(initialProvider);
|
|
2117
2200
|
healthTracker.recordFailure(initialProvider, err, err.status);
|
|
2201
|
+
getLatencyTracker().record(initialProvider, failLatency);
|
|
2118
2202
|
|
|
2119
2203
|
// Check if we should fallback (any provider can fall back, not just ollama)
|
|
2120
2204
|
const shouldFallback =
|
|
@@ -2124,6 +2208,33 @@ async function invokeModel(body, options = {}) {
|
|
|
2124
2208
|
|
|
2125
2209
|
if (!shouldFallback) {
|
|
2126
2210
|
metricsCollector.recordDatabricksRequest(false, retries);
|
|
2211
|
+
|
|
2212
|
+
// Record failed telemetry
|
|
2213
|
+
telemetry.record({
|
|
2214
|
+
request_id: crypto.randomUUID(),
|
|
2215
|
+
session_id: body._sessionId || null,
|
|
2216
|
+
timestamp: Date.now(),
|
|
2217
|
+
complexity_score: routingResult.score ?? null,
|
|
2218
|
+
tier: routingDecision.tier,
|
|
2219
|
+
agentic_type: routingResult.agenticResult?.agentType || null,
|
|
2220
|
+
tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
|
|
2221
|
+
input_tokens: null,
|
|
2222
|
+
message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
|
|
2223
|
+
request_type: routingResult.analysis?.requestType || null,
|
|
2224
|
+
provider: initialProvider,
|
|
2225
|
+
model: routingDecision.model,
|
|
2226
|
+
routing_method: routingDecision.method,
|
|
2227
|
+
was_fallback: false,
|
|
2228
|
+
latency_ms: failLatency,
|
|
2229
|
+
status_code: err.status || null,
|
|
2230
|
+
error_type: err.code || err.name || "unknown",
|
|
2231
|
+
quality_score: scoreResponseQuality(
|
|
2232
|
+
{ tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
|
|
2233
|
+
null,
|
|
2234
|
+
{ error_type: err.code || err.name, was_fallback: false, retry_count: retries, latency_ms: failLatency }
|
|
2235
|
+
),
|
|
2236
|
+
});
|
|
2237
|
+
|
|
2127
2238
|
throw err;
|
|
2128
2239
|
}
|
|
2129
2240
|
|
|
@@ -2197,6 +2308,45 @@ async function invokeModel(body, options = {}) {
|
|
|
2197
2308
|
totalLatency: Date.now() - startTime,
|
|
2198
2309
|
}, "Fallback to cloud provider succeeded");
|
|
2199
2310
|
|
|
2311
|
+
// Record latency for fallback provider
|
|
2312
|
+
getLatencyTracker().record(fallbackProvider, fallbackLatency);
|
|
2313
|
+
|
|
2314
|
+
// Capture fallback telemetry
|
|
2315
|
+
const fbOutputTokens = fallbackResult.json?.usage?.output_tokens || fallbackResult.json?.usage?.completion_tokens || 0;
|
|
2316
|
+
const fbInputTokens = fallbackResult.json?.usage?.input_tokens || fallbackResult.json?.usage?.prompt_tokens || 0;
|
|
2317
|
+
const fbToolCalls = fallbackResult.json?.content?.filter?.(
|
|
2318
|
+
(b) => b.type === "tool_use"
|
|
2319
|
+
)?.length || 0;
|
|
2320
|
+
|
|
2321
|
+
telemetry.record({
|
|
2322
|
+
request_id: crypto.randomUUID(),
|
|
2323
|
+
session_id: body._sessionId || null,
|
|
2324
|
+
timestamp: Date.now(),
|
|
2325
|
+
complexity_score: routingResult.score ?? null,
|
|
2326
|
+
tier: routingDecision.tier,
|
|
2327
|
+
agentic_type: routingResult.agenticResult?.agentType || null,
|
|
2328
|
+
tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
|
|
2329
|
+
input_tokens: fbInputTokens || null,
|
|
2330
|
+
message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
|
|
2331
|
+
request_type: routingResult.analysis?.requestType || null,
|
|
2332
|
+
provider: fallbackProvider,
|
|
2333
|
+
model: routingDecision.model,
|
|
2334
|
+
routing_method: "fallback",
|
|
2335
|
+
was_fallback: true,
|
|
2336
|
+
output_tokens: fbOutputTokens || null,
|
|
2337
|
+
latency_ms: Date.now() - startTime,
|
|
2338
|
+
status_code: 200,
|
|
2339
|
+
error_type: null,
|
|
2340
|
+
tool_calls_made: fbToolCalls,
|
|
2341
|
+
retry_count: 0,
|
|
2342
|
+
quality_score: scoreResponseQuality(
|
|
2343
|
+
{ tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
|
|
2344
|
+
null,
|
|
2345
|
+
{ status_code: 200, output_tokens: fbOutputTokens, tool_calls_made: fbToolCalls, was_fallback: true, retry_count: 0, latency_ms: Date.now() - startTime }
|
|
2346
|
+
),
|
|
2347
|
+
tokens_per_second: fbOutputTokens && fallbackLatency > 0 ? fbOutputTokens / (fallbackLatency / 1000) : null,
|
|
2348
|
+
});
|
|
2349
|
+
|
|
2200
2350
|
// Return result with actual provider used (fallback provider) and routing decision
|
|
2201
2351
|
return {
|
|
2202
2352
|
...fallbackResult,
|
|
@@ -2215,6 +2365,23 @@ async function invokeModel(body, options = {}) {
|
|
|
2215
2365
|
metricsCollector.recordDatabricksRequest(false, retries);
|
|
2216
2366
|
healthTracker.recordFailure(fallbackProvider, fallbackErr, fallbackErr.status);
|
|
2217
2367
|
|
|
2368
|
+
// Record double-failure telemetry
|
|
2369
|
+
telemetry.record({
|
|
2370
|
+
request_id: crypto.randomUUID(),
|
|
2371
|
+
session_id: body._sessionId || null,
|
|
2372
|
+
timestamp: Date.now(),
|
|
2373
|
+
complexity_score: routingResult.score ?? null,
|
|
2374
|
+
tier: routingDecision.tier,
|
|
2375
|
+
provider: fallbackProvider,
|
|
2376
|
+
model: routingDecision.model,
|
|
2377
|
+
routing_method: "fallback",
|
|
2378
|
+
was_fallback: true,
|
|
2379
|
+
latency_ms: Date.now() - startTime,
|
|
2380
|
+
status_code: fallbackErr.status || null,
|
|
2381
|
+
error_type: fallbackErr.code || fallbackErr.name || "double_failure",
|
|
2382
|
+
quality_score: 0,
|
|
2383
|
+
});
|
|
2384
|
+
|
|
2218
2385
|
logger.error({
|
|
2219
2386
|
originalProvider: initialProvider,
|
|
2220
2387
|
fallbackProvider,
|
|
@@ -77,25 +77,29 @@ async function hasAnthropicEndpoint(baseUrl) {
|
|
|
77
77
|
if (anthropicEndpointAvailable !== null) return anthropicEndpointAvailable;
|
|
78
78
|
|
|
79
79
|
try {
|
|
80
|
-
//
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
body: JSON.stringify({
|
|
88
|
-
model: "probe",
|
|
89
|
-
max_tokens: 1,
|
|
90
|
-
messages: [{ role: "user", content: "hi" }],
|
|
91
|
-
}),
|
|
80
|
+
// Check Ollama version — /v1/messages requires v0.14.0+
|
|
81
|
+
// This is instant (no LLM inference) vs the old probe that sent a real request
|
|
82
|
+
const controller = new AbortController();
|
|
83
|
+
const timeout = setTimeout(() => controller.abort(), 3000);
|
|
84
|
+
const versionRes = await fetch(`${baseUrl}/api/version`, {
|
|
85
|
+
method: "GET",
|
|
86
|
+
signal: controller.signal,
|
|
92
87
|
});
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
88
|
+
clearTimeout(timeout);
|
|
89
|
+
|
|
90
|
+
if (versionRes.ok) {
|
|
91
|
+
const versionData = await versionRes.json().catch(() => null);
|
|
92
|
+
const version = versionData?.version || "0.0.0";
|
|
93
|
+
const [major, minor] = version.split(".").map(Number);
|
|
94
|
+
|
|
95
|
+
// v0.14.0+ has the Anthropic Messages API
|
|
96
|
+
anthropicEndpointAvailable = major > 0 || (major === 0 && minor >= 14);
|
|
97
|
+
} else {
|
|
98
|
+
// Can't determine version — fall back to legacy
|
|
99
|
+
anthropicEndpointAvailable = false;
|
|
100
|
+
}
|
|
97
101
|
logger.info(
|
|
98
|
-
{ available: anthropicEndpointAvailable, status:
|
|
102
|
+
{ available: anthropicEndpointAvailable, status: versionRes.status },
|
|
99
103
|
anthropicEndpointAvailable
|
|
100
104
|
? "Ollama Anthropic API detected (/v1/messages) — using native passthrough"
|
|
101
105
|
: "Ollama Anthropic API not available — falling back to legacy /api/chat (upgrade to Ollama v0.14.0+ for best results)"
|
|
@@ -60,13 +60,16 @@ function convertOpenAIToAnthropic(openaiRequest) {
|
|
|
60
60
|
if (part.type === "text") {
|
|
61
61
|
return { type: "text", text: part.text };
|
|
62
62
|
} else if (part.type === "image_url") {
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
63
|
+
const url = part.image_url?.url || "";
|
|
64
|
+
if (url.startsWith("data:")) {
|
|
65
|
+
const match = url.match(/^data:(image\/[^;]+);base64,(.+)$/);
|
|
66
|
+
if (match) {
|
|
67
|
+
return { type: "image", source: { type: "base64", media_type: match[1], data: match[2] } };
|
|
68
68
|
}
|
|
69
|
-
}
|
|
69
|
+
}
|
|
70
|
+
return { type: "image", source: { type: "url", url } };
|
|
71
|
+
} else if (part.type === "document" || part.type === "image") {
|
|
72
|
+
return part;
|
|
70
73
|
}
|
|
71
74
|
return part;
|
|
72
75
|
});
|
|
@@ -208,10 +211,16 @@ function convertAnthropicToOpenAI(anthropicResponse, model = "claude-3-5-sonnet-
|
|
|
208
211
|
// Convert content blocks to OpenAI format
|
|
209
212
|
let messageContent = "";
|
|
210
213
|
const toolCalls = [];
|
|
214
|
+
let citations = [];
|
|
211
215
|
|
|
212
216
|
for (const block of content) {
|
|
213
217
|
if (block.type === "text") {
|
|
214
218
|
messageContent += block.text;
|
|
219
|
+
if (Array.isArray(block.citations)) {
|
|
220
|
+
citations.push(...block.citations);
|
|
221
|
+
}
|
|
222
|
+
} else if (block.type === "thinking") {
|
|
223
|
+
// Skip thinking blocks in OpenAI format (they don't have an equivalent)
|
|
215
224
|
} else if (block.type === "tool_use") {
|
|
216
225
|
toolCalls.push({
|
|
217
226
|
id: block.id,
|
|
@@ -249,6 +258,11 @@ function convertAnthropicToOpenAI(anthropicResponse, model = "claude-3-5-sonnet-
|
|
|
249
258
|
}
|
|
250
259
|
};
|
|
251
260
|
|
|
261
|
+
// Add citations if present
|
|
262
|
+
if (citations.length > 0) {
|
|
263
|
+
openaiResponse.citations = citations;
|
|
264
|
+
}
|
|
265
|
+
|
|
252
266
|
// Add tool_calls if present
|
|
253
267
|
if (toolCalls.length > 0) {
|
|
254
268
|
openaiResponse.choices[0].message.tool_calls = toolCalls;
|