lynkr 8.0.1 → 9.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. package/README.md +238 -315
  2. package/bin/cli.js +16 -3
  3. package/index.js +7 -3
  4. package/install.sh +3 -3
  5. package/lynkr-skill.tar.gz +0 -0
  6. package/native/Cargo.toml +26 -0
  7. package/native/index.js +29 -0
  8. package/native/lynkr-native.node +0 -0
  9. package/native/src/lib.rs +321 -0
  10. package/package.json +8 -6
  11. package/src/api/files-multipart.js +30 -0
  12. package/src/api/files-router.js +81 -0
  13. package/src/api/openai-router.js +379 -308
  14. package/src/api/providers-handler.js +171 -3
  15. package/src/api/router.js +109 -5
  16. package/src/cache/prompt.js +13 -0
  17. package/src/clients/circuit-breaker.js +10 -247
  18. package/src/clients/codex-process.js +342 -0
  19. package/src/clients/codex-utils.js +143 -0
  20. package/src/clients/databricks.js +243 -76
  21. package/src/clients/ollama-utils.js +21 -17
  22. package/src/clients/openai-format.js +20 -6
  23. package/src/clients/openrouter-utils.js +42 -37
  24. package/src/clients/prompt-cache-injection.js +140 -0
  25. package/src/clients/provider-capabilities.js +41 -0
  26. package/src/clients/resilience.js +540 -0
  27. package/src/clients/responses-format.js +8 -7
  28. package/src/clients/retry.js +22 -167
  29. package/src/clients/standard-tools.js +1 -1
  30. package/src/clients/xml-tool-extractor.js +307 -0
  31. package/src/cluster.js +82 -0
  32. package/src/config/index.js +66 -0
  33. package/src/context/compression.js +42 -9
  34. package/src/context/distill.js +507 -0
  35. package/src/context/tool-result-compressor.js +563 -0
  36. package/src/memory/extractor.js +22 -0
  37. package/src/orchestrator/index.js +147 -205
  38. package/src/routing/complexity-analyzer.js +258 -5
  39. package/src/routing/index.js +15 -34
  40. package/src/routing/latency-tracker.js +148 -0
  41. package/src/routing/model-tiers.js +2 -0
  42. package/src/routing/quality-scorer.js +113 -0
  43. package/src/routing/telemetry.js +502 -0
  44. package/src/server.js +23 -0
  45. package/src/stores/file-store.js +69 -0
  46. package/src/stores/response-store.js +25 -0
  47. package/src/tools/code-graph.js +538 -0
  48. package/src/tools/code-mode.js +304 -0
  49. package/src/tools/index.js +1 -1
  50. package/src/tools/lazy-loader.js +11 -0
  51. package/src/tools/mcp-remote.js +7 -0
  52. package/src/tools/smart-selection.js +11 -0
  53. package/src/tools/web.js +1 -1
  54. package/src/utils/payload.js +206 -0
  55. package/src/utils/perf-timer.js +80 -0
@@ -1,3 +1,4 @@
1
+ const crypto = require("crypto");
1
2
  const config = require("../config");
2
3
  const http = require("http");
3
4
  const https = require("https");
@@ -5,6 +6,7 @@ const { withRetry } = require("./retry");
5
6
  const { getCircuitBreakerRegistry } = require("./circuit-breaker");
6
7
  const { getMetricsCollector } = require("../observability/metrics");
7
8
  const { getHealthTracker } = require("../observability/health-tracker");
9
+ const { createBulkhead } = require("./resilience");
8
10
  const logger = require("../logger");
9
11
  const { STANDARD_TOOLS, STANDARD_TOOL_NAMES } = require("./standard-tools");
10
12
  const { convertAnthropicToolsToOpenRouter } = require("./openrouter-utils");
@@ -12,6 +14,9 @@ const {
12
14
  detectModelFamily
13
15
  } = require("./bedrock-utils");
14
16
  const { getGPTSystemPromptAddendum } = require("./gpt-utils");
17
+ const telemetry = require("../routing/telemetry");
18
+ const { scoreResponseQuality } = require("../routing/quality-scorer");
19
+ const { getLatencyTracker } = require("../routing/latency-tracker");
15
20
 
16
21
 
17
22
 
@@ -20,70 +25,29 @@ if (typeof fetch !== "function") {
20
25
  throw new Error("Node 18+ is required for the built-in fetch API.");
21
26
  }
22
27
 
23
- /**
24
- * Simple Semaphore for limiting concurrent requests
25
- * Used to prevent Z.AI rate limiting from parallel Claude Code CLI calls
26
- */
27
- class Semaphore {
28
- constructor(maxConcurrent = 2) {
29
- this.maxConcurrent = maxConcurrent;
30
- this.current = 0;
31
- this.queue = [];
32
- }
33
-
34
- async acquire() {
35
- if (this.current < this.maxConcurrent) {
36
- this.current++;
37
- return;
38
- }
39
-
40
- // Wait in queue
41
- return new Promise((resolve) => {
42
- this.queue.push(resolve);
43
- });
44
- }
45
-
46
- release() {
47
- this.current--;
48
- if (this.queue.length > 0 && this.current < this.maxConcurrent) {
49
- this.current++;
50
- const next = this.queue.shift();
51
- next();
52
- }
53
- }
54
-
55
- async run(fn) {
56
- await this.acquire();
57
- try {
58
- return await fn();
59
- } finally {
60
- this.release();
61
- }
62
- }
63
- }
64
-
65
- // Z.AI request semaphore - limit concurrent requests to avoid rate limiting
28
+ // Z.AI request bulkhead - limit concurrent requests to avoid rate limiting
66
29
  // Configurable via ZAI_MAX_CONCURRENT env var (default: 2)
67
30
  const zaiMaxConcurrent = parseInt(process.env.ZAI_MAX_CONCURRENT || '2', 10);
68
- const zaiSemaphore = new Semaphore(zaiMaxConcurrent);
69
- logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI semaphore initialized");
31
+ const zaiSemaphore = createBulkhead({ maxConcurrent: zaiMaxConcurrent, maxQueue: 50 });
32
+ logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI bulkhead initialized");
70
33
 
71
34
 
72
35
 
73
36
  // HTTP connection pooling for better performance
37
+ // Increased maxSockets for high-concurrency team deployments (50+ devs)
74
38
  const httpAgent = new http.Agent({
75
39
  keepAlive: true,
76
- maxSockets: 50,
77
- maxFreeSockets: 10,
78
- timeout: 60000,
40
+ maxSockets: 200,
41
+ maxFreeSockets: 20,
42
+ timeout: 120000,
79
43
  keepAliveMsecs: 30000,
80
44
  });
81
45
 
82
46
  const httpsAgent = new https.Agent({
83
47
  keepAlive: true,
84
- maxSockets: 50,
85
- maxFreeSockets: 10,
86
- timeout: 60000,
48
+ maxSockets: 200,
49
+ maxFreeSockets: 20,
50
+ timeout: 120000,
87
51
  keepAliveMsecs: 30000,
88
52
  });
89
53
 
@@ -307,8 +271,8 @@ async function invokeOllama(body) {
307
271
  const ollamaBody = {
308
272
  model: modelName,
309
273
  messages: body.messages,
310
- max_tokens: body.max_tokens || 4096,
311
- stream: false,
274
+ max_tokens: body.max_tokens || 16384,
275
+ stream: body.stream ?? false,
312
276
  };
313
277
 
314
278
  if (body.system) ollamaBody.system = body.system;
@@ -375,7 +339,7 @@ async function invokeOllama(body) {
375
339
  const ollamaBody = {
376
340
  model: modelName,
377
341
  messages: deduplicated,
378
- stream: false,
342
+ stream: body.stream ?? false,
379
343
  options: {
380
344
  temperature: body.temperature ?? 0.7,
381
345
  num_predict: body.max_tokens ?? 16384,
@@ -432,7 +396,7 @@ async function invokeOpenRouter(body) {
432
396
  model: body._suggestionModeModel || body._tierModel || config.openrouter.model,
433
397
  messages,
434
398
  temperature: body.temperature ?? 0.7,
435
- max_tokens: body.max_tokens ?? 4096,
399
+ max_tokens: body.max_tokens ?? 16384,
436
400
  top_p: body.top_p ?? 1.0,
437
401
  stream: body.stream ?? false
438
402
  };
@@ -515,7 +479,7 @@ async function invokeAzureOpenAI(body) {
515
479
  const azureBody = {
516
480
  messages,
517
481
  temperature: body.temperature ?? 0.3, // Lower temperature for more deterministic, action-oriented behavior
518
- max_tokens: Math.min(body.max_tokens ?? 4096, 16384), // Cap at Azure OpenAI's limit
482
+ max_tokens: Math.min(body.max_tokens ?? 16384, 16384), // Cap at Azure OpenAI's limit
519
483
  top_p: body.top_p ?? 1.0,
520
484
  stream: false, // Force non-streaming for Azure OpenAI - streaming format conversion not yet implemented
521
485
  model: body._suggestionModeModel || body._tierModel || config.azureOpenAI.deployment
@@ -911,7 +875,7 @@ async function invokeOpenAI(body) {
911
875
  model: body._suggestionModeModel || body._tierModel || config.openai.model || "gpt-4o",
912
876
  messages,
913
877
  temperature: body.temperature ?? 0.7,
914
- max_tokens: body.max_tokens ?? 4096,
878
+ max_tokens: body.max_tokens ?? 16384,
915
879
  top_p: body.top_p ?? 1.0,
916
880
  stream: body.stream ?? false
917
881
  };
@@ -1012,7 +976,7 @@ async function invokeLlamaCpp(body) {
1012
976
  const llamacppBody = {
1013
977
  messages: deduplicated,
1014
978
  temperature: body.temperature ?? 0.7,
1015
- max_tokens: body.max_tokens ?? 4096,
979
+ max_tokens: body.max_tokens ?? 16384,
1016
980
  top_p: body.top_p ?? 1.0,
1017
981
  stream: body.stream ?? false
1018
982
  };
@@ -1096,7 +1060,7 @@ async function invokeLMStudio(body) {
1096
1060
  const lmstudioBody = {
1097
1061
  messages,
1098
1062
  temperature: body.temperature ?? 0.7,
1099
- max_tokens: body.max_tokens ?? 4096,
1063
+ max_tokens: body.max_tokens ?? 16384,
1100
1064
  top_p: body.top_p ?? 1.0,
1101
1065
  stream: body.stream ?? false
1102
1066
  };
@@ -1411,7 +1375,7 @@ async function invokeZai(body) {
1411
1375
  zaiBody = {
1412
1376
  model: mappedModel,
1413
1377
  messages,
1414
- max_tokens: body.max_tokens || 4096,
1378
+ max_tokens: body.max_tokens || 16384,
1415
1379
  temperature: body.temperature ?? 0.7,
1416
1380
  stream: body.stream,
1417
1381
  };
@@ -1473,12 +1437,9 @@ async function invokeZai(body) {
1473
1437
  zaiBody: JSON.stringify(zaiBody).substring(0, 1000),
1474
1438
  }, "Z.AI request body (truncated)");
1475
1439
 
1476
- // Use semaphore to limit concurrent Z.AI requests (prevents rate limiting)
1477
- return zaiSemaphore.run(async () => {
1478
- logger.debug({
1479
- queueLength: zaiSemaphore.queue.length,
1480
- currentConcurrent: zaiSemaphore.current,
1481
- }, "Z.AI semaphore status");
1440
+ // Use bulkhead to limit concurrent Z.AI requests (prevents rate limiting)
1441
+ return zaiSemaphore.execute(async () => {
1442
+ logger.debug("Z.AI bulkhead executing request");
1482
1443
 
1483
1444
  const response = await performJsonRequest(endpoint, { headers, body: zaiBody }, "Z.AI");
1484
1445
 
@@ -1560,7 +1521,7 @@ async function invokeMoonshot(body) {
1560
1521
  const moonshotBody = {
1561
1522
  model: mappedModel,
1562
1523
  messages,
1563
- max_tokens: body.max_tokens || 4096,
1524
+ max_tokens: body.max_tokens || 16384,
1564
1525
  temperature: body.temperature ?? 0.7,
1565
1526
  top_p: body.top_p ?? 1.0,
1566
1527
  stream: false, // Force non-streaming - OpenAI SSE to Anthropic SSE conversion not implemented
@@ -1638,20 +1599,34 @@ function convertOpenAIToAnthropic(response) {
1638
1599
  const message = choice.message || {};
1639
1600
  const content = [];
1640
1601
 
1602
+ // Extract tool calls embedded as XML/text in content (Minimax, Qwen, GLM, etc.)
1603
+ if (!message.tool_calls?.length && typeof message.content === "string" && message.content.trim()) {
1604
+ const { extractToolCallsFromText } = require("./xml-tool-extractor");
1605
+ const extracted = extractToolCallsFromText(message.content);
1606
+ if (extracted.toolCalls.length > 0) {
1607
+ message.tool_calls = extracted.toolCalls;
1608
+ message.content = extracted.cleanedText;
1609
+ choice.finish_reason = "tool_calls";
1610
+ }
1611
+ }
1612
+
1641
1613
  // Add text content from message.content
1642
1614
  // Don't add placeholder text if there are tool_calls - tools are the actual response
1643
1615
  const hasToolCalls = Array.isArray(message.tool_calls) && message.tool_calls.length > 0;
1644
1616
 
1645
- // Extract text content - handle thinking models that split content/reasoning
1617
+ // Extract text content and reasoning from thinking models
1646
1618
  const textContent = typeof message.content === 'string' ? message.content : '';
1647
1619
  const reasoningContent = typeof message.reasoning_content === 'string' ? message.reasoning_content : '';
1648
1620
 
1621
+ // Emit reasoning_content as a proper thinking block (not discarded)
1622
+ if (reasoningContent) {
1623
+ content.push({ type: "thinking", thinking: reasoningContent });
1624
+ }
1625
+
1649
1626
  if (textContent) {
1650
- // Has regular content - use it directly (ignore reasoning_content chain-of-thought)
1651
1627
  content.push({ type: "text", text: textContent });
1652
- } else if (reasoningContent) {
1653
- // Fallback: thinking models where content is empty but reasoning has the output
1654
- content.push({ type: "text", text: reasoningContent });
1628
+ } else if (!reasoningContent) {
1629
+ // No content and no reasoning will be handled by the empty check below
1655
1630
  }
1656
1631
 
1657
1632
  // Convert tool calls
@@ -1791,7 +1766,7 @@ async function invokeVertex(body) {
1791
1766
  contents,
1792
1767
  generationConfig: {
1793
1768
  temperature: body.temperature ?? 0.7,
1794
- maxOutputTokens: body.max_tokens || 4096,
1769
+ maxOutputTokens: body.max_tokens || 16384,
1795
1770
  topP: body.top_p ?? 1.0,
1796
1771
  }
1797
1772
  };
@@ -2000,6 +1975,54 @@ function convertGeminiToAnthropic(response, requestedModel) {
2000
1975
  };
2001
1976
  }
2002
1977
 
1978
+ async function invokeCodex(body) {
1979
+ const { getCodexProcess } = require("./codex-process");
1980
+ const { convertAnthropicToCodexPrompt, convertCodexResponseToAnthropic } = require("./codex-utils");
1981
+
1982
+ const codex = getCodexProcess();
1983
+ await codex.ensureRunning();
1984
+
1985
+ const model = body._tierModel || config.codex?.model || "gpt-5.3-codex";
1986
+ const { prompt, systemContext } = convertAnthropicToCodexPrompt(body);
1987
+
1988
+ if (!prompt) {
1989
+ throw new Error("Codex: no prompt content to send");
1990
+ }
1991
+
1992
+ // Start a new thread
1993
+ const threadParams = { model };
1994
+ if (systemContext) {
1995
+ threadParams.instructions = systemContext;
1996
+ }
1997
+ const threadResult = await codex.sendRequest("thread/start", threadParams);
1998
+ const threadId = threadResult?.threadId || threadResult?.id;
1999
+
2000
+ if (!threadId) {
2001
+ throw new Error("Codex: thread/start did not return a threadId");
2002
+ }
2003
+
2004
+ logger.debug({ threadId, model, promptLength: prompt.length }, "[Codex] Thread started");
2005
+
2006
+ // Send the turn and collect response
2007
+ const turnResult = await codex.sendTurn(threadId, prompt, model);
2008
+
2009
+ logger.debug({
2010
+ threadId,
2011
+ responseLength: turnResult.text?.length || 0,
2012
+ }, "[Codex] Turn completed");
2013
+
2014
+ // Convert to Anthropic format
2015
+ const anthropicJson = convertCodexResponseToAnthropic(turnResult, model);
2016
+
2017
+ return {
2018
+ ok: true,
2019
+ status: 200,
2020
+ json: anthropicJson,
2021
+ text: JSON.stringify(anthropicJson),
2022
+ contentType: "application/json",
2023
+ };
2024
+ }
2025
+
2003
2026
  async function invokeModel(body, options = {}) {
2004
2027
  const { determineProviderSmart, isFallbackEnabled, getFallbackProvider } = require("./routing");
2005
2028
  const metricsCollector = getMetricsCollector();
@@ -2007,9 +2030,11 @@ async function invokeModel(body, options = {}) {
2007
2030
  const healthTracker = getHealthTracker();
2008
2031
 
2009
2032
  // Determine provider via async tier routing
2033
+ // Thread workspace for code-graph integration (from X-Lynkr-Workspace header or body._workspace)
2034
+ const workspace = body._workspace || options.workspace || null;
2010
2035
  const routingResult = options.forceProvider
2011
2036
  ? { provider: options.forceProvider, model: null, method: 'forced' }
2012
- : await determineProviderSmart(body);
2037
+ : await determineProviderSmart(body, { workspace });
2013
2038
  const initialProvider = routingResult.provider;
2014
2039
  const tierSelectedModel = routingResult.model;
2015
2040
 
@@ -2018,6 +2043,11 @@ async function invokeModel(body, options = {}) {
2018
2043
  body._tierModel = tierSelectedModel;
2019
2044
  }
2020
2045
 
2046
+ // Inject provider-side prompt caching (cache_control breakpoints)
2047
+ // Reduces input token cost by up to 90% and latency by up to 80%
2048
+ const { injectPromptCaching } = require('./prompt-cache-injection');
2049
+ injectPromptCaching(body, initialProvider);
2050
+
2021
2051
  // Build routing decision object for response headers
2022
2052
  const routingDecision = {
2023
2053
  provider: initialProvider,
@@ -2081,6 +2111,8 @@ async function invokeModel(body, options = {}) {
2081
2111
  return await invokeVertex(body);
2082
2112
  } else if (initialProvider === "moonshot") {
2083
2113
  return await invokeMoonshot(body);
2114
+ } else if (initialProvider === "codex") {
2115
+ return await invokeCodex(body);
2084
2116
  }
2085
2117
  return await invokeDatabricks(body);
2086
2118
  });
@@ -2091,10 +2123,13 @@ async function invokeModel(body, options = {}) {
2091
2123
  metricsCollector.recordDatabricksRequest(true, retries);
2092
2124
  healthTracker.recordSuccess(initialProvider, latency);
2093
2125
 
2126
+ // Record latency for routing intelligence
2127
+ getLatencyTracker().record(initialProvider, latency);
2128
+
2094
2129
  // Record tokens and cost savings
2130
+ const outputTokens = result.json?.usage?.output_tokens || result.json?.usage?.completion_tokens || 0;
2131
+ const inputTokens = result.json?.usage?.input_tokens || result.json?.usage?.prompt_tokens || 0;
2095
2132
  if (result.json?.usage) {
2096
- const inputTokens = result.json.usage.input_tokens || result.json.usage.prompt_tokens || 0;
2097
- const outputTokens = result.json.usage.output_tokens || result.json.usage.completion_tokens || 0;
2098
2133
  metricsCollector.recordTokens(inputTokens, outputTokens);
2099
2134
 
2100
2135
  // Estimate cost savings if Ollama was used
@@ -2104,6 +2139,53 @@ async function invokeModel(body, options = {}) {
2104
2139
  }
2105
2140
  }
2106
2141
 
2142
+ // Count tool calls in response
2143
+ const toolCallsMade = result.json?.content?.filter?.(
2144
+ (b) => b.type === "tool_use"
2145
+ )?.length || 0;
2146
+
2147
+ // Compute quality score
2148
+ const qualityScore = scoreResponseQuality(
2149
+ { tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
2150
+ null,
2151
+ {
2152
+ status_code: 200,
2153
+ output_tokens: outputTokens,
2154
+ tool_calls_made: toolCallsMade,
2155
+ was_fallback: false,
2156
+ retry_count: retries,
2157
+ error_type: null,
2158
+ latency_ms: latency,
2159
+ }
2160
+ );
2161
+
2162
+ // Record routing telemetry (non-blocking)
2163
+ telemetry.record({
2164
+ request_id: crypto.randomUUID(),
2165
+ session_id: body._sessionId || null,
2166
+ timestamp: Date.now(),
2167
+ complexity_score: routingResult.score ?? null,
2168
+ tier: routingDecision.tier,
2169
+ agentic_type: routingResult.agenticResult?.agentType || null,
2170
+ tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
2171
+ input_tokens: inputTokens || null,
2172
+ message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
2173
+ request_type: routingResult.analysis?.requestType || null,
2174
+ provider: initialProvider,
2175
+ model: routingDecision.model,
2176
+ routing_method: routingDecision.method,
2177
+ was_fallback: false,
2178
+ output_tokens: outputTokens || null,
2179
+ latency_ms: latency,
2180
+ status_code: 200,
2181
+ error_type: null,
2182
+ tool_calls_made: toolCallsMade,
2183
+ retry_count: retries,
2184
+ circuit_breaker_state: breaker.state,
2185
+ quality_score: qualityScore,
2186
+ tokens_per_second: outputTokens && latency > 0 ? outputTokens / (latency / 1000) : null,
2187
+ });
2188
+
2107
2189
  // Return result with provider info and routing decision for headers
2108
2190
  return {
2109
2191
  ...result,
@@ -2113,8 +2195,10 @@ async function invokeModel(body, options = {}) {
2113
2195
 
2114
2196
  } catch (err) {
2115
2197
  // Record failure
2198
+ const failLatency = Date.now() - startTime;
2116
2199
  metricsCollector.recordProviderFailure(initialProvider);
2117
2200
  healthTracker.recordFailure(initialProvider, err, err.status);
2201
+ getLatencyTracker().record(initialProvider, failLatency);
2118
2202
 
2119
2203
  // Check if we should fallback (any provider can fall back, not just ollama)
2120
2204
  const shouldFallback =
@@ -2124,6 +2208,33 @@ async function invokeModel(body, options = {}) {
2124
2208
 
2125
2209
  if (!shouldFallback) {
2126
2210
  metricsCollector.recordDatabricksRequest(false, retries);
2211
+
2212
+ // Record failed telemetry
2213
+ telemetry.record({
2214
+ request_id: crypto.randomUUID(),
2215
+ session_id: body._sessionId || null,
2216
+ timestamp: Date.now(),
2217
+ complexity_score: routingResult.score ?? null,
2218
+ tier: routingDecision.tier,
2219
+ agentic_type: routingResult.agenticResult?.agentType || null,
2220
+ tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
2221
+ input_tokens: null,
2222
+ message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
2223
+ request_type: routingResult.analysis?.requestType || null,
2224
+ provider: initialProvider,
2225
+ model: routingDecision.model,
2226
+ routing_method: routingDecision.method,
2227
+ was_fallback: false,
2228
+ latency_ms: failLatency,
2229
+ status_code: err.status || null,
2230
+ error_type: err.code || err.name || "unknown",
2231
+ quality_score: scoreResponseQuality(
2232
+ { tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
2233
+ null,
2234
+ { error_type: err.code || err.name, was_fallback: false, retry_count: retries, latency_ms: failLatency }
2235
+ ),
2236
+ });
2237
+
2127
2238
  throw err;
2128
2239
  }
2129
2240
 
@@ -2197,6 +2308,45 @@ async function invokeModel(body, options = {}) {
2197
2308
  totalLatency: Date.now() - startTime,
2198
2309
  }, "Fallback to cloud provider succeeded");
2199
2310
 
2311
+ // Record latency for fallback provider
2312
+ getLatencyTracker().record(fallbackProvider, fallbackLatency);
2313
+
2314
+ // Capture fallback telemetry
2315
+ const fbOutputTokens = fallbackResult.json?.usage?.output_tokens || fallbackResult.json?.usage?.completion_tokens || 0;
2316
+ const fbInputTokens = fallbackResult.json?.usage?.input_tokens || fallbackResult.json?.usage?.prompt_tokens || 0;
2317
+ const fbToolCalls = fallbackResult.json?.content?.filter?.(
2318
+ (b) => b.type === "tool_use"
2319
+ )?.length || 0;
2320
+
2321
+ telemetry.record({
2322
+ request_id: crypto.randomUUID(),
2323
+ session_id: body._sessionId || null,
2324
+ timestamp: Date.now(),
2325
+ complexity_score: routingResult.score ?? null,
2326
+ tier: routingDecision.tier,
2327
+ agentic_type: routingResult.agenticResult?.agentType || null,
2328
+ tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
2329
+ input_tokens: fbInputTokens || null,
2330
+ message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
2331
+ request_type: routingResult.analysis?.requestType || null,
2332
+ provider: fallbackProvider,
2333
+ model: routingDecision.model,
2334
+ routing_method: "fallback",
2335
+ was_fallback: true,
2336
+ output_tokens: fbOutputTokens || null,
2337
+ latency_ms: Date.now() - startTime,
2338
+ status_code: 200,
2339
+ error_type: null,
2340
+ tool_calls_made: fbToolCalls,
2341
+ retry_count: 0,
2342
+ quality_score: scoreResponseQuality(
2343
+ { tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
2344
+ null,
2345
+ { status_code: 200, output_tokens: fbOutputTokens, tool_calls_made: fbToolCalls, was_fallback: true, retry_count: 0, latency_ms: Date.now() - startTime }
2346
+ ),
2347
+ tokens_per_second: fbOutputTokens && fallbackLatency > 0 ? fbOutputTokens / (fallbackLatency / 1000) : null,
2348
+ });
2349
+
2200
2350
  // Return result with actual provider used (fallback provider) and routing decision
2201
2351
  return {
2202
2352
  ...fallbackResult,
@@ -2215,6 +2365,23 @@ async function invokeModel(body, options = {}) {
2215
2365
  metricsCollector.recordDatabricksRequest(false, retries);
2216
2366
  healthTracker.recordFailure(fallbackProvider, fallbackErr, fallbackErr.status);
2217
2367
 
2368
+ // Record double-failure telemetry
2369
+ telemetry.record({
2370
+ request_id: crypto.randomUUID(),
2371
+ session_id: body._sessionId || null,
2372
+ timestamp: Date.now(),
2373
+ complexity_score: routingResult.score ?? null,
2374
+ tier: routingDecision.tier,
2375
+ provider: fallbackProvider,
2376
+ model: routingDecision.model,
2377
+ routing_method: "fallback",
2378
+ was_fallback: true,
2379
+ latency_ms: Date.now() - startTime,
2380
+ status_code: fallbackErr.status || null,
2381
+ error_type: fallbackErr.code || fallbackErr.name || "double_failure",
2382
+ quality_score: 0,
2383
+ });
2384
+
2218
2385
  logger.error({
2219
2386
  originalProvider: initialProvider,
2220
2387
  fallbackProvider,
@@ -77,25 +77,29 @@ async function hasAnthropicEndpoint(baseUrl) {
77
77
  if (anthropicEndpointAvailable !== null) return anthropicEndpointAvailable;
78
78
 
79
79
  try {
80
- // Send a minimal request we only care about whether the route exists
81
- const res = await fetch(`${baseUrl}/v1/messages`, {
82
- method: "POST",
83
- headers: {
84
- "Content-Type": "application/json",
85
- "anthropic-version": "2023-06-01",
86
- },
87
- body: JSON.stringify({
88
- model: "probe",
89
- max_tokens: 1,
90
- messages: [{ role: "user", content: "hi" }],
91
- }),
80
+ // Check Ollama version/v1/messages requires v0.14.0+
81
+ // This is instant (no LLM inference) vs the old probe that sent a real request
82
+ const controller = new AbortController();
83
+ const timeout = setTimeout(() => controller.abort(), 3000);
84
+ const versionRes = await fetch(`${baseUrl}/api/version`, {
85
+ method: "GET",
86
+ signal: controller.signal,
92
87
  });
93
-
94
- // 404 → endpoint doesn't exist (old Ollama)
95
- // Any other status (200, 400, 500) → endpoint exists
96
- anthropicEndpointAvailable = res.status !== 404;
88
+ clearTimeout(timeout);
89
+
90
+ if (versionRes.ok) {
91
+ const versionData = await versionRes.json().catch(() => null);
92
+ const version = versionData?.version || "0.0.0";
93
+ const [major, minor] = version.split(".").map(Number);
94
+
95
+ // v0.14.0+ has the Anthropic Messages API
96
+ anthropicEndpointAvailable = major > 0 || (major === 0 && minor >= 14);
97
+ } else {
98
+ // Can't determine version — fall back to legacy
99
+ anthropicEndpointAvailable = false;
100
+ }
97
101
  logger.info(
98
- { available: anthropicEndpointAvailable, status: res.status },
102
+ { available: anthropicEndpointAvailable, status: versionRes.status },
99
103
  anthropicEndpointAvailable
100
104
  ? "Ollama Anthropic API detected (/v1/messages) — using native passthrough"
101
105
  : "Ollama Anthropic API not available — falling back to legacy /api/chat (upgrade to Ollama v0.14.0+ for best results)"
@@ -60,13 +60,16 @@ function convertOpenAIToAnthropic(openaiRequest) {
60
60
  if (part.type === "text") {
61
61
  return { type: "text", text: part.text };
62
62
  } else if (part.type === "image_url") {
63
- return {
64
- type: "image",
65
- source: {
66
- type: "url",
67
- url: part.image_url.url
63
+ const url = part.image_url?.url || "";
64
+ if (url.startsWith("data:")) {
65
+ const match = url.match(/^data:(image\/[^;]+);base64,(.+)$/);
66
+ if (match) {
67
+ return { type: "image", source: { type: "base64", media_type: match[1], data: match[2] } };
68
68
  }
69
- };
69
+ }
70
+ return { type: "image", source: { type: "url", url } };
71
+ } else if (part.type === "document" || part.type === "image") {
72
+ return part;
70
73
  }
71
74
  return part;
72
75
  });
@@ -208,10 +211,16 @@ function convertAnthropicToOpenAI(anthropicResponse, model = "claude-3-5-sonnet-
208
211
  // Convert content blocks to OpenAI format
209
212
  let messageContent = "";
210
213
  const toolCalls = [];
214
+ let citations = [];
211
215
 
212
216
  for (const block of content) {
213
217
  if (block.type === "text") {
214
218
  messageContent += block.text;
219
+ if (Array.isArray(block.citations)) {
220
+ citations.push(...block.citations);
221
+ }
222
+ } else if (block.type === "thinking") {
223
+ // Skip thinking blocks in OpenAI format (they don't have an equivalent)
215
224
  } else if (block.type === "tool_use") {
216
225
  toolCalls.push({
217
226
  id: block.id,
@@ -249,6 +258,11 @@ function convertAnthropicToOpenAI(anthropicResponse, model = "claude-3-5-sonnet-
249
258
  }
250
259
  };
251
260
 
261
+ // Add citations if present
262
+ if (citations.length > 0) {
263
+ openaiResponse.citations = citations;
264
+ }
265
+
252
266
  // Add tool_calls if present
253
267
  if (toolCalls.length > 0) {
254
268
  openaiResponse.choices[0].message.tool_calls = toolCalls;