lynkr 8.0.1 → 9.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,3 +1,4 @@
1
+ const crypto = require("crypto");
1
2
  const config = require("../config");
2
3
  const http = require("http");
3
4
  const https = require("https");
@@ -5,6 +6,7 @@ const { withRetry } = require("./retry");
5
6
  const { getCircuitBreakerRegistry } = require("./circuit-breaker");
6
7
  const { getMetricsCollector } = require("../observability/metrics");
7
8
  const { getHealthTracker } = require("../observability/health-tracker");
9
+ const { createBulkhead } = require("./resilience");
8
10
  const logger = require("../logger");
9
11
  const { STANDARD_TOOLS, STANDARD_TOOL_NAMES } = require("./standard-tools");
10
12
  const { convertAnthropicToolsToOpenRouter } = require("./openrouter-utils");
@@ -12,6 +14,9 @@ const {
12
14
  detectModelFamily
13
15
  } = require("./bedrock-utils");
14
16
  const { getGPTSystemPromptAddendum } = require("./gpt-utils");
17
+ const telemetry = require("../routing/telemetry");
18
+ const { scoreResponseQuality } = require("../routing/quality-scorer");
19
+ const { getLatencyTracker } = require("../routing/latency-tracker");
15
20
 
16
21
 
17
22
 
@@ -20,53 +25,11 @@ if (typeof fetch !== "function") {
20
25
  throw new Error("Node 18+ is required for the built-in fetch API.");
21
26
  }
22
27
 
23
- /**
24
- * Simple Semaphore for limiting concurrent requests
25
- * Used to prevent Z.AI rate limiting from parallel Claude Code CLI calls
26
- */
27
- class Semaphore {
28
- constructor(maxConcurrent = 2) {
29
- this.maxConcurrent = maxConcurrent;
30
- this.current = 0;
31
- this.queue = [];
32
- }
33
-
34
- async acquire() {
35
- if (this.current < this.maxConcurrent) {
36
- this.current++;
37
- return;
38
- }
39
-
40
- // Wait in queue
41
- return new Promise((resolve) => {
42
- this.queue.push(resolve);
43
- });
44
- }
45
-
46
- release() {
47
- this.current--;
48
- if (this.queue.length > 0 && this.current < this.maxConcurrent) {
49
- this.current++;
50
- const next = this.queue.shift();
51
- next();
52
- }
53
- }
54
-
55
- async run(fn) {
56
- await this.acquire();
57
- try {
58
- return await fn();
59
- } finally {
60
- this.release();
61
- }
62
- }
63
- }
64
-
65
- // Z.AI request semaphore - limit concurrent requests to avoid rate limiting
28
+ // Z.AI request bulkhead - limit concurrent requests to avoid rate limiting
66
29
  // Configurable via ZAI_MAX_CONCURRENT env var (default: 2)
67
30
  const zaiMaxConcurrent = parseInt(process.env.ZAI_MAX_CONCURRENT || '2', 10);
68
- const zaiSemaphore = new Semaphore(zaiMaxConcurrent);
69
- logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI semaphore initialized");
31
+ const zaiSemaphore = createBulkhead({ maxConcurrent: zaiMaxConcurrent, maxQueue: 50 });
32
+ logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI bulkhead initialized");
70
33
 
71
34
 
72
35
 
@@ -307,7 +270,7 @@ async function invokeOllama(body) {
307
270
  const ollamaBody = {
308
271
  model: modelName,
309
272
  messages: body.messages,
310
- max_tokens: body.max_tokens || 4096,
273
+ max_tokens: body.max_tokens || 16384,
311
274
  stream: false,
312
275
  };
313
276
 
@@ -432,7 +395,7 @@ async function invokeOpenRouter(body) {
432
395
  model: body._suggestionModeModel || body._tierModel || config.openrouter.model,
433
396
  messages,
434
397
  temperature: body.temperature ?? 0.7,
435
- max_tokens: body.max_tokens ?? 4096,
398
+ max_tokens: body.max_tokens ?? 16384,
436
399
  top_p: body.top_p ?? 1.0,
437
400
  stream: body.stream ?? false
438
401
  };
@@ -515,7 +478,7 @@ async function invokeAzureOpenAI(body) {
515
478
  const azureBody = {
516
479
  messages,
517
480
  temperature: body.temperature ?? 0.3, // Lower temperature for more deterministic, action-oriented behavior
518
- max_tokens: Math.min(body.max_tokens ?? 4096, 16384), // Cap at Azure OpenAI's limit
481
+ max_tokens: Math.min(body.max_tokens ?? 16384, 16384), // Cap at Azure OpenAI's limit
519
482
  top_p: body.top_p ?? 1.0,
520
483
  stream: false, // Force non-streaming for Azure OpenAI - streaming format conversion not yet implemented
521
484
  model: body._suggestionModeModel || body._tierModel || config.azureOpenAI.deployment
@@ -911,7 +874,7 @@ async function invokeOpenAI(body) {
911
874
  model: body._suggestionModeModel || body._tierModel || config.openai.model || "gpt-4o",
912
875
  messages,
913
876
  temperature: body.temperature ?? 0.7,
914
- max_tokens: body.max_tokens ?? 4096,
877
+ max_tokens: body.max_tokens ?? 16384,
915
878
  top_p: body.top_p ?? 1.0,
916
879
  stream: body.stream ?? false
917
880
  };
@@ -1012,7 +975,7 @@ async function invokeLlamaCpp(body) {
1012
975
  const llamacppBody = {
1013
976
  messages: deduplicated,
1014
977
  temperature: body.temperature ?? 0.7,
1015
- max_tokens: body.max_tokens ?? 4096,
978
+ max_tokens: body.max_tokens ?? 16384,
1016
979
  top_p: body.top_p ?? 1.0,
1017
980
  stream: body.stream ?? false
1018
981
  };
@@ -1096,7 +1059,7 @@ async function invokeLMStudio(body) {
1096
1059
  const lmstudioBody = {
1097
1060
  messages,
1098
1061
  temperature: body.temperature ?? 0.7,
1099
- max_tokens: body.max_tokens ?? 4096,
1062
+ max_tokens: body.max_tokens ?? 16384,
1100
1063
  top_p: body.top_p ?? 1.0,
1101
1064
  stream: body.stream ?? false
1102
1065
  };
@@ -1411,7 +1374,7 @@ async function invokeZai(body) {
1411
1374
  zaiBody = {
1412
1375
  model: mappedModel,
1413
1376
  messages,
1414
- max_tokens: body.max_tokens || 4096,
1377
+ max_tokens: body.max_tokens || 16384,
1415
1378
  temperature: body.temperature ?? 0.7,
1416
1379
  stream: body.stream,
1417
1380
  };
@@ -1473,12 +1436,9 @@ async function invokeZai(body) {
1473
1436
  zaiBody: JSON.stringify(zaiBody).substring(0, 1000),
1474
1437
  }, "Z.AI request body (truncated)");
1475
1438
 
1476
- // Use semaphore to limit concurrent Z.AI requests (prevents rate limiting)
1477
- return zaiSemaphore.run(async () => {
1478
- logger.debug({
1479
- queueLength: zaiSemaphore.queue.length,
1480
- currentConcurrent: zaiSemaphore.current,
1481
- }, "Z.AI semaphore status");
1439
+ // Use bulkhead to limit concurrent Z.AI requests (prevents rate limiting)
1440
+ return zaiSemaphore.execute(async () => {
1441
+ logger.debug("Z.AI bulkhead executing request");
1482
1442
 
1483
1443
  const response = await performJsonRequest(endpoint, { headers, body: zaiBody }, "Z.AI");
1484
1444
 
@@ -1560,7 +1520,7 @@ async function invokeMoonshot(body) {
1560
1520
  const moonshotBody = {
1561
1521
  model: mappedModel,
1562
1522
  messages,
1563
- max_tokens: body.max_tokens || 4096,
1523
+ max_tokens: body.max_tokens || 16384,
1564
1524
  temperature: body.temperature ?? 0.7,
1565
1525
  top_p: body.top_p ?? 1.0,
1566
1526
  stream: false, // Force non-streaming - OpenAI SSE to Anthropic SSE conversion not implemented
@@ -1791,7 +1751,7 @@ async function invokeVertex(body) {
1791
1751
  contents,
1792
1752
  generationConfig: {
1793
1753
  temperature: body.temperature ?? 0.7,
1794
- maxOutputTokens: body.max_tokens || 4096,
1754
+ maxOutputTokens: body.max_tokens || 16384,
1795
1755
  topP: body.top_p ?? 1.0,
1796
1756
  }
1797
1757
  };
@@ -2000,6 +1960,54 @@ function convertGeminiToAnthropic(response, requestedModel) {
2000
1960
  };
2001
1961
  }
2002
1962
 
1963
+ async function invokeCodex(body) {
1964
+ const { getCodexProcess } = require("./codex-process");
1965
+ const { convertAnthropicToCodexPrompt, convertCodexResponseToAnthropic } = require("./codex-utils");
1966
+
1967
+ const codex = getCodexProcess();
1968
+ await codex.ensureRunning();
1969
+
1970
+ const model = body._tierModel || config.codex?.model || "gpt-5.3-codex";
1971
+ const { prompt, systemContext } = convertAnthropicToCodexPrompt(body);
1972
+
1973
+ if (!prompt) {
1974
+ throw new Error("Codex: no prompt content to send");
1975
+ }
1976
+
1977
+ // Start a new thread
1978
+ const threadParams = { model };
1979
+ if (systemContext) {
1980
+ threadParams.instructions = systemContext;
1981
+ }
1982
+ const threadResult = await codex.sendRequest("thread/start", threadParams);
1983
+ const threadId = threadResult?.threadId || threadResult?.id;
1984
+
1985
+ if (!threadId) {
1986
+ throw new Error("Codex: thread/start did not return a threadId");
1987
+ }
1988
+
1989
+ logger.debug({ threadId, model, promptLength: prompt.length }, "[Codex] Thread started");
1990
+
1991
+ // Send the turn and collect response
1992
+ const turnResult = await codex.sendTurn(threadId, prompt, model);
1993
+
1994
+ logger.debug({
1995
+ threadId,
1996
+ responseLength: turnResult.text?.length || 0,
1997
+ }, "[Codex] Turn completed");
1998
+
1999
+ // Convert to Anthropic format
2000
+ const anthropicJson = convertCodexResponseToAnthropic(turnResult, model);
2001
+
2002
+ return {
2003
+ ok: true,
2004
+ status: 200,
2005
+ json: anthropicJson,
2006
+ text: JSON.stringify(anthropicJson),
2007
+ contentType: "application/json",
2008
+ };
2009
+ }
2010
+
2003
2011
  async function invokeModel(body, options = {}) {
2004
2012
  const { determineProviderSmart, isFallbackEnabled, getFallbackProvider } = require("./routing");
2005
2013
  const metricsCollector = getMetricsCollector();
@@ -2007,9 +2015,11 @@ async function invokeModel(body, options = {}) {
2007
2015
  const healthTracker = getHealthTracker();
2008
2016
 
2009
2017
  // Determine provider via async tier routing
2018
+ // Thread workspace for code-graph integration (from X-Lynkr-Workspace header or body._workspace)
2019
+ const workspace = body._workspace || options.workspace || null;
2010
2020
  const routingResult = options.forceProvider
2011
2021
  ? { provider: options.forceProvider, model: null, method: 'forced' }
2012
- : await determineProviderSmart(body);
2022
+ : await determineProviderSmart(body, { workspace });
2013
2023
  const initialProvider = routingResult.provider;
2014
2024
  const tierSelectedModel = routingResult.model;
2015
2025
 
@@ -2081,6 +2091,8 @@ async function invokeModel(body, options = {}) {
2081
2091
  return await invokeVertex(body);
2082
2092
  } else if (initialProvider === "moonshot") {
2083
2093
  return await invokeMoonshot(body);
2094
+ } else if (initialProvider === "codex") {
2095
+ return await invokeCodex(body);
2084
2096
  }
2085
2097
  return await invokeDatabricks(body);
2086
2098
  });
@@ -2091,10 +2103,13 @@ async function invokeModel(body, options = {}) {
2091
2103
  metricsCollector.recordDatabricksRequest(true, retries);
2092
2104
  healthTracker.recordSuccess(initialProvider, latency);
2093
2105
 
2106
+ // Record latency for routing intelligence
2107
+ getLatencyTracker().record(initialProvider, latency);
2108
+
2094
2109
  // Record tokens and cost savings
2110
+ const outputTokens = result.json?.usage?.output_tokens || result.json?.usage?.completion_tokens || 0;
2111
+ const inputTokens = result.json?.usage?.input_tokens || result.json?.usage?.prompt_tokens || 0;
2095
2112
  if (result.json?.usage) {
2096
- const inputTokens = result.json.usage.input_tokens || result.json.usage.prompt_tokens || 0;
2097
- const outputTokens = result.json.usage.output_tokens || result.json.usage.completion_tokens || 0;
2098
2113
  metricsCollector.recordTokens(inputTokens, outputTokens);
2099
2114
 
2100
2115
  // Estimate cost savings if Ollama was used
@@ -2104,6 +2119,53 @@ async function invokeModel(body, options = {}) {
2104
2119
  }
2105
2120
  }
2106
2121
 
2122
+ // Count tool calls in response
2123
+ const toolCallsMade = result.json?.content?.filter?.(
2124
+ (b) => b.type === "tool_use"
2125
+ )?.length || 0;
2126
+
2127
+ // Compute quality score
2128
+ const qualityScore = scoreResponseQuality(
2129
+ { tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
2130
+ null,
2131
+ {
2132
+ status_code: 200,
2133
+ output_tokens: outputTokens,
2134
+ tool_calls_made: toolCallsMade,
2135
+ was_fallback: false,
2136
+ retry_count: retries,
2137
+ error_type: null,
2138
+ latency_ms: latency,
2139
+ }
2140
+ );
2141
+
2142
+ // Record routing telemetry (non-blocking)
2143
+ telemetry.record({
2144
+ request_id: crypto.randomUUID(),
2145
+ session_id: body._sessionId || null,
2146
+ timestamp: Date.now(),
2147
+ complexity_score: routingResult.score ?? null,
2148
+ tier: routingDecision.tier,
2149
+ agentic_type: routingResult.agenticResult?.agentType || null,
2150
+ tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
2151
+ input_tokens: inputTokens || null,
2152
+ message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
2153
+ request_type: routingResult.analysis?.requestType || null,
2154
+ provider: initialProvider,
2155
+ model: routingDecision.model,
2156
+ routing_method: routingDecision.method,
2157
+ was_fallback: false,
2158
+ output_tokens: outputTokens || null,
2159
+ latency_ms: latency,
2160
+ status_code: 200,
2161
+ error_type: null,
2162
+ tool_calls_made: toolCallsMade,
2163
+ retry_count: retries,
2164
+ circuit_breaker_state: breaker.state,
2165
+ quality_score: qualityScore,
2166
+ tokens_per_second: outputTokens && latency > 0 ? outputTokens / (latency / 1000) : null,
2167
+ });
2168
+
2107
2169
  // Return result with provider info and routing decision for headers
2108
2170
  return {
2109
2171
  ...result,
@@ -2113,8 +2175,10 @@ async function invokeModel(body, options = {}) {
2113
2175
 
2114
2176
  } catch (err) {
2115
2177
  // Record failure
2178
+ const failLatency = Date.now() - startTime;
2116
2179
  metricsCollector.recordProviderFailure(initialProvider);
2117
2180
  healthTracker.recordFailure(initialProvider, err, err.status);
2181
+ getLatencyTracker().record(initialProvider, failLatency);
2118
2182
 
2119
2183
  // Check if we should fallback (any provider can fall back, not just ollama)
2120
2184
  const shouldFallback =
@@ -2124,6 +2188,33 @@ async function invokeModel(body, options = {}) {
2124
2188
 
2125
2189
  if (!shouldFallback) {
2126
2190
  metricsCollector.recordDatabricksRequest(false, retries);
2191
+
2192
+ // Record failed telemetry
2193
+ telemetry.record({
2194
+ request_id: crypto.randomUUID(),
2195
+ session_id: body._sessionId || null,
2196
+ timestamp: Date.now(),
2197
+ complexity_score: routingResult.score ?? null,
2198
+ tier: routingDecision.tier,
2199
+ agentic_type: routingResult.agenticResult?.agentType || null,
2200
+ tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
2201
+ input_tokens: null,
2202
+ message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
2203
+ request_type: routingResult.analysis?.requestType || null,
2204
+ provider: initialProvider,
2205
+ model: routingDecision.model,
2206
+ routing_method: routingDecision.method,
2207
+ was_fallback: false,
2208
+ latency_ms: failLatency,
2209
+ status_code: err.status || null,
2210
+ error_type: err.code || err.name || "unknown",
2211
+ quality_score: scoreResponseQuality(
2212
+ { tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
2213
+ null,
2214
+ { error_type: err.code || err.name, was_fallback: false, retry_count: retries, latency_ms: failLatency }
2215
+ ),
2216
+ });
2217
+
2127
2218
  throw err;
2128
2219
  }
2129
2220
 
@@ -2197,6 +2288,45 @@ async function invokeModel(body, options = {}) {
2197
2288
  totalLatency: Date.now() - startTime,
2198
2289
  }, "Fallback to cloud provider succeeded");
2199
2290
 
2291
+ // Record latency for fallback provider
2292
+ getLatencyTracker().record(fallbackProvider, fallbackLatency);
2293
+
2294
+ // Capture fallback telemetry
2295
+ const fbOutputTokens = fallbackResult.json?.usage?.output_tokens || fallbackResult.json?.usage?.completion_tokens || 0;
2296
+ const fbInputTokens = fallbackResult.json?.usage?.input_tokens || fallbackResult.json?.usage?.prompt_tokens || 0;
2297
+ const fbToolCalls = fallbackResult.json?.content?.filter?.(
2298
+ (b) => b.type === "tool_use"
2299
+ )?.length || 0;
2300
+
2301
+ telemetry.record({
2302
+ request_id: crypto.randomUUID(),
2303
+ session_id: body._sessionId || null,
2304
+ timestamp: Date.now(),
2305
+ complexity_score: routingResult.score ?? null,
2306
+ tier: routingDecision.tier,
2307
+ agentic_type: routingResult.agenticResult?.agentType || null,
2308
+ tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
2309
+ input_tokens: fbInputTokens || null,
2310
+ message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
2311
+ request_type: routingResult.analysis?.requestType || null,
2312
+ provider: fallbackProvider,
2313
+ model: routingDecision.model,
2314
+ routing_method: "fallback",
2315
+ was_fallback: true,
2316
+ output_tokens: fbOutputTokens || null,
2317
+ latency_ms: Date.now() - startTime,
2318
+ status_code: 200,
2319
+ error_type: null,
2320
+ tool_calls_made: fbToolCalls,
2321
+ retry_count: 0,
2322
+ quality_score: scoreResponseQuality(
2323
+ { tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
2324
+ null,
2325
+ { status_code: 200, output_tokens: fbOutputTokens, tool_calls_made: fbToolCalls, was_fallback: true, retry_count: 0, latency_ms: Date.now() - startTime }
2326
+ ),
2327
+ tokens_per_second: fbOutputTokens && fallbackLatency > 0 ? fbOutputTokens / (fallbackLatency / 1000) : null,
2328
+ });
2329
+
2200
2330
  // Return result with actual provider used (fallback provider) and routing decision
2201
2331
  return {
2202
2332
  ...fallbackResult,
@@ -2215,6 +2345,23 @@ async function invokeModel(body, options = {}) {
2215
2345
  metricsCollector.recordDatabricksRequest(false, retries);
2216
2346
  healthTracker.recordFailure(fallbackProvider, fallbackErr, fallbackErr.status);
2217
2347
 
2348
+ // Record double-failure telemetry
2349
+ telemetry.record({
2350
+ request_id: crypto.randomUUID(),
2351
+ session_id: body._sessionId || null,
2352
+ timestamp: Date.now(),
2353
+ complexity_score: routingResult.score ?? null,
2354
+ tier: routingDecision.tier,
2355
+ provider: fallbackProvider,
2356
+ model: routingDecision.model,
2357
+ routing_method: "fallback",
2358
+ was_fallback: true,
2359
+ latency_ms: Date.now() - startTime,
2360
+ status_code: fallbackErr.status || null,
2361
+ error_type: fallbackErr.code || fallbackErr.name || "double_failure",
2362
+ quality_score: 0,
2363
+ });
2364
+
2218
2365
  logger.error({
2219
2366
  originalProvider: initialProvider,
2220
2367
  fallbackProvider,