npm - lynkr - Versions diffs - 8.0.1 → 9.0.2 - Mend

lynkr 8.0.1 → 9.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (55) hide show

package/README.md +238 -315
package/bin/cli.js +16 -3
package/index.js +7 -3
package/install.sh +3 -3
package/lynkr-skill.tar.gz +0 -0
package/native/Cargo.toml +26 -0
package/native/index.js +29 -0
package/native/lynkr-native.node +0 -0
package/native/src/lib.rs +321 -0
package/package.json +8 -6
package/src/api/files-multipart.js +30 -0
package/src/api/files-router.js +81 -0
package/src/api/openai-router.js +379 -308
package/src/api/providers-handler.js +171 -3
package/src/api/router.js +109 -5
package/src/cache/prompt.js +13 -0
package/src/clients/circuit-breaker.js +10 -247
package/src/clients/codex-process.js +342 -0
package/src/clients/codex-utils.js +143 -0
package/src/clients/databricks.js +243 -76
package/src/clients/ollama-utils.js +21 -17
package/src/clients/openai-format.js +20 -6
package/src/clients/openrouter-utils.js +42 -37
package/src/clients/prompt-cache-injection.js +140 -0
package/src/clients/provider-capabilities.js +41 -0
package/src/clients/resilience.js +540 -0
package/src/clients/responses-format.js +8 -7
package/src/clients/retry.js +22 -167
package/src/clients/standard-tools.js +1 -1
package/src/clients/xml-tool-extractor.js +307 -0
package/src/cluster.js +82 -0
package/src/config/index.js +66 -0
package/src/context/compression.js +42 -9
package/src/context/distill.js +507 -0
package/src/context/tool-result-compressor.js +563 -0
package/src/memory/extractor.js +22 -0
package/src/orchestrator/index.js +147 -205
package/src/routing/complexity-analyzer.js +258 -5
package/src/routing/index.js +15 -34
package/src/routing/latency-tracker.js +148 -0
package/src/routing/model-tiers.js +2 -0
package/src/routing/quality-scorer.js +113 -0
package/src/routing/telemetry.js +502 -0
package/src/server.js +23 -0
package/src/stores/file-store.js +69 -0
package/src/stores/response-store.js +25 -0
package/src/tools/code-graph.js +538 -0
package/src/tools/code-mode.js +304 -0
package/src/tools/index.js +1 -1
package/src/tools/lazy-loader.js +11 -0
package/src/tools/mcp-remote.js +7 -0
package/src/tools/smart-selection.js +11 -0
package/src/tools/web.js +1 -1
package/src/utils/payload.js +206 -0
package/src/utils/perf-timer.js +80 -0

package/src/clients/databricks.js CHANGED Viewed

@@ -1,3 +1,4 @@
+const crypto = require("crypto");
 const config = require("../config");
 const http = require("http");
 const https = require("https");
@@ -5,6 +6,7 @@ const { withRetry } = require("./retry");
 const { getCircuitBreakerRegistry } = require("./circuit-breaker");
 const { getMetricsCollector } = require("../observability/metrics");
 const { getHealthTracker } = require("../observability/health-tracker");
+const { createBulkhead } = require("./resilience");
 const logger = require("../logger");
 const { STANDARD_TOOLS, STANDARD_TOOL_NAMES } = require("./standard-tools");
 const { convertAnthropicToolsToOpenRouter } = require("./openrouter-utils");
@@ -12,6 +14,9 @@ const {
   detectModelFamily
 } = require("./bedrock-utils");
 const { getGPTSystemPromptAddendum } = require("./gpt-utils");
+const telemetry = require("../routing/telemetry");
+const { scoreResponseQuality } = require("../routing/quality-scorer");
+const { getLatencyTracker } = require("../routing/latency-tracker");
@@ -20,70 +25,29 @@ if (typeof fetch !== "function") {
   throw new Error("Node 18+ is required for the built-in fetch API.");
 }
-/**
- * Simple Semaphore for limiting concurrent requests
- * Used to prevent Z.AI rate limiting from parallel Claude Code CLI calls
- */
-class Semaphore {
-  constructor(maxConcurrent = 2) {
-    this.maxConcurrent = maxConcurrent;
-    this.current = 0;
-    this.queue = [];
-  }
-  async acquire() {
-    if (this.current < this.maxConcurrent) {
-      this.current++;
-      return;
-    }
-    // Wait in queue
-    return new Promise((resolve) => {
-      this.queue.push(resolve);
-    });
-  }
-  release() {
-    this.current--;
-    if (this.queue.length > 0 && this.current < this.maxConcurrent) {
-      this.current++;
-      const next = this.queue.shift();
-      next();
-    }
-  }
-  async run(fn) {
-    await this.acquire();
-    try {
-      return await fn();
-    } finally {
-      this.release();
-    }
-  }
-}
-// Z.AI request semaphore - limit concurrent requests to avoid rate limiting
+// Z.AI request bulkhead - limit concurrent requests to avoid rate limiting
 // Configurable via ZAI_MAX_CONCURRENT env var (default: 2)
 const zaiMaxConcurrent = parseInt(process.env.ZAI_MAX_CONCURRENT || '2', 10);
-const zaiSemaphore = new Semaphore(zaiMaxConcurrent);
-logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI semaphore initialized");
+const zaiSemaphore = createBulkhead({ maxConcurrent: zaiMaxConcurrent, maxQueue: 50 });
+logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI bulkhead initialized");
 // HTTP connection pooling for better performance
+// Increased maxSockets for high-concurrency team deployments (50+ devs)
 const httpAgent = new http.Agent({
   keepAlive: true,
-  maxSockets: 50,
-  maxFreeSockets: 10,
-  timeout: 60000,
+  maxSockets: 200,
+  maxFreeSockets: 20,
+  timeout: 120000,
   keepAliveMsecs: 30000,
 });
 const httpsAgent = new https.Agent({
   keepAlive: true,
-  maxSockets: 50,
-  maxFreeSockets: 10,
-  timeout: 60000,
+  maxSockets: 200,
+  maxFreeSockets: 20,
+  timeout: 120000,
   keepAliveMsecs: 30000,
 });
@@ -307,8 +271,8 @@ async function invokeOllama(body) {
     const ollamaBody = {
       model: modelName,
       messages: body.messages,
-      max_tokens: body.max_tokens || 4096,
-      stream: false,
+      max_tokens: body.max_tokens || 16384,
+      stream: body.stream ?? false,
     };
     if (body.system) ollamaBody.system = body.system;
@@ -375,7 +339,7 @@ async function invokeOllama(body) {
   const ollamaBody = {
     model: modelName,
     messages: deduplicated,
-    stream: false,
+    stream: body.stream ?? false,
     options: {
       temperature: body.temperature ?? 0.7,
       num_predict: body.max_tokens ?? 16384,
@@ -432,7 +396,7 @@ async function invokeOpenRouter(body) {
     model: body._suggestionModeModel || body._tierModel || config.openrouter.model,
     messages,
     temperature: body.temperature ?? 0.7,
-    max_tokens: body.max_tokens ?? 4096,
+    max_tokens: body.max_tokens ?? 16384,
     top_p: body.top_p ?? 1.0,
     stream: body.stream ?? false
   };
@@ -515,7 +479,7 @@ async function invokeAzureOpenAI(body) {
   const azureBody = {
     messages,
     temperature: body.temperature ?? 0.3,  // Lower temperature for more deterministic, action-oriented behavior
-    max_tokens: Math.min(body.max_tokens ?? 4096, 16384),  // Cap at Azure OpenAI's limit
+    max_tokens: Math.min(body.max_tokens ?? 16384, 16384),  // Cap at Azure OpenAI's limit
     top_p: body.top_p ?? 1.0,
     stream: false,  // Force non-streaming for Azure OpenAI - streaming format conversion not yet implemented
     model: body._suggestionModeModel || body._tierModel || config.azureOpenAI.deployment
@@ -911,7 +875,7 @@ async function invokeOpenAI(body) {
     model: body._suggestionModeModel || body._tierModel || config.openai.model || "gpt-4o",
     messages,
     temperature: body.temperature ?? 0.7,
-    max_tokens: body.max_tokens ?? 4096,
+    max_tokens: body.max_tokens ?? 16384,
     top_p: body.top_p ?? 1.0,
     stream: body.stream ?? false
   };
@@ -1012,7 +976,7 @@ async function invokeLlamaCpp(body) {
   const llamacppBody = {
     messages: deduplicated,
     temperature: body.temperature ?? 0.7,
-    max_tokens: body.max_tokens ?? 4096,
+    max_tokens: body.max_tokens ?? 16384,
     top_p: body.top_p ?? 1.0,
     stream: body.stream ?? false
   };
@@ -1096,7 +1060,7 @@ async function invokeLMStudio(body) {
   const lmstudioBody = {
     messages,
     temperature: body.temperature ?? 0.7,
-    max_tokens: body.max_tokens ?? 4096,
+    max_tokens: body.max_tokens ?? 16384,
     top_p: body.top_p ?? 1.0,
     stream: body.stream ?? false
   };
@@ -1411,7 +1375,7 @@ async function invokeZai(body) {
     zaiBody = {
       model: mappedModel,
       messages,
-      max_tokens: body.max_tokens || 4096,
+      max_tokens: body.max_tokens || 16384,
       temperature: body.temperature ?? 0.7,
       stream: body.stream,
     };
@@ -1473,12 +1437,9 @@ async function invokeZai(body) {
     zaiBody: JSON.stringify(zaiBody).substring(0, 1000),
   }, "Z.AI request body (truncated)");
-  // Use semaphore to limit concurrent Z.AI requests (prevents rate limiting)
-  return zaiSemaphore.run(async () => {
-    logger.debug({
-      queueLength: zaiSemaphore.queue.length,
-      currentConcurrent: zaiSemaphore.current,
-    }, "Z.AI semaphore status");
+  // Use bulkhead to limit concurrent Z.AI requests (prevents rate limiting)
+  return zaiSemaphore.execute(async () => {
+    logger.debug("Z.AI bulkhead executing request");
     const response = await performJsonRequest(endpoint, { headers, body: zaiBody }, "Z.AI");
@@ -1560,7 +1521,7 @@ async function invokeMoonshot(body) {
   const moonshotBody = {
     model: mappedModel,
     messages,
-    max_tokens: body.max_tokens || 4096,
+    max_tokens: body.max_tokens || 16384,
     temperature: body.temperature ?? 0.7,
     top_p: body.top_p ?? 1.0,
     stream: false,  // Force non-streaming - OpenAI SSE to Anthropic SSE conversion not implemented
@@ -1638,20 +1599,34 @@ function convertOpenAIToAnthropic(response) {
   const message = choice.message || {};
   const content = [];
+  // Extract tool calls embedded as XML/text in content (Minimax, Qwen, GLM, etc.)
+  if (!message.tool_calls?.length && typeof message.content === "string" && message.content.trim()) {
+    const { extractToolCallsFromText } = require("./xml-tool-extractor");
+    const extracted = extractToolCallsFromText(message.content);
+    if (extracted.toolCalls.length > 0) {
+      message.tool_calls = extracted.toolCalls;
+      message.content = extracted.cleanedText;
+      choice.finish_reason = "tool_calls";
+    }
+  }
   // Add text content from message.content
   // Don't add placeholder text if there are tool_calls - tools are the actual response
   const hasToolCalls = Array.isArray(message.tool_calls) && message.tool_calls.length > 0;
-  // Extract text content - handle thinking models that split content/reasoning
+  // Extract text content and reasoning from thinking models
   const textContent = typeof message.content === 'string' ? message.content : '';
   const reasoningContent = typeof message.reasoning_content === 'string' ? message.reasoning_content : '';
+  // Emit reasoning_content as a proper thinking block (not discarded)
+  if (reasoningContent) {
+    content.push({ type: "thinking", thinking: reasoningContent });
+  }
   if (textContent) {
-    // Has regular content - use it directly (ignore reasoning_content chain-of-thought)
     content.push({ type: "text", text: textContent });
-  } else if (reasoningContent) {
-    // Fallback: thinking models where content is empty but reasoning has the output
-    content.push({ type: "text", text: reasoningContent });
+  } else if (!reasoningContent) {
+    // No content and no reasoning — will be handled by the empty check below
   }
   // Convert tool calls
@@ -1791,7 +1766,7 @@ async function invokeVertex(body) {
     contents,
     generationConfig: {
       temperature: body.temperature ?? 0.7,
-      maxOutputTokens: body.max_tokens || 4096,
+      maxOutputTokens: body.max_tokens || 16384,
       topP: body.top_p ?? 1.0,
     }
   };
@@ -2000,6 +1975,54 @@ function convertGeminiToAnthropic(response, requestedModel) {
   };
 }
+async function invokeCodex(body) {
+  const { getCodexProcess } = require("./codex-process");
+  const { convertAnthropicToCodexPrompt, convertCodexResponseToAnthropic } = require("./codex-utils");
+  const codex = getCodexProcess();
+  await codex.ensureRunning();
+  const model = body._tierModel || config.codex?.model || "gpt-5.3-codex";
+  const { prompt, systemContext } = convertAnthropicToCodexPrompt(body);
+  if (!prompt) {
+    throw new Error("Codex: no prompt content to send");
+  }
+  // Start a new thread
+  const threadParams = { model };
+  if (systemContext) {
+    threadParams.instructions = systemContext;
+  }
+  const threadResult = await codex.sendRequest("thread/start", threadParams);
+  const threadId = threadResult?.threadId || threadResult?.id;
+  if (!threadId) {
+    throw new Error("Codex: thread/start did not return a threadId");
+  }
+  logger.debug({ threadId, model, promptLength: prompt.length }, "[Codex] Thread started");
+  // Send the turn and collect response
+  const turnResult = await codex.sendTurn(threadId, prompt, model);
+  logger.debug({
+    threadId,
+    responseLength: turnResult.text?.length || 0,
+  }, "[Codex] Turn completed");
+  // Convert to Anthropic format
+  const anthropicJson = convertCodexResponseToAnthropic(turnResult, model);
+  return {
+    ok: true,
+    status: 200,
+    json: anthropicJson,
+    text: JSON.stringify(anthropicJson),
+    contentType: "application/json",
+  };
+}
 async function invokeModel(body, options = {}) {
   const { determineProviderSmart, isFallbackEnabled, getFallbackProvider } = require("./routing");
   const metricsCollector = getMetricsCollector();
@@ -2007,9 +2030,11 @@ async function invokeModel(body, options = {}) {
   const healthTracker = getHealthTracker();
   // Determine provider via async tier routing
+  // Thread workspace for code-graph integration (from X-Lynkr-Workspace header or body._workspace)
+  const workspace = body._workspace || options.workspace || null;
   const routingResult = options.forceProvider
     ? { provider: options.forceProvider, model: null, method: 'forced' }
-    : await determineProviderSmart(body);
+    : await determineProviderSmart(body, { workspace });
   const initialProvider = routingResult.provider;
   const tierSelectedModel = routingResult.model;
@@ -2018,6 +2043,11 @@ async function invokeModel(body, options = {}) {
     body._tierModel = tierSelectedModel;
   }
+  // Inject provider-side prompt caching (cache_control breakpoints)
+  // Reduces input token cost by up to 90% and latency by up to 80%
+  const { injectPromptCaching } = require('./prompt-cache-injection');
+  injectPromptCaching(body, initialProvider);
   // Build routing decision object for response headers
   const routingDecision = {
     provider: initialProvider,
@@ -2081,6 +2111,8 @@ async function invokeModel(body, options = {}) {
         return await invokeVertex(body);
       } else if (initialProvider === "moonshot") {
         return await invokeMoonshot(body);
+      } else if (initialProvider === "codex") {
+        return await invokeCodex(body);
       }
       return await invokeDatabricks(body);
     });
@@ -2091,10 +2123,13 @@ async function invokeModel(body, options = {}) {
     metricsCollector.recordDatabricksRequest(true, retries);
     healthTracker.recordSuccess(initialProvider, latency);
+    // Record latency for routing intelligence
+    getLatencyTracker().record(initialProvider, latency);
     // Record tokens and cost savings
+    const outputTokens = result.json?.usage?.output_tokens || result.json?.usage?.completion_tokens || 0;
+    const inputTokens = result.json?.usage?.input_tokens || result.json?.usage?.prompt_tokens || 0;
     if (result.json?.usage) {
-      const inputTokens = result.json.usage.input_tokens || result.json.usage.prompt_tokens || 0;
-      const outputTokens = result.json.usage.output_tokens || result.json.usage.completion_tokens || 0;
       metricsCollector.recordTokens(inputTokens, outputTokens);
       // Estimate cost savings if Ollama was used
@@ -2104,6 +2139,53 @@ async function invokeModel(body, options = {}) {
       }
     }
+    // Count tool calls in response
+    const toolCallsMade = result.json?.content?.filter?.(
+      (b) => b.type === "tool_use"
+    )?.length || 0;
+    // Compute quality score
+    const qualityScore = scoreResponseQuality(
+      { tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
+      null,
+      {
+        status_code: 200,
+        output_tokens: outputTokens,
+        tool_calls_made: toolCallsMade,
+        was_fallback: false,
+        retry_count: retries,
+        error_type: null,
+        latency_ms: latency,
+      }
+    );
+    // Record routing telemetry (non-blocking)
+    telemetry.record({
+      request_id: crypto.randomUUID(),
+      session_id: body._sessionId || null,
+      timestamp: Date.now(),
+      complexity_score: routingResult.score ?? null,
+      tier: routingDecision.tier,
+      agentic_type: routingResult.agenticResult?.agentType || null,
+      tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
+      input_tokens: inputTokens || null,
+      message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
+      request_type: routingResult.analysis?.requestType || null,
+      provider: initialProvider,
+      model: routingDecision.model,
+      routing_method: routingDecision.method,
+      was_fallback: false,
+      output_tokens: outputTokens || null,
+      latency_ms: latency,
+      status_code: 200,
+      error_type: null,
+      tool_calls_made: toolCallsMade,
+      retry_count: retries,
+      circuit_breaker_state: breaker.state,
+      quality_score: qualityScore,
+      tokens_per_second: outputTokens && latency > 0 ? outputTokens / (latency / 1000) : null,
+    });
     // Return result with provider info and routing decision for headers
     return {
       ...result,
@@ -2113,8 +2195,10 @@ async function invokeModel(body, options = {}) {
   } catch (err) {
     // Record failure
+    const failLatency = Date.now() - startTime;
     metricsCollector.recordProviderFailure(initialProvider);
     healthTracker.recordFailure(initialProvider, err, err.status);
+    getLatencyTracker().record(initialProvider, failLatency);
     // Check if we should fallback (any provider can fall back, not just ollama)
     const shouldFallback =
@@ -2124,6 +2208,33 @@ async function invokeModel(body, options = {}) {
     if (!shouldFallback) {
       metricsCollector.recordDatabricksRequest(false, retries);
+      // Record failed telemetry
+      telemetry.record({
+        request_id: crypto.randomUUID(),
+        session_id: body._sessionId || null,
+        timestamp: Date.now(),
+        complexity_score: routingResult.score ?? null,
+        tier: routingDecision.tier,
+        agentic_type: routingResult.agenticResult?.agentType || null,
+        tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
+        input_tokens: null,
+        message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
+        request_type: routingResult.analysis?.requestType || null,
+        provider: initialProvider,
+        model: routingDecision.model,
+        routing_method: routingDecision.method,
+        was_fallback: false,
+        latency_ms: failLatency,
+        status_code: err.status || null,
+        error_type: err.code || err.name || "unknown",
+        quality_score: scoreResponseQuality(
+          { tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
+          null,
+          { error_type: err.code || err.name, was_fallback: false, retry_count: retries, latency_ms: failLatency }
+        ),
+      });
       throw err;
     }
@@ -2197,6 +2308,45 @@ async function invokeModel(body, options = {}) {
         totalLatency: Date.now() - startTime,
       }, "Fallback to cloud provider succeeded");
+      // Record latency for fallback provider
+      getLatencyTracker().record(fallbackProvider, fallbackLatency);
+      // Capture fallback telemetry
+      const fbOutputTokens = fallbackResult.json?.usage?.output_tokens || fallbackResult.json?.usage?.completion_tokens || 0;
+      const fbInputTokens = fallbackResult.json?.usage?.input_tokens || fallbackResult.json?.usage?.prompt_tokens || 0;
+      const fbToolCalls = fallbackResult.json?.content?.filter?.(
+        (b) => b.type === "tool_use"
+      )?.length || 0;
+      telemetry.record({
+        request_id: crypto.randomUUID(),
+        session_id: body._sessionId || null,
+        timestamp: Date.now(),
+        complexity_score: routingResult.score ?? null,
+        tier: routingDecision.tier,
+        agentic_type: routingResult.agenticResult?.agentType || null,
+        tool_count: Array.isArray(body?.tools) ? body.tools.length : 0,
+        input_tokens: fbInputTokens || null,
+        message_count: Array.isArray(body?.messages) ? body.messages.length : 0,
+        request_type: routingResult.analysis?.requestType || null,
+        provider: fallbackProvider,
+        model: routingDecision.model,
+        routing_method: "fallback",
+        was_fallback: true,
+        output_tokens: fbOutputTokens || null,
+        latency_ms: Date.now() - startTime,
+        status_code: 200,
+        error_type: null,
+        tool_calls_made: fbToolCalls,
+        retry_count: 0,
+        quality_score: scoreResponseQuality(
+          { tier: routingDecision.tier, hasTools: Array.isArray(body?.tools) && body.tools.length > 0 },
+          null,
+          { status_code: 200, output_tokens: fbOutputTokens, tool_calls_made: fbToolCalls, was_fallback: true, retry_count: 0, latency_ms: Date.now() - startTime }
+        ),
+        tokens_per_second: fbOutputTokens && fallbackLatency > 0 ? fbOutputTokens / (fallbackLatency / 1000) : null,
+      });
       // Return result with actual provider used (fallback provider) and routing decision
       return {
         ...fallbackResult,
@@ -2215,6 +2365,23 @@ async function invokeModel(body, options = {}) {
       metricsCollector.recordDatabricksRequest(false, retries);
       healthTracker.recordFailure(fallbackProvider, fallbackErr, fallbackErr.status);
+      // Record double-failure telemetry
+      telemetry.record({
+        request_id: crypto.randomUUID(),
+        session_id: body._sessionId || null,
+        timestamp: Date.now(),
+        complexity_score: routingResult.score ?? null,
+        tier: routingDecision.tier,
+        provider: fallbackProvider,
+        model: routingDecision.model,
+        routing_method: "fallback",
+        was_fallback: true,
+        latency_ms: Date.now() - startTime,
+        status_code: fallbackErr.status || null,
+        error_type: fallbackErr.code || fallbackErr.name || "double_failure",
+        quality_score: 0,
+      });
       logger.error({
         originalProvider: initialProvider,
         fallbackProvider,

package/src/clients/ollama-utils.js CHANGED Viewed

@@ -77,25 +77,29 @@ async function hasAnthropicEndpoint(baseUrl) {
   if (anthropicEndpointAvailable !== null) return anthropicEndpointAvailable;
   try {
-    // Send a minimal request — we only care about whether the route exists
-    const res = await fetch(`${baseUrl}/v1/messages`, {
-      method: "POST",
-      headers: {
-        "Content-Type": "application/json",
-        "anthropic-version": "2023-06-01",
-      },
-      body: JSON.stringify({
-        model: "probe",
-        max_tokens: 1,
-        messages: [{ role: "user", content: "hi" }],
-      }),
+    // Check Ollama version — /v1/messages requires v0.14.0+
+    // This is instant (no LLM inference) vs the old probe that sent a real request
+    const controller = new AbortController();
+    const timeout = setTimeout(() => controller.abort(), 3000);
+    const versionRes = await fetch(`${baseUrl}/api/version`, {
+      method: "GET",
+      signal: controller.signal,
     });
-    // 404 → endpoint doesn't exist (old Ollama)
-    // Any other status (200, 400, 500) → endpoint exists
-    anthropicEndpointAvailable = res.status !== 404;
+    clearTimeout(timeout);
+    if (versionRes.ok) {
+      const versionData = await versionRes.json().catch(() => null);
+      const version = versionData?.version || "0.0.0";
+      const [major, minor] = version.split(".").map(Number);
+      // v0.14.0+ has the Anthropic Messages API
+      anthropicEndpointAvailable = major > 0 || (major === 0 && minor >= 14);
+    } else {
+      // Can't determine version — fall back to legacy
+      anthropicEndpointAvailable = false;
+    }
     logger.info(
-      { available: anthropicEndpointAvailable, status: res.status },
+      { available: anthropicEndpointAvailable, status: versionRes.status },
       anthropicEndpointAvailable
         ? "Ollama Anthropic API detected (/v1/messages) — using native passthrough"
         : "Ollama Anthropic API not available — falling back to legacy /api/chat (upgrade to Ollama v0.14.0+ for best results)"

package/src/clients/openai-format.js CHANGED Viewed

@@ -60,13 +60,16 @@ function convertOpenAIToAnthropic(openaiRequest) {
           if (part.type === "text") {
             return { type: "text", text: part.text };
           } else if (part.type === "image_url") {
-            return {
-              type: "image",
-              source: {
-                type: "url",
-                url: part.image_url.url
+            const url = part.image_url?.url || "";
+            if (url.startsWith("data:")) {
+              const match = url.match(/^data:(image\/[^;]+);base64,(.+)$/);
+              if (match) {
+                return { type: "image", source: { type: "base64", media_type: match[1], data: match[2] } };
               }
-            };
+            }
+            return { type: "image", source: { type: "url", url } };
+          } else if (part.type === "document" || part.type === "image") {
+            return part;
           }
           return part;
         });
@@ -208,10 +211,16 @@ function convertAnthropicToOpenAI(anthropicResponse, model = "claude-3-5-sonnet-
   // Convert content blocks to OpenAI format
   let messageContent = "";
   const toolCalls = [];
+  let citations = [];
   for (const block of content) {
     if (block.type === "text") {
       messageContent += block.text;
+      if (Array.isArray(block.citations)) {
+        citations.push(...block.citations);
+      }
+    } else if (block.type === "thinking") {
+      // Skip thinking blocks in OpenAI format (they don't have an equivalent)
     } else if (block.type === "tool_use") {
       toolCalls.push({
         id: block.id,
@@ -249,6 +258,11 @@ function convertAnthropicToOpenAI(anthropicResponse, model = "claude-3-5-sonnet-
     }
   };
+  // Add citations if present
+  if (citations.length > 0) {
+    openaiResponse.citations = citations;
+  }
   // Add tool_calls if present
   if (toolCalls.length > 0) {
     openaiResponse.choices[0].message.tool_calls = toolCalls;