npm - lynkr - Versions diffs - 9.0.1 → 9.1.2 - Mend

lynkr 9.0.1 → 9.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

package/README.md +70 -21
package/bin/cli.js +34 -4
package/bin/lynkr-trajectory.js +136 -0
package/bin/lynkr-usage.js +219 -0
package/funding.json +110 -0
package/index.js +7 -3
package/install.sh +3 -3
package/lynkr-skill.tar.gz +0 -0
package/native/Cargo.toml +26 -0
package/native/index.js +29 -0
package/native/lynkr-native.node +0 -0
package/native/src/lib.rs +321 -0
package/package.json +6 -5
package/public/dashboard.html +665 -0
package/src/api/files-multipart.js +30 -0
package/src/api/files-router.js +81 -0
package/src/api/middleware/budget.js +19 -1
package/src/api/middleware/load-shedding.js +17 -0
package/src/api/openai-router.js +353 -301
package/src/api/router.js +275 -40
package/src/cache/prompt.js +13 -0
package/src/clients/databricks.js +42 -18
package/src/clients/ollama-utils.js +21 -17
package/src/clients/openai-format.js +50 -10
package/src/clients/openrouter-utils.js +42 -37
package/src/clients/prompt-cache-injection.js +140 -0
package/src/clients/provider-capabilities.js +41 -0
package/src/clients/responses-format.js +8 -7
package/src/clients/standard-tools.js +1 -1
package/src/clients/xml-tool-extractor.js +307 -0
package/src/cluster.js +82 -0
package/src/config/index.js +16 -0
package/src/context/distill.js +15 -0
package/src/context/tool-result-compressor.js +563 -0
package/src/dashboard/api.js +170 -0
package/src/dashboard/router.js +13 -0
package/src/headroom/client.js +3 -109
package/src/headroom/index.js +0 -14
package/src/memory/extractor.js +22 -0
package/src/memory/search.js +0 -50
package/src/orchestrator/index.js +163 -204
package/src/orchestrator/preflight.js +188 -0
package/src/routing/index.js +64 -32
package/src/routing/interaction.js +183 -0
package/src/routing/risk-analyzer.js +194 -0
package/src/routing/telemetry.js +47 -2
package/src/server.js +15 -0
package/src/stores/file-store.js +104 -0
package/src/stores/response-store.js +25 -0
package/src/tools/index.js +1 -1
package/src/tools/smart-selection.js +11 -2
package/src/tools/web.js +1 -1
package/src/training/trajectory-compressor.js +266 -0
package/src/usage/aggregator.js +206 -0
package/src/utils/markdown-ansi.js +146 -0
package/.lynkr/telemetry.db +0 -0
package/.lynkr/telemetry.db-shm +0 -0
package/.lynkr/telemetry.db-wal +0 -0

package/src/api/router.js CHANGED Viewed

@@ -6,8 +6,10 @@ const logger = require("../logger");
 const { createRateLimiter } = require("./middleware/rate-limiter");
 const openaiRouter = require("./openai-router");
 const providersRouter = require("./providers-handler");
-const { getRoutingHeaders, getRoutingStats, analyzeComplexity, getModelTierSelector } = require("../routing");
+const { getRoutingHeaders, getRoutingStats, analyzeComplexity, getModelTierSelector, analyzeRisk } = require("../routing");
+const { buildInteractionBlock } = require("../routing/interaction");
 const { validateCwd } = require("../workspace");
+const { renderText } = require("../utils/markdown-ansi");
 const router = express.Router();
@@ -63,6 +65,24 @@ router.get("/health", (req, res) => {
   res.json({ status: "ok" });
 });
+// Usage report — same data as `lynkr usage` CLI, served as JSON for
+// dashboards / agents / scripts that want to surface spend & savings.
+router.get("/v1/usage", (req, res) => {
+  try {
+    const aggregator = require("../usage/aggregator");
+    const window = req.query.window || (req.query.days ? `${parseInt(req.query.days, 10)}d` : "30d");
+    const usage = aggregator.getUsage({
+      window,
+      flagship: req.query.flagship,
+      provider: req.query.provider,
+      model: req.query.model,
+    });
+    res.json(usage);
+  } catch (err) {
+    res.status(500).json({ error: err.message });
+  }
+});
 // Routing stats endpoint (Phase 3: Metrics)
 router.get("/routing/stats", (req, res) => {
   const stats = getRoutingStats();
@@ -213,7 +233,46 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
     const { createTimer } = require("../utils/perf-timer");
     const timer = createTimer("POST /v1/messages");
     metrics.recordRequest();
-    // Support both query parameter (?stream=true) and body parameter ({"stream": true})
+    // Convert Anthropic server tools (web_search_20260209, etc.) to regular
+    // function tools so non-Anthropic providers can execute them via Lynkr.
+    // The orchestrator's SERVER_SIDE_TOOLS handling will execute them server-side.
+    if (Array.isArray(req.body?.tools)) {
+      const incomingToolTypes = req.body.tools.map(t => t?.type || t?.name).filter(Boolean);
+      logger.info({ incomingToolTypes }, "Incoming /v1/messages tool types");
+      req.body.tools = req.body.tools.map((tool) => {
+        if (tool?.type?.startsWith?.("web_search_20")) {
+          logger.info({ originalType: tool.type, name: tool.name }, "Converting web_search server tool to function tool");
+          return {
+            name: tool.name || "web_search",
+            description: "Search the web for up-to-date information. Returns relevant search results from the web.",
+            input_schema: {
+              type: "object",
+              properties: {
+                query: { type: "string", description: "Search query" },
+              },
+              required: ["query"],
+            },
+          };
+        }
+        if (tool?.type?.startsWith?.("web_fetch_")) {
+          return {
+            name: tool.name || "web_fetch",
+            description: "Fetch the contents of a URL.",
+            input_schema: {
+              type: "object",
+              properties: {
+                url: { type: "string", description: "URL to fetch" },
+              },
+              required: ["url"],
+            },
+          };
+        }
+        return tool;
+      });
+    }
+// Support both query parameter (?stream=true) and body parameter ({"stream": true})
     const wantsStream = Boolean(req.query?.stream === 'true' || req.body?.stream);
     const hasTools = Array.isArray(req.body?.tools) && req.body.tools.length > 0;
     timer.mark("parseRequest");
@@ -221,24 +280,70 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
     // Analyze complexity for routing headers (Phase 3)
     const complexity = await analyzeComplexity(req.body);
     timer.mark("analyzeComplexity");
+    // Risk axis runs alongside complexity. Cheap pure-string scan, no I/O.
+    let preRouteRisk = null;
+    try {
+      preRouteRisk = analyzeRisk(req.body);
+    } catch (err) {
+      logger.debug({ err: err.message }, '[Router] Risk analysis failed in pre-route');
+    }
+    // Pre-route tier: high-risk forces COMPLEX, otherwise tier is
+    // inferred from the complexity recommendation. The actual final
+    // tier may differ (invokeModel re-runs determineProviderSmart) —
+    // this is best-effort for header surfacing.
     let preRouteProvider = 'cloud';
-    if (complexity.recommendation === 'local') {
-      // Use tier config to determine actual provider instead of hardcoding 'ollama'
+    let preRouteTier = null;
+    let preRouteModel = null;
+    let preRouteMethod = 'complexity';
+    let preRouteReason = complexity.breakdown?.taskType?.reason || complexity.recommendation;
+    if (preRouteRisk?.level === 'high') {
       try {
         const selector = getModelTierSelector();
-        const tierResult = selector.selectModel('SIMPLE', null);
+        const tierResult = selector.selectModel('COMPLEX', null);
         preRouteProvider = tierResult.provider;
+        preRouteTier = 'COMPLEX';
+        preRouteModel = tierResult.model;
+        preRouteMethod = 'risk';
+        preRouteReason = 'high_risk_forced_tier';
       } catch (_) {
-        preRouteProvider = 'ollama';
+        // Risk-forced tier not configured; fall back to normal flow.
       }
     }
-    const routingHeaders = getRoutingHeaders({
+    if (!preRouteTier) {
+      if (complexity.recommendation === 'local') {
+        try {
+          const selector = getModelTierSelector();
+          const tierResult = selector.selectModel('SIMPLE', null);
+          preRouteProvider = tierResult.provider;
+          preRouteTier = 'SIMPLE';
+          preRouteModel = tierResult.model;
+        } catch (_) {
+          preRouteProvider = 'ollama';
+        }
+      }
+    }
+    const preRouteDecision = {
       provider: preRouteProvider,
+      tier: preRouteTier,
+      model: preRouteModel,
+      method: preRouteMethod,
+      reason: preRouteReason,
       score: complexity.score,
       threshold: complexity.threshold,
-      method: 'complexity',
-      reason: complexity.breakdown?.taskType?.reason || complexity.recommendation,
-    });
+      risk: preRouteRisk,
+    };
+    const routingHeaders = getRoutingHeaders(preRouteDecision);
+    // Build the interaction block once. It travels in headers always
+    // (X-Lynkr-Interaction-* derived fields) and optionally into the
+    // response body when LYNKR_VISIBLE_ROUTING=true.
+    const interaction = buildInteractionBlock(preRouteDecision);
     // Extract client CWD from request body or header
     const clientCwd = validateCwd(req.body?.cwd || req.headers['x-workspace-cwd']);
@@ -369,7 +474,11 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
       })}\n\n`);
       // 2. content_block_start and content_block_delta for each content block
-      const contentBlocks = msg.content || [];
+      // Filter out server-side tools that shouldn't reach the client
+      const _serverTools = new Set(["task", "websearch", "webfetch", "web_search", "web_fetch", "web_agent"]);
+      const contentBlocks = (msg.content || []).filter(b =>
+        !(b.type === "tool_use" && _serverTools.has((b.name || "").toLowerCase()))
+      );
       for (let i = 0; i < contentBlocks.length; i++) {
         const block = contentBlocks[i];
@@ -381,38 +490,90 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
             content_block: { type: "text", text: "" }
           })}\n\n`);
-          // Send text in chunks
-          const text = block.text || "";
-          const chunkSize = 20;
-          for (let j = 0; j < text.length; j += chunkSize) {
-            const chunk = text.slice(j, j + chunkSize);
-            res.write(`event: content_block_delta\n`);
-            res.write(`data: ${JSON.stringify({
-              type: "content_block_delta",
-              index: i,
-              delta: { type: "text_delta", text: chunk }
-            })}\n\n`);
+          // Send text — one chunk when ANSI rendering is active (splitting
+          // ANSI escape sequences across 20-char chunks breaks terminal output).
+          // Plain text falls back to line-level chunks for a trickle effect.
+          // Never apply ANSI rendering to HTML content (<artifact> blocks):
+          // ANSI codes corrupt CSS selectors like `*` and break the browser viewer.
+          const rawBlockText = block.text || "";
+          const isHtmlContent = rawBlockText.includes("<artifact") || rawBlockText.trimStart().startsWith("<");
+          const text = isHtmlContent ? rawBlockText : renderText(rawBlockText);
+          const { enabled: ansiEnabled } = require("../utils/markdown-ansi");
+          if (ansiEnabled && !isHtmlContent) {
+            if (text.length > 0) {
+              res.write(`event: content_block_delta\n`);
+              res.write(`data: ${JSON.stringify({
+                type: "content_block_delta",
+                index: i,
+                delta: { type: "text_delta", text }
+              })}\n\n`);
+            }
+          } else {
+            const lines = text.split("\n");
+            for (const line of lines) {
+              const lineWithNl = line + "\n";
+              res.write(`event: content_block_delta\n`);
+              res.write(`data: ${JSON.stringify({
+                type: "content_block_delta",
+                index: i,
+                delta: { type: "text_delta", text: lineWithNl }
+              })}\n\n`);
+            }
           }
           res.write(`event: content_block_stop\n`);
           res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
-        } else if (block.type === "tool_use") {
+        } else if (block.type === "thinking") {
           res.write(`event: content_block_start\n`);
           res.write(`data: ${JSON.stringify({
             type: "content_block_start",
             index: i,
-            content_block: { type: "tool_use", id: block.id, name: block.name, input: {} }
-          })}\n\n`);
-          res.write(`event: content_block_delta\n`);
-          res.write(`data: ${JSON.stringify({
-            type: "content_block_delta",
-            index: i,
-            delta: { type: "input_json_delta", partial_json: JSON.stringify(block.input) }
+            content_block: { type: "thinking", thinking: "" }
           })}\n\n`);
+          const thinkingText = block.thinking || "";
+          const thinkChunkSize = 40;
+          for (let j = 0; j < thinkingText.length; j += thinkChunkSize) {
+            res.write(`event: content_block_delta\n`);
+            res.write(`data: ${JSON.stringify({
+              type: "content_block_delta",
+              index: i,
+              delta: { type: "thinking_delta", thinking: thinkingText.slice(j, j + thinkChunkSize) }
+            })}\n\n`);
+          }
           res.write(`event: content_block_stop\n`);
           res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
+        } else if (block.type === "tool_use") {
+          // Original request had no tools → model hallucinated a tool call.
+          // Extract file content from write-style tools and wrap it in an
+          // <artifact> block so open-design routes it to the Design panel.
+          const toolName = (block.name || "").toLowerCase();
+          const writeTools = new Set(["write", "create_file", "write_file", "str_replace_editor"]);
+          if (writeTools.has(toolName)) {
+            const rawContent = block.input?.content ?? block.input?.file_content ?? block.input?.new_content ?? "";
+            const filePath = String(block.input?.file_path ?? block.input?.filename ?? "design.html");
+            const content = String(rawContent);
+            if (content) {
+              // Wrap in <artifact> so open-design's parser routes it to the file viewer.
+              const identifier = filePath.replace(/[^a-zA-Z0-9._-]/g, "_");
+              const title = filePath;
+              const wrapped = `<artifact identifier="${identifier}" type="text/html" title="${title}">\n${content}\n</artifact>`;
+              res.write(`event: content_block_start\n`);
+              res.write(`data: ${JSON.stringify({
+                type: "content_block_start",
+                index: i,
+                content_block: { type: "text", text: "" }
+              })}\n\n`);
+              res.write(`event: content_block_delta\n`);
+              res.write(`data: ${JSON.stringify({
+                type: "content_block_delta",
+                index: i,
+                delta: { type: "text_delta", text: wrapped }
+              })}\n\n`);
+              res.write(`event: content_block_stop\n`);
+              res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
+            }
+          }
+          // Non-write tool_use in a tool-less request is silently dropped.
         }
       }
@@ -488,7 +649,11 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
       })}\n\n`);
       // 2. content_block_start and content_block_delta for each content block
-      const contentBlocks = msg.content || [];
+      // Filter out server-side tools that shouldn't reach the client
+      const _serverTools = new Set(["task", "websearch", "webfetch", "web_search", "web_fetch", "web_agent"]);
+      const contentBlocks = (msg.content || []).filter(b =>
+        !(b.type === "tool_use" && _serverTools.has((b.name || "").toLowerCase()))
+      );
       for (let i = 0; i < contentBlocks.length; i++) {
         const block = contentBlocks[i];
@@ -500,18 +665,51 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
             content_block: { type: "text", text: "" }
           })}\n\n`);
-          const text = block.text || "";
-          const chunkSize = 20;
-          for (let j = 0; j < text.length; j += chunkSize) {
-            const chunk = text.slice(j, j + chunkSize);
+          const rawBlockText2 = block.text || "";
+          const isHtmlContent2 = rawBlockText2.includes("<artifact") || rawBlockText2.trimStart().startsWith("<");
+          const text = isHtmlContent2 ? rawBlockText2 : renderText(rawBlockText2);
+          const { enabled: ansiEnabled } = require("../utils/markdown-ansi");
+          if (ansiEnabled && !isHtmlContent2) {
+            if (text.length > 0) {
+              res.write(`event: content_block_delta\n`);
+              res.write(`data: ${JSON.stringify({
+                type: "content_block_delta",
+                index: i,
+                delta: { type: "text_delta", text }
+              })}\n\n`);
+            }
+          } else {
+            const lines = text.split("\n");
+            for (const line of lines) {
+              const lineWithNl = line + "\n";
+              res.write(`event: content_block_delta\n`);
+              res.write(`data: ${JSON.stringify({
+                type: "content_block_delta",
+                index: i,
+                delta: { type: "text_delta", text: lineWithNl }
+              })}\n\n`);
+            }
+          }
+          res.write(`event: content_block_stop\n`);
+          res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
+        } else if (block.type === "thinking") {
+          res.write(`event: content_block_start\n`);
+          res.write(`data: ${JSON.stringify({
+            type: "content_block_start",
+            index: i,
+            content_block: { type: "thinking", thinking: "" }
+          })}\n\n`);
+          const thinkingText = block.thinking || "";
+          const thinkChunkSize = 40;
+          for (let j = 0; j < thinkingText.length; j += thinkChunkSize) {
             res.write(`event: content_block_delta\n`);
             res.write(`data: ${JSON.stringify({
               type: "content_block_delta",
               index: i,
-              delta: { type: "text_delta", text: chunk }
+              delta: { type: "thinking_delta", thinking: thinkingText.slice(j, j + thinkChunkSize) }
             })}\n\n`);
           }
           res.write(`event: content_block_stop\n`);
           res.write(`data: ${JSON.stringify({ type: "content_block_stop", index: i })}\n\n`);
         } else if (block.type === "tool_use") {
@@ -566,8 +764,33 @@ router.post("/v1/messages", rateLimiter, async (req, res, next) => {
       });
     }
+    // Inject visible interaction block into the response body when
+    // LYNKR_VISIBLE_ROUTING=true. We only mutate JSON bodies — and only
+    // when the response looks like a valid Anthropic Message — so this
+    // is a no-op for streamed / error / non-message responses.
+    let finalBody = result.body;
+    if (
+      config.routing?.visibleInteraction &&
+      interaction &&
+      result.status >= 200 && result.status < 300 &&
+      result.body
+    ) {
+      try {
+        const text = Buffer.isBuffer(result.body) ? result.body.toString('utf8') : result.body;
+        if (typeof text === 'string' && text.startsWith('{')) {
+          const parsed = JSON.parse(text);
+          if (parsed && typeof parsed === 'object' && parsed.type === 'message') {
+            parsed.lynkr_interaction = interaction;
+            finalBody = JSON.stringify(parsed);
+          }
+        }
+      } catch (err) {
+        logger.debug({ err: err.message }, '[Router] Skipped interaction injection (non-JSON body)');
+      }
+    }
     metrics.recordResponse(result.status);
-    res.status(result.status).send(result.body);
+    res.status(result.status).send(finalBody);
   } catch (error) {
     next(error);
   }
@@ -724,6 +947,18 @@ router.get("/metrics/compression", async (req, res) => {
   }
 });
+router.get("/metrics/tool-compression", (req, res) => {
+  const { getMetrics } = require("../context/tool-result-compressor");
+  res.json(getMetrics());
+});
+router.get("/tee/:id", (req, res) => {
+  const { teeGet } = require("../context/tool-result-compressor");
+  const content = teeGet(req.params.id);
+  if (!content) return res.status(404).json({ error: "Tee entry not found or expired" });
+  res.type("text/plain").send(content);
+});
 router.get("/health/headroom", async (req, res) => {
   try {
     const { getHeadroomManager } = require("../headroom");

package/src/cache/prompt.js CHANGED Viewed

@@ -5,6 +5,15 @@ try {
 } catch {
   Database = null;
 }
+// Try to load native Rust cache key computation (4x faster for small payloads)
+let nativeCacheKey = null;
+try {
+  const native = require('../../native');
+  if (native.available && native.computeCacheKey) {
+    nativeCacheKey = native.computeCacheKey;
+  }
+} catch { /* native module not available — use JS */ }
 const path = require("path");
 const fs = require("fs");
 const config = require("../config");
@@ -164,6 +173,10 @@ class PromptCache {
         max_tokens: payload.max_tokens ?? null,
       };
       const serialised = stableStringify(canonical);
+      // Use Rust for small payloads where it's 4x faster
+      if (nativeCacheKey && serialised.length < 5000) {
+        return nativeCacheKey(serialised);
+      }
       return crypto.createHash("sha256").update(serialised).digest("hex");
     } catch (error) {
       logger.warn(

package/src/clients/databricks.js CHANGED Viewed

@@ -34,19 +34,20 @@ logger.info({ maxConcurrent: zaiMaxConcurrent }, "Z.AI bulkhead initialized");
 // HTTP connection pooling for better performance
+// Increased maxSockets for high-concurrency team deployments (50+ devs)
 const httpAgent = new http.Agent({
   keepAlive: true,
-  maxSockets: 50,
-  maxFreeSockets: 10,
-  timeout: 60000,
+  maxSockets: 200,
+  maxFreeSockets: 20,
+  timeout: 120000,
   keepAliveMsecs: 30000,
 });
 const httpsAgent = new https.Agent({
   keepAlive: true,
-  maxSockets: 50,
-  maxFreeSockets: 10,
-  timeout: 60000,
+  maxSockets: 200,
+  maxFreeSockets: 20,
+  timeout: 120000,
   keepAliveMsecs: 30000,
 });
@@ -220,7 +221,7 @@ async function invokeOllama(body) {
   const useAnthropicApi = await hasAnthropicEndpoint(config.ollama.endpoint);
   // Check if model supports tools FIRST (before wasteful injection)
-  const supportsTools = await checkOllamaToolSupport(config.ollama.model);
+  const supportsTools = await checkOllamaToolSupport(modelName);
   const injectToolsOllama = process.env.INJECT_TOOLS_OLLAMA !== "false";
   // Determine tools to send
@@ -271,7 +272,7 @@ async function invokeOllama(body) {
       model: modelName,
       messages: body.messages,
       max_tokens: body.max_tokens || 16384,
-      stream: false,
+      stream: body.stream ?? false,
     };
     if (body.system) ollamaBody.system = body.system;
@@ -338,7 +339,7 @@ async function invokeOllama(body) {
   const ollamaBody = {
     model: modelName,
     messages: deduplicated,
-    stream: false,
+    stream: body.stream ?? false,
     options: {
       temperature: body.temperature ?? 0.7,
       num_predict: body.max_tokens ?? 16384,
@@ -475,13 +476,17 @@ async function invokeAzureOpenAI(body) {
   // System prompt injection disabled - breaks model response
   // Tool guidance now provided via tool descriptions instead
+  const azureDeployment = body._suggestionModeModel || body._tierModel || config.azureOpenAI.deployment || "";
+  const isGpt5 = /gpt-5/i.test(azureDeployment);
+  const maxTokensKey = isGpt5 ? "max_completion_tokens" : "max_tokens";
   const azureBody = {
     messages,
-    temperature: body.temperature ?? 0.3,  // Lower temperature for more deterministic, action-oriented behavior
-    max_tokens: Math.min(body.max_tokens ?? 16384, 16384),  // Cap at Azure OpenAI's limit
+    temperature: body.temperature ?? 0.3,
+    [maxTokensKey]: Math.min(body.max_tokens ?? 16384, 16384),
     top_p: body.top_p ?? 1.0,
-    stream: false,  // Force non-streaming for Azure OpenAI - streaming format conversion not yet implemented
-    model: body._suggestionModeModel || body._tierModel || config.azureOpenAI.deployment
+    stream: false,
+    model: azureDeployment
   };
   // Add tools - inject standard tools if client didn't send any (passthrough mode)
@@ -1598,20 +1603,34 @@ function convertOpenAIToAnthropic(response) {
   const message = choice.message || {};
   const content = [];
+  // Extract tool calls embedded as XML/text in content (Minimax, Qwen, GLM, etc.)
+  if (!message.tool_calls?.length && typeof message.content === "string" && message.content.trim()) {
+    const { extractToolCallsFromText } = require("./xml-tool-extractor");
+    const extracted = extractToolCallsFromText(message.content);
+    if (extracted.toolCalls.length > 0) {
+      message.tool_calls = extracted.toolCalls;
+      message.content = extracted.cleanedText;
+      choice.finish_reason = "tool_calls";
+    }
+  }
   // Add text content from message.content
   // Don't add placeholder text if there are tool_calls - tools are the actual response
   const hasToolCalls = Array.isArray(message.tool_calls) && message.tool_calls.length > 0;
-  // Extract text content - handle thinking models that split content/reasoning
+  // Extract text content and reasoning from thinking models
   const textContent = typeof message.content === 'string' ? message.content : '';
   const reasoningContent = typeof message.reasoning_content === 'string' ? message.reasoning_content : '';
+  // Emit reasoning_content as a proper thinking block (not discarded)
+  if (reasoningContent) {
+    content.push({ type: "thinking", thinking: reasoningContent });
+  }
   if (textContent) {
-    // Has regular content - use it directly (ignore reasoning_content chain-of-thought)
     content.push({ type: "text", text: textContent });
-  } else if (reasoningContent) {
-    // Fallback: thinking models where content is empty but reasoning has the output
-    content.push({ type: "text", text: reasoningContent });
+  } else if (!reasoningContent) {
+    // No content and no reasoning — will be handled by the empty check below
   }
   // Convert tool calls
@@ -2028,6 +2047,11 @@ async function invokeModel(body, options = {}) {
     body._tierModel = tierSelectedModel;
   }
+  // Inject provider-side prompt caching (cache_control breakpoints)
+  // Reduces input token cost by up to 90% and latency by up to 80%
+  const { injectPromptCaching } = require('./prompt-cache-injection');
+  injectPromptCaching(body, initialProvider);
   // Build routing decision object for response headers
   const routingDecision = {
     provider: initialProvider,

package/src/clients/ollama-utils.js CHANGED Viewed

@@ -77,25 +77,29 @@ async function hasAnthropicEndpoint(baseUrl) {
   if (anthropicEndpointAvailable !== null) return anthropicEndpointAvailable;
   try {
-    // Send a minimal request — we only care about whether the route exists
-    const res = await fetch(`${baseUrl}/v1/messages`, {
-      method: "POST",
-      headers: {
-        "Content-Type": "application/json",
-        "anthropic-version": "2023-06-01",
-      },
-      body: JSON.stringify({
-        model: "probe",
-        max_tokens: 1,
-        messages: [{ role: "user", content: "hi" }],
-      }),
+    // Check Ollama version — /v1/messages requires v0.14.0+
+    // This is instant (no LLM inference) vs the old probe that sent a real request
+    const controller = new AbortController();
+    const timeout = setTimeout(() => controller.abort(), 3000);
+    const versionRes = await fetch(`${baseUrl}/api/version`, {
+      method: "GET",
+      signal: controller.signal,
     });
-    // 404 → endpoint doesn't exist (old Ollama)
-    // Any other status (200, 400, 500) → endpoint exists
-    anthropicEndpointAvailable = res.status !== 404;
+    clearTimeout(timeout);
+    if (versionRes.ok) {
+      const versionData = await versionRes.json().catch(() => null);
+      const version = versionData?.version || "0.0.0";
+      const [major, minor] = version.split(".").map(Number);
+      // v0.14.0+ has the Anthropic Messages API
+      anthropicEndpointAvailable = major > 0 || (major === 0 && minor >= 14);
+    } else {
+      // Can't determine version — fall back to legacy
+      anthropicEndpointAvailable = false;
+    }
     logger.info(
-      { available: anthropicEndpointAvailable, status: res.status },
+      { available: anthropicEndpointAvailable, status: versionRes.status },
       anthropicEndpointAvailable
         ? "Ollama Anthropic API detected (/v1/messages) — using native passthrough"
         : "Ollama Anthropic API not available — falling back to legacy /api/chat (upgrade to Ollama v0.14.0+ for best results)"