npm - wolverine-ai - Versions diffs - 4.0.3 → 4.0.5 - Mend

wolverine-ai 4.0.3 → 4.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "wolverine-ai",
-  "version": "4.0.3",
+  "version": "4.0.5",
   "description": "Self-healing Node.js server framework powered by AI. Catches crashes, diagnoses errors, generates fixes, verifies, and restarts — automatically.",
   "main": "src/index.js",
   "bin": {

package/src/brain/brain.js CHANGED Viewed

@@ -133,6 +133,10 @@ const SEED_DOCS = [
     text: "Configuration: hybrid-always architecture — no provider selection. Users pick the best model for each of 8 task roles directly in settings.json 'models' section. Mix and match: wolverine for audit, claude for reasoning, gpt for coding. Provider auto-detected from model name. Embedding is separate ('embedding' key) — always wolverine-embedding-1 billed through credits (proxies to text-embedding-3-small at 2x markup). Secrets in .env.local. Config priority: env vars > settings.json > defaults.",
     metadata: { topic: "configuration" },
   },
+  {
+    text: "AI client prompt caching: all 3 providers cache automatically. Anthropic: system prompt marked cache_control:ephemeral, 90% cheaper on repeat calls within 5 min TTL. OpenAI: automatic prefix caching for >=1024 token prefixes, 50% cheaper on cached input, tracked via usage.prompt_tokens_details.cached_tokens. Wolverine/llama.cpp: cache_prompt:true in request body reuses KV cache for identical prefixes between requests, near-zero TTFT on second+ call in a heal pipeline. Cache savings tracked in analytics: cacheCreation (tokens written to cache) and cacheRead (tokens served from cache).",
+    metadata: { topic: "prompt-caching" },
+  },
   {
     text: "Platform telemetry: lightweight background process, zero-config. Default platform: api.wolverinenode.xyz. Auto-registers on first run (retries every 60s until platform responds), saves key to .wolverine/platform-key. Heartbeat payload matches PLATFORM.md spec: instanceId, server (name/port/uptime/status/pid), process (memoryMB/cpuPercent), routes, repairs, usage (tokens/cost/calls/byCategory), brain, backups. Offline-resilient: queues up to 1440 heartbeats locally, drains on reconnect. No chalk dependency, cached version/key in memory, minimal IO. Opt out: WOLVERINE_TELEMETRY=false. Override URL: WOLVERINE_PLATFORM_URL.",
     metadata: { topic: "platform-telemetry" },

package/src/brain/embedder.js CHANGED Viewed

@@ -51,7 +51,7 @@ async function embed(text) {
     response = await client.embeddings.create({ model, input: text });
   } catch (err) {
     // If wolverine proxy is down (startup, crash loop), fall back to OpenAI direct
-    if (provider === "wolverine" && /ECONNREFUSED|ECONNRESET|ETIMEDOUT|fetch failed/i.test(err.message || "")) {
+    if (provider === "wolverine" && /ECONNREFUSED|ECONNRESET|ETIMEDOUT|fetch failed|Connection error/i.test(err.message || "")) {
       const directClient = getClient("openai");
       response = await directClient.embeddings.create({ model: "text-embedding-3-small", input: text });
     } else {
@@ -98,7 +98,7 @@ async function embedBatch(texts) {
   try {
     response = await client.embeddings.create({ model, input: uncached });
   } catch (err) {
-    if (provider === "wolverine" && /ECONNREFUSED|ECONNRESET|ETIMEDOUT|fetch failed/i.test(err.message || "")) {
+    if (provider === "wolverine" && /ECONNREFUSED|ECONNRESET|ETIMEDOUT|fetch failed|Connection error/i.test(err.message || "")) {
       const directClient = getClient("openai");
       response = await directClient.embeddings.create({ model: "text-embedding-3-small", input: uncached });
     } else {

package/src/core/ai-client.js CHANGED Viewed

@@ -21,8 +21,9 @@ function _extractTokens(usage) {
     output: usage.completion_tokens || usage.output_tokens || 0,
     // Anthropic cache fields
     cacheCreation: usage.cache_creation_input_tokens || usage.cache_write_tokens || 0,
-    // OpenAI uses cache_read_tokens, Anthropic uses cache_read_input_tokens
-    cacheRead: usage.cache_read_input_tokens || usage.cache_read_tokens || 0,
+    // OpenAI prompt_tokens_details.cached_tokens + Anthropic cache_read_input_tokens
+    cacheRead: usage.cache_read_input_tokens || usage.cache_read_tokens
+      || usage.prompt_tokens_details?.cached_tokens || 0,
   };
 }
@@ -232,7 +233,7 @@ async function aiCall({ model, systemPrompt, userPrompt, maxTokens = 2048, tools
         result = await _chatCall(_getWolverineClient(), { model, systemPrompt, userPrompt, maxTokens, tools, toolChoice });
       } catch (proxyErr) {
         // If billing proxy is down (server crashing), fall back to direct GPU
-        const isConnErr = /ECONNREFUSED|ECONNRESET|ETIMEDOUT|fetch failed/i.test(proxyErr.message || "");
+        const isConnErr = /ECONNREFUSED|ECONNRESET|ETIMEDOUT|fetch failed|Connection error/i.test(proxyErr.message || "");
         const directClient = _getWolverineDirectClient();
         if (isConnErr && directClient) {
           console.log(chalk.yellow("  ⚠️  Billing proxy down — using direct GPU (unbilled)"));
@@ -269,7 +270,7 @@ async function aiCallWithHistory({ model, messages, tools, maxTokens = 4096, cat
       try {
         result = await _chatCallWithHistory(_getWolverineClient(), { model, messages, tools, maxTokens });
       } catch (proxyErr) {
-        const isConnErr = /ECONNREFUSED|ECONNRESET|ETIMEDOUT|fetch failed/i.test(proxyErr.message || "");
+        const isConnErr = /ECONNREFUSED|ECONNRESET|ETIMEDOUT|fetch failed|Connection error/i.test(proxyErr.message || "");
         const directClient = _getWolverineDirectClient();
         if (isConnErr && directClient) {
           console.log(chalk.yellow("  ⚠️  Billing proxy down — using direct GPU (unbilled)"));
@@ -520,19 +521,20 @@ async function _chatCall(openai, { model, systemPrompt, userPrompt, maxTokens, t
   if (systemPrompt) messages.push({ role: "system", content: systemPrompt });
   messages.push({ role: "user", content: userPrompt });
-  // No temperature for o-series and gpt-5+ (forbidden, causes error)
   const noTemp = /^(o[1-9]|gpt-5)/.test(model);
+  const isWolverine = detectProvider(model) === "wolverine";
   const params = {
     model, messages,
     ...(!noTemp ? { temperature: 0 } : {}),
     ...tokenParam(model, maxTokens),
     ..._reasoningParams(model),
+    // Prompt caching: llama.cpp reuses KV cache for identical prefixes
+    ...(isWolverine ? { cache_prompt: true } : {}),
   };
   if (tools && tools.length > 0) {
     params.tools = tools;
     params.tool_choice = toolChoice || "auto";
-    // Disable parallel calls for reliability — sequential is more predictable for healing
     params.parallel_tool_calls = false;
   }
@@ -589,11 +591,14 @@ async function _responsesCallWithHistory(openai, { model, messages, tools, maxTo
 async function _chatCallWithHistory(openai, { model, messages, tools, maxTokens }) {
   const noTemp = /^(o[1-9]|gpt-5)/.test(model);
+  const isWolverine = detectProvider(model) === "wolverine";
   const params = {
     model, messages,
     ...(!noTemp ? { temperature: 0 } : {}),
     ...tokenParam(model, maxTokens),
     ..._reasoningParams(model),
+    // Prompt caching: llama.cpp KV cache reuse for multi-turn agent conversations
+    ...(isWolverine ? { cache_prompt: true } : {}),
   };
   if (tools && tools.length > 0) {
     params.tools = tools;