wolverine-ai 4.0.4 → 4.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "wolverine-ai",
3
- "version": "4.0.4",
3
+ "version": "4.0.5",
4
4
  "description": "Self-healing Node.js server framework powered by AI. Catches crashes, diagnoses errors, generates fixes, verifies, and restarts — automatically.",
5
5
  "main": "src/index.js",
6
6
  "bin": {
@@ -133,6 +133,10 @@ const SEED_DOCS = [
133
133
  text: "Configuration: hybrid-always architecture — no provider selection. Users pick the best model for each of 8 task roles directly in settings.json 'models' section. Mix and match: wolverine for audit, claude for reasoning, gpt for coding. Provider auto-detected from model name. Embedding is separate ('embedding' key) — always wolverine-embedding-1 billed through credits (proxies to text-embedding-3-small at 2x markup). Secrets in .env.local. Config priority: env vars > settings.json > defaults.",
134
134
  metadata: { topic: "configuration" },
135
135
  },
136
+ {
137
+ text: "AI client prompt caching: all 3 providers cache automatically. Anthropic: system prompt marked cache_control:ephemeral, 90% cheaper on repeat calls within 5 min TTL. OpenAI: automatic prefix caching for >=1024 token prefixes, 50% cheaper on cached input, tracked via usage.prompt_tokens_details.cached_tokens. Wolverine/llama.cpp: cache_prompt:true in request body reuses KV cache for identical prefixes between requests, near-zero TTFT on second+ call in a heal pipeline. Cache savings tracked in analytics: cacheCreation (tokens written to cache) and cacheRead (tokens served from cache).",
138
+ metadata: { topic: "prompt-caching" },
139
+ },
136
140
  {
137
141
  text: "Platform telemetry: lightweight background process, zero-config. Default platform: api.wolverinenode.xyz. Auto-registers on first run (retries every 60s until platform responds), saves key to .wolverine/platform-key. Heartbeat payload matches PLATFORM.md spec: instanceId, server (name/port/uptime/status/pid), process (memoryMB/cpuPercent), routes, repairs, usage (tokens/cost/calls/byCategory), brain, backups. Offline-resilient: queues up to 1440 heartbeats locally, drains on reconnect. No chalk dependency, cached version/key in memory, minimal IO. Opt out: WOLVERINE_TELEMETRY=false. Override URL: WOLVERINE_PLATFORM_URL.",
138
142
  metadata: { topic: "platform-telemetry" },
@@ -21,8 +21,9 @@ function _extractTokens(usage) {
21
21
  output: usage.completion_tokens || usage.output_tokens || 0,
22
22
  // Anthropic cache fields
23
23
  cacheCreation: usage.cache_creation_input_tokens || usage.cache_write_tokens || 0,
24
- // OpenAI uses cache_read_tokens, Anthropic uses cache_read_input_tokens
25
- cacheRead: usage.cache_read_input_tokens || usage.cache_read_tokens || 0,
24
+ // OpenAI prompt_tokens_details.cached_tokens + Anthropic cache_read_input_tokens
25
+ cacheRead: usage.cache_read_input_tokens || usage.cache_read_tokens
26
+ || usage.prompt_tokens_details?.cached_tokens || 0,
26
27
  };
27
28
  }
28
29
 
@@ -520,19 +521,20 @@ async function _chatCall(openai, { model, systemPrompt, userPrompt, maxTokens, t
520
521
  if (systemPrompt) messages.push({ role: "system", content: systemPrompt });
521
522
  messages.push({ role: "user", content: userPrompt });
522
523
 
523
- // No temperature for o-series and gpt-5+ (forbidden, causes error)
524
524
  const noTemp = /^(o[1-9]|gpt-5)/.test(model);
525
+ const isWolverine = detectProvider(model) === "wolverine";
525
526
  const params = {
526
527
  model, messages,
527
528
  ...(!noTemp ? { temperature: 0 } : {}),
528
529
  ...tokenParam(model, maxTokens),
529
530
  ..._reasoningParams(model),
531
+ // Prompt caching: llama.cpp reuses KV cache for identical prefixes
532
+ ...(isWolverine ? { cache_prompt: true } : {}),
530
533
  };
531
534
 
532
535
  if (tools && tools.length > 0) {
533
536
  params.tools = tools;
534
537
  params.tool_choice = toolChoice || "auto";
535
- // Disable parallel calls for reliability — sequential is more predictable for healing
536
538
  params.parallel_tool_calls = false;
537
539
  }
538
540
 
@@ -589,11 +591,14 @@ async function _responsesCallWithHistory(openai, { model, messages, tools, maxTo
589
591
 
590
592
  async function _chatCallWithHistory(openai, { model, messages, tools, maxTokens }) {
591
593
  const noTemp = /^(o[1-9]|gpt-5)/.test(model);
594
+ const isWolverine = detectProvider(model) === "wolverine";
592
595
  const params = {
593
596
  model, messages,
594
597
  ...(!noTemp ? { temperature: 0 } : {}),
595
598
  ...tokenParam(model, maxTokens),
596
599
  ..._reasoningParams(model),
600
+ // Prompt caching: llama.cpp KV cache reuse for multi-turn agent conversations
601
+ ...(isWolverine ? { cache_prompt: true } : {}),
597
602
  };
598
603
  if (tools && tools.length > 0) {
599
604
  params.tools = tools;