npm - @agentforge-io/core - Versions diffs - 4.0.2 → 4.0.4 - Mend

@agentforge-io/core 4.0.2 → 4.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/providers/anthropic-provider.js +69 -9
package/dist/services/agent-runner.service.js +6 -1
package/package.json +2 -2

package/dist/providers/anthropic-provider.js CHANGED Viewed

@@ -63,14 +63,24 @@ class AnthropicProvider {
     async *stream(params) {
         const includeTemperature = typeof params.temperature === 'number' &&
             !modelRejectsTemperature(params.model);
-        // Prompt caching: mark the system prompt as `ephemeral` so Anthropic
-        // reuses it across turns. Conversations longer than one turn pay ~10%
-        // of the input-token cost on turns 2+, which is the dominant spend
-        // for our system assistants (each turn re-pays the same ~700k-token
-        // system + tools otherwise). Same `ephemeral` marker on the LAST
-        // tool so tool schemas also cache — combined with the system
-        // breakpoint that uses 2 of the 4 allowed cache slots, leaving 2
-        // for the model to mark hot message ranges itself.
+        // Prompt caching. Anthropic caches the longest matching prefix of the
+        // request (system → tools → messages) and re-reads it at ~0.1x the base
+        // input cost on subsequent turns inside the cache window. We place three
+        // of the four allowed `ephemeral` breakpoints, in prefix order:
+        //
+        //   1. system prompt   — large, identical on every turn.
+        //   2. last tool schema — caches the whole tool block (the API caches up
+        //      to the marked breakpoint, so one marker on the final tool covers
+        //      all of them).
+        //   3. last message    — see `withConversationCacheBreakpoint`. This is
+        //      the one that actually attacks the runaway spend: without it the
+        //      entire growing conversation/tool-loop history is re-billed at full
+        //      price on EVERY turn. With it, the prior history is served from
+        //      cache and only the newest turn is processed fresh.
+        //
+        // That leaves one breakpoint spare. We deliberately don't auto-place it;
+        // a fourth marker only helps with very specific rolling-window patterns
+        // and risks evicting the more valuable system/tools entries.
         //
         // Cost reference (Claude Sonnet 4.6):
         //   - write: $3.75 / MTok  (1.25x base)
@@ -78,6 +88,10 @@ class AnthropicProvider {
         // First turn pays the write; every subsequent turn in the same
         // 5-minute window reads from cache. Net effect on a 4-turn test
         // conversation was $2.53 → $0.31.
+        //
+        // Below the per-model minimum cacheable length (1024 tokens for
+        // Opus/Sonnet, 2048 for Haiku) a breakpoint is silently ignored by the
+        // API — no error, no extra cost — so marking is always safe.
         const cachedSystem = params.systemPrompt
             ? [
                 {
@@ -98,7 +112,7 @@ class AnthropicProvider {
             max_tokens: params.maxTokens,
             ...(includeTemperature ? { temperature: params.temperature } : {}),
             ...(cachedSystem ? { system: cachedSystem } : {}),
-            messages: toAnthropicMessages(params.messages),
+            messages: withConversationCacheBreakpoint(toAnthropicMessages(params.messages)),
             ...(cachedTools.length > 0
                 ? { tools: cachedTools }
                 : {}),
@@ -199,6 +213,52 @@ function toAnthropicMessages(messages) {
         };
     });
 }
+/**
+ * Incremental conversation caching.
+ *
+ * Anthropic caches the longest matching prefix of the request and re-reads it
+ * cheaply on later turns, but it only *writes* a cache entry up to a
+ * `cache_control` breakpoint the caller sets. By stamping the breakpoint on
+ * the LAST content block of the LAST message we write a cache covering the
+ * whole conversation-so-far. On the next turn that prefix is read from cache
+ * (~0.1x input cost) and only the newly appended turn is billed fresh.
+ *
+ * Why this matters: the tool loop and multi-turn chats append to `messages`
+ * every iteration. Without a breakpoint here the entire history is re-sent and
+ * re-billed at full price on every single turn — the dominant source of the
+ * runaway token spend. The breakpoint "moves" with the conversation each turn,
+ * which is exactly the incremental pattern Anthropic recommends: the previous
+ * turn's write becomes this turn's read.
+ *
+ * `cache_control` lives on a content block, so a message whose content is a
+ * bare string is first normalized into a one-element text block. A message
+ * with no blocks at all is left untouched (nothing to mark). The breakpoint is
+ * a no-op below the per-model minimum cacheable length, so this is always safe
+ * to apply.
+ */
+function withConversationCacheBreakpoint(messages) {
+    if (messages.length === 0)
+        return messages;
+    const lastIdx = messages.length - 1;
+    const last = messages[lastIdx];
+    const blocks = typeof last.content === 'string'
+        ? [{ type: 'text', text: last.content }]
+        : [...last.content];
+    if (blocks.length === 0)
+        return messages;
+    const lastBlockIdx = blocks.length - 1;
+    // `cache_control` is accepted on the wire (prompt caching is GA) but this
+    // SDK version only models it on the `beta` block types, not the stable
+    // union — hence the `unknown` hop. Same reason the tool breakpoints above
+    // are cast to `Anthropic.Tool[]`.
+    blocks[lastBlockIdx] = {
+        ...blocks[lastBlockIdx],
+        cache_control: { type: 'ephemeral' },
+    };
+    const out = [...messages];
+    out[lastIdx] = { ...last, content: blocks };
+    return out;
+}
 /**
  * Inverse of `toAnthropicMessages` for the final-message envelope. The
  * runner uses this to rebuild `currentMessages` for the next iteration of

package/dist/services/agent-runner.service.js CHANGED Viewed

@@ -353,7 +353,12 @@ class AgentRunnerService {
                 'account-specific facts, external APIs). Do not refuse to use a ' +
                 'tool because of style or persona instructions further below — ' +
                 'those control your voice, not your capabilities. When a tool is ' +
-                'clearly relevant, call it before composing the final answer.');
+                'clearly relevant, call it before composing the final answer. ' +
+                'A tool call is NEVER the end of your turn: after the tool ' +
+                'results come back, you MUST write a final text response to the ' +
+                'user (the answer, summary, or report). Ending a turn with only ' +
+                'tool calls and no text leaves the user staring at an empty ' +
+                'reply — never do that.');
         }
         parts.push(agent.systemPrompt);
         if (overrides?.systemPromptSuffix)

package/package.json CHANGED Viewed

@@ -1,7 +1,7 @@
 {
   "name": "@agentforge-io/core",
-  "version": "4.0.2",
-  "description": "Framework-free AI runtime SDK. Owns: agent loop (Anthropic), conversations, tools, streaming, agent-job queue, SdkHooks. Identity, billing, infra (email/uploads/secrets) live in the host's modules — not here.",
+  "version": "4.0.4",
+  "description": "Framework-free AI runtime SDK. Owns: agent loop (Anthropic), conversations, tools, streaming, agent-job queue, SdkHooks. Identity, billing, infra (email/uploads/secrets) live in the host's modules \u2014 not here.",
   "license": "MIT",
   "type": "commonjs",
   "main": "dist/index.js",