npm - @blockrun/franklin - Versions diffs - 3.24.2 → 3.24.4 - Mend

@blockrun/franklin 3.24.2 → 3.24.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/dist/agent/llm.d.ts CHANGED Viewed

@@ -89,20 +89,6 @@ export declare function sanitizeTableUnicode(s: string): string;
  * Peel those layers so the UI doesn't show raw nested JSON.
  */
 export declare function extractApiErrorMessage(errorBody: string): string;
-/**
- * Apply Anthropic prompt caching using the `system_and_3` strategy.
- * Pattern from nousresearch/hermes-agent `agent/prompt_caching.py`.
- *
- * Places 4 cache_control breakpoints (Anthropic's max):
- *   1. System prompt (stable across all turns)
- *   2-4. Last 3 non-system messages (rolling window)
- *
- * Also caches the last tool definition (tools are stable across turns).
- *
- * This keeps the cache warm: each new turn extends the cached prefix rather
- * than invalidating it. Multi-turn conversations see ~75% input token savings
- * on Anthropic models.
- */
 /**
  * True if the given Anthropic model accepts the `thinking: { type: 'enabled' }`
  * API flag (so-called *extended thinking*). Models using *adaptive thinking*

package/dist/agent/llm.js CHANGED Viewed

@@ -173,20 +173,6 @@ export function extractApiErrorMessage(errorBody) {
     return extracted.replace(/\s+/g, ' ').trim();
 }
 // ─── Anthropic Prompt Caching ─────────────────────────────────────────────
-/**
- * Apply Anthropic prompt caching using the `system_and_3` strategy.
- * Pattern from nousresearch/hermes-agent `agent/prompt_caching.py`.
- *
- * Places 4 cache_control breakpoints (Anthropic's max):
- *   1. System prompt (stable across all turns)
- *   2-4. Last 3 non-system messages (rolling window)
- *
- * Also caches the last tool definition (tools are stable across turns).
- *
- * This keeps the cache warm: each new turn extends the cached prefix rather
- * than invalidating it. Multi-turn conversations see ~75% input token savings
- * on Anthropic models.
- */
 /**
  * True if the given Anthropic model accepts the `thinking: { type: 'enabled' }`
  * API flag (so-called *extended thinking*). Models using *adaptive thinking*
@@ -255,30 +241,59 @@ export function isRoleplayedJsonToolCallText(text) {
         return false;
     }
 }
+/**
+ * Apply Anthropic prompt caching, budgeted to Anthropic's hard limit of 4
+ * `cache_control` breakpoints counted across system + tools + messages COMBINED.
+ * Adapted from the `system_and_3` pattern (nousresearch/hermes-agent
+ * `agent/prompt_caching.py`), with the budget made explicit so the tool
+ * breakpoint can't push the total to 5 (see issue #73).
+ *
+ * Breakpoints are spent in priority order:
+ *   1. System prompt   — 1, if present (stable across all turns)
+ *   2. Last tool def    — 1, if any tools (stable across all turns)
+ *   3. Last N messages  — the remaining budget, capped at a rolling window of 3
+ *
+ * So a session with a system prompt + tools + ≥3 messages spends 1 + 1 + 2 = 4,
+ * not 5. This keeps the cache warm: each new turn extends the cached prefix
+ * rather than invalidating it. Multi-turn conversations see ~75% input token
+ * savings on Anthropic models.
+ */
 function applyAnthropicPromptCaching(payload, request) {
     const out = { ...payload };
     const cacheMarker = { type: 'ephemeral' };
+    // Anthropic allows a MAXIMUM of 4 blocks carrying cache_control, counted
+    // across system + tools + messages COMBINED. Exceeding it is a hard 400:
+    //   "A maximum of 4 blocks with cache_control may be provided. Found 5."
+    // Spend the stable breakpoints (system, last tool) first, then give the
+    // rolling message window only whatever budget is left. See issue #73.
+    const MAX_BREAKPOINTS = 4;
+    let used = 0;
     // 1. System prompt → wrap as array with cache_control on the text block
     if (typeof request.system === 'string' && request.system.length > 0) {
         out['system'] = [
             { type: 'text', text: request.system, cache_control: cacheMarker },
         ];
+        used++;
     }
     // 2. Tools → cache_control on the last tool (stable across turns)
     if (request.tools && request.tools.length > 0) {
         const toolsCopy = request.tools.map(t => ({ ...t }));
         toolsCopy[toolsCopy.length - 1]['cache_control'] = cacheMarker;
         out['tools'] = toolsCopy;
+        used++;
     }
-    // 3. Messages → rolling cache_control on last 3 messages (user/assistant).
+    // 3. Messages → rolling cache_control on the last N messages (user/assistant).
     // System is a separate field in ModelRequest, so all messages here are non-system.
-    // Strategy: mark the last 3 messages so the cached prefix extends as the
+    // Strategy: mark the last messages so the cached prefix extends as the
     // conversation grows. Older cached prefixes expire after 5 min but newer
-    // ones keep the cache warm.
+    // ones keep the cache warm. The window is capped at 3 but never allowed to
+    // push the combined total past MAX_BREAKPOINTS — once system + tools are
+    // spent it shrinks (typically 3→2) so the request stays within Anthropic's limit.
     if (request.messages && request.messages.length > 0) {
         const messagesCopy = request.messages.map(m => ({ ...m }));
-        // Mark last 3 messages (or fewer if history is shorter)
-        const start = Math.max(0, messagesCopy.length - 3);
+        // Mark the last `windowSize` messages (or fewer if history is shorter).
+        const windowSize = Math.min(3, Math.max(0, MAX_BREAKPOINTS - used));
+        const start = Math.max(0, messagesCopy.length - windowSize);
         for (let idx = start; idx < messagesCopy.length; idx++) {
             const msg = messagesCopy[idx];
             if (typeof msg.content === 'string') {
@@ -452,14 +467,18 @@ export class ModelClient {
                 // Extended thinking requires temperature=1 on Anthropic API
                 requestPayload['temperature'] = 1;
             }
-            // ─ Anthropic prompt caching: `system_and_3` strategy ─────────────────
-            // 4 cache_control breakpoints (Anthropic max):
+            // ─ Anthropic prompt caching: budgeted breakpoints ───────────────────
+            // Anthropic permits at most 4 cache_control breakpoints, counted across
+            // system + tools + messages combined. We spend them in priority order:
             //   1. System prompt (stable across turns)
-            //   2-4. Last 3 non-system messages (rolling window)
+            //   2. Last tool definition (stable across turns)
+            //   3+. Rolling window over the last non-system messages — given only
+            //       the remaining budget (so system + tool + window ≤ 4).
             //
             // This keeps the cache warm across turns: each new turn extends the
             // cache instead of invalidating it. ~75% input token savings on
-            // multi-turn conversations. Pattern adopted from nousresearch/hermes-agent.
+            // multi-turn conversations. The budget cap fixes a hard 400 once a
+            // session reached ≥3 messages (system + tool + 3 = 5). See issue #73.
             requestPayload = applyAnthropicPromptCaching(requestPayload, request);
         }
         // ── GPT-5 / Codex: use "developer" role for system prompt ──────────────

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@blockrun/franklin",
-  "version": "3.24.2",
+  "version": "3.24.4",
   "description": "Franklin Agent — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
   "type": "module",
   "exports": {