@blockrun/franklin 3.24.2 → 3.24.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -89,20 +89,6 @@ export declare function sanitizeTableUnicode(s: string): string;
89
89
  * Peel those layers so the UI doesn't show raw nested JSON.
90
90
  */
91
91
  export declare function extractApiErrorMessage(errorBody: string): string;
92
- /**
93
- * Apply Anthropic prompt caching using the `system_and_3` strategy.
94
- * Pattern from nousresearch/hermes-agent `agent/prompt_caching.py`.
95
- *
96
- * Places 4 cache_control breakpoints (Anthropic's max):
97
- * 1. System prompt (stable across all turns)
98
- * 2-4. Last 3 non-system messages (rolling window)
99
- *
100
- * Also caches the last tool definition (tools are stable across turns).
101
- *
102
- * This keeps the cache warm: each new turn extends the cached prefix rather
103
- * than invalidating it. Multi-turn conversations see ~75% input token savings
104
- * on Anthropic models.
105
- */
106
92
  /**
107
93
  * True if the given Anthropic model accepts the `thinking: { type: 'enabled' }`
108
94
  * API flag (so-called *extended thinking*). Models using *adaptive thinking*
package/dist/agent/llm.js CHANGED
@@ -173,20 +173,6 @@ export function extractApiErrorMessage(errorBody) {
173
173
  return extracted.replace(/\s+/g, ' ').trim();
174
174
  }
175
175
  // ─── Anthropic Prompt Caching ─────────────────────────────────────────────
176
- /**
177
- * Apply Anthropic prompt caching using the `system_and_3` strategy.
178
- * Pattern from nousresearch/hermes-agent `agent/prompt_caching.py`.
179
- *
180
- * Places 4 cache_control breakpoints (Anthropic's max):
181
- * 1. System prompt (stable across all turns)
182
- * 2-4. Last 3 non-system messages (rolling window)
183
- *
184
- * Also caches the last tool definition (tools are stable across turns).
185
- *
186
- * This keeps the cache warm: each new turn extends the cached prefix rather
187
- * than invalidating it. Multi-turn conversations see ~75% input token savings
188
- * on Anthropic models.
189
- */
190
176
  /**
191
177
  * True if the given Anthropic model accepts the `thinking: { type: 'enabled' }`
192
178
  * API flag (so-called *extended thinking*). Models using *adaptive thinking*
@@ -255,30 +241,59 @@ export function isRoleplayedJsonToolCallText(text) {
255
241
  return false;
256
242
  }
257
243
  }
244
+ /**
245
+ * Apply Anthropic prompt caching, budgeted to Anthropic's hard limit of 4
246
+ * `cache_control` breakpoints counted across system + tools + messages COMBINED.
247
+ * Adapted from the `system_and_3` pattern (nousresearch/hermes-agent
248
+ * `agent/prompt_caching.py`), with the budget made explicit so the tool
249
+ * breakpoint can't push the total to 5 (see issue #73).
250
+ *
251
+ * Breakpoints are spent in priority order:
252
+ * 1. System prompt — 1, if present (stable across all turns)
253
+ * 2. Last tool def — 1, if any tools (stable across all turns)
254
+ * 3. Last N messages — the remaining budget, capped at a rolling window of 3
255
+ *
256
+ * So a session with a system prompt + tools + ≥3 messages spends 1 + 1 + 2 = 4,
257
+ * not 5. This keeps the cache warm: each new turn extends the cached prefix
258
+ * rather than invalidating it. Multi-turn conversations see ~75% input token
259
+ * savings on Anthropic models.
260
+ */
258
261
  function applyAnthropicPromptCaching(payload, request) {
259
262
  const out = { ...payload };
260
263
  const cacheMarker = { type: 'ephemeral' };
264
+ // Anthropic allows a MAXIMUM of 4 blocks carrying cache_control, counted
265
+ // across system + tools + messages COMBINED. Exceeding it is a hard 400:
266
+ // "A maximum of 4 blocks with cache_control may be provided. Found 5."
267
+ // Spend the stable breakpoints (system, last tool) first, then give the
268
+ // rolling message window only whatever budget is left. See issue #73.
269
+ const MAX_BREAKPOINTS = 4;
270
+ let used = 0;
261
271
  // 1. System prompt → wrap as array with cache_control on the text block
262
272
  if (typeof request.system === 'string' && request.system.length > 0) {
263
273
  out['system'] = [
264
274
  { type: 'text', text: request.system, cache_control: cacheMarker },
265
275
  ];
276
+ used++;
266
277
  }
267
278
  // 2. Tools → cache_control on the last tool (stable across turns)
268
279
  if (request.tools && request.tools.length > 0) {
269
280
  const toolsCopy = request.tools.map(t => ({ ...t }));
270
281
  toolsCopy[toolsCopy.length - 1]['cache_control'] = cacheMarker;
271
282
  out['tools'] = toolsCopy;
283
+ used++;
272
284
  }
273
- // 3. Messages → rolling cache_control on last 3 messages (user/assistant).
285
+ // 3. Messages → rolling cache_control on the last N messages (user/assistant).
274
286
  // System is a separate field in ModelRequest, so all messages here are non-system.
275
- // Strategy: mark the last 3 messages so the cached prefix extends as the
287
+ // Strategy: mark the last messages so the cached prefix extends as the
276
288
  // conversation grows. Older cached prefixes expire after 5 min but newer
277
- // ones keep the cache warm.
289
+ // ones keep the cache warm. The window is capped at 3 but never allowed to
290
+ // push the combined total past MAX_BREAKPOINTS — once system + tools are
291
+ // spent it shrinks (typically 3→2) so the request stays within Anthropic's limit.
278
292
  if (request.messages && request.messages.length > 0) {
279
293
  const messagesCopy = request.messages.map(m => ({ ...m }));
280
- // Mark last 3 messages (or fewer if history is shorter)
281
- const start = Math.max(0, messagesCopy.length - 3);
294
+ // Mark the last `windowSize` messages (or fewer if history is shorter).
295
+ const windowSize = Math.min(3, Math.max(0, MAX_BREAKPOINTS - used));
296
+ const start = Math.max(0, messagesCopy.length - windowSize);
282
297
  for (let idx = start; idx < messagesCopy.length; idx++) {
283
298
  const msg = messagesCopy[idx];
284
299
  if (typeof msg.content === 'string') {
@@ -452,14 +467,18 @@ export class ModelClient {
452
467
  // Extended thinking requires temperature=1 on Anthropic API
453
468
  requestPayload['temperature'] = 1;
454
469
  }
455
- // ─ Anthropic prompt caching: `system_and_3` strategy ─────────────────
456
- // 4 cache_control breakpoints (Anthropic max):
470
+ // ─ Anthropic prompt caching: budgeted breakpoints ───────────────────
471
+ // Anthropic permits at most 4 cache_control breakpoints, counted across
472
+ // system + tools + messages combined. We spend them in priority order:
457
473
  // 1. System prompt (stable across turns)
458
- // 2-4. Last 3 non-system messages (rolling window)
474
+ // 2. Last tool definition (stable across turns)
475
+ // 3+. Rolling window over the last non-system messages — given only
476
+ // the remaining budget (so system + tool + window ≤ 4).
459
477
  //
460
478
  // This keeps the cache warm across turns: each new turn extends the
461
479
  // cache instead of invalidating it. ~75% input token savings on
462
- // multi-turn conversations. Pattern adopted from nousresearch/hermes-agent.
480
+ // multi-turn conversations. The budget cap fixes a hard 400 once a
481
+ // session reached ≥3 messages (system + tool + 3 = 5). See issue #73.
463
482
  requestPayload = applyAnthropicPromptCaching(requestPayload, request);
464
483
  }
465
484
  // ── GPT-5 / Codex: use "developer" role for system prompt ──────────────
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blockrun/franklin",
3
- "version": "3.24.2",
3
+ "version": "3.24.4",
4
4
  "description": "Franklin Agent — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
5
5
  "type": "module",
6
6
  "exports": {