@blockrun/franklin 3.24.2 → 3.24.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/llm.d.ts +0 -14
- package/dist/agent/llm.js +42 -23
- package/package.json +1 -1
package/dist/agent/llm.d.ts
CHANGED
|
@@ -89,20 +89,6 @@ export declare function sanitizeTableUnicode(s: string): string;
|
|
|
89
89
|
* Peel those layers so the UI doesn't show raw nested JSON.
|
|
90
90
|
*/
|
|
91
91
|
export declare function extractApiErrorMessage(errorBody: string): string;
|
|
92
|
-
/**
|
|
93
|
-
* Apply Anthropic prompt caching using the `system_and_3` strategy.
|
|
94
|
-
* Pattern from nousresearch/hermes-agent `agent/prompt_caching.py`.
|
|
95
|
-
*
|
|
96
|
-
* Places 4 cache_control breakpoints (Anthropic's max):
|
|
97
|
-
* 1. System prompt (stable across all turns)
|
|
98
|
-
* 2-4. Last 3 non-system messages (rolling window)
|
|
99
|
-
*
|
|
100
|
-
* Also caches the last tool definition (tools are stable across turns).
|
|
101
|
-
*
|
|
102
|
-
* This keeps the cache warm: each new turn extends the cached prefix rather
|
|
103
|
-
* than invalidating it. Multi-turn conversations see ~75% input token savings
|
|
104
|
-
* on Anthropic models.
|
|
105
|
-
*/
|
|
106
92
|
/**
|
|
107
93
|
* True if the given Anthropic model accepts the `thinking: { type: 'enabled' }`
|
|
108
94
|
* API flag (so-called *extended thinking*). Models using *adaptive thinking*
|
package/dist/agent/llm.js
CHANGED
|
@@ -173,20 +173,6 @@ export function extractApiErrorMessage(errorBody) {
|
|
|
173
173
|
return extracted.replace(/\s+/g, ' ').trim();
|
|
174
174
|
}
|
|
175
175
|
// ─── Anthropic Prompt Caching ─────────────────────────────────────────────
|
|
176
|
-
/**
|
|
177
|
-
* Apply Anthropic prompt caching using the `system_and_3` strategy.
|
|
178
|
-
* Pattern from nousresearch/hermes-agent `agent/prompt_caching.py`.
|
|
179
|
-
*
|
|
180
|
-
* Places 4 cache_control breakpoints (Anthropic's max):
|
|
181
|
-
* 1. System prompt (stable across all turns)
|
|
182
|
-
* 2-4. Last 3 non-system messages (rolling window)
|
|
183
|
-
*
|
|
184
|
-
* Also caches the last tool definition (tools are stable across turns).
|
|
185
|
-
*
|
|
186
|
-
* This keeps the cache warm: each new turn extends the cached prefix rather
|
|
187
|
-
* than invalidating it. Multi-turn conversations see ~75% input token savings
|
|
188
|
-
* on Anthropic models.
|
|
189
|
-
*/
|
|
190
176
|
/**
|
|
191
177
|
* True if the given Anthropic model accepts the `thinking: { type: 'enabled' }`
|
|
192
178
|
* API flag (so-called *extended thinking*). Models using *adaptive thinking*
|
|
@@ -255,30 +241,59 @@ export function isRoleplayedJsonToolCallText(text) {
|
|
|
255
241
|
return false;
|
|
256
242
|
}
|
|
257
243
|
}
|
|
244
|
+
/**
|
|
245
|
+
* Apply Anthropic prompt caching, budgeted to Anthropic's hard limit of 4
|
|
246
|
+
* `cache_control` breakpoints counted across system + tools + messages COMBINED.
|
|
247
|
+
* Adapted from the `system_and_3` pattern (nousresearch/hermes-agent
|
|
248
|
+
* `agent/prompt_caching.py`), with the budget made explicit so the tool
|
|
249
|
+
* breakpoint can't push the total to 5 (see issue #73).
|
|
250
|
+
*
|
|
251
|
+
* Breakpoints are spent in priority order:
|
|
252
|
+
* 1. System prompt — 1, if present (stable across all turns)
|
|
253
|
+
* 2. Last tool def — 1, if any tools (stable across all turns)
|
|
254
|
+
* 3. Last N messages — the remaining budget, capped at a rolling window of 3
|
|
255
|
+
*
|
|
256
|
+
* So a session with a system prompt + tools + ≥3 messages spends 1 + 1 + 2 = 4,
|
|
257
|
+
* not 5. This keeps the cache warm: each new turn extends the cached prefix
|
|
258
|
+
* rather than invalidating it. Multi-turn conversations see ~75% input token
|
|
259
|
+
* savings on Anthropic models.
|
|
260
|
+
*/
|
|
258
261
|
function applyAnthropicPromptCaching(payload, request) {
|
|
259
262
|
const out = { ...payload };
|
|
260
263
|
const cacheMarker = { type: 'ephemeral' };
|
|
264
|
+
// Anthropic allows a MAXIMUM of 4 blocks carrying cache_control, counted
|
|
265
|
+
// across system + tools + messages COMBINED. Exceeding it is a hard 400:
|
|
266
|
+
// "A maximum of 4 blocks with cache_control may be provided. Found 5."
|
|
267
|
+
// Spend the stable breakpoints (system, last tool) first, then give the
|
|
268
|
+
// rolling message window only whatever budget is left. See issue #73.
|
|
269
|
+
const MAX_BREAKPOINTS = 4;
|
|
270
|
+
let used = 0;
|
|
261
271
|
// 1. System prompt → wrap as array with cache_control on the text block
|
|
262
272
|
if (typeof request.system === 'string' && request.system.length > 0) {
|
|
263
273
|
out['system'] = [
|
|
264
274
|
{ type: 'text', text: request.system, cache_control: cacheMarker },
|
|
265
275
|
];
|
|
276
|
+
used++;
|
|
266
277
|
}
|
|
267
278
|
// 2. Tools → cache_control on the last tool (stable across turns)
|
|
268
279
|
if (request.tools && request.tools.length > 0) {
|
|
269
280
|
const toolsCopy = request.tools.map(t => ({ ...t }));
|
|
270
281
|
toolsCopy[toolsCopy.length - 1]['cache_control'] = cacheMarker;
|
|
271
282
|
out['tools'] = toolsCopy;
|
|
283
|
+
used++;
|
|
272
284
|
}
|
|
273
|
-
// 3. Messages → rolling cache_control on last
|
|
285
|
+
// 3. Messages → rolling cache_control on the last N messages (user/assistant).
|
|
274
286
|
// System is a separate field in ModelRequest, so all messages here are non-system.
|
|
275
|
-
// Strategy: mark the last
|
|
287
|
+
// Strategy: mark the last messages so the cached prefix extends as the
|
|
276
288
|
// conversation grows. Older cached prefixes expire after 5 min but newer
|
|
277
|
-
// ones keep the cache warm.
|
|
289
|
+
// ones keep the cache warm. The window is capped at 3 but never allowed to
|
|
290
|
+
// push the combined total past MAX_BREAKPOINTS — once system + tools are
|
|
291
|
+
// spent it shrinks (typically 3→2) so the request stays within Anthropic's limit.
|
|
278
292
|
if (request.messages && request.messages.length > 0) {
|
|
279
293
|
const messagesCopy = request.messages.map(m => ({ ...m }));
|
|
280
|
-
// Mark last
|
|
281
|
-
const
|
|
294
|
+
// Mark the last `windowSize` messages (or fewer if history is shorter).
|
|
295
|
+
const windowSize = Math.min(3, Math.max(0, MAX_BREAKPOINTS - used));
|
|
296
|
+
const start = Math.max(0, messagesCopy.length - windowSize);
|
|
282
297
|
for (let idx = start; idx < messagesCopy.length; idx++) {
|
|
283
298
|
const msg = messagesCopy[idx];
|
|
284
299
|
if (typeof msg.content === 'string') {
|
|
@@ -452,14 +467,18 @@ export class ModelClient {
|
|
|
452
467
|
// Extended thinking requires temperature=1 on Anthropic API
|
|
453
468
|
requestPayload['temperature'] = 1;
|
|
454
469
|
}
|
|
455
|
-
// ─ Anthropic prompt caching:
|
|
456
|
-
// 4 cache_control breakpoints
|
|
470
|
+
// ─ Anthropic prompt caching: budgeted breakpoints ───────────────────
|
|
471
|
+
// Anthropic permits at most 4 cache_control breakpoints, counted across
|
|
472
|
+
// system + tools + messages combined. We spend them in priority order:
|
|
457
473
|
// 1. System prompt (stable across turns)
|
|
458
|
-
// 2
|
|
474
|
+
// 2. Last tool definition (stable across turns)
|
|
475
|
+
// 3+. Rolling window over the last non-system messages — given only
|
|
476
|
+
// the remaining budget (so system + tool + window ≤ 4).
|
|
459
477
|
//
|
|
460
478
|
// This keeps the cache warm across turns: each new turn extends the
|
|
461
479
|
// cache instead of invalidating it. ~75% input token savings on
|
|
462
|
-
// multi-turn conversations.
|
|
480
|
+
// multi-turn conversations. The budget cap fixes a hard 400 once a
|
|
481
|
+
// session reached ≥3 messages (system + tool + 3 = 5). See issue #73.
|
|
463
482
|
requestPayload = applyAnthropicPromptCaching(requestPayload, request);
|
|
464
483
|
}
|
|
465
484
|
// ── GPT-5 / Codex: use "developer" role for system prompt ──────────────
|
package/package.json
CHANGED