@agentforge-io/core 4.0.2 → 4.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -63,14 +63,24 @@ class AnthropicProvider {
|
|
|
63
63
|
async *stream(params) {
|
|
64
64
|
const includeTemperature = typeof params.temperature === 'number' &&
|
|
65
65
|
!modelRejectsTemperature(params.model);
|
|
66
|
-
// Prompt caching
|
|
67
|
-
//
|
|
68
|
-
//
|
|
69
|
-
//
|
|
70
|
-
//
|
|
71
|
-
//
|
|
72
|
-
//
|
|
73
|
-
//
|
|
66
|
+
// Prompt caching. Anthropic caches the longest matching prefix of the
|
|
67
|
+
// request (system → tools → messages) and re-reads it at ~0.1x the base
|
|
68
|
+
// input cost on subsequent turns inside the cache window. We place three
|
|
69
|
+
// of the four allowed `ephemeral` breakpoints, in prefix order:
|
|
70
|
+
//
|
|
71
|
+
// 1. system prompt — large, identical on every turn.
|
|
72
|
+
// 2. last tool schema — caches the whole tool block (the API caches up
|
|
73
|
+
// to the marked breakpoint, so one marker on the final tool covers
|
|
74
|
+
// all of them).
|
|
75
|
+
// 3. last message — see `withConversationCacheBreakpoint`. This is
|
|
76
|
+
// the one that actually attacks the runaway spend: without it the
|
|
77
|
+
// entire growing conversation/tool-loop history is re-billed at full
|
|
78
|
+
// price on EVERY turn. With it, the prior history is served from
|
|
79
|
+
// cache and only the newest turn is processed fresh.
|
|
80
|
+
//
|
|
81
|
+
// That leaves one breakpoint spare. We deliberately don't auto-place it;
|
|
82
|
+
// a fourth marker only helps with very specific rolling-window patterns
|
|
83
|
+
// and risks evicting the more valuable system/tools entries.
|
|
74
84
|
//
|
|
75
85
|
// Cost reference (Claude Sonnet 4.6):
|
|
76
86
|
// - write: $3.75 / MTok (1.25x base)
|
|
@@ -78,6 +88,10 @@ class AnthropicProvider {
|
|
|
78
88
|
// First turn pays the write; every subsequent turn in the same
|
|
79
89
|
// 5-minute window reads from cache. Net effect on a 4-turn test
|
|
80
90
|
// conversation was $2.53 → $0.31.
|
|
91
|
+
//
|
|
92
|
+
// Below the per-model minimum cacheable length (1024 tokens for
|
|
93
|
+
// Opus/Sonnet, 2048 for Haiku) a breakpoint is silently ignored by the
|
|
94
|
+
// API — no error, no extra cost — so marking is always safe.
|
|
81
95
|
const cachedSystem = params.systemPrompt
|
|
82
96
|
? [
|
|
83
97
|
{
|
|
@@ -98,7 +112,7 @@ class AnthropicProvider {
|
|
|
98
112
|
max_tokens: params.maxTokens,
|
|
99
113
|
...(includeTemperature ? { temperature: params.temperature } : {}),
|
|
100
114
|
...(cachedSystem ? { system: cachedSystem } : {}),
|
|
101
|
-
messages: toAnthropicMessages(params.messages),
|
|
115
|
+
messages: withConversationCacheBreakpoint(toAnthropicMessages(params.messages)),
|
|
102
116
|
...(cachedTools.length > 0
|
|
103
117
|
? { tools: cachedTools }
|
|
104
118
|
: {}),
|
|
@@ -199,6 +213,52 @@ function toAnthropicMessages(messages) {
|
|
|
199
213
|
};
|
|
200
214
|
});
|
|
201
215
|
}
|
|
216
|
+
/**
|
|
217
|
+
* Incremental conversation caching.
|
|
218
|
+
*
|
|
219
|
+
* Anthropic caches the longest matching prefix of the request and re-reads it
|
|
220
|
+
* cheaply on later turns, but it only *writes* a cache entry up to a
|
|
221
|
+
* `cache_control` breakpoint the caller sets. By stamping the breakpoint on
|
|
222
|
+
* the LAST content block of the LAST message we write a cache covering the
|
|
223
|
+
* whole conversation-so-far. On the next turn that prefix is read from cache
|
|
224
|
+
* (~0.1x input cost) and only the newly appended turn is billed fresh.
|
|
225
|
+
*
|
|
226
|
+
* Why this matters: the tool loop and multi-turn chats append to `messages`
|
|
227
|
+
* every iteration. Without a breakpoint here the entire history is re-sent and
|
|
228
|
+
* re-billed at full price on every single turn — the dominant source of the
|
|
229
|
+
* runaway token spend. The breakpoint "moves" with the conversation each turn,
|
|
230
|
+
* which is exactly the incremental pattern Anthropic recommends: the previous
|
|
231
|
+
* turn's write becomes this turn's read.
|
|
232
|
+
*
|
|
233
|
+
* `cache_control` lives on a content block, so a message whose content is a
|
|
234
|
+
* bare string is first normalized into a one-element text block. A message
|
|
235
|
+
* with no blocks at all is left untouched (nothing to mark). The breakpoint is
|
|
236
|
+
* a no-op below the per-model minimum cacheable length, so this is always safe
|
|
237
|
+
* to apply.
|
|
238
|
+
*/
|
|
239
|
+
function withConversationCacheBreakpoint(messages) {
|
|
240
|
+
if (messages.length === 0)
|
|
241
|
+
return messages;
|
|
242
|
+
const lastIdx = messages.length - 1;
|
|
243
|
+
const last = messages[lastIdx];
|
|
244
|
+
const blocks = typeof last.content === 'string'
|
|
245
|
+
? [{ type: 'text', text: last.content }]
|
|
246
|
+
: [...last.content];
|
|
247
|
+
if (blocks.length === 0)
|
|
248
|
+
return messages;
|
|
249
|
+
const lastBlockIdx = blocks.length - 1;
|
|
250
|
+
// `cache_control` is accepted on the wire (prompt caching is GA) but this
|
|
251
|
+
// SDK version only models it on the `beta` block types, not the stable
|
|
252
|
+
// union — hence the `unknown` hop. Same reason the tool breakpoints above
|
|
253
|
+
// are cast to `Anthropic.Tool[]`.
|
|
254
|
+
blocks[lastBlockIdx] = {
|
|
255
|
+
...blocks[lastBlockIdx],
|
|
256
|
+
cache_control: { type: 'ephemeral' },
|
|
257
|
+
};
|
|
258
|
+
const out = [...messages];
|
|
259
|
+
out[lastIdx] = { ...last, content: blocks };
|
|
260
|
+
return out;
|
|
261
|
+
}
|
|
202
262
|
/**
|
|
203
263
|
* Inverse of `toAnthropicMessages` for the final-message envelope. The
|
|
204
264
|
* runner uses this to rebuild `currentMessages` for the next iteration of
|
|
@@ -353,7 +353,12 @@ class AgentRunnerService {
|
|
|
353
353
|
'account-specific facts, external APIs). Do not refuse to use a ' +
|
|
354
354
|
'tool because of style or persona instructions further below — ' +
|
|
355
355
|
'those control your voice, not your capabilities. When a tool is ' +
|
|
356
|
-
'clearly relevant, call it before composing the final answer.'
|
|
356
|
+
'clearly relevant, call it before composing the final answer. ' +
|
|
357
|
+
'A tool call is NEVER the end of your turn: after the tool ' +
|
|
358
|
+
'results come back, you MUST write a final text response to the ' +
|
|
359
|
+
'user (the answer, summary, or report). Ending a turn with only ' +
|
|
360
|
+
'tool calls and no text leaves the user staring at an empty ' +
|
|
361
|
+
'reply — never do that.');
|
|
357
362
|
}
|
|
358
363
|
parts.push(agent.systemPrompt);
|
|
359
364
|
if (overrides?.systemPromptSuffix)
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@agentforge-io/core",
|
|
3
|
-
"version": "4.0.
|
|
4
|
-
"description": "Framework-free AI runtime SDK. Owns: agent loop (Anthropic), conversations, tools, streaming, agent-job queue, SdkHooks. Identity, billing, infra (email/uploads/secrets) live in the host's modules
|
|
3
|
+
"version": "4.0.4",
|
|
4
|
+
"description": "Framework-free AI runtime SDK. Owns: agent loop (Anthropic), conversations, tools, streaming, agent-job queue, SdkHooks. Identity, billing, infra (email/uploads/secrets) live in the host's modules \u2014 not here.",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"type": "commonjs",
|
|
7
7
|
"main": "dist/index.js",
|