@agentforge-io/core 4.0.2 → 4.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -63,14 +63,24 @@ class AnthropicProvider {
63
63
  async *stream(params) {
64
64
  const includeTemperature = typeof params.temperature === 'number' &&
65
65
  !modelRejectsTemperature(params.model);
66
- // Prompt caching: mark the system prompt as `ephemeral` so Anthropic
67
- // reuses it across turns. Conversations longer than one turn pay ~10%
68
- // of the input-token cost on turns 2+, which is the dominant spend
69
- // for our system assistants (each turn re-pays the same ~700k-token
70
- // system + tools otherwise). Same `ephemeral` marker on the LAST
71
- // tool so tool schemas also cache — combined with the system
72
- // breakpoint that uses 2 of the 4 allowed cache slots, leaving 2
73
- // for the model to mark hot message ranges itself.
66
+ // Prompt caching. Anthropic caches the longest matching prefix of the
67
+ // request (system tools messages) and re-reads it at ~0.1x the base
68
+ // input cost on subsequent turns inside the cache window. We place three
69
+ // of the four allowed `ephemeral` breakpoints, in prefix order:
70
+ //
71
+ // 1. system prompt — large, identical on every turn.
72
+ // 2. last tool schema caches the whole tool block (the API caches up
73
+ // to the marked breakpoint, so one marker on the final tool covers
74
+ // all of them).
75
+ // 3. last message — see `withConversationCacheBreakpoint`. This is
76
+ // the one that actually attacks the runaway spend: without it the
77
+ // entire growing conversation/tool-loop history is re-billed at full
78
+ // price on EVERY turn. With it, the prior history is served from
79
+ // cache and only the newest turn is processed fresh.
80
+ //
81
+ // That leaves one breakpoint spare. We deliberately don't auto-place it;
82
+ // a fourth marker only helps with very specific rolling-window patterns
83
+ // and risks evicting the more valuable system/tools entries.
74
84
  //
75
85
  // Cost reference (Claude Sonnet 4.6):
76
86
  // - write: $3.75 / MTok (1.25x base)
@@ -78,6 +88,10 @@ class AnthropicProvider {
78
88
  // First turn pays the write; every subsequent turn in the same
79
89
  // 5-minute window reads from cache. Net effect on a 4-turn test
80
90
  // conversation was $2.53 → $0.31.
91
+ //
92
+ // Below the per-model minimum cacheable length (1024 tokens for
93
+ // Opus/Sonnet, 2048 for Haiku) a breakpoint is silently ignored by the
94
+ // API — no error, no extra cost — so marking is always safe.
81
95
  const cachedSystem = params.systemPrompt
82
96
  ? [
83
97
  {
@@ -98,7 +112,7 @@ class AnthropicProvider {
98
112
  max_tokens: params.maxTokens,
99
113
  ...(includeTemperature ? { temperature: params.temperature } : {}),
100
114
  ...(cachedSystem ? { system: cachedSystem } : {}),
101
- messages: toAnthropicMessages(params.messages),
115
+ messages: withConversationCacheBreakpoint(toAnthropicMessages(params.messages)),
102
116
  ...(cachedTools.length > 0
103
117
  ? { tools: cachedTools }
104
118
  : {}),
@@ -199,6 +213,52 @@ function toAnthropicMessages(messages) {
199
213
  };
200
214
  });
201
215
  }
216
+ /**
217
+ * Incremental conversation caching.
218
+ *
219
+ * Anthropic caches the longest matching prefix of the request and re-reads it
220
+ * cheaply on later turns, but it only *writes* a cache entry up to a
221
+ * `cache_control` breakpoint the caller sets. By stamping the breakpoint on
222
+ * the LAST content block of the LAST message we write a cache covering the
223
+ * whole conversation-so-far. On the next turn that prefix is read from cache
224
+ * (~0.1x input cost) and only the newly appended turn is billed fresh.
225
+ *
226
+ * Why this matters: the tool loop and multi-turn chats append to `messages`
227
+ * every iteration. Without a breakpoint here the entire history is re-sent and
228
+ * re-billed at full price on every single turn — the dominant source of the
229
+ * runaway token spend. The breakpoint "moves" with the conversation each turn,
230
+ * which is exactly the incremental pattern Anthropic recommends: the previous
231
+ * turn's write becomes this turn's read.
232
+ *
233
+ * `cache_control` lives on a content block, so a message whose content is a
234
+ * bare string is first normalized into a one-element text block. A message
235
+ * with no blocks at all is left untouched (nothing to mark). The breakpoint is
236
+ * a no-op below the per-model minimum cacheable length, so this is always safe
237
+ * to apply.
238
+ */
239
+ function withConversationCacheBreakpoint(messages) {
240
+ if (messages.length === 0)
241
+ return messages;
242
+ const lastIdx = messages.length - 1;
243
+ const last = messages[lastIdx];
244
+ const blocks = typeof last.content === 'string'
245
+ ? [{ type: 'text', text: last.content }]
246
+ : [...last.content];
247
+ if (blocks.length === 0)
248
+ return messages;
249
+ const lastBlockIdx = blocks.length - 1;
250
+ // `cache_control` is accepted on the wire (prompt caching is GA) but this
251
+ // SDK version only models it on the `beta` block types, not the stable
252
+ // union — hence the `unknown` hop. Same reason the tool breakpoints above
253
+ // are cast to `Anthropic.Tool[]`.
254
+ blocks[lastBlockIdx] = {
255
+ ...blocks[lastBlockIdx],
256
+ cache_control: { type: 'ephemeral' },
257
+ };
258
+ const out = [...messages];
259
+ out[lastIdx] = { ...last, content: blocks };
260
+ return out;
261
+ }
202
262
  /**
203
263
  * Inverse of `toAnthropicMessages` for the final-message envelope. The
204
264
  * runner uses this to rebuild `currentMessages` for the next iteration of
@@ -353,7 +353,12 @@ class AgentRunnerService {
353
353
  'account-specific facts, external APIs). Do not refuse to use a ' +
354
354
  'tool because of style or persona instructions further below — ' +
355
355
  'those control your voice, not your capabilities. When a tool is ' +
356
- 'clearly relevant, call it before composing the final answer.');
356
+ 'clearly relevant, call it before composing the final answer. ' +
357
+ 'A tool call is NEVER the end of your turn: after the tool ' +
358
+ 'results come back, you MUST write a final text response to the ' +
359
+ 'user (the answer, summary, or report). Ending a turn with only ' +
360
+ 'tool calls and no text leaves the user staring at an empty ' +
361
+ 'reply — never do that.');
357
362
  }
358
363
  parts.push(agent.systemPrompt);
359
364
  if (overrides?.systemPromptSuffix)
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "@agentforge-io/core",
3
- "version": "4.0.2",
4
- "description": "Framework-free AI runtime SDK. Owns: agent loop (Anthropic), conversations, tools, streaming, agent-job queue, SdkHooks. Identity, billing, infra (email/uploads/secrets) live in the host's modules not here.",
3
+ "version": "4.0.4",
4
+ "description": "Framework-free AI runtime SDK. Owns: agent loop (Anthropic), conversations, tools, streaming, agent-job queue, SdkHooks. Identity, billing, infra (email/uploads/secrets) live in the host's modules \u2014 not here.",
5
5
  "license": "MIT",
6
6
  "type": "commonjs",
7
7
  "main": "dist/index.js",