@blockrun/franklin 3.15.93 → 3.15.95

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -46,6 +46,19 @@ export interface StreamChunk {
46
46
  export interface CompletionUsage {
47
47
  inputTokens: number;
48
48
  outputTokens: number;
49
+ /**
50
+ * Anthropic prompt-cache fields. `input_tokens` only counts the base
51
+ * (uncached) portion; the cache-creation and cache-read counts are
52
+ * separate and billed at different rates (1.25× / 0.1× of base input,
53
+ * respectively). Pre-fix, Franklin only read `input_tokens` and
54
+ * silently undercounted every vision / cache-using call's total
55
+ * token spend — verified 2026-05-11 from an Opus 4.7 turn billed
56
+ * $0.567 with audit logging `inputTokens: 3653` (implies ~113K real
57
+ * billed input tokens). Surface all three so audits, stats, and any
58
+ * future estimation paths see the full picture.
59
+ */
60
+ cacheCreationInputTokens?: number;
61
+ cacheReadInputTokens?: number;
49
62
  }
50
63
  export interface LLMClientOptions {
51
64
  apiUrl: string;
package/dist/agent/llm.js CHANGED
@@ -864,6 +864,15 @@ export class ModelClient {
864
864
  const msgUsage = chunk.payload['usage'];
865
865
  if (msgUsage) {
866
866
  usage.outputTokens = msgUsage['output_tokens'] ?? usage.outputTokens;
867
+ // Cache and tool-call breakdowns can arrive in message_delta
868
+ // too; merge whatever's present without clobbering values set
869
+ // by message_start.
870
+ if (msgUsage['cache_creation_input_tokens'] !== undefined) {
871
+ usage.cacheCreationInputTokens = msgUsage['cache_creation_input_tokens'];
872
+ }
873
+ if (msgUsage['cache_read_input_tokens'] !== undefined) {
874
+ usage.cacheReadInputTokens = msgUsage['cache_read_input_tokens'];
875
+ }
867
876
  }
868
877
  const delta = chunk.payload['delta'];
869
878
  if (delta?.['stop_reason']) {
@@ -877,6 +886,18 @@ export class ModelClient {
877
886
  if (msgUsage) {
878
887
  usage.inputTokens = msgUsage['input_tokens'] ?? 0;
879
888
  usage.outputTokens = msgUsage['output_tokens'] ?? 0;
889
+ // Vision and prompt-cache calls return up to two extra
890
+ // billed-tokens counts that input_tokens does NOT include:
891
+ // cache_creation_input_tokens (1.25× base price) and
892
+ // cache_read_input_tokens (0.1× base price). Without these,
893
+ // any audit/stats over a vision-heavy session looks wildly
894
+ // inconsistent with the wallet charge.
895
+ if (msgUsage['cache_creation_input_tokens'] !== undefined) {
896
+ usage.cacheCreationInputTokens = msgUsage['cache_creation_input_tokens'];
897
+ }
898
+ if (msgUsage['cache_read_input_tokens'] !== undefined) {
899
+ usage.cacheReadInputTokens = msgUsage['cache_read_input_tokens'];
900
+ }
880
901
  }
881
902
  break;
882
903
  }
@@ -1011,10 +1011,26 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
1011
1011
  // where input-replay tax has clearly started biting; the
1012
1012
  // fire-once-per-turn flag still bounds the worst case at one
1013
1013
  // extra summary call (~$0.005).
1014
+ //
1015
+ // 2026-05-11: added a high-cost early-exit. The original
1016
+ // (>15 calls AND >$0.03) gate works well for cheap models
1017
+ // where 15 calls clears the $0.03 floor trivially. For Opus-
1018
+ // class models, cost climbs much faster than call count —
1019
+ // verified in production from a real session:
1020
+ // `Research-bloat compacted at 16 calls / $9.4552: ~3129
1021
+ // tokens`. By the time the 16-call gate fired, $9.45 was
1022
+ // already spent on input-replay. With an early-exit at
1023
+ // $1.00 turn-cost, the compact would have fired around
1024
+ // call 4-5, saving ~$8 on that turn. The cost cap is
1025
+ // intentionally conservative — even extended-thinking Opus
1026
+ // shouldn't legitimately need >$1 of context-replay before
1027
+ // compacting (the compact itself runs on a cheaper model
1028
+ // and costs <$0.05).
1029
+ const TURN_COST_CAP_FOR_EARLY_COMPACT = 1.00;
1014
1030
  if (!bloatCompactedThisTurn &&
1015
1031
  compactFailures < 3 &&
1016
- turnToolCalls > 15 &&
1017
- turnCostUsd > 0.03) {
1032
+ ((turnToolCalls > 15 && turnCostUsd > 0.03) ||
1033
+ turnCostUsd > TURN_COST_CAP_FOR_EARLY_COMPACT)) {
1018
1034
  try {
1019
1035
  const beforeTokens = estimateHistoryTokens(history);
1020
1036
  const { history: compacted, compacted: didCompact } = await forceCompact(history, config.model, client, config.debug);
@@ -1640,6 +1656,8 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
1640
1656
  model: resolvedModel,
1641
1657
  inputTokens,
1642
1658
  outputTokens: usage.outputTokens,
1659
+ cacheCreationInputTokens: usage.cacheCreationInputTokens,
1660
+ cacheReadInputTokens: usage.cacheReadInputTokens,
1643
1661
  costUsd: callCost,
1644
1662
  // Any failed model this turn means the model that finally
1645
1663
  // succeeded was a fallback. Without this, audit log read 0%
@@ -14,6 +14,18 @@ export interface AuditEntry {
14
14
  model: string;
15
15
  inputTokens: number;
16
16
  outputTokens: number;
17
+ /**
18
+ * Anthropic prompt-cache fields, captured when the model reports them
19
+ * in `usage.cache_creation_input_tokens` / `usage.cache_read_input_tokens`.
20
+ * `inputTokens` above is the *uncached* portion; the cache fields are
21
+ * additional billed input the gateway charges for separately. Without
22
+ * these, vision and cache-heavy sessions show a wildly inconsistent
23
+ * cost-per-token ratio in audit dashboards — verified 2026-05-11 from
24
+ * an Opus 4.7 call with inputTokens=3653 but costUsd=$0.567 (implies
25
+ * ~113K real billed tokens once cache_creation is counted).
26
+ */
27
+ cacheCreationInputTokens?: number;
28
+ cacheReadInputTokens?: number;
17
29
  costUsd: number;
18
30
  latencyMs?: number;
19
31
  fallback?: boolean;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@blockrun/franklin",
3
- "version": "3.15.93",
3
+ "version": "3.15.95",
4
4
  "description": "Franklin — The AI agent with a wallet. Spends USDC autonomously to get real work done. Pay per action, no subscriptions.",
5
5
  "type": "module",
6
6
  "exports": {