@blockrun/franklin 3.15.93 → 3.15.95
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent/llm.d.ts +13 -0
- package/dist/agent/llm.js +21 -0
- package/dist/agent/loop.js +20 -2
- package/dist/stats/audit.d.ts +12 -0
- package/package.json +1 -1
package/dist/agent/llm.d.ts
CHANGED
|
@@ -46,6 +46,19 @@ export interface StreamChunk {
|
|
|
46
46
|
export interface CompletionUsage {
|
|
47
47
|
inputTokens: number;
|
|
48
48
|
outputTokens: number;
|
|
49
|
+
/**
|
|
50
|
+
* Anthropic prompt-cache fields. `input_tokens` only counts the base
|
|
51
|
+
* (uncached) portion; the cache-creation and cache-read counts are
|
|
52
|
+
* separate and billed at different rates (1.25× / 0.1× of base input,
|
|
53
|
+
* respectively). Pre-fix, Franklin only read `input_tokens` and
|
|
54
|
+
* silently undercounted every vision / cache-using call's total
|
|
55
|
+
* token spend — verified 2026-05-11 from an Opus 4.7 turn billed
|
|
56
|
+
* $0.567 with audit logging `inputTokens: 3653` (implies ~113K real
|
|
57
|
+
* billed input tokens). Surface all three so audits, stats, and any
|
|
58
|
+
* future estimation paths see the full picture.
|
|
59
|
+
*/
|
|
60
|
+
cacheCreationInputTokens?: number;
|
|
61
|
+
cacheReadInputTokens?: number;
|
|
49
62
|
}
|
|
50
63
|
export interface LLMClientOptions {
|
|
51
64
|
apiUrl: string;
|
package/dist/agent/llm.js
CHANGED
|
@@ -864,6 +864,15 @@ export class ModelClient {
|
|
|
864
864
|
const msgUsage = chunk.payload['usage'];
|
|
865
865
|
if (msgUsage) {
|
|
866
866
|
usage.outputTokens = msgUsage['output_tokens'] ?? usage.outputTokens;
|
|
867
|
+
// Cache and tool-call breakdowns can arrive in message_delta
|
|
868
|
+
// too; merge whatever's present without clobbering values set
|
|
869
|
+
// by message_start.
|
|
870
|
+
if (msgUsage['cache_creation_input_tokens'] !== undefined) {
|
|
871
|
+
usage.cacheCreationInputTokens = msgUsage['cache_creation_input_tokens'];
|
|
872
|
+
}
|
|
873
|
+
if (msgUsage['cache_read_input_tokens'] !== undefined) {
|
|
874
|
+
usage.cacheReadInputTokens = msgUsage['cache_read_input_tokens'];
|
|
875
|
+
}
|
|
867
876
|
}
|
|
868
877
|
const delta = chunk.payload['delta'];
|
|
869
878
|
if (delta?.['stop_reason']) {
|
|
@@ -877,6 +886,18 @@ export class ModelClient {
|
|
|
877
886
|
if (msgUsage) {
|
|
878
887
|
usage.inputTokens = msgUsage['input_tokens'] ?? 0;
|
|
879
888
|
usage.outputTokens = msgUsage['output_tokens'] ?? 0;
|
|
889
|
+
// Vision and prompt-cache calls return up to two extra
|
|
890
|
+
// billed-tokens counts that input_tokens does NOT include:
|
|
891
|
+
// cache_creation_input_tokens (1.25× base price) and
|
|
892
|
+
// cache_read_input_tokens (0.1× base price). Without these,
|
|
893
|
+
// any audit/stats over a vision-heavy session looks wildly
|
|
894
|
+
// inconsistent with the wallet charge.
|
|
895
|
+
if (msgUsage['cache_creation_input_tokens'] !== undefined) {
|
|
896
|
+
usage.cacheCreationInputTokens = msgUsage['cache_creation_input_tokens'];
|
|
897
|
+
}
|
|
898
|
+
if (msgUsage['cache_read_input_tokens'] !== undefined) {
|
|
899
|
+
usage.cacheReadInputTokens = msgUsage['cache_read_input_tokens'];
|
|
900
|
+
}
|
|
880
901
|
}
|
|
881
902
|
break;
|
|
882
903
|
}
|
package/dist/agent/loop.js
CHANGED
|
@@ -1011,10 +1011,26 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
1011
1011
|
// where input-replay tax has clearly started biting; the
|
|
1012
1012
|
// fire-once-per-turn flag still bounds the worst case at one
|
|
1013
1013
|
// extra summary call (~$0.005).
|
|
1014
|
+
//
|
|
1015
|
+
// 2026-05-11: added a high-cost early-exit. The original
|
|
1016
|
+
// (>15 calls AND >$0.03) gate works well for cheap models
|
|
1017
|
+
// where 15 calls clears the $0.03 floor trivially. For Opus-
|
|
1018
|
+
// class models, cost climbs much faster than call count —
|
|
1019
|
+
// verified in production from a real session:
|
|
1020
|
+
// `Research-bloat compacted at 16 calls / $9.4552: ~3129
|
|
1021
|
+
// tokens`. By the time the 16-call gate fired, $9.45 was
|
|
1022
|
+
// already spent on input-replay. With an early-exit at
|
|
1023
|
+
// $1.00 turn-cost, the compact would have fired around
|
|
1024
|
+
// call 4-5, saving ~$8 on that turn. The cost cap is
|
|
1025
|
+
// intentionally conservative — even extended-thinking Opus
|
|
1026
|
+
// shouldn't legitimately need >$1 of context-replay before
|
|
1027
|
+
// compacting (the compact itself runs on a cheaper model
|
|
1028
|
+
// and costs <$0.05).
|
|
1029
|
+
const TURN_COST_CAP_FOR_EARLY_COMPACT = 1.00;
|
|
1014
1030
|
if (!bloatCompactedThisTurn &&
|
|
1015
1031
|
compactFailures < 3 &&
|
|
1016
|
-
turnToolCalls > 15 &&
|
|
1017
|
-
|
|
1032
|
+
((turnToolCalls > 15 && turnCostUsd > 0.03) ||
|
|
1033
|
+
turnCostUsd > TURN_COST_CAP_FOR_EARLY_COMPACT)) {
|
|
1018
1034
|
try {
|
|
1019
1035
|
const beforeTokens = estimateHistoryTokens(history);
|
|
1020
1036
|
const { history: compacted, compacted: didCompact } = await forceCompact(history, config.model, client, config.debug);
|
|
@@ -1640,6 +1656,8 @@ export async function interactiveSession(config, getUserInput, onEvent, onAbortR
|
|
|
1640
1656
|
model: resolvedModel,
|
|
1641
1657
|
inputTokens,
|
|
1642
1658
|
outputTokens: usage.outputTokens,
|
|
1659
|
+
cacheCreationInputTokens: usage.cacheCreationInputTokens,
|
|
1660
|
+
cacheReadInputTokens: usage.cacheReadInputTokens,
|
|
1643
1661
|
costUsd: callCost,
|
|
1644
1662
|
// Any failed model this turn means the model that finally
|
|
1645
1663
|
// succeeded was a fallback. Without this, audit log read 0%
|
package/dist/stats/audit.d.ts
CHANGED
|
@@ -14,6 +14,18 @@ export interface AuditEntry {
|
|
|
14
14
|
model: string;
|
|
15
15
|
inputTokens: number;
|
|
16
16
|
outputTokens: number;
|
|
17
|
+
/**
|
|
18
|
+
* Anthropic prompt-cache fields, captured when the model reports them
|
|
19
|
+
* in `usage.cache_creation_input_tokens` / `usage.cache_read_input_tokens`.
|
|
20
|
+
* `inputTokens` above is the *uncached* portion; the cache fields are
|
|
21
|
+
* additional billed input the gateway charges for separately. Without
|
|
22
|
+
* these, vision and cache-heavy sessions show a wildly inconsistent
|
|
23
|
+
* cost-per-token ratio in audit dashboards — verified 2026-05-11 from
|
|
24
|
+
* an Opus 4.7 call with inputTokens=3653 but costUsd=$0.567 (implies
|
|
25
|
+
* ~113K real billed tokens once cache_creation is counted).
|
|
26
|
+
*/
|
|
27
|
+
cacheCreationInputTokens?: number;
|
|
28
|
+
cacheReadInputTokens?: number;
|
|
17
29
|
costUsd: number;
|
|
18
30
|
latencyMs?: number;
|
|
19
31
|
fallback?: boolean;
|
package/package.json
CHANGED