@askalf/dario 4.8.40 → 4.8.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -323,6 +323,23 @@ export declare function resolveEffort(flag: EffortValue | undefined, clientBody:
323
323
  * — never a broken request.
324
324
  */
325
325
  export declare function supportsAdaptiveThinking(modelId: string): boolean;
326
+ /**
327
+ * Place CC-style prompt-cache breakpoints on the tools array and the
328
+ * conversation. The system prompt is already cached at build time (2 system
329
+ * breakpoints); this adds the last tool + a single rolling breakpoint on the
330
+ * last message — total 4, the Anthropic max, mirroring Claude Code.
331
+ *
332
+ * Why: dario previously cached ONLY the system prompt and stripped every
333
+ * message breakpoint, so the tools schema (10-20KB) and the entire growing
334
+ * conversation re-billed as FRESH input every turn. Fleet cache-read ran ~1.9%
335
+ * vs CC's ~70-90%, draining the Max 5h/7d token window 10-50x faster — which is
336
+ * exactly why long agentic sessions hit a wall through dario that real CC
337
+ * sails through. CC genuinely caches tools + conversation, so NOT caching them
338
+ * was itself a wire divergence from CC. Exported for unit testing.
339
+ */
340
+ export declare function applyCcPromptCaching(ccRequest: Record<string, unknown>, cacheControl: {
341
+ type: 'ephemeral';
342
+ }): void;
326
343
  export declare function buildCCRequest(clientBody: Record<string, unknown>, billingTag: string, cacheControl: {
327
344
  type: 'ephemeral';
328
345
  }, identity: {
@@ -1053,6 +1053,49 @@ export function supportsAdaptiveThinking(modelId) {
1053
1053
  return true;
1054
1054
  return false;
1055
1055
  }
1056
+ /**
1057
+ * Place CC-style prompt-cache breakpoints on the tools array and the
1058
+ * conversation. The system prompt is already cached at build time (2 system
1059
+ * breakpoints); this adds the last tool + a single rolling breakpoint on the
1060
+ * last message — total 4, the Anthropic max, mirroring Claude Code.
1061
+ *
1062
+ * Why: dario previously cached ONLY the system prompt and stripped every
1063
+ * message breakpoint, so the tools schema (10-20KB) and the entire growing
1064
+ * conversation re-billed as FRESH input every turn. Fleet cache-read ran ~1.9%
1065
+ * vs CC's ~70-90%, draining the Max 5h/7d token window 10-50x faster — which is
1066
+ * exactly why long agentic sessions hit a wall through dario that real CC
1067
+ * sails through. CC genuinely caches tools + conversation, so NOT caching them
1068
+ * was itself a wire divergence from CC. Exported for unit testing.
1069
+ */
1070
+ export function applyCcPromptCaching(ccRequest, cacheControl) {
1071
+ // Tools — clone (CC_TOOL_DEFINITIONS is a shared module constant), strip any
1072
+ // stray breakpoints, cache the LAST tool (caches the whole tools prefix).
1073
+ const tools = ccRequest.tools;
1074
+ if (Array.isArray(tools) && tools.length > 0) {
1075
+ const cloned = tools.map((t) => {
1076
+ const copy = { ...t };
1077
+ delete copy.cache_control;
1078
+ return copy;
1079
+ });
1080
+ cloned[cloned.length - 1] = { ...cloned[cloned.length - 1], cache_control: cacheControl };
1081
+ ccRequest.tools = cloned;
1082
+ }
1083
+ // Conversation — cache up to and including the last message so the NEXT turn
1084
+ // reads the whole prefix from cache. Client breakpoints were already stripped
1085
+ // upstream; this is the single rolling breakpoint CC uses.
1086
+ const msgs = ccRequest.messages;
1087
+ if (Array.isArray(msgs) && msgs.length > 0) {
1088
+ const last = msgs[msgs.length - 1];
1089
+ // Only block-array content gets a breakpoint. String content (some SDK
1090
+ // clients) is left untouched — wrapping it would change the wire shape, and
1091
+ // a bare string user turn is tiny anyway, so system+tools caching is the
1092
+ // win. Real CC / agentic sessions use block arrays, which DO get cached.
1093
+ if (Array.isArray(last.content) && last.content.length > 0) {
1094
+ const blocks = last.content;
1095
+ blocks[blocks.length - 1] = { ...blocks[blocks.length - 1], cache_control: cacheControl };
1096
+ }
1097
+ }
1098
+ }
1056
1099
  export function buildCCRequest(clientBody, billingTag, cacheControl, identity, opts = {}) {
1057
1100
  const model = clientBody.model || 'claude-sonnet-4-6';
1058
1101
  const isHaiku = model.toLowerCase().includes('haiku');
package/dist/proxy.js CHANGED
@@ -7,7 +7,7 @@ import { homedir } from 'node:os';
7
7
  import { setDefaultResultOrder } from 'node:dns';
8
8
  import { arch, platform } from 'node:process';
9
9
  import { getAccessToken, getStatus } from './oauth.js';
10
- import { buildCCRequest, parseEffortSuffix, reverseMapResponse, createStreamingReverseMapper, orderHeadersForOutbound, CC_TEMPLATE } from './cc-template.js';
10
+ import { buildCCRequest, applyCcPromptCaching, parseEffortSuffix, reverseMapResponse, createStreamingReverseMapper, orderHeadersForOutbound, CC_TEMPLATE } from './cc-template.js';
11
11
  import { describeTemplate, detectDrift, checkCCCompat } from './live-fingerprint.js';
12
12
  import { AccountPool, computeStickyKey, parseRateLimits, modelFamily, isInAuthCooldown, authCooldownMs } from './pool.js';
13
13
  import { Analytics, billingBucketFromClaim } from './analytics.js';
@@ -1467,6 +1467,15 @@ export async function startProxy(opts = {}) {
1467
1467
  skipFields,
1468
1468
  honorClientThinking: opts.honorClientThinking ?? false,
1469
1469
  });
1470
+ // Prompt-cache the tools + conversation prefix (the system prompt
1471
+ // is already cached in ccBody's system blocks). Mirrors CC's cache
1472
+ // breakpoints so a long session doesn't re-bill them as fresh input
1473
+ // every turn and burn the Max 5h/7d window — the cause of the
1474
+ // "sessions wall in minutes through dario but not CC" report.
1475
+ // Opt-out: DARIO_SKIP_FIELDS=prompt_cache.
1476
+ if (!skipFields?.has('prompt_cache')) {
1477
+ applyCcPromptCaching(ccBody, CACHE_EPHEMERAL);
1478
+ }
1470
1479
  detectedClientForLog = detectedClient;
1471
1480
  preserveToolsEffective = Boolean(opts.preserveTools)
1472
1481
  || (Boolean(detectedClient) && !opts.hybridTools && !opts.mergeTools);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@askalf/dario",
3
- "version": "4.8.40",
3
+ "version": "4.8.41",
4
4
  "description": "Use your Claude Pro/Max subscription in any tool — Cursor, Cline, Aider, the Agent SDK, your scripts — at subscription pricing, not per-token API bills. One local Anthropic + OpenAI-compatible endpoint.",
5
5
  "type": "module",
6
6
  "bin": {