@warmdrift/kgauto-compiler 2.0.0-alpha.3 → 2.0.0-alpha.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,4 +1,4 @@
1
- # @warmdrift/kgauto-compiler — v2.0.0-alpha.3
1
+ # @warmdrift/kgauto-compiler — v2.0.0-alpha.5
2
2
 
3
3
  > Prompt compiler + central learning brain for multi-model AI apps.
4
4
  > **Swap models without rewriting prompts.**
@@ -18,8 +18,8 @@ mutations.
18
18
  - **Package:** alpha — coexists with v1 (`@warmdrift/kgauto@1.2.0`) under
19
19
  the temporary name `@warmdrift/kgauto-compiler`. Renames to v2 final once
20
20
  v1 is fully retired from production.
21
- - **Tests:** 132/132 passing
22
- - **Build:** clean (43KB ESM, 60KB CJS)
21
+ - **Tests:** 180/180 passing
22
+ - **Build:** clean (47KB ESM, 64KB CJS)
23
23
  - **Brain:** schema ready (see `brain/migrations/001_initial_schema.sql`);
24
24
  awaiting dedicated Supabase provisioning.
25
25
  - **Mutation engine:** v2.1 (after enough outcome data accumulates).
package/dist/index.d.mts CHANGED
@@ -1,5 +1,5 @@
1
- import { M as ModelProfile, C as CompilePolicy, N as NormalizedResponse, A as ApiKeys, P as ProviderOverrides, a as CompiledRequest, b as PromptIR, c as CallOptions, d as CallResult, R as RecordInput, O as OracleScore, e as CompileResult } from './profiles-BiyrF36f.mjs';
2
- export { f as ALIASES, g as CacheStrategy, h as CallAttempt, i as CallError, j as CliffRule, k as Constraints, I as IntentDeclaration, L as LoweringSpec, l as Message, m as MutationApplied, n as NormalizedTokens, o as PromptSection, p as Provider, q as RecoveryRule, S as StructuredOutputCapability, r as SystemPromptMode, T as ToolCall, s as ToolDefinition, t as allProfiles, u as getProfile, v as profilesByProvider, w as tryGetProfile } from './profiles-BiyrF36f.mjs';
1
+ import { M as ModelProfile, C as CompilePolicy, N as NormalizedResponse, A as ApiKeys, P as ProviderOverrides, a as CompiledRequest, b as PromptIR, c as CallOptions, d as CallResult, R as RecordInput, O as OracleScore, e as CompileResult } from './profiles-DHdCRBVH.mjs';
2
+ export { f as ALIASES, g as CacheStrategy, h as CallAttempt, i as CallError, j as CliffRule, k as Constraints, I as IntentDeclaration, L as LoweringSpec, l as Message, m as MutationApplied, n as NormalizedTokens, o as PromptSection, p as Provider, q as RecoveryRule, S as StructuredOutputCapability, r as SystemPromptMode, T as ToolCall, s as ToolDefinition, t as allProfiles, u as getProfile, v as profilesByProvider, w as tryGetProfile } from './profiles-DHdCRBVH.mjs';
3
3
  export { ALL_ARCHETYPES, ContextBucket, DIALECT_VERSION, HistoryDepth, INTENT_ARCHETYPES, IntentArchetypeName, OutputMode, ShapeSignature, ToolCountBucket, bucketContext, bucketHistory, bucketToolCount, hashShape, isArchetype, learningKey } from './dialect.mjs';
4
4
 
5
5
  /**
package/dist/index.d.ts CHANGED
@@ -1,5 +1,5 @@
1
- import { M as ModelProfile, C as CompilePolicy, N as NormalizedResponse, A as ApiKeys, P as ProviderOverrides, a as CompiledRequest, b as PromptIR, c as CallOptions, d as CallResult, R as RecordInput, O as OracleScore, e as CompileResult } from './profiles-C5lVqF8_.js';
2
- export { f as ALIASES, g as CacheStrategy, h as CallAttempt, i as CallError, j as CliffRule, k as Constraints, I as IntentDeclaration, L as LoweringSpec, l as Message, m as MutationApplied, n as NormalizedTokens, o as PromptSection, p as Provider, q as RecoveryRule, S as StructuredOutputCapability, r as SystemPromptMode, T as ToolCall, s as ToolDefinition, t as allProfiles, u as getProfile, v as profilesByProvider, w as tryGetProfile } from './profiles-C5lVqF8_.js';
1
+ import { M as ModelProfile, C as CompilePolicy, N as NormalizedResponse, A as ApiKeys, P as ProviderOverrides, a as CompiledRequest, b as PromptIR, c as CallOptions, d as CallResult, R as RecordInput, O as OracleScore, e as CompileResult } from './profiles-MGq5Tnjv.js';
2
+ export { f as ALIASES, g as CacheStrategy, h as CallAttempt, i as CallError, j as CliffRule, k as Constraints, I as IntentDeclaration, L as LoweringSpec, l as Message, m as MutationApplied, n as NormalizedTokens, o as PromptSection, p as Provider, q as RecoveryRule, S as StructuredOutputCapability, r as SystemPromptMode, T as ToolCall, s as ToolDefinition, t as allProfiles, u as getProfile, v as profilesByProvider, w as tryGetProfile } from './profiles-MGq5Tnjv.js';
3
3
  export { ALL_ARCHETYPES, ContextBucket, DIALECT_VERSION, HistoryDepth, INTENT_ARCHETYPES, IntentArchetypeName, OutputMode, ShapeSignature, ToolCountBucket, bucketContext, bucketHistory, bucketToolCount, hashShape, isArchetype, learningKey } from './dialect.js';
4
4
 
5
5
  /**
package/dist/index.js CHANGED
@@ -489,10 +489,15 @@ function lower(ir, profile, hints = {}) {
489
489
  }
490
490
  function lowerAnthropic(ir, profile, hints) {
491
491
  const systemBlocks = buildAnthropicSystemBlocks(ir.sections, profile);
492
- const messages = buildAnthropicMessages(ir.history ?? [], ir.currentTurn);
492
+ const history = ir.history ?? [];
493
+ const policy = ir.historyCachePolicy;
494
+ const markIndex = resolveHistoryMarkIndex(history.length, policy);
495
+ const messages = buildAnthropicMessages(history, ir.currentTurn, markIndex);
493
496
  const tools = ir.tools ? toAnthropicTools(ir.tools) : void 0;
494
497
  const cacheableTokens = computeCacheableTokens(systemBlocks);
495
- const cacheSavings = cacheableTokens / 1e6 * profile.costInputPer1m * (1 - (profile.lowering.cache.discount ?? 0.1));
498
+ const historyCacheableTokens = markIndex >= 0 ? sumHistoryTokens(history, markIndex) : 0;
499
+ const totalCacheableTokens = cacheableTokens + historyCacheableTokens;
500
+ const cacheSavings = totalCacheableTokens / 1e6 * profile.costInputPer1m * (1 - (profile.lowering.cache.discount ?? 0.1));
496
501
  return {
497
502
  request: {
498
503
  provider: "anthropic",
@@ -504,6 +509,7 @@ function lowerAnthropic(ir, profile, hints) {
504
509
  },
505
510
  diagnostics: {
506
511
  cacheableTokens,
512
+ historyCacheableTokens,
507
513
  estimatedCacheSavingsUsd: cacheSavings
508
514
  }
509
515
  };
@@ -536,17 +542,64 @@ function buildAnthropicSystemBlocks(sections, profile) {
536
542
  }
537
543
  return blocks;
538
544
  }
539
- function buildAnthropicMessages(history, currentTurn) {
545
+ function buildAnthropicMessages(history, currentTurn, markIndex) {
540
546
  const out = [];
541
- for (const m of history) {
547
+ for (let i = 0; i < history.length; i++) {
548
+ const m = history[i];
542
549
  if (m.role === "system") continue;
543
- out.push({ role: m.role, content: m.parts ?? m.content });
550
+ const shouldMark = i === markIndex;
551
+ out.push({
552
+ role: m.role,
553
+ content: shouldMark ? attachAnthropicCacheControl(m) : m.parts ?? m.content
554
+ });
544
555
  }
545
556
  if (currentTurn && currentTurn.role !== "system") {
546
557
  out.push({ role: currentTurn.role, content: currentTurn.parts ?? currentTurn.content });
547
558
  }
548
559
  return out;
549
560
  }
561
+ function attachAnthropicCacheControl(m) {
562
+ if (Array.isArray(m.parts) && m.parts.length > 0) {
563
+ const blocks = m.parts;
564
+ const last = blocks[blocks.length - 1];
565
+ const withMarker = {
566
+ ...last,
567
+ cache_control: { type: "ephemeral" }
568
+ };
569
+ return [...blocks.slice(0, -1), withMarker];
570
+ }
571
+ return [
572
+ {
573
+ type: "text",
574
+ text: m.content,
575
+ cache_control: { type: "ephemeral" }
576
+ }
577
+ ];
578
+ }
579
+ function resolveHistoryMarkIndex(historyLen, policy) {
580
+ if (!policy || policy.strategy === "none") return -1;
581
+ if (historyLen === 0) return -1;
582
+ if (policy.strategy === "all-but-latest") {
583
+ return historyLen - 1;
584
+ }
585
+ const idx = historyLen - 1 - policy.suffix;
586
+ return idx >= 0 ? idx : -1;
587
+ }
588
+ function sumHistoryTokens(history, throughIndex) {
589
+ let total = 0;
590
+ for (let i = 0; i <= throughIndex && i < history.length; i++) {
591
+ const m = history[i];
592
+ if (m.role === "system") continue;
593
+ if (Array.isArray(m.parts)) {
594
+ for (const p of m.parts) {
595
+ if (typeof p.text === "string") total += countTokens(p.text);
596
+ }
597
+ } else if (typeof m.content === "string") {
598
+ total += countTokens(m.content);
599
+ }
600
+ }
601
+ return total;
602
+ }
550
603
  function toAnthropicTools(tools) {
551
604
  return tools.map((t) => ({
552
605
  name: t.name,
@@ -581,6 +634,9 @@ function lowerGoogle(ir, profile, hints) {
581
634
  const minTokens = profile.lowering.cache.minTokens ?? 4096;
582
635
  const meetsMin = cacheableTokens >= minTokens;
583
636
  const cacheSavings = meetsMin ? cacheableTokens / 1e6 * profile.costInputPer1m * (1 - (profile.lowering.cache.discount ?? 0.25)) : 0;
637
+ const history = ir.history ?? [];
638
+ const histMarkIndex = resolveHistoryMarkIndex(history.length, ir.historyCachePolicy);
639
+ const historyCacheableTokens = histMarkIndex >= 0 ? sumHistoryTokens(history, histMarkIndex) : 0;
584
640
  return {
585
641
  request: {
586
642
  provider: "google",
@@ -592,6 +648,7 @@ function lowerGoogle(ir, profile, hints) {
592
648
  },
593
649
  diagnostics: {
594
650
  cacheableTokens: meetsMin ? cacheableTokens : 0,
651
+ historyCacheableTokens,
595
652
  estimatedCacheSavingsUsd: cacheSavings
596
653
  }
597
654
  };
@@ -639,6 +696,9 @@ function lowerOpenAI(ir, profile, hints) {
639
696
  content: ir.currentTurn.parts ?? ir.currentTurn.content
640
697
  });
641
698
  }
699
+ const history = ir.history ?? [];
700
+ const histMarkIndex = resolveHistoryMarkIndex(history.length, ir.historyCachePolicy);
701
+ const historyCacheableTokens = histMarkIndex >= 0 ? sumHistoryTokens(history, histMarkIndex) : 0;
642
702
  return {
643
703
  request: {
644
704
  provider: "openai",
@@ -648,7 +708,11 @@ function lowerOpenAI(ir, profile, hints) {
648
708
  response_format: ir.constraints?.structuredOutput ? { type: "json_object" } : void 0,
649
709
  reasoning_effort: hints.forceTerseOutput ? "low" : void 0
650
710
  },
651
- diagnostics: { cacheableTokens: 0, estimatedCacheSavingsUsd: 0 }
711
+ diagnostics: {
712
+ cacheableTokens: 0,
713
+ historyCacheableTokens,
714
+ estimatedCacheSavingsUsd: 0
715
+ }
652
716
  };
653
717
  }
654
718
  function toOpenAITools(tools) {
@@ -675,6 +739,9 @@ function lowerDeepSeek(ir, profile) {
675
739
  content: ir.currentTurn.parts ?? ir.currentTurn.content
676
740
  });
677
741
  }
742
+ const history = ir.history ?? [];
743
+ const histMarkIndex = resolveHistoryMarkIndex(history.length, ir.historyCachePolicy);
744
+ const historyCacheableTokens = histMarkIndex >= 0 ? sumHistoryTokens(history, histMarkIndex) : 0;
678
745
  return {
679
746
  request: {
680
747
  provider: "deepseek",
@@ -689,7 +756,11 @@ function lowerDeepSeek(ir, profile) {
689
756
  }
690
757
  })) : void 0
691
758
  },
692
- diagnostics: { cacheableTokens: 0, estimatedCacheSavingsUsd: 0 }
759
+ diagnostics: {
760
+ cacheableTokens: 0,
761
+ historyCacheableTokens,
762
+ estimatedCacheSavingsUsd: 0
763
+ }
693
764
  };
694
765
  }
695
766
  function sortSections(sections) {
@@ -1181,7 +1252,8 @@ function compile(ir, opts = {}) {
1181
1252
  historyKept: workingIR.history?.length ?? 0,
1182
1253
  historyDropped: (ir.history?.length ?? 0) - (workingIR.history?.length ?? 0),
1183
1254
  cacheableTokens: lowered.diagnostics.cacheableTokens,
1184
- estimatedCacheSavingsUsd: lowered.diagnostics.estimatedCacheSavingsUsd
1255
+ estimatedCacheSavingsUsd: lowered.diagnostics.estimatedCacheSavingsUsd,
1256
+ historyCacheableTokens: lowered.diagnostics.historyCacheableTokens
1185
1257
  }
1186
1258
  };
1187
1259
  }
@@ -1266,7 +1338,8 @@ function registerCompile(appId, archetype, ir, result) {
1266
1338
  learningKey: learningKey(archetype, result.target, shape),
1267
1339
  estimatedTokensIn: tokens,
1268
1340
  mutationsApplied: result.mutationsApplied.map((m) => m.id),
1269
- startedAt: Date.now()
1341
+ startedAt: Date.now(),
1342
+ historyCacheableTokens: result.diagnostics.historyCacheableTokens
1270
1343
  });
1271
1344
  }
1272
1345
  async function record(input) {
@@ -1309,6 +1382,9 @@ function buildPayload(input, reg) {
1309
1382
  const compileTarget = reg?.model;
1310
1383
  const actual = input.actualModel ?? compileTarget;
1311
1384
  const requested = input.actualModel && compileTarget && input.actualModel !== compileTarget ? compileTarget : void 0;
1385
+ const mutationsApplied = input.mutationsApplied ?? reg?.mutationsApplied ?? [];
1386
+ const costModel = actual;
1387
+ const costUsdActual = costModel ? computeCostUsd(costModel, input.tokensIn, input.tokensOut) : void 0;
1312
1388
  return {
1313
1389
  handle: input.handle,
1314
1390
  app_id: reg?.appId,
@@ -1318,7 +1394,7 @@ function buildPayload(input, reg) {
1318
1394
  provider: reg?.provider,
1319
1395
  shape_key: reg?.shapeKey,
1320
1396
  learning_key: reg?.learningKey,
1321
- mutations_applied: reg?.mutationsApplied ?? [],
1397
+ mutations_applied: mutationsApplied,
1322
1398
  tokens_in: input.tokensIn,
1323
1399
  tokens_out: input.tokensOut,
1324
1400
  estimated_tokens_in: reg?.estimatedTokensIn,
@@ -1332,9 +1408,22 @@ function buildPayload(input, reg) {
1332
1408
  oracle_rationale: input.oracleScore?.rationale,
1333
1409
  prompt_preview: input.promptPreview,
1334
1410
  response_preview: input.responsePreview,
1335
- dialect_version: "v1"
1411
+ dialect_version: "v1",
1412
+ cache_read_input_tokens: input.cacheReadInputTokens,
1413
+ cache_creation_input_tokens: input.cacheCreationInputTokens,
1414
+ cost_usd_actual: costUsdActual,
1415
+ ttft_ms: input.ttftMs,
1416
+ history_cacheable_tokens: reg?.historyCacheableTokens
1336
1417
  };
1337
1418
  }
1419
+ function computeCostUsd(modelId, tokensIn, tokensOut) {
1420
+ if (tokensIn === 0 && tokensOut === 0) return void 0;
1421
+ const profile = tryGetProfile(modelId);
1422
+ if (!profile) return void 0;
1423
+ const inUsd = tokensIn / 1e6 * profile.costInputPer1m;
1424
+ const outUsd = tokensOut / 1e6 * profile.costOutputPer1m;
1425
+ return Math.round((inUsd + outUsd) * 1e6) / 1e6;
1426
+ }
1338
1427
 
1339
1428
  // src/ir.ts
1340
1429
  var CallError = class extends Error {
@@ -1607,7 +1696,7 @@ async function call(ir, opts = {}) {
1607
1696
  attempts.push({ model: targetModel, status: "success" });
1608
1697
  const latencyMs2 = Date.now() - start;
1609
1698
  const responseWithStructured = withStructuredOutput(exec.response, ir);
1610
- void record({
1699
+ await record({
1611
1700
  handle: initial.handle,
1612
1701
  tokensIn: responseWithStructured.tokens.input,
1613
1702
  tokensOut: responseWithStructured.tokens.output,
@@ -1616,7 +1705,11 @@ async function call(ir, opts = {}) {
1616
1705
  emptyResponse: responseWithStructured.tokens.output === 0,
1617
1706
  toolsCalled: responseWithStructured.toolCalls.map((tc) => tc.name),
1618
1707
  actualModel: targetModel !== initial.target ? targetModel : void 0,
1619
- responsePreview: responseWithStructured.text.slice(0, 200)
1708
+ mutationsApplied: targetModel !== initial.target ? activeCompile.mutationsApplied.map((m) => m.id) : void 0,
1709
+ promptPreview: extractPromptPreview(ir),
1710
+ responsePreview: responseWithStructured.text.slice(0, 200),
1711
+ cacheReadInputTokens: responseWithStructured.tokens.cached,
1712
+ cacheCreationInputTokens: responseWithStructured.tokens.cacheCreated
1620
1713
  });
1621
1714
  return {
1622
1715
  handle: initial.handle,
@@ -1641,13 +1734,14 @@ async function call(ir, opts = {}) {
1641
1734
  }
1642
1735
  }
1643
1736
  const latencyMs = Date.now() - start;
1644
- void record({
1737
+ await record({
1645
1738
  handle: initial.handle,
1646
1739
  tokensIn: 0,
1647
1740
  tokensOut: 0,
1648
1741
  latencyMs,
1649
1742
  success: false,
1650
- errorType: lastErr?.errorCode
1743
+ errorType: lastErr?.errorCode,
1744
+ promptPreview: extractPromptPreview(ir)
1651
1745
  });
1652
1746
  throw new CallError(
1653
1747
  `call(): all attempts failed${lastErr ? ` \u2014 ${lastErr.errorCode}: ${lastErr.message}` : ""}`,
@@ -1665,6 +1759,13 @@ function compileAndRegister(ir, opts) {
1665
1759
  registerCompile(ir.appId, ir.intent.archetype, ir, result);
1666
1760
  return result;
1667
1761
  }
1762
+ function extractPromptPreview(ir) {
1763
+ const turn = ir.currentTurn?.content;
1764
+ if (turn) return turn.slice(0, 200);
1765
+ const lastHist = ir.history?.[ir.history.length - 1]?.content;
1766
+ if (lastHist) return lastHist.slice(0, 200);
1767
+ return void 0;
1768
+ }
1668
1769
  function withStructuredOutput(response, ir) {
1669
1770
  if (!ir.constraints?.structuredOutput) return response;
1670
1771
  if (!response.text) return response;
package/dist/index.mjs CHANGED
@@ -374,10 +374,15 @@ function lower(ir, profile, hints = {}) {
374
374
  }
375
375
  function lowerAnthropic(ir, profile, hints) {
376
376
  const systemBlocks = buildAnthropicSystemBlocks(ir.sections, profile);
377
- const messages = buildAnthropicMessages(ir.history ?? [], ir.currentTurn);
377
+ const history = ir.history ?? [];
378
+ const policy = ir.historyCachePolicy;
379
+ const markIndex = resolveHistoryMarkIndex(history.length, policy);
380
+ const messages = buildAnthropicMessages(history, ir.currentTurn, markIndex);
378
381
  const tools = ir.tools ? toAnthropicTools(ir.tools) : void 0;
379
382
  const cacheableTokens = computeCacheableTokens(systemBlocks);
380
- const cacheSavings = cacheableTokens / 1e6 * profile.costInputPer1m * (1 - (profile.lowering.cache.discount ?? 0.1));
383
+ const historyCacheableTokens = markIndex >= 0 ? sumHistoryTokens(history, markIndex) : 0;
384
+ const totalCacheableTokens = cacheableTokens + historyCacheableTokens;
385
+ const cacheSavings = totalCacheableTokens / 1e6 * profile.costInputPer1m * (1 - (profile.lowering.cache.discount ?? 0.1));
381
386
  return {
382
387
  request: {
383
388
  provider: "anthropic",
@@ -389,6 +394,7 @@ function lowerAnthropic(ir, profile, hints) {
389
394
  },
390
395
  diagnostics: {
391
396
  cacheableTokens,
397
+ historyCacheableTokens,
392
398
  estimatedCacheSavingsUsd: cacheSavings
393
399
  }
394
400
  };
@@ -421,17 +427,64 @@ function buildAnthropicSystemBlocks(sections, profile) {
421
427
  }
422
428
  return blocks;
423
429
  }
424
- function buildAnthropicMessages(history, currentTurn) {
430
+ function buildAnthropicMessages(history, currentTurn, markIndex) {
425
431
  const out = [];
426
- for (const m of history) {
432
+ for (let i = 0; i < history.length; i++) {
433
+ const m = history[i];
427
434
  if (m.role === "system") continue;
428
- out.push({ role: m.role, content: m.parts ?? m.content });
435
+ const shouldMark = i === markIndex;
436
+ out.push({
437
+ role: m.role,
438
+ content: shouldMark ? attachAnthropicCacheControl(m) : m.parts ?? m.content
439
+ });
429
440
  }
430
441
  if (currentTurn && currentTurn.role !== "system") {
431
442
  out.push({ role: currentTurn.role, content: currentTurn.parts ?? currentTurn.content });
432
443
  }
433
444
  return out;
434
445
  }
446
+ function attachAnthropicCacheControl(m) {
447
+ if (Array.isArray(m.parts) && m.parts.length > 0) {
448
+ const blocks = m.parts;
449
+ const last = blocks[blocks.length - 1];
450
+ const withMarker = {
451
+ ...last,
452
+ cache_control: { type: "ephemeral" }
453
+ };
454
+ return [...blocks.slice(0, -1), withMarker];
455
+ }
456
+ return [
457
+ {
458
+ type: "text",
459
+ text: m.content,
460
+ cache_control: { type: "ephemeral" }
461
+ }
462
+ ];
463
+ }
464
+ function resolveHistoryMarkIndex(historyLen, policy) {
465
+ if (!policy || policy.strategy === "none") return -1;
466
+ if (historyLen === 0) return -1;
467
+ if (policy.strategy === "all-but-latest") {
468
+ return historyLen - 1;
469
+ }
470
+ const idx = historyLen - 1 - policy.suffix;
471
+ return idx >= 0 ? idx : -1;
472
+ }
473
+ function sumHistoryTokens(history, throughIndex) {
474
+ let total = 0;
475
+ for (let i = 0; i <= throughIndex && i < history.length; i++) {
476
+ const m = history[i];
477
+ if (m.role === "system") continue;
478
+ if (Array.isArray(m.parts)) {
479
+ for (const p of m.parts) {
480
+ if (typeof p.text === "string") total += countTokens(p.text);
481
+ }
482
+ } else if (typeof m.content === "string") {
483
+ total += countTokens(m.content);
484
+ }
485
+ }
486
+ return total;
487
+ }
435
488
  function toAnthropicTools(tools) {
436
489
  return tools.map((t) => ({
437
490
  name: t.name,
@@ -466,6 +519,9 @@ function lowerGoogle(ir, profile, hints) {
466
519
  const minTokens = profile.lowering.cache.minTokens ?? 4096;
467
520
  const meetsMin = cacheableTokens >= minTokens;
468
521
  const cacheSavings = meetsMin ? cacheableTokens / 1e6 * profile.costInputPer1m * (1 - (profile.lowering.cache.discount ?? 0.25)) : 0;
522
+ const history = ir.history ?? [];
523
+ const histMarkIndex = resolveHistoryMarkIndex(history.length, ir.historyCachePolicy);
524
+ const historyCacheableTokens = histMarkIndex >= 0 ? sumHistoryTokens(history, histMarkIndex) : 0;
469
525
  return {
470
526
  request: {
471
527
  provider: "google",
@@ -477,6 +533,7 @@ function lowerGoogle(ir, profile, hints) {
477
533
  },
478
534
  diagnostics: {
479
535
  cacheableTokens: meetsMin ? cacheableTokens : 0,
536
+ historyCacheableTokens,
480
537
  estimatedCacheSavingsUsd: cacheSavings
481
538
  }
482
539
  };
@@ -524,6 +581,9 @@ function lowerOpenAI(ir, profile, hints) {
524
581
  content: ir.currentTurn.parts ?? ir.currentTurn.content
525
582
  });
526
583
  }
584
+ const history = ir.history ?? [];
585
+ const histMarkIndex = resolveHistoryMarkIndex(history.length, ir.historyCachePolicy);
586
+ const historyCacheableTokens = histMarkIndex >= 0 ? sumHistoryTokens(history, histMarkIndex) : 0;
527
587
  return {
528
588
  request: {
529
589
  provider: "openai",
@@ -533,7 +593,11 @@ function lowerOpenAI(ir, profile, hints) {
533
593
  response_format: ir.constraints?.structuredOutput ? { type: "json_object" } : void 0,
534
594
  reasoning_effort: hints.forceTerseOutput ? "low" : void 0
535
595
  },
536
- diagnostics: { cacheableTokens: 0, estimatedCacheSavingsUsd: 0 }
596
+ diagnostics: {
597
+ cacheableTokens: 0,
598
+ historyCacheableTokens,
599
+ estimatedCacheSavingsUsd: 0
600
+ }
537
601
  };
538
602
  }
539
603
  function toOpenAITools(tools) {
@@ -560,6 +624,9 @@ function lowerDeepSeek(ir, profile) {
560
624
  content: ir.currentTurn.parts ?? ir.currentTurn.content
561
625
  });
562
626
  }
627
+ const history = ir.history ?? [];
628
+ const histMarkIndex = resolveHistoryMarkIndex(history.length, ir.historyCachePolicy);
629
+ const historyCacheableTokens = histMarkIndex >= 0 ? sumHistoryTokens(history, histMarkIndex) : 0;
563
630
  return {
564
631
  request: {
565
632
  provider: "deepseek",
@@ -574,7 +641,11 @@ function lowerDeepSeek(ir, profile) {
574
641
  }
575
642
  })) : void 0
576
643
  },
577
- diagnostics: { cacheableTokens: 0, estimatedCacheSavingsUsd: 0 }
644
+ diagnostics: {
645
+ cacheableTokens: 0,
646
+ historyCacheableTokens,
647
+ estimatedCacheSavingsUsd: 0
648
+ }
578
649
  };
579
650
  }
580
651
  function sortSections(sections) {
@@ -664,7 +735,8 @@ function compile(ir, opts = {}) {
664
735
  historyKept: workingIR.history?.length ?? 0,
665
736
  historyDropped: (ir.history?.length ?? 0) - (workingIR.history?.length ?? 0),
666
737
  cacheableTokens: lowered.diagnostics.cacheableTokens,
667
- estimatedCacheSavingsUsd: lowered.diagnostics.estimatedCacheSavingsUsd
738
+ estimatedCacheSavingsUsd: lowered.diagnostics.estimatedCacheSavingsUsd,
739
+ historyCacheableTokens: lowered.diagnostics.historyCacheableTokens
668
740
  }
669
741
  };
670
742
  }
@@ -749,7 +821,8 @@ function registerCompile(appId, archetype, ir, result) {
749
821
  learningKey: learningKey(archetype, result.target, shape),
750
822
  estimatedTokensIn: tokens,
751
823
  mutationsApplied: result.mutationsApplied.map((m) => m.id),
752
- startedAt: Date.now()
824
+ startedAt: Date.now(),
825
+ historyCacheableTokens: result.diagnostics.historyCacheableTokens
753
826
  });
754
827
  }
755
828
  async function record(input) {
@@ -792,6 +865,9 @@ function buildPayload(input, reg) {
792
865
  const compileTarget = reg?.model;
793
866
  const actual = input.actualModel ?? compileTarget;
794
867
  const requested = input.actualModel && compileTarget && input.actualModel !== compileTarget ? compileTarget : void 0;
868
+ const mutationsApplied = input.mutationsApplied ?? reg?.mutationsApplied ?? [];
869
+ const costModel = actual;
870
+ const costUsdActual = costModel ? computeCostUsd(costModel, input.tokensIn, input.tokensOut) : void 0;
795
871
  return {
796
872
  handle: input.handle,
797
873
  app_id: reg?.appId,
@@ -801,7 +877,7 @@ function buildPayload(input, reg) {
801
877
  provider: reg?.provider,
802
878
  shape_key: reg?.shapeKey,
803
879
  learning_key: reg?.learningKey,
804
- mutations_applied: reg?.mutationsApplied ?? [],
880
+ mutations_applied: mutationsApplied,
805
881
  tokens_in: input.tokensIn,
806
882
  tokens_out: input.tokensOut,
807
883
  estimated_tokens_in: reg?.estimatedTokensIn,
@@ -815,9 +891,22 @@ function buildPayload(input, reg) {
815
891
  oracle_rationale: input.oracleScore?.rationale,
816
892
  prompt_preview: input.promptPreview,
817
893
  response_preview: input.responsePreview,
818
- dialect_version: "v1"
894
+ dialect_version: "v1",
895
+ cache_read_input_tokens: input.cacheReadInputTokens,
896
+ cache_creation_input_tokens: input.cacheCreationInputTokens,
897
+ cost_usd_actual: costUsdActual,
898
+ ttft_ms: input.ttftMs,
899
+ history_cacheable_tokens: reg?.historyCacheableTokens
819
900
  };
820
901
  }
902
+ function computeCostUsd(modelId, tokensIn, tokensOut) {
903
+ if (tokensIn === 0 && tokensOut === 0) return void 0;
904
+ const profile = tryGetProfile(modelId);
905
+ if (!profile) return void 0;
906
+ const inUsd = tokensIn / 1e6 * profile.costInputPer1m;
907
+ const outUsd = tokensOut / 1e6 * profile.costOutputPer1m;
908
+ return Math.round((inUsd + outUsd) * 1e6) / 1e6;
909
+ }
821
910
 
822
911
  // src/ir.ts
823
912
  var CallError = class extends Error {
@@ -1090,7 +1179,7 @@ async function call(ir, opts = {}) {
1090
1179
  attempts.push({ model: targetModel, status: "success" });
1091
1180
  const latencyMs2 = Date.now() - start;
1092
1181
  const responseWithStructured = withStructuredOutput(exec.response, ir);
1093
- void record({
1182
+ await record({
1094
1183
  handle: initial.handle,
1095
1184
  tokensIn: responseWithStructured.tokens.input,
1096
1185
  tokensOut: responseWithStructured.tokens.output,
@@ -1099,7 +1188,11 @@ async function call(ir, opts = {}) {
1099
1188
  emptyResponse: responseWithStructured.tokens.output === 0,
1100
1189
  toolsCalled: responseWithStructured.toolCalls.map((tc) => tc.name),
1101
1190
  actualModel: targetModel !== initial.target ? targetModel : void 0,
1102
- responsePreview: responseWithStructured.text.slice(0, 200)
1191
+ mutationsApplied: targetModel !== initial.target ? activeCompile.mutationsApplied.map((m) => m.id) : void 0,
1192
+ promptPreview: extractPromptPreview(ir),
1193
+ responsePreview: responseWithStructured.text.slice(0, 200),
1194
+ cacheReadInputTokens: responseWithStructured.tokens.cached,
1195
+ cacheCreationInputTokens: responseWithStructured.tokens.cacheCreated
1103
1196
  });
1104
1197
  return {
1105
1198
  handle: initial.handle,
@@ -1124,13 +1217,14 @@ async function call(ir, opts = {}) {
1124
1217
  }
1125
1218
  }
1126
1219
  const latencyMs = Date.now() - start;
1127
- void record({
1220
+ await record({
1128
1221
  handle: initial.handle,
1129
1222
  tokensIn: 0,
1130
1223
  tokensOut: 0,
1131
1224
  latencyMs,
1132
1225
  success: false,
1133
- errorType: lastErr?.errorCode
1226
+ errorType: lastErr?.errorCode,
1227
+ promptPreview: extractPromptPreview(ir)
1134
1228
  });
1135
1229
  throw new CallError(
1136
1230
  `call(): all attempts failed${lastErr ? ` \u2014 ${lastErr.errorCode}: ${lastErr.message}` : ""}`,
@@ -1148,6 +1242,13 @@ function compileAndRegister(ir, opts) {
1148
1242
  registerCompile(ir.appId, ir.intent.archetype, ir, result);
1149
1243
  return result;
1150
1244
  }
1245
+ function extractPromptPreview(ir) {
1246
+ const turn = ir.currentTurn?.content;
1247
+ if (turn) return turn.slice(0, 200);
1248
+ const lastHist = ir.history?.[ir.history.length - 1]?.content;
1249
+ if (lastHist) return lastHist.slice(0, 200);
1250
+ return void 0;
1251
+ }
1151
1252
  function withStructuredOutput(response, ir) {
1152
1253
  if (!ir.constraints?.structuredOutput) return response;
1153
1254
  if (!response.text) return response;
@@ -91,6 +91,40 @@ interface Constraints {
91
91
  /** Override target model selection — if set, compiler uses this instead of routing. */
92
92
  forceModel?: string;
93
93
  }
94
+ /**
95
+ * Cache marker policy for the messages array (history + currentTurn).
96
+ *
97
+ * Anthropic positional caching: a `cache_control` marker on a content block
98
+ * tells the API "remember the prefix up through this block." On a subsequent
99
+ * request whose first N tokens match, those N billed at the cached rate
100
+ * (10% of the input price). Without a marker, every call re-pays for the
101
+ * entire history.
102
+ *
103
+ * - `'none'` (default when omitted): no history cache marker. System-level
104
+ * cache markers from `PromptSection.cacheable=true` still apply.
105
+ * - `'all-but-latest'`: marks the message immediately preceding `currentTurn`
106
+ * (the last history entry). On the next call, that entire history prefix
107
+ * is cacheable. Good fit for chat/agent loops where every prior turn is
108
+ * stable.
109
+ * - `'fixed-suffix'`: marks the message `suffix` positions from the end of
110
+ * `history`. Use when the last few turns are volatile (e.g., scratchpad,
111
+ * draft revisions) but the earlier prefix is stable.
112
+ *
113
+ * For non-Anthropic providers, no wire-format marker is emitted (Gemini /
114
+ * OpenAI / DeepSeek implicit caching takes effect automatically when a
115
+ * stable prefix is reused). The compiler still computes
116
+ * `diagnostics.historyCacheableTokens` for telemetry on every provider.
117
+ *
118
+ * alpha.5.
119
+ */
120
+ type HistoryCachePolicy = {
121
+ strategy: 'none';
122
+ } | {
123
+ strategy: 'all-but-latest';
124
+ } | {
125
+ strategy: 'fixed-suffix';
126
+ suffix: number;
127
+ };
94
128
  /**
95
129
  * Consumer-declared policy for model selection. Lives outside the IR
96
130
  * (passed via CompileOptions) because it's a SESSION/APP-level constraint,
@@ -146,6 +180,12 @@ interface PromptIR {
146
180
  models: string[];
147
181
  /** Compile constraints. */
148
182
  constraints?: Constraints;
183
+ /**
184
+ * Cache marker placement policy for the messages array. Default = no
185
+ * history cache markers. See `HistoryCachePolicy` for semantics.
186
+ * alpha.5.
187
+ */
188
+ historyCachePolicy?: HistoryCachePolicy;
149
189
  }
150
190
  type Provider = 'anthropic' | 'google' | 'openai' | 'deepseek' | 'mistral' | 'xai';
151
191
  /**
@@ -240,6 +280,16 @@ interface CompileResult {
240
280
  historyDropped: number;
241
281
  cacheableTokens: number;
242
282
  estimatedCacheSavingsUsd: number;
283
+ /**
284
+ * Tokens in `history` (and `currentTurn` when before the marker) that
285
+ * fall within the cacheable prefix per `historyCachePolicy`. Always
286
+ * computed; only Anthropic actually emits a wire-format marker. For
287
+ * Gemini / OpenAI / DeepSeek, this represents the theoretical cacheable
288
+ * prefix that implicit caching may pick up — useful telemetry for the
289
+ * brain to learn which (app, model, archetype) tuples benefit most
290
+ * from history caching. alpha.5.
291
+ */
292
+ historyCacheableTokens: number;
243
293
  };
244
294
  }
245
295
  /**
@@ -386,6 +436,38 @@ interface RecordInput {
386
436
  * the originally-requested model.
387
437
  */
388
438
  actualModel?: string;
439
+ /**
440
+ * Override `mutations_applied` for this outcome. Set by `call()` when
441
+ * fallback fires — the served compile's mutations (which actually shaped
442
+ * the request that went on the wire) replace the initial compile's
443
+ * mutations (registered against the handle). Without this override, fallback
444
+ * traffic is attributed to the initial compile's mutations and the brain's
445
+ * mutation effectiveness stats become misleading.
446
+ *
447
+ * alpha.4: extends s11 truth-in-logging to mutations.
448
+ */
449
+ mutationsApplied?: string[];
450
+ /**
451
+ * Cache read input tokens, when supported by the provider.
452
+ * - Anthropic: `usage.cache_read_input_tokens`
453
+ * - Google (implicit caching): `usageMetadata.cachedContentTokenCount`
454
+ * - OpenAI: `usage.prompt_tokens_details.cached_tokens`
455
+ *
456
+ * Powers the cost-and-efficiency-watcher (interfaces/kgauto.md, alpha.4):
457
+ * `tokens_in - cache_read_input_tokens` is the un-cached new context per call.
458
+ */
459
+ cacheReadInputTokens?: number;
460
+ /**
461
+ * Cache creation input tokens (Anthropic-specific).
462
+ * `usage.cache_creation_input_tokens`. The first call that pays the 25%
463
+ * upcharge to write a cache marker; subsequent calls hit `cacheRead`.
464
+ */
465
+ cacheCreationInputTokens?: number;
466
+ /**
467
+ * Time to first token (ms). Optional; populated when the provider/SDK
468
+ * surfaces it. Distinct from `latencyMs` (end-to-end wall clock).
469
+ */
470
+ ttftMs?: number;
389
471
  }
390
472
 
391
473
  /**
@@ -91,6 +91,40 @@ interface Constraints {
91
91
  /** Override target model selection — if set, compiler uses this instead of routing. */
92
92
  forceModel?: string;
93
93
  }
94
+ /**
95
+ * Cache marker policy for the messages array (history + currentTurn).
96
+ *
97
+ * Anthropic positional caching: a `cache_control` marker on a content block
98
+ * tells the API "remember the prefix up through this block." On a subsequent
99
+ * request whose first N tokens match, those N billed at the cached rate
100
+ * (10% of the input price). Without a marker, every call re-pays for the
101
+ * entire history.
102
+ *
103
+ * - `'none'` (default when omitted): no history cache marker. System-level
104
+ * cache markers from `PromptSection.cacheable=true` still apply.
105
+ * - `'all-but-latest'`: marks the message immediately preceding `currentTurn`
106
+ * (the last history entry). On the next call, that entire history prefix
107
+ * is cacheable. Good fit for chat/agent loops where every prior turn is
108
+ * stable.
109
+ * - `'fixed-suffix'`: marks the message `suffix` positions from the end of
110
+ * `history`. Use when the last few turns are volatile (e.g., scratchpad,
111
+ * draft revisions) but the earlier prefix is stable.
112
+ *
113
+ * For non-Anthropic providers, no wire-format marker is emitted (Gemini /
114
+ * OpenAI / DeepSeek implicit caching takes effect automatically when a
115
+ * stable prefix is reused). The compiler still computes
116
+ * `diagnostics.historyCacheableTokens` for telemetry on every provider.
117
+ *
118
+ * alpha.5.
119
+ */
120
+ type HistoryCachePolicy = {
121
+ strategy: 'none';
122
+ } | {
123
+ strategy: 'all-but-latest';
124
+ } | {
125
+ strategy: 'fixed-suffix';
126
+ suffix: number;
127
+ };
94
128
  /**
95
129
  * Consumer-declared policy for model selection. Lives outside the IR
96
130
  * (passed via CompileOptions) because it's a SESSION/APP-level constraint,
@@ -146,6 +180,12 @@ interface PromptIR {
146
180
  models: string[];
147
181
  /** Compile constraints. */
148
182
  constraints?: Constraints;
183
+ /**
184
+ * Cache marker placement policy for the messages array. Default = no
185
+ * history cache markers. See `HistoryCachePolicy` for semantics.
186
+ * alpha.5.
187
+ */
188
+ historyCachePolicy?: HistoryCachePolicy;
149
189
  }
150
190
  type Provider = 'anthropic' | 'google' | 'openai' | 'deepseek' | 'mistral' | 'xai';
151
191
  /**
@@ -240,6 +280,16 @@ interface CompileResult {
240
280
  historyDropped: number;
241
281
  cacheableTokens: number;
242
282
  estimatedCacheSavingsUsd: number;
283
+ /**
284
+ * Tokens in `history` (and `currentTurn` when before the marker) that
285
+ * fall within the cacheable prefix per `historyCachePolicy`. Always
286
+ * computed; only Anthropic actually emits a wire-format marker. For
287
+ * Gemini / OpenAI / DeepSeek, this represents the theoretical cacheable
288
+ * prefix that implicit caching may pick up — useful telemetry for the
289
+ * brain to learn which (app, model, archetype) tuples benefit most
290
+ * from history caching. alpha.5.
291
+ */
292
+ historyCacheableTokens: number;
243
293
  };
244
294
  }
245
295
  /**
@@ -386,6 +436,38 @@ interface RecordInput {
386
436
  * the originally-requested model.
387
437
  */
388
438
  actualModel?: string;
439
+ /**
440
+ * Override `mutations_applied` for this outcome. Set by `call()` when
441
+ * fallback fires — the served compile's mutations (which actually shaped
442
+ * the request that went on the wire) replace the initial compile's
443
+ * mutations (registered against the handle). Without this override, fallback
444
+ * traffic is attributed to the initial compile's mutations and the brain's
445
+ * mutation effectiveness stats become misleading.
446
+ *
447
+ * alpha.4: extends s11 truth-in-logging to mutations.
448
+ */
449
+ mutationsApplied?: string[];
450
+ /**
451
+ * Cache read input tokens, when supported by the provider.
452
+ * - Anthropic: `usage.cache_read_input_tokens`
453
+ * - Google (implicit caching): `usageMetadata.cachedContentTokenCount`
454
+ * - OpenAI: `usage.prompt_tokens_details.cached_tokens`
455
+ *
456
+ * Powers the cost-and-efficiency-watcher (interfaces/kgauto.md, alpha.4):
457
+ * `tokens_in - cache_read_input_tokens` is the un-cached new context per call.
458
+ */
459
+ cacheReadInputTokens?: number;
460
+ /**
461
+ * Cache creation input tokens (Anthropic-specific).
462
+ * `usage.cache_creation_input_tokens`. The first call that pays the 25%
463
+ * upcharge to write a cache marker; subsequent calls hit `cacheRead`.
464
+ */
465
+ cacheCreationInputTokens?: number;
466
+ /**
467
+ * Time to first token (ms). Optional; populated when the provider/SDK
468
+ * surfaces it. Distinct from `latencyMs` (end-to-end wall clock).
469
+ */
470
+ ttftMs?: number;
389
471
  }
390
472
 
391
473
  /**
@@ -1,2 +1,2 @@
1
- export { f as ALIASES, g as CacheStrategy, j as CliffRule, L as LoweringSpec, M as ModelProfile, q as RecoveryRule, S as StructuredOutputCapability, r as SystemPromptMode, t as allProfiles, u as getProfile, v as profilesByProvider, w as tryGetProfile } from './profiles-BiyrF36f.mjs';
1
+ export { f as ALIASES, g as CacheStrategy, j as CliffRule, L as LoweringSpec, M as ModelProfile, q as RecoveryRule, S as StructuredOutputCapability, r as SystemPromptMode, t as allProfiles, u as getProfile, v as profilesByProvider, w as tryGetProfile } from './profiles-DHdCRBVH.mjs';
2
2
  import './dialect.mjs';
@@ -1,2 +1,2 @@
1
- export { f as ALIASES, g as CacheStrategy, j as CliffRule, L as LoweringSpec, M as ModelProfile, q as RecoveryRule, S as StructuredOutputCapability, r as SystemPromptMode, t as allProfiles, u as getProfile, v as profilesByProvider, w as tryGetProfile } from './profiles-C5lVqF8_.js';
1
+ export { f as ALIASES, g as CacheStrategy, j as CliffRule, L as LoweringSpec, M as ModelProfile, q as RecoveryRule, S as StructuredOutputCapability, r as SystemPromptMode, t as allProfiles, u as getProfile, v as profilesByProvider, w as tryGetProfile } from './profiles-MGq5Tnjv.js';
2
2
  import './dialect.js';
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@warmdrift/kgauto-compiler",
3
- "version": "2.0.0-alpha.3",
3
+ "version": "2.0.0-alpha.5",
4
4
  "description": "Prompt compiler + central learning brain for multi-model AI apps. Swap models without rewriting prompts.",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",