@warmdrift/kgauto-compiler 2.0.0-alpha.14 → 2.0.0-alpha.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -269,33 +269,69 @@ function passCompressHistory(ir, opts = {}) {
269
269
  const summarizeAboveTokens = opts.summarizeAboveTokens;
270
270
  const historyTokensTotal = totalHistoryTokens(history);
271
271
  const countThresholdHit = history.length > summarizeOlderThan;
272
- const tokenThresholdHit = summarizeAboveTokens !== void 0 && historyTokensTotal > summarizeAboveTokens && history.length > keepRecent;
272
+ const tokenThresholdHit = summarizeAboveTokens !== void 0 && historyTokensTotal > summarizeAboveTokens;
273
273
  if (!countThresholdHit && !tokenThresholdHit) {
274
274
  return { value: ir, mutations: [], historyTokensTotal };
275
275
  }
276
- const cutIndex = history.length - keepRecent;
277
- const old = history.slice(0, cutIndex);
278
- const recent = history.slice(cutIndex);
279
- const userTurns = old.filter((m) => m.role === "user");
280
- const firstUserLine = userTurns[0]?.content.split("\n")[0]?.slice(0, 200) ?? "";
281
- const oldTokens = totalHistoryTokens(old);
282
- const trigger = tokenThresholdHit && !countThresholdHit ? "tokens" : "count";
283
- const summary = {
284
- role: "system",
285
- content: `[Earlier conversation: ${old.length} turns omitted (~${oldTokens} tokens). First user message: "${firstUserLine}"]`
286
- };
287
- return {
288
- value: { ...ir, history: [summary, ...recent] },
289
- mutations: [
290
- {
291
- id: `compress-history-${old.length}`,
292
- source: "static_pass",
293
- passName: "compress_history",
294
- description: trigger === "tokens" ? `Compressed ${old.length} old turns (~${oldTokens} tokens) into 1 summary \u2014 token threshold ${summarizeAboveTokens} exceeded (kept ${keepRecent} recent)` : `Compressed ${old.length} old turns into 1 summary (kept ${keepRecent} recent)`
276
+ if (history.length > keepRecent) {
277
+ const cutIndex = history.length - keepRecent;
278
+ const old = history.slice(0, cutIndex);
279
+ const recent = history.slice(cutIndex);
280
+ const userTurns = old.filter((m) => m.role === "user");
281
+ const firstUserLine = userTurns[0]?.content.split("\n")[0]?.slice(0, 200) ?? "";
282
+ const oldTokens = totalHistoryTokens(old);
283
+ const trigger = tokenThresholdHit && !countThresholdHit ? "tokens" : "count";
284
+ const summary = {
285
+ role: "system",
286
+ content: `[Earlier conversation: ${old.length} turns omitted (~${oldTokens} tokens). First user message: "${firstUserLine}"]`
287
+ };
288
+ return {
289
+ value: { ...ir, history: [summary, ...recent] },
290
+ mutations: [
291
+ {
292
+ id: `compress-history-${old.length}`,
293
+ source: "static_pass",
294
+ passName: "compress_history",
295
+ description: trigger === "tokens" ? `Compressed ${old.length} old turns (~${oldTokens} tokens) into 1 summary \u2014 token threshold ${summarizeAboveTokens} exceeded (kept ${keepRecent} recent)` : `Compressed ${old.length} old turns into 1 summary (kept ${keepRecent} recent)`
296
+ }
297
+ ],
298
+ historyTokensTotal
299
+ };
300
+ }
301
+ if (tokenThresholdHit) {
302
+ let fattestIdx = -1;
303
+ let fattestTokens = 0;
304
+ for (let i = 0; i < history.length; i++) {
305
+ const m = history[i];
306
+ if (!m || typeof m.content !== "string") continue;
307
+ const t = countTokens(m.content);
308
+ if (t > fattestTokens) {
309
+ fattestTokens = t;
310
+ fattestIdx = i;
295
311
  }
296
- ],
297
- historyTokensTotal
298
- };
312
+ }
313
+ const FAT_DOMINANCE_FLOOR = 0.3;
314
+ const fattest = fattestIdx >= 0 ? history[fattestIdx] : void 0;
315
+ if (fattest && historyTokensTotal > 0 && fattestTokens / historyTokensTotal >= FAT_DOMINANCE_FLOOR) {
316
+ const firstLine = fattest.content.split("\n")[0]?.slice(0, 200) ?? "";
317
+ const newContent = `[Earlier ${fattest.role} message content omitted: ~${fattestTokens} tokens. Preview: "${firstLine}"]`;
318
+ const newHistory = history.slice();
319
+ newHistory[fattestIdx] = { ...fattest, content: newContent };
320
+ return {
321
+ value: { ...ir, history: newHistory },
322
+ mutations: [
323
+ {
324
+ id: `compress-fat-message-${fattestIdx}`,
325
+ source: "static_pass",
326
+ passName: "compress_history",
327
+ description: `Replaced fat ${fattest.role} message #${fattestIdx} content (~${fattestTokens} of ${historyTokensTotal} tokens, ${Math.round(fattestTokens / historyTokensTotal * 100)}% of history) with summary stub \u2014 token threshold ${summarizeAboveTokens} exceeded (history.length ${history.length} <= keepRecent ${keepRecent}, slice not possible)`
328
+ }
329
+ ],
330
+ historyTokensTotal
331
+ };
332
+ }
333
+ }
334
+ return { value: ir, mutations: [], historyTokensTotal };
299
335
  }
300
336
  function passApplyCliffs(ir, profile, estimatedInputTokens) {
301
337
  const mutations = [];
@@ -1438,140 +1474,554 @@ var PROFILES_RAW = [
1438
1474
  hunt: 4
1439
1475
  // sequential tools — same as V4-Flash
1440
1476
  }
1441
- }
1442
- ];
1443
- var ALIASES = {
1444
- // DeepSeek's own model routing both names served by V4-Flash.
1445
- "deepseek-chat": "deepseek-v4-flash",
1446
- "deepseek-reasoner": "deepseek-v4-flash",
1447
- // Legacy kgauto typo actual API alias is dash-form (alpha.1 had dot).
1448
- "claude-haiku-4.5": "claude-haiku-4-5"
1449
- };
1450
- var brainHook = {};
1451
- function _setProfileBrainHook(hook) {
1452
- brainHook = hook;
1453
- }
1454
- function canonicalId(id) {
1455
- return brainHook.resolveAlias?.(id) ?? ALIASES[id] ?? id;
1456
- }
1457
- var PROFILE_INDEX = new Map(
1458
- PROFILES_RAW.map((p) => [p.id, p])
1459
- );
1460
- function getProfile(id) {
1461
- const canonical = canonicalId(id);
1462
- const fromBrain = brainHook.getProfile?.(canonical);
1463
- if (fromBrain) return fromBrain;
1464
- const p = PROFILE_INDEX.get(canonical);
1465
- if (!p) {
1466
- const known = [...PROFILE_INDEX.keys(), ...Object.keys(ALIASES)].join(", ");
1467
- throw new Error(`Unknown model id: "${id}". Known: ${known}`);
1468
- }
1469
- return p;
1470
- }
1471
- function tryGetProfile(id) {
1472
- const canonical = canonicalId(id);
1473
- return brainHook.getProfile?.(canonical) ?? PROFILE_INDEX.get(canonical);
1474
- }
1475
- function allProfiles() {
1476
- return PROFILES_RAW;
1477
- }
1478
- function allProfilesRaw() {
1479
- return PROFILES_RAW;
1480
- }
1481
- function profilesByProvider(provider) {
1482
- return PROFILES_RAW.filter((p) => p.provider === provider);
1483
- }
1484
-
1485
- // src/advisor.ts
1486
- function runAdvisor(ir, result, profile, policy) {
1487
- const out = [];
1488
- out.push(...detectCachingOff(ir, profile));
1489
- out.push(...detectSingleChunkSystem(ir, profile));
1490
- out.push(...detectToolBloat(ir, result));
1491
- out.push(...detectHistoryUncached(ir, profile));
1492
- out.push(...detectSingleModelArray(ir, policy));
1493
- return out;
1494
- }
1495
- function detectCachingOff(ir, profile) {
1496
- if (profile.provider !== "anthropic") return [];
1497
- const totalChars = ir.sections.reduce((s, sec) => s + sec.text.length, 0);
1498
- if (totalChars < 2e3) return [];
1499
- const anyCacheable = ir.sections.some((s) => s.cacheable === true);
1500
- if (anyCacheable) return [];
1501
- return [
1502
- {
1503
- level: "warn",
1504
- code: "caching-off-on-claude",
1505
- message: `System prompt is ${totalChars} chars on Anthropic but no PromptSection has cacheable=true. Anthropic prompt caching cuts cached-prefix input cost by ~90% on subsequent calls; without it, every turn re-pays full price for the static system context.`,
1506
- suggestion: "Mark stable system sections (role, persona, tool policy) with `cacheable: true`. The lowering pass concatenates cacheable sections into a single cache-controlled block before the dynamic ones.",
1507
- docsUrl: "https://github.com/stue/command-center/blob/main/interfaces/kgauto.md#best-practice-advisories"
1508
- }
1509
- ];
1510
- }
1511
- function detectSingleChunkSystem(ir, profile) {
1512
- if (profile.provider !== "anthropic") return [];
1513
- if (ir.sections.length !== 1) return [];
1514
- const only = ir.sections[0];
1515
- if (!only || only.text.length <= 1e3) return [];
1516
- return [
1517
- {
1518
- level: "info",
1519
- code: "single-chunk-system",
1520
- message: `System prompt is a single ${only.text.length}-char chunk. Splitting into NamedChunks (static role/persona vs dynamic context) gives the lowering pass a finer cache-marker boundary \u2014 only the static portion needs to be byte-stable for the cache to hit.`,
1521
- suggestion: "Refactor the system builder to return an array of `PromptSection` shaped { id, text, cacheable?: boolean }. Static chunks (role, persona, tool policy) get `cacheable: true`; dynamic ones (current context, today's date) don't.",
1522
- docsUrl: "https://github.com/stue/command-center/blob/main/interfaces/kgauto.md#best-practice-advisories"
1477
+ },
1478
+ // ── OpenAI ──
1479
+ // alpha.16 (2026-05-17): close the half-supported provider gap. env.ts
1480
+ // already registered OPENAI_API_KEY + executeOpenAI + normalizeOpenAILike
1481
+ // + lowerOpenAI all existed; profile entries were missing, so the
1482
+ // alpha.10 auto-filter would mark openai-keyed models reachable but
1483
+ // there were no profiles to filter IN. Half-supported is now fully
1484
+ // supported. PB request `openai-provider-profiles` (2026-05-16).
1485
+ //
1486
+ // Profile data verified against developers.openai.com/api/docs/pricing
1487
+ // + per-model pages 2026-05-17. L-049/L-081 step-zero: no AI-trained
1488
+ // numbers — fetched live from OpenAI's docs. As of 2026-05, OpenAI's
1489
+ // current flagship is gpt-5.5 (2025-12 cutoff); gpt-5.4-{base,mini,nano}
1490
+ // are the workhorse family. gpt-4.1 + gpt-4o are legacy.
1491
+ //
1492
+ // Both 5.5 and 5.4 carry a 272K input-token pricing cliff (2x input,
1493
+ // 1.5x output beyond that). Modeled as a `downgrade_quality_warning`
1494
+ // cliff because it ranks the model down at large-context shapes — the
1495
+ // semantics of "this model is now 2x more expensive" map onto the
1496
+ // existing penalty mechanism. Cost-watcher will catch high-context
1497
+ // spikes empirically; the cliff prevents naive routing into the doubled
1498
+ // pricing zone.
1499
+ {
1500
+ id: "gpt-5.5",
1501
+ verifiedAgainstDocs: "2026-05-17",
1502
+ provider: "openai",
1503
+ status: "current",
1504
+ maxContextTokens: 105e4,
1505
+ maxOutputTokens: 128e3,
1506
+ maxTools: 64,
1507
+ parallelToolCalls: true,
1508
+ structuredOutput: "native",
1509
+ systemPromptMode: "inline",
1510
+ streaming: true,
1511
+ cliffs: [
1512
+ {
1513
+ metric: "input_tokens",
1514
+ threshold: 272e3,
1515
+ action: "downgrade_quality_warning",
1516
+ reason: "OpenAI pricing tier shift: >272K input tokens billed at 2x input + 1.5x output rates"
1517
+ }
1518
+ ],
1519
+ costInputPer1m: 5,
1520
+ costOutputPer1m: 30,
1521
+ lowering: {
1522
+ system: { mode: "inline" },
1523
+ // OpenAI caching is implicit (auto-applied to repeated prefixes
1524
+ // ≥1024 tokens for prompt_tokens_details.cached_tokens). No
1525
+ // wire-format marker. Discount: 10x for cached input ($0.50/$5.00).
1526
+ cache: { strategy: "unsupported", minTokens: 1024, discount: 0.1 },
1527
+ tools: { format: "openai" }
1528
+ },
1529
+ recovery: [
1530
+ { signal: "rate_limit", action: "escalate", reason: "429 \u2014 escalate to fallback chain" },
1531
+ { signal: "model_not_found", action: "escalate", reason: "Model deprecated/renamed \u2014 escalate (L-061)" }
1532
+ ],
1533
+ strengths: ["reasoning", "agentic_coding", "long_context", "structured_output", "reliable_tool_use", "reasoning_effort_knob"],
1534
+ weaknesses: ["cost", "pricing_cliff_at_272k"],
1535
+ notes: "OpenAI frontier (2026-05). 1M context (1.05M total), 128K max output, 2025-12 cutoff. Reasoning effort knob (none/low/medium/high/xhigh). Pricing cliff at 272K input.",
1536
+ // Frontier-tier perf hypothesis. Anchored to Opus 4.7 row (similar
1537
+ // price/positioning). Brain evidence will refine; no telemetry yet.
1538
+ archetypePerf: {
1539
+ critique: 9,
1540
+ plan: 9,
1541
+ generate: 9,
1542
+ ask: 9,
1543
+ extract: 9,
1544
+ transform: 9,
1545
+ hunt: 8,
1546
+ // parallel tool support good but cliff at 272K hurts deep multi-step
1547
+ summarize: 7,
1548
+ // overkill for tolerant archetype
1549
+ classify: 7
1550
+ // overkill; cheaper models cover this
1523
1551
  }
1524
- ];
1525
- }
1526
- function detectToolBloat(ir, result) {
1527
- const SHORT_OUTPUT = /* @__PURE__ */ new Set([
1528
- "classify",
1529
- "extract",
1530
- "summarize",
1531
- "transform",
1532
- "critique"
1533
- ]);
1534
- if (!ir.tools || ir.tools.length === 0) return [];
1535
- const toolsKept = result.diagnostics.toolsKept;
1536
- if (toolsKept <= 10) return [];
1537
- if (!SHORT_OUTPUT.has(ir.intent.archetype)) return [];
1538
- return [
1539
- {
1540
- level: "warn",
1541
- code: "tool-bloat",
1542
- message: `${toolsKept} tools kept after the relevance pass for archetype="${ir.intent.archetype}" (consumer declared ${ir.tools.length}). This archetype is short-output and rarely needs more than 3 tools; each tool definition eats ~350 tokens of context budget.`,
1543
- suggestion: "Tighten `relevanceByIntent: { [archetype]: 0..1 }` per ToolDefinition. Tools below `toolRelevanceThreshold` (default 0.2) get dropped. Without `relevanceByIntent`, every tool defaults to neutral (0.5) and stays.",
1544
- docsUrl: "https://github.com/stue/kgauto/blob/main/v2/README.md#tools"
1552
+ },
1553
+ {
1554
+ id: "gpt-5.4",
1555
+ verifiedAgainstDocs: "2026-05-17",
1556
+ provider: "openai",
1557
+ status: "current",
1558
+ maxContextTokens: 105e4,
1559
+ maxOutputTokens: 128e3,
1560
+ maxTools: 64,
1561
+ parallelToolCalls: true,
1562
+ structuredOutput: "native",
1563
+ systemPromptMode: "inline",
1564
+ streaming: true,
1565
+ cliffs: [
1566
+ {
1567
+ metric: "input_tokens",
1568
+ threshold: 272e3,
1569
+ action: "downgrade_quality_warning",
1570
+ reason: "OpenAI pricing tier shift: >272K input tokens billed at 2x input + 1.5x output rates"
1571
+ }
1572
+ ],
1573
+ costInputPer1m: 2.5,
1574
+ costOutputPer1m: 15,
1575
+ lowering: {
1576
+ system: { mode: "inline" },
1577
+ cache: { strategy: "unsupported", minTokens: 1024, discount: 0.1 },
1578
+ tools: { format: "openai" }
1579
+ },
1580
+ recovery: [
1581
+ { signal: "rate_limit", action: "escalate", reason: "429 \u2014 escalate to fallback chain" },
1582
+ { signal: "model_not_found", action: "escalate", reason: "Model deprecated/renamed \u2014 escalate (L-061)" }
1583
+ ],
1584
+ strengths: ["reasoning", "long_context", "structured_output", "reliable_tool_use"],
1585
+ weaknesses: ["pricing_cliff_at_272k"],
1586
+ notes: "OpenAI workhorse (2026-05). 1M context (1.05M total), 128K max output, 2025-08 cutoff. Pricing cliff at 272K input. Pairs cleanly with Sonnet 4.6 on cost ($2.50/$15.00 vs $3.00/$15.00).",
1587
+ // Anchored to Sonnet 4.6 row (similar price/positioning). Slight
1588
+ // anthropic-side edge on agentic coding per master plan vibe.
1589
+ archetypePerf: {
1590
+ critique: 8,
1591
+ plan: 8,
1592
+ generate: 8,
1593
+ ask: 8,
1594
+ extract: 8,
1595
+ transform: 8,
1596
+ hunt: 7,
1597
+ summarize: 7,
1598
+ classify: 7
1545
1599
  }
1546
- ];
1547
- }
1548
- function detectHistoryUncached(ir, profile) {
1549
- if (profile.provider !== "anthropic") return [];
1550
- if (!ir.history || ir.history.length < 2) return [];
1551
- if (ir.historyCachePolicy && ir.historyCachePolicy.strategy !== "none") {
1552
- return [];
1553
- }
1554
- return [
1555
- {
1556
- level: "warn",
1557
- code: "history-uncached-on-claude",
1558
- message: `${ir.history.length} history messages on Anthropic with no historyCachePolicy. Every turn re-pays for the full conversation context; with caching, subsequent turns hit the cache at ~10% the input cost.`,
1559
- suggestion: "Set `historyCachePolicy: { strategy: 'all-but-latest' }` on this IR. The lowering pass marks the message immediately preceding currentTurn with cache_control; subsequent turns whose history prefix matches byte-for-byte hit the cache.",
1560
- docsUrl: "https://github.com/stue/command-center/blob/main/interfaces/kgauto.md#best-practice-advisories"
1600
+ },
1601
+ {
1602
+ id: "gpt-5.4-mini",
1603
+ verifiedAgainstDocs: "2026-05-17",
1604
+ provider: "openai",
1605
+ status: "current",
1606
+ maxContextTokens: 4e5,
1607
+ maxOutputTokens: 128e3,
1608
+ maxTools: 64,
1609
+ parallelToolCalls: true,
1610
+ structuredOutput: "native",
1611
+ systemPromptMode: "inline",
1612
+ streaming: true,
1613
+ cliffs: [],
1614
+ costInputPer1m: 0.75,
1615
+ costOutputPer1m: 4.5,
1616
+ lowering: {
1617
+ system: { mode: "inline" },
1618
+ cache: { strategy: "unsupported", minTokens: 1024, discount: 0.1 },
1619
+ tools: { format: "openai" }
1620
+ },
1621
+ recovery: [
1622
+ { signal: "rate_limit", action: "escalate", reason: "429 \u2014 escalate to fallback chain" },
1623
+ { signal: "model_not_found", action: "escalate", reason: "Model deprecated/renamed \u2014 escalate (L-061)" }
1624
+ ],
1625
+ strengths: ["cost", "speed", "agentic_coding", "structured_output", "reliable_tool_use"],
1626
+ weaknesses: ["reasoning_depth"],
1627
+ notes: "OpenAI mini-tier (2026-05). 400K context, 128K max output, 2025-08 cutoff. OpenAI describes as 'strongest mini model for coding, computer use, subagents.' Cache discount 10x ($0.075 input).",
1628
+ // Mini-tier hypothesis. Anchored to Haiku 4.5 + Flash row pricing.
1629
+ // Cost is slightly higher than Haiku ($0.75 vs $0.50 input) but
1630
+ // OpenAI claims strong coding/subagent perf.
1631
+ archetypePerf: {
1632
+ ask: 7,
1633
+ generate: 7,
1634
+ extract: 7,
1635
+ transform: 7,
1636
+ classify: 7,
1637
+ summarize: 7,
1638
+ hunt: 7,
1639
+ plan: 6,
1640
+ critique: 5
1641
+ // reasoning depth gap — frontier models handle this
1561
1642
  }
1562
- ];
1563
- }
1564
- function detectSingleModelArray(ir, policy) {
1565
- if (ir.models.length !== 1) return [];
1566
- if (policy?.posture === "locked") return [];
1567
- const only = ir.models[0];
1568
- return [
1569
- {
1570
- level: "warn",
1571
- code: "single-model-array",
1572
- message: `\`ir.models\` has length 1 (only "${only}") and posture is not 'locked'. A single-model chain has no safety net \u2014 the first 429 / 5xx / cliff hits the user as a failure. Master plan \xA71.2 closes the reliability gap with a 2-step minimum.`,
1573
- suggestion: "Use `getDefaultFallbackChain({ archetype: ir.intent.archetype, primary: '" + only + "', posture: 'preferred' })` for a user-anchored chain, or `getDefaultFallbackChain({ archetype, posture: 'open' })` for library-picked. If single-model is intentional (compliance/brand promise), set `policy.posture = 'locked'` to silence this rule.",
1574
- docsUrl: "https://github.com/stue/command-center/blob/main/interfaces/kgauto.md#single-model-array"
1643
+ },
1644
+ {
1645
+ id: "gpt-5.4-nano",
1646
+ verifiedAgainstDocs: "2026-05-17",
1647
+ provider: "openai",
1648
+ status: "current",
1649
+ maxContextTokens: 4e5,
1650
+ maxOutputTokens: 128e3,
1651
+ maxTools: 64,
1652
+ parallelToolCalls: true,
1653
+ structuredOutput: "native",
1654
+ systemPromptMode: "inline",
1655
+ streaming: true,
1656
+ cliffs: [],
1657
+ costInputPer1m: 0.2,
1658
+ costOutputPer1m: 1.25,
1659
+ lowering: {
1660
+ system: { mode: "inline" },
1661
+ cache: { strategy: "unsupported", minTokens: 1024, discount: 0.1 },
1662
+ tools: { format: "openai" }
1663
+ },
1664
+ recovery: [
1665
+ { signal: "rate_limit", action: "escalate", reason: "429 \u2014 escalate to fallback chain" },
1666
+ { signal: "model_not_found", action: "escalate", reason: "Model deprecated/renamed \u2014 escalate (L-061)" }
1667
+ ],
1668
+ strengths: ["cost", "speed", "volume", "structured_output"],
1669
+ weaknesses: ["reasoning_depth", "no_computer_use"],
1670
+ notes: "OpenAI nano-tier (2026-05). 400K context, 128K max output, 2025-08 cutoff. 'Cheapest GPT-5.4-class for simple high-volume tasks.' No fine-tuning, no computer-use tools. Cache discount 10x.",
1671
+ // Nano-tier. Anchored to Flash-Lite row ($0.10/$0.40 vs nano's
1672
+ // $0.20/$1.25). Slightly more expensive than Flash-Lite but with
1673
+ // OpenAI brand reliability. Good fit for classify/summarize floor.
1674
+ archetypePerf: {
1675
+ classify: 7,
1676
+ summarize: 6,
1677
+ ask: 6,
1678
+ transform: 6,
1679
+ extract: 6,
1680
+ generate: 5,
1681
+ hunt: 5,
1682
+ plan: 4,
1683
+ critique: 3
1684
+ // not for reasoning archetypes
1685
+ }
1686
+ },
1687
+ // ── Auto-onboarded (UNVERIFIED) ──
1688
+ // Cloned by scripts/auto-onboard-models.mjs from a same-family template.
1689
+ // Each entry's pricing/context/cliffs/lowering reflects the template, NOT
1690
+ // provider docs. Verify before promoting status to 'current' (L-049/L-081).
1691
+ {
1692
+ id: "gemini-3-flash-preview",
1693
+ verifiedAgainstDocs: "UNVERIFIED-AUTO-ONBOARD",
1694
+ provider: "google",
1695
+ status: "preview",
1696
+ maxContextTokens: 1048576,
1697
+ maxOutputTokens: 65535,
1698
+ maxTools: 128,
1699
+ parallelToolCalls: true,
1700
+ structuredOutput: "native",
1701
+ systemPromptMode: "separate",
1702
+ streaming: true,
1703
+ cliffs: [
1704
+ {
1705
+ metric: "input_tokens",
1706
+ threshold: 8e3,
1707
+ action: "downgrade_quality_warning",
1708
+ reason: "Quality degrades significantly above ~8K context tokens"
1709
+ },
1710
+ {
1711
+ metric: "tool_count",
1712
+ threshold: 20,
1713
+ action: "drop_to_top_relevant",
1714
+ reason: "Tool reliability drops above ~20 tools (despite 128 hard limit)"
1715
+ },
1716
+ {
1717
+ metric: "thinking_with_short_output",
1718
+ threshold: 1,
1719
+ action: "force_thinking_budget_zero",
1720
+ reason: "Thinking tokens consume maxOutputTokens \u2014 empty response if drained"
1721
+ },
1722
+ {
1723
+ // s11 trust artifact (2026-05-02): brain showed 5/5 empty rate on
1724
+ // tt-intelligence/summarize/gemini-2.5-flash with tools offered.
1725
+ // v1's disable_thinking_for_short_output already fired and didn't
1726
+ // help — disabling thinking is necessary but not sufficient. Tools
1727
+ // present + summarize intent confuses Flash into a no-output state
1728
+ // (likely tool-decision purgatory). Strip tools entirely for this
1729
+ // archetype on this model.
1730
+ metric: "tool_count",
1731
+ threshold: 1,
1732
+ whenIntent: "summarize",
1733
+ action: "strip_tools",
1734
+ reason: "Gemini Flash returns empty when summarize intent has tools offered (5/5 empty rate observed in v1 prod 2026-04-19, replayed into v2 brain 2026-04-29)"
1735
+ }
1736
+ ],
1737
+ costInputPer1m: 0.3,
1738
+ costOutputPer1m: 2.5,
1739
+ lowering: {
1740
+ ...GOOGLE_LOWERING_BASE,
1741
+ thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
1742
+ },
1743
+ recovery: [
1744
+ {
1745
+ signal: "empty_response_after_tool",
1746
+ action: "retry_with_params",
1747
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
1748
+ maxRetries: 1,
1749
+ reason: "Known: empty after tool result \u2014 retry with thinking off"
1750
+ },
1751
+ {
1752
+ signal: "empty_response",
1753
+ action: "retry_with_params",
1754
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
1755
+ maxRetries: 1,
1756
+ reason: "Empty response \u2014 try with thinking off"
1757
+ },
1758
+ {
1759
+ signal: "malformed_function_call",
1760
+ action: "escalate",
1761
+ reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target"
1762
+ }
1763
+ ],
1764
+ strengths: ["speed", "volume", "classification", "1m_context", "cost"],
1765
+ weaknesses: ["complex_schemas", "large_tool_sets", "high_context_quality"],
1766
+ notes: "AUTO-ONBOARDED 2026-05-16 from `gemini-2.5-flash`. Pricing, context, cliffs are template-cloned and UNVERIFIED \u2014 confirm against provider docs before promoting status to 'current'.",
1767
+ // Master plan §6.2 anchor. Tier 0 for hunt (parallel tool throughput
1768
+ // 15-75 calls/step beats Sonnet — L-040), summarize, classify.
1769
+ archetypePerf: {
1770
+ hunt: 9,
1771
+ // L-040: parallel tool throughput 15-75/step
1772
+ classify: 7,
1773
+ // brain-validated, 218 rows
1774
+ summarize: 7,
1775
+ // brain-validated; cliff strips tools when present
1776
+ transform: 7,
1777
+ ask: 7,
1778
+ generate: 6,
1779
+ plan: 5,
1780
+ extract: 6,
1781
+ // alpha.8 MAX_TOKENS history on structured output
1782
+ critique: 4
1783
+ // reasoning shallower than Sonnet/Opus
1784
+ }
1785
+ },
1786
+ {
1787
+ // ── Gemini 2.5 Flash-Lite ──
1788
+ // Onboarded 2026-05-13 (s22) after the model-release watcher surfaced
1789
+ // it as a UNREGISTERED + NEW candidate. Released by Google July 2025,
1790
+ // stable. Positioned BELOW Flash on the cost/perf frontier:
1791
+ // input $0.10/M (Flash $0.30/M) — 3× cheaper
1792
+ // output $0.40/M (Flash $2.50/M) — 6× cheaper
1793
+ // cache $0.01/M — 1/10 of input (vs Flash 0.25 discount)
1794
+ // Cliffs are HYPOTHESIZED from Flash's known failure modes — Flash-Lite
1795
+ // is a smaller sibling, so we inherit Flash's cliffs at equal-or-tighter
1796
+ // thresholds. The brain will validate/relax these as evidence accumulates
1797
+ // per (archetype, model) tuple. Currently ZERO brain rows for this model.
1798
+ id: "gemini-3.1-flash-lite",
1799
+ verifiedAgainstDocs: "UNVERIFIED-AUTO-ONBOARD",
1800
+ provider: "google",
1801
+ status: "preview",
1802
+ maxContextTokens: 1048576,
1803
+ maxOutputTokens: 65536,
1804
+ maxTools: 128,
1805
+ parallelToolCalls: true,
1806
+ structuredOutput: "native",
1807
+ systemPromptMode: "separate",
1808
+ streaming: true,
1809
+ cliffs: [
1810
+ {
1811
+ metric: "input_tokens",
1812
+ threshold: 8e3,
1813
+ action: "downgrade_quality_warning",
1814
+ reason: "Inherited from Flash: quality degrades above ~8K. Smaller model \u2014 likely degrades faster. Re-tune from brain after n\u226520."
1815
+ },
1816
+ {
1817
+ metric: "tool_count",
1818
+ threshold: 10,
1819
+ action: "drop_to_top_relevant",
1820
+ reason: "Conservative: Flash drops at 20, Flash-Lite is smaller \u2014 assume tighter ceiling until brain proves otherwise."
1821
+ },
1822
+ {
1823
+ metric: "thinking_with_short_output",
1824
+ threshold: 1,
1825
+ action: "force_thinking_budget_zero",
1826
+ reason: "Thinking enabled per Google API (thinking: true). Same drain risk as Flash \u2014 thinking tokens consume maxOutputTokens."
1827
+ },
1828
+ {
1829
+ // Strong prior: Flash hit 5/5 empty rate on summarize+tools (s11
1830
+ // trust artifact, kgauto commit 3872832). Flash-Lite shares the
1831
+ // same architectural family — almost certainly inherits this cliff.
1832
+ // Ship the guard preemptively; brain telemetry confirms or relaxes.
1833
+ metric: "tool_count",
1834
+ threshold: 1,
1835
+ whenIntent: "summarize",
1836
+ action: "strip_tools",
1837
+ reason: "Inherited from Flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on Flash-Lite specifically."
1838
+ }
1839
+ ],
1840
+ costInputPer1m: 0.1,
1841
+ costOutputPer1m: 0.4,
1842
+ lowering: {
1843
+ ...GOOGLE_LOWERING_BASE,
1844
+ // Cache discount 10× (vs Flash 4×) — Google's spec is $0.01/M cache vs
1845
+ // $0.10/M input. Material for repeat-prompt workloads (classify shape).
1846
+ cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
1847
+ thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
1848
+ },
1849
+ recovery: [
1850
+ {
1851
+ signal: "empty_response_after_tool",
1852
+ action: "retry_with_params",
1853
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
1854
+ maxRetries: 1,
1855
+ reason: "Known on Flash family: empty after tool result \u2014 retry with thinking off."
1856
+ },
1857
+ {
1858
+ signal: "empty_response",
1859
+ action: "retry_with_params",
1860
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
1861
+ maxRetries: 1,
1862
+ reason: "Empty response \u2014 try with thinking off."
1863
+ },
1864
+ {
1865
+ signal: "malformed_function_call",
1866
+ action: "escalate",
1867
+ reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target."
1868
+ }
1869
+ ],
1870
+ strengths: ["lowest_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
1871
+ weaknesses: ["complex_reasoning", "large_tool_sets", "complex_schemas", "structured_output_unproven", "long_context_quality"],
1872
+ notes: "AUTO-ONBOARDED 2026-05-16 from `gemini-2.5-flash-lite`. Pricing, context, cliffs are template-cloned and UNVERIFIED \u2014 confirm against provider docs before promoting status to 'current'.",
1873
+ // Tier 3 emergency floor for summarize/classify chains. ZERO brain
1874
+ // rows — all values are starter hypotheses anchored to "smaller
1875
+ // sibling of Flash, at-or-below Flash perf on every archetype." The
1876
+ // first 50 brain rows per archetype will validate or relax these.
1877
+ archetypePerf: {
1878
+ classify: 6,
1879
+ // starter hypothesis — verify (Flash is 7, lite likely ≤)
1880
+ summarize: 6,
1881
+ // starter hypothesis — verify; cliff strips tools
1882
+ transform: 6,
1883
+ // starter hypothesis — verify
1884
+ ask: 5,
1885
+ hunt: 5,
1886
+ generate: 4,
1887
+ extract: 4,
1888
+ plan: 3,
1889
+ critique: 3
1890
+ }
1891
+ }
1892
+ ];
1893
+ var ALIASES = {
1894
+ // DeepSeek's own model routing — both names served by V4-Flash.
1895
+ "deepseek-chat": "deepseek-v4-flash",
1896
+ "deepseek-reasoner": "deepseek-v4-flash",
1897
+ // Legacy kgauto typo — actual API alias is dash-form (alpha.1 had dot).
1898
+ "claude-haiku-4.5": "claude-haiku-4-5"
1899
+ };
1900
+ var brainHook = {};
1901
+ function _setProfileBrainHook(hook) {
1902
+ brainHook = hook;
1903
+ }
1904
+ function canonicalId(id) {
1905
+ return brainHook.resolveAlias?.(id) ?? ALIASES[id] ?? id;
1906
+ }
1907
+ var PROFILE_INDEX = new Map(
1908
+ PROFILES_RAW.map((p) => [p.id, p])
1909
+ );
1910
+ function getProfile(id) {
1911
+ const canonical = canonicalId(id);
1912
+ const fromBrain = brainHook.getProfile?.(canonical);
1913
+ if (fromBrain) return fromBrain;
1914
+ const p = PROFILE_INDEX.get(canonical);
1915
+ if (!p) {
1916
+ const known = [...PROFILE_INDEX.keys(), ...Object.keys(ALIASES)].join(", ");
1917
+ throw new Error(`Unknown model id: "${id}". Known: ${known}`);
1918
+ }
1919
+ return p;
1920
+ }
1921
+ function tryGetProfile(id) {
1922
+ const canonical = canonicalId(id);
1923
+ return brainHook.getProfile?.(canonical) ?? PROFILE_INDEX.get(canonical);
1924
+ }
1925
+ function allProfiles() {
1926
+ return PROFILES_RAW;
1927
+ }
1928
+ function allProfilesRaw() {
1929
+ return PROFILES_RAW;
1930
+ }
1931
+ function profilesByProvider(provider) {
1932
+ return PROFILES_RAW.filter((p) => p.provider === provider);
1933
+ }
1934
+
1935
+ // src/advisor.ts
1936
+ function runAdvisor(ir, result, profile, policy) {
1937
+ const out = [];
1938
+ out.push(...detectCachingOff(ir, profile));
1939
+ out.push(...detectSingleChunkSystem(ir, profile));
1940
+ out.push(...detectToolBloat(ir, result));
1941
+ out.push(...detectHistoryUncached(ir, profile));
1942
+ out.push(...detectSingleModelArray(ir, policy));
1943
+ return out;
1944
+ }
1945
+ function detectCachingOff(ir, profile) {
1946
+ if (profile.provider !== "anthropic") return [];
1947
+ const totalChars = ir.sections.reduce((s, sec) => s + sec.text.length, 0);
1948
+ if (totalChars < 2e3) return [];
1949
+ const anyCacheable = ir.sections.some((s) => s.cacheable === true);
1950
+ if (anyCacheable) return [];
1951
+ return [
1952
+ {
1953
+ level: "warn",
1954
+ code: "caching-off-on-claude",
1955
+ message: `System prompt is ${totalChars} chars on Anthropic but no PromptSection has cacheable=true. Anthropic prompt caching cuts cached-prefix input cost by ~90% on subsequent calls; without it, every turn re-pays full price for the static system context.`,
1956
+ suggestion: "Mark stable system sections (role, persona, tool policy) with `cacheable: true`. The lowering pass concatenates cacheable sections into a single cache-controlled block before the dynamic ones.",
1957
+ docsUrl: "https://github.com/stue/command-center/blob/main/interfaces/kgauto.md#best-practice-advisories"
1958
+ }
1959
+ ];
1960
+ }
1961
+ function detectSingleChunkSystem(ir, profile) {
1962
+ if (profile.provider !== "anthropic") return [];
1963
+ if (ir.sections.length !== 1) return [];
1964
+ const only = ir.sections[0];
1965
+ if (!only || only.text.length <= 1e3) return [];
1966
+ return [
1967
+ {
1968
+ level: "info",
1969
+ code: "single-chunk-system",
1970
+ message: `System prompt is a single ${only.text.length}-char chunk. Splitting into NamedChunks (static role/persona vs dynamic context) gives the lowering pass a finer cache-marker boundary \u2014 only the static portion needs to be byte-stable for the cache to hit.`,
1971
+ suggestion: "Refactor the system builder to return an array of `PromptSection` shaped { id, text, cacheable?: boolean }. Static chunks (role, persona, tool policy) get `cacheable: true`; dynamic ones (current context, today's date) don't.",
1972
+ docsUrl: "https://github.com/stue/command-center/blob/main/interfaces/kgauto.md#best-practice-advisories"
1973
+ }
1974
+ ];
1975
+ }
1976
+ function detectToolBloat(ir, result) {
1977
+ const SHORT_OUTPUT = /* @__PURE__ */ new Set([
1978
+ "classify",
1979
+ "extract",
1980
+ "summarize",
1981
+ "transform",
1982
+ "critique"
1983
+ ]);
1984
+ if (!ir.tools || ir.tools.length === 0) return [];
1985
+ const toolsKept = result.diagnostics.toolsKept;
1986
+ if (toolsKept <= 10) return [];
1987
+ if (!SHORT_OUTPUT.has(ir.intent.archetype)) return [];
1988
+ return [
1989
+ {
1990
+ level: "warn",
1991
+ code: "tool-bloat",
1992
+ message: `${toolsKept} tools kept after the relevance pass for archetype="${ir.intent.archetype}" (consumer declared ${ir.tools.length}). This archetype is short-output and rarely needs more than 3 tools; each tool definition eats ~350 tokens of context budget.`,
1993
+ suggestion: "Tighten `relevanceByIntent: { [archetype]: 0..1 }` per ToolDefinition. Tools below `toolRelevanceThreshold` (default 0.2) get dropped. Without `relevanceByIntent`, every tool defaults to neutral (0.5) and stays.",
1994
+ docsUrl: "https://github.com/stue/kgauto/blob/main/v2/README.md#tools"
1995
+ }
1996
+ ];
1997
+ }
1998
+ function detectHistoryUncached(ir, profile) {
1999
+ if (profile.provider !== "anthropic") return [];
2000
+ if (!ir.history || ir.history.length < 2) return [];
2001
+ if (ir.historyCachePolicy && ir.historyCachePolicy.strategy !== "none") {
2002
+ return [];
2003
+ }
2004
+ return [
2005
+ {
2006
+ level: "warn",
2007
+ code: "history-uncached-on-claude",
2008
+ message: `${ir.history.length} history messages on Anthropic with no historyCachePolicy. Every turn re-pays for the full conversation context; with caching, subsequent turns hit the cache at ~10% the input cost.`,
2009
+ suggestion: "Set `historyCachePolicy: { strategy: 'all-but-latest' }` on this IR. The lowering pass marks the message immediately preceding currentTurn with cache_control; subsequent turns whose history prefix matches byte-for-byte hit the cache.",
2010
+ docsUrl: "https://github.com/stue/command-center/blob/main/interfaces/kgauto.md#best-practice-advisories"
2011
+ }
2012
+ ];
2013
+ }
2014
+ function detectSingleModelArray(ir, policy) {
2015
+ if (ir.models.length !== 1) return [];
2016
+ if (policy?.posture === "locked") return [];
2017
+ const only = ir.models[0];
2018
+ return [
2019
+ {
2020
+ level: "warn",
2021
+ code: "single-model-array",
2022
+ message: `\`ir.models\` has length 1 (only "${only}") and posture is not 'locked'. A single-model chain has no safety net \u2014 the first 429 / 5xx / cliff hits the user as a failure. Master plan \xA71.2 closes the reliability gap with a 2-step minimum.`,
2023
+ suggestion: "Use `getDefaultFallbackChain({ archetype: ir.intent.archetype, primary: '" + only + "', posture: 'preferred' })` for a user-anchored chain, or `getDefaultFallbackChain({ archetype, posture: 'open' })` for library-picked. If single-model is intentional (compliance/brand promise), set `policy.posture = 'locked'` to silence this rule.",
2024
+ docsUrl: "https://github.com/stue/command-center/blob/main/interfaces/kgauto.md#single-model-array"
1575
2025
  }
1576
2026
  ];
1577
2027
  }
@@ -2245,47 +2695,234 @@ function classifyHttpError(status, body) {
2245
2695
  if (status === 429) {
2246
2696
  return { ok: false, status, errorType: "retryable", errorCode: "rate_limit", message, raw: body };
2247
2697
  }
2248
- if (status === 408) {
2249
- return { ok: false, status, errorType: "retryable", errorCode: "timeout", message, raw: body };
2698
+ if (status === 408) {
2699
+ return { ok: false, status, errorType: "retryable", errorCode: "timeout", message, raw: body };
2700
+ }
2701
+ if (status >= 500) {
2702
+ return { ok: false, status, errorType: "retryable", errorCode: "server_error", message, raw: body };
2703
+ }
2704
+ if (status === 404) {
2705
+ return { ok: false, status, errorType: "retryable", errorCode: "model_not_found", message, raw: body };
2706
+ }
2707
+ if (status === 401 || status === 403) {
2708
+ return { ok: false, status, errorType: "terminal", errorCode: "auth", message, raw: body };
2709
+ }
2710
+ if (status === 400) {
2711
+ return { ok: false, status, errorType: "terminal", errorCode: "invalid_request", message, raw: body };
2712
+ }
2713
+ return { ok: false, status, errorType: "terminal", errorCode: "unknown", message, raw: body };
2714
+ }
2715
+ function extractErrorMessage(body) {
2716
+ if (!body || typeof body !== "object") return void 0;
2717
+ const b = body;
2718
+ if (b.error && typeof b.error === "object") {
2719
+ const e = b.error;
2720
+ if (typeof e.message === "string") return e.message;
2721
+ }
2722
+ if (typeof b.message === "string") return b.message;
2723
+ return void 0;
2724
+ }
2725
+ function terminalError(status, code, message) {
2726
+ return { ok: false, status, errorType: "terminal", errorCode: code, message, raw: null };
2727
+ }
2728
+ function retryableError(status, code, message, raw) {
2729
+ return { ok: false, status, errorType: "retryable", errorCode: code, message, raw };
2730
+ }
2731
+ function tryParseJson(s) {
2732
+ if (typeof s !== "string" || s.length === 0) return void 0;
2733
+ try {
2734
+ const parsed = JSON.parse(s);
2735
+ return typeof parsed === "object" && parsed !== null ? parsed : void 0;
2736
+ } catch {
2737
+ return void 0;
2738
+ }
2739
+ }
2740
+
2741
+ // src/chains-brain.ts
2742
+ function isChainsRow(x) {
2743
+ if (!x || typeof x !== "object") return false;
2744
+ const r = x;
2745
+ return typeof r.archetype === "string" && typeof r.tier === "number" && typeof r.model_id === "string";
2746
+ }
2747
+ function mapRowsToChains(rows) {
2748
+ const grouped = /* @__PURE__ */ new Map();
2749
+ for (const row of rows) {
2750
+ if (!isChainsRow(row)) continue;
2751
+ const list = grouped.get(row.archetype) ?? [];
2752
+ list.push(row);
2753
+ grouped.set(row.archetype, list);
2754
+ }
2755
+ const out = {};
2756
+ for (const [archetype, group] of grouped.entries()) {
2757
+ group.sort((a, b) => a.tier - b.tier);
2758
+ out[archetype] = group.map((r) => r.model_id);
2759
+ }
2760
+ const bundled = getAllStarterChains();
2761
+ for (const archetype of Object.keys(bundled)) {
2762
+ if (!out[archetype]) out[archetype] = bundled[archetype];
2763
+ }
2764
+ return out;
2765
+ }
2766
+ var loadChainsFromBrain = createBrainQueryCache({
2767
+ table: "kgauto_chains",
2768
+ mapRows: mapRowsToChains,
2769
+ bundledFallback: getAllStarterChains
2770
+ });
2771
+
2772
+ // src/fallback.ts
2773
+ var STARTER_CHAINS = {
2774
+ // Reasoning floor — never degrade. Walk UP on 429 to Opus → cross-provider.
2775
+ // alpha.16: gpt-5.5 appended as third-provider critique floor (frontier-tier,
2776
+ // archetypePerf=9). Cross-provider-tail invariant has somewhere to land when
2777
+ // both Anthropic + Google are unreachable (consumer adds only OpenAI key).
2778
+ critique: [
2779
+ "claude-opus-4-7",
2780
+ "claude-sonnet-4-6",
2781
+ "gemini-2.5-pro",
2782
+ "gpt-5.5"
2783
+ ],
2784
+ // Reasoning matters — Sonnet primary; walk UP to Opus on 429 (rare exception
2785
+ // to "always cheaper"); cross-provider via Pro; DeepSeek Pro as tier 3 floor.
2786
+ plan: [
2787
+ "claude-sonnet-4-6",
2788
+ "claude-opus-4-7",
2789
+ "gemini-2.5-pro",
2790
+ "deepseek-v4-pro"
2791
+ ],
2792
+ // Quality + cost match. Walk Sonnet → Haiku same-provider, Pro cross,
2793
+ // gpt-5.4-mini as third-provider tail (alpha.16 — closes the mono-Anthropic
2794
+ // gap when consumer has only ANTHROPIC + OPENAI keys; archetypePerf=7).
2795
+ generate: [
2796
+ "claude-sonnet-4-6",
2797
+ "claude-haiku-4-5",
2798
+ "gemini-2.5-pro",
2799
+ "gpt-5.4-mini"
2800
+ ],
2801
+ ask: [
2802
+ "claude-sonnet-4-6",
2803
+ "claude-haiku-4-5",
2804
+ "gemini-2.5-pro",
2805
+ "gpt-5.4-mini"
2806
+ ],
2807
+ // Structured-output archetype — Flash skipped (alpha.8 MAX_TOKENS cliff),
2808
+ // DeepSeek skipped (no brain evidence). Floor at Haiku. alpha.16: gpt-5.4
2809
+ // appended as third-provider extract floor (archetypePerf=8, native
2810
+ // structured-output support).
2811
+ extract: [
2812
+ "claude-sonnet-4-6",
2813
+ "claude-haiku-4-5",
2814
+ "gemini-2.5-pro",
2815
+ "gpt-5.4"
2816
+ ],
2817
+ // Forgiving archetype — Sonnet primary but Flash safely floors it.
2818
+ transform: [
2819
+ "claude-sonnet-4-6",
2820
+ "claude-haiku-4-5",
2821
+ "gemini-2.5-pro",
2822
+ "gemini-2.5-flash"
2823
+ ],
2824
+ // Parallel-tool throughput champion (Flash, L-040). Tier 1 cross-provider
2825
+ // Pro; tier 2 Sonnet (quality safety net for blocked-Flash case); tier 3
2826
+ // Haiku (reduced tool budget — cliff at 16 fires).
2827
+ hunt: [
2828
+ "gemini-2.5-flash",
2829
+ "gemini-2.5-pro",
2830
+ "claude-sonnet-4-6",
2831
+ "claude-haiku-4-5"
2832
+ ],
2833
+ // Cost-sensitive + tolerant. DeepSeek brain-evidence tier 1; Haiku tier 2
2834
+ // for quality safety; Flash-Lite emergency floor (onboarded s22).
2835
+ summarize: [
2836
+ "gemini-2.5-flash",
2837
+ "deepseek-v4-flash",
2838
+ "claude-haiku-4-5",
2839
+ "gemini-2.5-flash-lite"
2840
+ ],
2841
+ // Brain-validated DeepSeek tier 1 (169 rows, 0% empty); Haiku tier 2;
2842
+ // Flash-Lite floor for repeat-prompt workloads (cache-discount 10×).
2843
+ classify: [
2844
+ "gemini-2.5-flash",
2845
+ "deepseek-v4-flash",
2846
+ "claude-haiku-4-5",
2847
+ "gemini-2.5-flash-lite"
2848
+ ]
2849
+ };
2850
+ function getDefaultFallbackChain(opts) {
2851
+ const { archetype, primary, maxDepth = 3, policy, reachability } = opts;
2852
+ if (maxDepth < 1) {
2853
+ throw new Error(
2854
+ `getDefaultFallbackChain: maxDepth must be >= 1, got ${maxDepth}`
2855
+ );
2856
+ }
2857
+ const allChains = loadChainsFromBrain();
2858
+ const starter = allChains[archetype];
2859
+ if (!starter) {
2860
+ throw new Error(
2861
+ `getDefaultFallbackChain: unknown archetype "${archetype}". Known: ${Object.keys(allChains).join(", ")}`
2862
+ );
2250
2863
  }
2251
- if (status >= 500) {
2252
- return { ok: false, status, errorType: "retryable", errorCode: "server_error", message, raw: body };
2864
+ let chain;
2865
+ if (primary) {
2866
+ chain = [primary, ...starter.filter((id) => id !== primary)];
2867
+ } else {
2868
+ chain = [...starter];
2253
2869
  }
2254
- if (status === 404) {
2255
- return { ok: false, status, errorType: "retryable", errorCode: "model_not_found", message, raw: body };
2870
+ if (policy?.blockedModels && policy.blockedModels.length > 0) {
2871
+ const blocked = new Set(policy.blockedModels);
2872
+ chain = chain.filter((id) => !blocked.has(id));
2256
2873
  }
2257
- if (status === 401 || status === 403) {
2258
- return { ok: false, status, errorType: "terminal", errorCode: "auth", message, raw: body };
2874
+ const seen = /* @__PURE__ */ new Set();
2875
+ const deduped = [];
2876
+ for (const id of chain) {
2877
+ if (!seen.has(id)) {
2878
+ seen.add(id);
2879
+ deduped.push(id);
2880
+ }
2259
2881
  }
2260
- if (status === 400) {
2261
- return { ok: false, status, errorType: "terminal", errorCode: "invalid_request", message, raw: body };
2882
+ let filtered = deduped;
2883
+ if (reachability) {
2884
+ filtered = deduped.filter((id) => isModelReachable(id, reachability));
2262
2885
  }
2263
- return { ok: false, status, errorType: "terminal", errorCode: "unknown", message, raw: body };
2886
+ return filtered.slice(0, maxDepth);
2264
2887
  }
2265
- function extractErrorMessage(body) {
2266
- if (!body || typeof body !== "object") return void 0;
2267
- const b = body;
2268
- if (b.error && typeof b.error === "object") {
2269
- const e = b.error;
2270
- if (typeof e.message === "string") return e.message;
2888
+ function getStarterChain(archetype) {
2889
+ const chain = STARTER_CHAINS[archetype];
2890
+ if (!chain) {
2891
+ throw new Error(
2892
+ `getStarterChain: unknown archetype "${archetype}"`
2893
+ );
2271
2894
  }
2272
- if (typeof b.message === "string") return b.message;
2273
- return void 0;
2274
- }
2275
- function terminalError(status, code, message) {
2276
- return { ok: false, status, errorType: "terminal", errorCode: code, message, raw: null };
2895
+ return [...chain];
2277
2896
  }
2278
- function retryableError(status, code, message, raw) {
2279
- return { ok: false, status, errorType: "retryable", errorCode: code, message, raw };
2897
+ function getAllStarterChains() {
2898
+ const out = {};
2899
+ for (const [archetype, chain] of Object.entries(STARTER_CHAINS)) {
2900
+ out[archetype] = [...chain];
2901
+ }
2902
+ return out;
2280
2903
  }
2281
- function tryParseJson(s) {
2282
- if (typeof s !== "string" || s.length === 0) return void 0;
2283
- try {
2284
- const parsed = JSON.parse(s);
2285
- return typeof parsed === "object" && parsed !== null ? parsed : void 0;
2286
- } catch {
2287
- return void 0;
2904
+ function ensureCrossProviderTail(opts) {
2905
+ const { chain, archetype, apiKeys, envSource } = opts;
2906
+ if (chain.length < 1) return { chain };
2907
+ const providers = /* @__PURE__ */ new Set();
2908
+ for (const t of chain) {
2909
+ const p = tryGetProfile(t);
2910
+ if (p) providers.add(p.provider);
2911
+ }
2912
+ if (providers.size >= 2) return { chain };
2913
+ const existingProvider = providers.values().next().value;
2914
+ if (!existingProvider) return { chain };
2915
+ const allChains = loadChainsFromBrain();
2916
+ const fullChain = allChains[archetype];
2917
+ if (!fullChain) return { chain };
2918
+ for (const candidate of fullChain) {
2919
+ if (chain.includes(candidate)) continue;
2920
+ const cp = tryGetProfile(candidate);
2921
+ if (!cp || cp.provider === existingProvider) continue;
2922
+ if (!isModelReachable(candidate, { apiKeys, envSource })) continue;
2923
+ return { chain: [...chain, candidate], appended: candidate };
2288
2924
  }
2925
+ return { chain };
2289
2926
  }
2290
2927
 
2291
2928
  // src/call.ts
@@ -2333,6 +2970,58 @@ async function call(ir, opts = {}) {
2333
2970
  "no_reachable_models"
2334
2971
  );
2335
2972
  }
2973
+ const archetypeName = ir.intent?.archetype;
2974
+ if (archetypeName) {
2975
+ const ensured = ensureCrossProviderTail({
2976
+ chain: targetsToTry,
2977
+ archetype: archetypeName,
2978
+ apiKeys: opts.apiKeys
2979
+ });
2980
+ if (ensured.appended) {
2981
+ targetsToTry = ensured.chain;
2982
+ }
2983
+ }
2984
+ }
2985
+ let policyBlockedFiltered;
2986
+ if (opts.policy?.blockedModels && opts.policy.blockedModels.length > 0) {
2987
+ const blocked = new Set(opts.policy.blockedModels);
2988
+ const filtered = [];
2989
+ const dropped = [];
2990
+ for (const t of targetsToTry) {
2991
+ if (blocked.has(t)) {
2992
+ dropped.push(t);
2993
+ } else {
2994
+ filtered.push(t);
2995
+ }
2996
+ }
2997
+ if (dropped.length > 0) {
2998
+ policyBlockedFiltered = dropped;
2999
+ targetsToTry = filtered;
3000
+ }
3001
+ if (targetsToTry.length === 0) {
3002
+ const latencyMs2 = Date.now() - start;
3003
+ await record({
3004
+ handle: initial.handle,
3005
+ tokensIn: 0,
3006
+ tokensOut: 0,
3007
+ latencyMs: latencyMs2,
3008
+ success: false,
3009
+ errorType: "all_blocked_by_policy",
3010
+ promptPreview: extractPromptPreview(ir)
3011
+ });
3012
+ const blockedAttempts = dropped.map((m) => ({
3013
+ model: m,
3014
+ status: "terminal",
3015
+ errorCode: "blocked_by_policy",
3016
+ message: `Skipped \u2014 model ${m} is in CompilePolicy.blockedModels`
3017
+ }));
3018
+ throw new CallError(
3019
+ `call(): all chain targets blocked by CompilePolicy.blockedModels: [${dropped.join(", ")}]`,
3020
+ blockedAttempts,
3021
+ void 0,
3022
+ "all_blocked_by_policy"
3023
+ );
3024
+ }
2336
3025
  }
2337
3026
  let activeCompile = initial;
2338
3027
  let lastErr;
@@ -2406,7 +3095,8 @@ async function call(ir, opts = {}) {
2406
3095
  servedBy: targetModel,
2407
3096
  fellOverFrom: fellOver ? initial.target : void 0,
2408
3097
  fallbackReason: fellOver ? normalizeFallbackReason(attempts) : void 0,
2409
- unreachableFiltered
3098
+ unreachableFiltered,
3099
+ policyBlockedFiltered
2410
3100
  };
2411
3101
  }
2412
3102
  attempts.push({
@@ -2435,8 +3125,9 @@ async function call(ir, opts = {}) {
2435
3125
  promptPreview: extractPromptPreview(ir)
2436
3126
  });
2437
3127
  const filteredNote = unreachableFiltered && unreachableFiltered.length > 0 ? ` (also auto-filtered: [${unreachableFiltered.join(", ")}] \u2014 no API key)` : "";
3128
+ const blockedNote = policyBlockedFiltered && policyBlockedFiltered.length > 0 ? ` (also policy-blocked: [${policyBlockedFiltered.join(", ")}])` : "";
2438
3129
  throw new CallError(
2439
- `call(): all attempts failed${lastErr ? ` \u2014 ${lastErr.errorCode}: ${lastErr.message}` : ""}${filteredNote}`,
3130
+ `call(): all attempts failed${lastErr ? ` \u2014 ${lastErr.errorCode}: ${lastErr.message}` : ""}${filteredNote}${blockedNote}`,
2440
3131
  attempts,
2441
3132
  lastErr?.status,
2442
3133
  lastErr?.errorCode
@@ -2591,162 +3282,6 @@ function clamp(n) {
2591
3282
  return Math.max(0, Math.min(1, n));
2592
3283
  }
2593
3284
 
2594
- // src/chains-brain.ts
2595
- function isChainsRow(x) {
2596
- if (!x || typeof x !== "object") return false;
2597
- const r = x;
2598
- return typeof r.archetype === "string" && typeof r.tier === "number" && typeof r.model_id === "string";
2599
- }
2600
- function mapRowsToChains(rows) {
2601
- const grouped = /* @__PURE__ */ new Map();
2602
- for (const row of rows) {
2603
- if (!isChainsRow(row)) continue;
2604
- const list = grouped.get(row.archetype) ?? [];
2605
- list.push(row);
2606
- grouped.set(row.archetype, list);
2607
- }
2608
- const out = {};
2609
- for (const [archetype, group] of grouped.entries()) {
2610
- group.sort((a, b) => a.tier - b.tier);
2611
- out[archetype] = group.map((r) => r.model_id);
2612
- }
2613
- const bundled = getAllStarterChains();
2614
- for (const archetype of Object.keys(bundled)) {
2615
- if (!out[archetype]) out[archetype] = bundled[archetype];
2616
- }
2617
- return out;
2618
- }
2619
- var loadChainsFromBrain = createBrainQueryCache({
2620
- table: "kgauto_chains",
2621
- mapRows: mapRowsToChains,
2622
- bundledFallback: getAllStarterChains
2623
- });
2624
-
2625
- // src/fallback.ts
2626
- var STARTER_CHAINS = {
2627
- // Reasoning floor — never degrade. Walk UP on 429 to Opus → cross-provider.
2628
- critique: [
2629
- "claude-opus-4-7",
2630
- "claude-sonnet-4-6",
2631
- "gemini-2.5-pro"
2632
- ],
2633
- // Reasoning matters — Sonnet primary; walk UP to Opus on 429 (rare exception
2634
- // to "always cheaper"); cross-provider via Pro; DeepSeek Pro as tier 3 floor.
2635
- plan: [
2636
- "claude-sonnet-4-6",
2637
- "claude-opus-4-7",
2638
- "gemini-2.5-pro",
2639
- "deepseek-v4-pro"
2640
- ],
2641
- // Quality + cost match. Walk Sonnet → Haiku same-provider, Pro cross,
2642
- // Flash floor for the open-posture chain.
2643
- generate: [
2644
- "claude-sonnet-4-6",
2645
- "claude-haiku-4-5",
2646
- "gemini-2.5-pro",
2647
- "gemini-2.5-flash"
2648
- ],
2649
- ask: [
2650
- "claude-sonnet-4-6",
2651
- "claude-haiku-4-5",
2652
- "gemini-2.5-pro",
2653
- "gemini-2.5-flash"
2654
- ],
2655
- // Structured-output archetype — Flash skipped (alpha.8 MAX_TOKENS cliff),
2656
- // DeepSeek skipped (no brain evidence). Floor at Haiku.
2657
- extract: [
2658
- "claude-sonnet-4-6",
2659
- "claude-haiku-4-5",
2660
- "gemini-2.5-pro"
2661
- ],
2662
- // Forgiving archetype — Sonnet primary but Flash safely floors it.
2663
- transform: [
2664
- "claude-sonnet-4-6",
2665
- "claude-haiku-4-5",
2666
- "gemini-2.5-pro",
2667
- "gemini-2.5-flash"
2668
- ],
2669
- // Parallel-tool throughput champion (Flash, L-040). Tier 1 cross-provider
2670
- // Pro; tier 2 Sonnet (quality safety net for blocked-Flash case); tier 3
2671
- // Haiku (reduced tool budget — cliff at 16 fires).
2672
- hunt: [
2673
- "gemini-2.5-flash",
2674
- "gemini-2.5-pro",
2675
- "claude-sonnet-4-6",
2676
- "claude-haiku-4-5"
2677
- ],
2678
- // Cost-sensitive + tolerant. DeepSeek brain-evidence tier 1; Haiku tier 2
2679
- // for quality safety; Flash-Lite emergency floor (onboarded s22).
2680
- summarize: [
2681
- "gemini-2.5-flash",
2682
- "deepseek-v4-flash",
2683
- "claude-haiku-4-5",
2684
- "gemini-2.5-flash-lite"
2685
- ],
2686
- // Brain-validated DeepSeek tier 1 (169 rows, 0% empty); Haiku tier 2;
2687
- // Flash-Lite floor for repeat-prompt workloads (cache-discount 10×).
2688
- classify: [
2689
- "gemini-2.5-flash",
2690
- "deepseek-v4-flash",
2691
- "claude-haiku-4-5",
2692
- "gemini-2.5-flash-lite"
2693
- ]
2694
- };
2695
- function getDefaultFallbackChain(opts) {
2696
- const { archetype, primary, maxDepth = 3, policy, reachability } = opts;
2697
- if (maxDepth < 1) {
2698
- throw new Error(
2699
- `getDefaultFallbackChain: maxDepth must be >= 1, got ${maxDepth}`
2700
- );
2701
- }
2702
- const allChains = loadChainsFromBrain();
2703
- const starter = allChains[archetype];
2704
- if (!starter) {
2705
- throw new Error(
2706
- `getDefaultFallbackChain: unknown archetype "${archetype}". Known: ${Object.keys(allChains).join(", ")}`
2707
- );
2708
- }
2709
- let chain;
2710
- if (primary) {
2711
- chain = [primary, ...starter.filter((id) => id !== primary)];
2712
- } else {
2713
- chain = [...starter];
2714
- }
2715
- if (policy?.blockedModels && policy.blockedModels.length > 0) {
2716
- const blocked = new Set(policy.blockedModels);
2717
- chain = chain.filter((id) => !blocked.has(id));
2718
- }
2719
- const seen = /* @__PURE__ */ new Set();
2720
- const deduped = [];
2721
- for (const id of chain) {
2722
- if (!seen.has(id)) {
2723
- seen.add(id);
2724
- deduped.push(id);
2725
- }
2726
- }
2727
- let filtered = deduped;
2728
- if (reachability) {
2729
- filtered = deduped.filter((id) => isModelReachable(id, reachability));
2730
- }
2731
- return filtered.slice(0, maxDepth);
2732
- }
2733
- function getStarterChain(archetype) {
2734
- const chain = STARTER_CHAINS[archetype];
2735
- if (!chain) {
2736
- throw new Error(
2737
- `getStarterChain: unknown archetype "${archetype}"`
2738
- );
2739
- }
2740
- return [...chain];
2741
- }
2742
- function getAllStarterChains() {
2743
- const out = {};
2744
- for (const [archetype, chain] of Object.entries(STARTER_CHAINS)) {
2745
- out[archetype] = [...chain];
2746
- }
2747
- return out;
2748
- }
2749
-
2750
3285
  // src/archetype-perf-brain.ts
2751
3286
  function isPerfRow(x) {
2752
3287
  if (!x || typeof x !== "object") return false;