@warmdrift/kgauto-compiler 2.0.0-alpha.26 → 2.0.0-alpha.27

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -829,12 +829,23 @@ var PROFILES_RAW = [
829
829
  // Each entry's pricing/context/cliffs/lowering reflects the template, NOT
830
830
  // provider docs. Verify before promoting status to 'current' (L-049/L-081).
831
831
  {
832
+ // s37 (2026-05-21): UNVERIFIED-AUTO-ONBOARD → verified against
833
+ // ai.google.dev/gemini-api/docs/models/gemini-3-flash-preview +
834
+ // ai.google.dev/gemini-api/docs/pricing. L-081 catches:
835
+ // maxOutputTokens 65_535 → 65_536 (off-by-one)
836
+ // costInputPer1m 0.30 → 0.50 (template-cloned from 2.5-flash; actual is 1.67× more expensive)
837
+ // costOutputPer1m 2.50 → 3.00 (template-cloned; actual 1.2× more expensive)
838
+ // cache discount default 0.25 → 0.10 (10× discount, $0.05/$0.50 per docs)
839
+ // Cliffs inherited from 2.5-flash conservatively. The 8K-context-quality
840
+ // cliff was a 2.5-Flash observation — Google positions Gemini 3 as
841
+ // sustained-frontier-on-long-context; brain evidence will validate/relax.
842
+ // Kept as guard for now.
832
843
  id: "gemini-3-flash-preview",
833
- verifiedAgainstDocs: "UNVERIFIED-AUTO-ONBOARD",
844
+ verifiedAgainstDocs: "2026-05-21",
834
845
  provider: "google",
835
846
  status: "preview",
836
847
  maxContextTokens: 1048576,
837
- maxOutputTokens: 65535,
848
+ maxOutputTokens: 65536,
838
849
  maxTools: 128,
839
850
  parallelToolCalls: true,
840
851
  structuredOutput: "native",
@@ -845,13 +856,13 @@ var PROFILES_RAW = [
845
856
  metric: "input_tokens",
846
857
  threshold: 8e3,
847
858
  action: "downgrade_quality_warning",
848
- reason: "Quality degrades significantly above ~8K context tokens"
859
+ reason: "Inherited from 2.5-flash guard; brain evidence on Gemini 3 long-context quality will validate/relax"
849
860
  },
850
861
  {
851
862
  metric: "tool_count",
852
863
  threshold: 20,
853
864
  action: "drop_to_top_relevant",
854
- reason: "Tool reliability drops above ~20 tools (despite 128 hard limit)"
865
+ reason: "Tool reliability drops above ~20 tools (despite 128 hard limit) \u2014 inherited from Flash family"
855
866
  },
856
867
  {
857
868
  metric: "thinking_with_short_output",
@@ -860,24 +871,22 @@ var PROFILES_RAW = [
860
871
  reason: "Thinking tokens consume maxOutputTokens \u2014 empty response if drained"
861
872
  },
862
873
  {
863
- // s11 trust artifact (2026-05-02): brain showed 5/5 empty rate on
864
- // tt-intelligence/summarize/gemini-2.5-flash with tools offered.
865
- // v1's disable_thinking_for_short_output already fired and didn't
866
- // help — disabling thinking is necessary but not sufficient. Tools
867
- // present + summarize intent confuses Flash into a no-output state
868
- // (likely tool-decision purgatory). Strip tools entirely for this
869
- // archetype on this model.
874
+ // Inherited from gemini-2.5-flash s11 trust artifact. Family-likely
875
+ // failure mode for Flash architecture. Keep preemptively until brain
876
+ // evidence on Gemini 3 specifically.
870
877
  metric: "tool_count",
871
878
  threshold: 1,
872
879
  whenIntent: "summarize",
873
880
  action: "strip_tools",
874
- reason: "Gemini Flash returns empty when summarize intent has tools offered (5/5 empty rate observed in v1 prod 2026-04-19, replayed into v2 brain 2026-04-29)"
881
+ reason: "Inherited from 2.5-flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3-flash-preview specifically."
875
882
  }
876
883
  ],
877
- costInputPer1m: 0.3,
878
- costOutputPer1m: 2.5,
884
+ costInputPer1m: 0.5,
885
+ costOutputPer1m: 3,
879
886
  lowering: {
880
887
  ...GOOGLE_LOWERING_BASE,
888
+ // 10× cache discount per Google pricing: $0.05/M cached vs $0.50/M input.
889
+ cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
881
890
  thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
882
891
  },
883
892
  recovery: [
@@ -903,40 +912,45 @@ var PROFILES_RAW = [
903
912
  ],
904
913
  strengths: ["speed", "volume", "classification", "1m_context", "cost"],
905
914
  weaknesses: ["complex_schemas", "large_tool_sets", "high_context_quality"],
906
- notes: "AUTO-ONBOARDED 2026-05-16 from `gemini-2.5-flash`. Pricing, context, cliffs are template-cloned and UNVERIFIED \u2014 confirm against provider docs before promoting status to 'current'.",
907
- // Master plan §6.2 anchor. Tier 0 for hunt (parallel tool throughput
908
- // 15-75 calls/step beats Sonnet L-040), summarize, classify.
915
+ notes: "Verified s37 (2026-05-21) against Google docs. Step-change positioning vs 2.5-flash on agentic loops per Google's release notes (Dec 2025). Pricing 1.67\xD7/1.2\xD7 higher than 2.5-flash; cache discount 10\xD7 (vs 4\xD7 for 2.5). Status=preview until brain evidence accumulates.",
916
+ // Anchored to 2.5-flash archetypePerf as starter, with judgment adjustments
917
+ // for Google's "step-change on agentic" positioning. Brain evidence (zero
918
+ // rows today) will replace these starter values.
909
919
  archetypePerf: {
910
920
  hunt: 9,
911
- // L-040: parallel tool throughput 15-75/step
921
+ // Inherits 2.5-flash L-040 parallel-tool tier; Google positions 3 as agentic-loop upgrade
912
922
  classify: 7,
913
- // brain-validated, 218 rows
923
+ // Inherits 2.5-flash brain-validated tier (218 rows on 2.5)
914
924
  summarize: 7,
915
- // brain-validated; cliff strips tools when present
925
+ // Inherits 2.5-flash; cliff strips tools when present
916
926
  transform: 7,
917
- ask: 7,
918
- generate: 6,
919
- plan: 5,
927
+ ask: 8,
928
+ // +1 vs 2.5-flash — sustained-frontier positioning
929
+ generate: 7,
930
+ // +1 vs 2.5-flash — agentic coding upgrade per Google
931
+ plan: 6,
932
+ // +1 vs 2.5-flash — complex iterations per positioning
920
933
  extract: 6,
921
- // alpha.8 MAX_TOKENS history on structured output
922
- critique: 4
923
- // reasoning shallower than Sonnet/Opus
934
+ critique: 5
935
+ // +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
924
936
  }
925
937
  },
926
938
  {
927
- // ── Gemini 2.5 Flash-Lite ──
928
- // Onboarded 2026-05-13 (s22) after the model-release watcher surfaced
929
- // it as a UNREGISTERED + NEW candidate. Released by Google July 2025,
930
- // stable. Positioned BELOW Flash on the cost/perf frontier:
931
- // input $0.10/M (Flash $0.30/M) —cheaper
932
- // output $0.40/M (Flash $2.50/M) — 6× cheaper
933
- // cache $0.01/M — 1/10 of input (vs Flash 0.25 discount)
934
- // Cliffs are HYPOTHESIZED from Flash's known failure modes — Flash-Lite
935
- // is a smaller sibling, so we inherit Flash's cliffs at equal-or-tighter
936
- // thresholds. The brain will validate/relax these as evidence accumulates
937
- // per (archetype, model) tuple. Currently ZERO brain rows for this model.
939
+ // ── Gemini 3.1 Flash-Lite ──
940
+ // Onboarded 2026-05-16 by auto-onboarder; s37 (2026-05-21) verified
941
+ // against ai.google.dev/gemini-api/docs/pricing.
942
+ //
943
+ // L-081 CATCHES (template clone from 2.5-flash-lite was 2.5-3.75× too cheap):
944
+ // costInputPer1m 0.10 0.25 (template clone undervalued by 2.)
945
+ // costOutputPer1m 0.40 1.50 (template clone undervalued by 3.75×)
946
+ //
947
+ // Real 3.1-flash-lite is NOT a cost-equivalent successor to 2.5-flash-lite —
948
+ // it sits between 2.5-flash-lite ($0.10/$0.40) and 2.5-flash ($0.30/$2.50).
949
+ // Cache discount 10× verified ($0.025/M cached vs $0.25/M input).
950
+ //
951
+ // Cliffs are HYPOTHESIZED from 2.5-flash family; brain evidence pending.
938
952
  id: "gemini-3.1-flash-lite",
939
- verifiedAgainstDocs: "UNVERIFIED-AUTO-ONBOARD",
953
+ verifiedAgainstDocs: "2026-05-21",
940
954
  provider: "google",
941
955
  status: "preview",
942
956
  maxContextTokens: 1048576,
@@ -977,12 +991,12 @@ var PROFILES_RAW = [
977
991
  reason: "Inherited from Flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on Flash-Lite specifically."
978
992
  }
979
993
  ],
980
- costInputPer1m: 0.1,
981
- costOutputPer1m: 0.4,
994
+ costInputPer1m: 0.25,
995
+ costOutputPer1m: 1.5,
982
996
  lowering: {
983
997
  ...GOOGLE_LOWERING_BASE,
984
- // Cache discount 10× (vs Flash 4×) — Google's spec is $0.01/M cache vs
985
- // $0.10/M input. Material for repeat-prompt workloads (classify shape).
998
+ // Cache discount 10× (vs Flash 4×) — Google docs s37: $0.025/M cached vs
999
+ // $0.25/M input. Material for repeat-prompt workloads (classify shape).
986
1000
  cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
987
1001
  thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
988
1002
  },
@@ -1007,13 +1021,13 @@ var PROFILES_RAW = [
1007
1021
  reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target."
1008
1022
  }
1009
1023
  ],
1010
- strengths: ["lowest_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
1024
+ strengths: ["low_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
1011
1025
  weaknesses: ["complex_reasoning", "large_tool_sets", "complex_schemas", "structured_output_unproven", "long_context_quality"],
1012
- notes: "AUTO-ONBOARDED 2026-05-16 from `gemini-2.5-flash-lite`. Pricing, context, cliffs are template-cloned and UNVERIFIED \u2014 confirm against provider docs before promoting status to 'current'.",
1013
- // Tier 3 emergency floor for summarize/classify chains. ZERO brain
1014
- // rows — all values are starter hypotheses anchored to "smaller
1015
- // sibling of Flash, at-or-below Flash perf on every archetype." The
1016
- // first 50 brain rows per archetype will validate or relax these.
1026
+ notes: "Verified s37 (2026-05-21) against Google docs. Sits between 2.5-flash-lite (cheaper) and 2.5-flash (more expensive) on cost frontier; 2.5\xD7 more expensive than initial template-clone. Cliffs hypothesized from Flash family \u2014 brain evidence pending.",
1027
+ // Tier 2-3 floor for summarize/classify chains at the new (verified) price
1028
+ // point. ZERO brain rows — values are starter hypotheses anchored to
1029
+ // "smaller sibling of Flash at higher cost than 2.5-flash-lite." The first
1030
+ // 50 brain rows per archetype will validate or relax these.
1017
1031
  archetypePerf: {
1018
1032
  classify: 6,
1019
1033
  // starter hypothesis — verify (Flash is 7, lite likely ≤)
@@ -1028,6 +1042,125 @@ var PROFILES_RAW = [
1028
1042
  plan: 3,
1029
1043
  critique: 3
1030
1044
  }
1045
+ },
1046
+ // ─────────────────────────────────────────────────────────────────────────
1047
+ // Gemini 3.5 Flash — hand-onboarded s37 (2026-05-21)
1048
+ //
1049
+ // Google positioning ("Most intelligent for sustained frontier performance
1050
+ // on agentic and coding tasks" / "particularly effective for rapid agentic
1051
+ // loops involving complex coding cycles and iterations") suggests this is
1052
+ // the Flash-family upgrade specifically aimed at hunt-shape workloads.
1053
+ // Pricing 5× input / 3.6× output vs 2.5-flash — material cost premium.
1054
+ // archetypePerf adjusted +1 vs 2.5-flash on ask/generate/plan/critique
1055
+ // (sustained-frontier positioning); hunt held at 9 inherited from L-040
1056
+ // family parallel-tool tier; brain evidence will validate within 50 rows.
1057
+ //
1058
+ // Cliffs inherited conservatively from 2.5-flash. Google's "sustained
1059
+ // frontier on long-context" positioning suggests the 8K cliff may not
1060
+ // apply to 3.5 — keep as guard until brain evidence shows otherwise.
1061
+ //
1062
+ // Specs verified against:
1063
+ // ai.google.dev/gemini-api/docs/models/gemini-3.5-flash
1064
+ // ai.google.dev/gemini-api/docs/pricing (Standard tier)
1065
+ // ─────────────────────────────────────────────────────────────────────────
1066
+ {
1067
+ id: "gemini-3.5-flash",
1068
+ verifiedAgainstDocs: "2026-05-21",
1069
+ provider: "google",
1070
+ status: "current",
1071
+ maxContextTokens: 1048576,
1072
+ maxOutputTokens: 65536,
1073
+ maxTools: 128,
1074
+ parallelToolCalls: true,
1075
+ structuredOutput: "native",
1076
+ systemPromptMode: "separate",
1077
+ streaming: true,
1078
+ cliffs: [
1079
+ {
1080
+ metric: "input_tokens",
1081
+ threshold: 8e3,
1082
+ action: "downgrade_quality_warning",
1083
+ reason: "Inherited from 2.5-flash guard; Google positions 3.5 as sustained-frontier-on-long-context but brain evidence pending"
1084
+ },
1085
+ {
1086
+ metric: "tool_count",
1087
+ threshold: 20,
1088
+ action: "drop_to_top_relevant",
1089
+ reason: "Inherited from Flash family: tool reliability drops above ~20 (despite 128 hard limit). Validate per (archetype, model) after n\u226520."
1090
+ },
1091
+ {
1092
+ metric: "thinking_with_short_output",
1093
+ threshold: 1,
1094
+ action: "force_thinking_budget_zero",
1095
+ reason: "Thinking mode supported per Google docs; same drain risk as 2.5-flash \u2014 thinking tokens consume maxOutputTokens"
1096
+ },
1097
+ {
1098
+ // Inherited from 2.5-flash s11 trust artifact (5/5 empty rate on
1099
+ // tt-intelligence/summarize/gemini-2.5-flash with tools offered).
1100
+ // Family-likely failure mode for Flash architecture across versions.
1101
+ // Keep preemptively until brain evidence on 3.5-flash specifically.
1102
+ metric: "tool_count",
1103
+ threshold: 1,
1104
+ whenIntent: "summarize",
1105
+ action: "strip_tools",
1106
+ reason: "Inherited from 2.5-flash s11 cliff (kgauto commit 3872832): summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3.5-flash specifically."
1107
+ }
1108
+ ],
1109
+ costInputPer1m: 1.5,
1110
+ costOutputPer1m: 9,
1111
+ lowering: {
1112
+ ...GOOGLE_LOWERING_BASE,
1113
+ // 10× cache discount per Google pricing: $0.15/M cached vs $1.50/M input.
1114
+ cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
1115
+ thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
1116
+ },
1117
+ recovery: [
1118
+ {
1119
+ signal: "empty_response_after_tool",
1120
+ action: "retry_with_params",
1121
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
1122
+ maxRetries: 1,
1123
+ reason: "Inherited Flash-family pattern: empty after tool result \u2014 retry with thinking off"
1124
+ },
1125
+ {
1126
+ signal: "empty_response",
1127
+ action: "retry_with_params",
1128
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
1129
+ maxRetries: 1,
1130
+ reason: "Empty response \u2014 try with thinking off"
1131
+ },
1132
+ {
1133
+ signal: "malformed_function_call",
1134
+ action: "escalate",
1135
+ reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target"
1136
+ }
1137
+ ],
1138
+ strengths: ["agentic_loops", "coding", "1m_context", "parallel_tools", "thinking_mode", "sustained_frontier"],
1139
+ weaknesses: ["cost_vs_2_5_flash", "no_brain_evidence_yet"],
1140
+ notes: "Hand-onboarded s37 (2026-05-21) verified against Google docs. Stable status; positioned as Flash-family upgrade for agentic loops and coding. 5\xD7/3.6\xD7 more expensive than 2.5-flash but Google claims step-change on sustained frontier work. archetypePerf adjustments are judgment-grounded starter hypotheses \u2014 brain evidence will validate within ~50 rows per archetype.",
1141
+ // Starter hypothesis: anchored to 2.5-flash archetypePerf with +1
1142
+ // adjustments where Google's positioning explicitly supports
1143
+ // (agentic/coding/sustained). Hunt held at 9 inherited from L-040 family
1144
+ // parallel-tool tier. Brain evidence will replace.
1145
+ archetypePerf: {
1146
+ hunt: 9,
1147
+ // Inherited from 2.5-flash L-040 parallel-tool tier; Google positions 3.5 as agentic-loop champion
1148
+ classify: 7,
1149
+ // Inherited from 2.5-flash brain-validated tier (218 rows on 2.5)
1150
+ summarize: 7,
1151
+ // Inherited from 2.5-flash; cliff strips tools when present
1152
+ transform: 7,
1153
+ ask: 8,
1154
+ // +1 vs 2.5-flash — sustained-frontier positioning
1155
+ generate: 8,
1156
+ // +1 vs 2.5-flash (6→8) — Google: "complex coding cycles and iterations"
1157
+ plan: 7,
1158
+ // +1 vs 2.5-flash (5→7) — "complex iterations" positioning
1159
+ extract: 7,
1160
+ // +1 vs 2.5-flash — sustained-frontier on structured tasks
1161
+ critique: 5
1162
+ // +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
1163
+ }
1031
1164
  }
1032
1165
  ];
1033
1166
  var ALIASES = {
package/dist/index.js CHANGED
@@ -1703,12 +1703,23 @@ var PROFILES_RAW = [
1703
1703
  // Each entry's pricing/context/cliffs/lowering reflects the template, NOT
1704
1704
  // provider docs. Verify before promoting status to 'current' (L-049/L-081).
1705
1705
  {
1706
+ // s37 (2026-05-21): UNVERIFIED-AUTO-ONBOARD → verified against
1707
+ // ai.google.dev/gemini-api/docs/models/gemini-3-flash-preview +
1708
+ // ai.google.dev/gemini-api/docs/pricing. L-081 catches:
1709
+ // maxOutputTokens 65_535 → 65_536 (off-by-one)
1710
+ // costInputPer1m 0.30 → 0.50 (template-cloned from 2.5-flash; actual is 1.67× more expensive)
1711
+ // costOutputPer1m 2.50 → 3.00 (template-cloned; actual 1.2× more expensive)
1712
+ // cache discount default 0.25 → 0.10 (10× discount, $0.05/$0.50 per docs)
1713
+ // Cliffs inherited from 2.5-flash conservatively. The 8K-context-quality
1714
+ // cliff was a 2.5-Flash observation — Google positions Gemini 3 as
1715
+ // sustained-frontier-on-long-context; brain evidence will validate/relax.
1716
+ // Kept as guard for now.
1706
1717
  id: "gemini-3-flash-preview",
1707
- verifiedAgainstDocs: "UNVERIFIED-AUTO-ONBOARD",
1718
+ verifiedAgainstDocs: "2026-05-21",
1708
1719
  provider: "google",
1709
1720
  status: "preview",
1710
1721
  maxContextTokens: 1048576,
1711
- maxOutputTokens: 65535,
1722
+ maxOutputTokens: 65536,
1712
1723
  maxTools: 128,
1713
1724
  parallelToolCalls: true,
1714
1725
  structuredOutput: "native",
@@ -1719,13 +1730,13 @@ var PROFILES_RAW = [
1719
1730
  metric: "input_tokens",
1720
1731
  threshold: 8e3,
1721
1732
  action: "downgrade_quality_warning",
1722
- reason: "Quality degrades significantly above ~8K context tokens"
1733
+ reason: "Inherited from 2.5-flash guard; brain evidence on Gemini 3 long-context quality will validate/relax"
1723
1734
  },
1724
1735
  {
1725
1736
  metric: "tool_count",
1726
1737
  threshold: 20,
1727
1738
  action: "drop_to_top_relevant",
1728
- reason: "Tool reliability drops above ~20 tools (despite 128 hard limit)"
1739
+ reason: "Tool reliability drops above ~20 tools (despite 128 hard limit) \u2014 inherited from Flash family"
1729
1740
  },
1730
1741
  {
1731
1742
  metric: "thinking_with_short_output",
@@ -1734,24 +1745,22 @@ var PROFILES_RAW = [
1734
1745
  reason: "Thinking tokens consume maxOutputTokens \u2014 empty response if drained"
1735
1746
  },
1736
1747
  {
1737
- // s11 trust artifact (2026-05-02): brain showed 5/5 empty rate on
1738
- // tt-intelligence/summarize/gemini-2.5-flash with tools offered.
1739
- // v1's disable_thinking_for_short_output already fired and didn't
1740
- // help — disabling thinking is necessary but not sufficient. Tools
1741
- // present + summarize intent confuses Flash into a no-output state
1742
- // (likely tool-decision purgatory). Strip tools entirely for this
1743
- // archetype on this model.
1748
+ // Inherited from gemini-2.5-flash s11 trust artifact. Family-likely
1749
+ // failure mode for Flash architecture. Keep preemptively until brain
1750
+ // evidence on Gemini 3 specifically.
1744
1751
  metric: "tool_count",
1745
1752
  threshold: 1,
1746
1753
  whenIntent: "summarize",
1747
1754
  action: "strip_tools",
1748
- reason: "Gemini Flash returns empty when summarize intent has tools offered (5/5 empty rate observed in v1 prod 2026-04-19, replayed into v2 brain 2026-04-29)"
1755
+ reason: "Inherited from 2.5-flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3-flash-preview specifically."
1749
1756
  }
1750
1757
  ],
1751
- costInputPer1m: 0.3,
1752
- costOutputPer1m: 2.5,
1758
+ costInputPer1m: 0.5,
1759
+ costOutputPer1m: 3,
1753
1760
  lowering: {
1754
1761
  ...GOOGLE_LOWERING_BASE,
1762
+ // 10× cache discount per Google pricing: $0.05/M cached vs $0.50/M input.
1763
+ cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
1755
1764
  thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
1756
1765
  },
1757
1766
  recovery: [
@@ -1777,40 +1786,45 @@ var PROFILES_RAW = [
1777
1786
  ],
1778
1787
  strengths: ["speed", "volume", "classification", "1m_context", "cost"],
1779
1788
  weaknesses: ["complex_schemas", "large_tool_sets", "high_context_quality"],
1780
- notes: "AUTO-ONBOARDED 2026-05-16 from `gemini-2.5-flash`. Pricing, context, cliffs are template-cloned and UNVERIFIED \u2014 confirm against provider docs before promoting status to 'current'.",
1781
- // Master plan §6.2 anchor. Tier 0 for hunt (parallel tool throughput
1782
- // 15-75 calls/step beats Sonnet L-040), summarize, classify.
1789
+ notes: "Verified s37 (2026-05-21) against Google docs. Step-change positioning vs 2.5-flash on agentic loops per Google's release notes (Dec 2025). Pricing 1.67\xD7/1.2\xD7 higher than 2.5-flash; cache discount 10\xD7 (vs 4\xD7 for 2.5). Status=preview until brain evidence accumulates.",
1790
+ // Anchored to 2.5-flash archetypePerf as starter, with judgment adjustments
1791
+ // for Google's "step-change on agentic" positioning. Brain evidence (zero
1792
+ // rows today) will replace these starter values.
1783
1793
  archetypePerf: {
1784
1794
  hunt: 9,
1785
- // L-040: parallel tool throughput 15-75/step
1795
+ // Inherits 2.5-flash L-040 parallel-tool tier; Google positions 3 as agentic-loop upgrade
1786
1796
  classify: 7,
1787
- // brain-validated, 218 rows
1797
+ // Inherits 2.5-flash brain-validated tier (218 rows on 2.5)
1788
1798
  summarize: 7,
1789
- // brain-validated; cliff strips tools when present
1799
+ // Inherits 2.5-flash; cliff strips tools when present
1790
1800
  transform: 7,
1791
- ask: 7,
1792
- generate: 6,
1793
- plan: 5,
1801
+ ask: 8,
1802
+ // +1 vs 2.5-flash — sustained-frontier positioning
1803
+ generate: 7,
1804
+ // +1 vs 2.5-flash — agentic coding upgrade per Google
1805
+ plan: 6,
1806
+ // +1 vs 2.5-flash — complex iterations per positioning
1794
1807
  extract: 6,
1795
- // alpha.8 MAX_TOKENS history on structured output
1796
- critique: 4
1797
- // reasoning shallower than Sonnet/Opus
1808
+ critique: 5
1809
+ // +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
1798
1810
  }
1799
1811
  },
1800
1812
  {
1801
- // ── Gemini 2.5 Flash-Lite ──
1802
- // Onboarded 2026-05-13 (s22) after the model-release watcher surfaced
1803
- // it as a UNREGISTERED + NEW candidate. Released by Google July 2025,
1804
- // stable. Positioned BELOW Flash on the cost/perf frontier:
1805
- // input $0.10/M (Flash $0.30/M) —cheaper
1806
- // output $0.40/M (Flash $2.50/M) — 6× cheaper
1807
- // cache $0.01/M — 1/10 of input (vs Flash 0.25 discount)
1808
- // Cliffs are HYPOTHESIZED from Flash's known failure modes — Flash-Lite
1809
- // is a smaller sibling, so we inherit Flash's cliffs at equal-or-tighter
1810
- // thresholds. The brain will validate/relax these as evidence accumulates
1811
- // per (archetype, model) tuple. Currently ZERO brain rows for this model.
1813
+ // ── Gemini 3.1 Flash-Lite ──
1814
+ // Onboarded 2026-05-16 by auto-onboarder; s37 (2026-05-21) verified
1815
+ // against ai.google.dev/gemini-api/docs/pricing.
1816
+ //
1817
+ // L-081 CATCHES (template clone from 2.5-flash-lite was 2.5-3.75× too cheap):
1818
+ // costInputPer1m 0.10 0.25 (template clone undervalued by 2.)
1819
+ // costOutputPer1m 0.40 1.50 (template clone undervalued by 3.75×)
1820
+ //
1821
+ // Real 3.1-flash-lite is NOT a cost-equivalent successor to 2.5-flash-lite —
1822
+ // it sits between 2.5-flash-lite ($0.10/$0.40) and 2.5-flash ($0.30/$2.50).
1823
+ // Cache discount 10× verified ($0.025/M cached vs $0.25/M input).
1824
+ //
1825
+ // Cliffs are HYPOTHESIZED from 2.5-flash family; brain evidence pending.
1812
1826
  id: "gemini-3.1-flash-lite",
1813
- verifiedAgainstDocs: "UNVERIFIED-AUTO-ONBOARD",
1827
+ verifiedAgainstDocs: "2026-05-21",
1814
1828
  provider: "google",
1815
1829
  status: "preview",
1816
1830
  maxContextTokens: 1048576,
@@ -1851,12 +1865,12 @@ var PROFILES_RAW = [
1851
1865
  reason: "Inherited from Flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on Flash-Lite specifically."
1852
1866
  }
1853
1867
  ],
1854
- costInputPer1m: 0.1,
1855
- costOutputPer1m: 0.4,
1868
+ costInputPer1m: 0.25,
1869
+ costOutputPer1m: 1.5,
1856
1870
  lowering: {
1857
1871
  ...GOOGLE_LOWERING_BASE,
1858
- // Cache discount 10× (vs Flash 4×) — Google's spec is $0.01/M cache vs
1859
- // $0.10/M input. Material for repeat-prompt workloads (classify shape).
1872
+ // Cache discount 10× (vs Flash 4×) — Google docs s37: $0.025/M cached vs
1873
+ // $0.25/M input. Material for repeat-prompt workloads (classify shape).
1860
1874
  cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
1861
1875
  thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
1862
1876
  },
@@ -1881,13 +1895,13 @@ var PROFILES_RAW = [
1881
1895
  reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target."
1882
1896
  }
1883
1897
  ],
1884
- strengths: ["lowest_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
1898
+ strengths: ["low_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
1885
1899
  weaknesses: ["complex_reasoning", "large_tool_sets", "complex_schemas", "structured_output_unproven", "long_context_quality"],
1886
- notes: "AUTO-ONBOARDED 2026-05-16 from `gemini-2.5-flash-lite`. Pricing, context, cliffs are template-cloned and UNVERIFIED \u2014 confirm against provider docs before promoting status to 'current'.",
1887
- // Tier 3 emergency floor for summarize/classify chains. ZERO brain
1888
- // rows — all values are starter hypotheses anchored to "smaller
1889
- // sibling of Flash, at-or-below Flash perf on every archetype." The
1890
- // first 50 brain rows per archetype will validate or relax these.
1900
+ notes: "Verified s37 (2026-05-21) against Google docs. Sits between 2.5-flash-lite (cheaper) and 2.5-flash (more expensive) on cost frontier; 2.5\xD7 more expensive than initial template-clone. Cliffs hypothesized from Flash family \u2014 brain evidence pending.",
1901
+ // Tier 2-3 floor for summarize/classify chains at the new (verified) price
1902
+ // point. ZERO brain rows — values are starter hypotheses anchored to
1903
+ // "smaller sibling of Flash at higher cost than 2.5-flash-lite." The first
1904
+ // 50 brain rows per archetype will validate or relax these.
1891
1905
  archetypePerf: {
1892
1906
  classify: 6,
1893
1907
  // starter hypothesis — verify (Flash is 7, lite likely ≤)
@@ -1902,6 +1916,125 @@ var PROFILES_RAW = [
1902
1916
  plan: 3,
1903
1917
  critique: 3
1904
1918
  }
1919
+ },
1920
+ // ─────────────────────────────────────────────────────────────────────────
1921
+ // Gemini 3.5 Flash — hand-onboarded s37 (2026-05-21)
1922
+ //
1923
+ // Google positioning ("Most intelligent for sustained frontier performance
1924
+ // on agentic and coding tasks" / "particularly effective for rapid agentic
1925
+ // loops involving complex coding cycles and iterations") suggests this is
1926
+ // the Flash-family upgrade specifically aimed at hunt-shape workloads.
1927
+ // Pricing 5× input / 3.6× output vs 2.5-flash — material cost premium.
1928
+ // archetypePerf adjusted +1 vs 2.5-flash on ask/generate/plan/critique
1929
+ // (sustained-frontier positioning); hunt held at 9 inherited from L-040
1930
+ // family parallel-tool tier; brain evidence will validate within 50 rows.
1931
+ //
1932
+ // Cliffs inherited conservatively from 2.5-flash. Google's "sustained
1933
+ // frontier on long-context" positioning suggests the 8K cliff may not
1934
+ // apply to 3.5 — keep as guard until brain evidence shows otherwise.
1935
+ //
1936
+ // Specs verified against:
1937
+ // ai.google.dev/gemini-api/docs/models/gemini-3.5-flash
1938
+ // ai.google.dev/gemini-api/docs/pricing (Standard tier)
1939
+ // ─────────────────────────────────────────────────────────────────────────
1940
+ {
1941
+ id: "gemini-3.5-flash",
1942
+ verifiedAgainstDocs: "2026-05-21",
1943
+ provider: "google",
1944
+ status: "current",
1945
+ maxContextTokens: 1048576,
1946
+ maxOutputTokens: 65536,
1947
+ maxTools: 128,
1948
+ parallelToolCalls: true,
1949
+ structuredOutput: "native",
1950
+ systemPromptMode: "separate",
1951
+ streaming: true,
1952
+ cliffs: [
1953
+ {
1954
+ metric: "input_tokens",
1955
+ threshold: 8e3,
1956
+ action: "downgrade_quality_warning",
1957
+ reason: "Inherited from 2.5-flash guard; Google positions 3.5 as sustained-frontier-on-long-context but brain evidence pending"
1958
+ },
1959
+ {
1960
+ metric: "tool_count",
1961
+ threshold: 20,
1962
+ action: "drop_to_top_relevant",
1963
+ reason: "Inherited from Flash family: tool reliability drops above ~20 (despite 128 hard limit). Validate per (archetype, model) after n\u226520."
1964
+ },
1965
+ {
1966
+ metric: "thinking_with_short_output",
1967
+ threshold: 1,
1968
+ action: "force_thinking_budget_zero",
1969
+ reason: "Thinking mode supported per Google docs; same drain risk as 2.5-flash \u2014 thinking tokens consume maxOutputTokens"
1970
+ },
1971
+ {
1972
+ // Inherited from 2.5-flash s11 trust artifact (5/5 empty rate on
1973
+ // tt-intelligence/summarize/gemini-2.5-flash with tools offered).
1974
+ // Family-likely failure mode for Flash architecture across versions.
1975
+ // Keep preemptively until brain evidence on 3.5-flash specifically.
1976
+ metric: "tool_count",
1977
+ threshold: 1,
1978
+ whenIntent: "summarize",
1979
+ action: "strip_tools",
1980
+ reason: "Inherited from 2.5-flash s11 cliff (kgauto commit 3872832): summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3.5-flash specifically."
1981
+ }
1982
+ ],
1983
+ costInputPer1m: 1.5,
1984
+ costOutputPer1m: 9,
1985
+ lowering: {
1986
+ ...GOOGLE_LOWERING_BASE,
1987
+ // 10× cache discount per Google pricing: $0.15/M cached vs $1.50/M input.
1988
+ cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
1989
+ thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
1990
+ },
1991
+ recovery: [
1992
+ {
1993
+ signal: "empty_response_after_tool",
1994
+ action: "retry_with_params",
1995
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
1996
+ maxRetries: 1,
1997
+ reason: "Inherited Flash-family pattern: empty after tool result \u2014 retry with thinking off"
1998
+ },
1999
+ {
2000
+ signal: "empty_response",
2001
+ action: "retry_with_params",
2002
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
2003
+ maxRetries: 1,
2004
+ reason: "Empty response \u2014 try with thinking off"
2005
+ },
2006
+ {
2007
+ signal: "malformed_function_call",
2008
+ action: "escalate",
2009
+ reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target"
2010
+ }
2011
+ ],
2012
+ strengths: ["agentic_loops", "coding", "1m_context", "parallel_tools", "thinking_mode", "sustained_frontier"],
2013
+ weaknesses: ["cost_vs_2_5_flash", "no_brain_evidence_yet"],
2014
+ notes: "Hand-onboarded s37 (2026-05-21) verified against Google docs. Stable status; positioned as Flash-family upgrade for agentic loops and coding. 5\xD7/3.6\xD7 more expensive than 2.5-flash but Google claims step-change on sustained frontier work. archetypePerf adjustments are judgment-grounded starter hypotheses \u2014 brain evidence will validate within ~50 rows per archetype.",
2015
+ // Starter hypothesis: anchored to 2.5-flash archetypePerf with +1
2016
+ // adjustments where Google's positioning explicitly supports
2017
+ // (agentic/coding/sustained). Hunt held at 9 inherited from L-040 family
2018
+ // parallel-tool tier. Brain evidence will replace.
2019
+ archetypePerf: {
2020
+ hunt: 9,
2021
+ // Inherited from 2.5-flash L-040 parallel-tool tier; Google positions 3.5 as agentic-loop champion
2022
+ classify: 7,
2023
+ // Inherited from 2.5-flash brain-validated tier (218 rows on 2.5)
2024
+ summarize: 7,
2025
+ // Inherited from 2.5-flash; cliff strips tools when present
2026
+ transform: 7,
2027
+ ask: 8,
2028
+ // +1 vs 2.5-flash — sustained-frontier positioning
2029
+ generate: 8,
2030
+ // +1 vs 2.5-flash (6→8) — Google: "complex coding cycles and iterations"
2031
+ plan: 7,
2032
+ // +1 vs 2.5-flash (5→7) — "complex iterations" positioning
2033
+ extract: 7,
2034
+ // +1 vs 2.5-flash — sustained-frontier on structured tasks
2035
+ critique: 5
2036
+ // +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
2037
+ }
1905
2038
  }
1906
2039
  ];
1907
2040
  var ALIASES = {
package/dist/index.mjs CHANGED
@@ -17,7 +17,7 @@ import {
17
17
  getProfile,
18
18
  profilesByProvider,
19
19
  tryGetProfile
20
- } from "./chunk-7MTHFSNY.mjs";
20
+ } from "./chunk-JQGRWJZO.mjs";
21
21
  import {
22
22
  emitAdvisoryFired,
23
23
  emitCompileDone,
package/dist/profiles.js CHANGED
@@ -859,12 +859,23 @@ var PROFILES_RAW = [
859
859
  // Each entry's pricing/context/cliffs/lowering reflects the template, NOT
860
860
  // provider docs. Verify before promoting status to 'current' (L-049/L-081).
861
861
  {
862
+ // s37 (2026-05-21): UNVERIFIED-AUTO-ONBOARD → verified against
863
+ // ai.google.dev/gemini-api/docs/models/gemini-3-flash-preview +
864
+ // ai.google.dev/gemini-api/docs/pricing. L-081 catches:
865
+ // maxOutputTokens 65_535 → 65_536 (off-by-one)
866
+ // costInputPer1m 0.30 → 0.50 (template-cloned from 2.5-flash; actual is 1.67× more expensive)
867
+ // costOutputPer1m 2.50 → 3.00 (template-cloned; actual 1.2× more expensive)
868
+ // cache discount default 0.25 → 0.10 (10× discount, $0.05/$0.50 per docs)
869
+ // Cliffs inherited from 2.5-flash conservatively. The 8K-context-quality
870
+ // cliff was a 2.5-Flash observation — Google positions Gemini 3 as
871
+ // sustained-frontier-on-long-context; brain evidence will validate/relax.
872
+ // Kept as guard for now.
862
873
  id: "gemini-3-flash-preview",
863
- verifiedAgainstDocs: "UNVERIFIED-AUTO-ONBOARD",
874
+ verifiedAgainstDocs: "2026-05-21",
864
875
  provider: "google",
865
876
  status: "preview",
866
877
  maxContextTokens: 1048576,
867
- maxOutputTokens: 65535,
878
+ maxOutputTokens: 65536,
868
879
  maxTools: 128,
869
880
  parallelToolCalls: true,
870
881
  structuredOutput: "native",
@@ -875,13 +886,13 @@ var PROFILES_RAW = [
875
886
  metric: "input_tokens",
876
887
  threshold: 8e3,
877
888
  action: "downgrade_quality_warning",
878
- reason: "Quality degrades significantly above ~8K context tokens"
889
+ reason: "Inherited from 2.5-flash guard; brain evidence on Gemini 3 long-context quality will validate/relax"
879
890
  },
880
891
  {
881
892
  metric: "tool_count",
882
893
  threshold: 20,
883
894
  action: "drop_to_top_relevant",
884
- reason: "Tool reliability drops above ~20 tools (despite 128 hard limit)"
895
+ reason: "Tool reliability drops above ~20 tools (despite 128 hard limit) \u2014 inherited from Flash family"
885
896
  },
886
897
  {
887
898
  metric: "thinking_with_short_output",
@@ -890,24 +901,22 @@ var PROFILES_RAW = [
890
901
  reason: "Thinking tokens consume maxOutputTokens \u2014 empty response if drained"
891
902
  },
892
903
  {
893
- // s11 trust artifact (2026-05-02): brain showed 5/5 empty rate on
894
- // tt-intelligence/summarize/gemini-2.5-flash with tools offered.
895
- // v1's disable_thinking_for_short_output already fired and didn't
896
- // help — disabling thinking is necessary but not sufficient. Tools
897
- // present + summarize intent confuses Flash into a no-output state
898
- // (likely tool-decision purgatory). Strip tools entirely for this
899
- // archetype on this model.
904
+ // Inherited from gemini-2.5-flash s11 trust artifact. Family-likely
905
+ // failure mode for Flash architecture. Keep preemptively until brain
906
+ // evidence on Gemini 3 specifically.
900
907
  metric: "tool_count",
901
908
  threshold: 1,
902
909
  whenIntent: "summarize",
903
910
  action: "strip_tools",
904
- reason: "Gemini Flash returns empty when summarize intent has tools offered (5/5 empty rate observed in v1 prod 2026-04-19, replayed into v2 brain 2026-04-29)"
911
+ reason: "Inherited from 2.5-flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3-flash-preview specifically."
905
912
  }
906
913
  ],
907
- costInputPer1m: 0.3,
908
- costOutputPer1m: 2.5,
914
+ costInputPer1m: 0.5,
915
+ costOutputPer1m: 3,
909
916
  lowering: {
910
917
  ...GOOGLE_LOWERING_BASE,
918
+ // 10× cache discount per Google pricing: $0.05/M cached vs $0.50/M input.
919
+ cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
911
920
  thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
912
921
  },
913
922
  recovery: [
@@ -933,40 +942,45 @@ var PROFILES_RAW = [
933
942
  ],
934
943
  strengths: ["speed", "volume", "classification", "1m_context", "cost"],
935
944
  weaknesses: ["complex_schemas", "large_tool_sets", "high_context_quality"],
936
- notes: "AUTO-ONBOARDED 2026-05-16 from `gemini-2.5-flash`. Pricing, context, cliffs are template-cloned and UNVERIFIED \u2014 confirm against provider docs before promoting status to 'current'.",
937
- // Master plan §6.2 anchor. Tier 0 for hunt (parallel tool throughput
938
- // 15-75 calls/step beats Sonnet L-040), summarize, classify.
945
+ notes: "Verified s37 (2026-05-21) against Google docs. Step-change positioning vs 2.5-flash on agentic loops per Google's release notes (Dec 2025). Pricing 1.67\xD7/1.2\xD7 higher than 2.5-flash; cache discount 10\xD7 (vs 4\xD7 for 2.5). Status=preview until brain evidence accumulates.",
946
+ // Anchored to 2.5-flash archetypePerf as starter, with judgment adjustments
947
+ // for Google's "step-change on agentic" positioning. Brain evidence (zero
948
+ // rows today) will replace these starter values.
939
949
  archetypePerf: {
940
950
  hunt: 9,
941
- // L-040: parallel tool throughput 15-75/step
951
+ // Inherits 2.5-flash L-040 parallel-tool tier; Google positions 3 as agentic-loop upgrade
942
952
  classify: 7,
943
- // brain-validated, 218 rows
953
+ // Inherits 2.5-flash brain-validated tier (218 rows on 2.5)
944
954
  summarize: 7,
945
- // brain-validated; cliff strips tools when present
955
+ // Inherits 2.5-flash; cliff strips tools when present
946
956
  transform: 7,
947
- ask: 7,
948
- generate: 6,
949
- plan: 5,
957
+ ask: 8,
958
+ // +1 vs 2.5-flash — sustained-frontier positioning
959
+ generate: 7,
960
+ // +1 vs 2.5-flash — agentic coding upgrade per Google
961
+ plan: 6,
962
+ // +1 vs 2.5-flash — complex iterations per positioning
950
963
  extract: 6,
951
- // alpha.8 MAX_TOKENS history on structured output
952
- critique: 4
953
- // reasoning shallower than Sonnet/Opus
964
+ critique: 5
965
+ // +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
954
966
  }
955
967
  },
956
968
  {
957
- // ── Gemini 2.5 Flash-Lite ──
958
- // Onboarded 2026-05-13 (s22) after the model-release watcher surfaced
959
- // it as a UNREGISTERED + NEW candidate. Released by Google July 2025,
960
- // stable. Positioned BELOW Flash on the cost/perf frontier:
961
- // input $0.10/M (Flash $0.30/M) —cheaper
962
- // output $0.40/M (Flash $2.50/M) — 6× cheaper
963
- // cache $0.01/M — 1/10 of input (vs Flash 0.25 discount)
964
- // Cliffs are HYPOTHESIZED from Flash's known failure modes — Flash-Lite
965
- // is a smaller sibling, so we inherit Flash's cliffs at equal-or-tighter
966
- // thresholds. The brain will validate/relax these as evidence accumulates
967
- // per (archetype, model) tuple. Currently ZERO brain rows for this model.
969
+ // ── Gemini 3.1 Flash-Lite ──
970
+ // Onboarded 2026-05-16 by auto-onboarder; s37 (2026-05-21) verified
971
+ // against ai.google.dev/gemini-api/docs/pricing.
972
+ //
973
+ // L-081 CATCHES (template clone from 2.5-flash-lite was 2.5-3.75× too cheap):
974
+ // costInputPer1m 0.10 0.25 (template clone undervalued by 2.)
975
+ // costOutputPer1m 0.40 1.50 (template clone undervalued by 3.75×)
976
+ //
977
+ // Real 3.1-flash-lite is NOT a cost-equivalent successor to 2.5-flash-lite —
978
+ // it sits between 2.5-flash-lite ($0.10/$0.40) and 2.5-flash ($0.30/$2.50).
979
+ // Cache discount 10× verified ($0.025/M cached vs $0.25/M input).
980
+ //
981
+ // Cliffs are HYPOTHESIZED from 2.5-flash family; brain evidence pending.
968
982
  id: "gemini-3.1-flash-lite",
969
- verifiedAgainstDocs: "UNVERIFIED-AUTO-ONBOARD",
983
+ verifiedAgainstDocs: "2026-05-21",
970
984
  provider: "google",
971
985
  status: "preview",
972
986
  maxContextTokens: 1048576,
@@ -1007,12 +1021,12 @@ var PROFILES_RAW = [
1007
1021
  reason: "Inherited from Flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on Flash-Lite specifically."
1008
1022
  }
1009
1023
  ],
1010
- costInputPer1m: 0.1,
1011
- costOutputPer1m: 0.4,
1024
+ costInputPer1m: 0.25,
1025
+ costOutputPer1m: 1.5,
1012
1026
  lowering: {
1013
1027
  ...GOOGLE_LOWERING_BASE,
1014
- // Cache discount 10× (vs Flash 4×) — Google's spec is $0.01/M cache vs
1015
- // $0.10/M input. Material for repeat-prompt workloads (classify shape).
1028
+ // Cache discount 10× (vs Flash 4×) — Google docs s37: $0.025/M cached vs
1029
+ // $0.25/M input. Material for repeat-prompt workloads (classify shape).
1016
1030
  cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
1017
1031
  thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
1018
1032
  },
@@ -1037,13 +1051,13 @@ var PROFILES_RAW = [
1037
1051
  reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target."
1038
1052
  }
1039
1053
  ],
1040
- strengths: ["lowest_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
1054
+ strengths: ["low_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
1041
1055
  weaknesses: ["complex_reasoning", "large_tool_sets", "complex_schemas", "structured_output_unproven", "long_context_quality"],
1042
- notes: "AUTO-ONBOARDED 2026-05-16 from `gemini-2.5-flash-lite`. Pricing, context, cliffs are template-cloned and UNVERIFIED \u2014 confirm against provider docs before promoting status to 'current'.",
1043
- // Tier 3 emergency floor for summarize/classify chains. ZERO brain
1044
- // rows — all values are starter hypotheses anchored to "smaller
1045
- // sibling of Flash, at-or-below Flash perf on every archetype." The
1046
- // first 50 brain rows per archetype will validate or relax these.
1056
+ notes: "Verified s37 (2026-05-21) against Google docs. Sits between 2.5-flash-lite (cheaper) and 2.5-flash (more expensive) on cost frontier; 2.5\xD7 more expensive than initial template-clone. Cliffs hypothesized from Flash family \u2014 brain evidence pending.",
1057
+ // Tier 2-3 floor for summarize/classify chains at the new (verified) price
1058
+ // point. ZERO brain rows — values are starter hypotheses anchored to
1059
+ // "smaller sibling of Flash at higher cost than 2.5-flash-lite." The first
1060
+ // 50 brain rows per archetype will validate or relax these.
1047
1061
  archetypePerf: {
1048
1062
  classify: 6,
1049
1063
  // starter hypothesis — verify (Flash is 7, lite likely ≤)
@@ -1058,6 +1072,125 @@ var PROFILES_RAW = [
1058
1072
  plan: 3,
1059
1073
  critique: 3
1060
1074
  }
1075
+ },
1076
+ // ─────────────────────────────────────────────────────────────────────────
1077
+ // Gemini 3.5 Flash — hand-onboarded s37 (2026-05-21)
1078
+ //
1079
+ // Google positioning ("Most intelligent for sustained frontier performance
1080
+ // on agentic and coding tasks" / "particularly effective for rapid agentic
1081
+ // loops involving complex coding cycles and iterations") suggests this is
1082
+ // the Flash-family upgrade specifically aimed at hunt-shape workloads.
1083
+ // Pricing 5× input / 3.6× output vs 2.5-flash — material cost premium.
1084
+ // archetypePerf adjusted +1 vs 2.5-flash on ask/generate/plan/critique
1085
+ // (sustained-frontier positioning); hunt held at 9 inherited from L-040
1086
+ // family parallel-tool tier; brain evidence will validate within 50 rows.
1087
+ //
1088
+ // Cliffs inherited conservatively from 2.5-flash. Google's "sustained
1089
+ // frontier on long-context" positioning suggests the 8K cliff may not
1090
+ // apply to 3.5 — keep as guard until brain evidence shows otherwise.
1091
+ //
1092
+ // Specs verified against:
1093
+ // ai.google.dev/gemini-api/docs/models/gemini-3.5-flash
1094
+ // ai.google.dev/gemini-api/docs/pricing (Standard tier)
1095
+ // ─────────────────────────────────────────────────────────────────────────
1096
+ {
1097
+ id: "gemini-3.5-flash",
1098
+ verifiedAgainstDocs: "2026-05-21",
1099
+ provider: "google",
1100
+ status: "current",
1101
+ maxContextTokens: 1048576,
1102
+ maxOutputTokens: 65536,
1103
+ maxTools: 128,
1104
+ parallelToolCalls: true,
1105
+ structuredOutput: "native",
1106
+ systemPromptMode: "separate",
1107
+ streaming: true,
1108
+ cliffs: [
1109
+ {
1110
+ metric: "input_tokens",
1111
+ threshold: 8e3,
1112
+ action: "downgrade_quality_warning",
1113
+ reason: "Inherited from 2.5-flash guard; Google positions 3.5 as sustained-frontier-on-long-context but brain evidence pending"
1114
+ },
1115
+ {
1116
+ metric: "tool_count",
1117
+ threshold: 20,
1118
+ action: "drop_to_top_relevant",
1119
+ reason: "Inherited from Flash family: tool reliability drops above ~20 (despite 128 hard limit). Validate per (archetype, model) after n\u226520."
1120
+ },
1121
+ {
1122
+ metric: "thinking_with_short_output",
1123
+ threshold: 1,
1124
+ action: "force_thinking_budget_zero",
1125
+ reason: "Thinking mode supported per Google docs; same drain risk as 2.5-flash \u2014 thinking tokens consume maxOutputTokens"
1126
+ },
1127
+ {
1128
+ // Inherited from 2.5-flash s11 trust artifact (5/5 empty rate on
1129
+ // tt-intelligence/summarize/gemini-2.5-flash with tools offered).
1130
+ // Family-likely failure mode for Flash architecture across versions.
1131
+ // Keep preemptively until brain evidence on 3.5-flash specifically.
1132
+ metric: "tool_count",
1133
+ threshold: 1,
1134
+ whenIntent: "summarize",
1135
+ action: "strip_tools",
1136
+ reason: "Inherited from 2.5-flash s11 cliff (kgauto commit 3872832): summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3.5-flash specifically."
1137
+ }
1138
+ ],
1139
+ costInputPer1m: 1.5,
1140
+ costOutputPer1m: 9,
1141
+ lowering: {
1142
+ ...GOOGLE_LOWERING_BASE,
1143
+ // 10× cache discount per Google pricing: $0.15/M cached vs $1.50/M input.
1144
+ cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
1145
+ thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
1146
+ },
1147
+ recovery: [
1148
+ {
1149
+ signal: "empty_response_after_tool",
1150
+ action: "retry_with_params",
1151
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
1152
+ maxRetries: 1,
1153
+ reason: "Inherited Flash-family pattern: empty after tool result \u2014 retry with thinking off"
1154
+ },
1155
+ {
1156
+ signal: "empty_response",
1157
+ action: "retry_with_params",
1158
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
1159
+ maxRetries: 1,
1160
+ reason: "Empty response \u2014 try with thinking off"
1161
+ },
1162
+ {
1163
+ signal: "malformed_function_call",
1164
+ action: "escalate",
1165
+ reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target"
1166
+ }
1167
+ ],
1168
+ strengths: ["agentic_loops", "coding", "1m_context", "parallel_tools", "thinking_mode", "sustained_frontier"],
1169
+ weaknesses: ["cost_vs_2_5_flash", "no_brain_evidence_yet"],
1170
+ notes: "Hand-onboarded s37 (2026-05-21) verified against Google docs. Stable status; positioned as Flash-family upgrade for agentic loops and coding. 5\xD7/3.6\xD7 more expensive than 2.5-flash but Google claims step-change on sustained frontier work. archetypePerf adjustments are judgment-grounded starter hypotheses \u2014 brain evidence will validate within ~50 rows per archetype.",
1171
+ // Starter hypothesis: anchored to 2.5-flash archetypePerf with +1
1172
+ // adjustments where Google's positioning explicitly supports
1173
+ // (agentic/coding/sustained). Hunt held at 9 inherited from L-040 family
1174
+ // parallel-tool tier. Brain evidence will replace.
1175
+ archetypePerf: {
1176
+ hunt: 9,
1177
+ // Inherited from 2.5-flash L-040 parallel-tool tier; Google positions 3.5 as agentic-loop champion
1178
+ classify: 7,
1179
+ // Inherited from 2.5-flash brain-validated tier (218 rows on 2.5)
1180
+ summarize: 7,
1181
+ // Inherited from 2.5-flash; cliff strips tools when present
1182
+ transform: 7,
1183
+ ask: 8,
1184
+ // +1 vs 2.5-flash — sustained-frontier positioning
1185
+ generate: 8,
1186
+ // +1 vs 2.5-flash (6→8) — Google: "complex coding cycles and iterations"
1187
+ plan: 7,
1188
+ // +1 vs 2.5-flash (5→7) — "complex iterations" positioning
1189
+ extract: 7,
1190
+ // +1 vs 2.5-flash — sustained-frontier on structured tasks
1191
+ critique: 5
1192
+ // +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
1193
+ }
1061
1194
  }
1062
1195
  ];
1063
1196
  var ALIASES = {
package/dist/profiles.mjs CHANGED
@@ -6,7 +6,7 @@ import {
6
6
  getProfile,
7
7
  profilesByProvider,
8
8
  tryGetProfile
9
- } from "./chunk-7MTHFSNY.mjs";
9
+ } from "./chunk-JQGRWJZO.mjs";
10
10
  export {
11
11
  ALIASES,
12
12
  _setProfileBrainHook,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@warmdrift/kgauto-compiler",
3
- "version": "2.0.0-alpha.26",
3
+ "version": "2.0.0-alpha.27",
4
4
  "description": "Prompt compiler + central learning brain for multi-model AI apps. Swap models without rewriting prompts.",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",