@warmdrift/kgauto-compiler 2.0.0-alpha.26 → 2.0.0-alpha.27
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-7MTHFSNY.mjs → chunk-JQGRWJZO.mjs} +181 -48
- package/dist/index.js +181 -48
- package/dist/index.mjs +1 -1
- package/dist/profiles.js +181 -48
- package/dist/profiles.mjs +1 -1
- package/package.json +1 -1
|
@@ -829,12 +829,23 @@ var PROFILES_RAW = [
|
|
|
829
829
|
// Each entry's pricing/context/cliffs/lowering reflects the template, NOT
|
|
830
830
|
// provider docs. Verify before promoting status to 'current' (L-049/L-081).
|
|
831
831
|
{
|
|
832
|
+
// s37 (2026-05-21): UNVERIFIED-AUTO-ONBOARD → verified against
|
|
833
|
+
// ai.google.dev/gemini-api/docs/models/gemini-3-flash-preview +
|
|
834
|
+
// ai.google.dev/gemini-api/docs/pricing. L-081 catches:
|
|
835
|
+
// maxOutputTokens 65_535 → 65_536 (off-by-one)
|
|
836
|
+
// costInputPer1m 0.30 → 0.50 (template-cloned from 2.5-flash; actual is 1.67× more expensive)
|
|
837
|
+
// costOutputPer1m 2.50 → 3.00 (template-cloned; actual 1.2× more expensive)
|
|
838
|
+
// cache discount default 0.25 → 0.10 (10× discount, $0.05/$0.50 per docs)
|
|
839
|
+
// Cliffs inherited from 2.5-flash conservatively. The 8K-context-quality
|
|
840
|
+
// cliff was a 2.5-Flash observation — Google positions Gemini 3 as
|
|
841
|
+
// sustained-frontier-on-long-context; brain evidence will validate/relax.
|
|
842
|
+
// Kept as guard for now.
|
|
832
843
|
id: "gemini-3-flash-preview",
|
|
833
|
-
verifiedAgainstDocs: "
|
|
844
|
+
verifiedAgainstDocs: "2026-05-21",
|
|
834
845
|
provider: "google",
|
|
835
846
|
status: "preview",
|
|
836
847
|
maxContextTokens: 1048576,
|
|
837
|
-
maxOutputTokens:
|
|
848
|
+
maxOutputTokens: 65536,
|
|
838
849
|
maxTools: 128,
|
|
839
850
|
parallelToolCalls: true,
|
|
840
851
|
structuredOutput: "native",
|
|
@@ -845,13 +856,13 @@ var PROFILES_RAW = [
|
|
|
845
856
|
metric: "input_tokens",
|
|
846
857
|
threshold: 8e3,
|
|
847
858
|
action: "downgrade_quality_warning",
|
|
848
|
-
reason: "
|
|
859
|
+
reason: "Inherited from 2.5-flash guard; brain evidence on Gemini 3 long-context quality will validate/relax"
|
|
849
860
|
},
|
|
850
861
|
{
|
|
851
862
|
metric: "tool_count",
|
|
852
863
|
threshold: 20,
|
|
853
864
|
action: "drop_to_top_relevant",
|
|
854
|
-
reason: "Tool reliability drops above ~20 tools (despite 128 hard limit)"
|
|
865
|
+
reason: "Tool reliability drops above ~20 tools (despite 128 hard limit) \u2014 inherited from Flash family"
|
|
855
866
|
},
|
|
856
867
|
{
|
|
857
868
|
metric: "thinking_with_short_output",
|
|
@@ -860,24 +871,22 @@ var PROFILES_RAW = [
|
|
|
860
871
|
reason: "Thinking tokens consume maxOutputTokens \u2014 empty response if drained"
|
|
861
872
|
},
|
|
862
873
|
{
|
|
863
|
-
//
|
|
864
|
-
//
|
|
865
|
-
//
|
|
866
|
-
// help — disabling thinking is necessary but not sufficient. Tools
|
|
867
|
-
// present + summarize intent confuses Flash into a no-output state
|
|
868
|
-
// (likely tool-decision purgatory). Strip tools entirely for this
|
|
869
|
-
// archetype on this model.
|
|
874
|
+
// Inherited from gemini-2.5-flash s11 trust artifact. Family-likely
|
|
875
|
+
// failure mode for Flash architecture. Keep preemptively until brain
|
|
876
|
+
// evidence on Gemini 3 specifically.
|
|
870
877
|
metric: "tool_count",
|
|
871
878
|
threshold: 1,
|
|
872
879
|
whenIntent: "summarize",
|
|
873
880
|
action: "strip_tools",
|
|
874
|
-
reason: "
|
|
881
|
+
reason: "Inherited from 2.5-flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3-flash-preview specifically."
|
|
875
882
|
}
|
|
876
883
|
],
|
|
877
|
-
costInputPer1m: 0.
|
|
878
|
-
costOutputPer1m:
|
|
884
|
+
costInputPer1m: 0.5,
|
|
885
|
+
costOutputPer1m: 3,
|
|
879
886
|
lowering: {
|
|
880
887
|
...GOOGLE_LOWERING_BASE,
|
|
888
|
+
// 10× cache discount per Google pricing: $0.05/M cached vs $0.50/M input.
|
|
889
|
+
cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
|
|
881
890
|
thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
|
|
882
891
|
},
|
|
883
892
|
recovery: [
|
|
@@ -903,40 +912,45 @@ var PROFILES_RAW = [
|
|
|
903
912
|
],
|
|
904
913
|
strengths: ["speed", "volume", "classification", "1m_context", "cost"],
|
|
905
914
|
weaknesses: ["complex_schemas", "large_tool_sets", "high_context_quality"],
|
|
906
|
-
notes: "
|
|
907
|
-
//
|
|
908
|
-
//
|
|
915
|
+
notes: "Verified s37 (2026-05-21) against Google docs. Step-change positioning vs 2.5-flash on agentic loops per Google's release notes (Dec 2025). Pricing 1.67\xD7/1.2\xD7 higher than 2.5-flash; cache discount 10\xD7 (vs 4\xD7 for 2.5). Status=preview until brain evidence accumulates.",
|
|
916
|
+
// Anchored to 2.5-flash archetypePerf as starter, with judgment adjustments
|
|
917
|
+
// for Google's "step-change on agentic" positioning. Brain evidence (zero
|
|
918
|
+
// rows today) will replace these starter values.
|
|
909
919
|
archetypePerf: {
|
|
910
920
|
hunt: 9,
|
|
911
|
-
// L-040
|
|
921
|
+
// Inherits 2.5-flash L-040 parallel-tool tier; Google positions 3 as agentic-loop upgrade
|
|
912
922
|
classify: 7,
|
|
913
|
-
// brain-validated
|
|
923
|
+
// Inherits 2.5-flash brain-validated tier (218 rows on 2.5)
|
|
914
924
|
summarize: 7,
|
|
915
|
-
//
|
|
925
|
+
// Inherits 2.5-flash; cliff strips tools when present
|
|
916
926
|
transform: 7,
|
|
917
|
-
ask:
|
|
918
|
-
|
|
919
|
-
|
|
927
|
+
ask: 8,
|
|
928
|
+
// +1 vs 2.5-flash — sustained-frontier positioning
|
|
929
|
+
generate: 7,
|
|
930
|
+
// +1 vs 2.5-flash — agentic coding upgrade per Google
|
|
931
|
+
plan: 6,
|
|
932
|
+
// +1 vs 2.5-flash — complex iterations per positioning
|
|
920
933
|
extract: 6,
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
// reasoning shallower than Sonnet/Opus
|
|
934
|
+
critique: 5
|
|
935
|
+
// +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
|
|
924
936
|
}
|
|
925
937
|
},
|
|
926
938
|
{
|
|
927
|
-
// ── Gemini
|
|
928
|
-
// Onboarded 2026-05-
|
|
929
|
-
//
|
|
930
|
-
//
|
|
931
|
-
//
|
|
932
|
-
//
|
|
933
|
-
//
|
|
934
|
-
//
|
|
935
|
-
//
|
|
936
|
-
//
|
|
937
|
-
//
|
|
939
|
+
// ── Gemini 3.1 Flash-Lite ──
|
|
940
|
+
// Onboarded 2026-05-16 by auto-onboarder; s37 (2026-05-21) verified
|
|
941
|
+
// against ai.google.dev/gemini-api/docs/pricing.
|
|
942
|
+
//
|
|
943
|
+
// L-081 CATCHES (template clone from 2.5-flash-lite was 2.5-3.75× too cheap):
|
|
944
|
+
// costInputPer1m 0.10 → 0.25 (template clone undervalued by 2.5×)
|
|
945
|
+
// costOutputPer1m 0.40 → 1.50 (template clone undervalued by 3.75×)
|
|
946
|
+
//
|
|
947
|
+
// Real 3.1-flash-lite is NOT a cost-equivalent successor to 2.5-flash-lite —
|
|
948
|
+
// it sits between 2.5-flash-lite ($0.10/$0.40) and 2.5-flash ($0.30/$2.50).
|
|
949
|
+
// Cache discount 10× verified ($0.025/M cached vs $0.25/M input).
|
|
950
|
+
//
|
|
951
|
+
// Cliffs are HYPOTHESIZED from 2.5-flash family; brain evidence pending.
|
|
938
952
|
id: "gemini-3.1-flash-lite",
|
|
939
|
-
verifiedAgainstDocs: "
|
|
953
|
+
verifiedAgainstDocs: "2026-05-21",
|
|
940
954
|
provider: "google",
|
|
941
955
|
status: "preview",
|
|
942
956
|
maxContextTokens: 1048576,
|
|
@@ -977,12 +991,12 @@ var PROFILES_RAW = [
|
|
|
977
991
|
reason: "Inherited from Flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on Flash-Lite specifically."
|
|
978
992
|
}
|
|
979
993
|
],
|
|
980
|
-
costInputPer1m: 0.
|
|
981
|
-
costOutputPer1m:
|
|
994
|
+
costInputPer1m: 0.25,
|
|
995
|
+
costOutputPer1m: 1.5,
|
|
982
996
|
lowering: {
|
|
983
997
|
...GOOGLE_LOWERING_BASE,
|
|
984
|
-
// Cache discount 10× (vs Flash 4×) — Google
|
|
985
|
-
// $0.
|
|
998
|
+
// Cache discount 10× (vs Flash 4×) — Google docs s37: $0.025/M cached vs
|
|
999
|
+
// $0.25/M input. Material for repeat-prompt workloads (classify shape).
|
|
986
1000
|
cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
|
|
987
1001
|
thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
|
|
988
1002
|
},
|
|
@@ -1007,13 +1021,13 @@ var PROFILES_RAW = [
|
|
|
1007
1021
|
reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target."
|
|
1008
1022
|
}
|
|
1009
1023
|
],
|
|
1010
|
-
strengths: ["
|
|
1024
|
+
strengths: ["low_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
|
|
1011
1025
|
weaknesses: ["complex_reasoning", "large_tool_sets", "complex_schemas", "structured_output_unproven", "long_context_quality"],
|
|
1012
|
-
notes: "
|
|
1013
|
-
// Tier 3
|
|
1014
|
-
// rows —
|
|
1015
|
-
// sibling of Flash
|
|
1016
|
-
//
|
|
1026
|
+
notes: "Verified s37 (2026-05-21) against Google docs. Sits between 2.5-flash-lite (cheaper) and 2.5-flash (more expensive) on cost frontier; 2.5\xD7 more expensive than initial template-clone. Cliffs hypothesized from Flash family \u2014 brain evidence pending.",
|
|
1027
|
+
// Tier 2-3 floor for summarize/classify chains at the new (verified) price
|
|
1028
|
+
// point. ZERO brain rows — values are starter hypotheses anchored to
|
|
1029
|
+
// "smaller sibling of Flash at higher cost than 2.5-flash-lite." The first
|
|
1030
|
+
// 50 brain rows per archetype will validate or relax these.
|
|
1017
1031
|
archetypePerf: {
|
|
1018
1032
|
classify: 6,
|
|
1019
1033
|
// starter hypothesis — verify (Flash is 7, lite likely ≤)
|
|
@@ -1028,6 +1042,125 @@ var PROFILES_RAW = [
|
|
|
1028
1042
|
plan: 3,
|
|
1029
1043
|
critique: 3
|
|
1030
1044
|
}
|
|
1045
|
+
},
|
|
1046
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1047
|
+
// Gemini 3.5 Flash — hand-onboarded s37 (2026-05-21)
|
|
1048
|
+
//
|
|
1049
|
+
// Google positioning ("Most intelligent for sustained frontier performance
|
|
1050
|
+
// on agentic and coding tasks" / "particularly effective for rapid agentic
|
|
1051
|
+
// loops involving complex coding cycles and iterations") suggests this is
|
|
1052
|
+
// the Flash-family upgrade specifically aimed at hunt-shape workloads.
|
|
1053
|
+
// Pricing 5× input / 3.6× output vs 2.5-flash — material cost premium.
|
|
1054
|
+
// archetypePerf adjusted +1 vs 2.5-flash on ask/generate/plan/critique
|
|
1055
|
+
// (sustained-frontier positioning); hunt held at 9 inherited from L-040
|
|
1056
|
+
// family parallel-tool tier; brain evidence will validate within 50 rows.
|
|
1057
|
+
//
|
|
1058
|
+
// Cliffs inherited conservatively from 2.5-flash. Google's "sustained
|
|
1059
|
+
// frontier on long-context" positioning suggests the 8K cliff may not
|
|
1060
|
+
// apply to 3.5 — keep as guard until brain evidence shows otherwise.
|
|
1061
|
+
//
|
|
1062
|
+
// Specs verified against:
|
|
1063
|
+
// ai.google.dev/gemini-api/docs/models/gemini-3.5-flash
|
|
1064
|
+
// ai.google.dev/gemini-api/docs/pricing (Standard tier)
|
|
1065
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1066
|
+
{
|
|
1067
|
+
id: "gemini-3.5-flash",
|
|
1068
|
+
verifiedAgainstDocs: "2026-05-21",
|
|
1069
|
+
provider: "google",
|
|
1070
|
+
status: "current",
|
|
1071
|
+
maxContextTokens: 1048576,
|
|
1072
|
+
maxOutputTokens: 65536,
|
|
1073
|
+
maxTools: 128,
|
|
1074
|
+
parallelToolCalls: true,
|
|
1075
|
+
structuredOutput: "native",
|
|
1076
|
+
systemPromptMode: "separate",
|
|
1077
|
+
streaming: true,
|
|
1078
|
+
cliffs: [
|
|
1079
|
+
{
|
|
1080
|
+
metric: "input_tokens",
|
|
1081
|
+
threshold: 8e3,
|
|
1082
|
+
action: "downgrade_quality_warning",
|
|
1083
|
+
reason: "Inherited from 2.5-flash guard; Google positions 3.5 as sustained-frontier-on-long-context but brain evidence pending"
|
|
1084
|
+
},
|
|
1085
|
+
{
|
|
1086
|
+
metric: "tool_count",
|
|
1087
|
+
threshold: 20,
|
|
1088
|
+
action: "drop_to_top_relevant",
|
|
1089
|
+
reason: "Inherited from Flash family: tool reliability drops above ~20 (despite 128 hard limit). Validate per (archetype, model) after n\u226520."
|
|
1090
|
+
},
|
|
1091
|
+
{
|
|
1092
|
+
metric: "thinking_with_short_output",
|
|
1093
|
+
threshold: 1,
|
|
1094
|
+
action: "force_thinking_budget_zero",
|
|
1095
|
+
reason: "Thinking mode supported per Google docs; same drain risk as 2.5-flash \u2014 thinking tokens consume maxOutputTokens"
|
|
1096
|
+
},
|
|
1097
|
+
{
|
|
1098
|
+
// Inherited from 2.5-flash s11 trust artifact (5/5 empty rate on
|
|
1099
|
+
// tt-intelligence/summarize/gemini-2.5-flash with tools offered).
|
|
1100
|
+
// Family-likely failure mode for Flash architecture across versions.
|
|
1101
|
+
// Keep preemptively until brain evidence on 3.5-flash specifically.
|
|
1102
|
+
metric: "tool_count",
|
|
1103
|
+
threshold: 1,
|
|
1104
|
+
whenIntent: "summarize",
|
|
1105
|
+
action: "strip_tools",
|
|
1106
|
+
reason: "Inherited from 2.5-flash s11 cliff (kgauto commit 3872832): summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3.5-flash specifically."
|
|
1107
|
+
}
|
|
1108
|
+
],
|
|
1109
|
+
costInputPer1m: 1.5,
|
|
1110
|
+
costOutputPer1m: 9,
|
|
1111
|
+
lowering: {
|
|
1112
|
+
...GOOGLE_LOWERING_BASE,
|
|
1113
|
+
// 10× cache discount per Google pricing: $0.15/M cached vs $1.50/M input.
|
|
1114
|
+
cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
|
|
1115
|
+
thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
|
|
1116
|
+
},
|
|
1117
|
+
recovery: [
|
|
1118
|
+
{
|
|
1119
|
+
signal: "empty_response_after_tool",
|
|
1120
|
+
action: "retry_with_params",
|
|
1121
|
+
retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
|
|
1122
|
+
maxRetries: 1,
|
|
1123
|
+
reason: "Inherited Flash-family pattern: empty after tool result \u2014 retry with thinking off"
|
|
1124
|
+
},
|
|
1125
|
+
{
|
|
1126
|
+
signal: "empty_response",
|
|
1127
|
+
action: "retry_with_params",
|
|
1128
|
+
retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
|
|
1129
|
+
maxRetries: 1,
|
|
1130
|
+
reason: "Empty response \u2014 try with thinking off"
|
|
1131
|
+
},
|
|
1132
|
+
{
|
|
1133
|
+
signal: "malformed_function_call",
|
|
1134
|
+
action: "escalate",
|
|
1135
|
+
reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target"
|
|
1136
|
+
}
|
|
1137
|
+
],
|
|
1138
|
+
strengths: ["agentic_loops", "coding", "1m_context", "parallel_tools", "thinking_mode", "sustained_frontier"],
|
|
1139
|
+
weaknesses: ["cost_vs_2_5_flash", "no_brain_evidence_yet"],
|
|
1140
|
+
notes: "Hand-onboarded s37 (2026-05-21) verified against Google docs. Stable status; positioned as Flash-family upgrade for agentic loops and coding. 5\xD7/3.6\xD7 more expensive than 2.5-flash but Google claims step-change on sustained frontier work. archetypePerf adjustments are judgment-grounded starter hypotheses \u2014 brain evidence will validate within ~50 rows per archetype.",
|
|
1141
|
+
// Starter hypothesis: anchored to 2.5-flash archetypePerf with +1
|
|
1142
|
+
// adjustments where Google's positioning explicitly supports
|
|
1143
|
+
// (agentic/coding/sustained). Hunt held at 9 inherited from L-040 family
|
|
1144
|
+
// parallel-tool tier. Brain evidence will replace.
|
|
1145
|
+
archetypePerf: {
|
|
1146
|
+
hunt: 9,
|
|
1147
|
+
// Inherited from 2.5-flash L-040 parallel-tool tier; Google positions 3.5 as agentic-loop champion
|
|
1148
|
+
classify: 7,
|
|
1149
|
+
// Inherited from 2.5-flash brain-validated tier (218 rows on 2.5)
|
|
1150
|
+
summarize: 7,
|
|
1151
|
+
// Inherited from 2.5-flash; cliff strips tools when present
|
|
1152
|
+
transform: 7,
|
|
1153
|
+
ask: 8,
|
|
1154
|
+
// +1 vs 2.5-flash — sustained-frontier positioning
|
|
1155
|
+
generate: 8,
|
|
1156
|
+
// +1 vs 2.5-flash (6→8) — Google: "complex coding cycles and iterations"
|
|
1157
|
+
plan: 7,
|
|
1158
|
+
// +1 vs 2.5-flash (5→7) — "complex iterations" positioning
|
|
1159
|
+
extract: 7,
|
|
1160
|
+
// +1 vs 2.5-flash — sustained-frontier on structured tasks
|
|
1161
|
+
critique: 5
|
|
1162
|
+
// +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
|
|
1163
|
+
}
|
|
1031
1164
|
}
|
|
1032
1165
|
];
|
|
1033
1166
|
var ALIASES = {
|
package/dist/index.js
CHANGED
|
@@ -1703,12 +1703,23 @@ var PROFILES_RAW = [
|
|
|
1703
1703
|
// Each entry's pricing/context/cliffs/lowering reflects the template, NOT
|
|
1704
1704
|
// provider docs. Verify before promoting status to 'current' (L-049/L-081).
|
|
1705
1705
|
{
|
|
1706
|
+
// s37 (2026-05-21): UNVERIFIED-AUTO-ONBOARD → verified against
|
|
1707
|
+
// ai.google.dev/gemini-api/docs/models/gemini-3-flash-preview +
|
|
1708
|
+
// ai.google.dev/gemini-api/docs/pricing. L-081 catches:
|
|
1709
|
+
// maxOutputTokens 65_535 → 65_536 (off-by-one)
|
|
1710
|
+
// costInputPer1m 0.30 → 0.50 (template-cloned from 2.5-flash; actual is 1.67× more expensive)
|
|
1711
|
+
// costOutputPer1m 2.50 → 3.00 (template-cloned; actual 1.2× more expensive)
|
|
1712
|
+
// cache discount default 0.25 → 0.10 (10× discount, $0.05/$0.50 per docs)
|
|
1713
|
+
// Cliffs inherited from 2.5-flash conservatively. The 8K-context-quality
|
|
1714
|
+
// cliff was a 2.5-Flash observation — Google positions Gemini 3 as
|
|
1715
|
+
// sustained-frontier-on-long-context; brain evidence will validate/relax.
|
|
1716
|
+
// Kept as guard for now.
|
|
1706
1717
|
id: "gemini-3-flash-preview",
|
|
1707
|
-
verifiedAgainstDocs: "
|
|
1718
|
+
verifiedAgainstDocs: "2026-05-21",
|
|
1708
1719
|
provider: "google",
|
|
1709
1720
|
status: "preview",
|
|
1710
1721
|
maxContextTokens: 1048576,
|
|
1711
|
-
maxOutputTokens:
|
|
1722
|
+
maxOutputTokens: 65536,
|
|
1712
1723
|
maxTools: 128,
|
|
1713
1724
|
parallelToolCalls: true,
|
|
1714
1725
|
structuredOutput: "native",
|
|
@@ -1719,13 +1730,13 @@ var PROFILES_RAW = [
|
|
|
1719
1730
|
metric: "input_tokens",
|
|
1720
1731
|
threshold: 8e3,
|
|
1721
1732
|
action: "downgrade_quality_warning",
|
|
1722
|
-
reason: "
|
|
1733
|
+
reason: "Inherited from 2.5-flash guard; brain evidence on Gemini 3 long-context quality will validate/relax"
|
|
1723
1734
|
},
|
|
1724
1735
|
{
|
|
1725
1736
|
metric: "tool_count",
|
|
1726
1737
|
threshold: 20,
|
|
1727
1738
|
action: "drop_to_top_relevant",
|
|
1728
|
-
reason: "Tool reliability drops above ~20 tools (despite 128 hard limit)"
|
|
1739
|
+
reason: "Tool reliability drops above ~20 tools (despite 128 hard limit) \u2014 inherited from Flash family"
|
|
1729
1740
|
},
|
|
1730
1741
|
{
|
|
1731
1742
|
metric: "thinking_with_short_output",
|
|
@@ -1734,24 +1745,22 @@ var PROFILES_RAW = [
|
|
|
1734
1745
|
reason: "Thinking tokens consume maxOutputTokens \u2014 empty response if drained"
|
|
1735
1746
|
},
|
|
1736
1747
|
{
|
|
1737
|
-
//
|
|
1738
|
-
//
|
|
1739
|
-
//
|
|
1740
|
-
// help — disabling thinking is necessary but not sufficient. Tools
|
|
1741
|
-
// present + summarize intent confuses Flash into a no-output state
|
|
1742
|
-
// (likely tool-decision purgatory). Strip tools entirely for this
|
|
1743
|
-
// archetype on this model.
|
|
1748
|
+
// Inherited from gemini-2.5-flash s11 trust artifact. Family-likely
|
|
1749
|
+
// failure mode for Flash architecture. Keep preemptively until brain
|
|
1750
|
+
// evidence on Gemini 3 specifically.
|
|
1744
1751
|
metric: "tool_count",
|
|
1745
1752
|
threshold: 1,
|
|
1746
1753
|
whenIntent: "summarize",
|
|
1747
1754
|
action: "strip_tools",
|
|
1748
|
-
reason: "
|
|
1755
|
+
reason: "Inherited from 2.5-flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3-flash-preview specifically."
|
|
1749
1756
|
}
|
|
1750
1757
|
],
|
|
1751
|
-
costInputPer1m: 0.
|
|
1752
|
-
costOutputPer1m:
|
|
1758
|
+
costInputPer1m: 0.5,
|
|
1759
|
+
costOutputPer1m: 3,
|
|
1753
1760
|
lowering: {
|
|
1754
1761
|
...GOOGLE_LOWERING_BASE,
|
|
1762
|
+
// 10× cache discount per Google pricing: $0.05/M cached vs $0.50/M input.
|
|
1763
|
+
cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
|
|
1755
1764
|
thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
|
|
1756
1765
|
},
|
|
1757
1766
|
recovery: [
|
|
@@ -1777,40 +1786,45 @@ var PROFILES_RAW = [
|
|
|
1777
1786
|
],
|
|
1778
1787
|
strengths: ["speed", "volume", "classification", "1m_context", "cost"],
|
|
1779
1788
|
weaknesses: ["complex_schemas", "large_tool_sets", "high_context_quality"],
|
|
1780
|
-
notes: "
|
|
1781
|
-
//
|
|
1782
|
-
//
|
|
1789
|
+
notes: "Verified s37 (2026-05-21) against Google docs. Step-change positioning vs 2.5-flash on agentic loops per Google's release notes (Dec 2025). Pricing 1.67\xD7/1.2\xD7 higher than 2.5-flash; cache discount 10\xD7 (vs 4\xD7 for 2.5). Status=preview until brain evidence accumulates.",
|
|
1790
|
+
// Anchored to 2.5-flash archetypePerf as starter, with judgment adjustments
|
|
1791
|
+
// for Google's "step-change on agentic" positioning. Brain evidence (zero
|
|
1792
|
+
// rows today) will replace these starter values.
|
|
1783
1793
|
archetypePerf: {
|
|
1784
1794
|
hunt: 9,
|
|
1785
|
-
// L-040
|
|
1795
|
+
// Inherits 2.5-flash L-040 parallel-tool tier; Google positions 3 as agentic-loop upgrade
|
|
1786
1796
|
classify: 7,
|
|
1787
|
-
// brain-validated
|
|
1797
|
+
// Inherits 2.5-flash brain-validated tier (218 rows on 2.5)
|
|
1788
1798
|
summarize: 7,
|
|
1789
|
-
//
|
|
1799
|
+
// Inherits 2.5-flash; cliff strips tools when present
|
|
1790
1800
|
transform: 7,
|
|
1791
|
-
ask:
|
|
1792
|
-
|
|
1793
|
-
|
|
1801
|
+
ask: 8,
|
|
1802
|
+
// +1 vs 2.5-flash — sustained-frontier positioning
|
|
1803
|
+
generate: 7,
|
|
1804
|
+
// +1 vs 2.5-flash — agentic coding upgrade per Google
|
|
1805
|
+
plan: 6,
|
|
1806
|
+
// +1 vs 2.5-flash — complex iterations per positioning
|
|
1794
1807
|
extract: 6,
|
|
1795
|
-
|
|
1796
|
-
|
|
1797
|
-
// reasoning shallower than Sonnet/Opus
|
|
1808
|
+
critique: 5
|
|
1809
|
+
// +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
|
|
1798
1810
|
}
|
|
1799
1811
|
},
|
|
1800
1812
|
{
|
|
1801
|
-
// ── Gemini
|
|
1802
|
-
// Onboarded 2026-05-
|
|
1803
|
-
//
|
|
1804
|
-
//
|
|
1805
|
-
//
|
|
1806
|
-
//
|
|
1807
|
-
//
|
|
1808
|
-
//
|
|
1809
|
-
//
|
|
1810
|
-
//
|
|
1811
|
-
//
|
|
1813
|
+
// ── Gemini 3.1 Flash-Lite ──
|
|
1814
|
+
// Onboarded 2026-05-16 by auto-onboarder; s37 (2026-05-21) verified
|
|
1815
|
+
// against ai.google.dev/gemini-api/docs/pricing.
|
|
1816
|
+
//
|
|
1817
|
+
// L-081 CATCHES (template clone from 2.5-flash-lite was 2.5-3.75× too cheap):
|
|
1818
|
+
// costInputPer1m 0.10 → 0.25 (template clone undervalued by 2.5×)
|
|
1819
|
+
// costOutputPer1m 0.40 → 1.50 (template clone undervalued by 3.75×)
|
|
1820
|
+
//
|
|
1821
|
+
// Real 3.1-flash-lite is NOT a cost-equivalent successor to 2.5-flash-lite —
|
|
1822
|
+
// it sits between 2.5-flash-lite ($0.10/$0.40) and 2.5-flash ($0.30/$2.50).
|
|
1823
|
+
// Cache discount 10× verified ($0.025/M cached vs $0.25/M input).
|
|
1824
|
+
//
|
|
1825
|
+
// Cliffs are HYPOTHESIZED from 2.5-flash family; brain evidence pending.
|
|
1812
1826
|
id: "gemini-3.1-flash-lite",
|
|
1813
|
-
verifiedAgainstDocs: "
|
|
1827
|
+
verifiedAgainstDocs: "2026-05-21",
|
|
1814
1828
|
provider: "google",
|
|
1815
1829
|
status: "preview",
|
|
1816
1830
|
maxContextTokens: 1048576,
|
|
@@ -1851,12 +1865,12 @@ var PROFILES_RAW = [
|
|
|
1851
1865
|
reason: "Inherited from Flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on Flash-Lite specifically."
|
|
1852
1866
|
}
|
|
1853
1867
|
],
|
|
1854
|
-
costInputPer1m: 0.
|
|
1855
|
-
costOutputPer1m:
|
|
1868
|
+
costInputPer1m: 0.25,
|
|
1869
|
+
costOutputPer1m: 1.5,
|
|
1856
1870
|
lowering: {
|
|
1857
1871
|
...GOOGLE_LOWERING_BASE,
|
|
1858
|
-
// Cache discount 10× (vs Flash 4×) — Google
|
|
1859
|
-
// $0.
|
|
1872
|
+
// Cache discount 10× (vs Flash 4×) — Google docs s37: $0.025/M cached vs
|
|
1873
|
+
// $0.25/M input. Material for repeat-prompt workloads (classify shape).
|
|
1860
1874
|
cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
|
|
1861
1875
|
thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
|
|
1862
1876
|
},
|
|
@@ -1881,13 +1895,13 @@ var PROFILES_RAW = [
|
|
|
1881
1895
|
reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target."
|
|
1882
1896
|
}
|
|
1883
1897
|
],
|
|
1884
|
-
strengths: ["
|
|
1898
|
+
strengths: ["low_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
|
|
1885
1899
|
weaknesses: ["complex_reasoning", "large_tool_sets", "complex_schemas", "structured_output_unproven", "long_context_quality"],
|
|
1886
|
-
notes: "
|
|
1887
|
-
// Tier 3
|
|
1888
|
-
// rows —
|
|
1889
|
-
// sibling of Flash
|
|
1890
|
-
//
|
|
1900
|
+
notes: "Verified s37 (2026-05-21) against Google docs. Sits between 2.5-flash-lite (cheaper) and 2.5-flash (more expensive) on cost frontier; 2.5\xD7 more expensive than initial template-clone. Cliffs hypothesized from Flash family \u2014 brain evidence pending.",
|
|
1901
|
+
// Tier 2-3 floor for summarize/classify chains at the new (verified) price
|
|
1902
|
+
// point. ZERO brain rows — values are starter hypotheses anchored to
|
|
1903
|
+
// "smaller sibling of Flash at higher cost than 2.5-flash-lite." The first
|
|
1904
|
+
// 50 brain rows per archetype will validate or relax these.
|
|
1891
1905
|
archetypePerf: {
|
|
1892
1906
|
classify: 6,
|
|
1893
1907
|
// starter hypothesis — verify (Flash is 7, lite likely ≤)
|
|
@@ -1902,6 +1916,125 @@ var PROFILES_RAW = [
|
|
|
1902
1916
|
plan: 3,
|
|
1903
1917
|
critique: 3
|
|
1904
1918
|
}
|
|
1919
|
+
},
|
|
1920
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1921
|
+
// Gemini 3.5 Flash — hand-onboarded s37 (2026-05-21)
|
|
1922
|
+
//
|
|
1923
|
+
// Google positioning ("Most intelligent for sustained frontier performance
|
|
1924
|
+
// on agentic and coding tasks" / "particularly effective for rapid agentic
|
|
1925
|
+
// loops involving complex coding cycles and iterations") suggests this is
|
|
1926
|
+
// the Flash-family upgrade specifically aimed at hunt-shape workloads.
|
|
1927
|
+
// Pricing 5× input / 3.6× output vs 2.5-flash — material cost premium.
|
|
1928
|
+
// archetypePerf adjusted +1 vs 2.5-flash on ask/generate/plan/critique
|
|
1929
|
+
// (sustained-frontier positioning); hunt held at 9 inherited from L-040
|
|
1930
|
+
// family parallel-tool tier; brain evidence will validate within 50 rows.
|
|
1931
|
+
//
|
|
1932
|
+
// Cliffs inherited conservatively from 2.5-flash. Google's "sustained
|
|
1933
|
+
// frontier on long-context" positioning suggests the 8K cliff may not
|
|
1934
|
+
// apply to 3.5 — keep as guard until brain evidence shows otherwise.
|
|
1935
|
+
//
|
|
1936
|
+
// Specs verified against:
|
|
1937
|
+
// ai.google.dev/gemini-api/docs/models/gemini-3.5-flash
|
|
1938
|
+
// ai.google.dev/gemini-api/docs/pricing (Standard tier)
|
|
1939
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1940
|
+
{
|
|
1941
|
+
id: "gemini-3.5-flash",
|
|
1942
|
+
verifiedAgainstDocs: "2026-05-21",
|
|
1943
|
+
provider: "google",
|
|
1944
|
+
status: "current",
|
|
1945
|
+
maxContextTokens: 1048576,
|
|
1946
|
+
maxOutputTokens: 65536,
|
|
1947
|
+
maxTools: 128,
|
|
1948
|
+
parallelToolCalls: true,
|
|
1949
|
+
structuredOutput: "native",
|
|
1950
|
+
systemPromptMode: "separate",
|
|
1951
|
+
streaming: true,
|
|
1952
|
+
cliffs: [
|
|
1953
|
+
{
|
|
1954
|
+
metric: "input_tokens",
|
|
1955
|
+
threshold: 8e3,
|
|
1956
|
+
action: "downgrade_quality_warning",
|
|
1957
|
+
reason: "Inherited from 2.5-flash guard; Google positions 3.5 as sustained-frontier-on-long-context but brain evidence pending"
|
|
1958
|
+
},
|
|
1959
|
+
{
|
|
1960
|
+
metric: "tool_count",
|
|
1961
|
+
threshold: 20,
|
|
1962
|
+
action: "drop_to_top_relevant",
|
|
1963
|
+
reason: "Inherited from Flash family: tool reliability drops above ~20 (despite 128 hard limit). Validate per (archetype, model) after n\u226520."
|
|
1964
|
+
},
|
|
1965
|
+
{
|
|
1966
|
+
metric: "thinking_with_short_output",
|
|
1967
|
+
threshold: 1,
|
|
1968
|
+
action: "force_thinking_budget_zero",
|
|
1969
|
+
reason: "Thinking mode supported per Google docs; same drain risk as 2.5-flash \u2014 thinking tokens consume maxOutputTokens"
|
|
1970
|
+
},
|
|
1971
|
+
{
|
|
1972
|
+
// Inherited from 2.5-flash s11 trust artifact (5/5 empty rate on
|
|
1973
|
+
// tt-intelligence/summarize/gemini-2.5-flash with tools offered).
|
|
1974
|
+
// Family-likely failure mode for Flash architecture across versions.
|
|
1975
|
+
// Keep preemptively until brain evidence on 3.5-flash specifically.
|
|
1976
|
+
metric: "tool_count",
|
|
1977
|
+
threshold: 1,
|
|
1978
|
+
whenIntent: "summarize",
|
|
1979
|
+
action: "strip_tools",
|
|
1980
|
+
reason: "Inherited from 2.5-flash s11 cliff (kgauto commit 3872832): summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3.5-flash specifically."
|
|
1981
|
+
}
|
|
1982
|
+
],
|
|
1983
|
+
costInputPer1m: 1.5,
|
|
1984
|
+
costOutputPer1m: 9,
|
|
1985
|
+
lowering: {
|
|
1986
|
+
...GOOGLE_LOWERING_BASE,
|
|
1987
|
+
// 10× cache discount per Google pricing: $0.15/M cached vs $1.50/M input.
|
|
1988
|
+
cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
|
|
1989
|
+
thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
|
|
1990
|
+
},
|
|
1991
|
+
recovery: [
|
|
1992
|
+
{
|
|
1993
|
+
signal: "empty_response_after_tool",
|
|
1994
|
+
action: "retry_with_params",
|
|
1995
|
+
retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
|
|
1996
|
+
maxRetries: 1,
|
|
1997
|
+
reason: "Inherited Flash-family pattern: empty after tool result \u2014 retry with thinking off"
|
|
1998
|
+
},
|
|
1999
|
+
{
|
|
2000
|
+
signal: "empty_response",
|
|
2001
|
+
action: "retry_with_params",
|
|
2002
|
+
retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
|
|
2003
|
+
maxRetries: 1,
|
|
2004
|
+
reason: "Empty response \u2014 try with thinking off"
|
|
2005
|
+
},
|
|
2006
|
+
{
|
|
2007
|
+
signal: "malformed_function_call",
|
|
2008
|
+
action: "escalate",
|
|
2009
|
+
reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target"
|
|
2010
|
+
}
|
|
2011
|
+
],
|
|
2012
|
+
strengths: ["agentic_loops", "coding", "1m_context", "parallel_tools", "thinking_mode", "sustained_frontier"],
|
|
2013
|
+
weaknesses: ["cost_vs_2_5_flash", "no_brain_evidence_yet"],
|
|
2014
|
+
notes: "Hand-onboarded s37 (2026-05-21) verified against Google docs. Stable status; positioned as Flash-family upgrade for agentic loops and coding. 5\xD7/3.6\xD7 more expensive than 2.5-flash but Google claims step-change on sustained frontier work. archetypePerf adjustments are judgment-grounded starter hypotheses \u2014 brain evidence will validate within ~50 rows per archetype.",
|
|
2015
|
+
// Starter hypothesis: anchored to 2.5-flash archetypePerf with +1
|
|
2016
|
+
// adjustments where Google's positioning explicitly supports
|
|
2017
|
+
// (agentic/coding/sustained). Hunt held at 9 inherited from L-040 family
|
|
2018
|
+
// parallel-tool tier. Brain evidence will replace.
|
|
2019
|
+
archetypePerf: {
|
|
2020
|
+
hunt: 9,
|
|
2021
|
+
// Inherited from 2.5-flash L-040 parallel-tool tier; Google positions 3.5 as agentic-loop champion
|
|
2022
|
+
classify: 7,
|
|
2023
|
+
// Inherited from 2.5-flash brain-validated tier (218 rows on 2.5)
|
|
2024
|
+
summarize: 7,
|
|
2025
|
+
// Inherited from 2.5-flash; cliff strips tools when present
|
|
2026
|
+
transform: 7,
|
|
2027
|
+
ask: 8,
|
|
2028
|
+
// +1 vs 2.5-flash — sustained-frontier positioning
|
|
2029
|
+
generate: 8,
|
|
2030
|
+
// +1 vs 2.5-flash (6→8) — Google: "complex coding cycles and iterations"
|
|
2031
|
+
plan: 7,
|
|
2032
|
+
// +1 vs 2.5-flash (5→7) — "complex iterations" positioning
|
|
2033
|
+
extract: 7,
|
|
2034
|
+
// +1 vs 2.5-flash — sustained-frontier on structured tasks
|
|
2035
|
+
critique: 5
|
|
2036
|
+
// +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
|
|
2037
|
+
}
|
|
1905
2038
|
}
|
|
1906
2039
|
];
|
|
1907
2040
|
var ALIASES = {
|
package/dist/index.mjs
CHANGED
package/dist/profiles.js
CHANGED
|
@@ -859,12 +859,23 @@ var PROFILES_RAW = [
|
|
|
859
859
|
// Each entry's pricing/context/cliffs/lowering reflects the template, NOT
|
|
860
860
|
// provider docs. Verify before promoting status to 'current' (L-049/L-081).
|
|
861
861
|
{
|
|
862
|
+
// s37 (2026-05-21): UNVERIFIED-AUTO-ONBOARD → verified against
|
|
863
|
+
// ai.google.dev/gemini-api/docs/models/gemini-3-flash-preview +
|
|
864
|
+
// ai.google.dev/gemini-api/docs/pricing. L-081 catches:
|
|
865
|
+
// maxOutputTokens 65_535 → 65_536 (off-by-one)
|
|
866
|
+
// costInputPer1m 0.30 → 0.50 (template-cloned from 2.5-flash; actual is 1.67× more expensive)
|
|
867
|
+
// costOutputPer1m 2.50 → 3.00 (template-cloned; actual 1.2× more expensive)
|
|
868
|
+
// cache discount default 0.25 → 0.10 (10× discount, $0.05/$0.50 per docs)
|
|
869
|
+
// Cliffs inherited from 2.5-flash conservatively. The 8K-context-quality
|
|
870
|
+
// cliff was a 2.5-Flash observation — Google positions Gemini 3 as
|
|
871
|
+
// sustained-frontier-on-long-context; brain evidence will validate/relax.
|
|
872
|
+
// Kept as guard for now.
|
|
862
873
|
id: "gemini-3-flash-preview",
|
|
863
|
-
verifiedAgainstDocs: "
|
|
874
|
+
verifiedAgainstDocs: "2026-05-21",
|
|
864
875
|
provider: "google",
|
|
865
876
|
status: "preview",
|
|
866
877
|
maxContextTokens: 1048576,
|
|
867
|
-
maxOutputTokens:
|
|
878
|
+
maxOutputTokens: 65536,
|
|
868
879
|
maxTools: 128,
|
|
869
880
|
parallelToolCalls: true,
|
|
870
881
|
structuredOutput: "native",
|
|
@@ -875,13 +886,13 @@ var PROFILES_RAW = [
|
|
|
875
886
|
metric: "input_tokens",
|
|
876
887
|
threshold: 8e3,
|
|
877
888
|
action: "downgrade_quality_warning",
|
|
878
|
-
reason: "
|
|
889
|
+
reason: "Inherited from 2.5-flash guard; brain evidence on Gemini 3 long-context quality will validate/relax"
|
|
879
890
|
},
|
|
880
891
|
{
|
|
881
892
|
metric: "tool_count",
|
|
882
893
|
threshold: 20,
|
|
883
894
|
action: "drop_to_top_relevant",
|
|
884
|
-
reason: "Tool reliability drops above ~20 tools (despite 128 hard limit)"
|
|
895
|
+
reason: "Tool reliability drops above ~20 tools (despite 128 hard limit) \u2014 inherited from Flash family"
|
|
885
896
|
},
|
|
886
897
|
{
|
|
887
898
|
metric: "thinking_with_short_output",
|
|
@@ -890,24 +901,22 @@ var PROFILES_RAW = [
|
|
|
890
901
|
reason: "Thinking tokens consume maxOutputTokens \u2014 empty response if drained"
|
|
891
902
|
},
|
|
892
903
|
{
|
|
893
|
-
//
|
|
894
|
-
//
|
|
895
|
-
//
|
|
896
|
-
// help — disabling thinking is necessary but not sufficient. Tools
|
|
897
|
-
// present + summarize intent confuses Flash into a no-output state
|
|
898
|
-
// (likely tool-decision purgatory). Strip tools entirely for this
|
|
899
|
-
// archetype on this model.
|
|
904
|
+
// Inherited from gemini-2.5-flash s11 trust artifact. Family-likely
|
|
905
|
+
// failure mode for Flash architecture. Keep preemptively until brain
|
|
906
|
+
// evidence on Gemini 3 specifically.
|
|
900
907
|
metric: "tool_count",
|
|
901
908
|
threshold: 1,
|
|
902
909
|
whenIntent: "summarize",
|
|
903
910
|
action: "strip_tools",
|
|
904
|
-
reason: "
|
|
911
|
+
reason: "Inherited from 2.5-flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3-flash-preview specifically."
|
|
905
912
|
}
|
|
906
913
|
],
|
|
907
|
-
costInputPer1m: 0.
|
|
908
|
-
costOutputPer1m:
|
|
914
|
+
costInputPer1m: 0.5,
|
|
915
|
+
costOutputPer1m: 3,
|
|
909
916
|
lowering: {
|
|
910
917
|
...GOOGLE_LOWERING_BASE,
|
|
918
|
+
// 10× cache discount per Google pricing: $0.05/M cached vs $0.50/M input.
|
|
919
|
+
cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
|
|
911
920
|
thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
|
|
912
921
|
},
|
|
913
922
|
recovery: [
|
|
@@ -933,40 +942,45 @@ var PROFILES_RAW = [
|
|
|
933
942
|
],
|
|
934
943
|
strengths: ["speed", "volume", "classification", "1m_context", "cost"],
|
|
935
944
|
weaknesses: ["complex_schemas", "large_tool_sets", "high_context_quality"],
|
|
936
|
-
notes: "
|
|
937
|
-
//
|
|
938
|
-
//
|
|
945
|
+
notes: "Verified s37 (2026-05-21) against Google docs. Step-change positioning vs 2.5-flash on agentic loops per Google's release notes (Dec 2025). Pricing 1.67\xD7/1.2\xD7 higher than 2.5-flash; cache discount 10\xD7 (vs 4\xD7 for 2.5). Status=preview until brain evidence accumulates.",
|
|
946
|
+
// Anchored to 2.5-flash archetypePerf as starter, with judgment adjustments
|
|
947
|
+
// for Google's "step-change on agentic" positioning. Brain evidence (zero
|
|
948
|
+
// rows today) will replace these starter values.
|
|
939
949
|
archetypePerf: {
|
|
940
950
|
hunt: 9,
|
|
941
|
-
// L-040
|
|
951
|
+
// Inherits 2.5-flash L-040 parallel-tool tier; Google positions 3 as agentic-loop upgrade
|
|
942
952
|
classify: 7,
|
|
943
|
-
// brain-validated
|
|
953
|
+
// Inherits 2.5-flash brain-validated tier (218 rows on 2.5)
|
|
944
954
|
summarize: 7,
|
|
945
|
-
//
|
|
955
|
+
// Inherits 2.5-flash; cliff strips tools when present
|
|
946
956
|
transform: 7,
|
|
947
|
-
ask:
|
|
948
|
-
|
|
949
|
-
|
|
957
|
+
ask: 8,
|
|
958
|
+
// +1 vs 2.5-flash — sustained-frontier positioning
|
|
959
|
+
generate: 7,
|
|
960
|
+
// +1 vs 2.5-flash — agentic coding upgrade per Google
|
|
961
|
+
plan: 6,
|
|
962
|
+
// +1 vs 2.5-flash — complex iterations per positioning
|
|
950
963
|
extract: 6,
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
// reasoning shallower than Sonnet/Opus
|
|
964
|
+
critique: 5
|
|
965
|
+
// +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
|
|
954
966
|
}
|
|
955
967
|
},
|
|
956
968
|
{
|
|
957
|
-
// ── Gemini
|
|
958
|
-
// Onboarded 2026-05-
|
|
959
|
-
//
|
|
960
|
-
//
|
|
961
|
-
//
|
|
962
|
-
//
|
|
963
|
-
//
|
|
964
|
-
//
|
|
965
|
-
//
|
|
966
|
-
//
|
|
967
|
-
//
|
|
969
|
+
// ── Gemini 3.1 Flash-Lite ──
|
|
970
|
+
// Onboarded 2026-05-16 by auto-onboarder; s37 (2026-05-21) verified
|
|
971
|
+
// against ai.google.dev/gemini-api/docs/pricing.
|
|
972
|
+
//
|
|
973
|
+
// L-081 CATCHES (template clone from 2.5-flash-lite was 2.5-3.75× too cheap):
|
|
974
|
+
// costInputPer1m 0.10 → 0.25 (template clone undervalued by 2.5×)
|
|
975
|
+
// costOutputPer1m 0.40 → 1.50 (template clone undervalued by 3.75×)
|
|
976
|
+
//
|
|
977
|
+
// Real 3.1-flash-lite is NOT a cost-equivalent successor to 2.5-flash-lite —
|
|
978
|
+
// it sits between 2.5-flash-lite ($0.10/$0.40) and 2.5-flash ($0.30/$2.50).
|
|
979
|
+
// Cache discount 10× verified ($0.025/M cached vs $0.25/M input).
|
|
980
|
+
//
|
|
981
|
+
// Cliffs are HYPOTHESIZED from 2.5-flash family; brain evidence pending.
|
|
968
982
|
id: "gemini-3.1-flash-lite",
|
|
969
|
-
verifiedAgainstDocs: "
|
|
983
|
+
verifiedAgainstDocs: "2026-05-21",
|
|
970
984
|
provider: "google",
|
|
971
985
|
status: "preview",
|
|
972
986
|
maxContextTokens: 1048576,
|
|
@@ -1007,12 +1021,12 @@ var PROFILES_RAW = [
|
|
|
1007
1021
|
reason: "Inherited from Flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on Flash-Lite specifically."
|
|
1008
1022
|
}
|
|
1009
1023
|
],
|
|
1010
|
-
costInputPer1m: 0.
|
|
1011
|
-
costOutputPer1m:
|
|
1024
|
+
costInputPer1m: 0.25,
|
|
1025
|
+
costOutputPer1m: 1.5,
|
|
1012
1026
|
lowering: {
|
|
1013
1027
|
...GOOGLE_LOWERING_BASE,
|
|
1014
|
-
// Cache discount 10× (vs Flash 4×) — Google
|
|
1015
|
-
// $0.
|
|
1028
|
+
// Cache discount 10× (vs Flash 4×) — Google docs s37: $0.025/M cached vs
|
|
1029
|
+
// $0.25/M input. Material for repeat-prompt workloads (classify shape).
|
|
1016
1030
|
cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
|
|
1017
1031
|
thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
|
|
1018
1032
|
},
|
|
@@ -1037,13 +1051,13 @@ var PROFILES_RAW = [
|
|
|
1037
1051
|
reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target."
|
|
1038
1052
|
}
|
|
1039
1053
|
],
|
|
1040
|
-
strengths: ["
|
|
1054
|
+
strengths: ["low_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
|
|
1041
1055
|
weaknesses: ["complex_reasoning", "large_tool_sets", "complex_schemas", "structured_output_unproven", "long_context_quality"],
|
|
1042
|
-
notes: "
|
|
1043
|
-
// Tier 3
|
|
1044
|
-
// rows —
|
|
1045
|
-
// sibling of Flash
|
|
1046
|
-
//
|
|
1056
|
+
notes: "Verified s37 (2026-05-21) against Google docs. Sits between 2.5-flash-lite (cheaper) and 2.5-flash (more expensive) on cost frontier; 2.5\xD7 more expensive than initial template-clone. Cliffs hypothesized from Flash family \u2014 brain evidence pending.",
|
|
1057
|
+
// Tier 2-3 floor for summarize/classify chains at the new (verified) price
|
|
1058
|
+
// point. ZERO brain rows — values are starter hypotheses anchored to
|
|
1059
|
+
// "smaller sibling of Flash at higher cost than 2.5-flash-lite." The first
|
|
1060
|
+
// 50 brain rows per archetype will validate or relax these.
|
|
1047
1061
|
archetypePerf: {
|
|
1048
1062
|
classify: 6,
|
|
1049
1063
|
// starter hypothesis — verify (Flash is 7, lite likely ≤)
|
|
@@ -1058,6 +1072,125 @@ var PROFILES_RAW = [
|
|
|
1058
1072
|
plan: 3,
|
|
1059
1073
|
critique: 3
|
|
1060
1074
|
}
|
|
1075
|
+
},
|
|
1076
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1077
|
+
// Gemini 3.5 Flash — hand-onboarded s37 (2026-05-21)
|
|
1078
|
+
//
|
|
1079
|
+
// Google positioning ("Most intelligent for sustained frontier performance
|
|
1080
|
+
// on agentic and coding tasks" / "particularly effective for rapid agentic
|
|
1081
|
+
// loops involving complex coding cycles and iterations") suggests this is
|
|
1082
|
+
// the Flash-family upgrade specifically aimed at hunt-shape workloads.
|
|
1083
|
+
// Pricing 5× input / 3.6× output vs 2.5-flash — material cost premium.
|
|
1084
|
+
// archetypePerf adjusted +1 vs 2.5-flash on ask/generate/plan/critique
|
|
1085
|
+
// (sustained-frontier positioning); hunt held at 9 inherited from L-040
|
|
1086
|
+
// family parallel-tool tier; brain evidence will validate within 50 rows.
|
|
1087
|
+
//
|
|
1088
|
+
// Cliffs inherited conservatively from 2.5-flash. Google's "sustained
|
|
1089
|
+
// frontier on long-context" positioning suggests the 8K cliff may not
|
|
1090
|
+
// apply to 3.5 — keep as guard until brain evidence shows otherwise.
|
|
1091
|
+
//
|
|
1092
|
+
// Specs verified against:
|
|
1093
|
+
// ai.google.dev/gemini-api/docs/models/gemini-3.5-flash
|
|
1094
|
+
// ai.google.dev/gemini-api/docs/pricing (Standard tier)
|
|
1095
|
+
// ─────────────────────────────────────────────────────────────────────────
|
|
1096
|
+
{
|
|
1097
|
+
id: "gemini-3.5-flash",
|
|
1098
|
+
verifiedAgainstDocs: "2026-05-21",
|
|
1099
|
+
provider: "google",
|
|
1100
|
+
status: "current",
|
|
1101
|
+
maxContextTokens: 1048576,
|
|
1102
|
+
maxOutputTokens: 65536,
|
|
1103
|
+
maxTools: 128,
|
|
1104
|
+
parallelToolCalls: true,
|
|
1105
|
+
structuredOutput: "native",
|
|
1106
|
+
systemPromptMode: "separate",
|
|
1107
|
+
streaming: true,
|
|
1108
|
+
cliffs: [
|
|
1109
|
+
{
|
|
1110
|
+
metric: "input_tokens",
|
|
1111
|
+
threshold: 8e3,
|
|
1112
|
+
action: "downgrade_quality_warning",
|
|
1113
|
+
reason: "Inherited from 2.5-flash guard; Google positions 3.5 as sustained-frontier-on-long-context but brain evidence pending"
|
|
1114
|
+
},
|
|
1115
|
+
{
|
|
1116
|
+
metric: "tool_count",
|
|
1117
|
+
threshold: 20,
|
|
1118
|
+
action: "drop_to_top_relevant",
|
|
1119
|
+
reason: "Inherited from Flash family: tool reliability drops above ~20 (despite 128 hard limit). Validate per (archetype, model) after n\u226520."
|
|
1120
|
+
},
|
|
1121
|
+
{
|
|
1122
|
+
metric: "thinking_with_short_output",
|
|
1123
|
+
threshold: 1,
|
|
1124
|
+
action: "force_thinking_budget_zero",
|
|
1125
|
+
reason: "Thinking mode supported per Google docs; same drain risk as 2.5-flash \u2014 thinking tokens consume maxOutputTokens"
|
|
1126
|
+
},
|
|
1127
|
+
{
|
|
1128
|
+
// Inherited from 2.5-flash s11 trust artifact (5/5 empty rate on
|
|
1129
|
+
// tt-intelligence/summarize/gemini-2.5-flash with tools offered).
|
|
1130
|
+
// Family-likely failure mode for Flash architecture across versions.
|
|
1131
|
+
// Keep preemptively until brain evidence on 3.5-flash specifically.
|
|
1132
|
+
metric: "tool_count",
|
|
1133
|
+
threshold: 1,
|
|
1134
|
+
whenIntent: "summarize",
|
|
1135
|
+
action: "strip_tools",
|
|
1136
|
+
reason: "Inherited from 2.5-flash s11 cliff (kgauto commit 3872832): summarize+tools \u2192 empty response. Preemptive guard until brain evidence on 3.5-flash specifically."
|
|
1137
|
+
}
|
|
1138
|
+
],
|
|
1139
|
+
costInputPer1m: 1.5,
|
|
1140
|
+
costOutputPer1m: 9,
|
|
1141
|
+
lowering: {
|
|
1142
|
+
...GOOGLE_LOWERING_BASE,
|
|
1143
|
+
// 10× cache discount per Google pricing: $0.15/M cached vs $1.50/M input.
|
|
1144
|
+
cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
|
|
1145
|
+
thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
|
|
1146
|
+
},
|
|
1147
|
+
recovery: [
|
|
1148
|
+
{
|
|
1149
|
+
signal: "empty_response_after_tool",
|
|
1150
|
+
action: "retry_with_params",
|
|
1151
|
+
retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
|
|
1152
|
+
maxRetries: 1,
|
|
1153
|
+
reason: "Inherited Flash-family pattern: empty after tool result \u2014 retry with thinking off"
|
|
1154
|
+
},
|
|
1155
|
+
{
|
|
1156
|
+
signal: "empty_response",
|
|
1157
|
+
action: "retry_with_params",
|
|
1158
|
+
retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
|
|
1159
|
+
maxRetries: 1,
|
|
1160
|
+
reason: "Empty response \u2014 try with thinking off"
|
|
1161
|
+
},
|
|
1162
|
+
{
|
|
1163
|
+
signal: "malformed_function_call",
|
|
1164
|
+
action: "escalate",
|
|
1165
|
+
reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target"
|
|
1166
|
+
}
|
|
1167
|
+
],
|
|
1168
|
+
strengths: ["agentic_loops", "coding", "1m_context", "parallel_tools", "thinking_mode", "sustained_frontier"],
|
|
1169
|
+
weaknesses: ["cost_vs_2_5_flash", "no_brain_evidence_yet"],
|
|
1170
|
+
notes: "Hand-onboarded s37 (2026-05-21) verified against Google docs. Stable status; positioned as Flash-family upgrade for agentic loops and coding. 5\xD7/3.6\xD7 more expensive than 2.5-flash but Google claims step-change on sustained frontier work. archetypePerf adjustments are judgment-grounded starter hypotheses \u2014 brain evidence will validate within ~50 rows per archetype.",
|
|
1171
|
+
// Starter hypothesis: anchored to 2.5-flash archetypePerf with +1
|
|
1172
|
+
// adjustments where Google's positioning explicitly supports
|
|
1173
|
+
// (agentic/coding/sustained). Hunt held at 9 inherited from L-040 family
|
|
1174
|
+
// parallel-tool tier. Brain evidence will replace.
|
|
1175
|
+
archetypePerf: {
|
|
1176
|
+
hunt: 9,
|
|
1177
|
+
// Inherited from 2.5-flash L-040 parallel-tool tier; Google positions 3.5 as agentic-loop champion
|
|
1178
|
+
classify: 7,
|
|
1179
|
+
// Inherited from 2.5-flash brain-validated tier (218 rows on 2.5)
|
|
1180
|
+
summarize: 7,
|
|
1181
|
+
// Inherited from 2.5-flash; cliff strips tools when present
|
|
1182
|
+
transform: 7,
|
|
1183
|
+
ask: 8,
|
|
1184
|
+
// +1 vs 2.5-flash — sustained-frontier positioning
|
|
1185
|
+
generate: 8,
|
|
1186
|
+
// +1 vs 2.5-flash (6→8) — Google: "complex coding cycles and iterations"
|
|
1187
|
+
plan: 7,
|
|
1188
|
+
// +1 vs 2.5-flash (5→7) — "complex iterations" positioning
|
|
1189
|
+
extract: 7,
|
|
1190
|
+
// +1 vs 2.5-flash — sustained-frontier on structured tasks
|
|
1191
|
+
critique: 5
|
|
1192
|
+
// +1 vs 2.5-flash — but still below Sonnet/Opus reasoning floor
|
|
1193
|
+
}
|
|
1061
1194
|
}
|
|
1062
1195
|
];
|
|
1063
1196
|
var ALIASES = {
|
package/dist/profiles.mjs
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@warmdrift/kgauto-compiler",
|
|
3
|
-
"version": "2.0.0-alpha.
|
|
3
|
+
"version": "2.0.0-alpha.27",
|
|
4
4
|
"description": "Prompt compiler + central learning brain for multi-model AI apps. Swap models without rewriting prompts.",
|
|
5
5
|
"main": "./dist/index.js",
|
|
6
6
|
"module": "./dist/index.mjs",
|