@warmdrift/kgauto-compiler 2.0.0-alpha.7 → 2.0.0-alpha.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/profiles.js CHANGED
@@ -79,7 +79,24 @@ var PROFILES_RAW = [
79
79
  ],
80
80
  strengths: ["reasoning", "agentic_coding", "long_context", "reliable_tool_use", "structured_output"],
81
81
  weaknesses: ["cost", "latency"],
82
- notes: "Frontier (2026-05). Step-change improvement over 4.6 in agentic coding. Adaptive thinking only \u2014 no extended-thinking toggle. 1M context, 128k max output."
82
+ notes: "Frontier (2026-05). Step-change improvement over 4.6 in agentic coding. Adaptive thinking only \u2014 no extended-thinking toggle. 1M context, 128k max output.",
83
+ // Frontier perf. Drops on archetypes where parallel-tool throughput
84
+ // (hunt) or low-budget cost-sensitivity (classify/summarize) matters
85
+ // more than reasoning depth.
86
+ archetypePerf: {
87
+ critique: 10,
88
+ plan: 10,
89
+ generate: 9,
90
+ ask: 9,
91
+ extract: 9,
92
+ transform: 9,
93
+ hunt: 8,
94
+ // strong but Flash dominates parallel tool throughput
95
+ summarize: 8,
96
+ // overkill for tolerant archetype; cost-out of frontier
97
+ classify: 8
98
+ // overkill; brain-validated cheaper models cover this
99
+ }
83
100
  },
84
101
  {
85
102
  id: "claude-opus-4-6",
@@ -111,7 +128,20 @@ var PROFILES_RAW = [
111
128
  ],
112
129
  strengths: ["reasoning", "long_context", "reliable_tool_use", "structured_output", "extended_thinking"],
113
130
  weaknesses: ["cost", "latency"],
114
- notes: "Predecessor to 4.7. Still current in Anthropic legacy table. Same pricing as 4.7 \u2014 choose 4.7 unless you need extended-thinking budget control (4.7 is adaptive-only)."
131
+ notes: "Predecessor to 4.7. Still current in Anthropic legacy table. Same pricing as 4.7 \u2014 choose 4.7 unless you need extended-thinking budget control (4.7 is adaptive-only).",
132
+ // One notch below 4.7 across the board — extended-thinking edge does
133
+ // not flip any archetype ranking. Legacy: chains should prefer 4.7.
134
+ archetypePerf: {
135
+ critique: 9,
136
+ plan: 9,
137
+ generate: 9,
138
+ ask: 9,
139
+ extract: 9,
140
+ transform: 9,
141
+ hunt: 7,
142
+ summarize: 8,
143
+ classify: 8
144
+ }
115
145
  },
116
146
  {
117
147
  id: "claude-sonnet-4-6",
@@ -135,7 +165,23 @@ var PROFILES_RAW = [
135
165
  ],
136
166
  strengths: ["quality", "tool_use", "long_context", "cache_friendly", "extended_thinking"],
137
167
  weaknesses: [],
138
- notes: "Workhorse. Best price/quality for most multi-turn agentic work. 1M context, 64k max output."
168
+ notes: "Workhorse. Best price/quality for most multi-turn agentic work. 1M context, 64k max output.",
169
+ // Master plan §6.2 anchor. Tier 0 for plan/generate/ask/extract/transform
170
+ // in starter chains; tier 1 cross-provider for hunt/summarize/classify.
171
+ archetypePerf: {
172
+ ask: 9,
173
+ generate: 9,
174
+ plan: 9,
175
+ critique: 9,
176
+ extract: 9,
177
+ transform: 9,
178
+ hunt: 7,
179
+ // strong but Flash beats on parallel tool throughput
180
+ summarize: 8,
181
+ // overkill for tolerant archetype
182
+ classify: 8
183
+ // overkill
184
+ }
139
185
  },
140
186
  {
141
187
  id: "claude-haiku-4-5",
@@ -165,7 +211,23 @@ var PROFILES_RAW = [
165
211
  ],
166
212
  strengths: ["speed", "cost", "classification", "cache_friendly", "extended_thinking"],
167
213
  weaknesses: ["complex_reasoning", "large_tool_sets"],
168
- notes: "Cheapest Anthropic. Great for classify, summarize, ask shapes. 200k context, 64k max output. API alias `claude-haiku-4-5` resolves to dated snapshot `claude-haiku-4-5-20251001`."
214
+ notes: "Cheapest Anthropic. Great for classify, summarize, ask shapes. 200k context, 64k max output. API alias `claude-haiku-4-5` resolves to dated snapshot `claude-haiku-4-5-20251001`.",
215
+ // Tier 1 cross-provider anchor for short-output chains (classify/
216
+ // summarize/extract/transform). Falls off on plan/critique where
217
+ // reasoning depth matters; competes with Pro on cost+latency.
218
+ archetypePerf: {
219
+ classify: 8,
220
+ summarize: 8,
221
+ ask: 7,
222
+ transform: 7,
223
+ extract: 7,
224
+ hunt: 6,
225
+ // tool reliability drops at 16 — cliff guard fires
226
+ generate: 6,
227
+ plan: 5,
228
+ critique: 4
229
+ // reasoning depth gap vs Sonnet/Opus
230
+ }
169
231
  },
170
232
  // ── Google ──
171
233
  {
@@ -243,7 +305,131 @@ var PROFILES_RAW = [
243
305
  ],
244
306
  strengths: ["speed", "volume", "classification", "1m_context", "cost"],
245
307
  weaknesses: ["complex_schemas", "large_tool_sets", "high_context_quality"],
246
- notes: "Fast and cheap with 1M context. Quality cliffs at 8K context and 20 tools \u2014 guard with cliffs."
308
+ notes: "Fast and cheap with 1M context. Quality cliffs at 8K context and 20 tools \u2014 guard with cliffs.",
309
+ // Master plan §6.2 anchor. Tier 0 for hunt (parallel tool throughput
310
+ // 15-75 calls/step beats Sonnet — L-040), summarize, classify.
311
+ archetypePerf: {
312
+ hunt: 9,
313
+ // L-040: parallel tool throughput 15-75/step
314
+ classify: 7,
315
+ // brain-validated, 218 rows
316
+ summarize: 7,
317
+ // brain-validated; cliff strips tools when present
318
+ transform: 7,
319
+ ask: 7,
320
+ generate: 6,
321
+ plan: 5,
322
+ extract: 6,
323
+ // alpha.8 MAX_TOKENS history on structured output
324
+ critique: 4
325
+ // reasoning shallower than Sonnet/Opus
326
+ }
327
+ },
328
+ {
329
+ // ── Gemini 2.5 Flash-Lite ──
330
+ // Onboarded 2026-05-13 (s22) after the model-release watcher surfaced
331
+ // it as a UNREGISTERED + NEW candidate. Released by Google July 2025,
332
+ // stable. Positioned BELOW Flash on the cost/perf frontier:
333
+ // input $0.10/M (Flash $0.30/M) — 3× cheaper
334
+ // output $0.40/M (Flash $2.50/M) — 6× cheaper
335
+ // cache $0.01/M — 1/10 of input (vs Flash 0.25 discount)
336
+ // Cliffs are HYPOTHESIZED from Flash's known failure modes — Flash-Lite
337
+ // is a smaller sibling, so we inherit Flash's cliffs at equal-or-tighter
338
+ // thresholds. The brain will validate/relax these as evidence accumulates
339
+ // per (archetype, model) tuple. Currently ZERO brain rows for this model.
340
+ id: "gemini-2.5-flash-lite",
341
+ verifiedAgainstDocs: "2026-05-13",
342
+ provider: "google",
343
+ status: "current",
344
+ maxContextTokens: 1048576,
345
+ maxOutputTokens: 65536,
346
+ maxTools: 128,
347
+ parallelToolCalls: true,
348
+ structuredOutput: "native",
349
+ systemPromptMode: "separate",
350
+ streaming: true,
351
+ cliffs: [
352
+ {
353
+ metric: "input_tokens",
354
+ threshold: 8e3,
355
+ action: "downgrade_quality_warning",
356
+ reason: "Inherited from Flash: quality degrades above ~8K. Smaller model \u2014 likely degrades faster. Re-tune from brain after n\u226520."
357
+ },
358
+ {
359
+ metric: "tool_count",
360
+ threshold: 10,
361
+ action: "drop_to_top_relevant",
362
+ reason: "Conservative: Flash drops at 20, Flash-Lite is smaller \u2014 assume tighter ceiling until brain proves otherwise."
363
+ },
364
+ {
365
+ metric: "thinking_with_short_output",
366
+ threshold: 1,
367
+ action: "force_thinking_budget_zero",
368
+ reason: "Thinking enabled per Google API (thinking: true). Same drain risk as Flash \u2014 thinking tokens consume maxOutputTokens."
369
+ },
370
+ {
371
+ // Strong prior: Flash hit 5/5 empty rate on summarize+tools (s11
372
+ // trust artifact, kgauto commit 3872832). Flash-Lite shares the
373
+ // same architectural family — almost certainly inherits this cliff.
374
+ // Ship the guard preemptively; brain telemetry confirms or relaxes.
375
+ metric: "tool_count",
376
+ threshold: 1,
377
+ whenIntent: "summarize",
378
+ action: "strip_tools",
379
+ reason: "Inherited from Flash s11 cliff: summarize+tools \u2192 empty response. Preemptive guard until brain evidence on Flash-Lite specifically."
380
+ }
381
+ ],
382
+ costInputPer1m: 0.1,
383
+ costOutputPer1m: 0.4,
384
+ lowering: {
385
+ ...GOOGLE_LOWERING_BASE,
386
+ // Cache discount 10× (vs Flash 4×) — Google's spec is $0.01/M cache vs
387
+ // $0.10/M input. Material for repeat-prompt workloads (classify shape).
388
+ cache: { ...GOOGLE_LOWERING_BASE.cache, discount: 0.1 },
389
+ thinking: { field: "generationConfig.thinkingConfig.thinkingBudget", default: "auto" }
390
+ },
391
+ recovery: [
392
+ {
393
+ signal: "empty_response_after_tool",
394
+ action: "retry_with_params",
395
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
396
+ maxRetries: 1,
397
+ reason: "Known on Flash family: empty after tool result \u2014 retry with thinking off."
398
+ },
399
+ {
400
+ signal: "empty_response",
401
+ action: "retry_with_params",
402
+ retryParams: { "generationConfig.thinkingConfig.thinkingBudget": 0 },
403
+ maxRetries: 1,
404
+ reason: "Empty response \u2014 try with thinking off."
405
+ },
406
+ {
407
+ signal: "malformed_function_call",
408
+ action: "escalate",
409
+ reason: "MALFORMED_FUNCTION_CALL maps to stop \u2014 escalate to next target."
410
+ }
411
+ ],
412
+ strengths: ["lowest_cost", "speed", "volume", "classification", "summarize", "1m_context", "cache_friendly"],
413
+ weaknesses: ["complex_reasoning", "large_tool_sets", "complex_schemas", "structured_output_unproven", "long_context_quality"],
414
+ notes: "Bottom-frontier anchor on cost: $0.10/$0.40 per 1M tokens, 1M context, 65K max output. Released July 2025 (stable). Positioned for classify / summarize / transform archetypes where quality bar is forgiving. Cliffs inherited from Flash at equal-or-tighter thresholds \u2014 re-tune per (archetype) once brain has n\u226520 rows. Alpha.8 contract layer handles MAX_TOKENS-on-structured-output via fallback chain, so structuredOutput=native is safe to declare even though Flash had alpha.8 history. Cache discount in spec: $0.01/M = 1/10 of input (richer than Flash 25%) \u2014 meaningful for repeat-prompt workloads.",
415
+ // Tier 3 emergency floor for summarize/classify chains. ZERO brain
416
+ // rows — all values are starter hypotheses anchored to "smaller
417
+ // sibling of Flash, at-or-below Flash perf on every archetype." The
418
+ // first 50 brain rows per archetype will validate or relax these.
419
+ archetypePerf: {
420
+ classify: 6,
421
+ // starter hypothesis — verify (Flash is 7, lite likely ≤)
422
+ summarize: 6,
423
+ // starter hypothesis — verify; cliff strips tools
424
+ transform: 6,
425
+ // starter hypothesis — verify
426
+ ask: 5,
427
+ hunt: 5,
428
+ generate: 4,
429
+ extract: 4,
430
+ plan: 3,
431
+ critique: 3
432
+ }
247
433
  },
248
434
  {
249
435
  id: "gemini-2.5-pro",
@@ -279,7 +465,21 @@ var PROFILES_RAW = [
279
465
  }
280
466
  ],
281
467
  strengths: ["reasoning", "1m_context", "structured_output", "tool_use"],
282
- weaknesses: ["pricing_above_200k"]
468
+ weaknesses: ["pricing_above_200k"],
469
+ // Master plan §3.3 anchor: tier-2 cross-provider in almost every chain.
470
+ // Sits on the frontier at perf-9 — close to Sonnet but cheaper input.
471
+ archetypePerf: {
472
+ critique: 9,
473
+ plan: 9,
474
+ ask: 8,
475
+ generate: 8,
476
+ extract: 8,
477
+ transform: 8,
478
+ hunt: 8,
479
+ // tier 1 cross-provider for hunt chain
480
+ summarize: 7,
481
+ classify: 7
482
+ }
283
483
  },
284
484
  {
285
485
  id: "gemini-3.1-pro-preview",
@@ -317,7 +517,23 @@ var PROFILES_RAW = [
317
517
  ],
318
518
  strengths: ["reasoning", "1m_context", "agentic_coding", "structured_output", "tool_use"],
319
519
  weaknesses: ["cost", "preview_status", "pricing_above_200k"],
320
- notes: "Frontier Gemini (preview, 2026-Q2). Step-change agentic coding per Google. Cache discount 10\xD7 (vs 4\xD7 for 2.5 Pro). Use status=preview to flag rollback path until GA."
520
+ notes: "Frontier Gemini (preview, 2026-Q2). Step-change agentic coding per Google. Cache discount 10\xD7 (vs 4\xD7 for 2.5 Pro). Use status=preview to flag rollback path until GA.",
521
+ // Frontier-Gemini preview — bumped one notch over 2.5 Pro on agentic
522
+ // coding / reasoning per Google's release notes. Preview status:
523
+ // chains should stay on 2.5 Pro until GA. Starter hypothesis.
524
+ archetypePerf: {
525
+ critique: 10,
526
+ // Google claims step-change on reasoning
527
+ plan: 10,
528
+ ask: 9,
529
+ generate: 9,
530
+ extract: 9,
531
+ transform: 8,
532
+ hunt: 9,
533
+ // step-change agentic per Google
534
+ summarize: 8,
535
+ classify: 7
536
+ }
321
537
  },
322
538
  // ── DeepSeek ──
323
539
  // 2026-05-08 audit (L-073): DeepSeek's `deepseek-chat` was silently aliased
@@ -357,7 +573,24 @@ var PROFILES_RAW = [
357
573
  ],
358
574
  strengths: ["cost", "1m_context", "json_output", "code", "reasoning"],
359
575
  weaknesses: ["parallel_tools", "large_tool_sets"],
360
- notes: "Cheap workhorse. 1M context, 384k max output. Cache-hit input $0.0028/M (1/50\xD7 of miss). Aliased as `deepseek-chat` (non-thinking) and `deepseek-reasoner` (thinking) \u2014 see ALIASES."
576
+ notes: "Cheap workhorse. 1M context, 384k max output. Cache-hit input $0.0028/M (1/50\xD7 of miss). Aliased as `deepseek-chat` (non-thinking) and `deepseek-reasoner` (thinking) \u2014 see ALIASES.",
577
+ // Master plan §6.2 anchor. Brain-validated tier 1 cross-provider for
578
+ // classify (169 rows, 0% empty). Tier 0 for summarize-with-no-tools.
579
+ // Falls off on hunt (sequential tools — L-040) and reasoning depth.
580
+ archetypePerf: {
581
+ classify: 7,
582
+ // brain-validated, 169 rows
583
+ summarize: 7,
584
+ // archetype-tolerant, no brain evidence yet
585
+ ask: 6,
586
+ transform: 6,
587
+ generate: 5,
588
+ plan: 5,
589
+ extract: 5,
590
+ critique: 4,
591
+ hunt: 4
592
+ // sequential tool calls only — L-040
593
+ }
361
594
  },
362
595
  {
363
596
  id: "deepseek-v4-pro",
@@ -393,7 +626,22 @@ var PROFILES_RAW = [
393
626
  ],
394
627
  strengths: ["quality", "reasoning", "1m_context", "json_output", "code", "extended_thinking"],
395
628
  weaknesses: ["parallel_tools", "large_tool_sets"],
396
- notes: "Pro tier. 1M context, 384k max output. Regular pricing $1.74/$3.48; 75% promo through 2026-05-31 ($0.435/$0.87). Default mode = thinking."
629
+ notes: "Pro tier. 1M context, 384k max output. Regular pricing $1.74/$3.48; 75% promo through 2026-05-31 ($0.435/$0.87). Default mode = thinking.",
630
+ // Master plan §3.3: tier 3 cross-provider for plan chain. Reasoning
631
+ // bumped one notch over V4-Flash; same parallel-tool ceiling.
632
+ archetypePerf: {
633
+ plan: 7,
634
+ // §3.3 tier 3 for plan
635
+ critique: 6,
636
+ ask: 7,
637
+ generate: 6,
638
+ classify: 7,
639
+ summarize: 7,
640
+ extract: 6,
641
+ transform: 6,
642
+ hunt: 4
643
+ // sequential tools — same as V4-Flash
644
+ }
397
645
  }
398
646
  ];
399
647
  var ALIASES = {
package/dist/profiles.mjs CHANGED
@@ -4,7 +4,7 @@ import {
4
4
  getProfile,
5
5
  profilesByProvider,
6
6
  tryGetProfile
7
- } from "./chunk-MBEI5UOM.mjs";
7
+ } from "./chunk-3KVKELZN.mjs";
8
8
  export {
9
9
  ALIASES,
10
10
  allProfiles,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@warmdrift/kgauto-compiler",
3
- "version": "2.0.0-alpha.7",
3
+ "version": "2.0.0-alpha.9",
4
4
  "description": "Prompt compiler + central learning brain for multi-model AI apps. Swap models without rewriting prompts.",
5
5
  "main": "./dist/index.js",
6
6
  "module": "./dist/index.mjs",