llmist 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -881,7 +881,7 @@ function findSafeDelimiter(content) {
881
881
  }
882
882
  let counter = 1;
883
883
  while (counter < 1e3) {
884
- const delimiter = `HEREDOC_${counter}`;
884
+ const delimiter = `__GADGET_PARAM_${counter}__`;
885
885
  const regex = new RegExp(`^${delimiter}\\s*$`);
886
886
  const isUsed = lines.some((line) => regex.test(line));
887
887
  if (!isUsed) {
@@ -988,7 +988,16 @@ var init_gadget = __esm({
988
988
  yaml = __toESM(require("js-yaml"), 1);
989
989
  init_schema_to_json();
990
990
  init_schema_validator();
991
- HEREDOC_DELIMITERS = ["EOF", "END", "DOC", "CONTENT", "TEXT", "HEREDOC", "DATA", "BLOCK"];
991
+ HEREDOC_DELIMITERS = [
992
+ "__GADGET_PARAM_EOF__",
993
+ "__GADGET_PARAM_END__",
994
+ "__GADGET_PARAM_DOC__",
995
+ "__GADGET_PARAM_CONTENT__",
996
+ "__GADGET_PARAM_TEXT__",
997
+ "__GADGET_PARAM_HEREDOC__",
998
+ "__GADGET_PARAM_DATA__",
999
+ "__GADGET_PARAM_BLOCK__"
1000
+ ];
992
1001
  BaseGadget = class {
993
1002
  /**
994
1003
  * The name of the gadget. Used for identification when LLM calls it.
@@ -3096,7 +3105,8 @@ var init_anthropic_models = __esm({
3096
3105
  pricing: {
3097
3106
  input: 3,
3098
3107
  output: 15,
3099
- cachedInput: 0.3
3108
+ cachedInput: 0.3,
3109
+ cacheWriteInput: 3.75
3100
3110
  },
3101
3111
  knowledgeCutoff: "2025-01",
3102
3112
  features: {
@@ -3120,7 +3130,8 @@ var init_anthropic_models = __esm({
3120
3130
  pricing: {
3121
3131
  input: 1,
3122
3132
  output: 5,
3123
- cachedInput: 0.1
3133
+ cachedInput: 0.1,
3134
+ cacheWriteInput: 1.25
3124
3135
  },
3125
3136
  knowledgeCutoff: "2025-02",
3126
3137
  features: {
@@ -3144,7 +3155,8 @@ var init_anthropic_models = __esm({
3144
3155
  pricing: {
3145
3156
  input: 3,
3146
3157
  output: 15,
3147
- cachedInput: 0.3
3158
+ cachedInput: 0.3,
3159
+ cacheWriteInput: 3.75
3148
3160
  },
3149
3161
  knowledgeCutoff: "2025-03",
3150
3162
  features: {
@@ -3168,7 +3180,8 @@ var init_anthropic_models = __esm({
3168
3180
  pricing: {
3169
3181
  input: 3,
3170
3182
  output: 15,
3171
- cachedInput: 0.3
3183
+ cachedInput: 0.3,
3184
+ cacheWriteInput: 3.75
3172
3185
  },
3173
3186
  knowledgeCutoff: "2024-11",
3174
3187
  features: {
@@ -3192,7 +3205,8 @@ var init_anthropic_models = __esm({
3192
3205
  pricing: {
3193
3206
  input: 15,
3194
3207
  output: 75,
3195
- cachedInput: 1.5
3208
+ cachedInput: 1.5,
3209
+ cacheWriteInput: 18.75
3196
3210
  },
3197
3211
  knowledgeCutoff: "2025-01",
3198
3212
  features: {
@@ -3216,7 +3230,8 @@ var init_anthropic_models = __esm({
3216
3230
  pricing: {
3217
3231
  input: 15,
3218
3232
  output: 75,
3219
- cachedInput: 1.5
3233
+ cachedInput: 1.5,
3234
+ cacheWriteInput: 18.75
3220
3235
  },
3221
3236
  knowledgeCutoff: "2025-03",
3222
3237
  features: {
@@ -3239,7 +3254,8 @@ var init_anthropic_models = __esm({
3239
3254
  pricing: {
3240
3255
  input: 0.8,
3241
3256
  output: 4,
3242
- cachedInput: 0.08
3257
+ cachedInput: 0.08,
3258
+ cacheWriteInput: 1
3243
3259
  },
3244
3260
  knowledgeCutoff: "2024-07",
3245
3261
  features: {
@@ -3262,7 +3278,8 @@ var init_anthropic_models = __esm({
3262
3278
  pricing: {
3263
3279
  input: 0.25,
3264
3280
  output: 1.25,
3265
- cachedInput: 0.025
3281
+ cachedInput: 0.025,
3282
+ cacheWriteInput: 0.3125
3266
3283
  },
3267
3284
  knowledgeCutoff: "2023-08",
3268
3285
  features: {
@@ -3286,7 +3303,8 @@ var init_anthropic_models = __esm({
3286
3303
  pricing: {
3287
3304
  input: 1,
3288
3305
  output: 5,
3289
- cachedInput: 0.1
3306
+ cachedInput: 0.1,
3307
+ cacheWriteInput: 1.25
3290
3308
  },
3291
3309
  knowledgeCutoff: "2025-02",
3292
3310
  features: {
@@ -3310,7 +3328,8 @@ var init_anthropic_models = __esm({
3310
3328
  pricing: {
3311
3329
  input: 3,
3312
3330
  output: 15,
3313
- cachedInput: 0.3
3331
+ cachedInput: 0.3,
3332
+ cacheWriteInput: 3.75
3314
3333
  },
3315
3334
  knowledgeCutoff: "2025-01",
3316
3335
  features: {
@@ -3334,7 +3353,8 @@ var init_anthropic_models = __esm({
3334
3353
  pricing: {
3335
3354
  input: 5,
3336
3355
  output: 25,
3337
- cachedInput: 0.5
3356
+ cachedInput: 0.5,
3357
+ cacheWriteInput: 6.25
3338
3358
  },
3339
3359
  knowledgeCutoff: "2025-03",
3340
3360
  features: {
@@ -3449,15 +3469,27 @@ var init_anthropic = __esm({
3449
3469
  }
3450
3470
  buildRequestPayload(options, descriptor, spec, messages) {
3451
3471
  const systemMessages = messages.filter((message) => message.role === "system");
3452
- const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
3453
- const conversation = messages.filter(
3472
+ const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
3473
+ type: "text",
3474
+ text: m.content,
3475
+ // Add cache_control to the LAST system message block
3476
+ ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
3477
+ })) : void 0;
3478
+ const nonSystemMessages = messages.filter(
3454
3479
  (message) => message.role !== "system"
3455
- ).map((message) => ({
3480
+ );
3481
+ const lastUserIndex = nonSystemMessages.reduce(
3482
+ (lastIdx, msg, idx) => msg.role === "user" ? idx : lastIdx,
3483
+ -1
3484
+ );
3485
+ const conversation = nonSystemMessages.map((message, index) => ({
3456
3486
  role: message.role,
3457
3487
  content: [
3458
3488
  {
3459
3489
  type: "text",
3460
- text: message.content
3490
+ text: message.content,
3491
+ // Add cache_control to the LAST user message
3492
+ ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
3461
3493
  }
3462
3494
  ]
3463
3495
  }));
@@ -3483,15 +3515,22 @@ var init_anthropic = __esm({
3483
3515
  async *wrapStream(iterable) {
3484
3516
  const stream2 = iterable;
3485
3517
  let inputTokens = 0;
3518
+ let cachedInputTokens = 0;
3519
+ let cacheCreationInputTokens = 0;
3486
3520
  for await (const event of stream2) {
3487
3521
  if (event.type === "message_start") {
3488
- inputTokens = event.message.usage.input_tokens;
3522
+ const usage = event.message.usage;
3523
+ cachedInputTokens = usage.cache_read_input_tokens ?? 0;
3524
+ cacheCreationInputTokens = usage.cache_creation_input_tokens ?? 0;
3525
+ inputTokens = usage.input_tokens + cachedInputTokens + cacheCreationInputTokens;
3489
3526
  yield {
3490
3527
  text: "",
3491
3528
  usage: {
3492
3529
  inputTokens,
3493
3530
  outputTokens: 0,
3494
- totalTokens: inputTokens
3531
+ totalTokens: inputTokens,
3532
+ cachedInputTokens,
3533
+ cacheCreationInputTokens
3495
3534
  },
3496
3535
  rawEvent: event
3497
3536
  };
@@ -3505,7 +3544,9 @@ var init_anthropic = __esm({
3505
3544
  const usage = event.usage ? {
3506
3545
  inputTokens,
3507
3546
  outputTokens: event.usage.output_tokens,
3508
- totalTokens: inputTokens + event.usage.output_tokens
3547
+ totalTokens: inputTokens + event.usage.output_tokens,
3548
+ cachedInputTokens,
3549
+ cacheCreationInputTokens
3509
3550
  } : void 0;
3510
3551
  if (event.delta.stop_reason || usage) {
3511
3552
  yield {
@@ -3586,6 +3627,7 @@ var init_gemini_models = __esm({
3586
3627
  "src/providers/gemini-models.ts"() {
3587
3628
  "use strict";
3588
3629
  GEMINI_MODELS = [
3630
+ // Gemini 3 Pro (Preview)
3589
3631
  {
3590
3632
  provider: "gemini",
3591
3633
  modelId: "gemini-3-pro-preview",
@@ -3594,8 +3636,11 @@ var init_gemini_models = __esm({
3594
3636
  maxOutputTokens: 65536,
3595
3637
  pricing: {
3596
3638
  input: 2,
3639
+ // $2.00 for prompts <= 200k, $4.00 for > 200k (using lower tier)
3597
3640
  output: 12,
3641
+ // $12.00 for prompts <= 200k, $18.00 for > 200k
3598
3642
  cachedInput: 0.2
3643
+ // $0.20 for prompts <= 200k
3599
3644
  },
3600
3645
  knowledgeCutoff: "2025-01",
3601
3646
  features: {
@@ -3608,9 +3653,10 @@ var init_gemini_models = __esm({
3608
3653
  metadata: {
3609
3654
  family: "Gemini 3",
3610
3655
  releaseDate: "2025-11-18",
3611
- notes: "Most advanced model. 1501 Elo LMArena, 91.9% GPQA Diamond, 76.2% SWE-bench. Deep Think mode available."
3656
+ notes: "Best model for multimodal understanding, agentic and vibe-coding. Deep Think mode available."
3612
3657
  }
3613
3658
  },
3659
+ // Gemini 2.5 Pro
3614
3660
  {
3615
3661
  provider: "gemini",
3616
3662
  modelId: "gemini-2.5-pro",
@@ -3619,8 +3665,11 @@ var init_gemini_models = __esm({
3619
3665
  maxOutputTokens: 65536,
3620
3666
  pricing: {
3621
3667
  input: 1.25,
3668
+ // $1.25 for prompts <= 200k, $2.50 for > 200k
3622
3669
  output: 10,
3670
+ // $10.00 for prompts <= 200k, $15.00 for > 200k
3623
3671
  cachedInput: 0.125
3672
+ // $0.125 for prompts <= 200k
3624
3673
  },
3625
3674
  knowledgeCutoff: "2025-01",
3626
3675
  features: {
@@ -3633,9 +3682,10 @@ var init_gemini_models = __esm({
3633
3682
  metadata: {
3634
3683
  family: "Gemini 2.5",
3635
3684
  releaseDate: "2025-06",
3636
- notes: "Balanced multimodal model with 1M context. Best for complex agents and reasoning."
3685
+ notes: "State-of-the-art multipurpose model. Excels at coding and complex reasoning."
3637
3686
  }
3638
3687
  },
3688
+ // Gemini 2.5 Flash
3639
3689
  {
3640
3690
  provider: "gemini",
3641
3691
  modelId: "gemini-2.5-flash",
@@ -3644,8 +3694,10 @@ var init_gemini_models = __esm({
3644
3694
  maxOutputTokens: 65536,
3645
3695
  pricing: {
3646
3696
  input: 0.3,
3697
+ // $0.30 for text/image/video, $1.00 for audio
3647
3698
  output: 2.5,
3648
3699
  cachedInput: 0.03
3700
+ // $0.03 for text/image/video
3649
3701
  },
3650
3702
  knowledgeCutoff: "2025-01",
3651
3703
  features: {
@@ -3658,9 +3710,10 @@ var init_gemini_models = __esm({
3658
3710
  metadata: {
3659
3711
  family: "Gemini 2.5",
3660
3712
  releaseDate: "2025-06",
3661
- notes: "Best price-performance ratio with thinking enabled by default"
3713
+ notes: "First hybrid reasoning model with 1M context and thinking budgets."
3662
3714
  }
3663
3715
  },
3716
+ // Gemini 2.5 Flash-Lite
3664
3717
  {
3665
3718
  provider: "gemini",
3666
3719
  modelId: "gemini-2.5-flash-lite",
@@ -3669,8 +3722,10 @@ var init_gemini_models = __esm({
3669
3722
  maxOutputTokens: 65536,
3670
3723
  pricing: {
3671
3724
  input: 0.1,
3725
+ // $0.10 for text/image/video, $0.30 for audio
3672
3726
  output: 0.4,
3673
3727
  cachedInput: 0.01
3728
+ // $0.01 for text/image/video
3674
3729
  },
3675
3730
  knowledgeCutoff: "2025-01",
3676
3731
  features: {
@@ -3682,9 +3737,10 @@ var init_gemini_models = __esm({
3682
3737
  metadata: {
3683
3738
  family: "Gemini 2.5",
3684
3739
  releaseDate: "2025-06",
3685
- notes: "Fastest and most cost-efficient model for high-volume, low-latency tasks"
3740
+ notes: "Smallest and most cost effective model, built for at scale usage."
3686
3741
  }
3687
3742
  },
3743
+ // Gemini 2.0 Flash
3688
3744
  {
3689
3745
  provider: "gemini",
3690
3746
  modelId: "gemini-2.0-flash",
@@ -3693,8 +3749,10 @@ var init_gemini_models = __esm({
3693
3749
  maxOutputTokens: 8192,
3694
3750
  pricing: {
3695
3751
  input: 0.1,
3752
+ // $0.10 for text/image/video, $0.70 for audio
3696
3753
  output: 0.4,
3697
- cachedInput: 0.01
3754
+ cachedInput: 0.025
3755
+ // $0.025 for text/image/video
3698
3756
  },
3699
3757
  knowledgeCutoff: "2024-08",
3700
3758
  features: {
@@ -3705,9 +3763,10 @@ var init_gemini_models = __esm({
3705
3763
  },
3706
3764
  metadata: {
3707
3765
  family: "Gemini 2.0",
3708
- notes: "Previous generation with 1M context and multimodal capabilities"
3766
+ notes: "Balanced multimodal model with 1M context, built for the era of Agents."
3709
3767
  }
3710
3768
  },
3769
+ // Gemini 2.0 Flash-Lite
3711
3770
  {
3712
3771
  provider: "gemini",
3713
3772
  modelId: "gemini-2.0-flash-lite",
@@ -3716,8 +3775,8 @@ var init_gemini_models = __esm({
3716
3775
  maxOutputTokens: 8192,
3717
3776
  pricing: {
3718
3777
  input: 0.075,
3719
- output: 0.3,
3720
- cachedInput: 75e-4
3778
+ output: 0.3
3779
+ // No context caching available for 2.0-flash-lite
3721
3780
  },
3722
3781
  knowledgeCutoff: "2024-08",
3723
3782
  features: {
@@ -3728,7 +3787,7 @@ var init_gemini_models = __esm({
3728
3787
  },
3729
3788
  metadata: {
3730
3789
  family: "Gemini 2.0",
3731
- notes: "Lightweight previous generation model for cost-sensitive applications"
3790
+ notes: "Smallest and most cost effective 2.0 model for at scale usage."
3732
3791
  }
3733
3792
  }
3734
3793
  ];
@@ -3898,7 +3957,9 @@ var init_gemini = __esm({
3898
3957
  return {
3899
3958
  inputTokens: usageMetadata.promptTokenCount ?? 0,
3900
3959
  outputTokens: usageMetadata.candidatesTokenCount ?? 0,
3901
- totalTokens: usageMetadata.totalTokenCount ?? 0
3960
+ totalTokens: usageMetadata.totalTokenCount ?? 0,
3961
+ // Gemini returns cached token count in cachedContentTokenCount
3962
+ cachedInputTokens: usageMetadata.cachedContentTokenCount ?? 0
3902
3963
  };
3903
3964
  }
3904
3965
  /**
@@ -3954,10 +4015,11 @@ var init_openai_models = __esm({
3954
4015
  "src/providers/openai-models.ts"() {
3955
4016
  "use strict";
3956
4017
  OPENAI_MODELS = [
4018
+ // GPT-5 Family
3957
4019
  {
3958
4020
  provider: "openai",
3959
4021
  modelId: "gpt-5.1",
3960
- displayName: "GPT-5.1 Instant",
4022
+ displayName: "GPT-5.1",
3961
4023
  contextWindow: 128e3,
3962
4024
  maxOutputTokens: 32768,
3963
4025
  pricing: {
@@ -3977,34 +4039,7 @@ var init_openai_models = __esm({
3977
4039
  metadata: {
3978
4040
  family: "GPT-5",
3979
4041
  releaseDate: "2025-11-12",
3980
- notes: "Warmer, more intelligent, better instruction following. 2-3x faster than GPT-5.",
3981
- supportsTemperature: false
3982
- }
3983
- },
3984
- {
3985
- provider: "openai",
3986
- modelId: "gpt-5.1-thinking",
3987
- displayName: "GPT-5.1 Thinking",
3988
- contextWindow: 196e3,
3989
- maxOutputTokens: 32768,
3990
- pricing: {
3991
- input: 1.25,
3992
- output: 10,
3993
- cachedInput: 0.125
3994
- },
3995
- knowledgeCutoff: "2024-09-30",
3996
- features: {
3997
- streaming: true,
3998
- functionCalling: true,
3999
- vision: true,
4000
- reasoning: true,
4001
- structuredOutputs: true,
4002
- fineTuning: true
4003
- },
4004
- metadata: {
4005
- family: "GPT-5",
4006
- releaseDate: "2025-11-12",
4007
- notes: "Advanced reasoning with thinking levels: Light, Standard, Extended, Heavy. Best for complex tasks.",
4042
+ notes: "Latest GPT-5 with improved instruction following. 2-3x faster than GPT-5.",
4008
4043
  supportsTemperature: false
4009
4044
  }
4010
4045
  },
@@ -4084,6 +4119,255 @@ var init_openai_models = __esm({
4084
4119
  notes: "Fastest, most cost-efficient version for well-defined tasks",
4085
4120
  supportsTemperature: false
4086
4121
  }
4122
+ },
4123
+ {
4124
+ provider: "openai",
4125
+ modelId: "gpt-5-pro",
4126
+ displayName: "GPT-5 Pro",
4127
+ contextWindow: 272e3,
4128
+ maxOutputTokens: 128e3,
4129
+ pricing: {
4130
+ input: 15,
4131
+ output: 120
4132
+ // No cached input pricing for gpt-5-pro
4133
+ },
4134
+ knowledgeCutoff: "2024-09-30",
4135
+ features: {
4136
+ streaming: true,
4137
+ functionCalling: true,
4138
+ vision: true,
4139
+ reasoning: true,
4140
+ structuredOutputs: true
4141
+ },
4142
+ metadata: {
4143
+ family: "GPT-5",
4144
+ notes: "Premium tier with enhanced capabilities. Does not support prompt caching.",
4145
+ supportsTemperature: false
4146
+ }
4147
+ },
4148
+ // GPT-4.1 Family
4149
+ {
4150
+ provider: "openai",
4151
+ modelId: "gpt-4.1",
4152
+ displayName: "GPT-4.1",
4153
+ contextWindow: 128e3,
4154
+ maxOutputTokens: 32768,
4155
+ pricing: {
4156
+ input: 2,
4157
+ output: 8,
4158
+ cachedInput: 0.5
4159
+ },
4160
+ knowledgeCutoff: "2024-04-01",
4161
+ features: {
4162
+ streaming: true,
4163
+ functionCalling: true,
4164
+ vision: true,
4165
+ structuredOutputs: true,
4166
+ fineTuning: true
4167
+ },
4168
+ metadata: {
4169
+ family: "GPT-4.1",
4170
+ notes: "Improved GPT-4 with better instruction following"
4171
+ }
4172
+ },
4173
+ {
4174
+ provider: "openai",
4175
+ modelId: "gpt-4.1-mini",
4176
+ displayName: "GPT-4.1 Mini",
4177
+ contextWindow: 128e3,
4178
+ maxOutputTokens: 32768,
4179
+ pricing: {
4180
+ input: 0.4,
4181
+ output: 1.6,
4182
+ cachedInput: 0.1
4183
+ },
4184
+ knowledgeCutoff: "2024-04-01",
4185
+ features: {
4186
+ streaming: true,
4187
+ functionCalling: true,
4188
+ vision: true,
4189
+ structuredOutputs: true,
4190
+ fineTuning: true
4191
+ },
4192
+ metadata: {
4193
+ family: "GPT-4.1",
4194
+ notes: "Cost-efficient GPT-4.1 variant"
4195
+ }
4196
+ },
4197
+ {
4198
+ provider: "openai",
4199
+ modelId: "gpt-4.1-nano",
4200
+ displayName: "GPT-4.1 Nano",
4201
+ contextWindow: 128e3,
4202
+ maxOutputTokens: 32768,
4203
+ pricing: {
4204
+ input: 0.1,
4205
+ output: 0.4,
4206
+ cachedInput: 0.025
4207
+ },
4208
+ knowledgeCutoff: "2024-04-01",
4209
+ features: {
4210
+ streaming: true,
4211
+ functionCalling: true,
4212
+ vision: true,
4213
+ structuredOutputs: true,
4214
+ fineTuning: true
4215
+ },
4216
+ metadata: {
4217
+ family: "GPT-4.1",
4218
+ notes: "Fastest GPT-4.1 variant for simple tasks"
4219
+ }
4220
+ },
4221
+ // GPT-4o Family
4222
+ {
4223
+ provider: "openai",
4224
+ modelId: "gpt-4o",
4225
+ displayName: "GPT-4o",
4226
+ contextWindow: 128e3,
4227
+ maxOutputTokens: 16384,
4228
+ pricing: {
4229
+ input: 2.5,
4230
+ output: 10,
4231
+ cachedInput: 1.25
4232
+ },
4233
+ knowledgeCutoff: "2024-04-01",
4234
+ features: {
4235
+ streaming: true,
4236
+ functionCalling: true,
4237
+ vision: true,
4238
+ structuredOutputs: true,
4239
+ fineTuning: true
4240
+ },
4241
+ metadata: {
4242
+ family: "GPT-4o",
4243
+ notes: "Multimodal model optimized for speed"
4244
+ }
4245
+ },
4246
+ {
4247
+ provider: "openai",
4248
+ modelId: "gpt-4o-mini",
4249
+ displayName: "GPT-4o Mini",
4250
+ contextWindow: 128e3,
4251
+ maxOutputTokens: 16384,
4252
+ pricing: {
4253
+ input: 0.15,
4254
+ output: 0.6,
4255
+ cachedInput: 0.075
4256
+ },
4257
+ knowledgeCutoff: "2024-04-01",
4258
+ features: {
4259
+ streaming: true,
4260
+ functionCalling: true,
4261
+ vision: true,
4262
+ structuredOutputs: true,
4263
+ fineTuning: true
4264
+ },
4265
+ metadata: {
4266
+ family: "GPT-4o",
4267
+ notes: "Fast and affordable multimodal model"
4268
+ }
4269
+ },
4270
+ // o-series (Reasoning models)
4271
+ {
4272
+ provider: "openai",
4273
+ modelId: "o1",
4274
+ displayName: "o1",
4275
+ contextWindow: 2e5,
4276
+ maxOutputTokens: 1e5,
4277
+ pricing: {
4278
+ input: 15,
4279
+ output: 60,
4280
+ cachedInput: 7.5
4281
+ },
4282
+ knowledgeCutoff: "2024-12-01",
4283
+ features: {
4284
+ streaming: true,
4285
+ functionCalling: true,
4286
+ vision: true,
4287
+ reasoning: true,
4288
+ structuredOutputs: true
4289
+ },
4290
+ metadata: {
4291
+ family: "o-series",
4292
+ notes: "Advanced reasoning model with chain-of-thought",
4293
+ supportsTemperature: false
4294
+ }
4295
+ },
4296
+ {
4297
+ provider: "openai",
4298
+ modelId: "o3",
4299
+ displayName: "o3",
4300
+ contextWindow: 2e5,
4301
+ maxOutputTokens: 1e5,
4302
+ pricing: {
4303
+ input: 2,
4304
+ output: 8,
4305
+ cachedInput: 0.5
4306
+ },
4307
+ knowledgeCutoff: "2025-01-01",
4308
+ features: {
4309
+ streaming: true,
4310
+ functionCalling: true,
4311
+ vision: true,
4312
+ reasoning: true,
4313
+ structuredOutputs: true
4314
+ },
4315
+ metadata: {
4316
+ family: "o-series",
4317
+ notes: "Next-gen reasoning model, more efficient than o1",
4318
+ supportsTemperature: false
4319
+ }
4320
+ },
4321
+ {
4322
+ provider: "openai",
4323
+ modelId: "o4-mini",
4324
+ displayName: "o4 Mini",
4325
+ contextWindow: 2e5,
4326
+ maxOutputTokens: 1e5,
4327
+ pricing: {
4328
+ input: 1.1,
4329
+ output: 4.4,
4330
+ cachedInput: 0.275
4331
+ },
4332
+ knowledgeCutoff: "2025-04-01",
4333
+ features: {
4334
+ streaming: true,
4335
+ functionCalling: true,
4336
+ vision: true,
4337
+ reasoning: true,
4338
+ structuredOutputs: true,
4339
+ fineTuning: true
4340
+ },
4341
+ metadata: {
4342
+ family: "o-series",
4343
+ notes: "Cost-efficient reasoning model",
4344
+ supportsTemperature: false
4345
+ }
4346
+ },
4347
+ {
4348
+ provider: "openai",
4349
+ modelId: "o3-mini",
4350
+ displayName: "o3 Mini",
4351
+ contextWindow: 2e5,
4352
+ maxOutputTokens: 1e5,
4353
+ pricing: {
4354
+ input: 1.1,
4355
+ output: 4.4,
4356
+ cachedInput: 0.55
4357
+ },
4358
+ knowledgeCutoff: "2025-01-01",
4359
+ features: {
4360
+ streaming: true,
4361
+ functionCalling: true,
4362
+ vision: true,
4363
+ reasoning: true,
4364
+ structuredOutputs: true
4365
+ },
4366
+ metadata: {
4367
+ family: "o-series",
4368
+ notes: "Compact reasoning model for cost-sensitive applications",
4369
+ supportsTemperature: false
4370
+ }
4087
4371
  }
4088
4372
  ];
4089
4373
  }
@@ -4164,7 +4448,8 @@ var init_openai = __esm({
4164
4448
  const usage = chunk.usage ? {
4165
4449
  inputTokens: chunk.usage.prompt_tokens,
4166
4450
  outputTokens: chunk.usage.completion_tokens,
4167
- totalTokens: chunk.usage.total_tokens
4451
+ totalTokens: chunk.usage.total_tokens,
4452
+ cachedInputTokens: chunk.usage.prompt_tokens_details?.cached_tokens ?? 0
4168
4453
  } : void 0;
4169
4454
  if (finishReason || usage) {
4170
4455
  yield { text: "", finishReason, usage, rawEvent: chunk };
@@ -4381,20 +4666,28 @@ var init_model_registry = __esm({
4381
4666
  /**
4382
4667
  * Estimate API cost for a given model and token usage
4383
4668
  * @param modelId - Full model identifier
4384
- * @param inputTokens - Number of input tokens
4669
+ * @param inputTokens - Number of input tokens (total, including cached and cache creation)
4385
4670
  * @param outputTokens - Number of output tokens
4386
- * @param useCachedInput - Whether to use cached input pricing (if supported by provider)
4671
+ * @param cachedInputTokens - Number of cached input tokens (subset of inputTokens)
4672
+ * @param cacheCreationInputTokens - Number of cache creation tokens (subset of inputTokens, Anthropic only)
4387
4673
  * @returns CostEstimate if model found, undefined otherwise
4388
4674
  */
4389
- estimateCost(modelId, inputTokens, outputTokens, useCachedInput = false) {
4675
+ estimateCost(modelId, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0) {
4390
4676
  const spec = this.getModelSpec(modelId);
4391
4677
  if (!spec) return void 0;
4392
- const inputRate = useCachedInput && spec.pricing.cachedInput !== void 0 ? spec.pricing.cachedInput : spec.pricing.input;
4393
- const inputCost = inputTokens / 1e6 * inputRate;
4678
+ const cachedRate = spec.pricing.cachedInput ?? spec.pricing.input;
4679
+ const cacheWriteRate = spec.pricing.cacheWriteInput ?? spec.pricing.input;
4680
+ const uncachedInputTokens = inputTokens - cachedInputTokens - cacheCreationInputTokens;
4681
+ const uncachedInputCost = uncachedInputTokens / 1e6 * spec.pricing.input;
4682
+ const cachedInputCost = cachedInputTokens / 1e6 * cachedRate;
4683
+ const cacheCreationCost = cacheCreationInputTokens / 1e6 * cacheWriteRate;
4684
+ const inputCost = uncachedInputCost + cachedInputCost + cacheCreationCost;
4394
4685
  const outputCost = outputTokens / 1e6 * spec.pricing.output;
4395
4686
  const totalCost = inputCost + outputCost;
4396
4687
  return {
4397
4688
  inputCost,
4689
+ cachedInputCost,
4690
+ cacheCreationCost,
4398
4691
  outputCost,
4399
4692
  totalCost,
4400
4693
  currency: "USD"