llmist 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -864,7 +864,7 @@ function findSafeDelimiter(content) {
864
864
  }
865
865
  let counter = 1;
866
866
  while (counter < 1e3) {
867
- const delimiter = `HEREDOC_${counter}`;
867
+ const delimiter = `__GADGET_PARAM_${counter}__`;
868
868
  const regex = new RegExp(`^${delimiter}\\s*$`);
869
869
  const isUsed = lines.some((line) => regex.test(line));
870
870
  if (!isUsed) {
@@ -971,7 +971,16 @@ var init_gadget = __esm({
971
971
  yaml = __toESM(require("js-yaml"), 1);
972
972
  init_schema_to_json();
973
973
  init_schema_validator();
974
- HEREDOC_DELIMITERS = ["EOF", "END", "DOC", "CONTENT", "TEXT", "HEREDOC", "DATA", "BLOCK"];
974
+ HEREDOC_DELIMITERS = [
975
+ "__GADGET_PARAM_EOF__",
976
+ "__GADGET_PARAM_END__",
977
+ "__GADGET_PARAM_DOC__",
978
+ "__GADGET_PARAM_CONTENT__",
979
+ "__GADGET_PARAM_TEXT__",
980
+ "__GADGET_PARAM_HEREDOC__",
981
+ "__GADGET_PARAM_DATA__",
982
+ "__GADGET_PARAM_BLOCK__"
983
+ ];
975
984
  BaseGadget = class {
976
985
  /**
977
986
  * The name of the gadget. Used for identification when LLM calls it.
@@ -3697,7 +3706,8 @@ var init_anthropic_models = __esm({
3697
3706
  pricing: {
3698
3707
  input: 3,
3699
3708
  output: 15,
3700
- cachedInput: 0.3
3709
+ cachedInput: 0.3,
3710
+ cacheWriteInput: 3.75
3701
3711
  },
3702
3712
  knowledgeCutoff: "2025-01",
3703
3713
  features: {
@@ -3721,7 +3731,8 @@ var init_anthropic_models = __esm({
3721
3731
  pricing: {
3722
3732
  input: 1,
3723
3733
  output: 5,
3724
- cachedInput: 0.1
3734
+ cachedInput: 0.1,
3735
+ cacheWriteInput: 1.25
3725
3736
  },
3726
3737
  knowledgeCutoff: "2025-02",
3727
3738
  features: {
@@ -3745,7 +3756,8 @@ var init_anthropic_models = __esm({
3745
3756
  pricing: {
3746
3757
  input: 3,
3747
3758
  output: 15,
3748
- cachedInput: 0.3
3759
+ cachedInput: 0.3,
3760
+ cacheWriteInput: 3.75
3749
3761
  },
3750
3762
  knowledgeCutoff: "2025-03",
3751
3763
  features: {
@@ -3769,7 +3781,8 @@ var init_anthropic_models = __esm({
3769
3781
  pricing: {
3770
3782
  input: 3,
3771
3783
  output: 15,
3772
- cachedInput: 0.3
3784
+ cachedInput: 0.3,
3785
+ cacheWriteInput: 3.75
3773
3786
  },
3774
3787
  knowledgeCutoff: "2024-11",
3775
3788
  features: {
@@ -3793,7 +3806,8 @@ var init_anthropic_models = __esm({
3793
3806
  pricing: {
3794
3807
  input: 15,
3795
3808
  output: 75,
3796
- cachedInput: 1.5
3809
+ cachedInput: 1.5,
3810
+ cacheWriteInput: 18.75
3797
3811
  },
3798
3812
  knowledgeCutoff: "2025-01",
3799
3813
  features: {
@@ -3817,7 +3831,8 @@ var init_anthropic_models = __esm({
3817
3831
  pricing: {
3818
3832
  input: 15,
3819
3833
  output: 75,
3820
- cachedInput: 1.5
3834
+ cachedInput: 1.5,
3835
+ cacheWriteInput: 18.75
3821
3836
  },
3822
3837
  knowledgeCutoff: "2025-03",
3823
3838
  features: {
@@ -3840,7 +3855,8 @@ var init_anthropic_models = __esm({
3840
3855
  pricing: {
3841
3856
  input: 0.8,
3842
3857
  output: 4,
3843
- cachedInput: 0.08
3858
+ cachedInput: 0.08,
3859
+ cacheWriteInput: 1
3844
3860
  },
3845
3861
  knowledgeCutoff: "2024-07",
3846
3862
  features: {
@@ -3863,7 +3879,8 @@ var init_anthropic_models = __esm({
3863
3879
  pricing: {
3864
3880
  input: 0.25,
3865
3881
  output: 1.25,
3866
- cachedInput: 0.025
3882
+ cachedInput: 0.025,
3883
+ cacheWriteInput: 0.3125
3867
3884
  },
3868
3885
  knowledgeCutoff: "2023-08",
3869
3886
  features: {
@@ -3887,7 +3904,8 @@ var init_anthropic_models = __esm({
3887
3904
  pricing: {
3888
3905
  input: 1,
3889
3906
  output: 5,
3890
- cachedInput: 0.1
3907
+ cachedInput: 0.1,
3908
+ cacheWriteInput: 1.25
3891
3909
  },
3892
3910
  knowledgeCutoff: "2025-02",
3893
3911
  features: {
@@ -3911,7 +3929,8 @@ var init_anthropic_models = __esm({
3911
3929
  pricing: {
3912
3930
  input: 3,
3913
3931
  output: 15,
3914
- cachedInput: 0.3
3932
+ cachedInput: 0.3,
3933
+ cacheWriteInput: 3.75
3915
3934
  },
3916
3935
  knowledgeCutoff: "2025-01",
3917
3936
  features: {
@@ -3935,7 +3954,8 @@ var init_anthropic_models = __esm({
3935
3954
  pricing: {
3936
3955
  input: 5,
3937
3956
  output: 25,
3938
- cachedInput: 0.5
3957
+ cachedInput: 0.5,
3958
+ cacheWriteInput: 6.25
3939
3959
  },
3940
3960
  knowledgeCutoff: "2025-03",
3941
3961
  features: {
@@ -4050,15 +4070,27 @@ var init_anthropic = __esm({
4050
4070
  }
4051
4071
  buildRequestPayload(options, descriptor, spec, messages) {
4052
4072
  const systemMessages = messages.filter((message) => message.role === "system");
4053
- const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
4054
- const conversation = messages.filter(
4073
+ const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
4074
+ type: "text",
4075
+ text: m.content,
4076
+ // Add cache_control to the LAST system message block
4077
+ ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
4078
+ })) : void 0;
4079
+ const nonSystemMessages = messages.filter(
4055
4080
  (message) => message.role !== "system"
4056
- ).map((message) => ({
4081
+ );
4082
+ const lastUserIndex = nonSystemMessages.reduce(
4083
+ (lastIdx, msg, idx) => msg.role === "user" ? idx : lastIdx,
4084
+ -1
4085
+ );
4086
+ const conversation = nonSystemMessages.map((message, index) => ({
4057
4087
  role: message.role,
4058
4088
  content: [
4059
4089
  {
4060
4090
  type: "text",
4061
- text: message.content
4091
+ text: message.content,
4092
+ // Add cache_control to the LAST user message
4093
+ ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
4062
4094
  }
4063
4095
  ]
4064
4096
  }));
@@ -4084,15 +4116,22 @@ var init_anthropic = __esm({
4084
4116
  async *wrapStream(iterable) {
4085
4117
  const stream2 = iterable;
4086
4118
  let inputTokens = 0;
4119
+ let cachedInputTokens = 0;
4120
+ let cacheCreationInputTokens = 0;
4087
4121
  for await (const event of stream2) {
4088
4122
  if (event.type === "message_start") {
4089
- inputTokens = event.message.usage.input_tokens;
4123
+ const usage = event.message.usage;
4124
+ cachedInputTokens = usage.cache_read_input_tokens ?? 0;
4125
+ cacheCreationInputTokens = usage.cache_creation_input_tokens ?? 0;
4126
+ inputTokens = usage.input_tokens + cachedInputTokens + cacheCreationInputTokens;
4090
4127
  yield {
4091
4128
  text: "",
4092
4129
  usage: {
4093
4130
  inputTokens,
4094
4131
  outputTokens: 0,
4095
- totalTokens: inputTokens
4132
+ totalTokens: inputTokens,
4133
+ cachedInputTokens,
4134
+ cacheCreationInputTokens
4096
4135
  },
4097
4136
  rawEvent: event
4098
4137
  };
@@ -4106,7 +4145,9 @@ var init_anthropic = __esm({
4106
4145
  const usage = event.usage ? {
4107
4146
  inputTokens,
4108
4147
  outputTokens: event.usage.output_tokens,
4109
- totalTokens: inputTokens + event.usage.output_tokens
4148
+ totalTokens: inputTokens + event.usage.output_tokens,
4149
+ cachedInputTokens,
4150
+ cacheCreationInputTokens
4110
4151
  } : void 0;
4111
4152
  if (event.delta.stop_reason || usage) {
4112
4153
  yield {
@@ -4187,6 +4228,7 @@ var init_gemini_models = __esm({
4187
4228
  "src/providers/gemini-models.ts"() {
4188
4229
  "use strict";
4189
4230
  GEMINI_MODELS = [
4231
+ // Gemini 3 Pro (Preview)
4190
4232
  {
4191
4233
  provider: "gemini",
4192
4234
  modelId: "gemini-3-pro-preview",
@@ -4195,8 +4237,11 @@ var init_gemini_models = __esm({
4195
4237
  maxOutputTokens: 65536,
4196
4238
  pricing: {
4197
4239
  input: 2,
4240
+ // $2.00 for prompts <= 200k, $4.00 for > 200k (using lower tier)
4198
4241
  output: 12,
4242
+ // $12.00 for prompts <= 200k, $18.00 for > 200k
4199
4243
  cachedInput: 0.2
4244
+ // $0.20 for prompts <= 200k
4200
4245
  },
4201
4246
  knowledgeCutoff: "2025-01",
4202
4247
  features: {
@@ -4209,9 +4254,10 @@ var init_gemini_models = __esm({
4209
4254
  metadata: {
4210
4255
  family: "Gemini 3",
4211
4256
  releaseDate: "2025-11-18",
4212
- notes: "Most advanced model. 1501 Elo LMArena, 91.9% GPQA Diamond, 76.2% SWE-bench. Deep Think mode available."
4257
+ notes: "Best model for multimodal understanding, agentic and vibe-coding. Deep Think mode available."
4213
4258
  }
4214
4259
  },
4260
+ // Gemini 2.5 Pro
4215
4261
  {
4216
4262
  provider: "gemini",
4217
4263
  modelId: "gemini-2.5-pro",
@@ -4220,8 +4266,11 @@ var init_gemini_models = __esm({
4220
4266
  maxOutputTokens: 65536,
4221
4267
  pricing: {
4222
4268
  input: 1.25,
4269
+ // $1.25 for prompts <= 200k, $2.50 for > 200k
4223
4270
  output: 10,
4271
+ // $10.00 for prompts <= 200k, $15.00 for > 200k
4224
4272
  cachedInput: 0.125
4273
+ // $0.125 for prompts <= 200k
4225
4274
  },
4226
4275
  knowledgeCutoff: "2025-01",
4227
4276
  features: {
@@ -4234,9 +4283,10 @@ var init_gemini_models = __esm({
4234
4283
  metadata: {
4235
4284
  family: "Gemini 2.5",
4236
4285
  releaseDate: "2025-06",
4237
- notes: "Balanced multimodal model with 1M context. Best for complex agents and reasoning."
4286
+ notes: "State-of-the-art multipurpose model. Excels at coding and complex reasoning."
4238
4287
  }
4239
4288
  },
4289
+ // Gemini 2.5 Flash
4240
4290
  {
4241
4291
  provider: "gemini",
4242
4292
  modelId: "gemini-2.5-flash",
@@ -4245,8 +4295,10 @@ var init_gemini_models = __esm({
4245
4295
  maxOutputTokens: 65536,
4246
4296
  pricing: {
4247
4297
  input: 0.3,
4298
+ // $0.30 for text/image/video, $1.00 for audio
4248
4299
  output: 2.5,
4249
4300
  cachedInput: 0.03
4301
+ // $0.03 for text/image/video
4250
4302
  },
4251
4303
  knowledgeCutoff: "2025-01",
4252
4304
  features: {
@@ -4259,9 +4311,10 @@ var init_gemini_models = __esm({
4259
4311
  metadata: {
4260
4312
  family: "Gemini 2.5",
4261
4313
  releaseDate: "2025-06",
4262
- notes: "Best price-performance ratio with thinking enabled by default"
4314
+ notes: "First hybrid reasoning model with 1M context and thinking budgets."
4263
4315
  }
4264
4316
  },
4317
+ // Gemini 2.5 Flash-Lite
4265
4318
  {
4266
4319
  provider: "gemini",
4267
4320
  modelId: "gemini-2.5-flash-lite",
@@ -4270,8 +4323,10 @@ var init_gemini_models = __esm({
4270
4323
  maxOutputTokens: 65536,
4271
4324
  pricing: {
4272
4325
  input: 0.1,
4326
+ // $0.10 for text/image/video, $0.30 for audio
4273
4327
  output: 0.4,
4274
4328
  cachedInput: 0.01
4329
+ // $0.01 for text/image/video
4275
4330
  },
4276
4331
  knowledgeCutoff: "2025-01",
4277
4332
  features: {
@@ -4283,9 +4338,10 @@ var init_gemini_models = __esm({
4283
4338
  metadata: {
4284
4339
  family: "Gemini 2.5",
4285
4340
  releaseDate: "2025-06",
4286
- notes: "Fastest and most cost-efficient model for high-volume, low-latency tasks"
4341
+ notes: "Smallest and most cost effective model, built for at scale usage."
4287
4342
  }
4288
4343
  },
4344
+ // Gemini 2.0 Flash
4289
4345
  {
4290
4346
  provider: "gemini",
4291
4347
  modelId: "gemini-2.0-flash",
@@ -4294,8 +4350,10 @@ var init_gemini_models = __esm({
4294
4350
  maxOutputTokens: 8192,
4295
4351
  pricing: {
4296
4352
  input: 0.1,
4353
+ // $0.10 for text/image/video, $0.70 for audio
4297
4354
  output: 0.4,
4298
- cachedInput: 0.01
4355
+ cachedInput: 0.025
4356
+ // $0.025 for text/image/video
4299
4357
  },
4300
4358
  knowledgeCutoff: "2024-08",
4301
4359
  features: {
@@ -4306,9 +4364,10 @@ var init_gemini_models = __esm({
4306
4364
  },
4307
4365
  metadata: {
4308
4366
  family: "Gemini 2.0",
4309
- notes: "Previous generation with 1M context and multimodal capabilities"
4367
+ notes: "Balanced multimodal model with 1M context, built for the era of Agents."
4310
4368
  }
4311
4369
  },
4370
+ // Gemini 2.0 Flash-Lite
4312
4371
  {
4313
4372
  provider: "gemini",
4314
4373
  modelId: "gemini-2.0-flash-lite",
@@ -4317,8 +4376,8 @@ var init_gemini_models = __esm({
4317
4376
  maxOutputTokens: 8192,
4318
4377
  pricing: {
4319
4378
  input: 0.075,
4320
- output: 0.3,
4321
- cachedInput: 75e-4
4379
+ output: 0.3
4380
+ // No context caching available for 2.0-flash-lite
4322
4381
  },
4323
4382
  knowledgeCutoff: "2024-08",
4324
4383
  features: {
@@ -4329,7 +4388,7 @@ var init_gemini_models = __esm({
4329
4388
  },
4330
4389
  metadata: {
4331
4390
  family: "Gemini 2.0",
4332
- notes: "Lightweight previous generation model for cost-sensitive applications"
4391
+ notes: "Smallest and most cost effective 2.0 model for at scale usage."
4333
4392
  }
4334
4393
  }
4335
4394
  ];
@@ -4499,7 +4558,9 @@ var init_gemini = __esm({
4499
4558
  return {
4500
4559
  inputTokens: usageMetadata.promptTokenCount ?? 0,
4501
4560
  outputTokens: usageMetadata.candidatesTokenCount ?? 0,
4502
- totalTokens: usageMetadata.totalTokenCount ?? 0
4561
+ totalTokens: usageMetadata.totalTokenCount ?? 0,
4562
+ // Gemini returns cached token count in cachedContentTokenCount
4563
+ cachedInputTokens: usageMetadata.cachedContentTokenCount ?? 0
4503
4564
  };
4504
4565
  }
4505
4566
  /**
@@ -4555,10 +4616,11 @@ var init_openai_models = __esm({
4555
4616
  "src/providers/openai-models.ts"() {
4556
4617
  "use strict";
4557
4618
  OPENAI_MODELS = [
4619
+ // GPT-5 Family
4558
4620
  {
4559
4621
  provider: "openai",
4560
4622
  modelId: "gpt-5.1",
4561
- displayName: "GPT-5.1 Instant",
4623
+ displayName: "GPT-5.1",
4562
4624
  contextWindow: 128e3,
4563
4625
  maxOutputTokens: 32768,
4564
4626
  pricing: {
@@ -4578,34 +4640,7 @@ var init_openai_models = __esm({
4578
4640
  metadata: {
4579
4641
  family: "GPT-5",
4580
4642
  releaseDate: "2025-11-12",
4581
- notes: "Warmer, more intelligent, better instruction following. 2-3x faster than GPT-5.",
4582
- supportsTemperature: false
4583
- }
4584
- },
4585
- {
4586
- provider: "openai",
4587
- modelId: "gpt-5.1-thinking",
4588
- displayName: "GPT-5.1 Thinking",
4589
- contextWindow: 196e3,
4590
- maxOutputTokens: 32768,
4591
- pricing: {
4592
- input: 1.25,
4593
- output: 10,
4594
- cachedInput: 0.125
4595
- },
4596
- knowledgeCutoff: "2024-09-30",
4597
- features: {
4598
- streaming: true,
4599
- functionCalling: true,
4600
- vision: true,
4601
- reasoning: true,
4602
- structuredOutputs: true,
4603
- fineTuning: true
4604
- },
4605
- metadata: {
4606
- family: "GPT-5",
4607
- releaseDate: "2025-11-12",
4608
- notes: "Advanced reasoning with thinking levels: Light, Standard, Extended, Heavy. Best for complex tasks.",
4643
+ notes: "Latest GPT-5 with improved instruction following. 2-3x faster than GPT-5.",
4609
4644
  supportsTemperature: false
4610
4645
  }
4611
4646
  },
@@ -4685,6 +4720,255 @@ var init_openai_models = __esm({
4685
4720
  notes: "Fastest, most cost-efficient version for well-defined tasks",
4686
4721
  supportsTemperature: false
4687
4722
  }
4723
+ },
4724
+ {
4725
+ provider: "openai",
4726
+ modelId: "gpt-5-pro",
4727
+ displayName: "GPT-5 Pro",
4728
+ contextWindow: 272e3,
4729
+ maxOutputTokens: 128e3,
4730
+ pricing: {
4731
+ input: 15,
4732
+ output: 120
4733
+ // No cached input pricing for gpt-5-pro
4734
+ },
4735
+ knowledgeCutoff: "2024-09-30",
4736
+ features: {
4737
+ streaming: true,
4738
+ functionCalling: true,
4739
+ vision: true,
4740
+ reasoning: true,
4741
+ structuredOutputs: true
4742
+ },
4743
+ metadata: {
4744
+ family: "GPT-5",
4745
+ notes: "Premium tier with enhanced capabilities. Does not support prompt caching.",
4746
+ supportsTemperature: false
4747
+ }
4748
+ },
4749
+ // GPT-4.1 Family
4750
+ {
4751
+ provider: "openai",
4752
+ modelId: "gpt-4.1",
4753
+ displayName: "GPT-4.1",
4754
+ contextWindow: 128e3,
4755
+ maxOutputTokens: 32768,
4756
+ pricing: {
4757
+ input: 2,
4758
+ output: 8,
4759
+ cachedInput: 0.5
4760
+ },
4761
+ knowledgeCutoff: "2024-04-01",
4762
+ features: {
4763
+ streaming: true,
4764
+ functionCalling: true,
4765
+ vision: true,
4766
+ structuredOutputs: true,
4767
+ fineTuning: true
4768
+ },
4769
+ metadata: {
4770
+ family: "GPT-4.1",
4771
+ notes: "Improved GPT-4 with better instruction following"
4772
+ }
4773
+ },
4774
+ {
4775
+ provider: "openai",
4776
+ modelId: "gpt-4.1-mini",
4777
+ displayName: "GPT-4.1 Mini",
4778
+ contextWindow: 128e3,
4779
+ maxOutputTokens: 32768,
4780
+ pricing: {
4781
+ input: 0.4,
4782
+ output: 1.6,
4783
+ cachedInput: 0.1
4784
+ },
4785
+ knowledgeCutoff: "2024-04-01",
4786
+ features: {
4787
+ streaming: true,
4788
+ functionCalling: true,
4789
+ vision: true,
4790
+ structuredOutputs: true,
4791
+ fineTuning: true
4792
+ },
4793
+ metadata: {
4794
+ family: "GPT-4.1",
4795
+ notes: "Cost-efficient GPT-4.1 variant"
4796
+ }
4797
+ },
4798
+ {
4799
+ provider: "openai",
4800
+ modelId: "gpt-4.1-nano",
4801
+ displayName: "GPT-4.1 Nano",
4802
+ contextWindow: 128e3,
4803
+ maxOutputTokens: 32768,
4804
+ pricing: {
4805
+ input: 0.1,
4806
+ output: 0.4,
4807
+ cachedInput: 0.025
4808
+ },
4809
+ knowledgeCutoff: "2024-04-01",
4810
+ features: {
4811
+ streaming: true,
4812
+ functionCalling: true,
4813
+ vision: true,
4814
+ structuredOutputs: true,
4815
+ fineTuning: true
4816
+ },
4817
+ metadata: {
4818
+ family: "GPT-4.1",
4819
+ notes: "Fastest GPT-4.1 variant for simple tasks"
4820
+ }
4821
+ },
4822
+ // GPT-4o Family
4823
+ {
4824
+ provider: "openai",
4825
+ modelId: "gpt-4o",
4826
+ displayName: "GPT-4o",
4827
+ contextWindow: 128e3,
4828
+ maxOutputTokens: 16384,
4829
+ pricing: {
4830
+ input: 2.5,
4831
+ output: 10,
4832
+ cachedInput: 1.25
4833
+ },
4834
+ knowledgeCutoff: "2024-04-01",
4835
+ features: {
4836
+ streaming: true,
4837
+ functionCalling: true,
4838
+ vision: true,
4839
+ structuredOutputs: true,
4840
+ fineTuning: true
4841
+ },
4842
+ metadata: {
4843
+ family: "GPT-4o",
4844
+ notes: "Multimodal model optimized for speed"
4845
+ }
4846
+ },
4847
+ {
4848
+ provider: "openai",
4849
+ modelId: "gpt-4o-mini",
4850
+ displayName: "GPT-4o Mini",
4851
+ contextWindow: 128e3,
4852
+ maxOutputTokens: 16384,
4853
+ pricing: {
4854
+ input: 0.15,
4855
+ output: 0.6,
4856
+ cachedInput: 0.075
4857
+ },
4858
+ knowledgeCutoff: "2024-04-01",
4859
+ features: {
4860
+ streaming: true,
4861
+ functionCalling: true,
4862
+ vision: true,
4863
+ structuredOutputs: true,
4864
+ fineTuning: true
4865
+ },
4866
+ metadata: {
4867
+ family: "GPT-4o",
4868
+ notes: "Fast and affordable multimodal model"
4869
+ }
4870
+ },
4871
+ // o-series (Reasoning models)
4872
+ {
4873
+ provider: "openai",
4874
+ modelId: "o1",
4875
+ displayName: "o1",
4876
+ contextWindow: 2e5,
4877
+ maxOutputTokens: 1e5,
4878
+ pricing: {
4879
+ input: 15,
4880
+ output: 60,
4881
+ cachedInput: 7.5
4882
+ },
4883
+ knowledgeCutoff: "2024-12-01",
4884
+ features: {
4885
+ streaming: true,
4886
+ functionCalling: true,
4887
+ vision: true,
4888
+ reasoning: true,
4889
+ structuredOutputs: true
4890
+ },
4891
+ metadata: {
4892
+ family: "o-series",
4893
+ notes: "Advanced reasoning model with chain-of-thought",
4894
+ supportsTemperature: false
4895
+ }
4896
+ },
4897
+ {
4898
+ provider: "openai",
4899
+ modelId: "o3",
4900
+ displayName: "o3",
4901
+ contextWindow: 2e5,
4902
+ maxOutputTokens: 1e5,
4903
+ pricing: {
4904
+ input: 2,
4905
+ output: 8,
4906
+ cachedInput: 0.5
4907
+ },
4908
+ knowledgeCutoff: "2025-01-01",
4909
+ features: {
4910
+ streaming: true,
4911
+ functionCalling: true,
4912
+ vision: true,
4913
+ reasoning: true,
4914
+ structuredOutputs: true
4915
+ },
4916
+ metadata: {
4917
+ family: "o-series",
4918
+ notes: "Next-gen reasoning model, more efficient than o1",
4919
+ supportsTemperature: false
4920
+ }
4921
+ },
4922
+ {
4923
+ provider: "openai",
4924
+ modelId: "o4-mini",
4925
+ displayName: "o4 Mini",
4926
+ contextWindow: 2e5,
4927
+ maxOutputTokens: 1e5,
4928
+ pricing: {
4929
+ input: 1.1,
4930
+ output: 4.4,
4931
+ cachedInput: 0.275
4932
+ },
4933
+ knowledgeCutoff: "2025-04-01",
4934
+ features: {
4935
+ streaming: true,
4936
+ functionCalling: true,
4937
+ vision: true,
4938
+ reasoning: true,
4939
+ structuredOutputs: true,
4940
+ fineTuning: true
4941
+ },
4942
+ metadata: {
4943
+ family: "o-series",
4944
+ notes: "Cost-efficient reasoning model",
4945
+ supportsTemperature: false
4946
+ }
4947
+ },
4948
+ {
4949
+ provider: "openai",
4950
+ modelId: "o3-mini",
4951
+ displayName: "o3 Mini",
4952
+ contextWindow: 2e5,
4953
+ maxOutputTokens: 1e5,
4954
+ pricing: {
4955
+ input: 1.1,
4956
+ output: 4.4,
4957
+ cachedInput: 0.55
4958
+ },
4959
+ knowledgeCutoff: "2025-01-01",
4960
+ features: {
4961
+ streaming: true,
4962
+ functionCalling: true,
4963
+ vision: true,
4964
+ reasoning: true,
4965
+ structuredOutputs: true
4966
+ },
4967
+ metadata: {
4968
+ family: "o-series",
4969
+ notes: "Compact reasoning model for cost-sensitive applications",
4970
+ supportsTemperature: false
4971
+ }
4688
4972
  }
4689
4973
  ];
4690
4974
  }
@@ -4765,7 +5049,8 @@ var init_openai = __esm({
4765
5049
  const usage = chunk.usage ? {
4766
5050
  inputTokens: chunk.usage.prompt_tokens,
4767
5051
  outputTokens: chunk.usage.completion_tokens,
4768
- totalTokens: chunk.usage.total_tokens
5052
+ totalTokens: chunk.usage.total_tokens,
5053
+ cachedInputTokens: chunk.usage.prompt_tokens_details?.cached_tokens ?? 0
4769
5054
  } : void 0;
4770
5055
  if (finishReason || usage) {
4771
5056
  yield { text: "", finishReason, usage, rawEvent: chunk };
@@ -4982,20 +5267,28 @@ var init_model_registry = __esm({
4982
5267
  /**
4983
5268
  * Estimate API cost for a given model and token usage
4984
5269
  * @param modelId - Full model identifier
4985
- * @param inputTokens - Number of input tokens
5270
+ * @param inputTokens - Number of input tokens (total, including cached and cache creation)
4986
5271
  * @param outputTokens - Number of output tokens
4987
- * @param useCachedInput - Whether to use cached input pricing (if supported by provider)
5272
+ * @param cachedInputTokens - Number of cached input tokens (subset of inputTokens)
5273
+ * @param cacheCreationInputTokens - Number of cache creation tokens (subset of inputTokens, Anthropic only)
4988
5274
  * @returns CostEstimate if model found, undefined otherwise
4989
5275
  */
4990
- estimateCost(modelId, inputTokens, outputTokens, useCachedInput = false) {
5276
+ estimateCost(modelId, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0) {
4991
5277
  const spec = this.getModelSpec(modelId);
4992
5278
  if (!spec) return void 0;
4993
- const inputRate = useCachedInput && spec.pricing.cachedInput !== void 0 ? spec.pricing.cachedInput : spec.pricing.input;
4994
- const inputCost = inputTokens / 1e6 * inputRate;
5279
+ const cachedRate = spec.pricing.cachedInput ?? spec.pricing.input;
5280
+ const cacheWriteRate = spec.pricing.cacheWriteInput ?? spec.pricing.input;
5281
+ const uncachedInputTokens = inputTokens - cachedInputTokens - cacheCreationInputTokens;
5282
+ const uncachedInputCost = uncachedInputTokens / 1e6 * spec.pricing.input;
5283
+ const cachedInputCost = cachedInputTokens / 1e6 * cachedRate;
5284
+ const cacheCreationCost = cacheCreationInputTokens / 1e6 * cacheWriteRate;
5285
+ const inputCost = uncachedInputCost + cachedInputCost + cacheCreationCost;
4995
5286
  const outputCost = outputTokens / 1e6 * spec.pricing.output;
4996
5287
  const totalCost = inputCost + outputCost;
4997
5288
  return {
4998
5289
  inputCost,
5290
+ cachedInputCost,
5291
+ cacheCreationCost,
4999
5292
  outputCost,
5000
5293
  totalCost,
5001
5294
  currency: "USD"