llmist 0.6.2 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -864,7 +864,7 @@ function findSafeDelimiter(content) {
864
864
  }
865
865
  let counter = 1;
866
866
  while (counter < 1e3) {
867
- const delimiter = `HEREDOC_${counter}`;
867
+ const delimiter = `__GADGET_PARAM_${counter}__`;
868
868
  const regex = new RegExp(`^${delimiter}\\s*$`);
869
869
  const isUsed = lines.some((line) => regex.test(line));
870
870
  if (!isUsed) {
@@ -922,6 +922,10 @@ function formatParamsAsYaml(params) {
922
922
  }
923
923
  return lines.join("\n");
924
924
  }
925
+ function formatTomlInlineTable(obj) {
926
+ const entries = Object.entries(obj).map(([k, v]) => `${k} = ${formatTomlValue(v)}`);
927
+ return `{ ${entries.join(", ")} }`;
928
+ }
925
929
  function formatTomlValue(value) {
926
930
  if (typeof value === "string") {
927
931
  if (value.includes("\n")) {
@@ -939,10 +943,17 @@ ${delimiter}`;
939
943
  return '""';
940
944
  }
941
945
  if (Array.isArray(value)) {
942
- return JSON.stringify(value);
946
+ if (value.length === 0) return "[]";
947
+ const items = value.map((item) => {
948
+ if (typeof item === "object" && item !== null && !Array.isArray(item)) {
949
+ return formatTomlInlineTable(item);
950
+ }
951
+ return formatTomlValue(item);
952
+ });
953
+ return `[${items.join(", ")}]`;
943
954
  }
944
955
  if (typeof value === "object") {
945
- return JSON.stringify(value);
956
+ return formatTomlInlineTable(value);
946
957
  }
947
958
  return JSON.stringify(value);
948
959
  }
@@ -960,7 +971,16 @@ var init_gadget = __esm({
960
971
  yaml = __toESM(require("js-yaml"), 1);
961
972
  init_schema_to_json();
962
973
  init_schema_validator();
963
- HEREDOC_DELIMITERS = ["EOF", "END", "DOC", "CONTENT", "TEXT", "HEREDOC", "DATA", "BLOCK"];
974
+ HEREDOC_DELIMITERS = [
975
+ "__GADGET_PARAM_EOF__",
976
+ "__GADGET_PARAM_END__",
977
+ "__GADGET_PARAM_DOC__",
978
+ "__GADGET_PARAM_CONTENT__",
979
+ "__GADGET_PARAM_TEXT__",
980
+ "__GADGET_PARAM_HEREDOC__",
981
+ "__GADGET_PARAM_DATA__",
982
+ "__GADGET_PARAM_BLOCK__"
983
+ ];
964
984
  BaseGadget = class {
965
985
  /**
966
986
  * The name of the gadget. Used for identification when LLM calls it.
@@ -1958,6 +1978,14 @@ function preprocessTomlHeredoc(tomlStr) {
1958
1978
  }
1959
1979
  return result.join("\n");
1960
1980
  }
1981
+ function stripMarkdownFences(content) {
1982
+ let cleaned = content.trim();
1983
+ const openingFence = /^```(?:toml|yaml|json)?\s*\n/i;
1984
+ const closingFence = /\n?```\s*$/;
1985
+ cleaned = cleaned.replace(openingFence, "");
1986
+ cleaned = cleaned.replace(closingFence, "");
1987
+ return cleaned.trim();
1988
+ }
1961
1989
  var yaml2, import_js_toml, globalInvocationCounter, StreamParser;
1962
1990
  var init_parser = __esm({
1963
1991
  "src/gadgets/parser.ts"() {
@@ -2013,35 +2041,36 @@ var init_parser = __esm({
2013
2041
  * Parse parameter string according to configured format
2014
2042
  */
2015
2043
  parseParameters(raw) {
2044
+ const cleaned = stripMarkdownFences(raw);
2016
2045
  if (this.parameterFormat === "json") {
2017
2046
  try {
2018
- return { parameters: JSON.parse(raw) };
2047
+ return { parameters: JSON.parse(cleaned) };
2019
2048
  } catch (error) {
2020
2049
  return { parseError: this.truncateParseError(error, "JSON") };
2021
2050
  }
2022
2051
  }
2023
2052
  if (this.parameterFormat === "yaml") {
2024
2053
  try {
2025
- return { parameters: yaml2.load(preprocessYaml(raw)) };
2054
+ return { parameters: yaml2.load(preprocessYaml(cleaned)) };
2026
2055
  } catch (error) {
2027
2056
  return { parseError: this.truncateParseError(error, "YAML") };
2028
2057
  }
2029
2058
  }
2030
2059
  if (this.parameterFormat === "toml") {
2031
2060
  try {
2032
- return { parameters: (0, import_js_toml.load)(preprocessTomlHeredoc(raw)) };
2061
+ return { parameters: (0, import_js_toml.load)(preprocessTomlHeredoc(cleaned)) };
2033
2062
  } catch (error) {
2034
2063
  return { parseError: this.truncateParseError(error, "TOML") };
2035
2064
  }
2036
2065
  }
2037
2066
  try {
2038
- return { parameters: JSON.parse(raw) };
2067
+ return { parameters: JSON.parse(cleaned) };
2039
2068
  } catch {
2040
2069
  try {
2041
- return { parameters: (0, import_js_toml.load)(preprocessTomlHeredoc(raw)) };
2070
+ return { parameters: (0, import_js_toml.load)(preprocessTomlHeredoc(cleaned)) };
2042
2071
  } catch {
2043
2072
  try {
2044
- return { parameters: yaml2.load(preprocessYaml(raw)) };
2073
+ return { parameters: yaml2.load(preprocessYaml(cleaned)) };
2045
2074
  } catch (error) {
2046
2075
  return { parseError: this.truncateParseError(error, "auto") };
2047
2076
  }
@@ -2587,6 +2616,7 @@ var init_agent = __esm({
2587
2616
  gadgetEndPrefix;
2588
2617
  onHumanInputRequired;
2589
2618
  textOnlyHandler;
2619
+ textWithGadgetsHandler;
2590
2620
  stopOnGadgetError;
2591
2621
  shouldContinueAfterError;
2592
2622
  defaultGadgetTimeoutMs;
@@ -2617,6 +2647,7 @@ var init_agent = __esm({
2617
2647
  this.gadgetEndPrefix = options.gadgetEndPrefix;
2618
2648
  this.onHumanInputRequired = options.onHumanInputRequired;
2619
2649
  this.textOnlyHandler = options.textOnlyHandler ?? "terminate";
2650
+ this.textWithGadgetsHandler = options.textWithGadgetsHandler;
2620
2651
  this.stopOnGadgetError = options.stopOnGadgetError ?? true;
2621
2652
  this.shouldContinueAfterError = options.shouldContinueAfterError;
2622
2653
  this.defaultGadgetTimeoutMs = options.defaultGadgetTimeoutMs;
@@ -2804,6 +2835,17 @@ var init_agent = __esm({
2804
2835
  }
2805
2836
  }
2806
2837
  if (result.didExecuteGadgets) {
2838
+ if (this.textWithGadgetsHandler) {
2839
+ const textContent = result.outputs.filter((output) => output.type === "text").map((output) => output.content).join("");
2840
+ if (textContent.trim()) {
2841
+ const { gadgetName, parameterMapping, resultMapping } = this.textWithGadgetsHandler;
2842
+ this.conversation.addGadgetCall(
2843
+ gadgetName,
2844
+ parameterMapping(textContent),
2845
+ resultMapping ? resultMapping(textContent) : textContent
2846
+ );
2847
+ }
2848
+ }
2807
2849
  for (const output of result.outputs) {
2808
2850
  if (output.type === "gadget_result") {
2809
2851
  const gadgetResult = output.result;
@@ -2815,7 +2857,13 @@ var init_agent = __esm({
2815
2857
  }
2816
2858
  }
2817
2859
  } else {
2818
- this.conversation.addAssistantMessage(finalMessage);
2860
+ if (finalMessage.trim()) {
2861
+ this.conversation.addGadgetCall(
2862
+ "TellUser",
2863
+ { message: finalMessage, done: false, type: "info" },
2864
+ `\u2139\uFE0F ${finalMessage}`
2865
+ );
2866
+ }
2819
2867
  const shouldBreak = await this.handleTextOnlyResponse(finalMessage);
2820
2868
  if (shouldBreak) {
2821
2869
  break;
@@ -3000,6 +3048,7 @@ var AgentBuilder;
3000
3048
  var init_builder = __esm({
3001
3049
  "src/agent/builder.ts"() {
3002
3050
  "use strict";
3051
+ init_constants();
3003
3052
  init_model_shortcuts();
3004
3053
  init_registry();
3005
3054
  init_agent();
@@ -3021,6 +3070,7 @@ var init_builder = __esm({
3021
3070
  gadgetStartPrefix;
3022
3071
  gadgetEndPrefix;
3023
3072
  textOnlyHandler;
3073
+ textWithGadgetsHandler;
3024
3074
  stopOnGadgetError;
3025
3075
  shouldContinueAfterError;
3026
3076
  defaultGadgetTimeoutMs;
@@ -3283,6 +3333,30 @@ var init_builder = __esm({
3283
3333
  this.textOnlyHandler = handler;
3284
3334
  return this;
3285
3335
  }
3336
+ /**
3337
+ * Set the handler for text content that appears alongside gadget calls.
3338
+ *
3339
+ * When set, text accompanying gadget responses will be wrapped as a
3340
+ * synthetic gadget call before the actual gadget results in the
3341
+ * conversation history.
3342
+ *
3343
+ * @param handler - Configuration for wrapping text
3344
+ * @returns This builder for chaining
3345
+ *
3346
+ * @example
3347
+ * ```typescript
3348
+ * // Wrap text as TellUser gadget
3349
+ * .withTextWithGadgetsHandler({
3350
+ * gadgetName: "TellUser",
3351
+ * parameterMapping: (text) => ({ message: text, done: false, type: "info" }),
3352
+ * resultMapping: (text) => `ℹ️ ${text}`,
3353
+ * })
3354
+ * ```
3355
+ */
3356
+ withTextWithGadgetsHandler(handler) {
3357
+ this.textWithGadgetsHandler = handler;
3358
+ return this;
3359
+ }
3286
3360
  /**
3287
3361
  * Set whether to stop gadget execution on first error.
3288
3362
  *
@@ -3397,6 +3471,69 @@ var init_builder = __esm({
3397
3471
  this.gadgetOutputLimitPercent = percent;
3398
3472
  return this;
3399
3473
  }
3474
+ /**
3475
+ * Add a synthetic gadget call to the conversation history.
3476
+ *
3477
+ * This is useful for in-context learning - showing the LLM what "past self"
3478
+ * did correctly so it mimics the pattern. The call is formatted with proper
3479
+ * markers and parameter format.
3480
+ *
3481
+ * @param gadgetName - Name of the gadget
3482
+ * @param parameters - Parameters passed to the gadget
3483
+ * @param result - Result returned by the gadget
3484
+ * @returns This builder for chaining
3485
+ *
3486
+ * @example
3487
+ * ```typescript
3488
+ * .withSyntheticGadgetCall(
3489
+ * 'TellUser',
3490
+ * {
3491
+ * message: '👋 Hello!\n\nHere\'s what I can do:\n- Analyze code\n- Run commands',
3492
+ * done: false,
3493
+ * type: 'info'
3494
+ * },
3495
+ * 'ℹ️ 👋 Hello!\n\nHere\'s what I can do:\n- Analyze code\n- Run commands'
3496
+ * )
3497
+ * ```
3498
+ */
3499
+ withSyntheticGadgetCall(gadgetName, parameters, result) {
3500
+ const startPrefix = this.gadgetStartPrefix ?? GADGET_START_PREFIX;
3501
+ const endPrefix = this.gadgetEndPrefix ?? GADGET_END_PREFIX;
3502
+ const format = this.parameterFormat ?? "yaml";
3503
+ const paramStr = this.formatSyntheticParameters(parameters, format);
3504
+ this.initialMessages.push({
3505
+ role: "assistant",
3506
+ content: `${startPrefix}${gadgetName}
3507
+ ${paramStr}
3508
+ ${endPrefix}`
3509
+ });
3510
+ this.initialMessages.push({
3511
+ role: "user",
3512
+ content: `Result: ${result}`
3513
+ });
3514
+ return this;
3515
+ }
3516
+ /**
3517
+ * Format parameters for synthetic gadget calls.
3518
+ * Uses heredoc for multiline string values.
3519
+ */
3520
+ formatSyntheticParameters(parameters, format) {
3521
+ if (format === "json" || format === "auto") {
3522
+ return JSON.stringify(parameters);
3523
+ }
3524
+ return Object.entries(parameters).map(([key, value]) => {
3525
+ if (typeof value === "string" && value.includes("\n")) {
3526
+ const separator = format === "yaml" ? ":" : " =";
3527
+ return `${key}${separator} <<<EOF
3528
+ ${value}
3529
+ EOF`;
3530
+ }
3531
+ if (format === "yaml") {
3532
+ return typeof value === "string" ? `${key}: ${value}` : `${key}: ${JSON.stringify(value)}`;
3533
+ }
3534
+ return `${key} = ${JSON.stringify(value)}`;
3535
+ }).join("\n");
3536
+ }
3400
3537
  /**
3401
3538
  * Build and create the agent with the given user prompt.
3402
3539
  * Returns the Agent instance ready to run.
@@ -3439,6 +3576,7 @@ var init_builder = __esm({
3439
3576
  gadgetStartPrefix: this.gadgetStartPrefix,
3440
3577
  gadgetEndPrefix: this.gadgetEndPrefix,
3441
3578
  textOnlyHandler: this.textOnlyHandler,
3579
+ textWithGadgetsHandler: this.textWithGadgetsHandler,
3442
3580
  stopOnGadgetError: this.stopOnGadgetError,
3443
3581
  shouldContinueAfterError: this.shouldContinueAfterError,
3444
3582
  defaultGadgetTimeoutMs: this.defaultGadgetTimeoutMs,
@@ -3540,6 +3678,7 @@ var init_builder = __esm({
3540
3678
  gadgetStartPrefix: this.gadgetStartPrefix,
3541
3679
  gadgetEndPrefix: this.gadgetEndPrefix,
3542
3680
  textOnlyHandler: this.textOnlyHandler,
3681
+ textWithGadgetsHandler: this.textWithGadgetsHandler,
3543
3682
  stopOnGadgetError: this.stopOnGadgetError,
3544
3683
  shouldContinueAfterError: this.shouldContinueAfterError,
3545
3684
  defaultGadgetTimeoutMs: this.defaultGadgetTimeoutMs,
@@ -3567,7 +3706,8 @@ var init_anthropic_models = __esm({
3567
3706
  pricing: {
3568
3707
  input: 3,
3569
3708
  output: 15,
3570
- cachedInput: 0.3
3709
+ cachedInput: 0.3,
3710
+ cacheWriteInput: 3.75
3571
3711
  },
3572
3712
  knowledgeCutoff: "2025-01",
3573
3713
  features: {
@@ -3591,7 +3731,8 @@ var init_anthropic_models = __esm({
3591
3731
  pricing: {
3592
3732
  input: 1,
3593
3733
  output: 5,
3594
- cachedInput: 0.1
3734
+ cachedInput: 0.1,
3735
+ cacheWriteInput: 1.25
3595
3736
  },
3596
3737
  knowledgeCutoff: "2025-02",
3597
3738
  features: {
@@ -3615,7 +3756,8 @@ var init_anthropic_models = __esm({
3615
3756
  pricing: {
3616
3757
  input: 3,
3617
3758
  output: 15,
3618
- cachedInput: 0.3
3759
+ cachedInput: 0.3,
3760
+ cacheWriteInput: 3.75
3619
3761
  },
3620
3762
  knowledgeCutoff: "2025-03",
3621
3763
  features: {
@@ -3639,7 +3781,8 @@ var init_anthropic_models = __esm({
3639
3781
  pricing: {
3640
3782
  input: 3,
3641
3783
  output: 15,
3642
- cachedInput: 0.3
3784
+ cachedInput: 0.3,
3785
+ cacheWriteInput: 3.75
3643
3786
  },
3644
3787
  knowledgeCutoff: "2024-11",
3645
3788
  features: {
@@ -3663,7 +3806,8 @@ var init_anthropic_models = __esm({
3663
3806
  pricing: {
3664
3807
  input: 15,
3665
3808
  output: 75,
3666
- cachedInput: 1.5
3809
+ cachedInput: 1.5,
3810
+ cacheWriteInput: 18.75
3667
3811
  },
3668
3812
  knowledgeCutoff: "2025-01",
3669
3813
  features: {
@@ -3687,7 +3831,8 @@ var init_anthropic_models = __esm({
3687
3831
  pricing: {
3688
3832
  input: 15,
3689
3833
  output: 75,
3690
- cachedInput: 1.5
3834
+ cachedInput: 1.5,
3835
+ cacheWriteInput: 18.75
3691
3836
  },
3692
3837
  knowledgeCutoff: "2025-03",
3693
3838
  features: {
@@ -3710,7 +3855,8 @@ var init_anthropic_models = __esm({
3710
3855
  pricing: {
3711
3856
  input: 0.8,
3712
3857
  output: 4,
3713
- cachedInput: 0.08
3858
+ cachedInput: 0.08,
3859
+ cacheWriteInput: 1
3714
3860
  },
3715
3861
  knowledgeCutoff: "2024-07",
3716
3862
  features: {
@@ -3733,7 +3879,8 @@ var init_anthropic_models = __esm({
3733
3879
  pricing: {
3734
3880
  input: 0.25,
3735
3881
  output: 1.25,
3736
- cachedInput: 0.025
3882
+ cachedInput: 0.025,
3883
+ cacheWriteInput: 0.3125
3737
3884
  },
3738
3885
  knowledgeCutoff: "2023-08",
3739
3886
  features: {
@@ -3757,7 +3904,8 @@ var init_anthropic_models = __esm({
3757
3904
  pricing: {
3758
3905
  input: 1,
3759
3906
  output: 5,
3760
- cachedInput: 0.1
3907
+ cachedInput: 0.1,
3908
+ cacheWriteInput: 1.25
3761
3909
  },
3762
3910
  knowledgeCutoff: "2025-02",
3763
3911
  features: {
@@ -3781,7 +3929,8 @@ var init_anthropic_models = __esm({
3781
3929
  pricing: {
3782
3930
  input: 3,
3783
3931
  output: 15,
3784
- cachedInput: 0.3
3932
+ cachedInput: 0.3,
3933
+ cacheWriteInput: 3.75
3785
3934
  },
3786
3935
  knowledgeCutoff: "2025-01",
3787
3936
  features: {
@@ -3805,7 +3954,8 @@ var init_anthropic_models = __esm({
3805
3954
  pricing: {
3806
3955
  input: 5,
3807
3956
  output: 25,
3808
- cachedInput: 0.5
3957
+ cachedInput: 0.5,
3958
+ cacheWriteInput: 6.25
3809
3959
  },
3810
3960
  knowledgeCutoff: "2025-03",
3811
3961
  features: {
@@ -3920,15 +4070,27 @@ var init_anthropic = __esm({
3920
4070
  }
3921
4071
  buildRequestPayload(options, descriptor, spec, messages) {
3922
4072
  const systemMessages = messages.filter((message) => message.role === "system");
3923
- const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
3924
- const conversation = messages.filter(
4073
+ const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
4074
+ type: "text",
4075
+ text: m.content,
4076
+ // Add cache_control to the LAST system message block
4077
+ ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
4078
+ })) : void 0;
4079
+ const nonSystemMessages = messages.filter(
3925
4080
  (message) => message.role !== "system"
3926
- ).map((message) => ({
4081
+ );
4082
+ const lastUserIndex = nonSystemMessages.reduce(
4083
+ (lastIdx, msg, idx) => msg.role === "user" ? idx : lastIdx,
4084
+ -1
4085
+ );
4086
+ const conversation = nonSystemMessages.map((message, index) => ({
3927
4087
  role: message.role,
3928
4088
  content: [
3929
4089
  {
3930
4090
  type: "text",
3931
- text: message.content
4091
+ text: message.content,
4092
+ // Add cache_control to the LAST user message
4093
+ ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
3932
4094
  }
3933
4095
  ]
3934
4096
  }));
@@ -3954,15 +4116,22 @@ var init_anthropic = __esm({
3954
4116
  async *wrapStream(iterable) {
3955
4117
  const stream2 = iterable;
3956
4118
  let inputTokens = 0;
4119
+ let cachedInputTokens = 0;
4120
+ let cacheCreationInputTokens = 0;
3957
4121
  for await (const event of stream2) {
3958
4122
  if (event.type === "message_start") {
3959
- inputTokens = event.message.usage.input_tokens;
4123
+ const usage = event.message.usage;
4124
+ cachedInputTokens = usage.cache_read_input_tokens ?? 0;
4125
+ cacheCreationInputTokens = usage.cache_creation_input_tokens ?? 0;
4126
+ inputTokens = usage.input_tokens + cachedInputTokens + cacheCreationInputTokens;
3960
4127
  yield {
3961
4128
  text: "",
3962
4129
  usage: {
3963
4130
  inputTokens,
3964
4131
  outputTokens: 0,
3965
- totalTokens: inputTokens
4132
+ totalTokens: inputTokens,
4133
+ cachedInputTokens,
4134
+ cacheCreationInputTokens
3966
4135
  },
3967
4136
  rawEvent: event
3968
4137
  };
@@ -3976,7 +4145,9 @@ var init_anthropic = __esm({
3976
4145
  const usage = event.usage ? {
3977
4146
  inputTokens,
3978
4147
  outputTokens: event.usage.output_tokens,
3979
- totalTokens: inputTokens + event.usage.output_tokens
4148
+ totalTokens: inputTokens + event.usage.output_tokens,
4149
+ cachedInputTokens,
4150
+ cacheCreationInputTokens
3980
4151
  } : void 0;
3981
4152
  if (event.delta.stop_reason || usage) {
3982
4153
  yield {
@@ -4057,6 +4228,7 @@ var init_gemini_models = __esm({
4057
4228
  "src/providers/gemini-models.ts"() {
4058
4229
  "use strict";
4059
4230
  GEMINI_MODELS = [
4231
+ // Gemini 3 Pro (Preview)
4060
4232
  {
4061
4233
  provider: "gemini",
4062
4234
  modelId: "gemini-3-pro-preview",
@@ -4065,8 +4237,11 @@ var init_gemini_models = __esm({
4065
4237
  maxOutputTokens: 65536,
4066
4238
  pricing: {
4067
4239
  input: 2,
4240
+ // $2.00 for prompts <= 200k, $4.00 for > 200k (using lower tier)
4068
4241
  output: 12,
4242
+ // $12.00 for prompts <= 200k, $18.00 for > 200k
4069
4243
  cachedInput: 0.2
4244
+ // $0.20 for prompts <= 200k
4070
4245
  },
4071
4246
  knowledgeCutoff: "2025-01",
4072
4247
  features: {
@@ -4079,9 +4254,10 @@ var init_gemini_models = __esm({
4079
4254
  metadata: {
4080
4255
  family: "Gemini 3",
4081
4256
  releaseDate: "2025-11-18",
4082
- notes: "Most advanced model. 1501 Elo LMArena, 91.9% GPQA Diamond, 76.2% SWE-bench. Deep Think mode available."
4257
+ notes: "Best model for multimodal understanding, agentic and vibe-coding. Deep Think mode available."
4083
4258
  }
4084
4259
  },
4260
+ // Gemini 2.5 Pro
4085
4261
  {
4086
4262
  provider: "gemini",
4087
4263
  modelId: "gemini-2.5-pro",
@@ -4090,8 +4266,11 @@ var init_gemini_models = __esm({
4090
4266
  maxOutputTokens: 65536,
4091
4267
  pricing: {
4092
4268
  input: 1.25,
4269
+ // $1.25 for prompts <= 200k, $2.50 for > 200k
4093
4270
  output: 10,
4271
+ // $10.00 for prompts <= 200k, $15.00 for > 200k
4094
4272
  cachedInput: 0.125
4273
+ // $0.125 for prompts <= 200k
4095
4274
  },
4096
4275
  knowledgeCutoff: "2025-01",
4097
4276
  features: {
@@ -4104,9 +4283,10 @@ var init_gemini_models = __esm({
4104
4283
  metadata: {
4105
4284
  family: "Gemini 2.5",
4106
4285
  releaseDate: "2025-06",
4107
- notes: "Balanced multimodal model with 1M context. Best for complex agents and reasoning."
4286
+ notes: "State-of-the-art multipurpose model. Excels at coding and complex reasoning."
4108
4287
  }
4109
4288
  },
4289
+ // Gemini 2.5 Flash
4110
4290
  {
4111
4291
  provider: "gemini",
4112
4292
  modelId: "gemini-2.5-flash",
@@ -4115,8 +4295,10 @@ var init_gemini_models = __esm({
4115
4295
  maxOutputTokens: 65536,
4116
4296
  pricing: {
4117
4297
  input: 0.3,
4298
+ // $0.30 for text/image/video, $1.00 for audio
4118
4299
  output: 2.5,
4119
4300
  cachedInput: 0.03
4301
+ // $0.03 for text/image/video
4120
4302
  },
4121
4303
  knowledgeCutoff: "2025-01",
4122
4304
  features: {
@@ -4129,9 +4311,10 @@ var init_gemini_models = __esm({
4129
4311
  metadata: {
4130
4312
  family: "Gemini 2.5",
4131
4313
  releaseDate: "2025-06",
4132
- notes: "Best price-performance ratio with thinking enabled by default"
4314
+ notes: "First hybrid reasoning model with 1M context and thinking budgets."
4133
4315
  }
4134
4316
  },
4317
+ // Gemini 2.5 Flash-Lite
4135
4318
  {
4136
4319
  provider: "gemini",
4137
4320
  modelId: "gemini-2.5-flash-lite",
@@ -4140,8 +4323,10 @@ var init_gemini_models = __esm({
4140
4323
  maxOutputTokens: 65536,
4141
4324
  pricing: {
4142
4325
  input: 0.1,
4326
+ // $0.10 for text/image/video, $0.30 for audio
4143
4327
  output: 0.4,
4144
4328
  cachedInput: 0.01
4329
+ // $0.01 for text/image/video
4145
4330
  },
4146
4331
  knowledgeCutoff: "2025-01",
4147
4332
  features: {
@@ -4153,9 +4338,10 @@ var init_gemini_models = __esm({
4153
4338
  metadata: {
4154
4339
  family: "Gemini 2.5",
4155
4340
  releaseDate: "2025-06",
4156
- notes: "Fastest and most cost-efficient model for high-volume, low-latency tasks"
4341
+ notes: "Smallest and most cost effective model, built for at scale usage."
4157
4342
  }
4158
4343
  },
4344
+ // Gemini 2.0 Flash
4159
4345
  {
4160
4346
  provider: "gemini",
4161
4347
  modelId: "gemini-2.0-flash",
@@ -4164,8 +4350,10 @@ var init_gemini_models = __esm({
4164
4350
  maxOutputTokens: 8192,
4165
4351
  pricing: {
4166
4352
  input: 0.1,
4353
+ // $0.10 for text/image/video, $0.70 for audio
4167
4354
  output: 0.4,
4168
- cachedInput: 0.01
4355
+ cachedInput: 0.025
4356
+ // $0.025 for text/image/video
4169
4357
  },
4170
4358
  knowledgeCutoff: "2024-08",
4171
4359
  features: {
@@ -4176,9 +4364,10 @@ var init_gemini_models = __esm({
4176
4364
  },
4177
4365
  metadata: {
4178
4366
  family: "Gemini 2.0",
4179
- notes: "Previous generation with 1M context and multimodal capabilities"
4367
+ notes: "Balanced multimodal model with 1M context, built for the era of Agents."
4180
4368
  }
4181
4369
  },
4370
+ // Gemini 2.0 Flash-Lite
4182
4371
  {
4183
4372
  provider: "gemini",
4184
4373
  modelId: "gemini-2.0-flash-lite",
@@ -4187,8 +4376,8 @@ var init_gemini_models = __esm({
4187
4376
  maxOutputTokens: 8192,
4188
4377
  pricing: {
4189
4378
  input: 0.075,
4190
- output: 0.3,
4191
- cachedInput: 75e-4
4379
+ output: 0.3
4380
+ // No context caching available for 2.0-flash-lite
4192
4381
  },
4193
4382
  knowledgeCutoff: "2024-08",
4194
4383
  features: {
@@ -4199,7 +4388,7 @@ var init_gemini_models = __esm({
4199
4388
  },
4200
4389
  metadata: {
4201
4390
  family: "Gemini 2.0",
4202
- notes: "Lightweight previous generation model for cost-sensitive applications"
4391
+ notes: "Smallest and most cost effective 2.0 model for at scale usage."
4203
4392
  }
4204
4393
  }
4205
4394
  ];
@@ -4369,7 +4558,9 @@ var init_gemini = __esm({
4369
4558
  return {
4370
4559
  inputTokens: usageMetadata.promptTokenCount ?? 0,
4371
4560
  outputTokens: usageMetadata.candidatesTokenCount ?? 0,
4372
- totalTokens: usageMetadata.totalTokenCount ?? 0
4561
+ totalTokens: usageMetadata.totalTokenCount ?? 0,
4562
+ // Gemini returns cached token count in cachedContentTokenCount
4563
+ cachedInputTokens: usageMetadata.cachedContentTokenCount ?? 0
4373
4564
  };
4374
4565
  }
4375
4566
  /**
@@ -4425,10 +4616,11 @@ var init_openai_models = __esm({
4425
4616
  "src/providers/openai-models.ts"() {
4426
4617
  "use strict";
4427
4618
  OPENAI_MODELS = [
4619
+ // GPT-5 Family
4428
4620
  {
4429
4621
  provider: "openai",
4430
4622
  modelId: "gpt-5.1",
4431
- displayName: "GPT-5.1 Instant",
4623
+ displayName: "GPT-5.1",
4432
4624
  contextWindow: 128e3,
4433
4625
  maxOutputTokens: 32768,
4434
4626
  pricing: {
@@ -4448,34 +4640,7 @@ var init_openai_models = __esm({
4448
4640
  metadata: {
4449
4641
  family: "GPT-5",
4450
4642
  releaseDate: "2025-11-12",
4451
- notes: "Warmer, more intelligent, better instruction following. 2-3x faster than GPT-5.",
4452
- supportsTemperature: false
4453
- }
4454
- },
4455
- {
4456
- provider: "openai",
4457
- modelId: "gpt-5.1-thinking",
4458
- displayName: "GPT-5.1 Thinking",
4459
- contextWindow: 196e3,
4460
- maxOutputTokens: 32768,
4461
- pricing: {
4462
- input: 1.25,
4463
- output: 10,
4464
- cachedInput: 0.125
4465
- },
4466
- knowledgeCutoff: "2024-09-30",
4467
- features: {
4468
- streaming: true,
4469
- functionCalling: true,
4470
- vision: true,
4471
- reasoning: true,
4472
- structuredOutputs: true,
4473
- fineTuning: true
4474
- },
4475
- metadata: {
4476
- family: "GPT-5",
4477
- releaseDate: "2025-11-12",
4478
- notes: "Advanced reasoning with thinking levels: Light, Standard, Extended, Heavy. Best for complex tasks.",
4643
+ notes: "Latest GPT-5 with improved instruction following. 2-3x faster than GPT-5.",
4479
4644
  supportsTemperature: false
4480
4645
  }
4481
4646
  },
@@ -4555,6 +4720,255 @@ var init_openai_models = __esm({
4555
4720
  notes: "Fastest, most cost-efficient version for well-defined tasks",
4556
4721
  supportsTemperature: false
4557
4722
  }
4723
+ },
4724
+ {
4725
+ provider: "openai",
4726
+ modelId: "gpt-5-pro",
4727
+ displayName: "GPT-5 Pro",
4728
+ contextWindow: 272e3,
4729
+ maxOutputTokens: 128e3,
4730
+ pricing: {
4731
+ input: 15,
4732
+ output: 120
4733
+ // No cached input pricing for gpt-5-pro
4734
+ },
4735
+ knowledgeCutoff: "2024-09-30",
4736
+ features: {
4737
+ streaming: true,
4738
+ functionCalling: true,
4739
+ vision: true,
4740
+ reasoning: true,
4741
+ structuredOutputs: true
4742
+ },
4743
+ metadata: {
4744
+ family: "GPT-5",
4745
+ notes: "Premium tier with enhanced capabilities. Does not support prompt caching.",
4746
+ supportsTemperature: false
4747
+ }
4748
+ },
4749
+ // GPT-4.1 Family
4750
+ {
4751
+ provider: "openai",
4752
+ modelId: "gpt-4.1",
4753
+ displayName: "GPT-4.1",
4754
+ contextWindow: 128e3,
4755
+ maxOutputTokens: 32768,
4756
+ pricing: {
4757
+ input: 2,
4758
+ output: 8,
4759
+ cachedInput: 0.5
4760
+ },
4761
+ knowledgeCutoff: "2024-04-01",
4762
+ features: {
4763
+ streaming: true,
4764
+ functionCalling: true,
4765
+ vision: true,
4766
+ structuredOutputs: true,
4767
+ fineTuning: true
4768
+ },
4769
+ metadata: {
4770
+ family: "GPT-4.1",
4771
+ notes: "Improved GPT-4 with better instruction following"
4772
+ }
4773
+ },
4774
+ {
4775
+ provider: "openai",
4776
+ modelId: "gpt-4.1-mini",
4777
+ displayName: "GPT-4.1 Mini",
4778
+ contextWindow: 128e3,
4779
+ maxOutputTokens: 32768,
4780
+ pricing: {
4781
+ input: 0.4,
4782
+ output: 1.6,
4783
+ cachedInput: 0.1
4784
+ },
4785
+ knowledgeCutoff: "2024-04-01",
4786
+ features: {
4787
+ streaming: true,
4788
+ functionCalling: true,
4789
+ vision: true,
4790
+ structuredOutputs: true,
4791
+ fineTuning: true
4792
+ },
4793
+ metadata: {
4794
+ family: "GPT-4.1",
4795
+ notes: "Cost-efficient GPT-4.1 variant"
4796
+ }
4797
+ },
4798
+ {
4799
+ provider: "openai",
4800
+ modelId: "gpt-4.1-nano",
4801
+ displayName: "GPT-4.1 Nano",
4802
+ contextWindow: 128e3,
4803
+ maxOutputTokens: 32768,
4804
+ pricing: {
4805
+ input: 0.1,
4806
+ output: 0.4,
4807
+ cachedInput: 0.025
4808
+ },
4809
+ knowledgeCutoff: "2024-04-01",
4810
+ features: {
4811
+ streaming: true,
4812
+ functionCalling: true,
4813
+ vision: true,
4814
+ structuredOutputs: true,
4815
+ fineTuning: true
4816
+ },
4817
+ metadata: {
4818
+ family: "GPT-4.1",
4819
+ notes: "Fastest GPT-4.1 variant for simple tasks"
4820
+ }
4821
+ },
4822
+ // GPT-4o Family
4823
+ {
4824
+ provider: "openai",
4825
+ modelId: "gpt-4o",
4826
+ displayName: "GPT-4o",
4827
+ contextWindow: 128e3,
4828
+ maxOutputTokens: 16384,
4829
+ pricing: {
4830
+ input: 2.5,
4831
+ output: 10,
4832
+ cachedInput: 1.25
4833
+ },
4834
+ knowledgeCutoff: "2024-04-01",
4835
+ features: {
4836
+ streaming: true,
4837
+ functionCalling: true,
4838
+ vision: true,
4839
+ structuredOutputs: true,
4840
+ fineTuning: true
4841
+ },
4842
+ metadata: {
4843
+ family: "GPT-4o",
4844
+ notes: "Multimodal model optimized for speed"
4845
+ }
4846
+ },
4847
+ {
4848
+ provider: "openai",
4849
+ modelId: "gpt-4o-mini",
4850
+ displayName: "GPT-4o Mini",
4851
+ contextWindow: 128e3,
4852
+ maxOutputTokens: 16384,
4853
+ pricing: {
4854
+ input: 0.15,
4855
+ output: 0.6,
4856
+ cachedInput: 0.075
4857
+ },
4858
+ knowledgeCutoff: "2024-04-01",
4859
+ features: {
4860
+ streaming: true,
4861
+ functionCalling: true,
4862
+ vision: true,
4863
+ structuredOutputs: true,
4864
+ fineTuning: true
4865
+ },
4866
+ metadata: {
4867
+ family: "GPT-4o",
4868
+ notes: "Fast and affordable multimodal model"
4869
+ }
4870
+ },
4871
+ // o-series (Reasoning models)
4872
+ {
4873
+ provider: "openai",
4874
+ modelId: "o1",
4875
+ displayName: "o1",
4876
+ contextWindow: 2e5,
4877
+ maxOutputTokens: 1e5,
4878
+ pricing: {
4879
+ input: 15,
4880
+ output: 60,
4881
+ cachedInput: 7.5
4882
+ },
4883
+ knowledgeCutoff: "2024-12-01",
4884
+ features: {
4885
+ streaming: true,
4886
+ functionCalling: true,
4887
+ vision: true,
4888
+ reasoning: true,
4889
+ structuredOutputs: true
4890
+ },
4891
+ metadata: {
4892
+ family: "o-series",
4893
+ notes: "Advanced reasoning model with chain-of-thought",
4894
+ supportsTemperature: false
4895
+ }
4896
+ },
4897
+ {
4898
+ provider: "openai",
4899
+ modelId: "o3",
4900
+ displayName: "o3",
4901
+ contextWindow: 2e5,
4902
+ maxOutputTokens: 1e5,
4903
+ pricing: {
4904
+ input: 2,
4905
+ output: 8,
4906
+ cachedInput: 0.5
4907
+ },
4908
+ knowledgeCutoff: "2025-01-01",
4909
+ features: {
4910
+ streaming: true,
4911
+ functionCalling: true,
4912
+ vision: true,
4913
+ reasoning: true,
4914
+ structuredOutputs: true
4915
+ },
4916
+ metadata: {
4917
+ family: "o-series",
4918
+ notes: "Next-gen reasoning model, more efficient than o1",
4919
+ supportsTemperature: false
4920
+ }
4921
+ },
4922
+ {
4923
+ provider: "openai",
4924
+ modelId: "o4-mini",
4925
+ displayName: "o4 Mini",
4926
+ contextWindow: 2e5,
4927
+ maxOutputTokens: 1e5,
4928
+ pricing: {
4929
+ input: 1.1,
4930
+ output: 4.4,
4931
+ cachedInput: 0.275
4932
+ },
4933
+ knowledgeCutoff: "2025-04-01",
4934
+ features: {
4935
+ streaming: true,
4936
+ functionCalling: true,
4937
+ vision: true,
4938
+ reasoning: true,
4939
+ structuredOutputs: true,
4940
+ fineTuning: true
4941
+ },
4942
+ metadata: {
4943
+ family: "o-series",
4944
+ notes: "Cost-efficient reasoning model",
4945
+ supportsTemperature: false
4946
+ }
4947
+ },
4948
+ {
4949
+ provider: "openai",
4950
+ modelId: "o3-mini",
4951
+ displayName: "o3 Mini",
4952
+ contextWindow: 2e5,
4953
+ maxOutputTokens: 1e5,
4954
+ pricing: {
4955
+ input: 1.1,
4956
+ output: 4.4,
4957
+ cachedInput: 0.55
4958
+ },
4959
+ knowledgeCutoff: "2025-01-01",
4960
+ features: {
4961
+ streaming: true,
4962
+ functionCalling: true,
4963
+ vision: true,
4964
+ reasoning: true,
4965
+ structuredOutputs: true
4966
+ },
4967
+ metadata: {
4968
+ family: "o-series",
4969
+ notes: "Compact reasoning model for cost-sensitive applications",
4970
+ supportsTemperature: false
4971
+ }
4558
4972
  }
4559
4973
  ];
4560
4974
  }
@@ -4635,7 +5049,8 @@ var init_openai = __esm({
4635
5049
  const usage = chunk.usage ? {
4636
5050
  inputTokens: chunk.usage.prompt_tokens,
4637
5051
  outputTokens: chunk.usage.completion_tokens,
4638
- totalTokens: chunk.usage.total_tokens
5052
+ totalTokens: chunk.usage.total_tokens,
5053
+ cachedInputTokens: chunk.usage.prompt_tokens_details?.cached_tokens ?? 0
4639
5054
  } : void 0;
4640
5055
  if (finishReason || usage) {
4641
5056
  yield { text: "", finishReason, usage, rawEvent: chunk };
@@ -4852,20 +5267,28 @@ var init_model_registry = __esm({
4852
5267
  /**
4853
5268
  * Estimate API cost for a given model and token usage
4854
5269
  * @param modelId - Full model identifier
4855
- * @param inputTokens - Number of input tokens
5270
+ * @param inputTokens - Number of input tokens (total, including cached and cache creation)
4856
5271
  * @param outputTokens - Number of output tokens
4857
- * @param useCachedInput - Whether to use cached input pricing (if supported by provider)
5272
+ * @param cachedInputTokens - Number of cached input tokens (subset of inputTokens)
5273
+ * @param cacheCreationInputTokens - Number of cache creation tokens (subset of inputTokens, Anthropic only)
4858
5274
  * @returns CostEstimate if model found, undefined otherwise
4859
5275
  */
4860
- estimateCost(modelId, inputTokens, outputTokens, useCachedInput = false) {
5276
+ estimateCost(modelId, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0) {
4861
5277
  const spec = this.getModelSpec(modelId);
4862
5278
  if (!spec) return void 0;
4863
- const inputRate = useCachedInput && spec.pricing.cachedInput !== void 0 ? spec.pricing.cachedInput : spec.pricing.input;
4864
- const inputCost = inputTokens / 1e6 * inputRate;
5279
+ const cachedRate = spec.pricing.cachedInput ?? spec.pricing.input;
5280
+ const cacheWriteRate = spec.pricing.cacheWriteInput ?? spec.pricing.input;
5281
+ const uncachedInputTokens = inputTokens - cachedInputTokens - cacheCreationInputTokens;
5282
+ const uncachedInputCost = uncachedInputTokens / 1e6 * spec.pricing.input;
5283
+ const cachedInputCost = cachedInputTokens / 1e6 * cachedRate;
5284
+ const cacheCreationCost = cacheCreationInputTokens / 1e6 * cacheWriteRate;
5285
+ const inputCost = uncachedInputCost + cachedInputCost + cacheCreationCost;
4865
5286
  const outputCost = outputTokens / 1e6 * spec.pricing.output;
4866
5287
  const totalCost = inputCost + outputCost;
4867
5288
  return {
4868
5289
  inputCost,
5290
+ cachedInputCost,
5291
+ cacheCreationCost,
4869
5292
  outputCost,
4870
5293
  totalCost,
4871
5294
  currency: "USD"