npm - llmist - Versions diffs - 0.7.0 → 0.8.0 - Mend

llmist 0.7.0 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/dist/{chunk-CTC2WJZA.js → chunk-4IMGADVY.js} +2 -2
package/dist/{chunk-ZFHFBEQ5.js → chunk-62M4TDAK.js} +359 -66
package/dist/chunk-62M4TDAK.js.map +1 -0
package/dist/cli.cjs +726 -123
package/dist/cli.cjs.map +1 -1
package/dist/cli.js +369 -59
package/dist/cli.js.map +1 -1
package/dist/index.cjs +358 -65
package/dist/index.cjs.map +1 -1
package/dist/index.d.cts +6 -9
package/dist/index.d.ts +6 -9
package/dist/index.js +2 -2
package/dist/{mock-stream-B2qwECvd.d.cts → mock-stream-CjmvWDc3.d.cts} +21 -20
package/dist/{mock-stream-B2qwECvd.d.ts → mock-stream-CjmvWDc3.d.ts} +21 -20
package/dist/testing/index.cjs +358 -65
package/dist/testing/index.cjs.map +1 -1
package/dist/testing/index.d.cts +2 -2
package/dist/testing/index.d.ts +2 -2
package/dist/testing/index.js +2 -2
package/package.json +2 -1
package/dist/chunk-ZFHFBEQ5.js.map +0 -1
/package/dist/{chunk-CTC2WJZA.js.map → chunk-4IMGADVY.js.map} +0 -0

package/dist/index.cjs CHANGED Viewed

@@ -881,7 +881,7 @@ function findSafeDelimiter(content) {
   }
   let counter = 1;
   while (counter < 1e3) {
-    const delimiter = `HEREDOC_${counter}`;
+    const delimiter = `__GADGET_PARAM_${counter}__`;
     const regex = new RegExp(`^${delimiter}\\s*$`);
     const isUsed = lines.some((line) => regex.test(line));
     if (!isUsed) {
@@ -988,7 +988,16 @@ var init_gadget = __esm({
     yaml = __toESM(require("js-yaml"), 1);
     init_schema_to_json();
     init_schema_validator();
-    HEREDOC_DELIMITERS = ["EOF", "END", "DOC", "CONTENT", "TEXT", "HEREDOC", "DATA", "BLOCK"];
+    HEREDOC_DELIMITERS = [
+      "__GADGET_PARAM_EOF__",
+      "__GADGET_PARAM_END__",
+      "__GADGET_PARAM_DOC__",
+      "__GADGET_PARAM_CONTENT__",
+      "__GADGET_PARAM_TEXT__",
+      "__GADGET_PARAM_HEREDOC__",
+      "__GADGET_PARAM_DATA__",
+      "__GADGET_PARAM_BLOCK__"
+    ];
     BaseGadget = class {
       /**
        * The name of the gadget. Used for identification when LLM calls it.
@@ -3096,7 +3105,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 3,
           output: 15,
-          cachedInput: 0.3
+          cachedInput: 0.3,
+          cacheWriteInput: 3.75
         },
         knowledgeCutoff: "2025-01",
         features: {
@@ -3120,7 +3130,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 1,
           output: 5,
-          cachedInput: 0.1
+          cachedInput: 0.1,
+          cacheWriteInput: 1.25
         },
         knowledgeCutoff: "2025-02",
         features: {
@@ -3144,7 +3155,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 3,
           output: 15,
-          cachedInput: 0.3
+          cachedInput: 0.3,
+          cacheWriteInput: 3.75
         },
         knowledgeCutoff: "2025-03",
         features: {
@@ -3168,7 +3180,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 3,
           output: 15,
-          cachedInput: 0.3
+          cachedInput: 0.3,
+          cacheWriteInput: 3.75
         },
         knowledgeCutoff: "2024-11",
         features: {
@@ -3192,7 +3205,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 15,
           output: 75,
-          cachedInput: 1.5
+          cachedInput: 1.5,
+          cacheWriteInput: 18.75
         },
         knowledgeCutoff: "2025-01",
         features: {
@@ -3216,7 +3230,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 15,
           output: 75,
-          cachedInput: 1.5
+          cachedInput: 1.5,
+          cacheWriteInput: 18.75
         },
         knowledgeCutoff: "2025-03",
         features: {
@@ -3239,7 +3254,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 0.8,
           output: 4,
-          cachedInput: 0.08
+          cachedInput: 0.08,
+          cacheWriteInput: 1
         },
         knowledgeCutoff: "2024-07",
         features: {
@@ -3262,7 +3278,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 0.25,
           output: 1.25,
-          cachedInput: 0.025
+          cachedInput: 0.025,
+          cacheWriteInput: 0.3125
         },
         knowledgeCutoff: "2023-08",
         features: {
@@ -3286,7 +3303,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 1,
           output: 5,
-          cachedInput: 0.1
+          cachedInput: 0.1,
+          cacheWriteInput: 1.25
         },
         knowledgeCutoff: "2025-02",
         features: {
@@ -3310,7 +3328,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 3,
           output: 15,
-          cachedInput: 0.3
+          cachedInput: 0.3,
+          cacheWriteInput: 3.75
         },
         knowledgeCutoff: "2025-01",
         features: {
@@ -3334,7 +3353,8 @@ var init_anthropic_models = __esm({
         pricing: {
           input: 5,
           output: 25,
-          cachedInput: 0.5
+          cachedInput: 0.5,
+          cacheWriteInput: 6.25
         },
         knowledgeCutoff: "2025-03",
         features: {
@@ -3449,15 +3469,27 @@ var init_anthropic = __esm({
       }
       buildRequestPayload(options, descriptor, spec, messages) {
         const systemMessages = messages.filter((message) => message.role === "system");
-        const system = systemMessages.length > 0 ? systemMessages.map((m) => m.content).join("\n\n") : void 0;
-        const conversation = messages.filter(
+        const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
+          type: "text",
+          text: m.content,
+          // Add cache_control to the LAST system message block
+          ...index === systemMessages.length - 1 ? { cache_control: { type: "ephemeral" } } : {}
+        })) : void 0;
+        const nonSystemMessages = messages.filter(
           (message) => message.role !== "system"
-        ).map((message) => ({
+        );
+        const lastUserIndex = nonSystemMessages.reduce(
+          (lastIdx, msg, idx) => msg.role === "user" ? idx : lastIdx,
+          -1
+        );
+        const conversation = nonSystemMessages.map((message, index) => ({
           role: message.role,
           content: [
             {
               type: "text",
-              text: message.content
+              text: message.content,
+              // Add cache_control to the LAST user message
+              ...message.role === "user" && index === lastUserIndex ? { cache_control: { type: "ephemeral" } } : {}
             }
           ]
         }));
@@ -3483,15 +3515,22 @@ var init_anthropic = __esm({
       async *wrapStream(iterable) {
         const stream2 = iterable;
         let inputTokens = 0;
+        let cachedInputTokens = 0;
+        let cacheCreationInputTokens = 0;
         for await (const event of stream2) {
           if (event.type === "message_start") {
-            inputTokens = event.message.usage.input_tokens;
+            const usage = event.message.usage;
+            cachedInputTokens = usage.cache_read_input_tokens ?? 0;
+            cacheCreationInputTokens = usage.cache_creation_input_tokens ?? 0;
+            inputTokens = usage.input_tokens + cachedInputTokens + cacheCreationInputTokens;
             yield {
               text: "",
               usage: {
                 inputTokens,
                 outputTokens: 0,
-                totalTokens: inputTokens
+                totalTokens: inputTokens,
+                cachedInputTokens,
+                cacheCreationInputTokens
               },
               rawEvent: event
             };
@@ -3505,7 +3544,9 @@ var init_anthropic = __esm({
             const usage = event.usage ? {
               inputTokens,
               outputTokens: event.usage.output_tokens,
-              totalTokens: inputTokens + event.usage.output_tokens
+              totalTokens: inputTokens + event.usage.output_tokens,
+              cachedInputTokens,
+              cacheCreationInputTokens
             } : void 0;
             if (event.delta.stop_reason || usage) {
               yield {
@@ -3586,6 +3627,7 @@ var init_gemini_models = __esm({
   "src/providers/gemini-models.ts"() {
     "use strict";
     GEMINI_MODELS = [
+      // Gemini 3 Pro (Preview)
       {
         provider: "gemini",
         modelId: "gemini-3-pro-preview",
@@ -3594,8 +3636,11 @@ var init_gemini_models = __esm({
         maxOutputTokens: 65536,
         pricing: {
           input: 2,
+          // $2.00 for prompts <= 200k, $4.00 for > 200k (using lower tier)
           output: 12,
+          // $12.00 for prompts <= 200k, $18.00 for > 200k
           cachedInput: 0.2
+          // $0.20 for prompts <= 200k
         },
         knowledgeCutoff: "2025-01",
         features: {
@@ -3608,9 +3653,10 @@ var init_gemini_models = __esm({
         metadata: {
           family: "Gemini 3",
           releaseDate: "2025-11-18",
-          notes: "Most advanced model. 1501 Elo LMArena, 91.9% GPQA Diamond, 76.2% SWE-bench. Deep Think mode available."
+          notes: "Best model for multimodal understanding, agentic and vibe-coding. Deep Think mode available."
         }
       },
+      // Gemini 2.5 Pro
       {
         provider: "gemini",
         modelId: "gemini-2.5-pro",
@@ -3619,8 +3665,11 @@ var init_gemini_models = __esm({
         maxOutputTokens: 65536,
         pricing: {
           input: 1.25,
+          // $1.25 for prompts <= 200k, $2.50 for > 200k
           output: 10,
+          // $10.00 for prompts <= 200k, $15.00 for > 200k
           cachedInput: 0.125
+          // $0.125 for prompts <= 200k
         },
         knowledgeCutoff: "2025-01",
         features: {
@@ -3633,9 +3682,10 @@ var init_gemini_models = __esm({
         metadata: {
           family: "Gemini 2.5",
           releaseDate: "2025-06",
-          notes: "Balanced multimodal model with 1M context. Best for complex agents and reasoning."
+          notes: "State-of-the-art multipurpose model. Excels at coding and complex reasoning."
         }
       },
+      // Gemini 2.5 Flash
       {
         provider: "gemini",
         modelId: "gemini-2.5-flash",
@@ -3644,8 +3694,10 @@ var init_gemini_models = __esm({
         maxOutputTokens: 65536,
         pricing: {
           input: 0.3,
+          // $0.30 for text/image/video, $1.00 for audio
           output: 2.5,
           cachedInput: 0.03
+          // $0.03 for text/image/video
         },
         knowledgeCutoff: "2025-01",
         features: {
@@ -3658,9 +3710,10 @@ var init_gemini_models = __esm({
         metadata: {
           family: "Gemini 2.5",
           releaseDate: "2025-06",
-          notes: "Best price-performance ratio with thinking enabled by default"
+          notes: "First hybrid reasoning model with 1M context and thinking budgets."
         }
       },
+      // Gemini 2.5 Flash-Lite
       {
         provider: "gemini",
         modelId: "gemini-2.5-flash-lite",
@@ -3669,8 +3722,10 @@ var init_gemini_models = __esm({
         maxOutputTokens: 65536,
         pricing: {
           input: 0.1,
+          // $0.10 for text/image/video, $0.30 for audio
           output: 0.4,
           cachedInput: 0.01
+          // $0.01 for text/image/video
         },
         knowledgeCutoff: "2025-01",
         features: {
@@ -3682,9 +3737,10 @@ var init_gemini_models = __esm({
         metadata: {
           family: "Gemini 2.5",
           releaseDate: "2025-06",
-          notes: "Fastest and most cost-efficient model for high-volume, low-latency tasks"
+          notes: "Smallest and most cost effective model, built for at scale usage."
         }
       },
+      // Gemini 2.0 Flash
       {
         provider: "gemini",
         modelId: "gemini-2.0-flash",
@@ -3693,8 +3749,10 @@ var init_gemini_models = __esm({
         maxOutputTokens: 8192,
         pricing: {
           input: 0.1,
+          // $0.10 for text/image/video, $0.70 for audio
           output: 0.4,
-          cachedInput: 0.01
+          cachedInput: 0.025
+          // $0.025 for text/image/video
         },
         knowledgeCutoff: "2024-08",
         features: {
@@ -3705,9 +3763,10 @@ var init_gemini_models = __esm({
         },
         metadata: {
           family: "Gemini 2.0",
-          notes: "Previous generation with 1M context and multimodal capabilities"
+          notes: "Balanced multimodal model with 1M context, built for the era of Agents."
         }
       },
+      // Gemini 2.0 Flash-Lite
       {
         provider: "gemini",
         modelId: "gemini-2.0-flash-lite",
@@ -3716,8 +3775,8 @@ var init_gemini_models = __esm({
         maxOutputTokens: 8192,
         pricing: {
           input: 0.075,
-          output: 0.3,
-          cachedInput: 75e-4
+          output: 0.3
+          // No context caching available for 2.0-flash-lite
         },
         knowledgeCutoff: "2024-08",
         features: {
@@ -3728,7 +3787,7 @@ var init_gemini_models = __esm({
         },
         metadata: {
           family: "Gemini 2.0",
-          notes: "Lightweight previous generation model for cost-sensitive applications"
+          notes: "Smallest and most cost effective 2.0 model for at scale usage."
         }
       }
     ];
@@ -3898,7 +3957,9 @@ var init_gemini = __esm({
         return {
           inputTokens: usageMetadata.promptTokenCount ?? 0,
           outputTokens: usageMetadata.candidatesTokenCount ?? 0,
-          totalTokens: usageMetadata.totalTokenCount ?? 0
+          totalTokens: usageMetadata.totalTokenCount ?? 0,
+          // Gemini returns cached token count in cachedContentTokenCount
+          cachedInputTokens: usageMetadata.cachedContentTokenCount ?? 0
         };
       }
       /**
@@ -3954,10 +4015,11 @@ var init_openai_models = __esm({
   "src/providers/openai-models.ts"() {
     "use strict";
     OPENAI_MODELS = [
+      // GPT-5 Family
       {
         provider: "openai",
         modelId: "gpt-5.1",
-        displayName: "GPT-5.1 Instant",
+        displayName: "GPT-5.1",
         contextWindow: 128e3,
         maxOutputTokens: 32768,
         pricing: {
@@ -3977,34 +4039,7 @@ var init_openai_models = __esm({
         metadata: {
           family: "GPT-5",
           releaseDate: "2025-11-12",
-          notes: "Warmer, more intelligent, better instruction following. 2-3x faster than GPT-5.",
-          supportsTemperature: false
-        }
-      },
-      {
-        provider: "openai",
-        modelId: "gpt-5.1-thinking",
-        displayName: "GPT-5.1 Thinking",
-        contextWindow: 196e3,
-        maxOutputTokens: 32768,
-        pricing: {
-          input: 1.25,
-          output: 10,
-          cachedInput: 0.125
-        },
-        knowledgeCutoff: "2024-09-30",
-        features: {
-          streaming: true,
-          functionCalling: true,
-          vision: true,
-          reasoning: true,
-          structuredOutputs: true,
-          fineTuning: true
-        },
-        metadata: {
-          family: "GPT-5",
-          releaseDate: "2025-11-12",
-          notes: "Advanced reasoning with thinking levels: Light, Standard, Extended, Heavy. Best for complex tasks.",
+          notes: "Latest GPT-5 with improved instruction following. 2-3x faster than GPT-5.",
           supportsTemperature: false
         }
       },
@@ -4084,6 +4119,255 @@ var init_openai_models = __esm({
           notes: "Fastest, most cost-efficient version for well-defined tasks",
           supportsTemperature: false
         }
+      },
+      {
+        provider: "openai",
+        modelId: "gpt-5-pro",
+        displayName: "GPT-5 Pro",
+        contextWindow: 272e3,
+        maxOutputTokens: 128e3,
+        pricing: {
+          input: 15,
+          output: 120
+          // No cached input pricing for gpt-5-pro
+        },
+        knowledgeCutoff: "2024-09-30",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true
+        },
+        metadata: {
+          family: "GPT-5",
+          notes: "Premium tier with enhanced capabilities. Does not support prompt caching.",
+          supportsTemperature: false
+        }
+      },
+      // GPT-4.1 Family
+      {
+        provider: "openai",
+        modelId: "gpt-4.1",
+        displayName: "GPT-4.1",
+        contextWindow: 128e3,
+        maxOutputTokens: 32768,
+        pricing: {
+          input: 2,
+          output: 8,
+          cachedInput: 0.5
+        },
+        knowledgeCutoff: "2024-04-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          structuredOutputs: true,
+          fineTuning: true
+        },
+        metadata: {
+          family: "GPT-4.1",
+          notes: "Improved GPT-4 with better instruction following"
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "gpt-4.1-mini",
+        displayName: "GPT-4.1 Mini",
+        contextWindow: 128e3,
+        maxOutputTokens: 32768,
+        pricing: {
+          input: 0.4,
+          output: 1.6,
+          cachedInput: 0.1
+        },
+        knowledgeCutoff: "2024-04-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          structuredOutputs: true,
+          fineTuning: true
+        },
+        metadata: {
+          family: "GPT-4.1",
+          notes: "Cost-efficient GPT-4.1 variant"
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "gpt-4.1-nano",
+        displayName: "GPT-4.1 Nano",
+        contextWindow: 128e3,
+        maxOutputTokens: 32768,
+        pricing: {
+          input: 0.1,
+          output: 0.4,
+          cachedInput: 0.025
+        },
+        knowledgeCutoff: "2024-04-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          structuredOutputs: true,
+          fineTuning: true
+        },
+        metadata: {
+          family: "GPT-4.1",
+          notes: "Fastest GPT-4.1 variant for simple tasks"
+        }
+      },
+      // GPT-4o Family
+      {
+        provider: "openai",
+        modelId: "gpt-4o",
+        displayName: "GPT-4o",
+        contextWindow: 128e3,
+        maxOutputTokens: 16384,
+        pricing: {
+          input: 2.5,
+          output: 10,
+          cachedInput: 1.25
+        },
+        knowledgeCutoff: "2024-04-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          structuredOutputs: true,
+          fineTuning: true
+        },
+        metadata: {
+          family: "GPT-4o",
+          notes: "Multimodal model optimized for speed"
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "gpt-4o-mini",
+        displayName: "GPT-4o Mini",
+        contextWindow: 128e3,
+        maxOutputTokens: 16384,
+        pricing: {
+          input: 0.15,
+          output: 0.6,
+          cachedInput: 0.075
+        },
+        knowledgeCutoff: "2024-04-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          structuredOutputs: true,
+          fineTuning: true
+        },
+        metadata: {
+          family: "GPT-4o",
+          notes: "Fast and affordable multimodal model"
+        }
+      },
+      // o-series (Reasoning models)
+      {
+        provider: "openai",
+        modelId: "o1",
+        displayName: "o1",
+        contextWindow: 2e5,
+        maxOutputTokens: 1e5,
+        pricing: {
+          input: 15,
+          output: 60,
+          cachedInput: 7.5
+        },
+        knowledgeCutoff: "2024-12-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true
+        },
+        metadata: {
+          family: "o-series",
+          notes: "Advanced reasoning model with chain-of-thought",
+          supportsTemperature: false
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "o3",
+        displayName: "o3",
+        contextWindow: 2e5,
+        maxOutputTokens: 1e5,
+        pricing: {
+          input: 2,
+          output: 8,
+          cachedInput: 0.5
+        },
+        knowledgeCutoff: "2025-01-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true
+        },
+        metadata: {
+          family: "o-series",
+          notes: "Next-gen reasoning model, more efficient than o1",
+          supportsTemperature: false
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "o4-mini",
+        displayName: "o4 Mini",
+        contextWindow: 2e5,
+        maxOutputTokens: 1e5,
+        pricing: {
+          input: 1.1,
+          output: 4.4,
+          cachedInput: 0.275
+        },
+        knowledgeCutoff: "2025-04-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true,
+          fineTuning: true
+        },
+        metadata: {
+          family: "o-series",
+          notes: "Cost-efficient reasoning model",
+          supportsTemperature: false
+        }
+      },
+      {
+        provider: "openai",
+        modelId: "o3-mini",
+        displayName: "o3 Mini",
+        contextWindow: 2e5,
+        maxOutputTokens: 1e5,
+        pricing: {
+          input: 1.1,
+          output: 4.4,
+          cachedInput: 0.55
+        },
+        knowledgeCutoff: "2025-01-01",
+        features: {
+          streaming: true,
+          functionCalling: true,
+          vision: true,
+          reasoning: true,
+          structuredOutputs: true
+        },
+        metadata: {
+          family: "o-series",
+          notes: "Compact reasoning model for cost-sensitive applications",
+          supportsTemperature: false
+        }
       }
     ];
   }
@@ -4164,7 +4448,8 @@ var init_openai = __esm({
           const usage = chunk.usage ? {
             inputTokens: chunk.usage.prompt_tokens,
             outputTokens: chunk.usage.completion_tokens,
-            totalTokens: chunk.usage.total_tokens
+            totalTokens: chunk.usage.total_tokens,
+            cachedInputTokens: chunk.usage.prompt_tokens_details?.cached_tokens ?? 0
           } : void 0;
           if (finishReason || usage) {
             yield { text: "", finishReason, usage, rawEvent: chunk };
@@ -4381,20 +4666,28 @@ var init_model_registry = __esm({
       /**
        * Estimate API cost for a given model and token usage
        * @param modelId - Full model identifier
-       * @param inputTokens - Number of input tokens
+       * @param inputTokens - Number of input tokens (total, including cached and cache creation)
        * @param outputTokens - Number of output tokens
-       * @param useCachedInput - Whether to use cached input pricing (if supported by provider)
+       * @param cachedInputTokens - Number of cached input tokens (subset of inputTokens)
+       * @param cacheCreationInputTokens - Number of cache creation tokens (subset of inputTokens, Anthropic only)
        * @returns CostEstimate if model found, undefined otherwise
        */
-      estimateCost(modelId, inputTokens, outputTokens, useCachedInput = false) {
+      estimateCost(modelId, inputTokens, outputTokens, cachedInputTokens = 0, cacheCreationInputTokens = 0) {
         const spec = this.getModelSpec(modelId);
         if (!spec) return void 0;
-        const inputRate = useCachedInput && spec.pricing.cachedInput !== void 0 ? spec.pricing.cachedInput : spec.pricing.input;
-        const inputCost = inputTokens / 1e6 * inputRate;
+        const cachedRate = spec.pricing.cachedInput ?? spec.pricing.input;
+        const cacheWriteRate = spec.pricing.cacheWriteInput ?? spec.pricing.input;
+        const uncachedInputTokens = inputTokens - cachedInputTokens - cacheCreationInputTokens;
+        const uncachedInputCost = uncachedInputTokens / 1e6 * spec.pricing.input;
+        const cachedInputCost = cachedInputTokens / 1e6 * cachedRate;
+        const cacheCreationCost = cacheCreationInputTokens / 1e6 * cacheWriteRate;
+        const inputCost = uncachedInputCost + cachedInputCost + cacheCreationCost;
         const outputCost = outputTokens / 1e6 * spec.pricing.output;
         const totalCost = inputCost + outputCost;
         return {
           inputCost,
+          cachedInputCost,
+          cacheCreationCost,
           outputCost,
           totalCost,
           currency: "USD"