npm - smoltalk - Versions diffs - 0.3.1 → 0.4.0 - Mend

smoltalk 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

package/dist/classes/message/AssistantMessage.d.ts +2 -0
package/dist/clients/anthropic.js +116 -4
package/dist/clients/google.js +5 -2
package/dist/clients/openai.js +5 -2
package/dist/clients/openaiResponses.js +5 -2
package/dist/model.d.ts +2 -0
package/dist/model.js +38 -5
package/dist/models.d.ts +123 -10
package/dist/models.js +65 -5
package/dist/types/costEstimate.d.ts +2 -0
package/dist/types/costEstimate.js +2 -0
package/dist/types/tokenUsage.d.ts +2 -0
package/dist/types/tokenUsage.js +2 -0
package/dist/types.d.ts +4 -0
package/package.json +1 -1

package/dist/classes/message/AssistantMessage.d.ts CHANGED Viewed

@@ -29,12 +29,14 @@ export declare const AssistantMessageJSONSchema: z.ZodObject<{
         inputTokens: z.ZodNumber;
         outputTokens: z.ZodNumber;
         cachedInputTokens: z.ZodOptional<z.ZodNumber>;
+        cacheCreationInputTokens: z.ZodOptional<z.ZodNumber>;
         totalTokens: z.ZodOptional<z.ZodNumber>;
     }, z.core.$strip>>;
     cost: z.ZodOptional<z.ZodObject<{
         inputCost: z.ZodNumber;
         outputCost: z.ZodNumber;
         cachedInputCost: z.ZodOptional<z.ZodNumber>;
+        cacheCreationInputCost: z.ZodOptional<z.ZodNumber>;
         totalCost: z.ZodNumber;
         currency: z.ZodString;
     }, z.core.$strip>>;

package/dist/clients/anthropic.js CHANGED Viewed

@@ -8,6 +8,82 @@ import { SmolContentPolicyError, SmolContextWindowExceededError, } from "../smol
 import { BaseClient } from "./baseClient.js";
 import { Model } from "../model.js";
 const DEFAULT_MAX_TOKENS = 4096;
+/**
+ * Attach ephemeral cache_control breakpoints to (up to) three places:
+ *   1. the last tool definition
+ *   2. the last system block (promoting system from string to array form)
+ *   3. the last block of the last user message
+ *
+ * Anthropic enforces minimum prefix sizes; smaller prefixes silently no-op.
+ */
+function applyCacheBreakpoints(req) {
+    const cc = { type: "ephemeral" };
+    // Tools: mark the last tool.
+    let tools = req.tools;
+    if (tools && tools.length > 0) {
+        const lastIdx = tools.length - 1;
+        const marked = [];
+        for (let i = 0; i < tools.length; i++) {
+            if (i === lastIdx) {
+                marked.push({ ...tools[i], cache_control: cc });
+            }
+            else {
+                marked.push(tools[i]);
+            }
+        }
+        tools = marked;
+    }
+    // System: promote string to array form so the last block can be marked.
+    let system = req.system;
+    if (typeof system === "string" && system.length > 0) {
+        system = [{ type: "text", text: system, cache_control: cc }];
+    }
+    else if (Array.isArray(system) && system.length > 0) {
+        const lastIdx = system.length - 1;
+        const marked = [];
+        for (let i = 0; i < system.length; i++) {
+            if (i === lastIdx) {
+                marked.push({ ...system[i], cache_control: cc });
+            }
+            else {
+                marked.push(system[i]);
+            }
+        }
+        system = marked;
+    }
+    // Messages: mark the last block of the last user message.
+    let messages = req.messages;
+    for (let i = messages.length - 1; i >= 0; i--) {
+        const m = messages[i];
+        if (m.role !== "user")
+            continue;
+        let blocks;
+        if (typeof m.content === "string") {
+            blocks = [{ type: "text", text: m.content }];
+        }
+        else {
+            blocks = [...m.content];
+        }
+        if (blocks.length === 0)
+            break;
+        blocks[blocks.length - 1] = {
+            ...blocks[blocks.length - 1],
+            cache_control: cc,
+        };
+        const rebuilt = [];
+        for (let j = 0; j < messages.length; j++) {
+            if (j === i) {
+                rebuilt.push({ ...m, content: blocks });
+            }
+            else {
+                rebuilt.push(messages[j]);
+            }
+        }
+        messages = rebuilt;
+        break;
+    }
+    return { system, messages, tools };
+}
 export class SmolAnthropic extends BaseClient {
     client;
     logger;
@@ -22,11 +98,22 @@ export class SmolAnthropic extends BaseClient {
         return this.model.getModel();
     }
     calculateUsageAndCost(usageData) {
+        const cacheRead = usageData.cache_read_input_tokens ?? 0;
+        const cacheCreation = usageData.cache_creation_input_tokens ?? 0;
         const usage = {
             inputTokens: usageData.input_tokens,
             outputTokens: usageData.output_tokens,
-            totalTokens: usageData.input_tokens + usageData.output_tokens,
+            totalTokens: usageData.input_tokens +
+                cacheRead +
+                cacheCreation +
+                usageData.output_tokens,
         };
+        if (cacheRead > 0) {
+            usage.cachedInputTokens = cacheRead;
+        }
+        if (cacheCreation > 0) {
+            usage.cacheCreationInputTokens = cacheCreation;
+        }
         const cost = this.model.calculateCost(usage) ?? undefined;
         return { usage, cost };
     }
@@ -81,7 +168,12 @@ export class SmolAnthropic extends BaseClient {
                     budget_tokens: reasoningBudgetMap[config.reasoningEffort],
                 }
                 : undefined;
-        return { system, messages: anthropicMessages, tools, thinking };
+        const cachingEnabled = config.caching?.enabled !== false;
+        const baseRequest = { system, messages: anthropicMessages, tools };
+        const finalRequest = cachingEnabled
+            ? applyCacheBreakpoints(baseRequest)
+            : baseRequest;
+        return { ...finalRequest, thinking };
     }
     rethrowAsSmolError(error) {
         if (error instanceof Anthropic.APIError) {
@@ -198,10 +290,15 @@ export class SmolAnthropic extends BaseClient {
         // Track thinking blocks by index: index -> { text, signature }
         const thinkingBlockMap = new Map();
         let inputTokens = 0;
+        let cacheReadTokens = 0;
+        let cacheCreationTokens = 0;
         let outputTokens = 0;
         for await (const event of stream) {
             if (event.type === "message_start") {
-                inputTokens = event.message.usage.input_tokens;
+                const u = event.message.usage;
+                inputTokens = u.input_tokens;
+                cacheReadTokens = u.cache_read_input_tokens ?? 0;
+                cacheCreationTokens = u.cache_creation_input_tokens ?? 0;
             }
             else if (event.type === "content_block_start") {
                 if (event.content_block.type === "tool_use") {
@@ -252,6 +349,15 @@ export class SmolAnthropic extends BaseClient {
             }
             else if (event.type === "message_delta") {
                 outputTokens = event.usage.output_tokens;
+                // Defensive: in practice Anthropic only sends cache fields on
+                // message_start, but read them here too so we don't miss an
+                // update if the SDK changes.
+                if (event.usage.cache_read_input_tokens != null) {
+                    cacheReadTokens = event.usage.cache_read_input_tokens;
+                }
+                if (event.usage.cache_creation_input_tokens != null) {
+                    cacheCreationTokens = event.usage.cache_creation_input_tokens;
+                }
             }
         }
         this.logger.debug("Streaming response completed from Anthropic");
@@ -269,8 +375,14 @@ export class SmolAnthropic extends BaseClient {
         const usage = {
             inputTokens,
             outputTokens,
-            totalTokens: inputTokens + outputTokens,
+            totalTokens: inputTokens + cacheReadTokens + cacheCreationTokens + outputTokens,
         };
+        if (cacheReadTokens > 0) {
+            usage.cachedInputTokens = cacheReadTokens;
+        }
+        if (cacheCreationTokens > 0) {
+            usage.cacheCreationInputTokens = cacheCreationTokens;
+        }
         const cost = this.model.calculateCost(usage) ?? undefined;
         yield {
             type: "done",

package/dist/clients/google.js CHANGED Viewed

@@ -31,12 +31,15 @@ export class SmolGoogle extends BaseClient {
         let usage;
         let cost;
         if (usageMetadata) {
+            const cached = usageMetadata.cachedContentTokenCount ?? 0;
             usage = {
-                inputTokens: usageMetadata.promptTokenCount || 0,
+                inputTokens: Math.max(0, (usageMetadata.promptTokenCount || 0) - cached),
                 outputTokens: usageMetadata.candidatesTokenCount || 0,
-                cachedInputTokens: usageMetadata.cachedContentTokenCount,
                 totalTokens: usageMetadata.totalTokenCount,
             };
+            if (cached > 0) {
+                usage.cachedInputTokens = cached;
+            }
             const calculatedCost = this.model.calculateCost(usage);
             if (calculatedCost) {
                 cost = calculatedCost;

package/dist/clients/openai.js CHANGED Viewed

@@ -30,12 +30,15 @@ export class SmolOpenAi extends BaseClient {
         let usage;
         let cost;
         if (usageData) {
+            const cached = usageData.prompt_tokens_details?.cached_tokens ?? 0;
             usage = {
-                inputTokens: usageData.prompt_tokens || 0,
+                inputTokens: Math.max(0, (usageData.prompt_tokens || 0) - cached),
                 outputTokens: usageData.completion_tokens || 0,
-                cachedInputTokens: usageData.prompt_tokens_details?.cached_tokens,
                 totalTokens: usageData.total_tokens,
             };
+            if (cached > 0) {
+                usage.cachedInputTokens = cached;
+            }
             const calculatedCost = this.model.calculateCost(usage);
             if (calculatedCost) {
                 cost = calculatedCost;

package/dist/clients/openaiResponses.js CHANGED Viewed

@@ -89,12 +89,15 @@ export class SmolOpenAiResponses extends BaseClient {
         let usage;
         let cost;
         if (usageData) {
+            const cached = usageData.input_tokens_details?.cached_tokens ?? 0;
             usage = {
-                inputTokens: usageData.input_tokens || 0,
+                inputTokens: Math.max(0, (usageData.input_tokens || 0) - cached),
                 outputTokens: usageData.output_tokens || 0,
-                cachedInputTokens: usageData.input_tokens_details?.cached_tokens,
                 totalTokens: usageData.total_tokens,
             };
+            if (cached > 0) {
+                usage.cachedInputTokens = cached;
+            }
             const calculatedCost = this.model.calculateCost(usage);
             if (calculatedCost) {
                 cost = calculatedCost;

package/dist/model.d.ts CHANGED Viewed

@@ -11,10 +11,12 @@ export declare class Model {
         inputTokens: number;
         outputTokens: number;
         cachedInputTokens?: number;
+        cacheCreationInputTokens?: number;
     }): {
         inputCost: number;
         outputCost: number;
         cachedInputCost?: number;
+        cacheCreationInputCost?: number;
         totalCost: number;
         currency: string;
     } | null;

package/dist/model.js CHANGED Viewed

@@ -26,16 +26,49 @@ export class Model {
         if (!model || !isTextModel(model)) {
             return null;
         }
+        const cachedTokens = usage.cachedInputTokens ?? 0;
+        const cacheCreationTokens = usage.cacheCreationInputTokens ?? 0;
+        // Disjoint buckets. If a discount price isn't defined for this model,
+        // the tokens were still billed by the provider — charge them at the
+        // full input rate so totalCost stays honest.
+        const cachedRate = model.cachedInputTokenCost ?? model.inputTokenCost ?? 0;
+        const cacheCreationRate = model.cacheCreationInputTokenCost ?? model.inputTokenCost ?? 0;
         const inputCost = round((usage.inputTokens * (model.inputTokenCost || 0)) / 1_000_000, 6);
         const outputCost = round((usage.outputTokens * (model.outputTokenCost || 0)) / 1_000_000, 6);
-        const cachedInputCost = usage.cachedInputTokens && model.cachedInputTokenCost
-            ? round((usage.cachedInputTokens * model.cachedInputTokenCost) / 1_000_000, 6)
-            : undefined;
-        const totalCost = round(inputCost + outputCost + (cachedInputCost || 0), 6);
+        // Only expose cachedInputCost / cacheCreationInputCost when the model
+        // actually has a distinct discount price. Otherwise, fold those dollars
+        // into inputCost so the user isn't misled by a $0 cached field.
+        let cachedInputCost;
+        let cacheCreationInputCost;
+        let foldedInputDollars = 0;
+        if (cachedTokens > 0) {
+            const dollars = (cachedTokens * cachedRate) / 1_000_000;
+            if (model.cachedInputTokenCost != null) {
+                cachedInputCost = round(dollars, 6);
+            }
+            else {
+                foldedInputDollars += dollars;
+            }
+        }
+        if (cacheCreationTokens > 0) {
+            const dollars = (cacheCreationTokens * cacheCreationRate) / 1_000_000;
+            if (model.cacheCreationInputTokenCost != null) {
+                cacheCreationInputCost = round(dollars, 6);
+            }
+            else {
+                foldedInputDollars += dollars;
+            }
+        }
+        const finalInputCost = round(inputCost + foldedInputDollars, 6);
+        const totalCost = round(finalInputCost +
+            outputCost +
+            (cachedInputCost || 0) +
+            (cacheCreationInputCost || 0), 6);
         return {
-            inputCost,
+            inputCost: finalInputCost,
             outputCost,
             cachedInputCost,
+            cacheCreationInputCost,
             totalCost,
             currency: "USD",
         };

package/dist/models.d.ts CHANGED Viewed

@@ -16,6 +16,7 @@ export type BaseModel = {
     description?: string;
     inputTokenCost?: number;
     cachedInputTokenCost?: number;
+    cacheCreationInputTokenCost?: number;
     outputTokenCost?: number;
     disabled?: boolean;
     costUnit?: "tokens" | "characters" | "minutes";
@@ -466,10 +467,27 @@ export declare const textModels: readonly [{
     readonly outputTokenCost: 12;
     readonly disabled: true;
     readonly provider: "google";
+}, {
+    readonly type: "text";
+    readonly modelName: "gemini-3.5-flash";
+    readonly description: "Latest Gemini 3.5 Flash model (GA May 2026). Outperforms Gemini 3.1 Pro on coding and agentic suites at 4x the speed. 1M context window, 64K output. Context caching: $0.15/1M read.";
+    readonly maxInputTokens: 1048576;
+    readonly maxOutputTokens: 65536;
+    readonly inputTokenCost: 1.5;
+    readonly cachedInputTokenCost: 0.15;
+    readonly outputTokenCost: 9;
+    readonly reasoning: {
+        readonly levels: readonly ["minimal", "low", "medium", "high"];
+        readonly defaultLevel: "high";
+        readonly canDisable: false;
+        readonly outputsThinking: true;
+        readonly outputsSignatures: true;
+    };
+    readonly provider: "google";
 }, {
     readonly type: "text";
     readonly modelName: "gemini-3-flash-preview";
-    readonly description: "Latest Gemini 3 flash model with 1M context window and 64K output. Outperforms 2.5 Pro while being 3x faster. Optimized for agentic workflows and coding. Includes context caching for 90% cost reductions.";
+    readonly description: "Gemini 3 Flash preview. Superseded by gemini-3.5-flash. 1M context window and 64K output. Optimized for agentic workflows and coding.";
     readonly maxInputTokens: 1048576;
     readonly maxOutputTokens: 65536;
     readonly inputTokenCost: 0.5;
@@ -485,8 +503,8 @@ export declare const textModels: readonly [{
     readonly provider: "google";
 }, {
     readonly type: "text";
-    readonly modelName: "gemini-3.1-flash-lite-preview";
-    readonly description: "Most cost-effective Gemini 3.1 model with thinking support and 1M context window. 2.5x faster TTFA and 45% faster output than 2.5 Flash. Released March 2026.";
+    readonly modelName: "gemini-3.1-flash-lite";
+    readonly description: "Most cost-effective Gemini 3.1 model (GA). Thinking support, 1M context window, 64K output. 2.5x faster TTFA and 45% faster output than 2.5 Flash.";
     readonly maxInputTokens: 1048576;
     readonly maxOutputTokens: 65536;
     readonly inputTokenCost: 0.25;
@@ -500,6 +518,16 @@ export declare const textModels: readonly [{
         readonly outputsSignatures: true;
     };
     readonly provider: "google";
+}, {
+    readonly type: "text";
+    readonly modelName: "gemini-3.1-flash-lite-preview";
+    readonly description: "DEPRECATED: Preview version, discontinued July 9, 2026. Use gemini-3.1-flash-lite instead.";
+    readonly maxInputTokens: 1048576;
+    readonly maxOutputTokens: 65536;
+    readonly inputTokenCost: 0.25;
+    readonly outputTokenCost: 1.5;
+    readonly disabled: true;
+    readonly provider: "google";
 }, {
     readonly type: "text";
     readonly modelName: "gemini-2.5-pro";
@@ -507,6 +535,7 @@ export declare const textModels: readonly [{
     readonly maxInputTokens: 2097152;
     readonly maxOutputTokens: 65536;
     readonly inputTokenCost: 1.25;
+    readonly cachedInputTokenCost: 0.31;
     readonly outputTokenCost: 10;
     readonly outputTokensPerSecond: 145;
     readonly reasoning: {
@@ -522,6 +551,7 @@ export declare const textModels: readonly [{
     readonly maxInputTokens: 1048576;
     readonly maxOutputTokens: 65536;
     readonly inputTokenCost: 0.3;
+    readonly cachedInputTokenCost: 0.075;
     readonly outputTokenCost: 2.5;
     readonly outputTokensPerSecond: 245;
     readonly reasoning: {
@@ -537,6 +567,7 @@ export declare const textModels: readonly [{
     readonly maxInputTokens: 1048576;
     readonly maxOutputTokens: 65536;
     readonly inputTokenCost: 0.1;
+    readonly cachedInputTokenCost: 0.025;
     readonly outputTokenCost: 0.4;
     readonly outputTokensPerSecond: 400;
     readonly reasoning: {
@@ -611,14 +642,30 @@ export declare const textModels: readonly [{
     readonly costUnit: "characters";
     readonly disabled: true;
     readonly provider: "google";
+}, {
+    readonly type: "text";
+    readonly modelName: "claude-opus-4-8";
+    readonly description: "The most capable Claude model for complex reasoning and agentic coding. Same per-token pricing as Opus 4.7 with improved tool-use efficiency (~290 tokens for tool-use system prompt vs 675 on 4.7). 1M context window, 128K max output.";
+    readonly maxInputTokens: 1000000;
+    readonly maxOutputTokens: 128000;
+    readonly inputTokenCost: 5;
+    readonly cachedInputTokenCost: 0.5;
+    readonly outputTokenCost: 25;
+    readonly reasoning: {
+        readonly canDisable: false;
+        readonly outputsThinking: true;
+        readonly outputsSignatures: true;
+    };
+    readonly provider: "anthropic";
 }, {
     readonly type: "text";
     readonly modelName: "claude-opus-4-7";
-    readonly description: "The most capable Claude model for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.";
+    readonly description: "Claude Opus 4.7 for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.";
     readonly maxInputTokens: 1000000;
     readonly maxOutputTokens: 128000;
     readonly inputTokenCost: 5;
     readonly cachedInputTokenCost: 0.5;
+    readonly cacheCreationInputTokenCost: 6.25;
     readonly outputTokenCost: 25;
     readonly outputTokensPerSecond: 72;
     readonly reasoning: {
@@ -635,6 +682,7 @@ export declare const textModels: readonly [{
     readonly maxOutputTokens: 128000;
     readonly inputTokenCost: 5;
     readonly cachedInputTokenCost: 0.5;
+    readonly cacheCreationInputTokenCost: 6.25;
     readonly outputTokenCost: 25;
     readonly outputTokensPerSecond: 53;
     readonly reasoning: {
@@ -651,6 +699,7 @@ export declare const textModels: readonly [{
     readonly maxOutputTokens: 64000;
     readonly inputTokenCost: 3;
     readonly cachedInputTokenCost: 0.3;
+    readonly cacheCreationInputTokenCost: 3.75;
     readonly outputTokenCost: 15;
     readonly outputTokensPerSecond: 52;
     readonly reasoning: {
@@ -667,6 +716,7 @@ export declare const textModels: readonly [{
     readonly maxOutputTokens: 64000;
     readonly inputTokenCost: 1;
     readonly cachedInputTokenCost: 0.1;
+    readonly cacheCreationInputTokenCost: 1.25;
     readonly outputTokenCost: 5;
     readonly outputTokensPerSecond: 97;
     readonly reasoning: {
@@ -734,7 +784,14 @@ export declare const imageModels: readonly [{
     readonly type: "image";
     readonly modelName: "gemini-3.1-flash-image-preview";
     readonly provider: "google";
-    readonly description: "Fast image generation with Gemini 3.1 Flash. Supports resolutions from 512px to 4096px. ~$0.067/image at 1K resolution.";
+    readonly description: "DEPRECATED: Preview version. Use gemini-3.1-flash-image instead.";
+    readonly costPerImage: 0.067;
+    readonly disabled: true;
+}, {
+    readonly type: "image";
+    readonly modelName: "gemini-3.1-flash-image";
+    readonly provider: "google";
+    readonly description: "Fast image generation with Gemini 3.1 Flash (GA). Supports resolutions from 512px to 4096px. ~$0.045/image at 512px, $0.067 at 1K, $0.101 at 2K, $0.151 at 4K.";
     readonly costPerImage: 0.067;
 }];
 export declare const embeddingsModels: EmbeddingsModel[];
@@ -1156,10 +1213,27 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
     readonly outputTokenCost: 12;
     readonly disabled: true;
     readonly provider: "google";
+} | {
+    readonly type: "text";
+    readonly modelName: "gemini-3.5-flash";
+    readonly description: "Latest Gemini 3.5 Flash model (GA May 2026). Outperforms Gemini 3.1 Pro on coding and agentic suites at 4x the speed. 1M context window, 64K output. Context caching: $0.15/1M read.";
+    readonly maxInputTokens: 1048576;
+    readonly maxOutputTokens: 65536;
+    readonly inputTokenCost: 1.5;
+    readonly cachedInputTokenCost: 0.15;
+    readonly outputTokenCost: 9;
+    readonly reasoning: {
+        readonly levels: readonly ["minimal", "low", "medium", "high"];
+        readonly defaultLevel: "high";
+        readonly canDisable: false;
+        readonly outputsThinking: true;
+        readonly outputsSignatures: true;
+    };
+    readonly provider: "google";
 } | {
     readonly type: "text";
     readonly modelName: "gemini-3-flash-preview";
-    readonly description: "Latest Gemini 3 flash model with 1M context window and 64K output. Outperforms 2.5 Pro while being 3x faster. Optimized for agentic workflows and coding. Includes context caching for 90% cost reductions.";
+    readonly description: "Gemini 3 Flash preview. Superseded by gemini-3.5-flash. 1M context window and 64K output. Optimized for agentic workflows and coding.";
     readonly maxInputTokens: 1048576;
     readonly maxOutputTokens: 65536;
     readonly inputTokenCost: 0.5;
@@ -1175,8 +1249,8 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
     readonly provider: "google";
 } | {
     readonly type: "text";
-    readonly modelName: "gemini-3.1-flash-lite-preview";
-    readonly description: "Most cost-effective Gemini 3.1 model with thinking support and 1M context window. 2.5x faster TTFA and 45% faster output than 2.5 Flash. Released March 2026.";
+    readonly modelName: "gemini-3.1-flash-lite";
+    readonly description: "Most cost-effective Gemini 3.1 model (GA). Thinking support, 1M context window, 64K output. 2.5x faster TTFA and 45% faster output than 2.5 Flash.";
     readonly maxInputTokens: 1048576;
     readonly maxOutputTokens: 65536;
     readonly inputTokenCost: 0.25;
@@ -1190,6 +1264,16 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
         readonly outputsSignatures: true;
     };
     readonly provider: "google";
+} | {
+    readonly type: "text";
+    readonly modelName: "gemini-3.1-flash-lite-preview";
+    readonly description: "DEPRECATED: Preview version, discontinued July 9, 2026. Use gemini-3.1-flash-lite instead.";
+    readonly maxInputTokens: 1048576;
+    readonly maxOutputTokens: 65536;
+    readonly inputTokenCost: 0.25;
+    readonly outputTokenCost: 1.5;
+    readonly disabled: true;
+    readonly provider: "google";
 } | {
     readonly type: "text";
     readonly modelName: "gemini-2.5-pro";
@@ -1197,6 +1281,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
     readonly maxInputTokens: 2097152;
     readonly maxOutputTokens: 65536;
     readonly inputTokenCost: 1.25;
+    readonly cachedInputTokenCost: 0.31;
     readonly outputTokenCost: 10;
     readonly outputTokensPerSecond: 145;
     readonly reasoning: {
@@ -1212,6 +1297,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
     readonly maxInputTokens: 1048576;
     readonly maxOutputTokens: 65536;
     readonly inputTokenCost: 0.3;
+    readonly cachedInputTokenCost: 0.075;
     readonly outputTokenCost: 2.5;
     readonly outputTokensPerSecond: 245;
     readonly reasoning: {
@@ -1227,6 +1313,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
     readonly maxInputTokens: 1048576;
     readonly maxOutputTokens: 65536;
     readonly inputTokenCost: 0.1;
+    readonly cachedInputTokenCost: 0.025;
     readonly outputTokenCost: 0.4;
     readonly outputTokensPerSecond: 400;
     readonly reasoning: {
@@ -1301,14 +1388,30 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
     readonly costUnit: "characters";
     readonly disabled: true;
     readonly provider: "google";
+} | {
+    readonly type: "text";
+    readonly modelName: "claude-opus-4-8";
+    readonly description: "The most capable Claude model for complex reasoning and agentic coding. Same per-token pricing as Opus 4.7 with improved tool-use efficiency (~290 tokens for tool-use system prompt vs 675 on 4.7). 1M context window, 128K max output.";
+    readonly maxInputTokens: 1000000;
+    readonly maxOutputTokens: 128000;
+    readonly inputTokenCost: 5;
+    readonly cachedInputTokenCost: 0.5;
+    readonly outputTokenCost: 25;
+    readonly reasoning: {
+        readonly canDisable: false;
+        readonly outputsThinking: true;
+        readonly outputsSignatures: true;
+    };
+    readonly provider: "anthropic";
 } | {
     readonly type: "text";
     readonly modelName: "claude-opus-4-7";
-    readonly description: "The most capable Claude model for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.";
+    readonly description: "Claude Opus 4.7 for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.";
     readonly maxInputTokens: 1000000;
     readonly maxOutputTokens: 128000;
     readonly inputTokenCost: 5;
     readonly cachedInputTokenCost: 0.5;
+    readonly cacheCreationInputTokenCost: 6.25;
     readonly outputTokenCost: 25;
     readonly outputTokensPerSecond: 72;
     readonly reasoning: {
@@ -1325,6 +1428,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
     readonly maxOutputTokens: 128000;
     readonly inputTokenCost: 5;
     readonly cachedInputTokenCost: 0.5;
+    readonly cacheCreationInputTokenCost: 6.25;
     readonly outputTokenCost: 25;
     readonly outputTokensPerSecond: 53;
     readonly reasoning: {
@@ -1341,6 +1445,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
     readonly maxOutputTokens: 64000;
     readonly inputTokenCost: 3;
     readonly cachedInputTokenCost: 0.3;
+    readonly cacheCreationInputTokenCost: 3.75;
     readonly outputTokenCost: 15;
     readonly outputTokensPerSecond: 52;
     readonly reasoning: {
@@ -1357,6 +1462,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
     readonly maxOutputTokens: 64000;
     readonly inputTokenCost: 1;
     readonly cachedInputTokenCost: 0.1;
+    readonly cacheCreationInputTokenCost: 1.25;
     readonly outputTokenCost: 5;
     readonly outputTokensPerSecond: 97;
     readonly reasoning: {
@@ -1423,7 +1529,14 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
     readonly type: "image";
     readonly modelName: "gemini-3.1-flash-image-preview";
     readonly provider: "google";
-    readonly description: "Fast image generation with Gemini 3.1 Flash. Supports resolutions from 512px to 4096px. ~$0.067/image at 1K resolution.";
+    readonly description: "DEPRECATED: Preview version. Use gemini-3.1-flash-image instead.";
+    readonly costPerImage: 0.067;
+    readonly disabled: true;
+} | {
+    readonly type: "image";
+    readonly modelName: "gemini-3.1-flash-image";
+    readonly provider: "google";
+    readonly description: "Fast image generation with Gemini 3.1 Flash (GA). Supports resolutions from 512px to 4096px. ~$0.045/image at 512px, $0.067 at 1K, $0.101 at 2K, $0.151 at 4K.";
     readonly costPerImage: 0.067;
 } | undefined;
 export declare function isImageModel(model: ModelType): model is ImageModel;

package/dist/models.js CHANGED Viewed

@@ -459,10 +459,28 @@ export const textModels = [
         disabled: true,
         provider: "google",
     },
+    {
+        type: "text",
+        modelName: "gemini-3.5-flash",
+        description: "Latest Gemini 3.5 Flash model (GA May 2026). Outperforms Gemini 3.1 Pro on coding and agentic suites at 4x the speed. 1M context window, 64K output. Context caching: $0.15/1M read.",
+        maxInputTokens: 1_048_576,
+        maxOutputTokens: 65536,
+        inputTokenCost: 1.5,
+        cachedInputTokenCost: 0.15,
+        outputTokenCost: 9.0,
+        reasoning: {
+            levels: ["minimal", "low", "medium", "high"],
+            defaultLevel: "high",
+            canDisable: false,
+            outputsThinking: true,
+            outputsSignatures: true,
+        },
+        provider: "google",
+    },
     {
         type: "text",
         modelName: "gemini-3-flash-preview",
-        description: "Latest Gemini 3 flash model with 1M context window and 64K output. Outperforms 2.5 Pro while being 3x faster. Optimized for agentic workflows and coding. Includes context caching for 90% cost reductions.",
+        description: "Gemini 3 Flash preview. Superseded by gemini-3.5-flash. 1M context window and 64K output. Optimized for agentic workflows and coding.",
         maxInputTokens: 1_048_576,
         maxOutputTokens: 65536,
         inputTokenCost: 0.5,
@@ -479,8 +497,8 @@ export const textModels = [
     },
     {
         type: "text",
-        modelName: "gemini-3.1-flash-lite-preview",
-        description: "Most cost-effective Gemini 3.1 model with thinking support and 1M context window. 2.5x faster TTFA and 45% faster output than 2.5 Flash. Released March 2026.",
+        modelName: "gemini-3.1-flash-lite",
+        description: "Most cost-effective Gemini 3.1 model (GA). Thinking support, 1M context window, 64K output. 2.5x faster TTFA and 45% faster output than 2.5 Flash.",
         maxInputTokens: 1_048_576,
         maxOutputTokens: 65536,
         inputTokenCost: 0.25,
@@ -495,6 +513,17 @@ export const textModels = [
         },
         provider: "google",
     },
+    {
+        type: "text",
+        modelName: "gemini-3.1-flash-lite-preview",
+        description: "DEPRECATED: Preview version, discontinued July 9, 2026. Use gemini-3.1-flash-lite instead.",
+        maxInputTokens: 1_048_576,
+        maxOutputTokens: 65536,
+        inputTokenCost: 0.25,
+        outputTokenCost: 1.5,
+        disabled: true,
+        provider: "google",
+    },
     {
         type: "text",
         modelName: "gemini-2.5-pro",
@@ -502,6 +531,7 @@ export const textModels = [
         maxInputTokens: 2_097_152,
         maxOutputTokens: 65536,
         inputTokenCost: 1.25,
+        cachedInputTokenCost: 0.31,
         outputTokenCost: 10.0,
         outputTokensPerSecond: 145,
         reasoning: {
@@ -518,6 +548,7 @@ export const textModels = [
         maxInputTokens: 1_048_576,
         maxOutputTokens: 65536,
         inputTokenCost: 0.3,
+        cachedInputTokenCost: 0.075,
         outputTokenCost: 2.5,
         outputTokensPerSecond: 245,
         reasoning: {
@@ -534,6 +565,7 @@ export const textModels = [
         maxInputTokens: 1_048_576,
         maxOutputTokens: 65536,
         inputTokenCost: 0.1,
+        cachedInputTokenCost: 0.025,
         outputTokenCost: 0.4,
         outputTokensPerSecond: 400,
         reasoning: {
@@ -615,14 +647,31 @@ export const textModels = [
         disabled: true,
         provider: "google",
     },
+    {
+        type: "text",
+        modelName: "claude-opus-4-8",
+        description: "The most capable Claude model for complex reasoning and agentic coding. Same per-token pricing as Opus 4.7 with improved tool-use efficiency (~290 tokens for tool-use system prompt vs 675 on 4.7). 1M context window, 128K max output.",
+        maxInputTokens: 1_000_000,
+        maxOutputTokens: 128_000,
+        inputTokenCost: 5,
+        cachedInputTokenCost: 0.5,
+        outputTokenCost: 25,
+        reasoning: {
+            canDisable: false,
+            outputsThinking: true,
+            outputsSignatures: true,
+        },
+        provider: "anthropic",
+    },
     {
         type: "text",
         modelName: "claude-opus-4-7",
-        description: "The most capable Claude model for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.",
+        description: "Claude Opus 4.7 for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.",
         maxInputTokens: 1_000_000,
         maxOutputTokens: 128_000,
         inputTokenCost: 5,
         cachedInputTokenCost: 0.5,
+        cacheCreationInputTokenCost: 6.25,
         outputTokenCost: 25,
         outputTokensPerSecond: 72,
         reasoning: {
@@ -640,6 +689,7 @@ export const textModels = [
         maxOutputTokens: 128_000,
         inputTokenCost: 5,
         cachedInputTokenCost: 0.5,
+        cacheCreationInputTokenCost: 6.25,
         outputTokenCost: 25,
         outputTokensPerSecond: 53,
         reasoning: {
@@ -657,6 +707,7 @@ export const textModels = [
         maxOutputTokens: 64_000,
         inputTokenCost: 3,
         cachedInputTokenCost: 0.3,
+        cacheCreationInputTokenCost: 3.75,
         outputTokenCost: 15,
         outputTokensPerSecond: 52,
         reasoning: {
@@ -674,6 +725,7 @@ export const textModels = [
         maxOutputTokens: 64_000,
         inputTokenCost: 1,
         cachedInputTokenCost: 0.1,
+        cacheCreationInputTokenCost: 1.25,
         outputTokenCost: 5,
         outputTokensPerSecond: 97,
         reasoning: {
@@ -751,7 +803,15 @@ export const imageModels = [
         type: "image",
         modelName: "gemini-3.1-flash-image-preview",
         provider: "google",
-        description: "Fast image generation with Gemini 3.1 Flash. Supports resolutions from 512px to 4096px. ~$0.067/image at 1K resolution.",
+        description: "DEPRECATED: Preview version. Use gemini-3.1-flash-image instead.",
+        costPerImage: 0.067,
+        disabled: true,
+    },
+    {
+        type: "image",
+        modelName: "gemini-3.1-flash-image",
+        provider: "google",
+        description: "Fast image generation with Gemini 3.1 Flash (GA). Supports resolutions from 512px to 4096px. ~$0.045/image at 512px, $0.067 at 1K, $0.101 at 2K, $0.151 at 4K.",
         costPerImage: 0.067,
     },
 ];

package/dist/types/costEstimate.d.ts CHANGED Viewed

@@ -3,6 +3,7 @@ export type CostEstimate = {
     inputCost: number;
     outputCost: number;
     cachedInputCost?: number;
+    cacheCreationInputCost?: number;
     totalCost: number;
     currency: string;
 };
@@ -10,6 +11,7 @@ export declare const CostEstimateSchema: z.ZodObject<{
     inputCost: z.ZodNumber;
     outputCost: z.ZodNumber;
     cachedInputCost: z.ZodOptional<z.ZodNumber>;
+    cacheCreationInputCost: z.ZodOptional<z.ZodNumber>;
     totalCost: z.ZodNumber;
     currency: z.ZodString;
 }, z.core.$strip>;

package/dist/types/costEstimate.js CHANGED Viewed

@@ -3,6 +3,7 @@ export const CostEstimateSchema = z.object({
     inputCost: z.number(),
     outputCost: z.number(),
     cachedInputCost: z.number().optional(),
+    cacheCreationInputCost: z.number().optional(),
     totalCost: z.number(),
     currency: z.string(),
 });
@@ -24,6 +25,7 @@ export function addCosts(_a, _b) {
         inputCost: a.inputCost + b.inputCost,
         outputCost: a.outputCost + b.outputCost,
         cachedInputCost: (a.cachedInputCost || 0) + (b.cachedInputCost || 0),
+        cacheCreationInputCost: (a.cacheCreationInputCost || 0) + (b.cacheCreationInputCost || 0),
         totalCost: a.totalCost + b.totalCost,
         currency: a.currency,
     };

package/dist/types/tokenUsage.d.ts CHANGED Viewed

@@ -3,12 +3,14 @@ export type TokenUsage = {
     inputTokens: number;
     outputTokens: number;
     cachedInputTokens?: number;
+    cacheCreationInputTokens?: number;
     totalTokens?: number;
 };
 export declare const TokenUsageSchema: z.ZodObject<{
     inputTokens: z.ZodNumber;
     outputTokens: z.ZodNumber;
     cachedInputTokens: z.ZodOptional<z.ZodNumber>;
+    cacheCreationInputTokens: z.ZodOptional<z.ZodNumber>;
     totalTokens: z.ZodOptional<z.ZodNumber>;
 }, z.core.$strip>;
 export declare function addTokenUsage(_a?: TokenUsage, _b?: TokenUsage): TokenUsage;

package/dist/types/tokenUsage.js CHANGED Viewed

@@ -3,6 +3,7 @@ export const TokenUsageSchema = z.object({
     inputTokens: z.number(),
     outputTokens: z.number(),
     cachedInputTokens: z.number().optional(),
+    cacheCreationInputTokens: z.number().optional(),
     totalTokens: z.number().optional(),
 });
 export function addTokenUsage(_a, _b) {
@@ -20,6 +21,7 @@ export function addTokenUsage(_a, _b) {
         inputTokens: a.inputTokens + b.inputTokens,
         outputTokens: a.outputTokens + b.outputTokens,
         cachedInputTokens: (a.cachedInputTokens || 0) + (b.cachedInputTokens || 0),
+        cacheCreationInputTokens: (a.cacheCreationInputTokens || 0) + (b.cacheCreationInputTokens || 0),
         totalTokens: (a.totalTokens || 0) + (b.totalTokens || 0),
     };
 }

package/dist/types.d.ts CHANGED Viewed

@@ -69,6 +69,10 @@ export type SmolConfig = {
         enabled: boolean;
         budgetTokens?: number;
     };
+    /** Prompt caching. Currently used by Anthropic; OpenAI/Google cache automatically. Defaults to enabled. */
+    caching?: {
+        enabled?: boolean;
+    };
     /** Provider-agnostic reasoning effort level. */
     reasoningEffort?: "low" | "medium" | "high";
     responseFormatOptions?: Partial<{

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "smoltalk",
-  "version": "0.3.1",
+  "version": "0.4.0",
   "description": "A common interface for LLM APIs",
   "homepage": "https://github.com/egonSchiele/smoltalk",
   "files": [