smoltalk 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -29,12 +29,14 @@ export declare const AssistantMessageJSONSchema: z.ZodObject<{
29
29
  inputTokens: z.ZodNumber;
30
30
  outputTokens: z.ZodNumber;
31
31
  cachedInputTokens: z.ZodOptional<z.ZodNumber>;
32
+ cacheCreationInputTokens: z.ZodOptional<z.ZodNumber>;
32
33
  totalTokens: z.ZodOptional<z.ZodNumber>;
33
34
  }, z.core.$strip>>;
34
35
  cost: z.ZodOptional<z.ZodObject<{
35
36
  inputCost: z.ZodNumber;
36
37
  outputCost: z.ZodNumber;
37
38
  cachedInputCost: z.ZodOptional<z.ZodNumber>;
39
+ cacheCreationInputCost: z.ZodOptional<z.ZodNumber>;
38
40
  totalCost: z.ZodNumber;
39
41
  currency: z.ZodString;
40
42
  }, z.core.$strip>>;
@@ -8,6 +8,82 @@ import { SmolContentPolicyError, SmolContextWindowExceededError, } from "../smol
8
8
  import { BaseClient } from "./baseClient.js";
9
9
  import { Model } from "../model.js";
10
10
  const DEFAULT_MAX_TOKENS = 4096;
11
+ /**
12
+ * Attach ephemeral cache_control breakpoints to (up to) three places:
13
+ * 1. the last tool definition
14
+ * 2. the last system block (promoting system from string to array form)
15
+ * 3. the last block of the last user message
16
+ *
17
+ * Anthropic enforces minimum prefix sizes; smaller prefixes silently no-op.
18
+ */
19
+ function applyCacheBreakpoints(req) {
20
+ const cc = { type: "ephemeral" };
21
+ // Tools: mark the last tool.
22
+ let tools = req.tools;
23
+ if (tools && tools.length > 0) {
24
+ const lastIdx = tools.length - 1;
25
+ const marked = [];
26
+ for (let i = 0; i < tools.length; i++) {
27
+ if (i === lastIdx) {
28
+ marked.push({ ...tools[i], cache_control: cc });
29
+ }
30
+ else {
31
+ marked.push(tools[i]);
32
+ }
33
+ }
34
+ tools = marked;
35
+ }
36
+ // System: promote string to array form so the last block can be marked.
37
+ let system = req.system;
38
+ if (typeof system === "string" && system.length > 0) {
39
+ system = [{ type: "text", text: system, cache_control: cc }];
40
+ }
41
+ else if (Array.isArray(system) && system.length > 0) {
42
+ const lastIdx = system.length - 1;
43
+ const marked = [];
44
+ for (let i = 0; i < system.length; i++) {
45
+ if (i === lastIdx) {
46
+ marked.push({ ...system[i], cache_control: cc });
47
+ }
48
+ else {
49
+ marked.push(system[i]);
50
+ }
51
+ }
52
+ system = marked;
53
+ }
54
+ // Messages: mark the last block of the last user message.
55
+ let messages = req.messages;
56
+ for (let i = messages.length - 1; i >= 0; i--) {
57
+ const m = messages[i];
58
+ if (m.role !== "user")
59
+ continue;
60
+ let blocks;
61
+ if (typeof m.content === "string") {
62
+ blocks = [{ type: "text", text: m.content }];
63
+ }
64
+ else {
65
+ blocks = [...m.content];
66
+ }
67
+ if (blocks.length === 0)
68
+ break;
69
+ blocks[blocks.length - 1] = {
70
+ ...blocks[blocks.length - 1],
71
+ cache_control: cc,
72
+ };
73
+ const rebuilt = [];
74
+ for (let j = 0; j < messages.length; j++) {
75
+ if (j === i) {
76
+ rebuilt.push({ ...m, content: blocks });
77
+ }
78
+ else {
79
+ rebuilt.push(messages[j]);
80
+ }
81
+ }
82
+ messages = rebuilt;
83
+ break;
84
+ }
85
+ return { system, messages, tools };
86
+ }
11
87
  export class SmolAnthropic extends BaseClient {
12
88
  client;
13
89
  logger;
@@ -22,11 +98,22 @@ export class SmolAnthropic extends BaseClient {
22
98
  return this.model.getModel();
23
99
  }
24
100
  calculateUsageAndCost(usageData) {
101
+ const cacheRead = usageData.cache_read_input_tokens ?? 0;
102
+ const cacheCreation = usageData.cache_creation_input_tokens ?? 0;
25
103
  const usage = {
26
104
  inputTokens: usageData.input_tokens,
27
105
  outputTokens: usageData.output_tokens,
28
- totalTokens: usageData.input_tokens + usageData.output_tokens,
106
+ totalTokens: usageData.input_tokens +
107
+ cacheRead +
108
+ cacheCreation +
109
+ usageData.output_tokens,
29
110
  };
111
+ if (cacheRead > 0) {
112
+ usage.cachedInputTokens = cacheRead;
113
+ }
114
+ if (cacheCreation > 0) {
115
+ usage.cacheCreationInputTokens = cacheCreation;
116
+ }
30
117
  const cost = this.model.calculateCost(usage) ?? undefined;
31
118
  return { usage, cost };
32
119
  }
@@ -81,7 +168,12 @@ export class SmolAnthropic extends BaseClient {
81
168
  budget_tokens: reasoningBudgetMap[config.reasoningEffort],
82
169
  }
83
170
  : undefined;
84
- return { system, messages: anthropicMessages, tools, thinking };
171
+ const cachingEnabled = config.caching?.enabled !== false;
172
+ const baseRequest = { system, messages: anthropicMessages, tools };
173
+ const finalRequest = cachingEnabled
174
+ ? applyCacheBreakpoints(baseRequest)
175
+ : baseRequest;
176
+ return { ...finalRequest, thinking };
85
177
  }
86
178
  rethrowAsSmolError(error) {
87
179
  if (error instanceof Anthropic.APIError) {
@@ -198,10 +290,15 @@ export class SmolAnthropic extends BaseClient {
198
290
  // Track thinking blocks by index: index -> { text, signature }
199
291
  const thinkingBlockMap = new Map();
200
292
  let inputTokens = 0;
293
+ let cacheReadTokens = 0;
294
+ let cacheCreationTokens = 0;
201
295
  let outputTokens = 0;
202
296
  for await (const event of stream) {
203
297
  if (event.type === "message_start") {
204
- inputTokens = event.message.usage.input_tokens;
298
+ const u = event.message.usage;
299
+ inputTokens = u.input_tokens;
300
+ cacheReadTokens = u.cache_read_input_tokens ?? 0;
301
+ cacheCreationTokens = u.cache_creation_input_tokens ?? 0;
205
302
  }
206
303
  else if (event.type === "content_block_start") {
207
304
  if (event.content_block.type === "tool_use") {
@@ -252,6 +349,15 @@ export class SmolAnthropic extends BaseClient {
252
349
  }
253
350
  else if (event.type === "message_delta") {
254
351
  outputTokens = event.usage.output_tokens;
352
+ // Defensive: in practice Anthropic only sends cache fields on
353
+ // message_start, but read them here too so we don't miss an
354
+ // update if the SDK changes.
355
+ if (event.usage.cache_read_input_tokens != null) {
356
+ cacheReadTokens = event.usage.cache_read_input_tokens;
357
+ }
358
+ if (event.usage.cache_creation_input_tokens != null) {
359
+ cacheCreationTokens = event.usage.cache_creation_input_tokens;
360
+ }
255
361
  }
256
362
  }
257
363
  this.logger.debug("Streaming response completed from Anthropic");
@@ -269,8 +375,14 @@ export class SmolAnthropic extends BaseClient {
269
375
  const usage = {
270
376
  inputTokens,
271
377
  outputTokens,
272
- totalTokens: inputTokens + outputTokens,
378
+ totalTokens: inputTokens + cacheReadTokens + cacheCreationTokens + outputTokens,
273
379
  };
380
+ if (cacheReadTokens > 0) {
381
+ usage.cachedInputTokens = cacheReadTokens;
382
+ }
383
+ if (cacheCreationTokens > 0) {
384
+ usage.cacheCreationInputTokens = cacheCreationTokens;
385
+ }
274
386
  const cost = this.model.calculateCost(usage) ?? undefined;
275
387
  yield {
276
388
  type: "done",
@@ -31,12 +31,15 @@ export class SmolGoogle extends BaseClient {
31
31
  let usage;
32
32
  let cost;
33
33
  if (usageMetadata) {
34
+ const cached = usageMetadata.cachedContentTokenCount ?? 0;
34
35
  usage = {
35
- inputTokens: usageMetadata.promptTokenCount || 0,
36
+ inputTokens: Math.max(0, (usageMetadata.promptTokenCount || 0) - cached),
36
37
  outputTokens: usageMetadata.candidatesTokenCount || 0,
37
- cachedInputTokens: usageMetadata.cachedContentTokenCount,
38
38
  totalTokens: usageMetadata.totalTokenCount,
39
39
  };
40
+ if (cached > 0) {
41
+ usage.cachedInputTokens = cached;
42
+ }
40
43
  const calculatedCost = this.model.calculateCost(usage);
41
44
  if (calculatedCost) {
42
45
  cost = calculatedCost;
@@ -30,12 +30,15 @@ export class SmolOpenAi extends BaseClient {
30
30
  let usage;
31
31
  let cost;
32
32
  if (usageData) {
33
+ const cached = usageData.prompt_tokens_details?.cached_tokens ?? 0;
33
34
  usage = {
34
- inputTokens: usageData.prompt_tokens || 0,
35
+ inputTokens: Math.max(0, (usageData.prompt_tokens || 0) - cached),
35
36
  outputTokens: usageData.completion_tokens || 0,
36
- cachedInputTokens: usageData.prompt_tokens_details?.cached_tokens,
37
37
  totalTokens: usageData.total_tokens,
38
38
  };
39
+ if (cached > 0) {
40
+ usage.cachedInputTokens = cached;
41
+ }
39
42
  const calculatedCost = this.model.calculateCost(usage);
40
43
  if (calculatedCost) {
41
44
  cost = calculatedCost;
@@ -89,12 +89,15 @@ export class SmolOpenAiResponses extends BaseClient {
89
89
  let usage;
90
90
  let cost;
91
91
  if (usageData) {
92
+ const cached = usageData.input_tokens_details?.cached_tokens ?? 0;
92
93
  usage = {
93
- inputTokens: usageData.input_tokens || 0,
94
+ inputTokens: Math.max(0, (usageData.input_tokens || 0) - cached),
94
95
  outputTokens: usageData.output_tokens || 0,
95
- cachedInputTokens: usageData.input_tokens_details?.cached_tokens,
96
96
  totalTokens: usageData.total_tokens,
97
97
  };
98
+ if (cached > 0) {
99
+ usage.cachedInputTokens = cached;
100
+ }
98
101
  const calculatedCost = this.model.calculateCost(usage);
99
102
  if (calculatedCost) {
100
103
  cost = calculatedCost;
package/dist/model.d.ts CHANGED
@@ -11,10 +11,12 @@ export declare class Model {
11
11
  inputTokens: number;
12
12
  outputTokens: number;
13
13
  cachedInputTokens?: number;
14
+ cacheCreationInputTokens?: number;
14
15
  }): {
15
16
  inputCost: number;
16
17
  outputCost: number;
17
18
  cachedInputCost?: number;
19
+ cacheCreationInputCost?: number;
18
20
  totalCost: number;
19
21
  currency: string;
20
22
  } | null;
package/dist/model.js CHANGED
@@ -26,16 +26,49 @@ export class Model {
26
26
  if (!model || !isTextModel(model)) {
27
27
  return null;
28
28
  }
29
+ const cachedTokens = usage.cachedInputTokens ?? 0;
30
+ const cacheCreationTokens = usage.cacheCreationInputTokens ?? 0;
31
+ // Disjoint buckets. If a discount price isn't defined for this model,
32
+ // the tokens were still billed by the provider — charge them at the
33
+ // full input rate so totalCost stays honest.
34
+ const cachedRate = model.cachedInputTokenCost ?? model.inputTokenCost ?? 0;
35
+ const cacheCreationRate = model.cacheCreationInputTokenCost ?? model.inputTokenCost ?? 0;
29
36
  const inputCost = round((usage.inputTokens * (model.inputTokenCost || 0)) / 1_000_000, 6);
30
37
  const outputCost = round((usage.outputTokens * (model.outputTokenCost || 0)) / 1_000_000, 6);
31
- const cachedInputCost = usage.cachedInputTokens && model.cachedInputTokenCost
32
- ? round((usage.cachedInputTokens * model.cachedInputTokenCost) / 1_000_000, 6)
33
- : undefined;
34
- const totalCost = round(inputCost + outputCost + (cachedInputCost || 0), 6);
38
+ // Only expose cachedInputCost / cacheCreationInputCost when the model
39
+ // actually has a distinct discount price. Otherwise, fold those dollars
40
+ // into inputCost so the user isn't misled by a $0 cached field.
41
+ let cachedInputCost;
42
+ let cacheCreationInputCost;
43
+ let foldedInputDollars = 0;
44
+ if (cachedTokens > 0) {
45
+ const dollars = (cachedTokens * cachedRate) / 1_000_000;
46
+ if (model.cachedInputTokenCost != null) {
47
+ cachedInputCost = round(dollars, 6);
48
+ }
49
+ else {
50
+ foldedInputDollars += dollars;
51
+ }
52
+ }
53
+ if (cacheCreationTokens > 0) {
54
+ const dollars = (cacheCreationTokens * cacheCreationRate) / 1_000_000;
55
+ if (model.cacheCreationInputTokenCost != null) {
56
+ cacheCreationInputCost = round(dollars, 6);
57
+ }
58
+ else {
59
+ foldedInputDollars += dollars;
60
+ }
61
+ }
62
+ const finalInputCost = round(inputCost + foldedInputDollars, 6);
63
+ const totalCost = round(finalInputCost +
64
+ outputCost +
65
+ (cachedInputCost || 0) +
66
+ (cacheCreationInputCost || 0), 6);
35
67
  return {
36
- inputCost,
68
+ inputCost: finalInputCost,
37
69
  outputCost,
38
70
  cachedInputCost,
71
+ cacheCreationInputCost,
39
72
  totalCost,
40
73
  currency: "USD",
41
74
  };
package/dist/models.d.ts CHANGED
@@ -16,6 +16,7 @@ export type BaseModel = {
16
16
  description?: string;
17
17
  inputTokenCost?: number;
18
18
  cachedInputTokenCost?: number;
19
+ cacheCreationInputTokenCost?: number;
19
20
  outputTokenCost?: number;
20
21
  disabled?: boolean;
21
22
  costUnit?: "tokens" | "characters" | "minutes";
@@ -466,10 +467,27 @@ export declare const textModels: readonly [{
466
467
  readonly outputTokenCost: 12;
467
468
  readonly disabled: true;
468
469
  readonly provider: "google";
470
+ }, {
471
+ readonly type: "text";
472
+ readonly modelName: "gemini-3.5-flash";
473
+ readonly description: "Latest Gemini 3.5 Flash model (GA May 2026). Outperforms Gemini 3.1 Pro on coding and agentic suites at 4x the speed. 1M context window, 64K output. Context caching: $0.15/1M read.";
474
+ readonly maxInputTokens: 1048576;
475
+ readonly maxOutputTokens: 65536;
476
+ readonly inputTokenCost: 1.5;
477
+ readonly cachedInputTokenCost: 0.15;
478
+ readonly outputTokenCost: 9;
479
+ readonly reasoning: {
480
+ readonly levels: readonly ["minimal", "low", "medium", "high"];
481
+ readonly defaultLevel: "high";
482
+ readonly canDisable: false;
483
+ readonly outputsThinking: true;
484
+ readonly outputsSignatures: true;
485
+ };
486
+ readonly provider: "google";
469
487
  }, {
470
488
  readonly type: "text";
471
489
  readonly modelName: "gemini-3-flash-preview";
472
- readonly description: "Latest Gemini 3 flash model with 1M context window and 64K output. Outperforms 2.5 Pro while being 3x faster. Optimized for agentic workflows and coding. Includes context caching for 90% cost reductions.";
490
+ readonly description: "Gemini 3 Flash preview. Superseded by gemini-3.5-flash. 1M context window and 64K output. Optimized for agentic workflows and coding.";
473
491
  readonly maxInputTokens: 1048576;
474
492
  readonly maxOutputTokens: 65536;
475
493
  readonly inputTokenCost: 0.5;
@@ -485,8 +503,8 @@ export declare const textModels: readonly [{
485
503
  readonly provider: "google";
486
504
  }, {
487
505
  readonly type: "text";
488
- readonly modelName: "gemini-3.1-flash-lite-preview";
489
- readonly description: "Most cost-effective Gemini 3.1 model with thinking support and 1M context window. 2.5x faster TTFA and 45% faster output than 2.5 Flash. Released March 2026.";
506
+ readonly modelName: "gemini-3.1-flash-lite";
507
+ readonly description: "Most cost-effective Gemini 3.1 model (GA). Thinking support, 1M context window, 64K output. 2.5x faster TTFA and 45% faster output than 2.5 Flash.";
490
508
  readonly maxInputTokens: 1048576;
491
509
  readonly maxOutputTokens: 65536;
492
510
  readonly inputTokenCost: 0.25;
@@ -500,6 +518,16 @@ export declare const textModels: readonly [{
500
518
  readonly outputsSignatures: true;
501
519
  };
502
520
  readonly provider: "google";
521
+ }, {
522
+ readonly type: "text";
523
+ readonly modelName: "gemini-3.1-flash-lite-preview";
524
+ readonly description: "DEPRECATED: Preview version, discontinued July 9, 2026. Use gemini-3.1-flash-lite instead.";
525
+ readonly maxInputTokens: 1048576;
526
+ readonly maxOutputTokens: 65536;
527
+ readonly inputTokenCost: 0.25;
528
+ readonly outputTokenCost: 1.5;
529
+ readonly disabled: true;
530
+ readonly provider: "google";
503
531
  }, {
504
532
  readonly type: "text";
505
533
  readonly modelName: "gemini-2.5-pro";
@@ -507,6 +535,7 @@ export declare const textModels: readonly [{
507
535
  readonly maxInputTokens: 2097152;
508
536
  readonly maxOutputTokens: 65536;
509
537
  readonly inputTokenCost: 1.25;
538
+ readonly cachedInputTokenCost: 0.31;
510
539
  readonly outputTokenCost: 10;
511
540
  readonly outputTokensPerSecond: 145;
512
541
  readonly reasoning: {
@@ -522,6 +551,7 @@ export declare const textModels: readonly [{
522
551
  readonly maxInputTokens: 1048576;
523
552
  readonly maxOutputTokens: 65536;
524
553
  readonly inputTokenCost: 0.3;
554
+ readonly cachedInputTokenCost: 0.075;
525
555
  readonly outputTokenCost: 2.5;
526
556
  readonly outputTokensPerSecond: 245;
527
557
  readonly reasoning: {
@@ -537,6 +567,7 @@ export declare const textModels: readonly [{
537
567
  readonly maxInputTokens: 1048576;
538
568
  readonly maxOutputTokens: 65536;
539
569
  readonly inputTokenCost: 0.1;
570
+ readonly cachedInputTokenCost: 0.025;
540
571
  readonly outputTokenCost: 0.4;
541
572
  readonly outputTokensPerSecond: 400;
542
573
  readonly reasoning: {
@@ -611,14 +642,30 @@ export declare const textModels: readonly [{
611
642
  readonly costUnit: "characters";
612
643
  readonly disabled: true;
613
644
  readonly provider: "google";
645
+ }, {
646
+ readonly type: "text";
647
+ readonly modelName: "claude-opus-4-8";
648
+ readonly description: "The most capable Claude model for complex reasoning and agentic coding. Same per-token pricing as Opus 4.7 with improved tool-use efficiency (~290 tokens for tool-use system prompt vs 675 on 4.7). 1M context window, 128K max output.";
649
+ readonly maxInputTokens: 1000000;
650
+ readonly maxOutputTokens: 128000;
651
+ readonly inputTokenCost: 5;
652
+ readonly cachedInputTokenCost: 0.5;
653
+ readonly outputTokenCost: 25;
654
+ readonly reasoning: {
655
+ readonly canDisable: false;
656
+ readonly outputsThinking: true;
657
+ readonly outputsSignatures: true;
658
+ };
659
+ readonly provider: "anthropic";
614
660
  }, {
615
661
  readonly type: "text";
616
662
  readonly modelName: "claude-opus-4-7";
617
- readonly description: "The most capable Claude model for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.";
663
+ readonly description: "Claude Opus 4.7 for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.";
618
664
  readonly maxInputTokens: 1000000;
619
665
  readonly maxOutputTokens: 128000;
620
666
  readonly inputTokenCost: 5;
621
667
  readonly cachedInputTokenCost: 0.5;
668
+ readonly cacheCreationInputTokenCost: 6.25;
622
669
  readonly outputTokenCost: 25;
623
670
  readonly outputTokensPerSecond: 72;
624
671
  readonly reasoning: {
@@ -635,6 +682,7 @@ export declare const textModels: readonly [{
635
682
  readonly maxOutputTokens: 128000;
636
683
  readonly inputTokenCost: 5;
637
684
  readonly cachedInputTokenCost: 0.5;
685
+ readonly cacheCreationInputTokenCost: 6.25;
638
686
  readonly outputTokenCost: 25;
639
687
  readonly outputTokensPerSecond: 53;
640
688
  readonly reasoning: {
@@ -651,6 +699,7 @@ export declare const textModels: readonly [{
651
699
  readonly maxOutputTokens: 64000;
652
700
  readonly inputTokenCost: 3;
653
701
  readonly cachedInputTokenCost: 0.3;
702
+ readonly cacheCreationInputTokenCost: 3.75;
654
703
  readonly outputTokenCost: 15;
655
704
  readonly outputTokensPerSecond: 52;
656
705
  readonly reasoning: {
@@ -667,6 +716,7 @@ export declare const textModels: readonly [{
667
716
  readonly maxOutputTokens: 64000;
668
717
  readonly inputTokenCost: 1;
669
718
  readonly cachedInputTokenCost: 0.1;
719
+ readonly cacheCreationInputTokenCost: 1.25;
670
720
  readonly outputTokenCost: 5;
671
721
  readonly outputTokensPerSecond: 97;
672
722
  readonly reasoning: {
@@ -734,7 +784,14 @@ export declare const imageModels: readonly [{
734
784
  readonly type: "image";
735
785
  readonly modelName: "gemini-3.1-flash-image-preview";
736
786
  readonly provider: "google";
737
- readonly description: "Fast image generation with Gemini 3.1 Flash. Supports resolutions from 512px to 4096px. ~$0.067/image at 1K resolution.";
787
+ readonly description: "DEPRECATED: Preview version. Use gemini-3.1-flash-image instead.";
788
+ readonly costPerImage: 0.067;
789
+ readonly disabled: true;
790
+ }, {
791
+ readonly type: "image";
792
+ readonly modelName: "gemini-3.1-flash-image";
793
+ readonly provider: "google";
794
+ readonly description: "Fast image generation with Gemini 3.1 Flash (GA). Supports resolutions from 512px to 4096px. ~$0.045/image at 512px, $0.067 at 1K, $0.101 at 2K, $0.151 at 4K.";
738
795
  readonly costPerImage: 0.067;
739
796
  }];
740
797
  export declare const embeddingsModels: EmbeddingsModel[];
@@ -1156,10 +1213,27 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1156
1213
  readonly outputTokenCost: 12;
1157
1214
  readonly disabled: true;
1158
1215
  readonly provider: "google";
1216
+ } | {
1217
+ readonly type: "text";
1218
+ readonly modelName: "gemini-3.5-flash";
1219
+ readonly description: "Latest Gemini 3.5 Flash model (GA May 2026). Outperforms Gemini 3.1 Pro on coding and agentic suites at 4x the speed. 1M context window, 64K output. Context caching: $0.15/1M read.";
1220
+ readonly maxInputTokens: 1048576;
1221
+ readonly maxOutputTokens: 65536;
1222
+ readonly inputTokenCost: 1.5;
1223
+ readonly cachedInputTokenCost: 0.15;
1224
+ readonly outputTokenCost: 9;
1225
+ readonly reasoning: {
1226
+ readonly levels: readonly ["minimal", "low", "medium", "high"];
1227
+ readonly defaultLevel: "high";
1228
+ readonly canDisable: false;
1229
+ readonly outputsThinking: true;
1230
+ readonly outputsSignatures: true;
1231
+ };
1232
+ readonly provider: "google";
1159
1233
  } | {
1160
1234
  readonly type: "text";
1161
1235
  readonly modelName: "gemini-3-flash-preview";
1162
- readonly description: "Latest Gemini 3 flash model with 1M context window and 64K output. Outperforms 2.5 Pro while being 3x faster. Optimized for agentic workflows and coding. Includes context caching for 90% cost reductions.";
1236
+ readonly description: "Gemini 3 Flash preview. Superseded by gemini-3.5-flash. 1M context window and 64K output. Optimized for agentic workflows and coding.";
1163
1237
  readonly maxInputTokens: 1048576;
1164
1238
  readonly maxOutputTokens: 65536;
1165
1239
  readonly inputTokenCost: 0.5;
@@ -1175,8 +1249,8 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1175
1249
  readonly provider: "google";
1176
1250
  } | {
1177
1251
  readonly type: "text";
1178
- readonly modelName: "gemini-3.1-flash-lite-preview";
1179
- readonly description: "Most cost-effective Gemini 3.1 model with thinking support and 1M context window. 2.5x faster TTFA and 45% faster output than 2.5 Flash. Released March 2026.";
1252
+ readonly modelName: "gemini-3.1-flash-lite";
1253
+ readonly description: "Most cost-effective Gemini 3.1 model (GA). Thinking support, 1M context window, 64K output. 2.5x faster TTFA and 45% faster output than 2.5 Flash.";
1180
1254
  readonly maxInputTokens: 1048576;
1181
1255
  readonly maxOutputTokens: 65536;
1182
1256
  readonly inputTokenCost: 0.25;
@@ -1190,6 +1264,16 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1190
1264
  readonly outputsSignatures: true;
1191
1265
  };
1192
1266
  readonly provider: "google";
1267
+ } | {
1268
+ readonly type: "text";
1269
+ readonly modelName: "gemini-3.1-flash-lite-preview";
1270
+ readonly description: "DEPRECATED: Preview version, discontinued July 9, 2026. Use gemini-3.1-flash-lite instead.";
1271
+ readonly maxInputTokens: 1048576;
1272
+ readonly maxOutputTokens: 65536;
1273
+ readonly inputTokenCost: 0.25;
1274
+ readonly outputTokenCost: 1.5;
1275
+ readonly disabled: true;
1276
+ readonly provider: "google";
1193
1277
  } | {
1194
1278
  readonly type: "text";
1195
1279
  readonly modelName: "gemini-2.5-pro";
@@ -1197,6 +1281,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1197
1281
  readonly maxInputTokens: 2097152;
1198
1282
  readonly maxOutputTokens: 65536;
1199
1283
  readonly inputTokenCost: 1.25;
1284
+ readonly cachedInputTokenCost: 0.31;
1200
1285
  readonly outputTokenCost: 10;
1201
1286
  readonly outputTokensPerSecond: 145;
1202
1287
  readonly reasoning: {
@@ -1212,6 +1297,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1212
1297
  readonly maxInputTokens: 1048576;
1213
1298
  readonly maxOutputTokens: 65536;
1214
1299
  readonly inputTokenCost: 0.3;
1300
+ readonly cachedInputTokenCost: 0.075;
1215
1301
  readonly outputTokenCost: 2.5;
1216
1302
  readonly outputTokensPerSecond: 245;
1217
1303
  readonly reasoning: {
@@ -1227,6 +1313,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1227
1313
  readonly maxInputTokens: 1048576;
1228
1314
  readonly maxOutputTokens: 65536;
1229
1315
  readonly inputTokenCost: 0.1;
1316
+ readonly cachedInputTokenCost: 0.025;
1230
1317
  readonly outputTokenCost: 0.4;
1231
1318
  readonly outputTokensPerSecond: 400;
1232
1319
  readonly reasoning: {
@@ -1301,14 +1388,30 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1301
1388
  readonly costUnit: "characters";
1302
1389
  readonly disabled: true;
1303
1390
  readonly provider: "google";
1391
+ } | {
1392
+ readonly type: "text";
1393
+ readonly modelName: "claude-opus-4-8";
1394
+ readonly description: "The most capable Claude model for complex reasoning and agentic coding. Same per-token pricing as Opus 4.7 with improved tool-use efficiency (~290 tokens for tool-use system prompt vs 675 on 4.7). 1M context window, 128K max output.";
1395
+ readonly maxInputTokens: 1000000;
1396
+ readonly maxOutputTokens: 128000;
1397
+ readonly inputTokenCost: 5;
1398
+ readonly cachedInputTokenCost: 0.5;
1399
+ readonly outputTokenCost: 25;
1400
+ readonly reasoning: {
1401
+ readonly canDisable: false;
1402
+ readonly outputsThinking: true;
1403
+ readonly outputsSignatures: true;
1404
+ };
1405
+ readonly provider: "anthropic";
1304
1406
  } | {
1305
1407
  readonly type: "text";
1306
1408
  readonly modelName: "claude-opus-4-7";
1307
- readonly description: "The most capable Claude model for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.";
1409
+ readonly description: "Claude Opus 4.7 for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.";
1308
1410
  readonly maxInputTokens: 1000000;
1309
1411
  readonly maxOutputTokens: 128000;
1310
1412
  readonly inputTokenCost: 5;
1311
1413
  readonly cachedInputTokenCost: 0.5;
1414
+ readonly cacheCreationInputTokenCost: 6.25;
1312
1415
  readonly outputTokenCost: 25;
1313
1416
  readonly outputTokensPerSecond: 72;
1314
1417
  readonly reasoning: {
@@ -1325,6 +1428,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1325
1428
  readonly maxOutputTokens: 128000;
1326
1429
  readonly inputTokenCost: 5;
1327
1430
  readonly cachedInputTokenCost: 0.5;
1431
+ readonly cacheCreationInputTokenCost: 6.25;
1328
1432
  readonly outputTokenCost: 25;
1329
1433
  readonly outputTokensPerSecond: 53;
1330
1434
  readonly reasoning: {
@@ -1341,6 +1445,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1341
1445
  readonly maxOutputTokens: 64000;
1342
1446
  readonly inputTokenCost: 3;
1343
1447
  readonly cachedInputTokenCost: 0.3;
1448
+ readonly cacheCreationInputTokenCost: 3.75;
1344
1449
  readonly outputTokenCost: 15;
1345
1450
  readonly outputTokensPerSecond: 52;
1346
1451
  readonly reasoning: {
@@ -1357,6 +1462,7 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1357
1462
  readonly maxOutputTokens: 64000;
1358
1463
  readonly inputTokenCost: 1;
1359
1464
  readonly cachedInputTokenCost: 0.1;
1465
+ readonly cacheCreationInputTokenCost: 1.25;
1360
1466
  readonly outputTokenCost: 5;
1361
1467
  readonly outputTokensPerSecond: 97;
1362
1468
  readonly reasoning: {
@@ -1423,7 +1529,14 @@ export declare function getModel(modelName: ModelName): TextModel | EmbeddingsMo
1423
1529
  readonly type: "image";
1424
1530
  readonly modelName: "gemini-3.1-flash-image-preview";
1425
1531
  readonly provider: "google";
1426
- readonly description: "Fast image generation with Gemini 3.1 Flash. Supports resolutions from 512px to 4096px. ~$0.067/image at 1K resolution.";
1532
+ readonly description: "DEPRECATED: Preview version. Use gemini-3.1-flash-image instead.";
1533
+ readonly costPerImage: 0.067;
1534
+ readonly disabled: true;
1535
+ } | {
1536
+ readonly type: "image";
1537
+ readonly modelName: "gemini-3.1-flash-image";
1538
+ readonly provider: "google";
1539
+ readonly description: "Fast image generation with Gemini 3.1 Flash (GA). Supports resolutions from 512px to 4096px. ~$0.045/image at 512px, $0.067 at 1K, $0.101 at 2K, $0.151 at 4K.";
1427
1540
  readonly costPerImage: 0.067;
1428
1541
  } | undefined;
1429
1542
  export declare function isImageModel(model: ModelType): model is ImageModel;
package/dist/models.js CHANGED
@@ -459,10 +459,28 @@ export const textModels = [
459
459
  disabled: true,
460
460
  provider: "google",
461
461
  },
462
+ {
463
+ type: "text",
464
+ modelName: "gemini-3.5-flash",
465
+ description: "Latest Gemini 3.5 Flash model (GA May 2026). Outperforms Gemini 3.1 Pro on coding and agentic suites at 4x the speed. 1M context window, 64K output. Context caching: $0.15/1M read.",
466
+ maxInputTokens: 1_048_576,
467
+ maxOutputTokens: 65536,
468
+ inputTokenCost: 1.5,
469
+ cachedInputTokenCost: 0.15,
470
+ outputTokenCost: 9.0,
471
+ reasoning: {
472
+ levels: ["minimal", "low", "medium", "high"],
473
+ defaultLevel: "high",
474
+ canDisable: false,
475
+ outputsThinking: true,
476
+ outputsSignatures: true,
477
+ },
478
+ provider: "google",
479
+ },
462
480
  {
463
481
  type: "text",
464
482
  modelName: "gemini-3-flash-preview",
465
- description: "Latest Gemini 3 flash model with 1M context window and 64K output. Outperforms 2.5 Pro while being 3x faster. Optimized for agentic workflows and coding. Includes context caching for 90% cost reductions.",
483
+ description: "Gemini 3 Flash preview. Superseded by gemini-3.5-flash. 1M context window and 64K output. Optimized for agentic workflows and coding.",
466
484
  maxInputTokens: 1_048_576,
467
485
  maxOutputTokens: 65536,
468
486
  inputTokenCost: 0.5,
@@ -479,8 +497,8 @@ export const textModels = [
479
497
  },
480
498
  {
481
499
  type: "text",
482
- modelName: "gemini-3.1-flash-lite-preview",
483
- description: "Most cost-effective Gemini 3.1 model with thinking support and 1M context window. 2.5x faster TTFA and 45% faster output than 2.5 Flash. Released March 2026.",
500
+ modelName: "gemini-3.1-flash-lite",
501
+ description: "Most cost-effective Gemini 3.1 model (GA). Thinking support, 1M context window, 64K output. 2.5x faster TTFA and 45% faster output than 2.5 Flash.",
484
502
  maxInputTokens: 1_048_576,
485
503
  maxOutputTokens: 65536,
486
504
  inputTokenCost: 0.25,
@@ -495,6 +513,17 @@ export const textModels = [
495
513
  },
496
514
  provider: "google",
497
515
  },
516
+ {
517
+ type: "text",
518
+ modelName: "gemini-3.1-flash-lite-preview",
519
+ description: "DEPRECATED: Preview version, discontinued July 9, 2026. Use gemini-3.1-flash-lite instead.",
520
+ maxInputTokens: 1_048_576,
521
+ maxOutputTokens: 65536,
522
+ inputTokenCost: 0.25,
523
+ outputTokenCost: 1.5,
524
+ disabled: true,
525
+ provider: "google",
526
+ },
498
527
  {
499
528
  type: "text",
500
529
  modelName: "gemini-2.5-pro",
@@ -502,6 +531,7 @@ export const textModels = [
502
531
  maxInputTokens: 2_097_152,
503
532
  maxOutputTokens: 65536,
504
533
  inputTokenCost: 1.25,
534
+ cachedInputTokenCost: 0.31,
505
535
  outputTokenCost: 10.0,
506
536
  outputTokensPerSecond: 145,
507
537
  reasoning: {
@@ -518,6 +548,7 @@ export const textModels = [
518
548
  maxInputTokens: 1_048_576,
519
549
  maxOutputTokens: 65536,
520
550
  inputTokenCost: 0.3,
551
+ cachedInputTokenCost: 0.075,
521
552
  outputTokenCost: 2.5,
522
553
  outputTokensPerSecond: 245,
523
554
  reasoning: {
@@ -534,6 +565,7 @@ export const textModels = [
534
565
  maxInputTokens: 1_048_576,
535
566
  maxOutputTokens: 65536,
536
567
  inputTokenCost: 0.1,
568
+ cachedInputTokenCost: 0.025,
537
569
  outputTokenCost: 0.4,
538
570
  outputTokensPerSecond: 400,
539
571
  reasoning: {
@@ -615,14 +647,31 @@ export const textModels = [
615
647
  disabled: true,
616
648
  provider: "google",
617
649
  },
650
+ {
651
+ type: "text",
652
+ modelName: "claude-opus-4-8",
653
+ description: "The most capable Claude model for complex reasoning and agentic coding. Same per-token pricing as Opus 4.7 with improved tool-use efficiency (~290 tokens for tool-use system prompt vs 675 on 4.7). 1M context window, 128K max output.",
654
+ maxInputTokens: 1_000_000,
655
+ maxOutputTokens: 128_000,
656
+ inputTokenCost: 5,
657
+ cachedInputTokenCost: 0.5,
658
+ outputTokenCost: 25,
659
+ reasoning: {
660
+ canDisable: false,
661
+ outputsThinking: true,
662
+ outputsSignatures: true,
663
+ },
664
+ provider: "anthropic",
665
+ },
618
666
  {
619
667
  type: "text",
620
668
  modelName: "claude-opus-4-7",
621
- description: "The most capable Claude model for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.",
669
+ description: "Claude Opus 4.7 for complex reasoning and agentic coding. Features Adaptive Thinking that auto-tunes reasoning compute per request. 1M context window, 128K max output. Knowledge cutoff: January 2026.",
622
670
  maxInputTokens: 1_000_000,
623
671
  maxOutputTokens: 128_000,
624
672
  inputTokenCost: 5,
625
673
  cachedInputTokenCost: 0.5,
674
+ cacheCreationInputTokenCost: 6.25,
626
675
  outputTokenCost: 25,
627
676
  outputTokensPerSecond: 72,
628
677
  reasoning: {
@@ -640,6 +689,7 @@ export const textModels = [
640
689
  maxOutputTokens: 128_000,
641
690
  inputTokenCost: 5,
642
691
  cachedInputTokenCost: 0.5,
692
+ cacheCreationInputTokenCost: 6.25,
643
693
  outputTokenCost: 25,
644
694
  outputTokensPerSecond: 53,
645
695
  reasoning: {
@@ -657,6 +707,7 @@ export const textModels = [
657
707
  maxOutputTokens: 64_000,
658
708
  inputTokenCost: 3,
659
709
  cachedInputTokenCost: 0.3,
710
+ cacheCreationInputTokenCost: 3.75,
660
711
  outputTokenCost: 15,
661
712
  outputTokensPerSecond: 52,
662
713
  reasoning: {
@@ -674,6 +725,7 @@ export const textModels = [
674
725
  maxOutputTokens: 64_000,
675
726
  inputTokenCost: 1,
676
727
  cachedInputTokenCost: 0.1,
728
+ cacheCreationInputTokenCost: 1.25,
677
729
  outputTokenCost: 5,
678
730
  outputTokensPerSecond: 97,
679
731
  reasoning: {
@@ -751,7 +803,15 @@ export const imageModels = [
751
803
  type: "image",
752
804
  modelName: "gemini-3.1-flash-image-preview",
753
805
  provider: "google",
754
- description: "Fast image generation with Gemini 3.1 Flash. Supports resolutions from 512px to 4096px. ~$0.067/image at 1K resolution.",
806
+ description: "DEPRECATED: Preview version. Use gemini-3.1-flash-image instead.",
807
+ costPerImage: 0.067,
808
+ disabled: true,
809
+ },
810
+ {
811
+ type: "image",
812
+ modelName: "gemini-3.1-flash-image",
813
+ provider: "google",
814
+ description: "Fast image generation with Gemini 3.1 Flash (GA). Supports resolutions from 512px to 4096px. ~$0.045/image at 512px, $0.067 at 1K, $0.101 at 2K, $0.151 at 4K.",
755
815
  costPerImage: 0.067,
756
816
  },
757
817
  ];
@@ -3,6 +3,7 @@ export type CostEstimate = {
3
3
  inputCost: number;
4
4
  outputCost: number;
5
5
  cachedInputCost?: number;
6
+ cacheCreationInputCost?: number;
6
7
  totalCost: number;
7
8
  currency: string;
8
9
  };
@@ -10,6 +11,7 @@ export declare const CostEstimateSchema: z.ZodObject<{
10
11
  inputCost: z.ZodNumber;
11
12
  outputCost: z.ZodNumber;
12
13
  cachedInputCost: z.ZodOptional<z.ZodNumber>;
14
+ cacheCreationInputCost: z.ZodOptional<z.ZodNumber>;
13
15
  totalCost: z.ZodNumber;
14
16
  currency: z.ZodString;
15
17
  }, z.core.$strip>;
@@ -3,6 +3,7 @@ export const CostEstimateSchema = z.object({
3
3
  inputCost: z.number(),
4
4
  outputCost: z.number(),
5
5
  cachedInputCost: z.number().optional(),
6
+ cacheCreationInputCost: z.number().optional(),
6
7
  totalCost: z.number(),
7
8
  currency: z.string(),
8
9
  });
@@ -24,6 +25,7 @@ export function addCosts(_a, _b) {
24
25
  inputCost: a.inputCost + b.inputCost,
25
26
  outputCost: a.outputCost + b.outputCost,
26
27
  cachedInputCost: (a.cachedInputCost || 0) + (b.cachedInputCost || 0),
28
+ cacheCreationInputCost: (a.cacheCreationInputCost || 0) + (b.cacheCreationInputCost || 0),
27
29
  totalCost: a.totalCost + b.totalCost,
28
30
  currency: a.currency,
29
31
  };
@@ -3,12 +3,14 @@ export type TokenUsage = {
3
3
  inputTokens: number;
4
4
  outputTokens: number;
5
5
  cachedInputTokens?: number;
6
+ cacheCreationInputTokens?: number;
6
7
  totalTokens?: number;
7
8
  };
8
9
  export declare const TokenUsageSchema: z.ZodObject<{
9
10
  inputTokens: z.ZodNumber;
10
11
  outputTokens: z.ZodNumber;
11
12
  cachedInputTokens: z.ZodOptional<z.ZodNumber>;
13
+ cacheCreationInputTokens: z.ZodOptional<z.ZodNumber>;
12
14
  totalTokens: z.ZodOptional<z.ZodNumber>;
13
15
  }, z.core.$strip>;
14
16
  export declare function addTokenUsage(_a?: TokenUsage, _b?: TokenUsage): TokenUsage;
@@ -3,6 +3,7 @@ export const TokenUsageSchema = z.object({
3
3
  inputTokens: z.number(),
4
4
  outputTokens: z.number(),
5
5
  cachedInputTokens: z.number().optional(),
6
+ cacheCreationInputTokens: z.number().optional(),
6
7
  totalTokens: z.number().optional(),
7
8
  });
8
9
  export function addTokenUsage(_a, _b) {
@@ -20,6 +21,7 @@ export function addTokenUsage(_a, _b) {
20
21
  inputTokens: a.inputTokens + b.inputTokens,
21
22
  outputTokens: a.outputTokens + b.outputTokens,
22
23
  cachedInputTokens: (a.cachedInputTokens || 0) + (b.cachedInputTokens || 0),
24
+ cacheCreationInputTokens: (a.cacheCreationInputTokens || 0) + (b.cacheCreationInputTokens || 0),
23
25
  totalTokens: (a.totalTokens || 0) + (b.totalTokens || 0),
24
26
  };
25
27
  }
package/dist/types.d.ts CHANGED
@@ -69,6 +69,10 @@ export type SmolConfig = {
69
69
  enabled: boolean;
70
70
  budgetTokens?: number;
71
71
  };
72
+ /** Prompt caching. Currently used by Anthropic; OpenAI/Google cache automatically. Defaults to enabled. */
73
+ caching?: {
74
+ enabled?: boolean;
75
+ };
72
76
  /** Provider-agnostic reasoning effort level. */
73
77
  reasoningEffort?: "low" | "medium" | "high";
74
78
  responseFormatOptions?: Partial<{
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "smoltalk",
3
- "version": "0.3.1",
3
+ "version": "0.4.0",
4
4
  "description": "A common interface for LLM APIs",
5
5
  "homepage": "https://github.com/egonSchiele/smoltalk",
6
6
  "files": [