llmist 2.2.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2497,7 +2497,27 @@ var init_cost_reporting_client = __esm({
2497
2497
  constructor(client, reportCost) {
2498
2498
  this.client = client;
2499
2499
  this.reportCost = reportCost;
2500
+ this.image = {
2501
+ generate: async (options) => {
2502
+ const result = await this.client.image.generate(options);
2503
+ if (result.cost !== void 0 && result.cost > 0) {
2504
+ this.reportCost(result.cost);
2505
+ }
2506
+ return result;
2507
+ }
2508
+ };
2509
+ this.speech = {
2510
+ generate: async (options) => {
2511
+ const result = await this.client.speech.generate(options);
2512
+ if (result.cost !== void 0 && result.cost > 0) {
2513
+ this.reportCost(result.cost);
2514
+ }
2515
+ return result;
2516
+ }
2517
+ };
2500
2518
  }
2519
+ image;
2520
+ speech;
2501
2521
  /**
2502
2522
  * Access to model registry for cost estimation.
2503
2523
  */
@@ -5385,6 +5405,28 @@ var init_anthropic = __esm({
5385
5405
  getModelSpecs() {
5386
5406
  return ANTHROPIC_MODELS;
5387
5407
  }
5408
+ // =========================================================================
5409
+ // Image Generation (Not Supported)
5410
+ // =========================================================================
5411
+ supportsImageGeneration(_modelId) {
5412
+ return false;
5413
+ }
5414
+ async generateImage() {
5415
+ throw new Error(
5416
+ "Anthropic does not support image generation. Use OpenAI (DALL-E, GPT Image) or Google Gemini (Imagen) instead."
5417
+ );
5418
+ }
5419
+ // =========================================================================
5420
+ // Speech Generation (Not Supported)
5421
+ // =========================================================================
5422
+ supportsSpeechGeneration(_modelId) {
5423
+ return false;
5424
+ }
5425
+ async generateSpeech() {
5426
+ throw new Error(
5427
+ "Anthropic does not support speech generation. Use OpenAI (TTS) or Google Gemini (TTS) instead."
5428
+ );
5429
+ }
5388
5430
  buildRequestPayload(options, descriptor, spec, messages) {
5389
5431
  const systemMessages = messages.filter((message) => message.role === "system");
5390
5432
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
@@ -5539,6 +5581,182 @@ var init_anthropic = __esm({
5539
5581
  }
5540
5582
  });
5541
5583
 
5584
+ // src/providers/gemini-image-models.ts
5585
+ function getGeminiImageModelSpec(modelId) {
5586
+ return geminiImageModels.find((m) => m.modelId === modelId);
5587
+ }
5588
+ function isGeminiImageModel(modelId) {
5589
+ return geminiImageModels.some((m) => m.modelId === modelId);
5590
+ }
5591
+ function calculateGeminiImageCost(modelId, size = "1:1", n = 1) {
5592
+ const spec = getGeminiImageModelSpec(modelId);
5593
+ if (!spec) return void 0;
5594
+ if (spec.pricing.perImage !== void 0) {
5595
+ return spec.pricing.perImage * n;
5596
+ }
5597
+ if (spec.pricing.bySize) {
5598
+ const sizePrice = spec.pricing.bySize[size];
5599
+ if (typeof sizePrice === "number") {
5600
+ return sizePrice * n;
5601
+ }
5602
+ }
5603
+ return void 0;
5604
+ }
5605
+ var IMAGEN4_ASPECT_RATIOS, GEMINI_IMAGE_ASPECT_RATIOS, geminiImageModels;
5606
+ var init_gemini_image_models = __esm({
5607
+ "src/providers/gemini-image-models.ts"() {
5608
+ "use strict";
5609
+ IMAGEN4_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
5610
+ GEMINI_IMAGE_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
5611
+ geminiImageModels = [
5612
+ // Imagen 4 Family (standalone image generation)
5613
+ {
5614
+ provider: "gemini",
5615
+ modelId: "imagen-4.0-fast-generate-001",
5616
+ displayName: "Imagen 4 Fast",
5617
+ pricing: {
5618
+ perImage: 0.02
5619
+ },
5620
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5621
+ maxImages: 4,
5622
+ defaultSize: "1:1",
5623
+ features: {
5624
+ textRendering: true
5625
+ }
5626
+ },
5627
+ {
5628
+ provider: "gemini",
5629
+ modelId: "imagen-4.0-generate-001",
5630
+ displayName: "Imagen 4",
5631
+ pricing: {
5632
+ perImage: 0.04
5633
+ },
5634
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5635
+ maxImages: 4,
5636
+ defaultSize: "1:1",
5637
+ features: {
5638
+ textRendering: true
5639
+ }
5640
+ },
5641
+ {
5642
+ provider: "gemini",
5643
+ modelId: "imagen-4.0-ultra-generate-001",
5644
+ displayName: "Imagen 4 Ultra",
5645
+ pricing: {
5646
+ perImage: 0.06
5647
+ },
5648
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5649
+ maxImages: 4,
5650
+ defaultSize: "1:1",
5651
+ features: {
5652
+ textRendering: true
5653
+ }
5654
+ },
5655
+ // Preview versions
5656
+ {
5657
+ provider: "gemini",
5658
+ modelId: "imagen-4.0-generate-preview-06-06",
5659
+ displayName: "Imagen 4 (Preview)",
5660
+ pricing: {
5661
+ perImage: 0.04
5662
+ },
5663
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5664
+ maxImages: 4,
5665
+ defaultSize: "1:1",
5666
+ features: {
5667
+ textRendering: true
5668
+ }
5669
+ },
5670
+ {
5671
+ provider: "gemini",
5672
+ modelId: "imagen-4.0-ultra-generate-preview-06-06",
5673
+ displayName: "Imagen 4 Ultra (Preview)",
5674
+ pricing: {
5675
+ perImage: 0.06
5676
+ },
5677
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5678
+ maxImages: 4,
5679
+ defaultSize: "1:1",
5680
+ features: {
5681
+ textRendering: true
5682
+ }
5683
+ },
5684
+ // Gemini Native Image Generation (multimodal models)
5685
+ {
5686
+ provider: "gemini",
5687
+ modelId: "gemini-2.5-flash-image",
5688
+ displayName: "Gemini 2.5 Flash Image",
5689
+ pricing: {
5690
+ perImage: 0.039
5691
+ },
5692
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
5693
+ maxImages: 1,
5694
+ defaultSize: "1:1",
5695
+ features: {
5696
+ conversational: true,
5697
+ textRendering: true
5698
+ }
5699
+ },
5700
+ {
5701
+ provider: "gemini",
5702
+ modelId: "gemini-2.5-flash-image-preview",
5703
+ displayName: "Gemini 2.5 Flash Image (Preview)",
5704
+ pricing: {
5705
+ perImage: 0.039
5706
+ },
5707
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
5708
+ maxImages: 1,
5709
+ defaultSize: "1:1",
5710
+ features: {
5711
+ conversational: true,
5712
+ textRendering: true
5713
+ }
5714
+ },
5715
+ {
5716
+ provider: "gemini",
5717
+ modelId: "gemini-3-pro-image-preview",
5718
+ displayName: "Gemini 3 Pro Image (Preview)",
5719
+ pricing: {
5720
+ // Token-based: ~$0.134 per 1K/2K image, $0.24 per 4K
5721
+ // Using 2K as default
5722
+ bySize: {
5723
+ "1K": 0.134,
5724
+ "2K": 0.134,
5725
+ "4K": 0.24
5726
+ }
5727
+ },
5728
+ supportedSizes: ["1K", "2K", "4K"],
5729
+ maxImages: 1,
5730
+ defaultSize: "2K",
5731
+ features: {
5732
+ conversational: true,
5733
+ textRendering: true
5734
+ }
5735
+ },
5736
+ // Alias: nano-banana-pro-preview is gemini-3-pro-image-preview
5737
+ {
5738
+ provider: "gemini",
5739
+ modelId: "nano-banana-pro-preview",
5740
+ displayName: "Nano Banana Pro (Gemini 3 Pro Image)",
5741
+ pricing: {
5742
+ bySize: {
5743
+ "1K": 0.134,
5744
+ "2K": 0.134,
5745
+ "4K": 0.24
5746
+ }
5747
+ },
5748
+ supportedSizes: ["1K", "2K", "4K"],
5749
+ maxImages: 1,
5750
+ defaultSize: "2K",
5751
+ features: {
5752
+ conversational: true,
5753
+ textRendering: true
5754
+ }
5755
+ }
5756
+ ];
5757
+ }
5758
+ });
5759
+
5542
5760
  // src/providers/gemini-models.ts
5543
5761
  var GEMINI_MODELS;
5544
5762
  var init_gemini_models = __esm({
@@ -5712,7 +5930,171 @@ var init_gemini_models = __esm({
5712
5930
  }
5713
5931
  });
5714
5932
 
5933
+ // src/providers/gemini-speech-models.ts
5934
+ function getGeminiSpeechModelSpec(modelId) {
5935
+ return geminiSpeechModels.find((m) => m.modelId === modelId);
5936
+ }
5937
+ function isGeminiSpeechModel(modelId) {
5938
+ return geminiSpeechModels.some((m) => m.modelId === modelId);
5939
+ }
5940
+ function calculateGeminiSpeechCost(modelId, characterCount, estimatedMinutes) {
5941
+ const spec = getGeminiSpeechModelSpec(modelId);
5942
+ if (!spec) return void 0;
5943
+ if (spec.pricing.perMinute !== void 0) {
5944
+ if (estimatedMinutes !== void 0) {
5945
+ return estimatedMinutes * spec.pricing.perMinute;
5946
+ }
5947
+ const approxMinutes = characterCount / 750;
5948
+ return approxMinutes * spec.pricing.perMinute;
5949
+ }
5950
+ return void 0;
5951
+ }
5952
+ var GEMINI_TTS_VOICES, GEMINI_TTS_FORMATS, geminiSpeechModels;
5953
+ var init_gemini_speech_models = __esm({
5954
+ "src/providers/gemini-speech-models.ts"() {
5955
+ "use strict";
5956
+ GEMINI_TTS_VOICES = [
5957
+ "Zephyr",
5958
+ // Bright
5959
+ "Puck",
5960
+ // Upbeat
5961
+ "Charon",
5962
+ // Informative
5963
+ "Kore",
5964
+ // Firm
5965
+ "Fenrir",
5966
+ // Excitable
5967
+ "Leda",
5968
+ // Youthful
5969
+ "Orus",
5970
+ // Firm
5971
+ "Aoede",
5972
+ // Breezy
5973
+ "Callirrhoe",
5974
+ // Easy-going
5975
+ "Autonoe",
5976
+ // Bright
5977
+ "Enceladus",
5978
+ // Breathy
5979
+ "Iapetus",
5980
+ // Clear
5981
+ "Umbriel",
5982
+ // Easy-going
5983
+ "Algieba",
5984
+ // Smooth
5985
+ "Despina",
5986
+ // Smooth
5987
+ "Erinome",
5988
+ // Clear
5989
+ "Algenib",
5990
+ // Gravelly
5991
+ "Rasalgethi",
5992
+ // Informative
5993
+ "Laomedeia",
5994
+ // Upbeat
5995
+ "Achernar",
5996
+ // Soft
5997
+ "Alnilam",
5998
+ // Firm
5999
+ "Schedar",
6000
+ // Even
6001
+ "Gacrux",
6002
+ // Mature
6003
+ "Pulcherrima",
6004
+ // Forward
6005
+ "Achird",
6006
+ // Friendly
6007
+ "Zubenelgenubi",
6008
+ // Casual
6009
+ "Vindemiatrix",
6010
+ // Gentle
6011
+ "Sadachbia",
6012
+ // Lively
6013
+ "Sadaltager",
6014
+ // Knowledgeable
6015
+ "Sulafat"
6016
+ // Warm
6017
+ ];
6018
+ GEMINI_TTS_FORMATS = ["pcm", "wav"];
6019
+ geminiSpeechModels = [
6020
+ {
6021
+ provider: "gemini",
6022
+ modelId: "gemini-2.5-flash-preview-tts",
6023
+ displayName: "Gemini 2.5 Flash TTS (Preview)",
6024
+ pricing: {
6025
+ // $0.50 per 1M input tokens = $0.0000005 per token
6026
+ perInputToken: 5e-7,
6027
+ // $10.00 per 1M audio output tokens = $0.00001 per token
6028
+ perAudioOutputToken: 1e-5,
6029
+ // Rough estimate: ~$0.01 per minute of audio
6030
+ perMinute: 0.01
6031
+ },
6032
+ voices: [...GEMINI_TTS_VOICES],
6033
+ formats: GEMINI_TTS_FORMATS,
6034
+ maxInputLength: 8e3,
6035
+ // bytes (text + prompt combined)
6036
+ defaultVoice: "Zephyr",
6037
+ defaultFormat: "wav",
6038
+ features: {
6039
+ multiSpeaker: true,
6040
+ languages: 24,
6041
+ voiceInstructions: true
6042
+ }
6043
+ },
6044
+ {
6045
+ provider: "gemini",
6046
+ modelId: "gemini-2.5-pro-preview-tts",
6047
+ displayName: "Gemini 2.5 Pro TTS (Preview)",
6048
+ pricing: {
6049
+ // $1.00 per 1M input tokens = $0.000001 per token
6050
+ perInputToken: 1e-6,
6051
+ // $20.00 per 1M audio output tokens = $0.00002 per token
6052
+ perAudioOutputToken: 2e-5,
6053
+ // Rough estimate: ~$0.02 per minute of audio
6054
+ perMinute: 0.02
6055
+ },
6056
+ voices: [...GEMINI_TTS_VOICES],
6057
+ formats: GEMINI_TTS_FORMATS,
6058
+ maxInputLength: 8e3,
6059
+ // bytes
6060
+ defaultVoice: "Zephyr",
6061
+ defaultFormat: "wav",
6062
+ features: {
6063
+ multiSpeaker: true,
6064
+ languages: 24,
6065
+ voiceInstructions: true
6066
+ }
6067
+ }
6068
+ ];
6069
+ }
6070
+ });
6071
+
5715
6072
  // src/providers/gemini.ts
6073
+ function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
6074
+ const byteRate = sampleRate * numChannels * bitsPerSample / 8;
6075
+ const blockAlign = numChannels * bitsPerSample / 8;
6076
+ const dataSize = pcmData.length;
6077
+ const headerSize = 44;
6078
+ const fileSize = headerSize + dataSize - 8;
6079
+ const buffer = new ArrayBuffer(headerSize + dataSize);
6080
+ const view = new DataView(buffer);
6081
+ const uint8 = new Uint8Array(buffer);
6082
+ view.setUint32(0, 1380533830, false);
6083
+ view.setUint32(4, fileSize, true);
6084
+ view.setUint32(8, 1463899717, false);
6085
+ view.setUint32(12, 1718449184, false);
6086
+ view.setUint32(16, 16, true);
6087
+ view.setUint16(20, 1, true);
6088
+ view.setUint16(22, numChannels, true);
6089
+ view.setUint32(24, sampleRate, true);
6090
+ view.setUint32(28, byteRate, true);
6091
+ view.setUint16(32, blockAlign, true);
6092
+ view.setUint16(34, bitsPerSample, true);
6093
+ view.setUint32(36, 1684108385, false);
6094
+ view.setUint32(40, dataSize, true);
6095
+ uint8.set(pcmData, headerSize);
6096
+ return buffer;
6097
+ }
5716
6098
  function createGeminiProviderFromEnv() {
5717
6099
  return createProviderFromEnv("GEMINI_API_KEY", import_genai.GoogleGenAI, GeminiGenerativeProvider);
5718
6100
  }
@@ -5723,7 +6105,9 @@ var init_gemini = __esm({
5723
6105
  import_genai = require("@google/genai");
5724
6106
  init_base_provider();
5725
6107
  init_constants2();
6108
+ init_gemini_image_models();
5726
6109
  init_gemini_models();
6110
+ init_gemini_speech_models();
5727
6111
  init_utils();
5728
6112
  GEMINI_ROLE_MAP = {
5729
6113
  system: "user",
@@ -5738,6 +6122,139 @@ var init_gemini = __esm({
5738
6122
  getModelSpecs() {
5739
6123
  return GEMINI_MODELS;
5740
6124
  }
6125
+ // =========================================================================
6126
+ // Image Generation
6127
+ // =========================================================================
6128
+ getImageModelSpecs() {
6129
+ return geminiImageModels;
6130
+ }
6131
+ supportsImageGeneration(modelId) {
6132
+ return isGeminiImageModel(modelId);
6133
+ }
6134
+ async generateImage(options) {
6135
+ const client = this.client;
6136
+ const spec = getGeminiImageModelSpec(options.model);
6137
+ const isImagenModel = options.model.startsWith("imagen");
6138
+ const aspectRatio = options.size ?? spec?.defaultSize ?? "1:1";
6139
+ const n = options.n ?? 1;
6140
+ if (isImagenModel) {
6141
+ const response2 = await client.models.generateImages({
6142
+ model: options.model,
6143
+ prompt: options.prompt,
6144
+ config: {
6145
+ numberOfImages: n,
6146
+ aspectRatio,
6147
+ outputMimeType: options.responseFormat === "b64_json" ? "image/png" : "image/jpeg"
6148
+ }
6149
+ });
6150
+ const images2 = response2.generatedImages ?? [];
6151
+ const cost2 = calculateGeminiImageCost(options.model, aspectRatio, images2.length);
6152
+ return {
6153
+ // Gemini's imageBytes is already base64 encoded, so use it directly
6154
+ images: images2.map((img) => ({
6155
+ b64Json: img.image?.imageBytes ?? void 0
6156
+ })),
6157
+ model: options.model,
6158
+ usage: {
6159
+ imagesGenerated: images2.length,
6160
+ size: aspectRatio,
6161
+ quality: "standard"
6162
+ },
6163
+ cost: cost2
6164
+ };
6165
+ }
6166
+ const response = await client.models.generateContent({
6167
+ model: options.model,
6168
+ contents: [{ role: "user", parts: [{ text: options.prompt }] }],
6169
+ config: {
6170
+ responseModalities: [import_genai.Modality.IMAGE, import_genai.Modality.TEXT]
6171
+ }
6172
+ });
6173
+ const images = [];
6174
+ const candidate = response.candidates?.[0];
6175
+ if (candidate?.content?.parts) {
6176
+ for (const part of candidate.content.parts) {
6177
+ if ("inlineData" in part && part.inlineData) {
6178
+ images.push({
6179
+ b64Json: part.inlineData.data
6180
+ });
6181
+ }
6182
+ }
6183
+ }
6184
+ const cost = calculateGeminiImageCost(options.model, aspectRatio, images.length);
6185
+ return {
6186
+ images,
6187
+ model: options.model,
6188
+ usage: {
6189
+ imagesGenerated: images.length,
6190
+ size: aspectRatio,
6191
+ quality: "standard"
6192
+ },
6193
+ cost
6194
+ };
6195
+ }
6196
+ // =========================================================================
6197
+ // Speech Generation
6198
+ // =========================================================================
6199
+ getSpeechModelSpecs() {
6200
+ return geminiSpeechModels;
6201
+ }
6202
+ supportsSpeechGeneration(modelId) {
6203
+ return isGeminiSpeechModel(modelId);
6204
+ }
6205
+ async generateSpeech(options) {
6206
+ const client = this.client;
6207
+ const spec = getGeminiSpeechModelSpec(options.model);
6208
+ const voice = options.voice ?? spec?.defaultVoice ?? "Zephyr";
6209
+ const response = await client.models.generateContent({
6210
+ model: options.model,
6211
+ contents: [
6212
+ {
6213
+ role: "user",
6214
+ parts: [{ text: options.input }]
6215
+ }
6216
+ ],
6217
+ config: {
6218
+ responseModalities: [import_genai.Modality.AUDIO],
6219
+ speechConfig: {
6220
+ voiceConfig: {
6221
+ prebuiltVoiceConfig: {
6222
+ voiceName: voice
6223
+ }
6224
+ }
6225
+ }
6226
+ }
6227
+ });
6228
+ let pcmData;
6229
+ const candidate = response.candidates?.[0];
6230
+ if (candidate?.content?.parts) {
6231
+ for (const part of candidate.content.parts) {
6232
+ if ("inlineData" in part && part.inlineData?.data) {
6233
+ const base64 = part.inlineData.data;
6234
+ const binary = atob(base64);
6235
+ pcmData = new Uint8Array(binary.length);
6236
+ for (let i = 0; i < binary.length; i++) {
6237
+ pcmData[i] = binary.charCodeAt(i);
6238
+ }
6239
+ break;
6240
+ }
6241
+ }
6242
+ }
6243
+ if (!pcmData) {
6244
+ throw new Error("No audio data in Gemini TTS response");
6245
+ }
6246
+ const audioData = wrapPcmInWav(pcmData, 24e3, 16, 1);
6247
+ const cost = calculateGeminiSpeechCost(options.model, options.input.length);
6248
+ return {
6249
+ audio: audioData,
6250
+ model: options.model,
6251
+ usage: {
6252
+ characterCount: options.input.length
6253
+ },
6254
+ cost,
6255
+ format: spec?.defaultFormat ?? "wav"
6256
+ };
6257
+ }
5741
6258
  buildRequestPayload(options, descriptor, _spec, messages) {
5742
6259
  const contents = this.convertMessagesToContents(messages);
5743
6260
  const generationConfig = this.buildGenerationConfig(options);
@@ -5933,6 +6450,121 @@ var init_gemini = __esm({
5933
6450
  }
5934
6451
  });
5935
6452
 
6453
+ // src/providers/openai-image-models.ts
6454
+ function getOpenAIImageModelSpec(modelId) {
6455
+ return openaiImageModels.find((m) => m.modelId === modelId);
6456
+ }
6457
+ function isOpenAIImageModel(modelId) {
6458
+ return openaiImageModels.some((m) => m.modelId === modelId);
6459
+ }
6460
+ function calculateOpenAIImageCost(modelId, size, quality = "standard", n = 1) {
6461
+ const spec = getOpenAIImageModelSpec(modelId);
6462
+ if (!spec) return void 0;
6463
+ const sizePrice = spec.pricing.bySize?.[size];
6464
+ if (sizePrice === void 0) return void 0;
6465
+ let pricePerImage;
6466
+ if (typeof sizePrice === "number") {
6467
+ pricePerImage = sizePrice;
6468
+ } else {
6469
+ pricePerImage = sizePrice[quality];
6470
+ if (pricePerImage === void 0) return void 0;
6471
+ }
6472
+ return pricePerImage * n;
6473
+ }
6474
+ var GPT_IMAGE_SIZES, GPT_IMAGE_QUALITIES, DALLE3_SIZES, DALLE3_QUALITIES, DALLE2_SIZES, openaiImageModels;
6475
+ var init_openai_image_models = __esm({
6476
+ "src/providers/openai-image-models.ts"() {
6477
+ "use strict";
6478
+ GPT_IMAGE_SIZES = ["1024x1024", "1024x1536", "1536x1024"];
6479
+ GPT_IMAGE_QUALITIES = ["low", "medium", "high"];
6480
+ DALLE3_SIZES = ["1024x1024", "1024x1792", "1792x1024"];
6481
+ DALLE3_QUALITIES = ["standard", "hd"];
6482
+ DALLE2_SIZES = ["256x256", "512x512", "1024x1024"];
6483
+ openaiImageModels = [
6484
+ // GPT Image 1 Family (flagship)
6485
+ {
6486
+ provider: "openai",
6487
+ modelId: "gpt-image-1",
6488
+ displayName: "GPT Image 1",
6489
+ pricing: {
6490
+ bySize: {
6491
+ "1024x1024": { low: 0.011, medium: 0.04, high: 0.17 },
6492
+ "1024x1536": { low: 0.016, medium: 0.06, high: 0.25 },
6493
+ "1536x1024": { low: 0.016, medium: 0.06, high: 0.25 }
6494
+ }
6495
+ },
6496
+ supportedSizes: [...GPT_IMAGE_SIZES],
6497
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
6498
+ maxImages: 1,
6499
+ defaultSize: "1024x1024",
6500
+ defaultQuality: "medium",
6501
+ features: {
6502
+ textRendering: true,
6503
+ transparency: true
6504
+ }
6505
+ },
6506
+ {
6507
+ provider: "openai",
6508
+ modelId: "gpt-image-1-mini",
6509
+ displayName: "GPT Image 1 Mini",
6510
+ pricing: {
6511
+ bySize: {
6512
+ "1024x1024": { low: 5e-3, medium: 0.02, high: 0.052 },
6513
+ "1024x1536": { low: 75e-4, medium: 0.03, high: 0.078 },
6514
+ "1536x1024": { low: 75e-4, medium: 0.03, high: 0.078 }
6515
+ }
6516
+ },
6517
+ supportedSizes: [...GPT_IMAGE_SIZES],
6518
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
6519
+ maxImages: 1,
6520
+ defaultSize: "1024x1024",
6521
+ defaultQuality: "medium",
6522
+ features: {
6523
+ textRendering: true,
6524
+ transparency: true
6525
+ }
6526
+ },
6527
+ // DALL-E Family
6528
+ {
6529
+ provider: "openai",
6530
+ modelId: "dall-e-3",
6531
+ displayName: "DALL-E 3",
6532
+ pricing: {
6533
+ bySize: {
6534
+ "1024x1024": { standard: 0.04, hd: 0.08 },
6535
+ "1024x1792": { standard: 0.08, hd: 0.12 },
6536
+ "1792x1024": { standard: 0.08, hd: 0.12 }
6537
+ }
6538
+ },
6539
+ supportedSizes: [...DALLE3_SIZES],
6540
+ supportedQualities: [...DALLE3_QUALITIES],
6541
+ maxImages: 1,
6542
+ // DALL-E 3 only supports n=1
6543
+ defaultSize: "1024x1024",
6544
+ defaultQuality: "standard",
6545
+ features: {
6546
+ textRendering: true
6547
+ }
6548
+ },
6549
+ {
6550
+ provider: "openai",
6551
+ modelId: "dall-e-2",
6552
+ displayName: "DALL-E 2 (Legacy)",
6553
+ pricing: {
6554
+ bySize: {
6555
+ "256x256": 0.016,
6556
+ "512x512": 0.018,
6557
+ "1024x1024": 0.02
6558
+ }
6559
+ },
6560
+ supportedSizes: [...DALLE2_SIZES],
6561
+ maxImages: 10,
6562
+ defaultSize: "1024x1024"
6563
+ }
6564
+ ];
6565
+ }
6566
+ });
6567
+
5936
6568
  // src/providers/openai-models.ts
5937
6569
  var OPENAI_MODELS;
5938
6570
  var init_openai_models = __esm({
@@ -6297,6 +6929,144 @@ var init_openai_models = __esm({
6297
6929
  }
6298
6930
  });
6299
6931
 
6932
+ // src/providers/openai-speech-models.ts
6933
+ function getOpenAISpeechModelSpec(modelId) {
6934
+ return openaiSpeechModels.find((m) => m.modelId === modelId);
6935
+ }
6936
+ function isOpenAISpeechModel(modelId) {
6937
+ return openaiSpeechModels.some((m) => m.modelId === modelId);
6938
+ }
6939
+ function calculateOpenAISpeechCost(modelId, characterCount, estimatedMinutes) {
6940
+ const spec = getOpenAISpeechModelSpec(modelId);
6941
+ if (!spec) return void 0;
6942
+ if (spec.pricing.perCharacter !== void 0) {
6943
+ return characterCount * spec.pricing.perCharacter;
6944
+ }
6945
+ if (spec.pricing.perMinute !== void 0 && estimatedMinutes !== void 0) {
6946
+ return estimatedMinutes * spec.pricing.perMinute;
6947
+ }
6948
+ if (spec.pricing.perMinute !== void 0) {
6949
+ const approxMinutes = characterCount / 750;
6950
+ return approxMinutes * spec.pricing.perMinute;
6951
+ }
6952
+ return void 0;
6953
+ }
6954
+ var OPENAI_TTS_VOICES, OPENAI_TTS_EXTENDED_VOICES, OPENAI_TTS_FORMATS, openaiSpeechModels;
6955
+ var init_openai_speech_models = __esm({
6956
+ "src/providers/openai-speech-models.ts"() {
6957
+ "use strict";
6958
+ OPENAI_TTS_VOICES = [
6959
+ "alloy",
6960
+ "echo",
6961
+ "fable",
6962
+ "onyx",
6963
+ "nova",
6964
+ "shimmer"
6965
+ ];
6966
+ OPENAI_TTS_EXTENDED_VOICES = [
6967
+ ...OPENAI_TTS_VOICES,
6968
+ "ash",
6969
+ "ballad",
6970
+ "coral",
6971
+ "sage",
6972
+ "verse"
6973
+ ];
6974
+ OPENAI_TTS_FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
6975
+ openaiSpeechModels = [
6976
+ // Standard TTS models (character-based pricing)
6977
+ {
6978
+ provider: "openai",
6979
+ modelId: "tts-1",
6980
+ displayName: "TTS-1",
6981
+ pricing: {
6982
+ // $15 per 1M characters = $0.000015 per character
6983
+ perCharacter: 15e-6
6984
+ },
6985
+ voices: [...OPENAI_TTS_VOICES],
6986
+ formats: OPENAI_TTS_FORMATS,
6987
+ maxInputLength: 4096,
6988
+ defaultVoice: "alloy",
6989
+ defaultFormat: "mp3",
6990
+ features: {
6991
+ voiceInstructions: false
6992
+ }
6993
+ },
6994
+ {
6995
+ provider: "openai",
6996
+ modelId: "tts-1-1106",
6997
+ displayName: "TTS-1 (Nov 2023)",
6998
+ pricing: {
6999
+ perCharacter: 15e-6
7000
+ },
7001
+ voices: [...OPENAI_TTS_VOICES],
7002
+ formats: OPENAI_TTS_FORMATS,
7003
+ maxInputLength: 4096,
7004
+ defaultVoice: "alloy",
7005
+ defaultFormat: "mp3",
7006
+ features: {
7007
+ voiceInstructions: false
7008
+ }
7009
+ },
7010
+ {
7011
+ provider: "openai",
7012
+ modelId: "tts-1-hd",
7013
+ displayName: "TTS-1 HD",
7014
+ pricing: {
7015
+ // $30 per 1M characters = $0.00003 per character
7016
+ perCharacter: 3e-5
7017
+ },
7018
+ voices: [...OPENAI_TTS_VOICES],
7019
+ formats: OPENAI_TTS_FORMATS,
7020
+ maxInputLength: 4096,
7021
+ defaultVoice: "alloy",
7022
+ defaultFormat: "mp3",
7023
+ features: {
7024
+ voiceInstructions: false
7025
+ }
7026
+ },
7027
+ {
7028
+ provider: "openai",
7029
+ modelId: "tts-1-hd-1106",
7030
+ displayName: "TTS-1 HD (Nov 2023)",
7031
+ pricing: {
7032
+ perCharacter: 3e-5
7033
+ },
7034
+ voices: [...OPENAI_TTS_VOICES],
7035
+ formats: OPENAI_TTS_FORMATS,
7036
+ maxInputLength: 4096,
7037
+ defaultVoice: "alloy",
7038
+ defaultFormat: "mp3",
7039
+ features: {
7040
+ voiceInstructions: false
7041
+ }
7042
+ },
7043
+ // Token-based TTS model with voice instructions support
7044
+ {
7045
+ provider: "openai",
7046
+ modelId: "gpt-4o-mini-tts",
7047
+ displayName: "GPT-4o Mini TTS",
7048
+ pricing: {
7049
+ // $0.60 per 1M input tokens = $0.0000006 per token
7050
+ perInputToken: 6e-7,
7051
+ // $12 per 1M audio output tokens = $0.000012 per token
7052
+ perAudioOutputToken: 12e-6,
7053
+ // ~$0.015 per minute of audio
7054
+ perMinute: 0.015
7055
+ },
7056
+ voices: [...OPENAI_TTS_EXTENDED_VOICES],
7057
+ formats: OPENAI_TTS_FORMATS,
7058
+ maxInputLength: 2e3,
7059
+ // tokens, not characters
7060
+ defaultVoice: "alloy",
7061
+ defaultFormat: "mp3",
7062
+ features: {
7063
+ voiceInstructions: true
7064
+ }
7065
+ }
7066
+ ];
7067
+ }
7068
+ });
7069
+
6300
7070
  // src/providers/openai.ts
6301
7071
  function sanitizeExtra(extra, allowTemperature) {
6302
7072
  if (!extra) {
@@ -6318,7 +7088,9 @@ var init_openai = __esm({
6318
7088
  import_tiktoken = require("tiktoken");
6319
7089
  init_base_provider();
6320
7090
  init_constants2();
7091
+ init_openai_image_models();
6321
7092
  init_openai_models();
7093
+ init_openai_speech_models();
6322
7094
  init_utils();
6323
7095
  ROLE_MAP = {
6324
7096
  system: "system",
@@ -6333,6 +7105,87 @@ var init_openai = __esm({
6333
7105
  getModelSpecs() {
6334
7106
  return OPENAI_MODELS;
6335
7107
  }
7108
+ // =========================================================================
7109
+ // Image Generation
7110
+ // =========================================================================
7111
+ getImageModelSpecs() {
7112
+ return openaiImageModels;
7113
+ }
7114
+ supportsImageGeneration(modelId) {
7115
+ return isOpenAIImageModel(modelId);
7116
+ }
7117
+ async generateImage(options) {
7118
+ const client = this.client;
7119
+ const spec = getOpenAIImageModelSpec(options.model);
7120
+ const size = options.size ?? spec?.defaultSize ?? "1024x1024";
7121
+ const quality = options.quality ?? spec?.defaultQuality ?? "standard";
7122
+ const n = options.n ?? 1;
7123
+ const isDallE2 = options.model === "dall-e-2";
7124
+ const isGptImage = options.model.startsWith("gpt-image");
7125
+ const requestParams = {
7126
+ model: options.model,
7127
+ prompt: options.prompt,
7128
+ size,
7129
+ n
7130
+ };
7131
+ if (!isDallE2 && !isGptImage) {
7132
+ requestParams.quality = quality;
7133
+ }
7134
+ if (isGptImage) {
7135
+ } else if (!isDallE2) {
7136
+ requestParams.response_format = options.responseFormat ?? "url";
7137
+ }
7138
+ const response = await client.images.generate(requestParams);
7139
+ const cost = calculateOpenAIImageCost(options.model, size, quality, n);
7140
+ const images = response.data ?? [];
7141
+ return {
7142
+ images: images.map((img) => ({
7143
+ url: img.url,
7144
+ b64Json: img.b64_json,
7145
+ revisedPrompt: img.revised_prompt
7146
+ })),
7147
+ model: options.model,
7148
+ usage: {
7149
+ imagesGenerated: images.length,
7150
+ size,
7151
+ quality
7152
+ },
7153
+ cost
7154
+ };
7155
+ }
7156
+ // =========================================================================
7157
+ // Speech Generation
7158
+ // =========================================================================
7159
+ getSpeechModelSpecs() {
7160
+ return openaiSpeechModels;
7161
+ }
7162
+ supportsSpeechGeneration(modelId) {
7163
+ return isOpenAISpeechModel(modelId);
7164
+ }
7165
+ async generateSpeech(options) {
7166
+ const client = this.client;
7167
+ const spec = getOpenAISpeechModelSpec(options.model);
7168
+ const format = options.responseFormat ?? spec?.defaultFormat ?? "mp3";
7169
+ const voice = options.voice ?? spec?.defaultVoice ?? "alloy";
7170
+ const response = await client.audio.speech.create({
7171
+ model: options.model,
7172
+ input: options.input,
7173
+ voice,
7174
+ response_format: format,
7175
+ speed: options.speed ?? 1
7176
+ });
7177
+ const audioBuffer = await response.arrayBuffer();
7178
+ const cost = calculateOpenAISpeechCost(options.model, options.input.length);
7179
+ return {
7180
+ audio: audioBuffer,
7181
+ model: options.model,
7182
+ usage: {
7183
+ characterCount: options.input.length
7184
+ },
7185
+ cost,
7186
+ format
7187
+ };
7188
+ }
6336
7189
  buildRequestPayload(options, descriptor, spec, messages) {
6337
7190
  const { maxTokens, temperature, topP, stopSequences, extra } = options;
6338
7191
  const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
@@ -6673,30 +7526,109 @@ var init_model_registry = __esm({
6673
7526
  }
6674
7527
  });
6675
7528
 
6676
- // src/core/options.ts
6677
- var ModelIdentifierParser;
6678
- var init_options = __esm({
6679
- "src/core/options.ts"() {
7529
+ // src/core/namespaces/image.ts
7530
+ var ImageNamespace;
7531
+ var init_image = __esm({
7532
+ "src/core/namespaces/image.ts"() {
6680
7533
  "use strict";
6681
- ModelIdentifierParser = class {
6682
- constructor(defaultProvider = "openai") {
7534
+ ImageNamespace = class {
7535
+ constructor(adapters, defaultProvider) {
7536
+ this.adapters = adapters;
6683
7537
  this.defaultProvider = defaultProvider;
6684
7538
  }
6685
- parse(identifier) {
6686
- const trimmed = identifier.trim();
6687
- if (!trimmed) {
6688
- throw new Error("Model identifier cannot be empty");
7539
+ /**
7540
+ * Generate images from a text prompt.
7541
+ *
7542
+ * @param options - Image generation options
7543
+ * @returns Promise resolving to the generation result with images and cost
7544
+ * @throws Error if the provider doesn't support image generation
7545
+ */
7546
+ async generate(options) {
7547
+ const modelId = options.model;
7548
+ const adapter = this.findImageAdapter(modelId);
7549
+ if (!adapter || !adapter.generateImage) {
7550
+ throw new Error(
7551
+ `No provider supports image generation for model "${modelId}". Available image models: ${this.listModels().map((m) => m.modelId).join(", ")}`
7552
+ );
6689
7553
  }
6690
- const [maybeProvider, ...rest] = trimmed.split(":");
6691
- if (rest.length === 0) {
6692
- return { provider: this.defaultProvider, name: maybeProvider };
7554
+ return adapter.generateImage(options);
7555
+ }
7556
+ /**
7557
+ * List all available image generation models.
7558
+ */
7559
+ listModels() {
7560
+ const models = [];
7561
+ for (const adapter of this.adapters) {
7562
+ if (adapter.getImageModelSpecs) {
7563
+ models.push(...adapter.getImageModelSpecs());
7564
+ }
6693
7565
  }
6694
- const provider = maybeProvider;
6695
- const name = rest.join(":");
6696
- if (!name) {
6697
- throw new Error("Model name cannot be empty");
7566
+ return models;
7567
+ }
7568
+ /**
7569
+ * Check if a model is supported for image generation.
7570
+ */
7571
+ supportsModel(modelId) {
7572
+ return this.findImageAdapter(modelId) !== void 0;
7573
+ }
7574
+ findImageAdapter(modelId) {
7575
+ return this.adapters.find(
7576
+ (adapter) => adapter.supportsImageGeneration?.(modelId) ?? false
7577
+ );
7578
+ }
7579
+ };
7580
+ }
7581
+ });
7582
+
7583
+ // src/core/namespaces/speech.ts
7584
+ var SpeechNamespace;
7585
+ var init_speech = __esm({
7586
+ "src/core/namespaces/speech.ts"() {
7587
+ "use strict";
7588
+ SpeechNamespace = class {
7589
+ constructor(adapters, defaultProvider) {
7590
+ this.adapters = adapters;
7591
+ this.defaultProvider = defaultProvider;
7592
+ }
7593
+ /**
7594
+ * Generate speech audio from text.
7595
+ *
7596
+ * @param options - Speech generation options
7597
+ * @returns Promise resolving to the generation result with audio and cost
7598
+ * @throws Error if the provider doesn't support speech generation
7599
+ */
7600
+ async generate(options) {
7601
+ const modelId = options.model;
7602
+ const adapter = this.findSpeechAdapter(modelId);
7603
+ if (!adapter || !adapter.generateSpeech) {
7604
+ throw new Error(
7605
+ `No provider supports speech generation for model "${modelId}". Available speech models: ${this.listModels().map((m) => m.modelId).join(", ")}`
7606
+ );
6698
7607
  }
6699
- return { provider, name };
7608
+ return adapter.generateSpeech(options);
7609
+ }
7610
+ /**
7611
+ * List all available speech generation models.
7612
+ */
7613
+ listModels() {
7614
+ const models = [];
7615
+ for (const adapter of this.adapters) {
7616
+ if (adapter.getSpeechModelSpecs) {
7617
+ models.push(...adapter.getSpeechModelSpecs());
7618
+ }
7619
+ }
7620
+ return models;
7621
+ }
7622
+ /**
7623
+ * Check if a model is supported for speech generation.
7624
+ */
7625
+ supportsModel(modelId) {
7626
+ return this.findSpeechAdapter(modelId) !== void 0;
7627
+ }
7628
+ findSpeechAdapter(modelId) {
7629
+ return this.adapters.find(
7630
+ (adapter) => adapter.supportsSpeechGeneration?.(modelId) ?? false
7631
+ );
6700
7632
  }
6701
7633
  };
6702
7634
  }
@@ -6745,6 +7677,69 @@ var init_quick_methods = __esm({
6745
7677
  }
6746
7678
  });
6747
7679
 
7680
+ // src/core/namespaces/text.ts
7681
+ var TextNamespace;
7682
+ var init_text = __esm({
7683
+ "src/core/namespaces/text.ts"() {
7684
+ "use strict";
7685
+ init_quick_methods();
7686
+ TextNamespace = class {
7687
+ constructor(client) {
7688
+ this.client = client;
7689
+ }
7690
+ /**
7691
+ * Generate a complete text response.
7692
+ *
7693
+ * @param prompt - User prompt
7694
+ * @param options - Optional configuration
7695
+ * @returns Complete text response
7696
+ */
7697
+ async complete(prompt, options) {
7698
+ return complete(this.client, prompt, options);
7699
+ }
7700
+ /**
7701
+ * Stream text chunks.
7702
+ *
7703
+ * @param prompt - User prompt
7704
+ * @param options - Optional configuration
7705
+ * @returns Async generator yielding text chunks
7706
+ */
7707
+ stream(prompt, options) {
7708
+ return stream(this.client, prompt, options);
7709
+ }
7710
+ };
7711
+ }
7712
+ });
7713
+
7714
+ // src/core/options.ts
7715
+ var ModelIdentifierParser;
7716
+ var init_options = __esm({
7717
+ "src/core/options.ts"() {
7718
+ "use strict";
7719
+ ModelIdentifierParser = class {
7720
+ constructor(defaultProvider = "openai") {
7721
+ this.defaultProvider = defaultProvider;
7722
+ }
7723
+ parse(identifier) {
7724
+ const trimmed = identifier.trim();
7725
+ if (!trimmed) {
7726
+ throw new Error("Model identifier cannot be empty");
7727
+ }
7728
+ const [maybeProvider, ...rest] = trimmed.split(":");
7729
+ if (rest.length === 0) {
7730
+ return { provider: this.defaultProvider, name: maybeProvider };
7731
+ }
7732
+ const provider = maybeProvider;
7733
+ const name = rest.join(":");
7734
+ if (!name) {
7735
+ throw new Error("Model name cannot be empty");
7736
+ }
7737
+ return { provider, name };
7738
+ }
7739
+ };
7740
+ }
7741
+ });
7742
+
6748
7743
  // src/core/client.ts
6749
7744
  var client_exports = {};
6750
7745
  __export(client_exports, {
@@ -6757,12 +7752,20 @@ var init_client = __esm({
6757
7752
  init_builder();
6758
7753
  init_discovery();
6759
7754
  init_model_registry();
7755
+ init_image();
7756
+ init_speech();
7757
+ init_text();
6760
7758
  init_options();
6761
7759
  init_quick_methods();
6762
7760
  LLMist = class _LLMist {
6763
7761
  parser;
7762
+ defaultProvider;
6764
7763
  modelRegistry;
6765
7764
  adapters;
7765
+ // Namespaces for different generation types
7766
+ text;
7767
+ image;
7768
+ speech;
6766
7769
  constructor(...args) {
6767
7770
  let adapters = [];
6768
7771
  let defaultProvider;
@@ -6801,6 +7804,7 @@ var init_client = __esm({
6801
7804
  const priorityB = b.priority ?? 0;
6802
7805
  return priorityB - priorityA;
6803
7806
  });
7807
+ this.defaultProvider = resolvedDefaultProvider;
6804
7808
  this.parser = new ModelIdentifierParser(resolvedDefaultProvider);
6805
7809
  this.modelRegistry = new ModelRegistry();
6806
7810
  for (const adapter of this.adapters) {
@@ -6809,6 +7813,9 @@ var init_client = __esm({
6809
7813
  if (customModels.length > 0) {
6810
7814
  this.modelRegistry.registerModels(customModels);
6811
7815
  }
7816
+ this.text = new TextNamespace(this);
7817
+ this.image = new ImageNamespace(this.adapters, this.defaultProvider);
7818
+ this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
6812
7819
  }
6813
7820
  stream(options) {
6814
7821
  const descriptor = this.parser.parse(options.model);