llmist 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2542,7 +2542,27 @@ var init_cost_reporting_client = __esm({
2542
2542
  constructor(client, reportCost) {
2543
2543
  this.client = client;
2544
2544
  this.reportCost = reportCost;
2545
+ this.image = {
2546
+ generate: async (options) => {
2547
+ const result = await this.client.image.generate(options);
2548
+ if (result.cost !== void 0 && result.cost > 0) {
2549
+ this.reportCost(result.cost);
2550
+ }
2551
+ return result;
2552
+ }
2553
+ };
2554
+ this.speech = {
2555
+ generate: async (options) => {
2556
+ const result = await this.client.speech.generate(options);
2557
+ if (result.cost !== void 0 && result.cost > 0) {
2558
+ this.reportCost(result.cost);
2559
+ }
2560
+ return result;
2561
+ }
2562
+ };
2545
2563
  }
2564
+ image;
2565
+ speech;
2546
2566
  /**
2547
2567
  * Access to model registry for cost estimation.
2548
2568
  */
@@ -5430,6 +5450,28 @@ var init_anthropic = __esm({
5430
5450
  getModelSpecs() {
5431
5451
  return ANTHROPIC_MODELS;
5432
5452
  }
5453
+ // =========================================================================
5454
+ // Image Generation (Not Supported)
5455
+ // =========================================================================
5456
+ supportsImageGeneration(_modelId) {
5457
+ return false;
5458
+ }
5459
+ async generateImage() {
5460
+ throw new Error(
5461
+ "Anthropic does not support image generation. Use OpenAI (DALL-E, GPT Image) or Google Gemini (Imagen) instead."
5462
+ );
5463
+ }
5464
+ // =========================================================================
5465
+ // Speech Generation (Not Supported)
5466
+ // =========================================================================
5467
+ supportsSpeechGeneration(_modelId) {
5468
+ return false;
5469
+ }
5470
+ async generateSpeech() {
5471
+ throw new Error(
5472
+ "Anthropic does not support speech generation. Use OpenAI (TTS) or Google Gemini (TTS) instead."
5473
+ );
5474
+ }
5433
5475
  buildRequestPayload(options, descriptor, spec, messages) {
5434
5476
  const systemMessages = messages.filter((message) => message.role === "system");
5435
5477
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
@@ -5584,6 +5626,182 @@ var init_anthropic = __esm({
5584
5626
  }
5585
5627
  });
5586
5628
 
5629
+ // src/providers/gemini-image-models.ts
5630
+ function getGeminiImageModelSpec(modelId) {
5631
+ return geminiImageModels.find((m) => m.modelId === modelId);
5632
+ }
5633
+ function isGeminiImageModel(modelId) {
5634
+ return geminiImageModels.some((m) => m.modelId === modelId);
5635
+ }
5636
+ function calculateGeminiImageCost(modelId, size = "1:1", n = 1) {
5637
+ const spec = getGeminiImageModelSpec(modelId);
5638
+ if (!spec) return void 0;
5639
+ if (spec.pricing.perImage !== void 0) {
5640
+ return spec.pricing.perImage * n;
5641
+ }
5642
+ if (spec.pricing.bySize) {
5643
+ const sizePrice = spec.pricing.bySize[size];
5644
+ if (typeof sizePrice === "number") {
5645
+ return sizePrice * n;
5646
+ }
5647
+ }
5648
+ return void 0;
5649
+ }
5650
+ var IMAGEN4_ASPECT_RATIOS, GEMINI_IMAGE_ASPECT_RATIOS, geminiImageModels;
5651
+ var init_gemini_image_models = __esm({
5652
+ "src/providers/gemini-image-models.ts"() {
5653
+ "use strict";
5654
+ IMAGEN4_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
5655
+ GEMINI_IMAGE_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
5656
+ geminiImageModels = [
5657
+ // Imagen 4 Family (standalone image generation)
5658
+ {
5659
+ provider: "gemini",
5660
+ modelId: "imagen-4.0-fast-generate-001",
5661
+ displayName: "Imagen 4 Fast",
5662
+ pricing: {
5663
+ perImage: 0.02
5664
+ },
5665
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5666
+ maxImages: 4,
5667
+ defaultSize: "1:1",
5668
+ features: {
5669
+ textRendering: true
5670
+ }
5671
+ },
5672
+ {
5673
+ provider: "gemini",
5674
+ modelId: "imagen-4.0-generate-001",
5675
+ displayName: "Imagen 4",
5676
+ pricing: {
5677
+ perImage: 0.04
5678
+ },
5679
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5680
+ maxImages: 4,
5681
+ defaultSize: "1:1",
5682
+ features: {
5683
+ textRendering: true
5684
+ }
5685
+ },
5686
+ {
5687
+ provider: "gemini",
5688
+ modelId: "imagen-4.0-ultra-generate-001",
5689
+ displayName: "Imagen 4 Ultra",
5690
+ pricing: {
5691
+ perImage: 0.06
5692
+ },
5693
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5694
+ maxImages: 4,
5695
+ defaultSize: "1:1",
5696
+ features: {
5697
+ textRendering: true
5698
+ }
5699
+ },
5700
+ // Preview versions
5701
+ {
5702
+ provider: "gemini",
5703
+ modelId: "imagen-4.0-generate-preview-06-06",
5704
+ displayName: "Imagen 4 (Preview)",
5705
+ pricing: {
5706
+ perImage: 0.04
5707
+ },
5708
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5709
+ maxImages: 4,
5710
+ defaultSize: "1:1",
5711
+ features: {
5712
+ textRendering: true
5713
+ }
5714
+ },
5715
+ {
5716
+ provider: "gemini",
5717
+ modelId: "imagen-4.0-ultra-generate-preview-06-06",
5718
+ displayName: "Imagen 4 Ultra (Preview)",
5719
+ pricing: {
5720
+ perImage: 0.06
5721
+ },
5722
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
5723
+ maxImages: 4,
5724
+ defaultSize: "1:1",
5725
+ features: {
5726
+ textRendering: true
5727
+ }
5728
+ },
5729
+ // Gemini Native Image Generation (multimodal models)
5730
+ {
5731
+ provider: "gemini",
5732
+ modelId: "gemini-2.5-flash-image",
5733
+ displayName: "Gemini 2.5 Flash Image",
5734
+ pricing: {
5735
+ perImage: 0.039
5736
+ },
5737
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
5738
+ maxImages: 1,
5739
+ defaultSize: "1:1",
5740
+ features: {
5741
+ conversational: true,
5742
+ textRendering: true
5743
+ }
5744
+ },
5745
+ {
5746
+ provider: "gemini",
5747
+ modelId: "gemini-2.5-flash-image-preview",
5748
+ displayName: "Gemini 2.5 Flash Image (Preview)",
5749
+ pricing: {
5750
+ perImage: 0.039
5751
+ },
5752
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
5753
+ maxImages: 1,
5754
+ defaultSize: "1:1",
5755
+ features: {
5756
+ conversational: true,
5757
+ textRendering: true
5758
+ }
5759
+ },
5760
+ {
5761
+ provider: "gemini",
5762
+ modelId: "gemini-3-pro-image-preview",
5763
+ displayName: "Gemini 3 Pro Image (Preview)",
5764
+ pricing: {
5765
+ // Token-based: ~$0.134 per 1K/2K image, $0.24 per 4K
5766
+ // Using 2K as default
5767
+ bySize: {
5768
+ "1K": 0.134,
5769
+ "2K": 0.134,
5770
+ "4K": 0.24
5771
+ }
5772
+ },
5773
+ supportedSizes: ["1K", "2K", "4K"],
5774
+ maxImages: 1,
5775
+ defaultSize: "2K",
5776
+ features: {
5777
+ conversational: true,
5778
+ textRendering: true
5779
+ }
5780
+ },
5781
+ // Alias: nano-banana-pro-preview is gemini-3-pro-image-preview
5782
+ {
5783
+ provider: "gemini",
5784
+ modelId: "nano-banana-pro-preview",
5785
+ displayName: "Nano Banana Pro (Gemini 3 Pro Image)",
5786
+ pricing: {
5787
+ bySize: {
5788
+ "1K": 0.134,
5789
+ "2K": 0.134,
5790
+ "4K": 0.24
5791
+ }
5792
+ },
5793
+ supportedSizes: ["1K", "2K", "4K"],
5794
+ maxImages: 1,
5795
+ defaultSize: "2K",
5796
+ features: {
5797
+ conversational: true,
5798
+ textRendering: true
5799
+ }
5800
+ }
5801
+ ];
5802
+ }
5803
+ });
5804
+
5587
5805
  // src/providers/gemini-models.ts
5588
5806
  var GEMINI_MODELS;
5589
5807
  var init_gemini_models = __esm({
@@ -5757,8 +5975,172 @@ var init_gemini_models = __esm({
5757
5975
  }
5758
5976
  });
5759
5977
 
5978
+ // src/providers/gemini-speech-models.ts
5979
+ function getGeminiSpeechModelSpec(modelId) {
5980
+ return geminiSpeechModels.find((m) => m.modelId === modelId);
5981
+ }
5982
+ function isGeminiSpeechModel(modelId) {
5983
+ return geminiSpeechModels.some((m) => m.modelId === modelId);
5984
+ }
5985
+ function calculateGeminiSpeechCost(modelId, characterCount, estimatedMinutes) {
5986
+ const spec = getGeminiSpeechModelSpec(modelId);
5987
+ if (!spec) return void 0;
5988
+ if (spec.pricing.perMinute !== void 0) {
5989
+ if (estimatedMinutes !== void 0) {
5990
+ return estimatedMinutes * spec.pricing.perMinute;
5991
+ }
5992
+ const approxMinutes = characterCount / 750;
5993
+ return approxMinutes * spec.pricing.perMinute;
5994
+ }
5995
+ return void 0;
5996
+ }
5997
+ var GEMINI_TTS_VOICES, GEMINI_TTS_FORMATS, geminiSpeechModels;
5998
+ var init_gemini_speech_models = __esm({
5999
+ "src/providers/gemini-speech-models.ts"() {
6000
+ "use strict";
6001
+ GEMINI_TTS_VOICES = [
6002
+ "Zephyr",
6003
+ // Bright
6004
+ "Puck",
6005
+ // Upbeat
6006
+ "Charon",
6007
+ // Informative
6008
+ "Kore",
6009
+ // Firm
6010
+ "Fenrir",
6011
+ // Excitable
6012
+ "Leda",
6013
+ // Youthful
6014
+ "Orus",
6015
+ // Firm
6016
+ "Aoede",
6017
+ // Breezy
6018
+ "Callirrhoe",
6019
+ // Easy-going
6020
+ "Autonoe",
6021
+ // Bright
6022
+ "Enceladus",
6023
+ // Breathy
6024
+ "Iapetus",
6025
+ // Clear
6026
+ "Umbriel",
6027
+ // Easy-going
6028
+ "Algieba",
6029
+ // Smooth
6030
+ "Despina",
6031
+ // Smooth
6032
+ "Erinome",
6033
+ // Clear
6034
+ "Algenib",
6035
+ // Gravelly
6036
+ "Rasalgethi",
6037
+ // Informative
6038
+ "Laomedeia",
6039
+ // Upbeat
6040
+ "Achernar",
6041
+ // Soft
6042
+ "Alnilam",
6043
+ // Firm
6044
+ "Schedar",
6045
+ // Even
6046
+ "Gacrux",
6047
+ // Mature
6048
+ "Pulcherrima",
6049
+ // Forward
6050
+ "Achird",
6051
+ // Friendly
6052
+ "Zubenelgenubi",
6053
+ // Casual
6054
+ "Vindemiatrix",
6055
+ // Gentle
6056
+ "Sadachbia",
6057
+ // Lively
6058
+ "Sadaltager",
6059
+ // Knowledgeable
6060
+ "Sulafat"
6061
+ // Warm
6062
+ ];
6063
+ GEMINI_TTS_FORMATS = ["pcm", "wav"];
6064
+ geminiSpeechModels = [
6065
+ {
6066
+ provider: "gemini",
6067
+ modelId: "gemini-2.5-flash-preview-tts",
6068
+ displayName: "Gemini 2.5 Flash TTS (Preview)",
6069
+ pricing: {
6070
+ // $0.50 per 1M input tokens = $0.0000005 per token
6071
+ perInputToken: 5e-7,
6072
+ // $10.00 per 1M audio output tokens = $0.00001 per token
6073
+ perAudioOutputToken: 1e-5,
6074
+ // Rough estimate: ~$0.01 per minute of audio
6075
+ perMinute: 0.01
6076
+ },
6077
+ voices: [...GEMINI_TTS_VOICES],
6078
+ formats: GEMINI_TTS_FORMATS,
6079
+ maxInputLength: 8e3,
6080
+ // bytes (text + prompt combined)
6081
+ defaultVoice: "Zephyr",
6082
+ defaultFormat: "wav",
6083
+ features: {
6084
+ multiSpeaker: true,
6085
+ languages: 24,
6086
+ voiceInstructions: true
6087
+ }
6088
+ },
6089
+ {
6090
+ provider: "gemini",
6091
+ modelId: "gemini-2.5-pro-preview-tts",
6092
+ displayName: "Gemini 2.5 Pro TTS (Preview)",
6093
+ pricing: {
6094
+ // $1.00 per 1M input tokens = $0.000001 per token
6095
+ perInputToken: 1e-6,
6096
+ // $20.00 per 1M audio output tokens = $0.00002 per token
6097
+ perAudioOutputToken: 2e-5,
6098
+ // Rough estimate: ~$0.02 per minute of audio
6099
+ perMinute: 0.02
6100
+ },
6101
+ voices: [...GEMINI_TTS_VOICES],
6102
+ formats: GEMINI_TTS_FORMATS,
6103
+ maxInputLength: 8e3,
6104
+ // bytes
6105
+ defaultVoice: "Zephyr",
6106
+ defaultFormat: "wav",
6107
+ features: {
6108
+ multiSpeaker: true,
6109
+ languages: 24,
6110
+ voiceInstructions: true
6111
+ }
6112
+ }
6113
+ ];
6114
+ }
6115
+ });
6116
+
5760
6117
  // src/providers/gemini.ts
5761
- import { FunctionCallingConfigMode, GoogleGenAI } from "@google/genai";
6118
+ import { FunctionCallingConfigMode, GoogleGenAI, Modality } from "@google/genai";
6119
+ function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
6120
+ const byteRate = sampleRate * numChannels * bitsPerSample / 8;
6121
+ const blockAlign = numChannels * bitsPerSample / 8;
6122
+ const dataSize = pcmData.length;
6123
+ const headerSize = 44;
6124
+ const fileSize = headerSize + dataSize - 8;
6125
+ const buffer = new ArrayBuffer(headerSize + dataSize);
6126
+ const view = new DataView(buffer);
6127
+ const uint8 = new Uint8Array(buffer);
6128
+ view.setUint32(0, 1380533830, false);
6129
+ view.setUint32(4, fileSize, true);
6130
+ view.setUint32(8, 1463899717, false);
6131
+ view.setUint32(12, 1718449184, false);
6132
+ view.setUint32(16, 16, true);
6133
+ view.setUint16(20, 1, true);
6134
+ view.setUint16(22, numChannels, true);
6135
+ view.setUint32(24, sampleRate, true);
6136
+ view.setUint32(28, byteRate, true);
6137
+ view.setUint16(32, blockAlign, true);
6138
+ view.setUint16(34, bitsPerSample, true);
6139
+ view.setUint32(36, 1684108385, false);
6140
+ view.setUint32(40, dataSize, true);
6141
+ uint8.set(pcmData, headerSize);
6142
+ return buffer;
6143
+ }
5762
6144
  function createGeminiProviderFromEnv() {
5763
6145
  return createProviderFromEnv("GEMINI_API_KEY", GoogleGenAI, GeminiGenerativeProvider);
5764
6146
  }
@@ -5768,7 +6150,9 @@ var init_gemini = __esm({
5768
6150
  "use strict";
5769
6151
  init_base_provider();
5770
6152
  init_constants2();
6153
+ init_gemini_image_models();
5771
6154
  init_gemini_models();
6155
+ init_gemini_speech_models();
5772
6156
  init_utils();
5773
6157
  GEMINI_ROLE_MAP = {
5774
6158
  system: "user",
@@ -5783,6 +6167,139 @@ var init_gemini = __esm({
5783
6167
  getModelSpecs() {
5784
6168
  return GEMINI_MODELS;
5785
6169
  }
6170
+ // =========================================================================
6171
+ // Image Generation
6172
+ // =========================================================================
6173
+ getImageModelSpecs() {
6174
+ return geminiImageModels;
6175
+ }
6176
+ supportsImageGeneration(modelId) {
6177
+ return isGeminiImageModel(modelId);
6178
+ }
6179
+ async generateImage(options) {
6180
+ const client = this.client;
6181
+ const spec = getGeminiImageModelSpec(options.model);
6182
+ const isImagenModel = options.model.startsWith("imagen");
6183
+ const aspectRatio = options.size ?? spec?.defaultSize ?? "1:1";
6184
+ const n = options.n ?? 1;
6185
+ if (isImagenModel) {
6186
+ const response2 = await client.models.generateImages({
6187
+ model: options.model,
6188
+ prompt: options.prompt,
6189
+ config: {
6190
+ numberOfImages: n,
6191
+ aspectRatio,
6192
+ outputMimeType: options.responseFormat === "b64_json" ? "image/png" : "image/jpeg"
6193
+ }
6194
+ });
6195
+ const images2 = response2.generatedImages ?? [];
6196
+ const cost2 = calculateGeminiImageCost(options.model, aspectRatio, images2.length);
6197
+ return {
6198
+ // Gemini's imageBytes is already base64 encoded, so use it directly
6199
+ images: images2.map((img) => ({
6200
+ b64Json: img.image?.imageBytes ?? void 0
6201
+ })),
6202
+ model: options.model,
6203
+ usage: {
6204
+ imagesGenerated: images2.length,
6205
+ size: aspectRatio,
6206
+ quality: "standard"
6207
+ },
6208
+ cost: cost2
6209
+ };
6210
+ }
6211
+ const response = await client.models.generateContent({
6212
+ model: options.model,
6213
+ contents: [{ role: "user", parts: [{ text: options.prompt }] }],
6214
+ config: {
6215
+ responseModalities: [Modality.IMAGE, Modality.TEXT]
6216
+ }
6217
+ });
6218
+ const images = [];
6219
+ const candidate = response.candidates?.[0];
6220
+ if (candidate?.content?.parts) {
6221
+ for (const part of candidate.content.parts) {
6222
+ if ("inlineData" in part && part.inlineData) {
6223
+ images.push({
6224
+ b64Json: part.inlineData.data
6225
+ });
6226
+ }
6227
+ }
6228
+ }
6229
+ const cost = calculateGeminiImageCost(options.model, aspectRatio, images.length);
6230
+ return {
6231
+ images,
6232
+ model: options.model,
6233
+ usage: {
6234
+ imagesGenerated: images.length,
6235
+ size: aspectRatio,
6236
+ quality: "standard"
6237
+ },
6238
+ cost
6239
+ };
6240
+ }
6241
+ // =========================================================================
6242
+ // Speech Generation
6243
+ // =========================================================================
6244
+ getSpeechModelSpecs() {
6245
+ return geminiSpeechModels;
6246
+ }
6247
+ supportsSpeechGeneration(modelId) {
6248
+ return isGeminiSpeechModel(modelId);
6249
+ }
6250
+ async generateSpeech(options) {
6251
+ const client = this.client;
6252
+ const spec = getGeminiSpeechModelSpec(options.model);
6253
+ const voice = options.voice ?? spec?.defaultVoice ?? "Zephyr";
6254
+ const response = await client.models.generateContent({
6255
+ model: options.model,
6256
+ contents: [
6257
+ {
6258
+ role: "user",
6259
+ parts: [{ text: options.input }]
6260
+ }
6261
+ ],
6262
+ config: {
6263
+ responseModalities: [Modality.AUDIO],
6264
+ speechConfig: {
6265
+ voiceConfig: {
6266
+ prebuiltVoiceConfig: {
6267
+ voiceName: voice
6268
+ }
6269
+ }
6270
+ }
6271
+ }
6272
+ });
6273
+ let pcmData;
6274
+ const candidate = response.candidates?.[0];
6275
+ if (candidate?.content?.parts) {
6276
+ for (const part of candidate.content.parts) {
6277
+ if ("inlineData" in part && part.inlineData?.data) {
6278
+ const base64 = part.inlineData.data;
6279
+ const binary = atob(base64);
6280
+ pcmData = new Uint8Array(binary.length);
6281
+ for (let i = 0; i < binary.length; i++) {
6282
+ pcmData[i] = binary.charCodeAt(i);
6283
+ }
6284
+ break;
6285
+ }
6286
+ }
6287
+ }
6288
+ if (!pcmData) {
6289
+ throw new Error("No audio data in Gemini TTS response");
6290
+ }
6291
+ const audioData = wrapPcmInWav(pcmData, 24e3, 16, 1);
6292
+ const cost = calculateGeminiSpeechCost(options.model, options.input.length);
6293
+ return {
6294
+ audio: audioData,
6295
+ model: options.model,
6296
+ usage: {
6297
+ characterCount: options.input.length
6298
+ },
6299
+ cost,
6300
+ format: spec?.defaultFormat ?? "wav"
6301
+ };
6302
+ }
5786
6303
  buildRequestPayload(options, descriptor, _spec, messages) {
5787
6304
  const contents = this.convertMessagesToContents(messages);
5788
6305
  const generationConfig = this.buildGenerationConfig(options);
@@ -5978,6 +6495,121 @@ var init_gemini = __esm({
5978
6495
  }
5979
6496
  });
5980
6497
 
6498
+ // src/providers/openai-image-models.ts
6499
+ function getOpenAIImageModelSpec(modelId) {
6500
+ return openaiImageModels.find((m) => m.modelId === modelId);
6501
+ }
6502
+ function isOpenAIImageModel(modelId) {
6503
+ return openaiImageModels.some((m) => m.modelId === modelId);
6504
+ }
6505
+ function calculateOpenAIImageCost(modelId, size, quality = "standard", n = 1) {
6506
+ const spec = getOpenAIImageModelSpec(modelId);
6507
+ if (!spec) return void 0;
6508
+ const sizePrice = spec.pricing.bySize?.[size];
6509
+ if (sizePrice === void 0) return void 0;
6510
+ let pricePerImage;
6511
+ if (typeof sizePrice === "number") {
6512
+ pricePerImage = sizePrice;
6513
+ } else {
6514
+ pricePerImage = sizePrice[quality];
6515
+ if (pricePerImage === void 0) return void 0;
6516
+ }
6517
+ return pricePerImage * n;
6518
+ }
6519
+ var GPT_IMAGE_SIZES, GPT_IMAGE_QUALITIES, DALLE3_SIZES, DALLE3_QUALITIES, DALLE2_SIZES, openaiImageModels;
6520
+ var init_openai_image_models = __esm({
6521
+ "src/providers/openai-image-models.ts"() {
6522
+ "use strict";
6523
+ GPT_IMAGE_SIZES = ["1024x1024", "1024x1536", "1536x1024"];
6524
+ GPT_IMAGE_QUALITIES = ["low", "medium", "high"];
6525
+ DALLE3_SIZES = ["1024x1024", "1024x1792", "1792x1024"];
6526
+ DALLE3_QUALITIES = ["standard", "hd"];
6527
+ DALLE2_SIZES = ["256x256", "512x512", "1024x1024"];
6528
+ openaiImageModels = [
6529
+ // GPT Image 1 Family (flagship)
6530
+ {
6531
+ provider: "openai",
6532
+ modelId: "gpt-image-1",
6533
+ displayName: "GPT Image 1",
6534
+ pricing: {
6535
+ bySize: {
6536
+ "1024x1024": { low: 0.011, medium: 0.04, high: 0.17 },
6537
+ "1024x1536": { low: 0.016, medium: 0.06, high: 0.25 },
6538
+ "1536x1024": { low: 0.016, medium: 0.06, high: 0.25 }
6539
+ }
6540
+ },
6541
+ supportedSizes: [...GPT_IMAGE_SIZES],
6542
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
6543
+ maxImages: 1,
6544
+ defaultSize: "1024x1024",
6545
+ defaultQuality: "medium",
6546
+ features: {
6547
+ textRendering: true,
6548
+ transparency: true
6549
+ }
6550
+ },
6551
+ {
6552
+ provider: "openai",
6553
+ modelId: "gpt-image-1-mini",
6554
+ displayName: "GPT Image 1 Mini",
6555
+ pricing: {
6556
+ bySize: {
6557
+ "1024x1024": { low: 5e-3, medium: 0.02, high: 0.052 },
6558
+ "1024x1536": { low: 75e-4, medium: 0.03, high: 0.078 },
6559
+ "1536x1024": { low: 75e-4, medium: 0.03, high: 0.078 }
6560
+ }
6561
+ },
6562
+ supportedSizes: [...GPT_IMAGE_SIZES],
6563
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
6564
+ maxImages: 1,
6565
+ defaultSize: "1024x1024",
6566
+ defaultQuality: "medium",
6567
+ features: {
6568
+ textRendering: true,
6569
+ transparency: true
6570
+ }
6571
+ },
6572
+ // DALL-E Family
6573
+ {
6574
+ provider: "openai",
6575
+ modelId: "dall-e-3",
6576
+ displayName: "DALL-E 3",
6577
+ pricing: {
6578
+ bySize: {
6579
+ "1024x1024": { standard: 0.04, hd: 0.08 },
6580
+ "1024x1792": { standard: 0.08, hd: 0.12 },
6581
+ "1792x1024": { standard: 0.08, hd: 0.12 }
6582
+ }
6583
+ },
6584
+ supportedSizes: [...DALLE3_SIZES],
6585
+ supportedQualities: [...DALLE3_QUALITIES],
6586
+ maxImages: 1,
6587
+ // DALL-E 3 only supports n=1
6588
+ defaultSize: "1024x1024",
6589
+ defaultQuality: "standard",
6590
+ features: {
6591
+ textRendering: true
6592
+ }
6593
+ },
6594
+ {
6595
+ provider: "openai",
6596
+ modelId: "dall-e-2",
6597
+ displayName: "DALL-E 2 (Legacy)",
6598
+ pricing: {
6599
+ bySize: {
6600
+ "256x256": 0.016,
6601
+ "512x512": 0.018,
6602
+ "1024x1024": 0.02
6603
+ }
6604
+ },
6605
+ supportedSizes: [...DALLE2_SIZES],
6606
+ maxImages: 10,
6607
+ defaultSize: "1024x1024"
6608
+ }
6609
+ ];
6610
+ }
6611
+ });
6612
+
5981
6613
  // src/providers/openai-models.ts
5982
6614
  var OPENAI_MODELS;
5983
6615
  var init_openai_models = __esm({
@@ -6342,6 +6974,144 @@ var init_openai_models = __esm({
6342
6974
  }
6343
6975
  });
6344
6976
 
6977
+ // src/providers/openai-speech-models.ts
6978
+ function getOpenAISpeechModelSpec(modelId) {
6979
+ return openaiSpeechModels.find((m) => m.modelId === modelId);
6980
+ }
6981
+ function isOpenAISpeechModel(modelId) {
6982
+ return openaiSpeechModels.some((m) => m.modelId === modelId);
6983
+ }
6984
+ function calculateOpenAISpeechCost(modelId, characterCount, estimatedMinutes) {
6985
+ const spec = getOpenAISpeechModelSpec(modelId);
6986
+ if (!spec) return void 0;
6987
+ if (spec.pricing.perCharacter !== void 0) {
6988
+ return characterCount * spec.pricing.perCharacter;
6989
+ }
6990
+ if (spec.pricing.perMinute !== void 0 && estimatedMinutes !== void 0) {
6991
+ return estimatedMinutes * spec.pricing.perMinute;
6992
+ }
6993
+ if (spec.pricing.perMinute !== void 0) {
6994
+ const approxMinutes = characterCount / 750;
6995
+ return approxMinutes * spec.pricing.perMinute;
6996
+ }
6997
+ return void 0;
6998
+ }
6999
+ var OPENAI_TTS_VOICES, OPENAI_TTS_EXTENDED_VOICES, OPENAI_TTS_FORMATS, openaiSpeechModels;
7000
+ var init_openai_speech_models = __esm({
7001
+ "src/providers/openai-speech-models.ts"() {
7002
+ "use strict";
7003
+ OPENAI_TTS_VOICES = [
7004
+ "alloy",
7005
+ "echo",
7006
+ "fable",
7007
+ "onyx",
7008
+ "nova",
7009
+ "shimmer"
7010
+ ];
7011
+ OPENAI_TTS_EXTENDED_VOICES = [
7012
+ ...OPENAI_TTS_VOICES,
7013
+ "ash",
7014
+ "ballad",
7015
+ "coral",
7016
+ "sage",
7017
+ "verse"
7018
+ ];
7019
+ OPENAI_TTS_FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
7020
+ openaiSpeechModels = [
7021
+ // Standard TTS models (character-based pricing)
7022
+ {
7023
+ provider: "openai",
7024
+ modelId: "tts-1",
7025
+ displayName: "TTS-1",
7026
+ pricing: {
7027
+ // $15 per 1M characters = $0.000015 per character
7028
+ perCharacter: 15e-6
7029
+ },
7030
+ voices: [...OPENAI_TTS_VOICES],
7031
+ formats: OPENAI_TTS_FORMATS,
7032
+ maxInputLength: 4096,
7033
+ defaultVoice: "alloy",
7034
+ defaultFormat: "mp3",
7035
+ features: {
7036
+ voiceInstructions: false
7037
+ }
7038
+ },
7039
+ {
7040
+ provider: "openai",
7041
+ modelId: "tts-1-1106",
7042
+ displayName: "TTS-1 (Nov 2023)",
7043
+ pricing: {
7044
+ perCharacter: 15e-6
7045
+ },
7046
+ voices: [...OPENAI_TTS_VOICES],
7047
+ formats: OPENAI_TTS_FORMATS,
7048
+ maxInputLength: 4096,
7049
+ defaultVoice: "alloy",
7050
+ defaultFormat: "mp3",
7051
+ features: {
7052
+ voiceInstructions: false
7053
+ }
7054
+ },
7055
+ {
7056
+ provider: "openai",
7057
+ modelId: "tts-1-hd",
7058
+ displayName: "TTS-1 HD",
7059
+ pricing: {
7060
+ // $30 per 1M characters = $0.00003 per character
7061
+ perCharacter: 3e-5
7062
+ },
7063
+ voices: [...OPENAI_TTS_VOICES],
7064
+ formats: OPENAI_TTS_FORMATS,
7065
+ maxInputLength: 4096,
7066
+ defaultVoice: "alloy",
7067
+ defaultFormat: "mp3",
7068
+ features: {
7069
+ voiceInstructions: false
7070
+ }
7071
+ },
7072
+ {
7073
+ provider: "openai",
7074
+ modelId: "tts-1-hd-1106",
7075
+ displayName: "TTS-1 HD (Nov 2023)",
7076
+ pricing: {
7077
+ perCharacter: 3e-5
7078
+ },
7079
+ voices: [...OPENAI_TTS_VOICES],
7080
+ formats: OPENAI_TTS_FORMATS,
7081
+ maxInputLength: 4096,
7082
+ defaultVoice: "alloy",
7083
+ defaultFormat: "mp3",
7084
+ features: {
7085
+ voiceInstructions: false
7086
+ }
7087
+ },
7088
+ // Token-based TTS model with voice instructions support
7089
+ {
7090
+ provider: "openai",
7091
+ modelId: "gpt-4o-mini-tts",
7092
+ displayName: "GPT-4o Mini TTS",
7093
+ pricing: {
7094
+ // $0.60 per 1M input tokens = $0.0000006 per token
7095
+ perInputToken: 6e-7,
7096
+ // $12 per 1M audio output tokens = $0.000012 per token
7097
+ perAudioOutputToken: 12e-6,
7098
+ // ~$0.015 per minute of audio
7099
+ perMinute: 0.015
7100
+ },
7101
+ voices: [...OPENAI_TTS_EXTENDED_VOICES],
7102
+ formats: OPENAI_TTS_FORMATS,
7103
+ maxInputLength: 2e3,
7104
+ // tokens, not characters
7105
+ defaultVoice: "alloy",
7106
+ defaultFormat: "mp3",
7107
+ features: {
7108
+ voiceInstructions: true
7109
+ }
7110
+ }
7111
+ ];
7112
+ }
7113
+ });
7114
+
6345
7115
  // src/providers/openai.ts
6346
7116
  import OpenAI from "openai";
6347
7117
  import { encoding_for_model } from "tiktoken";
@@ -6363,7 +7133,9 @@ var init_openai = __esm({
6363
7133
  "use strict";
6364
7134
  init_base_provider();
6365
7135
  init_constants2();
7136
+ init_openai_image_models();
6366
7137
  init_openai_models();
7138
+ init_openai_speech_models();
6367
7139
  init_utils();
6368
7140
  ROLE_MAP = {
6369
7141
  system: "system",
@@ -6378,6 +7150,87 @@ var init_openai = __esm({
6378
7150
  getModelSpecs() {
6379
7151
  return OPENAI_MODELS;
6380
7152
  }
7153
+ // =========================================================================
7154
+ // Image Generation
7155
+ // =========================================================================
7156
+ getImageModelSpecs() {
7157
+ return openaiImageModels;
7158
+ }
7159
+ supportsImageGeneration(modelId) {
7160
+ return isOpenAIImageModel(modelId);
7161
+ }
7162
+ async generateImage(options) {
7163
+ const client = this.client;
7164
+ const spec = getOpenAIImageModelSpec(options.model);
7165
+ const size = options.size ?? spec?.defaultSize ?? "1024x1024";
7166
+ const quality = options.quality ?? spec?.defaultQuality ?? "standard";
7167
+ const n = options.n ?? 1;
7168
+ const isDallE2 = options.model === "dall-e-2";
7169
+ const isGptImage = options.model.startsWith("gpt-image");
7170
+ const requestParams = {
7171
+ model: options.model,
7172
+ prompt: options.prompt,
7173
+ size,
7174
+ n
7175
+ };
7176
+ if (!isDallE2 && !isGptImage) {
7177
+ requestParams.quality = quality;
7178
+ }
7179
+ if (isGptImage) {
7180
+ } else if (!isDallE2) {
7181
+ requestParams.response_format = options.responseFormat ?? "url";
7182
+ }
7183
+ const response = await client.images.generate(requestParams);
7184
+ const cost = calculateOpenAIImageCost(options.model, size, quality, n);
7185
+ const images = response.data ?? [];
7186
+ return {
7187
+ images: images.map((img) => ({
7188
+ url: img.url,
7189
+ b64Json: img.b64_json,
7190
+ revisedPrompt: img.revised_prompt
7191
+ })),
7192
+ model: options.model,
7193
+ usage: {
7194
+ imagesGenerated: images.length,
7195
+ size,
7196
+ quality
7197
+ },
7198
+ cost
7199
+ };
7200
+ }
7201
+ // =========================================================================
7202
+ // Speech Generation
7203
+ // =========================================================================
7204
+ getSpeechModelSpecs() {
7205
+ return openaiSpeechModels;
7206
+ }
7207
+ supportsSpeechGeneration(modelId) {
7208
+ return isOpenAISpeechModel(modelId);
7209
+ }
7210
+ async generateSpeech(options) {
7211
+ const client = this.client;
7212
+ const spec = getOpenAISpeechModelSpec(options.model);
7213
+ const format = options.responseFormat ?? spec?.defaultFormat ?? "mp3";
7214
+ const voice = options.voice ?? spec?.defaultVoice ?? "alloy";
7215
+ const response = await client.audio.speech.create({
7216
+ model: options.model,
7217
+ input: options.input,
7218
+ voice,
7219
+ response_format: format,
7220
+ speed: options.speed ?? 1
7221
+ });
7222
+ const audioBuffer = await response.arrayBuffer();
7223
+ const cost = calculateOpenAISpeechCost(options.model, options.input.length);
7224
+ return {
7225
+ audio: audioBuffer,
7226
+ model: options.model,
7227
+ usage: {
7228
+ characterCount: options.input.length
7229
+ },
7230
+ cost,
7231
+ format
7232
+ };
7233
+ }
6381
7234
  buildRequestPayload(options, descriptor, spec, messages) {
6382
7235
  const { maxTokens, temperature, topP, stopSequences, extra } = options;
6383
7236
  const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
@@ -6718,30 +7571,109 @@ var init_model_registry = __esm({
6718
7571
  }
6719
7572
  });
6720
7573
 
6721
- // src/core/options.ts
6722
- var ModelIdentifierParser;
6723
- var init_options = __esm({
6724
- "src/core/options.ts"() {
7574
+ // src/core/namespaces/image.ts
7575
+ var ImageNamespace;
7576
+ var init_image = __esm({
7577
+ "src/core/namespaces/image.ts"() {
6725
7578
  "use strict";
6726
- ModelIdentifierParser = class {
6727
- constructor(defaultProvider = "openai") {
7579
+ ImageNamespace = class {
7580
+ constructor(adapters, defaultProvider) {
7581
+ this.adapters = adapters;
6728
7582
  this.defaultProvider = defaultProvider;
6729
7583
  }
6730
- parse(identifier) {
6731
- const trimmed = identifier.trim();
6732
- if (!trimmed) {
6733
- throw new Error("Model identifier cannot be empty");
7584
+ /**
7585
+ * Generate images from a text prompt.
7586
+ *
7587
+ * @param options - Image generation options
7588
+ * @returns Promise resolving to the generation result with images and cost
7589
+ * @throws Error if the provider doesn't support image generation
7590
+ */
7591
+ async generate(options) {
7592
+ const modelId = options.model;
7593
+ const adapter = this.findImageAdapter(modelId);
7594
+ if (!adapter || !adapter.generateImage) {
7595
+ throw new Error(
7596
+ `No provider supports image generation for model "${modelId}". Available image models: ${this.listModels().map((m) => m.modelId).join(", ")}`
7597
+ );
6734
7598
  }
6735
- const [maybeProvider, ...rest] = trimmed.split(":");
6736
- if (rest.length === 0) {
6737
- return { provider: this.defaultProvider, name: maybeProvider };
7599
+ return adapter.generateImage(options);
7600
+ }
7601
+ /**
7602
+ * List all available image generation models.
7603
+ */
7604
+ listModels() {
7605
+ const models = [];
7606
+ for (const adapter of this.adapters) {
7607
+ if (adapter.getImageModelSpecs) {
7608
+ models.push(...adapter.getImageModelSpecs());
7609
+ }
6738
7610
  }
6739
- const provider = maybeProvider;
6740
- const name = rest.join(":");
6741
- if (!name) {
6742
- throw new Error("Model name cannot be empty");
7611
+ return models;
7612
+ }
7613
+ /**
7614
+ * Check if a model is supported for image generation.
7615
+ */
7616
+ supportsModel(modelId) {
7617
+ return this.findImageAdapter(modelId) !== void 0;
7618
+ }
7619
+ findImageAdapter(modelId) {
7620
+ return this.adapters.find(
7621
+ (adapter) => adapter.supportsImageGeneration?.(modelId) ?? false
7622
+ );
7623
+ }
7624
+ };
7625
+ }
7626
+ });
7627
+
7628
+ // src/core/namespaces/speech.ts
7629
+ var SpeechNamespace;
7630
+ var init_speech = __esm({
7631
+ "src/core/namespaces/speech.ts"() {
7632
+ "use strict";
7633
+ SpeechNamespace = class {
7634
+ constructor(adapters, defaultProvider) {
7635
+ this.adapters = adapters;
7636
+ this.defaultProvider = defaultProvider;
7637
+ }
7638
+ /**
7639
+ * Generate speech audio from text.
7640
+ *
7641
+ * @param options - Speech generation options
7642
+ * @returns Promise resolving to the generation result with audio and cost
7643
+ * @throws Error if the provider doesn't support speech generation
7644
+ */
7645
+ async generate(options) {
7646
+ const modelId = options.model;
7647
+ const adapter = this.findSpeechAdapter(modelId);
7648
+ if (!adapter || !adapter.generateSpeech) {
7649
+ throw new Error(
7650
+ `No provider supports speech generation for model "${modelId}". Available speech models: ${this.listModels().map((m) => m.modelId).join(", ")}`
7651
+ );
6743
7652
  }
6744
- return { provider, name };
7653
+ return adapter.generateSpeech(options);
7654
+ }
7655
+ /**
7656
+ * List all available speech generation models.
7657
+ */
7658
+ listModels() {
7659
+ const models = [];
7660
+ for (const adapter of this.adapters) {
7661
+ if (adapter.getSpeechModelSpecs) {
7662
+ models.push(...adapter.getSpeechModelSpecs());
7663
+ }
7664
+ }
7665
+ return models;
7666
+ }
7667
+ /**
7668
+ * Check if a model is supported for speech generation.
7669
+ */
7670
+ supportsModel(modelId) {
7671
+ return this.findSpeechAdapter(modelId) !== void 0;
7672
+ }
7673
+ findSpeechAdapter(modelId) {
7674
+ return this.adapters.find(
7675
+ (adapter) => adapter.supportsSpeechGeneration?.(modelId) ?? false
7676
+ );
6745
7677
  }
6746
7678
  };
6747
7679
  }
@@ -6790,6 +7722,69 @@ var init_quick_methods = __esm({
6790
7722
  }
6791
7723
  });
6792
7724
 
7725
+ // src/core/namespaces/text.ts
7726
+ var TextNamespace;
7727
+ var init_text = __esm({
7728
+ "src/core/namespaces/text.ts"() {
7729
+ "use strict";
7730
+ init_quick_methods();
7731
+ TextNamespace = class {
7732
+ constructor(client) {
7733
+ this.client = client;
7734
+ }
7735
+ /**
7736
+ * Generate a complete text response.
7737
+ *
7738
+ * @param prompt - User prompt
7739
+ * @param options - Optional configuration
7740
+ * @returns Complete text response
7741
+ */
7742
+ async complete(prompt, options) {
7743
+ return complete(this.client, prompt, options);
7744
+ }
7745
+ /**
7746
+ * Stream text chunks.
7747
+ *
7748
+ * @param prompt - User prompt
7749
+ * @param options - Optional configuration
7750
+ * @returns Async generator yielding text chunks
7751
+ */
7752
+ stream(prompt, options) {
7753
+ return stream(this.client, prompt, options);
7754
+ }
7755
+ };
7756
+ }
7757
+ });
7758
+
7759
+ // src/core/options.ts
7760
+ var ModelIdentifierParser;
7761
+ var init_options = __esm({
7762
+ "src/core/options.ts"() {
7763
+ "use strict";
7764
+ ModelIdentifierParser = class {
7765
+ constructor(defaultProvider = "openai") {
7766
+ this.defaultProvider = defaultProvider;
7767
+ }
7768
+ parse(identifier) {
7769
+ const trimmed = identifier.trim();
7770
+ if (!trimmed) {
7771
+ throw new Error("Model identifier cannot be empty");
7772
+ }
7773
+ const [maybeProvider, ...rest] = trimmed.split(":");
7774
+ if (rest.length === 0) {
7775
+ return { provider: this.defaultProvider, name: maybeProvider };
7776
+ }
7777
+ const provider = maybeProvider;
7778
+ const name = rest.join(":");
7779
+ if (!name) {
7780
+ throw new Error("Model name cannot be empty");
7781
+ }
7782
+ return { provider, name };
7783
+ }
7784
+ };
7785
+ }
7786
+ });
7787
+
6793
7788
  // src/core/client.ts
6794
7789
  var client_exports = {};
6795
7790
  __export(client_exports, {
@@ -6802,12 +7797,20 @@ var init_client = __esm({
6802
7797
  init_builder();
6803
7798
  init_discovery();
6804
7799
  init_model_registry();
7800
+ init_image();
7801
+ init_speech();
7802
+ init_text();
6805
7803
  init_options();
6806
7804
  init_quick_methods();
6807
7805
  LLMist = class _LLMist {
6808
7806
  parser;
7807
+ defaultProvider;
6809
7808
  modelRegistry;
6810
7809
  adapters;
7810
+ // Namespaces for different generation types
7811
+ text;
7812
+ image;
7813
+ speech;
6811
7814
  constructor(...args) {
6812
7815
  let adapters = [];
6813
7816
  let defaultProvider;
@@ -6846,6 +7849,7 @@ var init_client = __esm({
6846
7849
  const priorityB = b.priority ?? 0;
6847
7850
  return priorityB - priorityA;
6848
7851
  });
7852
+ this.defaultProvider = resolvedDefaultProvider;
6849
7853
  this.parser = new ModelIdentifierParser(resolvedDefaultProvider);
6850
7854
  this.modelRegistry = new ModelRegistry();
6851
7855
  for (const adapter of this.adapters) {
@@ -6854,6 +7858,9 @@ var init_client = __esm({
6854
7858
  if (customModels.length > 0) {
6855
7859
  this.modelRegistry.registerModels(customModels);
6856
7860
  }
7861
+ this.text = new TextNamespace(this);
7862
+ this.image = new ImageNamespace(this.adapters, this.defaultProvider);
7863
+ this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
6857
7864
  }
6858
7865
  stream(options) {
6859
7866
  const descriptor = this.parser.parse(options.model);
@@ -8450,11 +9457,11 @@ export {
8450
9457
  init_discovery,
8451
9458
  ModelRegistry,
8452
9459
  init_model_registry,
8453
- ModelIdentifierParser,
8454
- init_options,
8455
9460
  complete,
8456
9461
  stream,
8457
9462
  init_quick_methods,
9463
+ ModelIdentifierParser,
9464
+ init_options,
8458
9465
  LLMist,
8459
9466
  init_client,
8460
9467
  AgentBuilder,
@@ -8501,4 +9508,4 @@ export {
8501
9508
  MockPromptRecorder,
8502
9509
  waitFor
8503
9510
  };
8504
- //# sourceMappingURL=chunk-GANXNBIZ.js.map
9511
+ //# sourceMappingURL=chunk-6ZDUWO6N.js.map