llmist 2.3.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -2555,7 +2555,27 @@ var init_cost_reporting_client = __esm({
2555
2555
  constructor(client, reportCost) {
2556
2556
  this.client = client;
2557
2557
  this.reportCost = reportCost;
2558
+ this.image = {
2559
+ generate: async (options) => {
2560
+ const result = await this.client.image.generate(options);
2561
+ if (result.cost !== void 0 && result.cost > 0) {
2562
+ this.reportCost(result.cost);
2563
+ }
2564
+ return result;
2565
+ }
2566
+ };
2567
+ this.speech = {
2568
+ generate: async (options) => {
2569
+ const result = await this.client.speech.generate(options);
2570
+ if (result.cost !== void 0 && result.cost > 0) {
2571
+ this.reportCost(result.cost);
2572
+ }
2573
+ return result;
2574
+ }
2575
+ };
2558
2576
  }
2577
+ image;
2578
+ speech;
2559
2579
  /**
2560
2580
  * Access to model registry for cost estimation.
2561
2581
  */
@@ -4648,6 +4668,28 @@ var init_anthropic = __esm({
4648
4668
  getModelSpecs() {
4649
4669
  return ANTHROPIC_MODELS;
4650
4670
  }
4671
+ // =========================================================================
4672
+ // Image Generation (Not Supported)
4673
+ // =========================================================================
4674
+ supportsImageGeneration(_modelId) {
4675
+ return false;
4676
+ }
4677
+ async generateImage() {
4678
+ throw new Error(
4679
+ "Anthropic does not support image generation. Use OpenAI (DALL-E, GPT Image) or Google Gemini (Imagen) instead."
4680
+ );
4681
+ }
4682
+ // =========================================================================
4683
+ // Speech Generation (Not Supported)
4684
+ // =========================================================================
4685
+ supportsSpeechGeneration(_modelId) {
4686
+ return false;
4687
+ }
4688
+ async generateSpeech() {
4689
+ throw new Error(
4690
+ "Anthropic does not support speech generation. Use OpenAI (TTS) or Google Gemini (TTS) instead."
4691
+ );
4692
+ }
4651
4693
  buildRequestPayload(options, descriptor, spec, messages) {
4652
4694
  const systemMessages = messages.filter((message) => message.role === "system");
4653
4695
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
@@ -4802,6 +4844,182 @@ var init_anthropic = __esm({
4802
4844
  }
4803
4845
  });
4804
4846
 
4847
+ // src/providers/gemini-image-models.ts
4848
+ function getGeminiImageModelSpec(modelId) {
4849
+ return geminiImageModels.find((m) => m.modelId === modelId);
4850
+ }
4851
+ function isGeminiImageModel(modelId) {
4852
+ return geminiImageModels.some((m) => m.modelId === modelId);
4853
+ }
4854
+ function calculateGeminiImageCost(modelId, size = "1:1", n = 1) {
4855
+ const spec = getGeminiImageModelSpec(modelId);
4856
+ if (!spec) return void 0;
4857
+ if (spec.pricing.perImage !== void 0) {
4858
+ return spec.pricing.perImage * n;
4859
+ }
4860
+ if (spec.pricing.bySize) {
4861
+ const sizePrice = spec.pricing.bySize[size];
4862
+ if (typeof sizePrice === "number") {
4863
+ return sizePrice * n;
4864
+ }
4865
+ }
4866
+ return void 0;
4867
+ }
4868
+ var IMAGEN4_ASPECT_RATIOS, GEMINI_IMAGE_ASPECT_RATIOS, geminiImageModels;
4869
+ var init_gemini_image_models = __esm({
4870
+ "src/providers/gemini-image-models.ts"() {
4871
+ "use strict";
4872
+ IMAGEN4_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
4873
+ GEMINI_IMAGE_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
4874
+ geminiImageModels = [
4875
+ // Imagen 4 Family (standalone image generation)
4876
+ {
4877
+ provider: "gemini",
4878
+ modelId: "imagen-4.0-fast-generate-001",
4879
+ displayName: "Imagen 4 Fast",
4880
+ pricing: {
4881
+ perImage: 0.02
4882
+ },
4883
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
4884
+ maxImages: 4,
4885
+ defaultSize: "1:1",
4886
+ features: {
4887
+ textRendering: true
4888
+ }
4889
+ },
4890
+ {
4891
+ provider: "gemini",
4892
+ modelId: "imagen-4.0-generate-001",
4893
+ displayName: "Imagen 4",
4894
+ pricing: {
4895
+ perImage: 0.04
4896
+ },
4897
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
4898
+ maxImages: 4,
4899
+ defaultSize: "1:1",
4900
+ features: {
4901
+ textRendering: true
4902
+ }
4903
+ },
4904
+ {
4905
+ provider: "gemini",
4906
+ modelId: "imagen-4.0-ultra-generate-001",
4907
+ displayName: "Imagen 4 Ultra",
4908
+ pricing: {
4909
+ perImage: 0.06
4910
+ },
4911
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
4912
+ maxImages: 4,
4913
+ defaultSize: "1:1",
4914
+ features: {
4915
+ textRendering: true
4916
+ }
4917
+ },
4918
+ // Preview versions
4919
+ {
4920
+ provider: "gemini",
4921
+ modelId: "imagen-4.0-generate-preview-06-06",
4922
+ displayName: "Imagen 4 (Preview)",
4923
+ pricing: {
4924
+ perImage: 0.04
4925
+ },
4926
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
4927
+ maxImages: 4,
4928
+ defaultSize: "1:1",
4929
+ features: {
4930
+ textRendering: true
4931
+ }
4932
+ },
4933
+ {
4934
+ provider: "gemini",
4935
+ modelId: "imagen-4.0-ultra-generate-preview-06-06",
4936
+ displayName: "Imagen 4 Ultra (Preview)",
4937
+ pricing: {
4938
+ perImage: 0.06
4939
+ },
4940
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
4941
+ maxImages: 4,
4942
+ defaultSize: "1:1",
4943
+ features: {
4944
+ textRendering: true
4945
+ }
4946
+ },
4947
+ // Gemini Native Image Generation (multimodal models)
4948
+ {
4949
+ provider: "gemini",
4950
+ modelId: "gemini-2.5-flash-image",
4951
+ displayName: "Gemini 2.5 Flash Image",
4952
+ pricing: {
4953
+ perImage: 0.039
4954
+ },
4955
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
4956
+ maxImages: 1,
4957
+ defaultSize: "1:1",
4958
+ features: {
4959
+ conversational: true,
4960
+ textRendering: true
4961
+ }
4962
+ },
4963
+ {
4964
+ provider: "gemini",
4965
+ modelId: "gemini-2.5-flash-image-preview",
4966
+ displayName: "Gemini 2.5 Flash Image (Preview)",
4967
+ pricing: {
4968
+ perImage: 0.039
4969
+ },
4970
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
4971
+ maxImages: 1,
4972
+ defaultSize: "1:1",
4973
+ features: {
4974
+ conversational: true,
4975
+ textRendering: true
4976
+ }
4977
+ },
4978
+ {
4979
+ provider: "gemini",
4980
+ modelId: "gemini-3-pro-image-preview",
4981
+ displayName: "Gemini 3 Pro Image (Preview)",
4982
+ pricing: {
4983
+ // Token-based: ~$0.134 per 1K/2K image, $0.24 per 4K
4984
+ // Using 2K as default
4985
+ bySize: {
4986
+ "1K": 0.134,
4987
+ "2K": 0.134,
4988
+ "4K": 0.24
4989
+ }
4990
+ },
4991
+ supportedSizes: ["1K", "2K", "4K"],
4992
+ maxImages: 1,
4993
+ defaultSize: "2K",
4994
+ features: {
4995
+ conversational: true,
4996
+ textRendering: true
4997
+ }
4998
+ },
4999
+ // Alias: nano-banana-pro-preview is gemini-3-pro-image-preview
5000
+ {
5001
+ provider: "gemini",
5002
+ modelId: "nano-banana-pro-preview",
5003
+ displayName: "Nano Banana Pro (Gemini 3 Pro Image)",
5004
+ pricing: {
5005
+ bySize: {
5006
+ "1K": 0.134,
5007
+ "2K": 0.134,
5008
+ "4K": 0.24
5009
+ }
5010
+ },
5011
+ supportedSizes: ["1K", "2K", "4K"],
5012
+ maxImages: 1,
5013
+ defaultSize: "2K",
5014
+ features: {
5015
+ conversational: true,
5016
+ textRendering: true
5017
+ }
5018
+ }
5019
+ ];
5020
+ }
5021
+ });
5022
+
4805
5023
  // src/providers/gemini-models.ts
4806
5024
  var GEMINI_MODELS;
4807
5025
  var init_gemini_models = __esm({
@@ -4975,7 +5193,171 @@ var init_gemini_models = __esm({
4975
5193
  }
4976
5194
  });
4977
5195
 
5196
+ // src/providers/gemini-speech-models.ts
5197
+ function getGeminiSpeechModelSpec(modelId) {
5198
+ return geminiSpeechModels.find((m) => m.modelId === modelId);
5199
+ }
5200
+ function isGeminiSpeechModel(modelId) {
5201
+ return geminiSpeechModels.some((m) => m.modelId === modelId);
5202
+ }
5203
+ function calculateGeminiSpeechCost(modelId, characterCount, estimatedMinutes) {
5204
+ const spec = getGeminiSpeechModelSpec(modelId);
5205
+ if (!spec) return void 0;
5206
+ if (spec.pricing.perMinute !== void 0) {
5207
+ if (estimatedMinutes !== void 0) {
5208
+ return estimatedMinutes * spec.pricing.perMinute;
5209
+ }
5210
+ const approxMinutes = characterCount / 750;
5211
+ return approxMinutes * spec.pricing.perMinute;
5212
+ }
5213
+ return void 0;
5214
+ }
5215
+ var GEMINI_TTS_VOICES, GEMINI_TTS_FORMATS, geminiSpeechModels;
5216
+ var init_gemini_speech_models = __esm({
5217
+ "src/providers/gemini-speech-models.ts"() {
5218
+ "use strict";
5219
+ GEMINI_TTS_VOICES = [
5220
+ "Zephyr",
5221
+ // Bright
5222
+ "Puck",
5223
+ // Upbeat
5224
+ "Charon",
5225
+ // Informative
5226
+ "Kore",
5227
+ // Firm
5228
+ "Fenrir",
5229
+ // Excitable
5230
+ "Leda",
5231
+ // Youthful
5232
+ "Orus",
5233
+ // Firm
5234
+ "Aoede",
5235
+ // Breezy
5236
+ "Callirrhoe",
5237
+ // Easy-going
5238
+ "Autonoe",
5239
+ // Bright
5240
+ "Enceladus",
5241
+ // Breathy
5242
+ "Iapetus",
5243
+ // Clear
5244
+ "Umbriel",
5245
+ // Easy-going
5246
+ "Algieba",
5247
+ // Smooth
5248
+ "Despina",
5249
+ // Smooth
5250
+ "Erinome",
5251
+ // Clear
5252
+ "Algenib",
5253
+ // Gravelly
5254
+ "Rasalgethi",
5255
+ // Informative
5256
+ "Laomedeia",
5257
+ // Upbeat
5258
+ "Achernar",
5259
+ // Soft
5260
+ "Alnilam",
5261
+ // Firm
5262
+ "Schedar",
5263
+ // Even
5264
+ "Gacrux",
5265
+ // Mature
5266
+ "Pulcherrima",
5267
+ // Forward
5268
+ "Achird",
5269
+ // Friendly
5270
+ "Zubenelgenubi",
5271
+ // Casual
5272
+ "Vindemiatrix",
5273
+ // Gentle
5274
+ "Sadachbia",
5275
+ // Lively
5276
+ "Sadaltager",
5277
+ // Knowledgeable
5278
+ "Sulafat"
5279
+ // Warm
5280
+ ];
5281
+ GEMINI_TTS_FORMATS = ["pcm", "wav"];
5282
+ geminiSpeechModels = [
5283
+ {
5284
+ provider: "gemini",
5285
+ modelId: "gemini-2.5-flash-preview-tts",
5286
+ displayName: "Gemini 2.5 Flash TTS (Preview)",
5287
+ pricing: {
5288
+ // $0.50 per 1M input tokens = $0.0000005 per token
5289
+ perInputToken: 5e-7,
5290
+ // $10.00 per 1M audio output tokens = $0.00001 per token
5291
+ perAudioOutputToken: 1e-5,
5292
+ // Rough estimate: ~$0.01 per minute of audio
5293
+ perMinute: 0.01
5294
+ },
5295
+ voices: [...GEMINI_TTS_VOICES],
5296
+ formats: GEMINI_TTS_FORMATS,
5297
+ maxInputLength: 8e3,
5298
+ // bytes (text + prompt combined)
5299
+ defaultVoice: "Zephyr",
5300
+ defaultFormat: "wav",
5301
+ features: {
5302
+ multiSpeaker: true,
5303
+ languages: 24,
5304
+ voiceInstructions: true
5305
+ }
5306
+ },
5307
+ {
5308
+ provider: "gemini",
5309
+ modelId: "gemini-2.5-pro-preview-tts",
5310
+ displayName: "Gemini 2.5 Pro TTS (Preview)",
5311
+ pricing: {
5312
+ // $1.00 per 1M input tokens = $0.000001 per token
5313
+ perInputToken: 1e-6,
5314
+ // $20.00 per 1M audio output tokens = $0.00002 per token
5315
+ perAudioOutputToken: 2e-5,
5316
+ // Rough estimate: ~$0.02 per minute of audio
5317
+ perMinute: 0.02
5318
+ },
5319
+ voices: [...GEMINI_TTS_VOICES],
5320
+ formats: GEMINI_TTS_FORMATS,
5321
+ maxInputLength: 8e3,
5322
+ // bytes
5323
+ defaultVoice: "Zephyr",
5324
+ defaultFormat: "wav",
5325
+ features: {
5326
+ multiSpeaker: true,
5327
+ languages: 24,
5328
+ voiceInstructions: true
5329
+ }
5330
+ }
5331
+ ];
5332
+ }
5333
+ });
5334
+
4978
5335
  // src/providers/gemini.ts
5336
+ function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
5337
+ const byteRate = sampleRate * numChannels * bitsPerSample / 8;
5338
+ const blockAlign = numChannels * bitsPerSample / 8;
5339
+ const dataSize = pcmData.length;
5340
+ const headerSize = 44;
5341
+ const fileSize = headerSize + dataSize - 8;
5342
+ const buffer = new ArrayBuffer(headerSize + dataSize);
5343
+ const view = new DataView(buffer);
5344
+ const uint8 = new Uint8Array(buffer);
5345
+ view.setUint32(0, 1380533830, false);
5346
+ view.setUint32(4, fileSize, true);
5347
+ view.setUint32(8, 1463899717, false);
5348
+ view.setUint32(12, 1718449184, false);
5349
+ view.setUint32(16, 16, true);
5350
+ view.setUint16(20, 1, true);
5351
+ view.setUint16(22, numChannels, true);
5352
+ view.setUint32(24, sampleRate, true);
5353
+ view.setUint32(28, byteRate, true);
5354
+ view.setUint16(32, blockAlign, true);
5355
+ view.setUint16(34, bitsPerSample, true);
5356
+ view.setUint32(36, 1684108385, false);
5357
+ view.setUint32(40, dataSize, true);
5358
+ uint8.set(pcmData, headerSize);
5359
+ return buffer;
5360
+ }
4979
5361
  function createGeminiProviderFromEnv() {
4980
5362
  return createProviderFromEnv("GEMINI_API_KEY", import_genai.GoogleGenAI, GeminiGenerativeProvider);
4981
5363
  }
@@ -4986,7 +5368,9 @@ var init_gemini = __esm({
4986
5368
  import_genai = require("@google/genai");
4987
5369
  init_base_provider();
4988
5370
  init_constants2();
5371
+ init_gemini_image_models();
4989
5372
  init_gemini_models();
5373
+ init_gemini_speech_models();
4990
5374
  init_utils();
4991
5375
  GEMINI_ROLE_MAP = {
4992
5376
  system: "user",
@@ -5001,6 +5385,139 @@ var init_gemini = __esm({
5001
5385
  getModelSpecs() {
5002
5386
  return GEMINI_MODELS;
5003
5387
  }
5388
+ // =========================================================================
5389
+ // Image Generation
5390
+ // =========================================================================
5391
+ getImageModelSpecs() {
5392
+ return geminiImageModels;
5393
+ }
5394
+ supportsImageGeneration(modelId) {
5395
+ return isGeminiImageModel(modelId);
5396
+ }
5397
+ async generateImage(options) {
5398
+ const client = this.client;
5399
+ const spec = getGeminiImageModelSpec(options.model);
5400
+ const isImagenModel = options.model.startsWith("imagen");
5401
+ const aspectRatio = options.size ?? spec?.defaultSize ?? "1:1";
5402
+ const n = options.n ?? 1;
5403
+ if (isImagenModel) {
5404
+ const response2 = await client.models.generateImages({
5405
+ model: options.model,
5406
+ prompt: options.prompt,
5407
+ config: {
5408
+ numberOfImages: n,
5409
+ aspectRatio,
5410
+ outputMimeType: options.responseFormat === "b64_json" ? "image/png" : "image/jpeg"
5411
+ }
5412
+ });
5413
+ const images2 = response2.generatedImages ?? [];
5414
+ const cost2 = calculateGeminiImageCost(options.model, aspectRatio, images2.length);
5415
+ return {
5416
+ // Gemini's imageBytes is already base64 encoded, so use it directly
5417
+ images: images2.map((img) => ({
5418
+ b64Json: img.image?.imageBytes ?? void 0
5419
+ })),
5420
+ model: options.model,
5421
+ usage: {
5422
+ imagesGenerated: images2.length,
5423
+ size: aspectRatio,
5424
+ quality: "standard"
5425
+ },
5426
+ cost: cost2
5427
+ };
5428
+ }
5429
+ const response = await client.models.generateContent({
5430
+ model: options.model,
5431
+ contents: [{ role: "user", parts: [{ text: options.prompt }] }],
5432
+ config: {
5433
+ responseModalities: [import_genai.Modality.IMAGE, import_genai.Modality.TEXT]
5434
+ }
5435
+ });
5436
+ const images = [];
5437
+ const candidate = response.candidates?.[0];
5438
+ if (candidate?.content?.parts) {
5439
+ for (const part of candidate.content.parts) {
5440
+ if ("inlineData" in part && part.inlineData) {
5441
+ images.push({
5442
+ b64Json: part.inlineData.data
5443
+ });
5444
+ }
5445
+ }
5446
+ }
5447
+ const cost = calculateGeminiImageCost(options.model, aspectRatio, images.length);
5448
+ return {
5449
+ images,
5450
+ model: options.model,
5451
+ usage: {
5452
+ imagesGenerated: images.length,
5453
+ size: aspectRatio,
5454
+ quality: "standard"
5455
+ },
5456
+ cost
5457
+ };
5458
+ }
5459
+ // =========================================================================
5460
+ // Speech Generation
5461
+ // =========================================================================
5462
+ getSpeechModelSpecs() {
5463
+ return geminiSpeechModels;
5464
+ }
5465
+ supportsSpeechGeneration(modelId) {
5466
+ return isGeminiSpeechModel(modelId);
5467
+ }
5468
+ async generateSpeech(options) {
5469
+ const client = this.client;
5470
+ const spec = getGeminiSpeechModelSpec(options.model);
5471
+ const voice = options.voice ?? spec?.defaultVoice ?? "Zephyr";
5472
+ const response = await client.models.generateContent({
5473
+ model: options.model,
5474
+ contents: [
5475
+ {
5476
+ role: "user",
5477
+ parts: [{ text: options.input }]
5478
+ }
5479
+ ],
5480
+ config: {
5481
+ responseModalities: [import_genai.Modality.AUDIO],
5482
+ speechConfig: {
5483
+ voiceConfig: {
5484
+ prebuiltVoiceConfig: {
5485
+ voiceName: voice
5486
+ }
5487
+ }
5488
+ }
5489
+ }
5490
+ });
5491
+ let pcmData;
5492
+ const candidate = response.candidates?.[0];
5493
+ if (candidate?.content?.parts) {
5494
+ for (const part of candidate.content.parts) {
5495
+ if ("inlineData" in part && part.inlineData?.data) {
5496
+ const base64 = part.inlineData.data;
5497
+ const binary = atob(base64);
5498
+ pcmData = new Uint8Array(binary.length);
5499
+ for (let i = 0; i < binary.length; i++) {
5500
+ pcmData[i] = binary.charCodeAt(i);
5501
+ }
5502
+ break;
5503
+ }
5504
+ }
5505
+ }
5506
+ if (!pcmData) {
5507
+ throw new Error("No audio data in Gemini TTS response");
5508
+ }
5509
+ const audioData = wrapPcmInWav(pcmData, 24e3, 16, 1);
5510
+ const cost = calculateGeminiSpeechCost(options.model, options.input.length);
5511
+ return {
5512
+ audio: audioData,
5513
+ model: options.model,
5514
+ usage: {
5515
+ characterCount: options.input.length
5516
+ },
5517
+ cost,
5518
+ format: spec?.defaultFormat ?? "wav"
5519
+ };
5520
+ }
5004
5521
  buildRequestPayload(options, descriptor, _spec, messages) {
5005
5522
  const contents = this.convertMessagesToContents(messages);
5006
5523
  const generationConfig = this.buildGenerationConfig(options);
@@ -5196,6 +5713,121 @@ var init_gemini = __esm({
5196
5713
  }
5197
5714
  });
5198
5715
 
5716
+ // src/providers/openai-image-models.ts
5717
+ function getOpenAIImageModelSpec(modelId) {
5718
+ return openaiImageModels.find((m) => m.modelId === modelId);
5719
+ }
5720
+ function isOpenAIImageModel(modelId) {
5721
+ return openaiImageModels.some((m) => m.modelId === modelId);
5722
+ }
5723
+ function calculateOpenAIImageCost(modelId, size, quality = "standard", n = 1) {
5724
+ const spec = getOpenAIImageModelSpec(modelId);
5725
+ if (!spec) return void 0;
5726
+ const sizePrice = spec.pricing.bySize?.[size];
5727
+ if (sizePrice === void 0) return void 0;
5728
+ let pricePerImage;
5729
+ if (typeof sizePrice === "number") {
5730
+ pricePerImage = sizePrice;
5731
+ } else {
5732
+ pricePerImage = sizePrice[quality];
5733
+ if (pricePerImage === void 0) return void 0;
5734
+ }
5735
+ return pricePerImage * n;
5736
+ }
5737
+ var GPT_IMAGE_SIZES, GPT_IMAGE_QUALITIES, DALLE3_SIZES, DALLE3_QUALITIES, DALLE2_SIZES, openaiImageModels;
5738
+ var init_openai_image_models = __esm({
5739
+ "src/providers/openai-image-models.ts"() {
5740
+ "use strict";
5741
+ GPT_IMAGE_SIZES = ["1024x1024", "1024x1536", "1536x1024"];
5742
+ GPT_IMAGE_QUALITIES = ["low", "medium", "high"];
5743
+ DALLE3_SIZES = ["1024x1024", "1024x1792", "1792x1024"];
5744
+ DALLE3_QUALITIES = ["standard", "hd"];
5745
+ DALLE2_SIZES = ["256x256", "512x512", "1024x1024"];
5746
+ openaiImageModels = [
5747
+ // GPT Image 1 Family (flagship)
5748
+ {
5749
+ provider: "openai",
5750
+ modelId: "gpt-image-1",
5751
+ displayName: "GPT Image 1",
5752
+ pricing: {
5753
+ bySize: {
5754
+ "1024x1024": { low: 0.011, medium: 0.04, high: 0.17 },
5755
+ "1024x1536": { low: 0.016, medium: 0.06, high: 0.25 },
5756
+ "1536x1024": { low: 0.016, medium: 0.06, high: 0.25 }
5757
+ }
5758
+ },
5759
+ supportedSizes: [...GPT_IMAGE_SIZES],
5760
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
5761
+ maxImages: 1,
5762
+ defaultSize: "1024x1024",
5763
+ defaultQuality: "medium",
5764
+ features: {
5765
+ textRendering: true,
5766
+ transparency: true
5767
+ }
5768
+ },
5769
+ {
5770
+ provider: "openai",
5771
+ modelId: "gpt-image-1-mini",
5772
+ displayName: "GPT Image 1 Mini",
5773
+ pricing: {
5774
+ bySize: {
5775
+ "1024x1024": { low: 5e-3, medium: 0.02, high: 0.052 },
5776
+ "1024x1536": { low: 75e-4, medium: 0.03, high: 0.078 },
5777
+ "1536x1024": { low: 75e-4, medium: 0.03, high: 0.078 }
5778
+ }
5779
+ },
5780
+ supportedSizes: [...GPT_IMAGE_SIZES],
5781
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
5782
+ maxImages: 1,
5783
+ defaultSize: "1024x1024",
5784
+ defaultQuality: "medium",
5785
+ features: {
5786
+ textRendering: true,
5787
+ transparency: true
5788
+ }
5789
+ },
5790
+ // DALL-E Family
5791
+ {
5792
+ provider: "openai",
5793
+ modelId: "dall-e-3",
5794
+ displayName: "DALL-E 3",
5795
+ pricing: {
5796
+ bySize: {
5797
+ "1024x1024": { standard: 0.04, hd: 0.08 },
5798
+ "1024x1792": { standard: 0.08, hd: 0.12 },
5799
+ "1792x1024": { standard: 0.08, hd: 0.12 }
5800
+ }
5801
+ },
5802
+ supportedSizes: [...DALLE3_SIZES],
5803
+ supportedQualities: [...DALLE3_QUALITIES],
5804
+ maxImages: 1,
5805
+ // DALL-E 3 only supports n=1
5806
+ defaultSize: "1024x1024",
5807
+ defaultQuality: "standard",
5808
+ features: {
5809
+ textRendering: true
5810
+ }
5811
+ },
5812
+ {
5813
+ provider: "openai",
5814
+ modelId: "dall-e-2",
5815
+ displayName: "DALL-E 2 (Legacy)",
5816
+ pricing: {
5817
+ bySize: {
5818
+ "256x256": 0.016,
5819
+ "512x512": 0.018,
5820
+ "1024x1024": 0.02
5821
+ }
5822
+ },
5823
+ supportedSizes: [...DALLE2_SIZES],
5824
+ maxImages: 10,
5825
+ defaultSize: "1024x1024"
5826
+ }
5827
+ ];
5828
+ }
5829
+ });
5830
+
5199
5831
  // src/providers/openai-models.ts
5200
5832
  var OPENAI_MODELS;
5201
5833
  var init_openai_models = __esm({
@@ -5560,6 +6192,144 @@ var init_openai_models = __esm({
5560
6192
  }
5561
6193
  });
5562
6194
 
6195
+ // src/providers/openai-speech-models.ts
6196
+ function getOpenAISpeechModelSpec(modelId) {
6197
+ return openaiSpeechModels.find((m) => m.modelId === modelId);
6198
+ }
6199
+ function isOpenAISpeechModel(modelId) {
6200
+ return openaiSpeechModels.some((m) => m.modelId === modelId);
6201
+ }
6202
+ function calculateOpenAISpeechCost(modelId, characterCount, estimatedMinutes) {
6203
+ const spec = getOpenAISpeechModelSpec(modelId);
6204
+ if (!spec) return void 0;
6205
+ if (spec.pricing.perCharacter !== void 0) {
6206
+ return characterCount * spec.pricing.perCharacter;
6207
+ }
6208
+ if (spec.pricing.perMinute !== void 0 && estimatedMinutes !== void 0) {
6209
+ return estimatedMinutes * spec.pricing.perMinute;
6210
+ }
6211
+ if (spec.pricing.perMinute !== void 0) {
6212
+ const approxMinutes = characterCount / 750;
6213
+ return approxMinutes * spec.pricing.perMinute;
6214
+ }
6215
+ return void 0;
6216
+ }
6217
+ var OPENAI_TTS_VOICES, OPENAI_TTS_EXTENDED_VOICES, OPENAI_TTS_FORMATS, openaiSpeechModels;
6218
+ var init_openai_speech_models = __esm({
6219
+ "src/providers/openai-speech-models.ts"() {
6220
+ "use strict";
6221
+ OPENAI_TTS_VOICES = [
6222
+ "alloy",
6223
+ "echo",
6224
+ "fable",
6225
+ "onyx",
6226
+ "nova",
6227
+ "shimmer"
6228
+ ];
6229
+ OPENAI_TTS_EXTENDED_VOICES = [
6230
+ ...OPENAI_TTS_VOICES,
6231
+ "ash",
6232
+ "ballad",
6233
+ "coral",
6234
+ "sage",
6235
+ "verse"
6236
+ ];
6237
+ OPENAI_TTS_FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
6238
+ openaiSpeechModels = [
6239
+ // Standard TTS models (character-based pricing)
6240
+ {
6241
+ provider: "openai",
6242
+ modelId: "tts-1",
6243
+ displayName: "TTS-1",
6244
+ pricing: {
6245
+ // $15 per 1M characters = $0.000015 per character
6246
+ perCharacter: 15e-6
6247
+ },
6248
+ voices: [...OPENAI_TTS_VOICES],
6249
+ formats: OPENAI_TTS_FORMATS,
6250
+ maxInputLength: 4096,
6251
+ defaultVoice: "alloy",
6252
+ defaultFormat: "mp3",
6253
+ features: {
6254
+ voiceInstructions: false
6255
+ }
6256
+ },
6257
+ {
6258
+ provider: "openai",
6259
+ modelId: "tts-1-1106",
6260
+ displayName: "TTS-1 (Nov 2023)",
6261
+ pricing: {
6262
+ perCharacter: 15e-6
6263
+ },
6264
+ voices: [...OPENAI_TTS_VOICES],
6265
+ formats: OPENAI_TTS_FORMATS,
6266
+ maxInputLength: 4096,
6267
+ defaultVoice: "alloy",
6268
+ defaultFormat: "mp3",
6269
+ features: {
6270
+ voiceInstructions: false
6271
+ }
6272
+ },
6273
+ {
6274
+ provider: "openai",
6275
+ modelId: "tts-1-hd",
6276
+ displayName: "TTS-1 HD",
6277
+ pricing: {
6278
+ // $30 per 1M characters = $0.00003 per character
6279
+ perCharacter: 3e-5
6280
+ },
6281
+ voices: [...OPENAI_TTS_VOICES],
6282
+ formats: OPENAI_TTS_FORMATS,
6283
+ maxInputLength: 4096,
6284
+ defaultVoice: "alloy",
6285
+ defaultFormat: "mp3",
6286
+ features: {
6287
+ voiceInstructions: false
6288
+ }
6289
+ },
6290
+ {
6291
+ provider: "openai",
6292
+ modelId: "tts-1-hd-1106",
6293
+ displayName: "TTS-1 HD (Nov 2023)",
6294
+ pricing: {
6295
+ perCharacter: 3e-5
6296
+ },
6297
+ voices: [...OPENAI_TTS_VOICES],
6298
+ formats: OPENAI_TTS_FORMATS,
6299
+ maxInputLength: 4096,
6300
+ defaultVoice: "alloy",
6301
+ defaultFormat: "mp3",
6302
+ features: {
6303
+ voiceInstructions: false
6304
+ }
6305
+ },
6306
+ // Token-based TTS model with voice instructions support
6307
+ {
6308
+ provider: "openai",
6309
+ modelId: "gpt-4o-mini-tts",
6310
+ displayName: "GPT-4o Mini TTS",
6311
+ pricing: {
6312
+ // $0.60 per 1M input tokens = $0.0000006 per token
6313
+ perInputToken: 6e-7,
6314
+ // $12 per 1M audio output tokens = $0.000012 per token
6315
+ perAudioOutputToken: 12e-6,
6316
+ // ~$0.015 per minute of audio
6317
+ perMinute: 0.015
6318
+ },
6319
+ voices: [...OPENAI_TTS_EXTENDED_VOICES],
6320
+ formats: OPENAI_TTS_FORMATS,
6321
+ maxInputLength: 2e3,
6322
+ // tokens, not characters
6323
+ defaultVoice: "alloy",
6324
+ defaultFormat: "mp3",
6325
+ features: {
6326
+ voiceInstructions: true
6327
+ }
6328
+ }
6329
+ ];
6330
+ }
6331
+ });
6332
+
5563
6333
  // src/providers/openai.ts
5564
6334
  function sanitizeExtra(extra, allowTemperature) {
5565
6335
  if (!extra) {
@@ -5581,7 +6351,9 @@ var init_openai = __esm({
5581
6351
  import_tiktoken = require("tiktoken");
5582
6352
  init_base_provider();
5583
6353
  init_constants2();
6354
+ init_openai_image_models();
5584
6355
  init_openai_models();
6356
+ init_openai_speech_models();
5585
6357
  init_utils();
5586
6358
  ROLE_MAP = {
5587
6359
  system: "system",
@@ -5596,6 +6368,87 @@ var init_openai = __esm({
5596
6368
  getModelSpecs() {
5597
6369
  return OPENAI_MODELS;
5598
6370
  }
6371
+ // =========================================================================
6372
+ // Image Generation
6373
+ // =========================================================================
6374
+ getImageModelSpecs() {
6375
+ return openaiImageModels;
6376
+ }
6377
+ supportsImageGeneration(modelId) {
6378
+ return isOpenAIImageModel(modelId);
6379
+ }
6380
+ async generateImage(options) {
6381
+ const client = this.client;
6382
+ const spec = getOpenAIImageModelSpec(options.model);
6383
+ const size = options.size ?? spec?.defaultSize ?? "1024x1024";
6384
+ const quality = options.quality ?? spec?.defaultQuality ?? "standard";
6385
+ const n = options.n ?? 1;
6386
+ const isDallE2 = options.model === "dall-e-2";
6387
+ const isGptImage = options.model.startsWith("gpt-image");
6388
+ const requestParams = {
6389
+ model: options.model,
6390
+ prompt: options.prompt,
6391
+ size,
6392
+ n
6393
+ };
6394
+ if (!isDallE2 && !isGptImage) {
6395
+ requestParams.quality = quality;
6396
+ }
6397
+ if (isGptImage) {
6398
+ } else if (!isDallE2) {
6399
+ requestParams.response_format = options.responseFormat ?? "url";
6400
+ }
6401
+ const response = await client.images.generate(requestParams);
6402
+ const cost = calculateOpenAIImageCost(options.model, size, quality, n);
6403
+ const images = response.data ?? [];
6404
+ return {
6405
+ images: images.map((img) => ({
6406
+ url: img.url,
6407
+ b64Json: img.b64_json,
6408
+ revisedPrompt: img.revised_prompt
6409
+ })),
6410
+ model: options.model,
6411
+ usage: {
6412
+ imagesGenerated: images.length,
6413
+ size,
6414
+ quality
6415
+ },
6416
+ cost
6417
+ };
6418
+ }
6419
+ // =========================================================================
6420
+ // Speech Generation
6421
+ // =========================================================================
6422
+ getSpeechModelSpecs() {
6423
+ return openaiSpeechModels;
6424
+ }
6425
+ supportsSpeechGeneration(modelId) {
6426
+ return isOpenAISpeechModel(modelId);
6427
+ }
6428
+ async generateSpeech(options) {
6429
+ const client = this.client;
6430
+ const spec = getOpenAISpeechModelSpec(options.model);
6431
+ const format = options.responseFormat ?? spec?.defaultFormat ?? "mp3";
6432
+ const voice = options.voice ?? spec?.defaultVoice ?? "alloy";
6433
+ const response = await client.audio.speech.create({
6434
+ model: options.model,
6435
+ input: options.input,
6436
+ voice,
6437
+ response_format: format,
6438
+ speed: options.speed ?? 1
6439
+ });
6440
+ const audioBuffer = await response.arrayBuffer();
6441
+ const cost = calculateOpenAISpeechCost(options.model, options.input.length);
6442
+ return {
6443
+ audio: audioBuffer,
6444
+ model: options.model,
6445
+ usage: {
6446
+ characterCount: options.input.length
6447
+ },
6448
+ cost,
6449
+ format
6450
+ };
6451
+ }
5599
6452
  buildRequestPayload(options, descriptor, spec, messages) {
5600
6453
  const { maxTokens, temperature, topP, stopSequences, extra } = options;
5601
6454
  const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
@@ -5936,30 +6789,109 @@ var init_model_registry = __esm({
5936
6789
  }
5937
6790
  });
5938
6791
 
5939
- // src/core/options.ts
5940
- var ModelIdentifierParser;
5941
- var init_options = __esm({
5942
- "src/core/options.ts"() {
6792
+ // src/core/namespaces/image.ts
6793
+ var ImageNamespace;
6794
+ var init_image = __esm({
6795
+ "src/core/namespaces/image.ts"() {
5943
6796
  "use strict";
5944
- ModelIdentifierParser = class {
5945
- constructor(defaultProvider = "openai") {
6797
+ ImageNamespace = class {
6798
+ constructor(adapters, defaultProvider) {
6799
+ this.adapters = adapters;
5946
6800
  this.defaultProvider = defaultProvider;
5947
6801
  }
5948
- parse(identifier) {
5949
- const trimmed = identifier.trim();
5950
- if (!trimmed) {
5951
- throw new Error("Model identifier cannot be empty");
6802
+ /**
6803
+ * Generate images from a text prompt.
6804
+ *
6805
+ * @param options - Image generation options
6806
+ * @returns Promise resolving to the generation result with images and cost
6807
+ * @throws Error if the provider doesn't support image generation
6808
+ */
6809
+ async generate(options) {
6810
+ const modelId = options.model;
6811
+ const adapter = this.findImageAdapter(modelId);
6812
+ if (!adapter || !adapter.generateImage) {
6813
+ throw new Error(
6814
+ `No provider supports image generation for model "${modelId}". Available image models: ${this.listModels().map((m) => m.modelId).join(", ")}`
6815
+ );
5952
6816
  }
5953
- const [maybeProvider, ...rest] = trimmed.split(":");
5954
- if (rest.length === 0) {
5955
- return { provider: this.defaultProvider, name: maybeProvider };
6817
+ return adapter.generateImage(options);
6818
+ }
6819
+ /**
6820
+ * List all available image generation models.
6821
+ */
6822
+ listModels() {
6823
+ const models = [];
6824
+ for (const adapter of this.adapters) {
6825
+ if (adapter.getImageModelSpecs) {
6826
+ models.push(...adapter.getImageModelSpecs());
6827
+ }
5956
6828
  }
5957
- const provider = maybeProvider;
5958
- const name = rest.join(":");
5959
- if (!name) {
5960
- throw new Error("Model name cannot be empty");
6829
+ return models;
6830
+ }
6831
+ /**
6832
+ * Check if a model is supported for image generation.
6833
+ */
6834
+ supportsModel(modelId) {
6835
+ return this.findImageAdapter(modelId) !== void 0;
6836
+ }
6837
+ findImageAdapter(modelId) {
6838
+ return this.adapters.find(
6839
+ (adapter) => adapter.supportsImageGeneration?.(modelId) ?? false
6840
+ );
6841
+ }
6842
+ };
6843
+ }
6844
+ });
6845
+
6846
+ // src/core/namespaces/speech.ts
6847
+ var SpeechNamespace;
6848
+ var init_speech = __esm({
6849
+ "src/core/namespaces/speech.ts"() {
6850
+ "use strict";
6851
+ SpeechNamespace = class {
6852
+ constructor(adapters, defaultProvider) {
6853
+ this.adapters = adapters;
6854
+ this.defaultProvider = defaultProvider;
6855
+ }
6856
+ /**
6857
+ * Generate speech audio from text.
6858
+ *
6859
+ * @param options - Speech generation options
6860
+ * @returns Promise resolving to the generation result with audio and cost
6861
+ * @throws Error if the provider doesn't support speech generation
6862
+ */
6863
+ async generate(options) {
6864
+ const modelId = options.model;
6865
+ const adapter = this.findSpeechAdapter(modelId);
6866
+ if (!adapter || !adapter.generateSpeech) {
6867
+ throw new Error(
6868
+ `No provider supports speech generation for model "${modelId}". Available speech models: ${this.listModels().map((m) => m.modelId).join(", ")}`
6869
+ );
5961
6870
  }
5962
- return { provider, name };
6871
+ return adapter.generateSpeech(options);
6872
+ }
6873
+ /**
6874
+ * List all available speech generation models.
6875
+ */
6876
+ listModels() {
6877
+ const models = [];
6878
+ for (const adapter of this.adapters) {
6879
+ if (adapter.getSpeechModelSpecs) {
6880
+ models.push(...adapter.getSpeechModelSpecs());
6881
+ }
6882
+ }
6883
+ return models;
6884
+ }
6885
+ /**
6886
+ * Check if a model is supported for speech generation.
6887
+ */
6888
+ supportsModel(modelId) {
6889
+ return this.findSpeechAdapter(modelId) !== void 0;
6890
+ }
6891
+ findSpeechAdapter(modelId) {
6892
+ return this.adapters.find(
6893
+ (adapter) => adapter.supportsSpeechGeneration?.(modelId) ?? false
6894
+ );
5963
6895
  }
5964
6896
  };
5965
6897
  }
@@ -6008,6 +6940,69 @@ var init_quick_methods = __esm({
6008
6940
  }
6009
6941
  });
6010
6942
 
6943
+ // src/core/namespaces/text.ts
6944
+ var TextNamespace;
6945
+ var init_text = __esm({
6946
+ "src/core/namespaces/text.ts"() {
6947
+ "use strict";
6948
+ init_quick_methods();
6949
+ TextNamespace = class {
6950
+ constructor(client) {
6951
+ this.client = client;
6952
+ }
6953
+ /**
6954
+ * Generate a complete text response.
6955
+ *
6956
+ * @param prompt - User prompt
6957
+ * @param options - Optional configuration
6958
+ * @returns Complete text response
6959
+ */
6960
+ async complete(prompt, options) {
6961
+ return complete(this.client, prompt, options);
6962
+ }
6963
+ /**
6964
+ * Stream text chunks.
6965
+ *
6966
+ * @param prompt - User prompt
6967
+ * @param options - Optional configuration
6968
+ * @returns Async generator yielding text chunks
6969
+ */
6970
+ stream(prompt, options) {
6971
+ return stream(this.client, prompt, options);
6972
+ }
6973
+ };
6974
+ }
6975
+ });
6976
+
6977
+ // src/core/options.ts
6978
+ var ModelIdentifierParser;
6979
+ var init_options = __esm({
6980
+ "src/core/options.ts"() {
6981
+ "use strict";
6982
+ ModelIdentifierParser = class {
6983
+ constructor(defaultProvider = "openai") {
6984
+ this.defaultProvider = defaultProvider;
6985
+ }
6986
+ parse(identifier) {
6987
+ const trimmed = identifier.trim();
6988
+ if (!trimmed) {
6989
+ throw new Error("Model identifier cannot be empty");
6990
+ }
6991
+ const [maybeProvider, ...rest] = trimmed.split(":");
6992
+ if (rest.length === 0) {
6993
+ return { provider: this.defaultProvider, name: maybeProvider };
6994
+ }
6995
+ const provider = maybeProvider;
6996
+ const name = rest.join(":");
6997
+ if (!name) {
6998
+ throw new Error("Model name cannot be empty");
6999
+ }
7000
+ return { provider, name };
7001
+ }
7002
+ };
7003
+ }
7004
+ });
7005
+
6011
7006
  // src/core/client.ts
6012
7007
  var client_exports = {};
6013
7008
  __export(client_exports, {
@@ -6020,12 +7015,20 @@ var init_client = __esm({
6020
7015
  init_builder();
6021
7016
  init_discovery();
6022
7017
  init_model_registry();
7018
+ init_image();
7019
+ init_speech();
7020
+ init_text();
6023
7021
  init_options();
6024
7022
  init_quick_methods();
6025
7023
  LLMist = class _LLMist {
6026
7024
  parser;
7025
+ defaultProvider;
6027
7026
  modelRegistry;
6028
7027
  adapters;
7028
+ // Namespaces for different generation types
7029
+ text;
7030
+ image;
7031
+ speech;
6029
7032
  constructor(...args) {
6030
7033
  let adapters = [];
6031
7034
  let defaultProvider;
@@ -6064,6 +7067,7 @@ var init_client = __esm({
6064
7067
  const priorityB = b.priority ?? 0;
6065
7068
  return priorityB - priorityA;
6066
7069
  });
7070
+ this.defaultProvider = resolvedDefaultProvider;
6067
7071
  this.parser = new ModelIdentifierParser(resolvedDefaultProvider);
6068
7072
  this.modelRegistry = new ModelRegistry();
6069
7073
  for (const adapter of this.adapters) {
@@ -6072,6 +7076,9 @@ var init_client = __esm({
6072
7076
  if (customModels.length > 0) {
6073
7077
  this.modelRegistry.registerModels(customModels);
6074
7078
  }
7079
+ this.text = new TextNamespace(this);
7080
+ this.image = new ImageNamespace(this.adapters, this.defaultProvider);
7081
+ this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
6075
7082
  }
6076
7083
  stream(options) {
6077
7084
  const descriptor = this.parser.parse(options.model);