llmist 2.2.0 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.cjs CHANGED
@@ -2498,7 +2498,27 @@ var init_cost_reporting_client = __esm({
2498
2498
  constructor(client, reportCost) {
2499
2499
  this.client = client;
2500
2500
  this.reportCost = reportCost;
2501
+ this.image = {
2502
+ generate: async (options) => {
2503
+ const result = await this.client.image.generate(options);
2504
+ if (result.cost !== void 0 && result.cost > 0) {
2505
+ this.reportCost(result.cost);
2506
+ }
2507
+ return result;
2508
+ }
2509
+ };
2510
+ this.speech = {
2511
+ generate: async (options) => {
2512
+ const result = await this.client.speech.generate(options);
2513
+ if (result.cost !== void 0 && result.cost > 0) {
2514
+ this.reportCost(result.cost);
2515
+ }
2516
+ return result;
2517
+ }
2518
+ };
2501
2519
  }
2520
+ image;
2521
+ speech;
2502
2522
  /**
2503
2523
  * Access to model registry for cost estimation.
2504
2524
  */
@@ -4591,6 +4611,28 @@ var init_anthropic = __esm({
4591
4611
  getModelSpecs() {
4592
4612
  return ANTHROPIC_MODELS;
4593
4613
  }
4614
+ // =========================================================================
4615
+ // Image Generation (Not Supported)
4616
+ // =========================================================================
4617
+ supportsImageGeneration(_modelId) {
4618
+ return false;
4619
+ }
4620
+ async generateImage() {
4621
+ throw new Error(
4622
+ "Anthropic does not support image generation. Use OpenAI (DALL-E, GPT Image) or Google Gemini (Imagen) instead."
4623
+ );
4624
+ }
4625
+ // =========================================================================
4626
+ // Speech Generation (Not Supported)
4627
+ // =========================================================================
4628
+ supportsSpeechGeneration(_modelId) {
4629
+ return false;
4630
+ }
4631
+ async generateSpeech() {
4632
+ throw new Error(
4633
+ "Anthropic does not support speech generation. Use OpenAI (TTS) or Google Gemini (TTS) instead."
4634
+ );
4635
+ }
4594
4636
  buildRequestPayload(options, descriptor, spec, messages) {
4595
4637
  const systemMessages = messages.filter((message) => message.role === "system");
4596
4638
  const system = systemMessages.length > 0 ? systemMessages.map((m, index) => ({
@@ -4745,6 +4787,182 @@ var init_anthropic = __esm({
4745
4787
  }
4746
4788
  });
4747
4789
 
4790
+ // src/providers/gemini-image-models.ts
4791
+ function getGeminiImageModelSpec(modelId) {
4792
+ return geminiImageModels.find((m) => m.modelId === modelId);
4793
+ }
4794
+ function isGeminiImageModel(modelId) {
4795
+ return geminiImageModels.some((m) => m.modelId === modelId);
4796
+ }
4797
+ function calculateGeminiImageCost(modelId, size = "1:1", n = 1) {
4798
+ const spec = getGeminiImageModelSpec(modelId);
4799
+ if (!spec) return void 0;
4800
+ if (spec.pricing.perImage !== void 0) {
4801
+ return spec.pricing.perImage * n;
4802
+ }
4803
+ if (spec.pricing.bySize) {
4804
+ const sizePrice = spec.pricing.bySize[size];
4805
+ if (typeof sizePrice === "number") {
4806
+ return sizePrice * n;
4807
+ }
4808
+ }
4809
+ return void 0;
4810
+ }
4811
+ var IMAGEN4_ASPECT_RATIOS, GEMINI_IMAGE_ASPECT_RATIOS, geminiImageModels;
4812
+ var init_gemini_image_models = __esm({
4813
+ "src/providers/gemini-image-models.ts"() {
4814
+ "use strict";
4815
+ IMAGEN4_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
4816
+ GEMINI_IMAGE_ASPECT_RATIOS = ["1:1", "3:4", "4:3", "9:16", "16:9"];
4817
+ geminiImageModels = [
4818
+ // Imagen 4 Family (standalone image generation)
4819
+ {
4820
+ provider: "gemini",
4821
+ modelId: "imagen-4.0-fast-generate-001",
4822
+ displayName: "Imagen 4 Fast",
4823
+ pricing: {
4824
+ perImage: 0.02
4825
+ },
4826
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
4827
+ maxImages: 4,
4828
+ defaultSize: "1:1",
4829
+ features: {
4830
+ textRendering: true
4831
+ }
4832
+ },
4833
+ {
4834
+ provider: "gemini",
4835
+ modelId: "imagen-4.0-generate-001",
4836
+ displayName: "Imagen 4",
4837
+ pricing: {
4838
+ perImage: 0.04
4839
+ },
4840
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
4841
+ maxImages: 4,
4842
+ defaultSize: "1:1",
4843
+ features: {
4844
+ textRendering: true
4845
+ }
4846
+ },
4847
+ {
4848
+ provider: "gemini",
4849
+ modelId: "imagen-4.0-ultra-generate-001",
4850
+ displayName: "Imagen 4 Ultra",
4851
+ pricing: {
4852
+ perImage: 0.06
4853
+ },
4854
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
4855
+ maxImages: 4,
4856
+ defaultSize: "1:1",
4857
+ features: {
4858
+ textRendering: true
4859
+ }
4860
+ },
4861
+ // Preview versions
4862
+ {
4863
+ provider: "gemini",
4864
+ modelId: "imagen-4.0-generate-preview-06-06",
4865
+ displayName: "Imagen 4 (Preview)",
4866
+ pricing: {
4867
+ perImage: 0.04
4868
+ },
4869
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
4870
+ maxImages: 4,
4871
+ defaultSize: "1:1",
4872
+ features: {
4873
+ textRendering: true
4874
+ }
4875
+ },
4876
+ {
4877
+ provider: "gemini",
4878
+ modelId: "imagen-4.0-ultra-generate-preview-06-06",
4879
+ displayName: "Imagen 4 Ultra (Preview)",
4880
+ pricing: {
4881
+ perImage: 0.06
4882
+ },
4883
+ supportedSizes: [...IMAGEN4_ASPECT_RATIOS],
4884
+ maxImages: 4,
4885
+ defaultSize: "1:1",
4886
+ features: {
4887
+ textRendering: true
4888
+ }
4889
+ },
4890
+ // Gemini Native Image Generation (multimodal models)
4891
+ {
4892
+ provider: "gemini",
4893
+ modelId: "gemini-2.5-flash-image",
4894
+ displayName: "Gemini 2.5 Flash Image",
4895
+ pricing: {
4896
+ perImage: 0.039
4897
+ },
4898
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
4899
+ maxImages: 1,
4900
+ defaultSize: "1:1",
4901
+ features: {
4902
+ conversational: true,
4903
+ textRendering: true
4904
+ }
4905
+ },
4906
+ {
4907
+ provider: "gemini",
4908
+ modelId: "gemini-2.5-flash-image-preview",
4909
+ displayName: "Gemini 2.5 Flash Image (Preview)",
4910
+ pricing: {
4911
+ perImage: 0.039
4912
+ },
4913
+ supportedSizes: [...GEMINI_IMAGE_ASPECT_RATIOS],
4914
+ maxImages: 1,
4915
+ defaultSize: "1:1",
4916
+ features: {
4917
+ conversational: true,
4918
+ textRendering: true
4919
+ }
4920
+ },
4921
+ {
4922
+ provider: "gemini",
4923
+ modelId: "gemini-3-pro-image-preview",
4924
+ displayName: "Gemini 3 Pro Image (Preview)",
4925
+ pricing: {
4926
+ // Token-based: ~$0.134 per 1K/2K image, $0.24 per 4K
4927
+ // Using 2K as default
4928
+ bySize: {
4929
+ "1K": 0.134,
4930
+ "2K": 0.134,
4931
+ "4K": 0.24
4932
+ }
4933
+ },
4934
+ supportedSizes: ["1K", "2K", "4K"],
4935
+ maxImages: 1,
4936
+ defaultSize: "2K",
4937
+ features: {
4938
+ conversational: true,
4939
+ textRendering: true
4940
+ }
4941
+ },
4942
+ // Alias: nano-banana-pro-preview is gemini-3-pro-image-preview
4943
+ {
4944
+ provider: "gemini",
4945
+ modelId: "nano-banana-pro-preview",
4946
+ displayName: "Nano Banana Pro (Gemini 3 Pro Image)",
4947
+ pricing: {
4948
+ bySize: {
4949
+ "1K": 0.134,
4950
+ "2K": 0.134,
4951
+ "4K": 0.24
4952
+ }
4953
+ },
4954
+ supportedSizes: ["1K", "2K", "4K"],
4955
+ maxImages: 1,
4956
+ defaultSize: "2K",
4957
+ features: {
4958
+ conversational: true,
4959
+ textRendering: true
4960
+ }
4961
+ }
4962
+ ];
4963
+ }
4964
+ });
4965
+
4748
4966
  // src/providers/gemini-models.ts
4749
4967
  var GEMINI_MODELS;
4750
4968
  var init_gemini_models = __esm({
@@ -4918,7 +5136,171 @@ var init_gemini_models = __esm({
4918
5136
  }
4919
5137
  });
4920
5138
 
5139
+ // src/providers/gemini-speech-models.ts
5140
+ function getGeminiSpeechModelSpec(modelId) {
5141
+ return geminiSpeechModels.find((m) => m.modelId === modelId);
5142
+ }
5143
+ function isGeminiSpeechModel(modelId) {
5144
+ return geminiSpeechModels.some((m) => m.modelId === modelId);
5145
+ }
5146
+ function calculateGeminiSpeechCost(modelId, characterCount, estimatedMinutes) {
5147
+ const spec = getGeminiSpeechModelSpec(modelId);
5148
+ if (!spec) return void 0;
5149
+ if (spec.pricing.perMinute !== void 0) {
5150
+ if (estimatedMinutes !== void 0) {
5151
+ return estimatedMinutes * spec.pricing.perMinute;
5152
+ }
5153
+ const approxMinutes = characterCount / 750;
5154
+ return approxMinutes * spec.pricing.perMinute;
5155
+ }
5156
+ return void 0;
5157
+ }
5158
+ var GEMINI_TTS_VOICES, GEMINI_TTS_FORMATS, geminiSpeechModels;
5159
+ var init_gemini_speech_models = __esm({
5160
+ "src/providers/gemini-speech-models.ts"() {
5161
+ "use strict";
5162
+ GEMINI_TTS_VOICES = [
5163
+ "Zephyr",
5164
+ // Bright
5165
+ "Puck",
5166
+ // Upbeat
5167
+ "Charon",
5168
+ // Informative
5169
+ "Kore",
5170
+ // Firm
5171
+ "Fenrir",
5172
+ // Excitable
5173
+ "Leda",
5174
+ // Youthful
5175
+ "Orus",
5176
+ // Firm
5177
+ "Aoede",
5178
+ // Breezy
5179
+ "Callirrhoe",
5180
+ // Easy-going
5181
+ "Autonoe",
5182
+ // Bright
5183
+ "Enceladus",
5184
+ // Breathy
5185
+ "Iapetus",
5186
+ // Clear
5187
+ "Umbriel",
5188
+ // Easy-going
5189
+ "Algieba",
5190
+ // Smooth
5191
+ "Despina",
5192
+ // Smooth
5193
+ "Erinome",
5194
+ // Clear
5195
+ "Algenib",
5196
+ // Gravelly
5197
+ "Rasalgethi",
5198
+ // Informative
5199
+ "Laomedeia",
5200
+ // Upbeat
5201
+ "Achernar",
5202
+ // Soft
5203
+ "Alnilam",
5204
+ // Firm
5205
+ "Schedar",
5206
+ // Even
5207
+ "Gacrux",
5208
+ // Mature
5209
+ "Pulcherrima",
5210
+ // Forward
5211
+ "Achird",
5212
+ // Friendly
5213
+ "Zubenelgenubi",
5214
+ // Casual
5215
+ "Vindemiatrix",
5216
+ // Gentle
5217
+ "Sadachbia",
5218
+ // Lively
5219
+ "Sadaltager",
5220
+ // Knowledgeable
5221
+ "Sulafat"
5222
+ // Warm
5223
+ ];
5224
+ GEMINI_TTS_FORMATS = ["pcm", "wav"];
5225
+ geminiSpeechModels = [
5226
+ {
5227
+ provider: "gemini",
5228
+ modelId: "gemini-2.5-flash-preview-tts",
5229
+ displayName: "Gemini 2.5 Flash TTS (Preview)",
5230
+ pricing: {
5231
+ // $0.50 per 1M input tokens = $0.0000005 per token
5232
+ perInputToken: 5e-7,
5233
+ // $10.00 per 1M audio output tokens = $0.00001 per token
5234
+ perAudioOutputToken: 1e-5,
5235
+ // Rough estimate: ~$0.01 per minute of audio
5236
+ perMinute: 0.01
5237
+ },
5238
+ voices: [...GEMINI_TTS_VOICES],
5239
+ formats: GEMINI_TTS_FORMATS,
5240
+ maxInputLength: 8e3,
5241
+ // bytes (text + prompt combined)
5242
+ defaultVoice: "Zephyr",
5243
+ defaultFormat: "wav",
5244
+ features: {
5245
+ multiSpeaker: true,
5246
+ languages: 24,
5247
+ voiceInstructions: true
5248
+ }
5249
+ },
5250
+ {
5251
+ provider: "gemini",
5252
+ modelId: "gemini-2.5-pro-preview-tts",
5253
+ displayName: "Gemini 2.5 Pro TTS (Preview)",
5254
+ pricing: {
5255
+ // $1.00 per 1M input tokens = $0.000001 per token
5256
+ perInputToken: 1e-6,
5257
+ // $20.00 per 1M audio output tokens = $0.00002 per token
5258
+ perAudioOutputToken: 2e-5,
5259
+ // Rough estimate: ~$0.02 per minute of audio
5260
+ perMinute: 0.02
5261
+ },
5262
+ voices: [...GEMINI_TTS_VOICES],
5263
+ formats: GEMINI_TTS_FORMATS,
5264
+ maxInputLength: 8e3,
5265
+ // bytes
5266
+ defaultVoice: "Zephyr",
5267
+ defaultFormat: "wav",
5268
+ features: {
5269
+ multiSpeaker: true,
5270
+ languages: 24,
5271
+ voiceInstructions: true
5272
+ }
5273
+ }
5274
+ ];
5275
+ }
5276
+ });
5277
+
4921
5278
  // src/providers/gemini.ts
5279
+ function wrapPcmInWav(pcmData, sampleRate, bitsPerSample, numChannels) {
5280
+ const byteRate = sampleRate * numChannels * bitsPerSample / 8;
5281
+ const blockAlign = numChannels * bitsPerSample / 8;
5282
+ const dataSize = pcmData.length;
5283
+ const headerSize = 44;
5284
+ const fileSize = headerSize + dataSize - 8;
5285
+ const buffer = new ArrayBuffer(headerSize + dataSize);
5286
+ const view = new DataView(buffer);
5287
+ const uint8 = new Uint8Array(buffer);
5288
+ view.setUint32(0, 1380533830, false);
5289
+ view.setUint32(4, fileSize, true);
5290
+ view.setUint32(8, 1463899717, false);
5291
+ view.setUint32(12, 1718449184, false);
5292
+ view.setUint32(16, 16, true);
5293
+ view.setUint16(20, 1, true);
5294
+ view.setUint16(22, numChannels, true);
5295
+ view.setUint32(24, sampleRate, true);
5296
+ view.setUint32(28, byteRate, true);
5297
+ view.setUint16(32, blockAlign, true);
5298
+ view.setUint16(34, bitsPerSample, true);
5299
+ view.setUint32(36, 1684108385, false);
5300
+ view.setUint32(40, dataSize, true);
5301
+ uint8.set(pcmData, headerSize);
5302
+ return buffer;
5303
+ }
4922
5304
  function createGeminiProviderFromEnv() {
4923
5305
  return createProviderFromEnv("GEMINI_API_KEY", import_genai.GoogleGenAI, GeminiGenerativeProvider);
4924
5306
  }
@@ -4929,7 +5311,9 @@ var init_gemini = __esm({
4929
5311
  import_genai = require("@google/genai");
4930
5312
  init_base_provider();
4931
5313
  init_constants2();
5314
+ init_gemini_image_models();
4932
5315
  init_gemini_models();
5316
+ init_gemini_speech_models();
4933
5317
  init_utils();
4934
5318
  GEMINI_ROLE_MAP = {
4935
5319
  system: "user",
@@ -4944,6 +5328,139 @@ var init_gemini = __esm({
4944
5328
  getModelSpecs() {
4945
5329
  return GEMINI_MODELS;
4946
5330
  }
5331
+ // =========================================================================
5332
+ // Image Generation
5333
+ // =========================================================================
5334
+ getImageModelSpecs() {
5335
+ return geminiImageModels;
5336
+ }
5337
+ supportsImageGeneration(modelId) {
5338
+ return isGeminiImageModel(modelId);
5339
+ }
5340
+ async generateImage(options) {
5341
+ const client = this.client;
5342
+ const spec = getGeminiImageModelSpec(options.model);
5343
+ const isImagenModel = options.model.startsWith("imagen");
5344
+ const aspectRatio = options.size ?? spec?.defaultSize ?? "1:1";
5345
+ const n = options.n ?? 1;
5346
+ if (isImagenModel) {
5347
+ const response2 = await client.models.generateImages({
5348
+ model: options.model,
5349
+ prompt: options.prompt,
5350
+ config: {
5351
+ numberOfImages: n,
5352
+ aspectRatio,
5353
+ outputMimeType: options.responseFormat === "b64_json" ? "image/png" : "image/jpeg"
5354
+ }
5355
+ });
5356
+ const images2 = response2.generatedImages ?? [];
5357
+ const cost2 = calculateGeminiImageCost(options.model, aspectRatio, images2.length);
5358
+ return {
5359
+ // Gemini's imageBytes is already base64 encoded, so use it directly
5360
+ images: images2.map((img) => ({
5361
+ b64Json: img.image?.imageBytes ?? void 0
5362
+ })),
5363
+ model: options.model,
5364
+ usage: {
5365
+ imagesGenerated: images2.length,
5366
+ size: aspectRatio,
5367
+ quality: "standard"
5368
+ },
5369
+ cost: cost2
5370
+ };
5371
+ }
5372
+ const response = await client.models.generateContent({
5373
+ model: options.model,
5374
+ contents: [{ role: "user", parts: [{ text: options.prompt }] }],
5375
+ config: {
5376
+ responseModalities: [import_genai.Modality.IMAGE, import_genai.Modality.TEXT]
5377
+ }
5378
+ });
5379
+ const images = [];
5380
+ const candidate = response.candidates?.[0];
5381
+ if (candidate?.content?.parts) {
5382
+ for (const part of candidate.content.parts) {
5383
+ if ("inlineData" in part && part.inlineData) {
5384
+ images.push({
5385
+ b64Json: part.inlineData.data
5386
+ });
5387
+ }
5388
+ }
5389
+ }
5390
+ const cost = calculateGeminiImageCost(options.model, aspectRatio, images.length);
5391
+ return {
5392
+ images,
5393
+ model: options.model,
5394
+ usage: {
5395
+ imagesGenerated: images.length,
5396
+ size: aspectRatio,
5397
+ quality: "standard"
5398
+ },
5399
+ cost
5400
+ };
5401
+ }
5402
+ // =========================================================================
5403
+ // Speech Generation
5404
+ // =========================================================================
5405
+ getSpeechModelSpecs() {
5406
+ return geminiSpeechModels;
5407
+ }
5408
+ supportsSpeechGeneration(modelId) {
5409
+ return isGeminiSpeechModel(modelId);
5410
+ }
5411
+ async generateSpeech(options) {
5412
+ const client = this.client;
5413
+ const spec = getGeminiSpeechModelSpec(options.model);
5414
+ const voice = options.voice ?? spec?.defaultVoice ?? "Zephyr";
5415
+ const response = await client.models.generateContent({
5416
+ model: options.model,
5417
+ contents: [
5418
+ {
5419
+ role: "user",
5420
+ parts: [{ text: options.input }]
5421
+ }
5422
+ ],
5423
+ config: {
5424
+ responseModalities: [import_genai.Modality.AUDIO],
5425
+ speechConfig: {
5426
+ voiceConfig: {
5427
+ prebuiltVoiceConfig: {
5428
+ voiceName: voice
5429
+ }
5430
+ }
5431
+ }
5432
+ }
5433
+ });
5434
+ let pcmData;
5435
+ const candidate = response.candidates?.[0];
5436
+ if (candidate?.content?.parts) {
5437
+ for (const part of candidate.content.parts) {
5438
+ if ("inlineData" in part && part.inlineData?.data) {
5439
+ const base64 = part.inlineData.data;
5440
+ const binary = atob(base64);
5441
+ pcmData = new Uint8Array(binary.length);
5442
+ for (let i = 0; i < binary.length; i++) {
5443
+ pcmData[i] = binary.charCodeAt(i);
5444
+ }
5445
+ break;
5446
+ }
5447
+ }
5448
+ }
5449
+ if (!pcmData) {
5450
+ throw new Error("No audio data in Gemini TTS response");
5451
+ }
5452
+ const audioData = wrapPcmInWav(pcmData, 24e3, 16, 1);
5453
+ const cost = calculateGeminiSpeechCost(options.model, options.input.length);
5454
+ return {
5455
+ audio: audioData,
5456
+ model: options.model,
5457
+ usage: {
5458
+ characterCount: options.input.length
5459
+ },
5460
+ cost,
5461
+ format: spec?.defaultFormat ?? "wav"
5462
+ };
5463
+ }
4947
5464
  buildRequestPayload(options, descriptor, _spec, messages) {
4948
5465
  const contents = this.convertMessagesToContents(messages);
4949
5466
  const generationConfig = this.buildGenerationConfig(options);
@@ -5139,6 +5656,121 @@ var init_gemini = __esm({
5139
5656
  }
5140
5657
  });
5141
5658
 
5659
+ // src/providers/openai-image-models.ts
5660
+ function getOpenAIImageModelSpec(modelId) {
5661
+ return openaiImageModels.find((m) => m.modelId === modelId);
5662
+ }
5663
+ function isOpenAIImageModel(modelId) {
5664
+ return openaiImageModels.some((m) => m.modelId === modelId);
5665
+ }
5666
+ function calculateOpenAIImageCost(modelId, size, quality = "standard", n = 1) {
5667
+ const spec = getOpenAIImageModelSpec(modelId);
5668
+ if (!spec) return void 0;
5669
+ const sizePrice = spec.pricing.bySize?.[size];
5670
+ if (sizePrice === void 0) return void 0;
5671
+ let pricePerImage;
5672
+ if (typeof sizePrice === "number") {
5673
+ pricePerImage = sizePrice;
5674
+ } else {
5675
+ pricePerImage = sizePrice[quality];
5676
+ if (pricePerImage === void 0) return void 0;
5677
+ }
5678
+ return pricePerImage * n;
5679
+ }
5680
+ var GPT_IMAGE_SIZES, GPT_IMAGE_QUALITIES, DALLE3_SIZES, DALLE3_QUALITIES, DALLE2_SIZES, openaiImageModels;
5681
+ var init_openai_image_models = __esm({
5682
+ "src/providers/openai-image-models.ts"() {
5683
+ "use strict";
5684
+ GPT_IMAGE_SIZES = ["1024x1024", "1024x1536", "1536x1024"];
5685
+ GPT_IMAGE_QUALITIES = ["low", "medium", "high"];
5686
+ DALLE3_SIZES = ["1024x1024", "1024x1792", "1792x1024"];
5687
+ DALLE3_QUALITIES = ["standard", "hd"];
5688
+ DALLE2_SIZES = ["256x256", "512x512", "1024x1024"];
5689
+ openaiImageModels = [
5690
+ // GPT Image 1 Family (flagship)
5691
+ {
5692
+ provider: "openai",
5693
+ modelId: "gpt-image-1",
5694
+ displayName: "GPT Image 1",
5695
+ pricing: {
5696
+ bySize: {
5697
+ "1024x1024": { low: 0.011, medium: 0.04, high: 0.17 },
5698
+ "1024x1536": { low: 0.016, medium: 0.06, high: 0.25 },
5699
+ "1536x1024": { low: 0.016, medium: 0.06, high: 0.25 }
5700
+ }
5701
+ },
5702
+ supportedSizes: [...GPT_IMAGE_SIZES],
5703
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
5704
+ maxImages: 1,
5705
+ defaultSize: "1024x1024",
5706
+ defaultQuality: "medium",
5707
+ features: {
5708
+ textRendering: true,
5709
+ transparency: true
5710
+ }
5711
+ },
5712
+ {
5713
+ provider: "openai",
5714
+ modelId: "gpt-image-1-mini",
5715
+ displayName: "GPT Image 1 Mini",
5716
+ pricing: {
5717
+ bySize: {
5718
+ "1024x1024": { low: 5e-3, medium: 0.02, high: 0.052 },
5719
+ "1024x1536": { low: 75e-4, medium: 0.03, high: 0.078 },
5720
+ "1536x1024": { low: 75e-4, medium: 0.03, high: 0.078 }
5721
+ }
5722
+ },
5723
+ supportedSizes: [...GPT_IMAGE_SIZES],
5724
+ supportedQualities: [...GPT_IMAGE_QUALITIES],
5725
+ maxImages: 1,
5726
+ defaultSize: "1024x1024",
5727
+ defaultQuality: "medium",
5728
+ features: {
5729
+ textRendering: true,
5730
+ transparency: true
5731
+ }
5732
+ },
5733
+ // DALL-E Family
5734
+ {
5735
+ provider: "openai",
5736
+ modelId: "dall-e-3",
5737
+ displayName: "DALL-E 3",
5738
+ pricing: {
5739
+ bySize: {
5740
+ "1024x1024": { standard: 0.04, hd: 0.08 },
5741
+ "1024x1792": { standard: 0.08, hd: 0.12 },
5742
+ "1792x1024": { standard: 0.08, hd: 0.12 }
5743
+ }
5744
+ },
5745
+ supportedSizes: [...DALLE3_SIZES],
5746
+ supportedQualities: [...DALLE3_QUALITIES],
5747
+ maxImages: 1,
5748
+ // DALL-E 3 only supports n=1
5749
+ defaultSize: "1024x1024",
5750
+ defaultQuality: "standard",
5751
+ features: {
5752
+ textRendering: true
5753
+ }
5754
+ },
5755
+ {
5756
+ provider: "openai",
5757
+ modelId: "dall-e-2",
5758
+ displayName: "DALL-E 2 (Legacy)",
5759
+ pricing: {
5760
+ bySize: {
5761
+ "256x256": 0.016,
5762
+ "512x512": 0.018,
5763
+ "1024x1024": 0.02
5764
+ }
5765
+ },
5766
+ supportedSizes: [...DALLE2_SIZES],
5767
+ maxImages: 10,
5768
+ defaultSize: "1024x1024"
5769
+ }
5770
+ ];
5771
+ }
5772
+ });
5773
+
5142
5774
  // src/providers/openai-models.ts
5143
5775
  var OPENAI_MODELS;
5144
5776
  var init_openai_models = __esm({
@@ -5503,6 +6135,144 @@ var init_openai_models = __esm({
5503
6135
  }
5504
6136
  });
5505
6137
 
6138
+ // src/providers/openai-speech-models.ts
6139
+ function getOpenAISpeechModelSpec(modelId) {
6140
+ return openaiSpeechModels.find((m) => m.modelId === modelId);
6141
+ }
6142
+ function isOpenAISpeechModel(modelId) {
6143
+ return openaiSpeechModels.some((m) => m.modelId === modelId);
6144
+ }
6145
+ function calculateOpenAISpeechCost(modelId, characterCount, estimatedMinutes) {
6146
+ const spec = getOpenAISpeechModelSpec(modelId);
6147
+ if (!spec) return void 0;
6148
+ if (spec.pricing.perCharacter !== void 0) {
6149
+ return characterCount * spec.pricing.perCharacter;
6150
+ }
6151
+ if (spec.pricing.perMinute !== void 0 && estimatedMinutes !== void 0) {
6152
+ return estimatedMinutes * spec.pricing.perMinute;
6153
+ }
6154
+ if (spec.pricing.perMinute !== void 0) {
6155
+ const approxMinutes = characterCount / 750;
6156
+ return approxMinutes * spec.pricing.perMinute;
6157
+ }
6158
+ return void 0;
6159
+ }
6160
+ var OPENAI_TTS_VOICES, OPENAI_TTS_EXTENDED_VOICES, OPENAI_TTS_FORMATS, openaiSpeechModels;
6161
+ var init_openai_speech_models = __esm({
6162
+ "src/providers/openai-speech-models.ts"() {
6163
+ "use strict";
6164
+ OPENAI_TTS_VOICES = [
6165
+ "alloy",
6166
+ "echo",
6167
+ "fable",
6168
+ "onyx",
6169
+ "nova",
6170
+ "shimmer"
6171
+ ];
6172
+ OPENAI_TTS_EXTENDED_VOICES = [
6173
+ ...OPENAI_TTS_VOICES,
6174
+ "ash",
6175
+ "ballad",
6176
+ "coral",
6177
+ "sage",
6178
+ "verse"
6179
+ ];
6180
+ OPENAI_TTS_FORMATS = ["mp3", "opus", "aac", "flac", "wav", "pcm"];
6181
+ openaiSpeechModels = [
6182
+ // Standard TTS models (character-based pricing)
6183
+ {
6184
+ provider: "openai",
6185
+ modelId: "tts-1",
6186
+ displayName: "TTS-1",
6187
+ pricing: {
6188
+ // $15 per 1M characters = $0.000015 per character
6189
+ perCharacter: 15e-6
6190
+ },
6191
+ voices: [...OPENAI_TTS_VOICES],
6192
+ formats: OPENAI_TTS_FORMATS,
6193
+ maxInputLength: 4096,
6194
+ defaultVoice: "alloy",
6195
+ defaultFormat: "mp3",
6196
+ features: {
6197
+ voiceInstructions: false
6198
+ }
6199
+ },
6200
+ {
6201
+ provider: "openai",
6202
+ modelId: "tts-1-1106",
6203
+ displayName: "TTS-1 (Nov 2023)",
6204
+ pricing: {
6205
+ perCharacter: 15e-6
6206
+ },
6207
+ voices: [...OPENAI_TTS_VOICES],
6208
+ formats: OPENAI_TTS_FORMATS,
6209
+ maxInputLength: 4096,
6210
+ defaultVoice: "alloy",
6211
+ defaultFormat: "mp3",
6212
+ features: {
6213
+ voiceInstructions: false
6214
+ }
6215
+ },
6216
+ {
6217
+ provider: "openai",
6218
+ modelId: "tts-1-hd",
6219
+ displayName: "TTS-1 HD",
6220
+ pricing: {
6221
+ // $30 per 1M characters = $0.00003 per character
6222
+ perCharacter: 3e-5
6223
+ },
6224
+ voices: [...OPENAI_TTS_VOICES],
6225
+ formats: OPENAI_TTS_FORMATS,
6226
+ maxInputLength: 4096,
6227
+ defaultVoice: "alloy",
6228
+ defaultFormat: "mp3",
6229
+ features: {
6230
+ voiceInstructions: false
6231
+ }
6232
+ },
6233
+ {
6234
+ provider: "openai",
6235
+ modelId: "tts-1-hd-1106",
6236
+ displayName: "TTS-1 HD (Nov 2023)",
6237
+ pricing: {
6238
+ perCharacter: 3e-5
6239
+ },
6240
+ voices: [...OPENAI_TTS_VOICES],
6241
+ formats: OPENAI_TTS_FORMATS,
6242
+ maxInputLength: 4096,
6243
+ defaultVoice: "alloy",
6244
+ defaultFormat: "mp3",
6245
+ features: {
6246
+ voiceInstructions: false
6247
+ }
6248
+ },
6249
+ // Token-based TTS model with voice instructions support
6250
+ {
6251
+ provider: "openai",
6252
+ modelId: "gpt-4o-mini-tts",
6253
+ displayName: "GPT-4o Mini TTS",
6254
+ pricing: {
6255
+ // $0.60 per 1M input tokens = $0.0000006 per token
6256
+ perInputToken: 6e-7,
6257
+ // $12 per 1M audio output tokens = $0.000012 per token
6258
+ perAudioOutputToken: 12e-6,
6259
+ // ~$0.015 per minute of audio
6260
+ perMinute: 0.015
6261
+ },
6262
+ voices: [...OPENAI_TTS_EXTENDED_VOICES],
6263
+ formats: OPENAI_TTS_FORMATS,
6264
+ maxInputLength: 2e3,
6265
+ // tokens, not characters
6266
+ defaultVoice: "alloy",
6267
+ defaultFormat: "mp3",
6268
+ features: {
6269
+ voiceInstructions: true
6270
+ }
6271
+ }
6272
+ ];
6273
+ }
6274
+ });
6275
+
5506
6276
  // src/providers/openai.ts
5507
6277
  function sanitizeExtra(extra, allowTemperature) {
5508
6278
  if (!extra) {
@@ -5524,7 +6294,9 @@ var init_openai = __esm({
5524
6294
  import_tiktoken = require("tiktoken");
5525
6295
  init_base_provider();
5526
6296
  init_constants2();
6297
+ init_openai_image_models();
5527
6298
  init_openai_models();
6299
+ init_openai_speech_models();
5528
6300
  init_utils();
5529
6301
  ROLE_MAP = {
5530
6302
  system: "system",
@@ -5539,6 +6311,87 @@ var init_openai = __esm({
5539
6311
  getModelSpecs() {
5540
6312
  return OPENAI_MODELS;
5541
6313
  }
6314
+ // =========================================================================
6315
+ // Image Generation
6316
+ // =========================================================================
6317
+ getImageModelSpecs() {
6318
+ return openaiImageModels;
6319
+ }
6320
+ supportsImageGeneration(modelId) {
6321
+ return isOpenAIImageModel(modelId);
6322
+ }
6323
+ async generateImage(options) {
6324
+ const client = this.client;
6325
+ const spec = getOpenAIImageModelSpec(options.model);
6326
+ const size = options.size ?? spec?.defaultSize ?? "1024x1024";
6327
+ const quality = options.quality ?? spec?.defaultQuality ?? "standard";
6328
+ const n = options.n ?? 1;
6329
+ const isDallE2 = options.model === "dall-e-2";
6330
+ const isGptImage = options.model.startsWith("gpt-image");
6331
+ const requestParams = {
6332
+ model: options.model,
6333
+ prompt: options.prompt,
6334
+ size,
6335
+ n
6336
+ };
6337
+ if (!isDallE2 && !isGptImage) {
6338
+ requestParams.quality = quality;
6339
+ }
6340
+ if (isGptImage) {
6341
+ } else if (!isDallE2) {
6342
+ requestParams.response_format = options.responseFormat ?? "url";
6343
+ }
6344
+ const response = await client.images.generate(requestParams);
6345
+ const cost = calculateOpenAIImageCost(options.model, size, quality, n);
6346
+ const images = response.data ?? [];
6347
+ return {
6348
+ images: images.map((img) => ({
6349
+ url: img.url,
6350
+ b64Json: img.b64_json,
6351
+ revisedPrompt: img.revised_prompt
6352
+ })),
6353
+ model: options.model,
6354
+ usage: {
6355
+ imagesGenerated: images.length,
6356
+ size,
6357
+ quality
6358
+ },
6359
+ cost
6360
+ };
6361
+ }
6362
+ // =========================================================================
6363
+ // Speech Generation
6364
+ // =========================================================================
6365
+ getSpeechModelSpecs() {
6366
+ return openaiSpeechModels;
6367
+ }
6368
+ supportsSpeechGeneration(modelId) {
6369
+ return isOpenAISpeechModel(modelId);
6370
+ }
6371
+ async generateSpeech(options) {
6372
+ const client = this.client;
6373
+ const spec = getOpenAISpeechModelSpec(options.model);
6374
+ const format = options.responseFormat ?? spec?.defaultFormat ?? "mp3";
6375
+ const voice = options.voice ?? spec?.defaultVoice ?? "alloy";
6376
+ const response = await client.audio.speech.create({
6377
+ model: options.model,
6378
+ input: options.input,
6379
+ voice,
6380
+ response_format: format,
6381
+ speed: options.speed ?? 1
6382
+ });
6383
+ const audioBuffer = await response.arrayBuffer();
6384
+ const cost = calculateOpenAISpeechCost(options.model, options.input.length);
6385
+ return {
6386
+ audio: audioBuffer,
6387
+ model: options.model,
6388
+ usage: {
6389
+ characterCount: options.input.length
6390
+ },
6391
+ cost,
6392
+ format
6393
+ };
6394
+ }
5542
6395
  buildRequestPayload(options, descriptor, spec, messages) {
5543
6396
  const { maxTokens, temperature, topP, stopSequences, extra } = options;
5544
6397
  const supportsTemperature = spec?.metadata?.supportsTemperature !== false;
@@ -5879,30 +6732,109 @@ var init_model_registry = __esm({
5879
6732
  }
5880
6733
  });
5881
6734
 
5882
- // src/core/options.ts
5883
- var ModelIdentifierParser;
5884
- var init_options = __esm({
5885
- "src/core/options.ts"() {
6735
+ // src/core/namespaces/image.ts
6736
+ var ImageNamespace;
6737
+ var init_image = __esm({
6738
+ "src/core/namespaces/image.ts"() {
5886
6739
  "use strict";
5887
- ModelIdentifierParser = class {
5888
- constructor(defaultProvider = "openai") {
6740
+ ImageNamespace = class {
6741
+ constructor(adapters, defaultProvider) {
6742
+ this.adapters = adapters;
6743
+ this.defaultProvider = defaultProvider;
6744
+ }
6745
+ /**
6746
+ * Generate images from a text prompt.
6747
+ *
6748
+ * @param options - Image generation options
6749
+ * @returns Promise resolving to the generation result with images and cost
6750
+ * @throws Error if the provider doesn't support image generation
6751
+ */
6752
+ async generate(options) {
6753
+ const modelId = options.model;
6754
+ const adapter = this.findImageAdapter(modelId);
6755
+ if (!adapter || !adapter.generateImage) {
6756
+ throw new Error(
6757
+ `No provider supports image generation for model "${modelId}". Available image models: ${this.listModels().map((m) => m.modelId).join(", ")}`
6758
+ );
6759
+ }
6760
+ return adapter.generateImage(options);
6761
+ }
6762
+ /**
6763
+ * List all available image generation models.
6764
+ */
6765
+ listModels() {
6766
+ const models = [];
6767
+ for (const adapter of this.adapters) {
6768
+ if (adapter.getImageModelSpecs) {
6769
+ models.push(...adapter.getImageModelSpecs());
6770
+ }
6771
+ }
6772
+ return models;
6773
+ }
6774
+ /**
6775
+ * Check if a model is supported for image generation.
6776
+ */
6777
+ supportsModel(modelId) {
6778
+ return this.findImageAdapter(modelId) !== void 0;
6779
+ }
6780
+ findImageAdapter(modelId) {
6781
+ return this.adapters.find(
6782
+ (adapter) => adapter.supportsImageGeneration?.(modelId) ?? false
6783
+ );
6784
+ }
6785
+ };
6786
+ }
6787
+ });
6788
+
6789
+ // src/core/namespaces/speech.ts
6790
+ var SpeechNamespace;
6791
+ var init_speech = __esm({
6792
+ "src/core/namespaces/speech.ts"() {
6793
+ "use strict";
6794
+ SpeechNamespace = class {
6795
+ constructor(adapters, defaultProvider) {
6796
+ this.adapters = adapters;
5889
6797
  this.defaultProvider = defaultProvider;
5890
6798
  }
5891
- parse(identifier) {
5892
- const trimmed = identifier.trim();
5893
- if (!trimmed) {
5894
- throw new Error("Model identifier cannot be empty");
5895
- }
5896
- const [maybeProvider, ...rest] = trimmed.split(":");
5897
- if (rest.length === 0) {
5898
- return { provider: this.defaultProvider, name: maybeProvider };
6799
+ /**
6800
+ * Generate speech audio from text.
6801
+ *
6802
+ * @param options - Speech generation options
6803
+ * @returns Promise resolving to the generation result with audio and cost
6804
+ * @throws Error if the provider doesn't support speech generation
6805
+ */
6806
+ async generate(options) {
6807
+ const modelId = options.model;
6808
+ const adapter = this.findSpeechAdapter(modelId);
6809
+ if (!adapter || !adapter.generateSpeech) {
6810
+ throw new Error(
6811
+ `No provider supports speech generation for model "${modelId}". Available speech models: ${this.listModels().map((m) => m.modelId).join(", ")}`
6812
+ );
5899
6813
  }
5900
- const provider = maybeProvider;
5901
- const name = rest.join(":");
5902
- if (!name) {
5903
- throw new Error("Model name cannot be empty");
6814
+ return adapter.generateSpeech(options);
6815
+ }
6816
+ /**
6817
+ * List all available speech generation models.
6818
+ */
6819
+ listModels() {
6820
+ const models = [];
6821
+ for (const adapter of this.adapters) {
6822
+ if (adapter.getSpeechModelSpecs) {
6823
+ models.push(...adapter.getSpeechModelSpecs());
6824
+ }
5904
6825
  }
5905
- return { provider, name };
6826
+ return models;
6827
+ }
6828
+ /**
6829
+ * Check if a model is supported for speech generation.
6830
+ */
6831
+ supportsModel(modelId) {
6832
+ return this.findSpeechAdapter(modelId) !== void 0;
6833
+ }
6834
+ findSpeechAdapter(modelId) {
6835
+ return this.adapters.find(
6836
+ (adapter) => adapter.supportsSpeechGeneration?.(modelId) ?? false
6837
+ );
5906
6838
  }
5907
6839
  };
5908
6840
  }
@@ -5951,6 +6883,69 @@ var init_quick_methods = __esm({
5951
6883
  }
5952
6884
  });
5953
6885
 
6886
+ // src/core/namespaces/text.ts
6887
+ var TextNamespace;
6888
+ var init_text = __esm({
6889
+ "src/core/namespaces/text.ts"() {
6890
+ "use strict";
6891
+ init_quick_methods();
6892
+ TextNamespace = class {
6893
+ constructor(client) {
6894
+ this.client = client;
6895
+ }
6896
+ /**
6897
+ * Generate a complete text response.
6898
+ *
6899
+ * @param prompt - User prompt
6900
+ * @param options - Optional configuration
6901
+ * @returns Complete text response
6902
+ */
6903
+ async complete(prompt, options) {
6904
+ return complete(this.client, prompt, options);
6905
+ }
6906
+ /**
6907
+ * Stream text chunks.
6908
+ *
6909
+ * @param prompt - User prompt
6910
+ * @param options - Optional configuration
6911
+ * @returns Async generator yielding text chunks
6912
+ */
6913
+ stream(prompt, options) {
6914
+ return stream(this.client, prompt, options);
6915
+ }
6916
+ };
6917
+ }
6918
+ });
6919
+
6920
+ // src/core/options.ts
6921
+ var ModelIdentifierParser;
6922
+ var init_options = __esm({
6923
+ "src/core/options.ts"() {
6924
+ "use strict";
6925
+ ModelIdentifierParser = class {
6926
+ constructor(defaultProvider = "openai") {
6927
+ this.defaultProvider = defaultProvider;
6928
+ }
6929
+ parse(identifier) {
6930
+ const trimmed = identifier.trim();
6931
+ if (!trimmed) {
6932
+ throw new Error("Model identifier cannot be empty");
6933
+ }
6934
+ const [maybeProvider, ...rest] = trimmed.split(":");
6935
+ if (rest.length === 0) {
6936
+ return { provider: this.defaultProvider, name: maybeProvider };
6937
+ }
6938
+ const provider = maybeProvider;
6939
+ const name = rest.join(":");
6940
+ if (!name) {
6941
+ throw new Error("Model name cannot be empty");
6942
+ }
6943
+ return { provider, name };
6944
+ }
6945
+ };
6946
+ }
6947
+ });
6948
+
5954
6949
  // src/core/client.ts
5955
6950
  var client_exports = {};
5956
6951
  __export(client_exports, {
@@ -5963,12 +6958,20 @@ var init_client = __esm({
5963
6958
  init_builder();
5964
6959
  init_discovery();
5965
6960
  init_model_registry();
6961
+ init_image();
6962
+ init_speech();
6963
+ init_text();
5966
6964
  init_options();
5967
6965
  init_quick_methods();
5968
6966
  LLMist = class _LLMist {
5969
6967
  parser;
6968
+ defaultProvider;
5970
6969
  modelRegistry;
5971
6970
  adapters;
6971
+ // Namespaces for different generation types
6972
+ text;
6973
+ image;
6974
+ speech;
5972
6975
  constructor(...args) {
5973
6976
  let adapters = [];
5974
6977
  let defaultProvider;
@@ -6007,6 +7010,7 @@ var init_client = __esm({
6007
7010
  const priorityB = b.priority ?? 0;
6008
7011
  return priorityB - priorityA;
6009
7012
  });
7013
+ this.defaultProvider = resolvedDefaultProvider;
6010
7014
  this.parser = new ModelIdentifierParser(resolvedDefaultProvider);
6011
7015
  this.modelRegistry = new ModelRegistry();
6012
7016
  for (const adapter of this.adapters) {
@@ -6015,6 +7019,9 @@ var init_client = __esm({
6015
7019
  if (customModels.length > 0) {
6016
7020
  this.modelRegistry.registerModels(customModels);
6017
7021
  }
7022
+ this.text = new TextNamespace(this);
7023
+ this.image = new ImageNamespace(this.adapters, this.defaultProvider);
7024
+ this.speech = new SpeechNamespace(this.adapters, this.defaultProvider);
6018
7025
  }
6019
7026
  stream(options) {
6020
7027
  const descriptor = this.parser.parse(options.model);
@@ -6995,7 +8002,9 @@ var COMMANDS = {
6995
8002
  complete: "complete",
6996
8003
  agent: "agent",
6997
8004
  models: "models",
6998
- gadget: "gadget"
8005
+ gadget: "gadget",
8006
+ image: "image",
8007
+ speech: "speech"
6999
8008
  };
7000
8009
  var LOG_LEVELS = ["silly", "trace", "debug", "info", "warn", "error", "fatal"];
7001
8010
  var DEFAULT_MODEL = "openai:gpt-5-nano";
@@ -7016,7 +8025,17 @@ var OPTION_FLAGS = {
7016
8025
  docker: "--docker",
7017
8026
  dockerRo: "--docker-ro",
7018
8027
  noDocker: "--no-docker",
7019
- dockerDev: "--docker-dev"
8028
+ dockerDev: "--docker-dev",
8029
+ // Image generation options
8030
+ imageSize: "--size <size>",
8031
+ imageQuality: "--quality <quality>",
8032
+ imageCount: "-n, --count <number>",
8033
+ imageOutput: "-o, --output <path>",
8034
+ // Speech generation options
8035
+ voice: "--voice <name>",
8036
+ speechFormat: "--format <format>",
8037
+ speechSpeed: "--speed <value>",
8038
+ speechOutput: "-o, --output <path>"
7020
8039
  };
7021
8040
  var OPTION_DESCRIPTIONS = {
7022
8041
  model: "Model identifier, e.g. openai:gpt-5-nano or anthropic:claude-sonnet-4-5.",
@@ -7035,7 +8054,17 @@ var OPTION_DESCRIPTIONS = {
7035
8054
  docker: "Run agent in a Docker sandbox container for security isolation.",
7036
8055
  dockerRo: "Run in Docker with current directory mounted read-only.",
7037
8056
  noDocker: "Disable Docker sandboxing (override config).",
7038
- dockerDev: "Run in Docker dev mode (mount local source instead of npm install)."
8057
+ dockerDev: "Run in Docker dev mode (mount local source instead of npm install).",
8058
+ // Image generation descriptions
8059
+ imageSize: "Image size/aspect ratio, e.g. '1024x1024', '1:1', '16:9'.",
8060
+ imageQuality: "Image quality: 'standard', 'hd', 'low', 'medium', 'high'.",
8061
+ imageCount: "Number of images to generate (model dependent, usually 1-4).",
8062
+ imageOutput: "Output path for the generated image. Defaults to stdout if not specified.",
8063
+ // Speech generation descriptions
8064
+ voice: "Voice name for speech generation, e.g. 'nova', 'alloy', 'Zephyr'.",
8065
+ speechFormat: "Audio format: 'mp3', 'opus', 'aac', 'flac', 'wav', 'pcm'.",
8066
+ speechSpeed: "Speech speed multiplier (0.25 to 4.0, default 1.0).",
8067
+ speechOutput: "Output path for audio file. Defaults to stdout if not specified."
7039
8068
  };
7040
8069
  var SUMMARY_PREFIX = "[llmist]";
7041
8070
 
@@ -7045,8 +8074,8 @@ var import_commander2 = require("commander");
7045
8074
  // package.json
7046
8075
  var package_default = {
7047
8076
  name: "llmist",
7048
- version: "2.1.0",
7049
- description: "Universal TypeScript LLM client with streaming-first agent framework. Works with any model - no structured outputs or native tool calling required. Implements its own flexible grammar for function calling.",
8077
+ version: "2.4.0",
8078
+ description: "TypeScript LLM client with streaming tool execution. Tools fire mid-stream. Built-in function calling works with any model\u2014no structured outputs or native tool support required.",
7050
8079
  type: "module",
7051
8080
  main: "dist/index.cjs",
7052
8081
  module: "dist/index.js",
@@ -7120,9 +8149,16 @@ var package_default = {
7120
8149
  "universal-client",
7121
8150
  "multi-provider",
7122
8151
  "hooks",
7123
- "gadgets"
8152
+ "gadgets",
8153
+ "chatbot",
8154
+ "chatgpt",
8155
+ "agentic",
8156
+ "language-model",
8157
+ "generative-ai",
8158
+ "bun",
8159
+ "nodejs"
7124
8160
  ],
7125
- author: "",
8161
+ author: "Zbigniew Sobiecki <zbigniew@sobiecki.name>",
7126
8162
  license: "MIT",
7127
8163
  dependencies: {
7128
8164
  "@anthropic-ai/sdk": "^0.69.0",
@@ -9120,6 +10156,22 @@ var AGENT_CONFIG_KEYS = /* @__PURE__ */ new Set([
9120
10156
  "docker-cwd-permission"
9121
10157
  // Override CWD mount permission for this profile
9122
10158
  ]);
10159
+ var IMAGE_CONFIG_KEYS = /* @__PURE__ */ new Set([
10160
+ "model",
10161
+ "size",
10162
+ "quality",
10163
+ "count",
10164
+ "output",
10165
+ "quiet"
10166
+ ]);
10167
+ var SPEECH_CONFIG_KEYS = /* @__PURE__ */ new Set([
10168
+ "model",
10169
+ "voice",
10170
+ "format",
10171
+ "speed",
10172
+ "output",
10173
+ "quiet"
10174
+ ]);
9123
10175
  var CUSTOM_CONFIG_KEYS = /* @__PURE__ */ new Set([
9124
10176
  ...COMPLETE_CONFIG_KEYS,
9125
10177
  ...AGENT_CONFIG_KEYS,
@@ -9380,6 +10432,75 @@ function validateAgentConfig(raw, section) {
9380
10432
  }
9381
10433
  return result;
9382
10434
  }
10435
+ function validateImageConfig(raw, section) {
10436
+ if (typeof raw !== "object" || raw === null) {
10437
+ throw new ConfigError(`[${section}] must be a table`);
10438
+ }
10439
+ const rawObj = raw;
10440
+ for (const key of Object.keys(rawObj)) {
10441
+ if (!IMAGE_CONFIG_KEYS.has(key)) {
10442
+ throw new ConfigError(`[${section}].${key} is not a valid option`);
10443
+ }
10444
+ }
10445
+ const result = {};
10446
+ if ("model" in rawObj) {
10447
+ result.model = validateString(rawObj.model, "model", section);
10448
+ }
10449
+ if ("size" in rawObj) {
10450
+ result.size = validateString(rawObj.size, "size", section);
10451
+ }
10452
+ if ("quality" in rawObj) {
10453
+ result.quality = validateString(rawObj.quality, "quality", section);
10454
+ }
10455
+ if ("count" in rawObj) {
10456
+ result.count = validateNumber(rawObj.count, "count", section, {
10457
+ integer: true,
10458
+ min: 1,
10459
+ max: 10
10460
+ });
10461
+ }
10462
+ if ("output" in rawObj) {
10463
+ result.output = validateString(rawObj.output, "output", section);
10464
+ }
10465
+ if ("quiet" in rawObj) {
10466
+ result.quiet = validateBoolean(rawObj.quiet, "quiet", section);
10467
+ }
10468
+ return result;
10469
+ }
10470
+ function validateSpeechConfig(raw, section) {
10471
+ if (typeof raw !== "object" || raw === null) {
10472
+ throw new ConfigError(`[${section}] must be a table`);
10473
+ }
10474
+ const rawObj = raw;
10475
+ for (const key of Object.keys(rawObj)) {
10476
+ if (!SPEECH_CONFIG_KEYS.has(key)) {
10477
+ throw new ConfigError(`[${section}].${key} is not a valid option`);
10478
+ }
10479
+ }
10480
+ const result = {};
10481
+ if ("model" in rawObj) {
10482
+ result.model = validateString(rawObj.model, "model", section);
10483
+ }
10484
+ if ("voice" in rawObj) {
10485
+ result.voice = validateString(rawObj.voice, "voice", section);
10486
+ }
10487
+ if ("format" in rawObj) {
10488
+ result.format = validateString(rawObj.format, "format", section);
10489
+ }
10490
+ if ("speed" in rawObj) {
10491
+ result.speed = validateNumber(rawObj.speed, "speed", section, {
10492
+ min: 0.25,
10493
+ max: 4
10494
+ });
10495
+ }
10496
+ if ("output" in rawObj) {
10497
+ result.output = validateString(rawObj.output, "output", section);
10498
+ }
10499
+ if ("quiet" in rawObj) {
10500
+ result.quiet = validateBoolean(rawObj.quiet, "quiet", section);
10501
+ }
10502
+ return result;
10503
+ }
9383
10504
  function validateStringOrBoolean(value, field, section) {
9384
10505
  if (typeof value === "string" || typeof value === "boolean") {
9385
10506
  return value;
@@ -9502,6 +10623,10 @@ function validateConfig(raw, configPath) {
9502
10623
  result.complete = validateCompleteConfig(value, key);
9503
10624
  } else if (key === "agent") {
9504
10625
  result.agent = validateAgentConfig(value, key);
10626
+ } else if (key === "image") {
10627
+ result.image = validateImageConfig(value, key);
10628
+ } else if (key === "speech") {
10629
+ result.speech = validateSpeechConfig(value, key);
9505
10630
  } else if (key === "prompts") {
9506
10631
  result.prompts = validatePromptsConfig(value, key);
9507
10632
  } else if (key === "docker") {
@@ -9546,7 +10671,7 @@ function loadConfig() {
9546
10671
  return resolveTemplatesInConfig(inherited, configPath);
9547
10672
  }
9548
10673
  function getCustomCommandNames(config) {
9549
- const reserved = /* @__PURE__ */ new Set(["global", "complete", "agent", "prompts", "docker"]);
10674
+ const reserved = /* @__PURE__ */ new Set(["global", "complete", "agent", "image", "speech", "prompts", "docker"]);
9550
10675
  return Object.keys(config).filter((key) => !reserved.has(key));
9551
10676
  }
9552
10677
  function resolveTemplatesInConfig(config, configPath) {
@@ -11141,19 +12266,118 @@ function registerGadgetCommand(program, env) {
11141
12266
  );
11142
12267
  }
11143
12268
 
12269
+ // src/cli/image-command.ts
12270
+ var import_node_fs11 = require("fs");
12271
+ var DEFAULT_IMAGE_MODEL = "dall-e-3";
12272
+ async function executeImage(promptArg, options, env) {
12273
+ const prompt = await resolvePrompt(promptArg, env);
12274
+ const client = env.createClient();
12275
+ const model = options.model;
12276
+ const n = options.count ? Number.parseInt(options.count, 10) : 1;
12277
+ const stderrTTY = env.stderr.isTTY === true;
12278
+ if (!options.quiet && stderrTTY) {
12279
+ env.stderr.write(`${SUMMARY_PREFIX} Generating image with ${model}...
12280
+ `);
12281
+ }
12282
+ const result = await client.image.generate({
12283
+ model,
12284
+ prompt,
12285
+ size: options.size,
12286
+ quality: options.quality,
12287
+ n,
12288
+ responseFormat: options.output ? "b64_json" : "url"
12289
+ });
12290
+ if (options.output) {
12291
+ const imageData = result.images[0];
12292
+ if (imageData.b64Json) {
12293
+ const buffer = Buffer.from(imageData.b64Json, "base64");
12294
+ (0, import_node_fs11.writeFileSync)(options.output, buffer);
12295
+ if (!options.quiet) {
12296
+ env.stderr.write(`${SUMMARY_PREFIX} Image saved to ${options.output}
12297
+ `);
12298
+ }
12299
+ } else if (imageData.url) {
12300
+ env.stdout.write(`${imageData.url}
12301
+ `);
12302
+ }
12303
+ } else {
12304
+ for (const image of result.images) {
12305
+ if (image.url) {
12306
+ env.stdout.write(`${image.url}
12307
+ `);
12308
+ } else if (image.b64Json) {
12309
+ env.stdout.write(image.b64Json);
12310
+ }
12311
+ }
12312
+ }
12313
+ if (!options.quiet && stderrTTY) {
12314
+ const parts = [
12315
+ `${result.images.length} image(s)`,
12316
+ `size: ${result.usage.size}`,
12317
+ `quality: ${result.usage.quality}`
12318
+ ];
12319
+ if (result.cost !== void 0) {
12320
+ parts.push(`cost: ${formatCost(result.cost)}`);
12321
+ }
12322
+ env.stderr.write(`${SUMMARY_PREFIX} ${parts.join(" | ")}
12323
+ `);
12324
+ }
12325
+ }
12326
+ function registerImageCommand(program, env, config) {
12327
+ program.command(COMMANDS.image).description("Generate images from a text prompt.").argument("[prompt]", "Image generation prompt. If omitted, stdin is used when available.").option(
12328
+ OPTION_FLAGS.model,
12329
+ OPTION_DESCRIPTIONS.model,
12330
+ config?.model ?? DEFAULT_IMAGE_MODEL
12331
+ ).option(OPTION_FLAGS.imageSize, OPTION_DESCRIPTIONS.imageSize, config?.size).option(OPTION_FLAGS.imageQuality, OPTION_DESCRIPTIONS.imageQuality, config?.quality).option(OPTION_FLAGS.imageCount, OPTION_DESCRIPTIONS.imageCount, config?.count?.toString()).option(OPTION_FLAGS.imageOutput, OPTION_DESCRIPTIONS.imageOutput, config?.output).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, config?.quiet ?? false).action(
12332
+ (prompt, options) => executeAction(() => executeImage(prompt, options, env), env)
12333
+ );
12334
+ }
12335
+
11144
12336
  // src/cli/models-command.ts
11145
12337
  var import_chalk8 = __toESM(require("chalk"), 1);
11146
12338
  init_model_shortcuts();
11147
12339
  async function handleModelsCommand(options, env) {
11148
12340
  const client = env.createClient();
11149
- const models = client.modelRegistry.listModels(options.provider);
12341
+ const showText = options.all || options.text || !options.image && !options.speech;
12342
+ const showImage = options.all || options.image;
12343
+ const showSpeech = options.all || options.speech;
12344
+ const textModels = showText ? client.modelRegistry.listModels(options.provider) : [];
12345
+ const imageModels = showImage ? client.image.listModels().filter((m) => !options.provider || m.provider === options.provider) : [];
12346
+ const speechModels = showSpeech ? client.speech.listModels().filter((m) => !options.provider || m.provider === options.provider) : [];
11150
12347
  if (options.format === "json") {
11151
- renderJSON(models, env.stdout);
12348
+ renderJSON(textModels, imageModels, speechModels, env.stdout);
11152
12349
  } else {
11153
- renderTable(models, options.verbose || false, env.stdout);
12350
+ renderAllTables(textModels, imageModels, speechModels, options.verbose || false, env.stdout);
12351
+ }
12352
+ }
12353
+ function renderAllTables(textModels, imageModels, speechModels, verbose, stream2) {
12354
+ const hasAnyModels = textModels.length > 0 || imageModels.length > 0 || speechModels.length > 0;
12355
+ if (!hasAnyModels) {
12356
+ stream2.write(import_chalk8.default.yellow("\nNo models found matching the specified criteria.\n\n"));
12357
+ return;
12358
+ }
12359
+ stream2.write(import_chalk8.default.bold.cyan("\nAvailable Models\n"));
12360
+ stream2.write(import_chalk8.default.cyan("=".repeat(80)) + "\n\n");
12361
+ if (textModels.length > 0) {
12362
+ renderTextTable(textModels, verbose, stream2);
12363
+ }
12364
+ if (imageModels.length > 0) {
12365
+ renderImageTable(imageModels, verbose, stream2);
12366
+ }
12367
+ if (speechModels.length > 0) {
12368
+ renderSpeechTable(speechModels, verbose, stream2);
12369
+ }
12370
+ if (textModels.length > 0) {
12371
+ stream2.write(import_chalk8.default.bold.magenta("Model Shortcuts\n"));
12372
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n");
12373
+ const shortcuts = Object.entries(MODEL_ALIASES).sort((a, b) => a[0].localeCompare(b[0]));
12374
+ for (const [shortcut, fullName] of shortcuts) {
12375
+ stream2.write(import_chalk8.default.cyan(` ${shortcut.padEnd(15)}`) + import_chalk8.default.dim(" \u2192 ") + import_chalk8.default.white(fullName) + "\n");
12376
+ }
12377
+ stream2.write("\n");
11154
12378
  }
11155
12379
  }
11156
- function renderTable(models, verbose, stream2) {
12380
+ function renderTextTable(models, verbose, stream2) {
11157
12381
  const grouped = /* @__PURE__ */ new Map();
11158
12382
  for (const model of models) {
11159
12383
  const provider = model.provider;
@@ -11162,13 +12386,13 @@ function renderTable(models, verbose, stream2) {
11162
12386
  }
11163
12387
  grouped.get(provider).push(model);
11164
12388
  }
11165
- stream2.write(import_chalk8.default.bold.cyan("\nAvailable Models\n"));
11166
- stream2.write(import_chalk8.default.cyan("=".repeat(80)) + "\n\n");
12389
+ stream2.write(import_chalk8.default.bold.blue("\u{1F4DD} Text/LLM Models\n"));
12390
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n\n");
11167
12391
  const providers = Array.from(grouped.keys()).sort();
11168
12392
  for (const provider of providers) {
11169
12393
  const providerModels = grouped.get(provider);
11170
12394
  const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
11171
- stream2.write(import_chalk8.default.bold.yellow(`${providerName} Models
12395
+ stream2.write(import_chalk8.default.bold.yellow(`${providerName}
11172
12396
  `));
11173
12397
  if (verbose) {
11174
12398
  renderVerboseTable(providerModels, stream2);
@@ -11177,13 +12401,6 @@ function renderTable(models, verbose, stream2) {
11177
12401
  }
11178
12402
  stream2.write("\n");
11179
12403
  }
11180
- stream2.write(import_chalk8.default.bold.magenta("Model Shortcuts\n"));
11181
- stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n");
11182
- const shortcuts = Object.entries(MODEL_ALIASES).sort((a, b) => a[0].localeCompare(b[0]));
11183
- for (const [shortcut, fullName] of shortcuts) {
11184
- stream2.write(import_chalk8.default.cyan(` ${shortcut.padEnd(15)}`) + import_chalk8.default.dim(" \u2192 ") + import_chalk8.default.white(fullName) + "\n");
11185
- }
11186
- stream2.write("\n");
11187
12404
  }
11188
12405
  function renderCompactTable(models, stream2) {
11189
12406
  const idWidth = 25;
@@ -11260,9 +12477,171 @@ function renderVerboseTable(models, stream2) {
11260
12477
  }
11261
12478
  stream2.write("\n");
11262
12479
  }
11263
- function renderJSON(models, stream2) {
11264
- const output = {
11265
- models: models.map((model) => ({
12480
+ function renderImageTable(models, verbose, stream2) {
12481
+ stream2.write(import_chalk8.default.bold.green("\u{1F3A8} Image Generation Models\n"));
12482
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n\n");
12483
+ const grouped = /* @__PURE__ */ new Map();
12484
+ for (const model of models) {
12485
+ if (!grouped.has(model.provider)) {
12486
+ grouped.set(model.provider, []);
12487
+ }
12488
+ grouped.get(model.provider).push(model);
12489
+ }
12490
+ for (const [provider, providerModels] of Array.from(grouped.entries()).sort()) {
12491
+ const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
12492
+ stream2.write(import_chalk8.default.bold.yellow(`${providerName}
12493
+ `));
12494
+ if (verbose) {
12495
+ for (const model of providerModels) {
12496
+ stream2.write(import_chalk8.default.bold.green(`
12497
+ ${model.modelId}
12498
+ `));
12499
+ stream2.write(import_chalk8.default.dim(" " + "\u2500".repeat(60)) + "\n");
12500
+ stream2.write(` ${import_chalk8.default.dim("Name:")} ${import_chalk8.default.white(model.displayName)}
12501
+ `);
12502
+ stream2.write(` ${import_chalk8.default.dim("Sizes:")} ${import_chalk8.default.yellow(model.supportedSizes.join(", "))}
12503
+ `);
12504
+ if (model.supportedQualities) {
12505
+ stream2.write(` ${import_chalk8.default.dim("Qualities:")} ${import_chalk8.default.yellow(model.supportedQualities.join(", "))}
12506
+ `);
12507
+ }
12508
+ stream2.write(` ${import_chalk8.default.dim("Max Images:")} ${import_chalk8.default.yellow(model.maxImages.toString())}
12509
+ `);
12510
+ stream2.write(` ${import_chalk8.default.dim("Pricing:")} ${import_chalk8.default.cyan(formatImagePrice(model))}
12511
+ `);
12512
+ if (model.features) {
12513
+ const features = [];
12514
+ if (model.features.textRendering) features.push("text-rendering");
12515
+ if (model.features.transparency) features.push("transparency");
12516
+ if (model.features.conversational) features.push("conversational");
12517
+ if (features.length > 0) {
12518
+ stream2.write(` ${import_chalk8.default.dim("Features:")} ${import_chalk8.default.blue(features.join(", "))}
12519
+ `);
12520
+ }
12521
+ }
12522
+ }
12523
+ } else {
12524
+ const idWidth = 32;
12525
+ const nameWidth = 25;
12526
+ const sizesWidth = 20;
12527
+ const priceWidth = 15;
12528
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
12529
+ stream2.write(
12530
+ import_chalk8.default.bold(
12531
+ "Model ID".padEnd(idWidth) + " " + "Display Name".padEnd(nameWidth) + " " + "Sizes".padEnd(sizesWidth) + " " + "Price".padEnd(priceWidth)
12532
+ ) + "\n"
12533
+ );
12534
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
12535
+ for (const model of providerModels) {
12536
+ const sizes = model.supportedSizes.length > 2 ? model.supportedSizes.slice(0, 2).join(", ") + "..." : model.supportedSizes.join(", ");
12537
+ stream2.write(
12538
+ import_chalk8.default.green(model.modelId.padEnd(idWidth)) + " " + import_chalk8.default.white(model.displayName.substring(0, nameWidth - 1).padEnd(nameWidth)) + " " + import_chalk8.default.yellow(sizes.padEnd(sizesWidth)) + " " + import_chalk8.default.cyan(formatImagePrice(model).padEnd(priceWidth)) + "\n"
12539
+ );
12540
+ }
12541
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + sizesWidth + priceWidth + 6)) + "\n");
12542
+ }
12543
+ stream2.write("\n");
12544
+ }
12545
+ }
12546
+ function renderSpeechTable(models, verbose, stream2) {
12547
+ stream2.write(import_chalk8.default.bold.magenta("\u{1F3A4} Speech (TTS) Models\n"));
12548
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(80)) + "\n\n");
12549
+ const grouped = /* @__PURE__ */ new Map();
12550
+ for (const model of models) {
12551
+ if (!grouped.has(model.provider)) {
12552
+ grouped.set(model.provider, []);
12553
+ }
12554
+ grouped.get(model.provider).push(model);
12555
+ }
12556
+ for (const [provider, providerModels] of Array.from(grouped.entries()).sort()) {
12557
+ const providerName = provider.charAt(0).toUpperCase() + provider.slice(1);
12558
+ stream2.write(import_chalk8.default.bold.yellow(`${providerName}
12559
+ `));
12560
+ if (verbose) {
12561
+ for (const model of providerModels) {
12562
+ stream2.write(import_chalk8.default.bold.green(`
12563
+ ${model.modelId}
12564
+ `));
12565
+ stream2.write(import_chalk8.default.dim(" " + "\u2500".repeat(60)) + "\n");
12566
+ stream2.write(` ${import_chalk8.default.dim("Name:")} ${import_chalk8.default.white(model.displayName)}
12567
+ `);
12568
+ stream2.write(` ${import_chalk8.default.dim("Voices:")} ${import_chalk8.default.yellow(model.voices.length.toString())} voices
12569
+ `);
12570
+ if (model.voices.length <= 6) {
12571
+ stream2.write(` ${import_chalk8.default.dim(model.voices.join(", "))}
12572
+ `);
12573
+ } else {
12574
+ stream2.write(` ${import_chalk8.default.dim(model.voices.slice(0, 6).join(", ") + "...")}
12575
+ `);
12576
+ }
12577
+ stream2.write(` ${import_chalk8.default.dim("Formats:")} ${import_chalk8.default.yellow(model.formats.join(", "))}
12578
+ `);
12579
+ stream2.write(` ${import_chalk8.default.dim("Max Input:")} ${import_chalk8.default.yellow(model.maxInputLength.toString())} chars
12580
+ `);
12581
+ stream2.write(` ${import_chalk8.default.dim("Pricing:")} ${import_chalk8.default.cyan(formatSpeechPrice(model))}
12582
+ `);
12583
+ if (model.features) {
12584
+ const features = [];
12585
+ if (model.features.multiSpeaker) features.push("multi-speaker");
12586
+ if (model.features.voiceInstructions) features.push("voice-instructions");
12587
+ if (model.features.languages) features.push(`${model.features.languages} languages`);
12588
+ if (features.length > 0) {
12589
+ stream2.write(` ${import_chalk8.default.dim("Features:")} ${import_chalk8.default.blue(features.join(", "))}
12590
+ `);
12591
+ }
12592
+ }
12593
+ }
12594
+ } else {
12595
+ const idWidth = 30;
12596
+ const nameWidth = 28;
12597
+ const voicesWidth = 12;
12598
+ const priceWidth = 18;
12599
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
12600
+ stream2.write(
12601
+ import_chalk8.default.bold(
12602
+ "Model ID".padEnd(idWidth) + " " + "Display Name".padEnd(nameWidth) + " " + "Voices".padEnd(voicesWidth) + " " + "Price".padEnd(priceWidth)
12603
+ ) + "\n"
12604
+ );
12605
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
12606
+ for (const model of providerModels) {
12607
+ stream2.write(
12608
+ import_chalk8.default.green(model.modelId.padEnd(idWidth)) + " " + import_chalk8.default.white(model.displayName.substring(0, nameWidth - 1).padEnd(nameWidth)) + " " + import_chalk8.default.yellow(`${model.voices.length} voices`.padEnd(voicesWidth)) + " " + import_chalk8.default.cyan(formatSpeechPrice(model).padEnd(priceWidth)) + "\n"
12609
+ );
12610
+ }
12611
+ stream2.write(import_chalk8.default.dim("\u2500".repeat(idWidth + nameWidth + voicesWidth + priceWidth + 6)) + "\n");
12612
+ }
12613
+ stream2.write("\n");
12614
+ }
12615
+ }
12616
+ function formatImagePrice(model) {
12617
+ if (model.pricing.perImage !== void 0) {
12618
+ return `$${model.pricing.perImage.toFixed(2)}/img`;
12619
+ }
12620
+ if (model.pricing.bySize) {
12621
+ const prices = Object.values(model.pricing.bySize);
12622
+ const minPrice = Math.min(...prices.flatMap((p) => typeof p === "number" ? [p] : Object.values(p)));
12623
+ const maxPrice = Math.max(...prices.flatMap((p) => typeof p === "number" ? [p] : Object.values(p)));
12624
+ if (minPrice === maxPrice) {
12625
+ return `$${minPrice.toFixed(2)}/img`;
12626
+ }
12627
+ return `$${minPrice.toFixed(2)}-${maxPrice.toFixed(2)}`;
12628
+ }
12629
+ return "varies";
12630
+ }
12631
+ function formatSpeechPrice(model) {
12632
+ if (model.pricing.perCharacter !== void 0) {
12633
+ const perMillion = model.pricing.perCharacter * 1e6;
12634
+ return `$${perMillion.toFixed(0)}/1M chars`;
12635
+ }
12636
+ if (model.pricing.perMinute !== void 0) {
12637
+ return `~$${model.pricing.perMinute.toFixed(2)}/min`;
12638
+ }
12639
+ return "varies";
12640
+ }
12641
+ function renderJSON(textModels, imageModels, speechModels, stream2) {
12642
+ const output = {};
12643
+ if (textModels.length > 0) {
12644
+ output.textModels = textModels.map((model) => ({
11266
12645
  provider: model.provider,
11267
12646
  modelId: model.modelId,
11268
12647
  displayName: model.displayName,
@@ -11278,9 +12657,33 @@ function renderJSON(models, stream2) {
11278
12657
  knowledgeCutoff: model.knowledgeCutoff,
11279
12658
  features: model.features,
11280
12659
  metadata: model.metadata
11281
- })),
11282
- shortcuts: MODEL_ALIASES
11283
- };
12660
+ }));
12661
+ output.shortcuts = MODEL_ALIASES;
12662
+ }
12663
+ if (imageModels.length > 0) {
12664
+ output.imageModels = imageModels.map((model) => ({
12665
+ provider: model.provider,
12666
+ modelId: model.modelId,
12667
+ displayName: model.displayName,
12668
+ supportedSizes: model.supportedSizes,
12669
+ supportedQualities: model.supportedQualities,
12670
+ maxImages: model.maxImages,
12671
+ pricing: model.pricing,
12672
+ features: model.features
12673
+ }));
12674
+ }
12675
+ if (speechModels.length > 0) {
12676
+ output.speechModels = speechModels.map((model) => ({
12677
+ provider: model.provider,
12678
+ modelId: model.modelId,
12679
+ displayName: model.displayName,
12680
+ voices: model.voices,
12681
+ formats: model.formats,
12682
+ maxInputLength: model.maxInputLength,
12683
+ pricing: model.pricing,
12684
+ features: model.features
12685
+ }));
12686
+ }
11284
12687
  stream2.write(JSON.stringify(output, null, 2) + "\n");
11285
12688
  }
11286
12689
  function formatTokens2(count) {
@@ -11293,7 +12696,7 @@ function formatTokens2(count) {
11293
12696
  }
11294
12697
  }
11295
12698
  function registerModelsCommand(program, env) {
11296
- program.command(COMMANDS.models).description("List all available LLM models with pricing and capabilities.").option("--provider <name>", "Filter by provider (openai, anthropic, gemini)").option("--format <format>", "Output format: table or json", "table").option("--verbose", "Show detailed model information", false).action(
12699
+ program.command(COMMANDS.models).description("List available models with pricing and capabilities.").option("--provider <name>", "Filter by provider (openai, anthropic, gemini)").option("--format <format>", "Output format: table or json", "table").option("--verbose", "Show detailed model information", false).option("--text", "Show text/LLM models (default if no type specified)").option("--image", "Show image generation models").option("--speech", "Show speech/TTS models").option("--all", "Show all model types (text, image, speech)").action(
11297
12700
  (options) => executeAction(
11298
12701
  () => handleModelsCommand(options, env),
11299
12702
  env
@@ -11301,6 +12704,60 @@ function registerModelsCommand(program, env) {
11301
12704
  );
11302
12705
  }
11303
12706
 
12707
+ // src/cli/speech-command.ts
12708
+ var import_node_fs12 = require("fs");
12709
+ var DEFAULT_SPEECH_MODEL = "tts-1";
12710
+ var DEFAULT_VOICE = "nova";
12711
+ async function executeSpeech(textArg, options, env) {
12712
+ const text = await resolvePrompt(textArg, env);
12713
+ const client = env.createClient();
12714
+ const model = options.model;
12715
+ const voice = options.voice ?? DEFAULT_VOICE;
12716
+ const speed = options.speed ? Number.parseFloat(options.speed) : void 0;
12717
+ const stderrTTY = env.stderr.isTTY === true;
12718
+ if (!options.quiet && stderrTTY) {
12719
+ env.stderr.write(`${SUMMARY_PREFIX} Generating speech with ${model} (voice: ${voice})...
12720
+ `);
12721
+ }
12722
+ const result = await client.speech.generate({
12723
+ model,
12724
+ input: text,
12725
+ voice,
12726
+ responseFormat: options.format,
12727
+ speed
12728
+ });
12729
+ const audioBuffer = Buffer.from(result.audio);
12730
+ if (options.output) {
12731
+ (0, import_node_fs12.writeFileSync)(options.output, audioBuffer);
12732
+ if (!options.quiet) {
12733
+ env.stderr.write(`${SUMMARY_PREFIX} Audio saved to ${options.output}
12734
+ `);
12735
+ }
12736
+ } else {
12737
+ env.stdout.write(audioBuffer);
12738
+ }
12739
+ if (!options.quiet && stderrTTY) {
12740
+ const parts = [
12741
+ `${result.usage.characterCount} characters`,
12742
+ `format: ${result.format}`
12743
+ ];
12744
+ if (result.cost !== void 0) {
12745
+ parts.push(`cost: ${formatCost(result.cost)}`);
12746
+ }
12747
+ env.stderr.write(`${SUMMARY_PREFIX} ${parts.join(" | ")}
12748
+ `);
12749
+ }
12750
+ }
12751
+ function registerSpeechCommand(program, env, config) {
12752
+ program.command(COMMANDS.speech).description("Generate speech audio from text.").argument("[text]", "Text to convert to speech. If omitted, stdin is used when available.").option(
12753
+ OPTION_FLAGS.model,
12754
+ OPTION_DESCRIPTIONS.model,
12755
+ config?.model ?? DEFAULT_SPEECH_MODEL
12756
+ ).option(OPTION_FLAGS.voice, OPTION_DESCRIPTIONS.voice, config?.voice ?? DEFAULT_VOICE).option(OPTION_FLAGS.speechFormat, OPTION_DESCRIPTIONS.speechFormat, config?.format).option(OPTION_FLAGS.speechSpeed, OPTION_DESCRIPTIONS.speechSpeed, config?.speed?.toString()).option(OPTION_FLAGS.speechOutput, OPTION_DESCRIPTIONS.speechOutput, config?.output).option(OPTION_FLAGS.quiet, OPTION_DESCRIPTIONS.quiet, config?.quiet ?? false).action(
12757
+ (text, options) => executeAction(() => executeSpeech(text, options, env), env)
12758
+ );
12759
+ }
12760
+
11304
12761
  // src/cli/environment.ts
11305
12762
  var import_node_readline = __toESM(require("readline"), 1);
11306
12763
  var import_chalk9 = __toESM(require("chalk"), 1);
@@ -11452,6 +12909,8 @@ function createProgram(env, config) {
11452
12909
  });
11453
12910
  registerCompleteCommand(program, env, config?.complete);
11454
12911
  registerAgentCommand(program, env, config?.agent);
12912
+ registerImageCommand(program, env, config?.image);
12913
+ registerSpeechCommand(program, env, config?.speech);
11455
12914
  registerModelsCommand(program, env);
11456
12915
  registerGadgetCommand(program, env);
11457
12916
  if (config) {