@ai-sdk/xai 4.0.0-canary.71 → 4.0.0-canary.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,12 @@
1
1
  # @ai-sdk/xai
2
2
 
3
+ ## 4.0.0-canary.72
4
+
5
+ ### Patch Changes
6
+
7
+ - 7486744: Add xAI speech-to-text transcription support.
8
+ - 7486744: feat(provider/xai): add text-to-speech support
9
+
3
10
  ## 4.0.0-canary.71
4
11
 
5
12
  ### Patch Changes
package/dist/index.d.ts CHANGED
@@ -1,7 +1,7 @@
1
1
  import { z } from 'zod/v4';
2
2
  import * as _ai_sdk_provider_utils from '@ai-sdk/provider-utils';
3
3
  import { InferSchema, FetchFunction } from '@ai-sdk/provider-utils';
4
- import { ProviderV4, LanguageModelV4, ImageModelV4, Experimental_VideoModelV4, Experimental_RealtimeFactoryV4, FilesV4, Experimental_RealtimeModelV4, Experimental_RealtimeModelV4ClientSecretOptions, Experimental_RealtimeModelV4ClientSecretResult, Experimental_RealtimeModelV4ServerEvent, Experimental_RealtimeModelV4ClientEvent, Experimental_RealtimeModelV4SessionConfig } from '@ai-sdk/provider';
4
+ import { ProviderV4, LanguageModelV4, ImageModelV4, Experimental_VideoModelV4, Experimental_RealtimeFactoryV4, SpeechModelV4, TranscriptionModelV4, FilesV4, Experimental_RealtimeModelV4, Experimental_RealtimeModelV4ClientSecretOptions, Experimental_RealtimeModelV4ClientSecretResult, Experimental_RealtimeModelV4ServerEvent, Experimental_RealtimeModelV4ClientEvent, Experimental_RealtimeModelV4SessionConfig } from '@ai-sdk/provider';
5
5
 
6
6
  type XaiChatModelId = 'grok-4.20-non-reasoning' | 'grok-4.20-reasoning' | 'grok-4.3' | 'grok-latest' | (string & {});
7
7
  declare const xaiLanguageModelChatOptions: z.ZodObject<{
@@ -179,6 +179,27 @@ interface XaiLegacyReferenceToVideoOptions extends XaiVideoSharedOptions {
179
179
  */
180
180
  type XaiVideoModelOptions = XaiVideoGenerationOptions | XaiVideoEditModeOptions | XaiVideoExtendModeOptions | XaiVideoReferenceToVideoOptions | XaiLegacyEditVideoOptions | XaiLegacyReferenceToVideoOptions;
181
181
 
182
+ declare const xaiSpeechModelOptionsSchema: _ai_sdk_provider_utils.LazySchema<{
183
+ sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | null | undefined;
184
+ bitRate?: 32000 | 64000 | 96000 | 128000 | 192000 | null | undefined;
185
+ optimizeStreamingLatency?: 0 | 1 | 2 | null | undefined;
186
+ textNormalization?: boolean | null | undefined;
187
+ }>;
188
+ type XaiSpeechModelOptions = InferSchema<typeof xaiSpeechModelOptionsSchema>;
189
+
190
+ declare const xaiTranscriptionModelOptionsSchema: _ai_sdk_provider_utils.LazySchema<{
191
+ audioFormat?: "pcm" | "mulaw" | "alaw" | null | undefined;
192
+ sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | null | undefined;
193
+ language?: string | null | undefined;
194
+ format?: boolean | null | undefined;
195
+ multichannel?: boolean | null | undefined;
196
+ channels?: number | null | undefined;
197
+ diarize?: boolean | null | undefined;
198
+ keyterm?: string | string[] | null | undefined;
199
+ fillerWords?: boolean | null | undefined;
200
+ }>;
201
+ type XaiTranscriptionModelOptions = InferSchema<typeof xaiTranscriptionModelOptionsSchema>;
202
+
182
203
  declare const xaiFilesOptionsSchema: _ai_sdk_provider_utils.LazySchema<{
183
204
  [x: string]: unknown;
184
205
  teamId?: string | undefined;
@@ -420,6 +441,22 @@ interface XaiProvider extends ProviderV4 {
420
441
  */
421
442
  videoModel(modelId: XaiVideoModelId): Experimental_VideoModelV4;
422
443
  experimental_realtime: Experimental_RealtimeFactoryV4;
444
+ /**
445
+ * Creates an xAI model for speech generation (text-to-speech).
446
+ */
447
+ speech(): SpeechModelV4;
448
+ /**
449
+ * Creates an xAI model for speech generation (text-to-speech).
450
+ */
451
+ speechModel(): SpeechModelV4;
452
+ /**
453
+ * Creates an xAI model for speech-to-text transcription.
454
+ */
455
+ transcription(): TranscriptionModelV4;
456
+ /**
457
+ * Creates an xAI model for speech-to-text transcription.
458
+ */
459
+ transcriptionModel(): TranscriptionModelV4;
423
460
  /**
424
461
  * Returns the xAI files interface for uploading files.
425
462
  */
@@ -482,4 +519,4 @@ declare class XaiRealtimeModel implements Experimental_RealtimeModelV4 {
482
519
 
483
520
  declare const VERSION: string;
484
521
 
485
- export { XaiRealtimeModel as Experimental_XaiRealtimeModel, type XaiRealtimeModelConfig as Experimental_XaiRealtimeModelConfig, VERSION, type XaiErrorData, type XaiFilesOptions, type XaiImageModelOptions, type XaiImageModelOptions as XaiImageProviderOptions, type XaiLanguageModelChatOptions, type XaiLanguageModelResponsesOptions, type XaiProvider, type XaiLanguageModelChatOptions as XaiProviderOptions, type XaiProviderSettings, type XaiLanguageModelResponsesOptions as XaiResponsesProviderOptions, type XaiVideoModelId, type XaiVideoModelOptions, type XaiVideoModelOptions as XaiVideoProviderOptions, codeExecution, createXai, mcpServer, viewImage, viewXVideo, webSearch, xSearch, xai, xaiTools };
522
+ export { XaiRealtimeModel as Experimental_XaiRealtimeModel, type XaiRealtimeModelConfig as Experimental_XaiRealtimeModelConfig, VERSION, type XaiErrorData, type XaiFilesOptions, type XaiImageModelOptions, type XaiImageModelOptions as XaiImageProviderOptions, type XaiLanguageModelChatOptions, type XaiLanguageModelResponsesOptions, type XaiProvider, type XaiLanguageModelChatOptions as XaiProviderOptions, type XaiProviderSettings, type XaiLanguageModelResponsesOptions as XaiResponsesProviderOptions, type XaiSpeechModelOptions, type XaiTranscriptionModelOptions, type XaiVideoModelId, type XaiVideoModelOptions, type XaiVideoModelOptions as XaiVideoProviderOptions, codeExecution, createXai, mcpServer, viewImage, viewXVideo, webSearch, xSearch, xai, xaiTools };
package/dist/index.js CHANGED
@@ -3393,7 +3393,7 @@ var xaiTools = {
3393
3393
  };
3394
3394
 
3395
3395
  // src/version.ts
3396
- var VERSION = true ? "4.0.0-canary.71" : "0.0.0-test";
3396
+ var VERSION = true ? "4.0.0-canary.72" : "0.0.0-test";
3397
3397
 
3398
3398
  // src/files/xai-files.ts
3399
3399
  import {
@@ -3845,6 +3845,357 @@ var xaiVideoStatusResponseSchema = z18.object({
3845
3845
  }).nullish()
3846
3846
  });
3847
3847
 
3848
+ // src/xai-speech-model.ts
3849
+ import {
3850
+ combineHeaders as combineHeaders6,
3851
+ createBinaryResponseHandler as createBinaryResponseHandler2,
3852
+ parseProviderOptions as parseProviderOptions6,
3853
+ postJsonToApi as postJsonToApi5,
3854
+ resolve,
3855
+ serializeModelOptions as serializeModelOptions4,
3856
+ WORKFLOW_DESERIALIZE as WORKFLOW_DESERIALIZE4,
3857
+ WORKFLOW_SERIALIZE as WORKFLOW_SERIALIZE4
3858
+ } from "@ai-sdk/provider-utils";
3859
+
3860
+ // src/xai-speech-model-options.ts
3861
+ import {
3862
+ lazySchema as lazySchema8,
3863
+ zodSchema as zodSchema8
3864
+ } from "@ai-sdk/provider-utils";
3865
+ import { z as z19 } from "zod/v4";
3866
+ var xaiSpeechModelOptionsSchema = lazySchema8(
3867
+ () => zodSchema8(
3868
+ z19.object({
3869
+ /**
3870
+ * Sample rate of the generated audio in Hz.
3871
+ */
3872
+ sampleRate: z19.union([
3873
+ z19.literal(8e3),
3874
+ z19.literal(16e3),
3875
+ z19.literal(22050),
3876
+ z19.literal(24e3),
3877
+ z19.literal(44100),
3878
+ z19.literal(48e3)
3879
+ ]).nullish(),
3880
+ /**
3881
+ * MP3 bit rate in bits per second. Only applies when outputFormat is mp3.
3882
+ */
3883
+ bitRate: z19.union([
3884
+ z19.literal(32e3),
3885
+ z19.literal(64e3),
3886
+ z19.literal(96e3),
3887
+ z19.literal(128e3),
3888
+ z19.literal(192e3)
3889
+ ]).nullish(),
3890
+ /**
3891
+ * Reduce time to first audio chunk, trading some quality for latency.
3892
+ */
3893
+ optimizeStreamingLatency: z19.union([z19.literal(0), z19.literal(1), z19.literal(2)]).nullish(),
3894
+ /**
3895
+ * Normalize written-form text into spoken-form text before synthesis.
3896
+ */
3897
+ textNormalization: z19.boolean().nullish()
3898
+ })
3899
+ )
3900
+ );
3901
+
3902
+ // src/xai-speech-model.ts
3903
+ var XaiSpeechModel = class _XaiSpeechModel {
3904
+ constructor(modelId, config) {
3905
+ this.modelId = modelId;
3906
+ this.config = config;
3907
+ this.specificationVersion = "v4";
3908
+ }
3909
+ static [WORKFLOW_SERIALIZE4](model) {
3910
+ return serializeModelOptions4({
3911
+ modelId: model.modelId,
3912
+ config: model.config
3913
+ });
3914
+ }
3915
+ static [WORKFLOW_DESERIALIZE4](options) {
3916
+ return new _XaiSpeechModel(options.modelId, options.config);
3917
+ }
3918
+ get provider() {
3919
+ return this.config.provider;
3920
+ }
3921
+ async getArgs({
3922
+ text,
3923
+ voice = "eve",
3924
+ outputFormat = "mp3",
3925
+ instructions,
3926
+ speed,
3927
+ language = "auto",
3928
+ providerOptions
3929
+ }) {
3930
+ const warnings = [];
3931
+ const xaiOptions = await parseProviderOptions6({
3932
+ provider: "xai",
3933
+ providerOptions,
3934
+ schema: xaiSpeechModelOptionsSchema
3935
+ });
3936
+ let codec = "mp3";
3937
+ if (["mp3", "wav", "pcm", "mulaw", "alaw"].includes(outputFormat)) {
3938
+ codec = outputFormat;
3939
+ } else {
3940
+ warnings.push({
3941
+ type: "unsupported",
3942
+ feature: "outputFormat",
3943
+ details: `Unsupported output format: ${outputFormat}. Using mp3 instead.`
3944
+ });
3945
+ }
3946
+ if (instructions != null) {
3947
+ warnings.push({
3948
+ type: "unsupported",
3949
+ feature: "instructions",
3950
+ details: "xAI speech models do not support the `instructions` option. Use xAI speech tags in `text` to control delivery."
3951
+ });
3952
+ }
3953
+ const output_format = {
3954
+ codec
3955
+ };
3956
+ if ((xaiOptions == null ? void 0 : xaiOptions.sampleRate) != null) {
3957
+ output_format.sample_rate = xaiOptions.sampleRate;
3958
+ }
3959
+ if ((xaiOptions == null ? void 0 : xaiOptions.bitRate) != null) {
3960
+ if (codec === "mp3") {
3961
+ output_format.bit_rate = xaiOptions.bitRate;
3962
+ } else {
3963
+ warnings.push({
3964
+ type: "unsupported",
3965
+ feature: "providerOptions",
3966
+ details: "xAI `bitRate` is supported only for mp3 output. It was ignored."
3967
+ });
3968
+ }
3969
+ }
3970
+ const requestBody = {
3971
+ text,
3972
+ voice_id: voice,
3973
+ language,
3974
+ output_format,
3975
+ speed,
3976
+ optimize_streaming_latency: xaiOptions == null ? void 0 : xaiOptions.optimizeStreamingLatency,
3977
+ text_normalization: xaiOptions == null ? void 0 : xaiOptions.textNormalization
3978
+ };
3979
+ return { requestBody, warnings };
3980
+ }
3981
+ async doGenerate(options) {
3982
+ var _a, _b, _c;
3983
+ const currentDate = (_c = (_b = (_a = this.config._internal) == null ? void 0 : _a.currentDate) == null ? void 0 : _b.call(_a)) != null ? _c : /* @__PURE__ */ new Date();
3984
+ const { requestBody, warnings } = await this.getArgs(options);
3985
+ const {
3986
+ value: audio,
3987
+ responseHeaders,
3988
+ rawValue: rawResponse
3989
+ } = await postJsonToApi5({
3990
+ url: `${this.config.baseURL}/tts`,
3991
+ headers: combineHeaders6(
3992
+ this.config.headers ? await resolve(this.config.headers) : void 0,
3993
+ options.headers
3994
+ ),
3995
+ body: requestBody,
3996
+ failedResponseHandler: xaiFailedResponseHandler,
3997
+ successfulResponseHandler: createBinaryResponseHandler2(),
3998
+ abortSignal: options.abortSignal,
3999
+ fetch: this.config.fetch
4000
+ });
4001
+ return {
4002
+ audio,
4003
+ warnings,
4004
+ request: {
4005
+ body: JSON.stringify(requestBody)
4006
+ },
4007
+ response: {
4008
+ timestamp: currentDate,
4009
+ modelId: this.modelId,
4010
+ headers: responseHeaders,
4011
+ body: rawResponse
4012
+ }
4013
+ };
4014
+ }
4015
+ };
4016
+
4017
+ // src/xai-transcription-model.ts
4018
+ import {
4019
+ combineHeaders as combineHeaders7,
4020
+ convertBase64ToUint8Array,
4021
+ createJsonResponseHandler as createJsonResponseHandler6,
4022
+ mediaTypeToExtension,
4023
+ parseProviderOptions as parseProviderOptions7,
4024
+ postFormDataToApi as postFormDataToApi2,
4025
+ serializeModelOptions as serializeModelOptions5,
4026
+ WORKFLOW_DESERIALIZE as WORKFLOW_DESERIALIZE5,
4027
+ WORKFLOW_SERIALIZE as WORKFLOW_SERIALIZE5
4028
+ } from "@ai-sdk/provider-utils";
4029
+ import { z as z21 } from "zod/v4";
4030
+
4031
+ // src/xai-transcription-model-options.ts
4032
+ import {
4033
+ lazySchema as lazySchema9,
4034
+ zodSchema as zodSchema9
4035
+ } from "@ai-sdk/provider-utils";
4036
+ import { z as z20 } from "zod/v4";
4037
+ var xaiTranscriptionModelOptionsSchema = lazySchema9(
4038
+ () => zodSchema9(
4039
+ z20.object({
4040
+ /**
4041
+ * Audio encoding for raw, headerless input audio.
4042
+ */
4043
+ audioFormat: z20.enum(["pcm", "mulaw", "alaw"]).nullish(),
4044
+ /**
4045
+ * Sample rate of the input audio in Hz.
4046
+ */
4047
+ sampleRate: z20.union([
4048
+ z20.literal(8e3),
4049
+ z20.literal(16e3),
4050
+ z20.literal(22050),
4051
+ z20.literal(24e3),
4052
+ z20.literal(44100),
4053
+ z20.literal(48e3)
4054
+ ]).nullish(),
4055
+ /**
4056
+ * Language code used for inverse text normalization.
4057
+ */
4058
+ language: z20.string().nullish(),
4059
+ /**
4060
+ * Enable inverse text normalization. Requires `language`.
4061
+ */
4062
+ format: z20.boolean().nullish(),
4063
+ /**
4064
+ * Enable per-channel transcription for multichannel audio.
4065
+ */
4066
+ multichannel: z20.boolean().nullish(),
4067
+ /**
4068
+ * Number of interleaved audio channels.
4069
+ */
4070
+ channels: z20.number().int().min(2).max(8).nullish(),
4071
+ /**
4072
+ * Enable speaker diarization.
4073
+ */
4074
+ diarize: z20.boolean().nullish(),
4075
+ /**
4076
+ * Terms to bias transcription toward.
4077
+ */
4078
+ keyterm: z20.union([z20.string(), z20.array(z20.string())]).nullish(),
4079
+ /**
4080
+ * Include filler words such as "uh" and "um" in the transcript.
4081
+ */
4082
+ fillerWords: z20.boolean().nullish()
4083
+ })
4084
+ )
4085
+ );
4086
+
4087
+ // src/xai-transcription-model.ts
4088
+ var XaiTranscriptionModel = class _XaiTranscriptionModel {
4089
+ constructor(modelId, config) {
4090
+ this.modelId = modelId;
4091
+ this.config = config;
4092
+ this.specificationVersion = "v4";
4093
+ }
4094
+ static [WORKFLOW_SERIALIZE5](model) {
4095
+ return serializeModelOptions5({
4096
+ modelId: model.modelId,
4097
+ config: model.config
4098
+ });
4099
+ }
4100
+ static [WORKFLOW_DESERIALIZE5](options) {
4101
+ return new _XaiTranscriptionModel(options.modelId, options.config);
4102
+ }
4103
+ get provider() {
4104
+ return this.config.provider;
4105
+ }
4106
+ async getArgs({
4107
+ audio,
4108
+ mediaType,
4109
+ providerOptions
4110
+ }) {
4111
+ const warnings = [];
4112
+ const xaiOptions = await parseProviderOptions7({
4113
+ provider: "xai",
4114
+ providerOptions,
4115
+ schema: xaiTranscriptionModelOptionsSchema
4116
+ });
4117
+ const formData = new FormData();
4118
+ const transcriptionOptions = {
4119
+ audio_format: xaiOptions == null ? void 0 : xaiOptions.audioFormat,
4120
+ sample_rate: xaiOptions == null ? void 0 : xaiOptions.sampleRate,
4121
+ language: xaiOptions == null ? void 0 : xaiOptions.language,
4122
+ format: xaiOptions == null ? void 0 : xaiOptions.format,
4123
+ multichannel: xaiOptions == null ? void 0 : xaiOptions.multichannel,
4124
+ channels: xaiOptions == null ? void 0 : xaiOptions.channels,
4125
+ diarize: xaiOptions == null ? void 0 : xaiOptions.diarize,
4126
+ filler_words: xaiOptions == null ? void 0 : xaiOptions.fillerWords
4127
+ };
4128
+ for (const [key, value] of Object.entries(transcriptionOptions)) {
4129
+ if (value != null) {
4130
+ formData.append(key, String(value));
4131
+ }
4132
+ }
4133
+ if ((xaiOptions == null ? void 0 : xaiOptions.keyterm) != null) {
4134
+ const keyterms = Array.isArray(xaiOptions.keyterm) ? xaiOptions.keyterm : [xaiOptions.keyterm];
4135
+ for (const keyterm of keyterms) {
4136
+ formData.append("keyterm", keyterm);
4137
+ }
4138
+ }
4139
+ const blob = audio instanceof Uint8Array ? new Blob([audio]) : new Blob([convertBase64ToUint8Array(audio)]);
4140
+ const fileExtension = mediaTypeToExtension(mediaType);
4141
+ formData.append(
4142
+ "file",
4143
+ new File([blob], "audio", { type: mediaType }),
4144
+ `audio.${fileExtension}`
4145
+ );
4146
+ return { formData, warnings };
4147
+ }
4148
+ async doGenerate(options) {
4149
+ var _a, _b, _c, _d, _e, _f, _g, _h, _i;
4150
+ const currentDate = (_c = (_b = (_a = this.config._internal) == null ? void 0 : _a.currentDate) == null ? void 0 : _b.call(_a)) != null ? _c : /* @__PURE__ */ new Date();
4151
+ const { formData, warnings } = await this.getArgs(options);
4152
+ const {
4153
+ value: response,
4154
+ responseHeaders,
4155
+ rawValue: rawResponse
4156
+ } = await postFormDataToApi2({
4157
+ url: `${(_d = this.config.baseURL) != null ? _d : "https://api.x.ai/v1"}/stt`,
4158
+ headers: combineHeaders7((_f = (_e = this.config).headers) == null ? void 0 : _f.call(_e), options.headers),
4159
+ formData,
4160
+ failedResponseHandler: xaiFailedResponseHandler,
4161
+ successfulResponseHandler: createJsonResponseHandler6(
4162
+ xaiTranscriptionResponseSchema
4163
+ ),
4164
+ abortSignal: options.abortSignal,
4165
+ fetch: this.config.fetch
4166
+ });
4167
+ return {
4168
+ text: response.text,
4169
+ segments: (_h = (_g = response.words) == null ? void 0 : _g.map((word) => ({
4170
+ text: word.text,
4171
+ startSecond: word.start,
4172
+ endSecond: word.end
4173
+ }))) != null ? _h : [],
4174
+ language: response.language || void 0,
4175
+ durationInSeconds: (_i = response.duration) != null ? _i : void 0,
4176
+ warnings,
4177
+ response: {
4178
+ timestamp: currentDate,
4179
+ modelId: this.modelId,
4180
+ headers: responseHeaders,
4181
+ body: rawResponse
4182
+ }
4183
+ };
4184
+ }
4185
+ };
4186
+ var xaiTranscriptionResponseSchema = z21.object({
4187
+ text: z21.string(),
4188
+ language: z21.string().nullish(),
4189
+ duration: z21.number().nullish(),
4190
+ words: z21.array(
4191
+ z21.object({
4192
+ text: z21.string(),
4193
+ start: z21.number(),
4194
+ end: z21.number()
4195
+ })
4196
+ ).nullish()
4197
+ });
4198
+
3848
4199
  // src/xai-provider.ts
3849
4200
  function createXai(options = {}) {
3850
4201
  var _a;
@@ -3904,6 +4255,22 @@ function createXai(options = {}) {
3904
4255
  fetch: options.fetch
3905
4256
  });
3906
4257
  };
4258
+ const createSpeechModel = () => {
4259
+ return new XaiSpeechModel("", {
4260
+ provider: "xai.speech",
4261
+ baseURL,
4262
+ headers: getHeaders,
4263
+ fetch: options.fetch
4264
+ });
4265
+ };
4266
+ const createTranscriptionModel = () => {
4267
+ return new XaiTranscriptionModel("", {
4268
+ provider: "xai.transcription",
4269
+ baseURL,
4270
+ headers: getHeaders,
4271
+ fetch: options.fetch
4272
+ });
4273
+ };
3907
4274
  const experimentalRealtimeFactory = Object.assign(
3908
4275
  (modelId) => createRealtimeModel(modelId),
3909
4276
  {
@@ -3941,6 +4308,10 @@ function createXai(options = {}) {
3941
4308
  provider.videoModel = createVideoModel;
3942
4309
  provider.video = createVideoModel;
3943
4310
  provider.experimental_realtime = experimentalRealtimeFactory;
4311
+ provider.speechModel = createSpeechModel;
4312
+ provider.speech = createSpeechModel;
4313
+ provider.transcriptionModel = createTranscriptionModel;
4314
+ provider.transcription = createTranscriptionModel;
3944
4315
  provider.files = createFiles;
3945
4316
  provider.tools = xaiTools;
3946
4317
  return provider;