npm - @ai-sdk/xai - Versions diffs - 4.0.0-canary.71 → 4.0.0-canary.72 - Mend

@ai-sdk/xai 4.0.0-canary.71 → 4.0.0-canary.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/CHANGELOG.md +7 -0
package/dist/index.d.ts +39 -2
package/dist/index.js +372 -1
package/dist/index.js.map +1 -1
package/docs/01-xai.mdx +192 -0
package/package.json +1 -1
package/src/index.ts +2 -0
package/src/xai-provider.ts +46 -0
package/src/xai-speech-model-options.ts +55 -0
package/src/xai-speech-model.ts +167 -0
package/src/xai-transcription-model-options.ts +70 -0
package/src/xai-transcription-model.ts +166 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,12 @@
 # @ai-sdk/xai
+## 4.0.0-canary.72
+### Patch Changes
+- 7486744: Add xAI speech-to-text transcription support.
+- 7486744: feat(provider/xai): add text-to-speech support
 ## 4.0.0-canary.71
 ### Patch Changes

package/dist/index.d.ts CHANGED Viewed

@@ -1,7 +1,7 @@
 import { z } from 'zod/v4';
 import * as _ai_sdk_provider_utils from '@ai-sdk/provider-utils';
 import { InferSchema, FetchFunction } from '@ai-sdk/provider-utils';
-import { ProviderV4, LanguageModelV4, ImageModelV4, Experimental_VideoModelV4, Experimental_RealtimeFactoryV4, FilesV4, Experimental_RealtimeModelV4, Experimental_RealtimeModelV4ClientSecretOptions, Experimental_RealtimeModelV4ClientSecretResult, Experimental_RealtimeModelV4ServerEvent, Experimental_RealtimeModelV4ClientEvent, Experimental_RealtimeModelV4SessionConfig } from '@ai-sdk/provider';
+import { ProviderV4, LanguageModelV4, ImageModelV4, Experimental_VideoModelV4, Experimental_RealtimeFactoryV4, SpeechModelV4, TranscriptionModelV4, FilesV4, Experimental_RealtimeModelV4, Experimental_RealtimeModelV4ClientSecretOptions, Experimental_RealtimeModelV4ClientSecretResult, Experimental_RealtimeModelV4ServerEvent, Experimental_RealtimeModelV4ClientEvent, Experimental_RealtimeModelV4SessionConfig } from '@ai-sdk/provider';
 type XaiChatModelId = 'grok-4.20-non-reasoning' | 'grok-4.20-reasoning' | 'grok-4.3' | 'grok-latest' | (string & {});
 declare const xaiLanguageModelChatOptions: z.ZodObject<{
@@ -179,6 +179,27 @@ interface XaiLegacyReferenceToVideoOptions extends XaiVideoSharedOptions {
  */
 type XaiVideoModelOptions = XaiVideoGenerationOptions | XaiVideoEditModeOptions | XaiVideoExtendModeOptions | XaiVideoReferenceToVideoOptions | XaiLegacyEditVideoOptions | XaiLegacyReferenceToVideoOptions;
+declare const xaiSpeechModelOptionsSchema: _ai_sdk_provider_utils.LazySchema<{
+    sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | null | undefined;
+    bitRate?: 32000 | 64000 | 96000 | 128000 | 192000 | null | undefined;
+    optimizeStreamingLatency?: 0 | 1 | 2 | null | undefined;
+    textNormalization?: boolean | null | undefined;
+}>;
+type XaiSpeechModelOptions = InferSchema<typeof xaiSpeechModelOptionsSchema>;
+declare const xaiTranscriptionModelOptionsSchema: _ai_sdk_provider_utils.LazySchema<{
+    audioFormat?: "pcm" | "mulaw" | "alaw" | null | undefined;
+    sampleRate?: 8000 | 16000 | 22050 | 24000 | 44100 | 48000 | null | undefined;
+    language?: string | null | undefined;
+    format?: boolean | null | undefined;
+    multichannel?: boolean | null | undefined;
+    channels?: number | null | undefined;
+    diarize?: boolean | null | undefined;
+    keyterm?: string | string[] | null | undefined;
+    fillerWords?: boolean | null | undefined;
+}>;
+type XaiTranscriptionModelOptions = InferSchema<typeof xaiTranscriptionModelOptionsSchema>;
 declare const xaiFilesOptionsSchema: _ai_sdk_provider_utils.LazySchema<{
     [x: string]: unknown;
     teamId?: string | undefined;
@@ -420,6 +441,22 @@ interface XaiProvider extends ProviderV4 {
      */
     videoModel(modelId: XaiVideoModelId): Experimental_VideoModelV4;
     experimental_realtime: Experimental_RealtimeFactoryV4;
+    /**
+     * Creates an xAI model for speech generation (text-to-speech).
+     */
+    speech(): SpeechModelV4;
+    /**
+     * Creates an xAI model for speech generation (text-to-speech).
+     */
+    speechModel(): SpeechModelV4;
+    /**
+     * Creates an xAI model for speech-to-text transcription.
+     */
+    transcription(): TranscriptionModelV4;
+    /**
+     * Creates an xAI model for speech-to-text transcription.
+     */
+    transcriptionModel(): TranscriptionModelV4;
     /**
      * Returns the xAI files interface for uploading files.
      */
@@ -482,4 +519,4 @@ declare class XaiRealtimeModel implements Experimental_RealtimeModelV4 {
 declare const VERSION: string;
-export { XaiRealtimeModel as Experimental_XaiRealtimeModel, type XaiRealtimeModelConfig as Experimental_XaiRealtimeModelConfig, VERSION, type XaiErrorData, type XaiFilesOptions, type XaiImageModelOptions, type XaiImageModelOptions as XaiImageProviderOptions, type XaiLanguageModelChatOptions, type XaiLanguageModelResponsesOptions, type XaiProvider, type XaiLanguageModelChatOptions as XaiProviderOptions, type XaiProviderSettings, type XaiLanguageModelResponsesOptions as XaiResponsesProviderOptions, type XaiVideoModelId, type XaiVideoModelOptions, type XaiVideoModelOptions as XaiVideoProviderOptions, codeExecution, createXai, mcpServer, viewImage, viewXVideo, webSearch, xSearch, xai, xaiTools };
+export { XaiRealtimeModel as Experimental_XaiRealtimeModel, type XaiRealtimeModelConfig as Experimental_XaiRealtimeModelConfig, VERSION, type XaiErrorData, type XaiFilesOptions, type XaiImageModelOptions, type XaiImageModelOptions as XaiImageProviderOptions, type XaiLanguageModelChatOptions, type XaiLanguageModelResponsesOptions, type XaiProvider, type XaiLanguageModelChatOptions as XaiProviderOptions, type XaiProviderSettings, type XaiLanguageModelResponsesOptions as XaiResponsesProviderOptions, type XaiSpeechModelOptions, type XaiTranscriptionModelOptions, type XaiVideoModelId, type XaiVideoModelOptions, type XaiVideoModelOptions as XaiVideoProviderOptions, codeExecution, createXai, mcpServer, viewImage, viewXVideo, webSearch, xSearch, xai, xaiTools };

package/dist/index.js CHANGED Viewed

@@ -3393,7 +3393,7 @@ var xaiTools = {
 };
 // src/version.ts
-var VERSION = true ? "4.0.0-canary.71" : "0.0.0-test";
+var VERSION = true ? "4.0.0-canary.72" : "0.0.0-test";
 // src/files/xai-files.ts
 import {
@@ -3845,6 +3845,357 @@ var xaiVideoStatusResponseSchema = z18.object({
   }).nullish()
 });
+// src/xai-speech-model.ts
+import {
+  combineHeaders as combineHeaders6,
+  createBinaryResponseHandler as createBinaryResponseHandler2,
+  parseProviderOptions as parseProviderOptions6,
+  postJsonToApi as postJsonToApi5,
+  resolve,
+  serializeModelOptions as serializeModelOptions4,
+  WORKFLOW_DESERIALIZE as WORKFLOW_DESERIALIZE4,
+  WORKFLOW_SERIALIZE as WORKFLOW_SERIALIZE4
+} from "@ai-sdk/provider-utils";
+// src/xai-speech-model-options.ts
+import {
+  lazySchema as lazySchema8,
+  zodSchema as zodSchema8
+} from "@ai-sdk/provider-utils";
+import { z as z19 } from "zod/v4";
+var xaiSpeechModelOptionsSchema = lazySchema8(
+  () => zodSchema8(
+    z19.object({
+      /**
+       * Sample rate of the generated audio in Hz.
+       */
+      sampleRate: z19.union([
+        z19.literal(8e3),
+        z19.literal(16e3),
+        z19.literal(22050),
+        z19.literal(24e3),
+        z19.literal(44100),
+        z19.literal(48e3)
+      ]).nullish(),
+      /**
+       * MP3 bit rate in bits per second. Only applies when outputFormat is mp3.
+       */
+      bitRate: z19.union([
+        z19.literal(32e3),
+        z19.literal(64e3),
+        z19.literal(96e3),
+        z19.literal(128e3),
+        z19.literal(192e3)
+      ]).nullish(),
+      /**
+       * Reduce time to first audio chunk, trading some quality for latency.
+       */
+      optimizeStreamingLatency: z19.union([z19.literal(0), z19.literal(1), z19.literal(2)]).nullish(),
+      /**
+       * Normalize written-form text into spoken-form text before synthesis.
+       */
+      textNormalization: z19.boolean().nullish()
+    })
+  )
+);
+// src/xai-speech-model.ts
+var XaiSpeechModel = class _XaiSpeechModel {
+  constructor(modelId, config) {
+    this.modelId = modelId;
+    this.config = config;
+    this.specificationVersion = "v4";
+  }
+  static [WORKFLOW_SERIALIZE4](model) {
+    return serializeModelOptions4({
+      modelId: model.modelId,
+      config: model.config
+    });
+  }
+  static [WORKFLOW_DESERIALIZE4](options) {
+    return new _XaiSpeechModel(options.modelId, options.config);
+  }
+  get provider() {
+    return this.config.provider;
+  }
+  async getArgs({
+    text,
+    voice = "eve",
+    outputFormat = "mp3",
+    instructions,
+    speed,
+    language = "auto",
+    providerOptions
+  }) {
+    const warnings = [];
+    const xaiOptions = await parseProviderOptions6({
+      provider: "xai",
+      providerOptions,
+      schema: xaiSpeechModelOptionsSchema
+    });
+    let codec = "mp3";
+    if (["mp3", "wav", "pcm", "mulaw", "alaw"].includes(outputFormat)) {
+      codec = outputFormat;
+    } else {
+      warnings.push({
+        type: "unsupported",
+        feature: "outputFormat",
+        details: `Unsupported output format: ${outputFormat}. Using mp3 instead.`
+      });
+    }
+    if (instructions != null) {
+      warnings.push({
+        type: "unsupported",
+        feature: "instructions",
+        details: "xAI speech models do not support the `instructions` option. Use xAI speech tags in `text` to control delivery."
+      });
+    }
+    const output_format = {
+      codec
+    };
+    if ((xaiOptions == null ? void 0 : xaiOptions.sampleRate) != null) {
+      output_format.sample_rate = xaiOptions.sampleRate;
+    }
+    if ((xaiOptions == null ? void 0 : xaiOptions.bitRate) != null) {
+      if (codec === "mp3") {
+        output_format.bit_rate = xaiOptions.bitRate;
+      } else {
+        warnings.push({
+          type: "unsupported",
+          feature: "providerOptions",
+          details: "xAI `bitRate` is supported only for mp3 output. It was ignored."
+        });
+      }
+    }
+    const requestBody = {
+      text,
+      voice_id: voice,
+      language,
+      output_format,
+      speed,
+      optimize_streaming_latency: xaiOptions == null ? void 0 : xaiOptions.optimizeStreamingLatency,
+      text_normalization: xaiOptions == null ? void 0 : xaiOptions.textNormalization
+    };
+    return { requestBody, warnings };
+  }
+  async doGenerate(options) {
+    var _a, _b, _c;
+    const currentDate = (_c = (_b = (_a = this.config._internal) == null ? void 0 : _a.currentDate) == null ? void 0 : _b.call(_a)) != null ? _c : /* @__PURE__ */ new Date();
+    const { requestBody, warnings } = await this.getArgs(options);
+    const {
+      value: audio,
+      responseHeaders,
+      rawValue: rawResponse
+    } = await postJsonToApi5({
+      url: `${this.config.baseURL}/tts`,
+      headers: combineHeaders6(
+        this.config.headers ? await resolve(this.config.headers) : void 0,
+        options.headers
+      ),
+      body: requestBody,
+      failedResponseHandler: xaiFailedResponseHandler,
+      successfulResponseHandler: createBinaryResponseHandler2(),
+      abortSignal: options.abortSignal,
+      fetch: this.config.fetch
+    });
+    return {
+      audio,
+      warnings,
+      request: {
+        body: JSON.stringify(requestBody)
+      },
+      response: {
+        timestamp: currentDate,
+        modelId: this.modelId,
+        headers: responseHeaders,
+        body: rawResponse
+      }
+    };
+  }
+};
+// src/xai-transcription-model.ts
+import {
+  combineHeaders as combineHeaders7,
+  convertBase64ToUint8Array,
+  createJsonResponseHandler as createJsonResponseHandler6,
+  mediaTypeToExtension,
+  parseProviderOptions as parseProviderOptions7,
+  postFormDataToApi as postFormDataToApi2,
+  serializeModelOptions as serializeModelOptions5,
+  WORKFLOW_DESERIALIZE as WORKFLOW_DESERIALIZE5,
+  WORKFLOW_SERIALIZE as WORKFLOW_SERIALIZE5
+} from "@ai-sdk/provider-utils";
+import { z as z21 } from "zod/v4";
+// src/xai-transcription-model-options.ts
+import {
+  lazySchema as lazySchema9,
+  zodSchema as zodSchema9
+} from "@ai-sdk/provider-utils";
+import { z as z20 } from "zod/v4";
+var xaiTranscriptionModelOptionsSchema = lazySchema9(
+  () => zodSchema9(
+    z20.object({
+      /**
+       * Audio encoding for raw, headerless input audio.
+       */
+      audioFormat: z20.enum(["pcm", "mulaw", "alaw"]).nullish(),
+      /**
+       * Sample rate of the input audio in Hz.
+       */
+      sampleRate: z20.union([
+        z20.literal(8e3),
+        z20.literal(16e3),
+        z20.literal(22050),
+        z20.literal(24e3),
+        z20.literal(44100),
+        z20.literal(48e3)
+      ]).nullish(),
+      /**
+       * Language code used for inverse text normalization.
+       */
+      language: z20.string().nullish(),
+      /**
+       * Enable inverse text normalization. Requires `language`.
+       */
+      format: z20.boolean().nullish(),
+      /**
+       * Enable per-channel transcription for multichannel audio.
+       */
+      multichannel: z20.boolean().nullish(),
+      /**
+       * Number of interleaved audio channels.
+       */
+      channels: z20.number().int().min(2).max(8).nullish(),
+      /**
+       * Enable speaker diarization.
+       */
+      diarize: z20.boolean().nullish(),
+      /**
+       * Terms to bias transcription toward.
+       */
+      keyterm: z20.union([z20.string(), z20.array(z20.string())]).nullish(),
+      /**
+       * Include filler words such as "uh" and "um" in the transcript.
+       */
+      fillerWords: z20.boolean().nullish()
+    })
+  )
+);
+// src/xai-transcription-model.ts
+var XaiTranscriptionModel = class _XaiTranscriptionModel {
+  constructor(modelId, config) {
+    this.modelId = modelId;
+    this.config = config;
+    this.specificationVersion = "v4";
+  }
+  static [WORKFLOW_SERIALIZE5](model) {
+    return serializeModelOptions5({
+      modelId: model.modelId,
+      config: model.config
+    });
+  }
+  static [WORKFLOW_DESERIALIZE5](options) {
+    return new _XaiTranscriptionModel(options.modelId, options.config);
+  }
+  get provider() {
+    return this.config.provider;
+  }
+  async getArgs({
+    audio,
+    mediaType,
+    providerOptions
+  }) {
+    const warnings = [];
+    const xaiOptions = await parseProviderOptions7({
+      provider: "xai",
+      providerOptions,
+      schema: xaiTranscriptionModelOptionsSchema
+    });
+    const formData = new FormData();
+    const transcriptionOptions = {
+      audio_format: xaiOptions == null ? void 0 : xaiOptions.audioFormat,
+      sample_rate: xaiOptions == null ? void 0 : xaiOptions.sampleRate,
+      language: xaiOptions == null ? void 0 : xaiOptions.language,
+      format: xaiOptions == null ? void 0 : xaiOptions.format,
+      multichannel: xaiOptions == null ? void 0 : xaiOptions.multichannel,
+      channels: xaiOptions == null ? void 0 : xaiOptions.channels,
+      diarize: xaiOptions == null ? void 0 : xaiOptions.diarize,
+      filler_words: xaiOptions == null ? void 0 : xaiOptions.fillerWords
+    };
+    for (const [key, value] of Object.entries(transcriptionOptions)) {
+      if (value != null) {
+        formData.append(key, String(value));
+      }
+    }
+    if ((xaiOptions == null ? void 0 : xaiOptions.keyterm) != null) {
+      const keyterms = Array.isArray(xaiOptions.keyterm) ? xaiOptions.keyterm : [xaiOptions.keyterm];
+      for (const keyterm of keyterms) {
+        formData.append("keyterm", keyterm);
+      }
+    }
+    const blob = audio instanceof Uint8Array ? new Blob([audio]) : new Blob([convertBase64ToUint8Array(audio)]);
+    const fileExtension = mediaTypeToExtension(mediaType);
+    formData.append(
+      "file",
+      new File([blob], "audio", { type: mediaType }),
+      `audio.${fileExtension}`
+    );
+    return { formData, warnings };
+  }
+  async doGenerate(options) {
+    var _a, _b, _c, _d, _e, _f, _g, _h, _i;
+    const currentDate = (_c = (_b = (_a = this.config._internal) == null ? void 0 : _a.currentDate) == null ? void 0 : _b.call(_a)) != null ? _c : /* @__PURE__ */ new Date();
+    const { formData, warnings } = await this.getArgs(options);
+    const {
+      value: response,
+      responseHeaders,
+      rawValue: rawResponse
+    } = await postFormDataToApi2({
+      url: `${(_d = this.config.baseURL) != null ? _d : "https://api.x.ai/v1"}/stt`,
+      headers: combineHeaders7((_f = (_e = this.config).headers) == null ? void 0 : _f.call(_e), options.headers),
+      formData,
+      failedResponseHandler: xaiFailedResponseHandler,
+      successfulResponseHandler: createJsonResponseHandler6(
+        xaiTranscriptionResponseSchema
+      ),
+      abortSignal: options.abortSignal,
+      fetch: this.config.fetch
+    });
+    return {
+      text: response.text,
+      segments: (_h = (_g = response.words) == null ? void 0 : _g.map((word) => ({
+        text: word.text,
+        startSecond: word.start,
+        endSecond: word.end
+      }))) != null ? _h : [],
+      language: response.language || void 0,
+      durationInSeconds: (_i = response.duration) != null ? _i : void 0,
+      warnings,
+      response: {
+        timestamp: currentDate,
+        modelId: this.modelId,
+        headers: responseHeaders,
+        body: rawResponse
+      }
+    };
+  }
+};
+var xaiTranscriptionResponseSchema = z21.object({
+  text: z21.string(),
+  language: z21.string().nullish(),
+  duration: z21.number().nullish(),
+  words: z21.array(
+    z21.object({
+      text: z21.string(),
+      start: z21.number(),
+      end: z21.number()
+    })
+  ).nullish()
+});
 // src/xai-provider.ts
 function createXai(options = {}) {
   var _a;
@@ -3904,6 +4255,22 @@ function createXai(options = {}) {
       fetch: options.fetch
     });
   };
+  const createSpeechModel = () => {
+    return new XaiSpeechModel("", {
+      provider: "xai.speech",
+      baseURL,
+      headers: getHeaders,
+      fetch: options.fetch
+    });
+  };
+  const createTranscriptionModel = () => {
+    return new XaiTranscriptionModel("", {
+      provider: "xai.transcription",
+      baseURL,
+      headers: getHeaders,
+      fetch: options.fetch
+    });
+  };
   const experimentalRealtimeFactory = Object.assign(
     (modelId) => createRealtimeModel(modelId),
     {
@@ -3941,6 +4308,10 @@ function createXai(options = {}) {
   provider.videoModel = createVideoModel;
   provider.video = createVideoModel;
   provider.experimental_realtime = experimentalRealtimeFactory;
+  provider.speechModel = createSpeechModel;
+  provider.speech = createSpeechModel;
+  provider.transcriptionModel = createTranscriptionModel;
+  provider.transcription = createTranscriptionModel;
   provider.files = createFiles;
   provider.tools = xaiTools;
   return provider;