npm - @runpod/ai-sdk-provider - Versions diffs - 1.0.1 → 1.2.0 - Mend

@runpod/ai-sdk-provider 1.0.1 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,25 @@
 # @runpod/ai-sdk-provider
+## 1.2.0
+### Minor Changes
+- cf0c976: Add transcription model support with `pruna/whisper-v3-large`
+  - Add `transcriptionModel()` and `transcription()` methods to the provider
+  - Support audio transcription via RunPod's Whisper endpoint
+  - Accept audio as `Uint8Array`, base64 string, or URL via providerOptions
+  - Return transcription text, segments with timing, detected language, and duration
+## 1.1.0
+### Minor Changes
+- 7ec59bc: add image models and improvements
+  - alibaba/wan-2.6: text-to-image model (max 1024x1024)
+  - qwen/qwen-image-edit-2511: edit model (max 1536x1536), supports 1-3 input images and loras
+  - google/nano-banana-edit: renamed from nano-banana-edit (backwards compatible), fixed payload format
+  - added resolution and aspect ratios columns to supported models table
 ## 1.0.1
 ### Patch Changes

package/README.md CHANGED Viewed

@@ -1,7 +1,5 @@
 # Runpod AI SDK Provider
-![Runpod AI SDK Provider banner](https://image.runpod.ai/runpod/ai-sdk-provider/banner.jpg)
 The **Runpod provider** for the [AI SDK](https://ai-sdk.dev/docs) contains language model and image generation support for [Runpod's](https://runpod.io) public endpoints.
 ## Setup
@@ -280,20 +278,22 @@ Check out our [examples](https://github.com/runpod/examples/tree/main/ai-sdk/get
 ### Supported Models
-| Model ID                               | Type |
-| -------------------------------------- | ---- |
-| `pruna/p-image-t2i`                    | t2i  |
-| `pruna/p-image-edit`                   | edit |
-| `google/nano-banana-pro-edit`          | edit |
-| `bytedance/seedream-3.0`               | t2i  |
-| `bytedance/seedream-4.0`               | t2i  |
-| `bytedance/seedream-4.0-edit`          | edit |
-| `qwen/qwen-image`                      | t2i  |
-| `qwen/qwen-image-edit`                 | edit |
-| `nano-banana-edit`                     | edit |
-| `black-forest-labs/flux-1-schnell`     | t2i  |
-| `black-forest-labs/flux-1-dev`         | t2i  |
-| `black-forest-labs/flux-1-kontext-dev` | edit |
+| Model ID                               | Type | Resolution        | Aspect Ratios                                   |
+| -------------------------------------- | ---- | ----------------- | ----------------------------------------------- |
+| `alibaba/wan-2.6`                      | t2i  | 768x768–1280x1280 | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3, 21:9, 9:21 |
+| `pruna/p-image-t2i`                    | t2i  | up to 1440x1440   | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3             |
+| `pruna/p-image-edit`                   | edit | up to 1440x1440   | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3             |
+| `google/nano-banana-edit`              | edit | up to 4096x4096   | 1:1, 4:3, 3:4                                   |
+| `google/nano-banana-pro-edit`          | edit | 1k, 2k, 4k        | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3, 21:9       |
+| `bytedance/seedream-3.0`               | t2i  | up to 4096x4096   | 1:1, 4:3, 3:4                                   |
+| `bytedance/seedream-4.0`               | t2i  | up to 4096x4096   | 1:1, 4:3, 3:4                                   |
+| `bytedance/seedream-4.0-edit`          | edit | up to 4096x4096   | uses size                                       |
+| `qwen/qwen-image`                      | t2i  | up to 4096x4096   | 1:1, 4:3, 3:4                                   |
+| `qwen/qwen-image-edit`                 | edit | up to 4096x4096   | 1:1, 4:3, 3:4                                   |
+| `qwen/qwen-image-edit-2511`            | edit | up to 1536x1536   | 1:1, 4:3, 3:4                                   |
+| `black-forest-labs/flux-1-schnell`     | t2i  | up to 2048x2048   | 1:1, 4:3, 3:4                                   |
+| `black-forest-labs/flux-1-dev`         | t2i  | up to 2048x2048   | 1:1, 4:3, 3:4                                   |
+| `black-forest-labs/flux-1-kontext-dev` | edit | up to 2048x2048   | 1:1, 4:3, 3:4                                   |
 For the full list of models, see the [Runpod Public Endpoint Reference](https://docs.runpod.io/hub/public-endpoint-reference).
@@ -301,21 +301,15 @@ For the full list of models, see the [Runpod Public Endpoint Reference](https://
 Additional options through `providerOptions.runpod` (supported options depend on the model):
-| Option                   | Type       | Default | Description                                                 |
-| ------------------------ | ---------- | ------- | ----------------------------------------------------------- |
-| `negative_prompt`        | `string`   | `""`    | What to avoid in the image (model-dependent)                |
-| `enable_safety_checker`  | `boolean`  | `true`  | Content safety filtering (model-dependent)                  |
-| `disable_safety_checker` | `boolean`  | `false` | Disable safety checker (Pruna)                              |
-| `aspect_ratio`           | `string`   | -       | Model-specific aspect ratio (Pruna: supports `custom`)      |
-| `image`                  | `string`   | -       | Legacy: Single input image URL/base64 (use `prompt.images`) |
-| `images`                 | `string[]` | -       | Legacy: Multiple input images (use `prompt.images`)         |
-| `resolution`             | `string`   | `"1k"`  | Output resolution: 1k, 2k, 4k (Nano Banana Pro)             |
-| `width` / `height`       | `number`   | -       | Custom dimensions (Pruna t2i, 256-1440; multiples of 16)    |
-| `num_inference_steps`    | `number`   | Auto    | Denoising steps (model-dependent)                           |
-| `guidance`               | `number`   | Auto    | Prompt adherence strength (model-dependent)                 |
-| `output_format`          | `string`   | `"png"` | Output format: png, jpg, jpeg, webp (model-dependent)       |
-| `maxPollAttempts`        | `number`   | `60`    | Max polling attempts                                        |
-| `pollIntervalMillis`     | `number`   | `5000`  | Polling interval (ms)                                       |
+| Option                  | Type      | Default | Description                                  |
+| ----------------------- | --------- | ------- | -------------------------------------------- |
+| `negative_prompt`       | `string`  | `""`    | What to avoid in the image (model-dependent) |
+| `enable_safety_checker` | `boolean` | `true`  | Content safety filtering (model-dependent)   |
+| `num_inference_steps`   | `number`  | Auto    | Denoising steps (model-dependent)            |
+| `guidance`              | `number`  | Auto    | Prompt adherence strength (model-dependent)  |
+| `output_format`         | `string`  | `"png"` | Output format: png, jpg, jpeg, webp          |
+| `maxPollAttempts`       | `number`  | `60`    | Max polling attempts                         |
+| `pollIntervalMillis`    | `number`  | `5000`  | Polling interval (ms)                        |
 **Example (providerOptions):**
@@ -361,18 +355,85 @@ const { image } = await generateImage({
 });
 ```
+#### Alibaba (WAN 2.6)
+Text-to-image model with flexible resolution support.
+**Resolution constraints:**
+- Total pixels: 589,824 (768x768) to 1,638,400 (1280x1280)
+- Aspect ratio: 1:4 to 4:1
+- Default: 1280x1280
+**Recommended resolutions for common aspect ratios:**
+| Aspect Ratio | Resolution |
+| :----------- | :--------- |
+| 1:1          | 1280x1280  |
+| 2:3          | 800x1200   |
+| 3:2          | 1200x800   |
+| 3:4          | 960x1280   |
+| 4:3          | 1280x960   |
+| 9:16         | 720x1280   |
+| 16:9         | 1280x720   |
+| 21:9         | 1344x576   |
+| 9:21         | 576x1344   |
+```ts
+const { image } = await generateImage({
+  model: runpod.image('alibaba/wan-2.6'),
+  prompt: 'A serene mountain landscape at dawn',
+  aspectRatio: '16:9',
+});
+```
 #### Google (Nano Banana Pro)
-Supported model: `google/nano-banana-pro-edit`
+| Option                              | Values           |
+| :---------------------------------- | :--------------- |
+| `providerOptions.runpod.resolution` | `1k`, `2k`, `4k` |
+```ts
+const { image } = await generateImage({
+  model: runpod.image('google/nano-banana-pro'),
+  prompt: 'A futuristic cityscape at sunset',
+  aspectRatio: '16:9',
+  providerOptions: {
+    runpod: {
+      resolution: '4k',
+    },
+  },
+});
+```
+#### Qwen (Image Edit 2511)
+| Option                         | Values                 |
+| :----------------------------- | :--------------------- |
+| `providerOptions.runpod.loras` | `[{path, scale}, ...]` |
-| Parameter                       | Supported Values                                                  | Notes                                |
-| :------------------------------ | :---------------------------------------------------------------- | :----------------------------------- |
-| `aspectRatio`                   | `1:1`, `16:9`, `9:16`, `4:3`, `3:4`, `3:2`, `2:3`, `21:9`, `9:21` | Standard AI SDK parameter            |
-| `resolution`                    | `1k`, `2k`, `4k`                                                  | Output resolution quality            |
-| `output_format`                 | `jpeg`, `png`, `webp`                                             | Output image format                  |
-| `prompt.images`                 | `string[]`                                                        | Recommended. Input image(s) to edit. |
-| `files`                         | `ImageModelV3File[]`                                              | Alternative (lower-level).           |
-| `providerOptions.runpod.images` | `string[]`                                                        | Legacy. Input image(s) to edit.      |
+Supports 1-3 input images.
+```ts
+const { image } = await generateImage({
+  model: runpod.image('qwen/qwen-image-edit-2511'),
+  prompt: {
+    text: 'Transform into anime style',
+    images: ['https://image.runpod.ai/asset/qwen/qwen-image-edit-2511.png'],
+  },
+  size: '1024x1024',
+  providerOptions: {
+    runpod: {
+      loras: [
+        {
+          path: 'https://huggingface.co/flymy-ai/qwen-image-anime-irl-lora/resolve/main/flymy_anime_irl.safetensors',
+          scale: 1,
+        },
+      ],
+    },
+  },
+});
+```
 ## Speech Models
@@ -474,7 +535,7 @@ const result = await generateSpeech({
   text: 'Hello!',
   providerOptions: {
     runpod: {
-      voice_url: 'https://example.com/voice.wav',
+      voice_url: 'https://your-audio-host.com/your-voice-sample.wav', // 5-10s audio sample
     },
   },
 });
@@ -504,6 +565,94 @@ const result = await generateSpeech({
 });
 ```
+## Transcription Models
+Transcribe audio using the AI SDK's `experimental_transcribe` and `runpod.transcription(...)`:
+```ts
+import { runpod } from '@runpod/ai-sdk-provider';
+import { experimental_transcribe as transcribe } from 'ai';
+const result = await transcribe({
+  model: runpod.transcription('pruna/whisper-v3-large'),
+  audio: new URL('https://image.runpod.ai/demo/transcription-demo.wav'),
+});
+console.log(result.text);
+```
+**Returns:**
+- `result.text` - Full transcription text
+- `result.segments` - Array of segments with timing info
+  - `segment.text` - Segment text
+  - `segment.startSecond` - Start time in seconds
+  - `segment.endSecond` - End time in seconds
+- `result.language` - Detected language code
+- `result.durationInSeconds` - Audio duration
+- `result.warnings` - Array of any warnings
+- `result.providerMetadata.runpod.jobId` - RunPod job ID
+### Audio Input
+You can provide audio in several ways:
+```ts
+// URL (recommended for large files)
+const result = await transcribe({
+  model: runpod.transcription('pruna/whisper-v3-large'),
+  audio: new URL('https://image.runpod.ai/demo/transcription-demo.wav'),
+});
+// Local file as Uint8Array
+import { readFileSync } from 'fs';
+const audioData = readFileSync('./audio.wav');
+const result = await transcribe({
+  model: runpod.transcription('pruna/whisper-v3-large'),
+  audio: audioData,
+});
+```
+### Examples
+Check out our [examples](https://github.com/runpod/examples/tree/main/ai-sdk/getting-started) for more code snippets on how to use all the different models.
+### Supported Models
+- `pruna/whisper-v3-large`
+### Provider Options
+Use `providerOptions.runpod` for model-specific parameters:
+| Option              | Type      | Default | Description                                    |
+| ------------------- | --------- | ------- | ---------------------------------------------- |
+| `audio`             | `string`  | -       | URL to audio file (alternative to binary data) |
+| `prompt`            | `string`  | -       | Context prompt to guide transcription          |
+| `language`          | `string`  | Auto    | ISO-639-1 language code (e.g., 'en', 'es')     |
+| `word_timestamps`   | `boolean` | `false` | Include word-level timestamps                  |
+| `translate`         | `boolean` | `false` | Translate audio to English                     |
+| `enable_vad`        | `boolean` | `false` | Enable voice activity detection                |
+| `maxPollAttempts`   | `number`  | `120`   | Max polling attempts                           |
+| `pollIntervalMillis`| `number`  | `2000`  | Polling interval (ms)                          |
+**Example (providerOptions):**
+```ts
+const result = await transcribe({
+  model: runpod.transcription('pruna/whisper-v3-large'),
+  audio: new URL('https://image.runpod.ai/demo/transcription-demo.wav'),
+  providerOptions: {
+    runpod: {
+      language: 'en',
+      prompt: 'This is a demo of audio transcription',
+      word_timestamps: true,
+    },
+  },
+});
+```
 ## About Runpod
 [Runpod](https://runpod.io) is the foundation for developers to build, deploy, and scale custom AI systems.

package/dist/index.d.mts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { LanguageModelV3, ImageModelV3, SpeechModelV3 } from '@ai-sdk/provider';
+import { LanguageModelV3, ImageModelV3, SpeechModelV3, TranscriptionModelV3 } from '@ai-sdk/provider';
 import { FetchFunction } from '@ai-sdk/provider-utils';
 export { OpenAICompatibleErrorData as RunpodErrorData } from '@ai-sdk/openai-compatible';
 import { z } from 'zod';
@@ -56,6 +56,14 @@ interface RunpodProvider {
   Creates a speech model for speech generation.
   */
     speech(modelId: string): SpeechModelV3;
+    /**
+  Creates a transcription model for audio transcription.
+  */
+    transcriptionModel(modelId: string): TranscriptionModelV3;
+    /**
+  Creates a transcription model for audio transcription.
+  */
+    transcription(modelId: string): TranscriptionModelV3;
 }
 declare function createRunpod(options?: RunpodProviderSettings): RunpodProvider;
 declare const runpod: RunpodProvider;
@@ -64,7 +72,66 @@ type RunpodChatModelId = 'qwen/qwen3-32b-awq' | (string & {});
 type RunpodCompletionModelId = 'qwen/qwen3-32b-awq' | (string & {});
-type RunpodImageModelId = 'qwen/qwen-image' | 'qwen/qwen-image-edit' | 'bytedance/seedream-3.0' | 'bytedance/seedream-4.0' | 'bytedance/seedream-4.0-edit' | 'black-forest-labs/flux-1-kontext-dev' | 'black-forest-labs/flux-1-schnell' | 'black-forest-labs/flux-1-dev' | 'nano-banana-edit';
+type RunpodImageModelId = 'qwen/qwen-image' | 'qwen/qwen-image-edit' | 'qwen/qwen-image-edit-2511' | 'bytedance/seedream-3.0' | 'bytedance/seedream-4.0' | 'bytedance/seedream-4.0-edit' | 'black-forest-labs/flux-1-kontext-dev' | 'black-forest-labs/flux-1-schnell' | 'black-forest-labs/flux-1-dev' | 'alibaba/wan-2.6' | 'google/nano-banana-edit' | 'nano-banana-edit';
+type RunpodTranscriptionModelId = 'pruna/whisper-v3-large' | (string & {});
+interface RunpodTranscriptionProviderOptions {
+    /**
+     * URL to audio file. Use this if you want to pass an audio URL directly
+     * instead of binary audio data.
+     */
+    audio?: string;
+    /**
+     * Optional context prompt to guide the transcription (initial_prompt in Whisper).
+     */
+    prompt?: string;
+    /**
+     * Alias for prompt - the initial prompt for the first window.
+     */
+    initial_prompt?: string;
+    /**
+     * Language of the audio in ISO-639-1 format (e.g., 'en', 'es', 'fr').
+     * If not specified, Whisper will auto-detect the language.
+     */
+    language?: string;
+    /**
+     * Whether to include word-level timestamps in the response.
+     * @default false
+     */
+    word_timestamps?: boolean;
+    /**
+     * Whisper model to use.
+     * Options: 'tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2', 'large-v3', 'turbo'
+     * @default 'base'
+     */
+    model?: string;
+    /**
+     * Output format for transcription.
+     * Options: 'plain_text', 'formatted_text', 'srt', 'vtt'
+     * @default 'plain_text'
+     */
+    transcription?: string;
+    /**
+     * Whether to translate the audio to English.
+     * @default false
+     */
+    translate?: boolean;
+    /**
+     * Whether to enable voice activity detection.
+     * @default false
+     */
+    enable_vad?: boolean;
+    /**
+     * Maximum number of polling attempts before timing out.
+     * @default 120
+     */
+    maxPollAttempts?: number;
+    /**
+     * Interval between polling attempts in milliseconds.
+     * @default 2000
+     */
+    pollIntervalMillis?: number;
+}
 declare const runpodImageErrorSchema: z.ZodObject<{
     error: z.ZodOptional<z.ZodString>;
@@ -78,4 +145,4 @@ declare const runpodImageErrorSchema: z.ZodObject<{
 }>;
 type RunpodImageErrorData = z.infer<typeof runpodImageErrorSchema>;
-export { type RunpodChatModelId, type RunpodCompletionModelId, type RunpodImageErrorData, type RunpodImageModelId, type RunpodProvider, type RunpodProviderSettings, createRunpod, runpod };
+export { type RunpodChatModelId, type RunpodCompletionModelId, type RunpodImageErrorData, type RunpodImageModelId, type RunpodProvider, type RunpodProviderSettings, type RunpodTranscriptionModelId, type RunpodTranscriptionProviderOptions, createRunpod, runpod };

package/dist/index.d.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { LanguageModelV3, ImageModelV3, SpeechModelV3 } from '@ai-sdk/provider';
+import { LanguageModelV3, ImageModelV3, SpeechModelV3, TranscriptionModelV3 } from '@ai-sdk/provider';
 import { FetchFunction } from '@ai-sdk/provider-utils';
 export { OpenAICompatibleErrorData as RunpodErrorData } from '@ai-sdk/openai-compatible';
 import { z } from 'zod';
@@ -56,6 +56,14 @@ interface RunpodProvider {
   Creates a speech model for speech generation.
   */
     speech(modelId: string): SpeechModelV3;
+    /**
+  Creates a transcription model for audio transcription.
+  */
+    transcriptionModel(modelId: string): TranscriptionModelV3;
+    /**
+  Creates a transcription model for audio transcription.
+  */
+    transcription(modelId: string): TranscriptionModelV3;
 }
 declare function createRunpod(options?: RunpodProviderSettings): RunpodProvider;
 declare const runpod: RunpodProvider;
@@ -64,7 +72,66 @@ type RunpodChatModelId = 'qwen/qwen3-32b-awq' | (string & {});
 type RunpodCompletionModelId = 'qwen/qwen3-32b-awq' | (string & {});
-type RunpodImageModelId = 'qwen/qwen-image' | 'qwen/qwen-image-edit' | 'bytedance/seedream-3.0' | 'bytedance/seedream-4.0' | 'bytedance/seedream-4.0-edit' | 'black-forest-labs/flux-1-kontext-dev' | 'black-forest-labs/flux-1-schnell' | 'black-forest-labs/flux-1-dev' | 'nano-banana-edit';
+type RunpodImageModelId = 'qwen/qwen-image' | 'qwen/qwen-image-edit' | 'qwen/qwen-image-edit-2511' | 'bytedance/seedream-3.0' | 'bytedance/seedream-4.0' | 'bytedance/seedream-4.0-edit' | 'black-forest-labs/flux-1-kontext-dev' | 'black-forest-labs/flux-1-schnell' | 'black-forest-labs/flux-1-dev' | 'alibaba/wan-2.6' | 'google/nano-banana-edit' | 'nano-banana-edit';
+type RunpodTranscriptionModelId = 'pruna/whisper-v3-large' | (string & {});
+interface RunpodTranscriptionProviderOptions {
+    /**
+     * URL to audio file. Use this if you want to pass an audio URL directly
+     * instead of binary audio data.
+     */
+    audio?: string;
+    /**
+     * Optional context prompt to guide the transcription (initial_prompt in Whisper).
+     */
+    prompt?: string;
+    /**
+     * Alias for prompt - the initial prompt for the first window.
+     */
+    initial_prompt?: string;
+    /**
+     * Language of the audio in ISO-639-1 format (e.g., 'en', 'es', 'fr').
+     * If not specified, Whisper will auto-detect the language.
+     */
+    language?: string;
+    /**
+     * Whether to include word-level timestamps in the response.
+     * @default false
+     */
+    word_timestamps?: boolean;
+    /**
+     * Whisper model to use.
+     * Options: 'tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2', 'large-v3', 'turbo'
+     * @default 'base'
+     */
+    model?: string;
+    /**
+     * Output format for transcription.
+     * Options: 'plain_text', 'formatted_text', 'srt', 'vtt'
+     * @default 'plain_text'
+     */
+    transcription?: string;
+    /**
+     * Whether to translate the audio to English.
+     * @default false
+     */
+    translate?: boolean;
+    /**
+     * Whether to enable voice activity detection.
+     * @default false
+     */
+    enable_vad?: boolean;
+    /**
+     * Maximum number of polling attempts before timing out.
+     * @default 120
+     */
+    maxPollAttempts?: number;
+    /**
+     * Interval between polling attempts in milliseconds.
+     * @default 2000
+     */
+    pollIntervalMillis?: number;
+}
 declare const runpodImageErrorSchema: z.ZodObject<{
     error: z.ZodOptional<z.ZodString>;
@@ -78,4 +145,4 @@ declare const runpodImageErrorSchema: z.ZodObject<{
 }>;
 type RunpodImageErrorData = z.infer<typeof runpodImageErrorSchema>;
-export { type RunpodChatModelId, type RunpodCompletionModelId, type RunpodImageErrorData, type RunpodImageModelId, type RunpodProvider, type RunpodProviderSettings, createRunpod, runpod };
+export { type RunpodChatModelId, type RunpodCompletionModelId, type RunpodImageErrorData, type RunpodImageModelId, type RunpodProvider, type RunpodProviderSettings, type RunpodTranscriptionModelId, type RunpodTranscriptionProviderOptions, createRunpod, runpod };