@runpod/ai-sdk-provider 1.1.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,21 @@
1
1
  # @runpod/ai-sdk-provider
2
2
 
3
+ ## 1.3.0
4
+
5
+ ### Minor Changes
6
+
7
+ - 973fae6: Add support for the Tongyi-MAI Z-Image Turbo image model with validated sizes and aspect ratios.
8
+
9
+ ## 1.2.0
10
+
11
+ ### Minor Changes
12
+
13
+ - cf0c976: Add transcription model support with `pruna/whisper-v3-large`
14
+ - Add `transcriptionModel()` and `transcription()` methods to the provider
15
+ - Support audio transcription via RunPod's Whisper endpoint
16
+ - Accept audio as `Uint8Array`, base64 string, or URL via providerOptions
17
+ - Return transcription text, segments with timing, detected language, and duration
18
+
3
19
  ## 1.1.0
4
20
 
5
21
  ### Minor Changes
package/README.md CHANGED
@@ -278,22 +278,23 @@ Check out our [examples](https://github.com/runpod/examples/tree/main/ai-sdk/get
278
278
 
279
279
  ### Supported Models
280
280
 
281
- | Model ID | Type | Resolution | Aspect Ratios |
282
- | -------------------------------------- | ---- | --------------- | ----------------------------------------- |
283
- | `alibaba/wan-2.6` | t2i | 1024x1024 | 1:1, 4:3, 3:4 |
284
- | `pruna/p-image-t2i` | t2i | up to 1440x1440 | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3 |
285
- | `pruna/p-image-edit` | edit | up to 1440x1440 | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3 |
286
- | `google/nano-banana-edit` | edit | up to 4096x4096 | 1:1, 4:3, 3:4 |
287
- | `google/nano-banana-pro-edit` | edit | 1k, 2k, 4k | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3, 21:9 |
288
- | `bytedance/seedream-3.0` | t2i | up to 4096x4096 | 1:1, 4:3, 3:4 |
289
- | `bytedance/seedream-4.0` | t2i | up to 4096x4096 | 1:1, 4:3, 3:4 |
290
- | `bytedance/seedream-4.0-edit` | edit | up to 4096x4096 | uses size |
291
- | `qwen/qwen-image` | t2i | up to 4096x4096 | 1:1, 4:3, 3:4 |
292
- | `qwen/qwen-image-edit` | edit | up to 4096x4096 | 1:1, 4:3, 3:4 |
293
- | `qwen/qwen-image-edit-2511` | edit | up to 1536x1536 | 1:1, 4:3, 3:4 |
294
- | `black-forest-labs/flux-1-schnell` | t2i | up to 2048x2048 | 1:1, 4:3, 3:4 |
295
- | `black-forest-labs/flux-1-dev` | t2i | up to 2048x2048 | 1:1, 4:3, 3:4 |
296
- | `black-forest-labs/flux-1-kontext-dev` | edit | up to 2048x2048 | 1:1, 4:3, 3:4 |
281
+ | Model ID | Type | Resolution | Aspect Ratios |
282
+ | -------------------------------------- | ---- | ----------------- | ----------------------------------------------- |
283
+ | `alibaba/wan-2.6` | t2i | 768x768–1280x1280 | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3, 21:9, 9:21 |
284
+ | `pruna/p-image-t2i` | t2i | up to 1440x1440 | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3 |
285
+ | `pruna/p-image-edit` | edit | up to 1440x1440 | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3 |
286
+ | `google/nano-banana-edit` | edit | up to 4096x4096 | 1:1, 4:3, 3:4 |
287
+ | `google/nano-banana-pro-edit` | edit | 1k, 2k, 4k | 1:1, 16:9, 9:16, 4:3, 3:4, 3:2, 2:3, 21:9 |
288
+ | `bytedance/seedream-3.0` | t2i | up to 4096x4096 | 1:1, 4:3, 3:4 |
289
+ | `bytedance/seedream-4.0` | t2i | up to 4096x4096 | 1:1, 4:3, 3:4 |
290
+ | `bytedance/seedream-4.0-edit` | edit | up to 4096x4096 | uses size |
291
+ | `qwen/qwen-image` | t2i | up to 4096x4096 | 1:1, 4:3, 3:4 |
292
+ | `qwen/qwen-image-edit` | edit | up to 4096x4096 | 1:1, 4:3, 3:4 |
293
+ | `qwen/qwen-image-edit-2511` | edit | up to 1536x1536 | 1:1, 4:3, 3:4 |
294
+ | `tongyi-mai/z-image-turbo` | t2i | up to 1536x1536 | 1:1, 4:3, 3:4, 3:2, 2:3, 16:9, 9:16 |
295
+ | `black-forest-labs/flux-1-schnell` | t2i | up to 2048x2048 | 1:1, 4:3, 3:4 |
296
+ | `black-forest-labs/flux-1-dev` | t2i | up to 2048x2048 | 1:1, 4:3, 3:4 |
297
+ | `black-forest-labs/flux-1-kontext-dev` | edit | up to 2048x2048 | 1:1, 4:3, 3:4 |
297
298
 
298
299
  For the full list of models, see the [Runpod Public Endpoint Reference](https://docs.runpod.io/hub/public-endpoint-reference).
299
300
 
@@ -355,6 +356,38 @@ const { image } = await generateImage({
355
356
  });
356
357
  ```
357
358
 
359
+ #### Alibaba (WAN 2.6)
360
+
361
+ Text-to-image model with flexible resolution support.
362
+
363
+ **Resolution constraints:**
364
+
365
+ - Total pixels: 589,824 (768x768) to 1,638,400 (1280x1280)
366
+ - Aspect ratio: 1:4 to 4:1
367
+ - Default: 1280x1280
368
+
369
+ **Recommended resolutions for common aspect ratios:**
370
+
371
+ | Aspect Ratio | Resolution |
372
+ | :----------- | :--------- |
373
+ | 1:1 | 1280x1280 |
374
+ | 2:3 | 800x1200 |
375
+ | 3:2 | 1200x800 |
376
+ | 3:4 | 960x1280 |
377
+ | 4:3 | 1280x960 |
378
+ | 9:16 | 720x1280 |
379
+ | 16:9 | 1280x720 |
380
+ | 21:9 | 1344x576 |
381
+ | 9:21 | 576x1344 |
382
+
383
+ ```ts
384
+ const { image } = await generateImage({
385
+ model: runpod.image('alibaba/wan-2.6'),
386
+ prompt: 'A serene mountain landscape at dawn',
387
+ aspectRatio: '16:9',
388
+ });
389
+ ```
390
+
358
391
  #### Google (Nano Banana Pro)
359
392
 
360
393
  | Option | Values |
@@ -403,6 +436,14 @@ const { image } = await generateImage({
403
436
  });
404
437
  ```
405
438
 
439
+ #### Tongyi-MAI (Z-Image Turbo)
440
+
441
+ Supported model: `tongyi-mai/z-image-turbo`
442
+
443
+ - Supported sizes (validated by provider): 512x512, 768x768, 1024x1024, 1280x1280, 1536x1536, 512x768, 768x512, 1024x768, 768x1024, 1328x1328, 1472x1140, 1140x1472, 768x432, 1024x576, 1280x720, 1536x864, 432x768, 576x1024, 720x1280, 864x1536
444
+ - Supported `aspectRatio` values: 1:1, 4:3, 3:4, 3:2, 2:3, 16:9, 9:16 (maps to sizes above; use `size` for exact dimensions)
445
+ - Additional parameters: `strength`, `output_format`, `enable_safety_checker`, `seed`
446
+
406
447
  ## Speech Models
407
448
 
408
449
  Generate speech using the AI SDK's `generateSpeech` and `runpod.speech(...)`:
@@ -533,6 +574,185 @@ const result = await generateSpeech({
533
574
  });
534
575
  ```
535
576
 
577
+ ## Transcription Models
578
+
579
+ Transcribe audio using the AI SDK's `experimental_transcribe` and `runpod.transcription(...)`:
580
+
581
+ ```ts
582
+ import { runpod } from '@runpod/ai-sdk-provider';
583
+ import { experimental_transcribe as transcribe } from 'ai';
584
+
585
+ const result = await transcribe({
586
+ model: runpod.transcription('pruna/whisper-v3-large'),
587
+ audio: new URL('https://image.runpod.ai/demo/transcription-demo.wav'),
588
+ });
589
+
590
+ console.log(result.text);
591
+ ```
592
+
593
+ **Returns:**
594
+
595
+ - `result.text` - Full transcription text
596
+ - `result.segments` - Array of segments with timing info
597
+ - `segment.text` - Segment text
598
+ - `segment.startSecond` - Start time in seconds
599
+ - `segment.endSecond` - End time in seconds
600
+ - `result.language` - Detected language code
601
+ - `result.durationInSeconds` - Audio duration
602
+ - `result.warnings` - Array of any warnings
603
+ - `result.providerMetadata.runpod.jobId` - RunPod job ID
604
+
605
+ ### Audio Input
606
+
607
+ You can provide audio in several ways:
608
+
609
+ ```ts
610
+ // URL (recommended for large files)
611
+ const result = await transcribe({
612
+ model: runpod.transcription('pruna/whisper-v3-large'),
613
+ audio: new URL('https://image.runpod.ai/demo/transcription-demo.wav'),
614
+ });
615
+
616
+ // Local file as Uint8Array
617
+ import { readFileSync } from 'fs';
618
+ const audioData = readFileSync('./audio.wav');
619
+
620
+ const result = await transcribe({
621
+ model: runpod.transcription('pruna/whisper-v3-large'),
622
+ audio: audioData,
623
+ });
624
+ ```
625
+
626
+ ### Examples
627
+
628
+ Check out our [examples](https://github.com/runpod/examples/tree/main/ai-sdk/getting-started) for more code snippets on how to use all the different models.
629
+
630
+ ### Supported Models
631
+
632
+ - `pruna/whisper-v3-large`
633
+
634
+ ### Provider Options
635
+
636
+ Use `providerOptions.runpod` for model-specific parameters:
637
+
638
+ | Option | Type | Default | Description |
639
+ | -------------------- | --------- | ------- | ---------------------------------------------- |
640
+ | `audio` | `string` | - | URL to audio file (alternative to binary data) |
641
+ | `prompt` | `string` | - | Context prompt to guide transcription |
642
+ | `language` | `string` | Auto | ISO-639-1 language code (e.g., 'en', 'es') |
643
+ | `word_timestamps` | `boolean` | `false` | Include word-level timestamps |
644
+ | `translate` | `boolean` | `false` | Translate audio to English |
645
+ | `enable_vad` | `boolean` | `false` | Enable voice activity detection |
646
+ | `maxPollAttempts` | `number` | `120` | Max polling attempts |
647
+ | `pollIntervalMillis` | `number` | `2000` | Polling interval (ms) |
648
+
649
+ **Example (providerOptions):**
650
+
651
+ ```ts
652
+ const result = await transcribe({
653
+ model: runpod.transcription('pruna/whisper-v3-large'),
654
+ audio: new URL('https://image.runpod.ai/demo/transcription-demo.wav'),
655
+ providerOptions: {
656
+ runpod: {
657
+ language: 'en',
658
+ prompt: 'This is a demo of audio transcription',
659
+ word_timestamps: true,
660
+ },
661
+ },
662
+ });
663
+ ```
664
+
665
+ ## Video Models
666
+
667
+ Generate videos using the AI SDK's `experimental_generateVideo` and `runpod.video(...)`:
668
+
669
+ ```ts
670
+ import { runpod } from '@runpod/ai-sdk-provider';
671
+ import { experimental_generateVideo as generateVideo } from 'ai';
672
+
673
+ // Text-to-video
674
+ const result = await generateVideo({
675
+ model: runpod.video('alibaba/wan-2.6-t2v'),
676
+ prompt: 'A golden retriever running on a sunny beach, cinematic, 4k',
677
+ });
678
+
679
+ console.log(result.video.url);
680
+ ```
681
+
682
+ ```ts
683
+ // Image-to-video
684
+ const result = await generateVideo({
685
+ model: runpod.video('alibaba/wan-2.6-i2v'),
686
+ prompt: 'Animate this scene with gentle camera movement',
687
+ image: new URL('https://example.com/image.png'),
688
+ });
689
+
690
+ console.log(result.video.url);
691
+ ```
692
+
693
+ **Returns:**
694
+
695
+ - `result.video` - Generated video (`{ type: 'url', url, mediaType: 'video/mp4' }`)
696
+ - `result.warnings` - Array of any warnings
697
+ - `result.providerMetadata.runpod.jobId` - Runpod job ID
698
+
699
+ ### Examples
700
+
701
+ Check out our [examples](https://github.com/runpod/examples/tree/main/ai-sdk/getting-started) for more code snippets on how to use all the different models.
702
+
703
+ ### Supported Models
704
+
705
+ | Model ID | Type | Company |
706
+ | --------------------------------------- | ----------- | ------------------- |
707
+ | `pruna/p-video` | t2v | Pruna AI |
708
+ | `vidu/q3-t2v` | t2v | Shengshu Technology |
709
+ | `vidu/q3-i2v` | i2v | Shengshu Technology |
710
+ | `kwaivgi/kling-v2.6-std-motion-control` | i2v + video | KwaiVGI (Kuaishou) |
711
+ | `kwaivgi/kling-video-o1-r2v` | i2v | KwaiVGI (Kuaishou) |
712
+ | `kwaivgi/kling-v2.1-i2v-pro` | i2v | KwaiVGI (Kuaishou) |
713
+ | `alibaba/wan-2.6-t2v` | t2v | Alibaba |
714
+ | `alibaba/wan-2.6-i2v` | i2v | Alibaba |
715
+ | `alibaba/wan-2.5` | i2v | Alibaba |
716
+ | `alibaba/wan-2.2-t2v-720-lora` | i2v | Alibaba |
717
+ | `alibaba/wan-2.2-i2v-720` | i2v | Alibaba |
718
+ | `alibaba/wan-2.1-i2v-720` | i2v | Alibaba |
719
+ | `bytedance/seedance-v1.5-pro-i2v` | i2v | ByteDance |
720
+ | `openai/sora-2-pro-i2v` | i2v | OpenAI |
721
+ | `openai/sora-2-i2v` | i2v | OpenAI |
722
+
723
+ ### Provider Options
724
+
725
+ Use `providerOptions.runpod` for model-specific parameters:
726
+
727
+ | Option | Type | Default | Description |
728
+ | --------------------- | -------- | ------- | ------------------------------------ |
729
+ | `negative_prompt` | `string` | - | What to avoid in the generated video |
730
+ | `guidance_scale` | `number` | - | Guidance scale for prompt adherence |
731
+ | `num_inference_steps` | `number` | - | Number of inference steps |
732
+ | `style` | `string` | - | Style preset (model-specific) |
733
+ | `maxPollAttempts` | `number` | `120` | Max polling attempts |
734
+ | `pollIntervalMillis` | `number` | `5000` | Polling interval (ms) |
735
+
736
+ Any additional model-specific parameters can be passed through `providerOptions.runpod` and will be forwarded to the API.
737
+
738
+ **Example (providerOptions):**
739
+
740
+ ```ts
741
+ const result = await generateVideo({
742
+ model: runpod.video('alibaba/wan-2.6-t2v'),
743
+ prompt: 'A serene mountain landscape with flowing water',
744
+ duration: 5,
745
+ aspectRatio: '16:9',
746
+ seed: 42,
747
+ providerOptions: {
748
+ runpod: {
749
+ negative_prompt: 'blurry, low quality',
750
+ guidance_scale: 7.5,
751
+ },
752
+ },
753
+ });
754
+ ```
755
+
536
756
  ## About Runpod
537
757
 
538
758
  [Runpod](https://runpod.io) is the foundation for developers to build, deploy, and scale custom AI systems.
package/dist/index.d.mts CHANGED
@@ -1,4 +1,4 @@
1
- import { LanguageModelV3, ImageModelV3, SpeechModelV3 } from '@ai-sdk/provider';
1
+ import { LanguageModelV3, ImageModelV3, SpeechModelV3, TranscriptionModelV3, Experimental_VideoModelV3 } from '@ai-sdk/provider';
2
2
  import { FetchFunction } from '@ai-sdk/provider-utils';
3
3
  export { OpenAICompatibleErrorData as RunpodErrorData } from '@ai-sdk/openai-compatible';
4
4
  import { z } from 'zod';
@@ -56,6 +56,22 @@ interface RunpodProvider {
56
56
  Creates a speech model for speech generation.
57
57
  */
58
58
  speech(modelId: string): SpeechModelV3;
59
+ /**
60
+ Creates a transcription model for audio transcription.
61
+ */
62
+ transcriptionModel(modelId: string): TranscriptionModelV3;
63
+ /**
64
+ Creates a transcription model for audio transcription.
65
+ */
66
+ transcription(modelId: string): TranscriptionModelV3;
67
+ /**
68
+ Creates a video model for video generation.
69
+ */
70
+ videoModel(modelId: string): Experimental_VideoModelV3;
71
+ /**
72
+ Creates a video model for video generation.
73
+ */
74
+ video(modelId: string): Experimental_VideoModelV3;
59
75
  }
60
76
  declare function createRunpod(options?: RunpodProviderSettings): RunpodProvider;
61
77
  declare const runpod: RunpodProvider;
@@ -64,7 +80,101 @@ type RunpodChatModelId = 'qwen/qwen3-32b-awq' | (string & {});
64
80
 
65
81
  type RunpodCompletionModelId = 'qwen/qwen3-32b-awq' | (string & {});
66
82
 
67
- type RunpodImageModelId = 'qwen/qwen-image' | 'qwen/qwen-image-edit' | 'qwen/qwen-image-edit-2511' | 'bytedance/seedream-3.0' | 'bytedance/seedream-4.0' | 'bytedance/seedream-4.0-edit' | 'black-forest-labs/flux-1-kontext-dev' | 'black-forest-labs/flux-1-schnell' | 'black-forest-labs/flux-1-dev' | 'alibaba/wan-2.6' | 'google/nano-banana-edit' | 'nano-banana-edit';
83
+ type RunpodImageModelId = 'qwen/qwen-image' | 'qwen/qwen-image-edit' | 'qwen/qwen-image-edit-2511' | 'bytedance/seedream-3.0' | 'bytedance/seedream-4.0' | 'bytedance/seedream-4.0-edit' | 'black-forest-labs/flux-1-kontext-dev' | 'black-forest-labs/flux-1-schnell' | 'black-forest-labs/flux-1-dev' | 'alibaba/wan-2.6' | 'tongyi-mai/z-image-turbo' | 'google/nano-banana-edit' | 'nano-banana-edit';
84
+
85
+ type RunpodTranscriptionModelId = 'pruna/whisper-v3-large' | (string & {});
86
+ interface RunpodTranscriptionProviderOptions {
87
+ /**
88
+ * URL to audio file. Use this if you want to pass an audio URL directly
89
+ * instead of binary audio data.
90
+ */
91
+ audio?: string;
92
+ /**
93
+ * Optional context prompt to guide the transcription (initial_prompt in Whisper).
94
+ */
95
+ prompt?: string;
96
+ /**
97
+ * Alias for prompt - the initial prompt for the first window.
98
+ */
99
+ initial_prompt?: string;
100
+ /**
101
+ * Language of the audio in ISO-639-1 format (e.g., 'en', 'es', 'fr').
102
+ * If not specified, Whisper will auto-detect the language.
103
+ */
104
+ language?: string;
105
+ /**
106
+ * Whether to include word-level timestamps in the response.
107
+ * @default false
108
+ */
109
+ word_timestamps?: boolean;
110
+ /**
111
+ * Whisper model to use.
112
+ * Options: 'tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2', 'large-v3', 'turbo'
113
+ * @default 'base'
114
+ */
115
+ model?: string;
116
+ /**
117
+ * Output format for transcription.
118
+ * Options: 'plain_text', 'formatted_text', 'srt', 'vtt'
119
+ * @default 'plain_text'
120
+ */
121
+ transcription?: string;
122
+ /**
123
+ * Whether to translate the audio to English.
124
+ * @default false
125
+ */
126
+ translate?: boolean;
127
+ /**
128
+ * Whether to enable voice activity detection.
129
+ * @default false
130
+ */
131
+ enable_vad?: boolean;
132
+ /**
133
+ * Maximum number of polling attempts before timing out.
134
+ * @default 120
135
+ */
136
+ maxPollAttempts?: number;
137
+ /**
138
+ * Interval between polling attempts in milliseconds.
139
+ * @default 2000
140
+ */
141
+ pollIntervalMillis?: number;
142
+ }
143
+
144
+ type RunpodVideoModelId = 'pruna/p-video' | 'vidu/q3-t2v' | 'vidu/q3-i2v' | 'kwaivgi/kling-v2.6-std-motion-control' | 'kwaivgi/kling-video-o1-r2v' | 'kwaivgi/kling-v2.1-i2v-pro' | 'alibaba/wan-2.6-t2v' | 'alibaba/wan-2.6-i2v' | 'alibaba/wan-2.5' | 'alibaba/wan-2.2-t2v-720-lora' | 'alibaba/wan-2.2-i2v-720' | 'alibaba/wan-2.1-i2v-720' | 'bytedance/seedance-v1.5-pro-i2v' | 'openai/sora-2-pro-i2v' | 'openai/sora-2-i2v' | (string & {});
145
+ interface RunpodVideoProviderOptions {
146
+ /**
147
+ * Negative prompt to guide what to avoid in the generated video.
148
+ */
149
+ negative_prompt?: string;
150
+ /**
151
+ * Style preset for video generation (model-specific).
152
+ */
153
+ style?: string;
154
+ /**
155
+ * Guidance scale for prompt adherence.
156
+ */
157
+ guidance_scale?: number;
158
+ /**
159
+ * Number of inference steps.
160
+ */
161
+ num_inference_steps?: number;
162
+ /**
163
+ * Maximum number of polling attempts before timing out.
164
+ * @default 120
165
+ */
166
+ maxPollAttempts?: number;
167
+ /**
168
+ * Interval between polling attempts in milliseconds.
169
+ * @default 5000
170
+ */
171
+ pollIntervalMillis?: number;
172
+ /**
173
+ * Additional model-specific parameters are passed through via
174
+ * index signature.
175
+ */
176
+ [key: string]: unknown;
177
+ }
68
178
 
69
179
  declare const runpodImageErrorSchema: z.ZodObject<{
70
180
  error: z.ZodOptional<z.ZodString>;
@@ -78,4 +188,4 @@ declare const runpodImageErrorSchema: z.ZodObject<{
78
188
  }>;
79
189
  type RunpodImageErrorData = z.infer<typeof runpodImageErrorSchema>;
80
190
 
81
- export { type RunpodChatModelId, type RunpodCompletionModelId, type RunpodImageErrorData, type RunpodImageModelId, type RunpodProvider, type RunpodProviderSettings, createRunpod, runpod };
191
+ export { type RunpodChatModelId, type RunpodCompletionModelId, type RunpodImageErrorData, type RunpodImageModelId, type RunpodProvider, type RunpodProviderSettings, type RunpodTranscriptionModelId, type RunpodTranscriptionProviderOptions, type RunpodVideoModelId, type RunpodVideoProviderOptions, createRunpod, runpod };
package/dist/index.d.ts CHANGED
@@ -1,4 +1,4 @@
1
- import { LanguageModelV3, ImageModelV3, SpeechModelV3 } from '@ai-sdk/provider';
1
+ import { LanguageModelV3, ImageModelV3, SpeechModelV3, TranscriptionModelV3, Experimental_VideoModelV3 } from '@ai-sdk/provider';
2
2
  import { FetchFunction } from '@ai-sdk/provider-utils';
3
3
  export { OpenAICompatibleErrorData as RunpodErrorData } from '@ai-sdk/openai-compatible';
4
4
  import { z } from 'zod';
@@ -56,6 +56,22 @@ interface RunpodProvider {
56
56
  Creates a speech model for speech generation.
57
57
  */
58
58
  speech(modelId: string): SpeechModelV3;
59
+ /**
60
+ Creates a transcription model for audio transcription.
61
+ */
62
+ transcriptionModel(modelId: string): TranscriptionModelV3;
63
+ /**
64
+ Creates a transcription model for audio transcription.
65
+ */
66
+ transcription(modelId: string): TranscriptionModelV3;
67
+ /**
68
+ Creates a video model for video generation.
69
+ */
70
+ videoModel(modelId: string): Experimental_VideoModelV3;
71
+ /**
72
+ Creates a video model for video generation.
73
+ */
74
+ video(modelId: string): Experimental_VideoModelV3;
59
75
  }
60
76
  declare function createRunpod(options?: RunpodProviderSettings): RunpodProvider;
61
77
  declare const runpod: RunpodProvider;
@@ -64,7 +80,101 @@ type RunpodChatModelId = 'qwen/qwen3-32b-awq' | (string & {});
64
80
 
65
81
  type RunpodCompletionModelId = 'qwen/qwen3-32b-awq' | (string & {});
66
82
 
67
- type RunpodImageModelId = 'qwen/qwen-image' | 'qwen/qwen-image-edit' | 'qwen/qwen-image-edit-2511' | 'bytedance/seedream-3.0' | 'bytedance/seedream-4.0' | 'bytedance/seedream-4.0-edit' | 'black-forest-labs/flux-1-kontext-dev' | 'black-forest-labs/flux-1-schnell' | 'black-forest-labs/flux-1-dev' | 'alibaba/wan-2.6' | 'google/nano-banana-edit' | 'nano-banana-edit';
83
+ type RunpodImageModelId = 'qwen/qwen-image' | 'qwen/qwen-image-edit' | 'qwen/qwen-image-edit-2511' | 'bytedance/seedream-3.0' | 'bytedance/seedream-4.0' | 'bytedance/seedream-4.0-edit' | 'black-forest-labs/flux-1-kontext-dev' | 'black-forest-labs/flux-1-schnell' | 'black-forest-labs/flux-1-dev' | 'alibaba/wan-2.6' | 'tongyi-mai/z-image-turbo' | 'google/nano-banana-edit' | 'nano-banana-edit';
84
+
85
+ type RunpodTranscriptionModelId = 'pruna/whisper-v3-large' | (string & {});
86
+ interface RunpodTranscriptionProviderOptions {
87
+ /**
88
+ * URL to audio file. Use this if you want to pass an audio URL directly
89
+ * instead of binary audio data.
90
+ */
91
+ audio?: string;
92
+ /**
93
+ * Optional context prompt to guide the transcription (initial_prompt in Whisper).
94
+ */
95
+ prompt?: string;
96
+ /**
97
+ * Alias for prompt - the initial prompt for the first window.
98
+ */
99
+ initial_prompt?: string;
100
+ /**
101
+ * Language of the audio in ISO-639-1 format (e.g., 'en', 'es', 'fr').
102
+ * If not specified, Whisper will auto-detect the language.
103
+ */
104
+ language?: string;
105
+ /**
106
+ * Whether to include word-level timestamps in the response.
107
+ * @default false
108
+ */
109
+ word_timestamps?: boolean;
110
+ /**
111
+ * Whisper model to use.
112
+ * Options: 'tiny', 'base', 'small', 'medium', 'large-v1', 'large-v2', 'large-v3', 'turbo'
113
+ * @default 'base'
114
+ */
115
+ model?: string;
116
+ /**
117
+ * Output format for transcription.
118
+ * Options: 'plain_text', 'formatted_text', 'srt', 'vtt'
119
+ * @default 'plain_text'
120
+ */
121
+ transcription?: string;
122
+ /**
123
+ * Whether to translate the audio to English.
124
+ * @default false
125
+ */
126
+ translate?: boolean;
127
+ /**
128
+ * Whether to enable voice activity detection.
129
+ * @default false
130
+ */
131
+ enable_vad?: boolean;
132
+ /**
133
+ * Maximum number of polling attempts before timing out.
134
+ * @default 120
135
+ */
136
+ maxPollAttempts?: number;
137
+ /**
138
+ * Interval between polling attempts in milliseconds.
139
+ * @default 2000
140
+ */
141
+ pollIntervalMillis?: number;
142
+ }
143
+
144
+ type RunpodVideoModelId = 'pruna/p-video' | 'vidu/q3-t2v' | 'vidu/q3-i2v' | 'kwaivgi/kling-v2.6-std-motion-control' | 'kwaivgi/kling-video-o1-r2v' | 'kwaivgi/kling-v2.1-i2v-pro' | 'alibaba/wan-2.6-t2v' | 'alibaba/wan-2.6-i2v' | 'alibaba/wan-2.5' | 'alibaba/wan-2.2-t2v-720-lora' | 'alibaba/wan-2.2-i2v-720' | 'alibaba/wan-2.1-i2v-720' | 'bytedance/seedance-v1.5-pro-i2v' | 'openai/sora-2-pro-i2v' | 'openai/sora-2-i2v' | (string & {});
145
+ interface RunpodVideoProviderOptions {
146
+ /**
147
+ * Negative prompt to guide what to avoid in the generated video.
148
+ */
149
+ negative_prompt?: string;
150
+ /**
151
+ * Style preset for video generation (model-specific).
152
+ */
153
+ style?: string;
154
+ /**
155
+ * Guidance scale for prompt adherence.
156
+ */
157
+ guidance_scale?: number;
158
+ /**
159
+ * Number of inference steps.
160
+ */
161
+ num_inference_steps?: number;
162
+ /**
163
+ * Maximum number of polling attempts before timing out.
164
+ * @default 120
165
+ */
166
+ maxPollAttempts?: number;
167
+ /**
168
+ * Interval between polling attempts in milliseconds.
169
+ * @default 5000
170
+ */
171
+ pollIntervalMillis?: number;
172
+ /**
173
+ * Additional model-specific parameters are passed through via
174
+ * index signature.
175
+ */
176
+ [key: string]: unknown;
177
+ }
68
178
 
69
179
  declare const runpodImageErrorSchema: z.ZodObject<{
70
180
  error: z.ZodOptional<z.ZodString>;
@@ -78,4 +188,4 @@ declare const runpodImageErrorSchema: z.ZodObject<{
78
188
  }>;
79
189
  type RunpodImageErrorData = z.infer<typeof runpodImageErrorSchema>;
80
190
 
81
- export { type RunpodChatModelId, type RunpodCompletionModelId, type RunpodImageErrorData, type RunpodImageModelId, type RunpodProvider, type RunpodProviderSettings, createRunpod, runpod };
191
+ export { type RunpodChatModelId, type RunpodCompletionModelId, type RunpodImageErrorData, type RunpodImageModelId, type RunpodProvider, type RunpodProviderSettings, type RunpodTranscriptionModelId, type RunpodTranscriptionProviderOptions, type RunpodVideoModelId, type RunpodVideoProviderOptions, createRunpod, runpod };