vargai 0.4.0-alpha61 → 0.4.0-alpha63

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,7 +40,9 @@ async function testGrokTextToVideo() {
40
40
 
41
41
  // Save the video
42
42
  const outputPath = join(import.meta.dir, "../output/grok-t2v-test.mp4");
43
- await writeFile(outputPath, result.videos[0]!);
43
+ const firstVideo = result.videos[0];
44
+ if (!firstVideo) throw new Error("No video returned from model");
45
+ await writeFile(outputPath, firstVideo);
44
46
  console.log(`Video saved to: ${outputPath}`);
45
47
 
46
48
  return outputPath;
@@ -88,7 +90,9 @@ async function testGrokImageToVideo() {
88
90
 
89
91
  // Save the video
90
92
  const outputPath = join(import.meta.dir, "../output/grok-i2v-test.mp4");
91
- await writeFile(outputPath, result.videos[0]!);
93
+ const firstVideo = result.videos[0];
94
+ if (!firstVideo) throw new Error("No video returned from model");
95
+ await writeFile(outputPath, firstVideo);
92
96
  console.log(`Video saved to: ${outputPath}`);
93
97
 
94
98
  return outputPath;
@@ -136,7 +140,9 @@ async function testGrokEditVideo() {
136
140
 
137
141
  // Save the video
138
142
  const outputPath = join(import.meta.dir, "../output/grok-edit-test.mp4");
139
- await writeFile(outputPath, result.videos[0]!);
143
+ const firstVideo = result.videos[0];
144
+ if (!firstVideo) throw new Error("No video returned from model");
145
+ await writeFile(outputPath, firstVideo);
140
146
  console.log(`Video saved to: ${outputPath}`);
141
147
 
142
148
  return outputPath;
package/package.json CHANGED
@@ -70,7 +70,7 @@
70
70
  "zod": "^4.2.1"
71
71
  },
72
72
  "sideEffects": false,
73
- "version": "0.4.0-alpha61",
73
+ "version": "0.4.0-alpha63",
74
74
  "exports": {
75
75
  ".": "./src/index.ts",
76
76
  "./ai": "./src/ai-sdk/index.ts",
@@ -269,6 +269,12 @@ function buildBaseClipFilter(
269
269
  const layer = clipLocalOverlays[i];
270
270
  if (!layer) continue;
271
271
 
272
+ if (!baseLabel) {
273
+ throw new Error(
274
+ `Clip ${clipIndex} is missing a base layer for overlay placement — ensure it has at least one visual layer (video, image, or fill-color)`,
275
+ );
276
+ }
277
+
272
278
  const overlayFilter = getVideoFilter(
273
279
  layer,
274
280
  inputIdx,
@@ -283,7 +289,7 @@ function buildBaseClipFilter(
283
289
 
284
290
  const outputLabel = `clip${clipIndex}ov${i}`;
285
291
  const positionFilter = getOverlayFilter(
286
- baseLabel!,
292
+ baseLabel,
287
293
  overlayFilter.outputLabel,
288
294
  layer,
289
295
  width,
@@ -164,6 +164,8 @@ const LIPSYNC_MODELS: Record<string, string> = {
164
164
  "sync-v2": "fal-ai/sync-lipsync",
165
165
  "sync-v2-pro": "fal-ai/sync-lipsync/v2",
166
166
  lipsync: "fal-ai/sync-lipsync",
167
+ "omnihuman-v1.5": "fal-ai/bytedance/omnihuman/v1.5",
168
+ "veed-fabric-1.0": "veed/fabric-1.0",
167
169
  };
168
170
 
169
171
  const IMAGE_MODELS: Record<string, string> = {
@@ -173,9 +175,20 @@ const IMAGE_MODELS: Record<string, string> = {
173
175
  "recraft-v3": "fal-ai/recraft/v3/text-to-image",
174
176
  "nano-banana-pro": "fal-ai/nano-banana-pro",
175
177
  "nano-banana-pro/edit": "fal-ai/nano-banana-pro/edit",
178
+ "nano-banana-2": "fal-ai/nano-banana-2/edit",
179
+ "nano-banana-2/edit": "fal-ai/nano-banana-2/edit",
176
180
  "seedream-v4.5/edit": "fal-ai/bytedance/seedream/v4.5/edit",
181
+ // Qwen Image 2 - text-to-image and image-to-image editing (standard + pro)
182
+ "qwen-image-2": "fal-ai/qwen-image-2/text-to-image",
183
+ "qwen-image-2/edit": "fal-ai/qwen-image-2/edit",
184
+ "qwen-image-2-pro": "fal-ai/qwen-image-2/pro/text-to-image",
185
+ "qwen-image-2-pro/edit": "fal-ai/qwen-image-2/pro/edit",
177
186
  // Qwen Image Edit 2511 Multiple Angles - camera angle adjustment
178
187
  "qwen-angles": "fal-ai/qwen-image-edit-2511-multiple-angles",
188
+ // Recraft V4 Pro - text-to-image
189
+ "recraft-v4-pro": "fal-ai/recraft/v4/pro/text-to-image",
190
+ // Reve - image editing
191
+ "reve/edit": "fal-ai/reve/edit",
179
192
  };
180
193
 
181
194
  // Models that use image_size instead of aspect_ratio
@@ -184,11 +197,19 @@ const IMAGE_SIZE_MODELS = new Set([
184
197
  "flux-dev",
185
198
  "flux-pro",
186
199
  "seedream-v4.5/edit",
200
+ "qwen-image-2",
201
+ "qwen-image-2/edit",
202
+ "qwen-image-2-pro",
203
+ "qwen-image-2-pro/edit",
204
+ "recraft-v4-pro",
187
205
  ]);
188
206
 
189
207
  // Qwen Angles model - image-to-image with camera angle adjustment
190
208
  const QWEN_ANGLES_MODEL = "qwen-angles";
191
209
 
210
+ // Models that use singular image_url instead of image_urls array
211
+ const SINGULAR_IMAGE_URL_MODELS = new Set(["reve/edit"]);
212
+
192
213
  // Map aspect ratio to image_size for Qwen Angles (base dimension 1024)
193
214
  const ASPECT_RATIO_TO_QWEN_SIZE: Record<
194
215
  string,
@@ -474,20 +495,30 @@ class FalVideoModel implements VideoModelV3 {
474
495
  };
475
496
 
476
497
  if (isLipsync) {
477
- // Lipsync: video + audio input
498
+ // Lipsync: either (video + audio) or (image + audio), depending on model
478
499
  const videoFile = files?.find((f) =>
479
500
  getMediaType(f)?.startsWith("video/"),
480
501
  );
502
+ const imageFile = files?.find((f) =>
503
+ getMediaType(f)?.startsWith("image/"),
504
+ );
481
505
  const audioFile = files?.find((f) =>
482
506
  getMediaType(f)?.startsWith("audio/"),
483
507
  );
484
508
 
485
509
  if (videoFile) {
486
510
  input.video_url = await fileToUrl(videoFile);
511
+ } else if (imageFile) {
512
+ input.image_url = await fileToUrl(imageFile);
487
513
  }
488
514
  if (audioFile) {
489
515
  input.audio_url = await fileToUrl(audioFile);
490
516
  }
517
+
518
+ // OmniHuman supports an optional prompt
519
+ if (prompt && this.modelId === "omnihuman-v1.5") {
520
+ input.prompt = prompt;
521
+ }
491
522
  } else if (isMotionControl) {
492
523
  // Motion control: image + reference video input
493
524
  if (prompt) {
@@ -836,7 +867,13 @@ class FalImageModel implements ImageModelV3 {
836
867
  modelId: this.modelId,
837
868
  fileHashes,
838
869
  });
839
- input.image_urls = await pMap(files, fileToUrl, { concurrency: 2 });
870
+ const imageUrls = await pMap(files, fileToUrl, { concurrency: 2 });
871
+ // Reve uses singular image_url instead of image_urls array
872
+ if (SINGULAR_IMAGE_URL_MODELS.has(this.modelId)) {
873
+ input.image_url = imageUrls[0];
874
+ } else {
875
+ input.image_urls = imageUrls;
876
+ }
840
877
  }
841
878
 
842
879
  if (isQwenAngles && !input.image_urls) {
@@ -15,6 +15,11 @@ import { ffmpegProvider } from "../../providers/ffmpeg";
15
15
 
16
16
  // Input schema with Zod
17
17
  const syncInputSchema = z.object({
18
+ model: z
19
+ .enum(["wan-25", "omnihuman-v1.5", "veed-fabric-1.0"])
20
+ .optional()
21
+ .default("wan-25")
22
+ .describe("Lip sync / avatar backend model"),
18
23
  image: filePathSchema.describe("Input image"),
19
24
  audio: filePathSchema.describe("Audio file"),
20
25
  prompt: z.string().describe("Description of the scene"),
@@ -40,13 +45,14 @@ export const definition: ActionDefinition<typeof schema> = {
40
45
  schema,
41
46
  routes: [],
42
47
  execute: async (inputs) => {
43
- const { image, audio, prompt, duration, resolution } = inputs;
44
- return lipsync({ image, audio, prompt, duration, resolution });
48
+ const { model, image, audio, prompt, duration, resolution } = inputs;
49
+ return lipsync({ model, image, audio, prompt, duration, resolution });
45
50
  },
46
51
  };
47
52
 
48
53
  // Types
49
54
  export interface LipsyncOptions {
55
+ model?: "wan-25" | "omnihuman-v1.5" | "veed-fabric-1.0";
50
56
  image: string;
51
57
  audio: string;
52
58
  prompt: string;
@@ -65,20 +71,56 @@ export interface Wav2LipOptions {
65
71
  }
66
72
 
67
73
  /**
68
- * Generate lip-synced video using Wan-25
74
+ * Generate lip-synced / avatar video using selected backend.
69
75
  */
70
76
  export async function lipsync(options: LipsyncOptions): Promise<LipsyncResult> {
71
- const { image, audio, prompt, duration = "5", resolution = "480p" } = options;
77
+ const {
78
+ model = "wan-25",
79
+ image,
80
+ audio,
81
+ prompt,
82
+ duration = "5",
83
+ resolution = "480p",
84
+ } = options;
72
85
 
73
- console.log("[sync] generating lip-synced video with wan-25...");
86
+ console.log(`[sync] generating lip-synced video with ${model}...`);
74
87
 
75
- const result = await falProvider.wan25({
76
- imageUrl: image,
77
- audioUrl: audio,
78
- prompt,
79
- duration,
80
- resolution,
81
- });
88
+ if (model === "omnihuman-v1.5" && resolution === "480p") {
89
+ console.warn(
90
+ "[sync] omnihuman-v1.5 does not support 480p; using 720p instead",
91
+ );
92
+ }
93
+ if (model === "veed-fabric-1.0" && resolution === "1080p") {
94
+ console.warn(
95
+ "[sync] veed-fabric-1.0 does not support 1080p; using 720p instead",
96
+ );
97
+ }
98
+
99
+ const result =
100
+ model === "omnihuman-v1.5"
101
+ ? await falProvider.omnihuman15({
102
+ imageUrl: image,
103
+ audioUrl: audio,
104
+ prompt,
105
+ resolution: (resolution === "480p" ? "720p" : resolution) as
106
+ | "720p"
107
+ | "1080p",
108
+ })
109
+ : model === "veed-fabric-1.0"
110
+ ? await falProvider.veedFabric10({
111
+ imageUrl: image,
112
+ audioUrl: audio,
113
+ resolution: (resolution === "1080p" ? "720p" : resolution) as
114
+ | "480p"
115
+ | "720p",
116
+ })
117
+ : await falProvider.wan25({
118
+ imageUrl: image,
119
+ audioUrl: audio,
120
+ prompt,
121
+ duration,
122
+ resolution,
123
+ });
82
124
 
83
125
  const videoUrl = result.data?.video?.url;
84
126
  if (!videoUrl) {
@@ -6,9 +6,15 @@ export { definition as elevenlabsTts } from "./elevenlabs";
6
6
  export { definition as flux } from "./flux";
7
7
  export { definition as kling } from "./kling";
8
8
  export { definition as llama } from "./llama";
9
+ export { definition as nanoBanana2 } from "./nano-banana-2";
9
10
  export { definition as nanoBananaPro } from "./nano-banana-pro";
11
+ export { definition as omnihuman } from "./omnihuman";
12
+ export { definition as qwenImage2 } from "./qwen-image-2";
13
+ export { definition as recraftV4 } from "./recraft-v4";
14
+ export { definition as reve } from "./reve";
10
15
  export { definition as sonauto } from "./sonauto";
11
16
  export { definition as soul } from "./soul";
17
+ export { definition as veedFabric } from "./veed-fabric";
12
18
  export { definition as wan } from "./wan";
13
19
  export { definition as whisper } from "./whisper";
14
20
 
@@ -17,9 +23,15 @@ import { definition as elevenlabsDefinition } from "./elevenlabs";
17
23
  import { definition as fluxDefinition } from "./flux";
18
24
  import { definition as klingDefinition } from "./kling";
19
25
  import { definition as llamaDefinition } from "./llama";
26
+ import { definition as nanoBanana2Definition } from "./nano-banana-2";
20
27
  import { definition as nanoBananaProDefinition } from "./nano-banana-pro";
28
+ import { definition as omnihumanDefinition } from "./omnihuman";
29
+ import { definition as qwenImage2Definition } from "./qwen-image-2";
30
+ import { definition as recraftV4Definition } from "./recraft-v4";
31
+ import { definition as reveDefinition } from "./reve";
21
32
  import { definition as sonautoDefinition } from "./sonauto";
22
33
  import { definition as soulDefinition } from "./soul";
34
+ import { definition as veedFabricDefinition } from "./veed-fabric";
23
35
  import { definition as wanDefinition } from "./wan";
24
36
  import { definition as whisperDefinition } from "./whisper";
25
37
 
@@ -27,7 +39,13 @@ export const allModels = [
27
39
  klingDefinition,
28
40
  fluxDefinition,
29
41
  nanoBananaProDefinition,
42
+ nanoBanana2Definition,
43
+ qwenImage2Definition,
44
+ recraftV4Definition,
45
+ reveDefinition,
30
46
  wanDefinition,
47
+ omnihumanDefinition,
48
+ veedFabricDefinition,
31
49
  whisperDefinition,
32
50
  elevenlabsDefinition,
33
51
  soulDefinition,
@@ -0,0 +1,115 @@
1
+ /**
2
+ * Nano Banana 2 image editing model (Google's next-gen image generation/editing)
3
+ * Edit-only model requiring image_urls input
4
+ */
5
+
6
+ import { z } from "zod";
7
+ import type { ModelDefinition, ZodSchema } from "../../core/schema/types";
8
+
9
+ // Nano Banana 2 resolution options (includes 0.5K unlike nano-banana-pro)
10
+ const nanoBanana2ResolutionSchema = z.enum(["0.5K", "1K", "2K", "4K"]);
11
+
12
+ // Nano Banana 2 aspect ratio options (supports "auto" unlike nano-banana-pro)
13
+ const nanoBanana2AspectRatioSchema = z.enum([
14
+ "auto",
15
+ "21:9",
16
+ "16:9",
17
+ "3:2",
18
+ "4:3",
19
+ "5:4",
20
+ "1:1",
21
+ "4:5",
22
+ "3:4",
23
+ "2:3",
24
+ "9:16",
25
+ ]);
26
+
27
+ // Output format options
28
+ const nanoBanana2OutputFormatSchema = z.enum(["png", "jpeg", "webp"]);
29
+
30
+ // Safety tolerance level (string enum "1"-"6", unlike nano-banana-pro's semantic filter)
31
+ const nanoBanana2SafetyToleranceSchema = z.enum(["1", "2", "3", "4", "5", "6"]);
32
+
33
+ // Input schema with Zod
34
+ const nanoBanana2InputSchema = z.object({
35
+ prompt: z.string().describe("Text description for image editing"),
36
+ image_urls: z
37
+ .array(z.string().url())
38
+ .describe(
39
+ "Input image URLs for image-to-image editing. Required for this model.",
40
+ ),
41
+ resolution: nanoBanana2ResolutionSchema
42
+ .default("1K")
43
+ .describe(
44
+ "Output resolution: 0.5K (512px), 1K (1024px), 2K (2048px), or 4K",
45
+ ),
46
+ aspect_ratio: nanoBanana2AspectRatioSchema
47
+ .default("auto")
48
+ .describe("Output aspect ratio. 'auto' preserves input aspect ratio."),
49
+ output_format: nanoBanana2OutputFormatSchema
50
+ .default("png")
51
+ .describe("Output image format"),
52
+ safety_tolerance: nanoBanana2SafetyToleranceSchema
53
+ .default("4")
54
+ .describe("Safety tolerance level: 1 (most strict) to 6 (least strict)"),
55
+ num_images: z
56
+ .number()
57
+ .int()
58
+ .min(1)
59
+ .max(4)
60
+ .default(1)
61
+ .describe("Number of images to generate (1-4)"),
62
+ seed: z
63
+ .number()
64
+ .int()
65
+ .optional()
66
+ .describe("Seed for the random number generator"),
67
+ limit_generations: z
68
+ .boolean()
69
+ .default(true)
70
+ .describe(
71
+ "Limit generations from each round of prompting to 1. May affect quality.",
72
+ ),
73
+ enable_web_search: z
74
+ .boolean()
75
+ .default(false)
76
+ .describe(
77
+ "Enable web search to use latest information for image generation",
78
+ ),
79
+ });
80
+
81
+ // Output schema with Zod
82
+ const nanoBanana2OutputSchema = z.object({
83
+ images: z.array(
84
+ z.object({
85
+ url: z.string(),
86
+ file_name: z.string().optional(),
87
+ content_type: z.string().optional(),
88
+ }),
89
+ ),
90
+ description: z.string().optional(),
91
+ });
92
+
93
+ // Schema object for the definition
94
+ const schema: ZodSchema<
95
+ typeof nanoBanana2InputSchema,
96
+ typeof nanoBanana2OutputSchema
97
+ > = {
98
+ input: nanoBanana2InputSchema,
99
+ output: nanoBanana2OutputSchema,
100
+ };
101
+
102
+ export const definition: ModelDefinition<typeof schema> = {
103
+ type: "model",
104
+ name: "nano-banana-2",
105
+ description:
106
+ "Google Nano Banana 2 - next-gen image editing model. Requires image_urls for all operations.",
107
+ providers: ["fal"],
108
+ defaultProvider: "fal",
109
+ providerModels: {
110
+ fal: "fal-ai/nano-banana-2/edit",
111
+ },
112
+ schema,
113
+ };
114
+
115
+ export default definition;
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Bytedance OmniHuman v1.5
3
+ * Image + audio -> video (full-body human animation)
4
+ */
5
+
6
+ import { z } from "zod";
7
+ import { urlSchema } from "../../core/schema/shared";
8
+ import type { ModelDefinition, ZodSchema } from "../../core/schema/types";
9
+
10
+ const omnihumanResolutionSchema = z
11
+ .enum(["720p", "1080p"])
12
+ .describe("Output resolution");
13
+
14
+ // Input schema with Zod
15
+ const omnihumanInputSchema = z.object({
16
+ prompt: z
17
+ .string()
18
+ .optional()
19
+ .describe("The text prompt used to guide the video generation"),
20
+ image_url: urlSchema.describe(
21
+ "The URL of the image used to generate the video",
22
+ ),
23
+ audio_url: urlSchema.describe(
24
+ "The URL of the audio file to generate the video",
25
+ ),
26
+ turbo_mode: z
27
+ .boolean()
28
+ .optional()
29
+ .default(false)
30
+ .describe("Faster generation with slight quality trade-off"),
31
+ resolution: omnihumanResolutionSchema
32
+ .optional()
33
+ .default("1080p")
34
+ .describe(
35
+ "The resolution of the generated video. 720p generation is faster and higher in quality",
36
+ ),
37
+ });
38
+
39
+ // Output schema with Zod
40
+ const omnihumanOutputSchema = z.object({
41
+ video: z.object({
42
+ url: z.string(),
43
+ }),
44
+ duration: z
45
+ .number()
46
+ .optional()
47
+ .describe("Duration of audio input/video output as used for billing"),
48
+ });
49
+
50
+ const schema: ZodSchema<
51
+ typeof omnihumanInputSchema,
52
+ typeof omnihumanOutputSchema
53
+ > = {
54
+ input: omnihumanInputSchema,
55
+ output: omnihumanOutputSchema,
56
+ };
57
+
58
+ export const definition: ModelDefinition<typeof schema> = {
59
+ type: "model",
60
+ name: "omnihuman",
61
+ description:
62
+ "OmniHuman v1.5 - generate a vivid talking video from an image and an audio file",
63
+ providers: ["fal"],
64
+ defaultProvider: "fal",
65
+ providerModels: {
66
+ fal: "fal-ai/bytedance/omnihuman/v1.5",
67
+ },
68
+ schema,
69
+ };
70
+
71
+ export default definition;
@@ -0,0 +1,113 @@
1
+ /**
2
+ * Qwen Image 2 generation and editing model
3
+ * Next-generation unified generation-and-editing model from Alibaba
4
+ * Supports both text-to-image and image-to-image editing
5
+ * Available in standard and pro tiers
6
+ */
7
+
8
+ import { z } from "zod";
9
+ import type { ModelDefinition, ZodSchema } from "../../core/schema/types";
10
+
11
+ // Image size can be an enum string or an object with width/height
12
+ const qwenImage2ImageSizeSchema = z.union([
13
+ z.enum([
14
+ "square_hd",
15
+ "square",
16
+ "landscape_4_3",
17
+ "landscape_16_9",
18
+ "portrait_4_3",
19
+ "portrait_16_9",
20
+ ]),
21
+ z.object({
22
+ width: z.number().int().min(512).max(2048),
23
+ height: z.number().int().min(512).max(2048),
24
+ }),
25
+ ]);
26
+
27
+ // Output format options
28
+ const qwenImage2OutputFormatSchema = z.enum(["png", "jpeg", "webp"]);
29
+
30
+ // Input schema with Zod
31
+ const qwenImage2InputSchema = z.object({
32
+ prompt: z
33
+ .string()
34
+ .describe(
35
+ "Text description for generation or editing. Supports Chinese and English.",
36
+ ),
37
+ negative_prompt: z
38
+ .string()
39
+ .default("")
40
+ .describe("Content to avoid in the generated image. Max 500 characters."),
41
+ image_size: qwenImage2ImageSizeSchema
42
+ .optional()
43
+ .describe(
44
+ "Output image size. Can be an enum (e.g. 'square_hd') or {width, height} object. Pixels must be between 512x512 and 2048x2048.",
45
+ ),
46
+ image_urls: z
47
+ .array(z.string().url())
48
+ .optional()
49
+ .describe(
50
+ "Reference images for editing (1-6 images). Order matters: reference as 'image 1', 'image 2' in prompt. Required for /edit endpoints.",
51
+ ),
52
+ enable_prompt_expansion: z
53
+ .boolean()
54
+ .default(true)
55
+ .describe("Enable LLM prompt optimization for better results"),
56
+ seed: z
57
+ .number()
58
+ .int()
59
+ .min(0)
60
+ .max(2147483647)
61
+ .optional()
62
+ .describe("Random seed for reproducibility"),
63
+ enable_safety_checker: z
64
+ .boolean()
65
+ .default(true)
66
+ .describe("Enable content moderation for input and output"),
67
+ num_images: z
68
+ .number()
69
+ .int()
70
+ .min(1)
71
+ .max(6)
72
+ .default(1)
73
+ .describe("Number of images to generate (1-4 for t2i, 1-6 for edit)"),
74
+ output_format: qwenImage2OutputFormatSchema
75
+ .default("png")
76
+ .describe("Output image format"),
77
+ });
78
+
79
+ // Output schema with Zod
80
+ const qwenImage2OutputSchema = z.object({
81
+ images: z.array(
82
+ z.object({
83
+ url: z.string(),
84
+ file_name: z.string().optional(),
85
+ content_type: z.string().optional(),
86
+ }),
87
+ ),
88
+ seed: z.number().int().optional(),
89
+ });
90
+
91
+ // Schema object for the definition
92
+ const schema: ZodSchema<
93
+ typeof qwenImage2InputSchema,
94
+ typeof qwenImage2OutputSchema
95
+ > = {
96
+ input: qwenImage2InputSchema,
97
+ output: qwenImage2OutputSchema,
98
+ };
99
+
100
+ export const definition: ModelDefinition<typeof schema> = {
101
+ type: "model",
102
+ name: "qwen-image-2",
103
+ description:
104
+ "Qwen Image 2.0 - next-gen unified generation-and-editing model. Supports text-to-image and image-to-image editing in standard and pro tiers.",
105
+ providers: ["fal"],
106
+ defaultProvider: "fal",
107
+ providerModels: {
108
+ fal: "fal-ai/qwen-image-2/text-to-image",
109
+ },
110
+ schema,
111
+ };
112
+
113
+ export default definition;
@@ -0,0 +1,94 @@
1
+ /**
2
+ * Recraft V4 Pro image generation model
3
+ * Built for brand systems and production-ready workflows
4
+ * Text-to-image only
5
+ */
6
+
7
+ import { z } from "zod";
8
+ import type { ModelDefinition, ZodSchema } from "../../core/schema/types";
9
+
10
+ // Image size can be an enum string or an object with width/height
11
+ const recraftV4ImageSizeSchema = z.union([
12
+ z.enum([
13
+ "square_hd",
14
+ "square",
15
+ "landscape_4_3",
16
+ "landscape_16_9",
17
+ "portrait_4_3",
18
+ "portrait_16_9",
19
+ ]),
20
+ z.object({
21
+ width: z.number().int(),
22
+ height: z.number().int(),
23
+ }),
24
+ ]);
25
+
26
+ // RGB color schema
27
+ const rgbColorSchema = z.object({
28
+ r: z.number().int().min(0).max(255),
29
+ g: z.number().int().min(0).max(255),
30
+ b: z.number().int().min(0).max(255),
31
+ });
32
+
33
+ // Output format - Recraft V4 outputs webp by default
34
+ const recraftV4OutputFormatSchema = z.enum(["png", "jpeg", "webp"]);
35
+
36
+ // Input schema with Zod
37
+ const recraftV4InputSchema = z.object({
38
+ prompt: z.string().describe("Text description for image generation"),
39
+ image_size: recraftV4ImageSizeSchema
40
+ .default("square_hd")
41
+ .describe(
42
+ "Output image size. Can be an enum (e.g. 'landscape_16_9') or {width, height} object.",
43
+ ),
44
+ colors: z
45
+ .array(rgbColorSchema)
46
+ .default([])
47
+ .describe("Array of preferable RGB colors for the generated image"),
48
+ background_color: rgbColorSchema
49
+ .optional()
50
+ .describe("Preferable background color of the generated image"),
51
+ enable_safety_checker: z
52
+ .boolean()
53
+ .default(true)
54
+ .describe("Enable content safety checker"),
55
+ output_format: recraftV4OutputFormatSchema
56
+ .optional()
57
+ .describe("Output image format"),
58
+ });
59
+
60
+ // Output schema with Zod
61
+ const recraftV4OutputSchema = z.object({
62
+ images: z.array(
63
+ z.object({
64
+ url: z.string(),
65
+ file_name: z.string().optional(),
66
+ file_size: z.number().optional(),
67
+ content_type: z.string().optional(),
68
+ }),
69
+ ),
70
+ });
71
+
72
+ // Schema object for the definition
73
+ const schema: ZodSchema<
74
+ typeof recraftV4InputSchema,
75
+ typeof recraftV4OutputSchema
76
+ > = {
77
+ input: recraftV4InputSchema,
78
+ output: recraftV4OutputSchema,
79
+ };
80
+
81
+ export const definition: ModelDefinition<typeof schema> = {
82
+ type: "model",
83
+ name: "recraft-v4-pro",
84
+ description:
85
+ "Recraft V4 Pro - professional text-to-image model built for brand systems and production-ready workflows. Strong composition, refined lighting, realistic materials.",
86
+ providers: ["fal"],
87
+ defaultProvider: "fal",
88
+ providerModels: {
89
+ fal: "fal-ai/recraft/v4/pro/text-to-image",
90
+ },
91
+ schema,
92
+ };
93
+
94
+ export default definition;
@@ -0,0 +1,66 @@
1
+ /**
2
+ * Reve image editing model
3
+ * Upload an existing image and transform it via a text prompt
4
+ * Edit-only model using singular image_url (not image_urls array)
5
+ */
6
+
7
+ import { z } from "zod";
8
+ import type { ModelDefinition, ZodSchema } from "../../core/schema/types";
9
+
10
+ // Output format options
11
+ const reveOutputFormatSchema = z.enum(["png", "jpeg", "webp"]);
12
+
13
+ // Input schema with Zod
14
+ const reveInputSchema = z.object({
15
+ prompt: z
16
+ .string()
17
+ .describe("Text description of how to edit the provided image"),
18
+ image_url: z
19
+ .string()
20
+ .url()
21
+ .describe(
22
+ "URL of the reference image to edit. Supports PNG, JPEG, WebP, AVIF, and HEIF formats.",
23
+ ),
24
+ num_images: z
25
+ .number()
26
+ .int()
27
+ .min(1)
28
+ .max(4)
29
+ .default(1)
30
+ .describe("Number of images to generate (1-4)"),
31
+ output_format: reveOutputFormatSchema
32
+ .default("png")
33
+ .describe("Output image format"),
34
+ });
35
+
36
+ // Output schema with Zod
37
+ const reveOutputSchema = z.object({
38
+ images: z.array(
39
+ z.object({
40
+ url: z.string(),
41
+ file_name: z.string().optional(),
42
+ content_type: z.string().optional(),
43
+ }),
44
+ ),
45
+ });
46
+
47
+ // Schema object for the definition
48
+ const schema: ZodSchema<typeof reveInputSchema, typeof reveOutputSchema> = {
49
+ input: reveInputSchema,
50
+ output: reveOutputSchema,
51
+ };
52
+
53
+ export const definition: ModelDefinition<typeof schema> = {
54
+ type: "model",
55
+ name: "reve",
56
+ description:
57
+ "Reve edit model - upload an existing image and transform it via a text prompt. Uses singular image_url input.",
58
+ providers: ["fal"],
59
+ defaultProvider: "fal",
60
+ providerModels: {
61
+ fal: "fal-ai/reve/edit",
62
+ },
63
+ schema,
64
+ };
65
+
66
+ export default definition;
@@ -0,0 +1,49 @@
1
+ /**
2
+ * VEED Fabric 1.0
3
+ * Image + audio -> talking video
4
+ */
5
+
6
+ import { z } from "zod";
7
+ import { urlSchema } from "../../core/schema/shared";
8
+ import type { ModelDefinition, ZodSchema } from "../../core/schema/types";
9
+
10
+ const fabricResolutionSchema = z
11
+ .enum(["480p", "720p"])
12
+ .describe("Output resolution");
13
+
14
+ // Input schema with Zod
15
+ const veedFabricInputSchema = z.object({
16
+ image_url: urlSchema.describe("Input image URL"),
17
+ audio_url: urlSchema.describe("Input audio URL"),
18
+ resolution: fabricResolutionSchema.describe("Output resolution"),
19
+ });
20
+
21
+ // Output schema with Zod
22
+ const veedFabricOutputSchema = z.object({
23
+ video: z.object({
24
+ content_type: z.string().optional(),
25
+ url: z.string().url(),
26
+ }),
27
+ });
28
+
29
+ const schema: ZodSchema<
30
+ typeof veedFabricInputSchema,
31
+ typeof veedFabricOutputSchema
32
+ > = {
33
+ input: veedFabricInputSchema,
34
+ output: veedFabricOutputSchema,
35
+ };
36
+
37
+ export const definition: ModelDefinition<typeof schema> = {
38
+ type: "model",
39
+ name: "veed-fabric",
40
+ description: "VEED Fabric 1.0 - turn an image into a talking video",
41
+ providers: ["fal"],
42
+ defaultProvider: "fal",
43
+ providerModels: {
44
+ fal: "veed/fabric-1.0",
45
+ },
46
+ schema,
47
+ };
48
+
49
+ export default definition;
@@ -54,6 +54,23 @@ export class FalProvider extends BaseProvider {
54
54
  return "fal-ai/nano-banana-pro/edit";
55
55
  }
56
56
  }
57
+ // Nano Banana 2: always route to /edit endpoint (edit-only model)
58
+ if (model === "fal-ai/nano-banana-2") {
59
+ return "fal-ai/nano-banana-2/edit";
60
+ }
61
+ // Qwen Image 2: route to /edit endpoint when image_urls are provided
62
+ if (model === "fal-ai/qwen-image-2/text-to-image") {
63
+ const imageUrls = inputs.image_urls as string[] | undefined;
64
+ if (imageUrls && imageUrls.length > 0) {
65
+ return "fal-ai/qwen-image-2/edit";
66
+ }
67
+ }
68
+ if (model === "fal-ai/qwen-image-2/pro/text-to-image") {
69
+ const imageUrls = inputs.image_urls as string[] | undefined;
70
+ if (imageUrls && imageUrls.length > 0) {
71
+ return "fal-ai/qwen-image-2/pro/edit";
72
+ }
73
+ }
57
74
  return model;
58
75
  }
59
76
 
@@ -332,6 +349,86 @@ export class FalProvider extends BaseProvider {
332
349
  return result;
333
350
  }
334
351
 
352
+ async omnihuman15(args: {
353
+ imageUrl: string;
354
+ audioUrl: string;
355
+ prompt?: string;
356
+ turboMode?: boolean;
357
+ resolution?: "720p" | "1080p";
358
+ }) {
359
+ const modelId: string = "fal-ai/bytedance/omnihuman/v1.5";
360
+
361
+ console.log(`[fal] starting omnihuman v1.5: ${modelId}`);
362
+
363
+ const imageUrl = await ensureUrl(args.imageUrl, (buffer) =>
364
+ this.uploadFile(buffer),
365
+ );
366
+ const audioUrl = await ensureUrl(args.audioUrl, (buffer) =>
367
+ this.uploadFile(buffer),
368
+ );
369
+
370
+ const input: Record<string, unknown> = {
371
+ ...(args.prompt ? { prompt: args.prompt } : {}),
372
+ image_url: imageUrl,
373
+ audio_url: audioUrl,
374
+ turbo_mode: args.turboMode ?? false,
375
+ resolution: args.resolution ?? "1080p",
376
+ };
377
+
378
+ const result = await fal.subscribe(modelId, {
379
+ input,
380
+ logs: true,
381
+ onQueueUpdate: (update) => {
382
+ if (update.status === "IN_PROGRESS") {
383
+ console.log(
384
+ `[fal] ${update.logs?.map((l) => l.message).join(" ") || "processing..."}`,
385
+ );
386
+ }
387
+ },
388
+ });
389
+
390
+ console.log("[fal] completed!");
391
+ return result;
392
+ }
393
+
394
+ async veedFabric10(args: {
395
+ imageUrl: string;
396
+ audioUrl: string;
397
+ resolution: "480p" | "720p";
398
+ }) {
399
+ const modelId: string = "veed/fabric-1.0";
400
+
401
+ console.log(`[fal] starting veed fabric 1.0: ${modelId}`);
402
+
403
+ const imageUrl = await ensureUrl(args.imageUrl, (buffer) =>
404
+ this.uploadFile(buffer),
405
+ );
406
+ const audioUrl = await ensureUrl(args.audioUrl, (buffer) =>
407
+ this.uploadFile(buffer),
408
+ );
409
+
410
+ const input: Record<string, unknown> = {
411
+ image_url: imageUrl,
412
+ audio_url: audioUrl,
413
+ resolution: args.resolution,
414
+ };
415
+
416
+ const result = await fal.subscribe(modelId, {
417
+ input,
418
+ logs: true,
419
+ onQueueUpdate: (update) => {
420
+ if (update.status === "IN_PROGRESS") {
421
+ console.log(
422
+ `[fal] ${update.logs?.map((l) => l.message).join(" ") || "processing..."}`,
423
+ );
424
+ }
425
+ },
426
+ });
427
+
428
+ console.log("[fal] completed!");
429
+ return result;
430
+ }
431
+
335
432
  async textToMusic(args: {
336
433
  prompt?: string;
337
434
  tags?: string[];
@@ -584,5 +681,10 @@ export const imageToImage = (
584
681
  ) => falProvider.imageToImage(args);
585
682
  export const wan25 = (args: Parameters<FalProvider["wan25"]>[0]) =>
586
683
  falProvider.wan25(args);
684
+ export const omnihuman15 = (args: Parameters<FalProvider["omnihuman15"]>[0]) =>
685
+ falProvider.omnihuman15(args);
686
+ export const veedFabric10 = (
687
+ args: Parameters<FalProvider["veedFabric10"]>[0],
688
+ ) => falProvider.veedFabric10(args);
587
689
  export const textToMusic = (args: Parameters<FalProvider["textToMusic"]>[0]) =>
588
690
  falProvider.textToMusic(args);
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Longer talking head demo (VEED Fabric 1.0):
3
+ * - character image from nano-banana-pro
4
+ * - voice from ElevenLabs
5
+ * - talking video from veed/fabric-1.0 (image + audio)
6
+ *
7
+ * Run: bun run src/react/examples/veed-fabric-long-talking-head.tsx
8
+ * Output: output/veed-fabric-long-talking-head.mp4
9
+ */
10
+
11
+ import { elevenlabs, fal } from "../../ai-sdk";
12
+ import { Clip, Image, Render, render, Speech, Video } from "..";
13
+
14
+ const SCRIPT =
15
+ "Hey, I am Nova. In this quick demo, you will hear a clean voiceover, and see a talking avatar generated from a single portrait. We are using VEED Fabric for image-to-video lipsync, and ElevenLabs for the voice.";
16
+
17
+ const portrait = Image({
18
+ prompt:
19
+ "Ultra-realistic studio portrait of Nova, a confident friendly product designer in her early 30s, warm smile, expressive eyes, subtle freckles, natural makeup, shoulder-length dark auburn hair, modern minimal wardrobe, cinematic softbox lighting, shallow depth of field, clean neutral background, high-end camera look",
20
+ model: fal.imageModel("nano-banana-pro"),
21
+ aspectRatio: "9:16",
22
+ });
23
+
24
+ const voiceover = Speech({
25
+ model: elevenlabs.speechModel("eleven_v3"),
26
+ voice: "adam",
27
+ children: SCRIPT,
28
+ });
29
+
30
+ const talking = Video({
31
+ model: fal.videoModel("veed-fabric-1.0"),
32
+ keepAudio: true,
33
+ prompt: {
34
+ images: [portrait],
35
+ audio: voiceover,
36
+ },
37
+ providerOptions: {
38
+ fal: {
39
+ resolution: "720p",
40
+ },
41
+ },
42
+ });
43
+
44
+ const demo = (
45
+ <Render width={1080} height={1920}>
46
+ <Clip duration="auto">{talking}</Clip>
47
+ </Render>
48
+ );
49
+
50
+ async function main() {
51
+ if (!process.env.FAL_API_KEY && !process.env.FAL_KEY) {
52
+ console.error("ERROR: FAL_API_KEY/FAL_KEY not found in environment");
53
+ process.exit(1);
54
+ }
55
+ if (!process.env.ELEVENLABS_API_KEY) {
56
+ console.error("ERROR: ELEVENLABS_API_KEY not found in environment");
57
+ process.exit(1);
58
+ }
59
+
60
+ const result = await render(demo, {
61
+ output: "output/veed-fabric-long-talking-head.mp4",
62
+ cache: ".cache/ai-veed-fabric-long-talking-head",
63
+ });
64
+
65
+ console.log(
66
+ `ok: output/veed-fabric-long-talking-head.mp4 (${(result.video.byteLength / 1024 / 1024).toFixed(2)} MB)`,
67
+ );
68
+ }
69
+
70
+ if (import.meta.main) {
71
+ main().catch((err) => {
72
+ console.error(err);
73
+ process.exit(1);
74
+ });
75
+ }
@@ -0,0 +1,60 @@
1
+ /**
2
+ * VEED Fabric 1.0 React syntax test
3
+ *
4
+ * Uses a local image + local audio file to generate a talking video.
5
+ *
6
+ * Run: bun run src/react/examples/veed-fabric-react-test.tsx
7
+ * Output: output/veed-fabric-react-test.mp4
8
+ */
9
+
10
+ import { fal } from "../../ai-sdk/providers/fal";
11
+ import { Clip, Render, render, Video } from "..";
12
+
13
+ const IMAGE_PATH = "output/garry-tan-image.png";
14
+ const AUDIO_PATH = "output/garry-tan-voice.mp3";
15
+
16
+ const RESOLUTION =
17
+ (process.env.FABRIC_RESOLUTION as "480p" | "720p" | undefined) ?? "720p";
18
+
19
+ const video = (
20
+ <Render width={720} height={1280}>
21
+ <Clip duration={5}>
22
+ <Video
23
+ model={fal.videoModel("veed-fabric-1.0")}
24
+ keepAudio
25
+ prompt={{
26
+ images: [IMAGE_PATH],
27
+ audio: AUDIO_PATH,
28
+ }}
29
+ providerOptions={{
30
+ fal: {
31
+ resolution: RESOLUTION,
32
+ },
33
+ }}
34
+ />
35
+ </Clip>
36
+ </Render>
37
+ );
38
+
39
+ async function main() {
40
+ if (!process.env.FAL_API_KEY && !process.env.FAL_KEY) {
41
+ console.error("ERROR: FAL_API_KEY/FAL_KEY not found in environment");
42
+ process.exit(1);
43
+ }
44
+
45
+ const result = await render(video, {
46
+ output: `output/veed-fabric-react-test-${RESOLUTION}.mp4`,
47
+ cache: `.cache/ai-veed-fabric-${RESOLUTION}-keepaudio`,
48
+ });
49
+
50
+ console.log(
51
+ `ok: output/veed-fabric-react-test-${RESOLUTION}.mp4 (${(result.video.byteLength / 1024 / 1024).toFixed(2)} MB)`,
52
+ );
53
+ }
54
+
55
+ if (import.meta.main) {
56
+ main().catch((err) => {
57
+ console.error(err);
58
+ process.exit(1);
59
+ });
60
+ }
@@ -11,11 +11,13 @@ import type {
11
11
  PositionObject,
12
12
  SizeValue,
13
13
  TitleLayer,
14
+ VideoLayer,
14
15
  } from "../../ai-sdk/providers/editly/types";
15
16
  import type { PackshotProps, VargElement } from "../types";
16
17
  import type { RenderContext } from "./context";
17
18
  import { renderImage } from "./image";
18
19
  import { createBlinkingButton } from "./packshot/blinking-button";
20
+ import { renderVideo } from "./video";
19
21
 
20
22
  /**
21
23
  * Resolve an FFmpegOutput to a string path/URL via the backend.
@@ -118,8 +120,23 @@ export async function renderPackshot(
118
120
  type: "fill-color" as const,
119
121
  color: props.background,
120
122
  });
123
+ } else if (props.background.type === "video") {
124
+ const bgFile = await renderVideo(
125
+ props.background as VargElement<"video">,
126
+ ctx,
127
+ );
128
+ const bgPath = await ctx.backend.resolvePath(bgFile);
129
+ const videoLayer: VideoLayer = {
130
+ type: "video",
131
+ path: bgPath,
132
+ resizeMode: "cover",
133
+ };
134
+ layers.push(videoLayer);
121
135
  } else {
122
- const bgFile = await renderImage(props.background, ctx);
136
+ const bgFile = await renderImage(
137
+ props.background as VargElement<"image">,
138
+ ctx,
139
+ );
123
140
  const bgPath = await ctx.backend.resolvePath(bgFile);
124
141
  layers.push({
125
142
  type: "image" as const,
@@ -1,5 +1,6 @@
1
1
  import type { ImageModelV3 } from "@ai-sdk/provider";
2
2
  import { generateImage, wrapImageModel } from "ai";
3
+ import pMap from "p-map";
3
4
  import { type CacheStorage, withCache } from "../../ai-sdk/cache";
4
5
  import type { File, File as VargFile } from "../../ai-sdk/file";
5
6
  import { fileCache } from "../../ai-sdk/file-cache";
@@ -9,7 +10,6 @@ import {
9
10
  placeholderFallbackMiddleware,
10
11
  wrapVideoModel,
11
12
  } from "../../ai-sdk/middleware";
12
-
13
13
  import { editly, localBackend } from "../../ai-sdk/providers/editly";
14
14
  import type {
15
15
  AudioTrack,
@@ -236,15 +236,42 @@ export async function renderRoot(
236
236
  }
237
237
  }
238
238
 
239
- const clipResults = await Promise.allSettled(
240
- clipElements.map((clipElement) => renderClip(clipElement, ctx)),
239
+ const concurrency =
240
+ options.concurrency === undefined
241
+ ? Number.POSITIVE_INFINITY
242
+ : options.concurrency;
243
+
244
+ if (
245
+ concurrency !== Number.POSITIVE_INFINITY &&
246
+ (!Number.isInteger(concurrency) || concurrency < 1)
247
+ ) {
248
+ throw new Error("render option `concurrency` must be a positive integer");
249
+ }
250
+
251
+ const clipResults = await pMap(
252
+ clipElements,
253
+ async (clipElement, i) => {
254
+ try {
255
+ return {
256
+ status: "fulfilled" as const,
257
+ value: await renderClip(clipElement, ctx),
258
+ index: i,
259
+ };
260
+ } catch (reason) {
261
+ return {
262
+ status: "rejected" as const,
263
+ reason: reason as Error,
264
+ index: i,
265
+ };
266
+ }
267
+ },
268
+ { concurrency },
241
269
  );
242
270
 
243
- const failures = clipResults
244
- .map((r, i) =>
245
- r.status === "rejected" ? { index: i, reason: r.reason } : null,
246
- )
247
- .filter(Boolean) as { index: number; reason: Error }[];
271
+ const failures = clipResults.filter(
272
+ (r): r is Extract<typeof r, { status: "rejected" }> =>
273
+ r.status === "rejected",
274
+ );
248
275
 
249
276
  if (failures.length > 0) {
250
277
  const successCount = clipResults.length - failures.length;
@@ -266,11 +293,10 @@ export async function renderRoot(
266
293
  );
267
294
  }
268
295
 
269
- const renderedClips = clipResults.map(
270
- (r) =>
271
- (r as PromiseFulfilledResult<Awaited<ReturnType<typeof renderClip>>>)
272
- .value,
273
- );
296
+ const renderedClips = clipResults.map((r) => {
297
+ if (r.status !== "fulfilled") throw new Error("unexpected");
298
+ return r.value;
299
+ });
274
300
 
275
301
  const clips: Clip[] = [];
276
302
  let currentTime = 0;
@@ -209,7 +209,16 @@ export interface SwipeProps extends BaseProps {
209
209
  }
210
210
 
211
211
  export interface PackshotProps extends BaseProps {
212
- background?: VargElement<"image"> | string;
212
+ /**
213
+ * Packshot background.
214
+ *
215
+ * - `string` — treated as a solid fill color (e.g. `"#000000"`).
216
+ * - `VargElement<"image">` — a generated or static image, rendered and
217
+ * used as a full-bleed cover background.
218
+ * - `VargElement<"video">` — a generated or static video, rendered and
219
+ * used as a looping full-bleed cover background.
220
+ */
221
+ background?: VargElement<"image"> | VargElement<"video"> | string;
213
222
  logo?: string;
214
223
  /**
215
224
  * Logo position on screen.
@@ -276,6 +285,8 @@ export interface RenderOptions {
276
285
  defaults?: DefaultModels;
277
286
  backend?: FFmpegBackend;
278
287
  storage?: StorageProvider;
288
+ /** Max concurrent clip renders. Defaults to unlimited. */
289
+ concurrency?: number;
279
290
  }
280
291
 
281
292
  // Re-export from file module for convenience