vargai 0.4.0-alpha61 → 0.4.0-alpha62

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,7 +40,9 @@ async function testGrokTextToVideo() {
40
40
 
41
41
  // Save the video
42
42
  const outputPath = join(import.meta.dir, "../output/grok-t2v-test.mp4");
43
- await writeFile(outputPath, result.videos[0]!);
43
+ const firstVideo = result.videos[0];
44
+ if (!firstVideo) throw new Error("No video returned from model");
45
+ await writeFile(outputPath, firstVideo);
44
46
  console.log(`Video saved to: ${outputPath}`);
45
47
 
46
48
  return outputPath;
@@ -88,7 +90,9 @@ async function testGrokImageToVideo() {
88
90
 
89
91
  // Save the video
90
92
  const outputPath = join(import.meta.dir, "../output/grok-i2v-test.mp4");
91
- await writeFile(outputPath, result.videos[0]!);
93
+ const firstVideo = result.videos[0];
94
+ if (!firstVideo) throw new Error("No video returned from model");
95
+ await writeFile(outputPath, firstVideo);
92
96
  console.log(`Video saved to: ${outputPath}`);
93
97
 
94
98
  return outputPath;
@@ -136,7 +140,9 @@ async function testGrokEditVideo() {
136
140
 
137
141
  // Save the video
138
142
  const outputPath = join(import.meta.dir, "../output/grok-edit-test.mp4");
139
- await writeFile(outputPath, result.videos[0]!);
143
+ const firstVideo = result.videos[0];
144
+ if (!firstVideo) throw new Error("No video returned from model");
145
+ await writeFile(outputPath, firstVideo);
140
146
  console.log(`Video saved to: ${outputPath}`);
141
147
 
142
148
  return outputPath;
package/package.json CHANGED
@@ -70,7 +70,7 @@
70
70
  "zod": "^4.2.1"
71
71
  },
72
72
  "sideEffects": false,
73
- "version": "0.4.0-alpha61",
73
+ "version": "0.4.0-alpha62",
74
74
  "exports": {
75
75
  ".": "./src/index.ts",
76
76
  "./ai": "./src/ai-sdk/index.ts",
@@ -269,6 +269,12 @@ function buildBaseClipFilter(
269
269
  const layer = clipLocalOverlays[i];
270
270
  if (!layer) continue;
271
271
 
272
+ if (!baseLabel) {
273
+ throw new Error(
274
+ `Clip ${clipIndex} is missing a base layer for overlay placement — ensure it has at least one visual layer (video, image, or fill-color)`,
275
+ );
276
+ }
277
+
272
278
  const overlayFilter = getVideoFilter(
273
279
  layer,
274
280
  inputIdx,
@@ -283,7 +289,7 @@ function buildBaseClipFilter(
283
289
 
284
290
  const outputLabel = `clip${clipIndex}ov${i}`;
285
291
  const positionFilter = getOverlayFilter(
286
- baseLabel!,
292
+ baseLabel,
287
293
  overlayFilter.outputLabel,
288
294
  layer,
289
295
  width,
@@ -164,6 +164,8 @@ const LIPSYNC_MODELS: Record<string, string> = {
164
164
  "sync-v2": "fal-ai/sync-lipsync",
165
165
  "sync-v2-pro": "fal-ai/sync-lipsync/v2",
166
166
  lipsync: "fal-ai/sync-lipsync",
167
+ "omnihuman-v1.5": "fal-ai/bytedance/omnihuman/v1.5",
168
+ "veed-fabric-1.0": "veed/fabric-1.0",
167
169
  };
168
170
 
169
171
  const IMAGE_MODELS: Record<string, string> = {
@@ -474,20 +476,30 @@ class FalVideoModel implements VideoModelV3 {
474
476
  };
475
477
 
476
478
  if (isLipsync) {
477
- // Lipsync: video + audio input
479
+ // Lipsync: either (video + audio) or (image + audio), depending on model
478
480
  const videoFile = files?.find((f) =>
479
481
  getMediaType(f)?.startsWith("video/"),
480
482
  );
483
+ const imageFile = files?.find((f) =>
484
+ getMediaType(f)?.startsWith("image/"),
485
+ );
481
486
  const audioFile = files?.find((f) =>
482
487
  getMediaType(f)?.startsWith("audio/"),
483
488
  );
484
489
 
485
490
  if (videoFile) {
486
491
  input.video_url = await fileToUrl(videoFile);
492
+ } else if (imageFile) {
493
+ input.image_url = await fileToUrl(imageFile);
487
494
  }
488
495
  if (audioFile) {
489
496
  input.audio_url = await fileToUrl(audioFile);
490
497
  }
498
+
499
+ // OmniHuman supports an optional prompt
500
+ if (prompt && this.modelId === "omnihuman-v1.5") {
501
+ input.prompt = prompt;
502
+ }
491
503
  } else if (isMotionControl) {
492
504
  // Motion control: image + reference video input
493
505
  if (prompt) {
@@ -15,6 +15,11 @@ import { ffmpegProvider } from "../../providers/ffmpeg";
15
15
 
16
16
  // Input schema with Zod
17
17
  const syncInputSchema = z.object({
18
+ model: z
19
+ .enum(["wan-25", "omnihuman-v1.5", "veed-fabric-1.0"])
20
+ .optional()
21
+ .default("wan-25")
22
+ .describe("Lip sync / avatar backend model"),
18
23
  image: filePathSchema.describe("Input image"),
19
24
  audio: filePathSchema.describe("Audio file"),
20
25
  prompt: z.string().describe("Description of the scene"),
@@ -40,13 +45,14 @@ export const definition: ActionDefinition<typeof schema> = {
40
45
  schema,
41
46
  routes: [],
42
47
  execute: async (inputs) => {
43
- const { image, audio, prompt, duration, resolution } = inputs;
44
- return lipsync({ image, audio, prompt, duration, resolution });
48
+ const { model, image, audio, prompt, duration, resolution } = inputs;
49
+ return lipsync({ model, image, audio, prompt, duration, resolution });
45
50
  },
46
51
  };
47
52
 
48
53
  // Types
49
54
  export interface LipsyncOptions {
55
+ model?: "wan-25" | "omnihuman-v1.5" | "veed-fabric-1.0";
50
56
  image: string;
51
57
  audio: string;
52
58
  prompt: string;
@@ -65,20 +71,56 @@ export interface Wav2LipOptions {
65
71
  }
66
72
 
67
73
  /**
68
- * Generate lip-synced video using Wan-25
74
+ * Generate lip-synced / avatar video using selected backend.
69
75
  */
70
76
  export async function lipsync(options: LipsyncOptions): Promise<LipsyncResult> {
71
- const { image, audio, prompt, duration = "5", resolution = "480p" } = options;
77
+ const {
78
+ model = "wan-25",
79
+ image,
80
+ audio,
81
+ prompt,
82
+ duration = "5",
83
+ resolution = "480p",
84
+ } = options;
72
85
 
73
- console.log("[sync] generating lip-synced video with wan-25...");
86
+ console.log(`[sync] generating lip-synced video with ${model}...`);
74
87
 
75
- const result = await falProvider.wan25({
76
- imageUrl: image,
77
- audioUrl: audio,
78
- prompt,
79
- duration,
80
- resolution,
81
- });
88
+ if (model === "omnihuman-v1.5" && resolution === "480p") {
89
+ console.warn(
90
+ "[sync] omnihuman-v1.5 does not support 480p; using 720p instead",
91
+ );
92
+ }
93
+ if (model === "veed-fabric-1.0" && resolution === "1080p") {
94
+ console.warn(
95
+ "[sync] veed-fabric-1.0 does not support 1080p; using 720p instead",
96
+ );
97
+ }
98
+
99
+ const result =
100
+ model === "omnihuman-v1.5"
101
+ ? await falProvider.omnihuman15({
102
+ imageUrl: image,
103
+ audioUrl: audio,
104
+ prompt,
105
+ resolution: (resolution === "480p" ? "720p" : resolution) as
106
+ | "720p"
107
+ | "1080p",
108
+ })
109
+ : model === "veed-fabric-1.0"
110
+ ? await falProvider.veedFabric10({
111
+ imageUrl: image,
112
+ audioUrl: audio,
113
+ resolution: (resolution === "1080p" ? "720p" : resolution) as
114
+ | "480p"
115
+ | "720p",
116
+ })
117
+ : await falProvider.wan25({
118
+ imageUrl: image,
119
+ audioUrl: audio,
120
+ prompt,
121
+ duration,
122
+ resolution,
123
+ });
82
124
 
83
125
  const videoUrl = result.data?.video?.url;
84
126
  if (!videoUrl) {
@@ -7,8 +7,10 @@ export { definition as flux } from "./flux";
7
7
  export { definition as kling } from "./kling";
8
8
  export { definition as llama } from "./llama";
9
9
  export { definition as nanoBananaPro } from "./nano-banana-pro";
10
+ export { definition as omnihuman } from "./omnihuman";
10
11
  export { definition as sonauto } from "./sonauto";
11
12
  export { definition as soul } from "./soul";
13
+ export { definition as veedFabric } from "./veed-fabric";
12
14
  export { definition as wan } from "./wan";
13
15
  export { definition as whisper } from "./whisper";
14
16
 
@@ -18,8 +20,10 @@ import { definition as fluxDefinition } from "./flux";
18
20
  import { definition as klingDefinition } from "./kling";
19
21
  import { definition as llamaDefinition } from "./llama";
20
22
  import { definition as nanoBananaProDefinition } from "./nano-banana-pro";
23
+ import { definition as omnihumanDefinition } from "./omnihuman";
21
24
  import { definition as sonautoDefinition } from "./sonauto";
22
25
  import { definition as soulDefinition } from "./soul";
26
+ import { definition as veedFabricDefinition } from "./veed-fabric";
23
27
  import { definition as wanDefinition } from "./wan";
24
28
  import { definition as whisperDefinition } from "./whisper";
25
29
 
@@ -28,6 +32,8 @@ export const allModels = [
28
32
  fluxDefinition,
29
33
  nanoBananaProDefinition,
30
34
  wanDefinition,
35
+ omnihumanDefinition,
36
+ veedFabricDefinition,
31
37
  whisperDefinition,
32
38
  elevenlabsDefinition,
33
39
  soulDefinition,
@@ -0,0 +1,71 @@
1
+ /**
2
+ * Bytedance OmniHuman v1.5
3
+ * Image + audio -> video (full-body human animation)
4
+ */
5
+
6
+ import { z } from "zod";
7
+ import { urlSchema } from "../../core/schema/shared";
8
+ import type { ModelDefinition, ZodSchema } from "../../core/schema/types";
9
+
10
+ const omnihumanResolutionSchema = z
11
+ .enum(["720p", "1080p"])
12
+ .describe("Output resolution");
13
+
14
+ // Input schema with Zod
15
+ const omnihumanInputSchema = z.object({
16
+ prompt: z
17
+ .string()
18
+ .optional()
19
+ .describe("The text prompt used to guide the video generation"),
20
+ image_url: urlSchema.describe(
21
+ "The URL of the image used to generate the video",
22
+ ),
23
+ audio_url: urlSchema.describe(
24
+ "The URL of the audio file to generate the video",
25
+ ),
26
+ turbo_mode: z
27
+ .boolean()
28
+ .optional()
29
+ .default(false)
30
+ .describe("Faster generation with slight quality trade-off"),
31
+ resolution: omnihumanResolutionSchema
32
+ .optional()
33
+ .default("1080p")
34
+ .describe(
35
+ "The resolution of the generated video. 720p generation is faster and higher in quality",
36
+ ),
37
+ });
38
+
39
+ // Output schema with Zod
40
+ const omnihumanOutputSchema = z.object({
41
+ video: z.object({
42
+ url: z.string(),
43
+ }),
44
+ duration: z
45
+ .number()
46
+ .optional()
47
+ .describe("Duration of audio input/video output as used for billing"),
48
+ });
49
+
50
+ const schema: ZodSchema<
51
+ typeof omnihumanInputSchema,
52
+ typeof omnihumanOutputSchema
53
+ > = {
54
+ input: omnihumanInputSchema,
55
+ output: omnihumanOutputSchema,
56
+ };
57
+
58
+ export const definition: ModelDefinition<typeof schema> = {
59
+ type: "model",
60
+ name: "omnihuman",
61
+ description:
62
+ "OmniHuman v1.5 - generate a vivid talking video from an image and an audio file",
63
+ providers: ["fal"],
64
+ defaultProvider: "fal",
65
+ providerModels: {
66
+ fal: "fal-ai/bytedance/omnihuman/v1.5",
67
+ },
68
+ schema,
69
+ };
70
+
71
+ export default definition;
@@ -0,0 +1,49 @@
1
+ /**
2
+ * VEED Fabric 1.0
3
+ * Image + audio -> talking video
4
+ */
5
+
6
+ import { z } from "zod";
7
+ import { urlSchema } from "../../core/schema/shared";
8
+ import type { ModelDefinition, ZodSchema } from "../../core/schema/types";
9
+
10
+ const fabricResolutionSchema = z
11
+ .enum(["480p", "720p"])
12
+ .describe("Output resolution");
13
+
14
+ // Input schema with Zod
15
+ const veedFabricInputSchema = z.object({
16
+ image_url: urlSchema.describe("Input image URL"),
17
+ audio_url: urlSchema.describe("Input audio URL"),
18
+ resolution: fabricResolutionSchema.describe("Output resolution"),
19
+ });
20
+
21
+ // Output schema with Zod
22
+ const veedFabricOutputSchema = z.object({
23
+ video: z.object({
24
+ content_type: z.string().optional(),
25
+ url: z.string().url(),
26
+ }),
27
+ });
28
+
29
+ const schema: ZodSchema<
30
+ typeof veedFabricInputSchema,
31
+ typeof veedFabricOutputSchema
32
+ > = {
33
+ input: veedFabricInputSchema,
34
+ output: veedFabricOutputSchema,
35
+ };
36
+
37
+ export const definition: ModelDefinition<typeof schema> = {
38
+ type: "model",
39
+ name: "veed-fabric",
40
+ description: "VEED Fabric 1.0 - turn an image into a talking video",
41
+ providers: ["fal"],
42
+ defaultProvider: "fal",
43
+ providerModels: {
44
+ fal: "veed/fabric-1.0",
45
+ },
46
+ schema,
47
+ };
48
+
49
+ export default definition;
@@ -332,6 +332,86 @@ export class FalProvider extends BaseProvider {
332
332
  return result;
333
333
  }
334
334
 
335
+ async omnihuman15(args: {
336
+ imageUrl: string;
337
+ audioUrl: string;
338
+ prompt?: string;
339
+ turboMode?: boolean;
340
+ resolution?: "720p" | "1080p";
341
+ }) {
342
+ const modelId: string = "fal-ai/bytedance/omnihuman/v1.5";
343
+
344
+ console.log(`[fal] starting omnihuman v1.5: ${modelId}`);
345
+
346
+ const imageUrl = await ensureUrl(args.imageUrl, (buffer) =>
347
+ this.uploadFile(buffer),
348
+ );
349
+ const audioUrl = await ensureUrl(args.audioUrl, (buffer) =>
350
+ this.uploadFile(buffer),
351
+ );
352
+
353
+ const input: Record<string, unknown> = {
354
+ ...(args.prompt ? { prompt: args.prompt } : {}),
355
+ image_url: imageUrl,
356
+ audio_url: audioUrl,
357
+ turbo_mode: args.turboMode ?? false,
358
+ resolution: args.resolution ?? "1080p",
359
+ };
360
+
361
+ const result = await fal.subscribe(modelId, {
362
+ input,
363
+ logs: true,
364
+ onQueueUpdate: (update) => {
365
+ if (update.status === "IN_PROGRESS") {
366
+ console.log(
367
+ `[fal] ${update.logs?.map((l) => l.message).join(" ") || "processing..."}`,
368
+ );
369
+ }
370
+ },
371
+ });
372
+
373
+ console.log("[fal] completed!");
374
+ return result;
375
+ }
376
+
377
+ async veedFabric10(args: {
378
+ imageUrl: string;
379
+ audioUrl: string;
380
+ resolution: "480p" | "720p";
381
+ }) {
382
+ const modelId: string = "veed/fabric-1.0";
383
+
384
+ console.log(`[fal] starting veed fabric 1.0: ${modelId}`);
385
+
386
+ const imageUrl = await ensureUrl(args.imageUrl, (buffer) =>
387
+ this.uploadFile(buffer),
388
+ );
389
+ const audioUrl = await ensureUrl(args.audioUrl, (buffer) =>
390
+ this.uploadFile(buffer),
391
+ );
392
+
393
+ const input: Record<string, unknown> = {
394
+ image_url: imageUrl,
395
+ audio_url: audioUrl,
396
+ resolution: args.resolution,
397
+ };
398
+
399
+ const result = await fal.subscribe(modelId, {
400
+ input,
401
+ logs: true,
402
+ onQueueUpdate: (update) => {
403
+ if (update.status === "IN_PROGRESS") {
404
+ console.log(
405
+ `[fal] ${update.logs?.map((l) => l.message).join(" ") || "processing..."}`,
406
+ );
407
+ }
408
+ },
409
+ });
410
+
411
+ console.log("[fal] completed!");
412
+ return result;
413
+ }
414
+
335
415
  async textToMusic(args: {
336
416
  prompt?: string;
337
417
  tags?: string[];
@@ -584,5 +664,10 @@ export const imageToImage = (
584
664
  ) => falProvider.imageToImage(args);
585
665
  export const wan25 = (args: Parameters<FalProvider["wan25"]>[0]) =>
586
666
  falProvider.wan25(args);
667
+ export const omnihuman15 = (args: Parameters<FalProvider["omnihuman15"]>[0]) =>
668
+ falProvider.omnihuman15(args);
669
+ export const veedFabric10 = (
670
+ args: Parameters<FalProvider["veedFabric10"]>[0],
671
+ ) => falProvider.veedFabric10(args);
587
672
  export const textToMusic = (args: Parameters<FalProvider["textToMusic"]>[0]) =>
588
673
  falProvider.textToMusic(args);
@@ -0,0 +1,75 @@
1
+ /**
2
+ * Longer talking head demo (VEED Fabric 1.0):
3
+ * - character image from nano-banana-pro
4
+ * - voice from ElevenLabs
5
+ * - talking video from veed/fabric-1.0 (image + audio)
6
+ *
7
+ * Run: bun run src/react/examples/veed-fabric-long-talking-head.tsx
8
+ * Output: output/veed-fabric-long-talking-head.mp4
9
+ */
10
+
11
+ import { elevenlabs, fal } from "../../ai-sdk";
12
+ import { Clip, Image, Render, render, Speech, Video } from "..";
13
+
14
+ const SCRIPT =
15
+ "Hey, I am Nova. In this quick demo, you will hear a clean voiceover, and see a talking avatar generated from a single portrait. We are using VEED Fabric for image-to-video lipsync, and ElevenLabs for the voice.";
16
+
17
+ const portrait = Image({
18
+ prompt:
19
+ "Ultra-realistic studio portrait of Nova, a confident friendly product designer in her early 30s, warm smile, expressive eyes, subtle freckles, natural makeup, shoulder-length dark auburn hair, modern minimal wardrobe, cinematic softbox lighting, shallow depth of field, clean neutral background, high-end camera look",
20
+ model: fal.imageModel("nano-banana-pro"),
21
+ aspectRatio: "9:16",
22
+ });
23
+
24
+ const voiceover = Speech({
25
+ model: elevenlabs.speechModel("eleven_v3"),
26
+ voice: "adam",
27
+ children: SCRIPT,
28
+ });
29
+
30
+ const talking = Video({
31
+ model: fal.videoModel("veed-fabric-1.0"),
32
+ keepAudio: true,
33
+ prompt: {
34
+ images: [portrait],
35
+ audio: voiceover,
36
+ },
37
+ providerOptions: {
38
+ fal: {
39
+ resolution: "720p",
40
+ },
41
+ },
42
+ });
43
+
44
+ const demo = (
45
+ <Render width={1080} height={1920}>
46
+ <Clip duration="auto">{talking}</Clip>
47
+ </Render>
48
+ );
49
+
50
+ async function main() {
51
+ if (!process.env.FAL_API_KEY && !process.env.FAL_KEY) {
52
+ console.error("ERROR: FAL_API_KEY/FAL_KEY not found in environment");
53
+ process.exit(1);
54
+ }
55
+ if (!process.env.ELEVENLABS_API_KEY) {
56
+ console.error("ERROR: ELEVENLABS_API_KEY not found in environment");
57
+ process.exit(1);
58
+ }
59
+
60
+ const result = await render(demo, {
61
+ output: "output/veed-fabric-long-talking-head.mp4",
62
+ cache: ".cache/ai-veed-fabric-long-talking-head",
63
+ });
64
+
65
+ console.log(
66
+ `ok: output/veed-fabric-long-talking-head.mp4 (${(result.video.byteLength / 1024 / 1024).toFixed(2)} MB)`,
67
+ );
68
+ }
69
+
70
+ if (import.meta.main) {
71
+ main().catch((err) => {
72
+ console.error(err);
73
+ process.exit(1);
74
+ });
75
+ }
@@ -0,0 +1,60 @@
1
+ /**
2
+ * VEED Fabric 1.0 React syntax test
3
+ *
4
+ * Uses a local image + local audio file to generate a talking video.
5
+ *
6
+ * Run: bun run src/react/examples/veed-fabric-react-test.tsx
7
+ * Output: output/veed-fabric-react-test.mp4
8
+ */
9
+
10
+ import { fal } from "../../ai-sdk/providers/fal";
11
+ import { Clip, Render, render, Video } from "..";
12
+
13
+ const IMAGE_PATH = "output/garry-tan-image.png";
14
+ const AUDIO_PATH = "output/garry-tan-voice.mp3";
15
+
16
+ const RESOLUTION =
17
+ (process.env.FABRIC_RESOLUTION as "480p" | "720p" | undefined) ?? "720p";
18
+
19
+ const video = (
20
+ <Render width={720} height={1280}>
21
+ <Clip duration={5}>
22
+ <Video
23
+ model={fal.videoModel("veed-fabric-1.0")}
24
+ keepAudio
25
+ prompt={{
26
+ images: [IMAGE_PATH],
27
+ audio: AUDIO_PATH,
28
+ }}
29
+ providerOptions={{
30
+ fal: {
31
+ resolution: RESOLUTION,
32
+ },
33
+ }}
34
+ />
35
+ </Clip>
36
+ </Render>
37
+ );
38
+
39
+ async function main() {
40
+ if (!process.env.FAL_API_KEY && !process.env.FAL_KEY) {
41
+ console.error("ERROR: FAL_API_KEY/FAL_KEY not found in environment");
42
+ process.exit(1);
43
+ }
44
+
45
+ const result = await render(video, {
46
+ output: `output/veed-fabric-react-test-${RESOLUTION}.mp4`,
47
+ cache: `.cache/ai-veed-fabric-${RESOLUTION}-keepaudio`,
48
+ });
49
+
50
+ console.log(
51
+ `ok: output/veed-fabric-react-test-${RESOLUTION}.mp4 (${(result.video.byteLength / 1024 / 1024).toFixed(2)} MB)`,
52
+ );
53
+ }
54
+
55
+ if (import.meta.main) {
56
+ main().catch((err) => {
57
+ console.error(err);
58
+ process.exit(1);
59
+ });
60
+ }