vargai 0.4.0-alpha96 → 0.4.0-alpha98

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -104,7 +104,7 @@
104
104
  "license": "Apache-2.0",
105
105
  "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
106
106
  "sideEffects": false,
107
- "version": "0.4.0-alpha96",
107
+ "version": "0.4.0-alpha98",
108
108
  "exports": {
109
109
  ".": "./src/index.ts",
110
110
  "./ai": "./src/ai-sdk/index.ts",
@@ -6,6 +6,17 @@ import { renderImage } from "./image";
6
6
  import { renderSpeech } from "./speech";
7
7
  import { renderVideo } from "./video";
8
8
 
9
+ /**
10
+ * Lipsync models that require a video input (not a static image).
11
+ * When TalkingHead uses one of these, we first generate a short video
12
+ * from the image via an image-to-video model, then lipsync.
13
+ */
14
+ const VIDEO_ONLY_LIPSYNC_MODELS = new Set([
15
+ "sync-v2",
16
+ "sync-v2-pro",
17
+ "lipsync",
18
+ ]);
19
+
9
20
  /**
10
21
  * Render a TalkingHead element into a video file.
11
22
  *
@@ -13,6 +24,9 @@ import { renderVideo } from "./video";
13
24
  * 1. Resolve the character image from `image` prop (VargElement or ResolvedElement)
14
25
  * 2. Resolve the speech audio from `audio` prop (VargElement or ResolvedElement)
15
26
  * 3. Generate a lipsync video via `model` (image + audio → video)
27
+ * - For models that accept images (veed-fabric, omnihuman): pass image directly
28
+ * - For models that require video (sync-v2-pro): first animate image → video,
29
+ * then lipsync the video
16
30
  *
17
31
  * The result is a video File suitable for use as a VideoLayer.
18
32
  */
@@ -54,7 +68,51 @@ export async function renderTalkingHead(
54
68
  const characterImageData = await characterFile.arrayBuffer();
55
69
  const speechAudioData = await speechFile.arrayBuffer();
56
70
 
57
- // Create a synthetic video element for the lipsync generation
71
+ // Determine the model ID to check if it requires video input
72
+ const modelId =
73
+ typeof lipsyncModel === "string" ? lipsyncModel : lipsyncModel.modelId;
74
+ const requiresVideo = VIDEO_ONLY_LIPSYNC_MODELS.has(modelId);
75
+
76
+ if (requiresVideo) {
77
+ // Models like sync-v2-pro require a video, not a static image.
78
+ // First animate the image into a short video, then lipsync.
79
+ const animateVideoElement: VargElement<"video"> = {
80
+ type: "video",
81
+ props: {
82
+ prompt: {
83
+ images: [characterImageData],
84
+ text: "person looking at camera, subtle idle movement, breathing, blinking",
85
+ },
86
+ model: ctx.defaults?.video,
87
+ providerOptions: {
88
+ fal: { resolution: props.resolution ?? "720p" },
89
+ },
90
+ },
91
+ children: [],
92
+ };
93
+
94
+ const animatedFile = await renderVideo(animateVideoElement, ctx);
95
+ const animatedVideoData = await animatedFile.arrayBuffer();
96
+
97
+ // Now lipsync the animated video with the speech audio
98
+ const lipsyncElement: VargElement<"video"> = {
99
+ type: "video",
100
+ props: {
101
+ prompt: {
102
+ video: animatedVideoData,
103
+ audio: speechAudioData,
104
+ },
105
+ model: lipsyncModel,
106
+ keepAudio: true,
107
+ providerOptions: { fal: { resolution: props.resolution ?? "720p" } },
108
+ },
109
+ children: [],
110
+ };
111
+
112
+ return renderVideo(lipsyncElement, ctx);
113
+ }
114
+
115
+ // For models that accept images directly (veed-fabric, omnihuman, etc.)
58
116
  const videoElement: VargElement<"video"> = {
59
117
  type: "video",
60
118
  props: {
@@ -808,6 +808,9 @@ export async function resolveTalkingHeadElement(
808
808
  const lipsyncModel = props.lipsyncModel ?? model;
809
809
  const generateVideo = getCachedGenerateVideo();
810
810
 
811
+ // Pass image + audio to the lipsync model. Models like veed-fabric and
812
+ // omnihuman accept images directly. For standalone await TalkingHead(),
813
+ // we don't support the animate-then-lipsync path (use render() for that).
811
814
  const { video } = await generateVideo({
812
815
  model: lipsyncModel as Parameters<typeof generateVideoRaw>[0]["model"],
813
816
  prompt: {