npm - vargai - Versions diffs - 0.4.0-alpha96 → 0.4.0-alpha98 - Mend

vargai 0.4.0-alpha96 → 0.4.0-alpha98

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/package.json +1 -1
package/src/react/renderers/talking-head.ts +59 -1
package/src/react/resolve.ts +3 -0

package/package.json CHANGED Viewed

@@ -104,7 +104,7 @@
   "license": "Apache-2.0",
   "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
   "sideEffects": false,
-  "version": "0.4.0-alpha96",
+  "version": "0.4.0-alpha98",
   "exports": {
     ".": "./src/index.ts",
     "./ai": "./src/ai-sdk/index.ts",

package/src/react/renderers/talking-head.ts CHANGED Viewed

@@ -6,6 +6,17 @@ import { renderImage } from "./image";
 import { renderSpeech } from "./speech";
 import { renderVideo } from "./video";
+/**
+ * Lipsync models that require a video input (not a static image).
+ * When TalkingHead uses one of these, we first generate a short video
+ * from the image via an image-to-video model, then lipsync.
+ */
+const VIDEO_ONLY_LIPSYNC_MODELS = new Set([
+  "sync-v2",
+  "sync-v2-pro",
+  "lipsync",
+]);
 /**
  * Render a TalkingHead element into a video file.
  *
@@ -13,6 +24,9 @@ import { renderVideo } from "./video";
  * 1. Resolve the character image from `image` prop (VargElement or ResolvedElement)
  * 2. Resolve the speech audio from `audio` prop (VargElement or ResolvedElement)
  * 3. Generate a lipsync video via `model` (image + audio → video)
+ *    - For models that accept images (veed-fabric, omnihuman): pass image directly
+ *    - For models that require video (sync-v2-pro): first animate image → video,
+ *      then lipsync the video
  *
  * The result is a video File suitable for use as a VideoLayer.
  */
@@ -54,7 +68,51 @@ export async function renderTalkingHead(
   const characterImageData = await characterFile.arrayBuffer();
   const speechAudioData = await speechFile.arrayBuffer();
-  // Create a synthetic video element for the lipsync generation
+  // Determine the model ID to check if it requires video input
+  const modelId =
+    typeof lipsyncModel === "string" ? lipsyncModel : lipsyncModel.modelId;
+  const requiresVideo = VIDEO_ONLY_LIPSYNC_MODELS.has(modelId);
+  if (requiresVideo) {
+    // Models like sync-v2-pro require a video, not a static image.
+    // First animate the image into a short video, then lipsync.
+    const animateVideoElement: VargElement<"video"> = {
+      type: "video",
+      props: {
+        prompt: {
+          images: [characterImageData],
+          text: "person looking at camera, subtle idle movement, breathing, blinking",
+        },
+        model: ctx.defaults?.video,
+        providerOptions: {
+          fal: { resolution: props.resolution ?? "720p" },
+        },
+      },
+      children: [],
+    };
+    const animatedFile = await renderVideo(animateVideoElement, ctx);
+    const animatedVideoData = await animatedFile.arrayBuffer();
+    // Now lipsync the animated video with the speech audio
+    const lipsyncElement: VargElement<"video"> = {
+      type: "video",
+      props: {
+        prompt: {
+          video: animatedVideoData,
+          audio: speechAudioData,
+        },
+        model: lipsyncModel,
+        keepAudio: true,
+        providerOptions: { fal: { resolution: props.resolution ?? "720p" } },
+      },
+      children: [],
+    };
+    return renderVideo(lipsyncElement, ctx);
+  }
+  // For models that accept images directly (veed-fabric, omnihuman, etc.)
   const videoElement: VargElement<"video"> = {
     type: "video",
     props: {

package/src/react/resolve.ts CHANGED Viewed

@@ -808,6 +808,9 @@ export async function resolveTalkingHeadElement(
   const lipsyncModel = props.lipsyncModel ?? model;
   const generateVideo = getCachedGenerateVideo();
+  // Pass image + audio to the lipsync model. Models like veed-fabric and
+  // omnihuman accept images directly. For standalone await TalkingHead(),
+  // we don't support the animate-then-lipsync path (use render() for that).
   const { video } = await generateVideo({
     model: lipsyncModel as Parameters<typeof generateVideoRaw>[0]["model"],
     prompt: {