npm - vargai - Versions diffs - 0.4.0-alpha95 → 0.4.0-alpha97 - Mend

vargai 0.4.0-alpha95 → 0.4.0-alpha97

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/package.json +1 -1
package/src/react/elements.ts +7 -3
package/src/react/renderers/clip.ts +22 -0
package/src/react/renderers/talking-head.ts +109 -0
package/src/react/resolve.ts +95 -0
package/src/react/types.ts +8 -4

package/package.json CHANGED Viewed

@@ -104,7 +104,7 @@
   "license": "Apache-2.0",
   "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
   "sideEffects": false,
-  "version": "0.4.0-alpha95",
+  "version": "0.4.0-alpha97",
   "exports": {
     ".": "./src/index.ts",
     "./ai": "./src/ai-sdk/index.ts",

package/src/react/elements.ts CHANGED Viewed

@@ -2,6 +2,7 @@ import {
   resolveImageElement,
   resolveMusicElement,
   resolveSpeechElement,
+  resolveTalkingHeadElement,
   resolveVideoElement,
 } from "./resolve";
 import type { ResolvedElement } from "./resolved-element";
@@ -141,11 +142,14 @@ export function Speech(
 export function TalkingHead(
   props: TalkingHeadProps,
-): VargElement<"talking-head"> {
-  return createElement(
+): VargElement<"talking-head"> & PromiseLike<ResolvedElement<"talking-head">> {
+  const element = createElement(
     "talking-head",
     props as Record<string, unknown>,
-    props.children,
+    undefined,
+  );
+  return makeThenable(element, (el) =>
+    resolveTalkingHeadElement(el, el.props as unknown as TalkingHeadProps),
   );
 }

package/src/react/renderers/clip.ts CHANGED Viewed

@@ -26,6 +26,7 @@ import { renderSlider } from "./slider";
 import { renderSpeech } from "./speech";
 import { renderSubtitle } from "./subtitle";
 import { renderSwipe } from "./swipe";
+import { renderTalkingHead } from "./talking-head";
 import { renderTitle } from "./title";
 import { resolvePath } from "./utils";
 import { renderVideo } from "./video";
@@ -151,6 +152,27 @@ async function renderClipLayers(
         break;
       }
+      case "talking-head": {
+        pending.push({
+          type: "async",
+          promise: renderTalkingHead(
+            element as VargElement<"talking-head">,
+            ctx,
+          )
+            .then((file) => ctx.backend.resolvePath(file))
+            .then(
+              (path) =>
+                ({
+                  type: "video",
+                  path,
+                  resizeMode: "cover",
+                  mixVolume: 1,
+                }) as VideoLayer,
+            ),
+        });
+        break;
+      }
       case "music": {
         const props = element.props as MusicProps;
         pending.push({

package/src/react/renderers/talking-head.ts ADDED Viewed

@@ -0,0 +1,109 @@
+import type { File } from "../../ai-sdk/file";
+import { ResolvedElement } from "../resolved-element";
+import type { TalkingHeadProps, VargElement } from "../types";
+import type { RenderContext } from "./context";
+import { renderImage } from "./image";
+import { renderSpeech } from "./speech";
+import { renderVideo } from "./video";
+/**
+ * Render a TalkingHead element into a video file.
+ *
+ * Pipeline:
+ * 1. Resolve the character image from `image` prop (VargElement or ResolvedElement)
+ * 2. Resolve the speech audio from `audio` prop (VargElement or ResolvedElement)
+ * 3. Generate a lipsync video via `model` (image + audio → video)
+ *
+ * The result is a video File suitable for use as a VideoLayer.
+ */
+export async function renderTalkingHead(
+  element: VargElement<"talking-head">,
+  ctx: RenderContext,
+): Promise<File> {
+  // If already resolved via `await TalkingHead(...)`, reuse the pre-generated file
+  if (element instanceof ResolvedElement) {
+    ctx.generatedFiles.push(element.meta.file);
+    return element.meta.file;
+  }
+  const props = element.props as TalkingHeadProps;
+  const model = props.model ?? ctx.defaults?.video;
+  if (!model) {
+    throw new Error(
+      "TalkingHead requires 'model' prop (or set defaults.video in render options)",
+    );
+  }
+  if (!props.image) {
+    throw new Error("TalkingHead requires 'image' prop (an Image element)");
+  }
+  if (!props.audio) {
+    throw new Error("TalkingHead requires 'audio' prop (a Speech element)");
+  }
+  // Step 1 & 2: Resolve character image and speech audio in parallel
+  const [characterFile, speechFile] = await Promise.all([
+    resolveImageProp(props.image, ctx),
+    resolveAudioProp(props.audio, ctx),
+  ]);
+  // Step 3: Generate lipsync video (image + audio → video)
+  const lipsyncModel = props.lipsyncModel ?? model;
+  const characterImageData = await characterFile.arrayBuffer();
+  const speechAudioData = await speechFile.arrayBuffer();
+  // Create a synthetic video element for the lipsync generation.
+  // Lipsync models (sync-v2-pro, etc.) require `video_url`, not `image_url`,
+  // so we pass the character image as the `video` input. The fal provider will
+  // upload it and set `video_url` in the API request. Fal.ai accepts image
+  // files as the video input for lipsync — it treats them as single-frame video.
+  const videoElement: VargElement<"video"> = {
+    type: "video",
+    props: {
+      prompt: {
+        video: characterImageData,
+        audio: speechAudioData,
+      },
+      model: lipsyncModel,
+      keepAudio: true,
+      providerOptions: { fal: { resolution: props.resolution ?? "720p" } },
+    },
+    children: [],
+  };
+  return renderVideo(videoElement, ctx);
+}
+/**
+ * Resolve an image prop — either a pre-resolved ResolvedElement<"image">
+ * or a lazy VargElement<"image"> that needs rendering.
+ */
+async function resolveImageProp(
+  image: VargElement<"image">,
+  ctx: RenderContext,
+): Promise<File> {
+  if (image instanceof ResolvedElement) {
+    ctx.generatedFiles.push(image.meta.file);
+    return image.meta.file;
+  }
+  return renderImage(image, ctx);
+}
+/**
+ * Resolve an audio prop — either a pre-resolved ResolvedElement<"speech">
+ * or a lazy VargElement<"speech"> that needs rendering.
+ */
+async function resolveAudioProp(
+  audio: VargElement<"speech">,
+  ctx: RenderContext,
+): Promise<File> {
+  if (audio instanceof ResolvedElement) {
+    ctx.generatedFiles.push(audio.meta.file);
+    return audio.meta.file;
+  }
+  return renderSpeech(audio, ctx);
+}

package/src/react/resolve.ts CHANGED Viewed

@@ -37,6 +37,7 @@ import type {
   ImageProps,
   MusicProps,
   SpeechProps,
+  TalkingHeadProps,
   VargElement,
 } from "./types";
@@ -748,3 +749,97 @@ export async function resolveMusicElement(
     duration,
   });
 }
+// ---------------------------------------------------------------------------
+// TalkingHead
+// ---------------------------------------------------------------------------
+/**
+ * Resolve a TalkingHead element by combining a pre-resolved image and speech
+ * into a lipsync video. Returns a ResolvedElement<"talking-head"> wrapping the
+ * final video.
+ *
+ * Pipeline:
+ * 1. Resolve the image from `image` prop (generate or reuse pre-resolved)
+ * 2. Resolve the speech from `audio` prop (generate or reuse pre-resolved)
+ * 3. Generate lipsync video from image + audio via `model`
+ */
+export async function resolveTalkingHeadElement(
+  element: VargElement<"talking-head">,
+  props: TalkingHeadProps,
+): Promise<ResolvedElement<"talking-head">> {
+  const model = props.model;
+  if (!model) {
+    throw new Error(
+      "await TalkingHead() requires 'model' prop for lipsync video generation",
+    );
+  }
+  if (!props.image) {
+    throw new Error(
+      "await TalkingHead() requires 'image' prop (an Image element).",
+    );
+  }
+  if (!props.audio) {
+    throw new Error(
+      "await TalkingHead() requires 'audio' prop (a Speech element).",
+    );
+  }
+  // Step 1: Resolve image — if it's a ResolvedElement, use its file directly;
+  // otherwise resolve the lazy Image element via generateImage.
+  const resolvedImage =
+    props.image instanceof ResolvedElement
+      ? props.image
+      : await resolveImageElement(props.image, props.image.props as ImageProps);
+  const characterBytes = new Uint8Array(await resolvedImage.file.arrayBuffer());
+  // Step 2: Resolve speech — same pattern.
+  const resolvedSpeech =
+    props.audio instanceof ResolvedElement
+      ? props.audio
+      : await resolveSpeechElement(
+          props.audio,
+          props.audio.props as SpeechProps,
+        );
+  const speechBytes = new Uint8Array(await resolvedSpeech.file.arrayBuffer());
+  // Step 3: Generate lipsync video (image + audio → video)
+  const lipsyncModel = props.lipsyncModel ?? model;
+  const generateVideo = getCachedGenerateVideo();
+  // Lipsync models require `video_url`, not `image_url`, so pass the
+  // character image as the `video` input (fal accepts images as video input).
+  const { video } = await generateVideo({
+    model: lipsyncModel as Parameters<typeof generateVideoRaw>[0]["model"],
+    prompt: {
+      video: characterBytes,
+      audio: speechBytes,
+    },
+    duration: 0, // duration determined by audio length
+  });
+  const mediaType = video.mimeType ?? "video/mp4";
+  const modelId =
+    typeof lipsyncModel === "string" ? lipsyncModel : lipsyncModel.modelId;
+  const promptLabel =
+    getTextContent(element.children) ?? "talking-head lipsync";
+  const file = File.fromGenerated({
+    uint8Array: video.uint8Array,
+    mediaType,
+    url: (video as { url?: string }).url,
+  }).withMetadata({
+    type: "video",
+    model: modelId,
+    prompt: `talking-head: ${promptLabel.slice(0, 100)}`,
+  });
+  const duration = await probeDuration(file);
+  return new ResolvedElement(element, {
+    file,
+    duration,
+  });
+}

package/src/react/types.ts CHANGED Viewed

@@ -198,16 +198,20 @@ export interface SpeechProps extends BaseProps, VolumeProps {
 }
 export interface TalkingHeadProps extends BaseProps {
-  character?: string;
-  src?: string;
-  voice?: string;
+  /** Pre-resolved or lazy image element to use as the character face. */
+  image?: VargElement<"image">;
+  /** Pre-resolved or lazy speech element to use as the audio track. */
+  audio?: VargElement<"speech">;
+  /** Lipsync video model (e.g. fal.videoModel("sync-v2-pro")). */
   model?: VideoModelV3;
+  /** Separate lipsync model override (defaults to `model`). */
   lipsyncModel?: VideoModelV3;
+  /** Video resolution for lipsync generation (default: "720p") */
+  resolution?: "480p" | "720p" | "1080p";
   position?:
     | Position
     | { left?: string; right?: string; top?: string; bottom?: string };
   size?: { width: string; height: string };
-  children?: string;
 }
 export interface TitleProps extends BaseProps {