vargai 0.4.0-alpha96 → 0.4.0-alpha97

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -104,7 +104,7 @@
104
104
  "license": "Apache-2.0",
105
105
  "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
106
106
  "sideEffects": false,
107
- "version": "0.4.0-alpha96",
107
+ "version": "0.4.0-alpha97",
108
108
  "exports": {
109
109
  ".": "./src/index.ts",
110
110
  "./ai": "./src/ai-sdk/index.ts",
@@ -54,12 +54,16 @@ export async function renderTalkingHead(
54
54
  const characterImageData = await characterFile.arrayBuffer();
55
55
  const speechAudioData = await speechFile.arrayBuffer();
56
56
 
57
- // Create a synthetic video element for the lipsync generation
57
+ // Create a synthetic video element for the lipsync generation.
58
+ // Lipsync models (sync-v2-pro, etc.) require `video_url`, not `image_url`,
59
+ // so we pass the character image as the `video` input. The fal provider will
60
+ // upload it and set `video_url` in the API request. Fal.ai accepts image
61
+ // files as the video input for lipsync — it treats them as single-frame video.
58
62
  const videoElement: VargElement<"video"> = {
59
63
  type: "video",
60
64
  props: {
61
65
  prompt: {
62
- images: [characterImageData],
66
+ video: characterImageData,
63
67
  audio: speechAudioData,
64
68
  },
65
69
  model: lipsyncModel,
@@ -808,10 +808,12 @@ export async function resolveTalkingHeadElement(
808
808
  const lipsyncModel = props.lipsyncModel ?? model;
809
809
  const generateVideo = getCachedGenerateVideo();
810
810
 
811
+ // Lipsync models require `video_url`, not `image_url`, so pass the
812
+ // character image as the `video` input (fal accepts images as video input).
811
813
  const { video } = await generateVideo({
812
814
  model: lipsyncModel as Parameters<typeof generateVideoRaw>[0]["model"],
813
815
  prompt: {
814
- images: [characterBytes],
816
+ video: characterBytes,
815
817
  audio: speechBytes,
816
818
  },
817
819
  duration: 0, // duration determined by audio length