vargai 0.4.0-alpha96 → 0.4.0-alpha98
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json
CHANGED
|
@@ -104,7 +104,7 @@
|
|
|
104
104
|
"license": "Apache-2.0",
|
|
105
105
|
"author": "varg.ai <hello@varg.ai> (https://varg.ai)",
|
|
106
106
|
"sideEffects": false,
|
|
107
|
-
"version": "0.4.0-
|
|
107
|
+
"version": "0.4.0-alpha98",
|
|
108
108
|
"exports": {
|
|
109
109
|
".": "./src/index.ts",
|
|
110
110
|
"./ai": "./src/ai-sdk/index.ts",
|
|
@@ -6,6 +6,17 @@ import { renderImage } from "./image";
|
|
|
6
6
|
import { renderSpeech } from "./speech";
|
|
7
7
|
import { renderVideo } from "./video";
|
|
8
8
|
|
|
9
|
+
/**
|
|
10
|
+
* Lipsync models that require a video input (not a static image).
|
|
11
|
+
* When TalkingHead uses one of these, we first generate a short video
|
|
12
|
+
* from the image via an image-to-video model, then lipsync.
|
|
13
|
+
*/
|
|
14
|
+
const VIDEO_ONLY_LIPSYNC_MODELS = new Set([
|
|
15
|
+
"sync-v2",
|
|
16
|
+
"sync-v2-pro",
|
|
17
|
+
"lipsync",
|
|
18
|
+
]);
|
|
19
|
+
|
|
9
20
|
/**
|
|
10
21
|
* Render a TalkingHead element into a video file.
|
|
11
22
|
*
|
|
@@ -13,6 +24,9 @@ import { renderVideo } from "./video";
|
|
|
13
24
|
* 1. Resolve the character image from `image` prop (VargElement or ResolvedElement)
|
|
14
25
|
* 2. Resolve the speech audio from `audio` prop (VargElement or ResolvedElement)
|
|
15
26
|
* 3. Generate a lipsync video via `model` (image + audio → video)
|
|
27
|
+
* - For models that accept images (veed-fabric, omnihuman): pass image directly
|
|
28
|
+
* - For models that require video (sync-v2-pro): first animate image → video,
|
|
29
|
+
* then lipsync the video
|
|
16
30
|
*
|
|
17
31
|
* The result is a video File suitable for use as a VideoLayer.
|
|
18
32
|
*/
|
|
@@ -54,7 +68,51 @@ export async function renderTalkingHead(
|
|
|
54
68
|
const characterImageData = await characterFile.arrayBuffer();
|
|
55
69
|
const speechAudioData = await speechFile.arrayBuffer();
|
|
56
70
|
|
|
57
|
-
//
|
|
71
|
+
// Determine the model ID to check if it requires video input
|
|
72
|
+
const modelId =
|
|
73
|
+
typeof lipsyncModel === "string" ? lipsyncModel : lipsyncModel.modelId;
|
|
74
|
+
const requiresVideo = VIDEO_ONLY_LIPSYNC_MODELS.has(modelId);
|
|
75
|
+
|
|
76
|
+
if (requiresVideo) {
|
|
77
|
+
// Models like sync-v2-pro require a video, not a static image.
|
|
78
|
+
// First animate the image into a short video, then lipsync.
|
|
79
|
+
const animateVideoElement: VargElement<"video"> = {
|
|
80
|
+
type: "video",
|
|
81
|
+
props: {
|
|
82
|
+
prompt: {
|
|
83
|
+
images: [characterImageData],
|
|
84
|
+
text: "person looking at camera, subtle idle movement, breathing, blinking",
|
|
85
|
+
},
|
|
86
|
+
model: ctx.defaults?.video,
|
|
87
|
+
providerOptions: {
|
|
88
|
+
fal: { resolution: props.resolution ?? "720p" },
|
|
89
|
+
},
|
|
90
|
+
},
|
|
91
|
+
children: [],
|
|
92
|
+
};
|
|
93
|
+
|
|
94
|
+
const animatedFile = await renderVideo(animateVideoElement, ctx);
|
|
95
|
+
const animatedVideoData = await animatedFile.arrayBuffer();
|
|
96
|
+
|
|
97
|
+
// Now lipsync the animated video with the speech audio
|
|
98
|
+
const lipsyncElement: VargElement<"video"> = {
|
|
99
|
+
type: "video",
|
|
100
|
+
props: {
|
|
101
|
+
prompt: {
|
|
102
|
+
video: animatedVideoData,
|
|
103
|
+
audio: speechAudioData,
|
|
104
|
+
},
|
|
105
|
+
model: lipsyncModel,
|
|
106
|
+
keepAudio: true,
|
|
107
|
+
providerOptions: { fal: { resolution: props.resolution ?? "720p" } },
|
|
108
|
+
},
|
|
109
|
+
children: [],
|
|
110
|
+
};
|
|
111
|
+
|
|
112
|
+
return renderVideo(lipsyncElement, ctx);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// For models that accept images directly (veed-fabric, omnihuman, etc.)
|
|
58
116
|
const videoElement: VargElement<"video"> = {
|
|
59
117
|
type: "video",
|
|
60
118
|
props: {
|
package/src/react/resolve.ts
CHANGED
|
@@ -808,6 +808,9 @@ export async function resolveTalkingHeadElement(
|
|
|
808
808
|
const lipsyncModel = props.lipsyncModel ?? model;
|
|
809
809
|
const generateVideo = getCachedGenerateVideo();
|
|
810
810
|
|
|
811
|
+
// Pass image + audio to the lipsync model. Models like veed-fabric and
|
|
812
|
+
// omnihuman accept images directly. For standalone await TalkingHead(),
|
|
813
|
+
// we don't support the animate-then-lipsync path (use render() for that).
|
|
811
814
|
const { video } = await generateVideo({
|
|
812
815
|
model: lipsyncModel as Parameters<typeof generateVideoRaw>[0]["model"],
|
|
813
816
|
prompt: {
|