vargai 0.4.0-alpha95 → 0.4.0-alpha97

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -104,7 +104,7 @@
104
104
  "license": "Apache-2.0",
105
105
  "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
106
106
  "sideEffects": false,
107
- "version": "0.4.0-alpha95",
107
+ "version": "0.4.0-alpha97",
108
108
  "exports": {
109
109
  ".": "./src/index.ts",
110
110
  "./ai": "./src/ai-sdk/index.ts",
@@ -2,6 +2,7 @@ import {
2
2
  resolveImageElement,
3
3
  resolveMusicElement,
4
4
  resolveSpeechElement,
5
+ resolveTalkingHeadElement,
5
6
  resolveVideoElement,
6
7
  } from "./resolve";
7
8
  import type { ResolvedElement } from "./resolved-element";
@@ -141,11 +142,14 @@ export function Speech(
141
142
 
142
143
  export function TalkingHead(
143
144
  props: TalkingHeadProps,
144
- ): VargElement<"talking-head"> {
145
- return createElement(
145
+ ): VargElement<"talking-head"> & PromiseLike<ResolvedElement<"talking-head">> {
146
+ const element = createElement(
146
147
  "talking-head",
147
148
  props as Record<string, unknown>,
148
- props.children,
149
+ undefined,
150
+ );
151
+ return makeThenable(element, (el) =>
152
+ resolveTalkingHeadElement(el, el.props as unknown as TalkingHeadProps),
149
153
  );
150
154
  }
151
155
 
@@ -26,6 +26,7 @@ import { renderSlider } from "./slider";
26
26
  import { renderSpeech } from "./speech";
27
27
  import { renderSubtitle } from "./subtitle";
28
28
  import { renderSwipe } from "./swipe";
29
+ import { renderTalkingHead } from "./talking-head";
29
30
  import { renderTitle } from "./title";
30
31
  import { resolvePath } from "./utils";
31
32
  import { renderVideo } from "./video";
@@ -151,6 +152,27 @@ async function renderClipLayers(
151
152
  break;
152
153
  }
153
154
 
155
+ case "talking-head": {
156
+ pending.push({
157
+ type: "async",
158
+ promise: renderTalkingHead(
159
+ element as VargElement<"talking-head">,
160
+ ctx,
161
+ )
162
+ .then((file) => ctx.backend.resolvePath(file))
163
+ .then(
164
+ (path) =>
165
+ ({
166
+ type: "video",
167
+ path,
168
+ resizeMode: "cover",
169
+ mixVolume: 1,
170
+ }) as VideoLayer,
171
+ ),
172
+ });
173
+ break;
174
+ }
175
+
154
176
  case "music": {
155
177
  const props = element.props as MusicProps;
156
178
  pending.push({
@@ -0,0 +1,109 @@
1
+ import type { File } from "../../ai-sdk/file";
2
+ import { ResolvedElement } from "../resolved-element";
3
+ import type { TalkingHeadProps, VargElement } from "../types";
4
+ import type { RenderContext } from "./context";
5
+ import { renderImage } from "./image";
6
+ import { renderSpeech } from "./speech";
7
+ import { renderVideo } from "./video";
8
+
9
+ /**
10
+ * Render a TalkingHead element into a video file.
11
+ *
12
+ * Pipeline:
13
+ * 1. Resolve the character image from `image` prop (VargElement or ResolvedElement)
14
+ * 2. Resolve the speech audio from `audio` prop (VargElement or ResolvedElement)
15
+ * 3. Generate a lipsync video via `model` (image + audio → video)
16
+ *
17
+ * The result is a video File suitable for use as a VideoLayer.
18
+ */
19
+ export async function renderTalkingHead(
20
+ element: VargElement<"talking-head">,
21
+ ctx: RenderContext,
22
+ ): Promise<File> {
23
+ // If already resolved via `await TalkingHead(...)`, reuse the pre-generated file
24
+ if (element instanceof ResolvedElement) {
25
+ ctx.generatedFiles.push(element.meta.file);
26
+ return element.meta.file;
27
+ }
28
+
29
+ const props = element.props as TalkingHeadProps;
30
+
31
+ const model = props.model ?? ctx.defaults?.video;
32
+ if (!model) {
33
+ throw new Error(
34
+ "TalkingHead requires 'model' prop (or set defaults.video in render options)",
35
+ );
36
+ }
37
+
38
+ if (!props.image) {
39
+ throw new Error("TalkingHead requires 'image' prop (an Image element)");
40
+ }
41
+
42
+ if (!props.audio) {
43
+ throw new Error("TalkingHead requires 'audio' prop (a Speech element)");
44
+ }
45
+
46
+ // Step 1 & 2: Resolve character image and speech audio in parallel
47
+ const [characterFile, speechFile] = await Promise.all([
48
+ resolveImageProp(props.image, ctx),
49
+ resolveAudioProp(props.audio, ctx),
50
+ ]);
51
+
52
+ // Step 3: Generate lipsync video (image + audio → video)
53
+ const lipsyncModel = props.lipsyncModel ?? model;
54
+ const characterImageData = await characterFile.arrayBuffer();
55
+ const speechAudioData = await speechFile.arrayBuffer();
56
+
57
+ // Create a synthetic video element for the lipsync generation.
58
+ // Lipsync models (sync-v2-pro, etc.) require `video_url`, not `image_url`,
59
+ // so we pass the character image as the `video` input. The fal provider will
60
+ // upload it and set `video_url` in the API request. Fal.ai accepts image
61
+ // files as the video input for lipsync — it treats them as single-frame video.
62
+ const videoElement: VargElement<"video"> = {
63
+ type: "video",
64
+ props: {
65
+ prompt: {
66
+ video: characterImageData,
67
+ audio: speechAudioData,
68
+ },
69
+ model: lipsyncModel,
70
+ keepAudio: true,
71
+ providerOptions: { fal: { resolution: props.resolution ?? "720p" } },
72
+ },
73
+ children: [],
74
+ };
75
+
76
+ return renderVideo(videoElement, ctx);
77
+ }
78
+
79
+ /**
80
+ * Resolve an image prop — either a pre-resolved ResolvedElement<"image">
81
+ * or a lazy VargElement<"image"> that needs rendering.
82
+ */
83
+ async function resolveImageProp(
84
+ image: VargElement<"image">,
85
+ ctx: RenderContext,
86
+ ): Promise<File> {
87
+ if (image instanceof ResolvedElement) {
88
+ ctx.generatedFiles.push(image.meta.file);
89
+ return image.meta.file;
90
+ }
91
+
92
+ return renderImage(image, ctx);
93
+ }
94
+
95
+ /**
96
+ * Resolve an audio prop — either a pre-resolved ResolvedElement<"speech">
97
+ * or a lazy VargElement<"speech"> that needs rendering.
98
+ */
99
+ async function resolveAudioProp(
100
+ audio: VargElement<"speech">,
101
+ ctx: RenderContext,
102
+ ): Promise<File> {
103
+ if (audio instanceof ResolvedElement) {
104
+ ctx.generatedFiles.push(audio.meta.file);
105
+ return audio.meta.file;
106
+ }
107
+
108
+ return renderSpeech(audio, ctx);
109
+ }
@@ -37,6 +37,7 @@ import type {
37
37
  ImageProps,
38
38
  MusicProps,
39
39
  SpeechProps,
40
+ TalkingHeadProps,
40
41
  VargElement,
41
42
  } from "./types";
42
43
 
@@ -748,3 +749,97 @@ export async function resolveMusicElement(
748
749
  duration,
749
750
  });
750
751
  }
752
+
753
+ // ---------------------------------------------------------------------------
754
+ // TalkingHead
755
+ // ---------------------------------------------------------------------------
756
+ /**
757
+ * Resolve a TalkingHead element by combining a pre-resolved image and speech
758
+ * into a lipsync video. Returns a ResolvedElement<"talking-head"> wrapping the
759
+ * final video.
760
+ *
761
+ * Pipeline:
762
+ * 1. Resolve the image from `image` prop (generate or reuse pre-resolved)
763
+ * 2. Resolve the speech from `audio` prop (generate or reuse pre-resolved)
764
+ * 3. Generate lipsync video from image + audio via `model`
765
+ */
766
+ export async function resolveTalkingHeadElement(
767
+ element: VargElement<"talking-head">,
768
+ props: TalkingHeadProps,
769
+ ): Promise<ResolvedElement<"talking-head">> {
770
+ const model = props.model;
771
+ if (!model) {
772
+ throw new Error(
773
+ "await TalkingHead() requires 'model' prop for lipsync video generation",
774
+ );
775
+ }
776
+
777
+ if (!props.image) {
778
+ throw new Error(
779
+ "await TalkingHead() requires 'image' prop (an Image element).",
780
+ );
781
+ }
782
+
783
+ if (!props.audio) {
784
+ throw new Error(
785
+ "await TalkingHead() requires 'audio' prop (a Speech element).",
786
+ );
787
+ }
788
+
789
+ // Step 1: Resolve image — if it's a ResolvedElement, use its file directly;
790
+ // otherwise resolve the lazy Image element via generateImage.
791
+ const resolvedImage =
792
+ props.image instanceof ResolvedElement
793
+ ? props.image
794
+ : await resolveImageElement(props.image, props.image.props as ImageProps);
795
+ const characterBytes = new Uint8Array(await resolvedImage.file.arrayBuffer());
796
+
797
+ // Step 2: Resolve speech — same pattern.
798
+ const resolvedSpeech =
799
+ props.audio instanceof ResolvedElement
800
+ ? props.audio
801
+ : await resolveSpeechElement(
802
+ props.audio,
803
+ props.audio.props as SpeechProps,
804
+ );
805
+ const speechBytes = new Uint8Array(await resolvedSpeech.file.arrayBuffer());
806
+
807
+ // Step 3: Generate lipsync video (image + audio → video)
808
+ const lipsyncModel = props.lipsyncModel ?? model;
809
+ const generateVideo = getCachedGenerateVideo();
810
+
811
+ // Lipsync models require `video_url`, not `image_url`, so pass the
812
+ // character image as the `video` input (fal accepts images as video input).
813
+ const { video } = await generateVideo({
814
+ model: lipsyncModel as Parameters<typeof generateVideoRaw>[0]["model"],
815
+ prompt: {
816
+ video: characterBytes,
817
+ audio: speechBytes,
818
+ },
819
+ duration: 0, // duration determined by audio length
820
+ });
821
+
822
+ const mediaType = video.mimeType ?? "video/mp4";
823
+ const modelId =
824
+ typeof lipsyncModel === "string" ? lipsyncModel : lipsyncModel.modelId;
825
+
826
+ const promptLabel =
827
+ getTextContent(element.children) ?? "talking-head lipsync";
828
+
829
+ const file = File.fromGenerated({
830
+ uint8Array: video.uint8Array,
831
+ mediaType,
832
+ url: (video as { url?: string }).url,
833
+ }).withMetadata({
834
+ type: "video",
835
+ model: modelId,
836
+ prompt: `talking-head: ${promptLabel.slice(0, 100)}`,
837
+ });
838
+
839
+ const duration = await probeDuration(file);
840
+
841
+ return new ResolvedElement(element, {
842
+ file,
843
+ duration,
844
+ });
845
+ }
@@ -198,16 +198,20 @@ export interface SpeechProps extends BaseProps, VolumeProps {
198
198
  }
199
199
 
200
200
  export interface TalkingHeadProps extends BaseProps {
201
- character?: string;
202
- src?: string;
203
- voice?: string;
201
+ /** Pre-resolved or lazy image element to use as the character face. */
202
+ image?: VargElement<"image">;
203
+ /** Pre-resolved or lazy speech element to use as the audio track. */
204
+ audio?: VargElement<"speech">;
205
+ /** Lipsync video model (e.g. fal.videoModel("sync-v2-pro")). */
204
206
  model?: VideoModelV3;
207
+ /** Separate lipsync model override (defaults to `model`). */
205
208
  lipsyncModel?: VideoModelV3;
209
+ /** Video resolution for lipsync generation (default: "720p") */
210
+ resolution?: "480p" | "720p" | "1080p";
206
211
  position?:
207
212
  | Position
208
213
  | { left?: string; right?: string; top?: string; bottom?: string };
209
214
  size?: { width: string; height: string };
210
- children?: string;
211
215
  }
212
216
 
213
217
  export interface TitleProps extends BaseProps {