vargai 0.4.0-alpha101 → 0.4.0-alpha104

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -104,7 +104,7 @@
104
104
  "license": "Apache-2.0",
105
105
  "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
106
106
  "sideEffects": false,
107
- "version": "0.4.0-alpha101",
107
+ "version": "0.4.0-alpha104",
108
108
  "exports": {
109
109
  ".": "./src/index.ts",
110
110
  "./ai": "./src/ai-sdk/index.ts",
@@ -78,6 +78,12 @@ export {
78
78
  type GoogleProviderSettings,
79
79
  google,
80
80
  } from "./providers/google";
81
+ export {
82
+ createHeyGen,
83
+ type HeyGenProvider,
84
+ type HeyGenProviderSettings,
85
+ heygen,
86
+ } from "./providers/heygen";
81
87
  export {
82
88
  createHiggsfield,
83
89
  type HiggsfieldImageModelSettings,
@@ -174,6 +174,7 @@ const MOTION_CONTROL_MODELS: Record<string, string> = {
174
174
  const LIPSYNC_MODELS: Record<string, string> = {
175
175
  "sync-v2": "fal-ai/sync-lipsync",
176
176
  "sync-v2-pro": "fal-ai/sync-lipsync/v2",
177
+ "sync-v3": "fal-ai/sync-lipsync/v3",
177
178
  lipsync: "fal-ai/sync-lipsync",
178
179
  "omnihuman-v1.5": "fal-ai/bytedance/omnihuman/v1.5",
179
180
  "veed-fabric-1.0": "veed/fabric-1.0",
@@ -0,0 +1,436 @@
1
+ /**
2
+ * HeyGen AI SDK provider for avatar video generation.
3
+ *
4
+ * Exposes heygen.videoModel("avatar-iv") for use in JSX composition:
5
+ *
6
+ * import { heygen } from "vargai/ai-sdk";
7
+ *
8
+ * const talking = Video({
9
+ * prompt: { text: "Hello world", images: [portrait] },
10
+ * model: heygen.videoModel("avatar-iv"),
11
+ * providerOptions: {
12
+ * heygen: { voice_id: "abc123", expressiveness: "medium" }
13
+ * },
14
+ * });
15
+ */
16
+
17
+ import {
18
+ type EmbeddingModelV3,
19
+ type ImageModelV3,
20
+ type LanguageModelV3,
21
+ NoSuchModelError,
22
+ type ProviderV3,
23
+ type SharedV3Warning,
24
+ type SpeechModelV3,
25
+ } from "@ai-sdk/provider";
26
+ import type {
27
+ VideoModelV3,
28
+ VideoModelV3CallOptions,
29
+ VideoModelV3File,
30
+ } from "../video-model";
31
+
32
+ const HEYGEN_API_BASE = "https://api.heygen.com";
33
+ const HEYGEN_UPLOAD_BASE = "https://upload.heygen.com";
34
+
35
+ // ---------------------------------------------------------------------------
36
+ // HeyGen response types
37
+ // ---------------------------------------------------------------------------
38
+
39
+ interface HeyGenVideoStatusData {
40
+ id: string;
41
+ status: string;
42
+ video_url?: string;
43
+ duration?: number;
44
+ error?: string | null;
45
+ }
46
+
47
+ // ---------------------------------------------------------------------------
48
+ // Helpers
49
+ // ---------------------------------------------------------------------------
50
+
51
+ function getMediaType(file: VideoModelV3File): string | undefined {
52
+ if ("mediaType" in file && file.mediaType) return file.mediaType;
53
+ return undefined;
54
+ }
55
+
56
+ async function fileToBytes(file: VideoModelV3File): Promise<Uint8Array> {
57
+ if ("data" in file) {
58
+ if (file.data instanceof Uint8Array) return file.data;
59
+ if (typeof file.data === "string") return Buffer.from(file.data, "base64");
60
+ }
61
+ throw new Error("HeyGen: file has no data");
62
+ }
63
+
64
+ /**
65
+ * Upload a file to HeyGen's asset endpoint and return the asset_id.
66
+ */
67
+ async function uploadAssetToHeyGen(
68
+ apiKey: string,
69
+ data: Uint8Array,
70
+ contentType: string,
71
+ ): Promise<string> {
72
+ const res = await fetch(`${HEYGEN_UPLOAD_BASE}/v1/asset`, {
73
+ method: "POST",
74
+ headers: {
75
+ "X-Api-Key": apiKey,
76
+ "Content-Type": contentType,
77
+ },
78
+ body: data,
79
+ });
80
+
81
+ if (!res.ok) {
82
+ const errorText = await res.text();
83
+ throw new Error(`HeyGen asset upload failed (${res.status}): ${errorText}`);
84
+ }
85
+
86
+ const json = (await res.json()) as {
87
+ data?: { id?: string };
88
+ };
89
+ const assetId = json.data?.id;
90
+ if (!assetId) throw new Error("HeyGen asset upload returned no asset id");
91
+ return assetId;
92
+ }
93
+
94
+ /**
95
+ * Upload an image as a HeyGen talking photo and return the talking_photo_id.
96
+ * This allows any image to be used as a character in Studio V2 videos.
97
+ */
98
+ async function uploadTalkingPhoto(
99
+ apiKey: string,
100
+ data: Uint8Array,
101
+ contentType: string,
102
+ ): Promise<string> {
103
+ const res = await fetch(`${HEYGEN_UPLOAD_BASE}/v1/talking_photo`, {
104
+ method: "POST",
105
+ headers: {
106
+ "X-Api-Key": apiKey,
107
+ "Content-Type": contentType,
108
+ },
109
+ body: data,
110
+ });
111
+
112
+ if (!res.ok) {
113
+ const errorText = await res.text();
114
+ throw new Error(
115
+ `HeyGen talking photo upload failed (${res.status}): ${errorText}`,
116
+ );
117
+ }
118
+
119
+ const json = (await res.json()) as {
120
+ data?: { talking_photo_id?: string };
121
+ };
122
+ const talkingPhotoId = json.data?.talking_photo_id;
123
+ if (!talkingPhotoId)
124
+ throw new Error("HeyGen talking photo upload returned no talking_photo_id");
125
+ return talkingPhotoId;
126
+ }
127
+
128
+ /**
129
+ * Build the `background` object for a Studio V2 scene.
130
+ * Accepts a URL string (image), a hex color, or a structured object.
131
+ */
132
+ function buildBackground(bg: unknown): Record<string, unknown> | undefined {
133
+ if (!bg) return undefined;
134
+ if (typeof bg === "string") {
135
+ if (bg.startsWith("#")) return { type: "color", value: bg };
136
+ return { type: "image", url: bg, fit: "cover" };
137
+ }
138
+ if (typeof bg === "object") return bg as Record<string, unknown>;
139
+ return undefined;
140
+ }
141
+
142
+ /**
143
+ * Poll HeyGen video status until completed or failed.
144
+ */
145
+ async function pollVideoStatus(
146
+ apiKey: string,
147
+ videoId: string,
148
+ signal?: AbortSignal,
149
+ ): Promise<HeyGenVideoStatusData> {
150
+ const maxWait = 600_000; // 10 minutes
151
+ const pollInterval = 5_000; // 5 seconds
152
+ const start = Date.now();
153
+
154
+ while (Date.now() - start < maxWait) {
155
+ if (signal?.aborted) throw new Error("HeyGen: aborted");
156
+
157
+ const res = await fetch(
158
+ `${HEYGEN_API_BASE}/v1/video_status.get?video_id=${videoId}`,
159
+ {
160
+ headers: {
161
+ "X-Api-Key": apiKey,
162
+ Accept: "application/json",
163
+ },
164
+ signal,
165
+ },
166
+ );
167
+
168
+ if (!res.ok) {
169
+ throw new Error(`HeyGen status check failed (${res.status})`);
170
+ }
171
+
172
+ const body = (await res.json()) as {
173
+ data?: HeyGenVideoStatusData;
174
+ };
175
+ const status = body.data?.status?.toLowerCase();
176
+
177
+ if (status === "completed") {
178
+ if (!body.data?.video_url) {
179
+ throw new Error("HeyGen video completed but no video_url in response");
180
+ }
181
+ return body.data;
182
+ }
183
+
184
+ if (status === "failed") {
185
+ throw new Error(
186
+ `HeyGen video generation failed: ${body.data?.error ?? "unknown error"}`,
187
+ );
188
+ }
189
+
190
+ await new Promise((resolve) => setTimeout(resolve, pollInterval));
191
+ }
192
+
193
+ throw new Error(`HeyGen video generation timed out after ${maxWait / 1000}s`);
194
+ }
195
+
196
+ // ---------------------------------------------------------------------------
197
+ // Video model
198
+ // ---------------------------------------------------------------------------
199
+
200
+ class HeyGenVideoModel implements VideoModelV3 {
201
+ readonly specificationVersion = "v3" as const;
202
+ readonly provider = "heygen";
203
+ readonly modelId: string;
204
+ readonly maxVideosPerCall = 1;
205
+
206
+ private apiKey: string;
207
+
208
+ constructor(modelId: string, apiKey: string) {
209
+ this.modelId = modelId;
210
+ this.apiKey = apiKey;
211
+ }
212
+
213
+ async doGenerate(options: VideoModelV3CallOptions) {
214
+ const { prompt, files, providerOptions, abortSignal } = options;
215
+ const warnings: SharedV3Warning[] = [];
216
+
217
+ const heygenOpts = (providerOptions?.heygen ?? {}) as Record<
218
+ string,
219
+ unknown
220
+ >;
221
+
222
+ // ---- Resolve character source ----
223
+ const avatarId = heygenOpts.avatar_id as string | undefined;
224
+ const talkingPhotoId = heygenOpts.talking_photo_id as string | undefined;
225
+ const voiceId = heygenOpts.voice_id as string | undefined;
226
+
227
+ // If an image file is provided and no avatar/talking_photo specified,
228
+ // upload it as a talking photo for use in Studio V2
229
+ let resolvedTalkingPhotoId = talkingPhotoId;
230
+ if (!avatarId && !talkingPhotoId) {
231
+ const imageFile = files?.find((f) =>
232
+ getMediaType(f)?.startsWith("image/"),
233
+ );
234
+ if (imageFile) {
235
+ const bytes = await fileToBytes(imageFile);
236
+ const contentType = getMediaType(imageFile) ?? "image/jpeg";
237
+ resolvedTalkingPhotoId = await uploadTalkingPhoto(
238
+ this.apiKey,
239
+ bytes,
240
+ contentType,
241
+ );
242
+ }
243
+ }
244
+
245
+ // Upload audio file if present (external audio mode)
246
+ let audioAssetId: string | undefined;
247
+ const audioFile = files?.find((f) => getMediaType(f)?.startsWith("audio/"));
248
+ if (audioFile) {
249
+ const audioBytes = await fileToBytes(audioFile);
250
+ const audioContentType = getMediaType(audioFile) ?? "audio/mpeg";
251
+ audioAssetId = await uploadAssetToHeyGen(
252
+ this.apiKey,
253
+ audioBytes,
254
+ audioContentType,
255
+ );
256
+ }
257
+
258
+ if (prompt && voiceId === undefined && !audioFile) {
259
+ warnings.push({
260
+ type: "other",
261
+ message:
262
+ "HeyGen requires voice_id when using script mode. Pass it via providerOptions.heygen.voice_id",
263
+ });
264
+ }
265
+
266
+ // ---- Always use Studio V2 (POST /v2/video/generate) ----
267
+ // Works for both pre-registered avatars and uploaded talking photos.
268
+
269
+ // Build character object
270
+ const character: Record<string, unknown> = {};
271
+ if (avatarId) {
272
+ character.type = "avatar";
273
+ character.avatar_id = avatarId;
274
+ character.avatar_style = (heygenOpts.avatar_style as string) ?? "normal";
275
+ } else if (resolvedTalkingPhotoId) {
276
+ character.type = "talking_photo";
277
+ character.talking_photo_id = resolvedTalkingPhotoId;
278
+ if (heygenOpts.talking_style)
279
+ character.talking_style = heygenOpts.talking_style;
280
+ if (heygenOpts.use_avatar_iv_model)
281
+ character.use_avatar_iv_model = heygenOpts.use_avatar_iv_model;
282
+ if (heygenOpts.matting) character.matting = heygenOpts.matting;
283
+ }
284
+
285
+ // Build voice object
286
+ const voice: Record<string, unknown> = {};
287
+ if (audioAssetId) {
288
+ voice.type = "audio";
289
+ voice.audio_asset_id = audioAssetId;
290
+ } else if (prompt && voiceId) {
291
+ voice.type = "text";
292
+ voice.input_text = prompt;
293
+ voice.voice_id = voiceId;
294
+ if (heygenOpts.speed) voice.speed = heygenOpts.speed;
295
+ if (heygenOpts.emotion) voice.emotion = heygenOpts.emotion;
296
+ }
297
+
298
+ // Build background object
299
+ const background = buildBackground(heygenOpts.background);
300
+
301
+ // Build scene
302
+ const scene: Record<string, unknown> = { character, voice };
303
+ if (background) scene.background = background;
304
+
305
+ // Aspect ratio → dimension
306
+ const aspectRatio =
307
+ (heygenOpts.aspect_ratio as string | undefined) ?? options.aspectRatio;
308
+ const dim =
309
+ aspectRatio === "9:16"
310
+ ? { width: 720, height: 1280 }
311
+ : { width: 1280, height: 720 };
312
+
313
+ const studioPayload: Record<string, unknown> = {
314
+ video_inputs: [scene],
315
+ dimension: dim,
316
+ };
317
+ if (heygenOpts.callback_url)
318
+ studioPayload.callback_url = heygenOpts.callback_url;
319
+ if (heygenOpts.title) studioPayload.title = heygenOpts.title;
320
+ if (heygenOpts.caption) studioPayload.caption = heygenOpts.caption;
321
+
322
+ const submitUrl = `${HEYGEN_API_BASE}/v2/video/generate`;
323
+ const submitBody = JSON.stringify(studioPayload);
324
+
325
+ // ---- Submit ----
326
+ const submitRes = await fetch(submitUrl, {
327
+ method: "POST",
328
+ headers: {
329
+ "X-Api-Key": this.apiKey,
330
+ "Content-Type": "application/json",
331
+ Accept: "application/json",
332
+ },
333
+ body: submitBody,
334
+ signal: abortSignal,
335
+ });
336
+
337
+ if (!submitRes.ok) {
338
+ const errorText = await submitRes.text();
339
+ throw new Error(
340
+ `HeyGen video generation failed (${submitRes.status}): ${errorText}`,
341
+ );
342
+ }
343
+
344
+ const submitData = (await submitRes.json()) as {
345
+ data?: { video_id?: string };
346
+ video_id?: string;
347
+ };
348
+ const videoId = submitData.data?.video_id ?? submitData.video_id;
349
+ if (!videoId) throw new Error("HeyGen returned no video_id");
350
+
351
+ // ---- Poll for completion ----
352
+ const statusData = await pollVideoStatus(this.apiKey, videoId, abortSignal);
353
+
354
+ // ---- Download video ----
355
+ const videoRes = await fetch(statusData.video_url!, {
356
+ signal: abortSignal,
357
+ });
358
+ if (!videoRes.ok) {
359
+ throw new Error(`Failed to download HeyGen video (${videoRes.status})`);
360
+ }
361
+ const videoBytes = new Uint8Array(await videoRes.arrayBuffer());
362
+
363
+ return {
364
+ videos: [videoBytes],
365
+ warnings,
366
+ response: {
367
+ timestamp: new Date(),
368
+ modelId: this.modelId,
369
+ headers: undefined,
370
+ },
371
+ };
372
+ }
373
+ }
374
+
375
+ // ---------------------------------------------------------------------------
376
+ // Provider factory
377
+ // ---------------------------------------------------------------------------
378
+
379
+ export interface HeyGenProviderSettings {
380
+ apiKey?: string;
381
+ }
382
+
383
+ export interface HeyGenProvider extends ProviderV3 {
384
+ videoModel(modelId?: string): VideoModelV3;
385
+ }
386
+
387
+ export function createHeyGen(
388
+ settings: HeyGenProviderSettings = {},
389
+ ): HeyGenProvider {
390
+ const apiKey = settings.apiKey ?? process.env.HEYGEN_API_KEY;
391
+ if (!apiKey) {
392
+ throw new Error("HEYGEN_API_KEY not set");
393
+ }
394
+
395
+ return {
396
+ specificationVersion: "v3",
397
+ videoModel(modelId = "avatar-iv") {
398
+ return new HeyGenVideoModel(modelId, apiKey);
399
+ },
400
+ languageModel(modelId: string): LanguageModelV3 {
401
+ throw new NoSuchModelError({
402
+ modelId,
403
+ modelType: "languageModel",
404
+ });
405
+ },
406
+ embeddingModel(modelId: string): EmbeddingModelV3 {
407
+ throw new NoSuchModelError({
408
+ modelId,
409
+ modelType: "embeddingModel",
410
+ });
411
+ },
412
+ imageModel(modelId: string): ImageModelV3 {
413
+ throw new NoSuchModelError({
414
+ modelId,
415
+ modelType: "imageModel",
416
+ });
417
+ },
418
+ speechModel(modelId: string): SpeechModelV3 {
419
+ throw new NoSuchModelError({
420
+ modelId,
421
+ modelType: "speechModel",
422
+ });
423
+ },
424
+ };
425
+ }
426
+
427
+ // Lazy singleton (same pattern as elevenlabs)
428
+ let _heygen: HeyGenProvider | undefined;
429
+ export const heygen = new Proxy({} as HeyGenProvider, {
430
+ get(_, prop) {
431
+ if (!_heygen) {
432
+ _heygen = createHeyGen();
433
+ }
434
+ return _heygen[prop as keyof HeyGenProvider];
435
+ },
436
+ });
@@ -0,0 +1,61 @@
1
+ /**
2
+ * HeyGen avatar video model
3
+ * Generates talking avatar videos from script + voice + image/avatar
4
+ */
5
+
6
+ import { z } from "zod";
7
+ import type { ModelDefinition, ZodSchema } from "../../core/schema/types";
8
+
9
+ const heygenInputSchema = z.object({
10
+ script: z.string().describe("Script text for the avatar to speak"),
11
+ voice_id: z.string().describe("HeyGen voice ID"),
12
+ avatar_id: z.string().optional().describe("Pre-registered HeyGen avatar ID"),
13
+ image_url: z
14
+ .string()
15
+ .optional()
16
+ .describe("Image URL to animate (alternative to avatar_id)"),
17
+ motion_prompt: z
18
+ .string()
19
+ .optional()
20
+ .describe("Natural language motion control prompt"),
21
+ expressiveness: z
22
+ .enum(["low", "medium", "high"])
23
+ .optional()
24
+ .default("medium")
25
+ .describe("Expressiveness level of the avatar"),
26
+ aspect_ratio: z
27
+ .enum(["16:9", "9:16"])
28
+ .optional()
29
+ .default("16:9")
30
+ .describe("Video aspect ratio"),
31
+ resolution: z
32
+ .enum(["720p", "1080p"])
33
+ .optional()
34
+ .default("1080p")
35
+ .describe("Video resolution"),
36
+ });
37
+
38
+ const heygenOutputSchema = z.object({
39
+ videoUrl: z.string(),
40
+ duration: z.number().optional(),
41
+ });
42
+
43
+ const schema: ZodSchema<typeof heygenInputSchema, typeof heygenOutputSchema> = {
44
+ input: heygenInputSchema,
45
+ output: heygenOutputSchema,
46
+ };
47
+
48
+ export const definition: ModelDefinition<typeof schema> = {
49
+ type: "model",
50
+ name: "heygen-avatar",
51
+ description:
52
+ "HeyGen Avatar IV model for generating talking avatar videos from script and voice",
53
+ providers: ["heygen"],
54
+ defaultProvider: "heygen",
55
+ providerModels: {
56
+ heygen: "avatar-iv",
57
+ },
58
+ schema,
59
+ };
60
+
61
+ export default definition;
@@ -4,6 +4,7 @@
4
4
 
5
5
  export { definition as elevenlabsTts } from "./elevenlabs";
6
6
  export { definition as flux } from "./flux";
7
+ export { definition as heygenAvatar } from "./heygen";
7
8
  export { definition as kling } from "./kling";
8
9
  export { definition as llama } from "./llama";
9
10
  export { definition as ltxA2v } from "./ltx-a2v";
@@ -31,6 +32,7 @@ export { definition as whisper } from "./whisper";
31
32
  // All model definitions for auto-loading
32
33
  import { definition as elevenlabsDefinition } from "./elevenlabs";
33
34
  import { definition as fluxDefinition } from "./flux";
35
+ import { definition as heygenAvatarDefinition } from "./heygen";
34
36
  import { definition as klingDefinition } from "./kling";
35
37
  import { definition as llamaDefinition } from "./llama";
36
38
  import { definition as ltxA2vDefinition } from "./ltx-a2v";
@@ -77,4 +79,5 @@ export const allModels = [
77
79
  seedance2FastPreviewDefinition,
78
80
  sonautoDefinition,
79
81
  llamaDefinition,
82
+ heygenAvatarDefinition,
80
83
  ];