vargai 0.4.0-alpha108 → 0.4.0-alpha111

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/package.json +4 -1
  2. package/src/ai-sdk/generate-video.ts +14 -8
  3. package/src/ai-sdk/providers/editly/backends/types.ts +4 -0
  4. package/src/ai-sdk/providers/editly/layers.ts +39 -19
  5. package/src/ai-sdk/providers/editly/rendi/index.ts +214 -3
  6. package/src/ai-sdk/providers/fal.ts +133 -2
  7. package/src/ai-sdk/providers/model-rules.ts +18 -0
  8. package/src/ai-sdk/providers/varg.ts +7 -4
  9. package/src/core/registry/resolver.ts +4 -0
  10. package/src/core/schema/types.ts +65 -0
  11. package/src/definitions/actions/video.ts +24 -4
  12. package/src/definitions/models/elevenlabs.ts +14 -1
  13. package/src/definitions/models/flux.ts +17 -1
  14. package/src/definitions/models/heygen.ts +20 -1
  15. package/src/definitions/models/index.ts +68 -2
  16. package/src/definitions/models/kling.ts +326 -1
  17. package/src/definitions/models/llama.ts +13 -1
  18. package/src/definitions/models/ltx-a2v.ts +17 -1
  19. package/src/definitions/models/nano-banana-2.ts +23 -1
  20. package/src/definitions/models/nano-banana-pro.ts +17 -1
  21. package/src/definitions/models/omnihuman.ts +13 -1
  22. package/src/definitions/models/phota.ts +29 -1
  23. package/src/definitions/models/qwen-image-2.ts +14 -1
  24. package/src/definitions/models/recraft-v4.ts +13 -1
  25. package/src/definitions/models/reve.ts +13 -1
  26. package/src/definitions/models/seedance.ts +21 -1
  27. package/src/definitions/models/sonauto.ts +13 -1
  28. package/src/definitions/models/soul.ts +13 -1
  29. package/src/definitions/models/veed-fabric.ts +16 -1
  30. package/src/definitions/models/wan.ts +23 -3
  31. package/src/definitions/models/whisper.ts +25 -1
  32. package/src/providers/fal.ts +97 -0
  33. package/src/react/elements.ts +81 -0
  34. package/src/react/index.ts +8 -0
  35. package/src/react/renderers/burn-captions.ts +83 -19
  36. package/src/react/renderers/captions.ts +292 -25
  37. package/src/react/renderers/emoji.ts +256 -0
  38. package/src/react/renderers/fonts.ts +509 -0
  39. package/src/react/renderers/progress.ts +12 -1
  40. package/src/react/renderers/render.ts +83 -4
  41. package/src/react/renderers/video.ts +24 -3
  42. package/src/react/resolve.ts +295 -4
  43. package/src/react/resolved-element.ts +13 -6
  44. package/src/react/types.ts +87 -4
  45. package/src/speech/map-segments.ts +2 -1
  46. package/src/speech/parse-alignment.ts +111 -6
  47. package/src/speech/word-segmenter.ts +172 -0
package/package.json CHANGED
@@ -28,6 +28,7 @@
28
28
  "@commitlint/config-conventional": "^20.0.0",
29
29
  "@size-limit/preset-small-lib": "^11.2.0",
30
30
  "@types/bun": "latest",
31
+ "@types/opentype.js": "^1.3.9",
31
32
  "@types/react": "^19.2.7",
32
33
  "husky": "^9.1.7",
33
34
  "lint-staged": "^16.2.7"
@@ -58,9 +59,11 @@
58
59
  "ai": "^6.0.26",
59
60
  "apify-client": "^2.20.0",
60
61
  "citty": "^0.1.6",
62
+ "fflate": "^0.8.2",
61
63
  "fluent-ffmpeg": "^2.1.3",
62
64
  "groq-sdk": "^0.36.0",
63
65
  "ink": "^6.5.1",
66
+ "opentype.js": "^1.3.4",
64
67
  "p-limit": "^6.2.0",
65
68
  "p-map": "^7.0.4",
66
69
  "react": "^19.2.0",
@@ -104,7 +107,7 @@
104
107
  "license": "Apache-2.0",
105
108
  "author": "varg.ai <hello@varg.ai> (https://varg.ai)",
106
109
  "sideEffects": false,
107
- "version": "0.4.0-alpha108",
110
+ "version": "0.4.0-alpha111",
108
111
  "exports": {
109
112
  ".": "./src/index.ts",
110
113
  "./ai": "./src/ai-sdk/index.ts",
@@ -10,9 +10,9 @@ export type GenerateVideoPrompt =
10
10
  | string
11
11
  | {
12
12
  text?: string;
13
- images?: Array<DataContent>;
14
- audio?: DataContent;
15
- video?: DataContent;
13
+ images?: DataContent | Array<DataContent>;
14
+ audio?: DataContent | Array<DataContent>;
15
+ video?: DataContent | Array<DataContent>;
16
16
  };
17
17
 
18
18
  export interface GenerateVideoOptions {
@@ -76,6 +76,12 @@ function toUint8Array(data: DataContent): Uint8Array {
76
76
  return data;
77
77
  }
78
78
 
79
+ /** Normalize singular or array to array */
80
+ function toArray<T>(value: T | T[] | undefined): T[] {
81
+ if (value == null) return [];
82
+ return Array.isArray(value) ? value : [value];
83
+ }
84
+
79
85
  function normalizePrompt(prompt: GenerateVideoPrompt): {
80
86
  prompt: string | undefined;
81
87
  files: ImageModelV3File[] | undefined;
@@ -86,7 +92,7 @@ function normalizePrompt(prompt: GenerateVideoPrompt): {
86
92
 
87
93
  const files: ImageModelV3File[] = [];
88
94
 
89
- for (const img of prompt.images ?? []) {
95
+ for (const img of toArray(prompt.images)) {
90
96
  files.push({
91
97
  type: "file",
92
98
  mediaType: "image/png",
@@ -94,19 +100,19 @@ function normalizePrompt(prompt: GenerateVideoPrompt): {
94
100
  });
95
101
  }
96
102
 
97
- if (prompt.audio) {
103
+ for (const aud of toArray(prompt.audio)) {
98
104
  files.push({
99
105
  type: "file",
100
106
  mediaType: "audio/mpeg",
101
- data: toUint8Array(prompt.audio),
107
+ data: toUint8Array(aud),
102
108
  });
103
109
  }
104
110
 
105
- if (prompt.video) {
111
+ for (const vid of toArray(prompt.video)) {
106
112
  files.push({
107
113
  type: "file",
108
114
  mediaType: "video/mp4",
109
- data: toUint8Array(prompt.video),
115
+ data: toUint8Array(vid),
110
116
  });
111
117
  }
112
118
 
@@ -47,6 +47,10 @@ export interface FFmpegRunOptions {
47
47
  verbose?: boolean;
48
48
  /** Max execution time in seconds (used by cloud backends like Rendi, ignored by local) */
49
49
  timeoutSeconds?: number;
50
+ /** Extra files (e.g. fonts) to include alongside inputs.
51
+ * When present, cloud backends like Rendi use compressed folder mode
52
+ * (input_compressed_folder) to bundle all files together. */
53
+ auxiliaryFiles?: { url: string; fileName: string }[];
50
54
  }
51
55
 
52
56
  export type FFmpegOutput =
@@ -43,12 +43,20 @@ function getCropPositionExpr(position: CropPosition | undefined): {
43
43
  }
44
44
 
45
45
  function escapeDrawText(text: string): string {
46
- return text
47
- .replace(/\\/g, "\\\\")
48
- .replace(/'/g, "'\\''")
49
- .replace(/:/g, "\\:")
50
- .replace(/\[/g, "\\[")
51
- .replace(/\]/g, "\\]");
46
+ return (
47
+ text
48
+ .replace(/\\/g, "\\\\")
49
+ .replace(/'/g, "'\\''")
50
+ .replace(/:/g, "\\:")
51
+ .replace(/\[/g, "\\[")
52
+ .replace(/\]/g, "\\]")
53
+ // Replace straight double quotes with typographic curly quotes.
54
+ // Straight " breaks Rendi's command parser (the -filter_complex value is
55
+ // wrapped in double quotes, so an unescaped " inside it terminates the
56
+ // argument and causes ffmpeg to interpret the next word as a file path).
57
+ .replace(/\u201C|\u201D/g, "\u201C") // normalise any existing curly quotes
58
+ .replace(/"/g, "\u201C")
59
+ );
52
60
  }
53
61
 
54
62
  function parseSize(val: number | string | undefined, base: number): number {
@@ -161,11 +169,15 @@ export function getVideoFilter(
161
169
  };
162
170
  }
163
171
 
164
- let scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=decrease`;
165
- if (layer.resizeMode === "cover") {
166
- scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=increase,crop=${width}:${height}`;
172
+ let scaleFilter: string;
173
+ if (layer.resizeMode === "contain") {
174
+ scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=decrease`;
167
175
  } else if (layer.resizeMode === "stretch") {
168
176
  scaleFilter = `scale=${width}:${height}`;
177
+ } else {
178
+ // Default ("cover" or undefined): scale up to fill canvas, crop excess
179
+ const { x, y } = getCropPositionExpr(layer.cropPosition);
180
+ scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=increase,crop=${width}:${height}:${x}:${y}`;
169
181
  }
170
182
 
171
183
  filters.push(scaleFilter);
@@ -219,11 +231,15 @@ export function getVideoFilterWithTrim(
219
231
  filters.push("fps=30");
220
232
  filters.push("settb=1/30");
221
233
  } else {
222
- let scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=decrease`;
223
- if (layer.resizeMode === "cover") {
224
- scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=increase,crop=${width}:${height}`;
234
+ let scaleFilter: string;
235
+ if (layer.resizeMode === "contain") {
236
+ scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=decrease`;
225
237
  } else if (layer.resizeMode === "stretch") {
226
238
  scaleFilter = `scale=${width}:${height}`;
239
+ } else {
240
+ // Default ("cover" or undefined): scale up to fill canvas, crop excess
241
+ const { x, y } = getCropPositionExpr(layer.cropPosition);
242
+ scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=increase,crop=${width}:${height}:${x}:${y}`;
227
243
  }
228
244
 
229
245
  filters.push(scaleFilter);
@@ -386,11 +402,14 @@ export function getImageFilter(
386
402
  };
387
403
  }
388
404
 
389
- let scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=decrease`;
390
- if (layer.resizeMode === "cover") {
391
- scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=increase,crop=${width}:${height}`;
405
+ let scaleFilter: string;
406
+ if (layer.resizeMode === "contain") {
407
+ scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=decrease`;
392
408
  } else if (layer.resizeMode === "stretch") {
393
409
  scaleFilter = `scale=${width}:${height}`;
410
+ } else {
411
+ // Default ("cover" or undefined): scale up to fill canvas, crop excess
412
+ scaleFilter = `scale=${width}:${height}:force_original_aspect_ratio=increase,crop=${width}:${height}`;
394
413
  }
395
414
  filters.push(scaleFilter);
396
415
  filters.push(`pad=${width}:${height}:(ow-iw)/2:(oh-ih)/2:black`);
@@ -532,13 +551,14 @@ export function getImageOverlayFilter(
532
551
  let scaleExpr: string;
533
552
  if (!hasExplicitHeight) {
534
553
  scaleExpr = `scale=${targetWidth}:-2`;
535
- } else if (layer.resizeMode === "cover") {
536
- const { x, y } = getCropPositionExpr(layer.cropPosition);
537
- scaleExpr = `scale=${targetWidth}:${targetHeight}:force_original_aspect_ratio=increase,crop=${targetWidth}:${targetHeight}:${x}:${y}`;
554
+ } else if (layer.resizeMode === "contain") {
555
+ scaleExpr = `scale=${targetWidth}:${targetHeight}:force_original_aspect_ratio=decrease,pad=${targetWidth}:${targetHeight}:(ow-iw)/2:(oh-ih)/2:black`;
538
556
  } else if (layer.resizeMode === "stretch") {
539
557
  scaleExpr = `scale=${targetWidth}:${targetHeight}`;
540
558
  } else {
541
- scaleExpr = `scale=${targetWidth}:${targetHeight}:force_original_aspect_ratio=decrease,pad=${targetWidth}:${targetHeight}:(ow-iw)/2:(oh-ih)/2:black`;
559
+ // Default ("cover" or undefined): scale up to fill, crop excess
560
+ const { x, y } = getCropPositionExpr(layer.cropPosition);
561
+ scaleExpr = `scale=${targetWidth}:${targetHeight}:force_original_aspect_ratio=increase,crop=${targetWidth}:${targetHeight}:${x}:${y}`;
542
562
  }
543
563
 
544
564
  const zoomDir = layer.zoomDirection ?? null;
@@ -1,3 +1,4 @@
1
+ import { zipSync } from "fflate";
1
2
  import sharp from "sharp";
2
3
  import { File } from "../../../file";
3
4
  import type { StorageProvider } from "../../../storage/types";
@@ -128,6 +129,11 @@ export class RendiBackend implements FFmpegBackend {
128
129
  }
129
130
 
130
131
  async run(options: FFmpegRunOptions): Promise<FFmpegRunResult> {
132
+ // When auxiliary files (e.g. fonts) are present, use compressed folder mode
133
+ if (options.auxiliaryFiles && options.auxiliaryFiles.length > 0) {
134
+ return this.runWithCompressedFolder(options);
135
+ }
136
+
131
137
  let {
132
138
  inputs,
133
139
  filterComplex,
@@ -287,6 +293,194 @@ export class RendiBackend implements FFmpegBackend {
287
293
  throw new Error("Rendi command timed out");
288
294
  }
289
295
 
296
+ /**
297
+ * Run an FFmpeg command using Rendi's input_compressed_folder mode.
298
+ *
299
+ * Used when auxiliary files (e.g. fonts for subtitle rendering) need to be
300
+ * bundled alongside regular inputs. Creates a ZIP containing all input files
301
+ * and auxiliary files, uploads it to storage, and submits to Rendi with
302
+ * `input_compressed_folder` instead of `input_files`.
303
+ *
304
+ * Inside the ZIP, all files are at the root level. The ffmpeg command
305
+ * references files by their bare filenames (not placeholders).
306
+ */
307
+ private async runWithCompressedFolder(
308
+ options: FFmpegRunOptions,
309
+ ): Promise<FFmpegRunResult> {
310
+ const {
311
+ inputs,
312
+ videoFilter,
313
+ filterComplex,
314
+ outputArgs = [],
315
+ outputPath,
316
+ verbose,
317
+ auxiliaryFiles = [],
318
+ } = options;
319
+
320
+ // 1. Resolve all input files to URLs
321
+ const inputEntries: { fileName: string; url: string }[] = [];
322
+ for (const input of inputs ?? []) {
323
+ const path = this.getInputPath(input);
324
+ const url = await this.resolvePath(path);
325
+ // Extract filename from URL or path
326
+ const fileName =
327
+ url.split("/").pop()?.split("?")[0] ?? `input_${inputEntries.length}`;
328
+ inputEntries.push({ fileName, url });
329
+ }
330
+
331
+ // 2. Download all files (inputs + auxiliary) into memory
332
+ const zipContents: Record<string, Uint8Array> = {};
333
+
334
+ const downloadTasks = [
335
+ ...inputEntries.map(async (entry) => {
336
+ const res = await fetch(entry.url);
337
+ if (!res.ok)
338
+ throw new Error(
339
+ `Failed to download input ${entry.fileName}: ${res.status}`,
340
+ );
341
+ zipContents[entry.fileName] = new Uint8Array(await res.arrayBuffer());
342
+ }),
343
+ ...auxiliaryFiles.map(async (file) => {
344
+ const res = await fetch(file.url);
345
+ if (!res.ok)
346
+ throw new Error(
347
+ `Failed to download auxiliary file ${file.fileName}: ${res.status}`,
348
+ );
349
+ zipContents[file.fileName] = new Uint8Array(await res.arrayBuffer());
350
+ }),
351
+ ];
352
+
353
+ await Promise.all(downloadTasks);
354
+
355
+ if (verbose) {
356
+ const totalSize = Object.values(zipContents).reduce(
357
+ (sum, buf) => sum + buf.length,
358
+ 0,
359
+ );
360
+ console.log(
361
+ `[rendi] creating ZIP with ${Object.keys(zipContents).length} files (${(totalSize / 1024 / 1024).toFixed(1)} MB)`,
362
+ );
363
+ }
364
+
365
+ // 3. Create ZIP
366
+ const zipData = zipSync(zipContents, { level: 1 }); // fast compression
367
+
368
+ // 4. Upload ZIP to storage
369
+ const zipKey = `internal/rendi-compressed-${Date.now()}.zip`;
370
+ const zipUrl = await this.storage.upload(
371
+ zipData,
372
+ zipKey,
373
+ "application/zip",
374
+ );
375
+
376
+ if (verbose) {
377
+ console.log(
378
+ `[rendi] uploaded ZIP (${(zipData.length / 1024 / 1024).toFixed(1)} MB) -> ${zipUrl}`,
379
+ );
380
+ }
381
+
382
+ // 5. Build ffmpeg command using bare filenames (not {{in_X}} placeholders)
383
+ const inputArgs: string[] = [];
384
+ for (const [i, input] of (inputs ?? []).entries()) {
385
+ if (typeof input !== "string" && "options" in input && input.options) {
386
+ inputArgs.push(...input.options);
387
+ }
388
+ inputArgs.push("-i", inputEntries[i]!.fileName);
389
+ }
390
+
391
+ const filterArgs: string[] = [];
392
+ if (filterComplex) {
393
+ filterArgs.push("-filter_complex", filterComplex);
394
+ }
395
+ if (videoFilter) {
396
+ // For compressed folder mode, the video filter references files by
397
+ // their bare filenames (already resolved in the working directory)
398
+ filterArgs.push("-vf", videoFilter);
399
+ }
400
+
401
+ const processedOutputArgs = outputArgs.filter((arg) => arg !== "-y");
402
+
403
+ const commandParts = [
404
+ ...inputArgs,
405
+ ...filterArgs,
406
+ ...processedOutputArgs,
407
+ "{{out_1}}",
408
+ ];
409
+ const ffmpegCommand = this.buildCommandString(commandParts);
410
+ const outputFilename = outputPath?.split("/").pop() ?? "output.mp4";
411
+
412
+ if (verbose) {
413
+ console.log("[rendi] input_compressed_folder:", zipUrl);
414
+ console.log("[rendi] ffmpeg_command:", ffmpegCommand);
415
+ }
416
+
417
+ // 6. Submit to Rendi with input_compressed_folder
418
+ const submitResponse = await fetch(`${RENDI_API_BASE}/run-ffmpeg-command`, {
419
+ method: "POST",
420
+ headers: {
421
+ "X-API-KEY": this.apiKey,
422
+ "Content-Type": "application/json",
423
+ },
424
+ body: JSON.stringify({
425
+ input_compressed_folder: zipUrl,
426
+ output_files: { out_1: outputFilename },
427
+ ffmpeg_command: ffmpegCommand,
428
+ max_command_run_seconds:
429
+ options.timeoutSeconds ?? this.maxCommandRunSeconds,
430
+ }),
431
+ });
432
+
433
+ if (!submitResponse.ok) {
434
+ const errorText = await submitResponse.text();
435
+ throw new Error(
436
+ `Rendi submit failed: ${submitResponse.status} - ${errorText}`,
437
+ );
438
+ }
439
+
440
+ const { command_id } =
441
+ (await submitResponse.json()) as RendiCommandResponse;
442
+
443
+ if (verbose) {
444
+ console.log("[rendi] command_id:", command_id);
445
+ }
446
+
447
+ // 7. Poll for completion (same as standard run)
448
+ let attempts = 0;
449
+ while (attempts < MAX_POLL_ATTEMPTS) {
450
+ const statusResponse = await fetch(
451
+ `${RENDI_API_BASE}/commands/${command_id}`,
452
+ {
453
+ headers: { "X-API-KEY": this.apiKey },
454
+ },
455
+ );
456
+
457
+ if (!statusResponse.ok) {
458
+ throw new Error(`Rendi poll failed: ${statusResponse.status}`);
459
+ }
460
+
461
+ const status = (await statusResponse.json()) as RendiStatusResponse;
462
+
463
+ if (status.status === "SUCCESS") {
464
+ const outputFile = status.output_files?.out_1;
465
+ if (!outputFile?.storage_url) {
466
+ throw new Error("Rendi completed but no output URL");
467
+ }
468
+ return { output: { type: "url", url: outputFile.storage_url } };
469
+ }
470
+
471
+ if (status.status === "FAILED") {
472
+ throw new Error(
473
+ `Rendi command failed: ${status.error_message ?? "unknown error"}`,
474
+ );
475
+ }
476
+
477
+ await this.sleep(POLL_INTERVAL_MS);
478
+ attempts++;
479
+ }
480
+
481
+ throw new Error("Rendi command timed out");
482
+ }
483
+
290
484
  async resolvePath(input: FilePath): Promise<string> {
291
485
  if (input instanceof File) {
292
486
  return input.upload(this.storage);
@@ -326,13 +520,30 @@ export class RendiBackend implements FFmpegBackend {
326
520
  private buildCommandString(args: string[]): string {
327
521
  return args
328
522
  .map((arg) => {
523
+ // Flags (e.g. -i, -filter_complex) and output placeholders pass through
329
524
  if (arg.startsWith("-") || arg.startsWith("{{")) {
330
525
  return arg;
331
526
  }
332
- if (arg.includes(" ") || arg.includes(":") || arg.includes("'")) {
333
- return `"${arg.replace(/"/g, '\\"')}"`;
527
+ // For values that need quoting (spaces, colons, single-quotes etc.):
528
+ // Rendi's server-side parser splits the command string like a POSIX
529
+ // shell. The old approach wrapped values in "..." and tried to escape
530
+ // inner " with \", but Rendi's parser does NOT reliably honour \"
531
+ // inside double-quoted strings — any literal " in user text (e.g.
532
+ // drawtext titles) would terminate the quoted arg and cause the next
533
+ // word to be treated as an output path.
534
+ //
535
+ // Defence-in-depth: replace any surviving straight " with the
536
+ // typographic curly-quote equivalent (the primary escaping happens in
537
+ // escapeDrawText, but filter strings can also come from other sources).
538
+ const sanitised = arg.replace(/"/g, "\u201C");
539
+ if (
540
+ sanitised.includes(" ") ||
541
+ sanitised.includes(":") ||
542
+ sanitised.includes("'")
543
+ ) {
544
+ return `"${sanitised}"`;
334
545
  }
335
- return arg;
546
+ return sanitised;
336
547
  })
337
548
  .join(" ");
338
549
  }
@@ -107,6 +107,11 @@ const VIDEO_MODELS: Record<string, { t2v: string; i2v: string }> = {
107
107
  t2v: "fal-ai/kling-video/o3/standard/text-to-video",
108
108
  i2v: "fal-ai/kling-video/o3/standard/image-to-video",
109
109
  },
110
+ // Kling O3 4K - native 4K output (i2v only, t2v falls back to pro)
111
+ "kling-v3-4k-image-to-video": {
112
+ t2v: "fal-ai/kling-video/o3/pro/text-to-video",
113
+ i2v: "fal-ai/kling-video/o3/4k/image-to-video",
114
+ },
110
115
  // Kling v2.6 - with native audio generation
111
116
  "kling-v2.6": {
112
117
  t2v: "fal-ai/kling-video/v2.6/pro/text-to-video",
@@ -163,8 +168,25 @@ const VIDEO_EDIT_MODELS: Record<string, string> = {
163
168
  "sora-2-remix": "fal-ai/sora-2/video-to-video/remix",
164
169
  };
165
170
 
171
+ // Reference-to-video models - images/elements + prompt → video with character consistency
172
+ const REFERENCE_VIDEO_MODELS: Record<string, string> = {
173
+ "kling-v3-pro-reference-to-video":
174
+ "fal-ai/kling-video/o3/pro/reference-to-video",
175
+ "kling-v3-4k-reference-to-video":
176
+ "fal-ai/kling-video/o3/4k/reference-to-video",
177
+ };
178
+
179
+ // Video-to-video reference models - reference video + prompt → new video preserving motion/camera
180
+ const V2V_REFERENCE_MODELS: Record<string, string> = {
181
+ "kling-v3-standard-v2v-reference":
182
+ "fal-ai/kling-video/o3/standard/video-to-video/reference",
183
+ };
184
+
166
185
  // Motion control models - video-to-video with motion transfer
167
186
  const MOTION_CONTROL_MODELS: Record<string, string> = {
187
+ "kling-v3-pro-motion-control": "fal-ai/kling-video/v3/pro/motion-control",
188
+ "kling-v3-standard-motion-control":
189
+ "fal-ai/kling-video/v3/standard/motion-control",
168
190
  "kling-v2.6-motion": "fal-ai/kling-video/v2.6/pro/motion-control",
169
191
  "kling-v2.6-motion-standard":
170
192
  "fal-ai/kling-video/v2.6/standard/motion-control",
@@ -520,8 +542,12 @@ class FalVideoModel implements VideoModelV3 {
520
542
  const isMotionControl = MOTION_CONTROL_MODELS[this.modelId] !== undefined;
521
543
  const isVideoEdit = VIDEO_EDIT_MODELS[this.modelId] !== undefined;
522
544
  const isVideoUpscale = VIDEO_UPSCALE_MODELS[this.modelId] !== undefined;
545
+ const isReferenceVideo = REFERENCE_VIDEO_MODELS[this.modelId] !== undefined;
546
+ const isV2VReference = V2V_REFERENCE_MODELS[this.modelId] !== undefined;
523
547
  const isKlingV3 =
524
- this.modelId === "kling-v3" || this.modelId === "kling-v3-standard";
548
+ this.modelId === "kling-v3" ||
549
+ this.modelId === "kling-v3-standard" ||
550
+ this.modelId === "kling-v3-4k-image-to-video";
525
551
  const isKlingV26 = this.modelId === "kling-v2.6";
526
552
  const isLtx2 = this.modelId === "ltx-2-19b-distilled";
527
553
  const isGrokImagine = this.modelId === "grok-imagine";
@@ -537,7 +563,11 @@ class FalVideoModel implements VideoModelV3 {
537
563
  ? this.resolveVideoEditEndpoint()
538
564
  : isVideoUpscale
539
565
  ? this.resolveVideoUpscaleEndpoint()
540
- : this.resolveEndpoint(hasImageInput ?? false);
566
+ : isReferenceVideo
567
+ ? this.resolveReferenceVideoEndpoint()
568
+ : isV2VReference
569
+ ? this.resolveV2VReferenceEndpoint()
570
+ : this.resolveEndpoint(hasImageInput ?? false);
541
571
 
542
572
  const input: Record<string, unknown> = {
543
573
  ...(providerOptions?.fal ?? {}),
@@ -600,6 +630,11 @@ class FalVideoModel implements VideoModelV3 {
600
630
  if (input.keep_original_sound === undefined) {
601
631
  input.keep_original_sound = true;
602
632
  }
633
+
634
+ // Pass aspect ratio so the provider returns the correct output dimensions
635
+ if (aspectRatio && !input.aspect_ratio) {
636
+ input.aspect_ratio = aspectRatio;
637
+ }
603
638
  } else if (isVideoEdit) {
604
639
  // Video edit: video input + prompt for editing instruction
605
640
  input.prompt = prompt;
@@ -625,6 +660,86 @@ class FalVideoModel implements VideoModelV3 {
625
660
  if (videoFile) {
626
661
  input.video_url = await fileToUrl(videoFile);
627
662
  }
663
+ } else if (isReferenceVideo) {
664
+ // Reference-to-video: prompt + optional start/end images + reference images
665
+ // Elements and multi_prompt are passed via providerOptions.fal
666
+ if (prompt) {
667
+ input.prompt = prompt;
668
+ }
669
+
670
+ if (files) {
671
+ const imageFiles = files.filter((f) =>
672
+ getMediaType(f)?.startsWith("image/"),
673
+ );
674
+ // First image → start_image_url, second → end_image_url
675
+ if (imageFiles[0]) {
676
+ input.start_image_url = await fileToUrl(imageFiles[0]);
677
+ }
678
+ if (imageFiles[1]) {
679
+ input.end_image_url = await fileToUrl(imageFiles[1]);
680
+ }
681
+ // Additional images (3+) → image_urls for style/appearance reference
682
+ if (imageFiles.length > 2) {
683
+ const additionalUrls: string[] = [];
684
+ for (let i = 2; i < imageFiles.length; i++) {
685
+ additionalUrls.push(await fileToUrl(imageFiles[i]!));
686
+ }
687
+ input.image_urls = additionalUrls;
688
+ }
689
+ }
690
+
691
+ // Duration as string integer for Kling O3
692
+ const normalized = normalizeProviderInput(this.modelId, { duration });
693
+ input.duration = normalized.duration;
694
+
695
+ if (!input.aspect_ratio) {
696
+ input.aspect_ratio = aspectRatio ?? "16:9";
697
+ }
698
+
699
+ // Default to generating audio
700
+ if (input.generate_audio === undefined) {
701
+ input.generate_audio = true;
702
+ }
703
+ } else if (isV2VReference) {
704
+ // Video-to-video reference: reference video + prompt → new video preserving motion/camera
705
+ // Elements and image_urls are passed via providerOptions.fal
706
+ if (prompt) {
707
+ input.prompt = prompt;
708
+ }
709
+
710
+ const videoFile = files?.find((f) =>
711
+ getMediaType(f)?.startsWith("video/"),
712
+ );
713
+ if (videoFile) {
714
+ input.video_url = await fileToUrl(videoFile);
715
+ }
716
+
717
+ // Reference images from file inputs (for style/appearance)
718
+ if (files) {
719
+ const imageFiles = files.filter((f) =>
720
+ getMediaType(f)?.startsWith("image/"),
721
+ );
722
+ if (imageFiles.length > 0) {
723
+ const imageUrls: string[] = [];
724
+ for (const imgFile of imageFiles) {
725
+ imageUrls.push(await fileToUrl(imgFile));
726
+ }
727
+ input.image_urls = imageUrls;
728
+ }
729
+ }
730
+
731
+ // Duration as string integer for Kling O3
732
+ const normalized = normalizeProviderInput(this.modelId, { duration });
733
+ input.duration = normalized.duration;
734
+
735
+ if (!input.aspect_ratio) {
736
+ input.aspect_ratio = aspectRatio ?? "auto";
737
+ }
738
+
739
+ // Default to keeping original audio from reference video
740
+ if (input.keep_audio === undefined) {
741
+ input.keep_audio = true;
742
+ }
628
743
  } else {
629
744
  // Standard video generation
630
745
  input.prompt = prompt;
@@ -825,6 +940,22 @@ class FalVideoModel implements VideoModelV3 {
825
940
 
826
941
  return VIDEO_UPSCALE_MODELS[this.modelId] ?? this.modelId;
827
942
  }
943
+
944
+ private resolveReferenceVideoEndpoint(): string {
945
+ if (this.modelId.startsWith("raw:")) {
946
+ return this.modelId.slice(4);
947
+ }
948
+
949
+ return REFERENCE_VIDEO_MODELS[this.modelId] ?? this.modelId;
950
+ }
951
+
952
+ private resolveV2VReferenceEndpoint(): string {
953
+ if (this.modelId.startsWith("raw:")) {
954
+ return this.modelId.slice(4);
955
+ }
956
+
957
+ return V2V_REFERENCE_MODELS[this.modelId] ?? this.modelId;
958
+ }
828
959
  }
829
960
 
830
961
  class FalImageModel implements ImageModelV3 {
@@ -70,6 +70,24 @@ const ModelDurationRules: Record<string, z.ZodType> = {
70
70
  "kling-v3": z.object({ duration: stringIntDuration(3, 15, 5) }),
71
71
  "kling-v3-standard": z.object({ duration: stringIntDuration(3, 15, 5) }),
72
72
 
73
+ // Kling O3 4K: same rules as v3
74
+ "kling-v3-4k-image-to-video": z.object({
75
+ duration: stringIntDuration(3, 15, 5),
76
+ }),
77
+
78
+ // Kling O3 reference-to-video: same duration range
79
+ "kling-v3-pro-reference-to-video": z.object({
80
+ duration: stringIntDuration(3, 15, 5),
81
+ }),
82
+ "kling-v3-4k-reference-to-video": z.object({
83
+ duration: stringIntDuration(3, 15, 5),
84
+ }),
85
+
86
+ // Kling O3 video-to-video reference: same duration range
87
+ "kling-v3-standard-v2v-reference": z.object({
88
+ duration: stringIntDuration(3, 15, 5),
89
+ }),
90
+
73
91
  // Kling v2.6: same rules as v3
74
92
  "kling-v2.6": z.object({ duration: stringIntDuration(3, 15, 5) }),
75
93