vargai 0.4.0-alpha77 → 0.4.0-alpha79

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -71,7 +71,7 @@
71
71
  "zod": "^4.2.1"
72
72
  },
73
73
  "sideEffects": false,
74
- "version": "0.4.0-alpha77",
74
+ "version": "0.4.0-alpha79",
75
75
  "exports": {
76
76
  ".": "./src/index.ts",
77
77
  "./ai": "./src/ai-sdk/index.ts",
@@ -419,6 +419,8 @@ interface TimedTextLayer {
419
419
  layer: TextLayer;
420
420
  startTime: number;
421
421
  duration: number;
422
+ transitionInDuration: number;
423
+ transitionOutDuration: number;
422
424
  }
423
425
 
424
426
  function collectTextLayers(clips: ProcessedClip[]): TimedTextLayer[] {
@@ -429,12 +431,19 @@ function collectTextLayers(clips: ProcessedClip[]): TimedTextLayer[] {
429
431
  const clip = clips[i];
430
432
  if (!clip) continue;
431
433
 
434
+ const transitionInDuration =
435
+ i > 0 ? (clips[i - 1]?.transition.duration ?? 0) : 0;
436
+ const transitionOutDuration =
437
+ i < clips.length - 1 ? clip.transition.duration : 0;
438
+
432
439
  for (const layer of clip.layers) {
433
440
  if (layer && isTextOverlayLayer(layer)) {
434
441
  textLayers.push({
435
442
  layer: layer as TextLayer,
436
443
  startTime: currentTime,
437
444
  duration: clip.duration,
445
+ transitionInDuration,
446
+ transitionOutDuration,
438
447
  });
439
448
  }
440
449
  }
@@ -845,13 +854,23 @@ export async function editly(config: EditlyConfig): Promise<EditlyResult> {
845
854
  const timedLayer = textLayers[i];
846
855
  if (!timedLayer) continue;
847
856
 
848
- const { layer, startTime, duration } = timedLayer;
857
+ const {
858
+ layer,
859
+ startTime,
860
+ duration,
861
+ transitionInDuration,
862
+ transitionOutDuration,
863
+ } = timedLayer;
849
864
  const outputLabel = `vwithtext${i}`;
850
865
 
866
+ // Shrink text visibility to avoid overlap during transitions
867
+ const effectiveStart = startTime + transitionInDuration;
868
+ const effectiveStop = startTime + duration - transitionOutDuration;
869
+
851
870
  const timedLayerWithEnable = {
852
871
  ...layer,
853
- start: layer.start ?? startTime,
854
- stop: layer.stop ?? startTime + duration,
872
+ start: layer.start != null ? layer.start + startTime : effectiveStart,
873
+ stop: layer.stop != null ? layer.stop + startTime : effectiveStop,
855
874
  };
856
875
 
857
876
  if (layer.type === "title") {
@@ -145,11 +145,21 @@ const VIDEO_MODELS: Record<string, { t2v: string; i2v: string }> = {
145
145
  t2v: "xai/grok-imagine-video/text-to-video",
146
146
  i2v: "xai/grok-imagine-video/image-to-video",
147
147
  },
148
+ // Sora 2 - OpenAI's video model via fal (t2v + i2v, with audio)
149
+ "sora-2": {
150
+ t2v: "fal-ai/sora-2/text-to-video",
151
+ i2v: "fal-ai/sora-2/image-to-video",
152
+ },
153
+ "sora-2-pro": {
154
+ t2v: "fal-ai/sora-2/text-to-video/pro",
155
+ i2v: "fal-ai/sora-2/image-to-video/pro",
156
+ },
148
157
  };
149
158
 
150
159
  // Video edit models - video-to-video editing
151
160
  const VIDEO_EDIT_MODELS: Record<string, string> = {
152
161
  "grok-imagine-edit": "xai/grok-imagine-video/edit-video",
162
+ "sora-2-remix": "fal-ai/sora-2/video-to-video/remix",
153
163
  };
154
164
 
155
165
  // Motion control models - video-to-video with motion transfer
@@ -183,6 +193,9 @@ const IMAGE_MODELS: Record<string, string> = {
183
193
  "qwen-image-2/edit": "fal-ai/qwen-image-2/edit",
184
194
  "qwen-image-2-pro": "fal-ai/qwen-image-2/pro/text-to-image",
185
195
  "qwen-image-2-pro/edit": "fal-ai/qwen-image-2/pro/edit",
196
+ // Grok Imagine Image - xAI text-to-image and image editing
197
+ "grok-imagine-image": "xai/grok-imagine-image",
198
+ "grok-imagine-image/edit": "xai/grok-imagine-image/edit",
186
199
  // Qwen Image Edit 2511 Multiple Angles - camera angle adjustment
187
200
  "qwen-angles": "fal-ai/qwen-image-edit-2511-multiple-angles",
188
201
  // Recraft V4 Pro - text-to-image
@@ -479,6 +492,7 @@ class FalVideoModel implements VideoModelV3 {
479
492
  const isKlingV26 = this.modelId === "kling-v2.6";
480
493
  const isLtx2 = this.modelId === "ltx-2-19b-distilled";
481
494
  const isGrokImagine = this.modelId === "grok-imagine";
495
+ const isSora2 = this.modelId === "sora-2" || this.modelId === "sora-2-pro";
482
496
 
483
497
  const fileHashes = await computeFileHashes(files as ImageModelV3File[]);
484
498
 
@@ -591,6 +605,23 @@ class FalVideoModel implements VideoModelV3 {
591
605
  if (!input.resolution) {
592
606
  input.resolution = "720p";
593
607
  }
608
+ } else if (isSora2) {
609
+ // Sora 2: only supports 4, 8, 12, 16, 20 second durations
610
+ const allowedDurations = [4, 8, 12, 16, 20];
611
+ const d = duration ?? 4;
612
+ if (!allowedDurations.includes(d)) {
613
+ warnings.push({
614
+ type: "other",
615
+ message: `Sora 2 only supports durations: ${allowedDurations.join(", ")}s. Got ${d}s, defaulting to 4s.`,
616
+ });
617
+ input.duration = 4;
618
+ } else {
619
+ input.duration = d;
620
+ }
621
+ // Disable video deletion so generated video URLs remain accessible
622
+ if (input.delete_video === undefined) {
623
+ input.delete_video = false;
624
+ }
594
625
  } else {
595
626
  input.duration = duration ?? 5;
596
627
  }
@@ -1,5 +1,6 @@
1
1
  import { existsSync, statSync } from "node:fs";
2
2
  import { resolve } from "node:path";
3
+ import { ResolvedElement } from "../resolved-element";
3
4
  import type { VargElement, VargNode } from "../types";
4
5
 
5
6
  export function resolvePath(path: string): string {
@@ -84,11 +85,22 @@ function serializeValue(v: unknown): string {
84
85
  }
85
86
  return v;
86
87
  }
88
+ // Never put raw binary data in cache keys — use semantic identity instead.
89
+ // Audio segments can be 48-110KB; base64-encoding them would exceed
90
+ // Upstash Redis' 32KB key size limit.
87
91
  if (v instanceof Uint8Array) {
88
- // Hash binary data instead of base64-encoding to keep cache keys small.
89
- // Raw base64 can produce 65-110KB strings for audio segments, exceeding
90
- // Upstash Redis' 32KB key size limit.
91
- return `uint8:${v.byteLength}:${Bun.hash(v).toString(16)}`;
92
+ return `uint8:${v.byteLength}`;
93
+ }
94
+ // ResolvedElement (e.g. a speech segment used as Video audio input):
95
+ // serialize by content identity (type + text + duration), not binary data.
96
+ if (v instanceof ResolvedElement) {
97
+ const parts = [v.type];
98
+ for (const child of v.children) {
99
+ if (typeof child === "string") parts.push(child);
100
+ }
101
+ if (v.meta.duration) parts.push(String(v.meta.duration));
102
+ if (v.meta.file?.url) parts.push(v.meta.file.url);
103
+ return `resolved(${parts.join(",")})`;
92
104
  }
93
105
  if (isVargElement(v)) {
94
106
  return `element:${computeCacheKey(v).join(":")}`;
@@ -97,6 +109,12 @@ function serializeValue(v: unknown): string {
97
109
  return `[${v.map(serializeValue).join(",")}]`;
98
110
  }
99
111
  if (v && typeof v === "object") {
112
+ // Skip File-like objects with binary data — use URL if available
113
+ if ("_data" in v && "_mediaType" in v) {
114
+ const url = (v as { _url?: string | null })._url;
115
+ const mediaType = (v as { _mediaType: string })._mediaType;
116
+ return url ? `file(${url})` : `file(${mediaType})`;
117
+ }
100
118
  const entries = Object.entries(v)
101
119
  .map(([key, val]) => `${key}:${serializeValue(val)}`)
102
120
  .join(",");
@@ -134,7 +152,7 @@ export function computeCacheKey(element: VargElement): CacheKeyPart[] {
134
152
  } else if (v === null || v === undefined) {
135
153
  key.push(k, v);
136
154
  } else if (v instanceof Uint8Array) {
137
- key.push(k, `uint8:${v.byteLength}:${Bun.hash(v).toString(16)}`);
155
+ key.push(k, `uint8:${v.byteLength}`);
138
156
  } else if (isVargElement(v)) {
139
157
  key.push(k, ...computeCacheKey(v));
140
158
  } else if (Array.isArray(v) || typeof v === "object") {
@@ -187,6 +187,12 @@ async function sliceSegments(
187
187
  descriptors.map(async (desc) => {
188
188
  const bytes = await sliceAudio(fullFile, desc.start, desc.end);
189
189
  const segmentFile = File.fromBuffer(bytes, "audio/mpeg");
190
+ // Upload segment to storage so downstream cache keys use the URL
191
+ // instead of serializing raw audio bytes (which can exceed Redis key limits).
192
+ const ctx = getResolveContext();
193
+ if (ctx?.storage) {
194
+ await segmentFile.upload(ctx.storage);
195
+ }
190
196
 
191
197
  // Rebase word timings relative to the segment's sliced audio (t=0)
192
198
  const segmentWords = allWords