vargai 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. package/.claude/settings.local.json +7 -0
  2. package/.env.example +27 -0
  3. package/.github/workflows/ci.yml +23 -0
  4. package/.husky/README.md +102 -0
  5. package/.husky/commit-msg +6 -0
  6. package/.husky/pre-commit +9 -0
  7. package/.husky/pre-push +6 -0
  8. package/.size-limit.json +8 -0
  9. package/.test-hooks.ts +5 -0
  10. package/CLAUDE.md +125 -0
  11. package/CONTRIBUTING.md +150 -0
  12. package/LICENSE.md +53 -0
  13. package/README.md +78 -0
  14. package/SKILLS.md +173 -0
  15. package/STRUCTURE.md +92 -0
  16. package/biome.json +34 -0
  17. package/bun.lock +1254 -0
  18. package/commitlint.config.js +22 -0
  19. package/docs/plan.md +66 -0
  20. package/docs/todo.md +14 -0
  21. package/docs/varg-sdk.md +812 -0
  22. package/ffmpeg/CLAUDE.md +68 -0
  23. package/package.json +69 -0
  24. package/pipeline/cookbooks/SKILL.md +285 -0
  25. package/pipeline/cookbooks/remotion-video.md +585 -0
  26. package/pipeline/cookbooks/round-video-character.md +337 -0
  27. package/pipeline/cookbooks/scripts/animate-frames-parallel.ts +84 -0
  28. package/pipeline/cookbooks/scripts/combine-scenes.sh +53 -0
  29. package/pipeline/cookbooks/scripts/generate-frames-parallel.ts +99 -0
  30. package/pipeline/cookbooks/scripts/still-to-video.sh +37 -0
  31. package/pipeline/cookbooks/talking-character.md +59 -0
  32. package/pipeline/cookbooks/text-to-tiktok.md +669 -0
  33. package/pipeline/cookbooks/trendwatching.md +156 -0
  34. package/plan.md +281 -0
  35. package/scripts/.gitkeep +0 -0
  36. package/src/ai-sdk/cache.ts +142 -0
  37. package/src/ai-sdk/examples/cached-generation.ts +53 -0
  38. package/src/ai-sdk/examples/duet-scene-4.ts +53 -0
  39. package/src/ai-sdk/examples/duet-scene-5-audio.ts +32 -0
  40. package/src/ai-sdk/examples/duet-video.ts +56 -0
  41. package/src/ai-sdk/examples/editly-composition.ts +63 -0
  42. package/src/ai-sdk/examples/editly-test.ts +57 -0
  43. package/src/ai-sdk/examples/editly-video-test.ts +52 -0
  44. package/src/ai-sdk/examples/fal-lipsync.ts +43 -0
  45. package/src/ai-sdk/examples/higgsfield-image.ts +61 -0
  46. package/src/ai-sdk/examples/music-generation.ts +19 -0
  47. package/src/ai-sdk/examples/openai-sora.ts +34 -0
  48. package/src/ai-sdk/examples/replicate-bg-removal.ts +52 -0
  49. package/src/ai-sdk/examples/simpsons-scene.ts +61 -0
  50. package/src/ai-sdk/examples/talking-lion.ts +55 -0
  51. package/src/ai-sdk/examples/video-generation.ts +39 -0
  52. package/src/ai-sdk/examples/workflow-animated-girl.ts +104 -0
  53. package/src/ai-sdk/examples/workflow-before-after.ts +114 -0
  54. package/src/ai-sdk/examples/workflow-character-grid.ts +112 -0
  55. package/src/ai-sdk/examples/workflow-slideshow.ts +161 -0
  56. package/src/ai-sdk/file-cache.ts +112 -0
  57. package/src/ai-sdk/file.ts +238 -0
  58. package/src/ai-sdk/generate-element.ts +92 -0
  59. package/src/ai-sdk/generate-music.ts +46 -0
  60. package/src/ai-sdk/generate-video.ts +165 -0
  61. package/src/ai-sdk/index.ts +72 -0
  62. package/src/ai-sdk/music-model.ts +110 -0
  63. package/src/ai-sdk/providers/editly/editly.test.ts +1108 -0
  64. package/src/ai-sdk/providers/editly/ffmpeg.ts +60 -0
  65. package/src/ai-sdk/providers/editly/index.ts +817 -0
  66. package/src/ai-sdk/providers/editly/layers.ts +772 -0
  67. package/src/ai-sdk/providers/editly/plan.md +144 -0
  68. package/src/ai-sdk/providers/editly/types.ts +328 -0
  69. package/src/ai-sdk/providers/elevenlabs-provider.ts +255 -0
  70. package/src/ai-sdk/providers/fal-provider.ts +512 -0
  71. package/src/ai-sdk/providers/higgsfield.ts +379 -0
  72. package/src/ai-sdk/providers/openai.ts +251 -0
  73. package/src/ai-sdk/providers/replicate.ts +16 -0
  74. package/src/ai-sdk/video-model.ts +185 -0
  75. package/src/cli/commands/find.tsx +137 -0
  76. package/src/cli/commands/help.tsx +85 -0
  77. package/src/cli/commands/index.ts +9 -0
  78. package/src/cli/commands/list.tsx +238 -0
  79. package/src/cli/commands/run.tsx +511 -0
  80. package/src/cli/commands/which.tsx +253 -0
  81. package/src/cli/index.ts +112 -0
  82. package/src/cli/quiet.ts +44 -0
  83. package/src/cli/types.ts +32 -0
  84. package/src/cli/ui/components/Badge.tsx +29 -0
  85. package/src/cli/ui/components/DataTable.tsx +51 -0
  86. package/src/cli/ui/components/Header.tsx +23 -0
  87. package/src/cli/ui/components/HelpBlock.tsx +44 -0
  88. package/src/cli/ui/components/KeyValue.tsx +33 -0
  89. package/src/cli/ui/components/OptionRow.tsx +81 -0
  90. package/src/cli/ui/components/Separator.tsx +23 -0
  91. package/src/cli/ui/components/StatusBox.tsx +108 -0
  92. package/src/cli/ui/components/VargBox.tsx +51 -0
  93. package/src/cli/ui/components/VargProgress.tsx +36 -0
  94. package/src/cli/ui/components/VargSpinner.tsx +34 -0
  95. package/src/cli/ui/components/VargText.tsx +56 -0
  96. package/src/cli/ui/components/index.ts +19 -0
  97. package/src/cli/ui/index.ts +12 -0
  98. package/src/cli/ui/render.ts +35 -0
  99. package/src/cli/ui/theme.ts +63 -0
  100. package/src/cli/utils.ts +78 -0
  101. package/src/core/executor/executor.ts +201 -0
  102. package/src/core/executor/index.ts +13 -0
  103. package/src/core/executor/job.ts +214 -0
  104. package/src/core/executor/pipeline.ts +222 -0
  105. package/src/core/index.ts +11 -0
  106. package/src/core/registry/index.ts +9 -0
  107. package/src/core/registry/loader.ts +149 -0
  108. package/src/core/registry/registry.ts +221 -0
  109. package/src/core/registry/resolver.ts +206 -0
  110. package/src/core/schema/helpers.ts +134 -0
  111. package/src/core/schema/index.ts +8 -0
  112. package/src/core/schema/shared.ts +102 -0
  113. package/src/core/schema/types.ts +279 -0
  114. package/src/core/schema/validator.ts +92 -0
  115. package/src/definitions/actions/captions.ts +261 -0
  116. package/src/definitions/actions/edit.ts +298 -0
  117. package/src/definitions/actions/image.ts +125 -0
  118. package/src/definitions/actions/index.ts +114 -0
  119. package/src/definitions/actions/music.ts +205 -0
  120. package/src/definitions/actions/sync.ts +128 -0
  121. package/src/definitions/actions/transcribe.ts +200 -0
  122. package/src/definitions/actions/upload.ts +111 -0
  123. package/src/definitions/actions/video.ts +163 -0
  124. package/src/definitions/actions/voice.ts +119 -0
  125. package/src/definitions/index.ts +23 -0
  126. package/src/definitions/models/elevenlabs.ts +50 -0
  127. package/src/definitions/models/flux.ts +56 -0
  128. package/src/definitions/models/index.ts +36 -0
  129. package/src/definitions/models/kling.ts +56 -0
  130. package/src/definitions/models/llama.ts +54 -0
  131. package/src/definitions/models/nano-banana-pro.ts +102 -0
  132. package/src/definitions/models/sonauto.ts +68 -0
  133. package/src/definitions/models/soul.ts +65 -0
  134. package/src/definitions/models/wan.ts +54 -0
  135. package/src/definitions/models/whisper.ts +44 -0
  136. package/src/definitions/skills/index.ts +12 -0
  137. package/src/definitions/skills/talking-character.ts +87 -0
  138. package/src/definitions/skills/text-to-tiktok.ts +97 -0
  139. package/src/index.ts +118 -0
  140. package/src/providers/apify.ts +269 -0
  141. package/src/providers/base.ts +264 -0
  142. package/src/providers/elevenlabs.ts +217 -0
  143. package/src/providers/fal.ts +392 -0
  144. package/src/providers/ffmpeg.ts +544 -0
  145. package/src/providers/fireworks.ts +193 -0
  146. package/src/providers/groq.ts +149 -0
  147. package/src/providers/higgsfield.ts +145 -0
  148. package/src/providers/index.ts +143 -0
  149. package/src/providers/replicate.ts +147 -0
  150. package/src/providers/storage.ts +206 -0
  151. package/src/tests/all.test.ts +509 -0
  152. package/src/tests/index.ts +33 -0
  153. package/src/tests/unit.test.ts +403 -0
  154. package/tsconfig.json +45 -0
@@ -0,0 +1,337 @@
1
+ # round video character cookbook
2
+
3
+ create realistic round selfie videos for telegram: front-facing camera POV videos with authentic camera shake, lighting, and audio
4
+
5
+ ## what this does
6
+
7
+ 1. generates 3 first frame options: person in specified setting (conference, station, etc)
8
+ 2. ai picks the best first frame from the 3 options
9
+ 3. generates voiceover from text script
10
+ 4. creates talking video using wan 2.5 with audio sync
11
+
12
+ ## inputs
13
+
14
+ - `text_script`: what the person will say
15
+ - `profile_photo`: photo of the person (e.g., media/friend/katia.jpg)
16
+ - `scene_location`: where they are (from script or default: conference/underground station)
17
+
18
+ ## steps
19
+
20
+ ### step 1: generate first frame options (person in setting)
21
+
22
+ generate 3 variations and let ai pick the best one:
23
+
24
+ ```bash
25
+ # generate 3 SELFIE-STYLE first frame options using nano banana pro
26
+ # CRITICAL: use the proven prompt structure below
27
+ # aspect_ratio "auto" preserves the original photo's aspect ratio (portrait/landscape)
28
+
29
+ # option 1
30
+ bun run lib/fal.ts image_to_image \
31
+ "selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
32
+ media/friend/katia.jpg \
33
+ auto
34
+
35
+ # option 2
36
+ bun run lib/fal.ts image_to_image \
37
+ "selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
38
+ media/friend/katia.jpg \
39
+ auto
40
+
41
+ # option 3
42
+ bun run lib/fal.ts image_to_image \
43
+ "selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
44
+ media/friend/katia.jpg \
45
+ auto
46
+ ```
47
+
48
+ **important prompting for selfie style (image-to-image):**
49
+ - start with "selfie POV" - simple and effective
50
+ - include "camera with subtle natural wobble and shake throughout"
51
+ - specify "focus on subject with shallow depth of field"
52
+ - lighting: "dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast"
53
+ - background: "ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights"
54
+ - clothing: "wear black hoodie without any text on it" (or specify other clothing)
55
+ - location: flexible - adjust based on script (hackathon space, metro station, office, etc.)
56
+ - aspect ratio "auto" preserves original dimensions - critical for avoiding squashed/stretched video!
57
+
58
+ each command outputs a URL like: `https://v3b.fal.media/files/.../image.jpg`
59
+
60
+ download all 3 options:
61
+ ```bash
62
+ curl -o media/friend/option1.jpg "https://url-from-option-1.jpg"
63
+ curl -o media/friend/option2.jpg "https://url-from-option-2.jpg"
64
+ curl -o media/friend/option3.jpg "https://url-from-option-3.jpg"
65
+ ```
66
+
67
+ **ai should review the 3 options and pick the best one based on:**
68
+ - face quality and recognition
69
+ - natural selfie look
70
+ - lighting and color balance
71
+ - background blur and composition
72
+ - overall authenticity
73
+
74
+ use the selected image url for step 4 (wan 2.5)
75
+
76
+ ### step 2: generate voiceover
77
+
78
+ ```bash
79
+ # generate voice from script
80
+ # save to media/friend/[name]/voice.mp3 for organization
81
+ bun run lib/elevenlabs.ts tts \
82
+ "hey everyone! excited to share this update from the conference" \
83
+ rachel \
84
+ media/friend/katia/voice.mp3
85
+ ```
86
+
87
+ the audio is saved to `media/friend/[name]/voice.mp3`. you'll need to upload this to get a url for wan 2.5.
88
+
89
+ ### step 3: generate talking video with wan 2.5 (via fal)
90
+
91
+ **important: audio must be at least 3 seconds long!**
92
+
93
+ fal's wan-25 endpoint requires audio duration of 3+ seconds. if your script is too short, extend it.
94
+
95
+ ```bash
96
+ # use fal's wan-25 endpoint (supports local files and urls)
97
+ # audio and image files will be auto-uploaded if local paths are provided
98
+ # duration MUST be 5 or 10 seconds only
99
+ bun run lib/fal.ts wan \
100
+ media/friend/katia/option2.jpg \
101
+ media/friend/katia/voice.mp3 \
102
+ "front-facing camera selfie POV video, handheld phone directly in front of face with continuous slight wobble and shake, subject in sharp focus with softly blurred background shallow depth of field, dramatic low-light scene with intense magenta hot pink light illuminating face and blue ambient lights in blurred background, dark indoor busy conference setting with abstract out-of-focus lights, conversational audio with muffled background crowd chatter and commotion" \
103
+ 10 \
104
+ 480p
105
+ ```
106
+
107
+ **command structure:**
108
+ ```bash
109
+ bun run lib/fal.ts wan <image_path_or_url> <audio_path_or_url> <prompt> [duration] [resolution]
110
+ ```
111
+
112
+ **parameters:**
113
+ - image: local path or url (auto-uploaded if local)
114
+ - audio: local path or url (auto-uploaded if local, **must be 3+ seconds**)
115
+ - prompt: detailed video style description
116
+ - duration: 5 or 10 (default: 5)
117
+ - resolution: 480p, 720p, or 1080p (default: 480p)
118
+
119
+ **detailed prompt structure for realistic selfie videos:**
120
+
121
+ the prompt should include ALL these elements for maximum authenticity:
122
+
123
+ **camera technique:**
124
+ - "front-facing camera selfie POV video"
125
+ - "handheld phone directly in front of face"
126
+ - "continuous slight wobble and shake"
127
+
128
+ **focus & composition:**
129
+ - "subject in sharp focus"
130
+ - "softly blurred background shallow depth of field"
131
+
132
+ **lighting:**
133
+ - "dramatic low-light scene"
134
+ - "intense magenta hot pink light illuminating face" (or specify your lighting color)
135
+ - "blue ambient lights in blurred background" (optional, for busy settings)
136
+
137
+ **setting:**
138
+ - "dark indoor busy setting with abstract out-of-focus lights" (adjust based on location)
139
+
140
+ **audio characteristics:**
141
+ - "conversational audio with muffled background crowd chatter and commotion"
142
+
143
+ this comprehensive prompting creates videos that look like authentic, quickly-recorded selfie messages with realistic imperfections.
144
+
145
+ this takes 2-4 minutes. the command will wait for completion and output the video url.
146
+
147
+ ### step 4: download result
148
+
149
+ ```bash
150
+ # fal wan-25 returns video url like: https://v3b.fal.media/files/.../video.mp4
151
+ curl -o media/friend/talking-character.mp4 "https://v3b.fal.media/files/.../video.mp4"
152
+ ```
153
+
154
+ ## output
155
+
156
+ - first frame options: 3 variations (jpg) - `media/friend/[name]/option1.jpg`, `option2.jpg`, `option3.jpg`
157
+ - selected first frame: best option chosen by ai
158
+ - voiceover: `media/friend/[name]/voice.mp3`
159
+ - final video: `media/friend/[name]/talking-character.mp4`
160
+
161
+ ## timing
162
+
163
+ - first frame generation: 15-30s (3 options)
164
+ - ai selection: instant
165
+ - voiceover: 5-10s
166
+ - wan 2.5 processing (fal): 2-4min
167
+
168
+ **total: ~3-5 minutes**
169
+
170
+ ## scene context examples
171
+
172
+ choose setting based on script context. always include handheld camera description for authentic look:
173
+
174
+ | script mentions | step 1: first frame prompt | wan 2.5 prompt (detailed style) |
175
+ |----------------|---------------------------|--------------------------------|
176
+ | "at the conference" / "hackathon" | selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it | front-facing camera selfie POV video, handheld phone directly in front of face with continuous slight wobble, subject in sharp focus with softly blurred background, dramatic low-light with intense magenta hot pink light illuminating face and blue ambient lights in blurred background, dark indoor busy conference setting with abstract out-of-focus lights, conversational audio with muffled background crowd chatter |
177
+ | "subway" / "metro" | selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with harsh fluorescent lighting, ambient lights scattered in background, dark underground station setting, abstract out-of-focus lights, location: metro station, wear black hoodie without any text on it | front-facing camera selfie POV video, handheld phone with slight shake, sharp focus on subject with blurred metro background, harsh fluorescent lighting with cool tones, dark underground station with out-of-focus commuters and lights, audio with echoing background noise and distant train sounds |
178
+ | "office" | selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, soft indoor office lighting environment, ambient lights in background, modern workspace setting, abstract out-of-focus monitors and lights, location: office, wear black hoodie without any text on it | front-facing camera selfie POV video, handheld phone wobble, sharp subject focus with blurred office background, soft indoor office lighting, modern workspace with blurred monitors and colleagues in background, conversational audio with quiet office ambient noise |
179
+ | "street" | selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, natural daylight or street lighting environment, ambient lights in background, urban street setting, abstract out-of-focus pedestrians and lights, location: city street, wear black hoodie without any text on it | front-facing camera selfie POV video, handheld shake, sharp focus with blurred street background, natural daylight or street lighting, urban setting with out-of-focus pedestrians and traffic, audio with street noise and distant traffic sounds |
180
+ | no location | selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights, wear black hoodie without any text on it | front-facing camera selfie POV video, handheld phone with slight wobble, sharp subject with softly blurred background, natural indoor lighting, casual indoor setting, conversational audio (default) |
181
+
182
+ **key phrases for authentic selfie look:**
183
+
184
+ **step 1 (first frame - image-to-image):**
185
+
186
+ proven prompt structure (adjust location only):
187
+ ```
188
+ selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: [hackathon space/metro station/office/city street], wear black hoodie without any text on it
189
+ ```
190
+
191
+ - start with "selfie POV" - simple, no zoom confusion
192
+ - "camera with subtle natural wobble and shake throughout" - natural movement
193
+ - "focus on subject with shallow depth of field" - proper framing
194
+ - lighting: magenta/hot pink with blue ambient (adjust per setting)
195
+ - location: flexible - change based on script
196
+ - clothing: black hoodie without text (or adjust as needed)
197
+
198
+ **step 4 (wan 2.5) - comprehensive style elements:**
199
+
200
+ *camera technique:*
201
+ - "front-facing camera selfie POV video"
202
+ - "handheld phone directly in front of face"
203
+ - "continuous slight wobble and shake"
204
+
205
+ *focus & depth:*
206
+ - "subject in sharp focus"
207
+ - "softly blurred background"
208
+ - "shallow depth of field"
209
+
210
+ *lighting:*
211
+ - "dramatic low-light scene"
212
+ - "intense magenta hot pink light illuminating face" (adjust color per setting)
213
+ - "blue ambient lights in blurred background" (optional)
214
+
215
+ *setting:*
216
+ - "dark indoor busy setting"
217
+ - "abstract out-of-focus lights"
218
+ - adjust per location (conference/metro/office/street)
219
+
220
+ *audio:*
221
+ - "conversational audio with muffled background crowd chatter and commotion"
222
+ - adjust per setting (metro=echoing/train sounds, office=quiet ambient, street=traffic)
223
+
224
+ ## example: full workflow
225
+
226
+ ```bash
227
+ # scenario: katia sharing conference update
228
+ # script: "hey everyone! i'm so excited to share this amazing update with you from the conference today"
229
+ # photo: media/friend/katia.jpg
230
+ # note: audio must be 3+ seconds long for wan-25!
231
+
232
+ # step 1: generate 3 SELFIE first frame options with nano banana pro
233
+ bun run lib/fal.ts image_to_image \
234
+ "selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
235
+ media/friend/katia.jpg \
236
+ auto
237
+ # output 1: https://v3b.fal.media/files/.../option1.png
238
+
239
+ bun run lib/fal.ts image_to_image \
240
+ "selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
241
+ media/friend/katia.jpg \
242
+ auto
243
+ # output 2: https://v3b.fal.media/files/.../option2.png
244
+
245
+ bun run lib/fal.ts image_to_image \
246
+ "selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
247
+ media/friend/katia.jpg \
248
+ auto
249
+ # output 3: https://v3b.fal.media/files/.../option3.png
250
+
251
+ # download all 3 options
252
+ curl -o media/friend/katia/option1.jpg "https://v3b.fal.media/files/.../option1.png"
253
+ curl -o media/friend/katia/option2.jpg "https://v3b.fal.media/files/.../option2.png"
254
+ curl -o media/friend/katia/option3.jpg "https://v3b.fal.media/files/.../option3.png"
255
+
256
+ # ai reviews the 3 options and picks the best one based on:
257
+ # - face quality and recognition
258
+ # - natural selfie look
259
+ # - lighting and color balance
260
+ # - background blur and composition
261
+ # - overall authenticity
262
+ # selected: option2 (example)
263
+
264
+ # step 2: generate voice (ensure 3+ seconds for wan-25)
265
+ bun run lib/elevenlabs.ts tts \
266
+ "hey everyone! i'm so excited to share this amazing update with you from the conference today" \
267
+ rachel \
268
+ media/friend/katia/voice.mp3
269
+ # output: media/friend/katia/voice.mp3
270
+
271
+ # step 3: run wan-25 (fal) - auto-uploads local files
272
+ bun run lib/fal.ts wan \
273
+ media/friend/katia/option2.jpg \
274
+ media/friend/katia/voice.mp3 \
275
+ "front-facing camera selfie POV video, handheld phone directly in front of face with continuous slight wobble and shake, subject in sharp focus with softly blurred background shallow depth of field, dramatic low-light scene with intense magenta hot pink light illuminating face and blue ambient lights in blurred background, dark indoor busy conference setting with abstract out-of-focus lights, conversational audio with muffled background crowd chatter and commotion" \
276
+ 10 \
277
+ 480p
278
+ # takes 2-4 minutes...
279
+ # output: { "data": { "video": { "url": "https://v3b.fal.media/files/.../video.mp4" } } }
280
+
281
+ # step 4: download
282
+ curl -o media/friend/katia-talking.mp4 "https://v3b.fal.media/files/.../video.mp4"
283
+ ```
284
+
285
+ **tested successfully** with katia.jpg and aleks - see media/friend/ for example outputs!
286
+
287
+ ## tips
288
+
289
+ - **selfie perspective**: CRITICAL - always use "selfie POV" in step 1 first frame generation!
290
+ - **audio duration**: CRITICAL - wan-25 requires audio to be at least 3 seconds long. extend short scripts!
291
+ - **duration constraint**: wan-25 only accepts 5 or 10 second videos
292
+ - **script length**: ensure script is at least 3 seconds when spoken, max 10 seconds
293
+ - **aspect ratio preservation**: CRITICAL - always use "auto" aspect ratio in image-to-image to avoid squashed/stretched videos!
294
+ - **nano banana pro**: uses aspect_ratio="auto" to preserve original photo dimensions (portrait/landscape)
295
+ - **local file support**: fal wan command auto-uploads local files - no need for manual upload step!
296
+ - **handheld camera**: always include "handheld phone" + "wobble and shake" in wan-25 prompt for authentic look
297
+ - **first frame quality**: this is the base - make it look natural and selfie-like!
298
+ - **scene matching**: extract location from script when mentioned
299
+ - **voice selection**: rachel (default) is clear and professional
300
+ - **resolution**: 480p is faster (2-3min), 720p/1080p takes longer (4-5min)
301
+ - **save intermediates**: store outputs in media/friend/[name]/ for organization and reuse
302
+ - **using fal instead of replicate**: fal's wan-25 endpoint is faster and more reliable than replicate
303
+
304
+ ## voice options
305
+
306
+ ```bash
307
+ # female voices (american english)
308
+ bun run lib/elevenlabs.ts tts "script" rachel media/friend/[name]/voice.mp3
309
+ bun run lib/elevenlabs.ts tts "script" bella media/friend/[name]/voice.mp3
310
+ bun run lib/elevenlabs.ts tts "script" elli media/friend/[name]/voice.mp3
311
+
312
+ # male voices (american english)
313
+ bun run lib/elevenlabs.ts tts "script" antoni media/friend/[name]/voice.mp3
314
+ bun run lib/elevenlabs.ts tts "script" josh media/friend/[name]/voice.mp3
315
+ ```
316
+
317
+ see all voices: `bun run lib/elevenlabs.ts voices`
318
+
319
+ ## environment setup
320
+
321
+ ```bash
322
+ # required api keys
323
+ export ELEVENLABS_API_KEY="your_key"
324
+ export FAL_KEY="your_key" # for wan-25 and image generation
325
+ ```
326
+
327
+ ## changelog
328
+
329
+ **2024-11-22:**
330
+ - switched to fal's wan-25-preview endpoint (faster, more reliable than replicate)
331
+ - added wan-25 support to lib/fal.ts with auto-upload for local files
332
+ - discovered: audio must be at least 3 seconds long for wan-25 (critical!)
333
+ - simplified workflow: no manual audio upload step needed
334
+ - tested successfully with aleks photo and "give me money" script
335
+ - switched from flux to nano banana pro for image-to-image (better aspect ratio preservation)
336
+ - fixed squashed video issue by using aspect_ratio="auto"
337
+ - clarified duration constraints (5 or 10 seconds only)
@@ -0,0 +1,84 @@
1
+ /**
2
+ * Animate multiple frames in parallel using kling
3
+ * Usage: bun run pipeline/cookbooks/scripts/animate-frames-parallel.ts
4
+ */
5
+
6
+ import { fal } from "@fal-ai/client";
7
+
8
+ interface VideoConfig {
9
+ name: string;
10
+ framePath: string;
11
+ prompt: string;
12
+ duration?: "5" | "10";
13
+ }
14
+
15
+ async function animateFrames(configs: VideoConfig[], outputDir: string) {
16
+ console.log(`Animating ${configs.length} frames in parallel...\n`);
17
+
18
+ // Upload all frames first
19
+ const frameUrls: string[] = [];
20
+ for (const config of configs) {
21
+ const url = await fal.storage.upload(Bun.file(config.framePath));
22
+ frameUrls.push(url);
23
+ }
24
+
25
+ const promises = configs.map((config, i) => {
26
+ return fal.subscribe("fal-ai/kling-video/v2.5-turbo/pro/image-to-video", {
27
+ input: {
28
+ prompt: `${config.prompt}, NO talking NO lip movement`,
29
+ image_url: frameUrls[i],
30
+ duration: config.duration || "5",
31
+ // note: aspect_ratio is determined by input image dimensions
32
+ },
33
+ });
34
+ });
35
+
36
+ const results = await Promise.all(promises);
37
+
38
+ for (let i = 0; i < results.length; i++) {
39
+ const result = results[i] as { data?: { video?: { url?: string } } };
40
+ const url = result.data?.video?.url;
41
+ const config = configs[i];
42
+ if (url && config) {
43
+ const response = await fetch(url);
44
+ const buffer = await response.arrayBuffer();
45
+ await Bun.write(`${outputDir}/${config.name}_video.mp4`, buffer);
46
+ console.log(`${config.name}_video.mp4 saved`);
47
+ } else {
48
+ console.error(`No URL for ${config?.name ?? "unknown"}`);
49
+ }
50
+ }
51
+
52
+ console.log("\nAll videos saved!");
53
+ }
54
+
55
+ // Example usage:
56
+ async function main() {
57
+ const outputDir = "media/girl-ruined-you";
58
+
59
+ const configs: VideoConfig[] = [
60
+ {
61
+ name: "scene6",
62
+ framePath: `${outputDir}/scene6_frame.jpg`,
63
+ prompt:
64
+ "3D pixar animation, two cats meet eyes in coffee shop, warm romantic moment",
65
+ duration: "5",
66
+ },
67
+ {
68
+ name: "scene7",
69
+ framePath: `${outputDir}/scene7_frame.jpg`,
70
+ prompt: "3D pixar animation, two cats walking together, sunset, romantic",
71
+ duration: "5",
72
+ },
73
+ {
74
+ name: "scene14",
75
+ framePath: `${outputDir}/scene14_frame.jpg`,
76
+ prompt: "3D pixar animation, cat looks at sunrise, hopeful realization",
77
+ duration: "5",
78
+ },
79
+ ];
80
+
81
+ await animateFrames(configs, outputDir);
82
+ }
83
+
84
+ main().catch(console.error);
@@ -0,0 +1,53 @@
1
+ #!/bin/bash
2
+ # Combine multiple scene videos with audio clips
3
+ # Usage: ./combine-scenes.sh <project_dir>
4
+
5
+ PROJECT_DIR=${1:-"media/girl-ruined-you"}
6
+
7
+ # Scene timing configuration (adjust as needed)
8
+ # Format: scene_num:start_time:duration
9
+ SCENES=(
10
+ "1:0:3.5"
11
+ "2:3.5:6.5"
12
+ "3:10:10"
13
+ "4:20:15"
14
+ "5:35:7"
15
+ )
16
+
17
+ echo "Extracting audio clips..."
18
+ for scene_config in "${SCENES[@]}"; do
19
+ IFS=':' read -r num start dur <<< "$scene_config"
20
+ ffmpeg -y -i "$PROJECT_DIR/voiceover.mp3" -ss "$start" -t "$dur" "$PROJECT_DIR/audio_scene${num}.mp3" 2>/dev/null
21
+ echo " audio_scene${num}.mp3 ($dur sec)"
22
+ done
23
+
24
+ echo ""
25
+ echo "Combining videos with audio..."
26
+ for scene_config in "${SCENES[@]}"; do
27
+ IFS=':' read -r num start dur <<< "$scene_config"
28
+
29
+ # Calculate loop count needed (5s videos)
30
+ loops=$(echo "($dur / 5) - 1" | bc)
31
+ if [ "$loops" -lt 0 ]; then loops=0; fi
32
+
33
+ ffmpeg -y -stream_loop "$loops" -i "$PROJECT_DIR/scene${num}_video.mp4" \
34
+ -i "$PROJECT_DIR/audio_scene${num}.mp3" \
35
+ -t "$dur" -c:v libx264 -preset fast -crf 20 -c:a aac -b:a 128k -shortest \
36
+ "$PROJECT_DIR/scene${num}_final.mp4" 2>/dev/null
37
+ echo " scene${num}_final.mp4"
38
+ done
39
+
40
+ echo ""
41
+ echo "Creating concat file..."
42
+ rm -f "$PROJECT_DIR/scenes.txt"
43
+ for scene_config in "${SCENES[@]}"; do
44
+ IFS=':' read -r num start dur <<< "$scene_config"
45
+ echo "file 'scene${num}_final.mp4'" >> "$PROJECT_DIR/scenes.txt"
46
+ done
47
+
48
+ echo "Concatenating all scenes..."
49
+ cd "$PROJECT_DIR" && ffmpeg -y -f concat -safe 0 -i scenes.txt -c copy combined_scenes.mp4 2>/dev/null
50
+
51
+ duration=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 combined_scenes.mp4)
52
+ echo ""
53
+ echo "Done! combined_scenes.mp4 ($duration sec)"
@@ -0,0 +1,99 @@
1
+ /**
2
+ * Generate multiple scene frames in parallel using flux kontext
3
+ * Usage: bun run pipeline/cookbooks/scripts/generate-frames-parallel.ts
4
+ */
5
+
6
+ import { fal } from "@fal-ai/client";
7
+
8
+ interface FrameConfig {
9
+ name: string;
10
+ prompt: string;
11
+ imageUrls: string[]; // character reference URLs
12
+ multi?: boolean; // use kontext/multi for multiple characters
13
+ }
14
+
15
+ async function generateFrames(configs: FrameConfig[], outputDir: string) {
16
+ console.log(`Generating ${configs.length} frames in parallel...\n`);
17
+
18
+ const promises = configs.map((config) => {
19
+ if (config.multi) {
20
+ return fal.subscribe("fal-ai/flux-pro/kontext/multi", {
21
+ input: {
22
+ prompt: config.prompt,
23
+ image_urls: config.imageUrls,
24
+ aspect_ratio: "9:16" as const,
25
+ },
26
+ });
27
+ } else {
28
+ return fal.subscribe("fal-ai/flux-pro/kontext", {
29
+ input: {
30
+ prompt: config.prompt,
31
+ image_url: config.imageUrls[0] ?? "",
32
+ aspect_ratio: "9:16" as const,
33
+ },
34
+ });
35
+ }
36
+ });
37
+
38
+ const results = await Promise.all(promises);
39
+
40
+ for (let i = 0; i < results.length; i++) {
41
+ const result = results[i] as {
42
+ data?: { images?: Array<{ url?: string }> };
43
+ };
44
+ const url = result.data?.images?.[0]?.url;
45
+ const config = configs[i];
46
+ if (url && config) {
47
+ const response = await fetch(url);
48
+ const buffer = await response.arrayBuffer();
49
+ await Bun.write(`${outputDir}/${config.name}_frame.jpg`, buffer);
50
+ console.log(`${config.name}_frame.jpg saved`);
51
+ } else {
52
+ console.error(`No URL for ${config?.name ?? "unknown"}`);
53
+ }
54
+ }
55
+
56
+ console.log("\nAll frames saved!");
57
+ }
58
+
59
+ // Example usage:
60
+ async function main() {
61
+ const outputDir = "media/girl-ruined-you";
62
+
63
+ // Upload character references first
64
+ const protagonist = await fal.storage.upload(
65
+ Bun.file(`${outputDir}/cat_protagonist.png`),
66
+ );
67
+ const secondGirl = await fal.storage.upload(
68
+ Bun.file(`${outputDir}/cat_second_girl.png`),
69
+ );
70
+
71
+ const configs: FrameConfig[] = [
72
+ {
73
+ name: "scene6",
74
+ prompt:
75
+ "3D pixar style: male cat in hoodie (first) and elegant female cat (second) meeting eyes in coffee shop, warm golden lighting, vertical portrait 9:16",
76
+ imageUrls: [protagonist, secondGirl],
77
+ multi: true,
78
+ },
79
+ {
80
+ name: "scene7",
81
+ prompt:
82
+ "3D pixar style: male cat and female cat walking together, sunset, romantic, vertical portrait 9:16",
83
+ imageUrls: [protagonist, secondGirl],
84
+ multi: true,
85
+ },
86
+ // Single character scene
87
+ {
88
+ name: "scene14",
89
+ prompt:
90
+ "Place this cat looking at sunrise through window, hopeful, vertical portrait 9:16",
91
+ imageUrls: [protagonist],
92
+ multi: false,
93
+ },
94
+ ];
95
+
96
+ await generateFrames(configs, outputDir);
97
+ }
98
+
99
+ main().catch(console.error);
@@ -0,0 +1,37 @@
1
+ #!/bin/bash
2
+ # Convert still frame to video with ken burns effect (slow zoom)
3
+ # Usage: ./still-to-video.sh <input.jpg> <output.mp4> <duration> [zoom_direction]
4
+ # zoom_direction: in (default), out
5
+
6
+ INPUT=$1
7
+ OUTPUT=$2
8
+ DURATION=$3
9
+ ZOOM=${4:-"in"}
10
+
11
+ if [ -z "$INPUT" ] || [ -z "$OUTPUT" ] || [ -z "$DURATION" ]; then
12
+ echo "Usage: ./still-to-video.sh <input.jpg> <output.mp4> <duration> [in|out]"
13
+ exit 1
14
+ fi
15
+
16
+ # Get input dimensions
17
+ WIDTH=$(ffprobe -v error -select_streams v:0 -show_entries stream=width -of csv=p=0 "$INPUT")
18
+ HEIGHT=$(ffprobe -v error -select_streams v:0 -show_entries stream=height -of csv=p=0 "$INPUT")
19
+
20
+ echo "Creating $DURATION sec video from $INPUT ($WIDTH x $HEIGHT)..."
21
+
22
+ if [ "$ZOOM" = "out" ]; then
23
+ # Zoom out: start zoomed in, end at normal
24
+ FILTER="zoompan=z='1.2-0.2*on/(${DURATION}*25)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=${DURATION}*25:s=${WIDTH}x${HEIGHT}:fps=25"
25
+ else
26
+ # Zoom in: start normal, end zoomed
27
+ FILTER="zoompan=z='1+0.2*on/(${DURATION}*25)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=${DURATION}*25:s=${WIDTH}x${HEIGHT}:fps=25"
28
+ fi
29
+
30
+ ffmpeg -y -loop 1 -i "$INPUT" \
31
+ -vf "$FILTER" \
32
+ -t "$DURATION" \
33
+ -c:v libx264 -preset fast -crf 20 \
34
+ -pix_fmt yuv420p \
35
+ "$OUTPUT"
36
+
37
+ echo "Done: $OUTPUT"
@@ -0,0 +1,59 @@
1
+ # talking character pipeline
2
+
3
+ create a talking character video with lipsync and captions
4
+
5
+ ## steps
6
+
7
+ ### 1. create character headshot
8
+ ```bash
9
+ # generate character using higgsfield soul
10
+ bun run service/image.ts soul "professional headshot of a friendly person, studio lighting" true
11
+ ```
12
+
13
+ ### 2. generate voiceover
14
+ ```bash
15
+ # use fal voice synthesis
16
+ bun run lib/fal.ts generate_speech "hello world, this is my voice" true
17
+ ```
18
+
19
+ ### 3. animate character
20
+ ```bash
21
+ # image-to-video with character talking
22
+ bun run service/video.ts from_image "person talking naturally, professional demeanor" <headshot_url> 5 true
23
+ ```
24
+
25
+ ### 4. add lipsync
26
+ ```bash
27
+ # sync lips with voiceover
28
+ bun run service/sync.ts overlay <video_url> <audio_url> output.mp4
29
+ ```
30
+
31
+ ### 5. add captions
32
+ ```bash
33
+ # add auto-generated captions with transcription
34
+ bun run service/captions.ts output.mp4 captioned.mp4 --provider fireworks
35
+ ```
36
+
37
+ ### 6. prepare for social media
38
+ ```bash
39
+ # resize and optimize for tiktok/instagram
40
+ bun run service/edit.ts social captioned.mp4 final-tiktok.mp4 tiktok
41
+ ```
42
+
43
+ ## expected output
44
+ - character headshot (png)
45
+ - voiceover audio (mp3)
46
+ - animated video (mp4)
47
+ - lipsynced video (mp4)
48
+ - captioned video (mp4)
49
+ - final social media ready video (mp4)
50
+
51
+ ## estimated time
52
+ - headshot: 30s
53
+ - voiceover: 10s
54
+ - animation: 2-3min
55
+ - lipsync: 30s
56
+ - captions: 15s (includes transcription)
57
+ - social prep: 5s
58
+
59
+ total: ~4-5min