vargai 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +7 -0
- package/.env.example +27 -0
- package/.github/workflows/ci.yml +23 -0
- package/.husky/README.md +102 -0
- package/.husky/commit-msg +6 -0
- package/.husky/pre-commit +9 -0
- package/.husky/pre-push +6 -0
- package/.size-limit.json +8 -0
- package/.test-hooks.ts +5 -0
- package/CLAUDE.md +125 -0
- package/CONTRIBUTING.md +150 -0
- package/LICENSE.md +53 -0
- package/README.md +78 -0
- package/SKILLS.md +173 -0
- package/STRUCTURE.md +92 -0
- package/biome.json +34 -0
- package/bun.lock +1254 -0
- package/commitlint.config.js +22 -0
- package/docs/plan.md +66 -0
- package/docs/todo.md +14 -0
- package/docs/varg-sdk.md +812 -0
- package/ffmpeg/CLAUDE.md +68 -0
- package/package.json +69 -0
- package/pipeline/cookbooks/SKILL.md +285 -0
- package/pipeline/cookbooks/remotion-video.md +585 -0
- package/pipeline/cookbooks/round-video-character.md +337 -0
- package/pipeline/cookbooks/scripts/animate-frames-parallel.ts +84 -0
- package/pipeline/cookbooks/scripts/combine-scenes.sh +53 -0
- package/pipeline/cookbooks/scripts/generate-frames-parallel.ts +99 -0
- package/pipeline/cookbooks/scripts/still-to-video.sh +37 -0
- package/pipeline/cookbooks/talking-character.md +59 -0
- package/pipeline/cookbooks/text-to-tiktok.md +669 -0
- package/pipeline/cookbooks/trendwatching.md +156 -0
- package/plan.md +281 -0
- package/scripts/.gitkeep +0 -0
- package/src/ai-sdk/cache.ts +142 -0
- package/src/ai-sdk/examples/cached-generation.ts +53 -0
- package/src/ai-sdk/examples/duet-scene-4.ts +53 -0
- package/src/ai-sdk/examples/duet-scene-5-audio.ts +32 -0
- package/src/ai-sdk/examples/duet-video.ts +56 -0
- package/src/ai-sdk/examples/editly-composition.ts +63 -0
- package/src/ai-sdk/examples/editly-test.ts +57 -0
- package/src/ai-sdk/examples/editly-video-test.ts +52 -0
- package/src/ai-sdk/examples/fal-lipsync.ts +43 -0
- package/src/ai-sdk/examples/higgsfield-image.ts +61 -0
- package/src/ai-sdk/examples/music-generation.ts +19 -0
- package/src/ai-sdk/examples/openai-sora.ts +34 -0
- package/src/ai-sdk/examples/replicate-bg-removal.ts +52 -0
- package/src/ai-sdk/examples/simpsons-scene.ts +61 -0
- package/src/ai-sdk/examples/talking-lion.ts +55 -0
- package/src/ai-sdk/examples/video-generation.ts +39 -0
- package/src/ai-sdk/examples/workflow-animated-girl.ts +104 -0
- package/src/ai-sdk/examples/workflow-before-after.ts +114 -0
- package/src/ai-sdk/examples/workflow-character-grid.ts +112 -0
- package/src/ai-sdk/examples/workflow-slideshow.ts +161 -0
- package/src/ai-sdk/file-cache.ts +112 -0
- package/src/ai-sdk/file.ts +238 -0
- package/src/ai-sdk/generate-element.ts +92 -0
- package/src/ai-sdk/generate-music.ts +46 -0
- package/src/ai-sdk/generate-video.ts +165 -0
- package/src/ai-sdk/index.ts +72 -0
- package/src/ai-sdk/music-model.ts +110 -0
- package/src/ai-sdk/providers/editly/editly.test.ts +1108 -0
- package/src/ai-sdk/providers/editly/ffmpeg.ts +60 -0
- package/src/ai-sdk/providers/editly/index.ts +817 -0
- package/src/ai-sdk/providers/editly/layers.ts +772 -0
- package/src/ai-sdk/providers/editly/plan.md +144 -0
- package/src/ai-sdk/providers/editly/types.ts +328 -0
- package/src/ai-sdk/providers/elevenlabs-provider.ts +255 -0
- package/src/ai-sdk/providers/fal-provider.ts +512 -0
- package/src/ai-sdk/providers/higgsfield.ts +379 -0
- package/src/ai-sdk/providers/openai.ts +251 -0
- package/src/ai-sdk/providers/replicate.ts +16 -0
- package/src/ai-sdk/video-model.ts +185 -0
- package/src/cli/commands/find.tsx +137 -0
- package/src/cli/commands/help.tsx +85 -0
- package/src/cli/commands/index.ts +9 -0
- package/src/cli/commands/list.tsx +238 -0
- package/src/cli/commands/run.tsx +511 -0
- package/src/cli/commands/which.tsx +253 -0
- package/src/cli/index.ts +112 -0
- package/src/cli/quiet.ts +44 -0
- package/src/cli/types.ts +32 -0
- package/src/cli/ui/components/Badge.tsx +29 -0
- package/src/cli/ui/components/DataTable.tsx +51 -0
- package/src/cli/ui/components/Header.tsx +23 -0
- package/src/cli/ui/components/HelpBlock.tsx +44 -0
- package/src/cli/ui/components/KeyValue.tsx +33 -0
- package/src/cli/ui/components/OptionRow.tsx +81 -0
- package/src/cli/ui/components/Separator.tsx +23 -0
- package/src/cli/ui/components/StatusBox.tsx +108 -0
- package/src/cli/ui/components/VargBox.tsx +51 -0
- package/src/cli/ui/components/VargProgress.tsx +36 -0
- package/src/cli/ui/components/VargSpinner.tsx +34 -0
- package/src/cli/ui/components/VargText.tsx +56 -0
- package/src/cli/ui/components/index.ts +19 -0
- package/src/cli/ui/index.ts +12 -0
- package/src/cli/ui/render.ts +35 -0
- package/src/cli/ui/theme.ts +63 -0
- package/src/cli/utils.ts +78 -0
- package/src/core/executor/executor.ts +201 -0
- package/src/core/executor/index.ts +13 -0
- package/src/core/executor/job.ts +214 -0
- package/src/core/executor/pipeline.ts +222 -0
- package/src/core/index.ts +11 -0
- package/src/core/registry/index.ts +9 -0
- package/src/core/registry/loader.ts +149 -0
- package/src/core/registry/registry.ts +221 -0
- package/src/core/registry/resolver.ts +206 -0
- package/src/core/schema/helpers.ts +134 -0
- package/src/core/schema/index.ts +8 -0
- package/src/core/schema/shared.ts +102 -0
- package/src/core/schema/types.ts +279 -0
- package/src/core/schema/validator.ts +92 -0
- package/src/definitions/actions/captions.ts +261 -0
- package/src/definitions/actions/edit.ts +298 -0
- package/src/definitions/actions/image.ts +125 -0
- package/src/definitions/actions/index.ts +114 -0
- package/src/definitions/actions/music.ts +205 -0
- package/src/definitions/actions/sync.ts +128 -0
- package/src/definitions/actions/transcribe.ts +200 -0
- package/src/definitions/actions/upload.ts +111 -0
- package/src/definitions/actions/video.ts +163 -0
- package/src/definitions/actions/voice.ts +119 -0
- package/src/definitions/index.ts +23 -0
- package/src/definitions/models/elevenlabs.ts +50 -0
- package/src/definitions/models/flux.ts +56 -0
- package/src/definitions/models/index.ts +36 -0
- package/src/definitions/models/kling.ts +56 -0
- package/src/definitions/models/llama.ts +54 -0
- package/src/definitions/models/nano-banana-pro.ts +102 -0
- package/src/definitions/models/sonauto.ts +68 -0
- package/src/definitions/models/soul.ts +65 -0
- package/src/definitions/models/wan.ts +54 -0
- package/src/definitions/models/whisper.ts +44 -0
- package/src/definitions/skills/index.ts +12 -0
- package/src/definitions/skills/talking-character.ts +87 -0
- package/src/definitions/skills/text-to-tiktok.ts +97 -0
- package/src/index.ts +118 -0
- package/src/providers/apify.ts +269 -0
- package/src/providers/base.ts +264 -0
- package/src/providers/elevenlabs.ts +217 -0
- package/src/providers/fal.ts +392 -0
- package/src/providers/ffmpeg.ts +544 -0
- package/src/providers/fireworks.ts +193 -0
- package/src/providers/groq.ts +149 -0
- package/src/providers/higgsfield.ts +145 -0
- package/src/providers/index.ts +143 -0
- package/src/providers/replicate.ts +147 -0
- package/src/providers/storage.ts +206 -0
- package/src/tests/all.test.ts +509 -0
- package/src/tests/index.ts +33 -0
- package/src/tests/unit.test.ts +403 -0
- package/tsconfig.json +45 -0
|
@@ -0,0 +1,337 @@
|
|
|
1
|
+
# round video character cookbook
|
|
2
|
+
|
|
3
|
+
create realistic round selfie videos for telegram: front-facing camera POV videos with authentic camera shake, lighting, and audio
|
|
4
|
+
|
|
5
|
+
## what this does
|
|
6
|
+
|
|
7
|
+
1. generates 3 first frame options: person in specified setting (conference, station, etc)
|
|
8
|
+
2. ai picks the best first frame from the 3 options
|
|
9
|
+
3. generates voiceover from text script
|
|
10
|
+
4. creates talking video using wan 2.5 with audio sync
|
|
11
|
+
|
|
12
|
+
## inputs
|
|
13
|
+
|
|
14
|
+
- `text_script`: what the person will say
|
|
15
|
+
- `profile_photo`: photo of the person (e.g., media/friend/katia.jpg)
|
|
16
|
+
- `scene_location`: where they are (from script or default: conference/underground station)
|
|
17
|
+
|
|
18
|
+
## steps
|
|
19
|
+
|
|
20
|
+
### step 1: generate first frame options (person in setting)
|
|
21
|
+
|
|
22
|
+
generate 3 variations and let ai pick the best one:
|
|
23
|
+
|
|
24
|
+
```bash
|
|
25
|
+
# generate 3 SELFIE-STYLE first frame options using nano banana pro
|
|
26
|
+
# CRITICAL: use the proven prompt structure below
|
|
27
|
+
# aspect_ratio "auto" preserves the original photo's aspect ratio (portrait/landscape)
|
|
28
|
+
|
|
29
|
+
# option 1
|
|
30
|
+
bun run lib/fal.ts image_to_image \
|
|
31
|
+
"selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
|
|
32
|
+
media/friend/katia.jpg \
|
|
33
|
+
auto
|
|
34
|
+
|
|
35
|
+
# option 2
|
|
36
|
+
bun run lib/fal.ts image_to_image \
|
|
37
|
+
"selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
|
|
38
|
+
media/friend/katia.jpg \
|
|
39
|
+
auto
|
|
40
|
+
|
|
41
|
+
# option 3
|
|
42
|
+
bun run lib/fal.ts image_to_image \
|
|
43
|
+
"selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
|
|
44
|
+
media/friend/katia.jpg \
|
|
45
|
+
auto
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
**important prompting for selfie style (image-to-image):**
|
|
49
|
+
- start with "selfie POV" - simple and effective
|
|
50
|
+
- include "camera with subtle natural wobble and shake throughout"
|
|
51
|
+
- specify "focus on subject with shallow depth of field"
|
|
52
|
+
- lighting: "dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast"
|
|
53
|
+
- background: "ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights"
|
|
54
|
+
- clothing: "wear black hoodie without any text on it" (or specify other clothing)
|
|
55
|
+
- location: flexible - adjust based on script (hackathon space, metro station, office, etc.)
|
|
56
|
+
- aspect ratio "auto" preserves original dimensions - critical for avoiding squashed/stretched video!
|
|
57
|
+
|
|
58
|
+
each command outputs a URL like: `https://v3b.fal.media/files/.../image.jpg`
|
|
59
|
+
|
|
60
|
+
download all 3 options:
|
|
61
|
+
```bash
|
|
62
|
+
curl -o media/friend/option1.jpg "https://url-from-option-1.jpg"
|
|
63
|
+
curl -o media/friend/option2.jpg "https://url-from-option-2.jpg"
|
|
64
|
+
curl -o media/friend/option3.jpg "https://url-from-option-3.jpg"
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
**ai should review the 3 options and pick the best one based on:**
|
|
68
|
+
- face quality and recognition
|
|
69
|
+
- natural selfie look
|
|
70
|
+
- lighting and color balance
|
|
71
|
+
- background blur and composition
|
|
72
|
+
- overall authenticity
|
|
73
|
+
|
|
74
|
+
use the selected image url for step 4 (wan 2.5)
|
|
75
|
+
|
|
76
|
+
### step 2: generate voiceover
|
|
77
|
+
|
|
78
|
+
```bash
|
|
79
|
+
# generate voice from script
|
|
80
|
+
# save to media/friend/[name]/voice.mp3 for organization
|
|
81
|
+
bun run lib/elevenlabs.ts tts \
|
|
82
|
+
"hey everyone! excited to share this update from the conference" \
|
|
83
|
+
rachel \
|
|
84
|
+
media/friend/katia/voice.mp3
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
the audio is saved to `media/friend/[name]/voice.mp3`. you'll need to upload this to get a url for wan 2.5.
|
|
88
|
+
|
|
89
|
+
### step 3: generate talking video with wan 2.5 (via fal)
|
|
90
|
+
|
|
91
|
+
**important: audio must be at least 3 seconds long!**
|
|
92
|
+
|
|
93
|
+
fal's wan-25 endpoint requires audio duration of 3+ seconds. if your script is too short, extend it.
|
|
94
|
+
|
|
95
|
+
```bash
|
|
96
|
+
# use fal's wan-25 endpoint (supports local files and urls)
|
|
97
|
+
# audio and image files will be auto-uploaded if local paths are provided
|
|
98
|
+
# duration MUST be 5 or 10 seconds only
|
|
99
|
+
bun run lib/fal.ts wan \
|
|
100
|
+
media/friend/katia/option2.jpg \
|
|
101
|
+
media/friend/katia/voice.mp3 \
|
|
102
|
+
"front-facing camera selfie POV video, handheld phone directly in front of face with continuous slight wobble and shake, subject in sharp focus with softly blurred background shallow depth of field, dramatic low-light scene with intense magenta hot pink light illuminating face and blue ambient lights in blurred background, dark indoor busy conference setting with abstract out-of-focus lights, conversational audio with muffled background crowd chatter and commotion" \
|
|
103
|
+
10 \
|
|
104
|
+
480p
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
**command structure:**
|
|
108
|
+
```bash
|
|
109
|
+
bun run lib/fal.ts wan <image_path_or_url> <audio_path_or_url> <prompt> [duration] [resolution]
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
**parameters:**
|
|
113
|
+
- image: local path or url (auto-uploaded if local)
|
|
114
|
+
- audio: local path or url (auto-uploaded if local, **must be 3+ seconds**)
|
|
115
|
+
- prompt: detailed video style description
|
|
116
|
+
- duration: 5 or 10 (default: 5)
|
|
117
|
+
- resolution: 480p, 720p, or 1080p (default: 480p)
|
|
118
|
+
|
|
119
|
+
**detailed prompt structure for realistic selfie videos:**
|
|
120
|
+
|
|
121
|
+
the prompt should include ALL these elements for maximum authenticity:
|
|
122
|
+
|
|
123
|
+
**camera technique:**
|
|
124
|
+
- "front-facing camera selfie POV video"
|
|
125
|
+
- "handheld phone directly in front of face"
|
|
126
|
+
- "continuous slight wobble and shake"
|
|
127
|
+
|
|
128
|
+
**focus & composition:**
|
|
129
|
+
- "subject in sharp focus"
|
|
130
|
+
- "softly blurred background shallow depth of field"
|
|
131
|
+
|
|
132
|
+
**lighting:**
|
|
133
|
+
- "dramatic low-light scene"
|
|
134
|
+
- "intense magenta hot pink light illuminating face" (or specify your lighting color)
|
|
135
|
+
- "blue ambient lights in blurred background" (optional, for busy settings)
|
|
136
|
+
|
|
137
|
+
**setting:**
|
|
138
|
+
- "dark indoor busy setting with abstract out-of-focus lights" (adjust based on location)
|
|
139
|
+
|
|
140
|
+
**audio characteristics:**
|
|
141
|
+
- "conversational audio with muffled background crowd chatter and commotion"
|
|
142
|
+
|
|
143
|
+
this comprehensive prompting creates videos that look like authentic, quickly-recorded selfie messages with realistic imperfections.
|
|
144
|
+
|
|
145
|
+
this takes 2-4 minutes. the command will wait for completion and output the video url.
|
|
146
|
+
|
|
147
|
+
### step 4: download result
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# fal wan-25 returns video url like: https://v3b.fal.media/files/.../video.mp4
|
|
151
|
+
curl -o media/friend/talking-character.mp4 "https://v3b.fal.media/files/.../video.mp4"
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
## output
|
|
155
|
+
|
|
156
|
+
- first frame options: 3 variations (jpg) - `media/friend/[name]/option1.jpg`, `option2.jpg`, `option3.jpg`
|
|
157
|
+
- selected first frame: best option chosen by ai
|
|
158
|
+
- voiceover: `media/friend/[name]/voice.mp3`
|
|
159
|
+
- final video: `media/friend/[name]/talking-character.mp4`
|
|
160
|
+
|
|
161
|
+
## timing
|
|
162
|
+
|
|
163
|
+
- first frame generation: 15-30s (3 options)
|
|
164
|
+
- ai selection: instant
|
|
165
|
+
- voiceover: 5-10s
|
|
166
|
+
- wan 2.5 processing (fal): 2-4min
|
|
167
|
+
|
|
168
|
+
**total: ~3-5 minutes**
|
|
169
|
+
|
|
170
|
+
## scene context examples
|
|
171
|
+
|
|
172
|
+
choose setting based on script context. always include handheld camera description for authentic look:
|
|
173
|
+
|
|
174
|
+
| script mentions | step 1: first frame prompt | wan 2.5 prompt (detailed style) |
|
|
175
|
+
|----------------|---------------------------|--------------------------------|
|
|
176
|
+
| "at the conference" / "hackathon" | selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it | front-facing camera selfie POV video, handheld phone directly in front of face with continuous slight wobble, subject in sharp focus with softly blurred background, dramatic low-light with intense magenta hot pink light illuminating face and blue ambient lights in blurred background, dark indoor busy conference setting with abstract out-of-focus lights, conversational audio with muffled background crowd chatter |
|
|
177
|
+
| "subway" / "metro" | selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with harsh fluorescent lighting, ambient lights scattered in background, dark underground station setting, abstract out-of-focus lights, location: metro station, wear black hoodie without any text on it | front-facing camera selfie POV video, handheld phone with slight shake, sharp focus on subject with blurred metro background, harsh fluorescent lighting with cool tones, dark underground station with out-of-focus commuters and lights, audio with echoing background noise and distant train sounds |
|
|
178
|
+
| "office" | selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, soft indoor office lighting environment, ambient lights in background, modern workspace setting, abstract out-of-focus monitors and lights, location: office, wear black hoodie without any text on it | front-facing camera selfie POV video, handheld phone wobble, sharp subject focus with blurred office background, soft indoor office lighting, modern workspace with blurred monitors and colleagues in background, conversational audio with quiet office ambient noise |
|
|
179
|
+
| "street" | selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, natural daylight or street lighting environment, ambient lights in background, urban street setting, abstract out-of-focus pedestrians and lights, location: city street, wear black hoodie without any text on it | front-facing camera selfie POV video, handheld shake, sharp focus with blurred street background, natural daylight or street lighting, urban setting with out-of-focus pedestrians and traffic, audio with street noise and distant traffic sounds |
|
|
180
|
+
| no location | selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights, wear black hoodie without any text on it | front-facing camera selfie POV video, handheld phone with slight wobble, sharp subject with softly blurred background, natural indoor lighting, casual indoor setting, conversational audio (default) |
|
|
181
|
+
|
|
182
|
+
**key phrases for authentic selfie look:**
|
|
183
|
+
|
|
184
|
+
**step 1 (first frame - image-to-image):**
|
|
185
|
+
|
|
186
|
+
proven prompt structure (adjust location only):
|
|
187
|
+
```
|
|
188
|
+
selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: [hackathon space/metro station/office/city street], wear black hoodie without any text on it
|
|
189
|
+
```
|
|
190
|
+
|
|
191
|
+
- start with "selfie POV" - simple, no zoom confusion
|
|
192
|
+
- "camera with subtle natural wobble and shake throughout" - natural movement
|
|
193
|
+
- "focus on subject with shallow depth of field" - proper framing
|
|
194
|
+
- lighting: magenta/hot pink with blue ambient (adjust per setting)
|
|
195
|
+
- location: flexible - change based on script
|
|
196
|
+
- clothing: black hoodie without text (or adjust as needed)
|
|
197
|
+
|
|
198
|
+
**step 4 (wan 2.5) - comprehensive style elements:**
|
|
199
|
+
|
|
200
|
+
*camera technique:*
|
|
201
|
+
- "front-facing camera selfie POV video"
|
|
202
|
+
- "handheld phone directly in front of face"
|
|
203
|
+
- "continuous slight wobble and shake"
|
|
204
|
+
|
|
205
|
+
*focus & depth:*
|
|
206
|
+
- "subject in sharp focus"
|
|
207
|
+
- "softly blurred background"
|
|
208
|
+
- "shallow depth of field"
|
|
209
|
+
|
|
210
|
+
*lighting:*
|
|
211
|
+
- "dramatic low-light scene"
|
|
212
|
+
- "intense magenta hot pink light illuminating face" (adjust color per setting)
|
|
213
|
+
- "blue ambient lights in blurred background" (optional)
|
|
214
|
+
|
|
215
|
+
*setting:*
|
|
216
|
+
- "dark indoor busy setting"
|
|
217
|
+
- "abstract out-of-focus lights"
|
|
218
|
+
- adjust per location (conference/metro/office/street)
|
|
219
|
+
|
|
220
|
+
*audio:*
|
|
221
|
+
- "conversational audio with muffled background crowd chatter and commotion"
|
|
222
|
+
- adjust per setting (metro=echoing/train sounds, office=quiet ambient, street=traffic)
|
|
223
|
+
|
|
224
|
+
## example: full workflow
|
|
225
|
+
|
|
226
|
+
```bash
|
|
227
|
+
# scenario: katia sharing conference update
|
|
228
|
+
# script: "hey everyone! i'm so excited to share this amazing update with you from the conference today"
|
|
229
|
+
# photo: media/friend/katia.jpg
|
|
230
|
+
# note: audio must be 3+ seconds long for wan-25!
|
|
231
|
+
|
|
232
|
+
# step 1: generate 3 SELFIE first frame options with nano banana pro
|
|
233
|
+
bun run lib/fal.ts image_to_image \
|
|
234
|
+
"selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
|
|
235
|
+
media/friend/katia.jpg \
|
|
236
|
+
auto
|
|
237
|
+
# output 1: https://v3b.fal.media/files/.../option1.png
|
|
238
|
+
|
|
239
|
+
bun run lib/fal.ts image_to_image \
|
|
240
|
+
"selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
|
|
241
|
+
media/friend/katia.jpg \
|
|
242
|
+
auto
|
|
243
|
+
# output 2: https://v3b.fal.media/files/.../option2.png
|
|
244
|
+
|
|
245
|
+
bun run lib/fal.ts image_to_image \
|
|
246
|
+
"selfie POV, camera with subtle natural wobble and shake throughout, focus on subject with shallow depth of field, dramatic low-light environment with intense magenta and hot pink lighting creating strong color cast, ambient blue lights scattered in background, dark indoor busy setting, abstract out-of-focus colorful lights creating busy background, location: hackathon space, wear black hoodie without any text on it" \
|
|
247
|
+
media/friend/katia.jpg \
|
|
248
|
+
auto
|
|
249
|
+
# output 3: https://v3b.fal.media/files/.../option3.png
|
|
250
|
+
|
|
251
|
+
# download all 3 options
|
|
252
|
+
curl -o media/friend/katia/option1.jpg "https://v3b.fal.media/files/.../option1.png"
|
|
253
|
+
curl -o media/friend/katia/option2.jpg "https://v3b.fal.media/files/.../option2.png"
|
|
254
|
+
curl -o media/friend/katia/option3.jpg "https://v3b.fal.media/files/.../option3.png"
|
|
255
|
+
|
|
256
|
+
# ai reviews the 3 options and picks the best one based on:
|
|
257
|
+
# - face quality and recognition
|
|
258
|
+
# - natural selfie look
|
|
259
|
+
# - lighting and color balance
|
|
260
|
+
# - background blur and composition
|
|
261
|
+
# - overall authenticity
|
|
262
|
+
# selected: option2 (example)
|
|
263
|
+
|
|
264
|
+
# step 2: generate voice (ensure 3+ seconds for wan-25)
|
|
265
|
+
bun run lib/elevenlabs.ts tts \
|
|
266
|
+
"hey everyone! i'm so excited to share this amazing update with you from the conference today" \
|
|
267
|
+
rachel \
|
|
268
|
+
media/friend/katia/voice.mp3
|
|
269
|
+
# output: media/friend/katia/voice.mp3
|
|
270
|
+
|
|
271
|
+
# step 3: run wan-25 (fal) - auto-uploads local files
|
|
272
|
+
bun run lib/fal.ts wan \
|
|
273
|
+
media/friend/katia/option2.jpg \
|
|
274
|
+
media/friend/katia/voice.mp3 \
|
|
275
|
+
"front-facing camera selfie POV video, handheld phone directly in front of face with continuous slight wobble and shake, subject in sharp focus with softly blurred background shallow depth of field, dramatic low-light scene with intense magenta hot pink light illuminating face and blue ambient lights in blurred background, dark indoor busy conference setting with abstract out-of-focus lights, conversational audio with muffled background crowd chatter and commotion" \
|
|
276
|
+
10 \
|
|
277
|
+
480p
|
|
278
|
+
# takes 2-4 minutes...
|
|
279
|
+
# output: { "data": { "video": { "url": "https://v3b.fal.media/files/.../video.mp4" } } }
|
|
280
|
+
|
|
281
|
+
# step 4: download
|
|
282
|
+
curl -o media/friend/katia-talking.mp4 "https://v3b.fal.media/files/.../video.mp4"
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
**tested successfully** with katia.jpg and aleks - see media/friend/ for example outputs!
|
|
286
|
+
|
|
287
|
+
## tips
|
|
288
|
+
|
|
289
|
+
- **selfie perspective**: CRITICAL - always use "selfie POV" in step 1 first frame generation!
|
|
290
|
+
- **audio duration**: CRITICAL - wan-25 requires audio to be at least 3 seconds long. extend short scripts!
|
|
291
|
+
- **duration constraint**: wan-25 only accepts 5 or 10 second videos
|
|
292
|
+
- **script length**: ensure script is at least 3 seconds when spoken, max 10 seconds
|
|
293
|
+
- **aspect ratio preservation**: CRITICAL - always use "auto" aspect ratio in image-to-image to avoid squashed/stretched videos!
|
|
294
|
+
- **nano banana pro**: uses aspect_ratio="auto" to preserve original photo dimensions (portrait/landscape)
|
|
295
|
+
- **local file support**: fal wan command auto-uploads local files - no need for manual upload step!
|
|
296
|
+
- **handheld camera**: always include "handheld phone" + "wobble and shake" in wan-25 prompt for authentic look
|
|
297
|
+
- **first frame quality**: this is the base - make it look natural and selfie-like!
|
|
298
|
+
- **scene matching**: extract location from script when mentioned
|
|
299
|
+
- **voice selection**: rachel (default) is clear and professional
|
|
300
|
+
- **resolution**: 480p is faster (2-3min), 720p/1080p takes longer (4-5min)
|
|
301
|
+
- **save intermediates**: store outputs in media/friend/[name]/ for organization and reuse
|
|
302
|
+
- **using fal instead of replicate**: fal's wan-25 endpoint is faster and more reliable than replicate
|
|
303
|
+
|
|
304
|
+
## voice options
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
# female voices (american english)
|
|
308
|
+
bun run lib/elevenlabs.ts tts "script" rachel media/friend/[name]/voice.mp3
|
|
309
|
+
bun run lib/elevenlabs.ts tts "script" bella media/friend/[name]/voice.mp3
|
|
310
|
+
bun run lib/elevenlabs.ts tts "script" elli media/friend/[name]/voice.mp3
|
|
311
|
+
|
|
312
|
+
# male voices (american english)
|
|
313
|
+
bun run lib/elevenlabs.ts tts "script" antoni media/friend/[name]/voice.mp3
|
|
314
|
+
bun run lib/elevenlabs.ts tts "script" josh media/friend/[name]/voice.mp3
|
|
315
|
+
```
|
|
316
|
+
|
|
317
|
+
see all voices: `bun run lib/elevenlabs.ts voices`
|
|
318
|
+
|
|
319
|
+
## environment setup
|
|
320
|
+
|
|
321
|
+
```bash
|
|
322
|
+
# required api keys
|
|
323
|
+
export ELEVENLABS_API_KEY="your_key"
|
|
324
|
+
export FAL_KEY="your_key" # for wan-25 and image generation
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
## changelog
|
|
328
|
+
|
|
329
|
+
**2024-11-22:**
|
|
330
|
+
- switched to fal's wan-25-preview endpoint (faster, more reliable than replicate)
|
|
331
|
+
- added wan-25 support to lib/fal.ts with auto-upload for local files
|
|
332
|
+
- discovered: audio must be at least 3 seconds long for wan-25 (critical!)
|
|
333
|
+
- simplified workflow: no manual audio upload step needed
|
|
334
|
+
- tested successfully with aleks photo and "give me money" script
|
|
335
|
+
- switched from flux to nano banana pro for image-to-image (better aspect ratio preservation)
|
|
336
|
+
- fixed squashed video issue by using aspect_ratio="auto"
|
|
337
|
+
- clarified duration constraints (5 or 10 seconds only)
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Animate multiple frames in parallel using kling
|
|
3
|
+
* Usage: bun run pipeline/cookbooks/scripts/animate-frames-parallel.ts
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { fal } from "@fal-ai/client";
|
|
7
|
+
|
|
8
|
+
interface VideoConfig {
|
|
9
|
+
name: string;
|
|
10
|
+
framePath: string;
|
|
11
|
+
prompt: string;
|
|
12
|
+
duration?: "5" | "10";
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
async function animateFrames(configs: VideoConfig[], outputDir: string) {
|
|
16
|
+
console.log(`Animating ${configs.length} frames in parallel...\n`);
|
|
17
|
+
|
|
18
|
+
// Upload all frames first
|
|
19
|
+
const frameUrls: string[] = [];
|
|
20
|
+
for (const config of configs) {
|
|
21
|
+
const url = await fal.storage.upload(Bun.file(config.framePath));
|
|
22
|
+
frameUrls.push(url);
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
const promises = configs.map((config, i) => {
|
|
26
|
+
return fal.subscribe("fal-ai/kling-video/v2.5-turbo/pro/image-to-video", {
|
|
27
|
+
input: {
|
|
28
|
+
prompt: `${config.prompt}, NO talking NO lip movement`,
|
|
29
|
+
image_url: frameUrls[i],
|
|
30
|
+
duration: config.duration || "5",
|
|
31
|
+
// note: aspect_ratio is determined by input image dimensions
|
|
32
|
+
},
|
|
33
|
+
});
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
const results = await Promise.all(promises);
|
|
37
|
+
|
|
38
|
+
for (let i = 0; i < results.length; i++) {
|
|
39
|
+
const result = results[i] as { data?: { video?: { url?: string } } };
|
|
40
|
+
const url = result.data?.video?.url;
|
|
41
|
+
const config = configs[i];
|
|
42
|
+
if (url && config) {
|
|
43
|
+
const response = await fetch(url);
|
|
44
|
+
const buffer = await response.arrayBuffer();
|
|
45
|
+
await Bun.write(`${outputDir}/${config.name}_video.mp4`, buffer);
|
|
46
|
+
console.log(`${config.name}_video.mp4 saved`);
|
|
47
|
+
} else {
|
|
48
|
+
console.error(`No URL for ${config?.name ?? "unknown"}`);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
console.log("\nAll videos saved!");
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
// Example usage:
|
|
56
|
+
async function main() {
|
|
57
|
+
const outputDir = "media/girl-ruined-you";
|
|
58
|
+
|
|
59
|
+
const configs: VideoConfig[] = [
|
|
60
|
+
{
|
|
61
|
+
name: "scene6",
|
|
62
|
+
framePath: `${outputDir}/scene6_frame.jpg`,
|
|
63
|
+
prompt:
|
|
64
|
+
"3D pixar animation, two cats meet eyes in coffee shop, warm romantic moment",
|
|
65
|
+
duration: "5",
|
|
66
|
+
},
|
|
67
|
+
{
|
|
68
|
+
name: "scene7",
|
|
69
|
+
framePath: `${outputDir}/scene7_frame.jpg`,
|
|
70
|
+
prompt: "3D pixar animation, two cats walking together, sunset, romantic",
|
|
71
|
+
duration: "5",
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
name: "scene14",
|
|
75
|
+
framePath: `${outputDir}/scene14_frame.jpg`,
|
|
76
|
+
prompt: "3D pixar animation, cat looks at sunrise, hopeful realization",
|
|
77
|
+
duration: "5",
|
|
78
|
+
},
|
|
79
|
+
];
|
|
80
|
+
|
|
81
|
+
await animateFrames(configs, outputDir);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Combine multiple scene videos with audio clips
|
|
3
|
+
# Usage: ./combine-scenes.sh <project_dir>
|
|
4
|
+
|
|
5
|
+
PROJECT_DIR=${1:-"media/girl-ruined-you"}
|
|
6
|
+
|
|
7
|
+
# Scene timing configuration (adjust as needed)
|
|
8
|
+
# Format: scene_num:start_time:duration
|
|
9
|
+
SCENES=(
|
|
10
|
+
"1:0:3.5"
|
|
11
|
+
"2:3.5:6.5"
|
|
12
|
+
"3:10:10"
|
|
13
|
+
"4:20:15"
|
|
14
|
+
"5:35:7"
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
echo "Extracting audio clips..."
|
|
18
|
+
for scene_config in "${SCENES[@]}"; do
|
|
19
|
+
IFS=':' read -r num start dur <<< "$scene_config"
|
|
20
|
+
ffmpeg -y -i "$PROJECT_DIR/voiceover.mp3" -ss "$start" -t "$dur" "$PROJECT_DIR/audio_scene${num}.mp3" 2>/dev/null
|
|
21
|
+
echo " audio_scene${num}.mp3 ($dur sec)"
|
|
22
|
+
done
|
|
23
|
+
|
|
24
|
+
echo ""
|
|
25
|
+
echo "Combining videos with audio..."
|
|
26
|
+
for scene_config in "${SCENES[@]}"; do
|
|
27
|
+
IFS=':' read -r num start dur <<< "$scene_config"
|
|
28
|
+
|
|
29
|
+
# Calculate loop count needed (5s videos)
|
|
30
|
+
loops=$(echo "($dur / 5) - 1" | bc)
|
|
31
|
+
if [ "$loops" -lt 0 ]; then loops=0; fi
|
|
32
|
+
|
|
33
|
+
ffmpeg -y -stream_loop "$loops" -i "$PROJECT_DIR/scene${num}_video.mp4" \
|
|
34
|
+
-i "$PROJECT_DIR/audio_scene${num}.mp3" \
|
|
35
|
+
-t "$dur" -c:v libx264 -preset fast -crf 20 -c:a aac -b:a 128k -shortest \
|
|
36
|
+
"$PROJECT_DIR/scene${num}_final.mp4" 2>/dev/null
|
|
37
|
+
echo " scene${num}_final.mp4"
|
|
38
|
+
done
|
|
39
|
+
|
|
40
|
+
echo ""
|
|
41
|
+
echo "Creating concat file..."
|
|
42
|
+
rm -f "$PROJECT_DIR/scenes.txt"
|
|
43
|
+
for scene_config in "${SCENES[@]}"; do
|
|
44
|
+
IFS=':' read -r num start dur <<< "$scene_config"
|
|
45
|
+
echo "file 'scene${num}_final.mp4'" >> "$PROJECT_DIR/scenes.txt"
|
|
46
|
+
done
|
|
47
|
+
|
|
48
|
+
echo "Concatenating all scenes..."
|
|
49
|
+
cd "$PROJECT_DIR" && ffmpeg -y -f concat -safe 0 -i scenes.txt -c copy combined_scenes.mp4 2>/dev/null
|
|
50
|
+
|
|
51
|
+
duration=$(ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 combined_scenes.mp4)
|
|
52
|
+
echo ""
|
|
53
|
+
echo "Done! combined_scenes.mp4 ($duration sec)"
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Generate multiple scene frames in parallel using flux kontext
|
|
3
|
+
* Usage: bun run pipeline/cookbooks/scripts/generate-frames-parallel.ts
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
import { fal } from "@fal-ai/client";
|
|
7
|
+
|
|
8
|
+
interface FrameConfig {
|
|
9
|
+
name: string;
|
|
10
|
+
prompt: string;
|
|
11
|
+
imageUrls: string[]; // character reference URLs
|
|
12
|
+
multi?: boolean; // use kontext/multi for multiple characters
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
async function generateFrames(configs: FrameConfig[], outputDir: string) {
|
|
16
|
+
console.log(`Generating ${configs.length} frames in parallel...\n`);
|
|
17
|
+
|
|
18
|
+
const promises = configs.map((config) => {
|
|
19
|
+
if (config.multi) {
|
|
20
|
+
return fal.subscribe("fal-ai/flux-pro/kontext/multi", {
|
|
21
|
+
input: {
|
|
22
|
+
prompt: config.prompt,
|
|
23
|
+
image_urls: config.imageUrls,
|
|
24
|
+
aspect_ratio: "9:16" as const,
|
|
25
|
+
},
|
|
26
|
+
});
|
|
27
|
+
} else {
|
|
28
|
+
return fal.subscribe("fal-ai/flux-pro/kontext", {
|
|
29
|
+
input: {
|
|
30
|
+
prompt: config.prompt,
|
|
31
|
+
image_url: config.imageUrls[0] ?? "",
|
|
32
|
+
aspect_ratio: "9:16" as const,
|
|
33
|
+
},
|
|
34
|
+
});
|
|
35
|
+
}
|
|
36
|
+
});
|
|
37
|
+
|
|
38
|
+
const results = await Promise.all(promises);
|
|
39
|
+
|
|
40
|
+
for (let i = 0; i < results.length; i++) {
|
|
41
|
+
const result = results[i] as {
|
|
42
|
+
data?: { images?: Array<{ url?: string }> };
|
|
43
|
+
};
|
|
44
|
+
const url = result.data?.images?.[0]?.url;
|
|
45
|
+
const config = configs[i];
|
|
46
|
+
if (url && config) {
|
|
47
|
+
const response = await fetch(url);
|
|
48
|
+
const buffer = await response.arrayBuffer();
|
|
49
|
+
await Bun.write(`${outputDir}/${config.name}_frame.jpg`, buffer);
|
|
50
|
+
console.log(`${config.name}_frame.jpg saved`);
|
|
51
|
+
} else {
|
|
52
|
+
console.error(`No URL for ${config?.name ?? "unknown"}`);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
console.log("\nAll frames saved!");
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Example usage:
|
|
60
|
+
async function main() {
|
|
61
|
+
const outputDir = "media/girl-ruined-you";
|
|
62
|
+
|
|
63
|
+
// Upload character references first
|
|
64
|
+
const protagonist = await fal.storage.upload(
|
|
65
|
+
Bun.file(`${outputDir}/cat_protagonist.png`),
|
|
66
|
+
);
|
|
67
|
+
const secondGirl = await fal.storage.upload(
|
|
68
|
+
Bun.file(`${outputDir}/cat_second_girl.png`),
|
|
69
|
+
);
|
|
70
|
+
|
|
71
|
+
const configs: FrameConfig[] = [
|
|
72
|
+
{
|
|
73
|
+
name: "scene6",
|
|
74
|
+
prompt:
|
|
75
|
+
"3D pixar style: male cat in hoodie (first) and elegant female cat (second) meeting eyes in coffee shop, warm golden lighting, vertical portrait 9:16",
|
|
76
|
+
imageUrls: [protagonist, secondGirl],
|
|
77
|
+
multi: true,
|
|
78
|
+
},
|
|
79
|
+
{
|
|
80
|
+
name: "scene7",
|
|
81
|
+
prompt:
|
|
82
|
+
"3D pixar style: male cat and female cat walking together, sunset, romantic, vertical portrait 9:16",
|
|
83
|
+
imageUrls: [protagonist, secondGirl],
|
|
84
|
+
multi: true,
|
|
85
|
+
},
|
|
86
|
+
// Single character scene
|
|
87
|
+
{
|
|
88
|
+
name: "scene14",
|
|
89
|
+
prompt:
|
|
90
|
+
"Place this cat looking at sunrise through window, hopeful, vertical portrait 9:16",
|
|
91
|
+
imageUrls: [protagonist],
|
|
92
|
+
multi: false,
|
|
93
|
+
},
|
|
94
|
+
];
|
|
95
|
+
|
|
96
|
+
await generateFrames(configs, outputDir);
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
main().catch(console.error);
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Convert still frame to video with ken burns effect (slow zoom)
|
|
3
|
+
# Usage: ./still-to-video.sh <input.jpg> <output.mp4> <duration> [zoom_direction]
|
|
4
|
+
# zoom_direction: in (default), out
|
|
5
|
+
|
|
6
|
+
INPUT=$1
|
|
7
|
+
OUTPUT=$2
|
|
8
|
+
DURATION=$3
|
|
9
|
+
ZOOM=${4:-"in"}
|
|
10
|
+
|
|
11
|
+
if [ -z "$INPUT" ] || [ -z "$OUTPUT" ] || [ -z "$DURATION" ]; then
|
|
12
|
+
echo "Usage: ./still-to-video.sh <input.jpg> <output.mp4> <duration> [in|out]"
|
|
13
|
+
exit 1
|
|
14
|
+
fi
|
|
15
|
+
|
|
16
|
+
# Get input dimensions
|
|
17
|
+
WIDTH=$(ffprobe -v error -select_streams v:0 -show_entries stream=width -of csv=p=0 "$INPUT")
|
|
18
|
+
HEIGHT=$(ffprobe -v error -select_streams v:0 -show_entries stream=height -of csv=p=0 "$INPUT")
|
|
19
|
+
|
|
20
|
+
echo "Creating $DURATION sec video from $INPUT ($WIDTH x $HEIGHT)..."
|
|
21
|
+
|
|
22
|
+
if [ "$ZOOM" = "out" ]; then
|
|
23
|
+
# Zoom out: start zoomed in, end at normal
|
|
24
|
+
FILTER="zoompan=z='1.2-0.2*on/(${DURATION}*25)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=${DURATION}*25:s=${WIDTH}x${HEIGHT}:fps=25"
|
|
25
|
+
else
|
|
26
|
+
# Zoom in: start normal, end zoomed
|
|
27
|
+
FILTER="zoompan=z='1+0.2*on/(${DURATION}*25)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=${DURATION}*25:s=${WIDTH}x${HEIGHT}:fps=25"
|
|
28
|
+
fi
|
|
29
|
+
|
|
30
|
+
ffmpeg -y -loop 1 -i "$INPUT" \
|
|
31
|
+
-vf "$FILTER" \
|
|
32
|
+
-t "$DURATION" \
|
|
33
|
+
-c:v libx264 -preset fast -crf 20 \
|
|
34
|
+
-pix_fmt yuv420p \
|
|
35
|
+
"$OUTPUT"
|
|
36
|
+
|
|
37
|
+
echo "Done: $OUTPUT"
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
# talking character pipeline
|
|
2
|
+
|
|
3
|
+
create a talking character video with lipsync and captions
|
|
4
|
+
|
|
5
|
+
## steps
|
|
6
|
+
|
|
7
|
+
### 1. create character headshot
|
|
8
|
+
```bash
|
|
9
|
+
# generate character using higgsfield soul
|
|
10
|
+
bun run service/image.ts soul "professional headshot of a friendly person, studio lighting" true
|
|
11
|
+
```
|
|
12
|
+
|
|
13
|
+
### 2. generate voiceover
|
|
14
|
+
```bash
|
|
15
|
+
# use fal voice synthesis
|
|
16
|
+
bun run lib/fal.ts generate_speech "hello world, this is my voice" true
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### 3. animate character
|
|
20
|
+
```bash
|
|
21
|
+
# image-to-video with character talking
|
|
22
|
+
bun run service/video.ts from_image "person talking naturally, professional demeanor" <headshot_url> 5 true
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
### 4. add lipsync
|
|
26
|
+
```bash
|
|
27
|
+
# sync lips with voiceover
|
|
28
|
+
bun run service/sync.ts overlay <video_url> <audio_url> output.mp4
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### 5. add captions
|
|
32
|
+
```bash
|
|
33
|
+
# add auto-generated captions with transcription
|
|
34
|
+
bun run service/captions.ts output.mp4 captioned.mp4 --provider fireworks
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### 6. prepare for social media
|
|
38
|
+
```bash
|
|
39
|
+
# resize and optimize for tiktok/instagram
|
|
40
|
+
bun run service/edit.ts social captioned.mp4 final-tiktok.mp4 tiktok
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
## expected output
|
|
44
|
+
- character headshot (png)
|
|
45
|
+
- voiceover audio (mp3)
|
|
46
|
+
- animated video (mp4)
|
|
47
|
+
- lipsynced video (mp4)
|
|
48
|
+
- captioned video (mp4)
|
|
49
|
+
- final social media ready video (mp4)
|
|
50
|
+
|
|
51
|
+
## estimated time
|
|
52
|
+
- headshot: 30s
|
|
53
|
+
- voiceover: 10s
|
|
54
|
+
- animation: 2-3min
|
|
55
|
+
- lipsync: 30s
|
|
56
|
+
- captions: 15s (includes transcription)
|
|
57
|
+
- social prep: 5s
|
|
58
|
+
|
|
59
|
+
total: ~4-5min
|