videowright 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +91 -0
- package/dist/cli/argv.d.ts +28 -0
- package/dist/cli/argv.d.ts.map +1 -0
- package/dist/cli/argv.js +115 -0
- package/dist/cli/argv.js.map +1 -0
- package/dist/cli/bin.d.ts +7 -0
- package/dist/cli/bin.d.ts.map +1 -0
- package/dist/cli/bin.js +10 -0
- package/dist/cli/bin.js.map +1 -0
- package/dist/cli/dev.d.ts +19 -0
- package/dist/cli/dev.d.ts.map +1 -0
- package/dist/cli/dev.js +104 -0
- package/dist/cli/dev.js.map +1 -0
- package/dist/cli/discover.d.ts +29 -0
- package/dist/cli/discover.d.ts.map +1 -0
- package/dist/cli/discover.js +104 -0
- package/dist/cli/discover.js.map +1 -0
- package/dist/cli/discover_project.d.ts +29 -0
- package/dist/cli/discover_project.d.ts.map +1 -0
- package/dist/cli/discover_project.js +108 -0
- package/dist/cli/discover_project.js.map +1 -0
- package/dist/cli/errors.d.ts +10 -0
- package/dist/cli/errors.d.ts.map +1 -0
- package/dist/cli/errors.js +13 -0
- package/dist/cli/errors.js.map +1 -0
- package/dist/cli/ffmpeg.d.ts +57 -0
- package/dist/cli/ffmpeg.d.ts.map +1 -0
- package/dist/cli/ffmpeg.js +122 -0
- package/dist/cli/ffmpeg.js.map +1 -0
- package/dist/cli/index.d.ts +7 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +152 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/playwright_check.d.ts +44 -0
- package/dist/cli/playwright_check.d.ts.map +1 -0
- package/dist/cli/playwright_check.js +20 -0
- package/dist/cli/playwright_check.js.map +1 -0
- package/dist/cli/prompt.d.ts +13 -0
- package/dist/cli/prompt.d.ts.map +1 -0
- package/dist/cli/prompt.js +47 -0
- package/dist/cli/prompt.js.map +1 -0
- package/dist/cli/render.d.ts +60 -0
- package/dist/cli/render.d.ts.map +1 -0
- package/dist/cli/render.js +471 -0
- package/dist/cli/render.js.map +1 -0
- package/dist/cli/script_cmd.d.ts +26 -0
- package/dist/cli/script_cmd.d.ts.map +1 -0
- package/dist/cli/script_cmd.js +88 -0
- package/dist/cli/script_cmd.js.map +1 -0
- package/dist/cli/time_shim.d.ts +44 -0
- package/dist/cli/time_shim.d.ts.map +1 -0
- package/dist/cli/time_shim.js +390 -0
- package/dist/cli/time_shim.js.map +1 -0
- package/dist/cli/ts_loader.d.ts +28 -0
- package/dist/cli/ts_loader.d.ts.map +1 -0
- package/dist/cli/ts_loader.js +95 -0
- package/dist/cli/ts_loader.js.map +1 -0
- package/dist/cli/vite_helpers.d.ts +62 -0
- package/dist/cli/vite_helpers.d.ts.map +1 -0
- package/dist/cli/vite_helpers.js +273 -0
- package/dist/cli/vite_helpers.js.map +1 -0
- package/dist/index.d.ts +11 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +14 -0
- package/dist/index.js.map +1 -0
- package/dist/player/hash_router.d.ts +23 -0
- package/dist/player/hash_router.d.ts.map +1 -0
- package/dist/player/hash_router.js +49 -0
- package/dist/player/hash_router.js.map +1 -0
- package/dist/player/hud.d.ts +33 -0
- package/dist/player/hud.d.ts.map +1 -0
- package/dist/player/hud.js +357 -0
- package/dist/player/hud.js.map +1 -0
- package/dist/player/index.d.ts +123 -0
- package/dist/player/index.d.ts.map +1 -0
- package/dist/player/index.js +848 -0
- package/dist/player/index.js.map +1 -0
- package/dist/player/input.d.ts +14 -0
- package/dist/player/input.d.ts.map +1 -0
- package/dist/player/input.js +90 -0
- package/dist/player/input.js.map +1 -0
- package/dist/player/slot.d.ts +22 -0
- package/dist/player/slot.d.ts.map +1 -0
- package/dist/player/slot.js +43 -0
- package/dist/player/slot.js.map +1 -0
- package/dist/player/transitions/cut.d.ts +7 -0
- package/dist/player/transitions/cut.d.ts.map +1 -0
- package/dist/player/transitions/cut.js +9 -0
- package/dist/player/transitions/cut.js.map +1 -0
- package/dist/player/transitions/fade.d.ts +7 -0
- package/dist/player/transitions/fade.d.ts.map +1 -0
- package/dist/player/transitions/fade.js +18 -0
- package/dist/player/transitions/fade.js.map +1 -0
- package/dist/player/transitions/index.d.ts +4 -0
- package/dist/player/transitions/index.d.ts.map +1 -0
- package/dist/player/transitions/index.js +4 -0
- package/dist/player/transitions/index.js.map +1 -0
- package/dist/player/transitions/slide.d.ts +6 -0
- package/dist/player/transitions/slide.d.ts.map +1 -0
- package/dist/player/transitions/slide.js +35 -0
- package/dist/player/transitions/slide.js.map +1 -0
- package/dist/script/index.d.ts +2 -0
- package/dist/script/index.d.ts.map +1 -0
- package/dist/script/index.js +2 -0
- package/dist/script/index.js.map +1 -0
- package/dist/script/script.d.ts +10 -0
- package/dist/script/script.d.ts.map +1 -0
- package/dist/script/script.js +41 -0
- package/dist/script/script.js.map +1 -0
- package/dist/segment/SegmentRunner.d.ts +52 -0
- package/dist/segment/SegmentRunner.d.ts.map +1 -0
- package/dist/segment/SegmentRunner.js +187 -0
- package/dist/segment/SegmentRunner.js.map +1 -0
- package/dist/segment/defineConfig.d.ts +6 -0
- package/dist/segment/defineConfig.d.ts.map +1 -0
- package/dist/segment/defineConfig.js +7 -0
- package/dist/segment/defineConfig.js.map +1 -0
- package/dist/segment/defineSegment.d.ts +7 -0
- package/dist/segment/defineSegment.d.ts.map +1 -0
- package/dist/segment/defineSegment.js +25 -0
- package/dist/segment/defineSegment.js.map +1 -0
- package/dist/segment/index.d.ts +5 -0
- package/dist/segment/index.d.ts.map +1 -0
- package/dist/segment/index.js +4 -0
- package/dist/segment/index.js.map +1 -0
- package/dist/timeline/index.d.ts +73 -0
- package/dist/timeline/index.d.ts.map +1 -0
- package/dist/timeline/index.js +142 -0
- package/dist/timeline/index.js.map +1 -0
- package/dist/timeline/loadAudioTrack.d.ts +18 -0
- package/dist/timeline/loadAudioTrack.d.ts.map +1 -0
- package/dist/timeline/loadAudioTrack.js +44 -0
- package/dist/timeline/loadAudioTrack.js.map +1 -0
- package/dist/timeline/loadVoiceover.d.ts +18 -0
- package/dist/timeline/loadVoiceover.d.ts.map +1 -0
- package/dist/timeline/loadVoiceover.js +38 -0
- package/dist/timeline/loadVoiceover.js.map +1 -0
- package/dist/timeline/resolveTiming.d.ts +28 -0
- package/dist/timeline/resolveTiming.d.ts.map +1 -0
- package/dist/timeline/resolveTiming.js +63 -0
- package/dist/timeline/resolveTiming.js.map +1 -0
- package/dist/timeline/validateTiming.d.ts +29 -0
- package/dist/timeline/validateTiming.d.ts.map +1 -0
- package/dist/timeline/validateTiming.js +62 -0
- package/dist/timeline/validateTiming.js.map +1 -0
- package/dist/types.d.ts +216 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +6 -0
- package/dist/types.js.map +1 -0
- package/package.json +47 -0
- package/skill/SKILL.md +64 -0
- package/skill/assets/hello_world/PLAN.md +31 -0
- package/skill/assets/hello_world/README.md +27 -0
- package/skill/assets/hello_world/audio/audio_plan.md +14 -0
- package/skill/assets/hello_world/segments/hello_intro.ts +69 -0
- package/skill/assets/hello_world/segments/hello_outro.ts +71 -0
- package/skill/assets/hello_world/timeline.ts +15 -0
- package/skill/assets/hello_world/voiceover_script/script.md +10 -0
- package/skill/assets/install/package.json +10 -0
- package/skill/assets/install/tsconfig.json +23 -0
- package/skill/assets/styles/editorial-mono/STYLE.md +124 -0
- package/skill/assets/styles/editorial-mono/brand.md +85 -0
- package/skill/assets/styles/editorial-mono/reference/animations.jsx +752 -0
- package/skill/assets/styles/editorial-mono/reference/scenes.html +563 -0
- package/skill/assets/styles/editorial-mono/sample/bullet.ts +101 -0
- package/skill/assets/styles/editorial-mono/sample/content.ts +104 -0
- package/skill/assets/styles/editorial-mono/sample/cta.ts +113 -0
- package/skill/assets/styles/editorial-mono/sample/feature.ts +111 -0
- package/skill/assets/styles/editorial-mono/sample/grid.ts +97 -0
- package/skill/assets/styles/editorial-mono/sample/kinetic.ts +96 -0
- package/skill/assets/styles/editorial-mono/sample/section.ts +101 -0
- package/skill/assets/styles/editorial-mono/sample/stat.ts +128 -0
- package/skill/assets/styles/editorial-mono/sample/title.ts +97 -0
- package/skill/assets/styles/editorial-mono/sample/ui-showcase.ts +159 -0
- package/skill/assets/styles/editorial-mono/tokens.css +44 -0
- package/skill/assets/styles/iso-diagram/STYLE.md +109 -0
- package/skill/assets/styles/iso-diagram/brand.md +32 -0
- package/skill/assets/styles/iso-diagram/reference/animations.jsx +673 -0
- package/skill/assets/styles/iso-diagram/reference/scenes.html +427 -0
- package/skill/assets/styles/iso-diagram/sample/bullet.ts +144 -0
- package/skill/assets/styles/iso-diagram/sample/content.ts +192 -0
- package/skill/assets/styles/iso-diagram/sample/cta.ts +162 -0
- package/skill/assets/styles/iso-diagram/sample/feature.ts +205 -0
- package/skill/assets/styles/iso-diagram/sample/grid.ts +181 -0
- package/skill/assets/styles/iso-diagram/sample/kinetic.ts +102 -0
- package/skill/assets/styles/iso-diagram/sample/section.ts +149 -0
- package/skill/assets/styles/iso-diagram/sample/stat.ts +164 -0
- package/skill/assets/styles/iso-diagram/sample/title.ts +173 -0
- package/skill/assets/styles/iso-diagram/sample/ui-showcase.ts +162 -0
- package/skill/assets/styles/iso-diagram/tokens.css +40 -0
- package/skill/assets/styles/motion-engineering/STYLE.md +106 -0
- package/skill/assets/styles/motion-engineering/brand.md +29 -0
- package/skill/assets/styles/motion-engineering/reference/animations.jsx +673 -0
- package/skill/assets/styles/motion-engineering/reference/scenes.html +513 -0
- package/skill/assets/styles/motion-engineering/sample/bullet.ts +176 -0
- package/skill/assets/styles/motion-engineering/sample/content.ts +228 -0
- package/skill/assets/styles/motion-engineering/sample/cta.ts +209 -0
- package/skill/assets/styles/motion-engineering/sample/feature.ts +299 -0
- package/skill/assets/styles/motion-engineering/sample/grid.ts +190 -0
- package/skill/assets/styles/motion-engineering/sample/kinetic.ts +159 -0
- package/skill/assets/styles/motion-engineering/sample/section.ts +196 -0
- package/skill/assets/styles/motion-engineering/sample/stat.ts +230 -0
- package/skill/assets/styles/motion-engineering/sample/title.ts +219 -0
- package/skill/assets/styles/motion-engineering/sample/ui-showcase.ts +267 -0
- package/skill/assets/styles/motion-engineering/tokens.css +40 -0
- package/skill/assets/styles/neon-terminal/STYLE.md +105 -0
- package/skill/assets/styles/neon-terminal/brand.md +27 -0
- package/skill/assets/styles/neon-terminal/reference/animations.jsx +673 -0
- package/skill/assets/styles/neon-terminal/reference/scenes.html +387 -0
- package/skill/assets/styles/neon-terminal/sample/bullet.ts +113 -0
- package/skill/assets/styles/neon-terminal/sample/content.ts +117 -0
- package/skill/assets/styles/neon-terminal/sample/cta.ts +131 -0
- package/skill/assets/styles/neon-terminal/sample/feature.ts +112 -0
- package/skill/assets/styles/neon-terminal/sample/grid.ts +128 -0
- package/skill/assets/styles/neon-terminal/sample/kinetic.ts +105 -0
- package/skill/assets/styles/neon-terminal/sample/section.ts +96 -0
- package/skill/assets/styles/neon-terminal/sample/stat.ts +123 -0
- package/skill/assets/styles/neon-terminal/sample/title.ts +122 -0
- package/skill/assets/styles/neon-terminal/sample/ui-showcase.ts +127 -0
- package/skill/assets/styles/neon-terminal/tokens.css +39 -0
- package/skill/assets/styles/risograph/STYLE.md +110 -0
- package/skill/assets/styles/risograph/brand.md +26 -0
- package/skill/assets/styles/risograph/reference/animations.jsx +673 -0
- package/skill/assets/styles/risograph/reference/scenes.html +403 -0
- package/skill/assets/styles/risograph/sample/bullet.ts +124 -0
- package/skill/assets/styles/risograph/sample/content.ts +135 -0
- package/skill/assets/styles/risograph/sample/cta.ts +149 -0
- package/skill/assets/styles/risograph/sample/feature.ts +152 -0
- package/skill/assets/styles/risograph/sample/grid.ts +123 -0
- package/skill/assets/styles/risograph/sample/kinetic.ts +125 -0
- package/skill/assets/styles/risograph/sample/section.ts +130 -0
- package/skill/assets/styles/risograph/sample/stat.ts +145 -0
- package/skill/assets/styles/risograph/sample/title.ts +132 -0
- package/skill/assets/styles/risograph/sample/ui-showcase.ts +147 -0
- package/skill/assets/styles/risograph/tokens.css +39 -0
- package/skill/assets/styles/swiss-console/STYLE.md +107 -0
- package/skill/assets/styles/swiss-console/brand.md +37 -0
- package/skill/assets/styles/swiss-console/reference/animations.jsx +673 -0
- package/skill/assets/styles/swiss-console/reference/scenes.html +420 -0
- package/skill/assets/styles/swiss-console/sample/bullet.ts +122 -0
- package/skill/assets/styles/swiss-console/sample/content.ts +137 -0
- package/skill/assets/styles/swiss-console/sample/cta.ts +109 -0
- package/skill/assets/styles/swiss-console/sample/feature.ts +163 -0
- package/skill/assets/styles/swiss-console/sample/grid.ts +145 -0
- package/skill/assets/styles/swiss-console/sample/kinetic.ts +117 -0
- package/skill/assets/styles/swiss-console/sample/section.ts +127 -0
- package/skill/assets/styles/swiss-console/sample/stat.ts +148 -0
- package/skill/assets/styles/swiss-console/sample/title.ts +148 -0
- package/skill/assets/styles/swiss-console/sample/ui-showcase.ts +198 -0
- package/skill/assets/styles/swiss-console/tokens.css +39 -0
- package/skill/install/INSTALL.md +400 -0
- package/skill/references/audio/audio_plan.md +199 -0
- package/skill/references/audio/build.md +208 -0
- package/skill/references/audio/cue_template.md +219 -0
- package/skill/references/audio/ffmpeg_cookbook.md +267 -0
- package/skill/references/audio/music/music.md +171 -0
- package/skill/references/audio/music/providers/elevenlabs.md +170 -0
- package/skill/references/audio/music/providers/manual.md +140 -0
- package/skill/references/audio/music/providers/openverse.md +265 -0
- package/skill/references/audio/sfx/providers/elevenlabs.md +152 -0
- package/skill/references/audio/sfx/providers/manual.md +117 -0
- package/skill/references/audio/sfx/providers/openverse.md +243 -0
- package/skill/references/audio/sfx/sfx.md +149 -0
- package/skill/references/audio/styles.md +102 -0
- package/skill/references/audio/sync.md +237 -0
- package/skill/references/audio/voiceover/animation_sync.md +142 -0
- package/skill/references/audio/voiceover/provider_script.md +153 -0
- package/skill/references/audio/voiceover/providers/elevenlabs.md +288 -0
- package/skill/references/audio/voiceover/providers/manual.md +100 -0
- package/skill/references/audio/voiceover/script_writing.md +100 -0
- package/skill/references/audio/voiceover/style_intake.md +56 -0
- package/skill/references/audio/voiceover/sync_algorithm.md +167 -0
- package/skill/references/audio/voiceover.md +296 -0
- package/skill/references/audio.md +135 -0
- package/skill/references/authoring_segment.md +446 -0
- package/skill/references/create_or_edit_video.md +232 -0
- package/skill/references/dev_server.md +157 -0
- package/skill/references/export.md +145 -0
- package/skill/references/new_video.md +117 -0
- package/skill/references/project_structure.md +144 -0
- package/skill/references/setup.md +109 -0
- package/skill/references/setup_new_style.md +158 -0
- package/skill/references/styles.md +154 -0
- package/skill/references/testing.md +115 -0
- package/skill/references/types.md +240 -0
- package/src/cli/entry/components/copy_button.ts +42 -0
- package/src/cli/entry/components/download_modal.ts +204 -0
- package/src/cli/entry/components/empty_state.ts +55 -0
- package/src/cli/entry/components/hide_hud_tab.ts +37 -0
- package/src/cli/entry/components/icons.ts +31 -0
- package/src/cli/entry/components/top_bar.ts +69 -0
- package/src/cli/entry/components/video_card.ts +57 -0
- package/src/cli/entry/dev_frame.ts +189 -0
- package/src/cli/entry/entry_index.ts +16 -0
- package/src/cli/entry/entry_video.ts +24 -0
- package/src/cli/entry/index.html +12 -0
- package/src/cli/entry/parse_slug.ts +14 -0
- package/src/cli/entry/render.html +17 -0
- package/src/cli/entry/render_entry.ts +121 -0
- package/src/cli/entry/styles/base.css +45 -0
- package/src/cli/entry/styles/components.css +605 -0
- package/src/cli/entry/styles/tokens.css +44 -0
- package/src/cli/entry/video.html +22 -0
- package/src/cli/entry/views/homepage.ts +66 -0
- package/src/cli/entry/views/video_view.ts +286 -0
- package/src/cli/entry/virtual.d.ts +8 -0
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
# Sync Algorithm
|
|
2
|
+
|
|
3
|
+
## When this is loaded
|
|
4
|
+
|
|
5
|
+
You have a voiceover audio file and provider timing data, and you need to compute a `Timing` object that syncs segment advances to the audio.
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
This is an agent reasoning step, not a deterministic function. You read the per-word timing data from the provider, walk the per-segment script from PLAN.md, and produce a `Timing` object with advance times for each segment.
|
|
10
|
+
|
|
11
|
+
## Inputs
|
|
12
|
+
|
|
13
|
+
1. **Per-segment script** from PLAN.md (the `## Script` section with subsections per segment id).
|
|
14
|
+
2. **Provider timing JSON** at `audio/originals/voiceovers/<slug>/timing.json`. This contains per-word or per-character timestamps from the TTS provider or STT transcription.
|
|
15
|
+
3. **Segment ids** in timeline order, plus each segment's `notes` and `voiceover` hint string.
|
|
16
|
+
4. **Each segment's `advances` array** -- the current timing. You will be replacing these values in the `Timing`, but the array length tells you how many advances each segment needs.
|
|
17
|
+
|
|
18
|
+
## Output
|
|
19
|
+
|
|
20
|
+
A `Timing` object written into `voiceover.ts`:
|
|
21
|
+
|
|
22
|
+
```ts
|
|
23
|
+
timing: {
|
|
24
|
+
perSegment: {
|
|
25
|
+
'intro': [4.2],
|
|
26
|
+
'feature-cards': [2.1, 5.8, 9.3, 12.0],
|
|
27
|
+
'outro': [3.5],
|
|
28
|
+
},
|
|
29
|
+
},
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Each value array has the same length as the segment's `advances` array. Values are segment-relative seconds (same units as `SegmentSpec.advances`).
|
|
33
|
+
|
|
34
|
+
## Parsing provider timing JSON
|
|
35
|
+
|
|
36
|
+
### ElevenLabs TTS timing
|
|
37
|
+
|
|
38
|
+
ElevenLabs TTS can output per-word timing (via the API's with-timestamps endpoint, or extracted via STT after portal generation). The JSON format contains an array of word entries with start and end timestamps:
|
|
39
|
+
|
|
40
|
+
```json
|
|
41
|
+
{
|
|
42
|
+
"words": [
|
|
43
|
+
{ "word": "Welcome", "start": 0.0, "end": 0.45 },
|
|
44
|
+
{ "word": "to", "start": 0.47, "end": 0.55 },
|
|
45
|
+
{ "word": "Acme", "start": 0.58, "end": 0.92 },
|
|
46
|
+
...
|
|
47
|
+
]
|
|
48
|
+
}
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Timestamps are in seconds from the start of the audio file. The `end` of the last word in a segment's script section gives you the boundary for that segment's audio content.
|
|
52
|
+
|
|
53
|
+
### ElevenLabs Speech-to-Text timing
|
|
54
|
+
|
|
55
|
+
ElevenLabs STT output has a similar structure with word-level timestamps. The format may include additional fields like confidence scores -- ignore those. Focus on `word`, `start`, and `end`.
|
|
56
|
+
|
|
57
|
+
If the JSON structure differs from the above (ElevenLabs may update their format), adapt by looking for word-level entries with start/end time fields. The core need is: which word was spoken at which timestamp.
|
|
58
|
+
|
|
59
|
+
## The sync procedure
|
|
60
|
+
|
|
61
|
+
### Step 1: Map script text to timing words
|
|
62
|
+
|
|
63
|
+
Walk through the provider timing JSON word by word. For each segment's script section in PLAN.md, find the corresponding words in the timing data by text matching.
|
|
64
|
+
|
|
65
|
+
- Match is case-insensitive and ignores punctuation.
|
|
66
|
+
- Provider timing may include words from pause markers or annotations that were in the provider script but not the PLAN script -- skip those.
|
|
67
|
+
- If the provider timing has significantly different text (indicating the TTS changed wording), flag this to the user and ask which text to use.
|
|
68
|
+
|
|
69
|
+
### Step 2: Find segment boundaries
|
|
70
|
+
|
|
71
|
+
For each segment, identify the timestamp where the segment should transition to the next:
|
|
72
|
+
|
|
73
|
+
- **Align to the next segment's VO onset, not the current segment's VO offset.** Each segment's final advance should land just *before* the next segment's first spoken word, so the voiceover starts right after the transition -- not after a dead-air pause. The transition is a lead-in to the next VO, not a tail-out from the previous one.
|
|
74
|
+
- Find the first word of the *next* segment's script section. Place the segment boundary 0.1-0.3 seconds before that word's `start` timestamp, so the transition finishes right as the new narration begins.
|
|
75
|
+
- For the **last segment** (no next segment), find the last word in the segment's script section and add a small buffer (0.3-0.5 seconds) after its `end` timestamp.
|
|
76
|
+
|
|
77
|
+
### Step 3: Convert to segment-relative advances
|
|
78
|
+
|
|
79
|
+
The `Timing` uses segment-relative seconds (time since the segment started, not since the audio started). To convert:
|
|
80
|
+
|
|
81
|
+
```
|
|
82
|
+
segment_start = sum of all previous segments' durations
|
|
83
|
+
advance_time = absolute_timestamp - segment_start
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
For the last advance of each segment (the one that transitions to the next segment), set it so the transition lands just before the next segment's first VO word (per Step 2).
|
|
87
|
+
|
|
88
|
+
### Step 4: Handle multi-advance segments
|
|
89
|
+
|
|
90
|
+
Segments with multiple advances have internal beats (`waitForNext()` calls in their `play()` function). For these:
|
|
91
|
+
|
|
92
|
+
1. **Count the advances.** The segment's `advances` array length tells you how many beats are needed.
|
|
93
|
+
2. **Identify beat positions.** Look for natural break points in the segment's script:
|
|
94
|
+
- `[pause for animation]` markers in the PLAN.md script.
|
|
95
|
+
- Sentence boundaries that align with visual transitions (check the segment's `notes` or code).
|
|
96
|
+
- Content transition cues: "Next,...", "And now,...", "Finally,...", "Moving on,...".
|
|
97
|
+
3. **Place internal advances** at the timestamps corresponding to these break points. Each internal advance should land at the end of the narration chunk before the next visual beat.
|
|
98
|
+
4. **Place the final advance** (the segment transition) just before the next segment's first VO word, per Step 2.
|
|
99
|
+
|
|
100
|
+
Example for a segment with 4 advances:
|
|
101
|
+
|
|
102
|
+
```
|
|
103
|
+
Script: "First feature. [pause] Second feature. [pause] Third feature."
|
|
104
|
+
Advances array length: 4 (3 internal beats + 1 final transition)
|
|
105
|
+
|
|
106
|
+
advance[0] = end of "First feature" + buffer (first waitForNext resolves)
|
|
107
|
+
advance[1] = end of "Second feature" + buffer (second waitForNext resolves)
|
|
108
|
+
advance[2] = end of "Third feature" + buffer (third waitForNext resolves)
|
|
109
|
+
advance[3] = just before next segment's first VO word (transition to next segment)
|
|
110
|
+
```
|
|
111
|
+
|
|
112
|
+
### Step 5: Apply the "audio always wins" rule
|
|
113
|
+
|
|
114
|
+
The video duration adapts to match the audio duration:
|
|
115
|
+
|
|
116
|
+
- If the audio for a segment is shorter than the segment's current `advances` suggest, compress the advances.
|
|
117
|
+
- If the audio is longer, stretch the advances.
|
|
118
|
+
- The last advance of the last segment should land at (or very near) the end of the audio file.
|
|
119
|
+
- Never truncate audio. Never pad with silence.
|
|
120
|
+
|
|
121
|
+
## Presenting the timing to the user
|
|
122
|
+
|
|
123
|
+
After computing the `Timing`, present it with annotations:
|
|
124
|
+
|
|
125
|
+
```
|
|
126
|
+
Proposed timing:
|
|
127
|
+
|
|
128
|
+
intro (1 advance):
|
|
129
|
+
[0] 4.2s -- "...set us apart." (end of intro narration)
|
|
130
|
+
|
|
131
|
+
feature-cards (4 advances):
|
|
132
|
+
[0] 2.1s -- "...across devices." (end of collaboration section)
|
|
133
|
+
[1] 5.8s -- "...in one view." (end of analytics section)
|
|
134
|
+
[2] 9.3s -- "...and more." (end of integrations section)
|
|
135
|
+
[3] 12.0s -- transition (next VO starts at ~12.2s)
|
|
136
|
+
|
|
137
|
+
outro (1 advance):
|
|
138
|
+
[0] 3.5s -- "...Thanks for watching." (end of video)
|
|
139
|
+
|
|
140
|
+
Total audio duration: 19.7s
|
|
141
|
+
```
|
|
142
|
+
|
|
143
|
+
For each advance, show:
|
|
144
|
+
|
|
145
|
+
- The segment-relative time in seconds.
|
|
146
|
+
- A snippet of the script text that the advance lands on.
|
|
147
|
+
- What the advance does (internal beat vs. segment transition).
|
|
148
|
+
|
|
149
|
+
## Iteration
|
|
150
|
+
|
|
151
|
+
The user may request adjustments:
|
|
152
|
+
|
|
153
|
+
- "Move the second beat in feature-cards 0.5 seconds later" -- adjust that advance value.
|
|
154
|
+
- "The intro feels rushed" -- extend the intro's advance time by adding more buffer.
|
|
155
|
+
- "Combine the first two beats in feature-cards into one" -- this changes the number of advances, which means the segment's `play()` function needs a `waitForNext()` removed. Flag this as a code change.
|
|
156
|
+
|
|
157
|
+
After each adjustment, re-present the timing. When the user confirms, write it into `voiceover.ts`.
|
|
158
|
+
|
|
159
|
+
## Edge cases
|
|
160
|
+
|
|
161
|
+
| Situation | Behavior |
|
|
162
|
+
|---|---|
|
|
163
|
+
| Provider timing JSON is missing | Error. The user must download it from the provider portal. Direct them to the provider walkthrough. |
|
|
164
|
+
| Words in timing do not match the script | Likely the TTS changed wording. Flag specific mismatches, ask user whether to use TTS text or original script text for alignment. |
|
|
165
|
+
| Segment has no script (silent segment) | Use the segment's existing `advances` values. The segment passes through without voiceover. |
|
|
166
|
+
| Audio is significantly shorter/longer than expected | Apply "audio always wins" -- compress or stretch. Flag the discrepancy so the user can decide if they want to re-record or adjust the video. |
|
|
167
|
+
| Provider timing has character-level rather than word-level data | Aggregate characters into words by grouping on whitespace boundaries. Use the word-end timestamp. |
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
# Voiceover
|
|
2
|
+
|
|
3
|
+
## When this is loaded
|
|
4
|
+
|
|
5
|
+
You were routed here from [audio.md](../audio.md) or from another workflow that needs to work with voiceover content. This is the top-level reference for all voiceover functionality.
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
Videowright supports voiceover audio integrated into video playback. A voiceover consists of an audio file (mp3 or wav), a `Timing` that syncs segment advances to the audio, and metadata stored in a typed `voiceover.ts` file. Audio plays in the dev server via an HTML `<audio>` element and is muxed into MP4 output by `render` via ffmpeg.
|
|
10
|
+
|
|
11
|
+
Two production flows are supported:
|
|
12
|
+
|
|
13
|
+
- **AI-generated** -- write a script, transform it with v2-targeted provider annotations, generate audio via ElevenLabs (API key or web portal), and import the audio and per-word timing JSON.
|
|
14
|
+
- **Manual** -- user provides their own audio file, then runs it through ElevenLabs Speech-to-Text to get per-word timing data for sync.
|
|
15
|
+
|
|
16
|
+
Both flows produce the same output: a `voiceover.ts` file with a `Voiceover` object that includes the audio path and a `Timing` object.
|
|
17
|
+
|
|
18
|
+
## Flow entry point
|
|
19
|
+
|
|
20
|
+
When the user asks to "add a voiceover" or "generate a voiceover", ask:
|
|
21
|
+
|
|
22
|
+
> Do you have an audio file already, or would you like to generate one with AI text-to-speech?
|
|
23
|
+
|
|
24
|
+
- **AI generation** -- follow Flow A below.
|
|
25
|
+
- **User-provided audio** -- follow Flow B below.
|
|
26
|
+
|
|
27
|
+
### Flow A: AI generation (ElevenLabs)
|
|
28
|
+
|
|
29
|
+
1. **Approach and voice selection.** Ask API key vs. portal, then (API only) which voice from the curated catalog. See [voiceover/providers/elevenlabs.md](voiceover/providers/elevenlabs.md) for the mode selection prompt and [voice catalog](#curated-voice-catalog).
|
|
30
|
+
2. **Style intake.** Ask the user about tone and emotional arc preferences. See [voiceover/style_intake.md](voiceover/style_intake.md).
|
|
31
|
+
3. **Script.** Write or integrate the VO script into PLAN.md. See [voiceover/script_writing.md](voiceover/script_writing.md).
|
|
32
|
+
4. **Provider script.** Transform the PLAN script into `provider_script.md` with v2-targeted annotations (SSML `<break>` tags, punctuation-driven prosody -- no v3 emotion tags). See [voiceover/provider_script.md](voiceover/provider_script.md).
|
|
33
|
+
5. **Audio generation.** Follow the sub-flow for the approach chosen in step 1. See [voiceover/providers/elevenlabs.md](voiceover/providers/elevenlabs.md).
|
|
34
|
+
6. **Sync timing.** Read the provider timing JSON and compute a `Timing` object. See [voiceover/sync_algorithm.md](voiceover/sync_algorithm.md).
|
|
35
|
+
7. **Write `voiceover.ts`.** Create the typed module exporting a `Voiceover` object.
|
|
36
|
+
8. **Audio plan and build.** Create or update `audio/audio_plan.md` with a VO cue pointing at this voiceover. For VO-only videos, the plan is minimal (single cue, full file, placed at 0s -- see [audio_plan.md](audio_plan.md) for the VO-only shortcut). Then build the track via [build.md](build.md). The build workflow handles approval, timeline.ts update, and sync.
|
|
37
|
+
|
|
38
|
+
### Curated voice catalog
|
|
39
|
+
|
|
40
|
+
When the user picks the **API key** approach in step 1, immediately present this catalog (default is **Asher** if no preference):
|
|
41
|
+
|
|
42
|
+
| # | Voice | Description | Preview |
|
|
43
|
+
|---|---|---|---|
|
|
44
|
+
| 1 | **Asher** | Warm, clear, and conversational male voice with confident, grounded delivery. Natural pacing and friendly authority give him an engaging presence that holds attention without feeling forced. Ideal for podcasts, narration, explainers, and authentic commercial reads. Works especially well as a default voice because of his versatility across different content types and tones. | [Listen](https://elevenlabs.io/app/voice-library?voiceId=tMvyQtpCVQ0DkixuYm6J) |
|
|
45
|
+
| 2 | **Cecily** | Warm, versatile female voice from the West Coast with an engaging, approachable delivery. Her natural warmth and conversational style make her equally effective for advertisements, social media content, and brand storytelling. She can shift between polished and casual registers without losing authenticity. A strong choice when you want a voice that feels relatable and trustworthy across a range of content. | [Listen](https://elevenlabs.io/app/voice-library?voiceId=Uc7anshoV8mdBhDnEZEX) |
|
|
46
|
+
| 3 | **Don** | Young American male voice with a casual, approachable tone that feels natural and engaging. Light, clear, and expressive -- perfect for conversations with listeners in a relaxed way. This style works especially well for social media content, storytelling, and audiobooks, where relatability and flow are key. The voice carries warmth and clarity, making it easy to listen to over long sessions from narration to digital campaigns. | [Listen](https://elevenlabs.io/app/voice-library?voiceId=8IbUB2LiiCZ85IJAHNnZ) |
|
|
47
|
+
| 4 | **Hanna** | Professional American female voice with a polished, authoritative delivery. Clear articulation and steady pacing make her an excellent choice for informative narration, e-learning modules, and corporate voiceover. She conveys competence and credibility without sounding stiff or robotic. Best when you need a voice that commands attention while remaining approachable in instructional or business contexts. | [Listen](https://elevenlabs.io/app/voice-library?voiceId=Hh0rE70WfnSFN80K8uJC) |
|
|
48
|
+
| 5 | **Other** | Provide any ElevenLabs voice ID. Browse voices at the [ElevenLabs Voice Library](https://elevenlabs.io/app/voice-library) to find one that fits your project. | -- |
|
|
49
|
+
|
|
50
|
+
If the user does not pick, default to **Asher**. Save the selected voice ID to the `eleven_labs_voice_id` field in the `voiceover.ts` file (not as an env var). If the user picks "Other", ask them to provide the voice ID.
|
|
51
|
+
|
|
52
|
+
Portal users skip this catalog -- they pick a voice visually in the ElevenLabs UI during audio generation (step 5).
|
|
53
|
+
|
|
54
|
+
### Flow B: Manual (user-provided audio)
|
|
55
|
+
|
|
56
|
+
1. **Get the audio.** Ask the user to provide or drop an audio file into `audio/originals/voiceovers/<slug>/`.
|
|
57
|
+
2. **Generate transcript and timing.** Walk the user through ElevenLabs Speech-to-Text to get per-word timing data. See [voiceover/providers/manual.md](voiceover/providers/manual.md).
|
|
58
|
+
3. **Sync timing.** Same as Flow A step 6.
|
|
59
|
+
4. **Write `voiceover.ts`.** Same as Flow A step 7.
|
|
60
|
+
5. **Audio plan and build.** Same as Flow A step 8.
|
|
61
|
+
|
|
62
|
+
## File and folder conventions
|
|
63
|
+
|
|
64
|
+
Voiceover originals live per-video under `audio/originals/voiceovers/`:
|
|
65
|
+
|
|
66
|
+
```
|
|
67
|
+
videos/<video-slug>/
|
|
68
|
+
timeline.ts
|
|
69
|
+
PLAN.md
|
|
70
|
+
voiceover_script/
|
|
71
|
+
script.md
|
|
72
|
+
audio/
|
|
73
|
+
originals/
|
|
74
|
+
voiceovers/
|
|
75
|
+
<vo-slug>/
|
|
76
|
+
voiceover.ts # typed Voiceover object (default export)
|
|
77
|
+
audio.mp3 # audio file (mp3 or wav; any name works, referenced from voiceover.ts)
|
|
78
|
+
timing.json # provider-supplied per-word timings (optional)
|
|
79
|
+
provider_script.md # provider-annotated script (AI flow only)
|
|
80
|
+
generate.sh # API generation script (AI flow only)
|
|
81
|
+
tracks/
|
|
82
|
+
v1/
|
|
83
|
+
track.ts # typed AudioTrack object (default export)
|
|
84
|
+
track.mp3 # rendered audio
|
|
85
|
+
plan_snapshot.md # point-in-time copy of audio plan
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
**Slug naming.** Both auto-versioned (`v1`, `v2`) and user-named (`narrator-warm`, `take-3`) are valid. The slug is the folder name under `audio/originals/voiceovers/`.
|
|
89
|
+
|
|
90
|
+
**Multiple voiceovers.** Stored as separate sibling folders under `audio/originals/voiceovers/`. Each is independent and self-contained. The active audio is determined by the audio track referenced in `timeline.ts` via `default_audio_track`.
|
|
91
|
+
|
|
92
|
+
## Types
|
|
93
|
+
|
|
94
|
+
### `Voiceover`
|
|
95
|
+
|
|
96
|
+
```ts
|
|
97
|
+
type Voiceover = {
|
|
98
|
+
audio_file: string; // path relative to the voiceover.ts file
|
|
99
|
+
provider: "elevenlabs" | "manual";
|
|
100
|
+
provider_timing_file?: string; // path relative to the voiceover.ts file
|
|
101
|
+
timing: Timing;
|
|
102
|
+
notes?: string;
|
|
103
|
+
eleven_labs_voice_id?: string; // ElevenLabs voice ID; defaults to Asher if omitted
|
|
104
|
+
};
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
### `Timing`
|
|
108
|
+
|
|
109
|
+
```ts
|
|
110
|
+
type Timing = {
|
|
111
|
+
perSegment: Partial<Record<string, number[]>>;
|
|
112
|
+
};
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
A `Timing` overrides segment `advances` for any segments it lists. Segments not listed fall back to their own `advances` array.
|
|
116
|
+
|
|
117
|
+
### `Timeline` extensions
|
|
118
|
+
|
|
119
|
+
```ts
|
|
120
|
+
interface Timeline {
|
|
121
|
+
meta: TimelineMeta;
|
|
122
|
+
segments: TimelineEntry[];
|
|
123
|
+
default_timing?: Timing; // standalone timing overrides
|
|
124
|
+
default_audio_track?: AudioTrack; // default audio track for this video
|
|
125
|
+
}
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
## Writing `voiceover.ts`
|
|
129
|
+
|
|
130
|
+
A voiceover module default-exports a `Voiceover` object:
|
|
131
|
+
|
|
132
|
+
```ts
|
|
133
|
+
import type { Voiceover } from 'videowright';
|
|
134
|
+
|
|
135
|
+
const voiceover: Voiceover = {
|
|
136
|
+
audio_file: './audio.mp3',
|
|
137
|
+
provider: 'elevenlabs',
|
|
138
|
+
provider_timing_file: './timing.json',
|
|
139
|
+
eleven_labs_voice_id: 'tMvyQtpCVQ0DkixuYm6J', // Asher
|
|
140
|
+
timing: {
|
|
141
|
+
perSegment: {
|
|
142
|
+
'intro': [4.2],
|
|
143
|
+
'feature-cards': [2.1, 5.8, 9.3, 12.0],
|
|
144
|
+
'outro': [3.5],
|
|
145
|
+
},
|
|
146
|
+
},
|
|
147
|
+
notes: 'Warm male voice, conversational tone',
|
|
148
|
+
};
|
|
149
|
+
|
|
150
|
+
export default voiceover;
|
|
151
|
+
```
|
|
152
|
+
|
|
153
|
+
## Setting the default audio track
|
|
154
|
+
|
|
155
|
+
After generating a voiceover, it is combined into an audio track (see [../audio.md](../audio.md) for the full audio workflow). The active audio track is set in `timeline.ts`:
|
|
156
|
+
|
|
157
|
+
```ts
|
|
158
|
+
import '../../styles/editorial-mono/tokens.css';
|
|
159
|
+
import type { Timeline } from 'videowright';
|
|
160
|
+
import defaultAudioTrack from './audio/tracks/v1/track.js';
|
|
161
|
+
|
|
162
|
+
const timeline: Timeline = {
|
|
163
|
+
meta: { title: 'My Video' },
|
|
164
|
+
segments: [
|
|
165
|
+
{ id: 'intro' },
|
|
166
|
+
{ id: 'feature-cards', transition: 'fade' },
|
|
167
|
+
{ id: 'outro', transition: 'fade' },
|
|
168
|
+
],
|
|
169
|
+
default_audio_track: defaultAudioTrack,
|
|
170
|
+
};
|
|
171
|
+
|
|
172
|
+
export default timeline;
|
|
173
|
+
```
|
|
174
|
+
|
|
175
|
+
The `default_audio_track` import is the single source of truth for which audio track is active. Switching tracks means updating the import path.
|
|
176
|
+
|
|
177
|
+
## CLI usage
|
|
178
|
+
|
|
179
|
+
`render` accepts `--audio-track`:
|
|
180
|
+
|
|
181
|
+
```bash
|
|
182
|
+
# Use a specific audio track
|
|
183
|
+
npx videowright render --audio-track v1
|
|
184
|
+
|
|
185
|
+
# Suppress audio (ignore default_audio_track, use default_timing or segment advances)
|
|
186
|
+
npx videowright render --audio-track none
|
|
187
|
+
|
|
188
|
+
# No flag: use default_audio_track from timeline.ts if set, otherwise no audio
|
|
189
|
+
npx videowright render
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
`dev` does not accept `--audio-track`. It uses `default_audio_track` from `timeline.ts` if set, otherwise no audio.
|
|
193
|
+
|
|
194
|
+
## Audio playback by mode
|
|
195
|
+
|
|
196
|
+
| Mode | Audio mechanism | Behavior |
|
|
197
|
+
|---|---|---|
|
|
198
|
+
| `dev` | HTML `<audio>` element | Play button in HUD starts auto-advance with synced audio. Manual nav pauses audio. |
|
|
199
|
+
| `render` | ffmpeg audio mux | Audio file is muxed into the output MP4 as a second input to ffmpeg. No `<audio>` element. |
|
|
200
|
+
|
|
201
|
+
## Timing precedence
|
|
202
|
+
|
|
203
|
+
When determining advance schedules:
|
|
204
|
+
|
|
205
|
+
1. **Active audio track's `timing`** -- if an audio track is active (via `--audio-track <id>` or `default_audio_track`).
|
|
206
|
+
2. **`default_timing`** on `timeline.ts` -- if no audio track is active.
|
|
207
|
+
3. **`SegmentSpec.advances`** -- per-segment fallback.
|
|
208
|
+
|
|
209
|
+
`--audio-track none` suppresses level 1 (audio tracks) but preserves `default_timing` (level 2) and per-segment advances (level 3).
|
|
210
|
+
|
|
211
|
+
## The `voiceover` field on segments
|
|
212
|
+
|
|
213
|
+
Each segment can declare a `voiceover` string in `defineSegment`:
|
|
214
|
+
|
|
215
|
+
```ts
|
|
216
|
+
export default defineSegment({
|
|
217
|
+
id: 'intro',
|
|
218
|
+
advances: [3.0],
|
|
219
|
+
voiceover: 'Welcome to the product demo.',
|
|
220
|
+
async play(ctx) { await ctx.hold(3000); },
|
|
221
|
+
});
|
|
222
|
+
```
|
|
223
|
+
|
|
224
|
+
This field is:
|
|
225
|
+
|
|
226
|
+
- **Shown in the HUD** during dev mode.
|
|
227
|
+
- **Collected by `videowright script`** into a single markdown document.
|
|
228
|
+
- **Used by the agent** to understand the segment's narrative purpose when editing.
|
|
229
|
+
|
|
230
|
+
It is a display hint, not the canonical voiceover audio source. The canonical audio comes from the `Voiceover` object in `voiceover.ts`.
|
|
231
|
+
|
|
232
|
+
## VO-first authoring
|
|
233
|
+
|
|
234
|
+
The default authoring pattern for new videos with voiceover intent:
|
|
235
|
+
|
|
236
|
+
1. **Write the script first.** Draft the full VO copy organized by segment in PLAN.md.
|
|
237
|
+
2. **Scaffold segments from the script.** Each segment's content and timing follow from its VO text. A 30-word section suggests ~12s; a 100-word section suggests ~40s (based on ~150 WPM).
|
|
238
|
+
3. **Use `waitForNext()` for every VO-aligned beat.** Each content reveal that a voiceover line should cue must be gated by `waitForNext()`, not `hold()`. This is what makes voiceover-swapping possible — different voiceovers supply different advance timings, and segments respond by advancing at the right moment without code changes. Use `hold()` only for animation lead-in or fixed internal pauses within a beat.
|
|
239
|
+
4. **Set `voiceover` on each segment** to its section of the script.
|
|
240
|
+
5. **Generate the audio** using one of the two flows above.
|
|
241
|
+
6. **Sync timing** to align segment advances with the audio.
|
|
242
|
+
|
|
243
|
+
## VO-alignment smell
|
|
244
|
+
|
|
245
|
+
If you are adjusting `hold()` values inside a segment to make an animation line up with a specific voiceover recording, that is a code smell. It means the segment is coupled to one narration — any change to the voiceover (different voice, different pacing, re-recorded take) will require re-tuning those timers.
|
|
246
|
+
|
|
247
|
+
The fix is structural: content that needs to sync with the voiceover should be gated by `waitForNext()`, so timing comes from the `advances` / `Timing` data rather than from hardcoded milliseconds in the segment code. Add a new advance at the sync point, and let the timers within each beat use percentage-based durations so they scale when beat lengths shift. See [authoring_segment.md § Percentage-based timing within beats](../authoring_segment.md#percentage-based-timing-within-beats) for the pattern.
|
|
248
|
+
|
|
249
|
+
## `videowright script` CLI
|
|
250
|
+
|
|
251
|
+
The `script` command reads segments' `voiceover` fields and assembles them into markdown:
|
|
252
|
+
|
|
253
|
+
```bash
|
|
254
|
+
npx videowright script # print to stdout
|
|
255
|
+
npx videowright script --write # write to voiceover_script/script.md
|
|
256
|
+
```
|
|
257
|
+
|
|
258
|
+
See the `videowright script` section below for output format and `--write` behavior.
|
|
259
|
+
|
|
260
|
+
### Output format
|
|
261
|
+
|
|
262
|
+
```markdown
|
|
263
|
+
# Video Title
|
|
264
|
+
|
|
265
|
+
## segment-id-1
|
|
266
|
+
Voiceover text for the first segment.
|
|
267
|
+
|
|
268
|
+
## segment-id-2
|
|
269
|
+
Voiceover text for the second segment.
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
*No voiceover: segment-id-3, segment-id-4*
|
|
274
|
+
```
|
|
275
|
+
|
|
276
|
+
### `--write` flag
|
|
277
|
+
|
|
278
|
+
With `--write`, the script is written to `videos/<name>/voiceover_script/script.md`. Without `--write`, it prints to stdout.
|
|
279
|
+
|
|
280
|
+
## Keeping things in sync
|
|
281
|
+
|
|
282
|
+
The `voiceover` field on each segment and `voiceover_script/script.md` are two representations of the same content:
|
|
283
|
+
|
|
284
|
+
- **After editing `voiceover` fields** on segments, run `npx videowright script --write` to regenerate `script.md`.
|
|
285
|
+
- **After editing `script.md`** directly, update each segment's `voiceover` field to match.
|
|
286
|
+
|
|
287
|
+
## Edge cases
|
|
288
|
+
|
|
289
|
+
| Situation | Behavior |
|
|
290
|
+
|---|---|
|
|
291
|
+
| User wants VO but has no script yet | Draft one during the build phase based on the video's purpose and segment outline. |
|
|
292
|
+
| User changes audio intent from silent to voiceover mid-project | Add `voiceover` fields to existing segments. Run `videowright script --write`. Follow the voiceover flow to generate audio and timing. |
|
|
293
|
+
| Audio file missing on disk | CLI errors before playback or render starts with a clear message and path. |
|
|
294
|
+
| `--audio-track <id>` with non-existent track | CLI errors with a hint to check the `audio/tracks/` folder. |
|
|
295
|
+
| Browser autoplay blocked | Audio is silent until the user clicks the play button (which counts as a user gesture). |
|
|
296
|
+
| Default audio track set but user switches via `--audio-track <other-id>` | Advance timing updates automatically. In-segment animations remain tuned to the original default -- the user can re-run the animation sync pass if needed. |
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
# Audio
|
|
2
|
+
|
|
3
|
+
## When this is loaded
|
|
4
|
+
|
|
5
|
+
You were routed here from the intent dispatch table because the user wants to work with audio -- voiceover, sound effects, or background music.
|
|
6
|
+
|
|
7
|
+
## Overview
|
|
8
|
+
|
|
9
|
+
Videowright supports multi-source audio tracks: voice-over, sound effects, and music combined into a single rendered audio file. Videos reference an audio track (not a voice-over directly); the audio track drives video timing and is muxed into the final MP4.
|
|
10
|
+
|
|
11
|
+
The audio workflow is progressive: start with what the video needs, source the assets, then build and sync the track.
|
|
12
|
+
|
|
13
|
+
## Audio intent questions
|
|
14
|
+
|
|
15
|
+
Ask these three questions upfront. Skip any whose answer is already clear from the user's input.
|
|
16
|
+
|
|
17
|
+
> 1. Will this video have a **voice-over**? (yes / no)
|
|
18
|
+
> 2. Will it have **sound effects**? (yes / no)
|
|
19
|
+
> 3. Will it have **background music**? (yes / no)
|
|
20
|
+
|
|
21
|
+
Based on answers, load only the relevant sub-references below.
|
|
22
|
+
|
|
23
|
+
## Routing
|
|
24
|
+
|
|
25
|
+
### Voice-over
|
|
26
|
+
|
|
27
|
+
If the user wants a voice-over, load [audio/voiceover.md](audio/voiceover.md). This covers:
|
|
28
|
+
|
|
29
|
+
- AI-generated (ElevenLabs) and manual (user-provided audio) flows
|
|
30
|
+
- Script writing, provider script transformation
|
|
31
|
+
- Voice selection and style intake
|
|
32
|
+
- Sync timing computation
|
|
33
|
+
|
|
34
|
+
### Sound effects
|
|
35
|
+
|
|
36
|
+
If the user wants sound effects, load [audio/sfx/sfx.md](audio/sfx/sfx.md). This covers:
|
|
37
|
+
|
|
38
|
+
- BYO (user-provided audio), ElevenLabs (AI-generated), and Openverse (free search) sourcing flows
|
|
39
|
+
- `sfx.ts` metadata authoring
|
|
40
|
+
- Per-asset approval UX (Approve / Discard and request changes)
|
|
41
|
+
- Integration into the audio plan as cues
|
|
42
|
+
|
|
43
|
+
SFX assets live in `audio/originals/sfx/<slug>/` and are referenced by cues in the audio plan.
|
|
44
|
+
|
|
45
|
+
### Background music
|
|
46
|
+
|
|
47
|
+
If the user wants background music, load [audio/music/music.md](audio/music/music.md). This covers:
|
|
48
|
+
|
|
49
|
+
- BYO (user-provided audio), ElevenLabs (AI-generated), and Openverse (free search) sourcing flows
|
|
50
|
+
- `music.ts` metadata authoring (rich free-text notes for BPM, key, mood, structure)
|
|
51
|
+
- Per-asset approval UX (Approve / Discard and request changes)
|
|
52
|
+
- Integration into the audio plan as cues with volume curves and ducking
|
|
53
|
+
|
|
54
|
+
Music assets live in `audio/originals/music/<slug>/` and are referenced by cues in the audio plan.
|
|
55
|
+
|
|
56
|
+
### Audio plan, build, and sync
|
|
57
|
+
|
|
58
|
+
If any audio is present (VO, SFX, or music), the audio plan/build/sync workflow applies:
|
|
59
|
+
|
|
60
|
+
1. **Audio plan** -- author `audio/audio_plan.md` describing the mix composition. See [audio/audio_plan.md](audio/audio_plan.md) for the format spec, [audio/cue_template.md](audio/cue_template.md) for the per-cue field template, and [audio/styles.md](audio/styles.md) for mix-level guidance.
|
|
61
|
+
2. **Build** -- render the plan into an audio track via ffmpeg. See [audio/build.md](audio/build.md). Uses recipes from [audio/ffmpeg_cookbook.md](audio/ffmpeg_cookbook.md).
|
|
62
|
+
3. **Sync** -- compute per-segment timing from the track. See [audio/sync.md](audio/sync.md).
|
|
63
|
+
|
|
64
|
+
For **VO-only videos** (no SFX, no music), the plan is minimal (single cue, full file, placed at 0s) and is auto-emitted during the voiceover flow. The user does not need to understand the plan format -- it is created transparently and consumed by build and sync.
|
|
65
|
+
|
|
66
|
+
For **multi-source mixes** (VO + SFX, VO + music, or all three), the plan is authored explicitly with per-cue volume curves, fades, and a full ffmpeg mix command.
|
|
67
|
+
|
|
68
|
+
## File layout
|
|
69
|
+
|
|
70
|
+
Each video has an `audio/` directory:
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
videos/<video-slug>/
|
|
74
|
+
voiceover_script/
|
|
75
|
+
script.md # assembled VO script
|
|
76
|
+
audio/
|
|
77
|
+
audio_plan.md # audio composition plan + log
|
|
78
|
+
originals/ # source files (immutable after use)
|
|
79
|
+
voiceovers/
|
|
80
|
+
v1/
|
|
81
|
+
voiceover.ts
|
|
82
|
+
audio.mp3
|
|
83
|
+
timing.json
|
|
84
|
+
provider_script.md
|
|
85
|
+
generate.sh
|
|
86
|
+
sfx/ # slug-named subfolders
|
|
87
|
+
keyboard_typing/
|
|
88
|
+
audio.mp3
|
|
89
|
+
sfx.ts
|
|
90
|
+
generate.sh
|
|
91
|
+
music/ # slug-named subfolders
|
|
92
|
+
uplift_piano/
|
|
93
|
+
audio.mp3
|
|
94
|
+
music.ts
|
|
95
|
+
generate.sh
|
|
96
|
+
tracks/ # rendered audio tracks
|
|
97
|
+
v1/
|
|
98
|
+
track.ts # typed AudioTrack object
|
|
99
|
+
track.mp3 # rendered audio
|
|
100
|
+
plan_snapshot.md # point-in-time plan copy
|
|
101
|
+
```
|
|
102
|
+
|
|
103
|
+
### Active track
|
|
104
|
+
|
|
105
|
+
`timeline.ts` imports the active track via `default_audio_track`. The import is the single source of truth for which track is live. Switching tracks means updating the import.
|
|
106
|
+
|
|
107
|
+
```ts
|
|
108
|
+
import defaultAudioTrack from './audio/tracks/v1/track.js';
|
|
109
|
+
|
|
110
|
+
const timeline: Timeline = {
|
|
111
|
+
// ...
|
|
112
|
+
default_audio_track: defaultAudioTrack,
|
|
113
|
+
};
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
## CLI usage
|
|
117
|
+
|
|
118
|
+
```bash
|
|
119
|
+
# Use a specific audio track
|
|
120
|
+
npx videowright render --audio-track v1
|
|
121
|
+
|
|
122
|
+
# Suppress audio (use default_timing or segment advances)
|
|
123
|
+
npx videowright render --audio-track none
|
|
124
|
+
|
|
125
|
+
# No flag: use default_audio_track from timeline.ts if set
|
|
126
|
+
npx videowright render
|
|
127
|
+
```
|
|
128
|
+
|
|
129
|
+
## Timing precedence
|
|
130
|
+
|
|
131
|
+
1. **Active audio track's `timing`** -- via `--audio-track <id>` or `default_audio_track`
|
|
132
|
+
2. **`default_timing`** on `timeline.ts`
|
|
133
|
+
3. **`SegmentSpec.advances`** -- per-segment fallback
|
|
134
|
+
|
|
135
|
+
`--audio-track none` suppresses level 1 (audio tracks) but preserves levels 2 and 3.
|