@vargai/sdk 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/.env.example +24 -0
  2. package/CLAUDE.md +118 -0
  3. package/HIGGSFIELD_REWRITE_SUMMARY.md +300 -0
  4. package/README.md +231 -0
  5. package/SKILLS.md +157 -0
  6. package/STRUCTURE.md +92 -0
  7. package/TEST_RESULTS.md +122 -0
  8. package/action/captions/SKILL.md +170 -0
  9. package/action/captions/index.ts +169 -0
  10. package/action/edit/SKILL.md +235 -0
  11. package/action/edit/index.ts +437 -0
  12. package/action/image/SKILL.md +140 -0
  13. package/action/image/index.ts +105 -0
  14. package/action/sync/SKILL.md +136 -0
  15. package/action/sync/index.ts +145 -0
  16. package/action/transcribe/SKILL.md +179 -0
  17. package/action/transcribe/index.ts +210 -0
  18. package/action/video/SKILL.md +116 -0
  19. package/action/video/index.ts +125 -0
  20. package/action/voice/SKILL.md +125 -0
  21. package/action/voice/index.ts +136 -0
  22. package/biome.json +33 -0
  23. package/bun.lock +842 -0
  24. package/cli/commands/find.ts +58 -0
  25. package/cli/commands/help.ts +70 -0
  26. package/cli/commands/list.ts +49 -0
  27. package/cli/commands/run.ts +237 -0
  28. package/cli/commands/which.ts +66 -0
  29. package/cli/discover.ts +66 -0
  30. package/cli/index.ts +33 -0
  31. package/cli/runner.ts +65 -0
  32. package/cli/types.ts +49 -0
  33. package/cli/ui.ts +185 -0
  34. package/index.ts +75 -0
  35. package/lib/README.md +144 -0
  36. package/lib/ai-sdk/fal.ts +106 -0
  37. package/lib/ai-sdk/replicate.ts +107 -0
  38. package/lib/elevenlabs.ts +382 -0
  39. package/lib/fal.ts +467 -0
  40. package/lib/ffmpeg.ts +467 -0
  41. package/lib/fireworks.ts +235 -0
  42. package/lib/groq.ts +246 -0
  43. package/lib/higgsfield/MIGRATION.md +308 -0
  44. package/lib/higgsfield/README.md +273 -0
  45. package/lib/higgsfield/example.ts +228 -0
  46. package/lib/higgsfield/index.ts +241 -0
  47. package/lib/higgsfield/soul.ts +262 -0
  48. package/lib/higgsfield.ts +176 -0
  49. package/lib/remotion/SKILL.md +823 -0
  50. package/lib/remotion/cli.ts +115 -0
  51. package/lib/remotion/functions.ts +283 -0
  52. package/lib/remotion/index.ts +19 -0
  53. package/lib/remotion/templates.ts +73 -0
  54. package/lib/replicate.ts +304 -0
  55. package/output.txt +1 -0
  56. package/package.json +42 -0
  57. package/pipeline/cookbooks/SKILL.md +285 -0
  58. package/pipeline/cookbooks/remotion-video.md +585 -0
  59. package/pipeline/cookbooks/round-video-character.md +337 -0
  60. package/pipeline/cookbooks/talking-character.md +59 -0
  61. package/scripts/produce-menopause-campaign.sh +202 -0
  62. package/service/music/SKILL.md +229 -0
  63. package/service/music/index.ts +296 -0
  64. package/test-import.ts +7 -0
  65. package/test-services.ts +97 -0
  66. package/tsconfig.json +29 -0
  67. package/utilities/s3.ts +147 -0
package/SKILLS.md ADDED
@@ -0,0 +1,157 @@
1
+ # agent skills
2
+
3
+ this sdk includes claude code agent skills for each service. each skill is co-located with its service code.
4
+
5
+ ## available skills
6
+
7
+ ### service skills
8
+
9
+ located in `service/<name>/SKILL.md`:
10
+
11
+ 1. **image-generation** (`service/image/`)
12
+ - generate ai images using fal (flux models) or higgsfield soul characters
13
+ - cli: `bun run service/image fal|soul <prompt> [options]`
14
+
15
+ 2. **video-generation** (`service/video/`)
16
+ - generate videos from images (local or url) or text prompts using fal.ai
17
+ - supports local image files - automatically uploads to fal storage
18
+ - cli: `bun run service/video from_image|from_text <args>`
19
+
20
+ 3. **voice-synthesis** (`service/voice/`)
21
+ - generate realistic text-to-speech audio using elevenlabs
22
+ - cli: `bun run service/voice generate|elevenlabs <text> [options]`
23
+
24
+ 3b. **music-generation** (`lib/elevenlabs.ts`)
25
+ - generate music from text prompts using elevenlabs
26
+ - generate sound effects from descriptions
27
+ - cli: `bun run lib/elevenlabs.ts music|sfx <prompt> [options]`
28
+
29
+ 4. **video-lipsync** (`service/sync/`)
30
+ - sync video with audio using wav2lip or simple overlay
31
+ - cli: `bun run service/sync sync|wav2lip|overlay <args>`
32
+
33
+ 5. **video-captions** (`service/captions/`)
34
+ - add auto-generated or custom subtitles to videos
35
+ - cli: `bun run service/captions <videoPath> [options]`
36
+
37
+ 6. **video-editing** (`service/edit/`)
38
+ - edit videos with ffmpeg (resize, trim, concat, social media prep)
39
+ - cli: `bun run service/edit social|montage|trim|resize|merge_audio <args>`
40
+
41
+ 7. **audio-transcription** (`service/transcribe/`)
42
+ - transcribe audio to text or subtitles using groq/fireworks
43
+ - cli: `bun run service/transcribe <audioUrl> <provider> [outputPath]`
44
+
45
+ ### utility skills
46
+
47
+ 8. **telegram-send** (external: `/Users/aleks/Github/Badaboom1995/rumble-b2c`)
48
+ - send videos to telegram users/channels as round videos
49
+ - automatically converts to 512x512 square format for telegram
50
+ - cli: `cd /Users/aleks/Github/Badaboom1995/rumble-b2c && bun run scripts/telegram-send-video.ts <videoPath> <@username>`
51
+ - example: `cd /Users/aleks/Github/Badaboom1995/rumble-b2c && bun run scripts/telegram-send-video.ts /path/to/video.mp4 @caffeinum`
52
+
53
+ ### pipeline skills
54
+
55
+ located in `pipeline/cookbooks/SKILL.md`:
56
+
57
+ 9. **talking-character-pipeline** (`pipeline/cookbooks/`)
58
+ - complete workflow to create talking character videos
59
+ - combines: character generation → voiceover → animation → lipsync → captions → social prep
60
+
61
+ 10. **round-video-character** (`pipeline/cookbooks/round-video-character.md`)
62
+ - create realistic round selfie videos for telegram using nano banana pro + wan 2.5
63
+ - workflow: generate selfie first frame (person in setting) → voiceover → wan 2.5 video
64
+ - uses: `bun run lib/fal.ts`, `bun run lib/replicate.ts`, `bun run lib/elevenlabs.ts`
65
+ - input: text script + profile photo
66
+ - output: extreme close-up selfie video with authentic camera shake, lighting, and audio
67
+
68
+ ## structure
69
+
70
+ each skill follows this pattern:
71
+
72
+ ```
73
+ service/<name>/
74
+ ├── index.ts # service implementation
75
+ └── SKILL.md # claude code agent skill
76
+ ```
77
+
78
+ ## how skills work
79
+
80
+ skills are **model-invoked** - claude autonomously decides when to use them based on your request and the skill's description.
81
+
82
+ **example:**
83
+ - you say: "create a talking character video"
84
+ - claude reads `talking-character-pipeline` skill
85
+ - claude executes the workflow using the pipeline steps
86
+
87
+ ## using skills
88
+
89
+ ### in claude code
90
+
91
+ skills are automatically discovered when you're in the sdk directory:
92
+
93
+ ```
94
+ user: create an image of a sunset
95
+ claude: [uses image-generation skill]
96
+ bun run service/image fal "beautiful sunset over mountains"
97
+ ```
98
+
99
+ ### manually
100
+
101
+ you can also run services directly:
102
+
103
+ ```bash
104
+ # generate image
105
+ bun run service/image fal "sunset over mountains" true
106
+
107
+ # generate video from that image
108
+ bun run service/video from_image "camera pan" https://image-url.jpg 5 true
109
+
110
+ # add voice
111
+ bun run service/voice elevenlabs "this is a beautiful sunset" rachel true
112
+
113
+ # sync with video
114
+ bun run service/sync wav2lip https://video-url.mp4 https://audio-url.mp3
115
+ ```
116
+
117
+ ## skill features
118
+
119
+ each skill includes:
120
+
121
+ - **name**: unique skill identifier
122
+ - **description**: when claude should use this skill
123
+ - **allowed-tools**: restricted to Read, Bash for safety
124
+ - **usage examples**: cli and programmatic examples
125
+ - **when to use**: specific use cases
126
+ - **tips**: best practices
127
+ - **environment variables**: required api keys
128
+
129
+ ## benefits
130
+
131
+ - **discoverability**: claude knows all available services
132
+ - **context**: skills provide usage examples and best practices
133
+ - **safety**: `allowed-tools` limits to read-only and bash execution
134
+ - **documentation**: skills serve as living documentation
135
+
136
+ ## skill reference
137
+
138
+ | skill | service | primary use case |
139
+ |-------|---------|------------------|
140
+ | image-generation | image | create ai images, character headshots |
141
+ | video-generation | video | animate images, generate video clips |
142
+ | voice-synthesis | voice | text-to-speech, voiceovers |
143
+ | music-generation | elevenlabs | generate music, create sound effects |
144
+ | video-lipsync | sync | sync audio with video, talking characters |
145
+ | video-captions | captions | add subtitles, accessibility |
146
+ | video-editing | edit | resize, trim, social media optimization |
147
+ | audio-transcription | transcribe | speech-to-text, subtitle generation |
148
+ | telegram-send | external | send videos to telegram as round videos |
149
+ | talking-character-pipeline | pipeline | end-to-end talking character videos |
150
+ | round-video-character | pipeline | telegram round selfie videos with wan 2.5 |
151
+
152
+ ## see also
153
+
154
+ - [README.md](README.md) - sdk overview and installation
155
+ - [STRUCTURE.md](STRUCTURE.md) - detailed module organization
156
+ - [pipeline/cookbooks/talking-character.md](pipeline/cookbooks/talking-character.md) - talking character workflow
157
+ - [pipeline/cookbooks/round-video-character.md](pipeline/cookbooks/round-video-character.md) - telegram round selfie video cookbook
package/STRUCTURE.md ADDED
@@ -0,0 +1,92 @@
1
+ # sdk structure
2
+
3
+ ## lib/ - two fal implementations
4
+
5
+ ### lib/ai-sdk/fal.ts
6
+ uses `@ai-sdk/fal` with vercel ai sdk
7
+
8
+ **when to use:**
9
+ - standard image generation
10
+ - need consistent api across providers
11
+ - want automatic image format handling
12
+ - prefer typed aspect ratios
13
+
14
+ **commands:**
15
+ ```bash
16
+ bun run lib/ai-sdk/fal.ts generate_image <prompt> [model] [aspectRatio]
17
+ ```
18
+
19
+ ### lib/fal.ts
20
+ uses `@fal-ai/client` directly
21
+
22
+ **when to use:**
23
+ - video generation (image-to-video, text-to-video)
24
+ - advanced fal features
25
+ - need queue/streaming updates
26
+ - custom api parameters
27
+
28
+ **commands:**
29
+ ```bash
30
+ bun run lib/fal.ts generate_image <prompt> [model] [imageSize]
31
+ bun run lib/fal.ts image_to_video <prompt> <imageUrl> [duration]
32
+ bun run lib/fal.ts text_to_video <prompt> [duration]
33
+ ```
34
+
35
+ ### lib/higgsfield.ts
36
+ uses `@higgsfield/client` for soul character generation
37
+
38
+ **commands:**
39
+ ```bash
40
+ bun run lib/higgsfield.ts generate_soul <prompt> [customReferenceId]
41
+ bun run lib/higgsfield.ts create_character <name> <imageUrl1> [imageUrl2...]
42
+ bun run lib/higgsfield.ts list_styles
43
+ ```
44
+
45
+ ## service/ - high-level wrappers
46
+
47
+ ### service/image.ts
48
+ combines fal + higgsfield for image generation
49
+
50
+ ```bash
51
+ bun run service/image.ts fal <prompt> [model] [upload]
52
+ bun run service/image.ts soul <prompt> [customReferenceId] [upload]
53
+ ```
54
+
55
+ ### service/video.ts
56
+ video generation with optional s3 upload
57
+
58
+ ```bash
59
+ bun run service/video.ts from_image <prompt> <imageUrl> [duration] [upload]
60
+ bun run service/video.ts from_text <prompt> [duration] [upload]
61
+ ```
62
+
63
+ ## utilities/
64
+
65
+ ### utilities/s3.ts
66
+ cloudflare r2 / s3 storage operations
67
+
68
+ ```bash
69
+ bun run utilities/s3.ts upload <filePath> <objectKey>
70
+ bun run utilities/s3.ts upload_from_url <url> <objectKey>
71
+ bun run utilities/s3.ts presigned_url <objectKey> [expiresIn]
72
+ ```
73
+
74
+ ## pipeline/cookbooks/
75
+ markdown guides for complex workflows
76
+
77
+ - `talking-character.md`: create talking character videos
78
+
79
+ ## dependencies
80
+
81
+ - `@ai-sdk/fal` - vercel ai sdk fal provider
82
+ - `@fal-ai/client` - official fal client
83
+ - `@higgsfield/client` - higgsfield api client
84
+ - `@aws-sdk/client-s3` - s3 storage
85
+ - `ai` - vercel ai sdk core
86
+
87
+ ## key decisions
88
+
89
+ 1. **two fal implementations** - ai-sdk for simplicity, client for power
90
+ 2. **all scripts are cli + library** - can be run directly or imported
91
+ 3. **consistent logging** - `[module] message` format
92
+ 4. **auto image opening** - ai-sdk version opens images automatically
@@ -0,0 +1,122 @@
1
+ # test results
2
+
3
+ ## ✅ both fal approaches working
4
+
5
+ ### approach 1: lib/ai-sdk/fal.ts (vercel ai sdk)
6
+
7
+ ```bash
8
+ $ bun run lib/ai-sdk/fal.ts generate_image "futuristic spaceship" "fal-ai/flux/dev" "16:9"
9
+
10
+ [ai-sdk/fal] generating image with fal-ai/flux/dev
11
+ [ai-sdk/fal] prompt: futuristic spaceship interior
12
+ [ai-sdk/fal] aspect ratio: 16:9
13
+ [ai-sdk/fal] completed!
14
+
15
+ image saved to: /tmp/fal-ai-sdk-1763772836608.png
16
+
17
+ metadata:
18
+ {
19
+ "images": [
20
+ {
21
+ "width": 1024,
22
+ "height": 576,
23
+ "contentType": "image/jpeg",
24
+ "nsfw": false
25
+ }
26
+ ]
27
+ }
28
+ ```
29
+
30
+ ✅ benefits:
31
+ - clean typed api
32
+ - auto image save + open
33
+ - aspect ratio support
34
+ - consistent with other ai-sdk providers
35
+
36
+ ### approach 2: lib/fal.ts (fal client direct)
37
+
38
+ ```bash
39
+ $ bun run lib/fal.ts generate_image "ancient temple ruins"
40
+
41
+ [fal] generating image with fal-ai/flux-pro/v1.1
42
+ [fal] prompt: ancient temple ruins at sunset
43
+ [fal] processing...
44
+ [fal] completed!
45
+
46
+ {
47
+ "data": {
48
+ "images": [
49
+ {
50
+ "url": "https://v3b.fal.media/files/b/koala/L5LYGCHZ4aZ_CKZsmPbUe.jpg",
51
+ "width": 1024,
52
+ "height": 768,
53
+ "content_type": "image/jpeg"
54
+ }
55
+ ],
56
+ "seed": 2946158106
57
+ }
58
+ }
59
+ ```
60
+
61
+ ✅ benefits:
62
+ - full api access
63
+ - queue updates
64
+ - video support
65
+ - custom parameters
66
+
67
+ ## cli tests ✅
68
+
69
+ all help menus working:
70
+
71
+ ```bash
72
+ bun run lib/ai-sdk/fal.ts # ✓
73
+ bun run lib/fal.ts # ✓
74
+ bun run lib/higgsfield.ts # ✓
75
+ bun run service/image.ts # ✓
76
+ bun run service/video.ts # ✓
77
+ bun run utilities/s3.ts # ✓
78
+ ```
79
+
80
+ ## library imports ✅
81
+
82
+ ```typescript
83
+ import { generateImage } from "./index"
84
+ import * as aiSdkFal from "./index"
85
+
86
+ // both approaches available
87
+ ```
88
+
89
+ ## actual generation tests ✅
90
+
91
+ successfully generated and opened:
92
+ - cyberpunk city (16:9, ai-sdk)
93
+ - spaceship interior (16:9, ai-sdk)
94
+ - temple ruins (4:3, fal client)
95
+ - aurora borealis (4:3, fal client)
96
+
97
+ all images ~15-20 seconds generation time
98
+
99
+ ## what works
100
+
101
+ 1. **dual fal implementations** - ai-sdk for simplicity, client for power ✓
102
+ 2. **all cli scripts executable** with proper help menus ✓
103
+ 3. **library imports functional** ✓
104
+ 4. **actual image generation working** ✓
105
+ 5. **automatic image opening** (ai-sdk version) ✓
106
+ 6. **queue progress updates** (fal client) ✓
107
+
108
+ ## file structure
109
+
110
+ ```
111
+ lib/
112
+ ├── ai-sdk/
113
+ │ └── fal.ts # vercel ai sdk approach
114
+ ├── fal.ts # fal client approach
115
+ └── higgsfield.ts # soul character generation
116
+ ```
117
+
118
+ ## recommendations
119
+
120
+ - **use lib/ai-sdk/fal.ts** for standard image generation
121
+ - **use lib/fal.ts** for video or advanced features
122
+ - **use service/**.ts for high-level operations with s3 upload
@@ -0,0 +1,170 @@
1
+ ---
2
+ name: video-captions
3
+ description: add auto-generated or custom subtitles to videos using groq/fireworks transcription and ffmpeg overlay. use when adding captions, subtitles, or text overlays to videos for accessibility or social media.
4
+ allowed-tools: Read, Bash
5
+ ---
6
+
7
+ # video captions
8
+
9
+ automatically generate and overlay subtitles on videos with customizable styling.
10
+
11
+ ## features
12
+
13
+ - **auto-generation**: transcribe video audio using groq or fireworks
14
+ - **custom srt support**: use existing subtitle files
15
+ - **styling**: customize font, size, colors, position
16
+ - **word-level timing**: fireworks provides precise word timestamps
17
+ - **instant overlay**: ffmpeg-based subtitle rendering
18
+
19
+ ## usage
20
+
21
+ ### auto-generate captions
22
+ ```bash
23
+ bun run service/captions.ts <videoPath> [outputPath] [options]
24
+ ```
25
+
26
+ **basic example:**
27
+ ```bash
28
+ bun run service/captions.ts media/video.mp4
29
+ # outputs: media/video-captioned.mp4
30
+ ```
31
+
32
+ **with options:**
33
+ ```bash
34
+ bun run service/captions.ts media/video.mp4 output.mp4 --provider fireworks --font Arial --size 28
35
+ ```
36
+
37
+ ### use existing srt file
38
+ ```bash
39
+ bun run service/captions.ts media/video.mp4 output.mp4 --srt media/video.srt
40
+ ```
41
+
42
+ ## options
43
+
44
+ - `--srt <path>` - use existing srt file instead of auto-generating
45
+ - `--provider <name>` - groq or fireworks (default: fireworks)
46
+ - `--font <name>` - font name (default: Arial)
47
+ - `--size <number>` - font size (default: 24)
48
+ - `--color <hex>` - primary color in ASS format (default: &HFFFFFF white)
49
+ - `--outline <hex>` - outline color in ASS format (default: &H000000 black)
50
+
51
+ ## as library
52
+
53
+ ```typescript
54
+ import { addCaptions } from "./service/captions"
55
+
56
+ const result = await addCaptions({
57
+ videoPath: "media/video.mp4",
58
+ output: "captioned.mp4",
59
+ provider: "fireworks", // or "groq"
60
+ style: {
61
+ fontName: "Helvetica",
62
+ fontSize: 28,
63
+ primaryColor: "&HFFFFFF",
64
+ outlineColor: "&H000000",
65
+ bold: true,
66
+ alignment: 2, // bottom center
67
+ marginV: 20
68
+ }
69
+ })
70
+ ```
71
+
72
+ ## providers
73
+
74
+ ### fireworks (recommended)
75
+ - **word-level timestamps** for precise timing
76
+ - generates `.srt` format with detailed timing
77
+ - better for social media content
78
+ - slightly slower transcription
79
+
80
+ ### groq
81
+ - **ultra-fast** transcription
82
+ - plain text output (converted to srt)
83
+ - sentence-level timing
84
+ - great for quick previews
85
+
86
+ ## styling options
87
+
88
+ ```typescript
89
+ interface SubtitleStyle {
90
+ fontName?: string // default: Arial
91
+ fontSize?: number // default: 24
92
+ primaryColor?: string // default: &HFFFFFF (white)
93
+ outlineColor?: string // default: &H000000 (black)
94
+ bold?: boolean // default: true
95
+ alignment?: number // 1-9, default: 2 (bottom center)
96
+ marginV?: number // vertical margin, default: 20
97
+ }
98
+ ```
99
+
100
+ **alignment values:**
101
+ ```
102
+ 1 = bottom left 2 = bottom center 3 = bottom right
103
+ 4 = middle left 5 = middle center 6 = middle right
104
+ 7 = top left 8 = top center 9 = top right
105
+ ```
106
+
107
+ ## when to use
108
+
109
+ use this skill when:
110
+ - preparing videos for social media (tiktok, instagram, youtube)
111
+ - adding accessibility features
112
+ - creating educational or tutorial content
113
+ - need word-level caption timing
114
+ - translating videos with custom srt files
115
+
116
+ ## typical workflow
117
+
118
+ 1. create or edit video
119
+ 2. add captions with auto-transcription (this service)
120
+ 3. customize style for platform
121
+ 4. prepare for social media (edit service)
122
+
123
+ ## examples
124
+
125
+ **tiktok/instagram style captions:**
126
+ ```bash
127
+ bun run service/captions.ts video.mp4 captioned.mp4 \
128
+ --provider fireworks \
129
+ --font "Arial Black" \
130
+ --size 32 \
131
+ --color "&H00FFFF"
132
+ ```
133
+
134
+ **professional style:**
135
+ ```bash
136
+ bun run service/captions.ts video.mp4 output.mp4 \
137
+ --provider fireworks \
138
+ --font "Helvetica" \
139
+ --size 24
140
+ ```
141
+
142
+ **with existing subtitles:**
143
+ ```bash
144
+ bun run service/captions.ts video.mp4 final.mp4 \
145
+ --srt custom-subtitles.srt \
146
+ --font "Arial" \
147
+ --size 26
148
+ ```
149
+
150
+ ## output
151
+
152
+ - generates `.srt` file if auto-transcribing
153
+ - creates new video file with burned-in subtitles
154
+ - preserves original video quality
155
+ - audio is copied without re-encoding
156
+
157
+ ## environment variables
158
+
159
+ required (for auto-transcription):
160
+ - `GROQ_API_KEY` - for groq provider
161
+ - `FIREWORKS_API_KEY` - for fireworks provider
162
+
163
+ **system requirements:**
164
+ - ffmpeg must be installed
165
+
166
+ ## processing time
167
+
168
+ - transcription: 5-30 seconds (depending on video length)
169
+ - overlay: 5-15 seconds (depending on video length)
170
+ - total: typically under 1 minute
@@ -0,0 +1,169 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * video captioning service
5
+ * generates and overlays subtitles on videos using ffmpeg
6
+ * supports auto-generation via groq/fireworks or custom srt files
7
+ */
8
+
9
+ import { existsSync } from "node:fs";
10
+ import ffmpeg from "fluent-ffmpeg";
11
+ import type { ActionMeta } from "../../cli/types";
12
+ import { transcribe } from "../transcribe";
13
+
14
+ export const meta: ActionMeta = {
15
+ name: "captions",
16
+ type: "action",
17
+ description: "add subtitles to video",
18
+ inputType: "video",
19
+ outputType: "video",
20
+ schema: {
21
+ input: {
22
+ type: "object",
23
+ required: ["video", "output"],
24
+ properties: {
25
+ video: {
26
+ type: "string",
27
+ format: "file-path",
28
+ description: "input video file",
29
+ },
30
+ output: {
31
+ type: "string",
32
+ format: "file-path",
33
+ description: "output video path",
34
+ },
35
+ srt: {
36
+ type: "string",
37
+ format: "file-path",
38
+ description: "existing srt file (auto-generates if not provided)",
39
+ },
40
+ provider: {
41
+ type: "string",
42
+ enum: ["groq", "fireworks"],
43
+ default: "fireworks",
44
+ description: "transcription provider for auto-generation",
45
+ },
46
+ },
47
+ },
48
+ output: { type: "string", format: "file-path", description: "video path" },
49
+ },
50
+ async run(options) {
51
+ const { video, output, srt, provider } = options as {
52
+ video: string;
53
+ output: string;
54
+ srt?: string;
55
+ provider?: "groq" | "fireworks";
56
+ };
57
+ return addCaptions({ videoPath: video, output, srtPath: srt, provider });
58
+ },
59
+ };
60
+
61
+ // types
62
+ export interface AddCaptionsOptions {
63
+ videoPath: string;
64
+ srtPath?: string; // optional existing srt file
65
+ output: string;
66
+ provider?: "groq" | "fireworks"; // only used if srtPath not provided
67
+ style?: SubtitleStyle;
68
+ }
69
+
70
+ export interface SubtitleStyle {
71
+ fontName?: string; // default: Arial
72
+ fontSize?: number; // default: 24
73
+ primaryColor?: string; // default: &HFFFFFF (white)
74
+ outlineColor?: string; // default: &H000000 (black)
75
+ bold?: boolean; // default: true
76
+ alignment?: number; // 1-9, default: 2 (bottom center)
77
+ marginV?: number; // vertical margin, default: 20
78
+ }
79
+
80
+ // default subtitle style
81
+ const DEFAULT_STYLE: Required<SubtitleStyle> = {
82
+ fontName: "Arial",
83
+ fontSize: 24,
84
+ primaryColor: "&HFFFFFF", // white
85
+ outlineColor: "&H000000", // black
86
+ bold: true,
87
+ alignment: 2, // bottom center
88
+ marginV: 20,
89
+ };
90
+
91
+ // main function to add captions to video
92
+ export async function addCaptions(
93
+ options: AddCaptionsOptions,
94
+ ): Promise<string> {
95
+ const { videoPath, srtPath, output, provider = "fireworks", style } = options;
96
+
97
+ if (!videoPath) {
98
+ throw new Error("videoPath is required");
99
+ }
100
+ if (!output) {
101
+ throw new Error("output is required");
102
+ }
103
+ if (!existsSync(videoPath)) {
104
+ throw new Error(`video file not found: ${videoPath}`);
105
+ }
106
+
107
+ console.log("[captions] adding captions to video...");
108
+
109
+ // determine srt file path
110
+ let finalSrtPath = srtPath;
111
+
112
+ // if no srt file provided, auto-generate it
113
+ if (!finalSrtPath) {
114
+ console.log(
115
+ `[captions] no srt file provided, auto-generating with ${provider}...`,
116
+ );
117
+
118
+ // generate srt file from video audio
119
+ const tempSrtPath = videoPath.replace(/\.[^.]+$/, ".srt");
120
+
121
+ const result = await transcribe({
122
+ audioUrl: videoPath,
123
+ provider,
124
+ outputFormat: "srt",
125
+ outputPath: tempSrtPath,
126
+ });
127
+
128
+ if (!result.success) {
129
+ throw new Error(`failed to generate subtitles: ${result.error}`);
130
+ }
131
+
132
+ finalSrtPath = tempSrtPath;
133
+ console.log(`[captions] generated subtitles at ${finalSrtPath}`);
134
+ }
135
+
136
+ if (!existsSync(finalSrtPath)) {
137
+ throw new Error(`srt file not found: ${finalSrtPath}`);
138
+ }
139
+
140
+ // merge style with defaults
141
+ const finalStyle = { ...DEFAULT_STYLE, ...style };
142
+
143
+ // build subtitle filter with style
144
+ const subtitlesFilter = `subtitles=${finalSrtPath}:force_style='FontName=${finalStyle.fontName},FontSize=${finalStyle.fontSize},PrimaryColour=${finalStyle.primaryColor},OutlineColour=${finalStyle.outlineColor},Bold=${finalStyle.bold ? -1 : 0},Alignment=${finalStyle.alignment},MarginV=${finalStyle.marginV}'`;
145
+
146
+ console.log("[captions] overlaying subtitles on video...");
147
+
148
+ return new Promise((resolve, reject) => {
149
+ ffmpeg(videoPath)
150
+ .videoFilters(subtitlesFilter)
151
+ .outputOptions(["-c:a", "copy"]) // copy audio without re-encoding
152
+ .output(output)
153
+ .on("end", () => {
154
+ console.log(`[captions] saved to ${output}`);
155
+ resolve(output);
156
+ })
157
+ .on("error", (err) => {
158
+ console.error("[captions] error:", err);
159
+ reject(err);
160
+ })
161
+ .run();
162
+ });
163
+ }
164
+
165
+ // cli
166
+ if (import.meta.main) {
167
+ const { runCli } = await import("../../cli/runner");
168
+ runCli(meta);
169
+ }