@vargai/sdk 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.example +24 -0
- package/CLAUDE.md +118 -0
- package/HIGGSFIELD_REWRITE_SUMMARY.md +300 -0
- package/README.md +231 -0
- package/SKILLS.md +157 -0
- package/STRUCTURE.md +92 -0
- package/TEST_RESULTS.md +122 -0
- package/action/captions/SKILL.md +170 -0
- package/action/captions/index.ts +169 -0
- package/action/edit/SKILL.md +235 -0
- package/action/edit/index.ts +437 -0
- package/action/image/SKILL.md +140 -0
- package/action/image/index.ts +105 -0
- package/action/sync/SKILL.md +136 -0
- package/action/sync/index.ts +145 -0
- package/action/transcribe/SKILL.md +179 -0
- package/action/transcribe/index.ts +210 -0
- package/action/video/SKILL.md +116 -0
- package/action/video/index.ts +125 -0
- package/action/voice/SKILL.md +125 -0
- package/action/voice/index.ts +136 -0
- package/biome.json +33 -0
- package/bun.lock +842 -0
- package/cli/commands/find.ts +58 -0
- package/cli/commands/help.ts +70 -0
- package/cli/commands/list.ts +49 -0
- package/cli/commands/run.ts +237 -0
- package/cli/commands/which.ts +66 -0
- package/cli/discover.ts +66 -0
- package/cli/index.ts +33 -0
- package/cli/runner.ts +65 -0
- package/cli/types.ts +49 -0
- package/cli/ui.ts +185 -0
- package/index.ts +75 -0
- package/lib/README.md +144 -0
- package/lib/ai-sdk/fal.ts +106 -0
- package/lib/ai-sdk/replicate.ts +107 -0
- package/lib/elevenlabs.ts +382 -0
- package/lib/fal.ts +467 -0
- package/lib/ffmpeg.ts +467 -0
- package/lib/fireworks.ts +235 -0
- package/lib/groq.ts +246 -0
- package/lib/higgsfield/MIGRATION.md +308 -0
- package/lib/higgsfield/README.md +273 -0
- package/lib/higgsfield/example.ts +228 -0
- package/lib/higgsfield/index.ts +241 -0
- package/lib/higgsfield/soul.ts +262 -0
- package/lib/higgsfield.ts +176 -0
- package/lib/remotion/SKILL.md +823 -0
- package/lib/remotion/cli.ts +115 -0
- package/lib/remotion/functions.ts +283 -0
- package/lib/remotion/index.ts +19 -0
- package/lib/remotion/templates.ts +73 -0
- package/lib/replicate.ts +304 -0
- package/output.txt +1 -0
- package/package.json +42 -0
- package/pipeline/cookbooks/SKILL.md +285 -0
- package/pipeline/cookbooks/remotion-video.md +585 -0
- package/pipeline/cookbooks/round-video-character.md +337 -0
- package/pipeline/cookbooks/talking-character.md +59 -0
- package/scripts/produce-menopause-campaign.sh +202 -0
- package/service/music/SKILL.md +229 -0
- package/service/music/index.ts +296 -0
- package/test-import.ts +7 -0
- package/test-services.ts +97 -0
- package/tsconfig.json +29 -0
- package/utilities/s3.ts +147 -0
package/SKILLS.md
ADDED
|
@@ -0,0 +1,157 @@
|
|
|
1
|
+
# agent skills
|
|
2
|
+
|
|
3
|
+
this sdk includes claude code agent skills for each service. each skill is co-located with its service code.
|
|
4
|
+
|
|
5
|
+
## available skills
|
|
6
|
+
|
|
7
|
+
### service skills
|
|
8
|
+
|
|
9
|
+
located in `service/<name>/SKILL.md`:
|
|
10
|
+
|
|
11
|
+
1. **image-generation** (`service/image/`)
|
|
12
|
+
- generate ai images using fal (flux models) or higgsfield soul characters
|
|
13
|
+
- cli: `bun run service/image fal|soul <prompt> [options]`
|
|
14
|
+
|
|
15
|
+
2. **video-generation** (`service/video/`)
|
|
16
|
+
- generate videos from images (local or url) or text prompts using fal.ai
|
|
17
|
+
- supports local image files - automatically uploads to fal storage
|
|
18
|
+
- cli: `bun run service/video from_image|from_text <args>`
|
|
19
|
+
|
|
20
|
+
3. **voice-synthesis** (`service/voice/`)
|
|
21
|
+
- generate realistic text-to-speech audio using elevenlabs
|
|
22
|
+
- cli: `bun run service/voice generate|elevenlabs <text> [options]`
|
|
23
|
+
|
|
24
|
+
3b. **music-generation** (`lib/elevenlabs.ts`)
|
|
25
|
+
- generate music from text prompts using elevenlabs
|
|
26
|
+
- generate sound effects from descriptions
|
|
27
|
+
- cli: `bun run lib/elevenlabs.ts music|sfx <prompt> [options]`
|
|
28
|
+
|
|
29
|
+
4. **video-lipsync** (`service/sync/`)
|
|
30
|
+
- sync video with audio using wav2lip or simple overlay
|
|
31
|
+
- cli: `bun run service/sync sync|wav2lip|overlay <args>`
|
|
32
|
+
|
|
33
|
+
5. **video-captions** (`service/captions/`)
|
|
34
|
+
- add auto-generated or custom subtitles to videos
|
|
35
|
+
- cli: `bun run service/captions <videoPath> [options]`
|
|
36
|
+
|
|
37
|
+
6. **video-editing** (`service/edit/`)
|
|
38
|
+
- edit videos with ffmpeg (resize, trim, concat, social media prep)
|
|
39
|
+
- cli: `bun run service/edit social|montage|trim|resize|merge_audio <args>`
|
|
40
|
+
|
|
41
|
+
7. **audio-transcription** (`service/transcribe/`)
|
|
42
|
+
- transcribe audio to text or subtitles using groq/fireworks
|
|
43
|
+
- cli: `bun run service/transcribe <audioUrl> <provider> [outputPath]`
|
|
44
|
+
|
|
45
|
+
### utility skills
|
|
46
|
+
|
|
47
|
+
8. **telegram-send** (external: `/Users/aleks/Github/Badaboom1995/rumble-b2c`)
|
|
48
|
+
- send videos to telegram users/channels as round videos
|
|
49
|
+
- automatically converts to 512x512 square format for telegram
|
|
50
|
+
- cli: `cd /Users/aleks/Github/Badaboom1995/rumble-b2c && bun run scripts/telegram-send-video.ts <videoPath> <@username>`
|
|
51
|
+
- example: `cd /Users/aleks/Github/Badaboom1995/rumble-b2c && bun run scripts/telegram-send-video.ts /path/to/video.mp4 @caffeinum`
|
|
52
|
+
|
|
53
|
+
### pipeline skills
|
|
54
|
+
|
|
55
|
+
located in `pipeline/cookbooks/SKILL.md`:
|
|
56
|
+
|
|
57
|
+
9. **talking-character-pipeline** (`pipeline/cookbooks/`)
|
|
58
|
+
- complete workflow to create talking character videos
|
|
59
|
+
- combines: character generation → voiceover → animation → lipsync → captions → social prep
|
|
60
|
+
|
|
61
|
+
10. **round-video-character** (`pipeline/cookbooks/round-video-character.md`)
|
|
62
|
+
- create realistic round selfie videos for telegram using nano banana pro + wan 2.5
|
|
63
|
+
- workflow: generate selfie first frame (person in setting) → voiceover → wan 2.5 video
|
|
64
|
+
- uses: `bun run lib/fal.ts`, `bun run lib/replicate.ts`, `bun run lib/elevenlabs.ts`
|
|
65
|
+
- input: text script + profile photo
|
|
66
|
+
- output: extreme close-up selfie video with authentic camera shake, lighting, and audio
|
|
67
|
+
|
|
68
|
+
## structure
|
|
69
|
+
|
|
70
|
+
each skill follows this pattern:
|
|
71
|
+
|
|
72
|
+
```
|
|
73
|
+
service/<name>/
|
|
74
|
+
├── index.ts # service implementation
|
|
75
|
+
└── SKILL.md # claude code agent skill
|
|
76
|
+
```
|
|
77
|
+
|
|
78
|
+
## how skills work
|
|
79
|
+
|
|
80
|
+
skills are **model-invoked** - claude autonomously decides when to use them based on your request and the skill's description.
|
|
81
|
+
|
|
82
|
+
**example:**
|
|
83
|
+
- you say: "create a talking character video"
|
|
84
|
+
- claude reads `talking-character-pipeline` skill
|
|
85
|
+
- claude executes the workflow using the pipeline steps
|
|
86
|
+
|
|
87
|
+
## using skills
|
|
88
|
+
|
|
89
|
+
### in claude code
|
|
90
|
+
|
|
91
|
+
skills are automatically discovered when you're in the sdk directory:
|
|
92
|
+
|
|
93
|
+
```
|
|
94
|
+
user: create an image of a sunset
|
|
95
|
+
claude: [uses image-generation skill]
|
|
96
|
+
bun run service/image fal "beautiful sunset over mountains"
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
### manually
|
|
100
|
+
|
|
101
|
+
you can also run services directly:
|
|
102
|
+
|
|
103
|
+
```bash
|
|
104
|
+
# generate image
|
|
105
|
+
bun run service/image fal "sunset over mountains" true
|
|
106
|
+
|
|
107
|
+
# generate video from that image
|
|
108
|
+
bun run service/video from_image "camera pan" https://image-url.jpg 5 true
|
|
109
|
+
|
|
110
|
+
# add voice
|
|
111
|
+
bun run service/voice elevenlabs "this is a beautiful sunset" rachel true
|
|
112
|
+
|
|
113
|
+
# sync with video
|
|
114
|
+
bun run service/sync wav2lip https://video-url.mp4 https://audio-url.mp3
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
## skill features
|
|
118
|
+
|
|
119
|
+
each skill includes:
|
|
120
|
+
|
|
121
|
+
- **name**: unique skill identifier
|
|
122
|
+
- **description**: when claude should use this skill
|
|
123
|
+
- **allowed-tools**: restricted to Read, Bash for safety
|
|
124
|
+
- **usage examples**: cli and programmatic examples
|
|
125
|
+
- **when to use**: specific use cases
|
|
126
|
+
- **tips**: best practices
|
|
127
|
+
- **environment variables**: required api keys
|
|
128
|
+
|
|
129
|
+
## benefits
|
|
130
|
+
|
|
131
|
+
- **discoverability**: claude knows all available services
|
|
132
|
+
- **context**: skills provide usage examples and best practices
|
|
133
|
+
- **safety**: `allowed-tools` limits to read-only and bash execution
|
|
134
|
+
- **documentation**: skills serve as living documentation
|
|
135
|
+
|
|
136
|
+
## skill reference
|
|
137
|
+
|
|
138
|
+
| skill | service | primary use case |
|
|
139
|
+
|-------|---------|------------------|
|
|
140
|
+
| image-generation | image | create ai images, character headshots |
|
|
141
|
+
| video-generation | video | animate images, generate video clips |
|
|
142
|
+
| voice-synthesis | voice | text-to-speech, voiceovers |
|
|
143
|
+
| music-generation | elevenlabs | generate music, create sound effects |
|
|
144
|
+
| video-lipsync | sync | sync audio with video, talking characters |
|
|
145
|
+
| video-captions | captions | add subtitles, accessibility |
|
|
146
|
+
| video-editing | edit | resize, trim, social media optimization |
|
|
147
|
+
| audio-transcription | transcribe | speech-to-text, subtitle generation |
|
|
148
|
+
| telegram-send | external | send videos to telegram as round videos |
|
|
149
|
+
| talking-character-pipeline | pipeline | end-to-end talking character videos |
|
|
150
|
+
| round-video-character | pipeline | telegram round selfie videos with wan 2.5 |
|
|
151
|
+
|
|
152
|
+
## see also
|
|
153
|
+
|
|
154
|
+
- [README.md](README.md) - sdk overview and installation
|
|
155
|
+
- [STRUCTURE.md](STRUCTURE.md) - detailed module organization
|
|
156
|
+
- [pipeline/cookbooks/talking-character.md](pipeline/cookbooks/talking-character.md) - talking character workflow
|
|
157
|
+
- [pipeline/cookbooks/round-video-character.md](pipeline/cookbooks/round-video-character.md) - telegram round selfie video cookbook
|
package/STRUCTURE.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
# sdk structure
|
|
2
|
+
|
|
3
|
+
## lib/ - two fal implementations
|
|
4
|
+
|
|
5
|
+
### lib/ai-sdk/fal.ts
|
|
6
|
+
uses `@ai-sdk/fal` with vercel ai sdk
|
|
7
|
+
|
|
8
|
+
**when to use:**
|
|
9
|
+
- standard image generation
|
|
10
|
+
- need consistent api across providers
|
|
11
|
+
- want automatic image format handling
|
|
12
|
+
- prefer typed aspect ratios
|
|
13
|
+
|
|
14
|
+
**commands:**
|
|
15
|
+
```bash
|
|
16
|
+
bun run lib/ai-sdk/fal.ts generate_image <prompt> [model] [aspectRatio]
|
|
17
|
+
```
|
|
18
|
+
|
|
19
|
+
### lib/fal.ts
|
|
20
|
+
uses `@fal-ai/client` directly
|
|
21
|
+
|
|
22
|
+
**when to use:**
|
|
23
|
+
- video generation (image-to-video, text-to-video)
|
|
24
|
+
- advanced fal features
|
|
25
|
+
- need queue/streaming updates
|
|
26
|
+
- custom api parameters
|
|
27
|
+
|
|
28
|
+
**commands:**
|
|
29
|
+
```bash
|
|
30
|
+
bun run lib/fal.ts generate_image <prompt> [model] [imageSize]
|
|
31
|
+
bun run lib/fal.ts image_to_video <prompt> <imageUrl> [duration]
|
|
32
|
+
bun run lib/fal.ts text_to_video <prompt> [duration]
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
### lib/higgsfield.ts
|
|
36
|
+
uses `@higgsfield/client` for soul character generation
|
|
37
|
+
|
|
38
|
+
**commands:**
|
|
39
|
+
```bash
|
|
40
|
+
bun run lib/higgsfield.ts generate_soul <prompt> [customReferenceId]
|
|
41
|
+
bun run lib/higgsfield.ts create_character <name> <imageUrl1> [imageUrl2...]
|
|
42
|
+
bun run lib/higgsfield.ts list_styles
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
## service/ - high-level wrappers
|
|
46
|
+
|
|
47
|
+
### service/image.ts
|
|
48
|
+
combines fal + higgsfield for image generation
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
bun run service/image.ts fal <prompt> [model] [upload]
|
|
52
|
+
bun run service/image.ts soul <prompt> [customReferenceId] [upload]
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### service/video.ts
|
|
56
|
+
video generation with optional s3 upload
|
|
57
|
+
|
|
58
|
+
```bash
|
|
59
|
+
bun run service/video.ts from_image <prompt> <imageUrl> [duration] [upload]
|
|
60
|
+
bun run service/video.ts from_text <prompt> [duration] [upload]
|
|
61
|
+
```
|
|
62
|
+
|
|
63
|
+
## utilities/
|
|
64
|
+
|
|
65
|
+
### utilities/s3.ts
|
|
66
|
+
cloudflare r2 / s3 storage operations
|
|
67
|
+
|
|
68
|
+
```bash
|
|
69
|
+
bun run utilities/s3.ts upload <filePath> <objectKey>
|
|
70
|
+
bun run utilities/s3.ts upload_from_url <url> <objectKey>
|
|
71
|
+
bun run utilities/s3.ts presigned_url <objectKey> [expiresIn]
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## pipeline/cookbooks/
|
|
75
|
+
markdown guides for complex workflows
|
|
76
|
+
|
|
77
|
+
- `talking-character.md`: create talking character videos
|
|
78
|
+
|
|
79
|
+
## dependencies
|
|
80
|
+
|
|
81
|
+
- `@ai-sdk/fal` - vercel ai sdk fal provider
|
|
82
|
+
- `@fal-ai/client` - official fal client
|
|
83
|
+
- `@higgsfield/client` - higgsfield api client
|
|
84
|
+
- `@aws-sdk/client-s3` - s3 storage
|
|
85
|
+
- `ai` - vercel ai sdk core
|
|
86
|
+
|
|
87
|
+
## key decisions
|
|
88
|
+
|
|
89
|
+
1. **two fal implementations** - ai-sdk for simplicity, client for power
|
|
90
|
+
2. **all scripts are cli + library** - can be run directly or imported
|
|
91
|
+
3. **consistent logging** - `[module] message` format
|
|
92
|
+
4. **auto image opening** - ai-sdk version opens images automatically
|
package/TEST_RESULTS.md
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
# test results
|
|
2
|
+
|
|
3
|
+
## ✅ both fal approaches working
|
|
4
|
+
|
|
5
|
+
### approach 1: lib/ai-sdk/fal.ts (vercel ai sdk)
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
$ bun run lib/ai-sdk/fal.ts generate_image "futuristic spaceship" "fal-ai/flux/dev" "16:9"
|
|
9
|
+
|
|
10
|
+
[ai-sdk/fal] generating image with fal-ai/flux/dev
|
|
11
|
+
[ai-sdk/fal] prompt: futuristic spaceship interior
|
|
12
|
+
[ai-sdk/fal] aspect ratio: 16:9
|
|
13
|
+
[ai-sdk/fal] completed!
|
|
14
|
+
|
|
15
|
+
image saved to: /tmp/fal-ai-sdk-1763772836608.png
|
|
16
|
+
|
|
17
|
+
metadata:
|
|
18
|
+
{
|
|
19
|
+
"images": [
|
|
20
|
+
{
|
|
21
|
+
"width": 1024,
|
|
22
|
+
"height": 576,
|
|
23
|
+
"contentType": "image/jpeg",
|
|
24
|
+
"nsfw": false
|
|
25
|
+
}
|
|
26
|
+
]
|
|
27
|
+
}
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
✅ benefits:
|
|
31
|
+
- clean typed api
|
|
32
|
+
- auto image save + open
|
|
33
|
+
- aspect ratio support
|
|
34
|
+
- consistent with other ai-sdk providers
|
|
35
|
+
|
|
36
|
+
### approach 2: lib/fal.ts (fal client direct)
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
$ bun run lib/fal.ts generate_image "ancient temple ruins"
|
|
40
|
+
|
|
41
|
+
[fal] generating image with fal-ai/flux-pro/v1.1
|
|
42
|
+
[fal] prompt: ancient temple ruins at sunset
|
|
43
|
+
[fal] processing...
|
|
44
|
+
[fal] completed!
|
|
45
|
+
|
|
46
|
+
{
|
|
47
|
+
"data": {
|
|
48
|
+
"images": [
|
|
49
|
+
{
|
|
50
|
+
"url": "https://v3b.fal.media/files/b/koala/L5LYGCHZ4aZ_CKZsmPbUe.jpg",
|
|
51
|
+
"width": 1024,
|
|
52
|
+
"height": 768,
|
|
53
|
+
"content_type": "image/jpeg"
|
|
54
|
+
}
|
|
55
|
+
],
|
|
56
|
+
"seed": 2946158106
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
✅ benefits:
|
|
62
|
+
- full api access
|
|
63
|
+
- queue updates
|
|
64
|
+
- video support
|
|
65
|
+
- custom parameters
|
|
66
|
+
|
|
67
|
+
## cli tests ✅
|
|
68
|
+
|
|
69
|
+
all help menus working:
|
|
70
|
+
|
|
71
|
+
```bash
|
|
72
|
+
bun run lib/ai-sdk/fal.ts # ✓
|
|
73
|
+
bun run lib/fal.ts # ✓
|
|
74
|
+
bun run lib/higgsfield.ts # ✓
|
|
75
|
+
bun run service/image.ts # ✓
|
|
76
|
+
bun run service/video.ts # ✓
|
|
77
|
+
bun run utilities/s3.ts # ✓
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## library imports ✅
|
|
81
|
+
|
|
82
|
+
```typescript
|
|
83
|
+
import { generateImage } from "./index"
|
|
84
|
+
import * as aiSdkFal from "./index"
|
|
85
|
+
|
|
86
|
+
// both approaches available
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
## actual generation tests ✅
|
|
90
|
+
|
|
91
|
+
successfully generated and opened:
|
|
92
|
+
- cyberpunk city (16:9, ai-sdk)
|
|
93
|
+
- spaceship interior (16:9, ai-sdk)
|
|
94
|
+
- temple ruins (4:3, fal client)
|
|
95
|
+
- aurora borealis (4:3, fal client)
|
|
96
|
+
|
|
97
|
+
all images ~15-20 seconds generation time
|
|
98
|
+
|
|
99
|
+
## what works
|
|
100
|
+
|
|
101
|
+
1. **dual fal implementations** - ai-sdk for simplicity, client for power ✓
|
|
102
|
+
2. **all cli scripts executable** with proper help menus ✓
|
|
103
|
+
3. **library imports functional** ✓
|
|
104
|
+
4. **actual image generation working** ✓
|
|
105
|
+
5. **automatic image opening** (ai-sdk version) ✓
|
|
106
|
+
6. **queue progress updates** (fal client) ✓
|
|
107
|
+
|
|
108
|
+
## file structure
|
|
109
|
+
|
|
110
|
+
```
|
|
111
|
+
lib/
|
|
112
|
+
├── ai-sdk/
|
|
113
|
+
│ └── fal.ts # vercel ai sdk approach
|
|
114
|
+
├── fal.ts # fal client approach
|
|
115
|
+
└── higgsfield.ts # soul character generation
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## recommendations
|
|
119
|
+
|
|
120
|
+
- **use lib/ai-sdk/fal.ts** for standard image generation
|
|
121
|
+
- **use lib/fal.ts** for video or advanced features
|
|
122
|
+
- **use service/**.ts for high-level operations with s3 upload
|
|
@@ -0,0 +1,170 @@
|
|
|
1
|
+
---
|
|
2
|
+
name: video-captions
|
|
3
|
+
description: add auto-generated or custom subtitles to videos using groq/fireworks transcription and ffmpeg overlay. use when adding captions, subtitles, or text overlays to videos for accessibility or social media.
|
|
4
|
+
allowed-tools: Read, Bash
|
|
5
|
+
---
|
|
6
|
+
|
|
7
|
+
# video captions
|
|
8
|
+
|
|
9
|
+
automatically generate and overlay subtitles on videos with customizable styling.
|
|
10
|
+
|
|
11
|
+
## features
|
|
12
|
+
|
|
13
|
+
- **auto-generation**: transcribe video audio using groq or fireworks
|
|
14
|
+
- **custom srt support**: use existing subtitle files
|
|
15
|
+
- **styling**: customize font, size, colors, position
|
|
16
|
+
- **word-level timing**: fireworks provides precise word timestamps
|
|
17
|
+
- **instant overlay**: ffmpeg-based subtitle rendering
|
|
18
|
+
|
|
19
|
+
## usage
|
|
20
|
+
|
|
21
|
+
### auto-generate captions
|
|
22
|
+
```bash
|
|
23
|
+
bun run service/captions.ts <videoPath> [outputPath] [options]
|
|
24
|
+
```
|
|
25
|
+
|
|
26
|
+
**basic example:**
|
|
27
|
+
```bash
|
|
28
|
+
bun run service/captions.ts media/video.mp4
|
|
29
|
+
# outputs: media/video-captioned.mp4
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
**with options:**
|
|
33
|
+
```bash
|
|
34
|
+
bun run service/captions.ts media/video.mp4 output.mp4 --provider fireworks --font Arial --size 28
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
### use existing srt file
|
|
38
|
+
```bash
|
|
39
|
+
bun run service/captions.ts media/video.mp4 output.mp4 --srt media/video.srt
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
## options
|
|
43
|
+
|
|
44
|
+
- `--srt <path>` - use existing srt file instead of auto-generating
|
|
45
|
+
- `--provider <name>` - groq or fireworks (default: fireworks)
|
|
46
|
+
- `--font <name>` - font name (default: Arial)
|
|
47
|
+
- `--size <number>` - font size (default: 24)
|
|
48
|
+
- `--color <hex>` - primary color in ASS format (default: &HFFFFFF white)
|
|
49
|
+
- `--outline <hex>` - outline color in ASS format (default: &H000000 black)
|
|
50
|
+
|
|
51
|
+
## as library
|
|
52
|
+
|
|
53
|
+
```typescript
|
|
54
|
+
import { addCaptions } from "./service/captions"
|
|
55
|
+
|
|
56
|
+
const result = await addCaptions({
|
|
57
|
+
videoPath: "media/video.mp4",
|
|
58
|
+
output: "captioned.mp4",
|
|
59
|
+
provider: "fireworks", // or "groq"
|
|
60
|
+
style: {
|
|
61
|
+
fontName: "Helvetica",
|
|
62
|
+
fontSize: 28,
|
|
63
|
+
primaryColor: "&HFFFFFF",
|
|
64
|
+
outlineColor: "&H000000",
|
|
65
|
+
bold: true,
|
|
66
|
+
alignment: 2, // bottom center
|
|
67
|
+
marginV: 20
|
|
68
|
+
}
|
|
69
|
+
})
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## providers
|
|
73
|
+
|
|
74
|
+
### fireworks (recommended)
|
|
75
|
+
- **word-level timestamps** for precise timing
|
|
76
|
+
- generates `.srt` format with detailed timing
|
|
77
|
+
- better for social media content
|
|
78
|
+
- slightly slower transcription
|
|
79
|
+
|
|
80
|
+
### groq
|
|
81
|
+
- **ultra-fast** transcription
|
|
82
|
+
- plain text output (converted to srt)
|
|
83
|
+
- sentence-level timing
|
|
84
|
+
- great for quick previews
|
|
85
|
+
|
|
86
|
+
## styling options
|
|
87
|
+
|
|
88
|
+
```typescript
|
|
89
|
+
interface SubtitleStyle {
|
|
90
|
+
fontName?: string // default: Arial
|
|
91
|
+
fontSize?: number // default: 24
|
|
92
|
+
primaryColor?: string // default: &HFFFFFF (white)
|
|
93
|
+
outlineColor?: string // default: &H000000 (black)
|
|
94
|
+
bold?: boolean // default: true
|
|
95
|
+
alignment?: number // 1-9, default: 2 (bottom center)
|
|
96
|
+
marginV?: number // vertical margin, default: 20
|
|
97
|
+
}
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
**alignment values:**
|
|
101
|
+
```
|
|
102
|
+
1 = bottom left 2 = bottom center 3 = bottom right
|
|
103
|
+
4 = middle left 5 = middle center 6 = middle right
|
|
104
|
+
7 = top left 8 = top center 9 = top right
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
## when to use
|
|
108
|
+
|
|
109
|
+
use this skill when:
|
|
110
|
+
- preparing videos for social media (tiktok, instagram, youtube)
|
|
111
|
+
- adding accessibility features
|
|
112
|
+
- creating educational or tutorial content
|
|
113
|
+
- need word-level caption timing
|
|
114
|
+
- translating videos with custom srt files
|
|
115
|
+
|
|
116
|
+
## typical workflow
|
|
117
|
+
|
|
118
|
+
1. create or edit video
|
|
119
|
+
2. add captions with auto-transcription (this service)
|
|
120
|
+
3. customize style for platform
|
|
121
|
+
4. prepare for social media (edit service)
|
|
122
|
+
|
|
123
|
+
## examples
|
|
124
|
+
|
|
125
|
+
**tiktok/instagram style captions:**
|
|
126
|
+
```bash
|
|
127
|
+
bun run service/captions.ts video.mp4 captioned.mp4 \
|
|
128
|
+
--provider fireworks \
|
|
129
|
+
--font "Arial Black" \
|
|
130
|
+
--size 32 \
|
|
131
|
+
--color "&H00FFFF"
|
|
132
|
+
```
|
|
133
|
+
|
|
134
|
+
**professional style:**
|
|
135
|
+
```bash
|
|
136
|
+
bun run service/captions.ts video.mp4 output.mp4 \
|
|
137
|
+
--provider fireworks \
|
|
138
|
+
--font "Helvetica" \
|
|
139
|
+
--size 24
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
**with existing subtitles:**
|
|
143
|
+
```bash
|
|
144
|
+
bun run service/captions.ts video.mp4 final.mp4 \
|
|
145
|
+
--srt custom-subtitles.srt \
|
|
146
|
+
--font "Arial" \
|
|
147
|
+
--size 26
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
## output
|
|
151
|
+
|
|
152
|
+
- generates `.srt` file if auto-transcribing
|
|
153
|
+
- creates new video file with burned-in subtitles
|
|
154
|
+
- preserves original video quality
|
|
155
|
+
- audio is copied without re-encoding
|
|
156
|
+
|
|
157
|
+
## environment variables
|
|
158
|
+
|
|
159
|
+
required (for auto-transcription):
|
|
160
|
+
- `GROQ_API_KEY` - for groq provider
|
|
161
|
+
- `FIREWORKS_API_KEY` - for fireworks provider
|
|
162
|
+
|
|
163
|
+
**system requirements:**
|
|
164
|
+
- ffmpeg must be installed
|
|
165
|
+
|
|
166
|
+
## processing time
|
|
167
|
+
|
|
168
|
+
- transcription: 5-30 seconds (depending on video length)
|
|
169
|
+
- overlay: 5-15 seconds (depending on video length)
|
|
170
|
+
- total: typically under 1 minute
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
#!/usr/bin/env bun
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* video captioning service
|
|
5
|
+
* generates and overlays subtitles on videos using ffmpeg
|
|
6
|
+
* supports auto-generation via groq/fireworks or custom srt files
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { existsSync } from "node:fs";
|
|
10
|
+
import ffmpeg from "fluent-ffmpeg";
|
|
11
|
+
import type { ActionMeta } from "../../cli/types";
|
|
12
|
+
import { transcribe } from "../transcribe";
|
|
13
|
+
|
|
14
|
+
export const meta: ActionMeta = {
|
|
15
|
+
name: "captions",
|
|
16
|
+
type: "action",
|
|
17
|
+
description: "add subtitles to video",
|
|
18
|
+
inputType: "video",
|
|
19
|
+
outputType: "video",
|
|
20
|
+
schema: {
|
|
21
|
+
input: {
|
|
22
|
+
type: "object",
|
|
23
|
+
required: ["video", "output"],
|
|
24
|
+
properties: {
|
|
25
|
+
video: {
|
|
26
|
+
type: "string",
|
|
27
|
+
format: "file-path",
|
|
28
|
+
description: "input video file",
|
|
29
|
+
},
|
|
30
|
+
output: {
|
|
31
|
+
type: "string",
|
|
32
|
+
format: "file-path",
|
|
33
|
+
description: "output video path",
|
|
34
|
+
},
|
|
35
|
+
srt: {
|
|
36
|
+
type: "string",
|
|
37
|
+
format: "file-path",
|
|
38
|
+
description: "existing srt file (auto-generates if not provided)",
|
|
39
|
+
},
|
|
40
|
+
provider: {
|
|
41
|
+
type: "string",
|
|
42
|
+
enum: ["groq", "fireworks"],
|
|
43
|
+
default: "fireworks",
|
|
44
|
+
description: "transcription provider for auto-generation",
|
|
45
|
+
},
|
|
46
|
+
},
|
|
47
|
+
},
|
|
48
|
+
output: { type: "string", format: "file-path", description: "video path" },
|
|
49
|
+
},
|
|
50
|
+
async run(options) {
|
|
51
|
+
const { video, output, srt, provider } = options as {
|
|
52
|
+
video: string;
|
|
53
|
+
output: string;
|
|
54
|
+
srt?: string;
|
|
55
|
+
provider?: "groq" | "fireworks";
|
|
56
|
+
};
|
|
57
|
+
return addCaptions({ videoPath: video, output, srtPath: srt, provider });
|
|
58
|
+
},
|
|
59
|
+
};
|
|
60
|
+
|
|
61
|
+
// types
|
|
62
|
+
export interface AddCaptionsOptions {
|
|
63
|
+
videoPath: string;
|
|
64
|
+
srtPath?: string; // optional existing srt file
|
|
65
|
+
output: string;
|
|
66
|
+
provider?: "groq" | "fireworks"; // only used if srtPath not provided
|
|
67
|
+
style?: SubtitleStyle;
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
export interface SubtitleStyle {
|
|
71
|
+
fontName?: string; // default: Arial
|
|
72
|
+
fontSize?: number; // default: 24
|
|
73
|
+
primaryColor?: string; // default: &HFFFFFF (white)
|
|
74
|
+
outlineColor?: string; // default: &H000000 (black)
|
|
75
|
+
bold?: boolean; // default: true
|
|
76
|
+
alignment?: number; // 1-9, default: 2 (bottom center)
|
|
77
|
+
marginV?: number; // vertical margin, default: 20
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// default subtitle style
|
|
81
|
+
const DEFAULT_STYLE: Required<SubtitleStyle> = {
|
|
82
|
+
fontName: "Arial",
|
|
83
|
+
fontSize: 24,
|
|
84
|
+
primaryColor: "&HFFFFFF", // white
|
|
85
|
+
outlineColor: "&H000000", // black
|
|
86
|
+
bold: true,
|
|
87
|
+
alignment: 2, // bottom center
|
|
88
|
+
marginV: 20,
|
|
89
|
+
};
|
|
90
|
+
|
|
91
|
+
// main function to add captions to video
|
|
92
|
+
export async function addCaptions(
|
|
93
|
+
options: AddCaptionsOptions,
|
|
94
|
+
): Promise<string> {
|
|
95
|
+
const { videoPath, srtPath, output, provider = "fireworks", style } = options;
|
|
96
|
+
|
|
97
|
+
if (!videoPath) {
|
|
98
|
+
throw new Error("videoPath is required");
|
|
99
|
+
}
|
|
100
|
+
if (!output) {
|
|
101
|
+
throw new Error("output is required");
|
|
102
|
+
}
|
|
103
|
+
if (!existsSync(videoPath)) {
|
|
104
|
+
throw new Error(`video file not found: ${videoPath}`);
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
console.log("[captions] adding captions to video...");
|
|
108
|
+
|
|
109
|
+
// determine srt file path
|
|
110
|
+
let finalSrtPath = srtPath;
|
|
111
|
+
|
|
112
|
+
// if no srt file provided, auto-generate it
|
|
113
|
+
if (!finalSrtPath) {
|
|
114
|
+
console.log(
|
|
115
|
+
`[captions] no srt file provided, auto-generating with ${provider}...`,
|
|
116
|
+
);
|
|
117
|
+
|
|
118
|
+
// generate srt file from video audio
|
|
119
|
+
const tempSrtPath = videoPath.replace(/\.[^.]+$/, ".srt");
|
|
120
|
+
|
|
121
|
+
const result = await transcribe({
|
|
122
|
+
audioUrl: videoPath,
|
|
123
|
+
provider,
|
|
124
|
+
outputFormat: "srt",
|
|
125
|
+
outputPath: tempSrtPath,
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
if (!result.success) {
|
|
129
|
+
throw new Error(`failed to generate subtitles: ${result.error}`);
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
finalSrtPath = tempSrtPath;
|
|
133
|
+
console.log(`[captions] generated subtitles at ${finalSrtPath}`);
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
if (!existsSync(finalSrtPath)) {
|
|
137
|
+
throw new Error(`srt file not found: ${finalSrtPath}`);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// merge style with defaults
|
|
141
|
+
const finalStyle = { ...DEFAULT_STYLE, ...style };
|
|
142
|
+
|
|
143
|
+
// build subtitle filter with style
|
|
144
|
+
const subtitlesFilter = `subtitles=${finalSrtPath}:force_style='FontName=${finalStyle.fontName},FontSize=${finalStyle.fontSize},PrimaryColour=${finalStyle.primaryColor},OutlineColour=${finalStyle.outlineColor},Bold=${finalStyle.bold ? -1 : 0},Alignment=${finalStyle.alignment},MarginV=${finalStyle.marginV}'`;
|
|
145
|
+
|
|
146
|
+
console.log("[captions] overlaying subtitles on video...");
|
|
147
|
+
|
|
148
|
+
return new Promise((resolve, reject) => {
|
|
149
|
+
ffmpeg(videoPath)
|
|
150
|
+
.videoFilters(subtitlesFilter)
|
|
151
|
+
.outputOptions(["-c:a", "copy"]) // copy audio without re-encoding
|
|
152
|
+
.output(output)
|
|
153
|
+
.on("end", () => {
|
|
154
|
+
console.log(`[captions] saved to ${output}`);
|
|
155
|
+
resolve(output);
|
|
156
|
+
})
|
|
157
|
+
.on("error", (err) => {
|
|
158
|
+
console.error("[captions] error:", err);
|
|
159
|
+
reject(err);
|
|
160
|
+
})
|
|
161
|
+
.run();
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// cli
|
|
166
|
+
if (import.meta.main) {
|
|
167
|
+
const { runCli } = await import("../../cli/runner");
|
|
168
|
+
runCli(meta);
|
|
169
|
+
}
|