varg.ai-sdk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.claude/settings.local.json +7 -0
  2. package/.env.example +24 -0
  3. package/CLAUDE.md +118 -0
  4. package/README.md +231 -0
  5. package/SKILLS.md +157 -0
  6. package/STRUCTURE.md +92 -0
  7. package/TEST_RESULTS.md +122 -0
  8. package/action/captions/SKILL.md +170 -0
  9. package/action/captions/index.ts +227 -0
  10. package/action/edit/SKILL.md +235 -0
  11. package/action/edit/index.ts +493 -0
  12. package/action/image/SKILL.md +140 -0
  13. package/action/image/index.ts +112 -0
  14. package/action/sync/SKILL.md +136 -0
  15. package/action/sync/index.ts +187 -0
  16. package/action/transcribe/SKILL.md +179 -0
  17. package/action/transcribe/index.ts +227 -0
  18. package/action/video/SKILL.md +116 -0
  19. package/action/video/index.ts +135 -0
  20. package/action/voice/SKILL.md +125 -0
  21. package/action/voice/index.ts +201 -0
  22. package/biome.json +33 -0
  23. package/index.ts +38 -0
  24. package/lib/README.md +144 -0
  25. package/lib/ai-sdk/fal.ts +106 -0
  26. package/lib/ai-sdk/replicate.ts +107 -0
  27. package/lib/elevenlabs.ts +382 -0
  28. package/lib/fal.ts +478 -0
  29. package/lib/ffmpeg.ts +467 -0
  30. package/lib/fireworks.ts +235 -0
  31. package/lib/groq.ts +246 -0
  32. package/lib/higgsfield.ts +176 -0
  33. package/lib/remotion/SKILL.md +823 -0
  34. package/lib/remotion/cli.ts +115 -0
  35. package/lib/remotion/functions.ts +283 -0
  36. package/lib/remotion/index.ts +19 -0
  37. package/lib/remotion/templates.ts +73 -0
  38. package/lib/replicate.ts +304 -0
  39. package/output.txt +1 -0
  40. package/package.json +35 -0
  41. package/pipeline/cookbooks/SKILL.md +285 -0
  42. package/pipeline/cookbooks/remotion-video.md +585 -0
  43. package/pipeline/cookbooks/round-video-character.md +337 -0
  44. package/pipeline/cookbooks/talking-character.md +59 -0
  45. package/test-import.ts +7 -0
  46. package/test-services.ts +97 -0
  47. package/tsconfig.json +29 -0
  48. package/utilities/s3.ts +147 -0
@@ -0,0 +1,125 @@
1
+ ---
2
+ name: voice-synthesis
3
+ description: generate realistic text-to-speech audio using elevenlabs with multiple voice options. use when user needs voiceovers, narration, character voices, or audio for lipsync videos.
4
+ allowed-tools: Read, Bash
5
+ ---
6
+
7
+ # voice synthesis
8
+
9
+ generate high-quality text-to-speech audio with elevenlabs.
10
+
11
+ ## available voices
12
+
13
+ - **rachel** - clear, professional female voice
14
+ - **domi** - warm, friendly female voice
15
+ - **bella** - energetic female voice
16
+ - **antoni** - friendly male voice
17
+ - **elli** - young, clear female voice
18
+ - **josh** - deep, clear male voice
19
+ - **arnold** - strong, authoritative male voice
20
+ - **adam** - natural, conversational male voice
21
+ - **sam** - raspy, character male voice
22
+
23
+ ## usage
24
+
25
+ ### generate voice
26
+ ```bash
27
+ bun run service/voice.ts generate <text> [voice] [provider] [upload]
28
+ ```
29
+
30
+ **parameters:**
31
+ - `text` (required): text to convert to speech
32
+ - `voice` (optional): voice name (default: rachel)
33
+ - `provider` (optional): elevenlabs (default)
34
+ - `upload` (optional): "true" to upload to s3
35
+
36
+ **example:**
37
+ ```bash
38
+ bun run service/voice.ts generate "hello world, this is my voice" rachel elevenlabs true
39
+ ```
40
+
41
+ ### shorthand for elevenlabs
42
+ ```bash
43
+ bun run service/voice.ts elevenlabs <text> [voice] [upload]
44
+ ```
45
+
46
+ **example:**
47
+ ```bash
48
+ bun run service/voice.ts elevenlabs "welcome to our video" josh true
49
+ ```
50
+
51
+ ## as library
52
+
53
+ ```typescript
54
+ import { generateVoice } from "./service/voice"
55
+
56
+ const result = await generateVoice({
57
+ text: "hello world",
58
+ voice: "rachel",
59
+ provider: "elevenlabs",
60
+ upload: true,
61
+ outputPath: "media/voiceover.mp3"
62
+ })
63
+
64
+ console.log(result.provider)
65
+ console.log(result.voiceId)
66
+ console.log(result.uploadUrl)
67
+ ```
68
+
69
+ ## output
70
+
71
+ returns `VoiceResult`:
72
+ ```typescript
73
+ {
74
+ audio: Buffer, // raw audio buffer
75
+ provider: string, // "elevenlabs"
76
+ voiceId: string, // actual voice id used
77
+ uploadUrl?: string // s3 url if upload requested
78
+ }
79
+ ```
80
+
81
+ saves audio file to `media/voice-{timestamp}.mp3`
82
+
83
+ ## when to use
84
+
85
+ use this skill when:
86
+ - creating voiceovers for videos
87
+ - generating narration or character dialogue
88
+ - preparing audio for lipsync videos
89
+ - need text-to-speech for talking character pipeline
90
+ - testing different voice options
91
+
92
+ ## tips
93
+
94
+ **voice selection:**
95
+ - use **rachel** or **josh** for professional narration
96
+ - use **bella** or **antoni** for friendly, casual content
97
+ - use **arnold** for authoritative or dramatic content
98
+ - use **sam** for character or stylized voices
99
+
100
+ **text formatting:**
101
+ - add punctuation for natural pauses
102
+ - use shorter sentences for clearer speech
103
+ - spell out numbers and abbreviations
104
+
105
+ ## integration with other services
106
+
107
+ perfect companion for:
108
+ - **lipsync service** - sync generated voice with video
109
+ - **video generation** - create talking character videos
110
+ - **captions service** - auto-generate subtitles from voiceover
111
+
112
+ ## environment variables
113
+
114
+ required:
115
+ - `ELEVENLABS_API_KEY` - for voice generation
116
+
117
+ optional (for s3 upload):
118
+ - `CLOUDFLARE_R2_API_URL`
119
+ - `CLOUDFLARE_ACCESS_KEY_ID`
120
+ - `CLOUDFLARE_ACCESS_SECRET`
121
+ - `CLOUDFLARE_R2_BUCKET`
122
+
123
+ ## generation time
124
+
125
+ expect 5-15 seconds depending on text length
@@ -0,0 +1,201 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * voice service - high-level voice generation combining multiple providers
5
+ * supports elevenlabs and future providers
6
+ */
7
+
8
+ import { textToSpeech, VOICES } from "../../lib/elevenlabs";
9
+ import { uploadFile } from "../../utilities/s3";
10
+
11
+ // types
12
+ export interface GenerateVoiceOptions {
13
+ text: string;
14
+ voice?: string;
15
+ provider?: "elevenlabs";
16
+ upload?: boolean;
17
+ outputPath?: string;
18
+ }
19
+
20
+ export interface VoiceResult {
21
+ audio: Buffer;
22
+ provider: string;
23
+ voiceId: string;
24
+ uploadUrl?: string;
25
+ }
26
+
27
+ // core functions
28
+ export async function generateVoice(
29
+ options: GenerateVoiceOptions,
30
+ ): Promise<VoiceResult> {
31
+ const {
32
+ text,
33
+ voice = "rachel",
34
+ provider = "elevenlabs",
35
+ upload = false,
36
+ outputPath,
37
+ } = options;
38
+
39
+ if (!text) {
40
+ throw new Error("text is required");
41
+ }
42
+
43
+ console.log(`[voice] generating with ${provider} (${voice})...`);
44
+
45
+ let audio: Buffer;
46
+ let voiceId: string;
47
+
48
+ switch (provider) {
49
+ case "elevenlabs": {
50
+ // map friendly names to voice ids
51
+ const voiceMap: Record<string, string> = {
52
+ rachel: VOICES.RACHEL,
53
+ domi: VOICES.DOMI,
54
+ bella: VOICES.BELLA,
55
+ antoni: VOICES.ANTONI,
56
+ elli: VOICES.ELLI,
57
+ josh: VOICES.JOSH,
58
+ arnold: VOICES.ARNOLD,
59
+ adam: VOICES.ADAM,
60
+ sam: VOICES.SAM,
61
+ };
62
+
63
+ voiceId = voiceMap[voice.toLowerCase()] || voice;
64
+
65
+ audio = await textToSpeech({
66
+ text,
67
+ voiceId,
68
+ outputPath,
69
+ });
70
+ break;
71
+ }
72
+
73
+ default:
74
+ throw new Error(`unsupported provider: ${provider}`);
75
+ }
76
+
77
+ const result: VoiceResult = {
78
+ audio,
79
+ provider,
80
+ voiceId,
81
+ };
82
+
83
+ // upload to s3 if requested
84
+ if (upload && outputPath) {
85
+ const objectKey = `voice/${Date.now()}-${voice}.mp3`;
86
+ const uploadUrl = await uploadFile(outputPath, objectKey);
87
+ result.uploadUrl = uploadUrl;
88
+ console.log(`[voice] uploaded to ${uploadUrl}`);
89
+ }
90
+
91
+ return result;
92
+ }
93
+
94
+ // cli
95
+ async function cli() {
96
+ const args = process.argv.slice(2);
97
+ const command = args[0];
98
+
99
+ if (!command || command === "help") {
100
+ console.log(`
101
+ usage:
102
+ bun run service/voice.ts <command> [args]
103
+
104
+ commands:
105
+ generate <text> [voice] [provider] [upload] generate voice from text
106
+ elevenlabs <text> [voice] [upload] generate with elevenlabs
107
+ help show this help
108
+
109
+ examples:
110
+ bun run service/voice.ts generate "hello world" rachel elevenlabs false
111
+ bun run service/voice.ts elevenlabs "hello world" josh true
112
+ bun run service/voice.ts generate "welcome to ai" bella
113
+
114
+ available voices:
115
+ rachel, domi, bella, antoni, elli, josh, arnold, adam, sam
116
+
117
+ providers:
118
+ elevenlabs (default)
119
+
120
+ environment:
121
+ ELEVENLABS_API_KEY - required for elevenlabs
122
+ CLOUDFLARE_* - required for upload
123
+ `);
124
+ process.exit(0);
125
+ }
126
+
127
+ try {
128
+ switch (command) {
129
+ case "generate": {
130
+ const text = args[1];
131
+ const voice = args[2];
132
+ const provider = (args[3] || "elevenlabs") as "elevenlabs";
133
+ const upload = args[4] === "true";
134
+
135
+ if (!text) {
136
+ throw new Error("text is required");
137
+ }
138
+
139
+ const outputPath = `media/voice-${Date.now()}.mp3`;
140
+
141
+ const result = await generateVoice({
142
+ text,
143
+ voice,
144
+ provider,
145
+ upload,
146
+ outputPath,
147
+ });
148
+
149
+ console.log(`[voice] result:`, {
150
+ provider: result.provider,
151
+ voiceId: result.voiceId,
152
+ audioSize: result.audio.length,
153
+ outputPath,
154
+ uploadUrl: result.uploadUrl,
155
+ });
156
+ break;
157
+ }
158
+
159
+ case "elevenlabs": {
160
+ const text = args[1];
161
+ const voice = args[2];
162
+ const upload = args[3] === "true";
163
+
164
+ if (!text) {
165
+ throw new Error("text is required");
166
+ }
167
+
168
+ const outputPath = `media/voice-${Date.now()}.mp3`;
169
+
170
+ const result = await generateVoice({
171
+ text,
172
+ voice,
173
+ provider: "elevenlabs",
174
+ upload,
175
+ outputPath,
176
+ });
177
+
178
+ console.log(`[voice] result:`, {
179
+ provider: result.provider,
180
+ voiceId: result.voiceId,
181
+ audioSize: result.audio.length,
182
+ outputPath,
183
+ uploadUrl: result.uploadUrl,
184
+ });
185
+ break;
186
+ }
187
+
188
+ default:
189
+ console.error(`unknown command: ${command}`);
190
+ console.log(`run 'bun run service/voice.ts help' for usage`);
191
+ process.exit(1);
192
+ }
193
+ } catch (error) {
194
+ console.error(`[voice] error:`, error);
195
+ process.exit(1);
196
+ }
197
+ }
198
+
199
+ if (import.meta.main) {
200
+ cli();
201
+ }
package/biome.json ADDED
@@ -0,0 +1,33 @@
1
+ {
2
+ "$schema": "https://biomejs.dev/schemas/2.3.7/schema.json",
3
+ "vcs": {
4
+ "enabled": true,
5
+ "clientKind": "git",
6
+ "useIgnoreFile": true
7
+ },
8
+ "files": {
9
+ "ignoreUnknown": false
10
+ },
11
+ "formatter": {
12
+ "enabled": true,
13
+ "indentStyle": "space"
14
+ },
15
+ "linter": {
16
+ "enabled": true,
17
+ "rules": {
18
+ "recommended": true
19
+ }
20
+ },
21
+ "assist": {
22
+ "actions": {
23
+ "source": {
24
+ "organizeImports": "on"
25
+ }
26
+ }
27
+ },
28
+ "javascript": {
29
+ "formatter": {
30
+ "quoteStyle": "double"
31
+ }
32
+ }
33
+ }
package/index.ts ADDED
@@ -0,0 +1,38 @@
1
+ /**
2
+ * varg.ai sdk
3
+ * video generation and editing tools
4
+ */
5
+
6
+ // re-export external clients
7
+ export { fal } from "@ai-sdk/fal";
8
+ export { replicate } from "@ai-sdk/replicate";
9
+ export { fal as falClient } from "@fal-ai/client";
10
+ export { HiggsfieldClient } from "@higgsfield/client";
11
+ // lib exports - ai-sdk/fal (provider)
12
+ export * as aiSdkFal from "./lib/ai-sdk/fal";
13
+ // lib exports - ai-sdk/replicate (provider)
14
+ export * as aiSdkReplicate from "./lib/ai-sdk/replicate";
15
+ // lib exports - elevenlabs
16
+ export * from "./lib/elevenlabs";
17
+ // lib exports - fal (client)
18
+ export * from "./lib/fal";
19
+ // lib exports - ffmpeg
20
+ export * from "./lib/ffmpeg";
21
+ // lib exports - fireworks
22
+ export * from "./lib/fireworks";
23
+ // lib exports - groq
24
+ export * from "./lib/groq";
25
+ // lib exports - higgsfield
26
+ export * from "./lib/higgsfield";
27
+ // lib exports - replicate
28
+ export * from "./lib/replicate";
29
+ // service exports
30
+ export * from "./service/captions";
31
+ export * from "./service/edit";
32
+ export * from "./service/image";
33
+ export * from "./service/sync";
34
+ export * from "./service/transcribe";
35
+ export * from "./service/video";
36
+ export * from "./service/voice";
37
+ // utilities exports
38
+ export * from "./utilities/s3";
package/lib/README.md ADDED
@@ -0,0 +1,144 @@
1
+ # lib/ modules
2
+
3
+ ## two fal implementations
4
+
5
+ ### lib/ai-sdk/fal.ts - ai-sdk provider (recommended for images)
6
+
7
+ uses `@ai-sdk/fal` with the vercel ai sdk's `experimental_generateImage`
8
+
9
+ **benefits:**
10
+ - clean, typed api via vercel ai sdk
11
+ - automatic image format handling (uint8array)
12
+ - consistent interface with other ai providers
13
+ - built-in aspect ratio support
14
+ - better for standard image generation
15
+
16
+ **example:**
17
+ ```bash
18
+ bun run lib/ai-sdk/fal.ts generate_image "cyberpunk city" "fal-ai/flux/dev" "16:9"
19
+ ```
20
+
21
+ **code:**
22
+ ```typescript
23
+ import { fal } from "@ai-sdk/fal"
24
+ import { experimental_generateImage as generateImage } from "ai"
25
+
26
+ const { image, providerMetadata } = await generateImage({
27
+ model: fal.image("fal-ai/flux/dev"),
28
+ prompt: "beautiful sunset",
29
+ aspectRatio: "16:9",
30
+ })
31
+ ```
32
+
33
+ ### lib/fal.ts - fal client direct (for video & advanced features)
34
+
35
+ uses `@fal-ai/client` directly with the raw fal api
36
+
37
+ **benefits:**
38
+ - access to all fal features (video, advanced params)
39
+ - streaming/queue updates
40
+ - full control over api parameters
41
+ - required for video generation (no ai-sdk support yet)
42
+ - **supports local images** - automatically uploads local files to fal storage
43
+
44
+ **examples:**
45
+ ```bash
46
+ # image generation
47
+ bun run lib/fal.ts generate_image "aurora borealis" "fal-ai/flux-pro/v1.1"
48
+
49
+ # video from url
50
+ bun run lib/fal.ts image_to_video "person talking" "https://image.url" 5
51
+
52
+ # video from local file (auto-uploads)
53
+ bun run lib/fal.ts image_to_video "ocean waves" "./media/beach.jpg" 10
54
+ ```
55
+
56
+ **code:**
57
+ ```typescript
58
+ import { imageToVideo } from "./lib/fal"
59
+
60
+ // works with both urls and local files
61
+ const result = await imageToVideo({
62
+ prompt: "person talking",
63
+ imageUrl: "./local/image.jpg", // or "https://..."
64
+ duration: 5,
65
+ })
66
+
67
+ // local files are automatically uploaded to fal storage
68
+ ```
69
+
70
+ ## when to use which?
71
+
72
+ | use case | approach |
73
+ |----------|----------|
74
+ | standard image generation | ai-sdk provider ✓ |
75
+ | video generation | fal client direct ✓ |
76
+ | advanced fal features | fal client direct ✓ |
77
+ | multi-provider app | ai-sdk provider ✓ |
78
+ | custom queue handling | fal client direct ✓ |
79
+
80
+ ## higgsfield.ts
81
+
82
+ uses `@higgsfield/client` for soul character generation
83
+
84
+ **features:**
85
+ - generate soul images with custom styles
86
+ - create and manage character references
87
+ - list available soul styles
88
+ - poll for job completion
89
+
90
+ **example:**
91
+ ```bash
92
+ HF_API_KEY=xxx HF_API_SECRET=xxx bun run lib/higgsfield.ts generate_soul "professional headshot"
93
+ ```
94
+
95
+ ## elevenlabs.ts
96
+
97
+ uses `@elevenlabs/elevenlabs-js` for voice, music, and sound effects generation
98
+
99
+ **features:**
100
+ - text-to-speech with multiple voices
101
+ - music generation from text prompts
102
+ - sound effects generation
103
+ - voice management
104
+
105
+ **examples:**
106
+ ```bash
107
+ # text-to-speech
108
+ bun run lib/elevenlabs.ts tts "hello world" rachel output.mp3
109
+
110
+ # music generation
111
+ bun run lib/elevenlabs.ts music "upbeat electronic dance music" 30000 music.mp3
112
+
113
+ # sound effects
114
+ bun run lib/elevenlabs.ts sfx "ocean waves crashing" 5 waves.mp3
115
+
116
+ # list voices
117
+ bun run lib/elevenlabs.ts voices
118
+ ```
119
+
120
+ **code:**
121
+ ```typescript
122
+ import { textToSpeech, generateMusic, generateSoundEffect } from "./lib/elevenlabs"
123
+
124
+ // voice
125
+ const audio = await textToSpeech({
126
+ text: "hello world",
127
+ voiceId: "rachel",
128
+ outputPath: "output.mp3"
129
+ })
130
+
131
+ // music
132
+ const music = await generateMusic({
133
+ prompt: "epic orchestral music",
134
+ musicLengthMs: 60000,
135
+ outputPath: "music.mp3"
136
+ })
137
+
138
+ // sound effects
139
+ const sfx = await generateSoundEffect({
140
+ text: "thunder and rain",
141
+ durationSeconds: 10,
142
+ outputPath: "sfx.mp3"
143
+ })
144
+ ```
@@ -0,0 +1,106 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * fal.ai wrapper using @ai-sdk/fal provider
4
+ * recommended for standard image generation with vercel ai sdk
5
+ *
6
+ * usage: bun run lib/ai-sdk/fal.ts <command> <args>
7
+ */
8
+
9
+ import { fal } from "@ai-sdk/fal";
10
+ import { experimental_generateImage as generateImageAI } from "ai";
11
+
12
+ export async function generateImage(args: {
13
+ prompt: string;
14
+ model?: string;
15
+ aspectRatio?: "1:1" | "16:9" | "9:16" | "4:3" | "3:4";
16
+ }) {
17
+ const modelId = args.model || "fal-ai/flux/dev";
18
+
19
+ console.log(`[ai-sdk/fal] generating image with ${modelId}`);
20
+ console.log(`[ai-sdk/fal] prompt: ${args.prompt}`);
21
+ if (args.aspectRatio) {
22
+ console.log(`[ai-sdk/fal] aspect ratio: ${args.aspectRatio}`);
23
+ }
24
+
25
+ try {
26
+ const { image, providerMetadata } = await generateImageAI({
27
+ model: fal.image(modelId),
28
+ prompt: args.prompt,
29
+ aspectRatio: args.aspectRatio,
30
+ });
31
+
32
+ console.log("[ai-sdk/fal] completed!");
33
+
34
+ // return in consistent format
35
+ return {
36
+ image: {
37
+ url: image.base64 ? `data:image/png;base64,${image.base64}` : undefined,
38
+ uint8Array: image.uint8Array,
39
+ },
40
+ metadata: providerMetadata?.fal,
41
+ };
42
+ } catch (error) {
43
+ console.error("[ai-sdk/fal] error:", error);
44
+ throw error;
45
+ }
46
+ }
47
+
48
+ // cli runner
49
+ if (import.meta.main) {
50
+ const [command, ...args] = process.argv.slice(2);
51
+
52
+ switch (command) {
53
+ case "generate_image": {
54
+ if (!args[0]) {
55
+ console.log(`
56
+ usage:
57
+ bun run lib/ai-sdk/fal.ts generate_image <prompt> [model] [aspectRatio]
58
+
59
+ examples:
60
+ bun run lib/ai-sdk/fal.ts generate_image "sunset over ocean" "fal-ai/flux/dev" "16:9"
61
+ bun run lib/ai-sdk/fal.ts generate_image "portrait photo" "fal-ai/flux-pro/v1.1" "9:16"
62
+
63
+ available models:
64
+ - fal-ai/flux/dev (default, fast)
65
+ - fal-ai/flux-pro/v1.1 (high quality)
66
+ - fal-ai/flux/schnell (very fast)
67
+ - fal-ai/ideogram/character (character consistency)
68
+ `);
69
+ process.exit(1);
70
+ }
71
+
72
+ const result = await generateImage({
73
+ prompt: args[0],
74
+ model: args[1],
75
+ aspectRatio: args[2] as
76
+ | "1:1"
77
+ | "16:9"
78
+ | "9:16"
79
+ | "4:3"
80
+ | "3:4"
81
+ | undefined,
82
+ });
83
+
84
+ // save image to file
85
+ if (result.image.uint8Array) {
86
+ const filename = `/tmp/fal-ai-sdk-${Date.now()}.png`;
87
+ await Bun.write(filename, result.image.uint8Array);
88
+ console.log(`\nimage saved to: ${filename}`);
89
+
90
+ // open image
91
+ await Bun.spawn(["open", filename]);
92
+ }
93
+
94
+ console.log("\nmetadata:");
95
+ console.log(JSON.stringify(result.metadata, null, 2));
96
+ break;
97
+ }
98
+
99
+ default:
100
+ console.log(`
101
+ usage:
102
+ bun run lib/ai-sdk/fal.ts generate_image <prompt> [model] [aspectRatio]
103
+ `);
104
+ process.exit(1);
105
+ }
106
+ }