varg.ai-sdk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/.claude/settings.local.json +7 -0
  2. package/.env.example +24 -0
  3. package/CLAUDE.md +118 -0
  4. package/README.md +231 -0
  5. package/SKILLS.md +157 -0
  6. package/STRUCTURE.md +92 -0
  7. package/TEST_RESULTS.md +122 -0
  8. package/action/captions/SKILL.md +170 -0
  9. package/action/captions/index.ts +227 -0
  10. package/action/edit/SKILL.md +235 -0
  11. package/action/edit/index.ts +493 -0
  12. package/action/image/SKILL.md +140 -0
  13. package/action/image/index.ts +112 -0
  14. package/action/sync/SKILL.md +136 -0
  15. package/action/sync/index.ts +187 -0
  16. package/action/transcribe/SKILL.md +179 -0
  17. package/action/transcribe/index.ts +227 -0
  18. package/action/video/SKILL.md +116 -0
  19. package/action/video/index.ts +135 -0
  20. package/action/voice/SKILL.md +125 -0
  21. package/action/voice/index.ts +201 -0
  22. package/biome.json +33 -0
  23. package/index.ts +38 -0
  24. package/lib/README.md +144 -0
  25. package/lib/ai-sdk/fal.ts +106 -0
  26. package/lib/ai-sdk/replicate.ts +107 -0
  27. package/lib/elevenlabs.ts +382 -0
  28. package/lib/fal.ts +478 -0
  29. package/lib/ffmpeg.ts +467 -0
  30. package/lib/fireworks.ts +235 -0
  31. package/lib/groq.ts +246 -0
  32. package/lib/higgsfield.ts +176 -0
  33. package/lib/remotion/SKILL.md +823 -0
  34. package/lib/remotion/cli.ts +115 -0
  35. package/lib/remotion/functions.ts +283 -0
  36. package/lib/remotion/index.ts +19 -0
  37. package/lib/remotion/templates.ts +73 -0
  38. package/lib/replicate.ts +304 -0
  39. package/output.txt +1 -0
  40. package/package.json +35 -0
  41. package/pipeline/cookbooks/SKILL.md +285 -0
  42. package/pipeline/cookbooks/remotion-video.md +585 -0
  43. package/pipeline/cookbooks/round-video-character.md +337 -0
  44. package/pipeline/cookbooks/talking-character.md +59 -0
  45. package/test-import.ts +7 -0
  46. package/test-services.ts +97 -0
  47. package/tsconfig.json +29 -0
  48. package/utilities/s3.ts +147 -0
@@ -0,0 +1,179 @@
1
+ ---
2
+ name: audio-transcription
3
+ description: transcribe audio to text or subtitles using groq whisper or fireworks with srt/vtt support. use when converting speech to text, generating subtitles, or need word-level timestamps for captions.
4
+ allowed-tools: Read, Bash
5
+ ---
6
+
7
+ # audio transcription
8
+
9
+ convert audio to text or subtitle files using ai transcription.
10
+
11
+ ## providers
12
+
13
+ ### groq (ultra-fast)
14
+ - uses whisper-large-v3
15
+ - fastest transcription (~5-10 seconds)
16
+ - plain text output
17
+ - sentence-level timing
18
+ - best for: quick transcripts, text extraction
19
+
20
+ ### fireworks (word-level)
21
+ - uses whisper-v3
22
+ - word-level timestamps
23
+ - outputs srt or vtt format
24
+ - precise subtitle timing
25
+ - best for: captions, subtitles, timed transcripts
26
+
27
+ ## usage
28
+
29
+ ### basic transcription
30
+ ```bash
31
+ bun run service/transcribe.ts <audioUrl> <provider> [outputPath]
32
+ ```
33
+
34
+ **example:**
35
+ ```bash
36
+ bun run service/transcribe.ts media/audio.mp3 groq
37
+ bun run service/transcribe.ts media/audio.mp3 fireworks output.srt
38
+ ```
39
+
40
+ ### with output format
41
+ ```bash
42
+ bun run lib/fireworks.ts <audioPath> <outputPath>
43
+ ```
44
+
45
+ **example:**
46
+ ```bash
47
+ bun run lib/fireworks.ts media/audio.mp3 output.srt
48
+ ```
49
+
50
+ ## as library
51
+
52
+ ```typescript
53
+ import { transcribe } from "./service/transcribe"
54
+
55
+ // groq transcription
56
+ const groqResult = await transcribe({
57
+ audioUrl: "media/audio.mp3",
58
+ provider: "groq",
59
+ outputFormat: "text"
60
+ })
61
+ console.log(groqResult.text)
62
+
63
+ // fireworks with srt
64
+ const fireworksResult = await transcribe({
65
+ audioUrl: "media/audio.mp3",
66
+ provider: "fireworks",
67
+ outputFormat: "srt",
68
+ outputPath: "subtitles.srt"
69
+ })
70
+ console.log(fireworksResult.text)
71
+ console.log(fireworksResult.outputPath) // subtitles.srt
72
+ ```
73
+
74
+ ## output formats
75
+
76
+ ### text (groq default)
77
+ ```
78
+ This is the transcribed text from the audio file.
79
+ All words in plain text format.
80
+ ```
81
+
82
+ ### srt (subtitle format)
83
+ ```
84
+ 1
85
+ 00:00:00,000 --> 00:00:02,500
86
+ This is the first subtitle
87
+
88
+ 2
89
+ 00:00:02,500 --> 00:00:05,000
90
+ This is the second subtitle
91
+ ```
92
+
93
+ ### vtt (web video text tracks)
94
+ ```
95
+ WEBVTT
96
+
97
+ 00:00:00.000 --> 00:00:02.500
98
+ This is the first subtitle
99
+
100
+ 00:00:02.500 --> 00:00:05.000
101
+ This is the second subtitle
102
+ ```
103
+
104
+ ## when to use
105
+
106
+ use this skill when:
107
+ - converting speech to text
108
+ - generating subtitles for videos
109
+ - creating accessible content
110
+ - need word-level timing for captions
111
+ - extracting dialogue from media
112
+ - preparing transcripts for analysis
113
+
114
+ ## provider comparison
115
+
116
+ | feature | groq | fireworks |
117
+ |---------|------|-----------|
118
+ | speed | ultra-fast (5-10s) | moderate (15-30s) |
119
+ | output | plain text | srt/vtt with timestamps |
120
+ | timing | sentence-level | word-level |
121
+ | use case | quick transcripts | precise subtitles |
122
+
123
+ ## typical workflows
124
+
125
+ ### for captions
126
+ 1. record or generate audio (voice service)
127
+ 2. transcribe with fireworks (this service)
128
+ 3. add captions to video (captions service)
129
+
130
+ ### for transcripts
131
+ 1. extract audio from video
132
+ 2. transcribe with groq (this service)
133
+ 3. use text for analysis or documentation
134
+
135
+ ## tips
136
+
137
+ **provider selection:**
138
+ - use **groq** when you just need the text fast
139
+ - use **fireworks** when you need subtitle files
140
+ - use **fireworks** for captions on social media videos
141
+
142
+ **audio quality:**
143
+ - clear audio transcribes more accurately
144
+ - reduce background noise when possible
145
+ - supports mp3, wav, m4a, and most audio formats
146
+
147
+ **timing accuracy:**
148
+ - fireworks provides word-level timestamps
149
+ - perfect for lip-sync verification
150
+ - great for precise subtitle placement
151
+
152
+ ## integration with other services
153
+
154
+ perfect companion for:
155
+ - **captions service** - auto-generate video subtitles
156
+ - **voice service** - transcribe generated speech
157
+ - **sync service** - verify audio timing
158
+
159
+ ## environment variables
160
+
161
+ required:
162
+ - `GROQ_API_KEY` - for groq provider
163
+ - `FIREWORKS_API_KEY` - for fireworks provider
164
+
165
+ ## processing time
166
+
167
+ - **groq**: 5-10 seconds (any audio length)
168
+ - **fireworks**: 15-30 seconds (depending on audio length)
169
+
170
+ ## supported formats
171
+
172
+ input audio:
173
+ - mp3, wav, m4a, ogg, flac
174
+ - video files (extracts audio automatically)
175
+
176
+ output formats:
177
+ - text (plain text)
178
+ - srt (subtitles)
179
+ - vtt (web video text tracks)
@@ -0,0 +1,227 @@
1
+ #!/usr/bin/env bun
2
+
3
+ /**
4
+ * audio transcription service
5
+ * supports groq whisper, fireworks api, and future providers
6
+ */
7
+
8
+ import { writeFileSync } from "node:fs";
9
+ import { join } from "node:path";
10
+ import { toFile } from "groq-sdk/uploads";
11
+ import {
12
+ convertFireworksToSRT,
13
+ transcribeWithFireworks as fireworksTranscribe,
14
+ } from "../../lib/fireworks";
15
+ import { GROQ_MODELS, transcribeAudio as groqTranscribe } from "../../lib/groq";
16
+
17
+ // types
18
+ export interface TranscribeOptions {
19
+ audioUrl: string; // url or local file path
20
+ provider?: "groq" | "fireworks";
21
+ model?: string;
22
+ language?: string;
23
+ outputFormat?: "text" | "srt";
24
+ outputPath?: string;
25
+ }
26
+
27
+ export interface TranscribeResult {
28
+ success: boolean;
29
+ text?: string;
30
+ srt?: string;
31
+ error?: string;
32
+ }
33
+
34
+ // groq transcription
35
+ async function transcribeWithGroq(
36
+ audioUrl: string,
37
+ options: {
38
+ model?: string;
39
+ language?: string;
40
+ outputFormat?: "text" | "srt";
41
+ },
42
+ ): Promise<TranscribeResult> {
43
+ try {
44
+ console.log("[transcribe] using groq whisper...");
45
+
46
+ // load audio file (local or remote)
47
+ let audioBuffer: ArrayBuffer;
48
+ let fileName = "audio.mp3";
49
+
50
+ if (audioUrl.startsWith("http://") || audioUrl.startsWith("https://")) {
51
+ // fetch remote file
52
+ const audioResponse = await fetch(audioUrl);
53
+ audioBuffer = await audioResponse.arrayBuffer();
54
+ } else {
55
+ // read local file with bun
56
+ const file = Bun.file(audioUrl);
57
+ audioBuffer = await file.arrayBuffer();
58
+ fileName = audioUrl.split("/").pop() || "audio.mp3";
59
+ }
60
+
61
+ const audioFile = await toFile(audioBuffer, fileName);
62
+
63
+ // transcribe with groq
64
+ const text = await groqTranscribe({
65
+ file: audioFile,
66
+ model: options.model || GROQ_MODELS.WHISPER_LARGE,
67
+ language: options.language,
68
+ });
69
+
70
+ console.log("[transcribe] groq transcription complete");
71
+
72
+ if (options.outputFormat === "srt") {
73
+ // groq returns plain text, so we need to convert to srt
74
+ // for now just return text with warning
75
+ console.warn(
76
+ "[transcribe] groq returns plain text, use fireworks for srt format",
77
+ );
78
+ return { success: true, text, srt: text };
79
+ }
80
+
81
+ return { success: true, text };
82
+ } catch (error) {
83
+ console.error("[transcribe] groq error:", error);
84
+ return {
85
+ success: false,
86
+ error:
87
+ error instanceof Error ? error.message : "groq transcription failed",
88
+ };
89
+ }
90
+ }
91
+
92
+ // fireworks transcription (with srt support)
93
+ async function transcribeWithFireworks(
94
+ audioUrl: string,
95
+ ): Promise<TranscribeResult> {
96
+ try {
97
+ console.log("[transcribe] using fireworks api...");
98
+
99
+ const data = await fireworksTranscribe({
100
+ audioPath: audioUrl,
101
+ });
102
+
103
+ const srtText = convertFireworksToSRT(data.words || []);
104
+ console.log("[transcribe] fireworks transcription complete");
105
+
106
+ return { success: true, srt: srtText, text: data.text };
107
+ } catch (error) {
108
+ console.error("[transcribe] fireworks error:", error);
109
+ return {
110
+ success: false,
111
+ error:
112
+ error instanceof Error
113
+ ? error.message
114
+ : "fireworks transcription failed",
115
+ };
116
+ }
117
+ }
118
+
119
+ // main transcription function
120
+ export async function transcribe(
121
+ options: TranscribeOptions,
122
+ ): Promise<TranscribeResult> {
123
+ const {
124
+ audioUrl,
125
+ provider = "groq",
126
+ model,
127
+ language,
128
+ outputFormat = "text",
129
+ outputPath,
130
+ } = options;
131
+
132
+ if (!audioUrl) {
133
+ throw new Error("audioUrl is required");
134
+ }
135
+
136
+ console.log(`[transcribe] transcribing ${audioUrl} with ${provider}...`);
137
+
138
+ let result: TranscribeResult;
139
+
140
+ // choose provider
141
+ if (provider === "groq") {
142
+ result = await transcribeWithGroq(audioUrl, {
143
+ model,
144
+ language,
145
+ outputFormat,
146
+ });
147
+ } else if (provider === "fireworks") {
148
+ result = await transcribeWithFireworks(audioUrl);
149
+ } else {
150
+ throw new Error(`unknown provider: ${provider}`);
151
+ }
152
+
153
+ // save to file if requested
154
+ if (result.success && outputPath) {
155
+ const content = outputFormat === "srt" ? result.srt : result.text;
156
+ if (content) {
157
+ writeFileSync(outputPath, content);
158
+ console.log(`[transcribe] saved to ${outputPath}`);
159
+ }
160
+ }
161
+
162
+ return result;
163
+ }
164
+
165
+ // cli
166
+ async function cli() {
167
+ const args = process.argv.slice(2);
168
+ const command = args[0];
169
+
170
+ if (!command || command === "help") {
171
+ console.log(`
172
+ usage:
173
+ bun run service/transcribe.ts <audioPath> [provider] [outputPath]
174
+
175
+ arguments:
176
+ audioPath - url or local path to audio file
177
+ provider - groq (default) | fireworks
178
+ outputPath - optional path to save transcription
179
+
180
+ examples:
181
+ bun run service/transcribe.ts https://example.com/audio.mp3
182
+ bun run service/transcribe.ts media/dora.ogg groq
183
+ bun run service/transcribe.ts https://example.com/audio.mp3 fireworks output.srt
184
+ bun run service/transcribe.ts media/audio.mp3 groq output.txt
185
+
186
+ providers:
187
+ groq - ultra-fast whisper (text only, free tier available)
188
+ fireworks - slower but includes srt timestamps (uses reels-srt api)
189
+
190
+ environment:
191
+ GROQ_API_KEY - your groq api key (for groq provider)
192
+ `);
193
+ process.exit(0);
194
+ }
195
+
196
+ try {
197
+ const audioUrl = args[0];
198
+ const provider = (args[1] || "groq") as "groq" | "fireworks";
199
+ const outputPath = args[2];
200
+
201
+ if (!audioUrl) {
202
+ throw new Error("audioUrl is required");
203
+ }
204
+
205
+ const result = await transcribe({
206
+ audioUrl,
207
+ provider,
208
+ outputFormat: provider === "fireworks" ? "srt" : "text",
209
+ outputPath: outputPath || join(process.cwd(), "output.txt"),
210
+ });
211
+
212
+ if (result.success) {
213
+ console.log("\ntranscription:");
214
+ console.log(result.srt || result.text);
215
+ } else {
216
+ console.error(`\nerror: ${result.error}`);
217
+ process.exit(1);
218
+ }
219
+ } catch (error) {
220
+ console.error("[transcribe] error:", error);
221
+ process.exit(1);
222
+ }
223
+ }
224
+
225
+ if (import.meta.main) {
226
+ cli();
227
+ }
@@ -0,0 +1,116 @@
1
+ ---
2
+ name: video-generation
3
+ description: generate videos from images or text prompts using fal.ai. use when user wants to animate images, create videos from text, or needs ai video generation with 5-10 second clips.
4
+ allowed-tools: Read, Bash
5
+ ---
6
+
7
+ # video generation
8
+
9
+ generate ai videos from images or text using fal.ai with automatic s3 upload support.
10
+
11
+ ## capabilities
12
+
13
+ - **image-to-video**: animate static images with motion prompts
14
+ - **text-to-video**: generate videos directly from text descriptions
15
+ - supports 5 or 10 second duration
16
+ - automatic s3 upload
17
+
18
+ ## usage
19
+
20
+ ### generate from image
21
+ ```bash
22
+ bun run service/video.ts from_image <prompt> <imageUrl> [duration] [upload]
23
+ ```
24
+
25
+ **parameters:**
26
+ - `prompt` (required): motion description (e.g., "camera pan left")
27
+ - `imageUrl` (required): url of the source image
28
+ - `duration` (optional): 5 or 10 seconds (default: 5)
29
+ - `upload` (optional): "true" to upload to s3
30
+
31
+ **example:**
32
+ ```bash
33
+ bun run service/video.ts from_image "person talking naturally" https://example.com/headshot.jpg 5 true
34
+ ```
35
+
36
+ ### generate from text
37
+ ```bash
38
+ bun run service/video.ts from_text <prompt> [duration] [upload]
39
+ ```
40
+
41
+ **parameters:**
42
+ - `prompt` (required): video scene description
43
+ - `duration` (optional): 5 or 10 seconds (default: 5)
44
+ - `upload` (optional): "true" to upload to s3
45
+
46
+ **example:**
47
+ ```bash
48
+ bun run service/video.ts from_text "waves crashing on beach at sunset" 10 true
49
+ ```
50
+
51
+ ## as library
52
+
53
+ ```typescript
54
+ import { generateVideoFromImage, generateVideoFromText } from "./service/video"
55
+
56
+ // animate an image
57
+ const videoResult = await generateVideoFromImage(
58
+ "camera zoom in slowly",
59
+ "https://example.com/portrait.jpg",
60
+ { duration: 5, upload: true }
61
+ )
62
+ console.log(videoResult.videoUrl)
63
+ console.log(videoResult.uploaded) // s3 url if upload=true
64
+
65
+ // generate from text
66
+ const textVideo = await generateVideoFromText(
67
+ "forest path with sunlight filtering through trees",
68
+ { duration: 10, upload: true }
69
+ )
70
+ ```
71
+
72
+ ## output
73
+
74
+ returns `VideoGenerationResult`:
75
+ ```typescript
76
+ {
77
+ videoUrl: string, // direct video url
78
+ duration?: number, // actual video duration
79
+ uploaded?: string // s3 url if upload requested
80
+ }
81
+ ```
82
+
83
+ ## when to use
84
+
85
+ use this skill when:
86
+ - animating character headshots or portraits
87
+ - creating motion from static images
88
+ - generating video clips from text descriptions
89
+ - preparing videos for lipsync or editing pipeline
90
+ - need short form video content (5-10s)
91
+
92
+ ## tips
93
+
94
+ **for character animation:**
95
+ - use subtle prompts like "person talking naturally" or "slight head movement"
96
+ - keep duration at 5 seconds for character shots
97
+ - combine with lipsync for talking videos
98
+
99
+ **for scene generation:**
100
+ - be descriptive about camera movement and scene dynamics
101
+ - 10 seconds works better for landscape/scene videos
102
+
103
+ ## environment variables
104
+
105
+ required:
106
+ - `FAL_API_KEY` - for fal video generation
107
+
108
+ optional (for s3 upload):
109
+ - `CLOUDFLARE_R2_API_URL`
110
+ - `CLOUDFLARE_ACCESS_KEY_ID`
111
+ - `CLOUDFLARE_ACCESS_SECRET`
112
+ - `CLOUDFLARE_R2_BUCKET`
113
+
114
+ ## generation time
115
+
116
+ expect 2-3 minutes per video clip
@@ -0,0 +1,135 @@
1
+ #!/usr/bin/env bun
2
+ /**
3
+ * video generation service combining fal and higgsfield
4
+ * usage: bun run service/video.ts <command> <args>
5
+ */
6
+
7
+ import { imageToVideo, textToVideo } from "../../lib/fal";
8
+ import { uploadFromUrl } from "../../utilities/s3";
9
+
10
+ export interface VideoGenerationResult {
11
+ videoUrl: string;
12
+ duration?: number;
13
+ uploaded?: string;
14
+ }
15
+
16
+ export async function generateVideoFromImage(
17
+ prompt: string,
18
+ imageUrl: string,
19
+ options: { duration?: 5 | 10; upload?: boolean } = {},
20
+ ): Promise<VideoGenerationResult> {
21
+ console.log("[service/video] generating video from image");
22
+
23
+ const result = await imageToVideo({
24
+ prompt,
25
+ imageUrl,
26
+ duration: options.duration,
27
+ });
28
+
29
+ const videoUrl = result.data?.video?.url;
30
+ if (!videoUrl) {
31
+ throw new Error("no video url in result");
32
+ }
33
+
34
+ let uploaded: string | undefined;
35
+ if (options.upload) {
36
+ const timestamp = Date.now();
37
+ const objectKey = `videos/generated/${timestamp}.mp4`;
38
+ uploaded = await uploadFromUrl(videoUrl, objectKey);
39
+ console.log(`[service/video] uploaded to ${uploaded}`);
40
+ }
41
+
42
+ return {
43
+ videoUrl,
44
+ duration: result.data?.duration,
45
+ uploaded,
46
+ };
47
+ }
48
+
49
+ export async function generateVideoFromText(
50
+ prompt: string,
51
+ options: { duration?: 5 | 10; upload?: boolean } = {},
52
+ ): Promise<VideoGenerationResult> {
53
+ console.log("[service/video] generating video from text");
54
+
55
+ const result = await textToVideo({
56
+ prompt,
57
+ duration: options.duration,
58
+ });
59
+
60
+ const videoUrl = result.data?.video?.url;
61
+ if (!videoUrl) {
62
+ throw new Error("no video url in result");
63
+ }
64
+
65
+ let uploaded: string | undefined;
66
+ if (options.upload) {
67
+ const timestamp = Date.now();
68
+ const objectKey = `videos/generated/${timestamp}.mp4`;
69
+ uploaded = await uploadFromUrl(videoUrl, objectKey);
70
+ console.log(`[service/video] uploaded to ${uploaded}`);
71
+ }
72
+
73
+ return {
74
+ videoUrl,
75
+ duration: result.data?.duration,
76
+ uploaded,
77
+ };
78
+ }
79
+
80
+ // cli runner
81
+ if (import.meta.main) {
82
+ const [command, ...args] = process.argv.slice(2);
83
+
84
+ switch (command) {
85
+ case "from_image": {
86
+ if (!args[0] || !args[1]) {
87
+ console.log(`
88
+ usage:
89
+ bun run service/video.ts from_image <prompt> <imageUrl> [duration] [upload]
90
+ `);
91
+ process.exit(1);
92
+ }
93
+ const duration = args[2];
94
+ if (duration && duration !== "5" && duration !== "10") {
95
+ console.error("duration must be 5 or 10");
96
+ process.exit(1);
97
+ }
98
+ const imgResult = await generateVideoFromImage(args[0], args[1], {
99
+ duration: duration === "10" ? 10 : 5,
100
+ upload: args[3] === "true",
101
+ });
102
+ console.log(JSON.stringify(imgResult, null, 2));
103
+ break;
104
+ }
105
+
106
+ case "from_text": {
107
+ if (!args[0]) {
108
+ console.log(`
109
+ usage:
110
+ bun run service/video.ts from_text <prompt> [duration] [upload]
111
+ `);
112
+ process.exit(1);
113
+ }
114
+ const duration = args[1];
115
+ if (duration && duration !== "5" && duration !== "10") {
116
+ console.error("duration must be 5 or 10");
117
+ process.exit(1);
118
+ }
119
+ const txtResult = await generateVideoFromText(args[0], {
120
+ duration: duration === "10" ? 10 : 5,
121
+ upload: args[2] === "true",
122
+ });
123
+ console.log(JSON.stringify(txtResult, null, 2));
124
+ break;
125
+ }
126
+
127
+ default:
128
+ console.log(`
129
+ usage:
130
+ bun run service/video.ts from_image <prompt> <imageUrl> [duration] [upload]
131
+ bun run service/video.ts from_text <prompt> [duration] [upload]
132
+ `);
133
+ process.exit(1);
134
+ }
135
+ }