vargai 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (154) hide show
  1. package/.claude/settings.local.json +7 -0
  2. package/.env.example +27 -0
  3. package/.github/workflows/ci.yml +23 -0
  4. package/.husky/README.md +102 -0
  5. package/.husky/commit-msg +6 -0
  6. package/.husky/pre-commit +9 -0
  7. package/.husky/pre-push +6 -0
  8. package/.size-limit.json +8 -0
  9. package/.test-hooks.ts +5 -0
  10. package/CLAUDE.md +125 -0
  11. package/CONTRIBUTING.md +150 -0
  12. package/LICENSE.md +53 -0
  13. package/README.md +78 -0
  14. package/SKILLS.md +173 -0
  15. package/STRUCTURE.md +92 -0
  16. package/biome.json +34 -0
  17. package/bun.lock +1254 -0
  18. package/commitlint.config.js +22 -0
  19. package/docs/plan.md +66 -0
  20. package/docs/todo.md +14 -0
  21. package/docs/varg-sdk.md +812 -0
  22. package/ffmpeg/CLAUDE.md +68 -0
  23. package/package.json +69 -0
  24. package/pipeline/cookbooks/SKILL.md +285 -0
  25. package/pipeline/cookbooks/remotion-video.md +585 -0
  26. package/pipeline/cookbooks/round-video-character.md +337 -0
  27. package/pipeline/cookbooks/scripts/animate-frames-parallel.ts +84 -0
  28. package/pipeline/cookbooks/scripts/combine-scenes.sh +53 -0
  29. package/pipeline/cookbooks/scripts/generate-frames-parallel.ts +99 -0
  30. package/pipeline/cookbooks/scripts/still-to-video.sh +37 -0
  31. package/pipeline/cookbooks/talking-character.md +59 -0
  32. package/pipeline/cookbooks/text-to-tiktok.md +669 -0
  33. package/pipeline/cookbooks/trendwatching.md +156 -0
  34. package/plan.md +281 -0
  35. package/scripts/.gitkeep +0 -0
  36. package/src/ai-sdk/cache.ts +142 -0
  37. package/src/ai-sdk/examples/cached-generation.ts +53 -0
  38. package/src/ai-sdk/examples/duet-scene-4.ts +53 -0
  39. package/src/ai-sdk/examples/duet-scene-5-audio.ts +32 -0
  40. package/src/ai-sdk/examples/duet-video.ts +56 -0
  41. package/src/ai-sdk/examples/editly-composition.ts +63 -0
  42. package/src/ai-sdk/examples/editly-test.ts +57 -0
  43. package/src/ai-sdk/examples/editly-video-test.ts +52 -0
  44. package/src/ai-sdk/examples/fal-lipsync.ts +43 -0
  45. package/src/ai-sdk/examples/higgsfield-image.ts +61 -0
  46. package/src/ai-sdk/examples/music-generation.ts +19 -0
  47. package/src/ai-sdk/examples/openai-sora.ts +34 -0
  48. package/src/ai-sdk/examples/replicate-bg-removal.ts +52 -0
  49. package/src/ai-sdk/examples/simpsons-scene.ts +61 -0
  50. package/src/ai-sdk/examples/talking-lion.ts +55 -0
  51. package/src/ai-sdk/examples/video-generation.ts +39 -0
  52. package/src/ai-sdk/examples/workflow-animated-girl.ts +104 -0
  53. package/src/ai-sdk/examples/workflow-before-after.ts +114 -0
  54. package/src/ai-sdk/examples/workflow-character-grid.ts +112 -0
  55. package/src/ai-sdk/examples/workflow-slideshow.ts +161 -0
  56. package/src/ai-sdk/file-cache.ts +112 -0
  57. package/src/ai-sdk/file.ts +238 -0
  58. package/src/ai-sdk/generate-element.ts +92 -0
  59. package/src/ai-sdk/generate-music.ts +46 -0
  60. package/src/ai-sdk/generate-video.ts +165 -0
  61. package/src/ai-sdk/index.ts +72 -0
  62. package/src/ai-sdk/music-model.ts +110 -0
  63. package/src/ai-sdk/providers/editly/editly.test.ts +1108 -0
  64. package/src/ai-sdk/providers/editly/ffmpeg.ts +60 -0
  65. package/src/ai-sdk/providers/editly/index.ts +817 -0
  66. package/src/ai-sdk/providers/editly/layers.ts +772 -0
  67. package/src/ai-sdk/providers/editly/plan.md +144 -0
  68. package/src/ai-sdk/providers/editly/types.ts +328 -0
  69. package/src/ai-sdk/providers/elevenlabs-provider.ts +255 -0
  70. package/src/ai-sdk/providers/fal-provider.ts +512 -0
  71. package/src/ai-sdk/providers/higgsfield.ts +379 -0
  72. package/src/ai-sdk/providers/openai.ts +251 -0
  73. package/src/ai-sdk/providers/replicate.ts +16 -0
  74. package/src/ai-sdk/video-model.ts +185 -0
  75. package/src/cli/commands/find.tsx +137 -0
  76. package/src/cli/commands/help.tsx +85 -0
  77. package/src/cli/commands/index.ts +9 -0
  78. package/src/cli/commands/list.tsx +238 -0
  79. package/src/cli/commands/run.tsx +511 -0
  80. package/src/cli/commands/which.tsx +253 -0
  81. package/src/cli/index.ts +112 -0
  82. package/src/cli/quiet.ts +44 -0
  83. package/src/cli/types.ts +32 -0
  84. package/src/cli/ui/components/Badge.tsx +29 -0
  85. package/src/cli/ui/components/DataTable.tsx +51 -0
  86. package/src/cli/ui/components/Header.tsx +23 -0
  87. package/src/cli/ui/components/HelpBlock.tsx +44 -0
  88. package/src/cli/ui/components/KeyValue.tsx +33 -0
  89. package/src/cli/ui/components/OptionRow.tsx +81 -0
  90. package/src/cli/ui/components/Separator.tsx +23 -0
  91. package/src/cli/ui/components/StatusBox.tsx +108 -0
  92. package/src/cli/ui/components/VargBox.tsx +51 -0
  93. package/src/cli/ui/components/VargProgress.tsx +36 -0
  94. package/src/cli/ui/components/VargSpinner.tsx +34 -0
  95. package/src/cli/ui/components/VargText.tsx +56 -0
  96. package/src/cli/ui/components/index.ts +19 -0
  97. package/src/cli/ui/index.ts +12 -0
  98. package/src/cli/ui/render.ts +35 -0
  99. package/src/cli/ui/theme.ts +63 -0
  100. package/src/cli/utils.ts +78 -0
  101. package/src/core/executor/executor.ts +201 -0
  102. package/src/core/executor/index.ts +13 -0
  103. package/src/core/executor/job.ts +214 -0
  104. package/src/core/executor/pipeline.ts +222 -0
  105. package/src/core/index.ts +11 -0
  106. package/src/core/registry/index.ts +9 -0
  107. package/src/core/registry/loader.ts +149 -0
  108. package/src/core/registry/registry.ts +221 -0
  109. package/src/core/registry/resolver.ts +206 -0
  110. package/src/core/schema/helpers.ts +134 -0
  111. package/src/core/schema/index.ts +8 -0
  112. package/src/core/schema/shared.ts +102 -0
  113. package/src/core/schema/types.ts +279 -0
  114. package/src/core/schema/validator.ts +92 -0
  115. package/src/definitions/actions/captions.ts +261 -0
  116. package/src/definitions/actions/edit.ts +298 -0
  117. package/src/definitions/actions/image.ts +125 -0
  118. package/src/definitions/actions/index.ts +114 -0
  119. package/src/definitions/actions/music.ts +205 -0
  120. package/src/definitions/actions/sync.ts +128 -0
  121. package/src/definitions/actions/transcribe.ts +200 -0
  122. package/src/definitions/actions/upload.ts +111 -0
  123. package/src/definitions/actions/video.ts +163 -0
  124. package/src/definitions/actions/voice.ts +119 -0
  125. package/src/definitions/index.ts +23 -0
  126. package/src/definitions/models/elevenlabs.ts +50 -0
  127. package/src/definitions/models/flux.ts +56 -0
  128. package/src/definitions/models/index.ts +36 -0
  129. package/src/definitions/models/kling.ts +56 -0
  130. package/src/definitions/models/llama.ts +54 -0
  131. package/src/definitions/models/nano-banana-pro.ts +102 -0
  132. package/src/definitions/models/sonauto.ts +68 -0
  133. package/src/definitions/models/soul.ts +65 -0
  134. package/src/definitions/models/wan.ts +54 -0
  135. package/src/definitions/models/whisper.ts +44 -0
  136. package/src/definitions/skills/index.ts +12 -0
  137. package/src/definitions/skills/talking-character.ts +87 -0
  138. package/src/definitions/skills/text-to-tiktok.ts +97 -0
  139. package/src/index.ts +118 -0
  140. package/src/providers/apify.ts +269 -0
  141. package/src/providers/base.ts +264 -0
  142. package/src/providers/elevenlabs.ts +217 -0
  143. package/src/providers/fal.ts +392 -0
  144. package/src/providers/ffmpeg.ts +544 -0
  145. package/src/providers/fireworks.ts +193 -0
  146. package/src/providers/groq.ts +149 -0
  147. package/src/providers/higgsfield.ts +145 -0
  148. package/src/providers/index.ts +143 -0
  149. package/src/providers/replicate.ts +147 -0
  150. package/src/providers/storage.ts +206 -0
  151. package/src/tests/all.test.ts +509 -0
  152. package/src/tests/index.ts +33 -0
  153. package/src/tests/unit.test.ts +403 -0
  154. package/tsconfig.json +45 -0
@@ -0,0 +1,114 @@
1
+ /**
2
+ * Action definitions index
3
+ */
4
+
5
+ export type { AddCaptionsOptions, SubtitleStyle } from "./captions";
6
+ // Captions
7
+ export { addCaptions, definition as captions } from "./captions";
8
+ export type {
9
+ CutOptions,
10
+ CutResult,
11
+ FadeOptions,
12
+ FadeResult,
13
+ MergeOptions,
14
+ MergeResult,
15
+ RemoveOptions,
16
+ RemoveResult,
17
+ SplitOptions,
18
+ SplitResult,
19
+ TransitionOptions,
20
+ TransitionResult,
21
+ TrimOptions,
22
+ TrimResult,
23
+ } from "./edit";
24
+ // Video editing (FFmpeg)
25
+ export {
26
+ cut,
27
+ cutDefinition,
28
+ fade,
29
+ fadeDefinition,
30
+ merge,
31
+ mergeDefinition,
32
+ remove,
33
+ removeDefinition,
34
+ split,
35
+ splitDefinition,
36
+ transition,
37
+ transitionDefinition,
38
+ trim,
39
+ trimDefinition,
40
+ } from "./edit";
41
+ export type { ImageGenerationResult } from "./image";
42
+ // Image generation
43
+ export {
44
+ definition as image,
45
+ generateWithFal,
46
+ generateWithSoul,
47
+ } from "./image";
48
+ export type { GenerateMusicOptions, MusicResult } from "./music";
49
+ // Music generation
50
+ export { definition as music, generateMusic } from "./music";
51
+ export type { LipsyncOptions, LipsyncResult, Wav2LipOptions } from "./sync";
52
+ // Lip sync
53
+ export {
54
+ definition as sync,
55
+ lipsync,
56
+ lipsyncOverlay,
57
+ lipsyncWav2Lip,
58
+ } from "./sync";
59
+ export type { TranscribeOptions, TranscribeResult } from "./transcribe";
60
+ // Transcription
61
+ export {
62
+ definition as transcribe,
63
+ transcribe as transcribeAudio,
64
+ } from "./transcribe";
65
+ export type { UploadOptions, UploadResult } from "./upload";
66
+ // Upload
67
+ export { definition as uploadDef, upload } from "./upload";
68
+ export type { VideoGenerationResult } from "./video";
69
+ // Video generation
70
+ export {
71
+ definition as video,
72
+ generateVideoFromImage,
73
+ generateVideoFromText,
74
+ } from "./video";
75
+ export type { GenerateVoiceOptions, VoiceResult } from "./voice";
76
+ // Voice generation
77
+ export { definition as voice, generateVoice } from "./voice";
78
+
79
+ // All action definitions for auto-loading
80
+ import { definition as captionsDefinition } from "./captions";
81
+ import {
82
+ cutDefinition,
83
+ fadeDefinition,
84
+ mergeDefinition,
85
+ removeDefinition,
86
+ splitDefinition,
87
+ transitionDefinition,
88
+ trimDefinition,
89
+ } from "./edit";
90
+ import { definition as imageDefinition } from "./image";
91
+ import { definition as musicDefinition } from "./music";
92
+ import { definition as syncDefinition } from "./sync";
93
+ import { definition as transcribeDefinition } from "./transcribe";
94
+ import { definition as uploadDefinition } from "./upload";
95
+ import { definition as videoDefinition } from "./video";
96
+ import { definition as voiceDefinition } from "./voice";
97
+
98
+ export const allActions = [
99
+ videoDefinition,
100
+ imageDefinition,
101
+ voiceDefinition,
102
+ transcribeDefinition,
103
+ musicDefinition,
104
+ syncDefinition,
105
+ captionsDefinition,
106
+ trimDefinition,
107
+ cutDefinition,
108
+ mergeDefinition,
109
+ splitDefinition,
110
+ fadeDefinition,
111
+ transitionDefinition,
112
+ removeDefinition,
113
+ uploadDefinition,
114
+ ];
@@ -0,0 +1,205 @@
1
+ /**
2
+ * Music generation action
3
+ * Text-to-music via Fal/Sonauto
4
+ */
5
+
6
+ import { writeFile } from "node:fs/promises";
7
+ import { z } from "zod";
8
+ import { audioFormatSchema, filePathSchema } from "../../core/schema/shared";
9
+ import type { ActionDefinition, ZodSchema } from "../../core/schema/types";
10
+ import { falProvider } from "../../providers/fal";
11
+ import { storageProvider } from "../../providers/storage";
12
+
13
+ // Input schema with Zod
14
+ const musicInputSchema = z.object({
15
+ prompt: z.string().optional().describe("Description of music to generate"),
16
+ tags: z
17
+ .array(z.string())
18
+ .optional()
19
+ .describe("Style tags like 'rock', 'energetic'"),
20
+ lyrics: z.string().optional().describe("Optional lyrics prompt"),
21
+ format: audioFormatSchema.default("mp3").describe("Output format"),
22
+ numSongs: z
23
+ .union([z.literal(1), z.literal(2)])
24
+ .default(1)
25
+ .describe("Number of songs to generate"),
26
+ output: filePathSchema.optional().describe("Output file path"),
27
+ });
28
+
29
+ // Output schema with Zod
30
+ const musicOutputSchema = z.object({
31
+ seed: z.number(),
32
+ tags: z.array(z.string()).optional(),
33
+ lyrics: z.string().optional(),
34
+ audio: z.array(
35
+ z.object({
36
+ url: z.string(),
37
+ fileName: z.string(),
38
+ contentType: z.string(),
39
+ fileSize: z.number(),
40
+ }),
41
+ ),
42
+ uploadUrls: z.array(z.string()).optional(),
43
+ });
44
+
45
+ // Schema object for the definition
46
+ const schema: ZodSchema<typeof musicInputSchema, typeof musicOutputSchema> = {
47
+ input: musicInputSchema,
48
+ output: musicOutputSchema,
49
+ };
50
+
51
+ export const definition: ActionDefinition<typeof schema> = {
52
+ type: "action",
53
+ name: "music",
54
+ description: "Generate music from text prompt or tags",
55
+ schema,
56
+ routes: [],
57
+ execute: async (inputs) => {
58
+ return generateMusic({
59
+ prompt: inputs.prompt,
60
+ tags: inputs.tags,
61
+ lyrics: inputs.lyrics,
62
+ format: inputs.format,
63
+ numSongs: inputs.numSongs,
64
+ outputPath: inputs.output,
65
+ });
66
+ },
67
+ };
68
+
69
+ // Types
70
+ export interface GenerateMusicOptions {
71
+ prompt?: string;
72
+ tags?: string[];
73
+ lyrics?: string;
74
+ seed?: number;
75
+ promptStrength?: number;
76
+ balanceStrength?: number;
77
+ numSongs?: 1 | 2;
78
+ format?: "flac" | "mp3" | "wav" | "ogg" | "m4a";
79
+ bitRate?: 128 | 192 | 256 | 320;
80
+ bpm?: number | "auto";
81
+ upload?: boolean;
82
+ outputPath?: string;
83
+ }
84
+
85
+ export interface MusicResult {
86
+ seed: number;
87
+ tags?: string[];
88
+ lyrics?: string;
89
+ audio: Array<{
90
+ url: string;
91
+ fileName: string;
92
+ contentType: string;
93
+ fileSize: number;
94
+ }>;
95
+ uploadUrls?: string[];
96
+ }
97
+
98
+ export async function generateMusic(
99
+ options: GenerateMusicOptions,
100
+ ): Promise<MusicResult> {
101
+ const {
102
+ prompt,
103
+ tags,
104
+ lyrics,
105
+ seed,
106
+ promptStrength = 2,
107
+ balanceStrength = 0.7,
108
+ numSongs = 1,
109
+ format = "mp3",
110
+ bitRate,
111
+ bpm = "auto",
112
+ upload = false,
113
+ outputPath,
114
+ } = options;
115
+
116
+ if (!prompt && !tags) {
117
+ throw new Error("Either prompt or tags is required");
118
+ }
119
+
120
+ console.log(`[music] generating ${numSongs} song(s)...`);
121
+ if (prompt) console.log(`[music] prompt: ${prompt}`);
122
+ if (tags) console.log(`[music] tags: ${tags.join(", ")}`);
123
+
124
+ const result = await falProvider.textToMusic({
125
+ prompt,
126
+ tags,
127
+ lyricsPrompt: lyrics,
128
+ seed,
129
+ promptStrength,
130
+ balanceStrength,
131
+ numSongs,
132
+ outputFormat: format,
133
+ outputBitRate: bitRate,
134
+ bpm,
135
+ });
136
+
137
+ const musicResult: MusicResult = {
138
+ seed: result.data.seed,
139
+ tags: result.data.tags,
140
+ lyrics: result.data.lyrics,
141
+ audio: Array.isArray(result.data.audio)
142
+ ? result.data.audio.map(
143
+ (a: {
144
+ url: string;
145
+ file_name: string;
146
+ content_type: string;
147
+ file_size: number;
148
+ }) => ({
149
+ url: a.url,
150
+ fileName: a.file_name,
151
+ contentType: a.content_type,
152
+ fileSize: a.file_size,
153
+ }),
154
+ )
155
+ : [
156
+ {
157
+ url: result.data.audio.url,
158
+ fileName: result.data.audio.file_name,
159
+ contentType: result.data.audio.content_type,
160
+ fileSize: result.data.audio.file_size,
161
+ },
162
+ ],
163
+ };
164
+
165
+ // Save files locally if requested
166
+ if (outputPath) {
167
+ for (let i = 0; i < musicResult.audio.length; i++) {
168
+ const audio = musicResult.audio[i];
169
+ if (!audio) continue;
170
+
171
+ const ext = format || "wav";
172
+ const filePath =
173
+ musicResult.audio.length === 1
174
+ ? outputPath
175
+ : outputPath.replace(/\.[^.]+$/, `-${i + 1}.${ext}`);
176
+
177
+ const response = await fetch(audio.url);
178
+ const buffer = await response.arrayBuffer();
179
+ await writeFile(filePath, Buffer.from(buffer));
180
+ console.log(`[music] saved to ${filePath}`);
181
+ }
182
+ }
183
+
184
+ // Upload to storage if requested
185
+ if (upload) {
186
+ const uploadUrls: string[] = [];
187
+ for (let i = 0; i < musicResult.audio.length; i++) {
188
+ const audio = musicResult.audio[i];
189
+ if (!audio) continue;
190
+
191
+ const objectKey = `music/${Date.now()}-${i + 1}.${format || "wav"}`;
192
+ const uploadUrl = await storageProvider.uploadFromUrl(
193
+ audio.url,
194
+ objectKey,
195
+ );
196
+ uploadUrls.push(uploadUrl);
197
+ console.log(`[music] uploaded to ${uploadUrl}`);
198
+ }
199
+ musicResult.uploadUrls = uploadUrls;
200
+ }
201
+
202
+ return musicResult;
203
+ }
204
+
205
+ export default definition;
@@ -0,0 +1,128 @@
1
+ /**
2
+ * Lip sync action
3
+ * Audio-to-video synchronization
4
+ */
5
+
6
+ import { z } from "zod";
7
+ import {
8
+ filePathSchema,
9
+ resolutionSchema,
10
+ videoDurationStringSchema,
11
+ } from "../../core/schema/shared";
12
+ import type { ActionDefinition, ZodSchema } from "../../core/schema/types";
13
+ import { falProvider } from "../../providers/fal";
14
+ import { ffmpegProvider } from "../../providers/ffmpeg";
15
+
16
+ // Input schema with Zod
17
+ const syncInputSchema = z.object({
18
+ image: filePathSchema.describe("Input image"),
19
+ audio: filePathSchema.describe("Audio file"),
20
+ prompt: z.string().describe("Description of the scene"),
21
+ duration: videoDurationStringSchema.default("5").describe("Output duration"),
22
+ resolution: resolutionSchema.default("480p").describe("Output resolution"),
23
+ });
24
+
25
+ // Output schema with Zod
26
+ const syncOutputSchema = z.object({
27
+ videoUrl: z.string(),
28
+ });
29
+
30
+ // Schema object for the definition
31
+ const schema: ZodSchema<typeof syncInputSchema, typeof syncOutputSchema> = {
32
+ input: syncInputSchema,
33
+ output: syncOutputSchema,
34
+ };
35
+
36
+ export const definition: ActionDefinition<typeof schema> = {
37
+ type: "action",
38
+ name: "sync",
39
+ description: "Lip sync audio to video/image",
40
+ schema,
41
+ routes: [],
42
+ execute: async (inputs) => {
43
+ const { image, audio, prompt, duration, resolution } = inputs;
44
+ return lipsync({ image, audio, prompt, duration, resolution });
45
+ },
46
+ };
47
+
48
+ // Types
49
+ export interface LipsyncOptions {
50
+ image: string;
51
+ audio: string;
52
+ prompt: string;
53
+ duration?: "5" | "10";
54
+ resolution?: "480p" | "720p" | "1080p";
55
+ }
56
+
57
+ export interface LipsyncResult {
58
+ videoUrl: string;
59
+ }
60
+
61
+ export interface Wav2LipOptions {
62
+ videoPath: string;
63
+ audioPath: string;
64
+ outputPath: string;
65
+ }
66
+
67
+ /**
68
+ * Generate lip-synced video using Wan-25
69
+ */
70
+ export async function lipsync(options: LipsyncOptions): Promise<LipsyncResult> {
71
+ const { image, audio, prompt, duration = "5", resolution = "480p" } = options;
72
+
73
+ console.log("[sync] generating lip-synced video with wan-25...");
74
+
75
+ const result = await falProvider.wan25({
76
+ imageUrl: image,
77
+ audioUrl: audio,
78
+ prompt,
79
+ duration,
80
+ resolution,
81
+ });
82
+
83
+ const videoUrl = result.data?.video?.url;
84
+ if (!videoUrl) {
85
+ throw new Error("No video URL in result");
86
+ }
87
+
88
+ return { videoUrl };
89
+ }
90
+
91
+ /**
92
+ * Overlay lip-synced face onto original video
93
+ */
94
+ export async function lipsyncOverlay(options: {
95
+ originalVideo: string;
96
+ lipsyncedVideo: string;
97
+ outputPath: string;
98
+ }): Promise<string> {
99
+ const { lipsyncedVideo, outputPath } = options;
100
+
101
+ console.log("[sync] overlaying lip-synced video...");
102
+
103
+ // This would require more complex ffmpeg operations
104
+ // For now, just return the lip-synced video as-is
105
+ await ffmpegProvider.convertFormat({
106
+ input: lipsyncedVideo,
107
+ output: outputPath,
108
+ });
109
+
110
+ return outputPath;
111
+ }
112
+
113
+ /**
114
+ * Wav2Lip-style lip sync (placeholder for future implementation)
115
+ */
116
+ export async function lipsyncWav2Lip(options: Wav2LipOptions): Promise<string> {
117
+ console.warn("[sync] wav2lip not yet implemented, using wan-25 fallback");
118
+
119
+ // For now, just copy the video
120
+ await ffmpegProvider.convertFormat({
121
+ input: options.videoPath,
122
+ output: options.outputPath,
123
+ });
124
+
125
+ return options.outputPath;
126
+ }
127
+
128
+ export default definition;
@@ -0,0 +1,200 @@
1
+ /**
2
+ * Transcription action
3
+ * Speech-to-text via Groq or Fireworks
4
+ */
5
+
6
+ import { writeFileSync } from "node:fs";
7
+ import { toFile } from "groq-sdk/uploads";
8
+ import { z } from "zod";
9
+ import {
10
+ filePathSchema,
11
+ transcriptionProviderSchema,
12
+ } from "../../core/schema/shared";
13
+ import type { ActionDefinition, ZodSchema } from "../../core/schema/types";
14
+ import {
15
+ convertFireworksToSRT,
16
+ fireworksProvider,
17
+ } from "../../providers/fireworks";
18
+ import { GROQ_MODELS, groqProvider } from "../../providers/groq";
19
+
20
+ // Input schema with Zod
21
+ const transcribeInputSchema = z.object({
22
+ audio: filePathSchema.describe("Audio/video file to transcribe"),
23
+ provider: transcriptionProviderSchema
24
+ .default("groq")
25
+ .describe("Transcription provider"),
26
+ output: filePathSchema.optional().describe("Output file path"),
27
+ });
28
+
29
+ // Output schema with Zod
30
+ const transcribeOutputSchema = z.object({
31
+ success: z.boolean(),
32
+ text: z.string().optional(),
33
+ srt: z.string().optional(),
34
+ error: z.string().optional(),
35
+ });
36
+
37
+ // Schema object for the definition
38
+ const schema: ZodSchema<
39
+ typeof transcribeInputSchema,
40
+ typeof transcribeOutputSchema
41
+ > = {
42
+ input: transcribeInputSchema,
43
+ output: transcribeOutputSchema,
44
+ };
45
+
46
+ export const definition: ActionDefinition<typeof schema> = {
47
+ type: "action",
48
+ name: "transcribe",
49
+ description: "Speech to text transcription",
50
+ schema,
51
+ routes: [],
52
+ execute: async (inputs) => {
53
+ const { audio, provider, output } = inputs;
54
+ return transcribe({ audioUrl: audio, provider, outputPath: output });
55
+ },
56
+ };
57
+
58
+ // Types
59
+ export interface TranscribeOptions {
60
+ audioUrl: string;
61
+ provider?: "groq" | "fireworks";
62
+ model?: string;
63
+ language?: string;
64
+ outputFormat?: "text" | "srt";
65
+ outputPath?: string;
66
+ }
67
+
68
+ export interface TranscribeResult {
69
+ success: boolean;
70
+ text?: string;
71
+ srt?: string;
72
+ error?: string;
73
+ }
74
+
75
+ // Groq transcription
76
+ async function transcribeWithGroq(
77
+ audioUrl: string,
78
+ options: {
79
+ model?: string;
80
+ language?: string;
81
+ outputFormat?: "text" | "srt";
82
+ },
83
+ ): Promise<TranscribeResult> {
84
+ try {
85
+ console.log("[transcribe] using groq whisper...");
86
+
87
+ // Load audio file
88
+ let audioBuffer: ArrayBuffer;
89
+ let fileName = "audio.mp3";
90
+
91
+ if (audioUrl.startsWith("http://") || audioUrl.startsWith("https://")) {
92
+ const audioResponse = await fetch(audioUrl);
93
+ audioBuffer = await audioResponse.arrayBuffer();
94
+ } else {
95
+ const file = Bun.file(audioUrl);
96
+ audioBuffer = await file.arrayBuffer();
97
+ fileName = audioUrl.split("/").pop() || "audio.mp3";
98
+ }
99
+
100
+ const audioFile = await toFile(audioBuffer, fileName);
101
+
102
+ const text = await groqProvider.transcribeAudio({
103
+ file: audioFile,
104
+ model: options.model || GROQ_MODELS.WHISPER_LARGE,
105
+ language: options.language,
106
+ });
107
+
108
+ console.log("[transcribe] groq transcription complete");
109
+
110
+ if (options.outputFormat === "srt") {
111
+ console.warn(
112
+ "[transcribe] groq returns plain text, use fireworks for SRT format",
113
+ );
114
+ return { success: true, text, srt: text };
115
+ }
116
+
117
+ return { success: true, text };
118
+ } catch (error) {
119
+ console.error("[transcribe] groq error:", error);
120
+ return {
121
+ success: false,
122
+ error:
123
+ error instanceof Error ? error.message : "Groq transcription failed",
124
+ };
125
+ }
126
+ }
127
+
128
+ // Fireworks transcription (with SRT support)
129
+ async function transcribeWithFireworks(
130
+ audioUrl: string,
131
+ ): Promise<TranscribeResult> {
132
+ try {
133
+ console.log("[transcribe] using fireworks api...");
134
+
135
+ const data = await fireworksProvider.transcribe({
136
+ audioPath: audioUrl,
137
+ });
138
+
139
+ const srtText = convertFireworksToSRT(data.words || []);
140
+ console.log("[transcribe] fireworks transcription complete");
141
+
142
+ return { success: true, srt: srtText, text: data.text };
143
+ } catch (error) {
144
+ console.error("[transcribe] fireworks error:", error);
145
+ return {
146
+ success: false,
147
+ error:
148
+ error instanceof Error
149
+ ? error.message
150
+ : "Fireworks transcription failed",
151
+ };
152
+ }
153
+ }
154
+
155
+ // Main transcription function
156
+ export async function transcribe(
157
+ options: TranscribeOptions,
158
+ ): Promise<TranscribeResult> {
159
+ const {
160
+ audioUrl,
161
+ provider = "groq",
162
+ model,
163
+ language,
164
+ outputFormat = "text",
165
+ outputPath,
166
+ } = options;
167
+
168
+ if (!audioUrl) {
169
+ throw new Error("audioUrl is required");
170
+ }
171
+
172
+ console.log(`[transcribe] transcribing ${audioUrl} with ${provider}...`);
173
+
174
+ let result: TranscribeResult;
175
+
176
+ if (provider === "groq") {
177
+ result = await transcribeWithGroq(audioUrl, {
178
+ model,
179
+ language,
180
+ outputFormat,
181
+ });
182
+ } else if (provider === "fireworks") {
183
+ result = await transcribeWithFireworks(audioUrl);
184
+ } else {
185
+ throw new Error(`Unknown provider: ${provider}`);
186
+ }
187
+
188
+ // Save to file if requested
189
+ if (result.success && outputPath) {
190
+ const content = outputFormat === "srt" ? result.srt : result.text;
191
+ if (content) {
192
+ writeFileSync(outputPath, content);
193
+ console.log(`[transcribe] saved to ${outputPath}`);
194
+ }
195
+ }
196
+
197
+ return result;
198
+ }
199
+
200
+ export default definition;