npm - varg.ai-sdk - Versions diffs - 0.1.0 - Mend

varg.ai-sdk 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

package/.claude/settings.local.json +7 -0
package/.env.example +24 -0
package/CLAUDE.md +118 -0
package/README.md +231 -0
package/SKILLS.md +157 -0
package/STRUCTURE.md +92 -0
package/TEST_RESULTS.md +122 -0
package/action/captions/SKILL.md +170 -0
package/action/captions/index.ts +227 -0
package/action/edit/SKILL.md +235 -0
package/action/edit/index.ts +493 -0
package/action/image/SKILL.md +140 -0
package/action/image/index.ts +112 -0
package/action/sync/SKILL.md +136 -0
package/action/sync/index.ts +187 -0
package/action/transcribe/SKILL.md +179 -0
package/action/transcribe/index.ts +227 -0
package/action/video/SKILL.md +116 -0
package/action/video/index.ts +135 -0
package/action/voice/SKILL.md +125 -0
package/action/voice/index.ts +201 -0
package/biome.json +33 -0
package/index.ts +38 -0
package/lib/README.md +144 -0
package/lib/ai-sdk/fal.ts +106 -0
package/lib/ai-sdk/replicate.ts +107 -0
package/lib/elevenlabs.ts +382 -0
package/lib/fal.ts +478 -0
package/lib/ffmpeg.ts +467 -0
package/lib/fireworks.ts +235 -0
package/lib/groq.ts +246 -0
package/lib/higgsfield.ts +176 -0
package/lib/remotion/SKILL.md +823 -0
package/lib/remotion/cli.ts +115 -0
package/lib/remotion/functions.ts +283 -0
package/lib/remotion/index.ts +19 -0
package/lib/remotion/templates.ts +73 -0
package/lib/replicate.ts +304 -0
package/output.txt +1 -0
package/package.json +35 -0
package/pipeline/cookbooks/SKILL.md +285 -0
package/pipeline/cookbooks/remotion-video.md +585 -0
package/pipeline/cookbooks/round-video-character.md +337 -0
package/pipeline/cookbooks/talking-character.md +59 -0
package/test-import.ts +7 -0
package/test-services.ts +97 -0
package/tsconfig.json +29 -0
package/utilities/s3.ts +147 -0

package/action/transcribe/SKILL.md ADDED Viewed

@@ -0,0 +1,179 @@
+---
+name: audio-transcription
+description: transcribe audio to text or subtitles using groq whisper or fireworks with srt/vtt support. use when converting speech to text, generating subtitles, or need word-level timestamps for captions.
+allowed-tools: Read, Bash
+---
+# audio transcription
+convert audio to text or subtitle files using ai transcription.
+## providers
+### groq (ultra-fast)
+- uses whisper-large-v3
+- fastest transcription (~5-10 seconds)
+- plain text output
+- sentence-level timing
+- best for: quick transcripts, text extraction
+### fireworks (word-level)
+- uses whisper-v3
+- word-level timestamps
+- outputs srt or vtt format
+- precise subtitle timing
+- best for: captions, subtitles, timed transcripts
+## usage
+### basic transcription
+```bash
+bun run service/transcribe.ts <audioUrl> <provider> [outputPath]
+```
+**example:**
+```bash
+bun run service/transcribe.ts media/audio.mp3 groq
+bun run service/transcribe.ts media/audio.mp3 fireworks output.srt
+```
+### with output format
+```bash
+bun run lib/fireworks.ts <audioPath> <outputPath>
+```
+**example:**
+```bash
+bun run lib/fireworks.ts media/audio.mp3 output.srt
+```
+## as library
+```typescript
+import { transcribe } from "./service/transcribe"
+// groq transcription
+const groqResult = await transcribe({
+  audioUrl: "media/audio.mp3",
+  provider: "groq",
+  outputFormat: "text"
+})
+console.log(groqResult.text)
+// fireworks with srt
+const fireworksResult = await transcribe({
+  audioUrl: "media/audio.mp3",
+  provider: "fireworks",
+  outputFormat: "srt",
+  outputPath: "subtitles.srt"
+})
+console.log(fireworksResult.text)
+console.log(fireworksResult.outputPath) // subtitles.srt
+```
+## output formats
+### text (groq default)
+```
+This is the transcribed text from the audio file.
+All words in plain text format.
+```
+### srt (subtitle format)
+```
+1
+00:00:00,000 --> 00:00:02,500
+This is the first subtitle
+2
+00:00:02,500 --> 00:00:05,000
+This is the second subtitle
+```
+### vtt (web video text tracks)
+```
+WEBVTT
+00:00:00.000 --> 00:00:02.500
+This is the first subtitle
+00:00:02.500 --> 00:00:05.000
+This is the second subtitle
+```
+## when to use
+use this skill when:
+- converting speech to text
+- generating subtitles for videos
+- creating accessible content
+- need word-level timing for captions
+- extracting dialogue from media
+- preparing transcripts for analysis
+## provider comparison
+| feature | groq | fireworks |
+|---------|------|-----------|
+| speed | ultra-fast (5-10s) | moderate (15-30s) |
+| output | plain text | srt/vtt with timestamps |
+| timing | sentence-level | word-level |
+| use case | quick transcripts | precise subtitles |
+## typical workflows
+### for captions
+1. record or generate audio (voice service)
+2. transcribe with fireworks (this service)
+3. add captions to video (captions service)
+### for transcripts
+1. extract audio from video
+2. transcribe with groq (this service)
+3. use text for analysis or documentation
+## tips
+**provider selection:**
+- use **groq** when you just need the text fast
+- use **fireworks** when you need subtitle files
+- use **fireworks** for captions on social media videos
+**audio quality:**
+- clear audio transcribes more accurately
+- reduce background noise when possible
+- supports mp3, wav, m4a, and most audio formats
+**timing accuracy:**
+- fireworks provides word-level timestamps
+- perfect for lip-sync verification
+- great for precise subtitle placement
+## integration with other services
+perfect companion for:
+- **captions service** - auto-generate video subtitles
+- **voice service** - transcribe generated speech
+- **sync service** - verify audio timing
+## environment variables
+required:
+- `GROQ_API_KEY` - for groq provider
+- `FIREWORKS_API_KEY` - for fireworks provider
+## processing time
+- **groq**: 5-10 seconds (any audio length)
+- **fireworks**: 15-30 seconds (depending on audio length)
+## supported formats
+input audio:
+- mp3, wav, m4a, ogg, flac
+- video files (extracts audio automatically)
+output formats:
+- text (plain text)
+- srt (subtitles)
+- vtt (web video text tracks)

package/action/transcribe/index.ts ADDED Viewed

@@ -0,0 +1,227 @@
+#!/usr/bin/env bun
+/**
+ * audio transcription service
+ * supports groq whisper, fireworks api, and future providers
+ */
+import { writeFileSync } from "node:fs";
+import { join } from "node:path";
+import { toFile } from "groq-sdk/uploads";
+import {
+  convertFireworksToSRT,
+  transcribeWithFireworks as fireworksTranscribe,
+} from "../../lib/fireworks";
+import { GROQ_MODELS, transcribeAudio as groqTranscribe } from "../../lib/groq";
+// types
+export interface TranscribeOptions {
+  audioUrl: string; // url or local file path
+  provider?: "groq" | "fireworks";
+  model?: string;
+  language?: string;
+  outputFormat?: "text" | "srt";
+  outputPath?: string;
+}
+export interface TranscribeResult {
+  success: boolean;
+  text?: string;
+  srt?: string;
+  error?: string;
+}
+// groq transcription
+async function transcribeWithGroq(
+  audioUrl: string,
+  options: {
+    model?: string;
+    language?: string;
+    outputFormat?: "text" | "srt";
+  },
+): Promise<TranscribeResult> {
+  try {
+    console.log("[transcribe] using groq whisper...");
+    // load audio file (local or remote)
+    let audioBuffer: ArrayBuffer;
+    let fileName = "audio.mp3";
+    if (audioUrl.startsWith("http://") || audioUrl.startsWith("https://")) {
+      // fetch remote file
+      const audioResponse = await fetch(audioUrl);
+      audioBuffer = await audioResponse.arrayBuffer();
+    } else {
+      // read local file with bun
+      const file = Bun.file(audioUrl);
+      audioBuffer = await file.arrayBuffer();
+      fileName = audioUrl.split("/").pop() || "audio.mp3";
+    }
+    const audioFile = await toFile(audioBuffer, fileName);
+    // transcribe with groq
+    const text = await groqTranscribe({
+      file: audioFile,
+      model: options.model || GROQ_MODELS.WHISPER_LARGE,
+      language: options.language,
+    });
+    console.log("[transcribe] groq transcription complete");
+    if (options.outputFormat === "srt") {
+      // groq returns plain text, so we need to convert to srt
+      // for now just return text with warning
+      console.warn(
+        "[transcribe] groq returns plain text, use fireworks for srt format",
+      );
+      return { success: true, text, srt: text };
+    }
+    return { success: true, text };
+  } catch (error) {
+    console.error("[transcribe] groq error:", error);
+    return {
+      success: false,
+      error:
+        error instanceof Error ? error.message : "groq transcription failed",
+    };
+  }
+}
+// fireworks transcription (with srt support)
+async function transcribeWithFireworks(
+  audioUrl: string,
+): Promise<TranscribeResult> {
+  try {
+    console.log("[transcribe] using fireworks api...");
+    const data = await fireworksTranscribe({
+      audioPath: audioUrl,
+    });
+    const srtText = convertFireworksToSRT(data.words || []);
+    console.log("[transcribe] fireworks transcription complete");
+    return { success: true, srt: srtText, text: data.text };
+  } catch (error) {
+    console.error("[transcribe] fireworks error:", error);
+    return {
+      success: false,
+      error:
+        error instanceof Error
+          ? error.message
+          : "fireworks transcription failed",
+    };
+  }
+}
+// main transcription function
+export async function transcribe(
+  options: TranscribeOptions,
+): Promise<TranscribeResult> {
+  const {
+    audioUrl,
+    provider = "groq",
+    model,
+    language,
+    outputFormat = "text",
+    outputPath,
+  } = options;
+  if (!audioUrl) {
+    throw new Error("audioUrl is required");
+  }
+  console.log(`[transcribe] transcribing ${audioUrl} with ${provider}...`);
+  let result: TranscribeResult;
+  // choose provider
+  if (provider === "groq") {
+    result = await transcribeWithGroq(audioUrl, {
+      model,
+      language,
+      outputFormat,
+    });
+  } else if (provider === "fireworks") {
+    result = await transcribeWithFireworks(audioUrl);
+  } else {
+    throw new Error(`unknown provider: ${provider}`);
+  }
+  // save to file if requested
+  if (result.success && outputPath) {
+    const content = outputFormat === "srt" ? result.srt : result.text;
+    if (content) {
+      writeFileSync(outputPath, content);
+      console.log(`[transcribe] saved to ${outputPath}`);
+    }
+  }
+  return result;
+}
+// cli
+async function cli() {
+  const args = process.argv.slice(2);
+  const command = args[0];
+  if (!command || command === "help") {
+    console.log(`
+usage:
+  bun run service/transcribe.ts <audioPath> [provider] [outputPath]
+arguments:
+  audioPath      - url or local path to audio file
+  provider       - groq (default) | fireworks
+  outputPath     - optional path to save transcription
+examples:
+  bun run service/transcribe.ts https://example.com/audio.mp3
+  bun run service/transcribe.ts media/dora.ogg groq
+  bun run service/transcribe.ts https://example.com/audio.mp3 fireworks output.srt
+  bun run service/transcribe.ts media/audio.mp3 groq output.txt
+providers:
+  groq        - ultra-fast whisper (text only, free tier available)
+  fireworks   - slower but includes srt timestamps (uses reels-srt api)
+environment:
+  GROQ_API_KEY - your groq api key (for groq provider)
+    `);
+    process.exit(0);
+  }
+  try {
+    const audioUrl = args[0];
+    const provider = (args[1] || "groq") as "groq" | "fireworks";
+    const outputPath = args[2];
+    if (!audioUrl) {
+      throw new Error("audioUrl is required");
+    }
+    const result = await transcribe({
+      audioUrl,
+      provider,
+      outputFormat: provider === "fireworks" ? "srt" : "text",
+      outputPath: outputPath || join(process.cwd(), "output.txt"),
+    });
+    if (result.success) {
+      console.log("\ntranscription:");
+      console.log(result.srt || result.text);
+    } else {
+      console.error(`\nerror: ${result.error}`);
+      process.exit(1);
+    }
+  } catch (error) {
+    console.error("[transcribe] error:", error);
+    process.exit(1);
+  }
+}
+if (import.meta.main) {
+  cli();
+}

package/action/video/SKILL.md ADDED Viewed

@@ -0,0 +1,116 @@
+---
+name: video-generation
+description: generate videos from images or text prompts using fal.ai. use when user wants to animate images, create videos from text, or needs ai video generation with 5-10 second clips.
+allowed-tools: Read, Bash
+---
+# video generation
+generate ai videos from images or text using fal.ai with automatic s3 upload support.
+## capabilities
+- **image-to-video**: animate static images with motion prompts
+- **text-to-video**: generate videos directly from text descriptions
+- supports 5 or 10 second duration
+- automatic s3 upload
+## usage
+### generate from image
+```bash
+bun run service/video.ts from_image <prompt> <imageUrl> [duration] [upload]
+```
+**parameters:**
+- `prompt` (required): motion description (e.g., "camera pan left")
+- `imageUrl` (required): url of the source image
+- `duration` (optional): 5 or 10 seconds (default: 5)
+- `upload` (optional): "true" to upload to s3
+**example:**
+```bash
+bun run service/video.ts from_image "person talking naturally" https://example.com/headshot.jpg 5 true
+```
+### generate from text
+```bash
+bun run service/video.ts from_text <prompt> [duration] [upload]
+```
+**parameters:**
+- `prompt` (required): video scene description
+- `duration` (optional): 5 or 10 seconds (default: 5)
+- `upload` (optional): "true" to upload to s3
+**example:**
+```bash
+bun run service/video.ts from_text "waves crashing on beach at sunset" 10 true
+```
+## as library
+```typescript
+import { generateVideoFromImage, generateVideoFromText } from "./service/video"
+// animate an image
+const videoResult = await generateVideoFromImage(
+  "camera zoom in slowly",
+  "https://example.com/portrait.jpg",
+  { duration: 5, upload: true }
+)
+console.log(videoResult.videoUrl)
+console.log(videoResult.uploaded) // s3 url if upload=true
+// generate from text
+const textVideo = await generateVideoFromText(
+  "forest path with sunlight filtering through trees",
+  { duration: 10, upload: true }
+)
+```
+## output
+returns `VideoGenerationResult`:
+```typescript
+{
+  videoUrl: string,      // direct video url
+  duration?: number,     // actual video duration
+  uploaded?: string      // s3 url if upload requested
+}
+```
+## when to use
+use this skill when:
+- animating character headshots or portraits
+- creating motion from static images
+- generating video clips from text descriptions
+- preparing videos for lipsync or editing pipeline
+- need short form video content (5-10s)
+## tips
+**for character animation:**
+- use subtle prompts like "person talking naturally" or "slight head movement"
+- keep duration at 5 seconds for character shots
+- combine with lipsync for talking videos
+**for scene generation:**
+- be descriptive about camera movement and scene dynamics
+- 10 seconds works better for landscape/scene videos
+## environment variables
+required:
+- `FAL_API_KEY` - for fal video generation
+optional (for s3 upload):
+- `CLOUDFLARE_R2_API_URL`
+- `CLOUDFLARE_ACCESS_KEY_ID`
+- `CLOUDFLARE_ACCESS_SECRET`
+- `CLOUDFLARE_R2_BUCKET`
+## generation time
+expect 2-3 minutes per video clip

package/action/video/index.ts ADDED Viewed

@@ -0,0 +1,135 @@
+#!/usr/bin/env bun
+/**
+ * video generation service combining fal and higgsfield
+ * usage: bun run service/video.ts <command> <args>
+ */
+import { imageToVideo, textToVideo } from "../../lib/fal";
+import { uploadFromUrl } from "../../utilities/s3";
+export interface VideoGenerationResult {
+  videoUrl: string;
+  duration?: number;
+  uploaded?: string;
+}
+export async function generateVideoFromImage(
+  prompt: string,
+  imageUrl: string,
+  options: { duration?: 5 | 10; upload?: boolean } = {},
+): Promise<VideoGenerationResult> {
+  console.log("[service/video] generating video from image");
+  const result = await imageToVideo({
+    prompt,
+    imageUrl,
+    duration: options.duration,
+  });
+  const videoUrl = result.data?.video?.url;
+  if (!videoUrl) {
+    throw new Error("no video url in result");
+  }
+  let uploaded: string | undefined;
+  if (options.upload) {
+    const timestamp = Date.now();
+    const objectKey = `videos/generated/${timestamp}.mp4`;
+    uploaded = await uploadFromUrl(videoUrl, objectKey);
+    console.log(`[service/video] uploaded to ${uploaded}`);
+  }
+  return {
+    videoUrl,
+    duration: result.data?.duration,
+    uploaded,
+  };
+}
+export async function generateVideoFromText(
+  prompt: string,
+  options: { duration?: 5 | 10; upload?: boolean } = {},
+): Promise<VideoGenerationResult> {
+  console.log("[service/video] generating video from text");
+  const result = await textToVideo({
+    prompt,
+    duration: options.duration,
+  });
+  const videoUrl = result.data?.video?.url;
+  if (!videoUrl) {
+    throw new Error("no video url in result");
+  }
+  let uploaded: string | undefined;
+  if (options.upload) {
+    const timestamp = Date.now();
+    const objectKey = `videos/generated/${timestamp}.mp4`;
+    uploaded = await uploadFromUrl(videoUrl, objectKey);
+    console.log(`[service/video] uploaded to ${uploaded}`);
+  }
+  return {
+    videoUrl,
+    duration: result.data?.duration,
+    uploaded,
+  };
+}
+// cli runner
+if (import.meta.main) {
+  const [command, ...args] = process.argv.slice(2);
+  switch (command) {
+    case "from_image": {
+      if (!args[0] || !args[1]) {
+        console.log(`
+usage:
+  bun run service/video.ts from_image <prompt> <imageUrl> [duration] [upload]
+        `);
+        process.exit(1);
+      }
+      const duration = args[2];
+      if (duration && duration !== "5" && duration !== "10") {
+        console.error("duration must be 5 or 10");
+        process.exit(1);
+      }
+      const imgResult = await generateVideoFromImage(args[0], args[1], {
+        duration: duration === "10" ? 10 : 5,
+        upload: args[3] === "true",
+      });
+      console.log(JSON.stringify(imgResult, null, 2));
+      break;
+    }
+    case "from_text": {
+      if (!args[0]) {
+        console.log(`
+usage:
+  bun run service/video.ts from_text <prompt> [duration] [upload]
+        `);
+        process.exit(1);
+      }
+      const duration = args[1];
+      if (duration && duration !== "5" && duration !== "10") {
+        console.error("duration must be 5 or 10");
+        process.exit(1);
+      }
+      const txtResult = await generateVideoFromText(args[0], {
+        duration: duration === "10" ? 10 : 5,
+        upload: args[2] === "true",
+      });
+      console.log(JSON.stringify(txtResult, null, 2));
+      break;
+    }
+    default:
+      console.log(`
+usage:
+  bun run service/video.ts from_image <prompt> <imageUrl> [duration] [upload]
+  bun run service/video.ts from_text <prompt> [duration] [upload]
+      `);
+      process.exit(1);
+  }
+}