npm - @twick/cloud-transcript - Versions diffs - 0.15.14 → 0.15.16 - Mend

@twick/cloud-transcript 0.15.14 → 0.15.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

package/README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 **Transcribe audio/video to JSON captions using Google GenAI (Vertex AI) with Gemini models.**
-Extract text from audio content with precise millisecond timestamps. Perfect for generating subtitle data from audio files or video URLs.
+Extract text from audio content with precise millisecond timestamps. Perfect for generating caption data from audio files or video URLs.
 ## What Problem Does This Solve?

package/core/audio.utils.js ADDED Viewed

@@ -0,0 +1,161 @@
+import fs from "fs";
+import { join } from "path";
+import { mkdtemp, readFile, rm } from "fs/promises";
+import { tmpdir } from "os";
+import { execFile } from "child_process";
+import { promisify } from "util";
+import { Readable, pipeline } from "stream";
+// These packages provide prebuilt ffmpeg/ffprobe binaries. Types are not bundled,
+// so we import them as `any` to keep TypeScript satisfied.
+import ffmpeg from "@ffmpeg-installer/ffmpeg";
+import ffprobe from "@ffprobe-installer/ffprobe";
+const execFileAsync = promisify(execFile);
+const pipelineAsync = promisify(pipeline);
+const ffmpegPath = ffmpeg.path;
+const ffprobePath = ffprobe.path;
+/**
+ * Audio encoding configuration for different formats.
+ * Currently supports FLAC format optimized for Google Speech-to-Text API.
+ * @type {Object<string, Object>}
+ */
+export const AUDIO_CONFIG = {
+  "FLAC": {
+    "codec": "flac",
+    "encoding": "FLAC",
+    "sampleRate": 16000,
+    "channelCount": 1,
+    "extension": "flac",
+    "contentType": "audio/flac",
+  },
+}
+/**
+ * Extracts audio from a video URL and converts it to a format suitable for transcription.
+ * Downloads the video, extracts audio using ffmpeg, and returns the audio buffer and duration.
+ *
+ * @param {string} videoUrl - Publicly accessible HTTP(S) URL to the video file
+ * @param {string} [format="FLAC"] - Audio output format (currently only "FLAC" supported)
+ * @returns {Promise<Object>} Object containing audioBuffer (Buffer) and duration (number in seconds)
+ * @throws {Error} If video download, extraction, or processing fails
+ */
+export const extractAudioBufferFromVideo = async (videoUrl, format = "FLAC") => {
+    const videoResponse = await fetch(videoUrl);
+    if (!videoResponse.ok) {
+      throw new Error(`Failed to download video: ${videoResponse.status} ${videoResponse.statusText}`);
+    }
+    const tmpBase = await mkdtemp(join(tmpdir(), 'mcp-'));
+    const inputPath = join(tmpBase, 'input_video');
+    // Change extension to .flac
+    const outputPath = join(tmpBase, `output_audio.${format}`);
+    if (!videoResponse.body) {
+      await rm(tmpBase, { recursive: true, force: true });
+      throw new Error("Video response has no body");
+    }
+    const videoStream = Readable.fromWeb(videoResponse.body);
+    const fileWriteStream = fs.createWriteStream(inputPath);
+    await pipelineAsync(videoStream, fileWriteStream);
+    let duration = 0;
+    try {
+      const { stdout } = await execFileAsync(ffprobePath, [
+        '-v', 'error',
+        '-show_entries', 'format=duration',
+        '-of', 'default=noprint_wrappers=1:nokey=1',
+        inputPath
+      ]);
+      duration = parseFloat(stdout.toString().trim()) || 0;
+    } catch (err) {
+      console.warn('Failed to get duration using ffprobe');
+    }
+    try {
+      await execFileAsync(ffmpegPath, [
+        '-y',
+        '-i', inputPath,
+        '-vn',             // Strip video
+        '-ac', '1',         // Mono channel (Required for STT)
+        '-ar', AUDIO_CONFIG[format].sampleRate,     // 16kHz is ideal for Chirp
+        '-c:a', AUDIO_CONFIG[format].codec,     // Use FLAC codec
+        outputPath
+      ]);
+    } catch (err) {
+      await rm(tmpBase, { recursive: true, force: true });
+      const stderr = err?.stderr?.toString?.().trim?.() || "";
+      throw new Error(`ffmpeg extraction failed: ${stderr}`);
+    }
+    // Use the promise-based readFile for consistency
+    const audioBuffer = await readFile(outputPath);
+    await rm(tmpBase, { recursive: true, force: true });
+    return { audioBuffer, duration };
+};
+/**
+ * Downloads audio from a URL, converts it to the specified format, and returns the buffer.
+ * Uses ffmpeg to transcode the audio (e.g. to FLAC for Speech-to-Text).
+ *
+ * @param {string} audioUrl - Publicly accessible HTTP(S) URL to the audio file
+ * @param {string} [format="FLAC"] - Audio output format (must be key in AUDIO_CONFIG)
+ * @returns {Promise<Object>} Object containing audioBuffer (Buffer) and duration (number in seconds)
+ * @throws {Error} If download, conversion, or processing fails
+ */
+export const extractAudioBufferFromAudioUrl = async (audioUrl, format = "FLAC") => {
+  const config = AUDIO_CONFIG[format];
+  if (!config) {
+    throw new Error(`Unsupported audio format: ${format}`);
+  }
+  const audioResponse = await fetch(audioUrl);
+  if (!audioResponse.ok) {
+    throw new Error(`Failed to download audio: ${audioResponse.status} ${audioResponse.statusText}`);
+  }
+  const tmpBase = await mkdtemp(join(tmpdir(), 'audio-'));
+  const inputPath = join(tmpBase, 'input_audio');
+  const outputPath = join(tmpBase, `output_audio.${config.extension}`);
+  if (!audioResponse.body) {
+    await rm(tmpBase, { recursive: true, force: true });
+    throw new Error("Audio response has no body");
+  }
+  const audioStream = Readable.fromWeb(audioResponse.body);
+  const fileWriteStream = fs.createWriteStream(inputPath);
+  await pipelineAsync(audioStream, fileWriteStream);
+  let duration = 0;
+  try {
+    const { stdout } = await execFileAsync(ffprobePath, [
+      '-v', 'error',
+      '-show_entries', 'format=duration',
+      '-of', 'default=noprint_wrappers=1:nokey=1',
+      inputPath
+    ]);
+    duration = parseFloat(stdout.toString().trim()) || 0;
+  } catch (err) {
+    console.warn('Failed to get duration using ffprobe');
+  }
+  try {
+    await execFileAsync(ffmpegPath, [
+      '-y',
+      '-i', inputPath,
+      '-ac', '1',
+      '-ar', config.sampleRate,
+      '-c:a', config.codec,
+      outputPath
+    ]);
+  } catch (err) {
+    await rm(tmpBase, { recursive: true, force: true });
+    const stderr = err?.stderr?.toString?.().trim?.() || "";
+    throw new Error(`ffmpeg conversion failed: ${stderr}`);
+  }
+  const audioBuffer = await readFile(outputPath);
+  await rm(tmpBase, { recursive: true, force: true });
+  return { audioBuffer, duration };
+};

package/core/gc.utils.js ADDED Viewed

@@ -0,0 +1,177 @@
+import { Storage } from "@google-cloud/storage";
+import {
+  SecretsManagerClient,
+  GetSecretValueCommand,
+} from "@aws-sdk/client-secrets-manager";
+import fs from "fs";
+/**
+ * Google Cloud Project ID. Can be set via GOOGLE_CLOUD_PROJECT environment variable.
+ * @type {string}
+ */
+export const CLOUD_PROJECT_ID = process.env.GOOGLE_CLOUD_PROJECT;
+/**
+ * Google Cloud region for Speech-to-Text API. Currently set to "global".
+ * @type {string}
+ */
+export const CLOUD_REGION = "global";
+export const AWS_REGION = process.env.AWS_REGION;
+/**
+ * Google Cloud Storage bucket name for storing audio files and project exports.
+ * Can be set via GOOGLE_CLOUD_STORAGE_BUCKET environment variable.
+ * @type {string}
+ */
+export const CLOUD_STORAGE_BUCKET = process.env.GOOGLE_CLOUD_STORAGE_BUCKET;
+let googleCredentials = null;
+/**
+ * Retrieves Google Cloud service account credentials from AWS Secrets Manager.
+ *
+ * If GCP_SERVICE_ACCOUNT_SECRET_NAME is set, fetches the JSON credentials from AWS Secrets Manager.
+ * If not set, returns undefined (useful when credentials are provided via GOOGLE_APPLICATION_CREDENTIALS).
+ *
+ * @returns {Promise<Object|undefined>} Parsed JSON credentials object or undefined
+ * @throws {Error} If fetching from Secrets Manager fails
+ */
+export const getGoogleCredentials = async () => {
+  if (googleCredentials) {
+    return googleCredentials;
+  }
+  try {
+    const secretName = process.env.GCP_SERVICE_ACCOUNT_SECRET_NAME;
+    if (!secretName) {
+      console.log(
+        "No secret name configured, skipping Google credentials initialization"
+      );
+      return;
+    }
+    const client = new SecretsManagerClient({
+      region: process.env.AWS_REGION || "ap-south-1",
+    });
+    const response = await client.send(
+      new GetSecretValueCommand({
+        SecretId: secretName,
+        VersionStage: "AWSCURRENT", // VersionStage defaults to AWSCURRENT if unspecified
+      })
+    );
+    const parsedCredentials = JSON.parse(response.SecretString);
+    // Validate that the credentials contain required fields
+    if (!parsedCredentials.client_email) {
+      throw new Error(
+        `Invalid Google Cloud credentials: missing 'client_email' field. ` +
+          `The secret must contain a valid service account JSON with 'client_email', ` +
+          `'private_key', and 'type' fields.`
+      );
+    }
+    if (!parsedCredentials.private_key) {
+      throw new Error(
+        `Invalid Google Cloud credentials: missing 'private_key' field.`
+      );
+    }
+    if (parsedCredentials.type !== "service_account") {
+      console.warn(
+        `Warning: credentials type is '${parsedCredentials.type}', expected 'service_account'`
+      );
+    }
+    googleCredentials = parsedCredentials;
+    return googleCredentials;
+  } catch (error) {
+    console.error(
+      `Failed to initialize Google credentials from secret ::`,
+      error
+    );
+    throw error;
+  }
+};
+let storage = null;
+/**
+ * Gets or initializes the Google Cloud Storage client instance.
+ *
+ * @returns {Promise<Storage>} Initialized Storage client
+ */
+const getStorage = async () => {
+  if (!storage) {
+    storage = new Storage({
+      projectId: CLOUD_PROJECT_ID,
+      credentials: await getGoogleCredentials(),
+    });
+  }
+  return storage;
+};
+/**
+ * Uploads a file to Google Cloud Storage.
+ *
+ * @param {Object} params - Upload parameters
+ * @param {Buffer|string} params.data - File data to upload (Buffer or string)
+ * @param {string} [params.folder] - Optional folder path in the bucket
+ * @param {string} params.fileName - Name of the file to create
+ * @param {string} params.contentType - MIME type of the file
+ * @param {boolean} [params.isPublic=false] - If true, returns a signed URL valid for 1 hour
+ * @returns {Promise<string>} Public URL or signed URL (if isPublic=true) to the uploaded file
+ */
+export const uploadFile = async ({
+  data,
+  folder,
+  fileName,
+  contentType,
+  isPublic = false,
+}) => {
+  const bucket = (await getStorage()).bucket(CLOUD_STORAGE_BUCKET);
+  const bucketName = CLOUD_STORAGE_BUCKET;
+  // 2. Define the path including the folder 'content'
+  const destinationPath = `${folder ? `${folder}/` : ""}${fileName}`;
+  const file = bucket.file(destinationPath);
+  // 3. Save the file.
+  await file.save(data, {
+    contentType: contentType,
+    resumable: false,
+  });
+  if (isPublic) {
+    // Generate a signed URL valid for 1 hour instead of making the file public
+    const expires = new Date();
+    expires.setHours(expires.getHours() + 1); // 1 hour from now
+    const [signedUrl] = await file.getSignedUrl({
+      version: "v4",
+      action: "read",
+      expires: expires,
+    });
+    return signedUrl;
+  }
+  return `https://storage.googleapis.com/${bucketName}/${destinationPath}`;
+};
+/**
+ * Converts a Google Cloud Storage URL to a gs:// URI format.
+ *
+ * @param {string} URI - GCS URL (https://storage.googleapis.com/...) or gs:// URI
+ * @returns {string} gs:// URI format
+ * @throws {Error} If the URI format is invalid
+ */
+export const getGCSUri = (URI) => {
+  if (URI.startsWith("https://storage.googleapis.com/")) {
+    const path = URI.replace("https://storage.googleapis.com/", "");
+    return `gs://${path}`;
+  } else if (!URI.startsWith("gs://")) {
+    throw new Error(
+      `Invalid audio URI format. Expected gs://bucket/path or https://storage.googleapis.com/bucket/path, got: ${URI}`
+    );
+  }
+  return URI;
+};

package/core/index.js ADDED Viewed

	@@ -0,0 +1 @@
1	+ export { transcribe } from "./workflow.js";

package/core/transcriber.js CHANGED Viewed

@@ -1,347 +1,231 @@
-import { GoogleGenAI } from "@google/genai";
+import { SpeechClient } from "@google-cloud/speech/build/src/v2/index.js";
 import {
-  SecretsManagerClient,
-  GetSecretValueCommand,
-} from "@aws-sdk/client-secrets-manager";
-import fs from "fs";
-import path, { join } from "path";
-import { mkdtemp, readFile, rm } from "fs/promises";
-import { tmpdir } from "os";
-import { execFile } from "child_process";
-import { promisify } from "util";
-import { Readable, pipeline } from "stream";
-// These packages provide prebuilt ffmpeg/ffprobe binaries. Types are not bundled,
-// so we import them as `any` to keep TypeScript satisfied.
-import ffmpeg from "@ffmpeg-installer/ffmpeg";
-import ffprobe from "@ffprobe-installer/ffprobe";
-const execFileAsync = promisify(execFile);
-const pipelineAsync = promisify(pipeline);
-const ffmpegPath = ffmpeg.path;
-const ffprobePath = ffprobe.path;
+  CLOUD_PROJECT_ID,
+  CLOUD_REGION,
+  getGCSUri,
+  getGoogleCredentials,
+  uploadFile,
+} from "./gc.utils.js";
+import { AUDIO_CONFIG } from "./audio.utils.js";
 /**
- * Read a required environment variable, optionally falling back to a default.
- * Throws if neither value is available, making configuration errors obvious.
- *
- * @param {string} name - Environment variable to read.
- * @param {string | undefined} defaultValue - Optional fallback value.
- * @returns {string} The resolved value.
- * @throws {Error} If no value is found.
+ * Language code mapping for Google Speech-to-Text API.
+ * @type {Object<string, string>}
  */
-const ensureEnv = (name, defaultValue) => {
-  const value = process.env[name] ?? defaultValue;
-  if (!value) {
-    throw new Error(`Missing required environment variable: ${name}`);
-  }
-  return value;
+const LANGUAGE_CODE = {
+  english: "en-US",
 };
 /**
- * Ensure GOOGLE_APPLICATION_CREDENTIALS points to a JSON key file.
- *
- * In AWS Lambda, the raw service-account JSON is expected to live in
- * AWS Secrets Manager. When GCP_SERVICE_ACCOUNT_SECRET_NAME is present, the
- * secret is fetched, written to `/tmp/gcp-sa-key.json`, and the environment
- * variable is updated to point at that file to avoid stale Lambda values.
- *
- * @returns {Promise<void>} Resolves once credentials are ready.
- * @throws {Error} When the secret cannot be read or written.
+ * Speech recognition model to use. "long" model is optimized for longer audio files.
+ * @type {string}
  */
-const ensureGoogleCredentialsFromSecret = async () => {
-  const secretName = process.env.GCP_SERVICE_ACCOUNT_SECRET_NAME;
-  if (!secretName) {
-    console.log(
-      "No secret name configured, skipping Google credentials initialization"
-    );
-    return;
-  }
+const MODEL = "long";
-  try {
-    const client = new SecretsManagerClient({
-      region: process.env.AWS_REGION || "ap-south-1",
-    });
+let speechClient = null;
-    const response = await client.send(
-      new GetSecretValueCommand({
-        SecretId: secretName,
-        VersionStage: "AWSCURRENT", // VersionStage defaults to AWSCURRENT if unspecified
-      })
-    );
-    const secret = response.SecretString;
-    const credPath = path.join("/tmp", "gcp-sa-key.json");
-    fs.writeFileSync(credPath, secret, { encoding: "utf8" });
-    process.env.GOOGLE_APPLICATION_CREDENTIALS = credPath;
-    console.log(
-      `Wrote Google service account credentials to ${credPath} from Secrets Manager`
-    );
-  } catch (error) {
-    console.error(
-      `Failed to initialize Google credentials from secret ::`,
-      error
-    );
-    throw error;
+/**
+ * Gets or initializes the Google Cloud Speech-to-Text client.
+ *
+ * @returns {Promise<SpeechClient>} Initialized SpeechClient instance
+ */
+export const getSpeechClient = async () => {
+  if (!speechClient) {
+    speechClient = new SpeechClient({
+      projectId: CLOUD_PROJECT_ID,
+      region: CLOUD_REGION,
+      credentials: await getGoogleCredentials(),
+    });
   }
+  return speechClient;
 };
 /**
- * Initialize a Google GenAI client configured for Vertex AI.
- * Ensures credentials, project, and location are available before instantiating.
- *
- * @returns {Promise<GoogleGenAI>} Configured GenAI client instance.
- * @throws {Error} When required environment variables are missing.
+ * Recognizer resource path for Google Speech-to-Text API v2.
+ * @type {string}
  */
-const createGenAIClient = async () => {
-  await ensureGoogleCredentialsFromSecret();
-  const project = ensureEnv("GOOGLE_CLOUD_PROJECT");
-  const location = ensureEnv("GOOGLE_CLOUD_LOCATION", "global");
-  const client = new GoogleGenAI({
-    vertexai: true,
-    project: project,
-    location: location,
-  });
-  return client;
-};
+const recognizer = `projects/${CLOUD_PROJECT_ID}/locations/${CLOUD_REGION}/recognizers/_`;
-const extractAudioBufferFromVideo = async (videoUrl) => {
-  const videoResponse = await fetch(videoUrl);
-  if (!videoResponse.ok) {
-    throw new Error(`Failed to download video: ${videoResponse.status} ${videoResponse.statusText}`);
-  }
-  const tmpBase = await mkdtemp(join(tmpdir(), 'mcp-'));
-  const inputPath = join(tmpBase, 'input_video');
-  const outputPath = join(tmpBase, 'output_audio.mp3');
+/**
+ * Processes Speech-to-Text API response and groups words into phrases of 4 words each.
+ *
+ * @param {Object} results - API response results object
+ * @returns {Array<Object>} Array of phrase objects with text, start time, end time, and word timings
+ */
+const processResponse = (results) => {
+  // Extract words from response
+  const words = results?.alternatives?.[0]?.words || [];
-  // Stream the video response directly to disk to avoid holding the full video in memory
-  if (!videoResponse.body) {
-    await rm(tmpBase, { recursive: true, force: true });
-    throw new Error("Video response has no body");
+  if (words.length === 0) {
+    return [];
   }
-  const videoStream = Readable.fromWeb(videoResponse.body);
-  const fileWriteStream = fs.createWriteStream(inputPath);
-  await pipelineAsync(videoStream, fileWriteStream);
-  // Get duration using bundled ffprobe
-  let duration = 0;
-  try {
-    const { stdout } = await execFileAsync(ffprobePath, [
-      '-v', 'error',
-      '-show_entries', 'format=duration',
-      '-of', 'default=noprint_wrappers=1:nokey=1',
-      inputPath
-    ]);
-    duration = parseFloat(stdout.toString().trim()) || 0;
-  } catch (err) {
-    console.warn('Failed to get duration using ffprobe, duration will be 0');
-  }
+  // Convert time offsets to milliseconds
+  const convertToMs = (offset) => {
+    if (!offset) return 0;
+    const seconds = Number(offset.seconds || 0);
+    const nanos = Number(offset.nanos || 0);
+    return seconds * 1000 + nanos / 1e6;
+  };
-  try {
-    await execFileAsync(ffmpegPath, [
-      '-y',
-      '-i', inputPath,
-      '-vn',
-      '-acodec', 'libmp3lame',
-      '-q:a', '2',
-      outputPath
-    ]);
-  } catch (err) {
-    await rm(tmpBase, { recursive: true, force: true });
-    const stderr = err?.stderr?.toString?.().trim?.() || "";
-    const msg = stderr || (err instanceof Error ? err.message : String(err));
-    throw new Error(`ffmpeg execution failed: ${msg}`);
+  // Process words into individual word timings
+  const processedWords = words.map((w) => ({
+    word: w.word,
+    startMs: convertToMs(w.startOffset),
+    endMs: convertToMs(w.endOffset),
+  }));
+  // Group words into phrases of 4 words each
+  const phrases = [];
+  for (let i = 0; i < processedWords.length; i += 4) {
+    const group = processedWords.slice(i, i + 4);
+    const text = group.map((w) => w.word).join(" ");
+    const startMs = group[0].startMs;
+    const endMs = group[group.length - 1].endMs;
+    const wordStarts = group.map((w) => w.startMs);
+    phrases.push({
+      t: text,
+      s: Math.round(startMs),
+      e: Math.round(endMs),
+      w: wordStarts.map((ms) => Math.round(ms)),
+    });
   }
-  const audioBuffer = await readFile(outputPath);
-  await rm(tmpBase, { recursive: true, force: true });
-  return { audioBuffer, duration };
+  return phrases;
 };
 /**
- * Build the captioning prompt passed to the Gemini model.
- *
- * @param {number} duration - Audio duration in seconds.
- * @param {string} language - Human-readable target language.
- * @param {string} languageFont - Desired script/font name.
- * @returns {string} Instruction prompt for the model.
+ * Transcribes short audio (typically under 60 seconds) using Google Speech-to-Text API.
+ * Uses synchronous recognize method for faster processing.
+ *
+ * @param {Object} params - Transcription parameters
+ * @param {Buffer} params.audioBuffer - Audio data buffer (FLAC format)
+ * @param {string} [params.language="english"] - Language code (e.g., "english")
+ * @param {string} [params.format="FLAC"] - Audio format (currently only "FLAC" supported)
+ * @returns {Promise<Array<Object>>} Array of phrase objects with text, timings, and word offsets
+ * @throws {Error} If transcription fails
  */
-const buildPrompt = (duration, language, languageFont) => {
-  // Convert duration from seconds to milliseconds for the prompt
-  const durationMs = Math.round(duration * 1000);
-  return `You are a professional subtitle and transcription engine.
-## INPUT
-- Audio duration: ${durationMs} milliseconds
-- Target language: ${language}
-- Subtitle font script: ${languageFont}
-## OBJECTIVE
-Transcribe the audio into clear, readable subtitles.
-If the spoken audio is NOT in ${language}, translate it into ${language} before generating subtitles.
-## SUBTITLE SEGMENTATION RULES
-- Split speech into short, natural phrases.
-- Each subtitle phrase MUST contain a maximum of 4 words.
-- Do NOT split words across phrases.
-- Avoid breaking phrases mid-sentence unless required by timing constraints.
-## TIMING RULES (STRICT — MUST FOLLOW)
-- All timestamps are in **milliseconds**.
-- Each subtitle object MUST include:
-  - 's': start timestamp
-  - 'e': end timestamp
-- Duration of each phrase = 'e - s'
-- Minimum phrase duration: **100 ms**
-- 'e' MUST be greater than 's'
-- 'e' MUST be **less than or equal to ${durationMs}**
-- Subtitles MUST be sequential:
-  - 's' of the next phrase MUST be **greater than or equal to** the previous 'e'
-  - NO overlapping timestamps
-- Prefer aligning timestamps with natural speech pauses.
-## TEXT RULES
-- 't' MUST be written using ${languageFont} characters.
-- No emojis.
-- No punctuation-only subtitles.
-- Normalize casing according to the target language's writing system.
-- Remove filler sounds (e.g., “um”, “uh”) unless semantically important.
-## OUTPUT FORMAT (CRITICAL)
-Return ONLY a valid JSON array.
-- No markdown
-- No code blocks
-- No explanations
-- No additional text
-- Output MUST start with '[' and end with ']'
+export async function transcribeShort({
+  audioBuffer,
+  language = "english",
+  format = "FLAC",
+}) {
+  const client = await getSpeechClient();
+  const audioContent = audioBuffer.toString("base64");
+  const request = {
+    recognizer: recognizer,
+    config: {
+      explicitDecodingConfig: {
+        encoding: AUDIO_CONFIG[format].encoding,
+        sampleRateHertz: AUDIO_CONFIG[format].sampleRate,
+        audioChannelCount: 1,
+      },
+      languageCodes: [LANGUAGE_CODE[language]],
+      model: MODEL,
+      features: {
+        enableWordTimeOffsets: true,
+      },
+    },
+    content: audioContent,
+  };
-## OUTPUT SCHEMA
-[
-  {
-    "t": "Subtitle text",
-    "s": 0,
-    "e": 1200
+  try {
+    const [response] = await client.recognize(request);
+    return processResponse(response.results?.[0]);
+  } catch (err) {
+    console.error("Transcription Error:", err.message);
+    throw err;
   }
-]
-`.trim();
-};
+}
 /**
- * Transcribe an audio URL to JSON subtitles using Google GenAI (Vertex AI),
- * mirroring the Python implementation in `playground/vertex/transcript.py`.
- *
- * @param {Object} params
- * @param {string} params.videoUrl - Publicly reachable video URL.
- * @param {string} [params.language="english"] - Target transcription language (human-readable).
- * @param {string} [params.languageFont="english"] - Target font/script for subtitles.
- * @returns {Promise<{ subtitles: Array<{t: string, s: number, e: number}> }>} Subtitles array with text, start time, and end time.
- * @throws {Error} When audioUrl is missing or downstream calls fail.
+ * Transcribes long audio (typically over 60 seconds) using Google Speech-to-Text API.
+ * Uses asynchronous batchRecognize method and requires audio to be uploaded to GCS first.
+ *
+ * @param {Object} params - Transcription parameters
+ * @param {Buffer} [params.audioBuffer] - Audio data buffer (required if audioUrl not provided)
+ * @param {string} [params.audioUrl] - GCS URI (gs://) or HTTPS URL to audio file (required if audioBuffer not provided)
+ * @param {string} [params.language="english"] - Language code (e.g., "english")
+ * @param {string} [params.format="FLAC"] - Audio format (currently only "FLAC" supported)
+ * @returns {Promise<Array<Object>>} Array of phrase objects with text, timings, and word offsets
+ * @throws {Error} If transcription fails
  */
-export const transcribeVideoUrl = async (params) => {
-  const {
-    videoUrl,
-    language = "english",
-    languageFont = "english",
-  } = params || {};
-  if (!videoUrl) {
-    throw new Error("Missing required parameter: videoUrl");
-  }
-  const { audioBuffer, duration } = await extractAudioBufferFromVideo(videoUrl);
-  if (!duration) {
-    throw new Error("Failed to get duration of video");
+export async function transcribeLong({
+  audioBuffer,
+  audioUrl,
+  language = "english",
+  format = "FLAC",
+}) {
+  let gcsUri;
+  if (audioUrl) {
+    gcsUri = getGCSUri(audioUrl);
+  } else {
+    const audioUri = await uploadFile({
+      data: audioBuffer,
+      folder: "audio",
+      fileName: `audio-${Date.now()}.${AUDIO_CONFIG[format].extension}`,
+      contentType: AUDIO_CONFIG[format].contentType,
+    });
+    gcsUri = getGCSUri(audioUri);
   }
-  const prompt = buildPrompt(duration, language, languageFont);
+  console.log("GCS URI:", gcsUri);
+  const client = await getSpeechClient();
-  const client = await createGenAIClient();
-  const modelName = process.env.GOOGLE_VERTEX_MODEL || "gemini-2.5-flash-lite";
-  const generationConfig = {
-    maxOutputTokens: 65535,
-    temperature: 1,
-    topP: 0.95,
-    thinkingConfig: {
-      thinkingBudget: 0,
-    },
-    safetySettings: [
-      {
-        category: "HARM_CATEGORY_HATE_SPEECH",
-        threshold: "OFF",
-      },
-      {
-        category: "HARM_CATEGORY_DANGEROUS_CONTENT",
-        threshold: "OFF",
-      },
-      {
-        category: "HARM_CATEGORY_SEXUALLY_EXPLICIT",
-        threshold: "OFF",
+  const request = {
+    recognizer: recognizer,
+    config: {
+      explicitDecodingConfig: {
+        encoding: AUDIO_CONFIG[format].encoding,
+        sampleRateHertz: AUDIO_CONFIG[format].sampleRate,
+        audioChannelCount: 1,
       },
-      {
-        category: "HARM_CATEGORY_HARASSMENT",
-        threshold: "OFF",
+      languageCodes: [LANGUAGE_CODE[language]],
+      model: MODEL,
+      features: {
+        enableWordTimeOffsets: true,
       },
-    ],
-  };
-  const req = {
-    model: modelName,
-    contents: [
+    },
+    files: [
       {
-        role: "user",
-        parts: [
-          {
-            inlineData: {
-              data: audioBuffer.toString("base64"),
-              mimeType: "audio/mpeg",
-            },
-          },
-          { text: prompt },
-        ],
+        uri: gcsUri,
       },
     ],
-    config: generationConfig,
+    recognitionOutputConfig: {
+      inlineResponseConfig: {},
+    },
   };
-  const response = await client.models.generateContent(req);
-  let textPart = response.text || "";
-  // Strip markdown code fences if present (```json ... ``` or ``` ... ```)
-  textPart = textPart
-    .replace(/^```json\s*/i, "") // Remove opening ```json
-    .replace(/^```\s*/i, "") // Remove opening ```
-    .replace(/\s*```$/i, "") // Remove closing ```
-    .trim();
+  try {
+    console.log("Waiting for operation to complete...");
+    const [operation] = await client.batchRecognize(request);
+    const [response] = await operation.promise();
+    // Extract results for the audio URI (use the GCS URI as the key)
+    const fileResult = response.results?.[gcsUri];
+    if (!fileResult || !fileResult.transcript) {
+      return [];
+    }
+    // Extract words from all results (batchRecognize can return multiple result segments)
+    const allPhrases = [];
+    const results = fileResult.transcript.results || [];
-  let subtitles = [];
-  try {
-    // Try to find JSON array in the text (in case there's extra text)
-    const jsonMatch = textPart.match(/\[[\s\S]*\]/);
-    const jsonText = jsonMatch ? jsonMatch[0] : textPart;
+    for (const result of results) {
+      const phrases = processResponse(result);
+      console.log("Phrases:", phrases);
+      console.log("Transcription Result:", result);
+      allPhrases.push(...phrases);
+    }
-    subtitles = JSON.parse(jsonText);
-    if (!Array.isArray(subtitles)) {
-      throw new Error("Parsed subtitles are not an array");
+    if (allPhrases.length === 0) {
+      return [];
     }
+    return allPhrases;
   } catch (err) {
-    console.warn(
-      "Failed to parse model output as JSON subtitles, returning raw text",
-      err
-    );
-    console.warn("Raw response text:", textPart.substring(0, 500));
-    subtitles = [];
+    console.error("Transcription Error:", err.message);
+    throw err;
   }
-  return {
-    subtitles,
-    duration,
-    videoUrl
-  };
-};
+}

package/core/workflow.js ADDED Viewed

@@ -0,0 +1,48 @@
+import { extractAudioBufferFromAudioUrl, extractAudioBufferFromVideo } from "./audio.utils.js";
+import { transcribeLong, transcribeShort } from "./transcriber.js";
+/**
+ * Creates a complete caption video project from a video URL.
+ * Downloads video, extracts audio, transcribes it using Google Speech-to-Text,
+ * and builds a Twick project JSON structure.
+ *
+ * @param {Object} params - Project creation parameters
+ * @param {string} params.videoUrl - Publicly accessible HTTP(S) URL to the video file
+ * @param {Object} [params.videoSize] - Video dimensions {width, height} (defaults to 720x1280)
+ * @param {string} [params.language="english"] - Transcription language code
+ * @param {string} [params.languageFont="english"] - Font/script for captions
+ * @returns {Promise<Object>} Twick project JSON structure
+ * @throws {Error} If video processing, transcription, or project building fails
+ */
+export const transcribe = async (params) => {
+  const { videoSize, videoUrl, audioUrl, language, languageFont } = params;
+  const { audioBuffer, duration } = audioUrl
+    ? await extractAudioBufferFromAudioUrl(audioUrl)
+    : await extractAudioBufferFromVideo(videoUrl);
+  let captions = [];
+  if (!duration) {
+    throw new Error("Failed to get duration of video");
+  } else if (!audioBuffer) {
+    throw new Error("Failed to get audio buffer from video");
+  } else if (duration > 6) {
+    captions = await transcribeLong({ audioBuffer, language });
+  } else {
+    captions = await transcribeShort({ audioBuffer, language });
+  }
+  if (!captions.length) {
+    throw new Error("No captions found");
+  }
+  console.log("Transcription successful");
+  return ({
+    captions,
+    duration,
+    audioUrl,
+    videoUrl,
+    videoSize,
+    language,
+    languageFont,
+  });
+};

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@twick/cloud-transcript",
-  "version": "0.15.14",
+  "version": "0.15.16",
   "description": "Twick cloud function for generating JSON captions from audio using Google Cloud Speech-to-Text",
   "type": "module",
   "main": "core/transcriber.js",
@@ -46,10 +46,12 @@
     "node": ">=20.0.0"
   },
   "dependencies": {
-    "@google/genai": "^1.0.0",
     "@aws-sdk/client-secrets-manager": "^3.679.0",
+    "fluent-ffmpeg": "^2.1.2",
     "@ffmpeg-installer/ffmpeg": "^1.1.0",
-    "@ffprobe-installer/ffprobe": "^1.1.0"
+    "@ffprobe-installer/ffprobe": "^1.1.0",
+    "@google-cloud/speech": "^7.2.1",
+    "@google-cloud/storage": "^7.18.0"
   },
   "devDependencies": {
     "typescript": "~5.4.5",

package/platform/aws/handler.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { transcribeVideoUrl } from '@twick/cloud-transcript';
+import { transcribe } from '../../core/workflow.js';
 const jsonResponse = (statusCode, body) => ({
   statusCode,
@@ -16,18 +16,19 @@ const jsonResponse = (statusCode, body) => ({
  *
  * Expected JSON payload (e.g. via AppSync / Lambda resolver):
  * {
- *   "videoUrl": "https://example.com/audio.mp3", // or "gs://bucket/object"
- *   "languageCode": "en-US", // optional, defaults to "en-US"
- *   "encoding": "MP3",        // optional
- *   "sampleRateHertz": 16000  // optional
+ *   "videoUrl": "https://example.com/video.mp4",   // for video input
+ *   "audioUrl": "https://example.com/audio.mp3",   // OR for audio input
+ *   "videoSize": { "width": 720, "height": 1280 }, // optional
+ *   "language": "english",                         // optional
+ *   "languageFont": "english"                      // optional
  * }
  *
  * Environment variables:
  * - GOOGLE_CLOUD_PROJECT: Explicit Google Cloud project id.
- * - GOOGLE_CLOUD_LOCATION (optional): Location of the Google Cloud project.
+ * - GOOGLE_CLOUD_LOCATION (optional): Location of the Google Cloud project.
  * - GOOGLE_VERTEX_MODEL (optional): Model to use for transcription.
  *
- * Returns: JSON payload containing transcript text, caption segments, and word-level timings.
+ * Returns: JSON payload with captions, duration, and project metadata.
  */
 export const handler = async (event) => {
   console.log('Transcript function invoked');
@@ -51,23 +52,26 @@ export const handler = async (event) => {
       (event?.body ? JSON.parse(event.body) : {}) ||
       {};
-    const { videoUrl, language,languageFont } =
+    const { videoUrl, audioUrl, videoSize, language, languageFont } =
       argumentsPayload;
-    if (!videoUrl) {
+    if (!videoUrl && !audioUrl) {
       return jsonResponse(400, {
-        error: 'Missing required field: videoUrl',
+        error: 'Missing required field: provide either videoUrl or audioUrl',
         expectedFormat: {
-          videoUrl:
-            'Publicly reachable audio URL or "gs://bucket/object" for GCS',
-          language: 'Optional language (e.g., "english", "hindi")',
-          languageFont: 'Optional font/script for captions (e.g., "english")',
+          videoUrl: 'Publicly reachable video URL (e.g. https://...)',
+          audioUrl: 'Publicly reachable audio URL (e.g. https://... or gs://...)',
+          videoSize: 'Optional { width, height }',
+          language: 'Optional (e.g. "english", "hindi")',
+          languageFont: 'Optional font/script (e.g. "english")',
         },
       });
     }
-    const result = await transcribeVideoUrl({
-      videoUrl,
+    const result = await transcribe({
+      videoUrl: videoUrl || undefined,
+      audioUrl: audioUrl || undefined,
+      videoSize,
       language,
       languageFont,
     });