@twick/cloud-transcript 0.15.14 → 0.15.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  **Transcribe audio/video to JSON captions using Google GenAI (Vertex AI) with Gemini models.**
4
4
 
5
- Extract text from audio content with precise millisecond timestamps. Perfect for generating subtitle data from audio files or video URLs.
5
+ Extract text from audio content with precise millisecond timestamps. Perfect for generating caption data from audio files or video URLs.
6
6
 
7
7
  ## What Problem Does This Solve?
8
8
 
@@ -0,0 +1,161 @@
1
+ import fs from "fs";
2
+ import { join } from "path";
3
+ import { mkdtemp, readFile, rm } from "fs/promises";
4
+ import { tmpdir } from "os";
5
+ import { execFile } from "child_process";
6
+ import { promisify } from "util";
7
+ import { Readable, pipeline } from "stream";
8
+
9
+ // These packages provide prebuilt ffmpeg/ffprobe binaries. Types are not bundled,
10
+ // so we import them as `any` to keep TypeScript satisfied.
11
+ import ffmpeg from "@ffmpeg-installer/ffmpeg";
12
+ import ffprobe from "@ffprobe-installer/ffprobe";
13
+
14
+
15
+ const execFileAsync = promisify(execFile);
16
+ const pipelineAsync = promisify(pipeline);
17
+ const ffmpegPath = ffmpeg.path;
18
+ const ffprobePath = ffprobe.path;
19
+
20
+ /**
21
+ * Audio encoding configuration for different formats.
22
+ * Currently supports FLAC format optimized for Google Speech-to-Text API.
23
+ * @type {Object<string, Object>}
24
+ */
25
+ export const AUDIO_CONFIG = {
26
+ "FLAC": {
27
+ "codec": "flac",
28
+ "encoding": "FLAC",
29
+ "sampleRate": 16000,
30
+ "channelCount": 1,
31
+ "extension": "flac",
32
+ "contentType": "audio/flac",
33
+ },
34
+ }
35
+
36
+ /**
37
+ * Extracts audio from a video URL and converts it to a format suitable for transcription.
38
+ * Downloads the video, extracts audio using ffmpeg, and returns the audio buffer and duration.
39
+ *
40
+ * @param {string} videoUrl - Publicly accessible HTTP(S) URL to the video file
41
+ * @param {string} [format="FLAC"] - Audio output format (currently only "FLAC" supported)
42
+ * @returns {Promise<Object>} Object containing audioBuffer (Buffer) and duration (number in seconds)
43
+ * @throws {Error} If video download, extraction, or processing fails
44
+ */
45
+ export const extractAudioBufferFromVideo = async (videoUrl, format = "FLAC") => {
46
+ const videoResponse = await fetch(videoUrl);
47
+ if (!videoResponse.ok) {
48
+ throw new Error(`Failed to download video: ${videoResponse.status} ${videoResponse.statusText}`);
49
+ }
50
+ const tmpBase = await mkdtemp(join(tmpdir(), 'mcp-'));
51
+ const inputPath = join(tmpBase, 'input_video');
52
+ // Change extension to .flac
53
+ const outputPath = join(tmpBase, `output_audio.${format}`);
54
+
55
+ if (!videoResponse.body) {
56
+ await rm(tmpBase, { recursive: true, force: true });
57
+ throw new Error("Video response has no body");
58
+ }
59
+ const videoStream = Readable.fromWeb(videoResponse.body);
60
+ const fileWriteStream = fs.createWriteStream(inputPath);
61
+ await pipelineAsync(videoStream, fileWriteStream);
62
+
63
+ let duration = 0;
64
+ try {
65
+ const { stdout } = await execFileAsync(ffprobePath, [
66
+ '-v', 'error',
67
+ '-show_entries', 'format=duration',
68
+ '-of', 'default=noprint_wrappers=1:nokey=1',
69
+ inputPath
70
+ ]);
71
+ duration = parseFloat(stdout.toString().trim()) || 0;
72
+ } catch (err) {
73
+ console.warn('Failed to get duration using ffprobe');
74
+ }
75
+
76
+ try {
77
+ await execFileAsync(ffmpegPath, [
78
+ '-y',
79
+ '-i', inputPath,
80
+ '-vn', // Strip video
81
+ '-ac', '1', // Mono channel (Required for STT)
82
+ '-ar', AUDIO_CONFIG[format].sampleRate, // 16kHz is ideal for Chirp
83
+ '-c:a', AUDIO_CONFIG[format].codec, // Use FLAC codec
84
+ outputPath
85
+ ]);
86
+ } catch (err) {
87
+ await rm(tmpBase, { recursive: true, force: true });
88
+ const stderr = err?.stderr?.toString?.().trim?.() || "";
89
+ throw new Error(`ffmpeg extraction failed: ${stderr}`);
90
+ }
91
+
92
+ // Use the promise-based readFile for consistency
93
+ const audioBuffer = await readFile(outputPath);
94
+ await rm(tmpBase, { recursive: true, force: true });
95
+ return { audioBuffer, duration };
96
+ };
97
+
98
+ /**
99
+ * Downloads audio from a URL, converts it to the specified format, and returns the buffer.
100
+ * Uses ffmpeg to transcode the audio (e.g. to FLAC for Speech-to-Text).
101
+ *
102
+ * @param {string} audioUrl - Publicly accessible HTTP(S) URL to the audio file
103
+ * @param {string} [format="FLAC"] - Audio output format (must be key in AUDIO_CONFIG)
104
+ * @returns {Promise<Object>} Object containing audioBuffer (Buffer) and duration (number in seconds)
105
+ * @throws {Error} If download, conversion, or processing fails
106
+ */
107
+ export const extractAudioBufferFromAudioUrl = async (audioUrl, format = "FLAC") => {
108
+ const config = AUDIO_CONFIG[format];
109
+ if (!config) {
110
+ throw new Error(`Unsupported audio format: ${format}`);
111
+ }
112
+
113
+ const audioResponse = await fetch(audioUrl);
114
+ if (!audioResponse.ok) {
115
+ throw new Error(`Failed to download audio: ${audioResponse.status} ${audioResponse.statusText}`);
116
+ }
117
+
118
+ const tmpBase = await mkdtemp(join(tmpdir(), 'audio-'));
119
+ const inputPath = join(tmpBase, 'input_audio');
120
+ const outputPath = join(tmpBase, `output_audio.${config.extension}`);
121
+
122
+ if (!audioResponse.body) {
123
+ await rm(tmpBase, { recursive: true, force: true });
124
+ throw new Error("Audio response has no body");
125
+ }
126
+ const audioStream = Readable.fromWeb(audioResponse.body);
127
+ const fileWriteStream = fs.createWriteStream(inputPath);
128
+ await pipelineAsync(audioStream, fileWriteStream);
129
+
130
+ let duration = 0;
131
+ try {
132
+ const { stdout } = await execFileAsync(ffprobePath, [
133
+ '-v', 'error',
134
+ '-show_entries', 'format=duration',
135
+ '-of', 'default=noprint_wrappers=1:nokey=1',
136
+ inputPath
137
+ ]);
138
+ duration = parseFloat(stdout.toString().trim()) || 0;
139
+ } catch (err) {
140
+ console.warn('Failed to get duration using ffprobe');
141
+ }
142
+
143
+ try {
144
+ await execFileAsync(ffmpegPath, [
145
+ '-y',
146
+ '-i', inputPath,
147
+ '-ac', '1',
148
+ '-ar', config.sampleRate,
149
+ '-c:a', config.codec,
150
+ outputPath
151
+ ]);
152
+ } catch (err) {
153
+ await rm(tmpBase, { recursive: true, force: true });
154
+ const stderr = err?.stderr?.toString?.().trim?.() || "";
155
+ throw new Error(`ffmpeg conversion failed: ${stderr}`);
156
+ }
157
+
158
+ const audioBuffer = await readFile(outputPath);
159
+ await rm(tmpBase, { recursive: true, force: true });
160
+ return { audioBuffer, duration };
161
+ };
@@ -0,0 +1,177 @@
1
+ import { Storage } from "@google-cloud/storage";
2
+ import {
3
+ SecretsManagerClient,
4
+ GetSecretValueCommand,
5
+ } from "@aws-sdk/client-secrets-manager";
6
+ import fs from "fs";
7
+
8
+ /**
9
+ * Google Cloud Project ID. Can be set via GOOGLE_CLOUD_PROJECT environment variable.
10
+ * @type {string}
11
+ */
12
+ export const CLOUD_PROJECT_ID = process.env.GOOGLE_CLOUD_PROJECT;
13
+
14
+ /**
15
+ * Google Cloud region for Speech-to-Text API. Currently set to "global".
16
+ * @type {string}
17
+ */
18
+ export const CLOUD_REGION = "global";
19
+
20
+ export const AWS_REGION = process.env.AWS_REGION;
21
+ /**
22
+ * Google Cloud Storage bucket name for storing audio files and project exports.
23
+ * Can be set via GOOGLE_CLOUD_STORAGE_BUCKET environment variable.
24
+ * @type {string}
25
+ */
26
+ export const CLOUD_STORAGE_BUCKET = process.env.GOOGLE_CLOUD_STORAGE_BUCKET;
27
+
28
+ let googleCredentials = null;
29
+
30
+ /**
31
+ * Retrieves Google Cloud service account credentials from AWS Secrets Manager.
32
+ *
33
+ * If GCP_SERVICE_ACCOUNT_SECRET_NAME is set, fetches the JSON credentials from AWS Secrets Manager.
34
+ * If not set, returns undefined (useful when credentials are provided via GOOGLE_APPLICATION_CREDENTIALS).
35
+ *
36
+ * @returns {Promise<Object|undefined>} Parsed JSON credentials object or undefined
37
+ * @throws {Error} If fetching from Secrets Manager fails
38
+ */
39
+ export const getGoogleCredentials = async () => {
40
+ if (googleCredentials) {
41
+ return googleCredentials;
42
+ }
43
+ try {
44
+ const secretName = process.env.GCP_SERVICE_ACCOUNT_SECRET_NAME;
45
+ if (!secretName) {
46
+ console.log(
47
+ "No secret name configured, skipping Google credentials initialization"
48
+ );
49
+ return;
50
+ }
51
+
52
+ const client = new SecretsManagerClient({
53
+ region: process.env.AWS_REGION || "ap-south-1",
54
+ });
55
+
56
+ const response = await client.send(
57
+ new GetSecretValueCommand({
58
+ SecretId: secretName,
59
+ VersionStage: "AWSCURRENT", // VersionStage defaults to AWSCURRENT if unspecified
60
+ })
61
+ );
62
+ const parsedCredentials = JSON.parse(response.SecretString);
63
+
64
+ // Validate that the credentials contain required fields
65
+ if (!parsedCredentials.client_email) {
66
+ throw new Error(
67
+ `Invalid Google Cloud credentials: missing 'client_email' field. ` +
68
+ `The secret must contain a valid service account JSON with 'client_email', ` +
69
+ `'private_key', and 'type' fields.`
70
+ );
71
+ }
72
+
73
+ if (!parsedCredentials.private_key) {
74
+ throw new Error(
75
+ `Invalid Google Cloud credentials: missing 'private_key' field.`
76
+ );
77
+ }
78
+
79
+ if (parsedCredentials.type !== "service_account") {
80
+ console.warn(
81
+ `Warning: credentials type is '${parsedCredentials.type}', expected 'service_account'`
82
+ );
83
+ }
84
+
85
+ googleCredentials = parsedCredentials;
86
+ return googleCredentials;
87
+ } catch (error) {
88
+ console.error(
89
+ `Failed to initialize Google credentials from secret ::`,
90
+ error
91
+ );
92
+ throw error;
93
+ }
94
+ };
95
+
96
+ let storage = null;
97
+
98
+ /**
99
+ * Gets or initializes the Google Cloud Storage client instance.
100
+ *
101
+ * @returns {Promise<Storage>} Initialized Storage client
102
+ */
103
+ const getStorage = async () => {
104
+ if (!storage) {
105
+ storage = new Storage({
106
+ projectId: CLOUD_PROJECT_ID,
107
+ credentials: await getGoogleCredentials(),
108
+ });
109
+ }
110
+ return storage;
111
+ };
112
+
113
+ /**
114
+ * Uploads a file to Google Cloud Storage.
115
+ *
116
+ * @param {Object} params - Upload parameters
117
+ * @param {Buffer|string} params.data - File data to upload (Buffer or string)
118
+ * @param {string} [params.folder] - Optional folder path in the bucket
119
+ * @param {string} params.fileName - Name of the file to create
120
+ * @param {string} params.contentType - MIME type of the file
121
+ * @param {boolean} [params.isPublic=false] - If true, returns a signed URL valid for 1 hour
122
+ * @returns {Promise<string>} Public URL or signed URL (if isPublic=true) to the uploaded file
123
+ */
124
+ export const uploadFile = async ({
125
+ data,
126
+ folder,
127
+ fileName,
128
+ contentType,
129
+ isPublic = false,
130
+ }) => {
131
+ const bucket = (await getStorage()).bucket(CLOUD_STORAGE_BUCKET);
132
+ const bucketName = CLOUD_STORAGE_BUCKET;
133
+
134
+ // 2. Define the path including the folder 'content'
135
+ const destinationPath = `${folder ? `${folder}/` : ""}${fileName}`;
136
+ const file = bucket.file(destinationPath);
137
+
138
+ // 3. Save the file.
139
+ await file.save(data, {
140
+ contentType: contentType,
141
+ resumable: false,
142
+ });
143
+
144
+ if (isPublic) {
145
+ // Generate a signed URL valid for 1 hour instead of making the file public
146
+ const expires = new Date();
147
+ expires.setHours(expires.getHours() + 1); // 1 hour from now
148
+
149
+ const [signedUrl] = await file.getSignedUrl({
150
+ version: "v4",
151
+ action: "read",
152
+ expires: expires,
153
+ });
154
+ return signedUrl;
155
+ }
156
+
157
+ return `https://storage.googleapis.com/${bucketName}/${destinationPath}`;
158
+ };
159
+
160
+ /**
161
+ * Converts a Google Cloud Storage URL to a gs:// URI format.
162
+ *
163
+ * @param {string} URI - GCS URL (https://storage.googleapis.com/...) or gs:// URI
164
+ * @returns {string} gs:// URI format
165
+ * @throws {Error} If the URI format is invalid
166
+ */
167
+ export const getGCSUri = (URI) => {
168
+ if (URI.startsWith("https://storage.googleapis.com/")) {
169
+ const path = URI.replace("https://storage.googleapis.com/", "");
170
+ return `gs://${path}`;
171
+ } else if (!URI.startsWith("gs://")) {
172
+ throw new Error(
173
+ `Invalid audio URI format. Expected gs://bucket/path or https://storage.googleapis.com/bucket/path, got: ${URI}`
174
+ );
175
+ }
176
+ return URI;
177
+ };
package/core/index.js ADDED
@@ -0,0 +1 @@
1
+ export { transcribe } from "./workflow.js";
@@ -1,347 +1,231 @@
1
- import { GoogleGenAI } from "@google/genai";
1
+ import { SpeechClient } from "@google-cloud/speech/build/src/v2/index.js";
2
2
  import {
3
- SecretsManagerClient,
4
- GetSecretValueCommand,
5
- } from "@aws-sdk/client-secrets-manager";
6
- import fs from "fs";
7
- import path, { join } from "path";
8
- import { mkdtemp, readFile, rm } from "fs/promises";
9
- import { tmpdir } from "os";
10
- import { execFile } from "child_process";
11
- import { promisify } from "util";
12
- import { Readable, pipeline } from "stream";
13
-
14
- // These packages provide prebuilt ffmpeg/ffprobe binaries. Types are not bundled,
15
- // so we import them as `any` to keep TypeScript satisfied.
16
- import ffmpeg from "@ffmpeg-installer/ffmpeg";
17
- import ffprobe from "@ffprobe-installer/ffprobe";
18
-
19
-
20
- const execFileAsync = promisify(execFile);
21
- const pipelineAsync = promisify(pipeline);
22
- const ffmpegPath = ffmpeg.path;
23
- const ffprobePath = ffprobe.path;
3
+ CLOUD_PROJECT_ID,
4
+ CLOUD_REGION,
5
+ getGCSUri,
6
+ getGoogleCredentials,
7
+ uploadFile,
8
+ } from "./gc.utils.js";
9
+ import { AUDIO_CONFIG } from "./audio.utils.js";
24
10
 
25
11
  /**
26
- * Read a required environment variable, optionally falling back to a default.
27
- * Throws if neither value is available, making configuration errors obvious.
28
- *
29
- * @param {string} name - Environment variable to read.
30
- * @param {string | undefined} defaultValue - Optional fallback value.
31
- * @returns {string} The resolved value.
32
- * @throws {Error} If no value is found.
12
+ * Language code mapping for Google Speech-to-Text API.
13
+ * @type {Object<string, string>}
33
14
  */
34
- const ensureEnv = (name, defaultValue) => {
35
- const value = process.env[name] ?? defaultValue;
36
- if (!value) {
37
- throw new Error(`Missing required environment variable: ${name}`);
38
- }
39
- return value;
15
+ const LANGUAGE_CODE = {
16
+ english: "en-US",
40
17
  };
41
18
 
42
19
  /**
43
- * Ensure GOOGLE_APPLICATION_CREDENTIALS points to a JSON key file.
44
- *
45
- * In AWS Lambda, the raw service-account JSON is expected to live in
46
- * AWS Secrets Manager. When GCP_SERVICE_ACCOUNT_SECRET_NAME is present, the
47
- * secret is fetched, written to `/tmp/gcp-sa-key.json`, and the environment
48
- * variable is updated to point at that file to avoid stale Lambda values.
49
- *
50
- * @returns {Promise<void>} Resolves once credentials are ready.
51
- * @throws {Error} When the secret cannot be read or written.
20
+ * Speech recognition model to use. "long" model is optimized for longer audio files.
21
+ * @type {string}
52
22
  */
53
- const ensureGoogleCredentialsFromSecret = async () => {
54
- const secretName = process.env.GCP_SERVICE_ACCOUNT_SECRET_NAME;
55
- if (!secretName) {
56
- console.log(
57
- "No secret name configured, skipping Google credentials initialization"
58
- );
59
- return;
60
- }
23
+ const MODEL = "long";
61
24
 
62
- try {
63
- const client = new SecretsManagerClient({
64
- region: process.env.AWS_REGION || "ap-south-1",
65
- });
25
+ let speechClient = null;
66
26
 
67
- const response = await client.send(
68
- new GetSecretValueCommand({
69
- SecretId: secretName,
70
- VersionStage: "AWSCURRENT", // VersionStage defaults to AWSCURRENT if unspecified
71
- })
72
- );
73
- const secret = response.SecretString;
74
- const credPath = path.join("/tmp", "gcp-sa-key.json");
75
- fs.writeFileSync(credPath, secret, { encoding: "utf8" });
76
- process.env.GOOGLE_APPLICATION_CREDENTIALS = credPath;
77
- console.log(
78
- `Wrote Google service account credentials to ${credPath} from Secrets Manager`
79
- );
80
- } catch (error) {
81
- console.error(
82
- `Failed to initialize Google credentials from secret ::`,
83
- error
84
- );
85
- throw error;
27
+ /**
28
+ * Gets or initializes the Google Cloud Speech-to-Text client.
29
+ *
30
+ * @returns {Promise<SpeechClient>} Initialized SpeechClient instance
31
+ */
32
+ export const getSpeechClient = async () => {
33
+ if (!speechClient) {
34
+ speechClient = new SpeechClient({
35
+ projectId: CLOUD_PROJECT_ID,
36
+ region: CLOUD_REGION,
37
+ credentials: await getGoogleCredentials(),
38
+ });
86
39
  }
40
+ return speechClient;
87
41
  };
88
42
 
89
43
  /**
90
- * Initialize a Google GenAI client configured for Vertex AI.
91
- * Ensures credentials, project, and location are available before instantiating.
92
- *
93
- * @returns {Promise<GoogleGenAI>} Configured GenAI client instance.
94
- * @throws {Error} When required environment variables are missing.
44
+ * Recognizer resource path for Google Speech-to-Text API v2.
45
+ * @type {string}
95
46
  */
96
- const createGenAIClient = async () => {
97
- await ensureGoogleCredentialsFromSecret();
98
- const project = ensureEnv("GOOGLE_CLOUD_PROJECT");
99
- const location = ensureEnv("GOOGLE_CLOUD_LOCATION", "global");
100
- const client = new GoogleGenAI({
101
- vertexai: true,
102
- project: project,
103
- location: location,
104
- });
105
-
106
- return client;
107
- };
47
+ const recognizer = `projects/${CLOUD_PROJECT_ID}/locations/${CLOUD_REGION}/recognizers/_`;
108
48
 
109
- const extractAudioBufferFromVideo = async (videoUrl) => {
110
- const videoResponse = await fetch(videoUrl);
111
- if (!videoResponse.ok) {
112
- throw new Error(`Failed to download video: ${videoResponse.status} ${videoResponse.statusText}`);
113
- }
114
- const tmpBase = await mkdtemp(join(tmpdir(), 'mcp-'));
115
- const inputPath = join(tmpBase, 'input_video');
116
- const outputPath = join(tmpBase, 'output_audio.mp3');
49
+ /**
50
+ * Processes Speech-to-Text API response and groups words into phrases of 4 words each.
51
+ *
52
+ * @param {Object} results - API response results object
53
+ * @returns {Array<Object>} Array of phrase objects with text, start time, end time, and word timings
54
+ */
55
+ const processResponse = (results) => {
56
+ // Extract words from response
57
+ const words = results?.alternatives?.[0]?.words || [];
117
58
 
118
- // Stream the video response directly to disk to avoid holding the full video in memory
119
- if (!videoResponse.body) {
120
- await rm(tmpBase, { recursive: true, force: true });
121
- throw new Error("Video response has no body");
59
+ if (words.length === 0) {
60
+ return [];
122
61
  }
123
- const videoStream = Readable.fromWeb(videoResponse.body);
124
- const fileWriteStream = fs.createWriteStream(inputPath);
125
- await pipelineAsync(videoStream, fileWriteStream);
126
62
 
127
- // Get duration using bundled ffprobe
128
- let duration = 0;
129
- try {
130
- const { stdout } = await execFileAsync(ffprobePath, [
131
- '-v', 'error',
132
- '-show_entries', 'format=duration',
133
- '-of', 'default=noprint_wrappers=1:nokey=1',
134
- inputPath
135
- ]);
136
- duration = parseFloat(stdout.toString().trim()) || 0;
137
- } catch (err) {
138
- console.warn('Failed to get duration using ffprobe, duration will be 0');
139
- }
63
+ // Convert time offsets to milliseconds
64
+ const convertToMs = (offset) => {
65
+ if (!offset) return 0;
66
+ const seconds = Number(offset.seconds || 0);
67
+ const nanos = Number(offset.nanos || 0);
68
+ return seconds * 1000 + nanos / 1e6;
69
+ };
140
70
 
141
- try {
142
- await execFileAsync(ffmpegPath, [
143
- '-y',
144
- '-i', inputPath,
145
- '-vn',
146
- '-acodec', 'libmp3lame',
147
- '-q:a', '2',
148
- outputPath
149
- ]);
150
- } catch (err) {
151
- await rm(tmpBase, { recursive: true, force: true });
152
- const stderr = err?.stderr?.toString?.().trim?.() || "";
153
- const msg = stderr || (err instanceof Error ? err.message : String(err));
154
- throw new Error(`ffmpeg execution failed: ${msg}`);
71
+ // Process words into individual word timings
72
+ const processedWords = words.map((w) => ({
73
+ word: w.word,
74
+ startMs: convertToMs(w.startOffset),
75
+ endMs: convertToMs(w.endOffset),
76
+ }));
77
+
78
+ // Group words into phrases of 4 words each
79
+ const phrases = [];
80
+ for (let i = 0; i < processedWords.length; i += 4) {
81
+ const group = processedWords.slice(i, i + 4);
82
+ const text = group.map((w) => w.word).join(" ");
83
+ const startMs = group[0].startMs;
84
+ const endMs = group[group.length - 1].endMs;
85
+ const wordStarts = group.map((w) => w.startMs);
86
+
87
+ phrases.push({
88
+ t: text,
89
+ s: Math.round(startMs),
90
+ e: Math.round(endMs),
91
+ w: wordStarts.map((ms) => Math.round(ms)),
92
+ });
155
93
  }
156
-
157
- const audioBuffer = await readFile(outputPath);
158
- await rm(tmpBase, { recursive: true, force: true });
159
- return { audioBuffer, duration };
94
+ return phrases;
160
95
  };
161
96
 
162
-
163
97
  /**
164
- * Build the captioning prompt passed to the Gemini model.
165
- *
166
- * @param {number} duration - Audio duration in seconds.
167
- * @param {string} language - Human-readable target language.
168
- * @param {string} languageFont - Desired script/font name.
169
- * @returns {string} Instruction prompt for the model.
98
+ * Transcribes short audio (typically under 60 seconds) using Google Speech-to-Text API.
99
+ * Uses synchronous recognize method for faster processing.
100
+ *
101
+ * @param {Object} params - Transcription parameters
102
+ * @param {Buffer} params.audioBuffer - Audio data buffer (FLAC format)
103
+ * @param {string} [params.language="english"] - Language code (e.g., "english")
104
+ * @param {string} [params.format="FLAC"] - Audio format (currently only "FLAC" supported)
105
+ * @returns {Promise<Array<Object>>} Array of phrase objects with text, timings, and word offsets
106
+ * @throws {Error} If transcription fails
170
107
  */
171
- const buildPrompt = (duration, language, languageFont) => {
172
- // Convert duration from seconds to milliseconds for the prompt
173
- const durationMs = Math.round(duration * 1000);
174
-
175
- return `You are a professional subtitle and transcription engine.
176
-
177
- ## INPUT
178
- - Audio duration: ${durationMs} milliseconds
179
- - Target language: ${language}
180
- - Subtitle font script: ${languageFont}
181
-
182
- ## OBJECTIVE
183
- Transcribe the audio into clear, readable subtitles.
184
-
185
- If the spoken audio is NOT in ${language}, translate it into ${language} before generating subtitles.
186
-
187
- ## SUBTITLE SEGMENTATION RULES
188
- - Split speech into short, natural phrases.
189
- - Each subtitle phrase MUST contain a maximum of 4 words.
190
- - Do NOT split words across phrases.
191
- - Avoid breaking phrases mid-sentence unless required by timing constraints.
192
-
193
- ## TIMING RULES (STRICT — MUST FOLLOW)
194
- - All timestamps are in **milliseconds**.
195
- - Each subtitle object MUST include:
196
- - 's': start timestamp
197
- - 'e': end timestamp
198
- - Duration of each phrase = 'e - s'
199
- - Minimum phrase duration: **100 ms**
200
- - 'e' MUST be greater than 's'
201
- - 'e' MUST be **less than or equal to ${durationMs}**
202
- - Subtitles MUST be sequential:
203
- - 's' of the next phrase MUST be **greater than or equal to** the previous 'e'
204
- - NO overlapping timestamps
205
- - Prefer aligning timestamps with natural speech pauses.
206
-
207
- ## TEXT RULES
208
- - 't' MUST be written using ${languageFont} characters.
209
- - No emojis.
210
- - No punctuation-only subtitles.
211
- - Normalize casing according to the target language's writing system.
212
- - Remove filler sounds (e.g., “um”, “uh”) unless semantically important.
213
-
214
- ## OUTPUT FORMAT (CRITICAL)
215
- Return ONLY a valid JSON array.
216
- - No markdown
217
- - No code blocks
218
- - No explanations
219
- - No additional text
220
- - Output MUST start with '[' and end with ']'
108
+ export async function transcribeShort({
109
+ audioBuffer,
110
+ language = "english",
111
+ format = "FLAC",
112
+ }) {
113
+ const client = await getSpeechClient();
114
+
115
+ const audioContent = audioBuffer.toString("base64");
116
+
117
+ const request = {
118
+ recognizer: recognizer,
119
+ config: {
120
+ explicitDecodingConfig: {
121
+ encoding: AUDIO_CONFIG[format].encoding,
122
+ sampleRateHertz: AUDIO_CONFIG[format].sampleRate,
123
+ audioChannelCount: 1,
124
+ },
125
+ languageCodes: [LANGUAGE_CODE[language]],
126
+ model: MODEL,
127
+ features: {
128
+ enableWordTimeOffsets: true,
129
+ },
130
+ },
131
+ content: audioContent,
132
+ };
221
133
 
222
- ## OUTPUT SCHEMA
223
- [
224
- {
225
- "t": "Subtitle text",
226
- "s": 0,
227
- "e": 1200
134
+ try {
135
+ const [response] = await client.recognize(request);
136
+ return processResponse(response.results?.[0]);
137
+ } catch (err) {
138
+ console.error("Transcription Error:", err.message);
139
+ throw err;
228
140
  }
229
- ]
230
- `.trim();
231
- };
141
+ }
232
142
 
233
143
  /**
234
- * Transcribe an audio URL to JSON subtitles using Google GenAI (Vertex AI),
235
- * mirroring the Python implementation in `playground/vertex/transcript.py`.
236
- *
237
- * @param {Object} params
238
- * @param {string} params.videoUrl - Publicly reachable video URL.
239
- * @param {string} [params.language="english"] - Target transcription language (human-readable).
240
- * @param {string} [params.languageFont="english"] - Target font/script for subtitles.
241
- * @returns {Promise<{ subtitles: Array<{t: string, s: number, e: number}> }>} Subtitles array with text, start time, and end time.
242
- * @throws {Error} When audioUrl is missing or downstream calls fail.
144
+ * Transcribes long audio (typically over 60 seconds) using Google Speech-to-Text API.
145
+ * Uses asynchronous batchRecognize method and requires audio to be uploaded to GCS first.
146
+ *
147
+ * @param {Object} params - Transcription parameters
148
+ * @param {Buffer} [params.audioBuffer] - Audio data buffer (required if audioUrl not provided)
149
+ * @param {string} [params.audioUrl] - GCS URI (gs://) or HTTPS URL to audio file (required if audioBuffer not provided)
150
+ * @param {string} [params.language="english"] - Language code (e.g., "english")
151
+ * @param {string} [params.format="FLAC"] - Audio format (currently only "FLAC" supported)
152
+ * @returns {Promise<Array<Object>>} Array of phrase objects with text, timings, and word offsets
153
+ * @throws {Error} If transcription fails
243
154
  */
244
- export const transcribeVideoUrl = async (params) => {
245
- const {
246
- videoUrl,
247
- language = "english",
248
- languageFont = "english",
249
- } = params || {};
250
-
251
- if (!videoUrl) {
252
- throw new Error("Missing required parameter: videoUrl");
253
- }
254
-
255
- const { audioBuffer, duration } = await extractAudioBufferFromVideo(videoUrl);
256
- if (!duration) {
257
- throw new Error("Failed to get duration of video");
155
+ export async function transcribeLong({
156
+ audioBuffer,
157
+ audioUrl,
158
+ language = "english",
159
+ format = "FLAC",
160
+ }) {
161
+ let gcsUri;
162
+ if (audioUrl) {
163
+ gcsUri = getGCSUri(audioUrl);
164
+ } else {
165
+ const audioUri = await uploadFile({
166
+ data: audioBuffer,
167
+ folder: "audio",
168
+ fileName: `audio-${Date.now()}.${AUDIO_CONFIG[format].extension}`,
169
+ contentType: AUDIO_CONFIG[format].contentType,
170
+ });
171
+ gcsUri = getGCSUri(audioUri);
258
172
  }
259
173
 
260
- const prompt = buildPrompt(duration, language, languageFont);
174
+ console.log("GCS URI:", gcsUri);
175
+ const client = await getSpeechClient();
261
176
 
262
- const client = await createGenAIClient();
263
- const modelName = process.env.GOOGLE_VERTEX_MODEL || "gemini-2.5-flash-lite";
264
-
265
- const generationConfig = {
266
- maxOutputTokens: 65535,
267
- temperature: 1,
268
- topP: 0.95,
269
- thinkingConfig: {
270
- thinkingBudget: 0,
271
- },
272
- safetySettings: [
273
- {
274
- category: "HARM_CATEGORY_HATE_SPEECH",
275
- threshold: "OFF",
276
- },
277
- {
278
- category: "HARM_CATEGORY_DANGEROUS_CONTENT",
279
- threshold: "OFF",
280
- },
281
- {
282
- category: "HARM_CATEGORY_SEXUALLY_EXPLICIT",
283
- threshold: "OFF",
177
+ const request = {
178
+ recognizer: recognizer,
179
+ config: {
180
+ explicitDecodingConfig: {
181
+ encoding: AUDIO_CONFIG[format].encoding,
182
+ sampleRateHertz: AUDIO_CONFIG[format].sampleRate,
183
+ audioChannelCount: 1,
284
184
  },
285
- {
286
- category: "HARM_CATEGORY_HARASSMENT",
287
- threshold: "OFF",
185
+ languageCodes: [LANGUAGE_CODE[language]],
186
+ model: MODEL,
187
+ features: {
188
+ enableWordTimeOffsets: true,
288
189
  },
289
- ],
290
- };
291
-
292
- const req = {
293
- model: modelName,
294
- contents: [
190
+ },
191
+ files: [
295
192
  {
296
- role: "user",
297
- parts: [
298
- {
299
- inlineData: {
300
- data: audioBuffer.toString("base64"),
301
- mimeType: "audio/mpeg",
302
- },
303
- },
304
- { text: prompt },
305
- ],
193
+ uri: gcsUri,
306
194
  },
307
195
  ],
308
- config: generationConfig,
196
+ recognitionOutputConfig: {
197
+ inlineResponseConfig: {},
198
+ },
309
199
  };
310
200
 
311
- const response = await client.models.generateContent(req);
312
-
313
- let textPart = response.text || "";
314
-
315
- // Strip markdown code fences if present (```json ... ``` or ``` ... ```)
316
- textPart = textPart
317
- .replace(/^```json\s*/i, "") // Remove opening ```json
318
- .replace(/^```\s*/i, "") // Remove opening ```
319
- .replace(/\s*```$/i, "") // Remove closing ```
320
- .trim();
201
+ try {
202
+ console.log("Waiting for operation to complete...");
203
+ const [operation] = await client.batchRecognize(request);
204
+ const [response] = await operation.promise();
205
+
206
+ // Extract results for the audio URI (use the GCS URI as the key)
207
+ const fileResult = response.results?.[gcsUri];
208
+ if (!fileResult || !fileResult.transcript) {
209
+ return [];
210
+ }
321
211
 
212
+ // Extract words from all results (batchRecognize can return multiple result segments)
213
+ const allPhrases = [];
214
+ const results = fileResult.transcript.results || [];
322
215
 
323
- let subtitles = [];
324
- try {
325
- // Try to find JSON array in the text (in case there's extra text)
326
- const jsonMatch = textPart.match(/\[[\s\S]*\]/);
327
- const jsonText = jsonMatch ? jsonMatch[0] : textPart;
216
+ for (const result of results) {
217
+ const phrases = processResponse(result);
218
+ console.log("Phrases:", phrases);
219
+ console.log("Transcription Result:", result);
220
+ allPhrases.push(...phrases);
221
+ }
328
222
 
329
- subtitles = JSON.parse(jsonText);
330
- if (!Array.isArray(subtitles)) {
331
- throw new Error("Parsed subtitles are not an array");
223
+ if (allPhrases.length === 0) {
224
+ return [];
332
225
  }
226
+ return allPhrases;
333
227
  } catch (err) {
334
- console.warn(
335
- "Failed to parse model output as JSON subtitles, returning raw text",
336
- err
337
- );
338
- console.warn("Raw response text:", textPart.substring(0, 500));
339
- subtitles = [];
228
+ console.error("Transcription Error:", err.message);
229
+ throw err;
340
230
  }
341
-
342
- return {
343
- subtitles,
344
- duration,
345
- videoUrl
346
- };
347
- };
231
+ }
@@ -0,0 +1,48 @@
1
+ import { extractAudioBufferFromAudioUrl, extractAudioBufferFromVideo } from "./audio.utils.js";
2
+ import { transcribeLong, transcribeShort } from "./transcriber.js";
3
+
4
+ /**
5
+ * Creates a complete caption video project from a video URL.
6
+ * Downloads video, extracts audio, transcribes it using Google Speech-to-Text,
7
+ * and builds a Twick project JSON structure.
8
+ *
9
+ * @param {Object} params - Project creation parameters
10
+ * @param {string} params.videoUrl - Publicly accessible HTTP(S) URL to the video file
11
+ * @param {Object} [params.videoSize] - Video dimensions {width, height} (defaults to 720x1280)
12
+ * @param {string} [params.language="english"] - Transcription language code
13
+ * @param {string} [params.languageFont="english"] - Font/script for captions
14
+ * @returns {Promise<Object>} Twick project JSON structure
15
+ * @throws {Error} If video processing, transcription, or project building fails
16
+ */
17
+ export const transcribe = async (params) => {
18
+ const { videoSize, videoUrl, audioUrl, language, languageFont } = params;
19
+
20
+ const { audioBuffer, duration } = audioUrl
21
+ ? await extractAudioBufferFromAudioUrl(audioUrl)
22
+ : await extractAudioBufferFromVideo(videoUrl);
23
+ let captions = [];
24
+ if (!duration) {
25
+ throw new Error("Failed to get duration of video");
26
+ } else if (!audioBuffer) {
27
+ throw new Error("Failed to get audio buffer from video");
28
+ } else if (duration > 6) {
29
+ captions = await transcribeLong({ audioBuffer, language });
30
+ } else {
31
+ captions = await transcribeShort({ audioBuffer, language });
32
+ }
33
+ if (!captions.length) {
34
+ throw new Error("No captions found");
35
+ }
36
+
37
+ console.log("Transcription successful");
38
+
39
+ return ({
40
+ captions,
41
+ duration,
42
+ audioUrl,
43
+ videoUrl,
44
+ videoSize,
45
+ language,
46
+ languageFont,
47
+ });
48
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@twick/cloud-transcript",
3
- "version": "0.15.14",
3
+ "version": "0.15.16",
4
4
  "description": "Twick cloud function for generating JSON captions from audio using Google Cloud Speech-to-Text",
5
5
  "type": "module",
6
6
  "main": "core/transcriber.js",
@@ -46,10 +46,12 @@
46
46
  "node": ">=20.0.0"
47
47
  },
48
48
  "dependencies": {
49
- "@google/genai": "^1.0.0",
50
49
  "@aws-sdk/client-secrets-manager": "^3.679.0",
50
+ "fluent-ffmpeg": "^2.1.2",
51
51
  "@ffmpeg-installer/ffmpeg": "^1.1.0",
52
- "@ffprobe-installer/ffprobe": "^1.1.0"
52
+ "@ffprobe-installer/ffprobe": "^1.1.0",
53
+ "@google-cloud/speech": "^7.2.1",
54
+ "@google-cloud/storage": "^7.18.0"
53
55
  },
54
56
  "devDependencies": {
55
57
  "typescript": "~5.4.5",
@@ -1,4 +1,4 @@
1
- import { transcribeVideoUrl } from '@twick/cloud-transcript';
1
+ import { transcribe } from '../../core/workflow.js';
2
2
 
3
3
  const jsonResponse = (statusCode, body) => ({
4
4
  statusCode,
@@ -16,18 +16,19 @@ const jsonResponse = (statusCode, body) => ({
16
16
  *
17
17
  * Expected JSON payload (e.g. via AppSync / Lambda resolver):
18
18
  * {
19
- * "videoUrl": "https://example.com/audio.mp3", // or "gs://bucket/object"
20
- * "languageCode": "en-US", // optional, defaults to "en-US"
21
- * "encoding": "MP3", // optional
22
- * "sampleRateHertz": 16000 // optional
19
+ * "videoUrl": "https://example.com/video.mp4", // for video input
20
+ * "audioUrl": "https://example.com/audio.mp3", // OR for audio input
21
+ * "videoSize": { "width": 720, "height": 1280 }, // optional
22
+ * "language": "english", // optional
23
+ * "languageFont": "english" // optional
23
24
  * }
24
25
  *
25
26
  * Environment variables:
26
27
  * - GOOGLE_CLOUD_PROJECT: Explicit Google Cloud project id.
27
- * - GOOGLE_CLOUD_LOCATION (optional): Location of the Google Cloud project.
28
+ * - GOOGLE_CLOUD_LOCATION (optional): Location of the Google Cloud project.
28
29
  * - GOOGLE_VERTEX_MODEL (optional): Model to use for transcription.
29
30
  *
30
- * Returns: JSON payload containing transcript text, caption segments, and word-level timings.
31
+ * Returns: JSON payload with captions, duration, and project metadata.
31
32
  */
32
33
  export const handler = async (event) => {
33
34
  console.log('Transcript function invoked');
@@ -51,23 +52,26 @@ export const handler = async (event) => {
51
52
  (event?.body ? JSON.parse(event.body) : {}) ||
52
53
  {};
53
54
 
54
- const { videoUrl, language,languageFont } =
55
+ const { videoUrl, audioUrl, videoSize, language, languageFont } =
55
56
  argumentsPayload;
56
57
 
57
- if (!videoUrl) {
58
+ if (!videoUrl && !audioUrl) {
58
59
  return jsonResponse(400, {
59
- error: 'Missing required field: videoUrl',
60
+ error: 'Missing required field: provide either videoUrl or audioUrl',
60
61
  expectedFormat: {
61
- videoUrl:
62
- 'Publicly reachable audio URL or "gs://bucket/object" for GCS',
63
- language: 'Optional language (e.g., "english", "hindi")',
64
- languageFont: 'Optional font/script for captions (e.g., "english")',
62
+ videoUrl: 'Publicly reachable video URL (e.g. https://...)',
63
+ audioUrl: 'Publicly reachable audio URL (e.g. https://... or gs://...)',
64
+ videoSize: 'Optional { width, height }',
65
+ language: 'Optional (e.g. "english", "hindi")',
66
+ languageFont: 'Optional font/script (e.g. "english")',
65
67
  },
66
68
  });
67
69
  }
68
70
 
69
- const result = await transcribeVideoUrl({
70
- videoUrl,
71
+ const result = await transcribe({
72
+ videoUrl: videoUrl || undefined,
73
+ audioUrl: audioUrl || undefined,
74
+ videoSize,
71
75
  language,
72
76
  languageFont,
73
77
  });