@twick/cloud-transcript 0.15.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,131 @@
1
+ # @twick/cloud-transcript
2
+
3
+ **Transcribe audio/video to JSON captions using Google GenAI (Vertex AI) with Gemini models.**
4
+
5
+ Extract text from audio content with precise millisecond timestamps. Perfect for generating subtitle data from audio files or video URLs.
6
+
7
+ ## What Problem Does This Solve?
8
+
9
+ - **AI-powered transcription** — Use Google's Gemini models for accurate audio-to-text conversion
10
+ - **Precise timestamps** — Get millisecond-level timing for each caption segment
11
+ - **Serverless processing** — Deploy as AWS Lambda for automatic scaling
12
+ - **Multiple languages** — Support various languages and fonts
13
+
14
+ ## Input → Output
15
+
16
+ **Input:** Audio URL + optional configuration
17
+ ```json
18
+ {
19
+ "audioUrl": "https://example.com/audio.mp3",
20
+ "language": "english",
21
+ "languageFont": "english"
22
+ }
23
+ ```
24
+
25
+ **Output:** JSON captions with timestamps
26
+ ```json
27
+ {
28
+ "captions": [
29
+ {
30
+ "t": "Example phrase 1",
31
+ "s": 0,
32
+ "e": 1500
33
+ },
34
+ {
35
+ "t": "Another short example",
36
+ "s": 1500,
37
+ "e": 2800
38
+ }
39
+ ],
40
+ "rawText": "Full raw response text from the model..."
41
+ }
42
+ ```
43
+
44
+ **Where it runs:** AWS Lambda container image (Linux/AMD64)
45
+
46
+ ## Installation
47
+
48
+ ```bash
49
+ npm install -D @twick/cloud-transcript
50
+ ```
51
+
52
+ ## Quick Start
53
+
54
+ ### 1. Scaffold AWS Lambda Template
55
+
56
+ ```bash
57
+ npx twick-transcript init
58
+ ```
59
+
60
+ ### 2. Build Docker Image
61
+
62
+ ```bash
63
+ npx twick-transcript build twick-transcript:latest
64
+ ```
65
+
66
+ ### 3. Configure Google Cloud
67
+
68
+ **Required:**
69
+ - Google Cloud project with Vertex AI API enabled
70
+ - Service account with Vertex AI permissions
71
+
72
+ **Environment variables:**
73
+ - `GOOGLE_CLOUD_PROJECT` (required) — Your GCP project ID
74
+ - `GOOGLE_CLOUD_LOCATION` (optional) — Vertex AI location (default: `"global"`)
75
+ - `GOOGLE_VERTEX_MODEL` (optional) — Model name (default: `"gemini-2.5-flash"`)
76
+
77
+ **Credentials (choose one):**
78
+ - **File path** (recommended):
79
+ - Mount service account JSON and set `GOOGLE_APPLICATION_CREDENTIALS` to the file path
80
+ - **Environment JSON** (alternative):
81
+ - Set `GOOGLE_KEY` to the service account JSON string
82
+
83
+ ### 4. Deploy to AWS Lambda
84
+
85
+ ```bash
86
+ # Login to ECR
87
+ npx twick-transcript ecr-login us-east-1 YOUR_ACCOUNT_ID
88
+
89
+ # Push to ECR
90
+ npx twick-transcript push twick-transcript:latest us-east-1 YOUR_ACCOUNT_ID
91
+ ```
92
+
93
+ ## Deployment (High Level)
94
+
95
+ 1. **Scaffold** the Lambda container template
96
+ 2. **Configure** Google Cloud credentials (file mount or environment variable)
97
+ 3. **Set environment variables** (GCP project, location, model)
98
+ 4. **Build and push** Docker image to ECR
99
+ 5. **Create Lambda function** using the ECR image
100
+
101
+ The Lambda handler expects:
102
+ - **Event format:** `{ audioUrl, language?, languageFont? }`
103
+ - **Response:** JSON with `captions` array and `rawText` string
104
+
105
+ **Note:** The audio URL must be publicly accessible via HTTP(S). Google Cloud Storage URIs (`gs://`) are not directly supported—use signed URLs instead.
106
+
107
+ ## Programmatic Usage
108
+
109
+ Use the core transcriber directly:
110
+
111
+ ```js
112
+ import { transcribeAudioUrl } from '@twick/cloud-transcript/core/transcriber.js';
113
+
114
+ const result = await transcribeAudioUrl({
115
+ audioUrl: 'https://example.com/audio.mp3',
116
+ language: 'english',
117
+ languageFont: 'english',
118
+ });
119
+
120
+ console.log(result.captions); // Array of {t, s, e} objects
121
+ console.log(result.rawText); // Raw model response
122
+ ```
123
+
124
+ ## Technical Details
125
+
126
+ - **Model:** Google Gemini (default: `gemini-2.5-flash`, configurable via `GOOGLE_VERTEX_MODEL`)
127
+ - **Format:** Captions segmented into max 4 words per segment
128
+ - **Timestamps:** Millisecond precision, non-overlapping segments
129
+ - **API:** Google Vertex AI (GenAI)
130
+
131
+ For detailed setup instructions, see the complete deployment guide in the repository.
@@ -0,0 +1,125 @@
1
+ #!/usr/bin/env node
2
+ import { fileURLToPath } from 'url';
3
+ import { dirname, join } from 'path';
4
+ import fs from 'fs';
5
+ import { spawn } from 'child_process';
6
+
7
+ const __filename = fileURLToPath(import.meta.url);
8
+ const __dirname = dirname(__filename);
9
+ const pkgRoot = join(__dirname, '..');
10
+
11
+ function copyTemplate(destDir) {
12
+ const templateDir = join(pkgRoot, 'platform', 'aws');
13
+ if (!fs.existsSync(destDir)) fs.mkdirSync(destDir, { recursive: true });
14
+
15
+ // Create platform/aws directory structure to maintain consistency with CMD ["platform/aws/handler.handler"]
16
+ const platformAwsDir = join(destDir, 'platform', 'aws');
17
+ if (!fs.existsSync(platformAwsDir)) {
18
+ fs.mkdirSync(platformAwsDir, { recursive: true });
19
+ }
20
+
21
+ // Copy Dockerfile to root (it references platform/aws/handler.handler)
22
+ const dockerfileSrc = join(templateDir, 'Dockerfile');
23
+ const dockerfileDest = join(destDir, 'Dockerfile');
24
+ fs.copyFileSync(dockerfileSrc, dockerfileDest);
25
+
26
+ // Copy handler.js to platform/aws/ to match the CMD path
27
+ const handlerSrc = join(templateDir, 'handler.js');
28
+ const handlerDest = join(platformAwsDir, 'handler.js');
29
+ fs.copyFileSync(handlerSrc, handlerDest);
30
+
31
+ // Minimal package.json to enable docker layer caching (npm ci)
32
+ const pkgJsonPath = join(destDir, 'package.json');
33
+ if (!fs.existsSync(pkgJsonPath)) {
34
+ const pkg = {
35
+ name: 'twick-transcript-runtime',
36
+ private: true,
37
+ type: 'module',
38
+ dependencies: {
39
+ '@twick/cloud-transcript': 'latest',
40
+ '@ffmpeg-installer/ffmpeg': '^1.1.0',
41
+ '@ffprobe-installer/ffprobe': '^1.1.0'
42
+ }
43
+ };
44
+ fs.writeFileSync(pkgJsonPath, JSON.stringify(pkg, null, 2));
45
+ }
46
+ }
47
+
48
+ function run(cmd, args, opts = {}) {
49
+ return new Promise((resolve, reject) => {
50
+ const ps = typeof cmd === 'string' && Array.isArray(args) && args.length === 0
51
+ ? spawn(cmd, { stdio: 'inherit', shell: true, ...opts })
52
+ : spawn(cmd, args, { stdio: 'inherit', shell: true, ...opts });
53
+ ps.on('close', (code) => (code === 0 ? resolve() : reject(new Error(`${cmd} exited ${code}`))));
54
+ });
55
+ }
56
+
57
+ async function main() {
58
+ const [command, ...rest] = process.argv.slice(2);
59
+
60
+ if (!command || ['-h', '--help', 'help'].includes(command)) {
61
+ console.log(`
62
+ Usage: twick-transcript <command> [options]
63
+
64
+ Commands:
65
+ init [dir] Scaffold AWS container template into [dir] (default: ./twick-transcript-aws)
66
+ build <image> [dir] Docker build image from [dir] (default: ./twick-transcript-aws)
67
+ ecr-login <region> <accountId> Login docker to ECR
68
+ push <image> <region> <accountId> Push image to ECR (repo must exist)
69
+
70
+ Examples:
71
+ twick-transcript init
72
+ twick-transcript build my-repo:latest
73
+ twick-transcript ecr-login us-east-1 123456789012
74
+ twick-transcript push my-repo:latest us-east-1 123456789012
75
+ `);
76
+ return;
77
+ }
78
+
79
+ if (command === 'init') {
80
+ const dir = rest[0] || 'twick-transcript-aws';
81
+ copyTemplate(dir);
82
+ console.log(`✔ Scaffolded AWS runtime into ./${dir}`);
83
+ return;
84
+ }
85
+
86
+ if (command === 'build') {
87
+ const image = rest[0];
88
+ const dir = rest[1] || 'twick-transcript-aws';
89
+ if (!image) throw new Error('Image name required. e.g., my-repo:latest');
90
+ // Build for linux/amd64 platform to avoid creating multi-arch manifest index
91
+ // This reduces the number of artifacts pushed to the registry
92
+ await run('docker', ['build', '--platform', 'linux/amd64', '-t', image, dir]);
93
+ return;
94
+ }
95
+
96
+ if (command === 'ecr-login') {
97
+ const region = rest[0];
98
+ const accountId = rest[1];
99
+ if (!region || !accountId) throw new Error('Usage: ecr-login <region> <accountId>');
100
+ const registry = `${accountId}.dkr.ecr.${region}.amazonaws.com`;
101
+ await run(`aws ecr get-login-password --region ${region} | docker login --username AWS --password-stdin ${registry}`, []);
102
+ return;
103
+ }
104
+
105
+ if (command === 'push') {
106
+ const image = rest[0];
107
+ const region = rest[1];
108
+ const accountId = rest[2];
109
+ if (!image || !region || !accountId) throw new Error('Usage: push <image> <region> <accountId>');
110
+ const [repo, tag = 'latest'] = image.split(':');
111
+ const registry = `${accountId}.dkr.ecr.${region}.amazonaws.com`;
112
+ const remote = `${registry}/${repo}:${tag}`;
113
+ await run('docker', ['tag', `${repo}:${tag}`, remote]);
114
+ await run('docker', ['push', remote]);
115
+ console.log(`✔ Pushed ${remote}`);
116
+ return;
117
+ }
118
+
119
+ throw new Error(`Unknown command: ${command}`);
120
+ }
121
+
122
+ main().catch((err) => {
123
+ console.error(err.message || err);
124
+ process.exit(1);
125
+ });
@@ -0,0 +1,347 @@
1
+ import { GoogleGenAI } from "@google/genai";
2
+ import {
3
+ SecretsManagerClient,
4
+ GetSecretValueCommand,
5
+ } from "@aws-sdk/client-secrets-manager";
6
+ import fs from "fs";
7
+ import path, { join } from "path";
8
+ import { mkdtemp, readFile, rm } from "fs/promises";
9
+ import { tmpdir } from "os";
10
+ import { execFile } from "child_process";
11
+ import { promisify } from "util";
12
+ import { Readable, pipeline } from "stream";
13
+
14
+ // These packages provide prebuilt ffmpeg/ffprobe binaries. Types are not bundled,
15
+ // so we import them as `any` to keep TypeScript satisfied.
16
+ import ffmpeg from "@ffmpeg-installer/ffmpeg";
17
+ import ffprobe from "@ffprobe-installer/ffprobe";
18
+
19
+
20
+ const execFileAsync = promisify(execFile);
21
+ const pipelineAsync = promisify(pipeline);
22
+ const ffmpegPath = ffmpeg.path;
23
+ const ffprobePath = ffprobe.path;
24
+
25
+ /**
26
+ * Read a required environment variable, optionally falling back to a default.
27
+ * Throws if neither value is available, making configuration errors obvious.
28
+ *
29
+ * @param {string} name - Environment variable to read.
30
+ * @param {string | undefined} defaultValue - Optional fallback value.
31
+ * @returns {string} The resolved value.
32
+ * @throws {Error} If no value is found.
33
+ */
34
+ const ensureEnv = (name, defaultValue) => {
35
+ const value = process.env[name] ?? defaultValue;
36
+ if (!value) {
37
+ throw new Error(`Missing required environment variable: ${name}`);
38
+ }
39
+ return value;
40
+ };
41
+
42
+ /**
43
+ * Ensure GOOGLE_APPLICATION_CREDENTIALS points to a JSON key file.
44
+ *
45
+ * In AWS Lambda, the raw service-account JSON is expected to live in
46
+ * AWS Secrets Manager. When GCP_SERVICE_ACCOUNT_SECRET_NAME is present, the
47
+ * secret is fetched, written to `/tmp/gcp-sa-key.json`, and the environment
48
+ * variable is updated to point at that file to avoid stale Lambda values.
49
+ *
50
+ * @returns {Promise<void>} Resolves once credentials are ready.
51
+ * @throws {Error} When the secret cannot be read or written.
52
+ */
53
+ const ensureGoogleCredentialsFromSecret = async () => {
54
+ const secretName = process.env.GCP_SERVICE_ACCOUNT_SECRET_NAME;
55
+ if (!secretName) {
56
+ console.log(
57
+ "No secret name configured, skipping Google credentials initialization"
58
+ );
59
+ return;
60
+ }
61
+
62
+ try {
63
+ const client = new SecretsManagerClient({
64
+ region: process.env.AWS_REGION || "ap-south-1",
65
+ });
66
+
67
+ const response = await client.send(
68
+ new GetSecretValueCommand({
69
+ SecretId: secretName,
70
+ VersionStage: "AWSCURRENT", // VersionStage defaults to AWSCURRENT if unspecified
71
+ })
72
+ );
73
+ const secret = response.SecretString;
74
+ const credPath = path.join("/tmp", "gcp-sa-key.json");
75
+ fs.writeFileSync(credPath, secret, { encoding: "utf8" });
76
+ process.env.GOOGLE_APPLICATION_CREDENTIALS = credPath;
77
+ console.log(
78
+ `Wrote Google service account credentials to ${credPath} from Secrets Manager`
79
+ );
80
+ } catch (error) {
81
+ console.error(
82
+ `Failed to initialize Google credentials from secret ::`,
83
+ error
84
+ );
85
+ throw error;
86
+ }
87
+ };
88
+
89
+ /**
90
+ * Initialize a Google GenAI client configured for Vertex AI.
91
+ * Ensures credentials, project, and location are available before instantiating.
92
+ *
93
+ * @returns {Promise<GoogleGenAI>} Configured GenAI client instance.
94
+ * @throws {Error} When required environment variables are missing.
95
+ */
96
+ const createGenAIClient = async () => {
97
+ await ensureGoogleCredentialsFromSecret();
98
+ const project = ensureEnv("GOOGLE_CLOUD_PROJECT");
99
+ const location = ensureEnv("GOOGLE_CLOUD_LOCATION", "global");
100
+ const client = new GoogleGenAI({
101
+ vertexai: true,
102
+ project: project,
103
+ location: location,
104
+ });
105
+
106
+ return client;
107
+ };
108
+
109
+ const extractAudioBufferFromVideo = async (videoUrl) => {
110
+ const videoResponse = await fetch(videoUrl);
111
+ if (!videoResponse.ok) {
112
+ throw new Error(`Failed to download video: ${videoResponse.status} ${videoResponse.statusText}`);
113
+ }
114
+ const tmpBase = await mkdtemp(join(tmpdir(), 'mcp-'));
115
+ const inputPath = join(tmpBase, 'input_video');
116
+ const outputPath = join(tmpBase, 'output_audio.mp3');
117
+
118
+ // Stream the video response directly to disk to avoid holding the full video in memory
119
+ if (!videoResponse.body) {
120
+ await rm(tmpBase, { recursive: true, force: true });
121
+ throw new Error("Video response has no body");
122
+ }
123
+ const videoStream = Readable.fromWeb(videoResponse.body);
124
+ const fileWriteStream = fs.createWriteStream(inputPath);
125
+ await pipelineAsync(videoStream, fileWriteStream);
126
+
127
+ // Get duration using bundled ffprobe
128
+ let duration = 0;
129
+ try {
130
+ const { stdout } = await execFileAsync(ffprobePath, [
131
+ '-v', 'error',
132
+ '-show_entries', 'format=duration',
133
+ '-of', 'default=noprint_wrappers=1:nokey=1',
134
+ inputPath
135
+ ]);
136
+ duration = parseFloat(stdout.toString().trim()) || 0;
137
+ } catch (err) {
138
+ console.warn('Failed to get duration using ffprobe, duration will be 0');
139
+ }
140
+
141
+ try {
142
+ await execFileAsync(ffmpegPath, [
143
+ '-y',
144
+ '-i', inputPath,
145
+ '-vn',
146
+ '-acodec', 'libmp3lame',
147
+ '-q:a', '2',
148
+ outputPath
149
+ ]);
150
+ } catch (err) {
151
+ await rm(tmpBase, { recursive: true, force: true });
152
+ const stderr = err?.stderr?.toString?.().trim?.() || "";
153
+ const msg = stderr || (err instanceof Error ? err.message : String(err));
154
+ throw new Error(`ffmpeg execution failed: ${msg}`);
155
+ }
156
+
157
+ const audioBuffer = await readFile(outputPath);
158
+ await rm(tmpBase, { recursive: true, force: true });
159
+ return { audioBuffer, duration };
160
+ };
161
+
162
+
163
+ /**
164
+ * Build the captioning prompt passed to the Gemini model.
165
+ *
166
+ * @param {number} duration - Audio duration in seconds.
167
+ * @param {string} language - Human-readable target language.
168
+ * @param {string} languageFont - Desired script/font name.
169
+ * @returns {string} Instruction prompt for the model.
170
+ */
171
+ const buildPrompt = (duration, language, languageFont) => {
172
+ // Convert duration from seconds to milliseconds for the prompt
173
+ const durationMs = Math.round(duration * 1000);
174
+
175
+ return `You are a professional subtitle and transcription engine.
176
+
177
+ ## INPUT
178
+ - Audio duration: ${durationMs} milliseconds
179
+ - Target language: ${language}
180
+ - Subtitle font script: ${languageFont}
181
+
182
+ ## OBJECTIVE
183
+ Transcribe the audio into clear, readable subtitles.
184
+
185
+ If the spoken audio is NOT in ${language}, translate it into ${language} before generating subtitles.
186
+
187
+ ## SUBTITLE SEGMENTATION RULES
188
+ - Split speech into short, natural phrases.
189
+ - Each subtitle phrase MUST contain a maximum of 4 words.
190
+ - Do NOT split words across phrases.
191
+ - Avoid breaking phrases mid-sentence unless required by timing constraints.
192
+
193
+ ## TIMING RULES (STRICT — MUST FOLLOW)
194
+ - All timestamps are in **milliseconds**.
195
+ - Each subtitle object MUST include:
196
+ - 's': start timestamp
197
+ - 'e': end timestamp
198
+ - Duration of each phrase = 'e - s'
199
+ - Minimum phrase duration: **100 ms**
200
+ - 'e' MUST be greater than 's'
201
+ - 'e' MUST be **less than or equal to ${durationMs}**
202
+ - Subtitles MUST be sequential:
203
+ - 's' of the next phrase MUST be **greater than or equal to** the previous 'e'
204
+ - NO overlapping timestamps
205
+ - Prefer aligning timestamps with natural speech pauses.
206
+
207
+ ## TEXT RULES
208
+ - 't' MUST be written using ${languageFont} characters.
209
+ - No emojis.
210
+ - No punctuation-only subtitles.
211
+ - Normalize casing according to the target language's writing system.
212
+ - Remove filler sounds (e.g., “um”, “uh”) unless semantically important.
213
+
214
+ ## OUTPUT FORMAT (CRITICAL)
215
+ Return ONLY a valid JSON array.
216
+ - No markdown
217
+ - No code blocks
218
+ - No explanations
219
+ - No additional text
220
+ - Output MUST start with '[' and end with ']'
221
+
222
+ ## OUTPUT SCHEMA
223
+ [
224
+ {
225
+ "t": "Subtitle text",
226
+ "s": 0,
227
+ "e": 1200
228
+ }
229
+ ]
230
+ `.trim();
231
+ };
232
+
233
+ /**
234
+ * Transcribe an audio URL to JSON subtitles using Google GenAI (Vertex AI),
235
+ * mirroring the Python implementation in `playground/vertex/transcript.py`.
236
+ *
237
+ * @param {Object} params
238
+ * @param {string} params.videoUrl - Publicly reachable video URL.
239
+ * @param {string} [params.language="english"] - Target transcription language (human-readable).
240
+ * @param {string} [params.languageFont="english"] - Target font/script for subtitles.
241
+ * @returns {Promise<{ subtitles: Array<{t: string, s: number, e: number}> }>} Subtitles array with text, start time, and end time.
242
+ * @throws {Error} When audioUrl is missing or downstream calls fail.
243
+ */
244
+ export const transcribeVideoUrl = async (params) => {
245
+ const {
246
+ videoUrl,
247
+ language = "english",
248
+ languageFont = "english",
249
+ } = params || {};
250
+
251
+ if (!videoUrl) {
252
+ throw new Error("Missing required parameter: videoUrl");
253
+ }
254
+
255
+ const { audioBuffer, duration } = await extractAudioBufferFromVideo(videoUrl);
256
+ if (!duration) {
257
+ throw new Error("Failed to get duration of video");
258
+ }
259
+
260
+ const prompt = buildPrompt(duration, language, languageFont);
261
+
262
+ const client = await createGenAIClient();
263
+ const modelName = process.env.GOOGLE_VERTEX_MODEL || "gemini-2.5-flash-lite";
264
+
265
+ const generationConfig = {
266
+ maxOutputTokens: 65535,
267
+ temperature: 1,
268
+ topP: 0.95,
269
+ thinkingConfig: {
270
+ thinkingBudget: 0,
271
+ },
272
+ safetySettings: [
273
+ {
274
+ category: "HARM_CATEGORY_HATE_SPEECH",
275
+ threshold: "OFF",
276
+ },
277
+ {
278
+ category: "HARM_CATEGORY_DANGEROUS_CONTENT",
279
+ threshold: "OFF",
280
+ },
281
+ {
282
+ category: "HARM_CATEGORY_SEXUALLY_EXPLICIT",
283
+ threshold: "OFF",
284
+ },
285
+ {
286
+ category: "HARM_CATEGORY_HARASSMENT",
287
+ threshold: "OFF",
288
+ },
289
+ ],
290
+ };
291
+
292
+ const req = {
293
+ model: modelName,
294
+ contents: [
295
+ {
296
+ role: "user",
297
+ parts: [
298
+ {
299
+ inlineData: {
300
+ data: audioBuffer.toString("base64"),
301
+ mimeType: "audio/mpeg",
302
+ },
303
+ },
304
+ { text: prompt },
305
+ ],
306
+ },
307
+ ],
308
+ config: generationConfig,
309
+ };
310
+
311
+ const response = await client.models.generateContent(req);
312
+
313
+ let textPart = response.text || "";
314
+
315
+ // Strip markdown code fences if present (```json ... ``` or ``` ... ```)
316
+ textPart = textPart
317
+ .replace(/^```json\s*/i, "") // Remove opening ```json
318
+ .replace(/^```\s*/i, "") // Remove opening ```
319
+ .replace(/\s*```$/i, "") // Remove closing ```
320
+ .trim();
321
+
322
+
323
+ let subtitles = [];
324
+ try {
325
+ // Try to find JSON array in the text (in case there's extra text)
326
+ const jsonMatch = textPart.match(/\[[\s\S]*\]/);
327
+ const jsonText = jsonMatch ? jsonMatch[0] : textPart;
328
+
329
+ subtitles = JSON.parse(jsonText);
330
+ if (!Array.isArray(subtitles)) {
331
+ throw new Error("Parsed subtitles are not an array");
332
+ }
333
+ } catch (err) {
334
+ console.warn(
335
+ "Failed to parse model output as JSON subtitles, returning raw text",
336
+ err
337
+ );
338
+ console.warn("Raw response text:", textPart.substring(0, 500));
339
+ subtitles = [];
340
+ }
341
+
342
+ return {
343
+ subtitles,
344
+ duration,
345
+ videoUrl
346
+ };
347
+ };
package/package.json ADDED
@@ -0,0 +1,58 @@
1
+ {
2
+ "name": "@twick/cloud-transcript",
3
+ "version": "0.15.1",
4
+ "description": "Twick cloud function for generating JSON captions from audio using Google Cloud Speech-to-Text",
5
+ "type": "module",
6
+ "main": "core/transcriber.js",
7
+ "exports": {
8
+ ".": "./core/transcriber.js",
9
+ "./aws": "./platform/aws/handler.js",
10
+ "./platform/aws/*": "./platform/aws/*"
11
+ },
12
+ "bin": {
13
+ "twick-transcript": "bin/twick-transcript.js"
14
+ },
15
+ "files": [
16
+ "core",
17
+ "platform",
18
+ "bin",
19
+ "README.md"
20
+ ],
21
+ "scripts": {
22
+ "test": "node --test test/transcriber.test.js",
23
+ "verify:aws": "node -e \"require('fs').accessSync('platform/aws/Dockerfile'); require('fs').accessSync('platform/aws/handler.js'); console.log('AWS transcript function assets present')\"",
24
+ "pack:aws": "npm run verify:aws && npm pack",
25
+ "release:aws": "npm run verify:aws && npm publish --access public --tag aws",
26
+ "deploy:aws": "node scripts/deploy-aws.js",
27
+ "prepublishOnly": "npm run verify:aws"
28
+ },
29
+ "publishConfig": {
30
+ "access": "public",
31
+ "tag": "aws"
32
+ },
33
+ "keywords": [
34
+ "twick",
35
+ "audio",
36
+ "transcript",
37
+ "caption",
38
+ "lambda",
39
+ "aws",
40
+ "docker",
41
+ "google-cloud-speech"
42
+ ],
43
+ "author": "",
44
+ "license": "SEE LICENSE IN LICENSE.md",
45
+ "engines": {
46
+ "node": ">=20.0.0"
47
+ },
48
+ "dependencies": {
49
+ "@google/genai": "^1.0.0",
50
+ "@aws-sdk/client-secrets-manager": "^3.679.0",
51
+ "@ffmpeg-installer/ffmpeg": "^1.1.0",
52
+ "@ffprobe-installer/ffprobe": "^1.1.0"
53
+ },
54
+ "devDependencies": {
55
+ "typescript": "~5.4.5",
56
+ "dotenv": "^16.4.5"
57
+ }
58
+ }
@@ -0,0 +1,14 @@
1
+ FROM --platform=linux/amd64 public.ecr.aws/lambda/nodejs:20
2
+
3
+ # Copy package files for better caching
4
+ COPY package.json package-lock.json* ./
5
+
6
+ RUN npm install
7
+
8
+ # Copy source code
9
+ COPY . ./
10
+
11
+ # Default Lambda handler
12
+ CMD ["platform/aws/handler.handler"]
13
+
14
+
@@ -0,0 +1,90 @@
1
+ import { transcribeVideoUrl } from '@twick/cloud-transcript';
2
+
3
+ const jsonResponse = (statusCode, body) => ({
4
+ statusCode,
5
+ headers: {
6
+ 'Content-Type': 'application/json',
7
+ 'Access-Control-Allow-Origin': '*',
8
+ 'Access-Control-Allow-Headers': 'Content-Type',
9
+ 'Access-Control-Allow-Methods': 'POST, OPTIONS',
10
+ },
11
+ body: JSON.stringify(body),
12
+ });
13
+
14
+ /**
15
+ * AWS Lambda handler for generating captions using Google Cloud Speech-to-Text.
16
+ *
17
+ * Expected JSON payload (e.g. via AppSync / Lambda resolver):
18
+ * {
19
+ * "videoUrl": "https://example.com/audio.mp3", // or "gs://bucket/object"
20
+ * "languageCode": "en-US", // optional, defaults to "en-US"
21
+ * "encoding": "MP3", // optional
22
+ * "sampleRateHertz": 16000 // optional
23
+ * }
24
+ *
25
+ * Environment variables:
26
+ * - GOOGLE_CLOUD_PROJECT: Explicit Google Cloud project id.
27
+ * - GOOGLE_CLOUD_LOCATION (optional): Location of the Google Cloud project.
28
+ * - GOOGLE_VERTEX_MODEL (optional): Model to use for transcription.
29
+ *
30
+ * Returns: JSON payload containing transcript text, caption segments, and word-level timings.
31
+ */
32
+ export const handler = async (event) => {
33
+ console.log('Transcript function invoked');
34
+ console.log('Event:', JSON.stringify(event));
35
+
36
+ if (event.httpMethod === 'OPTIONS') {
37
+ return {
38
+ statusCode: 204,
39
+ headers: {
40
+ 'Access-Control-Allow-Origin': '*',
41
+ 'Access-Control-Allow-Headers': 'Content-Type',
42
+ 'Access-Control-Allow-Methods': 'POST, OPTIONS',
43
+ },
44
+ body: '',
45
+ };
46
+ }
47
+
48
+ try {
49
+ const argumentsPayload =
50
+ event?.arguments ||
51
+ (event?.body ? JSON.parse(event.body) : {}) ||
52
+ {};
53
+
54
+ const { videoUrl, language,languageFont } =
55
+ argumentsPayload;
56
+
57
+ if (!videoUrl) {
58
+ return jsonResponse(400, {
59
+ error: 'Missing required field: videoUrl',
60
+ expectedFormat: {
61
+ videoUrl:
62
+ 'Publicly reachable audio URL or "gs://bucket/object" for GCS',
63
+ language: 'Optional language (e.g., "english", "hindi")',
64
+ languageFont: 'Optional font/script for captions (e.g., "english")',
65
+ },
66
+ });
67
+ }
68
+
69
+ const result = await transcribeVideoUrl({
70
+ videoUrl,
71
+ language,
72
+ languageFont,
73
+ });
74
+
75
+ console.log('Transcription completed successfully');
76
+
77
+ return jsonResponse(200, {
78
+ ...result,
79
+ });
80
+ } catch (error) {
81
+ console.error('Error generating transcript:', error);
82
+
83
+ return jsonResponse(500, {
84
+ error: 'Internal server error',
85
+ message: error instanceof Error ? error.message : 'Unknown error',
86
+ });
87
+ }
88
+ };
89
+
90
+