@twick/cloud-transcript 0.15.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +131 -0
- package/bin/twick-transcript.js +125 -0
- package/core/transcriber.js +347 -0
- package/package.json +58 -0
- package/platform/aws/Dockerfile +14 -0
- package/platform/aws/handler.js +90 -0
package/README.md
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
# @twick/cloud-transcript
|
|
2
|
+
|
|
3
|
+
**Transcribe audio/video to JSON captions using Google GenAI (Vertex AI) with Gemini models.**
|
|
4
|
+
|
|
5
|
+
Extract text from audio content with precise millisecond timestamps. Perfect for generating subtitle data from audio files or video URLs.
|
|
6
|
+
|
|
7
|
+
## What Problem Does This Solve?
|
|
8
|
+
|
|
9
|
+
- **AI-powered transcription** — Use Google's Gemini models for accurate audio-to-text conversion
|
|
10
|
+
- **Precise timestamps** — Get millisecond-level timing for each caption segment
|
|
11
|
+
- **Serverless processing** — Deploy as AWS Lambda for automatic scaling
|
|
12
|
+
- **Multiple languages** — Support various languages and fonts
|
|
13
|
+
|
|
14
|
+
## Input → Output
|
|
15
|
+
|
|
16
|
+
**Input:** Audio URL + optional configuration
|
|
17
|
+
```json
|
|
18
|
+
{
|
|
19
|
+
"audioUrl": "https://example.com/audio.mp3",
|
|
20
|
+
"language": "english",
|
|
21
|
+
"languageFont": "english"
|
|
22
|
+
}
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
**Output:** JSON captions with timestamps
|
|
26
|
+
```json
|
|
27
|
+
{
|
|
28
|
+
"captions": [
|
|
29
|
+
{
|
|
30
|
+
"t": "Example phrase 1",
|
|
31
|
+
"s": 0,
|
|
32
|
+
"e": 1500
|
|
33
|
+
},
|
|
34
|
+
{
|
|
35
|
+
"t": "Another short example",
|
|
36
|
+
"s": 1500,
|
|
37
|
+
"e": 2800
|
|
38
|
+
}
|
|
39
|
+
],
|
|
40
|
+
"rawText": "Full raw response text from the model..."
|
|
41
|
+
}
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
**Where it runs:** AWS Lambda container image (Linux/AMD64)
|
|
45
|
+
|
|
46
|
+
## Installation
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
npm install -D @twick/cloud-transcript
|
|
50
|
+
```
|
|
51
|
+
|
|
52
|
+
## Quick Start
|
|
53
|
+
|
|
54
|
+
### 1. Scaffold AWS Lambda Template
|
|
55
|
+
|
|
56
|
+
```bash
|
|
57
|
+
npx twick-transcript init
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### 2. Build Docker Image
|
|
61
|
+
|
|
62
|
+
```bash
|
|
63
|
+
npx twick-transcript build twick-transcript:latest
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
### 3. Configure Google Cloud
|
|
67
|
+
|
|
68
|
+
**Required:**
|
|
69
|
+
- Google Cloud project with Vertex AI API enabled
|
|
70
|
+
- Service account with Vertex AI permissions
|
|
71
|
+
|
|
72
|
+
**Environment variables:**
|
|
73
|
+
- `GOOGLE_CLOUD_PROJECT` (required) — Your GCP project ID
|
|
74
|
+
- `GOOGLE_CLOUD_LOCATION` (optional) — Vertex AI location (default: `"global"`)
|
|
75
|
+
- `GOOGLE_VERTEX_MODEL` (optional) — Model name (default: `"gemini-2.5-flash"`)
|
|
76
|
+
|
|
77
|
+
**Credentials (choose one):**
|
|
78
|
+
- **File path** (recommended):
|
|
79
|
+
- Mount service account JSON and set `GOOGLE_APPLICATION_CREDENTIALS` to the file path
|
|
80
|
+
- **Environment JSON** (alternative):
|
|
81
|
+
- Set `GOOGLE_KEY` to the service account JSON string
|
|
82
|
+
|
|
83
|
+
### 4. Deploy to AWS Lambda
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
# Login to ECR
|
|
87
|
+
npx twick-transcript ecr-login us-east-1 YOUR_ACCOUNT_ID
|
|
88
|
+
|
|
89
|
+
# Push to ECR
|
|
90
|
+
npx twick-transcript push twick-transcript:latest us-east-1 YOUR_ACCOUNT_ID
|
|
91
|
+
```
|
|
92
|
+
|
|
93
|
+
## Deployment (High Level)
|
|
94
|
+
|
|
95
|
+
1. **Scaffold** the Lambda container template
|
|
96
|
+
2. **Configure** Google Cloud credentials (file mount or environment variable)
|
|
97
|
+
3. **Set environment variables** (GCP project, location, model)
|
|
98
|
+
4. **Build and push** Docker image to ECR
|
|
99
|
+
5. **Create Lambda function** using the ECR image
|
|
100
|
+
|
|
101
|
+
The Lambda handler expects:
|
|
102
|
+
- **Event format:** `{ audioUrl, language?, languageFont? }`
|
|
103
|
+
- **Response:** JSON with `captions` array and `rawText` string
|
|
104
|
+
|
|
105
|
+
**Note:** The audio URL must be publicly accessible via HTTP(S). Google Cloud Storage URIs (`gs://`) are not directly supported—use signed URLs instead.
|
|
106
|
+
|
|
107
|
+
## Programmatic Usage
|
|
108
|
+
|
|
109
|
+
Use the core transcriber directly:
|
|
110
|
+
|
|
111
|
+
```js
|
|
112
|
+
import { transcribeAudioUrl } from '@twick/cloud-transcript/core/transcriber.js';
|
|
113
|
+
|
|
114
|
+
const result = await transcribeAudioUrl({
|
|
115
|
+
audioUrl: 'https://example.com/audio.mp3',
|
|
116
|
+
language: 'english',
|
|
117
|
+
languageFont: 'english',
|
|
118
|
+
});
|
|
119
|
+
|
|
120
|
+
console.log(result.captions); // Array of {t, s, e} objects
|
|
121
|
+
console.log(result.rawText); // Raw model response
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
## Technical Details
|
|
125
|
+
|
|
126
|
+
- **Model:** Google Gemini (default: `gemini-2.5-flash`, configurable via `GOOGLE_VERTEX_MODEL`)
|
|
127
|
+
- **Format:** Captions segmented into max 4 words per segment
|
|
128
|
+
- **Timestamps:** Millisecond precision, non-overlapping segments
|
|
129
|
+
- **API:** Google Vertex AI (GenAI)
|
|
130
|
+
|
|
131
|
+
For detailed setup instructions, see the complete deployment guide in the repository.
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { fileURLToPath } from 'url';
|
|
3
|
+
import { dirname, join } from 'path';
|
|
4
|
+
import fs from 'fs';
|
|
5
|
+
import { spawn } from 'child_process';
|
|
6
|
+
|
|
7
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
8
|
+
const __dirname = dirname(__filename);
|
|
9
|
+
const pkgRoot = join(__dirname, '..');
|
|
10
|
+
|
|
11
|
+
function copyTemplate(destDir) {
|
|
12
|
+
const templateDir = join(pkgRoot, 'platform', 'aws');
|
|
13
|
+
if (!fs.existsSync(destDir)) fs.mkdirSync(destDir, { recursive: true });
|
|
14
|
+
|
|
15
|
+
// Create platform/aws directory structure to maintain consistency with CMD ["platform/aws/handler.handler"]
|
|
16
|
+
const platformAwsDir = join(destDir, 'platform', 'aws');
|
|
17
|
+
if (!fs.existsSync(platformAwsDir)) {
|
|
18
|
+
fs.mkdirSync(platformAwsDir, { recursive: true });
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
// Copy Dockerfile to root (it references platform/aws/handler.handler)
|
|
22
|
+
const dockerfileSrc = join(templateDir, 'Dockerfile');
|
|
23
|
+
const dockerfileDest = join(destDir, 'Dockerfile');
|
|
24
|
+
fs.copyFileSync(dockerfileSrc, dockerfileDest);
|
|
25
|
+
|
|
26
|
+
// Copy handler.js to platform/aws/ to match the CMD path
|
|
27
|
+
const handlerSrc = join(templateDir, 'handler.js');
|
|
28
|
+
const handlerDest = join(platformAwsDir, 'handler.js');
|
|
29
|
+
fs.copyFileSync(handlerSrc, handlerDest);
|
|
30
|
+
|
|
31
|
+
// Minimal package.json to enable docker layer caching (npm ci)
|
|
32
|
+
const pkgJsonPath = join(destDir, 'package.json');
|
|
33
|
+
if (!fs.existsSync(pkgJsonPath)) {
|
|
34
|
+
const pkg = {
|
|
35
|
+
name: 'twick-transcript-runtime',
|
|
36
|
+
private: true,
|
|
37
|
+
type: 'module',
|
|
38
|
+
dependencies: {
|
|
39
|
+
'@twick/cloud-transcript': 'latest',
|
|
40
|
+
'@ffmpeg-installer/ffmpeg': '^1.1.0',
|
|
41
|
+
'@ffprobe-installer/ffprobe': '^1.1.0'
|
|
42
|
+
}
|
|
43
|
+
};
|
|
44
|
+
fs.writeFileSync(pkgJsonPath, JSON.stringify(pkg, null, 2));
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function run(cmd, args, opts = {}) {
|
|
49
|
+
return new Promise((resolve, reject) => {
|
|
50
|
+
const ps = typeof cmd === 'string' && Array.isArray(args) && args.length === 0
|
|
51
|
+
? spawn(cmd, { stdio: 'inherit', shell: true, ...opts })
|
|
52
|
+
: spawn(cmd, args, { stdio: 'inherit', shell: true, ...opts });
|
|
53
|
+
ps.on('close', (code) => (code === 0 ? resolve() : reject(new Error(`${cmd} exited ${code}`))));
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
async function main() {
|
|
58
|
+
const [command, ...rest] = process.argv.slice(2);
|
|
59
|
+
|
|
60
|
+
if (!command || ['-h', '--help', 'help'].includes(command)) {
|
|
61
|
+
console.log(`
|
|
62
|
+
Usage: twick-transcript <command> [options]
|
|
63
|
+
|
|
64
|
+
Commands:
|
|
65
|
+
init [dir] Scaffold AWS container template into [dir] (default: ./twick-transcript-aws)
|
|
66
|
+
build <image> [dir] Docker build image from [dir] (default: ./twick-transcript-aws)
|
|
67
|
+
ecr-login <region> <accountId> Login docker to ECR
|
|
68
|
+
push <image> <region> <accountId> Push image to ECR (repo must exist)
|
|
69
|
+
|
|
70
|
+
Examples:
|
|
71
|
+
twick-transcript init
|
|
72
|
+
twick-transcript build my-repo:latest
|
|
73
|
+
twick-transcript ecr-login us-east-1 123456789012
|
|
74
|
+
twick-transcript push my-repo:latest us-east-1 123456789012
|
|
75
|
+
`);
|
|
76
|
+
return;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (command === 'init') {
|
|
80
|
+
const dir = rest[0] || 'twick-transcript-aws';
|
|
81
|
+
copyTemplate(dir);
|
|
82
|
+
console.log(`✔ Scaffolded AWS runtime into ./${dir}`);
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
if (command === 'build') {
|
|
87
|
+
const image = rest[0];
|
|
88
|
+
const dir = rest[1] || 'twick-transcript-aws';
|
|
89
|
+
if (!image) throw new Error('Image name required. e.g., my-repo:latest');
|
|
90
|
+
// Build for linux/amd64 platform to avoid creating multi-arch manifest index
|
|
91
|
+
// This reduces the number of artifacts pushed to the registry
|
|
92
|
+
await run('docker', ['build', '--platform', 'linux/amd64', '-t', image, dir]);
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
if (command === 'ecr-login') {
|
|
97
|
+
const region = rest[0];
|
|
98
|
+
const accountId = rest[1];
|
|
99
|
+
if (!region || !accountId) throw new Error('Usage: ecr-login <region> <accountId>');
|
|
100
|
+
const registry = `${accountId}.dkr.ecr.${region}.amazonaws.com`;
|
|
101
|
+
await run(`aws ecr get-login-password --region ${region} | docker login --username AWS --password-stdin ${registry}`, []);
|
|
102
|
+
return;
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
if (command === 'push') {
|
|
106
|
+
const image = rest[0];
|
|
107
|
+
const region = rest[1];
|
|
108
|
+
const accountId = rest[2];
|
|
109
|
+
if (!image || !region || !accountId) throw new Error('Usage: push <image> <region> <accountId>');
|
|
110
|
+
const [repo, tag = 'latest'] = image.split(':');
|
|
111
|
+
const registry = `${accountId}.dkr.ecr.${region}.amazonaws.com`;
|
|
112
|
+
const remote = `${registry}/${repo}:${tag}`;
|
|
113
|
+
await run('docker', ['tag', `${repo}:${tag}`, remote]);
|
|
114
|
+
await run('docker', ['push', remote]);
|
|
115
|
+
console.log(`✔ Pushed ${remote}`);
|
|
116
|
+
return;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
throw new Error(`Unknown command: ${command}`);
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
main().catch((err) => {
|
|
123
|
+
console.error(err.message || err);
|
|
124
|
+
process.exit(1);
|
|
125
|
+
});
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
import { GoogleGenAI } from "@google/genai";
|
|
2
|
+
import {
|
|
3
|
+
SecretsManagerClient,
|
|
4
|
+
GetSecretValueCommand,
|
|
5
|
+
} from "@aws-sdk/client-secrets-manager";
|
|
6
|
+
import fs from "fs";
|
|
7
|
+
import path, { join } from "path";
|
|
8
|
+
import { mkdtemp, readFile, rm } from "fs/promises";
|
|
9
|
+
import { tmpdir } from "os";
|
|
10
|
+
import { execFile } from "child_process";
|
|
11
|
+
import { promisify } from "util";
|
|
12
|
+
import { Readable, pipeline } from "stream";
|
|
13
|
+
|
|
14
|
+
// These packages provide prebuilt ffmpeg/ffprobe binaries. Types are not bundled,
|
|
15
|
+
// so we import them as `any` to keep TypeScript satisfied.
|
|
16
|
+
import ffmpeg from "@ffmpeg-installer/ffmpeg";
|
|
17
|
+
import ffprobe from "@ffprobe-installer/ffprobe";
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
const execFileAsync = promisify(execFile);
|
|
21
|
+
const pipelineAsync = promisify(pipeline);
|
|
22
|
+
const ffmpegPath = ffmpeg.path;
|
|
23
|
+
const ffprobePath = ffprobe.path;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Read a required environment variable, optionally falling back to a default.
|
|
27
|
+
* Throws if neither value is available, making configuration errors obvious.
|
|
28
|
+
*
|
|
29
|
+
* @param {string} name - Environment variable to read.
|
|
30
|
+
* @param {string | undefined} defaultValue - Optional fallback value.
|
|
31
|
+
* @returns {string} The resolved value.
|
|
32
|
+
* @throws {Error} If no value is found.
|
|
33
|
+
*/
|
|
34
|
+
const ensureEnv = (name, defaultValue) => {
|
|
35
|
+
const value = process.env[name] ?? defaultValue;
|
|
36
|
+
if (!value) {
|
|
37
|
+
throw new Error(`Missing required environment variable: ${name}`);
|
|
38
|
+
}
|
|
39
|
+
return value;
|
|
40
|
+
};
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Ensure GOOGLE_APPLICATION_CREDENTIALS points to a JSON key file.
|
|
44
|
+
*
|
|
45
|
+
* In AWS Lambda, the raw service-account JSON is expected to live in
|
|
46
|
+
* AWS Secrets Manager. When GCP_SERVICE_ACCOUNT_SECRET_NAME is present, the
|
|
47
|
+
* secret is fetched, written to `/tmp/gcp-sa-key.json`, and the environment
|
|
48
|
+
* variable is updated to point at that file to avoid stale Lambda values.
|
|
49
|
+
*
|
|
50
|
+
* @returns {Promise<void>} Resolves once credentials are ready.
|
|
51
|
+
* @throws {Error} When the secret cannot be read or written.
|
|
52
|
+
*/
|
|
53
|
+
const ensureGoogleCredentialsFromSecret = async () => {
|
|
54
|
+
const secretName = process.env.GCP_SERVICE_ACCOUNT_SECRET_NAME;
|
|
55
|
+
if (!secretName) {
|
|
56
|
+
console.log(
|
|
57
|
+
"No secret name configured, skipping Google credentials initialization"
|
|
58
|
+
);
|
|
59
|
+
return;
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
try {
|
|
63
|
+
const client = new SecretsManagerClient({
|
|
64
|
+
region: process.env.AWS_REGION || "ap-south-1",
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
const response = await client.send(
|
|
68
|
+
new GetSecretValueCommand({
|
|
69
|
+
SecretId: secretName,
|
|
70
|
+
VersionStage: "AWSCURRENT", // VersionStage defaults to AWSCURRENT if unspecified
|
|
71
|
+
})
|
|
72
|
+
);
|
|
73
|
+
const secret = response.SecretString;
|
|
74
|
+
const credPath = path.join("/tmp", "gcp-sa-key.json");
|
|
75
|
+
fs.writeFileSync(credPath, secret, { encoding: "utf8" });
|
|
76
|
+
process.env.GOOGLE_APPLICATION_CREDENTIALS = credPath;
|
|
77
|
+
console.log(
|
|
78
|
+
`Wrote Google service account credentials to ${credPath} from Secrets Manager`
|
|
79
|
+
);
|
|
80
|
+
} catch (error) {
|
|
81
|
+
console.error(
|
|
82
|
+
`Failed to initialize Google credentials from secret ::`,
|
|
83
|
+
error
|
|
84
|
+
);
|
|
85
|
+
throw error;
|
|
86
|
+
}
|
|
87
|
+
};
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Initialize a Google GenAI client configured for Vertex AI.
|
|
91
|
+
* Ensures credentials, project, and location are available before instantiating.
|
|
92
|
+
*
|
|
93
|
+
* @returns {Promise<GoogleGenAI>} Configured GenAI client instance.
|
|
94
|
+
* @throws {Error} When required environment variables are missing.
|
|
95
|
+
*/
|
|
96
|
+
const createGenAIClient = async () => {
|
|
97
|
+
await ensureGoogleCredentialsFromSecret();
|
|
98
|
+
const project = ensureEnv("GOOGLE_CLOUD_PROJECT");
|
|
99
|
+
const location = ensureEnv("GOOGLE_CLOUD_LOCATION", "global");
|
|
100
|
+
const client = new GoogleGenAI({
|
|
101
|
+
vertexai: true,
|
|
102
|
+
project: project,
|
|
103
|
+
location: location,
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
return client;
|
|
107
|
+
};
|
|
108
|
+
|
|
109
|
+
const extractAudioBufferFromVideo = async (videoUrl) => {
|
|
110
|
+
const videoResponse = await fetch(videoUrl);
|
|
111
|
+
if (!videoResponse.ok) {
|
|
112
|
+
throw new Error(`Failed to download video: ${videoResponse.status} ${videoResponse.statusText}`);
|
|
113
|
+
}
|
|
114
|
+
const tmpBase = await mkdtemp(join(tmpdir(), 'mcp-'));
|
|
115
|
+
const inputPath = join(tmpBase, 'input_video');
|
|
116
|
+
const outputPath = join(tmpBase, 'output_audio.mp3');
|
|
117
|
+
|
|
118
|
+
// Stream the video response directly to disk to avoid holding the full video in memory
|
|
119
|
+
if (!videoResponse.body) {
|
|
120
|
+
await rm(tmpBase, { recursive: true, force: true });
|
|
121
|
+
throw new Error("Video response has no body");
|
|
122
|
+
}
|
|
123
|
+
const videoStream = Readable.fromWeb(videoResponse.body);
|
|
124
|
+
const fileWriteStream = fs.createWriteStream(inputPath);
|
|
125
|
+
await pipelineAsync(videoStream, fileWriteStream);
|
|
126
|
+
|
|
127
|
+
// Get duration using bundled ffprobe
|
|
128
|
+
let duration = 0;
|
|
129
|
+
try {
|
|
130
|
+
const { stdout } = await execFileAsync(ffprobePath, [
|
|
131
|
+
'-v', 'error',
|
|
132
|
+
'-show_entries', 'format=duration',
|
|
133
|
+
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
134
|
+
inputPath
|
|
135
|
+
]);
|
|
136
|
+
duration = parseFloat(stdout.toString().trim()) || 0;
|
|
137
|
+
} catch (err) {
|
|
138
|
+
console.warn('Failed to get duration using ffprobe, duration will be 0');
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
try {
|
|
142
|
+
await execFileAsync(ffmpegPath, [
|
|
143
|
+
'-y',
|
|
144
|
+
'-i', inputPath,
|
|
145
|
+
'-vn',
|
|
146
|
+
'-acodec', 'libmp3lame',
|
|
147
|
+
'-q:a', '2',
|
|
148
|
+
outputPath
|
|
149
|
+
]);
|
|
150
|
+
} catch (err) {
|
|
151
|
+
await rm(tmpBase, { recursive: true, force: true });
|
|
152
|
+
const stderr = err?.stderr?.toString?.().trim?.() || "";
|
|
153
|
+
const msg = stderr || (err instanceof Error ? err.message : String(err));
|
|
154
|
+
throw new Error(`ffmpeg execution failed: ${msg}`);
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
const audioBuffer = await readFile(outputPath);
|
|
158
|
+
await rm(tmpBase, { recursive: true, force: true });
|
|
159
|
+
return { audioBuffer, duration };
|
|
160
|
+
};
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
/**
|
|
164
|
+
* Build the captioning prompt passed to the Gemini model.
|
|
165
|
+
*
|
|
166
|
+
* @param {number} duration - Audio duration in seconds.
|
|
167
|
+
* @param {string} language - Human-readable target language.
|
|
168
|
+
* @param {string} languageFont - Desired script/font name.
|
|
169
|
+
* @returns {string} Instruction prompt for the model.
|
|
170
|
+
*/
|
|
171
|
+
const buildPrompt = (duration, language, languageFont) => {
|
|
172
|
+
// Convert duration from seconds to milliseconds for the prompt
|
|
173
|
+
const durationMs = Math.round(duration * 1000);
|
|
174
|
+
|
|
175
|
+
return `You are a professional subtitle and transcription engine.
|
|
176
|
+
|
|
177
|
+
## INPUT
|
|
178
|
+
- Audio duration: ${durationMs} milliseconds
|
|
179
|
+
- Target language: ${language}
|
|
180
|
+
- Subtitle font script: ${languageFont}
|
|
181
|
+
|
|
182
|
+
## OBJECTIVE
|
|
183
|
+
Transcribe the audio into clear, readable subtitles.
|
|
184
|
+
|
|
185
|
+
If the spoken audio is NOT in ${language}, translate it into ${language} before generating subtitles.
|
|
186
|
+
|
|
187
|
+
## SUBTITLE SEGMENTATION RULES
|
|
188
|
+
- Split speech into short, natural phrases.
|
|
189
|
+
- Each subtitle phrase MUST contain a maximum of 4 words.
|
|
190
|
+
- Do NOT split words across phrases.
|
|
191
|
+
- Avoid breaking phrases mid-sentence unless required by timing constraints.
|
|
192
|
+
|
|
193
|
+
## TIMING RULES (STRICT — MUST FOLLOW)
|
|
194
|
+
- All timestamps are in **milliseconds**.
|
|
195
|
+
- Each subtitle object MUST include:
|
|
196
|
+
- 's': start timestamp
|
|
197
|
+
- 'e': end timestamp
|
|
198
|
+
- Duration of each phrase = 'e - s'
|
|
199
|
+
- Minimum phrase duration: **100 ms**
|
|
200
|
+
- 'e' MUST be greater than 's'
|
|
201
|
+
- 'e' MUST be **less than or equal to ${durationMs}**
|
|
202
|
+
- Subtitles MUST be sequential:
|
|
203
|
+
- 's' of the next phrase MUST be **greater than or equal to** the previous 'e'
|
|
204
|
+
- NO overlapping timestamps
|
|
205
|
+
- Prefer aligning timestamps with natural speech pauses.
|
|
206
|
+
|
|
207
|
+
## TEXT RULES
|
|
208
|
+
- 't' MUST be written using ${languageFont} characters.
|
|
209
|
+
- No emojis.
|
|
210
|
+
- No punctuation-only subtitles.
|
|
211
|
+
- Normalize casing according to the target language's writing system.
|
|
212
|
+
- Remove filler sounds (e.g., “um”, “uh”) unless semantically important.
|
|
213
|
+
|
|
214
|
+
## OUTPUT FORMAT (CRITICAL)
|
|
215
|
+
Return ONLY a valid JSON array.
|
|
216
|
+
- No markdown
|
|
217
|
+
- No code blocks
|
|
218
|
+
- No explanations
|
|
219
|
+
- No additional text
|
|
220
|
+
- Output MUST start with '[' and end with ']'
|
|
221
|
+
|
|
222
|
+
## OUTPUT SCHEMA
|
|
223
|
+
[
|
|
224
|
+
{
|
|
225
|
+
"t": "Subtitle text",
|
|
226
|
+
"s": 0,
|
|
227
|
+
"e": 1200
|
|
228
|
+
}
|
|
229
|
+
]
|
|
230
|
+
`.trim();
|
|
231
|
+
};
|
|
232
|
+
|
|
233
|
+
/**
|
|
234
|
+
* Transcribe an audio URL to JSON subtitles using Google GenAI (Vertex AI),
|
|
235
|
+
* mirroring the Python implementation in `playground/vertex/transcript.py`.
|
|
236
|
+
*
|
|
237
|
+
* @param {Object} params
|
|
238
|
+
* @param {string} params.videoUrl - Publicly reachable video URL.
|
|
239
|
+
* @param {string} [params.language="english"] - Target transcription language (human-readable).
|
|
240
|
+
* @param {string} [params.languageFont="english"] - Target font/script for subtitles.
|
|
241
|
+
* @returns {Promise<{ subtitles: Array<{t: string, s: number, e: number}> }>} Subtitles array with text, start time, and end time.
|
|
242
|
+
* @throws {Error} When audioUrl is missing or downstream calls fail.
|
|
243
|
+
*/
|
|
244
|
+
export const transcribeVideoUrl = async (params) => {
|
|
245
|
+
const {
|
|
246
|
+
videoUrl,
|
|
247
|
+
language = "english",
|
|
248
|
+
languageFont = "english",
|
|
249
|
+
} = params || {};
|
|
250
|
+
|
|
251
|
+
if (!videoUrl) {
|
|
252
|
+
throw new Error("Missing required parameter: videoUrl");
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
const { audioBuffer, duration } = await extractAudioBufferFromVideo(videoUrl);
|
|
256
|
+
if (!duration) {
|
|
257
|
+
throw new Error("Failed to get duration of video");
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
const prompt = buildPrompt(duration, language, languageFont);
|
|
261
|
+
|
|
262
|
+
const client = await createGenAIClient();
|
|
263
|
+
const modelName = process.env.GOOGLE_VERTEX_MODEL || "gemini-2.5-flash-lite";
|
|
264
|
+
|
|
265
|
+
const generationConfig = {
|
|
266
|
+
maxOutputTokens: 65535,
|
|
267
|
+
temperature: 1,
|
|
268
|
+
topP: 0.95,
|
|
269
|
+
thinkingConfig: {
|
|
270
|
+
thinkingBudget: 0,
|
|
271
|
+
},
|
|
272
|
+
safetySettings: [
|
|
273
|
+
{
|
|
274
|
+
category: "HARM_CATEGORY_HATE_SPEECH",
|
|
275
|
+
threshold: "OFF",
|
|
276
|
+
},
|
|
277
|
+
{
|
|
278
|
+
category: "HARM_CATEGORY_DANGEROUS_CONTENT",
|
|
279
|
+
threshold: "OFF",
|
|
280
|
+
},
|
|
281
|
+
{
|
|
282
|
+
category: "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
|
283
|
+
threshold: "OFF",
|
|
284
|
+
},
|
|
285
|
+
{
|
|
286
|
+
category: "HARM_CATEGORY_HARASSMENT",
|
|
287
|
+
threshold: "OFF",
|
|
288
|
+
},
|
|
289
|
+
],
|
|
290
|
+
};
|
|
291
|
+
|
|
292
|
+
const req = {
|
|
293
|
+
model: modelName,
|
|
294
|
+
contents: [
|
|
295
|
+
{
|
|
296
|
+
role: "user",
|
|
297
|
+
parts: [
|
|
298
|
+
{
|
|
299
|
+
inlineData: {
|
|
300
|
+
data: audioBuffer.toString("base64"),
|
|
301
|
+
mimeType: "audio/mpeg",
|
|
302
|
+
},
|
|
303
|
+
},
|
|
304
|
+
{ text: prompt },
|
|
305
|
+
],
|
|
306
|
+
},
|
|
307
|
+
],
|
|
308
|
+
config: generationConfig,
|
|
309
|
+
};
|
|
310
|
+
|
|
311
|
+
const response = await client.models.generateContent(req);
|
|
312
|
+
|
|
313
|
+
let textPart = response.text || "";
|
|
314
|
+
|
|
315
|
+
// Strip markdown code fences if present (```json ... ``` or ``` ... ```)
|
|
316
|
+
textPart = textPart
|
|
317
|
+
.replace(/^```json\s*/i, "") // Remove opening ```json
|
|
318
|
+
.replace(/^```\s*/i, "") // Remove opening ```
|
|
319
|
+
.replace(/\s*```$/i, "") // Remove closing ```
|
|
320
|
+
.trim();
|
|
321
|
+
|
|
322
|
+
|
|
323
|
+
let subtitles = [];
|
|
324
|
+
try {
|
|
325
|
+
// Try to find JSON array in the text (in case there's extra text)
|
|
326
|
+
const jsonMatch = textPart.match(/\[[\s\S]*\]/);
|
|
327
|
+
const jsonText = jsonMatch ? jsonMatch[0] : textPart;
|
|
328
|
+
|
|
329
|
+
subtitles = JSON.parse(jsonText);
|
|
330
|
+
if (!Array.isArray(subtitles)) {
|
|
331
|
+
throw new Error("Parsed subtitles are not an array");
|
|
332
|
+
}
|
|
333
|
+
} catch (err) {
|
|
334
|
+
console.warn(
|
|
335
|
+
"Failed to parse model output as JSON subtitles, returning raw text",
|
|
336
|
+
err
|
|
337
|
+
);
|
|
338
|
+
console.warn("Raw response text:", textPart.substring(0, 500));
|
|
339
|
+
subtitles = [];
|
|
340
|
+
}
|
|
341
|
+
|
|
342
|
+
return {
|
|
343
|
+
subtitles,
|
|
344
|
+
duration,
|
|
345
|
+
videoUrl
|
|
346
|
+
};
|
|
347
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@twick/cloud-transcript",
|
|
3
|
+
"version": "0.15.1",
|
|
4
|
+
"description": "Twick cloud function for generating JSON captions from audio using Google Cloud Speech-to-Text",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "core/transcriber.js",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./core/transcriber.js",
|
|
9
|
+
"./aws": "./platform/aws/handler.js",
|
|
10
|
+
"./platform/aws/*": "./platform/aws/*"
|
|
11
|
+
},
|
|
12
|
+
"bin": {
|
|
13
|
+
"twick-transcript": "bin/twick-transcript.js"
|
|
14
|
+
},
|
|
15
|
+
"files": [
|
|
16
|
+
"core",
|
|
17
|
+
"platform",
|
|
18
|
+
"bin",
|
|
19
|
+
"README.md"
|
|
20
|
+
],
|
|
21
|
+
"scripts": {
|
|
22
|
+
"test": "node --test test/transcriber.test.js",
|
|
23
|
+
"verify:aws": "node -e \"require('fs').accessSync('platform/aws/Dockerfile'); require('fs').accessSync('platform/aws/handler.js'); console.log('AWS transcript function assets present')\"",
|
|
24
|
+
"pack:aws": "npm run verify:aws && npm pack",
|
|
25
|
+
"release:aws": "npm run verify:aws && npm publish --access public --tag aws",
|
|
26
|
+
"deploy:aws": "node scripts/deploy-aws.js",
|
|
27
|
+
"prepublishOnly": "npm run verify:aws"
|
|
28
|
+
},
|
|
29
|
+
"publishConfig": {
|
|
30
|
+
"access": "public",
|
|
31
|
+
"tag": "aws"
|
|
32
|
+
},
|
|
33
|
+
"keywords": [
|
|
34
|
+
"twick",
|
|
35
|
+
"audio",
|
|
36
|
+
"transcript",
|
|
37
|
+
"caption",
|
|
38
|
+
"lambda",
|
|
39
|
+
"aws",
|
|
40
|
+
"docker",
|
|
41
|
+
"google-cloud-speech"
|
|
42
|
+
],
|
|
43
|
+
"author": "",
|
|
44
|
+
"license": "SEE LICENSE IN LICENSE.md",
|
|
45
|
+
"engines": {
|
|
46
|
+
"node": ">=20.0.0"
|
|
47
|
+
},
|
|
48
|
+
"dependencies": {
|
|
49
|
+
"@google/genai": "^1.0.0",
|
|
50
|
+
"@aws-sdk/client-secrets-manager": "^3.679.0",
|
|
51
|
+
"@ffmpeg-installer/ffmpeg": "^1.1.0",
|
|
52
|
+
"@ffprobe-installer/ffprobe": "^1.1.0"
|
|
53
|
+
},
|
|
54
|
+
"devDependencies": {
|
|
55
|
+
"typescript": "~5.4.5",
|
|
56
|
+
"dotenv": "^16.4.5"
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
FROM --platform=linux/amd64 public.ecr.aws/lambda/nodejs:20
|
|
2
|
+
|
|
3
|
+
# Copy package files for better caching
|
|
4
|
+
COPY package.json package-lock.json* ./
|
|
5
|
+
|
|
6
|
+
RUN npm install
|
|
7
|
+
|
|
8
|
+
# Copy source code
|
|
9
|
+
COPY . ./
|
|
10
|
+
|
|
11
|
+
# Default Lambda handler
|
|
12
|
+
CMD ["platform/aws/handler.handler"]
|
|
13
|
+
|
|
14
|
+
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
import { transcribeVideoUrl } from '@twick/cloud-transcript';
|
|
2
|
+
|
|
3
|
+
const jsonResponse = (statusCode, body) => ({
|
|
4
|
+
statusCode,
|
|
5
|
+
headers: {
|
|
6
|
+
'Content-Type': 'application/json',
|
|
7
|
+
'Access-Control-Allow-Origin': '*',
|
|
8
|
+
'Access-Control-Allow-Headers': 'Content-Type',
|
|
9
|
+
'Access-Control-Allow-Methods': 'POST, OPTIONS',
|
|
10
|
+
},
|
|
11
|
+
body: JSON.stringify(body),
|
|
12
|
+
});
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* AWS Lambda handler for generating captions using Google Cloud Speech-to-Text.
|
|
16
|
+
*
|
|
17
|
+
* Expected JSON payload (e.g. via AppSync / Lambda resolver):
|
|
18
|
+
* {
|
|
19
|
+
* "videoUrl": "https://example.com/audio.mp3", // or "gs://bucket/object"
|
|
20
|
+
* "languageCode": "en-US", // optional, defaults to "en-US"
|
|
21
|
+
* "encoding": "MP3", // optional
|
|
22
|
+
* "sampleRateHertz": 16000 // optional
|
|
23
|
+
* }
|
|
24
|
+
*
|
|
25
|
+
* Environment variables:
|
|
26
|
+
* - GOOGLE_CLOUD_PROJECT: Explicit Google Cloud project id.
|
|
27
|
+
* - GOOGLE_CLOUD_LOCATION (optional): Location of the Google Cloud project.
|
|
28
|
+
* - GOOGLE_VERTEX_MODEL (optional): Model to use for transcription.
|
|
29
|
+
*
|
|
30
|
+
* Returns: JSON payload containing transcript text, caption segments, and word-level timings.
|
|
31
|
+
*/
|
|
32
|
+
export const handler = async (event) => {
|
|
33
|
+
console.log('Transcript function invoked');
|
|
34
|
+
console.log('Event:', JSON.stringify(event));
|
|
35
|
+
|
|
36
|
+
if (event.httpMethod === 'OPTIONS') {
|
|
37
|
+
return {
|
|
38
|
+
statusCode: 204,
|
|
39
|
+
headers: {
|
|
40
|
+
'Access-Control-Allow-Origin': '*',
|
|
41
|
+
'Access-Control-Allow-Headers': 'Content-Type',
|
|
42
|
+
'Access-Control-Allow-Methods': 'POST, OPTIONS',
|
|
43
|
+
},
|
|
44
|
+
body: '',
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
try {
|
|
49
|
+
const argumentsPayload =
|
|
50
|
+
event?.arguments ||
|
|
51
|
+
(event?.body ? JSON.parse(event.body) : {}) ||
|
|
52
|
+
{};
|
|
53
|
+
|
|
54
|
+
const { videoUrl, language,languageFont } =
|
|
55
|
+
argumentsPayload;
|
|
56
|
+
|
|
57
|
+
if (!videoUrl) {
|
|
58
|
+
return jsonResponse(400, {
|
|
59
|
+
error: 'Missing required field: videoUrl',
|
|
60
|
+
expectedFormat: {
|
|
61
|
+
videoUrl:
|
|
62
|
+
'Publicly reachable audio URL or "gs://bucket/object" for GCS',
|
|
63
|
+
language: 'Optional language (e.g., "english", "hindi")',
|
|
64
|
+
languageFont: 'Optional font/script for captions (e.g., "english")',
|
|
65
|
+
},
|
|
66
|
+
});
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
const result = await transcribeVideoUrl({
|
|
70
|
+
videoUrl,
|
|
71
|
+
language,
|
|
72
|
+
languageFont,
|
|
73
|
+
});
|
|
74
|
+
|
|
75
|
+
console.log('Transcription completed successfully');
|
|
76
|
+
|
|
77
|
+
return jsonResponse(200, {
|
|
78
|
+
...result,
|
|
79
|
+
});
|
|
80
|
+
} catch (error) {
|
|
81
|
+
console.error('Error generating transcript:', error);
|
|
82
|
+
|
|
83
|
+
return jsonResponse(500, {
|
|
84
|
+
error: 'Internal server error',
|
|
85
|
+
message: error instanceof Error ? error.message : 'Unknown error',
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
};
|
|
89
|
+
|
|
90
|
+
|