@twick/cloud-transcript 0.15.14 → 0.15.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -1
- package/core/audio.utils.js +161 -0
- package/core/gc.utils.js +177 -0
- package/core/index.js +1 -0
- package/core/transcriber.js +185 -301
- package/core/workflow.js +48 -0
- package/package.json +5 -3
- package/platform/aws/handler.js +20 -16
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
**Transcribe audio/video to JSON captions using Google GenAI (Vertex AI) with Gemini models.**
|
|
4
4
|
|
|
5
|
-
Extract text from audio content with precise millisecond timestamps. Perfect for generating
|
|
5
|
+
Extract text from audio content with precise millisecond timestamps. Perfect for generating caption data from audio files or video URLs.
|
|
6
6
|
|
|
7
7
|
## What Problem Does This Solve?
|
|
8
8
|
|
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
import fs from "fs";
|
|
2
|
+
import { join } from "path";
|
|
3
|
+
import { mkdtemp, readFile, rm } from "fs/promises";
|
|
4
|
+
import { tmpdir } from "os";
|
|
5
|
+
import { execFile } from "child_process";
|
|
6
|
+
import { promisify } from "util";
|
|
7
|
+
import { Readable, pipeline } from "stream";
|
|
8
|
+
|
|
9
|
+
// These packages provide prebuilt ffmpeg/ffprobe binaries. Types are not bundled,
|
|
10
|
+
// so we import them as `any` to keep TypeScript satisfied.
|
|
11
|
+
import ffmpeg from "@ffmpeg-installer/ffmpeg";
|
|
12
|
+
import ffprobe from "@ffprobe-installer/ffprobe";
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
const execFileAsync = promisify(execFile);
|
|
16
|
+
const pipelineAsync = promisify(pipeline);
|
|
17
|
+
const ffmpegPath = ffmpeg.path;
|
|
18
|
+
const ffprobePath = ffprobe.path;
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Audio encoding configuration for different formats.
|
|
22
|
+
* Currently supports FLAC format optimized for Google Speech-to-Text API.
|
|
23
|
+
* @type {Object<string, Object>}
|
|
24
|
+
*/
|
|
25
|
+
export const AUDIO_CONFIG = {
|
|
26
|
+
"FLAC": {
|
|
27
|
+
"codec": "flac",
|
|
28
|
+
"encoding": "FLAC",
|
|
29
|
+
"sampleRate": 16000,
|
|
30
|
+
"channelCount": 1,
|
|
31
|
+
"extension": "flac",
|
|
32
|
+
"contentType": "audio/flac",
|
|
33
|
+
},
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/**
|
|
37
|
+
* Extracts audio from a video URL and converts it to a format suitable for transcription.
|
|
38
|
+
* Downloads the video, extracts audio using ffmpeg, and returns the audio buffer and duration.
|
|
39
|
+
*
|
|
40
|
+
* @param {string} videoUrl - Publicly accessible HTTP(S) URL to the video file
|
|
41
|
+
* @param {string} [format="FLAC"] - Audio output format (currently only "FLAC" supported)
|
|
42
|
+
* @returns {Promise<Object>} Object containing audioBuffer (Buffer) and duration (number in seconds)
|
|
43
|
+
* @throws {Error} If video download, extraction, or processing fails
|
|
44
|
+
*/
|
|
45
|
+
export const extractAudioBufferFromVideo = async (videoUrl, format = "FLAC") => {
|
|
46
|
+
const videoResponse = await fetch(videoUrl);
|
|
47
|
+
if (!videoResponse.ok) {
|
|
48
|
+
throw new Error(`Failed to download video: ${videoResponse.status} ${videoResponse.statusText}`);
|
|
49
|
+
}
|
|
50
|
+
const tmpBase = await mkdtemp(join(tmpdir(), 'mcp-'));
|
|
51
|
+
const inputPath = join(tmpBase, 'input_video');
|
|
52
|
+
// Change extension to .flac
|
|
53
|
+
const outputPath = join(tmpBase, `output_audio.${format}`);
|
|
54
|
+
|
|
55
|
+
if (!videoResponse.body) {
|
|
56
|
+
await rm(tmpBase, { recursive: true, force: true });
|
|
57
|
+
throw new Error("Video response has no body");
|
|
58
|
+
}
|
|
59
|
+
const videoStream = Readable.fromWeb(videoResponse.body);
|
|
60
|
+
const fileWriteStream = fs.createWriteStream(inputPath);
|
|
61
|
+
await pipelineAsync(videoStream, fileWriteStream);
|
|
62
|
+
|
|
63
|
+
let duration = 0;
|
|
64
|
+
try {
|
|
65
|
+
const { stdout } = await execFileAsync(ffprobePath, [
|
|
66
|
+
'-v', 'error',
|
|
67
|
+
'-show_entries', 'format=duration',
|
|
68
|
+
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
69
|
+
inputPath
|
|
70
|
+
]);
|
|
71
|
+
duration = parseFloat(stdout.toString().trim()) || 0;
|
|
72
|
+
} catch (err) {
|
|
73
|
+
console.warn('Failed to get duration using ffprobe');
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
try {
|
|
77
|
+
await execFileAsync(ffmpegPath, [
|
|
78
|
+
'-y',
|
|
79
|
+
'-i', inputPath,
|
|
80
|
+
'-vn', // Strip video
|
|
81
|
+
'-ac', '1', // Mono channel (Required for STT)
|
|
82
|
+
'-ar', AUDIO_CONFIG[format].sampleRate, // 16kHz is ideal for Chirp
|
|
83
|
+
'-c:a', AUDIO_CONFIG[format].codec, // Use FLAC codec
|
|
84
|
+
outputPath
|
|
85
|
+
]);
|
|
86
|
+
} catch (err) {
|
|
87
|
+
await rm(tmpBase, { recursive: true, force: true });
|
|
88
|
+
const stderr = err?.stderr?.toString?.().trim?.() || "";
|
|
89
|
+
throw new Error(`ffmpeg extraction failed: ${stderr}`);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
// Use the promise-based readFile for consistency
|
|
93
|
+
const audioBuffer = await readFile(outputPath);
|
|
94
|
+
await rm(tmpBase, { recursive: true, force: true });
|
|
95
|
+
return { audioBuffer, duration };
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Downloads audio from a URL, converts it to the specified format, and returns the buffer.
|
|
100
|
+
* Uses ffmpeg to transcode the audio (e.g. to FLAC for Speech-to-Text).
|
|
101
|
+
*
|
|
102
|
+
* @param {string} audioUrl - Publicly accessible HTTP(S) URL to the audio file
|
|
103
|
+
* @param {string} [format="FLAC"] - Audio output format (must be key in AUDIO_CONFIG)
|
|
104
|
+
* @returns {Promise<Object>} Object containing audioBuffer (Buffer) and duration (number in seconds)
|
|
105
|
+
* @throws {Error} If download, conversion, or processing fails
|
|
106
|
+
*/
|
|
107
|
+
export const extractAudioBufferFromAudioUrl = async (audioUrl, format = "FLAC") => {
|
|
108
|
+
const config = AUDIO_CONFIG[format];
|
|
109
|
+
if (!config) {
|
|
110
|
+
throw new Error(`Unsupported audio format: ${format}`);
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
const audioResponse = await fetch(audioUrl);
|
|
114
|
+
if (!audioResponse.ok) {
|
|
115
|
+
throw new Error(`Failed to download audio: ${audioResponse.status} ${audioResponse.statusText}`);
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const tmpBase = await mkdtemp(join(tmpdir(), 'audio-'));
|
|
119
|
+
const inputPath = join(tmpBase, 'input_audio');
|
|
120
|
+
const outputPath = join(tmpBase, `output_audio.${config.extension}`);
|
|
121
|
+
|
|
122
|
+
if (!audioResponse.body) {
|
|
123
|
+
await rm(tmpBase, { recursive: true, force: true });
|
|
124
|
+
throw new Error("Audio response has no body");
|
|
125
|
+
}
|
|
126
|
+
const audioStream = Readable.fromWeb(audioResponse.body);
|
|
127
|
+
const fileWriteStream = fs.createWriteStream(inputPath);
|
|
128
|
+
await pipelineAsync(audioStream, fileWriteStream);
|
|
129
|
+
|
|
130
|
+
let duration = 0;
|
|
131
|
+
try {
|
|
132
|
+
const { stdout } = await execFileAsync(ffprobePath, [
|
|
133
|
+
'-v', 'error',
|
|
134
|
+
'-show_entries', 'format=duration',
|
|
135
|
+
'-of', 'default=noprint_wrappers=1:nokey=1',
|
|
136
|
+
inputPath
|
|
137
|
+
]);
|
|
138
|
+
duration = parseFloat(stdout.toString().trim()) || 0;
|
|
139
|
+
} catch (err) {
|
|
140
|
+
console.warn('Failed to get duration using ffprobe');
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
try {
|
|
144
|
+
await execFileAsync(ffmpegPath, [
|
|
145
|
+
'-y',
|
|
146
|
+
'-i', inputPath,
|
|
147
|
+
'-ac', '1',
|
|
148
|
+
'-ar', config.sampleRate,
|
|
149
|
+
'-c:a', config.codec,
|
|
150
|
+
outputPath
|
|
151
|
+
]);
|
|
152
|
+
} catch (err) {
|
|
153
|
+
await rm(tmpBase, { recursive: true, force: true });
|
|
154
|
+
const stderr = err?.stderr?.toString?.().trim?.() || "";
|
|
155
|
+
throw new Error(`ffmpeg conversion failed: ${stderr}`);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
const audioBuffer = await readFile(outputPath);
|
|
159
|
+
await rm(tmpBase, { recursive: true, force: true });
|
|
160
|
+
return { audioBuffer, duration };
|
|
161
|
+
};
|
package/core/gc.utils.js
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
import { Storage } from "@google-cloud/storage";
|
|
2
|
+
import {
|
|
3
|
+
SecretsManagerClient,
|
|
4
|
+
GetSecretValueCommand,
|
|
5
|
+
} from "@aws-sdk/client-secrets-manager";
|
|
6
|
+
import fs from "fs";
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Google Cloud Project ID. Can be set via GOOGLE_CLOUD_PROJECT environment variable.
|
|
10
|
+
* @type {string}
|
|
11
|
+
*/
|
|
12
|
+
export const CLOUD_PROJECT_ID = process.env.GOOGLE_CLOUD_PROJECT;
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* Google Cloud region for Speech-to-Text API. Currently set to "global".
|
|
16
|
+
* @type {string}
|
|
17
|
+
*/
|
|
18
|
+
export const CLOUD_REGION = "global";
|
|
19
|
+
|
|
20
|
+
export const AWS_REGION = process.env.AWS_REGION;
|
|
21
|
+
/**
|
|
22
|
+
* Google Cloud Storage bucket name for storing audio files and project exports.
|
|
23
|
+
* Can be set via GOOGLE_CLOUD_STORAGE_BUCKET environment variable.
|
|
24
|
+
* @type {string}
|
|
25
|
+
*/
|
|
26
|
+
export const CLOUD_STORAGE_BUCKET = process.env.GOOGLE_CLOUD_STORAGE_BUCKET;
|
|
27
|
+
|
|
28
|
+
let googleCredentials = null;
|
|
29
|
+
|
|
30
|
+
/**
|
|
31
|
+
* Retrieves Google Cloud service account credentials from AWS Secrets Manager.
|
|
32
|
+
*
|
|
33
|
+
* If GCP_SERVICE_ACCOUNT_SECRET_NAME is set, fetches the JSON credentials from AWS Secrets Manager.
|
|
34
|
+
* If not set, returns undefined (useful when credentials are provided via GOOGLE_APPLICATION_CREDENTIALS).
|
|
35
|
+
*
|
|
36
|
+
* @returns {Promise<Object|undefined>} Parsed JSON credentials object or undefined
|
|
37
|
+
* @throws {Error} If fetching from Secrets Manager fails
|
|
38
|
+
*/
|
|
39
|
+
export const getGoogleCredentials = async () => {
|
|
40
|
+
if (googleCredentials) {
|
|
41
|
+
return googleCredentials;
|
|
42
|
+
}
|
|
43
|
+
try {
|
|
44
|
+
const secretName = process.env.GCP_SERVICE_ACCOUNT_SECRET_NAME;
|
|
45
|
+
if (!secretName) {
|
|
46
|
+
console.log(
|
|
47
|
+
"No secret name configured, skipping Google credentials initialization"
|
|
48
|
+
);
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
const client = new SecretsManagerClient({
|
|
53
|
+
region: process.env.AWS_REGION || "ap-south-1",
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
const response = await client.send(
|
|
57
|
+
new GetSecretValueCommand({
|
|
58
|
+
SecretId: secretName,
|
|
59
|
+
VersionStage: "AWSCURRENT", // VersionStage defaults to AWSCURRENT if unspecified
|
|
60
|
+
})
|
|
61
|
+
);
|
|
62
|
+
const parsedCredentials = JSON.parse(response.SecretString);
|
|
63
|
+
|
|
64
|
+
// Validate that the credentials contain required fields
|
|
65
|
+
if (!parsedCredentials.client_email) {
|
|
66
|
+
throw new Error(
|
|
67
|
+
`Invalid Google Cloud credentials: missing 'client_email' field. ` +
|
|
68
|
+
`The secret must contain a valid service account JSON with 'client_email', ` +
|
|
69
|
+
`'private_key', and 'type' fields.`
|
|
70
|
+
);
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (!parsedCredentials.private_key) {
|
|
74
|
+
throw new Error(
|
|
75
|
+
`Invalid Google Cloud credentials: missing 'private_key' field.`
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
if (parsedCredentials.type !== "service_account") {
|
|
80
|
+
console.warn(
|
|
81
|
+
`Warning: credentials type is '${parsedCredentials.type}', expected 'service_account'`
|
|
82
|
+
);
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
googleCredentials = parsedCredentials;
|
|
86
|
+
return googleCredentials;
|
|
87
|
+
} catch (error) {
|
|
88
|
+
console.error(
|
|
89
|
+
`Failed to initialize Google credentials from secret ::`,
|
|
90
|
+
error
|
|
91
|
+
);
|
|
92
|
+
throw error;
|
|
93
|
+
}
|
|
94
|
+
};
|
|
95
|
+
|
|
96
|
+
let storage = null;
|
|
97
|
+
|
|
98
|
+
/**
|
|
99
|
+
* Gets or initializes the Google Cloud Storage client instance.
|
|
100
|
+
*
|
|
101
|
+
* @returns {Promise<Storage>} Initialized Storage client
|
|
102
|
+
*/
|
|
103
|
+
const getStorage = async () => {
|
|
104
|
+
if (!storage) {
|
|
105
|
+
storage = new Storage({
|
|
106
|
+
projectId: CLOUD_PROJECT_ID,
|
|
107
|
+
credentials: await getGoogleCredentials(),
|
|
108
|
+
});
|
|
109
|
+
}
|
|
110
|
+
return storage;
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Uploads a file to Google Cloud Storage.
|
|
115
|
+
*
|
|
116
|
+
* @param {Object} params - Upload parameters
|
|
117
|
+
* @param {Buffer|string} params.data - File data to upload (Buffer or string)
|
|
118
|
+
* @param {string} [params.folder] - Optional folder path in the bucket
|
|
119
|
+
* @param {string} params.fileName - Name of the file to create
|
|
120
|
+
* @param {string} params.contentType - MIME type of the file
|
|
121
|
+
* @param {boolean} [params.isPublic=false] - If true, returns a signed URL valid for 1 hour
|
|
122
|
+
* @returns {Promise<string>} Public URL or signed URL (if isPublic=true) to the uploaded file
|
|
123
|
+
*/
|
|
124
|
+
export const uploadFile = async ({
|
|
125
|
+
data,
|
|
126
|
+
folder,
|
|
127
|
+
fileName,
|
|
128
|
+
contentType,
|
|
129
|
+
isPublic = false,
|
|
130
|
+
}) => {
|
|
131
|
+
const bucket = (await getStorage()).bucket(CLOUD_STORAGE_BUCKET);
|
|
132
|
+
const bucketName = CLOUD_STORAGE_BUCKET;
|
|
133
|
+
|
|
134
|
+
// 2. Define the path including the folder 'content'
|
|
135
|
+
const destinationPath = `${folder ? `${folder}/` : ""}${fileName}`;
|
|
136
|
+
const file = bucket.file(destinationPath);
|
|
137
|
+
|
|
138
|
+
// 3. Save the file.
|
|
139
|
+
await file.save(data, {
|
|
140
|
+
contentType: contentType,
|
|
141
|
+
resumable: false,
|
|
142
|
+
});
|
|
143
|
+
|
|
144
|
+
if (isPublic) {
|
|
145
|
+
// Generate a signed URL valid for 1 hour instead of making the file public
|
|
146
|
+
const expires = new Date();
|
|
147
|
+
expires.setHours(expires.getHours() + 1); // 1 hour from now
|
|
148
|
+
|
|
149
|
+
const [signedUrl] = await file.getSignedUrl({
|
|
150
|
+
version: "v4",
|
|
151
|
+
action: "read",
|
|
152
|
+
expires: expires,
|
|
153
|
+
});
|
|
154
|
+
return signedUrl;
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
return `https://storage.googleapis.com/${bucketName}/${destinationPath}`;
|
|
158
|
+
};
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* Converts a Google Cloud Storage URL to a gs:// URI format.
|
|
162
|
+
*
|
|
163
|
+
* @param {string} URI - GCS URL (https://storage.googleapis.com/...) or gs:// URI
|
|
164
|
+
* @returns {string} gs:// URI format
|
|
165
|
+
* @throws {Error} If the URI format is invalid
|
|
166
|
+
*/
|
|
167
|
+
export const getGCSUri = (URI) => {
|
|
168
|
+
if (URI.startsWith("https://storage.googleapis.com/")) {
|
|
169
|
+
const path = URI.replace("https://storage.googleapis.com/", "");
|
|
170
|
+
return `gs://${path}`;
|
|
171
|
+
} else if (!URI.startsWith("gs://")) {
|
|
172
|
+
throw new Error(
|
|
173
|
+
`Invalid audio URI format. Expected gs://bucket/path or https://storage.googleapis.com/bucket/path, got: ${URI}`
|
|
174
|
+
);
|
|
175
|
+
}
|
|
176
|
+
return URI;
|
|
177
|
+
};
|
package/core/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export { transcribe } from "./workflow.js";
|
package/core/transcriber.js
CHANGED
|
@@ -1,347 +1,231 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { SpeechClient } from "@google-cloud/speech/build/src/v2/index.js";
|
|
2
2
|
import {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
import {
|
|
10
|
-
import { execFile } from "child_process";
|
|
11
|
-
import { promisify } from "util";
|
|
12
|
-
import { Readable, pipeline } from "stream";
|
|
13
|
-
|
|
14
|
-
// These packages provide prebuilt ffmpeg/ffprobe binaries. Types are not bundled,
|
|
15
|
-
// so we import them as `any` to keep TypeScript satisfied.
|
|
16
|
-
import ffmpeg from "@ffmpeg-installer/ffmpeg";
|
|
17
|
-
import ffprobe from "@ffprobe-installer/ffprobe";
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
const execFileAsync = promisify(execFile);
|
|
21
|
-
const pipelineAsync = promisify(pipeline);
|
|
22
|
-
const ffmpegPath = ffmpeg.path;
|
|
23
|
-
const ffprobePath = ffprobe.path;
|
|
3
|
+
CLOUD_PROJECT_ID,
|
|
4
|
+
CLOUD_REGION,
|
|
5
|
+
getGCSUri,
|
|
6
|
+
getGoogleCredentials,
|
|
7
|
+
uploadFile,
|
|
8
|
+
} from "./gc.utils.js";
|
|
9
|
+
import { AUDIO_CONFIG } from "./audio.utils.js";
|
|
24
10
|
|
|
25
11
|
/**
|
|
26
|
-
*
|
|
27
|
-
*
|
|
28
|
-
*
|
|
29
|
-
* @param {string} name - Environment variable to read.
|
|
30
|
-
* @param {string | undefined} defaultValue - Optional fallback value.
|
|
31
|
-
* @returns {string} The resolved value.
|
|
32
|
-
* @throws {Error} If no value is found.
|
|
12
|
+
* Language code mapping for Google Speech-to-Text API.
|
|
13
|
+
* @type {Object<string, string>}
|
|
33
14
|
*/
|
|
34
|
-
const
|
|
35
|
-
|
|
36
|
-
if (!value) {
|
|
37
|
-
throw new Error(`Missing required environment variable: ${name}`);
|
|
38
|
-
}
|
|
39
|
-
return value;
|
|
15
|
+
const LANGUAGE_CODE = {
|
|
16
|
+
english: "en-US",
|
|
40
17
|
};
|
|
41
18
|
|
|
42
19
|
/**
|
|
43
|
-
*
|
|
44
|
-
*
|
|
45
|
-
* In AWS Lambda, the raw service-account JSON is expected to live in
|
|
46
|
-
* AWS Secrets Manager. When GCP_SERVICE_ACCOUNT_SECRET_NAME is present, the
|
|
47
|
-
* secret is fetched, written to `/tmp/gcp-sa-key.json`, and the environment
|
|
48
|
-
* variable is updated to point at that file to avoid stale Lambda values.
|
|
49
|
-
*
|
|
50
|
-
* @returns {Promise<void>} Resolves once credentials are ready.
|
|
51
|
-
* @throws {Error} When the secret cannot be read or written.
|
|
20
|
+
* Speech recognition model to use. "long" model is optimized for longer audio files.
|
|
21
|
+
* @type {string}
|
|
52
22
|
*/
|
|
53
|
-
const
|
|
54
|
-
const secretName = process.env.GCP_SERVICE_ACCOUNT_SECRET_NAME;
|
|
55
|
-
if (!secretName) {
|
|
56
|
-
console.log(
|
|
57
|
-
"No secret name configured, skipping Google credentials initialization"
|
|
58
|
-
);
|
|
59
|
-
return;
|
|
60
|
-
}
|
|
23
|
+
const MODEL = "long";
|
|
61
24
|
|
|
62
|
-
|
|
63
|
-
const client = new SecretsManagerClient({
|
|
64
|
-
region: process.env.AWS_REGION || "ap-south-1",
|
|
65
|
-
});
|
|
25
|
+
let speechClient = null;
|
|
66
26
|
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
);
|
|
80
|
-
} catch (error) {
|
|
81
|
-
console.error(
|
|
82
|
-
`Failed to initialize Google credentials from secret ::`,
|
|
83
|
-
error
|
|
84
|
-
);
|
|
85
|
-
throw error;
|
|
27
|
+
/**
|
|
28
|
+
* Gets or initializes the Google Cloud Speech-to-Text client.
|
|
29
|
+
*
|
|
30
|
+
* @returns {Promise<SpeechClient>} Initialized SpeechClient instance
|
|
31
|
+
*/
|
|
32
|
+
export const getSpeechClient = async () => {
|
|
33
|
+
if (!speechClient) {
|
|
34
|
+
speechClient = new SpeechClient({
|
|
35
|
+
projectId: CLOUD_PROJECT_ID,
|
|
36
|
+
region: CLOUD_REGION,
|
|
37
|
+
credentials: await getGoogleCredentials(),
|
|
38
|
+
});
|
|
86
39
|
}
|
|
40
|
+
return speechClient;
|
|
87
41
|
};
|
|
88
42
|
|
|
89
43
|
/**
|
|
90
|
-
*
|
|
91
|
-
*
|
|
92
|
-
*
|
|
93
|
-
* @returns {Promise<GoogleGenAI>} Configured GenAI client instance.
|
|
94
|
-
* @throws {Error} When required environment variables are missing.
|
|
44
|
+
* Recognizer resource path for Google Speech-to-Text API v2.
|
|
45
|
+
* @type {string}
|
|
95
46
|
*/
|
|
96
|
-
const
|
|
97
|
-
await ensureGoogleCredentialsFromSecret();
|
|
98
|
-
const project = ensureEnv("GOOGLE_CLOUD_PROJECT");
|
|
99
|
-
const location = ensureEnv("GOOGLE_CLOUD_LOCATION", "global");
|
|
100
|
-
const client = new GoogleGenAI({
|
|
101
|
-
vertexai: true,
|
|
102
|
-
project: project,
|
|
103
|
-
location: location,
|
|
104
|
-
});
|
|
105
|
-
|
|
106
|
-
return client;
|
|
107
|
-
};
|
|
47
|
+
const recognizer = `projects/${CLOUD_PROJECT_ID}/locations/${CLOUD_REGION}/recognizers/_`;
|
|
108
48
|
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
49
|
+
/**
|
|
50
|
+
* Processes Speech-to-Text API response and groups words into phrases of 4 words each.
|
|
51
|
+
*
|
|
52
|
+
* @param {Object} results - API response results object
|
|
53
|
+
* @returns {Array<Object>} Array of phrase objects with text, start time, end time, and word timings
|
|
54
|
+
*/
|
|
55
|
+
const processResponse = (results) => {
|
|
56
|
+
// Extract words from response
|
|
57
|
+
const words = results?.alternatives?.[0]?.words || [];
|
|
117
58
|
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
await rm(tmpBase, { recursive: true, force: true });
|
|
121
|
-
throw new Error("Video response has no body");
|
|
59
|
+
if (words.length === 0) {
|
|
60
|
+
return [];
|
|
122
61
|
}
|
|
123
|
-
const videoStream = Readable.fromWeb(videoResponse.body);
|
|
124
|
-
const fileWriteStream = fs.createWriteStream(inputPath);
|
|
125
|
-
await pipelineAsync(videoStream, fileWriteStream);
|
|
126
62
|
|
|
127
|
-
//
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
const
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
inputPath
|
|
135
|
-
]);
|
|
136
|
-
duration = parseFloat(stdout.toString().trim()) || 0;
|
|
137
|
-
} catch (err) {
|
|
138
|
-
console.warn('Failed to get duration using ffprobe, duration will be 0');
|
|
139
|
-
}
|
|
63
|
+
// Convert time offsets to milliseconds
|
|
64
|
+
const convertToMs = (offset) => {
|
|
65
|
+
if (!offset) return 0;
|
|
66
|
+
const seconds = Number(offset.seconds || 0);
|
|
67
|
+
const nanos = Number(offset.nanos || 0);
|
|
68
|
+
return seconds * 1000 + nanos / 1e6;
|
|
69
|
+
};
|
|
140
70
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
const
|
|
153
|
-
const
|
|
154
|
-
|
|
71
|
+
// Process words into individual word timings
|
|
72
|
+
const processedWords = words.map((w) => ({
|
|
73
|
+
word: w.word,
|
|
74
|
+
startMs: convertToMs(w.startOffset),
|
|
75
|
+
endMs: convertToMs(w.endOffset),
|
|
76
|
+
}));
|
|
77
|
+
|
|
78
|
+
// Group words into phrases of 4 words each
|
|
79
|
+
const phrases = [];
|
|
80
|
+
for (let i = 0; i < processedWords.length; i += 4) {
|
|
81
|
+
const group = processedWords.slice(i, i + 4);
|
|
82
|
+
const text = group.map((w) => w.word).join(" ");
|
|
83
|
+
const startMs = group[0].startMs;
|
|
84
|
+
const endMs = group[group.length - 1].endMs;
|
|
85
|
+
const wordStarts = group.map((w) => w.startMs);
|
|
86
|
+
|
|
87
|
+
phrases.push({
|
|
88
|
+
t: text,
|
|
89
|
+
s: Math.round(startMs),
|
|
90
|
+
e: Math.round(endMs),
|
|
91
|
+
w: wordStarts.map((ms) => Math.round(ms)),
|
|
92
|
+
});
|
|
155
93
|
}
|
|
156
|
-
|
|
157
|
-
const audioBuffer = await readFile(outputPath);
|
|
158
|
-
await rm(tmpBase, { recursive: true, force: true });
|
|
159
|
-
return { audioBuffer, duration };
|
|
94
|
+
return phrases;
|
|
160
95
|
};
|
|
161
96
|
|
|
162
|
-
|
|
163
97
|
/**
|
|
164
|
-
*
|
|
165
|
-
*
|
|
166
|
-
*
|
|
167
|
-
* @param {
|
|
168
|
-
* @param {
|
|
169
|
-
* @
|
|
98
|
+
* Transcribes short audio (typically under 60 seconds) using Google Speech-to-Text API.
|
|
99
|
+
* Uses synchronous recognize method for faster processing.
|
|
100
|
+
*
|
|
101
|
+
* @param {Object} params - Transcription parameters
|
|
102
|
+
* @param {Buffer} params.audioBuffer - Audio data buffer (FLAC format)
|
|
103
|
+
* @param {string} [params.language="english"] - Language code (e.g., "english")
|
|
104
|
+
* @param {string} [params.format="FLAC"] - Audio format (currently only "FLAC" supported)
|
|
105
|
+
* @returns {Promise<Array<Object>>} Array of phrase objects with text, timings, and word offsets
|
|
106
|
+
* @throws {Error} If transcription fails
|
|
170
107
|
*/
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
- 's': start timestamp
|
|
197
|
-
- 'e': end timestamp
|
|
198
|
-
- Duration of each phrase = 'e - s'
|
|
199
|
-
- Minimum phrase duration: **100 ms**
|
|
200
|
-
- 'e' MUST be greater than 's'
|
|
201
|
-
- 'e' MUST be **less than or equal to ${durationMs}**
|
|
202
|
-
- Subtitles MUST be sequential:
|
|
203
|
-
- 's' of the next phrase MUST be **greater than or equal to** the previous 'e'
|
|
204
|
-
- NO overlapping timestamps
|
|
205
|
-
- Prefer aligning timestamps with natural speech pauses.
|
|
206
|
-
|
|
207
|
-
## TEXT RULES
|
|
208
|
-
- 't' MUST be written using ${languageFont} characters.
|
|
209
|
-
- No emojis.
|
|
210
|
-
- No punctuation-only subtitles.
|
|
211
|
-
- Normalize casing according to the target language's writing system.
|
|
212
|
-
- Remove filler sounds (e.g., “um”, “uh”) unless semantically important.
|
|
213
|
-
|
|
214
|
-
## OUTPUT FORMAT (CRITICAL)
|
|
215
|
-
Return ONLY a valid JSON array.
|
|
216
|
-
- No markdown
|
|
217
|
-
- No code blocks
|
|
218
|
-
- No explanations
|
|
219
|
-
- No additional text
|
|
220
|
-
- Output MUST start with '[' and end with ']'
|
|
108
|
+
export async function transcribeShort({
|
|
109
|
+
audioBuffer,
|
|
110
|
+
language = "english",
|
|
111
|
+
format = "FLAC",
|
|
112
|
+
}) {
|
|
113
|
+
const client = await getSpeechClient();
|
|
114
|
+
|
|
115
|
+
const audioContent = audioBuffer.toString("base64");
|
|
116
|
+
|
|
117
|
+
const request = {
|
|
118
|
+
recognizer: recognizer,
|
|
119
|
+
config: {
|
|
120
|
+
explicitDecodingConfig: {
|
|
121
|
+
encoding: AUDIO_CONFIG[format].encoding,
|
|
122
|
+
sampleRateHertz: AUDIO_CONFIG[format].sampleRate,
|
|
123
|
+
audioChannelCount: 1,
|
|
124
|
+
},
|
|
125
|
+
languageCodes: [LANGUAGE_CODE[language]],
|
|
126
|
+
model: MODEL,
|
|
127
|
+
features: {
|
|
128
|
+
enableWordTimeOffsets: true,
|
|
129
|
+
},
|
|
130
|
+
},
|
|
131
|
+
content: audioContent,
|
|
132
|
+
};
|
|
221
133
|
|
|
222
|
-
|
|
223
|
-
[
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
"
|
|
227
|
-
|
|
134
|
+
try {
|
|
135
|
+
const [response] = await client.recognize(request);
|
|
136
|
+
return processResponse(response.results?.[0]);
|
|
137
|
+
} catch (err) {
|
|
138
|
+
console.error("Transcription Error:", err.message);
|
|
139
|
+
throw err;
|
|
228
140
|
}
|
|
229
|
-
|
|
230
|
-
`.trim();
|
|
231
|
-
};
|
|
141
|
+
}
|
|
232
142
|
|
|
233
143
|
/**
|
|
234
|
-
*
|
|
235
|
-
*
|
|
236
|
-
*
|
|
237
|
-
* @param {Object} params
|
|
238
|
-
* @param {
|
|
239
|
-
* @param {string} [params.
|
|
240
|
-
* @param {string} [params.
|
|
241
|
-
* @
|
|
242
|
-
* @
|
|
144
|
+
* Transcribes long audio (typically over 60 seconds) using Google Speech-to-Text API.
|
|
145
|
+
* Uses asynchronous batchRecognize method and requires audio to be uploaded to GCS first.
|
|
146
|
+
*
|
|
147
|
+
* @param {Object} params - Transcription parameters
|
|
148
|
+
* @param {Buffer} [params.audioBuffer] - Audio data buffer (required if audioUrl not provided)
|
|
149
|
+
* @param {string} [params.audioUrl] - GCS URI (gs://) or HTTPS URL to audio file (required if audioBuffer not provided)
|
|
150
|
+
* @param {string} [params.language="english"] - Language code (e.g., "english")
|
|
151
|
+
* @param {string} [params.format="FLAC"] - Audio format (currently only "FLAC" supported)
|
|
152
|
+
* @returns {Promise<Array<Object>>} Array of phrase objects with text, timings, and word offsets
|
|
153
|
+
* @throws {Error} If transcription fails
|
|
243
154
|
*/
|
|
244
|
-
export
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
if (
|
|
252
|
-
|
|
253
|
-
}
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
155
|
+
export async function transcribeLong({
|
|
156
|
+
audioBuffer,
|
|
157
|
+
audioUrl,
|
|
158
|
+
language = "english",
|
|
159
|
+
format = "FLAC",
|
|
160
|
+
}) {
|
|
161
|
+
let gcsUri;
|
|
162
|
+
if (audioUrl) {
|
|
163
|
+
gcsUri = getGCSUri(audioUrl);
|
|
164
|
+
} else {
|
|
165
|
+
const audioUri = await uploadFile({
|
|
166
|
+
data: audioBuffer,
|
|
167
|
+
folder: "audio",
|
|
168
|
+
fileName: `audio-${Date.now()}.${AUDIO_CONFIG[format].extension}`,
|
|
169
|
+
contentType: AUDIO_CONFIG[format].contentType,
|
|
170
|
+
});
|
|
171
|
+
gcsUri = getGCSUri(audioUri);
|
|
258
172
|
}
|
|
259
173
|
|
|
260
|
-
|
|
174
|
+
console.log("GCS URI:", gcsUri);
|
|
175
|
+
const client = await getSpeechClient();
|
|
261
176
|
|
|
262
|
-
const
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
thinkingConfig: {
|
|
270
|
-
thinkingBudget: 0,
|
|
271
|
-
},
|
|
272
|
-
safetySettings: [
|
|
273
|
-
{
|
|
274
|
-
category: "HARM_CATEGORY_HATE_SPEECH",
|
|
275
|
-
threshold: "OFF",
|
|
276
|
-
},
|
|
277
|
-
{
|
|
278
|
-
category: "HARM_CATEGORY_DANGEROUS_CONTENT",
|
|
279
|
-
threshold: "OFF",
|
|
280
|
-
},
|
|
281
|
-
{
|
|
282
|
-
category: "HARM_CATEGORY_SEXUALLY_EXPLICIT",
|
|
283
|
-
threshold: "OFF",
|
|
177
|
+
const request = {
|
|
178
|
+
recognizer: recognizer,
|
|
179
|
+
config: {
|
|
180
|
+
explicitDecodingConfig: {
|
|
181
|
+
encoding: AUDIO_CONFIG[format].encoding,
|
|
182
|
+
sampleRateHertz: AUDIO_CONFIG[format].sampleRate,
|
|
183
|
+
audioChannelCount: 1,
|
|
284
184
|
},
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
185
|
+
languageCodes: [LANGUAGE_CODE[language]],
|
|
186
|
+
model: MODEL,
|
|
187
|
+
features: {
|
|
188
|
+
enableWordTimeOffsets: true,
|
|
288
189
|
},
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
const req = {
|
|
293
|
-
model: modelName,
|
|
294
|
-
contents: [
|
|
190
|
+
},
|
|
191
|
+
files: [
|
|
295
192
|
{
|
|
296
|
-
|
|
297
|
-
parts: [
|
|
298
|
-
{
|
|
299
|
-
inlineData: {
|
|
300
|
-
data: audioBuffer.toString("base64"),
|
|
301
|
-
mimeType: "audio/mpeg",
|
|
302
|
-
},
|
|
303
|
-
},
|
|
304
|
-
{ text: prompt },
|
|
305
|
-
],
|
|
193
|
+
uri: gcsUri,
|
|
306
194
|
},
|
|
307
195
|
],
|
|
308
|
-
|
|
196
|
+
recognitionOutputConfig: {
|
|
197
|
+
inlineResponseConfig: {},
|
|
198
|
+
},
|
|
309
199
|
};
|
|
310
200
|
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
201
|
+
try {
|
|
202
|
+
console.log("Waiting for operation to complete...");
|
|
203
|
+
const [operation] = await client.batchRecognize(request);
|
|
204
|
+
const [response] = await operation.promise();
|
|
205
|
+
|
|
206
|
+
// Extract results for the audio URI (use the GCS URI as the key)
|
|
207
|
+
const fileResult = response.results?.[gcsUri];
|
|
208
|
+
if (!fileResult || !fileResult.transcript) {
|
|
209
|
+
return [];
|
|
210
|
+
}
|
|
321
211
|
|
|
212
|
+
// Extract words from all results (batchRecognize can return multiple result segments)
|
|
213
|
+
const allPhrases = [];
|
|
214
|
+
const results = fileResult.transcript.results || [];
|
|
322
215
|
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
216
|
+
for (const result of results) {
|
|
217
|
+
const phrases = processResponse(result);
|
|
218
|
+
console.log("Phrases:", phrases);
|
|
219
|
+
console.log("Transcription Result:", result);
|
|
220
|
+
allPhrases.push(...phrases);
|
|
221
|
+
}
|
|
328
222
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
throw new Error("Parsed subtitles are not an array");
|
|
223
|
+
if (allPhrases.length === 0) {
|
|
224
|
+
return [];
|
|
332
225
|
}
|
|
226
|
+
return allPhrases;
|
|
333
227
|
} catch (err) {
|
|
334
|
-
console.
|
|
335
|
-
|
|
336
|
-
err
|
|
337
|
-
);
|
|
338
|
-
console.warn("Raw response text:", textPart.substring(0, 500));
|
|
339
|
-
subtitles = [];
|
|
228
|
+
console.error("Transcription Error:", err.message);
|
|
229
|
+
throw err;
|
|
340
230
|
}
|
|
341
|
-
|
|
342
|
-
return {
|
|
343
|
-
subtitles,
|
|
344
|
-
duration,
|
|
345
|
-
videoUrl
|
|
346
|
-
};
|
|
347
|
-
};
|
|
231
|
+
}
|
package/core/workflow.js
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { extractAudioBufferFromAudioUrl, extractAudioBufferFromVideo } from "./audio.utils.js";
|
|
2
|
+
import { transcribeLong, transcribeShort } from "./transcriber.js";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Creates a complete caption video project from a video URL.
|
|
6
|
+
* Downloads video, extracts audio, transcribes it using Google Speech-to-Text,
|
|
7
|
+
* and builds a Twick project JSON structure.
|
|
8
|
+
*
|
|
9
|
+
* @param {Object} params - Project creation parameters
|
|
10
|
+
* @param {string} params.videoUrl - Publicly accessible HTTP(S) URL to the video file
|
|
11
|
+
* @param {Object} [params.videoSize] - Video dimensions {width, height} (defaults to 720x1280)
|
|
12
|
+
* @param {string} [params.language="english"] - Transcription language code
|
|
13
|
+
* @param {string} [params.languageFont="english"] - Font/script for captions
|
|
14
|
+
* @returns {Promise<Object>} Twick project JSON structure
|
|
15
|
+
* @throws {Error} If video processing, transcription, or project building fails
|
|
16
|
+
*/
|
|
17
|
+
export const transcribe = async (params) => {
|
|
18
|
+
const { videoSize, videoUrl, audioUrl, language, languageFont } = params;
|
|
19
|
+
|
|
20
|
+
const { audioBuffer, duration } = audioUrl
|
|
21
|
+
? await extractAudioBufferFromAudioUrl(audioUrl)
|
|
22
|
+
: await extractAudioBufferFromVideo(videoUrl);
|
|
23
|
+
let captions = [];
|
|
24
|
+
if (!duration) {
|
|
25
|
+
throw new Error("Failed to get duration of video");
|
|
26
|
+
} else if (!audioBuffer) {
|
|
27
|
+
throw new Error("Failed to get audio buffer from video");
|
|
28
|
+
} else if (duration > 6) {
|
|
29
|
+
captions = await transcribeLong({ audioBuffer, language });
|
|
30
|
+
} else {
|
|
31
|
+
captions = await transcribeShort({ audioBuffer, language });
|
|
32
|
+
}
|
|
33
|
+
if (!captions.length) {
|
|
34
|
+
throw new Error("No captions found");
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
console.log("Transcription successful");
|
|
38
|
+
|
|
39
|
+
return ({
|
|
40
|
+
captions,
|
|
41
|
+
duration,
|
|
42
|
+
audioUrl,
|
|
43
|
+
videoUrl,
|
|
44
|
+
videoSize,
|
|
45
|
+
language,
|
|
46
|
+
languageFont,
|
|
47
|
+
});
|
|
48
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@twick/cloud-transcript",
|
|
3
|
-
"version": "0.15.
|
|
3
|
+
"version": "0.15.16",
|
|
4
4
|
"description": "Twick cloud function for generating JSON captions from audio using Google Cloud Speech-to-Text",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "core/transcriber.js",
|
|
@@ -46,10 +46,12 @@
|
|
|
46
46
|
"node": ">=20.0.0"
|
|
47
47
|
},
|
|
48
48
|
"dependencies": {
|
|
49
|
-
"@google/genai": "^1.0.0",
|
|
50
49
|
"@aws-sdk/client-secrets-manager": "^3.679.0",
|
|
50
|
+
"fluent-ffmpeg": "^2.1.2",
|
|
51
51
|
"@ffmpeg-installer/ffmpeg": "^1.1.0",
|
|
52
|
-
"@ffprobe-installer/ffprobe": "^1.1.0"
|
|
52
|
+
"@ffprobe-installer/ffprobe": "^1.1.0",
|
|
53
|
+
"@google-cloud/speech": "^7.2.1",
|
|
54
|
+
"@google-cloud/storage": "^7.18.0"
|
|
53
55
|
},
|
|
54
56
|
"devDependencies": {
|
|
55
57
|
"typescript": "~5.4.5",
|
package/platform/aws/handler.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { transcribe } from '../../core/workflow.js';
|
|
2
2
|
|
|
3
3
|
const jsonResponse = (statusCode, body) => ({
|
|
4
4
|
statusCode,
|
|
@@ -16,18 +16,19 @@ const jsonResponse = (statusCode, body) => ({
|
|
|
16
16
|
*
|
|
17
17
|
* Expected JSON payload (e.g. via AppSync / Lambda resolver):
|
|
18
18
|
* {
|
|
19
|
-
* "videoUrl": "https://example.com/
|
|
20
|
-
* "
|
|
21
|
-
* "
|
|
22
|
-
* "
|
|
19
|
+
* "videoUrl": "https://example.com/video.mp4", // for video input
|
|
20
|
+
* "audioUrl": "https://example.com/audio.mp3", // OR for audio input
|
|
21
|
+
* "videoSize": { "width": 720, "height": 1280 }, // optional
|
|
22
|
+
* "language": "english", // optional
|
|
23
|
+
* "languageFont": "english" // optional
|
|
23
24
|
* }
|
|
24
25
|
*
|
|
25
26
|
* Environment variables:
|
|
26
27
|
* - GOOGLE_CLOUD_PROJECT: Explicit Google Cloud project id.
|
|
27
|
-
* - GOOGLE_CLOUD_LOCATION (optional): Location of the Google Cloud project.
|
|
28
|
+
* - GOOGLE_CLOUD_LOCATION (optional): Location of the Google Cloud project.
|
|
28
29
|
* - GOOGLE_VERTEX_MODEL (optional): Model to use for transcription.
|
|
29
30
|
*
|
|
30
|
-
* Returns: JSON payload
|
|
31
|
+
* Returns: JSON payload with captions, duration, and project metadata.
|
|
31
32
|
*/
|
|
32
33
|
export const handler = async (event) => {
|
|
33
34
|
console.log('Transcript function invoked');
|
|
@@ -51,23 +52,26 @@ export const handler = async (event) => {
|
|
|
51
52
|
(event?.body ? JSON.parse(event.body) : {}) ||
|
|
52
53
|
{};
|
|
53
54
|
|
|
54
|
-
const { videoUrl, language,languageFont } =
|
|
55
|
+
const { videoUrl, audioUrl, videoSize, language, languageFont } =
|
|
55
56
|
argumentsPayload;
|
|
56
57
|
|
|
57
|
-
if (!videoUrl) {
|
|
58
|
+
if (!videoUrl && !audioUrl) {
|
|
58
59
|
return jsonResponse(400, {
|
|
59
|
-
error: 'Missing required field: videoUrl',
|
|
60
|
+
error: 'Missing required field: provide either videoUrl or audioUrl',
|
|
60
61
|
expectedFormat: {
|
|
61
|
-
videoUrl:
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
62
|
+
videoUrl: 'Publicly reachable video URL (e.g. https://...)',
|
|
63
|
+
audioUrl: 'Publicly reachable audio URL (e.g. https://... or gs://...)',
|
|
64
|
+
videoSize: 'Optional { width, height }',
|
|
65
|
+
language: 'Optional (e.g. "english", "hindi")',
|
|
66
|
+
languageFont: 'Optional font/script (e.g. "english")',
|
|
65
67
|
},
|
|
66
68
|
});
|
|
67
69
|
}
|
|
68
70
|
|
|
69
|
-
const result = await
|
|
70
|
-
videoUrl,
|
|
71
|
+
const result = await transcribe({
|
|
72
|
+
videoUrl: videoUrl || undefined,
|
|
73
|
+
audioUrl: audioUrl || undefined,
|
|
74
|
+
videoSize,
|
|
71
75
|
language,
|
|
72
76
|
languageFont,
|
|
73
77
|
});
|