@twick/cloud-transcript 0.15.14 → 0.15.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  **Transcribe audio/video to JSON captions using Google GenAI (Vertex AI) with Gemini models.**
4
4
 
5
- Extract text from audio content with precise millisecond timestamps. Perfect for generating subtitle data from audio files or video URLs.
5
+ Extract text from audio content with precise millisecond timestamps. Perfect for generating caption data from audio files or video URLs.
6
6
 
7
7
  ## What Problem Does This Solve?
8
8
 
@@ -172,34 +172,34 @@ const buildPrompt = (duration, language, languageFont) => {
172
172
  // Convert duration from seconds to milliseconds for the prompt
173
173
  const durationMs = Math.round(duration * 1000);
174
174
 
175
- return `You are a professional subtitle and transcription engine.
175
+ return `You are a professional caption and transcription engine.
176
176
 
177
177
  ## INPUT
178
178
  - Audio duration: ${durationMs} milliseconds
179
179
  - Target language: ${language}
180
- - Subtitle font script: ${languageFont}
180
+ - Caption font script: ${languageFont}
181
181
 
182
182
  ## OBJECTIVE
183
- Transcribe the audio into clear, readable subtitles.
183
+ Transcribe the audio into clear, readable captions.
184
184
 
185
- If the spoken audio is NOT in ${language}, translate it into ${language} before generating subtitles.
185
+ If the spoken audio is NOT in ${language}, translate it into ${language} before generating captions.
186
186
 
187
- ## SUBTITLE SEGMENTATION RULES
187
+ ## CAPTION SEGMENTATION RULES
188
188
  - Split speech into short, natural phrases.
189
- - Each subtitle phrase MUST contain a maximum of 4 words.
189
+ - Each caption phrase MUST contain a maximum of 4 words.
190
190
  - Do NOT split words across phrases.
191
191
  - Avoid breaking phrases mid-sentence unless required by timing constraints.
192
192
 
193
193
  ## TIMING RULES (STRICT — MUST FOLLOW)
194
194
  - All timestamps are in **milliseconds**.
195
- - Each subtitle object MUST include:
195
+ - Each caption object MUST include:
196
196
  - 's': start timestamp
197
197
  - 'e': end timestamp
198
198
  - Duration of each phrase = 'e - s'
199
199
  - Minimum phrase duration: **100 ms**
200
200
  - 'e' MUST be greater than 's'
201
201
  - 'e' MUST be **less than or equal to ${durationMs}**
202
- - Subtitles MUST be sequential:
202
+ - Captions MUST be sequential:
203
203
  - 's' of the next phrase MUST be **greater than or equal to** the previous 'e'
204
204
  - NO overlapping timestamps
205
205
  - Prefer aligning timestamps with natural speech pauses.
@@ -207,7 +207,7 @@ If the spoken audio is NOT in ${language}, translate it into ${language} before
207
207
  ## TEXT RULES
208
208
  - 't' MUST be written using ${languageFont} characters.
209
209
  - No emojis.
210
- - No punctuation-only subtitles.
210
+ - No punctuation-only captions.
211
211
  - Normalize casing according to the target language's writing system.
212
212
  - Remove filler sounds (e.g., “um”, “uh”) unless semantically important.
213
213
 
@@ -222,7 +222,7 @@ Return ONLY a valid JSON array.
222
222
  ## OUTPUT SCHEMA
223
223
  [
224
224
  {
225
- "t": "Subtitle text",
225
+ "t": "Caption text",
226
226
  "s": 0,
227
227
  "e": 1200
228
228
  }
@@ -231,14 +231,14 @@ Return ONLY a valid JSON array.
231
231
  };
232
232
 
233
233
  /**
234
- * Transcribe an audio URL to JSON subtitles using Google GenAI (Vertex AI),
234
+ * Transcribe an audio URL to JSON captions using Google GenAI (Vertex AI),
235
235
  * mirroring the Python implementation in `playground/vertex/transcript.py`.
236
236
  *
237
237
  * @param {Object} params
238
238
  * @param {string} params.videoUrl - Publicly reachable video URL.
239
239
  * @param {string} [params.language="english"] - Target transcription language (human-readable).
240
- * @param {string} [params.languageFont="english"] - Target font/script for subtitles.
241
- * @returns {Promise<{ subtitles: Array<{t: string, s: number, e: number}> }>} Subtitles array with text, start time, and end time.
240
+ * @param {string} [params.languageFont="english"] - Target font/script for captions.
241
+ * @returns {Promise<{ captions: Array<{t: string, s: number, e: number}> }>} Captions array with text, start time, and end time.
242
242
  * @throws {Error} When audioUrl is missing or downstream calls fail.
243
243
  */
244
244
  export const transcribeVideoUrl = async (params) => {
@@ -320,27 +320,27 @@ export const transcribeVideoUrl = async (params) => {
320
320
  .trim();
321
321
 
322
322
 
323
- let subtitles = [];
323
+ let captions = [];
324
324
  try {
325
325
  // Try to find JSON array in the text (in case there's extra text)
326
326
  const jsonMatch = textPart.match(/\[[\s\S]*\]/);
327
327
  const jsonText = jsonMatch ? jsonMatch[0] : textPart;
328
328
 
329
- subtitles = JSON.parse(jsonText);
330
- if (!Array.isArray(subtitles)) {
331
- throw new Error("Parsed subtitles are not an array");
329
+ captions = JSON.parse(jsonText);
330
+ if (!Array.isArray(captions)) {
331
+ throw new Error("Parsed captions are not an array");
332
332
  }
333
333
  } catch (err) {
334
334
  console.warn(
335
- "Failed to parse model output as JSON subtitles, returning raw text",
335
+ "Failed to parse model output as JSON captions, returning raw text",
336
336
  err
337
337
  );
338
338
  console.warn("Raw response text:", textPart.substring(0, 500));
339
- subtitles = [];
339
+ captions = [];
340
340
  }
341
341
 
342
342
  return {
343
- subtitles,
343
+ captions,
344
344
  duration,
345
345
  videoUrl
346
346
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@twick/cloud-transcript",
3
- "version": "0.15.14",
3
+ "version": "0.15.15",
4
4
  "description": "Twick cloud function for generating JSON captions from audio using Google Cloud Speech-to-Text",
5
5
  "type": "module",
6
6
  "main": "core/transcriber.js",