@wovin/tranz 0.1.36 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,572 @@
1
+ /**
2
+ * Transcription provider types and interfaces
3
+ * Defines the contract for all transcription providers
4
+ */
5
+
6
+ import { spawn } from "node:child_process"
7
+ import * as fs from "node:fs"
8
+ import path from "node:path"
9
+ import { getName } from "../file-utils.ts"
10
+ import { pipeline } from "node:stream"
11
+ import { promisify } from "node:util"
12
+ import { PipelineSource } from "node:stream"
13
+
14
+ /**
15
+ * A single transcription segment (one diarized turn, or one segment-granularity unit).
16
+ *
17
+ * Timestamps are integer milliseconds — normalized at the SDK boundary so consumers
18
+ * can pass straight into the wovin annotation schema (see docs/annotation-schema.md).
19
+ *
20
+ * `diarization` is the anonymous, per-recording diarization label as returned by
21
+ * the provider (Mistral: `"speaker_1"`, Deepgram: `0`, AssemblyAI: `"A"`, …).
22
+ * It is NOT a real-world speaker identity — that's a separate (future) `speakerId` field.
23
+ *
24
+ * When `mergeTranscriptionResults` joins multiple chunks, `diarization` is rewritten
25
+ * as `` `chunk${index}/${value}` `` because per-chunk labels are not comparable
26
+ * across chunks.
27
+ */
28
+ export interface TranscriptSegment {
29
+ startMs: number
30
+ endMs: number
31
+ text: string
32
+ diarization?: string | number
33
+ // future: speakerId?: string (identified profile)
34
+ // future: speakerName?: string (display name)
35
+ }
36
+
37
+ /**
38
+ * Result object returned from transcription operations
39
+ * Contains the transcribed text and optional provider-specific metadata
40
+ */
41
+ export interface TranscriptionResult {
42
+ /** The transcribed text content */
43
+ text: string
44
+ /** Raw response object from the provider (optional, for debugging) */
45
+ rawResponse?: any
46
+ /** Error message if transcription failed */
47
+ error?: string
48
+ /** Confidence score of the transcription (0-1) */
49
+ confidence?: number
50
+ /** Word-level data — populated only when granularity='word' or the provider returns it. Left undefined otherwise (not `[]`). */
51
+ words?: any[]
52
+ /** Segment-level data — populated when granularity='segment' (or the provider returns it). */
53
+ segments?: TranscriptSegment[]
54
+ /** Duration of audio in seconds */
55
+ duration?: number
56
+ /** Detected or specified language code */
57
+ language?: string
58
+ /** Model used for transcription (as returned by provider) */
59
+ model?: string
60
+ }
61
+
62
+ /**
63
+ * Interface that all transcription providers must implement
64
+ * Defines the standard contract for transcription functionality
65
+ */
66
+ export interface TranscriptionProvider {
67
+ /** Provider name/identifier */
68
+ name: string
69
+ /** Maximum audio duration in seconds (undefined = no limit) */
70
+ maxAudioDurationSec?: number
71
+ /**
72
+ * Transcribe audio from the given parameters
73
+ * @param params - Transcription parameters
74
+ * @returns Promise resolving to transcription result
75
+ */
76
+ transcribe(params: TranscribeParams): Promise<TranscriptionResult>
77
+ }
78
+
79
+ /**
80
+ * Parameters for transcription operations
81
+ * Supports both common and provider-specific options
82
+ */
83
+ export interface TranscribeParams {
84
+ /** Path to the audio file to transcribe */
85
+ audioPath?: string
86
+ /** Audio buffer to transcribe */
87
+ audioBuffer?: Buffer
88
+ /** MIME type for audioBuffer (auto-detected if not provided) */
89
+ mimeType?: string
90
+ /** URL to audio file (e.g., IPFS gateway URL) */
91
+ audioUrl?: string
92
+ /** Model to use for transcription (provider-specific) */
93
+ model?: string
94
+ /** Language code for transcription (e.g., 'en', 'fr') */
95
+ language?: string
96
+ /** API key for authentication (provider-specific) */
97
+ apiKey?: string
98
+ /** Enable speaker diarization (Whisper-specific) */
99
+ diarize?: boolean
100
+ /** Timestamp granularity for transcription (Mistral-specific) */
101
+ timestampGranularity?: 'segment' | 'word'
102
+ /**
103
+ * Context biasing terms (Voxtral/Mistral-specific).
104
+ * Up to `VOXTRAL_LIMITS.maxContextBiasingTerms` (100) custom-vocabulary terms
105
+ * passed to the Voxtral transcribe endpoint as `context_bias[]` form fields.
106
+ * Ignored by non-Mistral providers.
107
+ */
108
+ contextBias?: string[]
109
+ /** Path to model file (Whisper-specific) */
110
+ modelPath?: string
111
+ /** Output directory for results (Whisper-specific) */
112
+ outputDir?: string
113
+ /** Provider configuration object */
114
+ config?: any
115
+ }
116
+
117
+ /**
118
+ * Union type for supported provider names
119
+ */
120
+ export type ProviderName = 'whisper' | 'mistral' | 'greenpt'
121
+
122
+ /**
123
+ * Factory function to create a transcription provider instance
124
+ * @param providerName - Name of the provider to create
125
+ * @param config - Optional configuration object for the provider
126
+ * @returns Transcription provider instance
127
+ */
128
+ export function createProvider(providerName: ProviderName, config?: any): TranscriptionProvider {
129
+ switch (providerName) {
130
+ case 'whisper':
131
+ return new WhisperProvider(config)
132
+ case 'mistral':
133
+ return new MistralProvider()
134
+ case 'greenpt':
135
+ return new GreenPTProvider()
136
+ default:
137
+ throw new Error(`Unknown provider: ${providerName}`)
138
+ }
139
+ }
140
+
141
+ import { detectAudioMimeType } from './mime-detection.ts'
142
+
143
+ /**
144
+ * Type for whisper.cpp JSON output
145
+ */
146
+ type WhisperJsonOutput = Record<string, any>
147
+
148
+ /**
149
+ * Whisper provider for local whisper.cpp transcription
150
+ * Manages model caching and local transcription execution
151
+ */
152
+ export class WhisperProvider implements TranscriptionProvider {
153
+ name = 'whisper'
154
+
155
+ private cacheDir: string
156
+
157
+ static DEFAULTS = {
158
+ DIARIZE: false,
159
+ SILDUR: "1.3",
160
+ SILBUF: 0.2,
161
+ SILTHR: "-35dB",
162
+ MODEL_KEYS: {
163
+ tinyd: "ggml-small.en-tdrz.bin",
164
+ small: "ggml-small.bin",
165
+ medium: "ggml-medium.bin",
166
+ },
167
+ MODELS: {
168
+ tinyd:
169
+ "https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-small.en-tdrz.bin",
170
+ small:
171
+ "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin",
172
+ medium:
173
+ "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin",
174
+ },
175
+ }
176
+
177
+ constructor(config?: any) {
178
+ // Default cache directory, can be overridden via config
179
+ this.cacheDir = config?.cacheDir || `${process.env.HOME}/.cache/whisper-models`
180
+ }
181
+
182
+ async transcribe(params: TranscribeParams): Promise<TranscriptionResult> {
183
+ const {
184
+ audioPath,
185
+ outputDir = "./out",
186
+ diarize = WhisperProvider.DEFAULTS.DIARIZE,
187
+ modelPath: providedModelPath,
188
+ } = params
189
+
190
+ let modelPath = providedModelPath
191
+ const modelKey = (diarize ? "tinyd" : "small") as keyof typeof WhisperProvider.DEFAULTS.MODEL_KEYS
192
+
193
+ // Ensure model is cached if not provided
194
+ if (!modelPath) {
195
+ modelPath = await this.ensureRequestedModelIsCached(modelKey)
196
+ }
197
+
198
+ const sourceFileName = getName(audioPath)
199
+ const outTransPath = `${outputDir}/${sourceFileName}-transcript`
200
+ const tdrz = diarize ? "-tdrz" : ""
201
+
202
+ // Build arguments array properly
203
+ const args = [
204
+ tdrz,
205
+ "-t",
206
+ "8",
207
+ "-oj",
208
+ "-ng", // TODO: consider withGPU option
209
+ "-f",
210
+ audioPath,
211
+ "-m",
212
+ modelPath,
213
+ "-of",
214
+ outTransPath,
215
+ ].filter((arg) => arg !== "")
216
+
217
+ const cmd = `whisper-cli ${args.join(" ")}`
218
+ console.log("spawning ", cmd)
219
+ const whisperThread = spawn(`whisper-cli`, args)
220
+
221
+ return new Promise<TranscriptionResult>((resolveFx) => {
222
+ let whisperOutput = ""
223
+ const handleOut = (data: string) => {
224
+ const str = data.toString()
225
+ for (const match of ["[", "main:"]) {
226
+ if (str.startsWith(match) || str.includes("total time"))
227
+ console.log(str)
228
+ }
229
+ whisperOutput += data
230
+ }
231
+ whisperThread.stdout.on("data", handleOut)
232
+ whisperThread.stderr.on("data", handleOut)
233
+
234
+ whisperThread.on("close", (code) => {
235
+ try {
236
+ const trans: WhisperJsonOutput = JSON.parse(
237
+ fs.readFileSync(`${outTransPath}.json`).toString()
238
+ )
239
+
240
+ // Extract text from transcription array
241
+ const transcriptionArray = trans.result?.transcription || []
242
+ const text = transcriptionArray
243
+ .map((entry: any) => entry.text)
244
+ .filter((t: any) => t)
245
+ .join(" ")
246
+ .trim()
247
+
248
+ resolveFx({
249
+ text: text || "",
250
+ rawResponse: trans,
251
+ })
252
+ } catch (error) {
253
+ const errorMessage = error instanceof Error ? error.message : String(error)
254
+ resolveFx({
255
+ text: "",
256
+ error: `Failed to parse transcription result: ${errorMessage}`,
257
+ rawResponse: undefined,
258
+ })
259
+ }
260
+ })
261
+ whisperThread.on("error", (err) => {
262
+ console.error("Whisper Error", { err, outTransPath, args })
263
+ resolveFx({
264
+ text: "",
265
+ error: `Whisper process error: ${err.message}`,
266
+ })
267
+ })
268
+ }).catch((whisperError) => {
269
+ console.error("Uncaught Whisper Error", whisperError)
270
+ return {
271
+ text: "",
272
+ error: `Uncaught error: ${whisperError instanceof Error ? whisperError.message : String(whisperError)}`,
273
+ }
274
+ })
275
+ }
276
+
277
+ private async ensureRequestedModelIsCached(
278
+ modelKey: keyof typeof WhisperProvider.DEFAULTS.MODEL_KEYS
279
+ ): Promise<string> {
280
+ if (!WhisperProvider.DEFAULTS.MODEL_KEYS[modelKey])
281
+ throw new Error(`${modelKey} not known`)
282
+
283
+ const cachedModelsDirPath = `${this.cacheDir}/models`
284
+ if (!fs.existsSync(cachedModelsDirPath)) {
285
+ fs.mkdirSync(cachedModelsDirPath, { recursive: true })
286
+ }
287
+
288
+ const modelPath = `${cachedModelsDirPath}/${WhisperProvider.DEFAULTS.MODEL_KEYS[modelKey]}`
289
+ const isModelExisting = fs.existsSync(modelPath)
290
+
291
+ if (!isModelExisting) {
292
+ const srcURL = WhisperProvider.DEFAULTS.MODELS[modelKey]
293
+ console.log(`
294
+ requested model is missing
295
+ Fetching ${srcURL} into ${modelPath}
296
+ `)
297
+ const data = await fetch(srcURL)
298
+ if (!data?.body) throw new Error("fetch failed")
299
+
300
+ const streamPipeline = promisify(pipeline)
301
+ await streamPipeline(
302
+ data.body as unknown as PipelineSource<any>,
303
+ fs.createWriteStream(modelPath)
304
+ )
305
+ } else {
306
+ console.log(`Found ${modelPath} \n`)
307
+ }
308
+
309
+ return modelPath
310
+ }
311
+ }
312
+
313
+ /**
314
+ * Voxtral/Mistral API limits
315
+ * These may need adjustment based on actual API constraints
316
+ */
317
+ export const VOXTRAL_LIMITS = {
318
+ /** Maximum audio duration in seconds (3 hours for Voxtral Transcribe 2) */
319
+ maxAudioDurationSec: 3 * 60 * 60, // 10800 seconds = 3 hours
320
+ /** Maximum context biasing words/phrases */
321
+ maxContextBiasingTerms: 100,
322
+ /** Maximum file size in bytes (1GB) */
323
+ maxFileSizeBytes: 1024 * 1024 * 1024,
324
+ }
325
+
326
+ export class MistralProvider implements TranscriptionProvider {
327
+ name = 'mistral'
328
+ maxAudioDurationSec = VOXTRAL_LIMITS.maxAudioDurationSec
329
+
330
+ async transcribe(params: TranscribeParams): Promise<TranscriptionResult> {
331
+ // Validate API constraints
332
+ if (params.language && params.timestampGranularity) {
333
+ throw new Error('Cannot use both language and timestampGranularity (Mistral API limitation)')
334
+ }
335
+
336
+ // Validate diarize + timestampGranularity constraint
337
+ const diarize = params.diarize ?? true
338
+ if (diarize && params.timestampGranularity === 'word') {
339
+ throw new Error('When diarize is set to true, the timestamp granularity must be set to ["segment"], got ["word"]')
340
+ }
341
+
342
+ const formData = new FormData()
343
+
344
+ if (params.audioUrl) {
345
+ // URL input - use file_url parameter (no file upload needed)
346
+ formData.append('file_url', params.audioUrl)
347
+ } else {
348
+ // File or buffer input
349
+ let audioBuffer: Buffer
350
+ let mimeType: string
351
+
352
+ if (params.audioBuffer) {
353
+ audioBuffer = params.audioBuffer
354
+ mimeType = params.mimeType || detectAudioMimeType(audioBuffer)
355
+ } else if (params.audioPath) {
356
+ audioBuffer = fs.readFileSync(params.audioPath)
357
+ mimeType = detectAudioMimeType(audioBuffer)
358
+ } else {
359
+ return { text: '', error: 'No audio input provided (audioPath, audioBuffer, or audioUrl required)' }
360
+ }
361
+
362
+ // Extract extension from MIME type
363
+ const extension = mimeType === 'audio/mpeg'
364
+ ? 'mp3'
365
+ : mimeType === 'audio/wav'
366
+ ? 'wav'
367
+ : mimeType === 'audio/flac'
368
+ ? 'flac'
369
+ : 'ogg'
370
+
371
+ const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType })
372
+ formData.append('file', audioBlob, `audio.${extension}`)
373
+ }
374
+
375
+ // Add model
376
+ const model = params.model || 'voxtral-mini-latest'
377
+ formData.append('model', model)
378
+
379
+ // Add language if provided
380
+ if (params.language) {
381
+ formData.append('language', params.language)
382
+ }
383
+
384
+ // Add diarization - enabled by default (no extra cost)
385
+ if (diarize) {
386
+ formData.append('diarize', 'true')
387
+ }
388
+
389
+ // Add timestamp granularity - default to 'segment' (required when diarize is true)
390
+ // Note: timestamp_granularities is incompatible with language parameter
391
+ // See: https://docs.mistral.ai/capabilities/audio_transcription
392
+ const timestampGranularity = params.language ? undefined : (params.timestampGranularity ?? 'segment')
393
+ if (timestampGranularity) {
394
+ formData.append('timestamp_granularities', timestampGranularity)
395
+ }
396
+
397
+ // Context biasing terms (custom vocabulary) — Voxtral-specific.
398
+ // Sent as repeated `context_bias[]` form fields per Mistral API spec.
399
+ if (params.contextBias && params.contextBias.length > 0) {
400
+ if (params.contextBias.length > VOXTRAL_LIMITS.maxContextBiasingTerms) {
401
+ throw new Error(
402
+ `contextBias has ${params.contextBias.length} terms; Voxtral limit is ${VOXTRAL_LIMITS.maxContextBiasingTerms}`
403
+ )
404
+ }
405
+ for (const term of params.contextBias) {
406
+ formData.append('context_bias[]', term)
407
+ }
408
+ }
409
+
410
+ // POST to Mistral API
411
+ const response = await fetch('https://api.mistral.ai/v1/audio/transcriptions', {
412
+ method: 'POST',
413
+ headers: {
414
+ 'Authorization': `Bearer ${params.apiKey}`,
415
+ },
416
+ body: formData,
417
+ })
418
+
419
+ // Handle errors
420
+ if (!response.ok) {
421
+ const errorText = await response.text()
422
+ return { text: '', error: `API returned ${response.status}: ${errorText}` }
423
+ }
424
+
425
+ // Parse response
426
+ const result = await response.json()
427
+
428
+ if (!result?.text) {
429
+ return { text: '', error: 'No transcription returned', rawResponse: result }
430
+ }
431
+
432
+ // Normalize segments to ms + canonical `diarization` field.
433
+ // Voxtral 2 returns seconds; the wovin annotation schema mandates integer ms.
434
+ const segments: TranscriptSegment[] | undefined = Array.isArray(result.segments) && result.segments.length > 0
435
+ ? result.segments.map((seg: any): TranscriptSegment => ({
436
+ startMs: Math.round((seg.start ?? 0) * 1000),
437
+ endMs: Math.round((seg.end ?? 0) * 1000),
438
+ text: seg.text ?? '',
439
+ ...(seg.speaker_id !== undefined ? { diarization: seg.speaker_id } : {}),
440
+ }))
441
+ : undefined
442
+
443
+ // Only surface `words` when the API actually returned them (granularity='word' path,
444
+ // or nested seg.words). Do not fabricate an empty array — consumers check presence.
445
+ let words: any[] | undefined
446
+ if (Array.isArray(result.words) && result.words.length > 0) {
447
+ words = result.words
448
+ } else if (Array.isArray(result.segments)) {
449
+ const nested = result.segments.flatMap((seg: any) => seg.words ?? [])
450
+ if (nested.length > 0) words = nested
451
+ }
452
+
453
+ const duration = result.usage?.prompt_audio_seconds
454
+
455
+ return {
456
+ text: result.text,
457
+ language: result.language ?? params.language,
458
+ model: result.model,
459
+ duration: duration,
460
+ ...(words ? { words } : {}),
461
+ ...(segments ? { segments } : {}),
462
+ rawResponse: result,
463
+ }
464
+ }
465
+ }
466
+
467
+ /**
468
+ * GreenPT transcription provider
469
+ * Uses GreenPT API for audio transcription with Deepgram-compatible response format
470
+ */
471
+ export class GreenPTProvider implements TranscriptionProvider {
472
+ name = 'greenpt'
473
+
474
+ async transcribe(params: TranscribeParams): Promise<TranscriptionResult> {
475
+ if (!params.apiKey) {
476
+ return { text: '', error: 'API key is required for GreenPT provider' }
477
+ }
478
+
479
+ if (!params.audioPath) {
480
+ return { text: '', error: 'Audio path is required' }
481
+ }
482
+
483
+ try {
484
+ // Read audio file
485
+ const audioBuffer = fs.readFileSync(params.audioPath)
486
+
487
+ // Detect MIME type
488
+ const mimeType = detectAudioMimeType(audioBuffer)
489
+
490
+ // Create FormData with audio Blob
491
+ const formData = new FormData()
492
+ const extension = mimeType === 'audio/mpeg'
493
+ ? 'mp3'
494
+ : mimeType === 'audio/wav'
495
+ ? 'wav'
496
+ : mimeType === 'audio/flac'
497
+ ? 'flac'
498
+ : 'ogg'
499
+ const audioBlob = new Blob([new Uint8Array(audioBuffer)], { type: mimeType })
500
+ formData.append('file', audioBlob, `audio.${extension}`)
501
+
502
+ // Build query parameters
503
+ const queryParams = new URLSearchParams()
504
+ const model = params.model || 'green-s-pro'
505
+ queryParams.append('model', model)
506
+
507
+ if (params.language) {
508
+ queryParams.append('language', params.language)
509
+ }
510
+
511
+ if (params.diarize !== undefined) {
512
+ queryParams.append('diarize', String(params.diarize))
513
+ }
514
+
515
+ // Default punctuate to true
516
+ queryParams.append('punctuate', 'true')
517
+
518
+ // Build URL with query parameters
519
+ const url = `https://api.greenpt.ai/v1/listen?${queryParams.toString()}`
520
+
521
+ // Make API request
522
+ const response = await fetch(url, {
523
+ method: 'POST',
524
+ headers: {
525
+ 'Authorization': `Bearer ${params.apiKey}`,
526
+ },
527
+ body: formData,
528
+ })
529
+
530
+ if (!response.ok) {
531
+ const errorText = await response.text()
532
+ return {
533
+ text: '',
534
+ error: `API returned ${response.status}: ${errorText}`,
535
+ }
536
+ }
537
+
538
+ const result = await response.json()
539
+
540
+ // Extract transcription from Deepgram-compatible response format
541
+ const transcript = result?.results?.channels?.[0]?.alternatives?.[0]
542
+
543
+ if (!transcript) {
544
+ return {
545
+ text: '',
546
+ error: 'No transcription returned',
547
+ rawResponse: result,
548
+ }
549
+ }
550
+
551
+ // Build text from transcript field, or from words if transcript is missing
552
+ let text = transcript.transcript
553
+ if (!text && transcript.words && transcript.words.length > 0) {
554
+ text = transcript.words.map((w: any) => w.word).join(' ')
555
+ }
556
+
557
+ return {
558
+ text: text || '',
559
+ confidence: transcript.confidence,
560
+ words: transcript.words,
561
+ duration: result?.metadata?.duration,
562
+ rawResponse: result,
563
+ }
564
+ } catch (error) {
565
+ const errorMessage = error instanceof Error ? error.message : String(error)
566
+ return {
567
+ text: '',
568
+ error: `Transcription failed: ${errorMessage}`,
569
+ }
570
+ }
571
+ }
572
+ }