@wovin/tranz 0.1.35 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +8 -5
- package/dist/{audio.min.js → audio.js} +32 -18
- package/dist/index.d.ts +3 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/{index.min.js → index.js} +161 -29
- package/dist/providers.d.ts +1 -1
- package/dist/providers.d.ts.map +1 -1
- package/dist/{providers.min.js → providers.js} +68 -24
- package/dist/utils/audio/merge-results.d.ts +14 -12
- package/dist/utils/audio/merge-results.d.ts.map +1 -1
- package/dist/utils/transcription/format.d.ts +27 -0
- package/dist/utils/transcription/format.d.ts.map +1 -1
- package/dist/utils/transcription/providers.d.ts +30 -1
- package/dist/utils/transcription/providers.d.ts.map +1 -1
- package/dist/utils/transcription/transcribe.d.ts +5 -0
- package/dist/utils/transcription/transcribe.d.ts.map +1 -1
- package/package.json +10 -8
- package/src/audio.ts +25 -0
- package/src/index.ts +61 -0
- package/src/providers.ts +23 -0
- package/src/realtime.ts +58 -0
- package/src/utils/audio/index.ts +6 -0
- package/src/utils/audio/merge-results.ts +198 -0
- package/src/utils/audio/split.ts +504 -0
- package/src/utils/file-utils.ts +16 -0
- package/src/utils/transcription/format.ts +208 -0
- package/src/utils/transcription/mime-detection.ts +80 -0
- package/src/utils/transcription/providers.ts +572 -0
- package/src/utils/transcription/realtime.ts +821 -0
- package/src/utils/transcription/runtime.ts +40 -0
- package/src/utils/transcription/transcribe.ts +366 -0
- /package/dist/{realtime.min.js → realtime.js} +0 -0
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Runtime environment detection for WebSocket implementations
|
|
3
|
+
*
|
|
4
|
+
* Provides environment-aware WebSocket constructor selection:
|
|
5
|
+
* - Browser/Deno: Uses global WebSocket API
|
|
6
|
+
* - Node.js: Dynamically imports 'ws' package
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Get the appropriate WebSocket implementation for the current runtime
|
|
11
|
+
*
|
|
12
|
+
* @returns WebSocket constructor (browser WebSocket or ws package)
|
|
13
|
+
* @throws Error if WebSocket is not available in any form
|
|
14
|
+
*/
|
|
15
|
+
export async function getWebSocketImpl(): Promise<any> {
|
|
16
|
+
// Check if we're in a browser/Deno environment (has DOM or navigator)
|
|
17
|
+
const isBrowser = typeof globalThis !== "undefined" &&
|
|
18
|
+
(typeof globalThis.document !== "undefined" || typeof globalThis.navigator !== "undefined");
|
|
19
|
+
|
|
20
|
+
// Browser/Deno - use global WebSocket
|
|
21
|
+
if (isBrowser && typeof globalThis.WebSocket !== "undefined") {
|
|
22
|
+
return globalThis.WebSocket;
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// Node.js - dynamically import ws
|
|
26
|
+
if (!isBrowser) {
|
|
27
|
+
try {
|
|
28
|
+
const WS = await import("ws");
|
|
29
|
+
return WS.default || WS;
|
|
30
|
+
} catch (err) {
|
|
31
|
+
throw new Error(
|
|
32
|
+
"WebSocket not available. In Node.js, install 'ws' package: npm install ws"
|
|
33
|
+
);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
throw new Error(
|
|
38
|
+
"WebSocket not available in this environment"
|
|
39
|
+
);
|
|
40
|
+
}
|
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Simple high-level transcription API with good defaults
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import * as fs from 'node:fs'
|
|
6
|
+
import * as https from 'node:https'
|
|
7
|
+
import * as http from 'node:http'
|
|
8
|
+
import * as os from 'node:os'
|
|
9
|
+
import * as path from 'node:path'
|
|
10
|
+
import { MistralProvider, VOXTRAL_LIMITS, type TranscriptionResult } from './providers.ts'
|
|
11
|
+
import { autoSplitAudio, getAudioDuration, type AudioSegment } from '../audio/split.ts'
|
|
12
|
+
import { mergeTranscriptionResults, type MergedTranscriptionResult } from '../audio/merge-results.ts'
|
|
13
|
+
|
|
14
|
+
/** Logger interface for transcription progress */
|
|
15
|
+
export interface TranscribeLogger {
|
|
16
|
+
info: (msg: string) => void
|
|
17
|
+
warn: (msg: string) => void
|
|
18
|
+
debug: (msg: string) => void
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
const defaultLogger: TranscribeLogger = {
|
|
22
|
+
info: (msg) => console.log(`[tranz] ${msg}`),
|
|
23
|
+
warn: (msg) => console.warn(`[tranz] ${msg}`),
|
|
24
|
+
debug: () => {}, // silent by default
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export interface TranscribeOptions {
|
|
28
|
+
/** Path to audio file */
|
|
29
|
+
audioPath?: string
|
|
30
|
+
/** Audio buffer to transcribe */
|
|
31
|
+
audioBuffer?: Buffer
|
|
32
|
+
/** MIME type for audioBuffer (auto-detected if not provided) */
|
|
33
|
+
mimeType?: string
|
|
34
|
+
/** URL to audio file (e.g., IPFS gateway URL) */
|
|
35
|
+
audioUrl?: string
|
|
36
|
+
/** Known duration in seconds (skips duration detection for URL input) */
|
|
37
|
+
duration?: number
|
|
38
|
+
/** Language code (e.g. 'en', 'fr') - note: disables word timestamps for Mistral */
|
|
39
|
+
language?: string
|
|
40
|
+
/** Model to use (default: voxtral-mini-latest) */
|
|
41
|
+
model?: string
|
|
42
|
+
/** Enable speaker diarization (default: true) */
|
|
43
|
+
diarize?: boolean
|
|
44
|
+
/** Timestamp granularity: 'word' | 'segment' (default: 'segment' when diarize=true, disabled if language set) */
|
|
45
|
+
timestamps?: 'word' | 'segment'
|
|
46
|
+
/**
|
|
47
|
+
* Context biasing terms — up to `VOXTRAL_LIMITS.maxContextBiasingTerms` (100)
|
|
48
|
+
* custom-vocabulary entries passed to Voxtral as `context_bias[]`. Mistral only.
|
|
49
|
+
*/
|
|
50
|
+
contextBias?: string[]
|
|
51
|
+
/** Auto-split long audio (default: true). For URLs, detects duration first. */
|
|
52
|
+
autoSplit?: boolean
|
|
53
|
+
/** Output directory for split segments (default: system temp) */
|
|
54
|
+
splitOutputDir?: string
|
|
55
|
+
/** Custom logger (default: console) */
|
|
56
|
+
logger?: TranscribeLogger
|
|
57
|
+
/** Enable verbose/debug logging */
|
|
58
|
+
verbose?: boolean
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export interface MistralTranscriberConfig {
|
|
62
|
+
/** Mistral API key */
|
|
63
|
+
apiKey: string
|
|
64
|
+
/** Default model (default: voxtral-mini-latest) */
|
|
65
|
+
model?: string
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** Map of MIME types to file extensions */
|
|
69
|
+
const MIME_TO_EXT: Record<string, string> = {
|
|
70
|
+
'audio/mpeg': '.mp3',
|
|
71
|
+
'audio/mp3': '.mp3',
|
|
72
|
+
'audio/wav': '.wav',
|
|
73
|
+
'audio/x-wav': '.wav',
|
|
74
|
+
'audio/ogg': '.ogg',
|
|
75
|
+
'audio/flac': '.flac',
|
|
76
|
+
'audio/x-flac': '.flac',
|
|
77
|
+
'audio/mp4': '.m4a',
|
|
78
|
+
'audio/m4a': '.m4a',
|
|
79
|
+
'audio/aac': '.aac',
|
|
80
|
+
'audio/webm': '.webm',
|
|
81
|
+
'audio/opus': '.opus',
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
/**
|
|
85
|
+
* Get file extension from Content-Type header or URL
|
|
86
|
+
*/
|
|
87
|
+
function getExtFromContentType(contentType: string | undefined, url: string): string {
|
|
88
|
+
// Try Content-Type first
|
|
89
|
+
if (contentType) {
|
|
90
|
+
const mimeType = contentType.split(';')[0].trim().toLowerCase()
|
|
91
|
+
if (MIME_TO_EXT[mimeType]) {
|
|
92
|
+
return MIME_TO_EXT[mimeType]
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
// Fall back to URL path extension
|
|
96
|
+
try {
|
|
97
|
+
const urlPath = new URL(url).pathname
|
|
98
|
+
const ext = path.extname(urlPath).toLowerCase()
|
|
99
|
+
if (ext && ['.mp3', '.wav', '.ogg', '.flac', '.m4a', '.aac', '.webm', '.opus'].includes(ext)) {
|
|
100
|
+
return ext
|
|
101
|
+
}
|
|
102
|
+
} catch {}
|
|
103
|
+
// Default to .audio (ffprobe will probe the format)
|
|
104
|
+
return '.audio'
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
/**
|
|
108
|
+
* Download a URL to a temporary file
|
|
109
|
+
*/
|
|
110
|
+
async function downloadToTempFile(url: string, outputDir: string): Promise<string> {
|
|
111
|
+
return new Promise((resolve, reject) => {
|
|
112
|
+
const protocol = url.startsWith('https') ? https : http
|
|
113
|
+
protocol.get(url, (response) => {
|
|
114
|
+
if (response.statusCode === 301 || response.statusCode === 302) {
|
|
115
|
+
// Handle redirect
|
|
116
|
+
const redirectUrl = response.headers.location
|
|
117
|
+
if (redirectUrl) {
|
|
118
|
+
downloadToTempFile(redirectUrl, outputDir).then(resolve).catch(reject)
|
|
119
|
+
return
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
if (response.statusCode !== 200) {
|
|
123
|
+
reject(new Error(`Failed to download: HTTP ${response.statusCode}`))
|
|
124
|
+
return
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
// Determine file extension from Content-Type or URL
|
|
128
|
+
const ext = getExtFromContentType(response.headers['content-type'], url)
|
|
129
|
+
const tempPath = path.join(outputDir, `download-${Date.now()}${ext}`)
|
|
130
|
+
const file = fs.createWriteStream(tempPath)
|
|
131
|
+
|
|
132
|
+
response.pipe(file)
|
|
133
|
+
file.on('finish', () => {
|
|
134
|
+
file.close()
|
|
135
|
+
resolve(tempPath)
|
|
136
|
+
})
|
|
137
|
+
file.on('error', (err) => {
|
|
138
|
+
fs.unlink(tempPath, () => {})
|
|
139
|
+
reject(err)
|
|
140
|
+
})
|
|
141
|
+
}).on('error', (err) => {
|
|
142
|
+
reject(err)
|
|
143
|
+
})
|
|
144
|
+
})
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
/**
|
|
148
|
+
* Try to get duration from URL using ffprobe (uses HTTP range requests)
|
|
149
|
+
* Returns undefined if detection fails
|
|
150
|
+
*/
|
|
151
|
+
async function tryGetUrlDuration(url: string): Promise<number | undefined> {
|
|
152
|
+
try {
|
|
153
|
+
return await getAudioDuration(url)
|
|
154
|
+
} catch {
|
|
155
|
+
return undefined
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/**
|
|
160
|
+
* Simple Mistral transcriber with auto-splitting and good defaults
|
|
161
|
+
*
|
|
162
|
+
* @example
|
|
163
|
+
* ```ts
|
|
164
|
+
* const transcriber = createMistralTranscriber({ apiKey: process.env.MISTRAL_API_KEY })
|
|
165
|
+
*
|
|
166
|
+
* // From file (supports auto-split for long audio)
|
|
167
|
+
* const result = await transcriber.transcribe({ audioPath: './interview.mp3' })
|
|
168
|
+
*
|
|
169
|
+
* // From URL (auto-detects if splitting needed, downloads only if necessary)
|
|
170
|
+
* const result = await transcriber.transcribe({ audioUrl: 'https://gateway.ipfs.io/ipfs/Qm...' })
|
|
171
|
+
*
|
|
172
|
+
* // From URL with known duration (skips detection)
|
|
173
|
+
* const result = await transcriber.transcribe({ audioUrl: '...', duration: 120 })
|
|
174
|
+
*
|
|
175
|
+
* // From buffer
|
|
176
|
+
* const result = await transcriber.transcribe({ audioBuffer: buffer, mimeType: 'audio/mpeg' })
|
|
177
|
+
* ```
|
|
178
|
+
*/
|
|
179
|
+
/** Transcriber interface returned by createMistralTranscriber */
|
|
180
|
+
export interface MistralTranscriber {
|
|
181
|
+
transcribe(options: TranscribeOptions): Promise<MergedTranscriptionResult>
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
export function createMistralTranscriber(config: MistralTranscriberConfig): MistralTranscriber {
|
|
185
|
+
const provider = new MistralProvider()
|
|
186
|
+
const defaultModel = config.model || 'voxtral-mini-latest'
|
|
187
|
+
|
|
188
|
+
return {
|
|
189
|
+
/**
|
|
190
|
+
* Transcribe audio with smart auto-splitting
|
|
191
|
+
* - For files: checks duration and splits if needed
|
|
192
|
+
* - For URLs: probes duration via HTTP range request, downloads only if splitting needed
|
|
193
|
+
* - For buffers: transcribes directly (no splitting)
|
|
194
|
+
*/
|
|
195
|
+
async transcribe(options: TranscribeOptions): Promise<MergedTranscriptionResult> {
|
|
196
|
+
const {
|
|
197
|
+
audioPath,
|
|
198
|
+
audioBuffer,
|
|
199
|
+
mimeType,
|
|
200
|
+
audioUrl,
|
|
201
|
+
duration: knownDuration,
|
|
202
|
+
language,
|
|
203
|
+
model = defaultModel,
|
|
204
|
+
contextBias,
|
|
205
|
+
diarize = true,
|
|
206
|
+
timestamps = language ? undefined : 'segment',
|
|
207
|
+
autoSplit,
|
|
208
|
+
splitOutputDir,
|
|
209
|
+
logger: customLogger,
|
|
210
|
+
verbose,
|
|
211
|
+
} = options
|
|
212
|
+
|
|
213
|
+
const log = customLogger || defaultLogger
|
|
214
|
+
if (verbose) log.debug = log.info // promote debug to info when verbose
|
|
215
|
+
|
|
216
|
+
const maxDuration = VOXTRAL_LIMITS.maxAudioDurationSec
|
|
217
|
+
|
|
218
|
+
// Buffer input - no auto-split support, transcribe directly
|
|
219
|
+
if (audioBuffer) {
|
|
220
|
+
log.info(`Transcribing from buffer (${(audioBuffer.length / 1024 / 1024).toFixed(2)} MB)`)
|
|
221
|
+
const result = await provider.transcribe({
|
|
222
|
+
audioBuffer,
|
|
223
|
+
mimeType,
|
|
224
|
+
apiKey: config.apiKey,
|
|
225
|
+
model,
|
|
226
|
+
language,
|
|
227
|
+
diarize,
|
|
228
|
+
timestampGranularity: timestamps,
|
|
229
|
+
contextBias,
|
|
230
|
+
})
|
|
231
|
+
return result
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
// URL input - smart handling
|
|
235
|
+
if (audioUrl) {
|
|
236
|
+
// If autoSplit explicitly disabled, use URL directly
|
|
237
|
+
if (autoSplit === false) {
|
|
238
|
+
log.info(`Transcribing URL directly (autoSplit disabled)`)
|
|
239
|
+
const result = await provider.transcribe({
|
|
240
|
+
audioUrl,
|
|
241
|
+
apiKey: config.apiKey,
|
|
242
|
+
model,
|
|
243
|
+
language,
|
|
244
|
+
diarize,
|
|
245
|
+
timestampGranularity: timestamps,
|
|
246
|
+
contextBias,
|
|
247
|
+
})
|
|
248
|
+
return result
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// Check duration (use known or detect)
|
|
252
|
+
let duration = knownDuration
|
|
253
|
+
if (duration === undefined) {
|
|
254
|
+
log.info(`Probing URL duration via ffprobe...`)
|
|
255
|
+
duration = await tryGetUrlDuration(audioUrl)
|
|
256
|
+
if (duration !== undefined) {
|
|
257
|
+
log.info(`Duration detected: ${duration.toFixed(1)}s`)
|
|
258
|
+
} else {
|
|
259
|
+
log.warn(`Duration detection failed, will download to check`)
|
|
260
|
+
}
|
|
261
|
+
} else {
|
|
262
|
+
log.debug(`Using provided duration: ${duration.toFixed(1)}s`)
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
// If duration known and short enough, use URL directly
|
|
266
|
+
if (duration !== undefined && duration <= maxDuration) {
|
|
267
|
+
log.info(`Duration ${duration.toFixed(1)}s <= ${maxDuration}s, using URL directly`)
|
|
268
|
+
const result = await provider.transcribe({
|
|
269
|
+
audioUrl,
|
|
270
|
+
apiKey: config.apiKey,
|
|
271
|
+
model,
|
|
272
|
+
language,
|
|
273
|
+
diarize,
|
|
274
|
+
timestampGranularity: timestamps,
|
|
275
|
+
contextBias,
|
|
276
|
+
})
|
|
277
|
+
return result
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
// Duration unknown or too long - download and process as file
|
|
281
|
+
log.info(`Downloading URL to temp file for processing...`)
|
|
282
|
+
const outDir = splitOutputDir || path.join(os.tmpdir(), `tranz-${Date.now()}`)
|
|
283
|
+
fs.mkdirSync(outDir, { recursive: true })
|
|
284
|
+
|
|
285
|
+
const tempFile = await downloadToTempFile(audioUrl, outDir)
|
|
286
|
+
log.info(`Downloaded to ${tempFile}`)
|
|
287
|
+
|
|
288
|
+
// Recurse with file path
|
|
289
|
+
const result = await this.transcribe({
|
|
290
|
+
audioPath: tempFile,
|
|
291
|
+
language,
|
|
292
|
+
model,
|
|
293
|
+
diarize,
|
|
294
|
+
timestamps,
|
|
295
|
+
contextBias,
|
|
296
|
+
autoSplit: true,
|
|
297
|
+
splitOutputDir: outDir,
|
|
298
|
+
logger: customLogger,
|
|
299
|
+
verbose,
|
|
300
|
+
})
|
|
301
|
+
|
|
302
|
+
// Cleanup temp file (segments are in outDir)
|
|
303
|
+
try { fs.unlinkSync(tempFile) } catch {}
|
|
304
|
+
|
|
305
|
+
return result
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
// File path input
|
|
309
|
+
if (!audioPath) {
|
|
310
|
+
return { text: '', error: 'No audio input provided (audioPath, audioBuffer, or audioUrl required)' }
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
log.debug(`Processing file: ${audioPath}`)
|
|
314
|
+
const duration = knownDuration ?? await getAudioDuration(audioPath)
|
|
315
|
+
log.info(`Audio duration: ${duration.toFixed(1)}s`)
|
|
316
|
+
const needsSplit = autoSplit !== false && duration > maxDuration
|
|
317
|
+
|
|
318
|
+
if (!needsSplit) {
|
|
319
|
+
log.info(`Transcribing file directly (no split needed)`)
|
|
320
|
+
const result = await provider.transcribe({
|
|
321
|
+
audioPath,
|
|
322
|
+
apiKey: config.apiKey,
|
|
323
|
+
model,
|
|
324
|
+
language,
|
|
325
|
+
diarize,
|
|
326
|
+
timestampGranularity: timestamps,
|
|
327
|
+
contextBias,
|
|
328
|
+
})
|
|
329
|
+
return result
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
// Auto-split and transcribe segments
|
|
333
|
+
log.info(`Duration ${duration.toFixed(1)}s > ${maxDuration}s, splitting audio...`)
|
|
334
|
+
const outDir = splitOutputDir || path.join(os.tmpdir(), `tranz-split-${Date.now()}`)
|
|
335
|
+
fs.mkdirSync(outDir, { recursive: true })
|
|
336
|
+
|
|
337
|
+
const segments = await autoSplitAudio(audioPath, outDir, {
|
|
338
|
+
maxDurationSec: maxDuration,
|
|
339
|
+
})
|
|
340
|
+
log.info(`Split into ${segments.length} segments`)
|
|
341
|
+
|
|
342
|
+
// Transcribe each segment
|
|
343
|
+
const results: TranscriptionResult[] = []
|
|
344
|
+
for (let i = 0; i < segments.length; i++) {
|
|
345
|
+
const segment = segments[i]
|
|
346
|
+
log.info(`Transcribing segment ${i + 1}/${segments.length} (${segment.durationSec.toFixed(1)}s)`)
|
|
347
|
+
const result = await provider.transcribe({
|
|
348
|
+
audioPath: segment.outputPath,
|
|
349
|
+
apiKey: config.apiKey,
|
|
350
|
+
model,
|
|
351
|
+
language,
|
|
352
|
+
diarize,
|
|
353
|
+
timestampGranularity: timestamps,
|
|
354
|
+
contextBias,
|
|
355
|
+
})
|
|
356
|
+
results.push(result)
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
log.info(`Merging ${segments.length} segments`)
|
|
360
|
+
return mergeTranscriptionResults(results, segments)
|
|
361
|
+
},
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
/** Alias for simpler import */
|
|
366
|
+
export const transcribe = createMistralTranscriber
|
|
File without changes
|