@wovin/tranz 0.1.4 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -0
- package/dist/index.min.js +7 -4
- package/dist/providers.min.js +7 -4
- package/dist/realtime.d.ts +38 -0
- package/dist/realtime.d.ts.map +1 -0
- package/dist/realtime.min.js +95 -0
- package/dist/utils/transcription/providers.d.ts.map +1 -1
- package/dist/utils/transcription/realtime.d.ts +197 -0
- package/dist/utils/transcription/realtime.d.ts.map +1 -0
- package/package.json +14 -2
package/README.md
CHANGED
|
@@ -8,6 +8,7 @@ Audio transcription library with provider support and auto-splitting for long au
|
|
|
8
8
|
## Features
|
|
9
9
|
|
|
10
10
|
- **Multiple Transcription Providers**: Mistral Voxtral, Whisper, GreenPT
|
|
11
|
+
- **Realtime Transcription**: Stream audio from microphone or other sources for live transcription
|
|
11
12
|
- **Automatic Audio Splitting**: Handles long audio files by intelligently splitting at silence points
|
|
12
13
|
- **Smart Input Support**: Files, URLs (with HTTP range probing), or buffers
|
|
13
14
|
- **Speaker Diarization**: Identify different speakers in audio
|
|
@@ -147,6 +148,72 @@ const result = await whisper.transcribe({
|
|
|
147
148
|
})
|
|
148
149
|
```
|
|
149
150
|
|
|
151
|
+
## Realtime Transcription
|
|
152
|
+
|
|
153
|
+
Stream audio for real-time transcription using Mistral's WebSocket API:
|
|
154
|
+
|
|
155
|
+
```typescript
|
|
156
|
+
import {
|
|
157
|
+
createRealtimeTranscriber,
|
|
158
|
+
captureAudioFromMicrophone,
|
|
159
|
+
} from '@wovin/tranz/realtime'
|
|
160
|
+
|
|
161
|
+
// Create realtime transcriber
|
|
162
|
+
const transcriber = createRealtimeTranscriber({
|
|
163
|
+
apiKey: process.env.MISTRAL_API_KEY,
|
|
164
|
+
})
|
|
165
|
+
|
|
166
|
+
// Capture audio from microphone (requires SoX)
|
|
167
|
+
const { stream, stop } = captureAudioFromMicrophone(16000)
|
|
168
|
+
|
|
169
|
+
try {
|
|
170
|
+
for await (const event of transcriber.transcribe(stream)) {
|
|
171
|
+
if (event.type === 'transcription.text.delta') {
|
|
172
|
+
process.stdout.write(event.text)
|
|
173
|
+
} else if (event.type === 'transcription.done') {
|
|
174
|
+
console.log('\nComplete:', event.text)
|
|
175
|
+
break
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
} finally {
|
|
179
|
+
stop()
|
|
180
|
+
}
|
|
181
|
+
```
|
|
182
|
+
|
|
183
|
+
### Custom Audio Source
|
|
184
|
+
|
|
185
|
+
You can provide any `AsyncIterable<Uint8Array>` as an audio source:
|
|
186
|
+
|
|
187
|
+
```typescript
|
|
188
|
+
async function* myAudioSource() {
|
|
189
|
+
// Read from file, socket, etc.
|
|
190
|
+
const buffer = await readSomeAudio()
|
|
191
|
+
yield new Uint8Array(buffer)
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
for await (const event of transcriber.transcribe(myAudioSource())) {
|
|
195
|
+
// Handle events
|
|
196
|
+
}
|
|
197
|
+
```
|
|
198
|
+
|
|
199
|
+
### Realtime Event Types
|
|
200
|
+
|
|
201
|
+
- `session.created` - WebSocket connection established
|
|
202
|
+
- `session.updated` - Audio format confirmed
|
|
203
|
+
- `transcription.text.delta` - Transcription text chunks (use for live display)
|
|
204
|
+
- `transcription.language` - Detected audio language
|
|
205
|
+
- `transcription.done` - Complete transcript available
|
|
206
|
+
- `error` - Error occurred
|
|
207
|
+
|
|
208
|
+
### Limitations
|
|
209
|
+
|
|
210
|
+
The WebSocket realtime API has some limitations compared to batch transcription:
|
|
211
|
+
- No timestamp information (no word or segment timing)
|
|
212
|
+
- No speaker diarization
|
|
213
|
+
- Designed for streaming/live use cases, not long audio files
|
|
214
|
+
|
|
215
|
+
For timestamped transcriptions or speaker identification, use the batch API instead.
|
|
216
|
+
|
|
150
217
|
## Advanced: Audio Utilities
|
|
151
218
|
|
|
152
219
|
```typescript
|
package/dist/index.min.js
CHANGED
|
@@ -215,6 +215,9 @@ var MistralProvider = class {
|
|
|
215
215
|
return VOXTRAL_LIMITS.recommendedMaxDurationSec;
|
|
216
216
|
}
|
|
217
217
|
async transcribe(params) {
|
|
218
|
+
if (params.language && params.timestampGranularity) {
|
|
219
|
+
return { text: "", error: "Cannot use both language and timestampGranularity (Mistral API limitation)" };
|
|
220
|
+
}
|
|
218
221
|
const formData = new FormData();
|
|
219
222
|
if (params.audioUrl) {
|
|
220
223
|
formData.append("file_url", params.audioUrl);
|
|
@@ -239,14 +242,14 @@ var MistralProvider = class {
|
|
|
239
242
|
if (params.language) {
|
|
240
243
|
formData.append("language", params.language);
|
|
241
244
|
}
|
|
242
|
-
const timestampGranularity = params.timestampGranularity ?? (params.language ? void 0 : "word");
|
|
243
|
-
if (timestampGranularity) {
|
|
244
|
-
formData.append("timestamp_granularities", timestampGranularity);
|
|
245
|
-
}
|
|
246
245
|
const diarize = params.diarize ?? true;
|
|
247
246
|
if (diarize) {
|
|
248
247
|
formData.append("diarize", "true");
|
|
249
248
|
}
|
|
249
|
+
const timestampGranularity = params.language ? void 0 : params.timestampGranularity ?? "segment";
|
|
250
|
+
if (timestampGranularity) {
|
|
251
|
+
formData.append("timestamp_granularities", timestampGranularity);
|
|
252
|
+
}
|
|
250
253
|
const response = await fetch("https://api.mistral.ai/v1/audio/transcriptions", {
|
|
251
254
|
method: "POST",
|
|
252
255
|
headers: {
|
package/dist/providers.min.js
CHANGED
|
@@ -215,6 +215,9 @@ var MistralProvider = class {
|
|
|
215
215
|
return VOXTRAL_LIMITS.recommendedMaxDurationSec;
|
|
216
216
|
}
|
|
217
217
|
async transcribe(params) {
|
|
218
|
+
if (params.language && params.timestampGranularity) {
|
|
219
|
+
return { text: "", error: "Cannot use both language and timestampGranularity (Mistral API limitation)" };
|
|
220
|
+
}
|
|
218
221
|
const formData = new FormData();
|
|
219
222
|
if (params.audioUrl) {
|
|
220
223
|
formData.append("file_url", params.audioUrl);
|
|
@@ -239,14 +242,14 @@ var MistralProvider = class {
|
|
|
239
242
|
if (params.language) {
|
|
240
243
|
formData.append("language", params.language);
|
|
241
244
|
}
|
|
242
|
-
const timestampGranularity = params.timestampGranularity ?? (params.language ? void 0 : "word");
|
|
243
|
-
if (timestampGranularity) {
|
|
244
|
-
formData.append("timestamp_granularities", timestampGranularity);
|
|
245
|
-
}
|
|
246
245
|
const diarize = params.diarize ?? true;
|
|
247
246
|
if (diarize) {
|
|
248
247
|
formData.append("diarize", "true");
|
|
249
248
|
}
|
|
249
|
+
const timestampGranularity = params.language ? void 0 : params.timestampGranularity ?? "segment";
|
|
250
|
+
if (timestampGranularity) {
|
|
251
|
+
formData.append("timestamp_granularities", timestampGranularity);
|
|
252
|
+
}
|
|
250
253
|
const response = await fetch("https://api.mistral.ai/v1/audio/transcriptions", {
|
|
251
254
|
method: "POST",
|
|
252
255
|
headers: {
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Realtime transcription API
|
|
3
|
+
*
|
|
4
|
+
* This module provides a simple, event-driven interface for streaming audio
|
|
5
|
+
* transcription using Mistral's realtime WebSocket API.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
* import {
|
|
10
|
+
* createRealtimeTranscriber,
|
|
11
|
+
* captureAudioFromMicrophone,
|
|
12
|
+
* } from '@wovin/tranz/realtime'
|
|
13
|
+
*
|
|
14
|
+
* const transcriber = createRealtimeTranscriber({
|
|
15
|
+
* apiKey: process.env.MISTRAL_API_KEY,
|
|
16
|
+
* })
|
|
17
|
+
*
|
|
18
|
+
* const { stream, stop } = captureAudioFromMicrophone(16000)
|
|
19
|
+
*
|
|
20
|
+
* try {
|
|
21
|
+
* for await (const event of transcriber.transcribe(stream)) {
|
|
22
|
+
* if (event.type === 'transcription.text.delta') {
|
|
23
|
+
* process.stdout.write(event.text)
|
|
24
|
+
* } else if (event.type === 'transcription.done') {
|
|
25
|
+
* console.log('\nComplete!')
|
|
26
|
+
* break
|
|
27
|
+
* }
|
|
28
|
+
* }
|
|
29
|
+
* } finally {
|
|
30
|
+
* stop()
|
|
31
|
+
* }
|
|
32
|
+
* ```
|
|
33
|
+
*
|
|
34
|
+
* @module @wovin/tranz/realtime
|
|
35
|
+
*/
|
|
36
|
+
export { createRealtimeTranscriber, captureAudioFromMicrophone, type RealtimeEvent, type RealtimeConfig, type RealtimeTranscriber, type TranscribeOptions, type AudioFormat, type AudioCaptureResult, type SessionCreatedEvent, type SessionUpdatedEvent, type TranscriptionTextDeltaEvent, type TranscriptionLanguageEvent, type TranscriptionSegmentEvent, type TranscriptionDoneEvent, type ErrorEvent, } from "./utils/transcription/realtime.js";
|
|
37
|
+
export { AudioEncoding } from "@mistralai/mistralai/extra/realtime";
|
|
38
|
+
//# sourceMappingURL=realtime.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"realtime.d.ts","sourceRoot":"","sources":["../src/realtime.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAkCG;AAEH,OAAO,EACL,yBAAyB,EACzB,0BAA0B,EAC1B,KAAK,aAAa,EAClB,KAAK,cAAc,EACnB,KAAK,mBAAmB,EACxB,KAAK,iBAAiB,EACtB,KAAK,WAAW,EAChB,KAAK,kBAAkB,EACvB,KAAK,mBAAmB,EACxB,KAAK,mBAAmB,EACxB,KAAK,2BAA2B,EAChC,KAAK,0BAA0B,EAC/B,KAAK,yBAAyB,EAC9B,KAAK,sBAAsB,EAC3B,KAAK,UAAU,GAChB,MAAM,mCAAmC,CAAC;AAG3C,OAAO,EAAE,aAAa,EAAE,MAAM,qCAAqC,CAAC"}
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
// src/utils/transcription/realtime.ts
|
|
2
|
+
import { spawn } from "child_process";
|
|
3
|
+
import {
|
|
4
|
+
AudioEncoding,
|
|
5
|
+
RealtimeTranscription
|
|
6
|
+
} from "@mistralai/mistralai/extra/realtime";
|
|
7
|
+
function createRealtimeTranscriber(config) {
|
|
8
|
+
const model = config.model ?? "voxtral-mini-transcribe-realtime-2602";
|
|
9
|
+
const baseUrl = config.baseUrl ?? "wss://api.mistral.ai";
|
|
10
|
+
const client = new RealtimeTranscription({
|
|
11
|
+
apiKey: config.apiKey,
|
|
12
|
+
serverURL: baseUrl
|
|
13
|
+
});
|
|
14
|
+
return {
|
|
15
|
+
async *transcribe(audioStream, options) {
|
|
16
|
+
const audioFormat = {
|
|
17
|
+
encoding: options?.audioFormat?.encoding ?? AudioEncoding.PcmS16le,
|
|
18
|
+
sampleRate: options?.audioFormat?.sampleRate ?? 16e3
|
|
19
|
+
};
|
|
20
|
+
const eventStream = client.transcribeStream(audioStream, model, {
|
|
21
|
+
audioFormat
|
|
22
|
+
});
|
|
23
|
+
for await (const event of eventStream) {
|
|
24
|
+
yield event;
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
function captureAudioFromMicrophone(sampleRate = 16e3) {
|
|
30
|
+
const recorder = spawn(
|
|
31
|
+
"rec",
|
|
32
|
+
[
|
|
33
|
+
"-q",
|
|
34
|
+
// Quiet mode
|
|
35
|
+
"-t",
|
|
36
|
+
"raw",
|
|
37
|
+
// Raw PCM output
|
|
38
|
+
"-b",
|
|
39
|
+
"16",
|
|
40
|
+
// 16-bit samples
|
|
41
|
+
"-e",
|
|
42
|
+
"signed-integer",
|
|
43
|
+
// Signed PCM
|
|
44
|
+
"-r",
|
|
45
|
+
String(sampleRate),
|
|
46
|
+
// Sample rate
|
|
47
|
+
"-c",
|
|
48
|
+
"1",
|
|
49
|
+
// Mono (1 channel)
|
|
50
|
+
"-"
|
|
51
|
+
// Output to stdout
|
|
52
|
+
],
|
|
53
|
+
{ stdio: ["ignore", "pipe", "ignore"] }
|
|
54
|
+
);
|
|
55
|
+
recorder.on("error", (err) => {
|
|
56
|
+
const error = err;
|
|
57
|
+
if (error.code === "ENOENT") {
|
|
58
|
+
console.error(
|
|
59
|
+
"\nError: 'rec' command not found. Please install SoX:",
|
|
60
|
+
"\n macOS: brew install sox",
|
|
61
|
+
"\n Linux: sudo apt install sox"
|
|
62
|
+
);
|
|
63
|
+
process.exit(1);
|
|
64
|
+
}
|
|
65
|
+
throw err;
|
|
66
|
+
});
|
|
67
|
+
const stream = (async function* () {
|
|
68
|
+
try {
|
|
69
|
+
if (!recorder.stdout) {
|
|
70
|
+
throw new Error("Failed to create audio capture stream");
|
|
71
|
+
}
|
|
72
|
+
for await (const chunk of recorder.stdout) {
|
|
73
|
+
yield new Uint8Array(chunk);
|
|
74
|
+
}
|
|
75
|
+
} finally {
|
|
76
|
+
if (!recorder.killed) {
|
|
77
|
+
recorder.kill("SIGTERM");
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
})();
|
|
81
|
+
const stop = () => {
|
|
82
|
+
if (!recorder.killed) {
|
|
83
|
+
recorder.kill("SIGTERM");
|
|
84
|
+
}
|
|
85
|
+
};
|
|
86
|
+
return { stream, stop };
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// src/realtime.ts
|
|
90
|
+
import { AudioEncoding as AudioEncoding2 } from "@mistralai/mistralai/extra/realtime";
|
|
91
|
+
export {
|
|
92
|
+
AudioEncoding2 as AudioEncoding,
|
|
93
|
+
captureAudioFromMicrophone,
|
|
94
|
+
createRealtimeTranscriber
|
|
95
|
+
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../../../src/utils/transcription/providers.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAUH;;;GAGG;AACH,MAAM,WAAW,mBAAmB;IAClC,mCAAmC;IACnC,IAAI,EAAE,MAAM,CAAA;IACZ,sEAAsE;IACtE,WAAW,CAAC,EAAE,GAAG,CAAA;IACjB,4CAA4C;IAC5C,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,kDAAkD;IAClD,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,iDAAiD;IACjD,KAAK,CAAC,EAAE,GAAG,EAAE,CAAA;IACb,mCAAmC;IACnC,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,0CAA0C;IAC1C,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,6DAA6D;IAC7D,KAAK,CAAC,EAAE,MAAM,CAAA;CACf;AAED;;;GAGG;AACH,MAAM,WAAW,qBAAqB;IACpC,+BAA+B;IAC/B,IAAI,EAAE,MAAM,CAAA;IACZ,+DAA+D;IAC/D,mBAAmB,CAAC,EAAE,MAAM,CAAA;IAC5B;;;;OAIG;IACH,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAAA;CACnE;AAED;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,2CAA2C;IAC3C,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,iCAAiC;IACjC,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,gEAAgE;IAChE,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,iDAAiD;IACjD,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,yDAAyD;IACzD,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,yDAAyD;IACzD,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,qDAAqD;IACrD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,oDAAoD;IACpD,OAAO,CAAC,EAAE,OAAO,CAAA;IACjB,iEAAiE;IACjE,oBAAoB,CAAC,EAAE,SAAS,GAAG,MAAM,CAAA;IACzC,4CAA4C;IAC5C,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,sDAAsD;IACtD,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,oCAAoC;IACpC,MAAM,CAAC,EAAE,GAAG,CAAA;CACb;AAED;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,SAAS,GAAG,SAAS,GAAG,SAAS,CAAA;AAE5D;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,YAAY,EAAE,YAAY,EAAE,MAAM,CAAC,EAAE,GAAG,GAAG,qBAAqB,CAW9F;AASD;;;GAGG;AACH,qBAAa,eAAgB,YAAW,qBAAqB;IAC3D,IAAI,SAAY;IAEhB,OAAO,CAAC,QAAQ,CAAQ;IAExB,MAAM,CAAC,QAAQ;;;;;;;;;;;;;;;MAkBd;gBAEW,MAAM,CAAC,EAAE,GAAG;IAKlB,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,mBAAmB,CAAC;YA+F1D,4BAA4B;CAkC3C;AAED;;;GAGG;AACH,eAAO,MAAM,cAAc;IACzB,2EAA2E;;IAE3E,kEAAkE;;IAElE,4CAA4C;;IAE5C,uCAAuC;;CAExC,CAAA;AAED,qBAAa,eAAgB,YAAW,qBAAqB;IAC3D,IAAI,SAAY;IAChB,mBAAmB,SAAqC;IAExD;;OAEG;IACH,MAAM,CAAC,WAAW,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO;IAIhD;;OAEG;IACH,MAAM,CAAC,wBAAwB,IAAI,MAAM;IAInC,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,mBAAmB,CAAC;
|
|
1
|
+
{"version":3,"file":"providers.d.ts","sourceRoot":"","sources":["../../../src/utils/transcription/providers.ts"],"names":[],"mappings":"AAAA;;;GAGG;AAUH;;;GAGG;AACH,MAAM,WAAW,mBAAmB;IAClC,mCAAmC;IACnC,IAAI,EAAE,MAAM,CAAA;IACZ,sEAAsE;IACtE,WAAW,CAAC,EAAE,GAAG,CAAA;IACjB,4CAA4C;IAC5C,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,kDAAkD;IAClD,UAAU,CAAC,EAAE,MAAM,CAAA;IACnB,iDAAiD;IACjD,KAAK,CAAC,EAAE,GAAG,EAAE,CAAA;IACb,mCAAmC;IACnC,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,0CAA0C;IAC1C,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,6DAA6D;IAC7D,KAAK,CAAC,EAAE,MAAM,CAAA;CACf;AAED;;;GAGG;AACH,MAAM,WAAW,qBAAqB;IACpC,+BAA+B;IAC/B,IAAI,EAAE,MAAM,CAAA;IACZ,+DAA+D;IAC/D,mBAAmB,CAAC,EAAE,MAAM,CAAA;IAC5B;;;;OAIG;IACH,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAAA;CACnE;AAED;;;GAGG;AACH,MAAM,WAAW,gBAAgB;IAC/B,2CAA2C;IAC3C,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,iCAAiC;IACjC,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,gEAAgE;IAChE,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,iDAAiD;IACjD,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,yDAAyD;IACzD,KAAK,CAAC,EAAE,MAAM,CAAA;IACd,yDAAyD;IACzD,QAAQ,CAAC,EAAE,MAAM,CAAA;IACjB,qDAAqD;IACrD,MAAM,CAAC,EAAE,MAAM,CAAA;IACf,oDAAoD;IACpD,OAAO,CAAC,EAAE,OAAO,CAAA;IACjB,iEAAiE;IACjE,oBAAoB,CAAC,EAAE,SAAS,GAAG,MAAM,CAAA;IACzC,4CAA4C;IAC5C,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,sDAAsD;IACtD,SAAS,CAAC,EAAE,MAAM,CAAA;IAClB,oCAAoC;IACpC,MAAM,CAAC,EAAE,GAAG,CAAA;CACb;AAED;;GAEG;AACH,MAAM,MAAM,YAAY,GAAG,SAAS,GAAG,SAAS,GAAG,SAAS,CAAA;AAE5D;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,YAAY,EAAE,YAAY,EAAE,MAAM,CAAC,EAAE,GAAG,GAAG,qBAAqB,CAW9F;AASD;;;GAGG;AACH,qBAAa,eAAgB,YAAW,qBAAqB;IAC3D,IAAI,SAAY;IAEhB,OAAO,CAAC,QAAQ,CAAQ;IAExB,MAAM,CAAC,QAAQ;;;;;;;;;;;;;;;MAkBd;gBAEW,MAAM,CAAC,EAAE,GAAG;IAKlB,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,mBAAmB,CAAC;YA+F1D,4BAA4B;CAkC3C;AAED;;;GAGG;AACH,eAAO,MAAM,cAAc;IACzB,2EAA2E;;IAE3E,kEAAkE;;IAElE,4CAA4C;;IAE5C,uCAAuC;;CAExC,CAAA;AAED,qBAAa,eAAgB,YAAW,qBAAqB;IAC3D,IAAI,SAAY;IAChB,mBAAmB,SAAqC;IAExD;;OAEG;IACH,MAAM,CAAC,WAAW,CAAC,WAAW,EAAE,MAAM,GAAG,OAAO;IAIhD;;OAEG;IACH,MAAM,CAAC,wBAAwB,IAAI,MAAM;IAInC,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,mBAAmB,CAAC;CAmGzE;AAED;;;GAGG;AACH,qBAAa,eAAgB,YAAW,qBAAqB;IAC3D,IAAI,SAAY;IAEV,UAAU,CAAC,MAAM,EAAE,gBAAgB,GAAG,OAAO,CAAC,mBAAmB,CAAC;CAkGzE"}
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Realtime transcription API for Mistral's WebSocket-based transcription service
|
|
3
|
+
*
|
|
4
|
+
* Provides a simple, event-driven interface for streaming audio transcription.
|
|
5
|
+
* Users provide audio as AsyncIterable<Uint8Array> and receive typed events.
|
|
6
|
+
*
|
|
7
|
+
* @example
|
|
8
|
+
* ```typescript
|
|
9
|
+
* import { createRealtimeTranscriber } from '@wovin/tranz/realtime'
|
|
10
|
+
*
|
|
11
|
+
* const transcriber = createRealtimeTranscriber({
|
|
12
|
+
* apiKey: process.env.MISTRAL_API_KEY,
|
|
13
|
+
* })
|
|
14
|
+
*
|
|
15
|
+
* for await (const event of transcriber.transcribe(audioStream)) {
|
|
16
|
+
* if (event.type === 'transcription.text.delta') {
|
|
17
|
+
* process.stdout.write(event.text)
|
|
18
|
+
* }
|
|
19
|
+
* }
|
|
20
|
+
* ```
|
|
21
|
+
*/
|
|
22
|
+
import { AudioEncoding } from "@mistralai/mistralai/extra/realtime";
|
|
23
|
+
/**
|
|
24
|
+
* Audio format configuration for realtime transcription
|
|
25
|
+
*/
|
|
26
|
+
export interface AudioFormat {
|
|
27
|
+
/** Audio encoding format (default: pcm_s16le) */
|
|
28
|
+
encoding: AudioEncoding;
|
|
29
|
+
/** Sample rate in Hz (default: 16000) */
|
|
30
|
+
sampleRate: number;
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Configuration for creating a realtime transcriber
|
|
34
|
+
*/
|
|
35
|
+
export interface RealtimeConfig {
|
|
36
|
+
/** Mistral API key */
|
|
37
|
+
apiKey: string;
|
|
38
|
+
/** Model ID (default: voxtral-mini-transcribe-realtime-2602) */
|
|
39
|
+
model?: string;
|
|
40
|
+
/** WebSocket base URL (default: wss://api.mistral.ai) */
|
|
41
|
+
baseUrl?: string;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Options for transcription
|
|
45
|
+
*/
|
|
46
|
+
export interface TranscribeOptions {
|
|
47
|
+
/** Audio format configuration (optional, uses defaults if not provided) */
|
|
48
|
+
audioFormat?: Partial<AudioFormat>;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Union type for all realtime transcription events
|
|
52
|
+
* These events are yielded as the transcription progresses
|
|
53
|
+
*/
|
|
54
|
+
export type RealtimeEvent = SessionCreatedEvent | SessionUpdatedEvent | TranscriptionTextDeltaEvent | TranscriptionLanguageEvent | TranscriptionSegmentEvent | TranscriptionDoneEvent | ErrorEvent;
|
|
55
|
+
/**
|
|
56
|
+
* Session created event - emitted when WebSocket connection is established
|
|
57
|
+
*/
|
|
58
|
+
export interface SessionCreatedEvent {
|
|
59
|
+
type: "session.created";
|
|
60
|
+
session: {
|
|
61
|
+
id: string;
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
/**
|
|
65
|
+
* Session updated event - emitted when audio format is confirmed
|
|
66
|
+
*/
|
|
67
|
+
export interface SessionUpdatedEvent {
|
|
68
|
+
type: "session.updated";
|
|
69
|
+
session: {
|
|
70
|
+
audioFormat: AudioFormat;
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
/**
|
|
74
|
+
* Text delta event - emitted as transcription text arrives in chunks
|
|
75
|
+
* This is the primary event for displaying real-time transcription
|
|
76
|
+
*/
|
|
77
|
+
export interface TranscriptionTextDeltaEvent {
|
|
78
|
+
type: "transcription.text.delta";
|
|
79
|
+
text: string;
|
|
80
|
+
}
|
|
81
|
+
/**
|
|
82
|
+
* Language detection event - emitted when audio language is detected
|
|
83
|
+
*/
|
|
84
|
+
export interface TranscriptionLanguageEvent {
|
|
85
|
+
type: "transcription.language";
|
|
86
|
+
audioLanguage: string;
|
|
87
|
+
}
|
|
88
|
+
/**
|
|
89
|
+
* Segment event - emitted for timestamped segments
|
|
90
|
+
* NOTE: WebSocket realtime API does NOT support this - included for completeness
|
|
91
|
+
*/
|
|
92
|
+
export interface TranscriptionSegmentEvent {
|
|
93
|
+
type: "transcription.segment";
|
|
94
|
+
start?: number;
|
|
95
|
+
end?: number;
|
|
96
|
+
text: string;
|
|
97
|
+
speakerId?: string;
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Transcription done event - emitted when transcription completes
|
|
101
|
+
* Contains the complete transcript
|
|
102
|
+
*/
|
|
103
|
+
export interface TranscriptionDoneEvent {
|
|
104
|
+
type: "transcription.done";
|
|
105
|
+
text: string;
|
|
106
|
+
language?: string;
|
|
107
|
+
}
|
|
108
|
+
/**
|
|
109
|
+
* Error event - emitted when an error occurs
|
|
110
|
+
*/
|
|
111
|
+
export interface ErrorEvent {
|
|
112
|
+
type: "error";
|
|
113
|
+
error: {
|
|
114
|
+
message: string | unknown;
|
|
115
|
+
};
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Realtime transcriber interface
|
|
119
|
+
*/
|
|
120
|
+
export interface RealtimeTranscriber {
|
|
121
|
+
/**
|
|
122
|
+
* Transcribe audio stream and yield events as they arrive
|
|
123
|
+
*
|
|
124
|
+
* @param audioStream - AsyncIterable of audio chunks (Uint8Array)
|
|
125
|
+
* @param options - Optional transcription options
|
|
126
|
+
* @returns AsyncIterable of transcription events
|
|
127
|
+
*
|
|
128
|
+
* @example
|
|
129
|
+
* ```typescript
|
|
130
|
+
* const transcriber = createRealtimeTranscriber({ apiKey: 'xxx' })
|
|
131
|
+
*
|
|
132
|
+
* for await (const event of transcriber.transcribe(audioStream)) {
|
|
133
|
+
* if (event.type === 'transcription.text.delta') {
|
|
134
|
+
* console.log(event.text)
|
|
135
|
+
* } else if (event.type === 'transcription.done') {
|
|
136
|
+
* console.log('Complete:', event.text)
|
|
137
|
+
* break
|
|
138
|
+
* }
|
|
139
|
+
* }
|
|
140
|
+
* ```
|
|
141
|
+
*/
|
|
142
|
+
transcribe(audioStream: AsyncIterable<Uint8Array>, options?: TranscribeOptions): AsyncIterable<RealtimeEvent>;
|
|
143
|
+
}
|
|
144
|
+
/**
|
|
145
|
+
* Create a realtime transcriber instance
|
|
146
|
+
*
|
|
147
|
+
* @param config - Configuration including API key and optional model/baseUrl
|
|
148
|
+
* @returns RealtimeTranscriber instance
|
|
149
|
+
*
|
|
150
|
+
* @example
|
|
151
|
+
* ```typescript
|
|
152
|
+
* const transcriber = createRealtimeTranscriber({
|
|
153
|
+
* apiKey: process.env.MISTRAL_API_KEY,
|
|
154
|
+
* model: 'voxtral-mini-transcribe-realtime-2602', // optional
|
|
155
|
+
* baseUrl: 'wss://api.mistral.ai', // optional
|
|
156
|
+
* })
|
|
157
|
+
* ```
|
|
158
|
+
*/
|
|
159
|
+
export declare function createRealtimeTranscriber(config: RealtimeConfig): RealtimeTranscriber;
|
|
160
|
+
/**
|
|
161
|
+
* Result from audio capture - includes stream and stop function
|
|
162
|
+
*/
|
|
163
|
+
export interface AudioCaptureResult {
|
|
164
|
+
/** AsyncGenerator yielding audio chunks */
|
|
165
|
+
stream: AsyncGenerator<Uint8Array, void, unknown>;
|
|
166
|
+
/** Function to stop audio capture */
|
|
167
|
+
stop: () => void;
|
|
168
|
+
}
|
|
169
|
+
/**
|
|
170
|
+
* Capture audio from microphone using SoX `rec` command
|
|
171
|
+
*
|
|
172
|
+
* Yields PCM 16-bit signed little-endian mono audio chunks suitable for
|
|
173
|
+
* realtime transcription.
|
|
174
|
+
*
|
|
175
|
+
* **Requirements:**
|
|
176
|
+
* - SoX audio tools must be installed
|
|
177
|
+
* - macOS: `brew install sox`
|
|
178
|
+
* - Linux: `sudo apt install sox`
|
|
179
|
+
*
|
|
180
|
+
* @param sampleRate - Sample rate in Hz (default: 16000)
|
|
181
|
+
* @returns Object with audio stream and stop function
|
|
182
|
+
*
|
|
183
|
+
* @example
|
|
184
|
+
* ```typescript
|
|
185
|
+
* const { stream, stop } = captureAudioFromMicrophone(16000)
|
|
186
|
+
*
|
|
187
|
+
* try {
|
|
188
|
+
* for await (const event of transcriber.transcribe(stream)) {
|
|
189
|
+
* // ... handle events
|
|
190
|
+
* }
|
|
191
|
+
* } finally {
|
|
192
|
+
* stop() // Clean up audio capture
|
|
193
|
+
* }
|
|
194
|
+
* ```
|
|
195
|
+
*/
|
|
196
|
+
export declare function captureAudioFromMicrophone(sampleRate?: number): AudioCaptureResult;
|
|
197
|
+
//# sourceMappingURL=realtime.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"realtime.d.ts","sourceRoot":"","sources":["../../../src/utils/transcription/realtime.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;GAoBG;AAGH,OAAO,EACL,aAAa,EAEd,MAAM,qCAAqC,CAAC;AAM7C;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,iDAAiD;IACjD,QAAQ,EAAE,aAAa,CAAC;IACxB,yCAAyC;IACzC,UAAU,EAAE,MAAM,CAAC;CACpB;AAED;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,sBAAsB;IACtB,MAAM,EAAE,MAAM,CAAC;IACf,gEAAgE;IAChE,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,yDAAyD;IACzD,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED;;GAEG;AACH,MAAM,WAAW,iBAAiB;IAChC,2EAA2E;IAC3E,WAAW,CAAC,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;CACpC;AAED;;;GAGG;AACH,MAAM,MAAM,aAAa,GACrB,mBAAmB,GACnB,mBAAmB,GACnB,2BAA2B,GAC3B,0BAA0B,GAC1B,yBAAyB,GACzB,sBAAsB,GACtB,UAAU,CAAC;AAEf;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,iBAAiB,CAAC;IACxB,OAAO,EAAE;QACP,EAAE,EAAE,MAAM,CAAC;KACZ,CAAC;CACH;AAED;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,iBAAiB,CAAC;IACxB,OAAO,EAAE;QACP,WAAW,EAAE,WAAW,CAAC;KAC1B,CAAC;CACH;AAED;;;GAGG;AACH,MAAM,WAAW,2BAA2B;IAC1C,IAAI,EAAE,0BAA0B,CAAC;IACjC,IAAI,EAAE,MAAM,CAAC;CACd;AAED;;GAEG;AACH,MAAM,WAAW,0BAA0B;IACzC,IAAI,EAAE,wBAAwB,CAAC;IAC/B,aAAa,EAAE,MAAM,CAAC;CACvB;AAED;;;GAGG;AACH,MAAM,WAAW,yBAAyB;IACxC,IAAI,EAAE,uBAAuB,CAAC;IAC9B,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,GAAG,CAAC,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,SAAS,CAAC,EAAE,MAAM,CAAC;CACpB;AAED;;;GAGG;AACH,MAAM,WAAW,sBAAsB;IACrC,IAAI,EAAE,oBAAoB,CAAC;IAC3B,IAAI,EAAE,MAAM,CAAC;IACb,QAAQ,CAAC,EAAE,MAAM,CAAC;CACnB;AAED;;GAEG;AACH,MAAM,WAAW,UAAU;IACzB,IAAI,EAAE,OAAO,CAAC;IACd,KAAK,EAAE;QACL,OAAO,EAAE,MAAM,GAAG,OAAO,CAAC;KAC3B,CAAC;CACH;AAMD;;GAEG;AACH,MAAM,WAAW,mBAAmB;IAClC;;;;;;;;;;;;;;;;;;;;OAoBG;IACH,UAAU,CACR,WAAW,EAAE,aAAa,CAAC,UAAU,CAAC,EACtC,OAAO,CAAC,EAAE,iBAAiB,GAC1B,aAAa,CAAC,aAAa,CAAC,CAAC;CACjC;AAED;;;;;;;;;;;;;;GAcG;AACH,wBAAgB,yBAAyB,CACvC,MAAM,EAAE,cAAc,GACrB,mBAAmB,CA+BrB;AAMD;;GAEG;AACH,MAAM,WAAW,kBAAkB;IACjC,2CAA2C;IAC3C,MAAM,EAAE,cAAc,CAAC,UAAU,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;IAClD,qCAAqC;IACrC,IAAI,EAAE,MAAM,IAAI,CAAC;CAClB;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;GA0BG;AACH,wBAAgB,0BAA0B,CACxC,UAAU,GAAE,MAAc,GACzB,kBAAkB,CAuDpB"}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@wovin/tranz",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.6",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Audio transcription library with provider support and auto-splitting",
|
|
6
6
|
"author": "gotjoshua @gotjoshua",
|
|
@@ -27,6 +27,10 @@
|
|
|
27
27
|
"./audio": {
|
|
28
28
|
"import": "./dist/audio.min.js",
|
|
29
29
|
"types": "./dist/audio.d.ts"
|
|
30
|
+
},
|
|
31
|
+
"./realtime": {
|
|
32
|
+
"import": "./dist/realtime.min.js",
|
|
33
|
+
"types": "./dist/realtime.d.ts"
|
|
30
34
|
}
|
|
31
35
|
},
|
|
32
36
|
"files": [
|
|
@@ -36,14 +40,20 @@
|
|
|
36
40
|
"access": "public"
|
|
37
41
|
},
|
|
38
42
|
"dependencies": {
|
|
43
|
+
"@mistralai/mistralai": "^1.14.0",
|
|
39
44
|
"fluent-ffmpeg": "^2.1.2"
|
|
40
45
|
},
|
|
41
46
|
"devDependencies": {
|
|
42
47
|
"@types/fluent-ffmpeg": "^2.1.21",
|
|
43
48
|
"@types/node": "^24.10.1",
|
|
49
|
+
"@types/ws": "^8.5.13",
|
|
50
|
+
"@types/yargs": "^17.0.33",
|
|
44
51
|
"concurrently": "^8.2.2",
|
|
45
52
|
"tsup": "^8.5.0",
|
|
53
|
+
"tsx": "^4.19.2",
|
|
46
54
|
"typescript": "^5.9.3",
|
|
55
|
+
"ws": "^8.18.0",
|
|
56
|
+
"yargs": "^17.7.2",
|
|
47
57
|
"tsupconfig": "^0.0.0"
|
|
48
58
|
},
|
|
49
59
|
"keywords": [
|
|
@@ -60,6 +70,8 @@
|
|
|
60
70
|
"dev": "concurrently \"pnpm dev:code\" \"pnpm dev:types\"",
|
|
61
71
|
"dev:code": "tsup --watch",
|
|
62
72
|
"dev:types": "tsc --emitDeclarationOnly --declaration --watch",
|
|
63
|
-
"clean": "rm -rf .turbo && rm -rf node_modules && rm -rf dist"
|
|
73
|
+
"clean": "rm -rf .turbo && rm -rf node_modules && rm -rf dist",
|
|
74
|
+
"test:realtime": "tsx test/realtime-transcription.ts",
|
|
75
|
+
"test:realtime-api": "tsx test/realtime-api-test.ts"
|
|
64
76
|
}
|
|
65
77
|
}
|