node-av 4.0.0 → 5.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (109) hide show
  1. package/README.md +23 -0
  2. package/binding.gyp +19 -11
  3. package/dist/api/bitstream-filter.d.ts +13 -12
  4. package/dist/api/bitstream-filter.js +33 -29
  5. package/dist/api/bitstream-filter.js.map +1 -1
  6. package/dist/api/decoder.d.ts +211 -96
  7. package/dist/api/decoder.js +396 -375
  8. package/dist/api/decoder.js.map +1 -1
  9. package/dist/api/demuxer.d.ts +10 -10
  10. package/dist/api/demuxer.js +7 -10
  11. package/dist/api/demuxer.js.map +1 -1
  12. package/dist/api/encoder.d.ts +155 -122
  13. package/dist/api/encoder.js +368 -541
  14. package/dist/api/encoder.js.map +1 -1
  15. package/dist/api/filter-complex.d.ts +769 -0
  16. package/dist/api/filter-complex.js +1596 -0
  17. package/dist/api/filter-complex.js.map +1 -0
  18. package/dist/api/filter-presets.d.ts +68 -0
  19. package/dist/api/filter-presets.js +96 -0
  20. package/dist/api/filter-presets.js.map +1 -1
  21. package/dist/api/filter.d.ts +183 -113
  22. package/dist/api/filter.js +347 -365
  23. package/dist/api/filter.js.map +1 -1
  24. package/dist/api/fmp4-stream.d.ts +18 -2
  25. package/dist/api/fmp4-stream.js +45 -4
  26. package/dist/api/fmp4-stream.js.map +1 -1
  27. package/dist/api/hardware.d.ts +47 -0
  28. package/dist/api/hardware.js +45 -0
  29. package/dist/api/hardware.js.map +1 -1
  30. package/dist/api/index.d.ts +2 -0
  31. package/dist/api/index.js +3 -0
  32. package/dist/api/index.js.map +1 -1
  33. package/dist/api/io-stream.d.ts +3 -3
  34. package/dist/api/io-stream.js.map +1 -1
  35. package/dist/api/muxer.d.ts +10 -10
  36. package/dist/api/muxer.js +6 -6
  37. package/dist/api/muxer.js.map +1 -1
  38. package/dist/api/pipeline.d.ts +2 -2
  39. package/dist/api/pipeline.js +22 -22
  40. package/dist/api/pipeline.js.map +1 -1
  41. package/dist/api/rtp-stream.d.ts +5 -2
  42. package/dist/api/rtp-stream.js +33 -4
  43. package/dist/api/rtp-stream.js.map +1 -1
  44. package/dist/api/types.d.ts +63 -7
  45. package/dist/api/utilities/audio-sample.d.ts +10 -0
  46. package/dist/api/utilities/audio-sample.js +10 -0
  47. package/dist/api/utilities/audio-sample.js.map +1 -1
  48. package/dist/api/utilities/channel-layout.d.ts +1 -0
  49. package/dist/api/utilities/channel-layout.js +1 -0
  50. package/dist/api/utilities/channel-layout.js.map +1 -1
  51. package/dist/api/utilities/image.d.ts +38 -0
  52. package/dist/api/utilities/image.js +38 -0
  53. package/dist/api/utilities/image.js.map +1 -1
  54. package/dist/api/utilities/index.d.ts +1 -0
  55. package/dist/api/utilities/index.js +2 -0
  56. package/dist/api/utilities/index.js.map +1 -1
  57. package/dist/api/utilities/media-type.d.ts +1 -0
  58. package/dist/api/utilities/media-type.js +1 -0
  59. package/dist/api/utilities/media-type.js.map +1 -1
  60. package/dist/api/utilities/pixel-format.d.ts +3 -0
  61. package/dist/api/utilities/pixel-format.js +3 -0
  62. package/dist/api/utilities/pixel-format.js.map +1 -1
  63. package/dist/api/utilities/sample-format.d.ts +5 -0
  64. package/dist/api/utilities/sample-format.js +5 -0
  65. package/dist/api/utilities/sample-format.js.map +1 -1
  66. package/dist/api/utilities/scheduler.d.ts +21 -52
  67. package/dist/api/utilities/scheduler.js +20 -58
  68. package/dist/api/utilities/scheduler.js.map +1 -1
  69. package/dist/api/utilities/streaming.d.ts +32 -1
  70. package/dist/api/utilities/streaming.js +32 -1
  71. package/dist/api/utilities/streaming.js.map +1 -1
  72. package/dist/api/utilities/timestamp.d.ts +14 -0
  73. package/dist/api/utilities/timestamp.js +14 -0
  74. package/dist/api/utilities/timestamp.js.map +1 -1
  75. package/dist/api/utilities/whisper-model.d.ts +310 -0
  76. package/dist/api/utilities/whisper-model.js +528 -0
  77. package/dist/api/utilities/whisper-model.js.map +1 -0
  78. package/dist/api/whisper.d.ts +324 -0
  79. package/dist/api/whisper.js +362 -0
  80. package/dist/api/whisper.js.map +1 -0
  81. package/dist/constants/constants.d.ts +3 -1
  82. package/dist/constants/constants.js +1 -0
  83. package/dist/constants/constants.js.map +1 -1
  84. package/dist/ffmpeg/index.d.ts +3 -3
  85. package/dist/ffmpeg/index.js +3 -3
  86. package/dist/ffmpeg/utils.d.ts +27 -0
  87. package/dist/ffmpeg/utils.js +28 -16
  88. package/dist/ffmpeg/utils.js.map +1 -1
  89. package/dist/lib/binding.d.ts +4 -4
  90. package/dist/lib/binding.js.map +1 -1
  91. package/dist/lib/codec-parameters.d.ts +47 -1
  92. package/dist/lib/codec-parameters.js +55 -0
  93. package/dist/lib/codec-parameters.js.map +1 -1
  94. package/dist/lib/fifo.d.ts +416 -0
  95. package/dist/lib/fifo.js +453 -0
  96. package/dist/lib/fifo.js.map +1 -0
  97. package/dist/lib/frame.d.ts +96 -1
  98. package/dist/lib/frame.js +139 -1
  99. package/dist/lib/frame.js.map +1 -1
  100. package/dist/lib/index.d.ts +1 -0
  101. package/dist/lib/index.js +2 -0
  102. package/dist/lib/index.js.map +1 -1
  103. package/dist/lib/native-types.d.ts +29 -2
  104. package/dist/lib/rational.d.ts +18 -0
  105. package/dist/lib/rational.js +19 -0
  106. package/dist/lib/rational.js.map +1 -1
  107. package/dist/lib/types.d.ts +23 -1
  108. package/install/check.js +2 -2
  109. package/package.json +31 -21
@@ -0,0 +1,324 @@
1
+ import type { Frame } from '../lib/frame.js';
2
+ import type { WhisperModelName, WhisperVADModelName } from './utilities/whisper-model.js';
3
+ /**
4
+ * Transcribed audio segment from Whisper.
5
+ *
6
+ * Represents a single transcribed segment with timing information.
7
+ * Start and end times are in milliseconds from the beginning of the audio.
8
+ */
9
+ export interface WhisperSegment {
10
+ /**
11
+ * Start time of the segment in milliseconds.
12
+ */
13
+ start: number;
14
+ /**
15
+ * End time of the segment in milliseconds.
16
+ */
17
+ end: number;
18
+ /**
19
+ * Transcribed text content.
20
+ */
21
+ text: string;
22
+ /**
23
+ * Indicates if this segment represents a speaker turn.
24
+ * Only available when VAD (Voice Activity Detection) is enabled.
25
+ */
26
+ turn?: boolean;
27
+ }
28
+ /**
29
+ * Options for configuring Whisper transcriber.
30
+ *
31
+ * Controls model selection, language, GPU acceleration, VAD, and output behavior.
32
+ */
33
+ export interface WhisperTranscriberOptions {
34
+ /**
35
+ * Path to whisper.cpp GGML model file.
36
+ *
37
+ * Required. Download models using {@link WhisperDownloader}.
38
+ * ```
39
+ */
40
+ model: WhisperModelName;
41
+ /**
42
+ * Path to VAD (Voice Activity Detection) model file.
43
+ *
44
+ * Optional. Enables better audio segmentation using Silero VAD.
45
+ * Download VAD models using {@link WhisperDownloader.downloadVADModel}.
46
+ * ```
47
+ */
48
+ vadModel?: WhisperVADModelName;
49
+ /**
50
+ * Directory where models will be downloaded if not already present.
51
+ *
52
+ * @default '<PROJECT_DIR>/models'
53
+ */
54
+ modelDir?: string;
55
+ /**
56
+ * Language code for transcription.
57
+ *
58
+ * Use 'auto' for automatic language detection.
59
+ *
60
+ * @default 'auto'
61
+ */
62
+ language?: string;
63
+ /**
64
+ * Audio queue size in seconds.
65
+ *
66
+ * Maximum duration of audio buffered before processing.
67
+ * Increase when using VAD for better segmentation.
68
+ *
69
+ * @default 3
70
+ */
71
+ queue?: number;
72
+ /**
73
+ * Enable GPU acceleration for processing.
74
+ *
75
+ * Requires whisper.cpp built with GPU support (CUDA/Vulkan/Metal).
76
+ *
77
+ * @default true
78
+ */
79
+ useGpu?: boolean;
80
+ /**
81
+ * GPU device index to use.
82
+ *
83
+ * Only relevant when multiple GPUs are available.
84
+ *
85
+ * @default 0
86
+ */
87
+ gpuDevice?: number;
88
+ /**
89
+ * VAD threshold for voice activity detection.
90
+ *
91
+ * Higher values are more conservative (less likely to detect speech).
92
+ * Range: 0.0 to 1.0
93
+ *
94
+ * @default 0.5
95
+ */
96
+ vadThreshold?: number;
97
+ /**
98
+ * Minimum speech duration for VAD in seconds.
99
+ *
100
+ * Audio chunks shorter than this will be filtered out.
101
+ *
102
+ * @default 0.1
103
+ */
104
+ vadMinSpeechDuration?: number;
105
+ /**
106
+ * Minimum silence duration for VAD in seconds.
107
+ *
108
+ * Silence shorter than this won't trigger segment boundaries.
109
+ *
110
+ * @default 0.5
111
+ */
112
+ vadMinSilenceDuration?: number;
113
+ }
114
+ /**
115
+ * High-level Whisper transcriber for automatic speech recognition.
116
+ *
117
+ * Provides streaming audio transcription using OpenAI's Whisper model via whisper.cpp.
118
+ * Supports GPU acceleration, VAD (Voice Activity Detection), and real-time processing.
119
+ * Built on FFmpeg's whisper filter with automatic frame metadata extraction.
120
+ *
121
+ * Features:
122
+ * - Real-time streaming transcription
123
+ * - GPU acceleration (CUDA/Vulkan/Metal)
124
+ * - Voice Activity Detection for better segmentation
125
+ * - Automatic language detection
126
+ * - Type-safe transcription segments
127
+ * - Frame-based API for flexible integration
128
+ *
129
+ * @example
130
+ * ```typescript
131
+ * import { Demuxer, Decoder, WhisperTranscriber } from 'node-av/api';
132
+ * import { WhisperDownloader } from 'node-av/api/utilities/whisper-model';
133
+ *
134
+ * // Download model
135
+ * const modelPath = await WhisperDownloader.downloadModel({
136
+ * model: 'base.en',
137
+ * outputPath: './models'
138
+ * });
139
+ *
140
+ * // Open audio and create decoder
141
+ * await using input = await Demuxer.open('podcast.mp3');
142
+ * using decoder = await Decoder.create(input.audio());
143
+ *
144
+ * // Create transcriber
145
+ * await using transcriber = await WhisperTranscriber.create({
146
+ * model: modelPath,
147
+ * language: 'en'
148
+ * });
149
+ *
150
+ * // Transcribe using decoded frames
151
+ * for await (const segment of transcriber.transcribe(decoder.frames(input.packets()))) {
152
+ * const timestamp = `[${(segment.start / 1000).toFixed(1)}s - ${(segment.end / 1000).toFixed(1)}s]`;
153
+ * console.log(`${timestamp}: ${segment.text}`);
154
+ * }
155
+ * ```
156
+ *
157
+ * @example
158
+ * ```typescript
159
+ * // Real-time microphone transcription with VAD
160
+ * import { Demuxer, Decoder, WhisperTranscriber } from 'node-av/api';
161
+ * import { WhisperDownloader } from 'node-av/api/utilities/whisper-model';
162
+ *
163
+ * // Download VAD model
164
+ * const vadPath = await WhisperDownloader.downloadVADModel('silero-v5.1.2', './models');
165
+ *
166
+ * // Setup transcriber with VAD
167
+ * await using transcriber = await WhisperTranscriber.create({
168
+ * model: './models/ggml-medium.bin',
169
+ * language: 'en',
170
+ * queue: 10,
171
+ * vadModel: vadPath,
172
+ * vadThreshold: 0.5
173
+ * });
174
+ *
175
+ * // Live transcription from decoded audio frames
176
+ * using decoder = await Decoder.create(microphoneStream);
177
+ * for await (const segment of transcriber.transcribe(decoder.frames(microphonePackets))) {
178
+ * if (segment.turn) {
179
+ * console.log('\n--- New speaker turn ---');
180
+ * }
181
+ * console.log(segment.text);
182
+ * }
183
+ * ```
184
+ *
185
+ * @see {@link WhisperDownloader} For downloading Whisper and VAD models
186
+ * @see {@link Decoder} For audio decoding
187
+ * @see {@link Demuxer} For reading media files
188
+ */
189
+ export declare class WhisperTranscriber implements Disposable {
190
+ private options;
191
+ private isClosed;
192
+ /**
193
+ * @param options - Transcriber configuration
194
+ *
195
+ * Use {@link create} factory method instead
196
+ *
197
+ * @internal
198
+ */
199
+ private constructor();
200
+ /**
201
+ * Create a Whisper transcriber instance.
202
+ *
203
+ * Initializes the transcriber with the specified model and configuration.
204
+ * The transcriber can then process audio frames from any source.
205
+ *
206
+ * @param options - Transcriber configuration
207
+ *
208
+ * @returns Configured transcriber instance
209
+ *
210
+ * @throws {Error} If model file does not exist
211
+ *
212
+ * @throws {Error} If VAD model file does not exist (when vadModel specified)
213
+ *
214
+ * @example
215
+ * ```typescript
216
+ * import { WhisperTranscriber } from 'node-av/api';
217
+ *
218
+ * // Create transcriber with basic options
219
+ * await using transcriber = await WhisperTranscriber.create({
220
+ * model: './models/ggml-base.en.bin',
221
+ * language: 'en'
222
+ * });
223
+ * ```
224
+ *
225
+ * @example
226
+ * ```typescript
227
+ * // Create transcriber with GPU and VAD support
228
+ * await using transcriber = await WhisperTranscriber.create({
229
+ * model: './models/ggml-base.bin',
230
+ * language: 'auto',
231
+ * useGpu: true,
232
+ * gpuDevice: 0,
233
+ * vadModel: './models/ggml-silero-v5.1.2.bin',
234
+ * vadThreshold: 0.5,
235
+ * queue: 10
236
+ * });
237
+ * ```
238
+ */
239
+ static create(options: WhisperTranscriberOptions): Promise<WhisperTranscriber>;
240
+ /**
241
+ * Transcribe audio frames to text segments.
242
+ *
243
+ * Processes audio frames through the Whisper filter and yields transcribed segments.
244
+ * Each segment contains start/end timestamps and the transcribed text.
245
+ * Reads metadata directly from frame metadata tags (lavfi.whisper.text, lavfi.whisper.duration).
246
+ *
247
+ * The generator continues until the input stream ends or close() is called.
248
+ * Always use with `for await...of` to properly handle async iteration.
249
+ *
250
+ * @param frames - Audio frames (from Decoder.frames()) or single frame to transcribe
251
+ *
252
+ * @yields {WhisperSegment} Transcribed audio segments with timing and text
253
+ *
254
+ * @throws {FFmpegError} If filter initialization fails
255
+ *
256
+ * @example
257
+ * ```typescript
258
+ * import { Demuxer, Decoder, WhisperTranscriber } from 'node-av/api';
259
+ *
260
+ * await using input = await Demuxer.open('podcast.mp3');
261
+ * using decoder = await Decoder.create(input.audio());
262
+ * await using transcriber = await WhisperTranscriber.create({
263
+ * model: './models/ggml-base.en.bin',
264
+ * language: 'en'
265
+ * });
266
+ *
267
+ * // Transcribe decoded frames
268
+ * for await (const segment of transcriber.transcribe(decoder.frames(input.packets()))) {
269
+ * console.log(`[${segment.start}ms]: ${segment.text}`);
270
+ * }
271
+ * ```
272
+ *
273
+ * @example
274
+ * ```typescript
275
+ * // With custom timing format
276
+ * const audioFrames = decoder.frames(input.packets());
277
+ * for await (const segment of transcriber.transcribe(audioFrames)) {
278
+ * const startSec = (segment.start / 1000).toFixed(2);
279
+ * const endSec = (segment.end / 1000).toFixed(2);
280
+ * console.log(`[${startSec}s - ${endSec}s]: ${segment.text}`);
281
+ * }
282
+ * ```
283
+ *
284
+ * @example
285
+ * ```typescript
286
+ * // Process single frame
287
+ * using frame = decoder.decodeSync(packet);
288
+ * for await (const segment of transcriber.transcribe(frame)) {
289
+ * console.log(`Transcribed: ${segment.text}`);
290
+ * }
291
+ * ```
292
+ */
293
+ transcribe(frames: AsyncIterable<Frame | null> | Frame | null): AsyncGenerator<WhisperSegment, void, unknown>;
294
+ /**
295
+ * Close transcriber and clean up resources.
296
+ *
297
+ * Releases filter graph and stops frame processing.
298
+ * Called automatically when using `await using` syntax.
299
+ *
300
+ * @example
301
+ * ```typescript
302
+ * // Automatic cleanup
303
+ * {
304
+ * await using transcriber = await WhisperTranscriber.create(options);
305
+ * // Use transcriber
306
+ * } // Automatically calls close()
307
+ *
308
+ * // Manual cleanup
309
+ * const transcriber = await WhisperTranscriber.create(options);
310
+ * try {
311
+ * // Use transcriber
312
+ * } finally {
313
+ * await transcriber.close();
314
+ * }
315
+ * ```
316
+ */
317
+ close(): void;
318
+ /**
319
+ * Symbol.asyncDispose implementation for `await using` syntax.
320
+ *
321
+ * @internal
322
+ */
323
+ [Symbol.dispose](): void;
324
+ }
@@ -0,0 +1,362 @@
1
+ var __addDisposableResource = (this && this.__addDisposableResource) || function (env, value, async) {
2
+ if (value !== null && value !== void 0) {
3
+ if (typeof value !== "object" && typeof value !== "function") throw new TypeError("Object expected.");
4
+ var dispose, inner;
5
+ if (async) {
6
+ if (!Symbol.asyncDispose) throw new TypeError("Symbol.asyncDispose is not defined.");
7
+ dispose = value[Symbol.asyncDispose];
8
+ }
9
+ if (dispose === void 0) {
10
+ if (!Symbol.dispose) throw new TypeError("Symbol.dispose is not defined.");
11
+ dispose = value[Symbol.dispose];
12
+ if (async) inner = dispose;
13
+ }
14
+ if (typeof dispose !== "function") throw new TypeError("Object not disposable.");
15
+ if (inner) dispose = function() { try { inner.call(this); } catch (e) { return Promise.reject(e); } };
16
+ env.stack.push({ value: value, dispose: dispose, async: async });
17
+ }
18
+ else if (async) {
19
+ env.stack.push({ async: true });
20
+ }
21
+ return value;
22
+ };
23
+ var __disposeResources = (this && this.__disposeResources) || (function (SuppressedError) {
24
+ return function (env) {
25
+ function fail(e) {
26
+ env.error = env.hasError ? new SuppressedError(e, env.error, "An error was suppressed during disposal.") : e;
27
+ env.hasError = true;
28
+ }
29
+ var r, s = 0;
30
+ function next() {
31
+ while (r = env.stack.pop()) {
32
+ try {
33
+ if (!r.async && s === 1) return s = 0, env.stack.push(r), Promise.resolve().then(next);
34
+ if (r.dispose) {
35
+ var result = r.dispose.call(r.value);
36
+ if (r.async) return s |= 2, Promise.resolve(result).then(next, function(e) { fail(e); return next(); });
37
+ }
38
+ else s |= 1;
39
+ }
40
+ catch (e) {
41
+ fail(e);
42
+ }
43
+ }
44
+ if (s === 1) return env.hasError ? Promise.reject(env.error) : Promise.resolve();
45
+ if (env.hasError) throw env.error;
46
+ }
47
+ return next();
48
+ };
49
+ })(typeof SuppressedError === "function" ? SuppressedError : function (error, suppressed, message) {
50
+ var e = new Error(message);
51
+ return e.name = "SuppressedError", e.error = error, e.suppressed = suppressed, e;
52
+ });
53
+ import { FilterPreset } from './filter-presets.js';
54
+ import { FilterAPI } from './filter.js';
55
+ import { WhisperDownloader } from './utilities/whisper-model.js';
56
+ /**
57
+ * High-level Whisper transcriber for automatic speech recognition.
58
+ *
59
+ * Provides streaming audio transcription using OpenAI's Whisper model via whisper.cpp.
60
+ * Supports GPU acceleration, VAD (Voice Activity Detection), and real-time processing.
61
+ * Built on FFmpeg's whisper filter with automatic frame metadata extraction.
62
+ *
63
+ * Features:
64
+ * - Real-time streaming transcription
65
+ * - GPU acceleration (CUDA/Vulkan/Metal)
66
+ * - Voice Activity Detection for better segmentation
67
+ * - Automatic language detection
68
+ * - Type-safe transcription segments
69
+ * - Frame-based API for flexible integration
70
+ *
71
+ * @example
72
+ * ```typescript
73
+ * import { Demuxer, Decoder, WhisperTranscriber } from 'node-av/api';
74
+ * import { WhisperDownloader } from 'node-av/api/utilities/whisper-model';
75
+ *
76
+ * // Download model
77
+ * const modelPath = await WhisperDownloader.downloadModel({
78
+ * model: 'base.en',
79
+ * outputPath: './models'
80
+ * });
81
+ *
82
+ * // Open audio and create decoder
83
+ * await using input = await Demuxer.open('podcast.mp3');
84
+ * using decoder = await Decoder.create(input.audio());
85
+ *
86
+ * // Create transcriber
87
+ * await using transcriber = await WhisperTranscriber.create({
88
+ * model: modelPath,
89
+ * language: 'en'
90
+ * });
91
+ *
92
+ * // Transcribe using decoded frames
93
+ * for await (const segment of transcriber.transcribe(decoder.frames(input.packets()))) {
94
+ * const timestamp = `[${(segment.start / 1000).toFixed(1)}s - ${(segment.end / 1000).toFixed(1)}s]`;
95
+ * console.log(`${timestamp}: ${segment.text}`);
96
+ * }
97
+ * ```
98
+ *
99
+ * @example
100
+ * ```typescript
101
+ * // Real-time microphone transcription with VAD
102
+ * import { Demuxer, Decoder, WhisperTranscriber } from 'node-av/api';
103
+ * import { WhisperDownloader } from 'node-av/api/utilities/whisper-model';
104
+ *
105
+ * // Download VAD model
106
+ * const vadPath = await WhisperDownloader.downloadVADModel('silero-v5.1.2', './models');
107
+ *
108
+ * // Setup transcriber with VAD
109
+ * await using transcriber = await WhisperTranscriber.create({
110
+ * model: './models/ggml-medium.bin',
111
+ * language: 'en',
112
+ * queue: 10,
113
+ * vadModel: vadPath,
114
+ * vadThreshold: 0.5
115
+ * });
116
+ *
117
+ * // Live transcription from decoded audio frames
118
+ * using decoder = await Decoder.create(microphoneStream);
119
+ * for await (const segment of transcriber.transcribe(decoder.frames(microphonePackets))) {
120
+ * if (segment.turn) {
121
+ * console.log('\n--- New speaker turn ---');
122
+ * }
123
+ * console.log(segment.text);
124
+ * }
125
+ * ```
126
+ *
127
+ * @see {@link WhisperDownloader} For downloading Whisper and VAD models
128
+ * @see {@link Decoder} For audio decoding
129
+ * @see {@link Demuxer} For reading media files
130
+ */
131
+ export class WhisperTranscriber {
132
+ options;
133
+ isClosed = false;
134
+ /**
135
+ * @param options - Transcriber configuration
136
+ *
137
+ * Use {@link create} factory method instead
138
+ *
139
+ * @internal
140
+ */
141
+ constructor(options) {
142
+ this.options = options;
143
+ }
144
+ /**
145
+ * Create a Whisper transcriber instance.
146
+ *
147
+ * Initializes the transcriber with the specified model and configuration.
148
+ * The transcriber can then process audio frames from any source.
149
+ *
150
+ * @param options - Transcriber configuration
151
+ *
152
+ * @returns Configured transcriber instance
153
+ *
154
+ * @throws {Error} If model file does not exist
155
+ *
156
+ * @throws {Error} If VAD model file does not exist (when vadModel specified)
157
+ *
158
+ * @example
159
+ * ```typescript
160
+ * import { WhisperTranscriber } from 'node-av/api';
161
+ *
162
+ * // Create transcriber with basic options
163
+ * await using transcriber = await WhisperTranscriber.create({
164
+ * model: './models/ggml-base.en.bin',
165
+ * language: 'en'
166
+ * });
167
+ * ```
168
+ *
169
+ * @example
170
+ * ```typescript
171
+ * // Create transcriber with GPU and VAD support
172
+ * await using transcriber = await WhisperTranscriber.create({
173
+ * model: './models/ggml-base.bin',
174
+ * language: 'auto',
175
+ * useGpu: true,
176
+ * gpuDevice: 0,
177
+ * vadModel: './models/ggml-silero-v5.1.2.bin',
178
+ * vadThreshold: 0.5,
179
+ * queue: 10
180
+ * });
181
+ * ```
182
+ */
183
+ static async create(options) {
184
+ const modelsToDownload = [options.model, options.vadModel].filter(Boolean);
185
+ const [modelPath, vadModelPath] = await WhisperDownloader.downloadModels(modelsToDownload, options.modelDir);
186
+ const fullOptions = {
187
+ model: modelPath,
188
+ vadModel: vadModelPath,
189
+ modelDir: options.modelDir ?? WhisperDownloader.DEFAULT_MODEL_PATH,
190
+ language: options.language ?? 'auto',
191
+ queue: options.queue ?? 3,
192
+ useGpu: options.useGpu ?? true,
193
+ gpuDevice: options.gpuDevice ?? 0,
194
+ vadThreshold: options.vadThreshold ?? 0.5,
195
+ vadMinSpeechDuration: options.vadMinSpeechDuration ?? 0.1,
196
+ vadMinSilenceDuration: options.vadMinSilenceDuration ?? 0.5,
197
+ };
198
+ return new WhisperTranscriber(fullOptions);
199
+ }
200
+ /**
201
+ * Transcribe audio frames to text segments.
202
+ *
203
+ * Processes audio frames through the Whisper filter and yields transcribed segments.
204
+ * Each segment contains start/end timestamps and the transcribed text.
205
+ * Reads metadata directly from frame metadata tags (lavfi.whisper.text, lavfi.whisper.duration).
206
+ *
207
+ * The generator continues until the input stream ends or close() is called.
208
+ * Always use with `for await...of` to properly handle async iteration.
209
+ *
210
+ * @param frames - Audio frames (from Decoder.frames()) or single frame to transcribe
211
+ *
212
+ * @yields {WhisperSegment} Transcribed audio segments with timing and text
213
+ *
214
+ * @throws {FFmpegError} If filter initialization fails
215
+ *
216
+ * @example
217
+ * ```typescript
218
+ * import { Demuxer, Decoder, WhisperTranscriber } from 'node-av/api';
219
+ *
220
+ * await using input = await Demuxer.open('podcast.mp3');
221
+ * using decoder = await Decoder.create(input.audio());
222
+ * await using transcriber = await WhisperTranscriber.create({
223
+ * model: './models/ggml-base.en.bin',
224
+ * language: 'en'
225
+ * });
226
+ *
227
+ * // Transcribe decoded frames
228
+ * for await (const segment of transcriber.transcribe(decoder.frames(input.packets()))) {
229
+ * console.log(`[${segment.start}ms]: ${segment.text}`);
230
+ * }
231
+ * ```
232
+ *
233
+ * @example
234
+ * ```typescript
235
+ * // With custom timing format
236
+ * const audioFrames = decoder.frames(input.packets());
237
+ * for await (const segment of transcriber.transcribe(audioFrames)) {
238
+ * const startSec = (segment.start / 1000).toFixed(2);
239
+ * const endSec = (segment.end / 1000).toFixed(2);
240
+ * console.log(`[${startSec}s - ${endSec}s]: ${segment.text}`);
241
+ * }
242
+ * ```
243
+ *
244
+ * @example
245
+ * ```typescript
246
+ * // Process single frame
247
+ * using frame = decoder.decodeSync(packet);
248
+ * for await (const segment of transcriber.transcribe(frame)) {
249
+ * console.log(`Transcribed: ${segment.text}`);
250
+ * }
251
+ * ```
252
+ */
253
+ async *transcribe(frames) {
254
+ const env_1 = { stack: [], error: void 0, hasError: false };
255
+ try {
256
+ const chain = FilterPreset.chain()
257
+ .whisper({
258
+ model: this.options.model,
259
+ language: this.options.language,
260
+ queue: this.options.queue,
261
+ useGpu: this.options.useGpu,
262
+ gpuDevice: this.options.gpuDevice,
263
+ vadModel: this.options.vadModel,
264
+ vadThreshold: this.options.vadThreshold,
265
+ vadMinSpeechDuration: this.options.vadMinSpeechDuration,
266
+ vadMinSilenceDuration: this.options.vadMinSilenceDuration,
267
+ })
268
+ .build();
269
+ // Create filter API
270
+ const filter = __addDisposableResource(env_1, FilterAPI.create(chain, {
271
+ allowReinit: true,
272
+ dropOnChange: false,
273
+ }), false);
274
+ // Track cumulative time for start/end timestamps
275
+ let cumulativeTime = 0; // in milliseconds
276
+ const filterGenerator = filter.frames(frames);
277
+ // Decode and process frames through filter
278
+ for await (const frame_1 of filterGenerator) {
279
+ const env_2 = { stack: [], error: void 0, hasError: false };
280
+ try {
281
+ const frame = __addDisposableResource(env_2, frame_1, false);
282
+ if (this.isClosed) {
283
+ break;
284
+ }
285
+ if (!frame?.isAudio()) {
286
+ continue;
287
+ }
288
+ // Get frame metadata
289
+ const metadata = frame.getMetadata();
290
+ const text = metadata.get('lavfi.whisper.text');
291
+ const durationStr = metadata.get('lavfi.whisper.duration');
292
+ if (text?.trim()) {
293
+ // Parse duration (in seconds)
294
+ const duration = durationStr ? parseFloat(durationStr) * 1000 : 0;
295
+ // Yield transcribed segment
296
+ yield {
297
+ start: cumulativeTime,
298
+ end: cumulativeTime + duration,
299
+ text: text.trim(),
300
+ };
301
+ // Update cumulative time
302
+ if (duration > 0) {
303
+ cumulativeTime += duration;
304
+ }
305
+ }
306
+ }
307
+ catch (e_1) {
308
+ env_2.error = e_1;
309
+ env_2.hasError = true;
310
+ }
311
+ finally {
312
+ __disposeResources(env_2);
313
+ }
314
+ }
315
+ }
316
+ catch (e_2) {
317
+ env_1.error = e_2;
318
+ env_1.hasError = true;
319
+ }
320
+ finally {
321
+ __disposeResources(env_1);
322
+ }
323
+ }
324
+ /**
325
+ * Close transcriber and clean up resources.
326
+ *
327
+ * Releases filter graph and stops frame processing.
328
+ * Called automatically when using `await using` syntax.
329
+ *
330
+ * @example
331
+ * ```typescript
332
+ * // Automatic cleanup
333
+ * {
334
+ * await using transcriber = await WhisperTranscriber.create(options);
335
+ * // Use transcriber
336
+ * } // Automatically calls close()
337
+ *
338
+ * // Manual cleanup
339
+ * const transcriber = await WhisperTranscriber.create(options);
340
+ * try {
341
+ * // Use transcriber
342
+ * } finally {
343
+ * await transcriber.close();
344
+ * }
345
+ * ```
346
+ */
347
+ close() {
348
+ if (this.isClosed) {
349
+ return;
350
+ }
351
+ this.isClosed = true;
352
+ }
353
+ /**
354
+ * Symbol.asyncDispose implementation for `await using` syntax.
355
+ *
356
+ * @internal
357
+ */
358
+ [Symbol.dispose]() {
359
+ this.close();
360
+ }
361
+ }
362
+ //# sourceMappingURL=whisper.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"whisper.js","sourceRoot":"","sources":["../../src/api/whisper.ts"],"names":[],"mappings":";;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAAA,OAAO,EAAE,YAAY,EAAE,MAAM,qBAAqB,CAAC;AACnD,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,iBAAiB,EAAE,MAAM,8BAA8B,CAAC;AAkIjE;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA0EG;AACH,MAAM,OAAO,kBAAkB;IACrB,OAAO,CAAsC;IAC7C,QAAQ,GAAG,KAAK,CAAC;IAEzB;;;;;;OAMG;IACH,YAAoB,OAA4C;QAC9D,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;IACzB,CAAC;IAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;OAsCG;IACH,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,OAAkC;QACpD,MAAM,gBAAgB,GAAG,CAAC,OAAO,CAAC,KAAK,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,OAAO,CAA+C,CAAC;QACzH,MAAM,CAAC,SAAS,EAAE,YAAY,CAAC,GAAG,MAAM,iBAAiB,CAAC,cAAc,CAAC,gBAAgB,EAAE,OAAO,CAAC,QAAQ,CAAC,CAAC;QAE7G,MAAM,WAAW,GAAwC;YACvD,KAAK,EAAE,SAA6B;YACpC,QAAQ,EAAE,YAAmC;YAC7C,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,iBAAiB,CAAC,kBAAkB;YAClE,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,MAAM;YACpC,KAAK,EAAE,OAAO,CAAC,KAAK,IAAI,CAAC;YACzB,MAAM,EAAE,OAAO,CAAC,MAAM,IAAI,IAAI;YAC9B,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,CAAC;YACjC,YAAY,EAAE,OAAO,CAAC,YAAY,IAAI,GAAG;YACzC,oBAAoB,EAAE,OAAO,CAAC,oBAAoB,IAAI,GAAG;YACzD,qBAAqB,EAAE,OAAO,CAAC,qBAAqB,IAAI,GAAG;SAC5D,CAAC;QAEF,OAAO,IAAI,kBAAkB,CAAC,WAAW,CAAC,CAAC;IAC7C,CAAC;IAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;OAoDG;IACH,KAAK,CAAC,CAAC,UAAU,CAAC,MAAkD;;;YAClE,MAAM,KAAK,GAAG,YAAY,CAAC,KAAK,EAAE;iBAC/B,OAAO,CAAC;gBACP,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;gBACzB,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,QAAQ;gBAC/B,KAAK,EAAE,IAAI,CAAC,OAAO,CAAC,KAAK;gBACzB,MAAM,EAAE,IAAI,CAAC,OAAO,CAAC,MAAM;gBAC3B,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,SAAS;gBACjC,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,QAAQ;gBAC/B,YAAY,EAAE,IAAI,CAAC,OAAO,CAAC,YAAY;gBACvC,oBAAoB,EAAE,IAAI,CAAC,OAAO,CAAC,oBAAoB;gBACvD,qBAAqB,EAAE,IAAI,CAAC,OAAO,CAAC,qBAAqB;aAC1D,CAAC;iBACD,KAAK,EAAE,CAAC;YAEX,oBAAoB;YACpB,MAAM,MAAM,kCAAG,SAAS,CAAC,MAAM,CAAC,KAAK,EAAE;gBACrC,WAAW,EAAE,IAAI;gBACjB,YAAY,EAAE,KAAK;aACpB,CAAC,QAAA,CAAC;YAEH,iDAAiD;YACjD,IAAI,cAAc,GAAG,CAAC,CAAC,CAAC,kBAAkB;YAC1C,MAAM,eAAe,GAAG,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC;YAE9C,2CAA2C;YAC3C,IAAI,KAAK,mBAAiB,eAAe,EAAE,CAAC;;;0BAA3B,KAAK,iDAAA;oBACpB,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;wBAClB,MAAM;oBACR,CAAC;oBAED,IAAI,CAAC,KAAK,EAAE,OAAO,EAAE,EAAE,CAAC;wBACtB,SAAS;oBACX,CAAC;oBAED,qBAAqB;oBACrB,MAAM,QAAQ,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC;oBACrC,MAAM,IAAI,GAAG,QAAQ,CAAC,GAAG,CAAC,oBAAoB,CAAC,CAAC;oBAChD,MAAM,WAAW,GAAG,QAAQ,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;oBAE3D,IAAI,IAAI,EAAE,IAAI,EAAE,EAAE,CAAC;wBACjB,8BAA8B;wBAC9B,MAAM,QAAQ,GAAG,WAAW,CAAC,CAAC,CAAC,UAAU,CAAC,WAAW,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;wBAElE,4BAA4B;wBAC5B,MAAM;4BACJ,KAAK,EAAE,cAAc;4BACrB,GAAG,EAAE,cAAc,GAAG,QAAQ;4BAC9B,IAAI,EAAE,IAAI,CAAC,IAAI,EAAE;yBAClB,CAAC;wBAEF,yBAAyB;wBACzB,IAAI,QAAQ,GAAG,CAAC,EAAE,CAAC;4BACjB,cAAc,IAAI,QAAQ,CAAC;wBAC7B,CAAC;oBACH,CAAC;;;;;;;;;aACF;;;;;;;;;KACF;IAED;;;;;;;;;;;;;;;;;;;;;;OAsBG;IACH,KAAK;QACH,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC;YAClB,OAAO;QACT,CAAC;QAED,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC;IACvB,CAAC;IAED;;;;OAIG;IACH,CAAC,MAAM,CAAC,OAAO,CAAC;QACd,IAAI,CAAC,KAAK,EAAE,CAAC;IACf,CAAC;CACF"}