react-native-executorch 0.5.1-rc.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. package/README.md +132 -0
  2. package/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +4 -10
  3. package/common/rnexecutorch/models/speech_to_text/SpeechToText.h +1 -1
  4. package/common/rnexecutorch/models/speech_to_text/SpeechToTextStrategy.h +3 -2
  5. package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.cpp +16 -4
  6. package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.h +2 -2
  7. package/ios/RnExecutorch.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
  8. package/ios/RnExecutorch.xcodeproj/project.xcworkspace/xcuserdata/jakubchmura.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  9. package/ios/RnExecutorch.xcodeproj/xcuserdata/jakubchmura.xcuserdatad/xcschemes/xcschememanagement.plist +14 -0
  10. package/lib/module/constants/modelUrls.js +61 -36
  11. package/lib/module/constants/modelUrls.js.map +1 -1
  12. package/lib/module/constants/ocr/models.js +1 -1
  13. package/lib/module/hooks/natural_language_processing/useSpeechToText.js +71 -34
  14. package/lib/module/hooks/natural_language_processing/useSpeechToText.js.map +1 -1
  15. package/lib/module/index.js +2 -3
  16. package/lib/module/index.js.map +1 -1
  17. package/lib/module/modules/natural_language_processing/SpeechToTextModule.js +72 -31
  18. package/lib/module/modules/natural_language_processing/SpeechToTextModule.js.map +1 -1
  19. package/lib/module/types/stt.js +1 -85
  20. package/lib/module/types/stt.js.map +1 -1
  21. package/lib/module/utils/SpeechToTextModule/ASR.js +191 -0
  22. package/lib/module/utils/SpeechToTextModule/ASR.js.map +1 -0
  23. package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js +73 -0
  24. package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js.map +1 -0
  25. package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js +56 -0
  26. package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js.map +1 -0
  27. package/lib/tsconfig.tsbuildinfo +1 -0
  28. package/lib/typescript/constants/modelUrls.d.ts +24 -7
  29. package/lib/typescript/constants/modelUrls.d.ts.map +1 -1
  30. package/lib/typescript/constants/ocr/models.d.ts +126 -126
  31. package/lib/typescript/hooks/natural_language_processing/useSpeechToText.d.ts +15 -24
  32. package/lib/typescript/hooks/natural_language_processing/useSpeechToText.d.ts.map +1 -1
  33. package/lib/typescript/index.d.ts +2 -3
  34. package/lib/typescript/index.d.ts.map +1 -1
  35. package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts +19 -22
  36. package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts.map +1 -1
  37. package/lib/typescript/types/stt.d.ts +17 -91
  38. package/lib/typescript/types/stt.d.ts.map +1 -1
  39. package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts +27 -0
  40. package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts.map +1 -0
  41. package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts +23 -0
  42. package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts.map +1 -0
  43. package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts +13 -0
  44. package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts.map +1 -0
  45. package/package.json +5 -3
  46. package/src/constants/modelUrls.ts +70 -37
  47. package/src/constants/ocr/models.ts +1 -1
  48. package/src/hooks/natural_language_processing/useSpeechToText.ts +87 -92
  49. package/src/index.ts +6 -8
  50. package/src/modules/natural_language_processing/SpeechToTextModule.ts +81 -69
  51. package/src/types/stt.ts +97 -92
  52. package/src/utils/SpeechToTextModule/ASR.ts +303 -0
  53. package/src/utils/SpeechToTextModule/OnlineProcessor.ts +87 -0
  54. package/src/utils/SpeechToTextModule/hypothesisBuffer.ts +79 -0
  55. package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.xcworkspace/xcuserdata/jakubchmura.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  56. package/common/rnexecutorch/models/speech_to_text/MoonshineStrategy.cpp +0 -31
  57. package/common/rnexecutorch/models/speech_to_text/MoonshineStrategy.h +0 -21
  58. package/lib/common/Logger.d.ts +0 -8
  59. package/lib/common/Logger.js +0 -19
  60. package/lib/constants/modelUrls.d.ts +0 -89
  61. package/lib/constants/modelUrls.js +0 -116
  62. package/lib/constants/sttDefaults.js +0 -66
  63. package/lib/controllers/LLMController.js +0 -210
  64. package/lib/controllers/OCRController.js +0 -65
  65. package/lib/controllers/SpeechToTextController.d.ts +0 -52
  66. package/lib/controllers/SpeechToTextController.js +0 -343
  67. package/lib/hooks/natural_language_processing/useSpeechToText.js +0 -44
  68. package/lib/index.d.ts +0 -50
  69. package/lib/index.js +0 -59
  70. package/lib/module/constants/sttDefaults.js +0 -74
  71. package/lib/module/constants/sttDefaults.js.map +0 -1
  72. package/lib/module/controllers/SpeechToTextController.js +0 -320
  73. package/lib/module/controllers/SpeechToTextController.js.map +0 -1
  74. package/lib/modules/natural_language_processing/SpeechToTextModule.d.ts +0 -14
  75. package/lib/modules/natural_language_processing/SpeechToTextModule.js +0 -30
  76. package/lib/modules/natural_language_processing/TokenizerModule.js +0 -29
  77. package/lib/native/RnExecutorchModules.d.ts +0 -3
  78. package/lib/native/RnExecutorchModules.js +0 -16
  79. package/lib/typescript/constants/sttDefaults.d.ts +0 -29
  80. package/lib/typescript/constants/sttDefaults.d.ts.map +0 -1
  81. package/lib/typescript/controllers/SpeechToTextController.d.ts +0 -57
  82. package/lib/typescript/controllers/SpeechToTextController.d.ts.map +0 -1
  83. package/lib/utils/ResourceFetcherUtils.js +0 -119
  84. package/lib/utils/llm.js +0 -72
  85. package/src/constants/sttDefaults.ts +0 -82
  86. package/src/controllers/SpeechToTextController.ts +0 -471
  87. package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.xcworkspace/xcuserdata/norbertklockiewicz.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  88. /package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/xcuserdata/{norbertklockiewicz.xcuserdatad → jakubchmura.xcuserdatad}/xcschemes/xcschememanagement.plist +0 -0
@@ -1,471 +0,0 @@
1
- import {
2
- HAMMING_DIST_THRESHOLD,
3
- MODEL_CONFIGS,
4
- SECOND,
5
- MODES,
6
- NUM_TOKENS_TO_TRIM,
7
- STREAMING_ACTION,
8
- } from '../constants/sttDefaults';
9
- import { AvailableModels, ModelConfig } from '../types/stt';
10
- import { TokenizerModule } from '../modules/natural_language_processing/TokenizerModule';
11
- import { ResourceSource } from '../types/common';
12
- import { ResourceFetcher } from '../utils/ResourceFetcher';
13
- import { longCommonInfPref } from '../utils/stt';
14
- import { SpeechToTextLanguage } from '../types/stt';
15
- import { ETError, getError } from '../Error';
16
- import { Logger } from '../common/Logger';
17
-
18
- export class SpeechToTextController {
19
- private speechToTextNativeModule: any;
20
-
21
- public sequence: number[] = [];
22
- public isReady = false;
23
- public isGenerating = false;
24
-
25
- private tokenizerModule: TokenizerModule;
26
- private overlapSeconds!: number;
27
- private windowSize!: number;
28
- private chunks: number[][] = [];
29
- private seqs: number[][] = [];
30
- private prevSeq: number[] = [];
31
- private waveform: number[] = [];
32
- private numOfChunks = 0;
33
- private streaming = false;
34
-
35
- // User callbacks
36
- private decodedTranscribeCallback: (sequence: number[]) => void;
37
- private isReadyCallback: (isReady: boolean) => void;
38
- private isGeneratingCallback: (isGenerating: boolean) => void;
39
- private onErrorCallback: (error: any) => void;
40
- private config!: ModelConfig;
41
-
42
- constructor({
43
- transcribeCallback,
44
- isReadyCallback,
45
- isGeneratingCallback,
46
- onErrorCallback,
47
- overlapSeconds,
48
- windowSize,
49
- streamingConfig,
50
- }: {
51
- transcribeCallback: (sequence: string) => void;
52
- isReadyCallback?: (isReady: boolean) => void;
53
- isGeneratingCallback?: (isGenerating: boolean) => void;
54
- onErrorCallback?: (error: Error | undefined) => void;
55
- overlapSeconds?: number;
56
- windowSize?: number;
57
- streamingConfig?: keyof typeof MODES;
58
- }) {
59
- this.tokenizerModule = new TokenizerModule();
60
- this.decodedTranscribeCallback = async (seq) =>
61
- transcribeCallback(await this.tokenIdsToText(seq));
62
- this.isReadyCallback = (isReady) => {
63
- this.isReady = isReady;
64
- isReadyCallback?.(isReady);
65
- };
66
- this.isGeneratingCallback = (isGenerating) => {
67
- this.isGenerating = isGenerating;
68
- isGeneratingCallback?.(isGenerating);
69
- };
70
- this.onErrorCallback = (error) => {
71
- if (onErrorCallback) {
72
- onErrorCallback(error ? new Error(getError(error)) : undefined);
73
- return;
74
- } else {
75
- throw new Error(getError(error));
76
- }
77
- };
78
- this.configureStreaming(
79
- overlapSeconds,
80
- windowSize,
81
- streamingConfig || 'balanced'
82
- );
83
- }
84
-
85
- public async load({
86
- modelName,
87
- encoderSource,
88
- decoderSource,
89
- tokenizerSource,
90
- onDownloadProgressCallback,
91
- }: {
92
- modelName: AvailableModels;
93
- encoderSource?: ResourceSource;
94
- decoderSource?: ResourceSource;
95
- tokenizerSource?: ResourceSource;
96
- onDownloadProgressCallback?: (downloadProgress: number) => void;
97
- }) {
98
- this.onErrorCallback(undefined);
99
- this.isReadyCallback(false);
100
- this.config = MODEL_CONFIGS[modelName];
101
-
102
- try {
103
- const tokenizerLoadPromise = this.tokenizerModule.load({
104
- tokenizerSource: tokenizerSource || this.config.tokenizer.source,
105
- });
106
- const pathsPromise = ResourceFetcher.fetch(
107
- onDownloadProgressCallback,
108
- encoderSource || this.config.sources.encoder,
109
- decoderSource || this.config.sources.decoder
110
- );
111
- const [_, encoderDecoderResults] = await Promise.all([
112
- tokenizerLoadPromise,
113
- pathsPromise,
114
- ]);
115
- encoderSource = encoderDecoderResults?.[0];
116
- decoderSource = encoderDecoderResults?.[1];
117
- if (!encoderSource || !decoderSource) {
118
- throw new Error('Download interrupted.');
119
- }
120
- } catch (e) {
121
- this.onErrorCallback(e);
122
- return;
123
- }
124
-
125
- if (modelName === 'whisperMultilingual') {
126
- // The underlying native class is instantiated based on the name of the model. There is no need to
127
- // create a separate class for multilingual version of Whisper, since it is the same. We just need
128
- // the distinction here, in TS, for start tokens and such. If we introduce
129
- // more versions of Whisper, such as the small one, this should be refactored.
130
- modelName = AvailableModels.WHISPER;
131
- }
132
-
133
- try {
134
- const nativeSpeechToText = await global.loadSpeechToText(
135
- encoderSource,
136
- decoderSource,
137
- modelName
138
- );
139
- this.speechToTextNativeModule = nativeSpeechToText;
140
- this.isReadyCallback(true);
141
- } catch (e) {
142
- this.onErrorCallback(e);
143
- }
144
- }
145
-
146
- public configureStreaming(
147
- overlapSeconds?: number,
148
- windowSize?: number,
149
- streamingConfig?: keyof typeof MODES
150
- ) {
151
- if (streamingConfig) {
152
- this.windowSize = MODES[streamingConfig].windowSize * SECOND;
153
- this.overlapSeconds = MODES[streamingConfig].overlapSeconds * SECOND;
154
- }
155
- if (streamingConfig && (windowSize || overlapSeconds)) {
156
- Logger.warn(
157
- `windowSize and overlapSeconds overrides values from streamingConfig ${streamingConfig}.`
158
- );
159
- }
160
- this.windowSize = (windowSize || 0) * SECOND || this.windowSize;
161
- this.overlapSeconds = (overlapSeconds || 0) * SECOND || this.overlapSeconds;
162
- if (2 * this.overlapSeconds + this.windowSize >= 30 * SECOND) {
163
- Logger.warn(
164
- `Invalid values for overlapSeconds and/or windowSize provided. Expected windowSize + 2 * overlapSeconds (== ${this.windowSize + 2 * this.overlapSeconds}) <= 30. Setting windowSize to ${30 * SECOND - 2 * this.overlapSeconds}.`
165
- );
166
- this.windowSize = 30 * SECOND - 2 * this.overlapSeconds;
167
- }
168
- }
169
-
170
- private chunkWaveform() {
171
- this.numOfChunks = Math.ceil(this.waveform.length / this.windowSize);
172
- for (let i = 0; i < this.numOfChunks; i++) {
173
- let chunk: number[] = [];
174
- const left = Math.max(this.windowSize * i - this.overlapSeconds, 0);
175
- const right = Math.min(
176
- this.windowSize * (i + 1) + this.overlapSeconds,
177
- this.waveform.length
178
- );
179
- chunk = this.waveform.slice(left, right);
180
- this.chunks.push(chunk);
181
- }
182
- }
183
-
184
- private resetState() {
185
- this.sequence = [];
186
- this.seqs = [];
187
- this.waveform = [];
188
- this.prevSeq = [];
189
- this.chunks = [];
190
- this.decodedTranscribeCallback([]);
191
- this.onErrorCallback(undefined);
192
- }
193
-
194
- private expectedChunkLength() {
195
- //only first chunk can be of shorter length, for first chunk there are no seqs decoded
196
- return this.seqs.length
197
- ? this.windowSize + 2 * this.overlapSeconds
198
- : this.windowSize + this.overlapSeconds;
199
- }
200
-
201
- private async getStartingTokenIds(audioLanguage?: string): Promise<number[]> {
202
- // We need different starting token ids based on the multilingualism of the model.
203
- // The eng version only needs BOS token, while the multilingual one needs:
204
- // [BOS, LANG, TRANSCRIBE]. Optionally we should also set notimestamps token, as timestamps
205
- // is not yet supported.
206
- if (!audioLanguage) {
207
- return [this.config.tokenizer.bos];
208
- }
209
- // FIXME: I should use .getTokenId for the BOS as well, should remove it from config
210
- const langTokenId = await this.tokenizerModule.tokenToId(
211
- `<|${audioLanguage}|>`
212
- );
213
- const transcribeTokenId =
214
- await this.tokenizerModule.tokenToId('<|transcribe|>');
215
- const noTimestampsTokenId =
216
- await this.tokenizerModule.tokenToId('<|notimestamps|>');
217
- const startingTokenIds = [
218
- this.config.tokenizer.bos,
219
- langTokenId,
220
- transcribeTokenId,
221
- noTimestampsTokenId,
222
- ];
223
- return startingTokenIds;
224
- }
225
-
226
- private async decodeChunk(
227
- chunk: number[],
228
- audioLanguage?: SpeechToTextLanguage
229
- ): Promise<number[]> {
230
- const seq = await this.getStartingTokenIds(audioLanguage);
231
- let prevSeqTokenIdx = 0;
232
- this.prevSeq = this.sequence.slice();
233
- try {
234
- await this.encode(new Float32Array(chunk));
235
- } catch (error) {
236
- this.onErrorCallback(new Error(getError(error) + ' encoding error'));
237
- return [];
238
- }
239
- let lastToken = seq.at(-1) as number;
240
- while (lastToken !== this.config.tokenizer.eos) {
241
- try {
242
- lastToken = await this.decode(seq);
243
- } catch (error) {
244
- this.onErrorCallback(new Error(getError(error) + ' decoding error'));
245
- return [...seq, this.config.tokenizer.eos];
246
- }
247
- seq.push(lastToken);
248
- if (
249
- this.seqs.length > 0 &&
250
- seq.length < this.seqs.at(-1)!.length &&
251
- seq.length % 3 !== 0
252
- ) {
253
- this.prevSeq.push(this.seqs.at(-1)![prevSeqTokenIdx++]!);
254
- this.decodedTranscribeCallback(this.prevSeq);
255
- }
256
- }
257
- return seq;
258
- }
259
-
260
- private async handleOverlaps(seqs: number[][]): Promise<number[]> {
261
- const maxInd = longCommonInfPref(
262
- seqs.at(-2)!,
263
- seqs.at(-1)!,
264
- HAMMING_DIST_THRESHOLD
265
- );
266
- this.sequence = [...this.sequence, ...seqs.at(-2)!.slice(0, maxInd)];
267
- this.decodedTranscribeCallback(this.sequence);
268
- return this.sequence.slice();
269
- }
270
-
271
- private trimLeft(numOfTokensToTrim: number) {
272
- const idx = this.seqs.length - 1;
273
- if (this.seqs[idx]![0] === this.config.tokenizer.bos) {
274
- this.seqs[idx] = this.seqs[idx]!.slice(numOfTokensToTrim);
275
- }
276
- }
277
-
278
- private trimRight(numOfTokensToTrim: number) {
279
- const idx = this.seqs.length - 2;
280
- if (this.seqs[idx]!.at(-1) === this.config.tokenizer.eos) {
281
- this.seqs[idx] = this.seqs[idx]!.slice(0, -numOfTokensToTrim);
282
- }
283
- }
284
-
285
- // since we are calling this every time (except first) after a new seq is pushed to this.seqs
286
- // we can only trim left the last seq and trim right the second to last seq
287
- private async trimSequences(audioLanguage?: string) {
288
- const numSpecialTokens = (await this.getStartingTokenIds(audioLanguage))
289
- .length;
290
- this.trimLeft(numSpecialTokens + NUM_TOKENS_TO_TRIM);
291
- this.trimRight(numSpecialTokens + NUM_TOKENS_TO_TRIM);
292
- }
293
-
294
- // if last chunk is too short combine it with second to last to improve quality
295
- private validateAndFixLastChunk() {
296
- if (this.chunks.length < 2) return;
297
-
298
- const lastChunkLength = this.chunks.at(-1)!.length / SECOND;
299
- const secondToLastChunkLength = this.chunks.at(-2)!.length / SECOND;
300
- if (lastChunkLength < 5 && secondToLastChunkLength + lastChunkLength < 30) {
301
- this.chunks[this.chunks.length - 2] = [
302
- ...this.chunks.at(-2)!.slice(0, -this.overlapSeconds * 2),
303
- ...this.chunks.at(-1)!,
304
- ];
305
- this.chunks = this.chunks.slice(0, -1);
306
- }
307
- }
308
-
309
- private async tokenIdsToText(tokenIds: number[]): Promise<string> {
310
- try {
311
- return await this.tokenizerModule.decode(tokenIds, true);
312
- } catch (e) {
313
- this.onErrorCallback(
314
- new Error(`An error has occurred when decoding the token ids: ${e}`)
315
- );
316
- return '';
317
- }
318
- }
319
-
320
- public async transcribe(
321
- waveform: number[],
322
- audioLanguage?: SpeechToTextLanguage
323
- ): Promise<string> {
324
- try {
325
- if (!this.isReady) throw Error(getError(ETError.ModuleNotLoaded));
326
- if (this.isGenerating || this.streaming)
327
- throw Error(getError(ETError.ModelGenerating));
328
- if (!!audioLanguage !== this.config.isMultilingual)
329
- throw new Error(getError(ETError.MultilingualConfiguration));
330
- } catch (e) {
331
- this.onErrorCallback(e);
332
- return '';
333
- }
334
-
335
- // Making sure that the error is not set when we get there
336
- this.isGeneratingCallback(true);
337
- this.resetState();
338
- this.waveform = waveform;
339
- this.chunkWaveform();
340
- this.validateAndFixLastChunk();
341
-
342
- for (let chunkId = 0; chunkId < this.chunks.length; chunkId++) {
343
- const seq = await this.decodeChunk(
344
- this.chunks!.at(chunkId)!,
345
- audioLanguage
346
- );
347
- // whole audio is inside one chunk, no processing required
348
- if (this.chunks.length === 1) {
349
- this.sequence = seq;
350
- this.decodedTranscribeCallback(seq);
351
- break;
352
- }
353
- this.seqs.push(seq);
354
-
355
- if (this.seqs.length < 2) continue;
356
-
357
- // Remove starting tokenIds and some additional ones
358
- await this.trimSequences(audioLanguage);
359
-
360
- this.prevSeq = await this.handleOverlaps(this.seqs);
361
-
362
- // last sequence processed
363
- // overlaps are already handled, so just append the last seq
364
- if (this.seqs.length === this.chunks.length) {
365
- this.sequence = [...this.sequence, ...this.seqs.at(-1)!];
366
- this.decodedTranscribeCallback(this.sequence);
367
- this.prevSeq = this.sequence;
368
- }
369
- }
370
- const decodedText = await this.tokenIdsToText(this.sequence);
371
- this.isGeneratingCallback(false);
372
- return decodedText;
373
- }
374
-
375
- public async streamingTranscribe(
376
- streamAction: STREAMING_ACTION,
377
- waveform?: number[],
378
- audioLanguage?: SpeechToTextLanguage
379
- ): Promise<string> {
380
- try {
381
- if (!this.isReady) throw Error(getError(ETError.ModuleNotLoaded));
382
- if (!!audioLanguage !== this.config.isMultilingual)
383
- throw new Error(getError(ETError.MultilingualConfiguration));
384
-
385
- if (
386
- streamAction === STREAMING_ACTION.START &&
387
- !this.streaming &&
388
- this.isGenerating
389
- )
390
- throw Error(getError(ETError.ModelGenerating));
391
- if (streamAction === STREAMING_ACTION.START && this.streaming)
392
- throw Error(getError(ETError.ModelGenerating));
393
- if (streamAction === STREAMING_ACTION.DATA && !this.streaming)
394
- throw Error(getError(ETError.StreamingNotStarted));
395
- if (streamAction === STREAMING_ACTION.STOP && !this.streaming)
396
- throw Error(getError(ETError.StreamingNotStarted));
397
- if (streamAction === STREAMING_ACTION.DATA && !waveform)
398
- throw new Error(getError(ETError.MissingDataChunk));
399
- } catch (e) {
400
- this.onErrorCallback(e);
401
- return '';
402
- }
403
-
404
- if (streamAction === STREAMING_ACTION.START) {
405
- this.resetState();
406
- this.streaming = true;
407
- this.isGeneratingCallback(true);
408
- }
409
-
410
- this.waveform = [...this.waveform, ...(waveform || [])];
411
-
412
- // while buffer has at least required size get chunk and decode
413
- while (this.waveform.length >= this.expectedChunkLength()) {
414
- const chunk = this.waveform.slice(
415
- 0,
416
- this.windowSize +
417
- this.overlapSeconds * (1 + Number(this.seqs.length > 0))
418
- );
419
- this.chunks = [chunk]; //save last chunk for STREAMING_ACTION.STOP
420
- this.waveform = this.waveform.slice(
421
- this.windowSize - this.overlapSeconds * Number(this.seqs.length === 0)
422
- );
423
- const seq = await this.decodeChunk(chunk, audioLanguage);
424
- this.seqs.push(seq);
425
-
426
- if (this.seqs.length < 2) continue;
427
-
428
- await this.trimSequences(audioLanguage);
429
- await this.handleOverlaps(this.seqs);
430
- }
431
-
432
- // got final package, process all remaining waveform data
433
- // since we run the loop above the waveform has at most one chunk in it
434
- if (streamAction === STREAMING_ACTION.STOP) {
435
- // pad remaining waveform data with previous chunk to this.windowSize + 2 * this.overlapSeconds
436
- const chunk = this.chunks.length
437
- ? [
438
- ...this.chunks[0]!.slice(0, this.windowSize),
439
- ...this.waveform,
440
- ].slice(-this.windowSize - 2 * this.overlapSeconds)
441
- : this.waveform;
442
-
443
- this.waveform = [];
444
- const seq = await this.decodeChunk(chunk, audioLanguage);
445
- this.seqs.push(seq);
446
-
447
- if (this.seqs.length === 1) {
448
- this.sequence = this.seqs[0]!;
449
- } else {
450
- await this.trimSequences(audioLanguage);
451
- await this.handleOverlaps(this.seqs);
452
- this.sequence = [...this.sequence, ...this.seqs.at(-1)!];
453
- }
454
- this.decodedTranscribeCallback(this.sequence);
455
- this.isGeneratingCallback(false);
456
- this.streaming = false;
457
- }
458
-
459
- const decodedText = await this.tokenIdsToText(this.sequence);
460
-
461
- return decodedText;
462
- }
463
-
464
- public async encode(waveform: Float32Array): Promise<null> {
465
- return await this.speechToTextNativeModule.encode(waveform);
466
- }
467
-
468
- public async decode(seq: number[]): Promise<number> {
469
- return await this.speechToTextNativeModule.decode(seq);
470
- }
471
- }