react-native-executorch 0.5.3 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/android/src/main/cpp/CMakeLists.txt +2 -1
  2. package/common/rnexecutorch/data_processing/Numerical.cpp +27 -19
  3. package/common/rnexecutorch/data_processing/Numerical.h +53 -4
  4. package/common/rnexecutorch/data_processing/dsp.cpp +1 -1
  5. package/common/rnexecutorch/data_processing/dsp.h +1 -1
  6. package/common/rnexecutorch/data_processing/gzip.cpp +47 -0
  7. package/common/rnexecutorch/data_processing/gzip.h +7 -0
  8. package/common/rnexecutorch/host_objects/ModelHostObject.h +24 -0
  9. package/common/rnexecutorch/metaprogramming/TypeConcepts.h +21 -1
  10. package/common/rnexecutorch/models/BaseModel.cpp +3 -2
  11. package/common/rnexecutorch/models/BaseModel.h +3 -2
  12. package/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +100 -39
  13. package/common/rnexecutorch/models/speech_to_text/SpeechToText.h +43 -21
  14. package/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp +307 -0
  15. package/common/rnexecutorch/models/speech_to_text/asr/ASR.h +61 -0
  16. package/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.cpp +80 -0
  17. package/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.h +27 -0
  18. package/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp +96 -0
  19. package/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h +36 -0
  20. package/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h +15 -0
  21. package/common/rnexecutorch/models/speech_to_text/types/GenerationResult.h +12 -0
  22. package/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h +12 -0
  23. package/common/rnexecutorch/models/speech_to_text/types/Segment.h +14 -0
  24. package/common/rnexecutorch/models/speech_to_text/types/Word.h +13 -0
  25. package/lib/module/modules/natural_language_processing/SpeechToTextModule.js +75 -53
  26. package/lib/module/modules/natural_language_processing/SpeechToTextModule.js.map +1 -1
  27. package/lib/typescript/hooks/natural_language_processing/useSpeechToText.d.ts +5 -5
  28. package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts +7 -12
  29. package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts.map +1 -1
  30. package/lib/typescript/types/stt.d.ts +0 -9
  31. package/lib/typescript/types/stt.d.ts.map +1 -1
  32. package/package.json +1 -1
  33. package/react-native-executorch.podspec +2 -0
  34. package/src/modules/natural_language_processing/SpeechToTextModule.ts +118 -54
  35. package/src/types/stt.ts +0 -12
  36. package/common/rnexecutorch/models/EncoderDecoderBase.cpp +0 -21
  37. package/common/rnexecutorch/models/EncoderDecoderBase.h +0 -31
  38. package/common/rnexecutorch/models/speech_to_text/SpeechToTextStrategy.h +0 -27
  39. package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.cpp +0 -50
  40. package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.h +0 -25
  41. package/lib/module/utils/SpeechToTextModule/ASR.js +0 -191
  42. package/lib/module/utils/SpeechToTextModule/ASR.js.map +0 -1
  43. package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js +0 -73
  44. package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js.map +0 -1
  45. package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js +0 -56
  46. package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js.map +0 -1
  47. package/lib/module/utils/stt.js +0 -22
  48. package/lib/module/utils/stt.js.map +0 -1
  49. package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts +0 -27
  50. package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts.map +0 -1
  51. package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts +0 -23
  52. package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts.map +0 -1
  53. package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts +0 -13
  54. package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts.map +0 -1
  55. package/lib/typescript/utils/stt.d.ts +0 -2
  56. package/lib/typescript/utils/stt.d.ts.map +0 -1
  57. package/src/utils/SpeechToTextModule/ASR.ts +0 -303
  58. package/src/utils/SpeechToTextModule/OnlineProcessor.ts +0 -87
  59. package/src/utils/SpeechToTextModule/hypothesisBuffer.ts +0 -79
  60. package/src/utils/stt.ts +0 -28
@@ -1,71 +1,98 @@
1
1
  "use strict";
2
2
 
3
- import { ASR } from '../../utils/SpeechToTextModule/ASR';
4
- import { OnlineASRProcessor } from '../../utils/SpeechToTextModule/OnlineProcessor';
3
+ import { Logger } from '../../common/Logger';
4
+ import { ResourceFetcher } from '../../utils/ResourceFetcher';
5
5
  export class SpeechToTextModule {
6
- asr = new ASR();
7
- processor = new OnlineASRProcessor(this.asr);
8
- isStreaming = false;
9
- readyToProcess = false;
10
- minAudioSamples = 1 * 16000; // 1 second
11
-
12
6
  async load(model, onDownloadProgressCallback = () => {}) {
13
7
  this.modelConfig = model;
14
- return this.asr.load(model, onDownloadProgressCallback);
8
+ const tokenizerLoadPromise = ResourceFetcher.fetch(undefined, model.tokenizerSource);
9
+ const encoderDecoderPromise = ResourceFetcher.fetch(onDownloadProgressCallback, model.encoderSource, model.decoderSource);
10
+ const [tokenizerSources, encoderDecoderResults] = await Promise.all([tokenizerLoadPromise, encoderDecoderPromise]);
11
+ const encoderSource = encoderDecoderResults?.[0];
12
+ const decoderSource = encoderDecoderResults?.[1];
13
+ if (!encoderSource || !decoderSource || !tokenizerSources) {
14
+ throw new Error('Download interrupted.');
15
+ }
16
+ this.nativeModule = await global.loadSpeechToText(encoderSource, decoderSource, tokenizerSources[0]);
15
17
  }
16
18
  async encode(waveform) {
17
- return this.asr.encode(waveform);
19
+ if (Array.isArray(waveform)) {
20
+ Logger.info('Passing waveform as number[] is deprecated, use Float32Array instead');
21
+ waveform = new Float32Array(waveform);
22
+ }
23
+ return new Float32Array(await this.nativeModule.encode(waveform));
18
24
  }
19
- async decode(tokens) {
20
- return this.asr.decode(tokens);
25
+ async decode(tokens, encoderOutput) {
26
+ if (Array.isArray(tokens)) {
27
+ Logger.info('Passing tokens as number[] is deprecated, use Int32Array instead');
28
+ tokens = new Int32Array(tokens);
29
+ }
30
+ if (Array.isArray(encoderOutput)) {
31
+ Logger.info('Passing encoderOutput as number[] is deprecated, use Float32Array instead');
32
+ encoderOutput = new Float32Array(encoderOutput);
33
+ }
34
+ return new Float32Array(await this.nativeModule.decode(tokens, encoderOutput));
21
35
  }
22
36
  async transcribe(waveform, options = {}) {
23
37
  this.validateOptions(options);
24
- const segments = await this.asr.transcribe(waveform, options);
25
- let transcription = '';
26
- for (const segment of segments) {
27
- for (const word of segment.words) {
28
- transcription += ` ${word.word}`;
29
- }
38
+ if (Array.isArray(waveform)) {
39
+ Logger.info('Passing waveform as number[] is deprecated, use Float32Array instead');
40
+ waveform = new Float32Array(waveform);
30
41
  }
31
- return transcription.trim();
42
+ return this.nativeModule.transcribe(waveform, options.language || '');
32
43
  }
33
44
  async *stream(options = {}) {
34
- if (this.isStreaming) {
35
- throw new Error('Streaming is already in progress');
36
- }
37
45
  this.validateOptions(options);
38
- this.resetStreamState();
39
- this.isStreaming = true;
40
- while (this.isStreaming) {
41
- if (!this.readyToProcess || this.processor.audioBuffer.length < this.minAudioSamples) {
42
- await new Promise(resolve => setTimeout(resolve, 100));
46
+ const queue = [];
47
+ let waiter = null;
48
+ let finished = false;
49
+ let error;
50
+ const wake = () => {
51
+ waiter?.();
52
+ waiter = null;
53
+ };
54
+ (async () => {
55
+ try {
56
+ await this.nativeModule.stream((committed, nonCommitted, isDone) => {
57
+ queue.push({
58
+ committed,
59
+ nonCommitted
60
+ });
61
+ if (isDone) {
62
+ finished = true;
63
+ }
64
+ wake();
65
+ }, options.language || '');
66
+ finished = true;
67
+ wake();
68
+ } catch (e) {
69
+ error = e;
70
+ finished = true;
71
+ wake();
72
+ }
73
+ })();
74
+ while (true) {
75
+ if (queue.length > 0) {
76
+ yield queue.shift();
77
+ if (finished && queue.length === 0) {
78
+ return;
79
+ }
43
80
  continue;
44
81
  }
45
- const {
46
- committed,
47
- nonCommitted
48
- } = await this.processor.processIter(options);
49
- yield {
50
- committed,
51
- nonCommitted
52
- };
53
- this.readyToProcess = false;
82
+ if (error) throw error;
83
+ if (finished) return;
84
+ await new Promise(r => waiter = r);
54
85
  }
55
- const {
56
- committed
57
- } = await this.processor.finish();
58
- yield {
59
- committed,
60
- nonCommitted: ''
61
- };
62
86
  }
63
- streamStop() {
64
- this.isStreaming = false;
87
+ async streamInsert(waveform) {
88
+ if (Array.isArray(waveform)) {
89
+ Logger.info('Passing waveform as number[] is deprecated, use Float32Array instead');
90
+ waveform = new Float32Array(waveform);
91
+ }
92
+ return this.nativeModule.streamInsert(waveform);
65
93
  }
66
- streamInsert(waveform) {
67
- this.processor.insertAudioChunk(waveform);
68
- this.readyToProcess = true;
94
+ async streamStop() {
95
+ return this.nativeModule.streamStop();
69
96
  }
70
97
  validateOptions(options) {
71
98
  if (!this.modelConfig.isMultilingual && options.language) {
@@ -75,10 +102,5 @@ export class SpeechToTextModule {
75
102
  throw new Error('Model is multilingual, provide a language');
76
103
  }
77
104
  }
78
- resetStreamState() {
79
- this.isStreaming = false;
80
- this.readyToProcess = false;
81
- this.processor = new OnlineASRProcessor(this.asr);
82
- }
83
105
  }
84
106
  //# sourceMappingURL=SpeechToTextModule.js.map
@@ -1 +1 @@
1
- {"version":3,"names":["ASR","OnlineASRProcessor","SpeechToTextModule","asr","processor","isStreaming","readyToProcess","minAudioSamples","load","model","onDownloadProgressCallback","modelConfig","encode","waveform","decode","tokens","transcribe","options","validateOptions","segments","transcription","segment","word","words","trim","stream","Error","resetStreamState","audioBuffer","length","Promise","resolve","setTimeout","committed","nonCommitted","processIter","finish","streamStop","streamInsert","insertAudioChunk","isMultilingual","language"],"sourceRoot":"../../../../src","sources":["modules/natural_language_processing/SpeechToTextModule.ts"],"mappings":";;AACA,SAASA,GAAG,QAAQ,oCAAoC;AACxD,SAASC,kBAAkB,QAAQ,gDAAgD;AAEnF,OAAO,MAAMC,kBAAkB,CAAC;EAEtBC,GAAG,GAAQ,IAAIH,GAAG,CAAC,CAAC;EAEpBI,SAAS,GAAuB,IAAIH,kBAAkB,CAAC,IAAI,CAACE,GAAG,CAAC;EAChEE,WAAW,GAAG,KAAK;EACnBC,cAAc,GAAG,KAAK;EACtBC,eAAe,GAAW,CAAC,GAAG,KAAK,CAAC,CAAC;;EAE7C,MAAaC,IAAIA,CACfC,KAA8B,EAC9BC,0BAAsD,GAAGA,CAAA,KAAM,CAAC,CAAC,EACjE;IACA,IAAI,CAACC,WAAW,GAAGF,KAAK;IACxB,OAAO,IAAI,CAACN,GAAG,CAACK,IAAI,CAACC,KAAK,EAAEC,0BAA0B,CAAC;EACzD;EAEA,MAAaE,MAAMA,CAACC,QAAsB,EAAiB;IACzD,OAAO,IAAI,CAACV,GAAG,CAACS,MAAM,CAACC,QAAQ,CAAC;EAClC;EAEA,MAAaC,MAAMA,CAACC,MAAgB,EAAyB;IAC3D,OAAO,IAAI,CAACZ,GAAG,CAACW,MAAM,CAACC,MAAM,CAAC;EAChC;EAEA,MAAaC,UAAUA,CACrBH,QAAkB,EAClBI,OAAwB,GAAG,CAAC,CAAC,EACZ;IACjB,IAAI,CAACC,eAAe,CAACD,OAAO,CAAC;IAE7B,MAAME,QAAQ,GAAG,MAAM,IAAI,CAAChB,GAAG,CAACa,UAAU,CAACH,QAAQ,EAAEI,OAAO,CAAC;IAE7D,IAAIG,aAAa,GAAG,EAAE;IACtB,KAAK,MAAMC,OAAO,IAAIF,QAAQ,EAAE;MAC9B,KAAK,MAAMG,IAAI,IAAID,OAAO,CAACE,KAAK,EAAE;QAChCH,aAAa,IAAI,IAAIE,IAAI,CAACA,IAAI,EAAE;MAClC;IACF;IAEA,OAAOF,aAAa,CAACI,IAAI,CAAC,CAAC;EAC7B;EAEA,OAAcC,MAAMA,CAACR,OAAwB,GAAG,CAAC,CAAC,EAAE;IAClD,IAAI,IAAI,CAACZ,WAAW,EAAE;MACpB,MAAM,IAAIqB,KAAK,CAAC,kCAAkC,CAAC;IACrD;IACA,IAAI,CAACR,eAAe,CAACD,OAAO,CAAC;IAC7B,IAAI,CAACU,gBAAgB,CAAC,CAAC;IAEvB,IAAI,CAACtB,WAAW,GAAG,IAAI;IACvB,OAAO,IAAI,CAACA,WAAW,EAAE;MACvB,IACE,CAAC,IAAI,CAACC,cAAc,IACpB,IAAI,CAACF,SAAS,CAACwB,WAAW,CAACC,MAAM,GAAG,IAAI,CAACtB,eAAe,EACxD;QACA,MAAM,IAAIuB,OAAO,CAAEC,OAAO,IAAKC,UAAU,CAACD,OAAO,EAAE,GAAG,CAAC,CAAC;QACxD;MACF;MAEA,MAAM;QAAEE,SAAS;QAAEC;MAAa,CAAC,GAC/B,MAAM,IAAI,CAAC9B,SAAS,CAAC+B,WAAW,CAAClB,OAAO,CAAC;MAC3C,MAAM;QAAEgB,SAAS;QAAEC;MAAa,CAAC;MACjC,IAAI,CAAC5B,cAAc,GAAG,KAAK;IAC7B;IAEA,MAAM;MAAE2B;IAAU,CAAC,GAAG,MAAM,IAAI,CAAC7B,SAAS,CAACgC,MAAM,CAAC,CAAC;IACnD,MAAM;MAAEH,SAAS;MAAEC,YAAY,EAAE;IAAG,CAAC;EACvC;EAEOG,UAAUA,CAAA,EAAG;IAClB,IAAI,CAAChC,WAAW,GAAG,KAAK;EAC1B;EAEOiC,YAAYA,CAACzB,QAAkB,EAAE;IACtC,IAAI,CAACT,SAAS,CAACmC,gBAAgB,CAAC1B,QAAQ,CAAC;IACzC,IAAI,CAACP,cAAc,GAAG,IAAI;EAC5B;EAEQY,eAAeA,CAACD,OAAwB,EAAE;IAChD,IAAI,CAAC,IAAI,CAACN,WAAW,CAAC6B,cAAc,IAAIvB,OAAO,CAACwB,QAAQ,EAAE;MACxD,MAAM,IAAIf,KAAK,CAAC,gDAAgD,CAAC;IACnE;IACA,IAAI,IAAI,CAACf,WAAW,CAAC6B,cAAc,IAAI,CAACvB,OAAO,CAACwB,QAAQ,EAAE;MACxD,MAAM,IAAIf,KAAK,CAAC,2CAA2C,CAAC;IAC9D;EACF;EAEQC,gBAAgBA,CAAA,EAAG;IACzB,IAAI,CAACtB,WAAW,GAAG,KAAK;IACxB,IAAI,CAACC,cAAc,GAAG,KAAK;IAC3B,IAAI,CAACF,SAAS,GAAG,IAAIH,kBAAkB,CAAC,IAAI,CAACE,GAAG,CAAC;EACnD;AACF","ignoreList":[]}
1
+ {"version":3,"names":["Logger","ResourceFetcher","SpeechToTextModule","load","model","onDownloadProgressCallback","modelConfig","tokenizerLoadPromise","fetch","undefined","tokenizerSource","encoderDecoderPromise","encoderSource","decoderSource","tokenizerSources","encoderDecoderResults","Promise","all","Error","nativeModule","global","loadSpeechToText","encode","waveform","Array","isArray","info","Float32Array","decode","tokens","encoderOutput","Int32Array","transcribe","options","validateOptions","language","stream","queue","waiter","finished","error","wake","committed","nonCommitted","isDone","push","e","length","shift","r","streamInsert","streamStop","isMultilingual"],"sourceRoot":"../../../../src","sources":["modules/natural_language_processing/SpeechToTextModule.ts"],"mappings":";;AAAA,SAASA,MAAM,QAAQ,qBAAqB;AAE5C,SAASC,eAAe,QAAQ,6BAA6B;AAE7D,OAAO,MAAMC,kBAAkB,CAAC;EAK9B,MAAaC,IAAIA,CACfC,KAA8B,EAC9BC,0BAAsD,GAAGA,CAAA,KAAM,CAAC,CAAC,EACjE;IACA,IAAI,CAACC,WAAW,GAAGF,KAAK;IAExB,MAAMG,oBAAoB,GAAGN,eAAe,CAACO,KAAK,CAChDC,SAAS,EACTL,KAAK,CAACM,eACR,CAAC;IACD,MAAMC,qBAAqB,GAAGV,eAAe,CAACO,KAAK,CACjDH,0BAA0B,EAC1BD,KAAK,CAACQ,aAAa,EACnBR,KAAK,CAACS,aACR,CAAC;IACD,MAAM,CAACC,gBAAgB,EAAEC,qBAAqB,CAAC,GAAG,MAAMC,OAAO,CAACC,GAAG,CAAC,CAClEV,oBAAoB,EACpBI,qBAAqB,CACtB,CAAC;IACF,MAAMC,aAAa,GAAGG,qBAAqB,GAAG,CAAC,CAAC;IAChD,MAAMF,aAAa,GAAGE,qBAAqB,GAAG,CAAC,CAAC;IAChD,IAAI,CAACH,aAAa,IAAI,CAACC,aAAa,IAAI,CAACC,gBAAgB,EAAE;MACzD,MAAM,IAAII,KAAK,CAAC,uBAAuB,CAAC;IAC1C;IACA,IAAI,CAACC,YAAY,GAAG,MAAMC,MAAM,CAACC,gBAAgB,CAC/CT,aAAa,EACbC,aAAa,EACbC,gBAAgB,CAAC,CAAC,CACpB,CAAC;EACH;EAEA,MAAaQ,MAAMA,CACjBC,QAAiC,EACV;IACvB,IAAIC,KAAK,CAACC,OAAO,CAACF,QAAQ,CAAC,EAAE;MAC3BvB,MAAM,CAAC0B,IAAI,CACT,sEACF,CAAC;MACDH,QAAQ,GAAG,IAAII,YAAY,CAACJ,QAAQ,CAAC;IACvC;IACA,OAAO,IAAII,YAAY,CAAC,MAAM,IAAI,CAACR,YAAY,CAACG,MAAM,CAACC,QAAQ,CAAC,CAAC;EACnE;EAEA,MAAaK,MAAMA,CACjBC,MAA6B,EAC7BC,aAAsC,EACf;IACvB,IAAIN,KAAK,CAACC,OAAO,CAACI,MAAM,CAAC,EAAE;MACzB7B,MAAM,CAAC0B,IAAI,CACT,kEACF,CAAC;MACDG,MAAM,GAAG,IAAIE,UAAU,CAACF,MAAM,CAAC;IACjC;IACA,IAAIL,KAAK,CAACC,OAAO,CAACK,aAAa,CAAC,EAAE;MAChC9B,MAAM,CAAC0B,IAAI,CACT,2EACF,CAAC;MACDI,aAAa,GAAG,IAAIH,YAAY,CAACG,aAAa,CAAC;IACjD;IACA,OAAO,IAAIH,YAAY,CACrB,MAAM,IAAI,CAACR,YAAY,CAACS,MAAM,CAACC,MAAM,EAAEC,aAAa,CACtD,CAAC;EACH;EAEA,MAAaE,UAAUA,CACrBT,QAAiC,EACjCU,OAAwB,GAAG,CAAC,CAAC,EACZ;IACjB,IAAI,CAACC,eAAe,CAACD,OAAO,CAAC;IAE7B,IAAIT,KAAK,CAACC,OAAO,CAACF,QAAQ,CAAC,EAAE;MAC3BvB,MAAM,CAAC0B,IAAI,CACT,sEACF,CAAC;MACDH,QAAQ,GAAG,IAAII,YAAY,CAACJ,QAAQ,CAAC;IACvC;IAEA,OAAO,IAAI,CAACJ,YAAY,CAACa,UAAU,CAACT,QAAQ,EAAEU,OAAO,CAACE,QAAQ,IAAI,EAAE,CAAC;EACvE;EAEA,OAAcC,MAAMA,CAClBH,OAAwB,GAAG,CAAC,CAAC,EACgC;IAC7D,IAAI,CAACC,eAAe,CAACD,OAAO,CAAC;IAE7B,MAAMI,KAAoD,GAAG,EAAE;IAC/D,IAAIC,MAA2B,GAAG,IAAI;IACtC,IAAIC,QAAQ,GAAG,KAAK;IACpB,IAAIC,KAAc;IAElB,MAAMC,IAAI,GAAGA,CAAA,KAAM;MACjBH,MAAM,GAAG,CAAC;MACVA,MAAM,GAAG,IAAI;IACf,CAAC;IAED,CAAC,YAAY;MACX,IAAI;QACF,MAAM,IAAI,CAACnB,YAAY,CAACiB,MAAM,CAC5B,CAACM,SAAiB,EAAEC,YAAoB,EAAEC,MAAe,KAAK;UAC5DP,KAAK,CAACQ,IAAI,CAAC;YAAEH,SAAS;YAAEC;UAAa,CAAC,CAAC;UACvC,IAAIC,MAAM,EAAE;YACVL,QAAQ,GAAG,IAAI;UACjB;UACAE,IAAI,CAAC,CAAC;QACR,CAAC,EACDR,OAAO,CAACE,QAAQ,IAAI,EACtB,CAAC;QACDI,QAAQ,GAAG,IAAI;QACfE,IAAI,CAAC,CAAC;MACR,CAAC,CAAC,OAAOK,CAAC,EAAE;QACVN,KAAK,GAAGM,CAAC;QACTP,QAAQ,GAAG,IAAI;QACfE,IAAI,CAAC,CAAC;MACR;IACF,CAAC,EAAE,CAAC;IAEJ,OAAO,IAAI,EAAE;MACX,IAAIJ,KAAK,CAACU,MAAM,GAAG,CAAC,EAAE;QACpB,MAAMV,KAAK,CAACW,KAAK,CAAC,CAAE;QACpB,IAAIT,QAAQ,IAAIF,KAAK,CAACU,MAAM,KAAK,CAAC,EAAE;UAClC;QACF;QACA;MACF;MACA,IAAIP,KAAK,EAAE,MAAMA,KAAK;MACtB,IAAID,QAAQ,EAAE;MACd,MAAM,IAAIvB,OAAO,CAAQiC,CAAC,IAAMX,MAAM,GAAGW,CAAE,CAAC;IAC9C;EACF;EAEA,MAAaC,YAAYA,CAAC3B,QAAiC,EAAiB;IAC1E,IAAIC,KAAK,CAACC,OAAO,CAACF,QAAQ,CAAC,EAAE;MAC3BvB,MAAM,CAAC0B,IAAI,CACT,sEACF,CAAC;MACDH,QAAQ,GAAG,IAAII,YAAY,CAACJ,QAAQ,CAAC;IACvC;IACA,OAAO,IAAI,CAACJ,YAAY,CAAC+B,YAAY,CAAC3B,QAAQ,CAAC;EACjD;EAEA,MAAa4B,UAAUA,CAAA,EAAkB;IACvC,OAAO,IAAI,CAAChC,YAAY,CAACgC,UAAU,CAAC,CAAC;EACvC;EAEQjB,eAAeA,CAACD,OAAwB,EAAE;IAChD,IAAI,CAAC,IAAI,CAAC3B,WAAW,CAAC8C,cAAc,IAAInB,OAAO,CAACE,QAAQ,EAAE;MACxD,MAAM,IAAIjB,KAAK,CAAC,gDAAgD,CAAC;IACnE;IACA,IAAI,IAAI,CAACZ,WAAW,CAAC8C,cAAc,IAAI,CAACnB,OAAO,CAACE,QAAQ,EAAE;MACxD,MAAM,IAAIjB,KAAK,CAAC,2CAA2C,CAAC;IAC9D;EACF;AACF","ignoreList":[]}
@@ -9,11 +9,11 @@ export declare const useSpeechToText: ({ model, preventLoad, }: {
9
9
  downloadProgress: number;
10
10
  committedTranscription: string;
11
11
  nonCommittedTranscription: string;
12
- encode: (waveform: Float32Array<ArrayBufferLike>) => Promise<void>;
13
- decode: (tokens: number[]) => Promise<Float32Array<ArrayBufferLike>>;
14
- transcribe: (waveform: number[], options?: import("../../types/stt").DecodingOptions | undefined) => Promise<string>;
12
+ encode: (waveform: number[] | Float32Array<ArrayBufferLike>) => Promise<Float32Array<ArrayBufferLike>>;
13
+ decode: (tokens: number[] | Int32Array<ArrayBufferLike>, encoderOutput: number[] | Float32Array<ArrayBufferLike>) => Promise<Float32Array<ArrayBufferLike>>;
14
+ transcribe: (waveform: number[] | Float32Array<ArrayBufferLike>, options?: import("../../types/stt").DecodingOptions | undefined) => Promise<string>;
15
15
  stream: () => Promise<string>;
16
- streamStop: () => void;
17
- streamInsert: (waveform: number[]) => void;
16
+ streamStop: () => Promise<void>;
17
+ streamInsert: (waveform: number[] | Float32Array<ArrayBufferLike>) => Promise<void>;
18
18
  };
19
19
  //# sourceMappingURL=useSpeechToText.d.ts.map
@@ -1,22 +1,17 @@
1
1
  import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
2
2
  export declare class SpeechToTextModule {
3
+ private nativeModule;
3
4
  private modelConfig;
4
- private asr;
5
- private processor;
6
- private isStreaming;
7
- private readyToProcess;
8
- private minAudioSamples;
9
5
  load(model: SpeechToTextModelConfig, onDownloadProgressCallback?: (progress: number) => void): Promise<void>;
10
- encode(waveform: Float32Array): Promise<void>;
11
- decode(tokens: number[]): Promise<Float32Array>;
12
- transcribe(waveform: number[], options?: DecodingOptions): Promise<string>;
6
+ encode(waveform: Float32Array | number[]): Promise<Float32Array>;
7
+ decode(tokens: Int32Array | number[], encoderOutput: Float32Array | number[]): Promise<Float32Array>;
8
+ transcribe(waveform: Float32Array | number[], options?: DecodingOptions): Promise<string>;
13
9
  stream(options?: DecodingOptions): AsyncGenerator<{
14
10
  committed: string;
15
11
  nonCommitted: string;
16
- }, void, unknown>;
17
- streamStop(): void;
18
- streamInsert(waveform: number[]): void;
12
+ }>;
13
+ streamInsert(waveform: Float32Array | number[]): Promise<void>;
14
+ streamStop(): Promise<void>;
19
15
  private validateOptions;
20
- private resetStreamState;
21
16
  }
22
17
  //# sourceMappingURL=SpeechToTextModule.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"SpeechToTextModule.d.ts","sourceRoot":"","sources":["../../../../src/modules/natural_language_processing/SpeechToTextModule.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAC;AAI3E,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,WAAW,CAA2B;IAC9C,OAAO,CAAC,GAAG,CAAkB;IAE7B,OAAO,CAAC,SAAS,CAAwD;IACzE,OAAO,CAAC,WAAW,CAAS;IAC5B,OAAO,CAAC,cAAc,CAAS;IAC/B,OAAO,CAAC,eAAe,CAAqB;IAE/B,IAAI,CACf,KAAK,EAAE,uBAAuB,EAC9B,0BAA0B,GAAE,CAAC,QAAQ,EAAE,MAAM,KAAK,IAAe;IAMtD,MAAM,CAAC,QAAQ,EAAE,YAAY,GAAG,OAAO,CAAC,IAAI,CAAC;IAI7C,MAAM,CAAC,MAAM,EAAE,MAAM,EAAE,GAAG,OAAO,CAAC,YAAY,CAAC;IAI/C,UAAU,CACrB,QAAQ,EAAE,MAAM,EAAE,EAClB,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,MAAM,CAAC;IAeJ,MAAM,CAAC,OAAO,GAAE,eAAoB;;;;IA2B3C,UAAU;IAIV,YAAY,CAAC,QAAQ,EAAE,MAAM,EAAE;IAKtC,OAAO,CAAC,eAAe;IASvB,OAAO,CAAC,gBAAgB;CAKzB"}
1
+ {"version":3,"file":"SpeechToTextModule.d.ts","sourceRoot":"","sources":["../../../../src/modules/natural_language_processing/SpeechToTextModule.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,eAAe,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAC;AAG3E,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,YAAY,CAAM;IAE1B,OAAO,CAAC,WAAW,CAA2B;IAEjC,IAAI,CACf,KAAK,EAAE,uBAAuB,EAC9B,0BAA0B,GAAE,CAAC,QAAQ,EAAE,MAAM,KAAK,IAAe;IA6BtD,MAAM,CACjB,QAAQ,EAAE,YAAY,GAAG,MAAM,EAAE,GAChC,OAAO,CAAC,YAAY,CAAC;IAUX,MAAM,CACjB,MAAM,EAAE,UAAU,GAAG,MAAM,EAAE,EAC7B,aAAa,EAAE,YAAY,GAAG,MAAM,EAAE,GACrC,OAAO,CAAC,YAAY,CAAC;IAkBX,UAAU,CACrB,QAAQ,EAAE,YAAY,GAAG,MAAM,EAAE,EACjC,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,MAAM,CAAC;IAaJ,MAAM,CAClB,OAAO,GAAE,eAAoB,GAC5B,cAAc,CAAC;QAAE,SAAS,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,CAAC;IAgDjD,YAAY,CAAC,QAAQ,EAAE,YAAY,GAAG,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9D,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAIxC,OAAO,CAAC,eAAe;CAQxB"}
@@ -1,13 +1,4 @@
1
1
  import { ResourceSource } from './common';
2
- export type WordTuple = [number, number, string];
3
- export interface WordObject {
4
- start: number;
5
- end: number;
6
- word: string;
7
- }
8
- export interface Segment {
9
- words: WordObject[];
10
- }
11
2
  export type SpeechToTextLanguage = 'af' | 'sq' | 'ar' | 'hy' | 'az' | 'eu' | 'be' | 'bn' | 'bs' | 'bg' | 'my' | 'ca' | 'zh' | 'hr' | 'cs' | 'da' | 'nl' | 'et' | 'en' | 'fi' | 'fr' | 'gl' | 'ka' | 'de' | 'el' | 'gu' | 'ht' | 'he' | 'hi' | 'hu' | 'is' | 'id' | 'it' | 'ja' | 'kn' | 'kk' | 'km' | 'ko' | 'lo' | 'lv' | 'lt' | 'mk' | 'mg' | 'ms' | 'ml' | 'mt' | 'mr' | 'ne' | 'no' | 'fa' | 'pl' | 'pt' | 'pa' | 'ro' | 'ru' | 'sr' | 'si' | 'sk' | 'sl' | 'es' | 'su' | 'sw' | 'sv' | 'tl' | 'tg' | 'ta' | 'te' | 'th' | 'tr' | 'uk' | 'ur' | 'uz' | 'vi' | 'cy' | 'yi';
12
3
  export interface DecodingOptions {
13
4
  language?: SpeechToTextLanguage;
@@ -1 +1 @@
1
- {"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../../../src/types/stt.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,UAAU,CAAC;AAE1C,MAAM,MAAM,SAAS,GAAG,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC,CAAC;AAEjD,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,MAAM,CAAC;IACd,GAAG,EAAE,MAAM,CAAC;IACZ,IAAI,EAAE,MAAM,CAAC;CACd;AAED,MAAM,WAAW,OAAO;IACtB,KAAK,EAAE,UAAU,EAAE,CAAC;CACrB;AAGD,MAAM,MAAM,oBAAoB,GAC5B,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,CAAC;AAET,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,EAAE,oBAAoB,CAAC;CACjC;AAED,MAAM,WAAW,uBAAuB;IACtC,cAAc,EAAE,OAAO,CAAC;IACxB,aAAa,EAAE,cAAc,CAAC;IAC9B,aAAa,EAAE,cAAc,CAAC;IAC9B,eAAe,EAAE,cAAc,CAAC;CACjC"}
1
+ {"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../../../src/types/stt.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,UAAU,CAAC;AAG1C,MAAM,MAAM,oBAAoB,GAC5B,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,CAAC;AAET,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,EAAE,oBAAoB,CAAC;CACjC;AAED,MAAM,WAAW,uBAAuB;IACtC,cAAc,EAAE,OAAO,CAAC;IACxB,aAAa,EAAE,cAAc,CAAC;IAC9B,aAAa,EAAE,cAAc,CAAC;IAC9B,eAAe,EAAE,cAAc,CAAC;CACjC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "react-native-executorch",
3
- "version": "0.5.3",
3
+ "version": "0.5.4",
4
4
  "description": "An easy way to run AI models in React Native with ExecuTorch",
5
5
  "source": "./src/index.ts",
6
6
  "main": "./lib/module/index.js",
@@ -75,6 +75,8 @@ Pod::Spec.new do |s|
75
75
  "common/**/*.{cpp,c,h,hpp}",
76
76
  ]
77
77
 
78
+ s.libraries = "z"
79
+
78
80
  # Exclude file with tests to not introduce gtest dependency.
79
81
  # Do not include the headers from common/rnexecutorch/jsi/ as source files.
80
82
  # Xcode/Cocoapods leaks them to other pods that an app also depends on, so if
@@ -1,84 +1,154 @@
1
+ import { Logger } from '../../common/Logger';
1
2
  import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
2
- import { ASR } from '../../utils/SpeechToTextModule/ASR';
3
- import { OnlineASRProcessor } from '../../utils/SpeechToTextModule/OnlineProcessor';
3
+ import { ResourceFetcher } from '../../utils/ResourceFetcher';
4
4
 
5
5
  export class SpeechToTextModule {
6
- private modelConfig!: SpeechToTextModelConfig;
7
- private asr: ASR = new ASR();
6
+ private nativeModule: any;
8
7
 
9
- private processor: OnlineASRProcessor = new OnlineASRProcessor(this.asr);
10
- private isStreaming = false;
11
- private readyToProcess = false;
12
- private minAudioSamples: number = 1 * 16000; // 1 second
8
+ private modelConfig!: SpeechToTextModelConfig;
13
9
 
14
10
  public async load(
15
11
  model: SpeechToTextModelConfig,
16
12
  onDownloadProgressCallback: (progress: number) => void = () => {}
17
13
  ) {
18
14
  this.modelConfig = model;
19
- return this.asr.load(model, onDownloadProgressCallback);
15
+
16
+ const tokenizerLoadPromise = ResourceFetcher.fetch(
17
+ undefined,
18
+ model.tokenizerSource
19
+ );
20
+ const encoderDecoderPromise = ResourceFetcher.fetch(
21
+ onDownloadProgressCallback,
22
+ model.encoderSource,
23
+ model.decoderSource
24
+ );
25
+ const [tokenizerSources, encoderDecoderResults] = await Promise.all([
26
+ tokenizerLoadPromise,
27
+ encoderDecoderPromise,
28
+ ]);
29
+ const encoderSource = encoderDecoderResults?.[0];
30
+ const decoderSource = encoderDecoderResults?.[1];
31
+ if (!encoderSource || !decoderSource || !tokenizerSources) {
32
+ throw new Error('Download interrupted.');
33
+ }
34
+ this.nativeModule = await global.loadSpeechToText(
35
+ encoderSource,
36
+ decoderSource,
37
+ tokenizerSources[0]!
38
+ );
20
39
  }
21
40
 
22
- public async encode(waveform: Float32Array): Promise<void> {
23
- return this.asr.encode(waveform);
41
+ public async encode(
42
+ waveform: Float32Array | number[]
43
+ ): Promise<Float32Array> {
44
+ if (Array.isArray(waveform)) {
45
+ Logger.info(
46
+ 'Passing waveform as number[] is deprecated, use Float32Array instead'
47
+ );
48
+ waveform = new Float32Array(waveform);
49
+ }
50
+ return new Float32Array(await this.nativeModule.encode(waveform));
24
51
  }
25
52
 
26
- public async decode(tokens: number[]): Promise<Float32Array> {
27
- return this.asr.decode(tokens);
53
+ public async decode(
54
+ tokens: Int32Array | number[],
55
+ encoderOutput: Float32Array | number[]
56
+ ): Promise<Float32Array> {
57
+ if (Array.isArray(tokens)) {
58
+ Logger.info(
59
+ 'Passing tokens as number[] is deprecated, use Int32Array instead'
60
+ );
61
+ tokens = new Int32Array(tokens);
62
+ }
63
+ if (Array.isArray(encoderOutput)) {
64
+ Logger.info(
65
+ 'Passing encoderOutput as number[] is deprecated, use Float32Array instead'
66
+ );
67
+ encoderOutput = new Float32Array(encoderOutput);
68
+ }
69
+ return new Float32Array(
70
+ await this.nativeModule.decode(tokens, encoderOutput)
71
+ );
28
72
  }
29
73
 
30
74
  public async transcribe(
31
- waveform: number[],
75
+ waveform: Float32Array | number[],
32
76
  options: DecodingOptions = {}
33
77
  ): Promise<string> {
34
78
  this.validateOptions(options);
35
79
 
36
- const segments = await this.asr.transcribe(waveform, options);
37
-
38
- let transcription = '';
39
- for (const segment of segments) {
40
- for (const word of segment.words) {
41
- transcription += ` ${word.word}`;
42
- }
80
+ if (Array.isArray(waveform)) {
81
+ Logger.info(
82
+ 'Passing waveform as number[] is deprecated, use Float32Array instead'
83
+ );
84
+ waveform = new Float32Array(waveform);
43
85
  }
44
86
 
45
- return transcription.trim();
87
+ return this.nativeModule.transcribe(waveform, options.language || '');
46
88
  }
47
89
 
48
- public async *stream(options: DecodingOptions = {}) {
49
- if (this.isStreaming) {
50
- throw new Error('Streaming is already in progress');
51
- }
90
+ public async *stream(
91
+ options: DecodingOptions = {}
92
+ ): AsyncGenerator<{ committed: string; nonCommitted: string }> {
52
93
  this.validateOptions(options);
53
- this.resetStreamState();
54
-
55
- this.isStreaming = true;
56
- while (this.isStreaming) {
57
- if (
58
- !this.readyToProcess ||
59
- this.processor.audioBuffer.length < this.minAudioSamples
60
- ) {
61
- await new Promise((resolve) => setTimeout(resolve, 100));
94
+
95
+ const queue: { committed: string; nonCommitted: string }[] = [];
96
+ let waiter: (() => void) | null = null;
97
+ let finished = false;
98
+ let error: unknown;
99
+
100
+ const wake = () => {
101
+ waiter?.();
102
+ waiter = null;
103
+ };
104
+
105
+ (async () => {
106
+ try {
107
+ await this.nativeModule.stream(
108
+ (committed: string, nonCommitted: string, isDone: boolean) => {
109
+ queue.push({ committed, nonCommitted });
110
+ if (isDone) {
111
+ finished = true;
112
+ }
113
+ wake();
114
+ },
115
+ options.language || ''
116
+ );
117
+ finished = true;
118
+ wake();
119
+ } catch (e) {
120
+ error = e;
121
+ finished = true;
122
+ wake();
123
+ }
124
+ })();
125
+
126
+ while (true) {
127
+ if (queue.length > 0) {
128
+ yield queue.shift()!;
129
+ if (finished && queue.length === 0) {
130
+ return;
131
+ }
62
132
  continue;
63
133
  }
64
-
65
- const { committed, nonCommitted } =
66
- await this.processor.processIter(options);
67
- yield { committed, nonCommitted };
68
- this.readyToProcess = false;
134
+ if (error) throw error;
135
+ if (finished) return;
136
+ await new Promise<void>((r) => (waiter = r));
69
137
  }
70
-
71
- const { committed } = await this.processor.finish();
72
- yield { committed, nonCommitted: '' };
73
138
  }
74
139
 
75
- public streamStop() {
76
- this.isStreaming = false;
140
+ public async streamInsert(waveform: Float32Array | number[]): Promise<void> {
141
+ if (Array.isArray(waveform)) {
142
+ Logger.info(
143
+ 'Passing waveform as number[] is deprecated, use Float32Array instead'
144
+ );
145
+ waveform = new Float32Array(waveform);
146
+ }
147
+ return this.nativeModule.streamInsert(waveform);
77
148
  }
78
149
 
79
- public streamInsert(waveform: number[]) {
80
- this.processor.insertAudioChunk(waveform);
81
- this.readyToProcess = true;
150
+ public async streamStop(): Promise<void> {
151
+ return this.nativeModule.streamStop();
82
152
  }
83
153
 
84
154
  private validateOptions(options: DecodingOptions) {
@@ -89,10 +159,4 @@ export class SpeechToTextModule {
89
159
  throw new Error('Model is multilingual, provide a language');
90
160
  }
91
161
  }
92
-
93
- private resetStreamState() {
94
- this.isStreaming = false;
95
- this.readyToProcess = false;
96
- this.processor = new OnlineASRProcessor(this.asr);
97
- }
98
162
  }
package/src/types/stt.ts CHANGED
@@ -1,17 +1,5 @@
1
1
  import { ResourceSource } from './common';
2
2
 
3
- export type WordTuple = [number, number, string];
4
-
5
- export interface WordObject {
6
- start: number;
7
- end: number;
8
- word: string;
9
- }
10
-
11
- export interface Segment {
12
- words: WordObject[];
13
- }
14
-
15
3
  // Languages supported by whisper (not whisper.en)
16
4
  export type SpeechToTextLanguage =
17
5
  | 'af'
@@ -1,21 +0,0 @@
1
- #include <rnexecutorch/models/EncoderDecoderBase.h>
2
-
3
- namespace rnexecutorch::models {
4
-
5
- EncoderDecoderBase::EncoderDecoderBase(
6
- const std::string &encoderPath, const std::string &decoderPath,
7
- std::shared_ptr<react::CallInvoker> callInvoker)
8
- : callInvoker(callInvoker),
9
- encoder_(std::make_unique<BaseModel>(encoderPath, callInvoker)),
10
- decoder_(std::make_unique<BaseModel>(decoderPath, callInvoker)) {};
11
-
12
- size_t EncoderDecoderBase::getMemoryLowerBound() const noexcept {
13
- return encoder_->getMemoryLowerBound() + decoder_->getMemoryLowerBound();
14
- }
15
-
16
- void EncoderDecoderBase::unload() noexcept {
17
- encoder_.reset(nullptr);
18
- decoder_.reset(nullptr);
19
- }
20
-
21
- } // namespace rnexecutorch::models
@@ -1,31 +0,0 @@
1
- #pragma once
2
-
3
- #include <ReactCommon/CallInvoker.h>
4
- #include <memory>
5
- #include <rnexecutorch/models/BaseModel.h>
6
- #include <string>
7
-
8
- namespace rnexecutorch::models {
9
-
10
- using namespace facebook;
11
- using executorch::aten::Tensor;
12
- using executorch::runtime::EValue;
13
-
14
- class EncoderDecoderBase {
15
- public:
16
- explicit EncoderDecoderBase(const std::string &encoderPath,
17
- const std::string &decoderPath,
18
- std::shared_ptr<react::CallInvoker> callInvoker);
19
- size_t getMemoryLowerBound() const noexcept;
20
- void unload() noexcept;
21
-
22
- protected:
23
- std::shared_ptr<react::CallInvoker> callInvoker;
24
- std::unique_ptr<BaseModel> encoder_;
25
- std::unique_ptr<BaseModel> decoder_;
26
-
27
- private:
28
- size_t memorySizeLowerBound;
29
- };
30
-
31
- } // namespace rnexecutorch::models
@@ -1,27 +0,0 @@
1
- #pragma once
2
-
3
- #include "executorch/extension/tensor/tensor_ptr.h"
4
- #include <rnexecutorch/host_objects/JSTensorViewOut.h>
5
- #include <span>
6
- #include <vector>
7
-
8
- namespace rnexecutorch::models::speech_to_text {
9
-
10
- using TensorPtr = ::executorch::extension::TensorPtr;
11
-
12
- class SpeechToTextStrategy {
13
- public:
14
- virtual ~SpeechToTextStrategy() = default;
15
-
16
- virtual TensorPtr prepareAudioInput(std::span<float> waveform) = 0;
17
-
18
- virtual TensorPtr
19
- prepareTokenInput(const std::vector<int64_t> &prevTokens) = 0;
20
-
21
- virtual std::string getDecoderMethod() const = 0;
22
-
23
- virtual std::shared_ptr<OwningArrayBuffer> extractOutputToken(
24
- const executorch::aten::Tensor &decoderOutputTensor) const = 0;
25
- };
26
-
27
- } // namespace rnexecutorch::models::speech_to_text
@@ -1,50 +0,0 @@
1
- #include "executorch/extension/tensor/tensor_ptr.h"
2
- #include "rnexecutorch/data_processing/dsp.h"
3
- #include <rnexecutorch/models/speech_to_text/WhisperStrategy.h>
4
-
5
- namespace rnexecutorch::models::speech_to_text {
6
-
7
- using namespace ::executorch::extension;
8
- using namespace ::executorch::aten;
9
-
10
- TensorPtr WhisperStrategy::prepareAudioInput(std::span<float> waveform) {
11
- constexpr auto fftWindowSize = 512;
12
- constexpr auto stftHopLength = 160;
13
- constexpr auto innerDim = 256;
14
- preprocessedData =
15
- dsp::stftFromWaveform(waveform, fftWindowSize, stftHopLength);
16
- const auto numFrames = preprocessedData.size() / innerDim;
17
- std::vector<int32_t> inputShape = {static_cast<int32_t>(numFrames), innerDim};
18
- return make_tensor_ptr(std::move(inputShape), std::move(preprocessedData));
19
- }
20
-
21
- TensorPtr
22
- WhisperStrategy::prepareTokenInput(const std::vector<int64_t> &prevTokens) {
23
- tokens32.clear();
24
- tokens32.reserve(prevTokens.size());
25
- for (auto token : prevTokens) {
26
- tokens32.push_back(static_cast<int32_t>(token));
27
- }
28
- auto tensorSizes = {1, static_cast<int32_t>(tokens32.size())};
29
- return make_tensor_ptr(std::move(tensorSizes), std::move(tokens32));
30
- }
31
-
32
- std::shared_ptr<OwningArrayBuffer> WhisperStrategy::extractOutputToken(
33
- const executorch::aten::Tensor &decoderOutputTensor) const {
34
- const auto innerDim = decoderOutputTensor.size(1);
35
- const auto dictSize = decoderOutputTensor.size(2);
36
- auto outputNumel = decoderOutputTensor.numel();
37
- auto dataPtr =
38
- static_cast<const float *>(decoderOutputTensor.const_data_ptr()) +
39
- (innerDim - 1) * dictSize;
40
-
41
- std::span<const float> modelOutput(dataPtr, outputNumel / innerDim);
42
- auto createBuffer = [](const auto &data, size_t size) {
43
- auto buffer = std::make_shared<OwningArrayBuffer>(size);
44
- std::memcpy(buffer->data(), data, size);
45
- return buffer;
46
- };
47
- return createBuffer(modelOutput.data(), modelOutput.size_bytes());
48
- }
49
-
50
- } // namespace rnexecutorch::models::speech_to_text