react-native-executorch 0.5.3 → 0.5.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/cpp/CMakeLists.txt +2 -1
- package/common/rnexecutorch/data_processing/Numerical.cpp +27 -19
- package/common/rnexecutorch/data_processing/Numerical.h +53 -4
- package/common/rnexecutorch/data_processing/dsp.cpp +1 -1
- package/common/rnexecutorch/data_processing/dsp.h +1 -1
- package/common/rnexecutorch/data_processing/gzip.cpp +47 -0
- package/common/rnexecutorch/data_processing/gzip.h +7 -0
- package/common/rnexecutorch/host_objects/ModelHostObject.h +24 -0
- package/common/rnexecutorch/metaprogramming/TypeConcepts.h +21 -1
- package/common/rnexecutorch/models/BaseModel.cpp +3 -2
- package/common/rnexecutorch/models/BaseModel.h +3 -2
- package/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +103 -39
- package/common/rnexecutorch/models/speech_to_text/SpeechToText.h +39 -21
- package/common/rnexecutorch/models/speech_to_text/asr/ASR.cpp +310 -0
- package/common/rnexecutorch/models/speech_to_text/asr/ASR.h +62 -0
- package/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.cpp +82 -0
- package/common/rnexecutorch/models/speech_to_text/stream/HypothesisBuffer.h +25 -0
- package/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.cpp +99 -0
- package/common/rnexecutorch/models/speech_to_text/stream/OnlineASRProcessor.h +33 -0
- package/common/rnexecutorch/models/speech_to_text/types/DecodingOptions.h +15 -0
- package/common/rnexecutorch/models/speech_to_text/types/GenerationResult.h +12 -0
- package/common/rnexecutorch/models/speech_to_text/types/ProcessResult.h +12 -0
- package/common/rnexecutorch/models/speech_to_text/types/Segment.h +14 -0
- package/common/rnexecutorch/models/speech_to_text/types/Word.h +13 -0
- package/lib/module/modules/natural_language_processing/SpeechToTextModule.js +75 -53
- package/lib/module/modules/natural_language_processing/SpeechToTextModule.js.map +1 -1
- package/lib/typescript/hooks/natural_language_processing/useSpeechToText.d.ts +5 -5
- package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts +7 -12
- package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts.map +1 -1
- package/lib/typescript/types/stt.d.ts +0 -9
- package/lib/typescript/types/stt.d.ts.map +1 -1
- package/package.json +1 -1
- package/react-native-executorch.podspec +2 -0
- package/src/modules/natural_language_processing/SpeechToTextModule.ts +118 -54
- package/src/types/stt.ts +0 -12
- package/common/rnexecutorch/models/EncoderDecoderBase.cpp +0 -21
- package/common/rnexecutorch/models/EncoderDecoderBase.h +0 -31
- package/common/rnexecutorch/models/speech_to_text/SpeechToTextStrategy.h +0 -27
- package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.cpp +0 -50
- package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.h +0 -25
- package/lib/Error.js +0 -53
- package/lib/ThreadPool.d.ts +0 -10
- package/lib/ThreadPool.js +0 -28
- package/lib/common/Logger.d.ts +0 -8
- package/lib/common/Logger.js +0 -19
- package/lib/constants/directories.js +0 -2
- package/lib/constants/llmDefaults.d.ts +0 -6
- package/lib/constants/llmDefaults.js +0 -16
- package/lib/constants/modelUrls.d.ts +0 -223
- package/lib/constants/modelUrls.js +0 -322
- package/lib/constants/ocr/models.d.ts +0 -882
- package/lib/constants/ocr/models.js +0 -182
- package/lib/constants/ocr/symbols.js +0 -139
- package/lib/constants/sttDefaults.d.ts +0 -28
- package/lib/constants/sttDefaults.js +0 -68
- package/lib/controllers/LLMController.d.ts +0 -47
- package/lib/controllers/LLMController.js +0 -213
- package/lib/controllers/OCRController.js +0 -67
- package/lib/controllers/SpeechToTextController.d.ts +0 -56
- package/lib/controllers/SpeechToTextController.js +0 -349
- package/lib/controllers/VerticalOCRController.js +0 -70
- package/lib/hooks/computer_vision/useClassification.d.ts +0 -15
- package/lib/hooks/computer_vision/useClassification.js +0 -7
- package/lib/hooks/computer_vision/useImageEmbeddings.d.ts +0 -15
- package/lib/hooks/computer_vision/useImageEmbeddings.js +0 -7
- package/lib/hooks/computer_vision/useImageSegmentation.d.ts +0 -38
- package/lib/hooks/computer_vision/useImageSegmentation.js +0 -7
- package/lib/hooks/computer_vision/useOCR.d.ts +0 -20
- package/lib/hooks/computer_vision/useOCR.js +0 -41
- package/lib/hooks/computer_vision/useObjectDetection.d.ts +0 -15
- package/lib/hooks/computer_vision/useObjectDetection.js +0 -7
- package/lib/hooks/computer_vision/useStyleTransfer.d.ts +0 -15
- package/lib/hooks/computer_vision/useStyleTransfer.js +0 -7
- package/lib/hooks/computer_vision/useVerticalOCR.d.ts +0 -21
- package/lib/hooks/computer_vision/useVerticalOCR.js +0 -43
- package/lib/hooks/general/useExecutorchModule.d.ts +0 -13
- package/lib/hooks/general/useExecutorchModule.js +0 -7
- package/lib/hooks/natural_language_processing/useLLM.d.ts +0 -10
- package/lib/hooks/natural_language_processing/useLLM.js +0 -78
- package/lib/hooks/natural_language_processing/useSpeechToText.d.ts +0 -27
- package/lib/hooks/natural_language_processing/useSpeechToText.js +0 -49
- package/lib/hooks/natural_language_processing/useTextEmbeddings.d.ts +0 -16
- package/lib/hooks/natural_language_processing/useTextEmbeddings.js +0 -7
- package/lib/hooks/natural_language_processing/useTokenizer.d.ts +0 -17
- package/lib/hooks/natural_language_processing/useTokenizer.js +0 -52
- package/lib/hooks/useModule.js +0 -45
- package/lib/hooks/useNonStaticModule.d.ts +0 -20
- package/lib/hooks/useNonStaticModule.js +0 -49
- package/lib/index.d.ts +0 -48
- package/lib/index.js +0 -58
- package/lib/module/utils/SpeechToTextModule/ASR.js +0 -191
- package/lib/module/utils/SpeechToTextModule/ASR.js.map +0 -1
- package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js +0 -73
- package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js.map +0 -1
- package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js +0 -56
- package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js.map +0 -1
- package/lib/module/utils/stt.js +0 -22
- package/lib/module/utils/stt.js.map +0 -1
- package/lib/modules/BaseModule.js +0 -25
- package/lib/modules/BaseNonStaticModule.js +0 -14
- package/lib/modules/computer_vision/ClassificationModule.d.ts +0 -8
- package/lib/modules/computer_vision/ClassificationModule.js +0 -17
- package/lib/modules/computer_vision/ImageEmbeddingsModule.d.ts +0 -8
- package/lib/modules/computer_vision/ImageEmbeddingsModule.js +0 -17
- package/lib/modules/computer_vision/ImageSegmentationModule.d.ts +0 -11
- package/lib/modules/computer_vision/ImageSegmentationModule.js +0 -27
- package/lib/modules/computer_vision/OCRModule.d.ts +0 -14
- package/lib/modules/computer_vision/OCRModule.js +0 -17
- package/lib/modules/computer_vision/ObjectDetectionModule.d.ts +0 -9
- package/lib/modules/computer_vision/ObjectDetectionModule.js +0 -17
- package/lib/modules/computer_vision/StyleTransferModule.d.ts +0 -8
- package/lib/modules/computer_vision/StyleTransferModule.js +0 -17
- package/lib/modules/computer_vision/VerticalOCRModule.d.ts +0 -14
- package/lib/modules/computer_vision/VerticalOCRModule.js +0 -19
- package/lib/modules/general/ExecutorchModule.d.ts +0 -7
- package/lib/modules/general/ExecutorchModule.js +0 -14
- package/lib/modules/natural_language_processing/LLMModule.d.ts +0 -28
- package/lib/modules/natural_language_processing/LLMModule.js +0 -45
- package/lib/modules/natural_language_processing/SpeechToTextModule.d.ts +0 -24
- package/lib/modules/natural_language_processing/SpeechToTextModule.js +0 -36
- package/lib/modules/natural_language_processing/TextEmbeddingsModule.d.ts +0 -9
- package/lib/modules/natural_language_processing/TextEmbeddingsModule.js +0 -21
- package/lib/modules/natural_language_processing/TokenizerModule.d.ts +0 -12
- package/lib/modules/natural_language_processing/TokenizerModule.js +0 -30
- package/lib/native/NativeETInstaller.js +0 -2
- package/lib/native/NativeOCR.js +0 -2
- package/lib/native/NativeVerticalOCR.js +0 -2
- package/lib/native/RnExecutorchModules.d.ts +0 -7
- package/lib/native/RnExecutorchModules.js +0 -18
- package/lib/tsconfig.tsbuildinfo +0 -1
- package/lib/types/common.d.ts +0 -32
- package/lib/types/common.js +0 -25
- package/lib/types/imageSegmentation.js +0 -26
- package/lib/types/llm.d.ts +0 -46
- package/lib/types/llm.js +0 -9
- package/lib/types/objectDetection.js +0 -94
- package/lib/types/ocr.js +0 -1
- package/lib/types/stt.d.ts +0 -94
- package/lib/types/stt.js +0 -85
- package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts +0 -27
- package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts.map +0 -1
- package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts +0 -23
- package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts.map +0 -1
- package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts +0 -13
- package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts.map +0 -1
- package/lib/typescript/utils/stt.d.ts +0 -2
- package/lib/typescript/utils/stt.d.ts.map +0 -1
- package/lib/utils/ResourceFetcher.d.ts +0 -24
- package/lib/utils/ResourceFetcher.js +0 -305
- package/lib/utils/ResourceFetcherUtils.d.ts +0 -54
- package/lib/utils/ResourceFetcherUtils.js +0 -127
- package/lib/utils/llm.d.ts +0 -6
- package/lib/utils/llm.js +0 -72
- package/lib/utils/stt.js +0 -21
- package/src/utils/SpeechToTextModule/ASR.ts +0 -303
- package/src/utils/SpeechToTextModule/OnlineProcessor.ts +0 -87
- package/src/utils/SpeechToTextModule/hypothesisBuffer.ts +0 -79
- package/src/utils/stt.ts +0 -28
|
@@ -1,71 +1,98 @@
|
|
|
1
1
|
"use strict";
|
|
2
2
|
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
3
|
+
import { Logger } from '../../common/Logger';
|
|
4
|
+
import { ResourceFetcher } from '../../utils/ResourceFetcher';
|
|
5
5
|
export class SpeechToTextModule {
|
|
6
|
-
asr = new ASR();
|
|
7
|
-
processor = new OnlineASRProcessor(this.asr);
|
|
8
|
-
isStreaming = false;
|
|
9
|
-
readyToProcess = false;
|
|
10
|
-
minAudioSamples = 1 * 16000; // 1 second
|
|
11
|
-
|
|
12
6
|
async load(model, onDownloadProgressCallback = () => {}) {
|
|
13
7
|
this.modelConfig = model;
|
|
14
|
-
|
|
8
|
+
const tokenizerLoadPromise = ResourceFetcher.fetch(undefined, model.tokenizerSource);
|
|
9
|
+
const encoderDecoderPromise = ResourceFetcher.fetch(onDownloadProgressCallback, model.encoderSource, model.decoderSource);
|
|
10
|
+
const [tokenizerSources, encoderDecoderResults] = await Promise.all([tokenizerLoadPromise, encoderDecoderPromise]);
|
|
11
|
+
const encoderSource = encoderDecoderResults?.[0];
|
|
12
|
+
const decoderSource = encoderDecoderResults?.[1];
|
|
13
|
+
if (!encoderSource || !decoderSource || !tokenizerSources) {
|
|
14
|
+
throw new Error('Download interrupted.');
|
|
15
|
+
}
|
|
16
|
+
this.nativeModule = await global.loadSpeechToText(encoderSource, decoderSource, tokenizerSources[0]);
|
|
15
17
|
}
|
|
16
18
|
async encode(waveform) {
|
|
17
|
-
|
|
19
|
+
if (Array.isArray(waveform)) {
|
|
20
|
+
Logger.info('Passing waveform as number[] is deprecated, use Float32Array instead');
|
|
21
|
+
waveform = new Float32Array(waveform);
|
|
22
|
+
}
|
|
23
|
+
return new Float32Array(await this.nativeModule.encode(waveform));
|
|
18
24
|
}
|
|
19
|
-
async decode(tokens) {
|
|
20
|
-
|
|
25
|
+
async decode(tokens, encoderOutput) {
|
|
26
|
+
if (Array.isArray(tokens)) {
|
|
27
|
+
Logger.info('Passing tokens as number[] is deprecated, use Int32Array instead');
|
|
28
|
+
tokens = new Int32Array(tokens);
|
|
29
|
+
}
|
|
30
|
+
if (Array.isArray(encoderOutput)) {
|
|
31
|
+
Logger.info('Passing encoderOutput as number[] is deprecated, use Float32Array instead');
|
|
32
|
+
encoderOutput = new Float32Array(encoderOutput);
|
|
33
|
+
}
|
|
34
|
+
return new Float32Array(await this.nativeModule.decode(tokens, encoderOutput));
|
|
21
35
|
}
|
|
22
36
|
async transcribe(waveform, options = {}) {
|
|
23
37
|
this.validateOptions(options);
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
for (const word of segment.words) {
|
|
28
|
-
transcription += ` ${word.word}`;
|
|
29
|
-
}
|
|
38
|
+
if (Array.isArray(waveform)) {
|
|
39
|
+
Logger.info('Passing waveform as number[] is deprecated, use Float32Array instead');
|
|
40
|
+
waveform = new Float32Array(waveform);
|
|
30
41
|
}
|
|
31
|
-
return
|
|
42
|
+
return this.nativeModule.transcribe(waveform, options.language || '');
|
|
32
43
|
}
|
|
33
44
|
async *stream(options = {}) {
|
|
34
|
-
if (this.isStreaming) {
|
|
35
|
-
throw new Error('Streaming is already in progress');
|
|
36
|
-
}
|
|
37
45
|
this.validateOptions(options);
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
46
|
+
const queue = [];
|
|
47
|
+
let waiter = null;
|
|
48
|
+
let finished = false;
|
|
49
|
+
let error;
|
|
50
|
+
const wake = () => {
|
|
51
|
+
waiter?.();
|
|
52
|
+
waiter = null;
|
|
53
|
+
};
|
|
54
|
+
(async () => {
|
|
55
|
+
try {
|
|
56
|
+
await this.nativeModule.stream((committed, nonCommitted, isDone) => {
|
|
57
|
+
queue.push({
|
|
58
|
+
committed,
|
|
59
|
+
nonCommitted
|
|
60
|
+
});
|
|
61
|
+
if (isDone) {
|
|
62
|
+
finished = true;
|
|
63
|
+
}
|
|
64
|
+
wake();
|
|
65
|
+
}, options.language || '');
|
|
66
|
+
finished = true;
|
|
67
|
+
wake();
|
|
68
|
+
} catch (e) {
|
|
69
|
+
error = e;
|
|
70
|
+
finished = true;
|
|
71
|
+
wake();
|
|
72
|
+
}
|
|
73
|
+
})();
|
|
74
|
+
while (true) {
|
|
75
|
+
if (queue.length > 0) {
|
|
76
|
+
yield queue.shift();
|
|
77
|
+
if (finished && queue.length === 0) {
|
|
78
|
+
return;
|
|
79
|
+
}
|
|
43
80
|
continue;
|
|
44
81
|
}
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
} = await this.processor.processIter(options);
|
|
49
|
-
yield {
|
|
50
|
-
committed,
|
|
51
|
-
nonCommitted
|
|
52
|
-
};
|
|
53
|
-
this.readyToProcess = false;
|
|
82
|
+
if (error) throw error;
|
|
83
|
+
if (finished) return;
|
|
84
|
+
await new Promise(r => waiter = r);
|
|
54
85
|
}
|
|
55
|
-
const {
|
|
56
|
-
committed
|
|
57
|
-
} = await this.processor.finish();
|
|
58
|
-
yield {
|
|
59
|
-
committed,
|
|
60
|
-
nonCommitted: ''
|
|
61
|
-
};
|
|
62
86
|
}
|
|
63
|
-
|
|
64
|
-
|
|
87
|
+
async streamInsert(waveform) {
|
|
88
|
+
if (Array.isArray(waveform)) {
|
|
89
|
+
Logger.info('Passing waveform as number[] is deprecated, use Float32Array instead');
|
|
90
|
+
waveform = new Float32Array(waveform);
|
|
91
|
+
}
|
|
92
|
+
return this.nativeModule.streamInsert(waveform);
|
|
65
93
|
}
|
|
66
|
-
|
|
67
|
-
this.
|
|
68
|
-
this.readyToProcess = true;
|
|
94
|
+
async streamStop() {
|
|
95
|
+
return this.nativeModule.streamStop();
|
|
69
96
|
}
|
|
70
97
|
validateOptions(options) {
|
|
71
98
|
if (!this.modelConfig.isMultilingual && options.language) {
|
|
@@ -75,10 +102,5 @@ export class SpeechToTextModule {
|
|
|
75
102
|
throw new Error('Model is multilingual, provide a language');
|
|
76
103
|
}
|
|
77
104
|
}
|
|
78
|
-
resetStreamState() {
|
|
79
|
-
this.isStreaming = false;
|
|
80
|
-
this.readyToProcess = false;
|
|
81
|
-
this.processor = new OnlineASRProcessor(this.asr);
|
|
82
|
-
}
|
|
83
105
|
}
|
|
84
106
|
//# sourceMappingURL=SpeechToTextModule.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"names":["
|
|
1
|
+
{"version":3,"names":["Logger","ResourceFetcher","SpeechToTextModule","load","model","onDownloadProgressCallback","modelConfig","tokenizerLoadPromise","fetch","undefined","tokenizerSource","encoderDecoderPromise","encoderSource","decoderSource","tokenizerSources","encoderDecoderResults","Promise","all","Error","nativeModule","global","loadSpeechToText","encode","waveform","Array","isArray","info","Float32Array","decode","tokens","encoderOutput","Int32Array","transcribe","options","validateOptions","language","stream","queue","waiter","finished","error","wake","committed","nonCommitted","isDone","push","e","length","shift","r","streamInsert","streamStop","isMultilingual"],"sourceRoot":"../../../../src","sources":["modules/natural_language_processing/SpeechToTextModule.ts"],"mappings":";;AAAA,SAASA,MAAM,QAAQ,qBAAqB;AAE5C,SAASC,eAAe,QAAQ,6BAA6B;AAE7D,OAAO,MAAMC,kBAAkB,CAAC;EAK9B,MAAaC,IAAIA,CACfC,KAA8B,EAC9BC,0BAAsD,GAAGA,CAAA,KAAM,CAAC,CAAC,EACjE;IACA,IAAI,CAACC,WAAW,GAAGF,KAAK;IAExB,MAAMG,oBAAoB,GAAGN,eAAe,CAACO,KAAK,CAChDC,SAAS,EACTL,KAAK,CAACM,eACR,CAAC;IACD,MAAMC,qBAAqB,GAAGV,eAAe,CAACO,KAAK,CACjDH,0BAA0B,EAC1BD,KAAK,CAACQ,aAAa,EACnBR,KAAK,CAACS,aACR,CAAC;IACD,MAAM,CAACC,gBAAgB,EAAEC,qBAAqB,CAAC,GAAG,MAAMC,OAAO,CAACC,GAAG,CAAC,CAClEV,oBAAoB,EACpBI,qBAAqB,CACtB,CAAC;IACF,MAAMC,aAAa,GAAGG,qBAAqB,GAAG,CAAC,CAAC;IAChD,MAAMF,aAAa,GAAGE,qBAAqB,GAAG,CAAC,CAAC;IAChD,IAAI,CAACH,aAAa,IAAI,CAACC,aAAa,IAAI,CAACC,gBAAgB,EAAE;MACzD,MAAM,IAAII,KAAK,CAAC,uBAAuB,CAAC;IAC1C;IACA,IAAI,CAACC,YAAY,GAAG,MAAMC,MAAM,CAACC,gBAAgB,CAC/CT,aAAa,EACbC,aAAa,EACbC,gBAAgB,CAAC,CAAC,CACpB,CAAC;EACH;EAEA,MAAaQ,MAAMA,CACjBC,QAAiC,EACV;IACvB,IAAIC,KAAK,CAACC,OAAO,CAACF,QAAQ,CAAC,EAAE;MAC3BvB,MAAM,CAAC0B,IAAI,CACT,sEACF,CAAC;MACDH,QAAQ,GAAG,IAAII,YAAY,CAACJ,QAAQ,CAAC;IACvC;IACA,OAAO,IAAII,YAAY,CAAC,MAAM,IAAI,CAACR,YAAY,CAACG,MAAM,CAACC,QAAQ,CAAC,CAAC;EACnE;EAEA,MAAaK,MAAMA,CACjBC,MAA6B,EAC7BC,aAAsC,EACf;IACvB,IAAIN,KAAK,CAACC,OAAO,CAACI,MAAM,CAAC,EAAE;MACzB7B,MAAM,CAAC0B,IAAI,CACT,kEACF,CAAC;MACDG,MAAM,GAAG,IAAIE,UAAU,CAACF,MAAM,CAAC;IACjC;IACA,IAAIL,KAAK,CAACC,OAAO,CAACK,aAAa,CAAC,EAAE;MAChC9B,MAAM,CAAC0B,IAAI,CACT,2EACF,CAAC;MACDI,aAAa,GAAG,IAAIH,YAAY,CAACG,aAAa,CAAC;IACjD;IACA,OAAO,IAAIH,YAAY,CACrB,MAAM,IAAI,CAACR,YAAY,CAACS,MAAM,CAACC,MAAM,EAAEC,aAAa,CACtD,CAAC;EACH;EAEA,MAAaE,UAAUA,CACrBT,QAAiC,EACjCU,OAAwB,GAAG,CAAC,CAAC,EACZ;IACjB,IAAI,CAACC,eAAe,CAACD,OAAO,CAAC;IAE7B,IAAIT,KAAK,CAACC,OAAO,CAACF,QAAQ,CAAC,EAAE;MAC3BvB,MAAM,CAAC0B,IAAI,CACT,sEACF,CAAC;MACDH,QAAQ,GAAG,IAAII,YAAY,CAACJ,QAAQ,CAAC;IACvC;IAEA,OAAO,IAAI,CAACJ,YAAY,CAACa,UAAU,CAACT,QAAQ,EAAEU,OAAO,CAACE,QAAQ,IAAI,EAAE,CAAC;EACvE;EAEA,OAAcC,MAAMA,CAClBH,OAAwB,GAAG,CAAC,CAAC,EACgC;IAC7D,IAAI,CAACC,eAAe,CAACD,OAAO,CAAC;IAE7B,MAAMI,KAAoD,GAAG,EAAE;IAC/D,IAAIC,MAA2B,GAAG,IAAI;IACtC,IAAIC,QAAQ,GAAG,KAAK;IACpB,IAAIC,KAAc;IAElB,MAAMC,IAAI,GAAGA,CAAA,KAAM;MACjBH,MAAM,GAAG,CAAC;MACVA,MAAM,GAAG,IAAI;IACf,CAAC;IAED,CAAC,YAAY;MACX,IAAI;QACF,MAAM,IAAI,CAACnB,YAAY,CAACiB,MAAM,CAC5B,CAACM,SAAiB,EAAEC,YAAoB,EAAEC,MAAe,KAAK;UAC5DP,KAAK,CAACQ,IAAI,CAAC;YAAEH,SAAS;YAAEC;UAAa,CAAC,CAAC;UACvC,IAAIC,MAAM,EAAE;YACVL,QAAQ,GAAG,IAAI;UACjB;UACAE,IAAI,CAAC,CAAC;QACR,CAAC,EACDR,OAAO,CAACE,QAAQ,IAAI,EACtB,CAAC;QACDI,QAAQ,GAAG,IAAI;QACfE,IAAI,CAAC,CAAC;MACR,CAAC,CAAC,OAAOK,CAAC,EAAE;QACVN,KAAK,GAAGM,CAAC;QACTP,QAAQ,GAAG,IAAI;QACfE,IAAI,CAAC,CAAC;MACR;IACF,CAAC,EAAE,CAAC;IAEJ,OAAO,IAAI,EAAE;MACX,IAAIJ,KAAK,CAACU,MAAM,GAAG,CAAC,EAAE;QACpB,MAAMV,KAAK,CAACW,KAAK,CAAC,CAAE;QACpB,IAAIT,QAAQ,IAAIF,KAAK,CAACU,MAAM,KAAK,CAAC,EAAE;UAClC;QACF;QACA;MACF;MACA,IAAIP,KAAK,EAAE,MAAMA,KAAK;MACtB,IAAID,QAAQ,EAAE;MACd,MAAM,IAAIvB,OAAO,CAAQiC,CAAC,IAAMX,MAAM,GAAGW,CAAE,CAAC;IAC9C;EACF;EAEA,MAAaC,YAAYA,CAAC3B,QAAiC,EAAiB;IAC1E,IAAIC,KAAK,CAACC,OAAO,CAACF,QAAQ,CAAC,EAAE;MAC3BvB,MAAM,CAAC0B,IAAI,CACT,sEACF,CAAC;MACDH,QAAQ,GAAG,IAAII,YAAY,CAACJ,QAAQ,CAAC;IACvC;IACA,OAAO,IAAI,CAACJ,YAAY,CAAC+B,YAAY,CAAC3B,QAAQ,CAAC;EACjD;EAEA,MAAa4B,UAAUA,CAAA,EAAkB;IACvC,OAAO,IAAI,CAAChC,YAAY,CAACgC,UAAU,CAAC,CAAC;EACvC;EAEQjB,eAAeA,CAACD,OAAwB,EAAE;IAChD,IAAI,CAAC,IAAI,CAAC3B,WAAW,CAAC8C,cAAc,IAAInB,OAAO,CAACE,QAAQ,EAAE;MACxD,MAAM,IAAIjB,KAAK,CAAC,gDAAgD,CAAC;IACnE;IACA,IAAI,IAAI,CAACZ,WAAW,CAAC8C,cAAc,IAAI,CAACnB,OAAO,CAACE,QAAQ,EAAE;MACxD,MAAM,IAAIjB,KAAK,CAAC,2CAA2C,CAAC;IAC9D;EACF;AACF","ignoreList":[]}
|
|
@@ -9,11 +9,11 @@ export declare const useSpeechToText: ({ model, preventLoad, }: {
|
|
|
9
9
|
downloadProgress: number;
|
|
10
10
|
committedTranscription: string;
|
|
11
11
|
nonCommittedTranscription: string;
|
|
12
|
-
encode: (waveform: Float32Array<ArrayBufferLike>) => Promise<
|
|
13
|
-
decode: (tokens: number[]) => Promise<Float32Array<ArrayBufferLike>>;
|
|
14
|
-
transcribe: (waveform: number[]
|
|
12
|
+
encode: (waveform: number[] | Float32Array<ArrayBufferLike>) => Promise<Float32Array<ArrayBufferLike>>;
|
|
13
|
+
decode: (tokens: number[] | Int32Array<ArrayBufferLike>, encoderOutput: number[] | Float32Array<ArrayBufferLike>) => Promise<Float32Array<ArrayBufferLike>>;
|
|
14
|
+
transcribe: (waveform: number[] | Float32Array<ArrayBufferLike>, options?: import("../../types/stt").DecodingOptions | undefined) => Promise<string>;
|
|
15
15
|
stream: () => Promise<string>;
|
|
16
|
-
streamStop: () => void
|
|
17
|
-
streamInsert: (waveform: number[]) => void
|
|
16
|
+
streamStop: () => Promise<void>;
|
|
17
|
+
streamInsert: (waveform: number[] | Float32Array<ArrayBufferLike>) => Promise<void>;
|
|
18
18
|
};
|
|
19
19
|
//# sourceMappingURL=useSpeechToText.d.ts.map
|
|
@@ -1,22 +1,17 @@
|
|
|
1
1
|
import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
|
|
2
2
|
export declare class SpeechToTextModule {
|
|
3
|
+
private nativeModule;
|
|
3
4
|
private modelConfig;
|
|
4
|
-
private asr;
|
|
5
|
-
private processor;
|
|
6
|
-
private isStreaming;
|
|
7
|
-
private readyToProcess;
|
|
8
|
-
private minAudioSamples;
|
|
9
5
|
load(model: SpeechToTextModelConfig, onDownloadProgressCallback?: (progress: number) => void): Promise<void>;
|
|
10
|
-
encode(waveform: Float32Array): Promise<
|
|
11
|
-
decode(tokens: number[]): Promise<Float32Array>;
|
|
12
|
-
transcribe(waveform: number[], options?: DecodingOptions): Promise<string>;
|
|
6
|
+
encode(waveform: Float32Array | number[]): Promise<Float32Array>;
|
|
7
|
+
decode(tokens: Int32Array | number[], encoderOutput: Float32Array | number[]): Promise<Float32Array>;
|
|
8
|
+
transcribe(waveform: Float32Array | number[], options?: DecodingOptions): Promise<string>;
|
|
13
9
|
stream(options?: DecodingOptions): AsyncGenerator<{
|
|
14
10
|
committed: string;
|
|
15
11
|
nonCommitted: string;
|
|
16
|
-
}
|
|
17
|
-
|
|
18
|
-
|
|
12
|
+
}>;
|
|
13
|
+
streamInsert(waveform: Float32Array | number[]): Promise<void>;
|
|
14
|
+
streamStop(): Promise<void>;
|
|
19
15
|
private validateOptions;
|
|
20
|
-
private resetStreamState;
|
|
21
16
|
}
|
|
22
17
|
//# sourceMappingURL=SpeechToTextModule.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"SpeechToTextModule.d.ts","sourceRoot":"","sources":["../../../../src/modules/natural_language_processing/SpeechToTextModule.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"SpeechToTextModule.d.ts","sourceRoot":"","sources":["../../../../src/modules/natural_language_processing/SpeechToTextModule.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,eAAe,EAAE,uBAAuB,EAAE,MAAM,iBAAiB,CAAC;AAG3E,qBAAa,kBAAkB;IAC7B,OAAO,CAAC,YAAY,CAAM;IAE1B,OAAO,CAAC,WAAW,CAA2B;IAEjC,IAAI,CACf,KAAK,EAAE,uBAAuB,EAC9B,0BAA0B,GAAE,CAAC,QAAQ,EAAE,MAAM,KAAK,IAAe;IA6BtD,MAAM,CACjB,QAAQ,EAAE,YAAY,GAAG,MAAM,EAAE,GAChC,OAAO,CAAC,YAAY,CAAC;IAUX,MAAM,CACjB,MAAM,EAAE,UAAU,GAAG,MAAM,EAAE,EAC7B,aAAa,EAAE,YAAY,GAAG,MAAM,EAAE,GACrC,OAAO,CAAC,YAAY,CAAC;IAkBX,UAAU,CACrB,QAAQ,EAAE,YAAY,GAAG,MAAM,EAAE,EACjC,OAAO,GAAE,eAAoB,GAC5B,OAAO,CAAC,MAAM,CAAC;IAaJ,MAAM,CAClB,OAAO,GAAE,eAAoB,GAC5B,cAAc,CAAC;QAAE,SAAS,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,MAAM,CAAA;KAAE,CAAC;IAgDjD,YAAY,CAAC,QAAQ,EAAE,YAAY,GAAG,MAAM,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IAU9D,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IAIxC,OAAO,CAAC,eAAe;CAQxB"}
|
|
@@ -1,13 +1,4 @@
|
|
|
1
1
|
import { ResourceSource } from './common';
|
|
2
|
-
export type WordTuple = [number, number, string];
|
|
3
|
-
export interface WordObject {
|
|
4
|
-
start: number;
|
|
5
|
-
end: number;
|
|
6
|
-
word: string;
|
|
7
|
-
}
|
|
8
|
-
export interface Segment {
|
|
9
|
-
words: WordObject[];
|
|
10
|
-
}
|
|
11
2
|
export type SpeechToTextLanguage = 'af' | 'sq' | 'ar' | 'hy' | 'az' | 'eu' | 'be' | 'bn' | 'bs' | 'bg' | 'my' | 'ca' | 'zh' | 'hr' | 'cs' | 'da' | 'nl' | 'et' | 'en' | 'fi' | 'fr' | 'gl' | 'ka' | 'de' | 'el' | 'gu' | 'ht' | 'he' | 'hi' | 'hu' | 'is' | 'id' | 'it' | 'ja' | 'kn' | 'kk' | 'km' | 'ko' | 'lo' | 'lv' | 'lt' | 'mk' | 'mg' | 'ms' | 'ml' | 'mt' | 'mr' | 'ne' | 'no' | 'fa' | 'pl' | 'pt' | 'pa' | 'ro' | 'ru' | 'sr' | 'si' | 'sk' | 'sl' | 'es' | 'su' | 'sw' | 'sv' | 'tl' | 'tg' | 'ta' | 'te' | 'th' | 'tr' | 'uk' | 'ur' | 'uz' | 'vi' | 'cy' | 'yi';
|
|
12
3
|
export interface DecodingOptions {
|
|
13
4
|
language?: SpeechToTextLanguage;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../../../src/types/stt.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,UAAU,CAAC;
|
|
1
|
+
{"version":3,"file":"stt.d.ts","sourceRoot":"","sources":["../../../src/types/stt.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,cAAc,EAAE,MAAM,UAAU,CAAC;AAG1C,MAAM,MAAM,oBAAoB,GAC5B,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,GACJ,IAAI,CAAC;AAET,MAAM,WAAW,eAAe;IAC9B,QAAQ,CAAC,EAAE,oBAAoB,CAAC;CACjC;AAED,MAAM,WAAW,uBAAuB;IACtC,cAAc,EAAE,OAAO,CAAC;IACxB,aAAa,EAAE,cAAc,CAAC;IAC9B,aAAa,EAAE,cAAc,CAAC;IAC9B,eAAe,EAAE,cAAc,CAAC;CACjC"}
|
package/package.json
CHANGED
|
@@ -75,6 +75,8 @@ Pod::Spec.new do |s|
|
|
|
75
75
|
"common/**/*.{cpp,c,h,hpp}",
|
|
76
76
|
]
|
|
77
77
|
|
|
78
|
+
s.libraries = "z"
|
|
79
|
+
|
|
78
80
|
# Exclude file with tests to not introduce gtest dependency.
|
|
79
81
|
# Do not include the headers from common/rnexecutorch/jsi/ as source files.
|
|
80
82
|
# Xcode/Cocoapods leaks them to other pods that an app also depends on, so if
|
|
@@ -1,84 +1,154 @@
|
|
|
1
|
+
import { Logger } from '../../common/Logger';
|
|
1
2
|
import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
|
|
2
|
-
import {
|
|
3
|
-
import { OnlineASRProcessor } from '../../utils/SpeechToTextModule/OnlineProcessor';
|
|
3
|
+
import { ResourceFetcher } from '../../utils/ResourceFetcher';
|
|
4
4
|
|
|
5
5
|
export class SpeechToTextModule {
|
|
6
|
-
private
|
|
7
|
-
private asr: ASR = new ASR();
|
|
6
|
+
private nativeModule: any;
|
|
8
7
|
|
|
9
|
-
private
|
|
10
|
-
private isStreaming = false;
|
|
11
|
-
private readyToProcess = false;
|
|
12
|
-
private minAudioSamples: number = 1 * 16000; // 1 second
|
|
8
|
+
private modelConfig!: SpeechToTextModelConfig;
|
|
13
9
|
|
|
14
10
|
public async load(
|
|
15
11
|
model: SpeechToTextModelConfig,
|
|
16
12
|
onDownloadProgressCallback: (progress: number) => void = () => {}
|
|
17
13
|
) {
|
|
18
14
|
this.modelConfig = model;
|
|
19
|
-
|
|
15
|
+
|
|
16
|
+
const tokenizerLoadPromise = ResourceFetcher.fetch(
|
|
17
|
+
undefined,
|
|
18
|
+
model.tokenizerSource
|
|
19
|
+
);
|
|
20
|
+
const encoderDecoderPromise = ResourceFetcher.fetch(
|
|
21
|
+
onDownloadProgressCallback,
|
|
22
|
+
model.encoderSource,
|
|
23
|
+
model.decoderSource
|
|
24
|
+
);
|
|
25
|
+
const [tokenizerSources, encoderDecoderResults] = await Promise.all([
|
|
26
|
+
tokenizerLoadPromise,
|
|
27
|
+
encoderDecoderPromise,
|
|
28
|
+
]);
|
|
29
|
+
const encoderSource = encoderDecoderResults?.[0];
|
|
30
|
+
const decoderSource = encoderDecoderResults?.[1];
|
|
31
|
+
if (!encoderSource || !decoderSource || !tokenizerSources) {
|
|
32
|
+
throw new Error('Download interrupted.');
|
|
33
|
+
}
|
|
34
|
+
this.nativeModule = await global.loadSpeechToText(
|
|
35
|
+
encoderSource,
|
|
36
|
+
decoderSource,
|
|
37
|
+
tokenizerSources[0]!
|
|
38
|
+
);
|
|
20
39
|
}
|
|
21
40
|
|
|
22
|
-
public async encode(
|
|
23
|
-
|
|
41
|
+
public async encode(
|
|
42
|
+
waveform: Float32Array | number[]
|
|
43
|
+
): Promise<Float32Array> {
|
|
44
|
+
if (Array.isArray(waveform)) {
|
|
45
|
+
Logger.info(
|
|
46
|
+
'Passing waveform as number[] is deprecated, use Float32Array instead'
|
|
47
|
+
);
|
|
48
|
+
waveform = new Float32Array(waveform);
|
|
49
|
+
}
|
|
50
|
+
return new Float32Array(await this.nativeModule.encode(waveform));
|
|
24
51
|
}
|
|
25
52
|
|
|
26
|
-
public async decode(
|
|
27
|
-
|
|
53
|
+
public async decode(
|
|
54
|
+
tokens: Int32Array | number[],
|
|
55
|
+
encoderOutput: Float32Array | number[]
|
|
56
|
+
): Promise<Float32Array> {
|
|
57
|
+
if (Array.isArray(tokens)) {
|
|
58
|
+
Logger.info(
|
|
59
|
+
'Passing tokens as number[] is deprecated, use Int32Array instead'
|
|
60
|
+
);
|
|
61
|
+
tokens = new Int32Array(tokens);
|
|
62
|
+
}
|
|
63
|
+
if (Array.isArray(encoderOutput)) {
|
|
64
|
+
Logger.info(
|
|
65
|
+
'Passing encoderOutput as number[] is deprecated, use Float32Array instead'
|
|
66
|
+
);
|
|
67
|
+
encoderOutput = new Float32Array(encoderOutput);
|
|
68
|
+
}
|
|
69
|
+
return new Float32Array(
|
|
70
|
+
await this.nativeModule.decode(tokens, encoderOutput)
|
|
71
|
+
);
|
|
28
72
|
}
|
|
29
73
|
|
|
30
74
|
public async transcribe(
|
|
31
|
-
waveform: number[],
|
|
75
|
+
waveform: Float32Array | number[],
|
|
32
76
|
options: DecodingOptions = {}
|
|
33
77
|
): Promise<string> {
|
|
34
78
|
this.validateOptions(options);
|
|
35
79
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
transcription += ` ${word.word}`;
|
|
42
|
-
}
|
|
80
|
+
if (Array.isArray(waveform)) {
|
|
81
|
+
Logger.info(
|
|
82
|
+
'Passing waveform as number[] is deprecated, use Float32Array instead'
|
|
83
|
+
);
|
|
84
|
+
waveform = new Float32Array(waveform);
|
|
43
85
|
}
|
|
44
86
|
|
|
45
|
-
return
|
|
87
|
+
return this.nativeModule.transcribe(waveform, options.language || '');
|
|
46
88
|
}
|
|
47
89
|
|
|
48
|
-
public async *stream(
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
}
|
|
90
|
+
public async *stream(
|
|
91
|
+
options: DecodingOptions = {}
|
|
92
|
+
): AsyncGenerator<{ committed: string; nonCommitted: string }> {
|
|
52
93
|
this.validateOptions(options);
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
)
|
|
61
|
-
|
|
94
|
+
|
|
95
|
+
const queue: { committed: string; nonCommitted: string }[] = [];
|
|
96
|
+
let waiter: (() => void) | null = null;
|
|
97
|
+
let finished = false;
|
|
98
|
+
let error: unknown;
|
|
99
|
+
|
|
100
|
+
const wake = () => {
|
|
101
|
+
waiter?.();
|
|
102
|
+
waiter = null;
|
|
103
|
+
};
|
|
104
|
+
|
|
105
|
+
(async () => {
|
|
106
|
+
try {
|
|
107
|
+
await this.nativeModule.stream(
|
|
108
|
+
(committed: string, nonCommitted: string, isDone: boolean) => {
|
|
109
|
+
queue.push({ committed, nonCommitted });
|
|
110
|
+
if (isDone) {
|
|
111
|
+
finished = true;
|
|
112
|
+
}
|
|
113
|
+
wake();
|
|
114
|
+
},
|
|
115
|
+
options.language || ''
|
|
116
|
+
);
|
|
117
|
+
finished = true;
|
|
118
|
+
wake();
|
|
119
|
+
} catch (e) {
|
|
120
|
+
error = e;
|
|
121
|
+
finished = true;
|
|
122
|
+
wake();
|
|
123
|
+
}
|
|
124
|
+
})();
|
|
125
|
+
|
|
126
|
+
while (true) {
|
|
127
|
+
if (queue.length > 0) {
|
|
128
|
+
yield queue.shift()!;
|
|
129
|
+
if (finished && queue.length === 0) {
|
|
130
|
+
return;
|
|
131
|
+
}
|
|
62
132
|
continue;
|
|
63
133
|
}
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
yield { committed, nonCommitted };
|
|
68
|
-
this.readyToProcess = false;
|
|
134
|
+
if (error) throw error;
|
|
135
|
+
if (finished) return;
|
|
136
|
+
await new Promise<void>((r) => (waiter = r));
|
|
69
137
|
}
|
|
70
|
-
|
|
71
|
-
const { committed } = await this.processor.finish();
|
|
72
|
-
yield { committed, nonCommitted: '' };
|
|
73
138
|
}
|
|
74
139
|
|
|
75
|
-
public
|
|
76
|
-
|
|
140
|
+
public async streamInsert(waveform: Float32Array | number[]): Promise<void> {
|
|
141
|
+
if (Array.isArray(waveform)) {
|
|
142
|
+
Logger.info(
|
|
143
|
+
'Passing waveform as number[] is deprecated, use Float32Array instead'
|
|
144
|
+
);
|
|
145
|
+
waveform = new Float32Array(waveform);
|
|
146
|
+
}
|
|
147
|
+
return this.nativeModule.streamInsert(waveform);
|
|
77
148
|
}
|
|
78
149
|
|
|
79
|
-
public
|
|
80
|
-
this.
|
|
81
|
-
this.readyToProcess = true;
|
|
150
|
+
public async streamStop(): Promise<void> {
|
|
151
|
+
return this.nativeModule.streamStop();
|
|
82
152
|
}
|
|
83
153
|
|
|
84
154
|
private validateOptions(options: DecodingOptions) {
|
|
@@ -89,10 +159,4 @@ export class SpeechToTextModule {
|
|
|
89
159
|
throw new Error('Model is multilingual, provide a language');
|
|
90
160
|
}
|
|
91
161
|
}
|
|
92
|
-
|
|
93
|
-
private resetStreamState() {
|
|
94
|
-
this.isStreaming = false;
|
|
95
|
-
this.readyToProcess = false;
|
|
96
|
-
this.processor = new OnlineASRProcessor(this.asr);
|
|
97
|
-
}
|
|
98
162
|
}
|
package/src/types/stt.ts
CHANGED
|
@@ -1,17 +1,5 @@
|
|
|
1
1
|
import { ResourceSource } from './common';
|
|
2
2
|
|
|
3
|
-
export type WordTuple = [number, number, string];
|
|
4
|
-
|
|
5
|
-
export interface WordObject {
|
|
6
|
-
start: number;
|
|
7
|
-
end: number;
|
|
8
|
-
word: string;
|
|
9
|
-
}
|
|
10
|
-
|
|
11
|
-
export interface Segment {
|
|
12
|
-
words: WordObject[];
|
|
13
|
-
}
|
|
14
|
-
|
|
15
3
|
// Languages supported by whisper (not whisper.en)
|
|
16
4
|
export type SpeechToTextLanguage =
|
|
17
5
|
| 'af'
|
|
@@ -1,21 +0,0 @@
|
|
|
1
|
-
#include <rnexecutorch/models/EncoderDecoderBase.h>
|
|
2
|
-
|
|
3
|
-
namespace rnexecutorch::models {
|
|
4
|
-
|
|
5
|
-
EncoderDecoderBase::EncoderDecoderBase(
|
|
6
|
-
const std::string &encoderPath, const std::string &decoderPath,
|
|
7
|
-
std::shared_ptr<react::CallInvoker> callInvoker)
|
|
8
|
-
: callInvoker(callInvoker),
|
|
9
|
-
encoder_(std::make_unique<BaseModel>(encoderPath, callInvoker)),
|
|
10
|
-
decoder_(std::make_unique<BaseModel>(decoderPath, callInvoker)) {};
|
|
11
|
-
|
|
12
|
-
size_t EncoderDecoderBase::getMemoryLowerBound() const noexcept {
|
|
13
|
-
return encoder_->getMemoryLowerBound() + decoder_->getMemoryLowerBound();
|
|
14
|
-
}
|
|
15
|
-
|
|
16
|
-
void EncoderDecoderBase::unload() noexcept {
|
|
17
|
-
encoder_.reset(nullptr);
|
|
18
|
-
decoder_.reset(nullptr);
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
} // namespace rnexecutorch::models
|
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
#pragma once
|
|
2
|
-
|
|
3
|
-
#include <ReactCommon/CallInvoker.h>
|
|
4
|
-
#include <memory>
|
|
5
|
-
#include <rnexecutorch/models/BaseModel.h>
|
|
6
|
-
#include <string>
|
|
7
|
-
|
|
8
|
-
namespace rnexecutorch::models {
|
|
9
|
-
|
|
10
|
-
using namespace facebook;
|
|
11
|
-
using executorch::aten::Tensor;
|
|
12
|
-
using executorch::runtime::EValue;
|
|
13
|
-
|
|
14
|
-
class EncoderDecoderBase {
|
|
15
|
-
public:
|
|
16
|
-
explicit EncoderDecoderBase(const std::string &encoderPath,
|
|
17
|
-
const std::string &decoderPath,
|
|
18
|
-
std::shared_ptr<react::CallInvoker> callInvoker);
|
|
19
|
-
size_t getMemoryLowerBound() const noexcept;
|
|
20
|
-
void unload() noexcept;
|
|
21
|
-
|
|
22
|
-
protected:
|
|
23
|
-
std::shared_ptr<react::CallInvoker> callInvoker;
|
|
24
|
-
std::unique_ptr<BaseModel> encoder_;
|
|
25
|
-
std::unique_ptr<BaseModel> decoder_;
|
|
26
|
-
|
|
27
|
-
private:
|
|
28
|
-
size_t memorySizeLowerBound;
|
|
29
|
-
};
|
|
30
|
-
|
|
31
|
-
} // namespace rnexecutorch::models
|
|
@@ -1,27 +0,0 @@
|
|
|
1
|
-
#pragma once
|
|
2
|
-
|
|
3
|
-
#include "executorch/extension/tensor/tensor_ptr.h"
|
|
4
|
-
#include <rnexecutorch/host_objects/JSTensorViewOut.h>
|
|
5
|
-
#include <span>
|
|
6
|
-
#include <vector>
|
|
7
|
-
|
|
8
|
-
namespace rnexecutorch::models::speech_to_text {
|
|
9
|
-
|
|
10
|
-
using TensorPtr = ::executorch::extension::TensorPtr;
|
|
11
|
-
|
|
12
|
-
class SpeechToTextStrategy {
|
|
13
|
-
public:
|
|
14
|
-
virtual ~SpeechToTextStrategy() = default;
|
|
15
|
-
|
|
16
|
-
virtual TensorPtr prepareAudioInput(std::span<float> waveform) = 0;
|
|
17
|
-
|
|
18
|
-
virtual TensorPtr
|
|
19
|
-
prepareTokenInput(const std::vector<int64_t> &prevTokens) = 0;
|
|
20
|
-
|
|
21
|
-
virtual std::string getDecoderMethod() const = 0;
|
|
22
|
-
|
|
23
|
-
virtual std::shared_ptr<OwningArrayBuffer> extractOutputToken(
|
|
24
|
-
const executorch::aten::Tensor &decoderOutputTensor) const = 0;
|
|
25
|
-
};
|
|
26
|
-
|
|
27
|
-
} // namespace rnexecutorch::models::speech_to_text
|
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
#include "executorch/extension/tensor/tensor_ptr.h"
|
|
2
|
-
#include "rnexecutorch/data_processing/dsp.h"
|
|
3
|
-
#include <rnexecutorch/models/speech_to_text/WhisperStrategy.h>
|
|
4
|
-
|
|
5
|
-
namespace rnexecutorch::models::speech_to_text {
|
|
6
|
-
|
|
7
|
-
using namespace ::executorch::extension;
|
|
8
|
-
using namespace ::executorch::aten;
|
|
9
|
-
|
|
10
|
-
TensorPtr WhisperStrategy::prepareAudioInput(std::span<float> waveform) {
|
|
11
|
-
constexpr auto fftWindowSize = 512;
|
|
12
|
-
constexpr auto stftHopLength = 160;
|
|
13
|
-
constexpr auto innerDim = 256;
|
|
14
|
-
preprocessedData =
|
|
15
|
-
dsp::stftFromWaveform(waveform, fftWindowSize, stftHopLength);
|
|
16
|
-
const auto numFrames = preprocessedData.size() / innerDim;
|
|
17
|
-
std::vector<int32_t> inputShape = {static_cast<int32_t>(numFrames), innerDim};
|
|
18
|
-
return make_tensor_ptr(std::move(inputShape), std::move(preprocessedData));
|
|
19
|
-
}
|
|
20
|
-
|
|
21
|
-
TensorPtr
|
|
22
|
-
WhisperStrategy::prepareTokenInput(const std::vector<int64_t> &prevTokens) {
|
|
23
|
-
tokens32.clear();
|
|
24
|
-
tokens32.reserve(prevTokens.size());
|
|
25
|
-
for (auto token : prevTokens) {
|
|
26
|
-
tokens32.push_back(static_cast<int32_t>(token));
|
|
27
|
-
}
|
|
28
|
-
auto tensorSizes = {1, static_cast<int32_t>(tokens32.size())};
|
|
29
|
-
return make_tensor_ptr(std::move(tensorSizes), std::move(tokens32));
|
|
30
|
-
}
|
|
31
|
-
|
|
32
|
-
std::shared_ptr<OwningArrayBuffer> WhisperStrategy::extractOutputToken(
|
|
33
|
-
const executorch::aten::Tensor &decoderOutputTensor) const {
|
|
34
|
-
const auto innerDim = decoderOutputTensor.size(1);
|
|
35
|
-
const auto dictSize = decoderOutputTensor.size(2);
|
|
36
|
-
auto outputNumel = decoderOutputTensor.numel();
|
|
37
|
-
auto dataPtr =
|
|
38
|
-
static_cast<const float *>(decoderOutputTensor.const_data_ptr()) +
|
|
39
|
-
(innerDim - 1) * dictSize;
|
|
40
|
-
|
|
41
|
-
std::span<const float> modelOutput(dataPtr, outputNumel / innerDim);
|
|
42
|
-
auto createBuffer = [](const auto &data, size_t size) {
|
|
43
|
-
auto buffer = std::make_shared<OwningArrayBuffer>(size);
|
|
44
|
-
std::memcpy(buffer->data(), data, size);
|
|
45
|
-
return buffer;
|
|
46
|
-
};
|
|
47
|
-
return createBuffer(modelOutput.data(), modelOutput.size_bytes());
|
|
48
|
-
}
|
|
49
|
-
|
|
50
|
-
} // namespace rnexecutorch::models::speech_to_text
|