react-native-executorch 0.5.1-rc.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +132 -0
- package/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +4 -10
- package/common/rnexecutorch/models/speech_to_text/SpeechToText.h +1 -1
- package/common/rnexecutorch/models/speech_to_text/SpeechToTextStrategy.h +3 -2
- package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.cpp +16 -4
- package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.h +2 -2
- package/ios/RnExecutorch.xcodeproj/project.xcworkspace/contents.xcworkspacedata +7 -0
- package/ios/RnExecutorch.xcodeproj/project.xcworkspace/xcuserdata/jakubchmura.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
- package/ios/RnExecutorch.xcodeproj/xcuserdata/jakubchmura.xcuserdatad/xcschemes/xcschememanagement.plist +14 -0
- package/lib/module/constants/modelUrls.js +61 -36
- package/lib/module/constants/modelUrls.js.map +1 -1
- package/lib/module/constants/ocr/models.js +1 -1
- package/lib/module/hooks/natural_language_processing/useSpeechToText.js +71 -34
- package/lib/module/hooks/natural_language_processing/useSpeechToText.js.map +1 -1
- package/lib/module/index.js +2 -3
- package/lib/module/index.js.map +1 -1
- package/lib/module/modules/natural_language_processing/SpeechToTextModule.js +72 -31
- package/lib/module/modules/natural_language_processing/SpeechToTextModule.js.map +1 -1
- package/lib/module/types/stt.js +1 -85
- package/lib/module/types/stt.js.map +1 -1
- package/lib/module/utils/SpeechToTextModule/ASR.js +191 -0
- package/lib/module/utils/SpeechToTextModule/ASR.js.map +1 -0
- package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js +73 -0
- package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js.map +1 -0
- package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js +56 -0
- package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js.map +1 -0
- package/lib/tsconfig.tsbuildinfo +1 -0
- package/lib/typescript/constants/modelUrls.d.ts +24 -7
- package/lib/typescript/constants/modelUrls.d.ts.map +1 -1
- package/lib/typescript/constants/ocr/models.d.ts +126 -126
- package/lib/typescript/hooks/natural_language_processing/useSpeechToText.d.ts +15 -24
- package/lib/typescript/hooks/natural_language_processing/useSpeechToText.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +2 -3
- package/lib/typescript/index.d.ts.map +1 -1
- package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts +19 -22
- package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts.map +1 -1
- package/lib/typescript/types/stt.d.ts +17 -91
- package/lib/typescript/types/stt.d.ts.map +1 -1
- package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts +27 -0
- package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts.map +1 -0
- package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts +23 -0
- package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts.map +1 -0
- package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts +13 -0
- package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts.map +1 -0
- package/package.json +5 -3
- package/src/constants/modelUrls.ts +70 -37
- package/src/constants/ocr/models.ts +1 -1
- package/src/hooks/natural_language_processing/useSpeechToText.ts +87 -92
- package/src/index.ts +6 -8
- package/src/modules/natural_language_processing/SpeechToTextModule.ts +81 -69
- package/src/types/stt.ts +97 -92
- package/src/utils/SpeechToTextModule/ASR.ts +303 -0
- package/src/utils/SpeechToTextModule/OnlineProcessor.ts +87 -0
- package/src/utils/SpeechToTextModule/hypothesisBuffer.ts +79 -0
- package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.xcworkspace/xcuserdata/jakubchmura.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
- package/common/rnexecutorch/models/speech_to_text/MoonshineStrategy.cpp +0 -31
- package/common/rnexecutorch/models/speech_to_text/MoonshineStrategy.h +0 -21
- package/lib/common/Logger.d.ts +0 -8
- package/lib/common/Logger.js +0 -19
- package/lib/constants/modelUrls.d.ts +0 -89
- package/lib/constants/modelUrls.js +0 -116
- package/lib/constants/sttDefaults.js +0 -66
- package/lib/controllers/LLMController.js +0 -210
- package/lib/controllers/OCRController.js +0 -65
- package/lib/controllers/SpeechToTextController.d.ts +0 -52
- package/lib/controllers/SpeechToTextController.js +0 -343
- package/lib/hooks/natural_language_processing/useSpeechToText.js +0 -44
- package/lib/index.d.ts +0 -50
- package/lib/index.js +0 -59
- package/lib/module/constants/sttDefaults.js +0 -74
- package/lib/module/constants/sttDefaults.js.map +0 -1
- package/lib/module/controllers/SpeechToTextController.js +0 -320
- package/lib/module/controllers/SpeechToTextController.js.map +0 -1
- package/lib/modules/natural_language_processing/SpeechToTextModule.d.ts +0 -14
- package/lib/modules/natural_language_processing/SpeechToTextModule.js +0 -30
- package/lib/modules/natural_language_processing/TokenizerModule.js +0 -29
- package/lib/native/RnExecutorchModules.d.ts +0 -3
- package/lib/native/RnExecutorchModules.js +0 -16
- package/lib/typescript/constants/sttDefaults.d.ts +0 -29
- package/lib/typescript/constants/sttDefaults.d.ts.map +0 -1
- package/lib/typescript/controllers/SpeechToTextController.d.ts +0 -57
- package/lib/typescript/controllers/SpeechToTextController.d.ts.map +0 -1
- package/lib/utils/ResourceFetcherUtils.js +0 -119
- package/lib/utils/llm.js +0 -72
- package/src/constants/sttDefaults.ts +0 -82
- package/src/controllers/SpeechToTextController.ts +0 -471
- package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.xcworkspace/xcuserdata/norbertklockiewicz.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
- /package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/xcuserdata/{norbertklockiewicz.xcuserdatad → jakubchmura.xcuserdatad}/xcschemes/xcschememanagement.plist +0 -0
|
@@ -1,116 +1,111 @@
|
|
|
1
|
-
import { useEffect,
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import {
|
|
5
|
-
import { AvailableModels, SpeechToTextLanguage } from '../../types/stt';
|
|
6
|
-
|
|
7
|
-
interface SpeechToTextModule {
|
|
8
|
-
isReady: boolean;
|
|
9
|
-
isGenerating: boolean;
|
|
10
|
-
sequence: string;
|
|
11
|
-
downloadProgress: number;
|
|
12
|
-
configureStreaming: SpeechToTextController['configureStreaming'];
|
|
13
|
-
error: Error | undefined;
|
|
14
|
-
transcribe: (
|
|
15
|
-
input: number[],
|
|
16
|
-
audioLanguage?: SpeechToTextLanguage
|
|
17
|
-
) => ReturnType<SpeechToTextController['transcribe']>;
|
|
18
|
-
streamingTranscribe: (
|
|
19
|
-
streamAction: STREAMING_ACTION,
|
|
20
|
-
input?: number[],
|
|
21
|
-
audioLanguage?: SpeechToTextLanguage
|
|
22
|
-
) => ReturnType<SpeechToTextController['streamingTranscribe']>;
|
|
23
|
-
}
|
|
1
|
+
import { useEffect, useCallback, useState } from 'react';
|
|
2
|
+
import { ETError, getError } from '../../Error';
|
|
3
|
+
import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule';
|
|
4
|
+
import { SpeechToTextModelConfig } from '../../types/stt';
|
|
24
5
|
|
|
25
6
|
export const useSpeechToText = ({
|
|
26
7
|
model,
|
|
27
|
-
overlapSeconds,
|
|
28
|
-
windowSize,
|
|
29
|
-
streamingConfig,
|
|
30
8
|
preventLoad = false,
|
|
31
9
|
}: {
|
|
32
|
-
model:
|
|
33
|
-
modelName: AvailableModels;
|
|
34
|
-
encoderSource: ResourceSource;
|
|
35
|
-
decoderSource: ResourceSource;
|
|
36
|
-
tokenizerSource: ResourceSource;
|
|
37
|
-
};
|
|
38
|
-
overlapSeconds?: ConstructorParameters<
|
|
39
|
-
typeof SpeechToTextController
|
|
40
|
-
>['0']['overlapSeconds'];
|
|
41
|
-
windowSize?: ConstructorParameters<
|
|
42
|
-
typeof SpeechToTextController
|
|
43
|
-
>['0']['windowSize'];
|
|
44
|
-
streamingConfig?: ConstructorParameters<
|
|
45
|
-
typeof SpeechToTextController
|
|
46
|
-
>['0']['streamingConfig'];
|
|
10
|
+
model: SpeechToTextModelConfig;
|
|
47
11
|
preventLoad?: boolean;
|
|
48
|
-
})
|
|
49
|
-
const [
|
|
12
|
+
}) => {
|
|
13
|
+
const [error, setError] = useState<null | string>(null);
|
|
50
14
|
const [isReady, setIsReady] = useState(false);
|
|
51
|
-
const [downloadProgress, setDownloadProgress] = useState(0);
|
|
52
15
|
const [isGenerating, setIsGenerating] = useState(false);
|
|
53
|
-
const [
|
|
54
|
-
|
|
55
|
-
const controllerInstance = useMemo(
|
|
56
|
-
() =>
|
|
57
|
-
new SpeechToTextController({
|
|
58
|
-
transcribeCallback: setSequence,
|
|
59
|
-
isReadyCallback: setIsReady,
|
|
60
|
-
isGeneratingCallback: setIsGenerating,
|
|
61
|
-
onErrorCallback: setError,
|
|
62
|
-
}),
|
|
63
|
-
[]
|
|
64
|
-
);
|
|
16
|
+
const [downloadProgress, setDownloadProgress] = useState(0);
|
|
65
17
|
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
streamingConfig
|
|
71
|
-
);
|
|
72
|
-
}, [controllerInstance, overlapSeconds, windowSize, streamingConfig]);
|
|
18
|
+
const [modelInstance] = useState(() => new SpeechToTextModule());
|
|
19
|
+
const [committedTranscription, setCommittedTranscription] = useState('');
|
|
20
|
+
const [nonCommittedTranscription, setNonCommittedTranscription] =
|
|
21
|
+
useState('');
|
|
73
22
|
|
|
74
23
|
useEffect(() => {
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
24
|
+
if (preventLoad) return;
|
|
25
|
+
(async () => {
|
|
26
|
+
setDownloadProgress(0);
|
|
27
|
+
setError(null);
|
|
28
|
+
try {
|
|
29
|
+
setIsReady(false);
|
|
30
|
+
await modelInstance.load(
|
|
31
|
+
{
|
|
32
|
+
isMultilingual: model.isMultilingual,
|
|
33
|
+
encoderSource: model.encoderSource,
|
|
34
|
+
decoderSource: model.decoderSource,
|
|
35
|
+
tokenizerSource: model.tokenizerSource,
|
|
36
|
+
},
|
|
37
|
+
setDownloadProgress
|
|
38
|
+
);
|
|
39
|
+
setIsReady(true);
|
|
40
|
+
} catch (err) {
|
|
41
|
+
setError((err as Error).message);
|
|
42
|
+
}
|
|
43
|
+
})();
|
|
87
44
|
}, [
|
|
88
|
-
|
|
89
|
-
model.
|
|
45
|
+
modelInstance,
|
|
46
|
+
model.isMultilingual,
|
|
90
47
|
model.encoderSource,
|
|
91
48
|
model.decoderSource,
|
|
92
49
|
model.tokenizerSource,
|
|
93
50
|
preventLoad,
|
|
94
51
|
]);
|
|
95
52
|
|
|
53
|
+
const stateWrapper = useCallback(
|
|
54
|
+
<T extends (...args: any[]) => Promise<any>>(fn: T) =>
|
|
55
|
+
async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
|
|
56
|
+
if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
|
|
57
|
+
if (isGenerating) throw new Error(getError(ETError.ModelGenerating));
|
|
58
|
+
setIsGenerating(true);
|
|
59
|
+
try {
|
|
60
|
+
return await fn.apply(modelInstance, args);
|
|
61
|
+
} finally {
|
|
62
|
+
setIsGenerating(false);
|
|
63
|
+
}
|
|
64
|
+
},
|
|
65
|
+
[isReady, isGenerating, modelInstance]
|
|
66
|
+
);
|
|
67
|
+
|
|
68
|
+
const stream = useCallback(async () => {
|
|
69
|
+
if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
|
|
70
|
+
if (isGenerating) throw new Error(getError(ETError.ModelGenerating));
|
|
71
|
+
setIsGenerating(true);
|
|
72
|
+
setCommittedTranscription('');
|
|
73
|
+
setNonCommittedTranscription('');
|
|
74
|
+
let transcription = '';
|
|
75
|
+
try {
|
|
76
|
+
for await (const { committed, nonCommitted } of modelInstance.stream()) {
|
|
77
|
+
setCommittedTranscription((prev) => prev + committed);
|
|
78
|
+
setNonCommittedTranscription(nonCommitted);
|
|
79
|
+
transcription += committed;
|
|
80
|
+
}
|
|
81
|
+
} finally {
|
|
82
|
+
setIsGenerating(false);
|
|
83
|
+
}
|
|
84
|
+
return transcription;
|
|
85
|
+
}, [isReady, isGenerating, modelInstance]);
|
|
86
|
+
|
|
87
|
+
const wrapper = useCallback(
|
|
88
|
+
<T extends (...args: any[]) => any>(fn: T) => {
|
|
89
|
+
return (...args: Parameters<T>): ReturnType<T> => {
|
|
90
|
+
if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
|
|
91
|
+
return fn.apply(modelInstance, args);
|
|
92
|
+
};
|
|
93
|
+
},
|
|
94
|
+
[isReady, modelInstance]
|
|
95
|
+
);
|
|
96
|
+
|
|
96
97
|
return {
|
|
98
|
+
error,
|
|
97
99
|
isReady,
|
|
98
100
|
isGenerating,
|
|
99
101
|
downloadProgress,
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
audioLanguage?: SpeechToTextLanguage
|
|
109
|
-
) =>
|
|
110
|
-
controllerInstance.streamingTranscribe(
|
|
111
|
-
streamAction,
|
|
112
|
-
waveform,
|
|
113
|
-
audioLanguage
|
|
114
|
-
),
|
|
102
|
+
committedTranscription,
|
|
103
|
+
nonCommittedTranscription,
|
|
104
|
+
encode: stateWrapper(SpeechToTextModule.prototype.encode),
|
|
105
|
+
decode: stateWrapper(SpeechToTextModule.prototype.decode),
|
|
106
|
+
transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
|
|
107
|
+
stream,
|
|
108
|
+
streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
|
|
109
|
+
streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
|
|
115
110
|
};
|
|
116
111
|
};
|
package/src/index.ts
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import { SpeechToTextLanguage } from './types/stt';
|
|
2
|
-
|
|
3
1
|
import { ETInstallerNativeModule } from './native/RnExecutorchModules';
|
|
4
2
|
|
|
5
3
|
// eslint-disable no-var
|
|
@@ -95,14 +93,14 @@ export * from './types/objectDetection';
|
|
|
95
93
|
export * from './types/ocr';
|
|
96
94
|
export * from './types/imageSegmentation';
|
|
97
95
|
export * from './types/llm';
|
|
98
|
-
export
|
|
96
|
+
export * from './types/common';
|
|
97
|
+
export {
|
|
98
|
+
SpeechToTextLanguage,
|
|
99
|
+
SpeechToTextModelConfig,
|
|
100
|
+
DecodingOptions,
|
|
101
|
+
} from './types/stt';
|
|
99
102
|
|
|
100
103
|
// constants
|
|
101
104
|
export * from './constants/modelUrls';
|
|
102
105
|
export * from './constants/ocr/models';
|
|
103
106
|
export * from './constants/llmDefaults';
|
|
104
|
-
export {
|
|
105
|
-
STREAMING_ACTION,
|
|
106
|
-
MODES,
|
|
107
|
-
AvailableModels,
|
|
108
|
-
} from './constants/sttDefaults';
|
|
@@ -1,86 +1,98 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import {
|
|
3
|
-
import {
|
|
4
|
-
import { STREAMING_ACTION } from '../../constants/sttDefaults';
|
|
1
|
+
import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
|
|
2
|
+
import { ASR } from '../../utils/SpeechToTextModule/ASR';
|
|
3
|
+
import { OnlineASRProcessor } from '../../utils/SpeechToTextModule/OnlineProcessor';
|
|
5
4
|
|
|
6
5
|
export class SpeechToTextModule {
|
|
7
|
-
private
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
}: {
|
|
15
|
-
transcribeCallback?: (sequence: string) => void;
|
|
16
|
-
overlapSeconds?: ConstructorParameters<
|
|
17
|
-
typeof SpeechToTextController
|
|
18
|
-
>['0']['overlapSeconds'];
|
|
19
|
-
windowSize?: ConstructorParameters<
|
|
20
|
-
typeof SpeechToTextController
|
|
21
|
-
>['0']['windowSize'];
|
|
22
|
-
streamingConfig?: ConstructorParameters<
|
|
23
|
-
typeof SpeechToTextController
|
|
24
|
-
>['0']['streamingConfig'];
|
|
25
|
-
} = {}) {
|
|
26
|
-
this.module = new SpeechToTextController({
|
|
27
|
-
transcribeCallback: transcribeCallback || (() => {}),
|
|
28
|
-
overlapSeconds,
|
|
29
|
-
windowSize,
|
|
30
|
-
streamingConfig,
|
|
31
|
-
});
|
|
32
|
-
}
|
|
6
|
+
private modelConfig!: SpeechToTextModelConfig;
|
|
7
|
+
private asr: ASR = new ASR();
|
|
8
|
+
|
|
9
|
+
private processor: OnlineASRProcessor = new OnlineASRProcessor(this.asr);
|
|
10
|
+
private isStreaming = false;
|
|
11
|
+
private readyToProcess = false;
|
|
12
|
+
private minAudioSamples: number = 1 * 16000; // 1 second
|
|
33
13
|
|
|
34
|
-
async load(
|
|
35
|
-
model:
|
|
36
|
-
modelName: AvailableModels;
|
|
37
|
-
encoderSource?: ResourceSource;
|
|
38
|
-
decoderSource?: ResourceSource;
|
|
39
|
-
tokenizerSource?: ResourceSource;
|
|
40
|
-
},
|
|
14
|
+
public async load(
|
|
15
|
+
model: SpeechToTextModelConfig,
|
|
41
16
|
onDownloadProgressCallback: (progress: number) => void = () => {}
|
|
42
17
|
) {
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
encoderSource: model.encoderSource,
|
|
46
|
-
decoderSource: model.decoderSource,
|
|
47
|
-
tokenizerSource: model.tokenizerSource,
|
|
48
|
-
onDownloadProgressCallback,
|
|
49
|
-
});
|
|
18
|
+
this.modelConfig = model;
|
|
19
|
+
return this.asr.load(model, onDownloadProgressCallback);
|
|
50
20
|
}
|
|
51
21
|
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
windowSize: Parameters<SpeechToTextController['configureStreaming']>[1],
|
|
55
|
-
streamingConfig: Parameters<SpeechToTextController['configureStreaming']>[2]
|
|
56
|
-
) {
|
|
57
|
-
this.module.configureStreaming(overlapSeconds, windowSize, streamingConfig);
|
|
22
|
+
public async encode(waveform: Float32Array): Promise<void> {
|
|
23
|
+
return this.asr.encode(waveform);
|
|
58
24
|
}
|
|
59
25
|
|
|
60
|
-
async
|
|
61
|
-
return
|
|
26
|
+
public async decode(tokens: number[]): Promise<Float32Array> {
|
|
27
|
+
return this.asr.decode(tokens);
|
|
62
28
|
}
|
|
63
29
|
|
|
64
|
-
async
|
|
65
|
-
|
|
30
|
+
public async transcribe(
|
|
31
|
+
waveform: number[],
|
|
32
|
+
options: DecodingOptions = {}
|
|
33
|
+
): Promise<string> {
|
|
34
|
+
this.validateOptions(options);
|
|
35
|
+
|
|
36
|
+
const segments = await this.asr.transcribe(waveform, options);
|
|
37
|
+
|
|
38
|
+
let transcription = '';
|
|
39
|
+
for (const segment of segments) {
|
|
40
|
+
for (const word of segment.words) {
|
|
41
|
+
transcription += ` ${word.word}`;
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
return transcription.trim();
|
|
66
46
|
}
|
|
67
47
|
|
|
68
|
-
async
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
48
|
+
public async *stream(options: DecodingOptions = {}) {
|
|
49
|
+
if (this.isStreaming) {
|
|
50
|
+
throw new Error('Streaming is already in progress');
|
|
51
|
+
}
|
|
52
|
+
this.validateOptions(options);
|
|
53
|
+
this.resetStreamState();
|
|
54
|
+
|
|
55
|
+
this.isStreaming = true;
|
|
56
|
+
while (this.isStreaming) {
|
|
57
|
+
if (
|
|
58
|
+
!this.readyToProcess ||
|
|
59
|
+
this.processor.audioBuffer.length < this.minAudioSamples
|
|
60
|
+
) {
|
|
61
|
+
await new Promise((resolve) => setTimeout(resolve, 100));
|
|
62
|
+
continue;
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
const { committed, nonCommitted } =
|
|
66
|
+
await this.processor.processIter(options);
|
|
67
|
+
yield { committed, nonCommitted };
|
|
68
|
+
this.readyToProcess = false;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
const { committed } = await this.processor.finish();
|
|
72
|
+
yield { committed, nonCommitted: '' };
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
public streamStop() {
|
|
76
|
+
this.isStreaming = false;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
public streamInsert(waveform: number[]) {
|
|
80
|
+
this.processor.insertAudioChunk(waveform);
|
|
81
|
+
this.readyToProcess = true;
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
private validateOptions(options: DecodingOptions) {
|
|
85
|
+
if (!this.modelConfig.isMultilingual && options.language) {
|
|
86
|
+
throw new Error('Model is not multilingual, cannot set language');
|
|
87
|
+
}
|
|
88
|
+
if (this.modelConfig.isMultilingual && !options.language) {
|
|
89
|
+
throw new Error('Model is multilingual, provide a language');
|
|
90
|
+
}
|
|
73
91
|
}
|
|
74
92
|
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
): ReturnType<SpeechToTextController['streamingTranscribe']> {
|
|
80
|
-
return await this.module.streamingTranscribe(
|
|
81
|
-
streamAction,
|
|
82
|
-
waveform,
|
|
83
|
-
audioLanguage
|
|
84
|
-
);
|
|
93
|
+
private resetStreamState() {
|
|
94
|
+
this.isStreaming = false;
|
|
95
|
+
this.readyToProcess = false;
|
|
96
|
+
this.processor = new OnlineASRProcessor(this.asr);
|
|
85
97
|
}
|
|
86
98
|
}
|
package/src/types/stt.ts
CHANGED
|
@@ -1,97 +1,102 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
eos: number;
|
|
10
|
-
};
|
|
11
|
-
isMultilingual: boolean;
|
|
1
|
+
import { ResourceSource } from './common';
|
|
2
|
+
|
|
3
|
+
export type WordTuple = [number, number, string];
|
|
4
|
+
|
|
5
|
+
export interface WordObject {
|
|
6
|
+
start: number;
|
|
7
|
+
end: number;
|
|
8
|
+
word: string;
|
|
12
9
|
}
|
|
13
10
|
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
Afrikaans = 'af',
|
|
17
|
-
Albanian = 'sq',
|
|
18
|
-
Arabic = 'ar',
|
|
19
|
-
Armenian = 'hy',
|
|
20
|
-
Azerbaijani = 'az',
|
|
21
|
-
Basque = 'eu',
|
|
22
|
-
Belarusian = 'be',
|
|
23
|
-
Bengali = 'bn',
|
|
24
|
-
Bosnian = 'bs',
|
|
25
|
-
Bulgarian = 'bg',
|
|
26
|
-
Burmese = 'my',
|
|
27
|
-
Catalan = 'ca',
|
|
28
|
-
Chinese = 'zh',
|
|
29
|
-
Croatian = 'hr',
|
|
30
|
-
Czech = 'cs',
|
|
31
|
-
Danish = 'da',
|
|
32
|
-
Dutch = 'nl',
|
|
33
|
-
Estonian = 'et',
|
|
34
|
-
English = 'en',
|
|
35
|
-
Finnish = 'fi',
|
|
36
|
-
French = 'fr',
|
|
37
|
-
Galician = 'gl',
|
|
38
|
-
Georgian = 'ka',
|
|
39
|
-
German = 'de',
|
|
40
|
-
Greek = 'el',
|
|
41
|
-
Gujarati = 'gu',
|
|
42
|
-
HaitianCreole = 'ht',
|
|
43
|
-
Hebrew = 'he',
|
|
44
|
-
Hindi = 'hi',
|
|
45
|
-
Hungarian = 'hu',
|
|
46
|
-
Icelandic = 'is',
|
|
47
|
-
Indonesian = 'id',
|
|
48
|
-
Italian = 'it',
|
|
49
|
-
Japanese = 'ja',
|
|
50
|
-
Kannada = 'kn',
|
|
51
|
-
Kazakh = 'kk',
|
|
52
|
-
Khmer = 'km',
|
|
53
|
-
Korean = 'ko',
|
|
54
|
-
Lao = 'lo',
|
|
55
|
-
Latvian = 'lv',
|
|
56
|
-
Lithuanian = 'lt',
|
|
57
|
-
Macedonian = 'mk',
|
|
58
|
-
Malagasy = 'mg',
|
|
59
|
-
Malay = 'ms',
|
|
60
|
-
Malayalam = 'ml',
|
|
61
|
-
Maltese = 'mt',
|
|
62
|
-
Marathi = 'mr',
|
|
63
|
-
Nepali = 'ne',
|
|
64
|
-
Norwegian = 'no',
|
|
65
|
-
Persian = 'fa',
|
|
66
|
-
Polish = 'pl',
|
|
67
|
-
Portuguese = 'pt',
|
|
68
|
-
Punjabi = 'pa',
|
|
69
|
-
Romanian = 'ro',
|
|
70
|
-
Russian = 'ru',
|
|
71
|
-
Serbian = 'sr',
|
|
72
|
-
Sinhala = 'si',
|
|
73
|
-
Slovak = 'sk',
|
|
74
|
-
Slovenian = 'sl',
|
|
75
|
-
Spanish = 'es',
|
|
76
|
-
Sundanese = 'su',
|
|
77
|
-
Swahili = 'sw',
|
|
78
|
-
Swedish = 'sv',
|
|
79
|
-
Tagalog = 'tl',
|
|
80
|
-
Tajik = 'tg',
|
|
81
|
-
Tamil = 'ta',
|
|
82
|
-
Telugu = 'te',
|
|
83
|
-
Thai = 'th',
|
|
84
|
-
Turkish = 'tr',
|
|
85
|
-
Ukrainian = 'uk',
|
|
86
|
-
Urdu = 'ur',
|
|
87
|
-
Uzbek = 'uz',
|
|
88
|
-
Vietnamese = 'vi',
|
|
89
|
-
Welsh = 'cy',
|
|
90
|
-
Yiddish = 'yi',
|
|
11
|
+
export interface Segment {
|
|
12
|
+
words: WordObject[];
|
|
91
13
|
}
|
|
92
14
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
15
|
+
// Languages supported by whisper (not whisper.en)
|
|
16
|
+
export type SpeechToTextLanguage =
|
|
17
|
+
| 'af'
|
|
18
|
+
| 'sq'
|
|
19
|
+
| 'ar'
|
|
20
|
+
| 'hy'
|
|
21
|
+
| 'az'
|
|
22
|
+
| 'eu'
|
|
23
|
+
| 'be'
|
|
24
|
+
| 'bn'
|
|
25
|
+
| 'bs'
|
|
26
|
+
| 'bg'
|
|
27
|
+
| 'my'
|
|
28
|
+
| 'ca'
|
|
29
|
+
| 'zh'
|
|
30
|
+
| 'hr'
|
|
31
|
+
| 'cs'
|
|
32
|
+
| 'da'
|
|
33
|
+
| 'nl'
|
|
34
|
+
| 'et'
|
|
35
|
+
| 'en'
|
|
36
|
+
| 'fi'
|
|
37
|
+
| 'fr'
|
|
38
|
+
| 'gl'
|
|
39
|
+
| 'ka'
|
|
40
|
+
| 'de'
|
|
41
|
+
| 'el'
|
|
42
|
+
| 'gu'
|
|
43
|
+
| 'ht'
|
|
44
|
+
| 'he'
|
|
45
|
+
| 'hi'
|
|
46
|
+
| 'hu'
|
|
47
|
+
| 'is'
|
|
48
|
+
| 'id'
|
|
49
|
+
| 'it'
|
|
50
|
+
| 'ja'
|
|
51
|
+
| 'kn'
|
|
52
|
+
| 'kk'
|
|
53
|
+
| 'km'
|
|
54
|
+
| 'ko'
|
|
55
|
+
| 'lo'
|
|
56
|
+
| 'lv'
|
|
57
|
+
| 'lt'
|
|
58
|
+
| 'mk'
|
|
59
|
+
| 'mg'
|
|
60
|
+
| 'ms'
|
|
61
|
+
| 'ml'
|
|
62
|
+
| 'mt'
|
|
63
|
+
| 'mr'
|
|
64
|
+
| 'ne'
|
|
65
|
+
| 'no'
|
|
66
|
+
| 'fa'
|
|
67
|
+
| 'pl'
|
|
68
|
+
| 'pt'
|
|
69
|
+
| 'pa'
|
|
70
|
+
| 'ro'
|
|
71
|
+
| 'ru'
|
|
72
|
+
| 'sr'
|
|
73
|
+
| 'si'
|
|
74
|
+
| 'sk'
|
|
75
|
+
| 'sl'
|
|
76
|
+
| 'es'
|
|
77
|
+
| 'su'
|
|
78
|
+
| 'sw'
|
|
79
|
+
| 'sv'
|
|
80
|
+
| 'tl'
|
|
81
|
+
| 'tg'
|
|
82
|
+
| 'ta'
|
|
83
|
+
| 'te'
|
|
84
|
+
| 'th'
|
|
85
|
+
| 'tr'
|
|
86
|
+
| 'uk'
|
|
87
|
+
| 'ur'
|
|
88
|
+
| 'uz'
|
|
89
|
+
| 'vi'
|
|
90
|
+
| 'cy'
|
|
91
|
+
| 'yi';
|
|
92
|
+
|
|
93
|
+
export interface DecodingOptions {
|
|
94
|
+
language?: SpeechToTextLanguage;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
export interface SpeechToTextModelConfig {
|
|
98
|
+
isMultilingual: boolean;
|
|
99
|
+
encoderSource: ResourceSource;
|
|
100
|
+
decoderSource: ResourceSource;
|
|
101
|
+
tokenizerSource: ResourceSource;
|
|
97
102
|
}
|