react-native-executorch 0.5.1-rc.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/README.md +132 -0
  2. package/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +4 -10
  3. package/common/rnexecutorch/models/speech_to_text/SpeechToText.h +1 -1
  4. package/common/rnexecutorch/models/speech_to_text/SpeechToTextStrategy.h +3 -2
  5. package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.cpp +16 -4
  6. package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.h +2 -2
  7. package/lib/Error.d.ts +30 -0
  8. package/lib/Error.js +50 -0
  9. package/lib/constants/directories.d.ts +1 -0
  10. package/lib/constants/directories.js +2 -0
  11. package/lib/constants/llmDefaults.d.ts +6 -0
  12. package/lib/constants/llmDefaults.js +16 -0
  13. package/lib/constants/modelUrls.d.ts +217 -83
  14. package/lib/constants/modelUrls.js +304 -98
  15. package/lib/constants/ocr/models.d.ts +882 -0
  16. package/lib/constants/ocr/models.js +182 -0
  17. package/lib/constants/ocr/symbols.d.ts +75 -0
  18. package/lib/constants/ocr/symbols.js +139 -0
  19. package/lib/{typescript/constants → constants}/sttDefaults.d.ts +0 -1
  20. package/lib/constants/sttDefaults.js +12 -10
  21. package/lib/controllers/LLMController.d.ts +47 -0
  22. package/lib/controllers/LLMController.js +14 -11
  23. package/lib/controllers/OCRController.d.ts +23 -0
  24. package/lib/controllers/OCRController.js +12 -5
  25. package/lib/controllers/SpeechToTextController.d.ts +8 -4
  26. package/lib/controllers/SpeechToTextController.js +15 -9
  27. package/lib/controllers/VerticalOCRController.d.ts +25 -0
  28. package/lib/controllers/VerticalOCRController.js +75 -0
  29. package/lib/hooks/computer_vision/useClassification.d.ts +15 -0
  30. package/lib/hooks/computer_vision/useClassification.js +7 -0
  31. package/lib/hooks/computer_vision/useImageEmbeddings.d.ts +15 -0
  32. package/lib/hooks/computer_vision/useImageEmbeddings.js +7 -0
  33. package/lib/hooks/computer_vision/useImageSegmentation.d.ts +38 -0
  34. package/lib/hooks/computer_vision/useImageSegmentation.js +7 -0
  35. package/lib/hooks/computer_vision/useOCR.d.ts +20 -0
  36. package/lib/hooks/computer_vision/useOCR.js +42 -0
  37. package/lib/hooks/computer_vision/useObjectDetection.d.ts +15 -0
  38. package/lib/hooks/computer_vision/useObjectDetection.js +7 -0
  39. package/lib/hooks/computer_vision/useStyleTransfer.d.ts +15 -0
  40. package/lib/hooks/computer_vision/useStyleTransfer.js +7 -0
  41. package/lib/hooks/computer_vision/useVerticalOCR.d.ts +21 -0
  42. package/lib/hooks/computer_vision/useVerticalOCR.js +45 -0
  43. package/lib/hooks/general/useExecutorchModule.d.ts +13 -0
  44. package/lib/hooks/general/useExecutorchModule.js +7 -0
  45. package/lib/hooks/natural_language_processing/useLLM.d.ts +10 -0
  46. package/lib/hooks/natural_language_processing/useLLM.js +78 -0
  47. package/lib/hooks/natural_language_processing/useSpeechToText.d.ts +27 -0
  48. package/lib/hooks/natural_language_processing/useSpeechToText.js +19 -14
  49. package/lib/hooks/natural_language_processing/useTextEmbeddings.d.ts +16 -0
  50. package/lib/hooks/natural_language_processing/useTextEmbeddings.js +7 -0
  51. package/lib/hooks/natural_language_processing/useTokenizer.d.ts +17 -0
  52. package/lib/hooks/natural_language_processing/useTokenizer.js +52 -0
  53. package/lib/hooks/useModule.d.ts +17 -0
  54. package/lib/hooks/useModule.js +45 -0
  55. package/lib/hooks/useNonStaticModule.d.ts +20 -0
  56. package/lib/hooks/useNonStaticModule.js +49 -0
  57. package/lib/index.d.ts +1 -1
  58. package/lib/index.js +3 -2
  59. package/lib/module/constants/modelUrls.js +61 -36
  60. package/lib/module/constants/modelUrls.js.map +1 -1
  61. package/lib/module/constants/ocr/models.js +1 -1
  62. package/lib/module/hooks/natural_language_processing/useSpeechToText.js +71 -34
  63. package/lib/module/hooks/natural_language_processing/useSpeechToText.js.map +1 -1
  64. package/lib/module/index.js +2 -3
  65. package/lib/module/index.js.map +1 -1
  66. package/lib/module/modules/natural_language_processing/SpeechToTextModule.js +72 -31
  67. package/lib/module/modules/natural_language_processing/SpeechToTextModule.js.map +1 -1
  68. package/lib/module/types/stt.js +1 -85
  69. package/lib/module/types/stt.js.map +1 -1
  70. package/lib/module/utils/ResourceFetcher.js +6 -8
  71. package/lib/module/utils/ResourceFetcher.js.map +1 -1
  72. package/lib/module/utils/ResourceFetcherUtils.js +20 -20
  73. package/lib/module/utils/ResourceFetcherUtils.js.map +1 -1
  74. package/lib/module/utils/SpeechToTextModule/ASR.js +191 -0
  75. package/lib/module/utils/SpeechToTextModule/ASR.js.map +1 -0
  76. package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js +73 -0
  77. package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js.map +1 -0
  78. package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js +56 -0
  79. package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js.map +1 -0
  80. package/lib/modules/BaseModule.d.ts +8 -0
  81. package/lib/modules/BaseModule.js +25 -0
  82. package/lib/modules/BaseNonStaticModule.d.ts +9 -0
  83. package/lib/modules/BaseNonStaticModule.js +14 -0
  84. package/lib/modules/computer_vision/ClassificationModule.d.ts +8 -0
  85. package/lib/modules/computer_vision/ClassificationModule.js +17 -0
  86. package/lib/modules/computer_vision/ImageEmbeddingsModule.d.ts +8 -0
  87. package/lib/modules/computer_vision/ImageEmbeddingsModule.js +17 -0
  88. package/lib/modules/computer_vision/ImageSegmentationModule.d.ts +11 -0
  89. package/lib/modules/computer_vision/ImageSegmentationModule.js +27 -0
  90. package/lib/modules/computer_vision/OCRModule.d.ts +15 -0
  91. package/lib/modules/computer_vision/OCRModule.js +20 -0
  92. package/lib/modules/computer_vision/ObjectDetectionModule.d.ts +9 -0
  93. package/lib/modules/computer_vision/ObjectDetectionModule.js +17 -0
  94. package/lib/modules/computer_vision/StyleTransferModule.d.ts +8 -0
  95. package/lib/modules/computer_vision/StyleTransferModule.js +17 -0
  96. package/lib/modules/computer_vision/VerticalOCRModule.d.ts +15 -0
  97. package/lib/modules/computer_vision/VerticalOCRModule.js +22 -0
  98. package/lib/modules/general/ExecutorchModule.d.ts +7 -0
  99. package/lib/modules/general/ExecutorchModule.js +14 -0
  100. package/lib/modules/natural_language_processing/LLMModule.d.ts +28 -0
  101. package/lib/modules/natural_language_processing/LLMModule.js +45 -0
  102. package/lib/modules/natural_language_processing/SpeechToTextModule.d.ts +18 -8
  103. package/lib/modules/natural_language_processing/SpeechToTextModule.js +21 -15
  104. package/lib/modules/natural_language_processing/TextEmbeddingsModule.d.ts +9 -0
  105. package/lib/modules/natural_language_processing/TextEmbeddingsModule.js +21 -0
  106. package/lib/modules/natural_language_processing/TokenizerModule.d.ts +12 -0
  107. package/lib/modules/natural_language_processing/TokenizerModule.js +5 -4
  108. package/lib/native/NativeETInstaller.d.ts +6 -0
  109. package/lib/native/NativeETInstaller.js +2 -0
  110. package/lib/native/NativeOCR.d.ts +8 -0
  111. package/lib/native/NativeOCR.js +2 -0
  112. package/lib/native/NativeVerticalOCR.d.ts +8 -0
  113. package/lib/native/NativeVerticalOCR.js +2 -0
  114. package/lib/types/common.d.ts +31 -0
  115. package/lib/types/common.js +25 -0
  116. package/lib/types/imageSegmentation.d.ts +24 -0
  117. package/lib/types/imageSegmentation.js +26 -0
  118. package/lib/types/llm.d.ts +46 -0
  119. package/lib/types/llm.js +9 -0
  120. package/lib/types/objectDetection.d.ts +104 -0
  121. package/lib/types/objectDetection.js +94 -0
  122. package/lib/types/ocr.d.ts +11 -0
  123. package/lib/types/ocr.js +1 -0
  124. package/lib/types/stt.d.ts +94 -0
  125. package/lib/types/stt.js +85 -0
  126. package/lib/typescript/constants/modelUrls.d.ts +24 -7
  127. package/lib/typescript/constants/modelUrls.d.ts.map +1 -1
  128. package/lib/typescript/constants/ocr/models.d.ts +126 -126
  129. package/lib/typescript/hooks/natural_language_processing/useSpeechToText.d.ts +15 -24
  130. package/lib/typescript/hooks/natural_language_processing/useSpeechToText.d.ts.map +1 -1
  131. package/lib/typescript/index.d.ts +2 -3
  132. package/lib/typescript/index.d.ts.map +1 -1
  133. package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts +19 -22
  134. package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts.map +1 -1
  135. package/lib/typescript/types/stt.d.ts +17 -91
  136. package/lib/typescript/types/stt.d.ts.map +1 -1
  137. package/lib/typescript/utils/ResourceFetcher.d.ts.map +1 -1
  138. package/lib/typescript/utils/ResourceFetcherUtils.d.ts.map +1 -1
  139. package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts +27 -0
  140. package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts.map +1 -0
  141. package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts +23 -0
  142. package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts.map +1 -0
  143. package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts +13 -0
  144. package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts.map +1 -0
  145. package/lib/utils/ResourceFetcher.d.ts +24 -0
  146. package/lib/utils/ResourceFetcher.js +305 -0
  147. package/lib/utils/ResourceFetcherUtils.d.ts +54 -0
  148. package/lib/utils/ResourceFetcherUtils.js +9 -0
  149. package/lib/utils/llm.d.ts +6 -0
  150. package/lib/utils/llm.js +1 -0
  151. package/lib/utils/stt.d.ts +1 -0
  152. package/lib/utils/stt.js +21 -0
  153. package/package.json +5 -3
  154. package/src/constants/modelUrls.ts +70 -37
  155. package/src/constants/ocr/models.ts +1 -1
  156. package/src/hooks/natural_language_processing/useSpeechToText.ts +87 -92
  157. package/src/index.ts +6 -8
  158. package/src/modules/natural_language_processing/SpeechToTextModule.ts +81 -69
  159. package/src/types/stt.ts +97 -92
  160. package/src/utils/ResourceFetcher.ts +9 -7
  161. package/src/utils/ResourceFetcherUtils.ts +15 -17
  162. package/src/utils/SpeechToTextModule/ASR.ts +303 -0
  163. package/src/utils/SpeechToTextModule/OnlineProcessor.ts +87 -0
  164. package/src/utils/SpeechToTextModule/hypothesisBuffer.ts +79 -0
  165. package/common/rnexecutorch/models/speech_to_text/MoonshineStrategy.cpp +0 -31
  166. package/common/rnexecutorch/models/speech_to_text/MoonshineStrategy.h +0 -21
  167. package/lib/module/constants/sttDefaults.js +0 -74
  168. package/lib/module/constants/sttDefaults.js.map +0 -1
  169. package/lib/module/controllers/SpeechToTextController.js +0 -320
  170. package/lib/module/controllers/SpeechToTextController.js.map +0 -1
  171. package/lib/typescript/constants/sttDefaults.d.ts.map +0 -1
  172. package/lib/typescript/controllers/SpeechToTextController.d.ts +0 -57
  173. package/lib/typescript/controllers/SpeechToTextController.d.ts.map +0 -1
  174. package/src/constants/sttDefaults.ts +0 -82
  175. package/src/controllers/SpeechToTextController.ts +0 -471
  176. package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -7
  177. package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.xcworkspace/xcuserdata/norbertklockiewicz.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  178. package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/xcuserdata/norbertklockiewicz.xcuserdatad/xcschemes/xcschememanagement.plist +0 -14
@@ -1,116 +1,111 @@
1
- import { useEffect, useMemo, useState } from 'react';
2
- import { SpeechToTextController } from '../../controllers/SpeechToTextController';
3
- import { ResourceSource } from '../../types/common';
4
- import { STREAMING_ACTION } from '../../constants/sttDefaults';
5
- import { AvailableModels, SpeechToTextLanguage } from '../../types/stt';
6
-
7
- interface SpeechToTextModule {
8
- isReady: boolean;
9
- isGenerating: boolean;
10
- sequence: string;
11
- downloadProgress: number;
12
- configureStreaming: SpeechToTextController['configureStreaming'];
13
- error: Error | undefined;
14
- transcribe: (
15
- input: number[],
16
- audioLanguage?: SpeechToTextLanguage
17
- ) => ReturnType<SpeechToTextController['transcribe']>;
18
- streamingTranscribe: (
19
- streamAction: STREAMING_ACTION,
20
- input?: number[],
21
- audioLanguage?: SpeechToTextLanguage
22
- ) => ReturnType<SpeechToTextController['streamingTranscribe']>;
23
- }
1
+ import { useEffect, useCallback, useState } from 'react';
2
+ import { ETError, getError } from '../../Error';
3
+ import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule';
4
+ import { SpeechToTextModelConfig } from '../../types/stt';
24
5
 
25
6
  export const useSpeechToText = ({
26
7
  model,
27
- overlapSeconds,
28
- windowSize,
29
- streamingConfig,
30
8
  preventLoad = false,
31
9
  }: {
32
- model: {
33
- modelName: AvailableModels;
34
- encoderSource: ResourceSource;
35
- decoderSource: ResourceSource;
36
- tokenizerSource: ResourceSource;
37
- };
38
- overlapSeconds?: ConstructorParameters<
39
- typeof SpeechToTextController
40
- >['0']['overlapSeconds'];
41
- windowSize?: ConstructorParameters<
42
- typeof SpeechToTextController
43
- >['0']['windowSize'];
44
- streamingConfig?: ConstructorParameters<
45
- typeof SpeechToTextController
46
- >['0']['streamingConfig'];
10
+ model: SpeechToTextModelConfig;
47
11
  preventLoad?: boolean;
48
- }): SpeechToTextModule => {
49
- const [sequence, setSequence] = useState<string>('');
12
+ }) => {
13
+ const [error, setError] = useState<null | string>(null);
50
14
  const [isReady, setIsReady] = useState(false);
51
- const [downloadProgress, setDownloadProgress] = useState(0);
52
15
  const [isGenerating, setIsGenerating] = useState(false);
53
- const [error, setError] = useState<Error | undefined>();
54
-
55
- const controllerInstance = useMemo(
56
- () =>
57
- new SpeechToTextController({
58
- transcribeCallback: setSequence,
59
- isReadyCallback: setIsReady,
60
- isGeneratingCallback: setIsGenerating,
61
- onErrorCallback: setError,
62
- }),
63
- []
64
- );
16
+ const [downloadProgress, setDownloadProgress] = useState(0);
65
17
 
66
- useEffect(() => {
67
- controllerInstance.configureStreaming(
68
- overlapSeconds,
69
- windowSize,
70
- streamingConfig
71
- );
72
- }, [controllerInstance, overlapSeconds, windowSize, streamingConfig]);
18
+ const [modelInstance] = useState(() => new SpeechToTextModule());
19
+ const [committedTranscription, setCommittedTranscription] = useState('');
20
+ const [nonCommittedTranscription, setNonCommittedTranscription] =
21
+ useState('');
73
22
 
74
23
  useEffect(() => {
75
- const loadModel = async () => {
76
- await controllerInstance.load({
77
- modelName: model.modelName,
78
- encoderSource: model.encoderSource,
79
- decoderSource: model.decoderSource,
80
- tokenizerSource: model.tokenizerSource,
81
- onDownloadProgressCallback: setDownloadProgress,
82
- });
83
- };
84
- if (!preventLoad) {
85
- loadModel();
86
- }
24
+ if (preventLoad) return;
25
+ (async () => {
26
+ setDownloadProgress(0);
27
+ setError(null);
28
+ try {
29
+ setIsReady(false);
30
+ await modelInstance.load(
31
+ {
32
+ isMultilingual: model.isMultilingual,
33
+ encoderSource: model.encoderSource,
34
+ decoderSource: model.decoderSource,
35
+ tokenizerSource: model.tokenizerSource,
36
+ },
37
+ setDownloadProgress
38
+ );
39
+ setIsReady(true);
40
+ } catch (err) {
41
+ setError((err as Error).message);
42
+ }
43
+ })();
87
44
  }, [
88
- controllerInstance,
89
- model.modelName,
45
+ modelInstance,
46
+ model.isMultilingual,
90
47
  model.encoderSource,
91
48
  model.decoderSource,
92
49
  model.tokenizerSource,
93
50
  preventLoad,
94
51
  ]);
95
52
 
53
+ const stateWrapper = useCallback(
54
+ <T extends (...args: any[]) => Promise<any>>(fn: T) =>
55
+ async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
56
+ if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
57
+ if (isGenerating) throw new Error(getError(ETError.ModelGenerating));
58
+ setIsGenerating(true);
59
+ try {
60
+ return await fn.apply(modelInstance, args);
61
+ } finally {
62
+ setIsGenerating(false);
63
+ }
64
+ },
65
+ [isReady, isGenerating, modelInstance]
66
+ );
67
+
68
+ const stream = useCallback(async () => {
69
+ if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
70
+ if (isGenerating) throw new Error(getError(ETError.ModelGenerating));
71
+ setIsGenerating(true);
72
+ setCommittedTranscription('');
73
+ setNonCommittedTranscription('');
74
+ let transcription = '';
75
+ try {
76
+ for await (const { committed, nonCommitted } of modelInstance.stream()) {
77
+ setCommittedTranscription((prev) => prev + committed);
78
+ setNonCommittedTranscription(nonCommitted);
79
+ transcription += committed;
80
+ }
81
+ } finally {
82
+ setIsGenerating(false);
83
+ }
84
+ return transcription;
85
+ }, [isReady, isGenerating, modelInstance]);
86
+
87
+ const wrapper = useCallback(
88
+ <T extends (...args: any[]) => any>(fn: T) => {
89
+ return (...args: Parameters<T>): ReturnType<T> => {
90
+ if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
91
+ return fn.apply(modelInstance, args);
92
+ };
93
+ },
94
+ [isReady, modelInstance]
95
+ );
96
+
96
97
  return {
98
+ error,
97
99
  isReady,
98
100
  isGenerating,
99
101
  downloadProgress,
100
- configureStreaming: controllerInstance.configureStreaming,
101
- sequence,
102
- error,
103
- transcribe: (waveform: number[], audioLanguage?: SpeechToTextLanguage) =>
104
- controllerInstance.transcribe(waveform, audioLanguage),
105
- streamingTranscribe: (
106
- streamAction: STREAMING_ACTION,
107
- waveform?: number[],
108
- audioLanguage?: SpeechToTextLanguage
109
- ) =>
110
- controllerInstance.streamingTranscribe(
111
- streamAction,
112
- waveform,
113
- audioLanguage
114
- ),
102
+ committedTranscription,
103
+ nonCommittedTranscription,
104
+ encode: stateWrapper(SpeechToTextModule.prototype.encode),
105
+ decode: stateWrapper(SpeechToTextModule.prototype.decode),
106
+ transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
107
+ stream,
108
+ streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
109
+ streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
115
110
  };
116
111
  };
package/src/index.ts CHANGED
@@ -1,5 +1,3 @@
1
- import { SpeechToTextLanguage } from './types/stt';
2
-
3
1
  import { ETInstallerNativeModule } from './native/RnExecutorchModules';
4
2
 
5
3
  // eslint-disable no-var
@@ -95,14 +93,14 @@ export * from './types/objectDetection';
95
93
  export * from './types/ocr';
96
94
  export * from './types/imageSegmentation';
97
95
  export * from './types/llm';
98
- export { SpeechToTextLanguage };
96
+ export * from './types/common';
97
+ export {
98
+ SpeechToTextLanguage,
99
+ SpeechToTextModelConfig,
100
+ DecodingOptions,
101
+ } from './types/stt';
99
102
 
100
103
  // constants
101
104
  export * from './constants/modelUrls';
102
105
  export * from './constants/ocr/models';
103
106
  export * from './constants/llmDefaults';
104
- export {
105
- STREAMING_ACTION,
106
- MODES,
107
- AvailableModels,
108
- } from './constants/sttDefaults';
@@ -1,86 +1,98 @@
1
- import { ResourceSource } from '../../types/common';
2
- import { SpeechToTextController } from '../../controllers/SpeechToTextController';
3
- import { AvailableModels, SpeechToTextLanguage } from '../../types/stt';
4
- import { STREAMING_ACTION } from '../../constants/sttDefaults';
1
+ import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
2
+ import { ASR } from '../../utils/SpeechToTextModule/ASR';
3
+ import { OnlineASRProcessor } from '../../utils/SpeechToTextModule/OnlineProcessor';
5
4
 
6
5
  export class SpeechToTextModule {
7
- private module: SpeechToTextController;
8
-
9
- constructor({
10
- transcribeCallback,
11
- overlapSeconds,
12
- windowSize,
13
- streamingConfig,
14
- }: {
15
- transcribeCallback?: (sequence: string) => void;
16
- overlapSeconds?: ConstructorParameters<
17
- typeof SpeechToTextController
18
- >['0']['overlapSeconds'];
19
- windowSize?: ConstructorParameters<
20
- typeof SpeechToTextController
21
- >['0']['windowSize'];
22
- streamingConfig?: ConstructorParameters<
23
- typeof SpeechToTextController
24
- >['0']['streamingConfig'];
25
- } = {}) {
26
- this.module = new SpeechToTextController({
27
- transcribeCallback: transcribeCallback || (() => {}),
28
- overlapSeconds,
29
- windowSize,
30
- streamingConfig,
31
- });
32
- }
6
+ private modelConfig!: SpeechToTextModelConfig;
7
+ private asr: ASR = new ASR();
8
+
9
+ private processor: OnlineASRProcessor = new OnlineASRProcessor(this.asr);
10
+ private isStreaming = false;
11
+ private readyToProcess = false;
12
+ private minAudioSamples: number = 1 * 16000; // 1 second
33
13
 
34
- async load(
35
- model: {
36
- modelName: AvailableModels;
37
- encoderSource?: ResourceSource;
38
- decoderSource?: ResourceSource;
39
- tokenizerSource?: ResourceSource;
40
- },
14
+ public async load(
15
+ model: SpeechToTextModelConfig,
41
16
  onDownloadProgressCallback: (progress: number) => void = () => {}
42
17
  ) {
43
- await this.module.load({
44
- modelName: model.modelName,
45
- encoderSource: model.encoderSource,
46
- decoderSource: model.decoderSource,
47
- tokenizerSource: model.tokenizerSource,
48
- onDownloadProgressCallback,
49
- });
18
+ this.modelConfig = model;
19
+ return this.asr.load(model, onDownloadProgressCallback);
50
20
  }
51
21
 
52
- configureStreaming(
53
- overlapSeconds: Parameters<SpeechToTextController['configureStreaming']>[0],
54
- windowSize: Parameters<SpeechToTextController['configureStreaming']>[1],
55
- streamingConfig: Parameters<SpeechToTextController['configureStreaming']>[2]
56
- ) {
57
- this.module.configureStreaming(overlapSeconds, windowSize, streamingConfig);
22
+ public async encode(waveform: Float32Array): Promise<void> {
23
+ return this.asr.encode(waveform);
58
24
  }
59
25
 
60
- async encode(waveform: Float32Array) {
61
- return await this.module.encode(waveform);
26
+ public async decode(tokens: number[]): Promise<Float32Array> {
27
+ return this.asr.decode(tokens);
62
28
  }
63
29
 
64
- async decode(seq: number[]) {
65
- return await this.module.decode(seq);
30
+ public async transcribe(
31
+ waveform: number[],
32
+ options: DecodingOptions = {}
33
+ ): Promise<string> {
34
+ this.validateOptions(options);
35
+
36
+ const segments = await this.asr.transcribe(waveform, options);
37
+
38
+ let transcription = '';
39
+ for (const segment of segments) {
40
+ for (const word of segment.words) {
41
+ transcription += ` ${word.word}`;
42
+ }
43
+ }
44
+
45
+ return transcription.trim();
66
46
  }
67
47
 
68
- async transcribe(
69
- waveform: number[],
70
- audioLanguage?: SpeechToTextLanguage
71
- ): ReturnType<SpeechToTextController['transcribe']> {
72
- return await this.module.transcribe(waveform, audioLanguage);
48
+ public async *stream(options: DecodingOptions = {}) {
49
+ if (this.isStreaming) {
50
+ throw new Error('Streaming is already in progress');
51
+ }
52
+ this.validateOptions(options);
53
+ this.resetStreamState();
54
+
55
+ this.isStreaming = true;
56
+ while (this.isStreaming) {
57
+ if (
58
+ !this.readyToProcess ||
59
+ this.processor.audioBuffer.length < this.minAudioSamples
60
+ ) {
61
+ await new Promise((resolve) => setTimeout(resolve, 100));
62
+ continue;
63
+ }
64
+
65
+ const { committed, nonCommitted } =
66
+ await this.processor.processIter(options);
67
+ yield { committed, nonCommitted };
68
+ this.readyToProcess = false;
69
+ }
70
+
71
+ const { committed } = await this.processor.finish();
72
+ yield { committed, nonCommitted: '' };
73
+ }
74
+
75
+ public streamStop() {
76
+ this.isStreaming = false;
77
+ }
78
+
79
+ public streamInsert(waveform: number[]) {
80
+ this.processor.insertAudioChunk(waveform);
81
+ this.readyToProcess = true;
82
+ }
83
+
84
+ private validateOptions(options: DecodingOptions) {
85
+ if (!this.modelConfig.isMultilingual && options.language) {
86
+ throw new Error('Model is not multilingual, cannot set language');
87
+ }
88
+ if (this.modelConfig.isMultilingual && !options.language) {
89
+ throw new Error('Model is multilingual, provide a language');
90
+ }
73
91
  }
74
92
 
75
- async streamingTranscribe(
76
- streamAction: STREAMING_ACTION,
77
- waveform?: number[],
78
- audioLanguage?: SpeechToTextLanguage
79
- ): ReturnType<SpeechToTextController['streamingTranscribe']> {
80
- return await this.module.streamingTranscribe(
81
- streamAction,
82
- waveform,
83
- audioLanguage
84
- );
93
+ private resetStreamState() {
94
+ this.isStreaming = false;
95
+ this.readyToProcess = false;
96
+ this.processor = new OnlineASRProcessor(this.asr);
85
97
  }
86
98
  }
package/src/types/stt.ts CHANGED
@@ -1,97 +1,102 @@
1
- export interface ModelConfig {
2
- sources: {
3
- encoder: string;
4
- decoder: string;
5
- };
6
- tokenizer: {
7
- source: string;
8
- bos: number;
9
- eos: number;
10
- };
11
- isMultilingual: boolean;
1
+ import { ResourceSource } from './common';
2
+
3
+ export type WordTuple = [number, number, string];
4
+
5
+ export interface WordObject {
6
+ start: number;
7
+ end: number;
8
+ word: string;
12
9
  }
13
10
 
14
- // Those languages are supported just by whisper multilingual
15
- export enum SpeechToTextLanguage {
16
- Afrikaans = 'af',
17
- Albanian = 'sq',
18
- Arabic = 'ar',
19
- Armenian = 'hy',
20
- Azerbaijani = 'az',
21
- Basque = 'eu',
22
- Belarusian = 'be',
23
- Bengali = 'bn',
24
- Bosnian = 'bs',
25
- Bulgarian = 'bg',
26
- Burmese = 'my',
27
- Catalan = 'ca',
28
- Chinese = 'zh',
29
- Croatian = 'hr',
30
- Czech = 'cs',
31
- Danish = 'da',
32
- Dutch = 'nl',
33
- Estonian = 'et',
34
- English = 'en',
35
- Finnish = 'fi',
36
- French = 'fr',
37
- Galician = 'gl',
38
- Georgian = 'ka',
39
- German = 'de',
40
- Greek = 'el',
41
- Gujarati = 'gu',
42
- HaitianCreole = 'ht',
43
- Hebrew = 'he',
44
- Hindi = 'hi',
45
- Hungarian = 'hu',
46
- Icelandic = 'is',
47
- Indonesian = 'id',
48
- Italian = 'it',
49
- Japanese = 'ja',
50
- Kannada = 'kn',
51
- Kazakh = 'kk',
52
- Khmer = 'km',
53
- Korean = 'ko',
54
- Lao = 'lo',
55
- Latvian = 'lv',
56
- Lithuanian = 'lt',
57
- Macedonian = 'mk',
58
- Malagasy = 'mg',
59
- Malay = 'ms',
60
- Malayalam = 'ml',
61
- Maltese = 'mt',
62
- Marathi = 'mr',
63
- Nepali = 'ne',
64
- Norwegian = 'no',
65
- Persian = 'fa',
66
- Polish = 'pl',
67
- Portuguese = 'pt',
68
- Punjabi = 'pa',
69
- Romanian = 'ro',
70
- Russian = 'ru',
71
- Serbian = 'sr',
72
- Sinhala = 'si',
73
- Slovak = 'sk',
74
- Slovenian = 'sl',
75
- Spanish = 'es',
76
- Sundanese = 'su',
77
- Swahili = 'sw',
78
- Swedish = 'sv',
79
- Tagalog = 'tl',
80
- Tajik = 'tg',
81
- Tamil = 'ta',
82
- Telugu = 'te',
83
- Thai = 'th',
84
- Turkish = 'tr',
85
- Ukrainian = 'uk',
86
- Urdu = 'ur',
87
- Uzbek = 'uz',
88
- Vietnamese = 'vi',
89
- Welsh = 'cy',
90
- Yiddish = 'yi',
11
+ export interface Segment {
12
+ words: WordObject[];
91
13
  }
92
14
 
93
- export enum AvailableModels {
94
- WHISPER = 'whisper',
95
- MOONSHINE = 'moonshine',
96
- WHISPER_MULTILINGUAL = 'whisperMultilingual',
15
+ // Languages supported by whisper (not whisper.en)
16
+ export type SpeechToTextLanguage =
17
+ | 'af'
18
+ | 'sq'
19
+ | 'ar'
20
+ | 'hy'
21
+ | 'az'
22
+ | 'eu'
23
+ | 'be'
24
+ | 'bn'
25
+ | 'bs'
26
+ | 'bg'
27
+ | 'my'
28
+ | 'ca'
29
+ | 'zh'
30
+ | 'hr'
31
+ | 'cs'
32
+ | 'da'
33
+ | 'nl'
34
+ | 'et'
35
+ | 'en'
36
+ | 'fi'
37
+ | 'fr'
38
+ | 'gl'
39
+ | 'ka'
40
+ | 'de'
41
+ | 'el'
42
+ | 'gu'
43
+ | 'ht'
44
+ | 'he'
45
+ | 'hi'
46
+ | 'hu'
47
+ | 'is'
48
+ | 'id'
49
+ | 'it'
50
+ | 'ja'
51
+ | 'kn'
52
+ | 'kk'
53
+ | 'km'
54
+ | 'ko'
55
+ | 'lo'
56
+ | 'lv'
57
+ | 'lt'
58
+ | 'mk'
59
+ | 'mg'
60
+ | 'ms'
61
+ | 'ml'
62
+ | 'mt'
63
+ | 'mr'
64
+ | 'ne'
65
+ | 'no'
66
+ | 'fa'
67
+ | 'pl'
68
+ | 'pt'
69
+ | 'pa'
70
+ | 'ro'
71
+ | 'ru'
72
+ | 'sr'
73
+ | 'si'
74
+ | 'sk'
75
+ | 'sl'
76
+ | 'es'
77
+ | 'su'
78
+ | 'sw'
79
+ | 'sv'
80
+ | 'tl'
81
+ | 'tg'
82
+ | 'ta'
83
+ | 'te'
84
+ | 'th'
85
+ | 'tr'
86
+ | 'uk'
87
+ | 'ur'
88
+ | 'uz'
89
+ | 'vi'
90
+ | 'cy'
91
+ | 'yi';
92
+
93
+ export interface DecodingOptions {
94
+ language?: SpeechToTextLanguage;
95
+ }
96
+
97
+ export interface SpeechToTextModelConfig {
98
+ isMultilingual: boolean;
99
+ encoderSource: ResourceSource;
100
+ decoderSource: ResourceSource;
101
+ tokenizerSource: ResourceSource;
97
102
  }
@@ -29,6 +29,7 @@
29
29
  */
30
30
  import {
31
31
  cacheDirectory,
32
+ copyAsync,
32
33
  createDownloadResumable,
33
34
  moveAsync,
34
35
  FileSystemSessionType,
@@ -38,6 +39,7 @@ import {
38
39
  readDirectoryAsync,
39
40
  } from 'expo-file-system';
40
41
  import { Asset } from 'expo-asset';
42
+ import { Platform } from 'react-native';
41
43
  import { RNEDirectory } from '../constants/directories';
42
44
  import { ResourceSource } from '../types/common';
43
45
  import {
@@ -61,7 +63,6 @@ export class ResourceFetcher {
61
63
  }
62
64
  const { results: info, totalLength } =
63
65
  await ResourceFetcherUtils.getFilesSizes(sources);
64
-
65
66
  const head: ResourceSourceExtended = {
66
67
  source: info[0]!.source,
67
68
  sourceType: info[0]!.type,
@@ -316,16 +317,17 @@ export class ResourceFetcher {
316
317
  const uri = asset.uri;
317
318
  const filename = ResourceFetcherUtils.getFilenameFromUri(uri);
318
319
  const fileUri = `${RNEDirectory}${filename}`;
319
- const fileUriWithType = `${fileUri}.${asset.type}`;
320
+ // On Android, file uri does not contain file extension, so we add it manually
321
+ const fileUriWithType =
322
+ Platform.OS === 'android' ? `${fileUri}.${asset.type}` : fileUri;
320
323
  if (await ResourceFetcherUtils.checkFileExists(fileUri)) {
321
324
  return ResourceFetcherUtils.removeFilePrefix(fileUri);
322
325
  }
323
326
  await ResourceFetcherUtils.createDirectoryIfNoExists();
324
- await asset.downloadAsync();
325
- if (!asset.localUri) {
326
- throw new Error(`Asset local URI is not available for ${source}`);
327
- }
328
- await moveAsync({ from: asset.localUri, to: fileUriWithType });
327
+ await copyAsync({
328
+ from: asset.uri,
329
+ to: fileUriWithType,
330
+ });
329
331
  return ResourceFetcherUtils.removeFilePrefix(fileUriWithType);
330
332
  }
331
333