react-native-executorch 0.5.1-rc.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (178) hide show
  1. package/README.md +132 -0
  2. package/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +4 -10
  3. package/common/rnexecutorch/models/speech_to_text/SpeechToText.h +1 -1
  4. package/common/rnexecutorch/models/speech_to_text/SpeechToTextStrategy.h +3 -2
  5. package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.cpp +16 -4
  6. package/common/rnexecutorch/models/speech_to_text/WhisperStrategy.h +2 -2
  7. package/lib/Error.d.ts +30 -0
  8. package/lib/Error.js +50 -0
  9. package/lib/constants/directories.d.ts +1 -0
  10. package/lib/constants/directories.js +2 -0
  11. package/lib/constants/llmDefaults.d.ts +6 -0
  12. package/lib/constants/llmDefaults.js +16 -0
  13. package/lib/constants/modelUrls.d.ts +217 -83
  14. package/lib/constants/modelUrls.js +304 -98
  15. package/lib/constants/ocr/models.d.ts +882 -0
  16. package/lib/constants/ocr/models.js +182 -0
  17. package/lib/constants/ocr/symbols.d.ts +75 -0
  18. package/lib/constants/ocr/symbols.js +139 -0
  19. package/lib/{typescript/constants → constants}/sttDefaults.d.ts +0 -1
  20. package/lib/constants/sttDefaults.js +12 -10
  21. package/lib/controllers/LLMController.d.ts +47 -0
  22. package/lib/controllers/LLMController.js +14 -11
  23. package/lib/controllers/OCRController.d.ts +23 -0
  24. package/lib/controllers/OCRController.js +12 -5
  25. package/lib/controllers/SpeechToTextController.d.ts +8 -4
  26. package/lib/controllers/SpeechToTextController.js +15 -9
  27. package/lib/controllers/VerticalOCRController.d.ts +25 -0
  28. package/lib/controllers/VerticalOCRController.js +75 -0
  29. package/lib/hooks/computer_vision/useClassification.d.ts +15 -0
  30. package/lib/hooks/computer_vision/useClassification.js +7 -0
  31. package/lib/hooks/computer_vision/useImageEmbeddings.d.ts +15 -0
  32. package/lib/hooks/computer_vision/useImageEmbeddings.js +7 -0
  33. package/lib/hooks/computer_vision/useImageSegmentation.d.ts +38 -0
  34. package/lib/hooks/computer_vision/useImageSegmentation.js +7 -0
  35. package/lib/hooks/computer_vision/useOCR.d.ts +20 -0
  36. package/lib/hooks/computer_vision/useOCR.js +42 -0
  37. package/lib/hooks/computer_vision/useObjectDetection.d.ts +15 -0
  38. package/lib/hooks/computer_vision/useObjectDetection.js +7 -0
  39. package/lib/hooks/computer_vision/useStyleTransfer.d.ts +15 -0
  40. package/lib/hooks/computer_vision/useStyleTransfer.js +7 -0
  41. package/lib/hooks/computer_vision/useVerticalOCR.d.ts +21 -0
  42. package/lib/hooks/computer_vision/useVerticalOCR.js +45 -0
  43. package/lib/hooks/general/useExecutorchModule.d.ts +13 -0
  44. package/lib/hooks/general/useExecutorchModule.js +7 -0
  45. package/lib/hooks/natural_language_processing/useLLM.d.ts +10 -0
  46. package/lib/hooks/natural_language_processing/useLLM.js +78 -0
  47. package/lib/hooks/natural_language_processing/useSpeechToText.d.ts +27 -0
  48. package/lib/hooks/natural_language_processing/useSpeechToText.js +19 -14
  49. package/lib/hooks/natural_language_processing/useTextEmbeddings.d.ts +16 -0
  50. package/lib/hooks/natural_language_processing/useTextEmbeddings.js +7 -0
  51. package/lib/hooks/natural_language_processing/useTokenizer.d.ts +17 -0
  52. package/lib/hooks/natural_language_processing/useTokenizer.js +52 -0
  53. package/lib/hooks/useModule.d.ts +17 -0
  54. package/lib/hooks/useModule.js +45 -0
  55. package/lib/hooks/useNonStaticModule.d.ts +20 -0
  56. package/lib/hooks/useNonStaticModule.js +49 -0
  57. package/lib/index.d.ts +1 -1
  58. package/lib/index.js +3 -2
  59. package/lib/module/constants/modelUrls.js +61 -36
  60. package/lib/module/constants/modelUrls.js.map +1 -1
  61. package/lib/module/constants/ocr/models.js +1 -1
  62. package/lib/module/hooks/natural_language_processing/useSpeechToText.js +71 -34
  63. package/lib/module/hooks/natural_language_processing/useSpeechToText.js.map +1 -1
  64. package/lib/module/index.js +2 -3
  65. package/lib/module/index.js.map +1 -1
  66. package/lib/module/modules/natural_language_processing/SpeechToTextModule.js +72 -31
  67. package/lib/module/modules/natural_language_processing/SpeechToTextModule.js.map +1 -1
  68. package/lib/module/types/stt.js +1 -85
  69. package/lib/module/types/stt.js.map +1 -1
  70. package/lib/module/utils/ResourceFetcher.js +6 -8
  71. package/lib/module/utils/ResourceFetcher.js.map +1 -1
  72. package/lib/module/utils/ResourceFetcherUtils.js +20 -20
  73. package/lib/module/utils/ResourceFetcherUtils.js.map +1 -1
  74. package/lib/module/utils/SpeechToTextModule/ASR.js +191 -0
  75. package/lib/module/utils/SpeechToTextModule/ASR.js.map +1 -0
  76. package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js +73 -0
  77. package/lib/module/utils/SpeechToTextModule/OnlineProcessor.js.map +1 -0
  78. package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js +56 -0
  79. package/lib/module/utils/SpeechToTextModule/hypothesisBuffer.js.map +1 -0
  80. package/lib/modules/BaseModule.d.ts +8 -0
  81. package/lib/modules/BaseModule.js +25 -0
  82. package/lib/modules/BaseNonStaticModule.d.ts +9 -0
  83. package/lib/modules/BaseNonStaticModule.js +14 -0
  84. package/lib/modules/computer_vision/ClassificationModule.d.ts +8 -0
  85. package/lib/modules/computer_vision/ClassificationModule.js +17 -0
  86. package/lib/modules/computer_vision/ImageEmbeddingsModule.d.ts +8 -0
  87. package/lib/modules/computer_vision/ImageEmbeddingsModule.js +17 -0
  88. package/lib/modules/computer_vision/ImageSegmentationModule.d.ts +11 -0
  89. package/lib/modules/computer_vision/ImageSegmentationModule.js +27 -0
  90. package/lib/modules/computer_vision/OCRModule.d.ts +15 -0
  91. package/lib/modules/computer_vision/OCRModule.js +20 -0
  92. package/lib/modules/computer_vision/ObjectDetectionModule.d.ts +9 -0
  93. package/lib/modules/computer_vision/ObjectDetectionModule.js +17 -0
  94. package/lib/modules/computer_vision/StyleTransferModule.d.ts +8 -0
  95. package/lib/modules/computer_vision/StyleTransferModule.js +17 -0
  96. package/lib/modules/computer_vision/VerticalOCRModule.d.ts +15 -0
  97. package/lib/modules/computer_vision/VerticalOCRModule.js +22 -0
  98. package/lib/modules/general/ExecutorchModule.d.ts +7 -0
  99. package/lib/modules/general/ExecutorchModule.js +14 -0
  100. package/lib/modules/natural_language_processing/LLMModule.d.ts +28 -0
  101. package/lib/modules/natural_language_processing/LLMModule.js +45 -0
  102. package/lib/modules/natural_language_processing/SpeechToTextModule.d.ts +18 -8
  103. package/lib/modules/natural_language_processing/SpeechToTextModule.js +21 -15
  104. package/lib/modules/natural_language_processing/TextEmbeddingsModule.d.ts +9 -0
  105. package/lib/modules/natural_language_processing/TextEmbeddingsModule.js +21 -0
  106. package/lib/modules/natural_language_processing/TokenizerModule.d.ts +12 -0
  107. package/lib/modules/natural_language_processing/TokenizerModule.js +5 -4
  108. package/lib/native/NativeETInstaller.d.ts +6 -0
  109. package/lib/native/NativeETInstaller.js +2 -0
  110. package/lib/native/NativeOCR.d.ts +8 -0
  111. package/lib/native/NativeOCR.js +2 -0
  112. package/lib/native/NativeVerticalOCR.d.ts +8 -0
  113. package/lib/native/NativeVerticalOCR.js +2 -0
  114. package/lib/types/common.d.ts +31 -0
  115. package/lib/types/common.js +25 -0
  116. package/lib/types/imageSegmentation.d.ts +24 -0
  117. package/lib/types/imageSegmentation.js +26 -0
  118. package/lib/types/llm.d.ts +46 -0
  119. package/lib/types/llm.js +9 -0
  120. package/lib/types/objectDetection.d.ts +104 -0
  121. package/lib/types/objectDetection.js +94 -0
  122. package/lib/types/ocr.d.ts +11 -0
  123. package/lib/types/ocr.js +1 -0
  124. package/lib/types/stt.d.ts +94 -0
  125. package/lib/types/stt.js +85 -0
  126. package/lib/typescript/constants/modelUrls.d.ts +24 -7
  127. package/lib/typescript/constants/modelUrls.d.ts.map +1 -1
  128. package/lib/typescript/constants/ocr/models.d.ts +126 -126
  129. package/lib/typescript/hooks/natural_language_processing/useSpeechToText.d.ts +15 -24
  130. package/lib/typescript/hooks/natural_language_processing/useSpeechToText.d.ts.map +1 -1
  131. package/lib/typescript/index.d.ts +2 -3
  132. package/lib/typescript/index.d.ts.map +1 -1
  133. package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts +19 -22
  134. package/lib/typescript/modules/natural_language_processing/SpeechToTextModule.d.ts.map +1 -1
  135. package/lib/typescript/types/stt.d.ts +17 -91
  136. package/lib/typescript/types/stt.d.ts.map +1 -1
  137. package/lib/typescript/utils/ResourceFetcher.d.ts.map +1 -1
  138. package/lib/typescript/utils/ResourceFetcherUtils.d.ts.map +1 -1
  139. package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts +27 -0
  140. package/lib/typescript/utils/SpeechToTextModule/ASR.d.ts.map +1 -0
  141. package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts +23 -0
  142. package/lib/typescript/utils/SpeechToTextModule/OnlineProcessor.d.ts.map +1 -0
  143. package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts +13 -0
  144. package/lib/typescript/utils/SpeechToTextModule/hypothesisBuffer.d.ts.map +1 -0
  145. package/lib/utils/ResourceFetcher.d.ts +24 -0
  146. package/lib/utils/ResourceFetcher.js +305 -0
  147. package/lib/utils/ResourceFetcherUtils.d.ts +54 -0
  148. package/lib/utils/ResourceFetcherUtils.js +9 -0
  149. package/lib/utils/llm.d.ts +6 -0
  150. package/lib/utils/llm.js +1 -0
  151. package/lib/utils/stt.d.ts +1 -0
  152. package/lib/utils/stt.js +21 -0
  153. package/package.json +5 -3
  154. package/src/constants/modelUrls.ts +70 -37
  155. package/src/constants/ocr/models.ts +1 -1
  156. package/src/hooks/natural_language_processing/useSpeechToText.ts +87 -92
  157. package/src/index.ts +6 -8
  158. package/src/modules/natural_language_processing/SpeechToTextModule.ts +81 -69
  159. package/src/types/stt.ts +97 -92
  160. package/src/utils/ResourceFetcher.ts +9 -7
  161. package/src/utils/ResourceFetcherUtils.ts +15 -17
  162. package/src/utils/SpeechToTextModule/ASR.ts +303 -0
  163. package/src/utils/SpeechToTextModule/OnlineProcessor.ts +87 -0
  164. package/src/utils/SpeechToTextModule/hypothesisBuffer.ts +79 -0
  165. package/common/rnexecutorch/models/speech_to_text/MoonshineStrategy.cpp +0 -31
  166. package/common/rnexecutorch/models/speech_to_text/MoonshineStrategy.h +0 -21
  167. package/lib/module/constants/sttDefaults.js +0 -74
  168. package/lib/module/constants/sttDefaults.js.map +0 -1
  169. package/lib/module/controllers/SpeechToTextController.js +0 -320
  170. package/lib/module/controllers/SpeechToTextController.js.map +0 -1
  171. package/lib/typescript/constants/sttDefaults.d.ts.map +0 -1
  172. package/lib/typescript/controllers/SpeechToTextController.d.ts +0 -57
  173. package/lib/typescript/controllers/SpeechToTextController.d.ts.map +0 -1
  174. package/src/constants/sttDefaults.ts +0 -82
  175. package/src/controllers/SpeechToTextController.ts +0 -471
  176. package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.xcworkspace/contents.xcworkspacedata +0 -7
  177. package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/project.xcworkspace/xcuserdata/norbertklockiewicz.xcuserdatad/UserInterfaceState.xcuserstate +0 -0
  178. package/third-party/ios/ExecutorchLib/ExecutorchLib.xcodeproj/xcuserdata/norbertklockiewicz.xcuserdatad/xcschemes/xcschememanagement.plist +0 -14
@@ -53,17 +53,16 @@ export namespace ResourceFetcherUtils {
53
53
  return SourceType.OBJECT;
54
54
  } else if (typeof source === 'number') {
55
55
  const uri = Asset.fromModule(source).uri;
56
- if (!uri.includes('://')) {
57
- return SourceType.RELEASE_MODE_FILE;
56
+ if (uri.startsWith('http')) {
57
+ return SourceType.DEV_MODE_FILE;
58
58
  }
59
- return SourceType.DEV_MODE_FILE;
60
- } else {
61
- // typeof source == 'string'
62
- if (source.startsWith('file://')) {
63
- return SourceType.LOCAL_FILE;
64
- }
65
- return SourceType.REMOTE_FILE;
59
+ return SourceType.RELEASE_MODE_FILE;
60
+ }
61
+ // typeof source == 'string'
62
+ if (source.startsWith('file://')) {
63
+ return SourceType.LOCAL_FILE;
66
64
  }
65
+ return SourceType.REMOTE_FILE;
67
66
  }
68
67
 
69
68
  export async function getFilesSizes(sources: ResourceSource[]) {
@@ -78,9 +77,8 @@ export namespace ResourceFetcherUtils {
78
77
  for (const source of sources) {
79
78
  const type = await ResourceFetcherUtils.getType(source);
80
79
  let length = 0;
81
-
82
- if (type === SourceType.REMOTE_FILE && typeof source === 'string') {
83
- try {
80
+ try {
81
+ if (type === SourceType.REMOTE_FILE && typeof source === 'string') {
84
82
  const response = await fetch(source, { method: 'HEAD' });
85
83
  if (!response.ok) {
86
84
  Logger.warn(
@@ -97,14 +95,14 @@ export namespace ResourceFetcherUtils {
97
95
  length = contentLength ? parseInt(contentLength, 10) : 0;
98
96
  previousFilesTotalLength = totalLength;
99
97
  totalLength += length;
100
- } catch (error) {
101
- Logger.warn(`Error fetching HEAD for ${source}:`, error);
102
- continue;
103
98
  }
99
+ } catch (error) {
100
+ Logger.warn(`Error fetching HEAD for ${source}:`, error);
101
+ continue;
102
+ } finally {
103
+ results.push({ source, type, length, previousFilesTotalLength });
104
104
  }
105
- results.push({ source, type, length, previousFilesTotalLength });
106
105
  }
107
-
108
106
  return { results, totalLength };
109
107
  }
110
108
 
@@ -0,0 +1,303 @@
1
+ // NOTE: This will be implemented in C++
2
+
3
+ import { TokenizerModule } from '../../modules/natural_language_processing/TokenizerModule';
4
+ import {
5
+ DecodingOptions,
6
+ Segment,
7
+ SpeechToTextModelConfig,
8
+ WordObject,
9
+ WordTuple,
10
+ } from '../../types/stt';
11
+ import { ResourceFetcher } from '../ResourceFetcher';
12
+
13
+ export class ASR {
14
+ private nativeModule: any;
15
+ private tokenizerModule: TokenizerModule = new TokenizerModule();
16
+
17
+ private timePrecision: number = 0.02; // Whisper timestamp precision
18
+ private maxDecodeLength: number = 128;
19
+ private chunkSize: number = 30; // 30 seconds
20
+ private minChunkSamples: number = 1 * 16000; // 1 second
21
+ private samplingRate: number = 16000;
22
+
23
+ private startOfTranscriptToken!: number;
24
+ private endOfTextToken!: number;
25
+ private timestampBeginToken!: number;
26
+
27
+ public async load(
28
+ model: SpeechToTextModelConfig,
29
+ onDownloadProgressCallback: (progress: number) => void
30
+ ) {
31
+ const tokenizerLoadPromise = this.tokenizerModule.load(model);
32
+ const encoderDecoderPromise = ResourceFetcher.fetch(
33
+ onDownloadProgressCallback,
34
+ model.encoderSource,
35
+ model.decoderSource
36
+ );
37
+ const [_, encoderDecoderResults] = await Promise.all([
38
+ tokenizerLoadPromise,
39
+ encoderDecoderPromise,
40
+ ]);
41
+ const encoderSource = encoderDecoderResults?.[0];
42
+ const decoderSource = encoderDecoderResults?.[1];
43
+ if (!encoderSource || !decoderSource) {
44
+ throw new Error('Download interrupted.');
45
+ }
46
+ this.nativeModule = await global.loadSpeechToText(
47
+ encoderSource,
48
+ decoderSource,
49
+ 'whisper'
50
+ );
51
+
52
+ this.startOfTranscriptToken = await this.tokenizerModule.tokenToId(
53
+ '<|startoftranscript|>'
54
+ );
55
+ this.endOfTextToken = await this.tokenizerModule.tokenToId('<|endoftext|>');
56
+ this.timestampBeginToken = await this.tokenizerModule.tokenToId('<|0.00|>');
57
+ }
58
+
59
+ private async getInitialSequence(
60
+ options: DecodingOptions
61
+ ): Promise<number[]> {
62
+ const initialSequence: number[] = [this.startOfTranscriptToken];
63
+ if (options.language) {
64
+ const languageToken = await this.tokenizerModule.tokenToId(
65
+ `<|${options.language}|>`
66
+ );
67
+ const taskToken = await this.tokenizerModule.tokenToId('<|transcribe|>');
68
+ initialSequence.push(languageToken);
69
+ initialSequence.push(taskToken);
70
+ }
71
+ initialSequence.push(this.timestampBeginToken);
72
+ return initialSequence;
73
+ }
74
+
75
+ private async generate(
76
+ audio: number[],
77
+ temperature: number,
78
+ options: DecodingOptions
79
+ ): Promise<{
80
+ sequencesIds: number[];
81
+ scores: number[];
82
+ }> {
83
+ await this.encode(new Float32Array(audio));
84
+ const initialSequence = await this.getInitialSequence(options);
85
+ const sequencesIds = [...initialSequence];
86
+ const scores: number[] = [];
87
+
88
+ while (sequencesIds.length <= this.maxDecodeLength) {
89
+ const logits = this.softmaxWithTemperature(
90
+ Array.from(await this.decode(sequencesIds)),
91
+ temperature === 0 ? 1 : temperature
92
+ );
93
+ const nextTokenId =
94
+ temperature === 0
95
+ ? logits.indexOf(Math.max(...logits))
96
+ : this.sampleFromDistribution(logits);
97
+ const nextTokenProb = logits[nextTokenId]!;
98
+ sequencesIds.push(nextTokenId);
99
+ scores.push(nextTokenProb);
100
+ if (nextTokenId === this.endOfTextToken) {
101
+ break;
102
+ }
103
+ }
104
+
105
+ return {
106
+ sequencesIds: sequencesIds.slice(initialSequence.length),
107
+ scores: scores.slice(initialSequence.length),
108
+ };
109
+ }
110
+
111
+ private softmaxWithTemperature(logits: number[], temperature = 1.0) {
112
+ const max = Math.max(...logits);
113
+ const exps = logits.map((logit) => Math.exp((logit - max) / temperature));
114
+ const sum = exps.reduce((a, b) => a + b, 0);
115
+ return exps.map((exp) => exp / sum);
116
+ }
117
+
118
+ private sampleFromDistribution(probs: number[]): number {
119
+ const r = Math.random();
120
+ let cumulative = 0;
121
+ for (let i = 0; i < probs.length; i++) {
122
+ cumulative += probs[i]!;
123
+ if (r < cumulative) {
124
+ return i;
125
+ }
126
+ }
127
+ return probs.length - 1;
128
+ }
129
+
130
+ private async generateWithFallback(
131
+ audio: number[],
132
+ options: DecodingOptions
133
+ ) {
134
+ const temperatures = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0];
135
+ let generatedTokens: number[] = [];
136
+
137
+ for (const temperature of temperatures) {
138
+ const result = await this.generate(audio, temperature, options);
139
+ const tokens = result.sequencesIds;
140
+ const scores = result.scores;
141
+
142
+ const seqLen = tokens.length;
143
+ const cumLogProb = scores.reduce(
144
+ (acc, score) => acc + Math.log(score),
145
+ 0
146
+ );
147
+ const avgLogProb = cumLogProb / seqLen;
148
+
149
+ if (avgLogProb >= -1.0) {
150
+ generatedTokens = tokens;
151
+ break;
152
+ }
153
+ }
154
+
155
+ return this.calculateWordLevelTimestamps(generatedTokens, audio);
156
+ }
157
+
158
+ private async calculateWordLevelTimestamps(
159
+ generatedTokens: number[],
160
+ audio: number[]
161
+ ): Promise<Segment[]> {
162
+ const segments: Segment[] = [];
163
+
164
+ let tokens: number[] = [];
165
+ let prevTimestamp = this.timestampBeginToken;
166
+ for (let i = 0; i < generatedTokens.length; i++) {
167
+ if (generatedTokens[i]! < this.timestampBeginToken) {
168
+ tokens.push(generatedTokens[i]!);
169
+ }
170
+
171
+ if (
172
+ i > 0 &&
173
+ generatedTokens[i - 1]! >= this.timestampBeginToken &&
174
+ generatedTokens[i]! >= this.timestampBeginToken
175
+ ) {
176
+ const start = prevTimestamp;
177
+ const end = generatedTokens[i - 1]!;
178
+ const wordObjects = await this.estimateWordTimestampsLinear(
179
+ tokens,
180
+ start,
181
+ end
182
+ );
183
+ segments.push({
184
+ words: wordObjects,
185
+ });
186
+ tokens = [];
187
+ prevTimestamp = generatedTokens[i]!;
188
+ }
189
+ }
190
+
191
+ const start = prevTimestamp;
192
+ const end = generatedTokens.at(-2)!;
193
+ const wordObjects = await this.estimateWordTimestampsLinear(
194
+ tokens,
195
+ start,
196
+ end
197
+ );
198
+ segments.push({
199
+ words: wordObjects,
200
+ });
201
+
202
+ const scalingFactor =
203
+ audio.length /
204
+ this.samplingRate /
205
+ ((end - this.timestampBeginToken) * this.timePrecision);
206
+ if (scalingFactor < 1) {
207
+ for (const segment of segments) {
208
+ for (const word of segment.words) {
209
+ word.start *= scalingFactor;
210
+ word.end *= scalingFactor;
211
+ }
212
+ }
213
+ }
214
+
215
+ return segments;
216
+ }
217
+
218
+ private async estimateWordTimestampsLinear(
219
+ tokens: number[],
220
+ start: number,
221
+ end: number
222
+ ): Promise<WordObject[]> {
223
+ const duration = (end - start) * this.timePrecision;
224
+ const segmentText = (
225
+ (await this.tokenizerModule.decode(tokens)) as string
226
+ ).trim();
227
+
228
+ const words = segmentText.split(' ').map((w) => ` ${w}`);
229
+ const numOfCharacters = words.reduce(
230
+ (acc: number, word: string) => acc + word.length,
231
+ 0
232
+ );
233
+
234
+ const timePerCharacter = duration / numOfCharacters;
235
+
236
+ const wordObjects: WordObject[] = [];
237
+ const startTimeOffset =
238
+ (start - this.timestampBeginToken) * this.timePrecision;
239
+
240
+ let prevCharNum = 0;
241
+ for (let j = 0; j < words.length; j++) {
242
+ const word = words[j]!;
243
+ const start = startTimeOffset + prevCharNum * timePerCharacter;
244
+ const end = start + timePerCharacter * word.length;
245
+ wordObjects.push({ word, start, end });
246
+ prevCharNum += word.length;
247
+ }
248
+
249
+ return wordObjects;
250
+ }
251
+
252
+ public async transcribe(
253
+ audio: number[],
254
+ options: DecodingOptions
255
+ ): Promise<Segment[]> {
256
+ let seek = 0;
257
+ const allSegments: Segment[] = [];
258
+
259
+ while (seek * this.samplingRate < audio.length) {
260
+ const chunk = audio.slice(
261
+ seek * this.samplingRate,
262
+ (seek + this.chunkSize) * this.samplingRate
263
+ );
264
+ if (chunk.length < this.minChunkSamples) {
265
+ return allSegments;
266
+ }
267
+ const segments = await this.generateWithFallback(chunk, options);
268
+ for (const segment of segments) {
269
+ for (const word of segment.words) {
270
+ word.start += seek;
271
+ word.end += seek;
272
+ }
273
+ }
274
+ allSegments.push(...segments);
275
+ const lastTimeStamp = segments.at(-1)!.words.at(-1)!.end;
276
+ seek = lastTimeStamp;
277
+ }
278
+
279
+ return allSegments;
280
+ }
281
+
282
+ public tsWords(segments: Segment[]): WordTuple[] {
283
+ const o: WordTuple[] = [];
284
+ for (const segment of segments) {
285
+ for (const word of segment.words) {
286
+ o.push([word.start, word.end, word.word]);
287
+ }
288
+ }
289
+ return o;
290
+ }
291
+
292
+ public segmentsEndTs(res: Segment[]) {
293
+ return res.map((segment) => segment.words.at(-1)!.end);
294
+ }
295
+
296
+ public async encode(waveform: Float32Array): Promise<void> {
297
+ await this.nativeModule.encode(waveform);
298
+ }
299
+
300
+ public async decode(tokens: number[]): Promise<Float32Array> {
301
+ return new Float32Array(await this.nativeModule.decode(tokens));
302
+ }
303
+ }
@@ -0,0 +1,87 @@
1
+ // NOTE: This will be implemented in C++
2
+
3
+ import { WordTuple, DecodingOptions, Segment } from '../../types/stt';
4
+ import { ASR } from './ASR';
5
+ import { HypothesisBuffer } from './hypothesisBuffer';
6
+
7
+ export class OnlineASRProcessor {
8
+ private asr: ASR;
9
+
10
+ private samplingRate: number = 16000;
11
+ public audioBuffer: number[] = [];
12
+ private transcriptBuffer: HypothesisBuffer = new HypothesisBuffer();
13
+ private bufferTimeOffset: number = 0;
14
+ private committed: WordTuple[] = [];
15
+
16
+ constructor(asr: ASR) {
17
+ this.asr = asr;
18
+ }
19
+
20
+ public insertAudioChunk(audio: number[]) {
21
+ this.audioBuffer.push(...audio);
22
+ }
23
+
24
+ public async processIter(options: DecodingOptions) {
25
+ const res = await this.asr.transcribe(this.audioBuffer, options);
26
+ const tsw = this.asr.tsWords(res);
27
+ this.transcriptBuffer.insert(tsw, this.bufferTimeOffset);
28
+ const o = this.transcriptBuffer.flush();
29
+ this.committed.push(...o);
30
+
31
+ const s = 15;
32
+ if (this.audioBuffer.length / this.samplingRate > s) {
33
+ this.chunkCompletedSegment(res);
34
+ }
35
+
36
+ const committed = this.toFlush(o)[2];
37
+ const nonCommitted = this.transcriptBuffer
38
+ .complete()
39
+ .map((x) => x[2])
40
+ .join('');
41
+ return { committed, nonCommitted };
42
+ }
43
+
44
+ private chunkCompletedSegment(res: Segment[]) {
45
+ if (this.committed.length === 0) {
46
+ return;
47
+ }
48
+
49
+ const ends = this.asr.segmentsEndTs(res);
50
+ const t = this.committed.at(-1)![1];
51
+
52
+ if (ends.length > 1) {
53
+ let e = ends.at(-2)! + this.bufferTimeOffset;
54
+ while (ends.length > 2 && e > t) {
55
+ ends.pop();
56
+ e = ends.at(-2)! + this.bufferTimeOffset;
57
+ }
58
+
59
+ if (e <= t) {
60
+ this.chunkAt(e);
61
+ }
62
+ }
63
+ }
64
+
65
+ private chunkAt(time: number) {
66
+ this.transcriptBuffer.popCommitted(time);
67
+ const cutSeconds = time - this.bufferTimeOffset;
68
+ this.audioBuffer = this.audioBuffer.slice(
69
+ Math.floor(cutSeconds * this.samplingRate)
70
+ );
71
+ this.bufferTimeOffset = time;
72
+ }
73
+
74
+ public async finish() {
75
+ const o = this.transcriptBuffer.complete();
76
+ const f = this.toFlush(o);
77
+ this.bufferTimeOffset += this.audioBuffer.length / this.samplingRate;
78
+ return { committed: f[2] };
79
+ }
80
+
81
+ private toFlush(words: WordTuple[]): [number | null, number | null, string] {
82
+ const t = words.map((s) => s[2]).join(' ');
83
+ const b = words.length === 0 ? null : words[0]![0];
84
+ const e = words.length === 0 ? null : words.at(-1)![1];
85
+ return [b, e, t];
86
+ }
87
+ }
@@ -0,0 +1,79 @@
1
+ // NOTE: This will be implemented in C++
2
+
3
+ import { WordTuple } from '../../types/stt';
4
+
5
+ export class HypothesisBuffer {
6
+ private committedInBuffer: WordTuple[] = [];
7
+ private buffer: WordTuple[] = [];
8
+ private new: WordTuple[] = [];
9
+
10
+ private lastCommittedTime: number = 0;
11
+ public lastCommittedWord: string | null = null;
12
+
13
+ public insert(newWords: WordTuple[], offset: number) {
14
+ const newWordsOffset: WordTuple[] = newWords.map(([a, b, t]) => [
15
+ a + offset,
16
+ b + offset,
17
+ t,
18
+ ]);
19
+ this.new = newWordsOffset.filter(
20
+ ([a, _b, _t]) => a > this.lastCommittedTime - 0.5
21
+ );
22
+
23
+ if (this.new.length > 0) {
24
+ const [a, _b, _t] = this.new[0]!;
25
+ if (
26
+ Math.abs(a - this.lastCommittedTime) < 1 &&
27
+ this.committedInBuffer.length > 0
28
+ ) {
29
+ const cn = this.committedInBuffer.length;
30
+ const nn = this.new.length;
31
+
32
+ for (let i = 1; i <= Math.min(cn, nn, 5); i++) {
33
+ const c = this.committedInBuffer
34
+ .slice(-i)
35
+ .map((w) => w[2])
36
+ .join(' ');
37
+ const tail = this.new
38
+ .slice(0, i)
39
+ .map((w) => w[2])
40
+ .join(' ');
41
+ if (c === tail) {
42
+ for (let j = 0; j < i; j++) {
43
+ this.new.shift();
44
+ }
45
+ break;
46
+ }
47
+ }
48
+ }
49
+ }
50
+ }
51
+
52
+ public flush(): WordTuple[] {
53
+ const commit: WordTuple[] = [];
54
+ while (this.new.length > 0 && this.buffer.length > 0) {
55
+ if (this.new[0]![2] !== this.buffer[0]![2]) {
56
+ break;
57
+ }
58
+ commit.push(this.new[0]!);
59
+ this.lastCommittedWord = this.new[0]![2];
60
+ this.lastCommittedTime = this.new[0]![1];
61
+ this.buffer.shift();
62
+ this.new.shift();
63
+ }
64
+ this.buffer = this.new;
65
+ this.new = [];
66
+ this.committedInBuffer.push(...commit);
67
+ return commit;
68
+ }
69
+
70
+ public popCommitted(time: number) {
71
+ this.committedInBuffer = this.committedInBuffer.filter(
72
+ ([_a, b, _t]) => b > time
73
+ );
74
+ }
75
+
76
+ public complete(): WordTuple[] {
77
+ return this.buffer;
78
+ }
79
+ }
@@ -1,31 +0,0 @@
1
- #include "MoonshineStrategy.h"
2
- #include "executorch/runtime/core/exec_aten/exec_aten.h"
3
- #include <executorch/extension/tensor/tensor_ptr_maker.h>
4
- #include <executorch/runtime/core/portable_type/scalar_type.h>
5
-
6
- namespace rnexecutorch {
7
-
8
- using namespace ::executorch::extension;
9
- using namespace ::executorch::aten;
10
-
11
- TensorPtr MoonshineStrategy::prepareAudioInput(std::span<float> waveform) {
12
- std::vector<int32_t> inputShape = {1, static_cast<int32_t>(waveform.size())};
13
- return make_tensor_ptr(std::move(inputShape), waveform.data(),
14
- ScalarType::Float);
15
- }
16
-
17
- TensorPtr
18
- MoonshineStrategy::prepareTokenInput(const std::vector<int64_t> &prevTokens) {
19
- std::vector<int32_t> tensorSizes = {1,
20
- static_cast<int32_t>(prevTokens.size())};
21
- // prevTokens gets copied!!
22
- return make_tensor_ptr(std::move(tensorSizes), prevTokens);
23
- }
24
-
25
- int64_t MoonshineStrategy::extractOutputToken(const void *outputPtr,
26
- size_t innerDim) const {
27
- const auto *data = static_cast<const int64_t *>(outputPtr);
28
- return data[innerDim - 1];
29
- }
30
-
31
- } // namespace rnexecutorch
@@ -1,21 +0,0 @@
1
- #pragma once
2
-
3
- #include "SpeechToTextStrategy.h"
4
- #include <span>
5
- #include <vector>
6
-
7
- namespace rnexecutorch {
8
-
9
- class MoonshineStrategy final : public SpeechToTextStrategy {
10
- public:
11
- TensorPtr prepareAudioInput(std::span<float> waveform) override;
12
-
13
- TensorPtr prepareTokenInput(const std::vector<int64_t> &prevTokens) override;
14
-
15
- std::string getDecoderMethod() const override { return "forward_cached"; }
16
-
17
- int64_t extractOutputToken(const void *outputPtr,
18
- size_t innerDim) const override;
19
- };
20
-
21
- } // namespace rnexecutorch
@@ -1,74 +0,0 @@
1
- "use strict";
2
-
3
- import { MOONSHINE_TINY, WHISPER_TINY, WHISPER_TINY_MULTILINGUAL } from './modelUrls';
4
- import { AvailableModels } from '../types/stt';
5
- export const SAMPLE_RATE = 16_000;
6
- export const SECOND = SAMPLE_RATE;
7
- export const HAMMING_DIST_THRESHOLD = 1;
8
- const whisperTinyModelConfig = {
9
- sources: {
10
- encoder: WHISPER_TINY.encoderSource,
11
- decoder: WHISPER_TINY.decoderSource
12
- },
13
- tokenizer: {
14
- source: WHISPER_TINY.tokenizerSource,
15
- bos: 50257,
16
- // FIXME: this is a placeholder and needs to be changed
17
- eos: 50256 // FIXME: this is a placeholder and needs to be changed
18
- },
19
- isMultilingual: false
20
- };
21
- const moonshineTinyModelConfig = {
22
- sources: {
23
- encoder: MOONSHINE_TINY.encoderSource,
24
- decoder: MOONSHINE_TINY.decoderSource
25
- },
26
- tokenizer: {
27
- source: MOONSHINE_TINY.tokenizerSource,
28
- bos: 1,
29
- // FIXME: this is a placeholder and needs to be changed
30
- eos: 2 // FIXME: this is a placeholder and needs to be changed
31
- },
32
- isMultilingual: false
33
- };
34
- const whisperTinyMultilingualModelConfig = {
35
- sources: {
36
- encoder: WHISPER_TINY_MULTILINGUAL.encoderSource,
37
- decoder: WHISPER_TINY_MULTILINGUAL.decoderSource
38
- },
39
- tokenizer: {
40
- source: WHISPER_TINY_MULTILINGUAL.tokenizerSource,
41
- bos: 50258,
42
- // FIXME: this is a placeholder and needs to be changed
43
- eos: 50257 // FIXME: this is a placeholder and needs to be changed
44
- },
45
- isMultilingual: true
46
- };
47
- export const MODEL_CONFIGS = {
48
- moonshine: moonshineTinyModelConfig,
49
- whisper: whisperTinyModelConfig,
50
- whisperMultilingual: whisperTinyMultilingualModelConfig
51
- };
52
- export const MODES = {
53
- fast: {
54
- windowSize: 5,
55
- overlapSeconds: 1.2
56
- },
57
- balanced: {
58
- windowSize: 12,
59
- overlapSeconds: 2
60
- },
61
- quality: {
62
- windowSize: 24,
63
- overlapSeconds: 3
64
- }
65
- };
66
- export const NUM_TOKENS_TO_TRIM = 3;
67
- export let STREAMING_ACTION = /*#__PURE__*/function (STREAMING_ACTION) {
68
- STREAMING_ACTION[STREAMING_ACTION["START"] = 0] = "START";
69
- STREAMING_ACTION[STREAMING_ACTION["DATA"] = 1] = "DATA";
70
- STREAMING_ACTION[STREAMING_ACTION["STOP"] = 2] = "STOP";
71
- return STREAMING_ACTION;
72
- }({});
73
- export { AvailableModels };
74
- //# sourceMappingURL=sttDefaults.js.map
@@ -1 +0,0 @@
1
- {"version":3,"names":["MOONSHINE_TINY","WHISPER_TINY","WHISPER_TINY_MULTILINGUAL","AvailableModels","SAMPLE_RATE","SECOND","HAMMING_DIST_THRESHOLD","whisperTinyModelConfig","sources","encoder","encoderSource","decoder","decoderSource","tokenizer","source","tokenizerSource","bos","eos","isMultilingual","moonshineTinyModelConfig","whisperTinyMultilingualModelConfig","MODEL_CONFIGS","moonshine","whisper","whisperMultilingual","MODES","fast","windowSize","overlapSeconds","balanced","quality","NUM_TOKENS_TO_TRIM","STREAMING_ACTION"],"sourceRoot":"../../../src","sources":["constants/sttDefaults.ts"],"mappings":";;AAAA,SACEA,cAAc,EACdC,YAAY,EACZC,yBAAyB,QACpB,aAAa;AACpB,SAASC,eAAe,QAAqB,cAAc;AAE3D,OAAO,MAAMC,WAAW,GAAG,MAAM;AACjC,OAAO,MAAMC,MAAM,GAAGD,WAAW;AACjC,OAAO,MAAME,sBAAsB,GAAG,CAAC;AAEvC,MAAMC,sBAAsB,GAAG;EAC7BC,OAAO,EAAE;IACPC,OAAO,EAAER,YAAY,CAACS,aAAa;IACnCC,OAAO,EAAEV,YAAY,CAACW;EACxB,CAAC;EACDC,SAAS,EAAE;IACTC,MAAM,EAAEb,YAAY,CAACc,eAAe;IACpCC,GAAG,EAAE,KAAK;IAAE;IACZC,GAAG,EAAE,KAAK,CAAE;EACd,CAAC;EACDC,cAAc,EAAE;AAClB,CAAC;AAED,MAAMC,wBAAwB,GAAG;EAC/BX,OAAO,EAAE;IACPC,OAAO,EAAET,cAAc,CAACU,aAAa;IACrCC,OAAO,EAAEX,cAAc,CAACY;EAC1B,CAAC;EACDC,SAAS,EAAE;IACTC,MAAM,EAAEd,cAAc,CAACe,eAAe;IACtCC,GAAG,EAAE,CAAC;IAAE;IACRC,GAAG,EAAE,CAAC,CAAE;EACV,CAAC;EACDC,cAAc,EAAE;AAClB,CAAC;AAED,MAAME,kCAAkC,GAAG;EACzCZ,OAAO,EAAE;IACPC,OAAO,EAAEP,yBAAyB,CAACQ,aAAa;IAChDC,OAAO,EAAET,yBAAyB,CAACU;EACrC,CAAC;EACDC,SAAS,EAAE;IACTC,MAAM,EAAEZ,yBAAyB,CAACa,eAAe;IACjDC,GAAG,EAAE,KAAK;IAAE;IACZC,GAAG,EAAE,KAAK,CAAE;EACd,CAAC;EACDC,cAAc,EAAE;AAClB,CAAC;AAED,OAAO,MAAMG,aAEZ,GAAG;EACFC,SAAS,EAAEH,wBAAwB;EACnCI,OAAO,EAAEhB,sBAAsB;EAC/BiB,mBAAmB,EAAEJ;AACvB,CAAC;AAED,OAAO,MAAMK,KAAK,GAAG;EACnBC,IAAI,EAAE;IACJC,UAAU,EAAE,CAAC;IACbC,cAAc,EAAE;EAClB,CAAC;EACDC,QAAQ,EAAE;IACRF,UAAU,EAAE,EAAE;IACdC,cAAc,EAAE;EAClB,CAAC;EACDE,OAAO,EAAE;IACPH,UAAU,EAAE,EAAE;IACdC,cAAc,EAAE;EAClB;AACF,CAAC;AAED,OAAO,MAAMG,kBAAkB,GAAG,CAAC;AAEnC,WAAYC,gBAAgB,0BAAhBA,gBAAgB;EAAhBA,gBAAgB,CAAhBA,gBAAgB;EAAhBA,gBAAgB,CAAhBA,gBAAgB;EAAhBA,gBAAgB,CAAhBA,gBAAgB;EAAA,OAAhBA,gBAAgB;AAAA;AAM5B,SAAS7B,eAAe","ignoreList":[]}