npm - react-native-executorch - Versions diffs - 0.5.1-rc.0 → 0.5.1 - Mend

react-native-executorch 0.5.1-rc.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (88) hide show

package/src/hooks/natural_language_processing/useSpeechToText.ts CHANGED Viewed

@@ -1,116 +1,111 @@
-import { useEffect, useMemo, useState } from 'react';
-import { SpeechToTextController } from '../../controllers/SpeechToTextController';
-import { ResourceSource } from '../../types/common';
-import { STREAMING_ACTION } from '../../constants/sttDefaults';
-import { AvailableModels, SpeechToTextLanguage } from '../../types/stt';
-interface SpeechToTextModule {
-  isReady: boolean;
-  isGenerating: boolean;
-  sequence: string;
-  downloadProgress: number;
-  configureStreaming: SpeechToTextController['configureStreaming'];
-  error: Error | undefined;
-  transcribe: (
-    input: number[],
-    audioLanguage?: SpeechToTextLanguage
-  ) => ReturnType<SpeechToTextController['transcribe']>;
-  streamingTranscribe: (
-    streamAction: STREAMING_ACTION,
-    input?: number[],
-    audioLanguage?: SpeechToTextLanguage
-  ) => ReturnType<SpeechToTextController['streamingTranscribe']>;
-}
+import { useEffect, useCallback, useState } from 'react';
+import { ETError, getError } from '../../Error';
+import { SpeechToTextModule } from '../../modules/natural_language_processing/SpeechToTextModule';
+import { SpeechToTextModelConfig } from '../../types/stt';
 export const useSpeechToText = ({
   model,
-  overlapSeconds,
-  windowSize,
-  streamingConfig,
   preventLoad = false,
 }: {
-  model: {
-    modelName: AvailableModels;
-    encoderSource: ResourceSource;
-    decoderSource: ResourceSource;
-    tokenizerSource: ResourceSource;
-  };
-  overlapSeconds?: ConstructorParameters<
-    typeof SpeechToTextController
-  >['0']['overlapSeconds'];
-  windowSize?: ConstructorParameters<
-    typeof SpeechToTextController
-  >['0']['windowSize'];
-  streamingConfig?: ConstructorParameters<
-    typeof SpeechToTextController
-  >['0']['streamingConfig'];
+  model: SpeechToTextModelConfig;
   preventLoad?: boolean;
-}): SpeechToTextModule => {
-  const [sequence, setSequence] = useState<string>('');
+}) => {
+  const [error, setError] = useState<null | string>(null);
   const [isReady, setIsReady] = useState(false);
-  const [downloadProgress, setDownloadProgress] = useState(0);
   const [isGenerating, setIsGenerating] = useState(false);
-  const [error, setError] = useState<Error | undefined>();
-  const controllerInstance = useMemo(
-    () =>
-      new SpeechToTextController({
-        transcribeCallback: setSequence,
-        isReadyCallback: setIsReady,
-        isGeneratingCallback: setIsGenerating,
-        onErrorCallback: setError,
-      }),
-    []
-  );
+  const [downloadProgress, setDownloadProgress] = useState(0);
-  useEffect(() => {
-    controllerInstance.configureStreaming(
-      overlapSeconds,
-      windowSize,
-      streamingConfig
-    );
-  }, [controllerInstance, overlapSeconds, windowSize, streamingConfig]);
+  const [modelInstance] = useState(() => new SpeechToTextModule());
+  const [committedTranscription, setCommittedTranscription] = useState('');
+  const [nonCommittedTranscription, setNonCommittedTranscription] =
+    useState('');
   useEffect(() => {
-    const loadModel = async () => {
-      await controllerInstance.load({
-        modelName: model.modelName,
-        encoderSource: model.encoderSource,
-        decoderSource: model.decoderSource,
-        tokenizerSource: model.tokenizerSource,
-        onDownloadProgressCallback: setDownloadProgress,
-      });
-    };
-    if (!preventLoad) {
-      loadModel();
-    }
+    if (preventLoad) return;
+    (async () => {
+      setDownloadProgress(0);
+      setError(null);
+      try {
+        setIsReady(false);
+        await modelInstance.load(
+          {
+            isMultilingual: model.isMultilingual,
+            encoderSource: model.encoderSource,
+            decoderSource: model.decoderSource,
+            tokenizerSource: model.tokenizerSource,
+          },
+          setDownloadProgress
+        );
+        setIsReady(true);
+      } catch (err) {
+        setError((err as Error).message);
+      }
+    })();
   }, [
-    controllerInstance,
-    model.modelName,
+    modelInstance,
+    model.isMultilingual,
     model.encoderSource,
     model.decoderSource,
     model.tokenizerSource,
     preventLoad,
   ]);
+  const stateWrapper = useCallback(
+    <T extends (...args: any[]) => Promise<any>>(fn: T) =>
+      async (...args: Parameters<T>): Promise<Awaited<ReturnType<T>>> => {
+        if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
+        if (isGenerating) throw new Error(getError(ETError.ModelGenerating));
+        setIsGenerating(true);
+        try {
+          return await fn.apply(modelInstance, args);
+        } finally {
+          setIsGenerating(false);
+        }
+      },
+    [isReady, isGenerating, modelInstance]
+  );
+  const stream = useCallback(async () => {
+    if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
+    if (isGenerating) throw new Error(getError(ETError.ModelGenerating));
+    setIsGenerating(true);
+    setCommittedTranscription('');
+    setNonCommittedTranscription('');
+    let transcription = '';
+    try {
+      for await (const { committed, nonCommitted } of modelInstance.stream()) {
+        setCommittedTranscription((prev) => prev + committed);
+        setNonCommittedTranscription(nonCommitted);
+        transcription += committed;
+      }
+    } finally {
+      setIsGenerating(false);
+    }
+    return transcription;
+  }, [isReady, isGenerating, modelInstance]);
+  const wrapper = useCallback(
+    <T extends (...args: any[]) => any>(fn: T) => {
+      return (...args: Parameters<T>): ReturnType<T> => {
+        if (!isReady) throw new Error(getError(ETError.ModuleNotLoaded));
+        return fn.apply(modelInstance, args);
+      };
+    },
+    [isReady, modelInstance]
+  );
   return {
+    error,
     isReady,
     isGenerating,
     downloadProgress,
-    configureStreaming: controllerInstance.configureStreaming,
-    sequence,
-    error,
-    transcribe: (waveform: number[], audioLanguage?: SpeechToTextLanguage) =>
-      controllerInstance.transcribe(waveform, audioLanguage),
-    streamingTranscribe: (
-      streamAction: STREAMING_ACTION,
-      waveform?: number[],
-      audioLanguage?: SpeechToTextLanguage
-    ) =>
-      controllerInstance.streamingTranscribe(
-        streamAction,
-        waveform,
-        audioLanguage
-      ),
+    committedTranscription,
+    nonCommittedTranscription,
+    encode: stateWrapper(SpeechToTextModule.prototype.encode),
+    decode: stateWrapper(SpeechToTextModule.prototype.decode),
+    transcribe: stateWrapper(SpeechToTextModule.prototype.transcribe),
+    stream,
+    streamStop: wrapper(SpeechToTextModule.prototype.streamStop),
+    streamInsert: wrapper(SpeechToTextModule.prototype.streamInsert),
   };
 };

package/src/index.ts CHANGED Viewed

@@ -1,5 +1,3 @@
-import { SpeechToTextLanguage } from './types/stt';
 import { ETInstallerNativeModule } from './native/RnExecutorchModules';
 // eslint-disable no-var
@@ -95,14 +93,14 @@ export * from './types/objectDetection';
 export * from './types/ocr';
 export * from './types/imageSegmentation';
 export * from './types/llm';
-export { SpeechToTextLanguage };
+export * from './types/common';
+export {
+  SpeechToTextLanguage,
+  SpeechToTextModelConfig,
+  DecodingOptions,
+} from './types/stt';
 // constants
 export * from './constants/modelUrls';
 export * from './constants/ocr/models';
 export * from './constants/llmDefaults';
-export {
-  STREAMING_ACTION,
-  MODES,
-  AvailableModels,
-} from './constants/sttDefaults';

package/src/modules/natural_language_processing/SpeechToTextModule.ts CHANGED Viewed

@@ -1,86 +1,98 @@
-import { ResourceSource } from '../../types/common';
-import { SpeechToTextController } from '../../controllers/SpeechToTextController';
-import { AvailableModels, SpeechToTextLanguage } from '../../types/stt';
-import { STREAMING_ACTION } from '../../constants/sttDefaults';
+import { DecodingOptions, SpeechToTextModelConfig } from '../../types/stt';
+import { ASR } from '../../utils/SpeechToTextModule/ASR';
+import { OnlineASRProcessor } from '../../utils/SpeechToTextModule/OnlineProcessor';
 export class SpeechToTextModule {
-  private module: SpeechToTextController;
-  constructor({
-    transcribeCallback,
-    overlapSeconds,
-    windowSize,
-    streamingConfig,
-  }: {
-    transcribeCallback?: (sequence: string) => void;
-    overlapSeconds?: ConstructorParameters<
-      typeof SpeechToTextController
-    >['0']['overlapSeconds'];
-    windowSize?: ConstructorParameters<
-      typeof SpeechToTextController
-    >['0']['windowSize'];
-    streamingConfig?: ConstructorParameters<
-      typeof SpeechToTextController
-    >['0']['streamingConfig'];
-  } = {}) {
-    this.module = new SpeechToTextController({
-      transcribeCallback: transcribeCallback || (() => {}),
-      overlapSeconds,
-      windowSize,
-      streamingConfig,
-    });
-  }
+  private modelConfig!: SpeechToTextModelConfig;
+  private asr: ASR = new ASR();
+  private processor: OnlineASRProcessor = new OnlineASRProcessor(this.asr);
+  private isStreaming = false;
+  private readyToProcess = false;
+  private minAudioSamples: number = 1 * 16000; // 1 second
-  async load(
-    model: {
-      modelName: AvailableModels;
-      encoderSource?: ResourceSource;
-      decoderSource?: ResourceSource;
-      tokenizerSource?: ResourceSource;
-    },
+  public async load(
+    model: SpeechToTextModelConfig,
     onDownloadProgressCallback: (progress: number) => void = () => {}
   ) {
-    await this.module.load({
-      modelName: model.modelName,
-      encoderSource: model.encoderSource,
-      decoderSource: model.decoderSource,
-      tokenizerSource: model.tokenizerSource,
-      onDownloadProgressCallback,
-    });
+    this.modelConfig = model;
+    return this.asr.load(model, onDownloadProgressCallback);
   }
-  configureStreaming(
-    overlapSeconds: Parameters<SpeechToTextController['configureStreaming']>[0],
-    windowSize: Parameters<SpeechToTextController['configureStreaming']>[1],
-    streamingConfig: Parameters<SpeechToTextController['configureStreaming']>[2]
-  ) {
-    this.module.configureStreaming(overlapSeconds, windowSize, streamingConfig);
+  public async encode(waveform: Float32Array): Promise<void> {
+    return this.asr.encode(waveform);
   }
-  async encode(waveform: Float32Array) {
-    return await this.module.encode(waveform);
+  public async decode(tokens: number[]): Promise<Float32Array> {
+    return this.asr.decode(tokens);
   }
-  async decode(seq: number[]) {
-    return await this.module.decode(seq);
+  public async transcribe(
+    waveform: number[],
+    options: DecodingOptions = {}
+  ): Promise<string> {
+    this.validateOptions(options);
+    const segments = await this.asr.transcribe(waveform, options);
+    let transcription = '';
+    for (const segment of segments) {
+      for (const word of segment.words) {
+        transcription += ` ${word.word}`;
+      }
+    }
+    return transcription.trim();
   }
-  async transcribe(
-    waveform: number[],
-    audioLanguage?: SpeechToTextLanguage
-  ): ReturnType<SpeechToTextController['transcribe']> {
-    return await this.module.transcribe(waveform, audioLanguage);
+  public async *stream(options: DecodingOptions = {}) {
+    if (this.isStreaming) {
+      throw new Error('Streaming is already in progress');
+    }
+    this.validateOptions(options);
+    this.resetStreamState();
+    this.isStreaming = true;
+    while (this.isStreaming) {
+      if (
+        !this.readyToProcess ||
+        this.processor.audioBuffer.length < this.minAudioSamples
+      ) {
+        await new Promise((resolve) => setTimeout(resolve, 100));
+        continue;
+      }
+      const { committed, nonCommitted } =
+        await this.processor.processIter(options);
+      yield { committed, nonCommitted };
+      this.readyToProcess = false;
+    }
+    const { committed } = await this.processor.finish();
+    yield { committed, nonCommitted: '' };
+  }
+  public streamStop() {
+    this.isStreaming = false;
+  }
+  public streamInsert(waveform: number[]) {
+    this.processor.insertAudioChunk(waveform);
+    this.readyToProcess = true;
+  }
+  private validateOptions(options: DecodingOptions) {
+    if (!this.modelConfig.isMultilingual && options.language) {
+      throw new Error('Model is not multilingual, cannot set language');
+    }
+    if (this.modelConfig.isMultilingual && !options.language) {
+      throw new Error('Model is multilingual, provide a language');
+    }
   }
-  async streamingTranscribe(
-    streamAction: STREAMING_ACTION,
-    waveform?: number[],
-    audioLanguage?: SpeechToTextLanguage
-  ): ReturnType<SpeechToTextController['streamingTranscribe']> {
-    return await this.module.streamingTranscribe(
-      streamAction,
-      waveform,
-      audioLanguage
-    );
+  private resetStreamState() {
+    this.isStreaming = false;
+    this.readyToProcess = false;
+    this.processor = new OnlineASRProcessor(this.asr);
   }
 }

package/src/types/stt.ts CHANGED Viewed

@@ -1,97 +1,102 @@
-export interface ModelConfig {
-  sources: {
-    encoder: string;
-    decoder: string;
-  };
-  tokenizer: {
-    source: string;
-    bos: number;
-    eos: number;
-  };
-  isMultilingual: boolean;
+import { ResourceSource } from './common';
+export type WordTuple = [number, number, string];
+export interface WordObject {
+  start: number;
+  end: number;
+  word: string;
 }
-// Those languages are supported just by whisper multilingual
-export enum SpeechToTextLanguage {
-  Afrikaans = 'af',
-  Albanian = 'sq',
-  Arabic = 'ar',
-  Armenian = 'hy',
-  Azerbaijani = 'az',
-  Basque = 'eu',
-  Belarusian = 'be',
-  Bengali = 'bn',
-  Bosnian = 'bs',
-  Bulgarian = 'bg',
-  Burmese = 'my',
-  Catalan = 'ca',
-  Chinese = 'zh',
-  Croatian = 'hr',
-  Czech = 'cs',
-  Danish = 'da',
-  Dutch = 'nl',
-  Estonian = 'et',
-  English = 'en',
-  Finnish = 'fi',
-  French = 'fr',
-  Galician = 'gl',
-  Georgian = 'ka',
-  German = 'de',
-  Greek = 'el',
-  Gujarati = 'gu',
-  HaitianCreole = 'ht',
-  Hebrew = 'he',
-  Hindi = 'hi',
-  Hungarian = 'hu',
-  Icelandic = 'is',
-  Indonesian = 'id',
-  Italian = 'it',
-  Japanese = 'ja',
-  Kannada = 'kn',
-  Kazakh = 'kk',
-  Khmer = 'km',
-  Korean = 'ko',
-  Lao = 'lo',
-  Latvian = 'lv',
-  Lithuanian = 'lt',
-  Macedonian = 'mk',
-  Malagasy = 'mg',
-  Malay = 'ms',
-  Malayalam = 'ml',
-  Maltese = 'mt',
-  Marathi = 'mr',
-  Nepali = 'ne',
-  Norwegian = 'no',
-  Persian = 'fa',
-  Polish = 'pl',
-  Portuguese = 'pt',
-  Punjabi = 'pa',
-  Romanian = 'ro',
-  Russian = 'ru',
-  Serbian = 'sr',
-  Sinhala = 'si',
-  Slovak = 'sk',
-  Slovenian = 'sl',
-  Spanish = 'es',
-  Sundanese = 'su',
-  Swahili = 'sw',
-  Swedish = 'sv',
-  Tagalog = 'tl',
-  Tajik = 'tg',
-  Tamil = 'ta',
-  Telugu = 'te',
-  Thai = 'th',
-  Turkish = 'tr',
-  Ukrainian = 'uk',
-  Urdu = 'ur',
-  Uzbek = 'uz',
-  Vietnamese = 'vi',
-  Welsh = 'cy',
-  Yiddish = 'yi',
+export interface Segment {
+  words: WordObject[];
 }
-export enum AvailableModels {
-  WHISPER = 'whisper',
-  MOONSHINE = 'moonshine',
-  WHISPER_MULTILINGUAL = 'whisperMultilingual',
+// Languages supported by whisper (not whisper.en)
+export type SpeechToTextLanguage =
+  | 'af'
+  | 'sq'
+  | 'ar'
+  | 'hy'
+  | 'az'
+  | 'eu'
+  | 'be'
+  | 'bn'
+  | 'bs'
+  | 'bg'
+  | 'my'
+  | 'ca'
+  | 'zh'
+  | 'hr'
+  | 'cs'
+  | 'da'
+  | 'nl'
+  | 'et'
+  | 'en'
+  | 'fi'
+  | 'fr'
+  | 'gl'
+  | 'ka'
+  | 'de'
+  | 'el'
+  | 'gu'
+  | 'ht'
+  | 'he'
+  | 'hi'
+  | 'hu'
+  | 'is'
+  | 'id'
+  | 'it'
+  | 'ja'
+  | 'kn'
+  | 'kk'
+  | 'km'
+  | 'ko'
+  | 'lo'
+  | 'lv'
+  | 'lt'
+  | 'mk'
+  | 'mg'
+  | 'ms'
+  | 'ml'
+  | 'mt'
+  | 'mr'
+  | 'ne'
+  | 'no'
+  | 'fa'
+  | 'pl'
+  | 'pt'
+  | 'pa'
+  | 'ro'
+  | 'ru'
+  | 'sr'
+  | 'si'
+  | 'sk'
+  | 'sl'
+  | 'es'
+  | 'su'
+  | 'sw'
+  | 'sv'
+  | 'tl'
+  | 'tg'
+  | 'ta'
+  | 'te'
+  | 'th'
+  | 'tr'
+  | 'uk'
+  | 'ur'
+  | 'uz'
+  | 'vi'
+  | 'cy'
+  | 'yi';
+export interface DecodingOptions {
+  language?: SpeechToTextLanguage;
+}
+export interface SpeechToTextModelConfig {
+  isMultilingual: boolean;
+  encoderSource: ResourceSource;
+  decoderSource: ResourceSource;
+  tokenizerSource: ResourceSource;
 }