npm - @gmessier/nitro-speech - Versions diffs - 0.4.2 → 0.4.4 - Mend

@gmessier/nitro-speech 0.4.2 → 0.4.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

package/README.md CHANGED Viewed

@@ -34,7 +34,7 @@
 - 👆 Configurable Haptic Feedback on start and finish
 - 🎚️ Speech-quality configurations:
   - Result is grouped by speech segments into Batches.
-  - Param `iosPreset` - `shortForm` or `general` enables best transcriber for your situation
+  - Param `iosPreset` - enables best transcriber for your situation
   - Param `disableRepeatingFilter` - filters out consecutive duplicate words.
   - Param `androidDisableBatchHandling` - disables empty partial results
   - Many more, see `SpeechRecognitionConfig`
@@ -126,7 +126,7 @@ Both permissions are required for speech recognition to work on iOS.
 | **Reset Auto-finish Time** | Resets the Timer to the threshold | ✅ | ✅ |
 | **Voice input volume** | `useVoiceInputVolume`, `getVoiceInputVolume()`, `onVolumeChange` | ✅ | ✅ |
 | **Reset Auto-finish Sensitivity** | The voice detector sensitivity to reset the Auto-finish time | ✅ | ✅ |
-| **Prewarm** | Prepares resources, downloads assets, confirms locale availability | ✅ | ✅ |
+| **Prewarm** | Prepares resources, downloads assets, confirms locale availability, requests permissions | ✅ | ✅ |
 | **Update config** | Static method `updateConfig` allows updating the config on the fly | ✅ | ✅ |
 | **Is Active** | Static method `getIsActive()` | ✅ | ✅ |
 | **Haptic feedback** | Haptic feedback on recording start/stop | ✅ | ✅ |
@@ -138,7 +138,7 @@ Both permissions are required for speech recognition to work on iOS.
 | **Language model selection** | Choose between web search vs free-form models | Auto | ✅ |
 | **Batch handling** | Filters out empty or repeated results | Auto | ✅ |
 | **Formatting quality** | Prefer quality vs speed in formatting | Auto | ✅ |
-| **Transcription preset** | `iosPreset` adapts for short phrases (`shortForm`) or `general` conversation | ✅ | Auto |
+| **Transcription preset** | `iosPreset` adapts for different scenarios | ✅ | Auto |
 | **Automatic punctuation** | Adds punctuation to transcription (iOS 16+) | ✅ | Auto |
 | **Atypical speech hint** | Hint iOS that speech may include accent, lisp, or other confounding traits | ✅ | Auto |
 | **getSupportedLocalesIOS** | Supported locales for iOS (No available API for Android) | ✅ | X |
@@ -230,12 +230,23 @@ function MyComponent() {
           )>
         <Text>Update Timer to 12s, 500ms interval, 0.65 sensitivity, with reset</Text>
       </TouchableOpacity>
+      <TouchableOpacity
+        onPress={() => {
+          scheduleOnRuntime(workletRuntime, () => {
+            RecognizerRef.prewarm({
+              iosPreset: 'speed',
+            }, { requestPermission: true });
+          });
+        }}
+      >
+        <Text>Prewarm from worklet with permission request (default behavior)</Text>
+      </TouchableOpacity>
     </View>
   );
 }
 ```
-On iOS 26+, the recognizer prefers the newer `SpeechTranscriber` path for general cases. Setting `iosPreset: 'shortForm'`, `iosAddPunctuation: false`, or `iosAtypicalSpeech: true` switches priority to `DictationTranscriber` that is better suited for short utterances or non-standard speech patterns.
+On iOS 26+, the recognizer prefers the newer `SpeechTranscriber` path for general cases. Setting `iosPreset: 'shortForm' OR 'speed'`, `iosAddPunctuation: false`, or `iosAtypicalSpeech: true` switches priority to `DictationTranscriber` that is better suited for short utterances or non-standard speech patterns.
 ### With React Navigation (important)
@@ -262,7 +273,7 @@ If you need to call recognizer methods from other components without prop drilli
 ```typescript
 import { RecognizerRef } from '@gmessier/nitro-speech';
-RecognizerRef.prewarm({ locale: 'en-US' });
+RecognizerRef.prewarm({ locale: 'en-US' }, { requestPermission: true });
 RecognizerRef.startListening({ locale: 'en-US' });
 RecognizerRef.addAutoFinishTime(5000);
 RecognizerRef.resetAutoFinishTime();
@@ -407,7 +418,8 @@ SpeechRecognizer.onVolumeChange = (volume) => {
 SpeechRecognizer.prewarm({
   locale: 'en-US',
   // ... your config to prepare
-});
+}, { requestPermission: true });
+);
 // OR `await` if you want to react to the success
 await SpeechRecognizer.prewarm({
   locale: 'en-US',
@@ -418,7 +430,7 @@ scheduleOnRuntime(workletRuntime, () => {
   SpeechRecognizer.prewarm({
     locale: 'en-US',
     // ... your config to prepare
-  });
+  }, { requestPermission: false });
 });
 // Start listening

package/android/src/main/java/com/margelo/nitro/nitrospeech/recognizer/AudioPermissionRequester.kt CHANGED Viewed

@@ -6,6 +6,8 @@ import android.content.pm.PackageManager
 import androidx.activity.ComponentActivity
 import androidx.activity.result.contract.ActivityResultContracts
 import androidx.core.content.ContextCompat
+import kotlinx.coroutines.suspendCancellableCoroutine
+import kotlin.coroutines.resume
 class AudioPermissionRequester (
   private val activity: Activity
@@ -16,12 +18,13 @@ class AudioPermissionRequester (
   private var callback: ((Boolean) -> Unit)? = null
   private val launcher = componentActivity.activityResultRegistry.register(
-    "record_audio_key", ActivityResultContracts.RequestPermission()
+    "record_audio_key",
+    ActivityResultContracts.RequestPermission()
   ) { granted ->
     callback?.invoke(granted)
   }
-  fun checkAndRequest(onResult: (Boolean) -> Unit) {
+  suspend fun checkAndRequest(): Boolean {
     val audioGranted =
       ContextCompat.checkSelfPermission(
         activity,
@@ -29,11 +32,14 @@ class AudioPermissionRequester (
       ) == PackageManager.PERMISSION_GRANTED
     if (audioGranted) {
-      onResult(true)
-      return
+      return true
     }
-    callback = onResult
-    launcher.launch(recordAudioPermission)
+    return suspendCancellableCoroutine { cont ->
+      callback = { granted ->
+        if (cont.isActive) cont.resume(granted)
+      }
+      launcher.launch(recordAudioPermission)
+    }
   }
 }

package/android/src/main/java/com/margelo/nitro/nitrospeech/recognizer/HybridRecognizer.kt CHANGED Viewed

@@ -14,6 +14,7 @@ import com.margelo.nitro.core.Promise
 import com.margelo.nitro.nitrospeech.MutableSpeechRecognitionConfig
 import com.margelo.nitro.nitrospeech.HybridRecognizerSpec
 import com.margelo.nitro.nitrospeech.SpeechRecognitionConfig
+import com.margelo.nitro.nitrospeech.SpeechRecognitionPrewarm
 import com.margelo.nitro.nitrospeech.VolumeChangeEvent
 @DoNotStrip
@@ -43,52 +44,27 @@ class HybridRecognizer: HybridRecognizerSpec() {
   @DoNotStrip
   @Keep
-  override fun prewarm(defaultParams: SpeechRecognitionConfig?): Promise<Unit> {
-    // no-op
+  override fun prewarm(
+    defaultParams: SpeechRecognitionConfig?,
+    options: SpeechRecognitionPrewarm?
+  ): Promise<Unit> {
+    logger.log("prewarm called")
     // nothing to prewarm
-    return Promise()
+    // only request permissions
+    return Promise.async {
+      // Enabled by default for user
+      if (options?.requestPermission != false) {
+        preparePermissions(null, isPrewarm = true)
+      }
+    }
   }
   @DoNotStrip
   @Keep
   override fun startListening(params: SpeechRecognitionConfig?) {
     logger.log("startListening: $params")
-    if (isActive) {
-      onFinishRecognition(
-        null,
-        "Error at startListening: Previous SpeechRecognizer is still active",
-        false
-      )
-      return
-    }
-    val context = NitroModules.applicationContext
-    if (context == null) {
-      onFinishRecognition(
-        null,
-        "Error at startListening: Context not available",
-        true
-      )
-      return
-    }
-    val activity = context.currentActivity
-    if (activity == null) {
-      onFinishRecognition(
-        null,
-        "Error at startListening: Activity not found",
-        true
-      )
-      return
-    }
-    val permissionRequester = AudioPermissionRequester(activity)
-    permissionRequester.checkAndRequest { granted ->
-      if (!granted) {
-        onPermissionDenied?.invoke()
-        return@checkAndRequest
-      }
-      config = params
-      start(context)
+    Promise.async {
+      preparePermissions(params, isPrewarm = false)
     }
   }
@@ -196,6 +172,48 @@ class HybridRecognizer: HybridRecognizerSpec() {
     stopListening()
   }
+  private suspend fun preparePermissions(params: SpeechRecognitionConfig?, isPrewarm: Boolean) {
+    if (isActive) {
+      onFinishRecognition(
+        null,
+        "Error: SpeechRecognizer is already active",
+        false
+      )
+      return
+    }
+    val context = NitroModules.applicationContext
+    if (context == null) {
+      onFinishRecognition(
+        null,
+        "Error: Context not available",
+        true
+      )
+      return
+    }
+    val activity = context.currentActivity
+    if (activity == null) {
+      onFinishRecognition(
+        null,
+        "Error: Activity not found",
+        true
+      )
+      return
+    }
+    val permissionRequester = AudioPermissionRequester(activity)
+    val granted = permissionRequester.checkAndRequest()
+    if (!granted) {
+      onPermissionDenied?.invoke()
+      return
+    }
+    if (isPrewarm) {
+      return
+    }
+    config = params
+    start(context)
+  }
   private fun start(context: Context) {
     mainHandler.post {
       try {

package/ios/Audio/AudioLevelTracker.swift CHANGED Viewed

@@ -20,7 +20,7 @@ final class AudioLevelTracker {
     var currentSample: AudioLevelSample?
-    private let lg = Lg(prefix: "RecognizerEngine")
+    private let lg = Lg(prefix: "RecognizerEngine", disable: true)
     func reset() {
         smoothedLevel = 0

package/ios/Coordinator.swift CHANGED Viewed

@@ -39,6 +39,7 @@ final class Coordinator {
         }
         if params?.iosPreset == IosPreset.shortform
+        || params?.iosPreset == IosPreset.speed
         || params?.iosAddPunctuation == false
         || params?.iosAtypicalSpeech == true {
             // DictationTranscriber priority

package/ios/Engines/AnalyzerEngine.swift CHANGED Viewed

@@ -46,17 +46,21 @@ final class AnalyzerEngine: RecognizerEngine {
         }
     }
-    override func prewarm(for type: FailureType) async {
-        await super.prewarm(for: type)
+    override func prewarm(for type: PrewarmType, _ options: SpeechRecognitionPrewarm? = nil) async {
+        await super.prewarm(for: type, options)
         do {
             // Create transcriber and install assets
             try await transcriber.create(config: self.recognizerDelegate?.config)
         }
         catch {
+            let failureType: FailureType = switch type {
+                case .prewarm: .prewarm
+                case .start: .start
+            }
             self.reportFailure(
                 from: "prewarm.assets",
                 message: "Failed to create transcriber",
-                type: type
+                type: failureType
             )
         }
     }
@@ -93,7 +97,7 @@ final class AnalyzerEngine: RecognizerEngine {
                     self?.outputContinuation?.yield(buffer)
                 }
             )
-            guard let hardwareFormat else { return }
+            guard let hardwareFormat = recognizerDelegate?.hardwareFormat else { return }
             let stream = AsyncStream(
                 AVAudioPCMBuffer.self,
                 bufferingPolicy: .unbounded
@@ -225,14 +229,14 @@ final class AnalyzerEngine: RecognizerEngine {
         if !disableRepeatingFilter {
             newBatch = Utils.repeatingFilter(newBatch)
         }
-        Log.log("[1] lastBatch: \(self.resultBatches.last ?? "") | newBatch: \(newBatch)")
+//        Log.log("[1] lastBatch: \(self.resultBatches.last ?? "") | newBatch: \(newBatch)")
         if self.resultBatches.isEmpty {
             self.resultBatches.append(newBatch)
         } else if CMTimeGetSeconds(rangeStart) == self.lastBatchStartTime || isFinal {
-            Log.log("[2] replace, isFinal: \(isFinal)")
+//            Log.log("[2] replace, isFinal: \(isFinal)")
             self.resultBatches[self.resultBatches.count - 1] = newBatch
         } else {
-            Log.log("[2] add new batch")
+//            Log.log("[2] add new batch")
             self.resultBatches.append(newBatch)
         }
         self.lastBatchStartTime = CMTimeGetSeconds(rangeStart)

package/ios/Engines/RecognizerEngine.swift CHANGED Viewed

@@ -12,10 +12,14 @@ enum FailureType {
     case onSession
 }
+enum PrewarmType {
+    case start
+    case prewarm
+}
 class RecognizerEngine {
     var isActive = false
     var isStopping = false
-    var hardwareFormat: AVAudioFormat?
     weak var recognizerDelegate: RecognizerDelegate?
     private let audioLevelTracker = AudioLevelTracker()
@@ -33,21 +37,48 @@ class RecognizerEngine {
     // MARK: - Recognizer Methods
-    func prewarm(for: FailureType) async {
-        self.prepareAudioEngine()
+    func prewarm(for type: PrewarmType, _ options: SpeechRecognitionPrewarm? = nil) async {
+        // Prepare audioEngine
+        audioEngine = AVAudioEngine()
+        lg.log("[prewarm.audioEngine]")
+        guard let recognizerDelegate else { return }
+        // Everything is set, return early
+        if type == .prewarm, recognizerDelegate.hardwareFormat != nil {
+            lg.log("[prewarm.return]: Everything set")
+            return
+        }
+        // User explicitly asked for prewarm without requesting permissions, return early
+        if type == .prewarm, options?.requestPermission == false {
+            lg.log("[prewarm.return]: requestPermission: false")
+            return
+        }
+        if type == .prewarm {
+            // options.requestPermission is true by default
+            // Start Permission sequence
+            let granted = await requestPermissions()
+            if granted {
+                self.prewarmAudioSession(for: type)
+            }
+        } else {
+            self.prewarmAudioSession(for: type)
+        }
         // for SpeechTranscriber: .isAvailable and async assets
         // for Dictation: only async assets
         // for legacy SF: only sync .isAvailable
     }
-    func start() {
-        guard let recognizerDelegate, !isActive else { return }
+    func start() async {
+        guard !isActive else { return }
-        Permissions(
-            onGranted: self.startSession,
-            onDenied: recognizerDelegate.permissionDenied,
-            onError: recognizerDelegate.error
-        ).requestAuthorization()
+        let granted = await requestPermissions()
+        if granted {
+            await startSession()
+        }
     }
     func stop() {
@@ -56,6 +87,55 @@ class RecognizerEngine {
         HapticImpact.trigger(with: self.recognizerDelegate?.config?.stopHapticFeedbackStyle)
     }
+    func updateSession(
+        newConfig: MutableSpeechRecognitionConfig? = nil,
+        addMsToTimer: Double? = nil,
+        resetTimer: Bool? = nil
+    ) {
+        guard let recognizerDelegate, isActive, !isStopping else { return }
+        let currentConfig = recognizerDelegate.config
+        // Update AutoFinish time
+        if let newAutoFinish = newConfig?.autoFinishRecognitionMs,
+           newAutoFinish != currentConfig?.autoFinishRecognitionMs {
+            autoStopper?.updateThreshold(
+                newAutoFinish,
+                from: "updateSession"
+            )
+        }
+        // Update AutoFinish progress interval
+        if let newInterval = newConfig?.autoFinishProgressIntervalMs,
+           newInterval != currentConfig?.autoFinishProgressIntervalMs {
+            autoStopper?.updateProgressInterval(
+                newInterval,
+                from: "updateSession"
+            )
+        }
+        if let addMsToTimer {
+            // Add time to the timer once
+            autoStopper?.addMsOnce(
+                addMsToTimer,
+                from: "updateSession"
+            )
+        } else if resetTimer == true {
+            // Reset to current baseline threshold.
+            autoStopper?.resetTimer(from: "updateSession")
+        }
+        // Only update new non-nil values in the config
+        recognizerDelegate.softlyUpdateConfig(newConfig: newConfig)
+    }
+    func getVoiceInputVolume() -> VolumeChangeEvent? {
+        guard let currentSample = audioLevelTracker.currentSample else { return nil }
+        return VolumeChangeEvent(
+            smoothedVolume: currentSample.smoothed,
+            rawVolume: currentSample.raw,
+            db: currentSample.db
+        )
+    }
+    // MARK: Helpers
     func startSession() async {
         lg.log("[startSession.startSession]")
         // Init everything
@@ -66,15 +146,13 @@ class RecognizerEngine {
         lg.log("[startSession.initAutoStop]")
         startAppStateObserver()
         lg.log("[startSession.startAppStateObserver]")
-        startAudioSession()
-        lg.log("[startSession.startAudioSession]")
     }
     func startAudioEngine(
         onBuffer: @escaping (AVAudioPCMBuffer) -> Void
     ) {
         lg.log("[startAudioEngine]")
-        guard let audioEngine, let hardwareFormat else { return }
+        guard let audioEngine, let hardwareFormat = self.recognizerDelegate?.hardwareFormat else { return }
         audioEngine.inputNode.installTap(
             onBus: 0,
             bufferSize: 1024,
@@ -124,53 +202,6 @@ class RecognizerEngine {
         recognizerDelegate.readyForSpeech()
         recognizerDelegate.result(batches: [])
     }
-    func updateSession(
-        newConfig: MutableSpeechRecognitionConfig? = nil,
-        addMsToTimer: Double? = nil,
-        resetTimer: Bool? = nil
-    ) {
-        guard let recognizerDelegate, isActive, !isStopping else { return }
-        let currentConfig = recognizerDelegate.config
-        // Update AutoFinish time
-        if let newAutoFinish = newConfig?.autoFinishRecognitionMs,
-           newAutoFinish != currentConfig?.autoFinishRecognitionMs {
-            autoStopper?.updateThreshold(
-                newAutoFinish,
-                from: "updateSession"
-            )
-        }
-        // Update AutoFinish progress interval
-        if let newInterval = newConfig?.autoFinishProgressIntervalMs,
-           newInterval != currentConfig?.autoFinishProgressIntervalMs {
-            autoStopper?.updateProgressInterval(
-                newInterval,
-                from: "updateSession"
-            )
-        }
-        if let addMsToTimer {
-            // Add time to the timer once
-            autoStopper?.addMsOnce(
-                addMsToTimer,
-                from: "updateSession"
-            )
-        } else if resetTimer == true {
-            // Reset to current baseline threshold.
-            autoStopper?.resetTimer(from: "updateSession")
-        }
-        // Only update new non-nil values in the config
-        recognizerDelegate.softlyUpdateConfig(newConfig: newConfig)
-    }
-    func getVoiceInputVolume() -> VolumeChangeEvent? {
-        guard let currentSample = audioLevelTracker.currentSample else { return nil }
-        return VolumeChangeEvent(
-            smoothedVolume: currentSample.smoothed,
-            rawVolume: currentSample.raw,
-            db: currentSample.db
-        )
-    }
     func cleanup(from: String) {
         lg.log("[cleanup]: \(from)")
@@ -226,29 +257,32 @@ class RecognizerEngine {
         }
     }
-    // MARK: - AudioEngine heavy prepare
+    // MARK: Permissions
-    private func prepareAudioEngine() {
-        lg.log("[prewarm.start]")
-        audioEngine = AVAudioEngine()
-        guard let audioEngine else {
-            self.reportFailure(
-                from: "Audio Engine",
-                message: "Audio Engine failed to initiate",
-                // RecognizerEngine-agnostic Error
-                type: .system
-            )
-            return
+    private func requestPermissions() async -> Bool {
+        guard let recognizerDelegate else { return false }
+        let authStatus = await Permissions.requestAuthorization()
+        if authStatus == .denied || authStatus == .restricted {
+            recognizerDelegate.permissionDenied()
+            return false
         }
-        lg.log("[prewarm.audioEngine]")
-        // heavy first hardwareFormat retrieval
-        if hardwareFormat == nil {
-            hardwareFormat = audioEngine.inputNode.outputFormat(forBus: 0)
-            lg.log("[prewarm.hardwareFormat]")
+        if authStatus != .authorized {
+            // .notDetermined or unknown issue
+            recognizerDelegate.error(message: "Speech recognition permission is not determined")
+            return false
+        }
+        if !(await Permissions.requestMicrophonePermission()) {
+            recognizerDelegate.permissionDenied()
+            return false
         }
+        // Granted
+        return true
     }
-    // MARK: - AutoStopper
+    // MARK: Auto Stopper
     private func initAutoStop() {
         let config = self.recognizerDelegate?.config
@@ -271,7 +305,7 @@ class RecognizerEngine {
         autoStopper = nil
     }
-    // MARK: - App State Observer
+    // MARK: App State Observer
     private func startAppStateObserver() {
         appStateObserver = AppStateObserver { [weak self] in
@@ -285,12 +319,37 @@ class RecognizerEngine {
         appStateObserver = nil
     }
-    // MARK: - Audio Session
+    // MARK: Audio Session
+    private func prewarmAudioSession(for type: PrewarmType) {
+        guard let audioEngine else {
+            self.reportFailure(
+                from: "Audio Engine",
+                message: "Audio Engine failed to initiate",
+                // RecognizerEngine-agnostic Error
+                type: .system
+            )
+            return
+        }
+        startAudioSession()
+        lg.log("[prewarmAudioSession.audioSession]")
+        // heavy first hardwareFormat retrieval
+        if let recognizerDelegate, recognizerDelegate.hardwareFormat == nil {
+            let format = audioEngine.inputNode.outputFormat(forBus: 0)
+            recognizerDelegate.setHardwareFormat(format: format)
+            lg.log("[prewarmAudioSession.hardwareFormat]")
+        }
+        if type == .prewarm {
+            stopAudioSession()
+            lg.log("[prewarmAudioSession.stopAudioSession]")
+        }
+    }
     private func startAudioSession() {
         do {
             let audioSession = AVAudioSession.sharedInstance()
-            try audioSession.setCategory(.record, mode: .measurement, options: .duckOthers)
+            try audioSession.setCategory(.playAndRecord, mode: .default, options: .duckOthers)
             // Required for haptic feedback
             try audioSession.setAllowHapticsAndSystemSoundsDuringRecording(true)
             try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
@@ -305,7 +364,6 @@ class RecognizerEngine {
     }
     private func stopAudioSession() {
         do {
-            // TODO: check unduck
             try AVAudioSession.sharedInstance().setActive(false)
         } catch {
             // Just log and no-op - not critical

package/ios/Engines/SFSpeechEngine.swift CHANGED Viewed

@@ -15,18 +15,22 @@ final class SFSpeechEngine: RecognizerEngine {
         recognitionTask?.finish()
     }
-    override func prewarm(for type: FailureType) async {
+    override func prewarm(for type: PrewarmType, _ options: SpeechRecognitionPrewarm? = nil) async {
         speechRecognizer = SFSpeechRecognizer(
             locale: Locale(identifier: self.recognizerDelegate?.config?.locale ?? "en-US")
         )
         if speechRecognizer?.isAvailable != true {
+            let failureType: FailureType = switch type {
+                case .prewarm: .prewarm
+                case .start: .start
+            }
             self.reportFailure(
                 from: "prewarm",
                 message: "SFSpeechRecognizer is not available",
-                type: type
+                type: failureType
             )
         }
-        await super.prewarm(for: type)
+        await super.prewarm(for: type, options)
     }
     override func startSession() async {