react-native-davoice-tts 1.0.291 → 1.0.293

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (22) hide show
  1. package/TTSRNBridge.podspec +1 -1
  2. package/android/libs/com/davoice/tts/1.0.0/tts-1.0.0.aar +0 -0
  3. package/android/libs/com/davoice/tts/1.0.0/tts-1.0.0.aar.md5 +1 -1
  4. package/android/libs/com/davoice/tts/1.0.0/tts-1.0.0.aar.sha1 +1 -1
  5. package/ios/TTSRNBridge/DavoiceTTS.xcframework/Info.plist +5 -5
  6. package/ios/TTSRNBridge/DavoiceTTS.xcframework/{ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.ofer → ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift.AEC.CRASH.ETC} +663 -125
  7. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DavoiceTTS +0 -0
  8. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.abi.json +4945 -4931
  9. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.private.swiftinterface +12 -12
  10. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.swiftinterface +12 -12
  11. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.AEC.CRASH.ETC +2853 -0
  12. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DavoiceTTS +0 -0
  13. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.abi.json +8306 -8292
  14. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.private.swiftinterface +83 -83
  15. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.swiftinterface +83 -83
  16. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.abi.json +8306 -8292
  17. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.private.swiftinterface +83 -83
  18. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.swiftinterface +83 -83
  19. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeDirectory +0 -0
  20. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeRequirements-1 +0 -0
  21. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeResources +29 -44
  22. package/package.json +1 -1
@@ -18,6 +18,22 @@ public final class STT: NSObject, SFSpeechRecognizerDelegate {
18
18
 
19
19
  // Global AEC toggle (default ON to keep existing behavior)
20
20
  public var aecEnabled: Bool = true
21
+ // If true, force VP/AEC ON for a short window after session activation while routes settle.
22
+ public var forceAECDuringRouteWarmup: Bool = true
23
+ public var aecRouteWarmupSeconds: Double = 20.0
24
+ // If true, always request 16k input sample rate from AVAudioSession.
25
+ // iOS may still override this depending on route / voice processing constraints.
26
+ public var force16kMicSampleRate: Bool = false
27
+ // If true, use old SV gate behavior (immediate open/close + full pre-roll flush).
28
+ public var useLegacySpeakerGateBehavior: Bool = false
29
+ // If true, keep gate open for a short hangover after the last positive match.
30
+ public var useSpeakerGateHangover: Bool = true
31
+ public var speakerGateHangoverSeconds: Double = 0.40
32
+ // If true, override SV tailSeconds to 0.5s for faster switching tests.
33
+ public var useShortSpeakerVerificationTailWindow: Bool = true
34
+ public var shortSpeakerVerificationTailSeconds: Float = 0.5
35
+ // In protected mode, flush only this much recent pre-roll when gate reopens.
36
+ public var speakerPreRollFlushMaxSeconds: Double = 0.5
21
37
 
22
38
  // MARK: - Private
23
39
  private var speechRecognizer: SFSpeechRecognizer?
@@ -25,6 +41,9 @@ public final class STT: NSObject, SFSpeechRecognizerDelegate {
25
41
  private var audioEngine: AVAudioEngine?
26
42
  private var recognitionTask: SFSpeechRecognitionTask?
27
43
  private var audioSession: AVAudioSession?
44
+ private let aecSessionActivationLock = NSLock()
45
+ private var lastAECSessionActivationAt: CFTimeInterval = 0
46
+ private var aecSessionIsActive: Bool = false
28
47
  private var isTearingDown: Bool = false
29
48
  private var sessionId: String?
30
49
  private var priorAudioCategory: AVAudioSession.Category?
@@ -32,8 +51,7 @@ public final class STT: NSObject, SFSpeechRecognizerDelegate {
32
51
  private var averagePowerForChannel1: Float = 0
33
52
  // Add to STT
34
53
  private var isAdjustingRoute = false
35
- private var lastRouteTune: CFAbsoluteTime = 0
36
- private let routeTuneCooldown: CFTimeInterval = 0.5
54
+ private var lastRouteSignature: String = ""
37
55
 
38
56
  private var playbackNode: AVAudioPlayerNode?
39
57
  private var seenRealSpeech = false // flips true after first non-blank token
@@ -75,11 +93,8 @@ public final class STT: NSObject, SFSpeechRecognizerDelegate {
75
93
 
76
94
  private(set) var sttActive = false
77
95
  // STT.swift (add near `private var playbackNode: AVAudioPlayerNode?`)
78
- private var ttsEQ: AVAudioUnitEQ?
96
+ // private var ttsEQ: AVAudioUnitEQ?
79
97
 
80
- // partial cadence monitor
81
- private var emaPartialGap: Double = 0 // exponential moving average of time between partials
82
- private let emaAlpha: Double = 0.3
83
98
  // Add near your other state:
84
99
  private var ioLatchActiveGen: UInt64 = 0
85
100
 
@@ -105,11 +120,45 @@ public final class STT: NSObject, SFSpeechRecognizerDelegate {
105
120
  private var activeTaskGen: UInt64 = 0
106
121
  private var micPaused: Bool = false
107
122
 
123
+ // --- Optional speaker verification gate ---
124
+ private struct SpeakerVerificationStartConfig {
125
+ let enrollment: SpeakerEnrollment
126
+ let config: SpeakerVerificationConfig
127
+ }
128
+ private let speakerVerificationQueue = DispatchQueue(label: "stt.sv.queue")
129
+ private let speakerVerificationStateLock = NSLock()
130
+ private var speakerVerificationStartConfig: SpeakerVerificationStartConfig?
131
+ private var speakerVerificationEngine: SpeakerVerificationEngine?
132
+ private var speakerVerificationFrameSize: Int = 0
133
+ private var speakerVerificationInputBuffer: [Float] = []
134
+ private var speakerGateOpen: Bool = true
135
+ private var speakerGateEnabled: Bool = false
136
+ private var speakerVerificationErrorSent: Bool = false
137
+ private var speakerPreRollBuffers: [AVAudioPCMBuffer] = []
138
+ private var speakerPreRollFrames: Int = 0
139
+ private var speakerPreRollMaxFrames: Int = 0
140
+ private var speakerPendingPreRollFlush: Bool = false
141
+ private let speakerPreRollSeconds: Double = 1.0
142
+ private var speakerVerificationThreshold: Float = 0
143
+ private var speakerVerificationFrameSeq: UInt64 = 0
144
+ private var speakerVerificationSourceSampleRate: Int = 0
145
+ private var speakerVerificationTargetSampleRate: Int = 0
146
+ private var speakerVerificationResampleCarry: [Float] = []
147
+ private var speakerVerificationResamplePos: Double = 0
148
+ private var speakerLastPositiveMatchAt: CFTimeInterval = 0
149
+
108
150
  // --- Speech recognition lite pause (counter-based) ---
109
151
  private let speechPauseLock = NSLock()
110
152
  private var speechRecognitionPauseCount: Int = 0
111
153
  private var speechRecognitionPaused: Bool = false
112
154
  @inline(__always)
155
+ private func isSpeechRecognitionLitePaused() -> Bool {
156
+ speechPauseLock.lock()
157
+ let paused = speechRecognitionPaused
158
+ speechPauseLock.unlock()
159
+ return paused
160
+ }
161
+ @inline(__always)
113
162
  private func resetSpeechRecognitionLitePauseState(_ why: String) {
114
163
  speechPauseLock.lock()
115
164
  speechRecognitionPauseCount = 0
@@ -394,10 +443,6 @@ private func pollOnMain(timeoutSec: TimeInterval,
394
443
  }
395
444
  micPaused = true
396
445
 
397
- // ✅ CRITICAL: invalidate any pending async tap installs/probes/latches.
398
- // This prevents closures scheduled earlier from touching nodes after engine is torn down.
399
- bumpGraphGen()
400
-
401
446
  let session = AVAudioSession.sharedInstance()
402
447
 
403
448
  // Save current session config (so we can restore on unpause)
@@ -459,6 +504,7 @@ private func pollOnMain(timeoutSec: TimeInterval,
459
504
  // Use this if we ever have duck others
460
505
  // try session.setActive(false, options: [.notifyOthersOnDeactivation])
461
506
  try session.setActive(false, options: [])
507
+ markAECSessionActivation(false, reason: "pauseMicrophone-pre")
462
508
  NSLog("[STT] pauseMicrophone(): setActive false")
463
509
  } catch {
464
510
  NSLog("[STT] pauseMicrophone(): failed to switch setActive false: \(error.localizedDescription)")
@@ -473,6 +519,7 @@ private func pollOnMain(timeoutSec: TimeInterval,
473
519
  // Switch to playback-only session so iOS releases the mic (indicator off)
474
520
  do {
475
521
  try session.setActive(true, options: [])
522
+ markAECSessionActivation(true, reason: "pauseMicrophone-playback")
476
523
  NSLog("[STT] pauseMicrophone(): session set to .playback (mic released)")
477
524
  } catch {
478
525
  NSLog("[STT] pauseMicrophone(): failed to switch to session.setActive with .playback: \(error.localizedDescription)")
@@ -494,6 +541,7 @@ private func pollOnMain(timeoutSec: TimeInterval,
494
541
  // Restore previous session category/mode/options and IO prefs
495
542
  do {
496
543
  try session.setActive(false, options: [.notifyOthersOnDeactivation])
544
+ markAECSessionActivation(false, reason: "unPauseMicrophone-pre")
497
545
  } catch {
498
546
  NSLog("[STT] unPauseMicrophone: setActive(false) failed: \(error.localizedDescription)")
499
547
  }
@@ -521,6 +569,7 @@ private func pollOnMain(timeoutSec: TimeInterval,
521
569
 
522
570
  do {
523
571
  try session.setActive(true, options: [])
572
+ markAECSessionActivation(true, reason: "unPauseMicrophone")
524
573
  } catch {
525
574
  NSLog("[STT] unPauseMicrophone: setActive(true) failed: \(error.localizedDescription)")
526
575
  }
@@ -613,32 +662,11 @@ private func pollOnMain(timeoutSec: TimeInterval,
613
662
  return true
614
663
  }
615
664
 
665
+
616
666
  @inline(__always)
617
667
  private func safeRemoveTap(_ node: AVAudioNode?, bus: AVAudioNodeBus = 0) {
618
- guard let n = node else { return }
619
- // MUST be called on main to avoid races with teardown/rebuild
620
- if !Thread.isMainThread {
621
- DispatchQueue.main.async { [weak self] in self?.safeRemoveTap(n, bus: bus) }
622
- return
623
- }
624
- guard n.engine != nil else { return }
625
- n.removeTap(onBus: bus)
626
- }
627
-
628
- @inline(__always)
629
- private func safeInstallTap(_ node: AVAudioNode?,
630
- bus: AVAudioNodeBus = 0,
631
- bufferSize: AVAudioFrameCount = 128,
632
- format: AVAudioFormat? = nil,
633
- block: @escaping AVAudioNodeTapBlock) {
634
- guard let n = node else { return }
635
- // MUST be called on main to avoid races with teardown/rebuild
636
- if !Thread.isMainThread {
637
- DispatchQueue.main.async { [weak self] in self?.safeInstallTap(n, bus: bus, bufferSize: bufferSize, format: format, block: block) }
638
- return
639
- }
640
- guard n.engine != nil else { return }
641
- n.installTap(onBus: bus, bufferSize: bufferSize, format: format, block: block)
668
+ guard let n = node, n.engine != nil else { return } // only remove if still attached
669
+ try? n.removeTap(onBus: bus)
642
670
  }
643
671
 
644
672
  // MARK: - Public API (native replacements for the former RCT methods)
@@ -668,7 +696,7 @@ private func pollOnMain(timeoutSec: TimeInterval,
668
696
  // >>> IMPORTANT: ensure no previous tap is left behind
669
697
  self.safeRemoveTap(out, bus: 0)
670
698
 
671
- self.safeInstallTap(out, bus: 0, bufferSize: 128, format: nil) { [weak self, weak out] _, _ in
699
+ out.installTap(onBus: 0, bufferSize: 128, format: nil) { [weak self, weak out] _, _ in
672
700
  guard let self = self, gen == self.graphGen else { return }
673
701
  if fired { return }
674
702
  fired = true
@@ -758,6 +786,10 @@ private func pollOnMain(timeoutSec: TimeInterval,
758
786
  NSLog("[STT] rearmTask(\(reason)) suppressed (micPaused)")
759
787
  return
760
788
  }
789
+ if isSpeechRecognitionLitePaused() {
790
+ NSLog("[STT] rearmTask(\(reason)) suppressed (speechRecognitionPaused)")
791
+ return
792
+ }
761
793
 
762
794
  // -----------------
763
795
  recognitionTask?.cancel()
@@ -774,6 +806,10 @@ private func checkTaskHealth() {
774
806
  NSLog("[STT] watchdog: isTearingDown || isTelephonyInterrupted || isRecoveringAfterTelephony -- DOING NOTHING")
775
807
  return
776
808
  }
809
+ if isSpeechRecognitionLitePaused() {
810
+ NSLog("[STT] watchdog: speechRecognitionPaused -- DOING NOTHING")
811
+ return
812
+ }
777
813
  if micPaused {
778
814
  NSLog("[STT] watchdog: micPaused -- DOING NOTHING")
779
815
  return
@@ -848,8 +884,26 @@ private func checkTaskHealth() {
848
884
  }
849
885
 
850
886
  public func startSpeech(localeStr: String?) {
851
- NSLog("[STT] startSpeech(locale=\(localeStr ?? "nil"))")
852
- lastLocaleStr = localeStr!
887
+ startSpeechInternal(localeStr: localeStr, speakerVerificationConfig: nil)
888
+ }
889
+
890
+ public func startSpeech(localeStr: String?, onboardingJsonPath: String) {
891
+ do {
892
+ let loaded = try loadSpeakerVerificationStartConfig(onboardingJsonPath: onboardingJsonPath)
893
+ startSpeechInternal(localeStr: localeStr, speakerVerificationConfig: loaded)
894
+ } catch {
895
+ sendResult(error: ["message": "Failed to load onboarding JSON: \(error.localizedDescription)"],
896
+ bestTranscription: nil,
897
+ transcriptions: nil,
898
+ isFinal: nil)
899
+ }
900
+ }
901
+
902
+ private func startSpeechInternal(localeStr: String?,
903
+ speakerVerificationConfig: SpeakerVerificationStartConfig?) {
904
+ NSLog("[STT] startSpeech(locale=\(localeStr ?? "nil"), sv=\(speakerVerificationConfig == nil ? "off" : "on"))")
905
+ lastLocaleStr = localeStr ?? ""
906
+ speakerVerificationStartConfig = speakerVerificationConfig
853
907
  if recognitionTask != nil {
854
908
  sendResult(error: ["code": "already_started", "message": "Speech recognition already started!"],
855
909
  bestTranscription: nil, transcriptions: nil, isFinal: nil)
@@ -960,8 +1014,13 @@ private func checkTaskHealth() {
960
1014
  catch { NSLog("[STT] setCategory failed: \(error.localizedDescription)") }
961
1015
  }
962
1016
 
963
- do { try s.setActive(true, options: []) }
964
- catch { NSLog("[STT] setActive failed: \(error.localizedDescription)") }
1017
+ do {
1018
+ try s.setActive(true, options: [])
1019
+ self.markAECSessionActivation(true, reason: "updateSessionRouting")
1020
+ } catch {
1021
+ NSLog("[STT] setActive failed: \(error.localizedDescription)")
1022
+ self.markAECSessionActivation(false, reason: "updateSessionRouting-failed")
1023
+ }
965
1024
 
966
1025
  // Optional: force 16k after activation
967
1026
  self.force16kIfPossible(s)
@@ -978,6 +1037,10 @@ private func checkTaskHealth() {
978
1037
 
979
1038
  // ↓↓↓ preferred settings helper
980
1039
  private func force16kIfPossible(_ session: AVAudioSession) {
1040
+ if force16kMicSampleRate {
1041
+ try? session.setPreferredSampleRate(16_000)
1042
+ }
1043
+
981
1044
  let hasExternalOutput = session.currentRoute.outputs.contains {
982
1045
  switch $0.portType {
983
1046
  case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
@@ -1004,6 +1067,27 @@ private func checkTaskHealth() {
1004
1067
  try? session.setPreferredIOBufferDuration(0.02)
1005
1068
  }
1006
1069
 
1070
+ private func markAECSessionActivation(_ active: Bool, reason: String) {
1071
+ let now = CACurrentMediaTime()
1072
+ aecSessionActivationLock.lock()
1073
+ aecSessionIsActive = active
1074
+ if active { lastAECSessionActivationAt = now }
1075
+ else { lastAECSessionActivationAt = 0 }
1076
+ aecSessionActivationLock.unlock()
1077
+ NSLog("[STT] AEC session activation(\(reason)): active=\(active ? "YES" : "NO") t=\(String(format: "%.3f", now))")
1078
+ }
1079
+
1080
+ private func isInAECRouteWarmupWindow() -> Bool {
1081
+ guard aecEnabled, forceAECDuringRouteWarmup, aecRouteWarmupSeconds > 0 else { return false }
1082
+ let now = CACurrentMediaTime()
1083
+ aecSessionActivationLock.lock()
1084
+ let isActive = aecSessionIsActive
1085
+ let lastActiveAt = lastAECSessionActivationAt
1086
+ aecSessionActivationLock.unlock()
1087
+ guard isActive, lastActiveAt > 0 else { return false }
1088
+ return (now - lastActiveAt) < aecRouteWarmupSeconds
1089
+ }
1090
+
1007
1091
  // MARK: - Core logic (kept intact, including AEC order/steps)
1008
1092
 
1009
1093
  /// Returns true if no errors occurred (identical flow & calls as ObjC).
@@ -1016,6 +1100,7 @@ private func checkTaskHealth() {
1016
1100
 
1017
1101
  do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
1018
1102
  catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
1103
+ markAECSessionActivation(false, reason: "setupAudioSession-pre")
1019
1104
 
1020
1105
  // Build options to match our routing rules
1021
1106
  // (defaultToSpeaker only when no external output is active)
@@ -1040,10 +1125,17 @@ private func checkTaskHealth() {
1040
1125
 
1041
1126
  do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
1042
1127
  catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
1128
+ markAECSessionActivation(false, reason: "setupAudioSession-reconfigure")
1043
1129
 
1044
1130
  // Force 16k before and after activation (some routes settle only after setActive)
1045
1131
  force16kIfPossible(session)
1046
- do { try session.setActive(true) } catch { err = error as NSError }
1132
+ do {
1133
+ try session.setActive(true)
1134
+ markAECSessionActivation(true, reason: "setupAudioSession")
1135
+ } catch {
1136
+ err = error as NSError
1137
+ markAECSessionActivation(false, reason: "setupAudioSession-failed")
1138
+ }
1047
1139
  NSLog("[STT] session SR=%.1f inCh=%d outCh=%d (wanted 16000)",
1048
1140
  session.sampleRate,
1049
1141
  Int(session.inputNumberOfChannels),
@@ -1061,6 +1153,65 @@ private func checkTaskHealth() {
1061
1153
  return true
1062
1154
  }
1063
1155
 
1156
+ private func shouldUseVoiceProcessingForCurrentRoute() -> Bool {
1157
+ guard aecEnabled else { return false }
1158
+ if isInAECRouteWarmupWindow() { return true }
1159
+ let s = AVAudioSession.sharedInstance()
1160
+ let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
1161
+ let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
1162
+ (s.currentRoute.inputs.first?.portType == .builtInMic)
1163
+ return speakerRoute && usingBuiltInMic
1164
+ }
1165
+
1166
+ private func configureVoiceProcessingDucking(_ inputNode: AVAudioInputNode) {
1167
+ if #available(iOS 17.0, *) {
1168
+ var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
1169
+ duck.enableAdvancedDucking = false
1170
+ duck.duckingLevel = .min
1171
+ inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
1172
+ }
1173
+ }
1174
+
1175
+ private func reconcileAEC(on engine: AVAudioEngine?, reason: String, allowRebuild: Bool = true) {
1176
+ guard let engine = engine else { return }
1177
+ let inputNode = engine.inputNode
1178
+ let desiredVP = shouldUseVoiceProcessingForCurrentRoute()
1179
+
1180
+ if #available(iOS 13.0, *) {
1181
+ let currentVP = inputNode.isVoiceProcessingEnabled
1182
+ if currentVP == desiredVP {
1183
+ if desiredVP { configureVoiceProcessingDucking(inputNode) }
1184
+ NSLog("[STT] AEC reconcile(\(reason)): unchanged vp=\(currentVP ? "ON" : "OFF")")
1185
+ return
1186
+ }
1187
+ }
1188
+
1189
+ do {
1190
+ try inputNode.setVoiceProcessingEnabled(desiredVP)
1191
+ if desiredVP { configureVoiceProcessingDucking(inputNode) }
1192
+ NSLog("[STT] AEC reconcile(\(reason)): set vp=\(desiredVP ? "ON" : "OFF")")
1193
+ } catch {
1194
+ NSLog("[STT] AEC reconcile(\(reason)) failed: \(error.localizedDescription)")
1195
+ if allowRebuild && sttActive && !isTearingDown && !micPaused &&
1196
+ !isTelephonyInterrupted && !isRecoveringAfterTelephony {
1197
+ rebuildEngineGraphAndRestart(reason: "aec-reconcile-\(reason)")
1198
+ }
1199
+ }
1200
+ }
1201
+
1202
+ private func scheduleAECReconcileRetries(reason: String,
1203
+ attempts: Int = 3,
1204
+ stepSec: TimeInterval = 0.20) {
1205
+ guard attempts > 0 else { return }
1206
+ for i in 1...attempts {
1207
+ DispatchQueue.main.asyncAfter(deadline: .now() + stepSec * Double(i)) { [weak self] in
1208
+ guard let self = self else { return }
1209
+ if self.isTearingDown || self.micPaused || self.isTelephonyInterrupted { return }
1210
+ self.reconcileAEC(on: self.audioEngine, reason: "\(reason)-retry\(i)", allowRebuild: false)
1211
+ }
1212
+ }
1213
+ }
1214
+
1064
1215
  private func currentInputFormat(_ engine: AVAudioEngine) -> AVAudioFormat? {
1065
1216
  // Prefer whatever CoreAudio currently provides; avoid cached formats.
1066
1217
  let fmt = engine.inputNode.outputFormat(forBus: 0)
@@ -1097,22 +1248,8 @@ private func recoverAfterTelephonyInterruption() {
1097
1248
  _ = setupAudioSession() // ✅ ensures correct mode/options
1098
1249
  forceSpeakerIfReceiver("recoverAfterTelephony") // ✅ receiver -> speaker now
1099
1250
 
1100
- let s = AVAudioSession.sharedInstance()
1101
- let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
1102
- let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
1103
- (s.currentRoute.inputs.first?.portType == .builtInMic)
1104
- let willUseVP = speakerRoute && usingBuiltInMic
1105
-
1106
1251
  let inputNode = eng.inputNode
1107
- if aecEnabled && willUseVP {
1108
- try? inputNode.setVoiceProcessingEnabled(true)
1109
- if #available(iOS 17.0, *) {
1110
- var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
1111
- duck.enableAdvancedDucking = false
1112
- duck.duckingLevel = .min
1113
- inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
1114
- }
1115
- }
1252
+ reconcileAEC(on: eng, reason: "recover-after-telephony-prestart", allowRebuild: false)
1116
1253
 
1117
1254
  eng.reset()
1118
1255
  let micMixer = AVAudioMixerNode()
@@ -1131,6 +1268,8 @@ private func recoverAfterTelephonyInterruption() {
1131
1268
  try eng.start()
1132
1269
  armFirstIOCycleLatch(on: eng)
1133
1270
  tryClearCaptureLossAfterStartSucceeded()
1271
+ reconcileAEC(on: eng, reason: "recover-after-telephony-poststart", allowRebuild: false)
1272
+ scheduleAECReconcileRetries(reason: "recover-after-telephony")
1134
1273
  } catch {
1135
1274
  NSLog("[STT] recover: engine.start failed → will let watchdog retry: \(error)")
1136
1275
  return
@@ -1176,7 +1315,11 @@ private func recoverAfterTelephonyInterruption() {
1176
1315
  return []
1177
1316
  }
1178
1317
  do {
1179
- let contents = try String(contentsOfFile: filePath, encoding: .utf8)
1318
+ var contents = try String(contentsOfFile: filePath, encoding: .utf8)
1319
+ // ✅ MIN FIX: remove UTF-8 BOM if present (often only affects the first token)
1320
+ if contents.unicodeScalars.first == "\u{FEFF}" {
1321
+ contents.unicodeScalars.removeFirst()
1322
+ }
1180
1323
  let rawItems = contents.components(separatedBy: ",")
1181
1324
  var cleaned: [String] = []
1182
1325
  cleaned.reserveCapacity(rawItems.count)
@@ -1210,13 +1353,20 @@ private func recoverAfterTelephonyInterruption() {
1210
1353
  if #available(iOS 16, *) { req.addsPunctuation = true }
1211
1354
  req.shouldReportPartialResults = true
1212
1355
  //if #available(iOS 13.0, *) { req.taskHint = .dictation }
1213
- req.contextualStrings = loadContextualStrings()
1356
+ let cs: [String] = loadContextualStrings()
1357
+ req.contextualStrings = cs
1358
+ NSLog("[STT] makeFreshRequest contextualStrings count=\(cs.count) sample=\(cs.prefix(10)) file=\(Bundle.main.path(forResource: "words_flattened", ofType: "txt") ?? "nil")")
1359
+
1214
1360
  self.recognitionRequest = req
1215
- NSLog("makeFreshRequest()")
1361
+ NSLog("[STT] makeFreshRequest()")
1216
1362
  return req
1217
1363
  }
1218
1364
 
1219
1365
  private func startTask(_ req: SFSpeechAudioBufferRecognitionRequest) {
1366
+ if isSpeechRecognitionLitePaused() {
1367
+ NSLog("[STT] startTask suppressed (speechRecognitionPaused)")
1368
+ return
1369
+ }
1220
1370
  NSLog("starting recognitionTask")
1221
1371
  lastTaskStartAt = CACurrentMediaTime()
1222
1372
  lastResultAt = lastTaskStartAt
@@ -1240,14 +1390,17 @@ private func recoverAfterTelephonyInterruption() {
1240
1390
 
1241
1391
  func markIfReal(_ r: SFSpeechRecognitionResult?) {
1242
1392
  guard let r = r else { return }
1243
- let best = r.bestTranscription.formattedString.trimmingCharacters(in: .whitespacesAndNewlines)
1244
- if !best.isEmpty ||
1245
- r.transcriptions.contains(where: { !$0.formattedString.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }) {
1246
- if !self.seenRealSpeech {
1247
- self.seenRealSpeech = true
1248
- NSLog("first real speech detected -> onSpeechStart to JS")
1249
- self.sendEvent(name: "onSpeechStart", body: nil)
1250
- }
1393
+
1394
+ // ✅ Do NOT use formattedString here (it normalizes spacing/punctuation/number formatting).
1395
+ // Instead, treat "real speech" as "we have at least one non-empty segment substring".
1396
+ let hasReal = r.bestTranscription.segments.contains {
1397
+ !$0.substring.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty
1398
+ }
1399
+
1400
+ if hasReal && !self.seenRealSpeech {
1401
+ self.seenRealSpeech = true
1402
+ NSLog("first real speech detected -> onSpeechStart to JS")
1403
+ self.sendEvent(name: "onSpeechStart", body: nil)
1251
1404
  }
1252
1405
  }
1253
1406
  markIfReal(result)
@@ -1276,9 +1429,9 @@ private func recoverAfterTelephonyInterruption() {
1276
1429
  }
1277
1430
 
1278
1431
  let isFinal = result.isFinal
1279
- let parts = result.transcriptions.map { $0.formattedString }
1432
+ let parts = result.transcriptions.map { $0.segments.map { $0.substring }.joined(separator: " ") }
1280
1433
  self.sendResult(error: nil,
1281
- bestTranscription: result.bestTranscription.formattedString,
1434
+ bestTranscription: result.bestTranscription.segments.map { $0.substring }.joined(separator: " "),
1282
1435
  transcriptions: parts,
1283
1436
  isFinal: isFinal)
1284
1437
 
@@ -1338,6 +1491,24 @@ private func recoverAfterTelephonyInterruption() {
1338
1491
  }
1339
1492
  mixerProbeActive = false
1340
1493
  mixerProbeCompletions.removeAll()
1494
+ speakerVerificationEngine = nil
1495
+ speakerVerificationFrameSize = 0
1496
+ speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
1497
+ speakerVerificationThreshold = 0
1498
+ speakerVerificationFrameSeq = 0
1499
+ speakerVerificationSourceSampleRate = 0
1500
+ speakerVerificationTargetSampleRate = 0
1501
+ speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
1502
+ speakerVerificationResamplePos = 0
1503
+ speakerLastPositiveMatchAt = 0
1504
+ setSpeakerGateState(enabled: false, open: true)
1505
+ speakerVerificationErrorSent = false
1506
+ speakerPreRollBuffers.removeAll(keepingCapacity: false)
1507
+ speakerPreRollFrames = 0
1508
+ speakerPreRollMaxFrames = 0
1509
+ speakerPendingPreRollFlush = false
1510
+ lastRouteSignature = ""
1511
+ markAECSessionActivation(false, reason: "teardown")
1341
1512
 
1342
1513
  resetAudioSession()
1343
1514
  savedSessionBeforePause = nil
@@ -1358,6 +1529,10 @@ private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
1358
1529
  NSLog("[STT] ensureEngineRunning suppressed (telephony/recovering)")
1359
1530
  return
1360
1531
  }
1532
+ if isSpeechRecognitionLitePaused() {
1533
+ NSLog("[STT] ensureEngineRunning(\(reason)) suppressed (speechRecognitionPaused)")
1534
+ return
1535
+ }
1361
1536
  if micPaused {
1362
1537
  NSLog("[STT] ensureEngineRunning(\(reason)) suppressed (micPaused)")
1363
1538
  return
@@ -1402,7 +1577,9 @@ private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
1402
1577
 
1403
1578
  // ensure a task is running
1404
1579
  if recognitionTask == nil {
1405
- if let req = recognitionRequest {
1580
+ if isSpeechRecognitionLitePaused() {
1581
+ NSLog("[STT] ensureEngineRunning(\(reason)): skip startTask (speechRecognitionPaused)")
1582
+ } else if let req = recognitionRequest {
1406
1583
  startTask(req)
1407
1584
  } else {
1408
1585
  startTask(makeFreshRequest())
@@ -1482,6 +1659,10 @@ private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
1482
1659
  bumpGraphGen()
1483
1660
  NSLog("[STT] 🔄 rebuildEngineGraphAndRestart (\(reason))")
1484
1661
  if isTelephonyInterrupted { NSLog("[STT] rebuild suppressed during telephony"); return }
1662
+ if isSpeechRecognitionLitePaused() {
1663
+ NSLog("[STT] rebuild suppressed (speechRecognitionPaused)")
1664
+ return
1665
+ }
1485
1666
 
1486
1667
  guard hasValidCaptureNow() else {
1487
1668
  markCaptureLost()
@@ -1511,27 +1692,8 @@ private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
1511
1692
  _ = setupAudioSession() // ✅ keep session policy consistent
1512
1693
  forceSpeakerIfReceiver("rebuild:\(reason)") // ✅ receiver -> speaker now
1513
1694
 
1514
- let s = AVAudioSession.sharedInstance()
1515
- let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
1516
- let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
1517
- (s.currentRoute.inputs.first?.portType == .builtInMic)
1518
-
1519
1695
  let inputNode = newEngine.inputNode
1520
-
1521
- if aecEnabled, speakerRoute && usingBuiltInMic {
1522
- // AEC makes sense here
1523
- do { try inputNode.setVoiceProcessingEnabled(true) } catch {
1524
- NSLog("Voice processing not available: \(error)")
1525
- }
1526
- if #available(iOS 17.0, *) {
1527
- var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
1528
- duck.enableAdvancedDucking = false
1529
- duck.duckingLevel = .min
1530
- inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
1531
- }
1532
- } else {
1533
- // Headsets / car / AirPlay → skip AEC
1534
- }
1696
+ reconcileAEC(on: newEngine, reason: "rebuild-\(reason)-prestart", allowRebuild: false)
1535
1697
 
1536
1698
  var inFmt = inputNode.outputFormat(forBus: 0)
1537
1699
 
@@ -1543,11 +1705,20 @@ private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
1543
1705
  micMixer.outputVolume = 0.0
1544
1706
 
1545
1707
  // TTS player → (de-esser) → mainMixer
1546
- if playbackNode == nil { playbackNode = AVAudioPlayerNode() }
1708
+ if let existing = playbackNode, existing.engine !== newEngine {
1709
+ // Node is owned by a different engine instance; recreate for this graph.
1710
+ existing.stop()
1711
+ playbackNode = nil
1712
+ }
1713
+ if playbackNode == nil {
1714
+ playbackNode = AVAudioPlayerNode()
1715
+ }
1547
1716
  if let player = playbackNode {
1548
- if player.engine == nil { newEngine.attach(player) }
1549
- newEngine.connect(player, to: newEngine.mainMixerNode, format: nil)
1550
- }
1717
+ if player.engine == nil {
1718
+ newEngine.attach(player)
1719
+ }
1720
+ newEngine.connect(player, to: newEngine.mainMixerNode, format: nil)
1721
+ }
1551
1722
 
1552
1723
  // // --- Aggressive low-pass only ---
1553
1724
  // let deEss = AVAudioUnitEQ(numberOfBands: 1)
@@ -1572,6 +1743,8 @@ if let player = playbackNode {
1572
1743
  try newEngine.start()
1573
1744
  armFirstIOCycleLatch(on: newEngine)
1574
1745
  tryClearCaptureLossAfterStartSucceeded()
1746
+ reconcileAEC(on: newEngine, reason: "rebuild-\(reason)-poststart", allowRebuild: false)
1747
+ scheduleAECReconcileRetries(reason: "rebuild-\(reason)")
1575
1748
  NSLog("[STT] rebuild: engine.start() ok, running=\(newEngine.isRunning)")
1576
1749
  } catch {
1577
1750
  markCaptureLost()
@@ -1647,7 +1820,11 @@ if let player = playbackNode {
1647
1820
  }
1648
1821
  }
1649
1822
  if self.recognitionTask == nil {
1650
- startTask(self.recognitionRequest!)
1823
+ if isSpeechRecognitionLitePaused() {
1824
+ NSLog("[STT] rebuild: skip startTask (speechRecognitionPaused)")
1825
+ } else {
1826
+ startTask(self.recognitionRequest!)
1827
+ }
1651
1828
  }
1652
1829
  if self.sttActive && !self.micPaused {
1653
1830
  self.installPlaybackHooks()
@@ -1656,6 +1833,10 @@ if let player = playbackNode {
1656
1833
 
1657
1834
  @objc private func handleEngineConfigChange(_ note: Notification) {
1658
1835
  if isTearingDown { return } // ← add
1836
+ if isSpeechRecognitionLitePaused() {
1837
+ NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange (ignored: speechRecognitionPaused)")
1838
+ return
1839
+ }
1659
1840
  if micPaused {
1660
1841
  NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange (ignored: micPaused)")
1661
1842
  return
@@ -1667,12 +1848,18 @@ if let player = playbackNode {
1667
1848
  playbackNode = nil
1668
1849
  }
1669
1850
  ensureEngineRunning(reason: "engine-config-change")
1851
+ reconcileAEC(on: audioEngine, reason: "engine-config-change")
1852
+ scheduleAECReconcileRetries(reason: "engine-config-change")
1670
1853
  }
1671
1854
 
1672
1855
  @objc private func handleMediaServicesReset(_ note: Notification) {
1673
1856
  if isTearingDown { return } // ← add
1674
1857
 
1675
- if micPaused {
1858
+ if isSpeechRecognitionLitePaused() {
1859
+ NSLog("[STT] 📺 Media services RESET (ignored: speechRecognitionPaused)")
1860
+ return
1861
+ }
1862
+ if micPaused {
1676
1863
  NSLog("[STT] 📺 Media services RESET (ignored: micPaused)")
1677
1864
  return
1678
1865
  }
@@ -1681,6 +1868,8 @@ if let player = playbackNode {
1681
1868
  bumpGraphGen()
1682
1869
  _ = setupAudioSession()
1683
1870
  ensureEngineRunning(reason: "media-services-reset")
1871
+ reconcileAEC(on: audioEngine, reason: "media-services-reset")
1872
+ scheduleAECReconcileRetries(reason: "media-services-reset")
1684
1873
  }
1685
1874
 
1686
1875
  /*?????????? Why so many changes???
@@ -1720,19 +1909,52 @@ if let player = playbackNode {
1720
1909
  */
1721
1910
  @objc private func handleRouteChange(_ note: Notification) {
1722
1911
  if isTearingDown { return }
1723
- if micPaused {
1912
+ if !sttActive {
1913
+ NSLog("[STT] 🔀 route change (ignored: sttInactive) \(note.userInfo ?? [:])")
1914
+ return
1915
+ }
1916
+ if isSpeechRecognitionLitePaused() {
1917
+ NSLog("[STT] 🔀 route change (ignored: speechRecognitionPaused) \(note.userInfo ?? [:])")
1918
+ return
1919
+ }
1920
+ if micPaused {
1724
1921
  NSLog("[STT] 🔀 route change (ignored: micPaused) \(note.userInfo ?? [:])")
1725
1922
  return
1726
1923
  }
1727
1924
 
1728
- NSLog("[STT] 🔀 route change: \(note.userInfo ?? [:])")
1925
+ let info = note.userInfo ?? [:]
1926
+ NSLog("[STT] 🔀 route change: \(info)")
1729
1927
  if isTelephonyInterrupted || isRecoveringAfterTelephony {
1730
- NSLog("[STT] 🔀 route change (ignored during telephony/recovering): \(note.userInfo ?? [:])")
1928
+ NSLog("[STT] 🔀 route change (ignored during telephony/recovering): \(info)")
1731
1929
  return
1732
1930
  }
1733
1931
 
1734
- updateSessionRouting(selectBestInput: true)
1932
+ let session = AVAudioSession.sharedInstance()
1933
+ let outSig = session.currentRoute.outputs.map { $0.portType.rawValue }.joined(separator: ",")
1934
+ let inSig = session.currentRoute.inputs.map { $0.portType.rawValue }.joined(separator: ",")
1935
+ let routeSig = "outs=\(outSig)|ins=\(inSig)"
1936
+ if routeSig == lastRouteSignature {
1937
+ NSLog("[STT] 🔀 route change ignored (same route signature)")
1938
+ return
1939
+ }
1940
+ lastRouteSignature = routeSig
1941
+
1942
+ if let reasonVal = info[AVAudioSessionRouteChangeReasonKey] as? UInt,
1943
+ let reason = AVAudioSession.RouteChangeReason(rawValue: reasonVal) {
1944
+ switch reason {
1945
+ // Match AVAudioWrapper behavior: handle concrete hardware events + route config changes.
1946
+ case .newDeviceAvailable, .oldDeviceUnavailable, .routeConfigurationChange:
1947
+ updateSessionRouting(selectBestInput: true)
1948
+ default:
1949
+ NSLog("[STT] 🔀 route change reason=\(reason.rawValue) -> skip updateSessionRouting")
1950
+ }
1951
+ } else {
1952
+ NSLog("[STT] 🔀 route change reason missing -> skip updateSessionRouting")
1953
+ }
1954
+
1735
1955
  forceSpeakerIfReceiver("routeChange")
1956
+ reconcileAEC(on: audioEngine, reason: "route-change", allowRebuild: false)
1957
+ scheduleAECReconcileRetries(reason: "route-change")
1736
1958
 
1737
1959
  ensureEngineRunning(reason: "route-change", skipCooldown: true)
1738
1960
  }
@@ -1757,7 +1979,7 @@ if let player = playbackNode {
1757
1979
  var fired = false
1758
1980
  self.safeRemoveTap(mixer, bus: 0)
1759
1981
 
1760
- self.safeInstallTap(mixer, bus: 0, bufferSize: 128, format: nil) { [weak self, weak mixer] _, _ in
1982
+ mixer.installTap(onBus: 0, bufferSize: 128, format: nil) { [weak self, weak mixer] _, _ in
1761
1983
  guard let self = self, gen == self.graphGen else { return }
1762
1984
  if fired { return }
1763
1985
  fired = true
@@ -1772,7 +1994,6 @@ if let player = playbackNode {
1772
1994
  }
1773
1995
  }
1774
1996
 
1775
-
1776
1997
  DispatchQueue.main.asyncAfter(deadline: .now() + timeout) { [weak self, weak mixer] in
1777
1998
  guard let self = self, gen == self.graphGen else { return }
1778
1999
  if fired { return }
@@ -1857,7 +2078,12 @@ if let player = playbackNode {
1857
2078
  tapFramesTotal = 0
1858
2079
 
1859
2080
  // Re-activate the session (safe if already active)
1860
- try? AVAudioSession.sharedInstance().setActive(true, options: [])
2081
+ do {
2082
+ try AVAudioSession.sharedInstance().setActive(true, options: [])
2083
+ markAECSessionActivation(true, reason: "interruption-ended")
2084
+ } catch {
2085
+ markAECSessionActivation(false, reason: "interruption-ended-failed")
2086
+ }
1861
2087
 
1862
2088
  // Give routes/formats a moment to settle *before* we rebuild
1863
2089
  DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) {
@@ -2006,12 +2232,27 @@ if let player = playbackNode {
2006
2232
  lastNoInputRecoveryAt = 0
2007
2233
  lastRearmAt = 0
2008
2234
  lastReclaimAttempt = 0
2009
- emaPartialGap = 0
2010
2235
  tapFramesTotal = 0
2011
2236
  lastTapFramesSeen = 0
2012
2237
  pausedForCaptureLoss = false
2013
2238
  mixerProbeActive = false
2014
2239
  mixerProbeCompletions.removeAll()
2240
+ speakerVerificationEngine = nil
2241
+ speakerVerificationFrameSize = 0
2242
+ speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
2243
+ speakerVerificationThreshold = 0
2244
+ speakerVerificationFrameSeq = 0
2245
+ speakerVerificationSourceSampleRate = 0
2246
+ speakerVerificationTargetSampleRate = 0
2247
+ speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
2248
+ speakerVerificationResamplePos = 0
2249
+ speakerLastPositiveMatchAt = 0
2250
+ setSpeakerGateState(enabled: false, open: true)
2251
+ speakerVerificationErrorSent = false
2252
+ speakerPreRollBuffers.removeAll(keepingCapacity: false)
2253
+ speakerPreRollFrames = 0
2254
+ speakerPreRollMaxFrames = 0
2255
+ speakerPendingPreRollFlush = false
2015
2256
 
2016
2257
  audioSession = AVAudioSession.sharedInstance()
2017
2258
  guard let session = audioSession else { return }
@@ -2060,7 +2301,9 @@ if let player = playbackNode {
2060
2301
  }
2061
2302
  request.shouldReportPartialResults = true
2062
2303
  //if #available(iOS 13.0, *) { request.taskHint = .dictation }
2063
- request.contextualStrings = loadContextualStrings()
2304
+ let cs: [String] = loadContextualStrings()
2305
+ request.contextualStrings = cs
2306
+ NSLog("[STT] makeFreshRequest contextualStrings count=\(cs.count) sample=\(cs.prefix(10)) file=\(Bundle.main.path(forResource: "words_flattened", ofType: "txt") ?? "nil")")
2064
2307
 
2065
2308
  guard recognitionRequest != nil else {
2066
2309
  sendResult(error: ["code": "recognition_init"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
@@ -2079,25 +2322,7 @@ if let player = playbackNode {
2079
2322
  let inputNode = engine.inputNode
2080
2323
  let _ = inputNode // presence check
2081
2324
 
2082
- let s = AVAudioSession.sharedInstance()
2083
- let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
2084
- let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
2085
- (s.currentRoute.inputs.first?.portType == .builtInMic)
2086
-
2087
- if aecEnabled, speakerRoute && usingBuiltInMic {
2088
- // AEC makes sense here
2089
- do { try inputNode.setVoiceProcessingEnabled(true) } catch {
2090
- NSLog("Voice processing not available: \(error)")
2091
- }
2092
- if #available(iOS 17.0, *) {
2093
- var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
2094
- duck.enableAdvancedDucking = false // disable advanced (VAD-based) ducking
2095
- duck.duckingLevel = .min // “as loud as possible” for other audio
2096
- inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
2097
- }
2098
- } else {
2099
- // Headsets / car / AirPlay → skip AEC
2100
- }
2325
+ reconcileAEC(on: engine, reason: "setup-start-prestart", allowRebuild: false)
2101
2326
 
2102
2327
  // if output node voice processing is ever needed, keep commented as in original:
2103
2328
  // do { try engine.outputNode.setVoiceProcessingEnabled(true) } catch { ... }
@@ -2151,7 +2376,72 @@ if let player = playbackNode {
2151
2376
  return
2152
2377
  }
2153
2378
 
2154
- inputNode.installTap(onBus: 0, bufferSize: 1024, format: format) { [weak self] buffer, _ in
2379
+ var tapBufferSize: AVAudioFrameCount = 1024
2380
+ if let svStart = speakerVerificationStartConfig {
2381
+ do {
2382
+ var svConfig = svStart.config
2383
+ let routeSampleRate = Int(round(format.sampleRate))
2384
+ if useShortSpeakerVerificationTailWindow {
2385
+ let forcedTail = max(0.1, shortSpeakerVerificationTailSeconds)
2386
+ svConfig.tailSeconds = forcedTail
2387
+ if svConfig.maxTailSeconds < forcedTail {
2388
+ svConfig.maxTailSeconds = forcedTail
2389
+ }
2390
+ NSLog("[STT] SV tail override enabled tailSeconds=\(forcedTail)")
2391
+ }
2392
+
2393
+ speakerVerificationFrameSize = svConfig.frameSize
2394
+ speakerVerificationThreshold = svConfig.decisionThreshold
2395
+ speakerVerificationFrameSeq = 0
2396
+ speakerVerificationSourceSampleRate = routeSampleRate
2397
+ speakerVerificationTargetSampleRate = svConfig.sampleRate
2398
+ speakerVerificationResampleCarry.removeAll(keepingCapacity: true)
2399
+ speakerVerificationResamplePos = 0
2400
+ speakerLastPositiveMatchAt = 0
2401
+ speakerVerificationInputBuffer.removeAll(keepingCapacity: true)
2402
+ setSpeakerGateState(enabled: false, open: false)
2403
+ speakerVerificationErrorSent = false
2404
+ speakerPreRollBuffers.removeAll(keepingCapacity: true)
2405
+ speakerPreRollFrames = 0
2406
+ speakerPendingPreRollFlush = false
2407
+ speakerPreRollMaxFrames = max(1, Int(round(format.sampleRate * speakerPreRollSeconds)))
2408
+
2409
+ svConfig.logLevel = .off
2410
+ let svEngine = try SpeakerVerificationEngine(config: svConfig)
2411
+ svEngine.setEnrollment(svStart.enrollment)
2412
+ svEngine.resetStreamingState()
2413
+
2414
+ speakerVerificationEngine = svEngine
2415
+ setSpeakerGateState(enabled: true, open: false)
2416
+ tapBufferSize = AVAudioFrameCount(max(64, svConfig.frameSize))
2417
+ NSLog("[STT] Speaker verification gate enabled frameSize=\(svConfig.frameSize) tailSeconds=\(svConfig.tailSeconds) threshold=\(svConfig.decisionThreshold) hangover=\(useSpeakerGateHangover ? "ON" : "OFF") hangSec=\(String(format: "%.3f", speakerGateHangoverSeconds))")
2418
+ if routeSampleRate != svConfig.sampleRate {
2419
+ NSLog("[STT] SV resampling enabled \(routeSampleRate)Hz -> \(svConfig.sampleRate)Hz")
2420
+ } else {
2421
+ NSLog("[STT] SV sampleRate already matched at \(routeSampleRate)Hz")
2422
+ }
2423
+ } catch {
2424
+ speakerVerificationEngine = nil
2425
+ speakerVerificationThreshold = 0
2426
+ speakerVerificationFrameSeq = 0
2427
+ speakerVerificationSourceSampleRate = 0
2428
+ speakerVerificationTargetSampleRate = 0
2429
+ speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
2430
+ speakerVerificationResamplePos = 0
2431
+ speakerLastPositiveMatchAt = 0
2432
+ setSpeakerGateState(enabled: false, open: true)
2433
+ speakerPreRollBuffers.removeAll(keepingCapacity: false)
2434
+ speakerPreRollFrames = 0
2435
+ speakerPreRollMaxFrames = 0
2436
+ speakerPendingPreRollFlush = false
2437
+ sendResult(error: ["message": "Speaker verification disabled: \(error.localizedDescription)"],
2438
+ bestTranscription: nil,
2439
+ transcriptions: nil,
2440
+ isFinal: nil)
2441
+ }
2442
+ }
2443
+
2444
+ inputNode.installTap(onBus: 0, bufferSize: tapBufferSize, format: format) { [weak self] buffer, _ in
2155
2445
  // Strongify self once
2156
2446
  guard let self = self else { return }
2157
2447
  // ✅ Count frames globally so the watchdog can see forward progress
@@ -2192,8 +2482,24 @@ if let player = playbackNode {
2192
2482
  let value = self.averagePowerForChannel1
2193
2483
  self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": value])
2194
2484
 
2485
+ if self.currentSpeakerGateState().enabled, let ch0 = buffer.floatChannelData?[0] {
2486
+ let mono = Array(UnsafeBufferPointer(start: ch0, count: Int(buffer.frameLength)))
2487
+ self.processSpeakerVerificationSamples(mono)
2488
+ }
2489
+
2195
2490
  // Append to recognition
2196
- self.recognitionRequest?.append(buffer)
2491
+ let gate = self.currentSpeakerGateState()
2492
+ if !gate.enabled {
2493
+ self.recognitionRequest?.append(buffer)
2494
+ NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=DISABLED action=append")
2495
+ } else if gate.open {
2496
+ self.flushSpeakerPreRollIfNeeded()
2497
+ self.recognitionRequest?.append(buffer)
2498
+ NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=OPEN action=append preRollFrames=\(self.currentSpeakerPreRollFrames())")
2499
+ } else {
2500
+ self.enqueueSpeakerPreRoll(buffer)
2501
+ NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=CLOSED action=buffer preRollFrames=\(self.currentSpeakerPreRollFrames())")
2502
+ }
2197
2503
 
2198
2504
  // inside inputNode.installTap { buffer, _ in
2199
2505
  self.lastBufferAt = CACurrentMediaTime()
@@ -2205,6 +2511,8 @@ if let player = playbackNode {
2205
2511
  do {
2206
2512
  try engine.start()
2207
2513
  armFirstIOCycleLatch(on: engine)
2514
+ reconcileAEC(on: engine, reason: "setup-start-poststart", allowRebuild: false)
2515
+ scheduleAECReconcileRetries(reason: "setup-start")
2208
2516
  } catch {
2209
2517
  audioSessionError = error as NSError
2210
2518
  }
@@ -2251,6 +2559,236 @@ if let player = playbackNode {
2251
2559
  }
2252
2560
  }
2253
2561
 
2562
+ private func loadSpeakerVerificationStartConfig(onboardingJsonPath: String) throws -> SpeakerVerificationStartConfig {
2563
+ let data = try Data(contentsOf: URL(fileURLWithPath: onboardingJsonPath))
2564
+ let enrollment = try SpeakerEnrollment.deserialize(data)
2565
+ return SpeakerVerificationStartConfig(enrollment: enrollment, config: enrollment.configSnapshot)
2566
+ }
2567
+
2568
+ private func currentSpeakerGateState() -> (enabled: Bool, open: Bool) {
2569
+ speakerVerificationStateLock.lock()
2570
+ let state = (speakerGateEnabled, speakerGateOpen)
2571
+ speakerVerificationStateLock.unlock()
2572
+ return state
2573
+ }
2574
+
2575
+ private func setSpeakerGateState(enabled: Bool, open: Bool) {
2576
+ speakerVerificationStateLock.lock()
2577
+ let wasOpen = speakerGateOpen
2578
+ let wasEnabled = speakerGateEnabled
2579
+ speakerGateEnabled = enabled
2580
+ speakerGateOpen = open
2581
+ let changed = (wasOpen != open) || (wasEnabled != enabled)
2582
+ if enabled && open && (!wasEnabled || !wasOpen) {
2583
+ speakerPendingPreRollFlush = true
2584
+ }
2585
+ if !enabled {
2586
+ speakerPendingPreRollFlush = false
2587
+ }
2588
+ speakerVerificationStateLock.unlock()
2589
+ if changed {
2590
+ NSLog("[STT][SV][GATE] enabled=\(enabled ? "YES" : "NO") open=\(open ? "YES" : "NO") th=\(speakerVerificationThreshold)")
2591
+ }
2592
+ }
2593
+
2594
+ private func currentSpeakerPreRollFrames() -> Int {
2595
+ speakerVerificationStateLock.lock()
2596
+ let n = speakerPreRollFrames
2597
+ speakerVerificationStateLock.unlock()
2598
+ return n
2599
+ }
2600
+
2601
+ private func enqueueSpeakerPreRoll(_ buffer: AVAudioPCMBuffer) {
2602
+ speakerVerificationStateLock.lock()
2603
+ defer { speakerVerificationStateLock.unlock() }
2604
+ guard speakerPreRollMaxFrames > 0 else { return }
2605
+ guard let copy = copyPCMBuffer(buffer) else { return }
2606
+ speakerPreRollBuffers.append(copy)
2607
+ speakerPreRollFrames += Int(copy.frameLength)
2608
+
2609
+ while speakerPreRollFrames > speakerPreRollMaxFrames, !speakerPreRollBuffers.isEmpty {
2610
+ let dropped = speakerPreRollBuffers.removeFirst()
2611
+ speakerPreRollFrames -= Int(dropped.frameLength)
2612
+ }
2613
+ }
2614
+
2615
+ private func flushSpeakerPreRollIfNeeded() {
2616
+ var toFlush: [AVAudioPCMBuffer] = []
2617
+ var totalFrames = 0
2618
+ var selectedFrames = 0
2619
+ speakerVerificationStateLock.lock()
2620
+ if speakerPendingPreRollFlush {
2621
+ totalFrames = speakerPreRollFrames
2622
+ if useLegacySpeakerGateBehavior {
2623
+ toFlush = speakerPreRollBuffers
2624
+ } else {
2625
+ let sr = max(1, speakerVerificationSourceSampleRate)
2626
+ let maxFrames = max(1, Int(round(Double(sr) * speakerPreRollFlushMaxSeconds)))
2627
+ if totalFrames <= maxFrames {
2628
+ toFlush = speakerPreRollBuffers
2629
+ } else {
2630
+ var kept: [AVAudioPCMBuffer] = []
2631
+ var keptFrames = 0
2632
+ for b in speakerPreRollBuffers.reversed() {
2633
+ kept.append(b)
2634
+ keptFrames += Int(b.frameLength)
2635
+ if keptFrames >= maxFrames { break }
2636
+ }
2637
+ toFlush = kept.reversed()
2638
+ }
2639
+ }
2640
+ selectedFrames = toFlush.reduce(0) { $0 + Int($1.frameLength) }
2641
+ speakerPreRollBuffers.removeAll(keepingCapacity: false)
2642
+ speakerPreRollFrames = 0
2643
+ speakerPendingPreRollFlush = false
2644
+ }
2645
+ speakerVerificationStateLock.unlock()
2646
+
2647
+ if toFlush.isEmpty { return }
2648
+ NSLog("[STT][SV][PREROLL] flushing buffers=\(toFlush.count) frames=\(selectedFrames) totalBuffered=\(totalFrames)")
2649
+ for b in toFlush {
2650
+ recognitionRequest?.append(b)
2651
+ }
2652
+ }
2653
+
2654
+ private func copyPCMBuffer(_ source: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
2655
+ guard let dst = AVAudioPCMBuffer(pcmFormat: source.format, frameCapacity: source.frameLength) else {
2656
+ return nil
2657
+ }
2658
+ dst.frameLength = source.frameLength
2659
+ let channels = Int(source.format.channelCount)
2660
+ let frames = Int(source.frameLength)
2661
+
2662
+ if let src = source.floatChannelData, let out = dst.floatChannelData {
2663
+ let bytes = frames * MemoryLayout<Float>.size
2664
+ for ch in 0..<channels {
2665
+ memcpy(out[ch], src[ch], bytes)
2666
+ }
2667
+ return dst
2668
+ }
2669
+
2670
+ if let src = source.int16ChannelData, let out = dst.int16ChannelData {
2671
+ let bytes = frames * MemoryLayout<Int16>.size
2672
+ for ch in 0..<channels {
2673
+ memcpy(out[ch], src[ch], bytes)
2674
+ }
2675
+ return dst
2676
+ }
2677
+
2678
+ if let src = source.int32ChannelData, let out = dst.int32ChannelData {
2679
+ let bytes = frames * MemoryLayout<Int32>.size
2680
+ for ch in 0..<channels {
2681
+ memcpy(out[ch], src[ch], bytes)
2682
+ }
2683
+ return dst
2684
+ }
2685
+
2686
+ return nil
2687
+ }
2688
+
2689
+ private func resampleSamplesForSpeakerVerificationIfNeeded(_ input: [Float]) -> [Float] {
2690
+ guard !input.isEmpty else { return [] }
2691
+ let srcRate = speakerVerificationSourceSampleRate
2692
+ let dstRate = speakerVerificationTargetSampleRate
2693
+ guard srcRate > 0, dstRate > 0 else { return input }
2694
+ if srcRate == dstRate { return input }
2695
+
2696
+ let ratio = Double(srcRate) / Double(dstRate) // source samples per output sample
2697
+ let source = speakerVerificationResampleCarry + input
2698
+ guard source.count >= 2 else {
2699
+ speakerVerificationResampleCarry = source
2700
+ return []
2701
+ }
2702
+
2703
+ var out: [Float] = []
2704
+ out.reserveCapacity(Int(Double(input.count) * Double(dstRate) / Double(srcRate)) + 8)
2705
+
2706
+ var pos = speakerVerificationResamplePos
2707
+ while pos + 1.0 < Double(source.count) {
2708
+ let i = Int(pos)
2709
+ let frac = Float(pos - Double(i))
2710
+ let a = source[i]
2711
+ let b = source[i + 1]
2712
+ out.append(a + (b - a) * frac)
2713
+ pos += ratio
2714
+ }
2715
+
2716
+ let keepStart = max(0, Int(floor(pos)) - 1)
2717
+ speakerVerificationResampleCarry = Array(source[keepStart...])
2718
+ speakerVerificationResamplePos = pos - Double(keepStart)
2719
+ return out
2720
+ }
2721
+
2722
+ private func processSpeakerVerificationSamples(_ samples: [Float]) {
2723
+ guard !samples.isEmpty else { return }
2724
+ speakerVerificationQueue.async { [weak self] in
2725
+ guard let self = self else { return }
2726
+ guard let engine = self.speakerVerificationEngine else { return }
2727
+ let frameSize = self.speakerVerificationFrameSize
2728
+ guard frameSize > 0 else { return }
2729
+
2730
+ let normalized = self.resampleSamplesForSpeakerVerificationIfNeeded(samples)
2731
+ if normalized.isEmpty { return }
2732
+ self.speakerVerificationInputBuffer.append(contentsOf: normalized)
2733
+
2734
+ while self.speakerVerificationInputBuffer.count >= frameSize {
2735
+ let frame = Array(self.speakerVerificationInputBuffer.prefix(frameSize))
2736
+ self.speakerVerificationInputBuffer.removeFirst(frameSize)
2737
+ self.speakerVerificationFrameSeq &+= 1
2738
+ let seq = self.speakerVerificationFrameSeq
2739
+
2740
+ do {
2741
+ let out = try engine.processFrame(frame: frame)
2742
+ switch out {
2743
+ case .pending(let p):
2744
+ let gate = self.currentSpeakerGateState()
2745
+ NSLog("[STT][SV][FRAME #\(seq)] pending buffered=\(p.bufferedSamples) neededSec=\(p.neededSeconds) gate=\(gate.open ? "OPEN" : "CLOSED") th=\(self.speakerVerificationThreshold)")
2746
+ case .result(let result):
2747
+ if self.useLegacySpeakerGateBehavior || !self.useSpeakerGateHangover {
2748
+ self.setSpeakerGateState(enabled: true, open: result.isMatch)
2749
+ } else {
2750
+ let now = CACurrentMediaTime()
2751
+ if result.isMatch {
2752
+ self.speakerLastPositiveMatchAt = now
2753
+ self.setSpeakerGateState(enabled: true, open: true)
2754
+ } else {
2755
+ let keepOpen = self.speakerLastPositiveMatchAt > 0 &&
2756
+ (now - self.speakerLastPositiveMatchAt) <= max(0, self.speakerGateHangoverSeconds)
2757
+ self.setSpeakerGateState(enabled: true, open: keepOpen)
2758
+ }
2759
+ }
2760
+ let gate = self.currentSpeakerGateState()
2761
+ NSLog("[STT][SV][FRAME #\(seq)] scoreBest=\(String(format: "%.4f", result.scoreBest)) raw=\(String(format: "%.4f", result.scoreBestRaw)) meancombo=\(String(format: "%.4f", result.scoreBestMeancombo)) mean=\(String(format: "%.4f", result.scoreMean)) match=\(result.isMatch ? "YES" : "NO") gate=\(gate.open ? "OPEN" : "CLOSED") th=\(String(format: "%.4f", self.speakerVerificationThreshold)) hangover=\(self.useSpeakerGateHangover ? "ON" : "OFF") hangSec=\(String(format: "%.3f", self.speakerGateHangoverSeconds))")
2762
+ }
2763
+ } catch {
2764
+ self.speakerVerificationEngine = nil
2765
+ self.speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
2766
+ self.speakerVerificationSourceSampleRate = 0
2767
+ self.speakerVerificationTargetSampleRate = 0
2768
+ self.speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
2769
+ self.speakerVerificationResamplePos = 0
2770
+ self.speakerLastPositiveMatchAt = 0
2771
+ self.setSpeakerGateState(enabled: false, open: true)
2772
+ self.speakerVerificationStateLock.lock()
2773
+ self.speakerPreRollBuffers.removeAll(keepingCapacity: false)
2774
+ self.speakerPreRollFrames = 0
2775
+ self.speakerPreRollMaxFrames = 0
2776
+ self.speakerVerificationStateLock.unlock()
2777
+ if !self.speakerVerificationErrorSent {
2778
+ self.speakerVerificationErrorSent = true
2779
+ DispatchQueue.main.async { [weak self] in
2780
+ self?.sendResult(error: ["message": "Speaker verification stopped: \(error.localizedDescription)"],
2781
+ bestTranscription: nil,
2782
+ transcriptions: nil,
2783
+ isFinal: nil)
2784
+ }
2785
+ }
2786
+ return
2787
+ }
2788
+ }
2789
+ }
2790
+ }
2791
+
2254
2792
  // MARK: - Helpers
2255
2793
  private func _normalizedPowerLevelFromDecibels(_ decibels: CGFloat) -> CGFloat {
2256
2794
  if decibels < -80.0 || decibels == 0.0 { return 0.0 }