react-native-davoice-tts 1.0.291 → 1.0.293
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/TTSRNBridge.podspec +1 -1
- package/android/libs/com/davoice/tts/1.0.0/tts-1.0.0.aar +0 -0
- package/android/libs/com/davoice/tts/1.0.0/tts-1.0.0.aar.md5 +1 -1
- package/android/libs/com/davoice/tts/1.0.0/tts-1.0.0.aar.sha1 +1 -1
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/Info.plist +5 -5
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/{ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.ofer → ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift.AEC.CRASH.ETC} +663 -125
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DavoiceTTS +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.abi.json +4945 -4931
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.private.swiftinterface +12 -12
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.swiftinterface +12 -12
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.AEC.CRASH.ETC +2853 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DavoiceTTS +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.abi.json +8306 -8292
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.private.swiftinterface +83 -83
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.swiftinterface +83 -83
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.abi.json +8306 -8292
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.private.swiftinterface +83 -83
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.swiftinterface +83 -83
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeDirectory +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeRequirements-1 +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeResources +29 -44
- package/package.json +1 -1
|
@@ -18,6 +18,22 @@ public final class STT: NSObject, SFSpeechRecognizerDelegate {
|
|
|
18
18
|
|
|
19
19
|
// Global AEC toggle (default ON to keep existing behavior)
|
|
20
20
|
public var aecEnabled: Bool = true
|
|
21
|
+
// If true, force VP/AEC ON for a short window after session activation while routes settle.
|
|
22
|
+
public var forceAECDuringRouteWarmup: Bool = true
|
|
23
|
+
public var aecRouteWarmupSeconds: Double = 20.0
|
|
24
|
+
// If true, always request 16k input sample rate from AVAudioSession.
|
|
25
|
+
// iOS may still override this depending on route / voice processing constraints.
|
|
26
|
+
public var force16kMicSampleRate: Bool = false
|
|
27
|
+
// If true, use old SV gate behavior (immediate open/close + full pre-roll flush).
|
|
28
|
+
public var useLegacySpeakerGateBehavior: Bool = false
|
|
29
|
+
// If true, keep gate open for a short hangover after the last positive match.
|
|
30
|
+
public var useSpeakerGateHangover: Bool = true
|
|
31
|
+
public var speakerGateHangoverSeconds: Double = 0.40
|
|
32
|
+
// If true, override SV tailSeconds to 0.5s for faster switching tests.
|
|
33
|
+
public var useShortSpeakerVerificationTailWindow: Bool = true
|
|
34
|
+
public var shortSpeakerVerificationTailSeconds: Float = 0.5
|
|
35
|
+
// In protected mode, flush only this much recent pre-roll when gate reopens.
|
|
36
|
+
public var speakerPreRollFlushMaxSeconds: Double = 0.5
|
|
21
37
|
|
|
22
38
|
// MARK: - Private
|
|
23
39
|
private var speechRecognizer: SFSpeechRecognizer?
|
|
@@ -25,6 +41,9 @@ public final class STT: NSObject, SFSpeechRecognizerDelegate {
|
|
|
25
41
|
private var audioEngine: AVAudioEngine?
|
|
26
42
|
private var recognitionTask: SFSpeechRecognitionTask?
|
|
27
43
|
private var audioSession: AVAudioSession?
|
|
44
|
+
private let aecSessionActivationLock = NSLock()
|
|
45
|
+
private var lastAECSessionActivationAt: CFTimeInterval = 0
|
|
46
|
+
private var aecSessionIsActive: Bool = false
|
|
28
47
|
private var isTearingDown: Bool = false
|
|
29
48
|
private var sessionId: String?
|
|
30
49
|
private var priorAudioCategory: AVAudioSession.Category?
|
|
@@ -32,8 +51,7 @@ public final class STT: NSObject, SFSpeechRecognizerDelegate {
|
|
|
32
51
|
private var averagePowerForChannel1: Float = 0
|
|
33
52
|
// Add to STT
|
|
34
53
|
private var isAdjustingRoute = false
|
|
35
|
-
private var
|
|
36
|
-
private let routeTuneCooldown: CFTimeInterval = 0.5
|
|
54
|
+
private var lastRouteSignature: String = ""
|
|
37
55
|
|
|
38
56
|
private var playbackNode: AVAudioPlayerNode?
|
|
39
57
|
private var seenRealSpeech = false // flips true after first non-blank token
|
|
@@ -75,11 +93,8 @@ public final class STT: NSObject, SFSpeechRecognizerDelegate {
|
|
|
75
93
|
|
|
76
94
|
private(set) var sttActive = false
|
|
77
95
|
// STT.swift (add near `private var playbackNode: AVAudioPlayerNode?`)
|
|
78
|
-
private var ttsEQ: AVAudioUnitEQ?
|
|
96
|
+
// private var ttsEQ: AVAudioUnitEQ?
|
|
79
97
|
|
|
80
|
-
// partial cadence monitor
|
|
81
|
-
private var emaPartialGap: Double = 0 // exponential moving average of time between partials
|
|
82
|
-
private let emaAlpha: Double = 0.3
|
|
83
98
|
// Add near your other state:
|
|
84
99
|
private var ioLatchActiveGen: UInt64 = 0
|
|
85
100
|
|
|
@@ -105,11 +120,45 @@ public final class STT: NSObject, SFSpeechRecognizerDelegate {
|
|
|
105
120
|
private var activeTaskGen: UInt64 = 0
|
|
106
121
|
private var micPaused: Bool = false
|
|
107
122
|
|
|
123
|
+
// --- Optional speaker verification gate ---
|
|
124
|
+
private struct SpeakerVerificationStartConfig {
|
|
125
|
+
let enrollment: SpeakerEnrollment
|
|
126
|
+
let config: SpeakerVerificationConfig
|
|
127
|
+
}
|
|
128
|
+
private let speakerVerificationQueue = DispatchQueue(label: "stt.sv.queue")
|
|
129
|
+
private let speakerVerificationStateLock = NSLock()
|
|
130
|
+
private var speakerVerificationStartConfig: SpeakerVerificationStartConfig?
|
|
131
|
+
private var speakerVerificationEngine: SpeakerVerificationEngine?
|
|
132
|
+
private var speakerVerificationFrameSize: Int = 0
|
|
133
|
+
private var speakerVerificationInputBuffer: [Float] = []
|
|
134
|
+
private var speakerGateOpen: Bool = true
|
|
135
|
+
private var speakerGateEnabled: Bool = false
|
|
136
|
+
private var speakerVerificationErrorSent: Bool = false
|
|
137
|
+
private var speakerPreRollBuffers: [AVAudioPCMBuffer] = []
|
|
138
|
+
private var speakerPreRollFrames: Int = 0
|
|
139
|
+
private var speakerPreRollMaxFrames: Int = 0
|
|
140
|
+
private var speakerPendingPreRollFlush: Bool = false
|
|
141
|
+
private let speakerPreRollSeconds: Double = 1.0
|
|
142
|
+
private var speakerVerificationThreshold: Float = 0
|
|
143
|
+
private var speakerVerificationFrameSeq: UInt64 = 0
|
|
144
|
+
private var speakerVerificationSourceSampleRate: Int = 0
|
|
145
|
+
private var speakerVerificationTargetSampleRate: Int = 0
|
|
146
|
+
private var speakerVerificationResampleCarry: [Float] = []
|
|
147
|
+
private var speakerVerificationResamplePos: Double = 0
|
|
148
|
+
private var speakerLastPositiveMatchAt: CFTimeInterval = 0
|
|
149
|
+
|
|
108
150
|
// --- Speech recognition lite pause (counter-based) ---
|
|
109
151
|
private let speechPauseLock = NSLock()
|
|
110
152
|
private var speechRecognitionPauseCount: Int = 0
|
|
111
153
|
private var speechRecognitionPaused: Bool = false
|
|
112
154
|
@inline(__always)
|
|
155
|
+
private func isSpeechRecognitionLitePaused() -> Bool {
|
|
156
|
+
speechPauseLock.lock()
|
|
157
|
+
let paused = speechRecognitionPaused
|
|
158
|
+
speechPauseLock.unlock()
|
|
159
|
+
return paused
|
|
160
|
+
}
|
|
161
|
+
@inline(__always)
|
|
113
162
|
private func resetSpeechRecognitionLitePauseState(_ why: String) {
|
|
114
163
|
speechPauseLock.lock()
|
|
115
164
|
speechRecognitionPauseCount = 0
|
|
@@ -394,10 +443,6 @@ private func pollOnMain(timeoutSec: TimeInterval,
|
|
|
394
443
|
}
|
|
395
444
|
micPaused = true
|
|
396
445
|
|
|
397
|
-
// ✅ CRITICAL: invalidate any pending async tap installs/probes/latches.
|
|
398
|
-
// This prevents closures scheduled earlier from touching nodes after engine is torn down.
|
|
399
|
-
bumpGraphGen()
|
|
400
|
-
|
|
401
446
|
let session = AVAudioSession.sharedInstance()
|
|
402
447
|
|
|
403
448
|
// Save current session config (so we can restore on unpause)
|
|
@@ -459,6 +504,7 @@ private func pollOnMain(timeoutSec: TimeInterval,
|
|
|
459
504
|
// Use this if we ever have duck others
|
|
460
505
|
// try session.setActive(false, options: [.notifyOthersOnDeactivation])
|
|
461
506
|
try session.setActive(false, options: [])
|
|
507
|
+
markAECSessionActivation(false, reason: "pauseMicrophone-pre")
|
|
462
508
|
NSLog("[STT] pauseMicrophone(): setActive false")
|
|
463
509
|
} catch {
|
|
464
510
|
NSLog("[STT] pauseMicrophone(): failed to switch setActive false: \(error.localizedDescription)")
|
|
@@ -473,6 +519,7 @@ private func pollOnMain(timeoutSec: TimeInterval,
|
|
|
473
519
|
// Switch to playback-only session so iOS releases the mic (indicator off)
|
|
474
520
|
do {
|
|
475
521
|
try session.setActive(true, options: [])
|
|
522
|
+
markAECSessionActivation(true, reason: "pauseMicrophone-playback")
|
|
476
523
|
NSLog("[STT] pauseMicrophone(): session set to .playback (mic released)")
|
|
477
524
|
} catch {
|
|
478
525
|
NSLog("[STT] pauseMicrophone(): failed to switch to session.setActive with .playback: \(error.localizedDescription)")
|
|
@@ -494,6 +541,7 @@ private func pollOnMain(timeoutSec: TimeInterval,
|
|
|
494
541
|
// Restore previous session category/mode/options and IO prefs
|
|
495
542
|
do {
|
|
496
543
|
try session.setActive(false, options: [.notifyOthersOnDeactivation])
|
|
544
|
+
markAECSessionActivation(false, reason: "unPauseMicrophone-pre")
|
|
497
545
|
} catch {
|
|
498
546
|
NSLog("[STT] unPauseMicrophone: setActive(false) failed: \(error.localizedDescription)")
|
|
499
547
|
}
|
|
@@ -521,6 +569,7 @@ private func pollOnMain(timeoutSec: TimeInterval,
|
|
|
521
569
|
|
|
522
570
|
do {
|
|
523
571
|
try session.setActive(true, options: [])
|
|
572
|
+
markAECSessionActivation(true, reason: "unPauseMicrophone")
|
|
524
573
|
} catch {
|
|
525
574
|
NSLog("[STT] unPauseMicrophone: setActive(true) failed: \(error.localizedDescription)")
|
|
526
575
|
}
|
|
@@ -613,32 +662,11 @@ private func pollOnMain(timeoutSec: TimeInterval,
|
|
|
613
662
|
return true
|
|
614
663
|
}
|
|
615
664
|
|
|
665
|
+
|
|
616
666
|
@inline(__always)
|
|
617
667
|
private func safeRemoveTap(_ node: AVAudioNode?, bus: AVAudioNodeBus = 0) {
|
|
618
|
-
guard let n = node else { return }
|
|
619
|
-
|
|
620
|
-
if !Thread.isMainThread {
|
|
621
|
-
DispatchQueue.main.async { [weak self] in self?.safeRemoveTap(n, bus: bus) }
|
|
622
|
-
return
|
|
623
|
-
}
|
|
624
|
-
guard n.engine != nil else { return }
|
|
625
|
-
n.removeTap(onBus: bus)
|
|
626
|
-
}
|
|
627
|
-
|
|
628
|
-
@inline(__always)
|
|
629
|
-
private func safeInstallTap(_ node: AVAudioNode?,
|
|
630
|
-
bus: AVAudioNodeBus = 0,
|
|
631
|
-
bufferSize: AVAudioFrameCount = 128,
|
|
632
|
-
format: AVAudioFormat? = nil,
|
|
633
|
-
block: @escaping AVAudioNodeTapBlock) {
|
|
634
|
-
guard let n = node else { return }
|
|
635
|
-
// MUST be called on main to avoid races with teardown/rebuild
|
|
636
|
-
if !Thread.isMainThread {
|
|
637
|
-
DispatchQueue.main.async { [weak self] in self?.safeInstallTap(n, bus: bus, bufferSize: bufferSize, format: format, block: block) }
|
|
638
|
-
return
|
|
639
|
-
}
|
|
640
|
-
guard n.engine != nil else { return }
|
|
641
|
-
n.installTap(onBus: bus, bufferSize: bufferSize, format: format, block: block)
|
|
668
|
+
guard let n = node, n.engine != nil else { return } // only remove if still attached
|
|
669
|
+
try? n.removeTap(onBus: bus)
|
|
642
670
|
}
|
|
643
671
|
|
|
644
672
|
// MARK: - Public API (native replacements for the former RCT methods)
|
|
@@ -668,7 +696,7 @@ private func pollOnMain(timeoutSec: TimeInterval,
|
|
|
668
696
|
// >>> IMPORTANT: ensure no previous tap is left behind
|
|
669
697
|
self.safeRemoveTap(out, bus: 0)
|
|
670
698
|
|
|
671
|
-
|
|
699
|
+
out.installTap(onBus: 0, bufferSize: 128, format: nil) { [weak self, weak out] _, _ in
|
|
672
700
|
guard let self = self, gen == self.graphGen else { return }
|
|
673
701
|
if fired { return }
|
|
674
702
|
fired = true
|
|
@@ -758,6 +786,10 @@ private func pollOnMain(timeoutSec: TimeInterval,
|
|
|
758
786
|
NSLog("[STT] rearmTask(\(reason)) suppressed (micPaused)")
|
|
759
787
|
return
|
|
760
788
|
}
|
|
789
|
+
if isSpeechRecognitionLitePaused() {
|
|
790
|
+
NSLog("[STT] rearmTask(\(reason)) suppressed (speechRecognitionPaused)")
|
|
791
|
+
return
|
|
792
|
+
}
|
|
761
793
|
|
|
762
794
|
// -----------------
|
|
763
795
|
recognitionTask?.cancel()
|
|
@@ -774,6 +806,10 @@ private func checkTaskHealth() {
|
|
|
774
806
|
NSLog("[STT] watchdog: isTearingDown || isTelephonyInterrupted || isRecoveringAfterTelephony -- DOING NOTHING")
|
|
775
807
|
return
|
|
776
808
|
}
|
|
809
|
+
if isSpeechRecognitionLitePaused() {
|
|
810
|
+
NSLog("[STT] watchdog: speechRecognitionPaused -- DOING NOTHING")
|
|
811
|
+
return
|
|
812
|
+
}
|
|
777
813
|
if micPaused {
|
|
778
814
|
NSLog("[STT] watchdog: micPaused -- DOING NOTHING")
|
|
779
815
|
return
|
|
@@ -848,8 +884,26 @@ private func checkTaskHealth() {
|
|
|
848
884
|
}
|
|
849
885
|
|
|
850
886
|
public func startSpeech(localeStr: String?) {
|
|
851
|
-
|
|
852
|
-
|
|
887
|
+
startSpeechInternal(localeStr: localeStr, speakerVerificationConfig: nil)
|
|
888
|
+
}
|
|
889
|
+
|
|
890
|
+
public func startSpeech(localeStr: String?, onboardingJsonPath: String) {
|
|
891
|
+
do {
|
|
892
|
+
let loaded = try loadSpeakerVerificationStartConfig(onboardingJsonPath: onboardingJsonPath)
|
|
893
|
+
startSpeechInternal(localeStr: localeStr, speakerVerificationConfig: loaded)
|
|
894
|
+
} catch {
|
|
895
|
+
sendResult(error: ["message": "Failed to load onboarding JSON: \(error.localizedDescription)"],
|
|
896
|
+
bestTranscription: nil,
|
|
897
|
+
transcriptions: nil,
|
|
898
|
+
isFinal: nil)
|
|
899
|
+
}
|
|
900
|
+
}
|
|
901
|
+
|
|
902
|
+
private func startSpeechInternal(localeStr: String?,
|
|
903
|
+
speakerVerificationConfig: SpeakerVerificationStartConfig?) {
|
|
904
|
+
NSLog("[STT] startSpeech(locale=\(localeStr ?? "nil"), sv=\(speakerVerificationConfig == nil ? "off" : "on"))")
|
|
905
|
+
lastLocaleStr = localeStr ?? ""
|
|
906
|
+
speakerVerificationStartConfig = speakerVerificationConfig
|
|
853
907
|
if recognitionTask != nil {
|
|
854
908
|
sendResult(error: ["code": "already_started", "message": "Speech recognition already started!"],
|
|
855
909
|
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
@@ -960,8 +1014,13 @@ private func checkTaskHealth() {
|
|
|
960
1014
|
catch { NSLog("[STT] setCategory failed: \(error.localizedDescription)") }
|
|
961
1015
|
}
|
|
962
1016
|
|
|
963
|
-
do {
|
|
964
|
-
|
|
1017
|
+
do {
|
|
1018
|
+
try s.setActive(true, options: [])
|
|
1019
|
+
self.markAECSessionActivation(true, reason: "updateSessionRouting")
|
|
1020
|
+
} catch {
|
|
1021
|
+
NSLog("[STT] setActive failed: \(error.localizedDescription)")
|
|
1022
|
+
self.markAECSessionActivation(false, reason: "updateSessionRouting-failed")
|
|
1023
|
+
}
|
|
965
1024
|
|
|
966
1025
|
// Optional: force 16k after activation
|
|
967
1026
|
self.force16kIfPossible(s)
|
|
@@ -978,6 +1037,10 @@ private func checkTaskHealth() {
|
|
|
978
1037
|
|
|
979
1038
|
// ↓↓↓ preferred settings helper
|
|
980
1039
|
private func force16kIfPossible(_ session: AVAudioSession) {
|
|
1040
|
+
if force16kMicSampleRate {
|
|
1041
|
+
try? session.setPreferredSampleRate(16_000)
|
|
1042
|
+
}
|
|
1043
|
+
|
|
981
1044
|
let hasExternalOutput = session.currentRoute.outputs.contains {
|
|
982
1045
|
switch $0.portType {
|
|
983
1046
|
case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
|
|
@@ -1004,6 +1067,27 @@ private func checkTaskHealth() {
|
|
|
1004
1067
|
try? session.setPreferredIOBufferDuration(0.02)
|
|
1005
1068
|
}
|
|
1006
1069
|
|
|
1070
|
+
private func markAECSessionActivation(_ active: Bool, reason: String) {
|
|
1071
|
+
let now = CACurrentMediaTime()
|
|
1072
|
+
aecSessionActivationLock.lock()
|
|
1073
|
+
aecSessionIsActive = active
|
|
1074
|
+
if active { lastAECSessionActivationAt = now }
|
|
1075
|
+
else { lastAECSessionActivationAt = 0 }
|
|
1076
|
+
aecSessionActivationLock.unlock()
|
|
1077
|
+
NSLog("[STT] AEC session activation(\(reason)): active=\(active ? "YES" : "NO") t=\(String(format: "%.3f", now))")
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
private func isInAECRouteWarmupWindow() -> Bool {
|
|
1081
|
+
guard aecEnabled, forceAECDuringRouteWarmup, aecRouteWarmupSeconds > 0 else { return false }
|
|
1082
|
+
let now = CACurrentMediaTime()
|
|
1083
|
+
aecSessionActivationLock.lock()
|
|
1084
|
+
let isActive = aecSessionIsActive
|
|
1085
|
+
let lastActiveAt = lastAECSessionActivationAt
|
|
1086
|
+
aecSessionActivationLock.unlock()
|
|
1087
|
+
guard isActive, lastActiveAt > 0 else { return false }
|
|
1088
|
+
return (now - lastActiveAt) < aecRouteWarmupSeconds
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1007
1091
|
// MARK: - Core logic (kept intact, including AEC order/steps)
|
|
1008
1092
|
|
|
1009
1093
|
/// Returns true if no errors occurred (identical flow & calls as ObjC).
|
|
@@ -1016,6 +1100,7 @@ private func checkTaskHealth() {
|
|
|
1016
1100
|
|
|
1017
1101
|
do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
|
|
1018
1102
|
catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
|
|
1103
|
+
markAECSessionActivation(false, reason: "setupAudioSession-pre")
|
|
1019
1104
|
|
|
1020
1105
|
// Build options to match our routing rules
|
|
1021
1106
|
// (defaultToSpeaker only when no external output is active)
|
|
@@ -1040,10 +1125,17 @@ private func checkTaskHealth() {
|
|
|
1040
1125
|
|
|
1041
1126
|
do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
|
|
1042
1127
|
catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
|
|
1128
|
+
markAECSessionActivation(false, reason: "setupAudioSession-reconfigure")
|
|
1043
1129
|
|
|
1044
1130
|
// Force 16k before and after activation (some routes settle only after setActive)
|
|
1045
1131
|
force16kIfPossible(session)
|
|
1046
|
-
do {
|
|
1132
|
+
do {
|
|
1133
|
+
try session.setActive(true)
|
|
1134
|
+
markAECSessionActivation(true, reason: "setupAudioSession")
|
|
1135
|
+
} catch {
|
|
1136
|
+
err = error as NSError
|
|
1137
|
+
markAECSessionActivation(false, reason: "setupAudioSession-failed")
|
|
1138
|
+
}
|
|
1047
1139
|
NSLog("[STT] session SR=%.1f inCh=%d outCh=%d (wanted 16000)",
|
|
1048
1140
|
session.sampleRate,
|
|
1049
1141
|
Int(session.inputNumberOfChannels),
|
|
@@ -1061,6 +1153,65 @@ private func checkTaskHealth() {
|
|
|
1061
1153
|
return true
|
|
1062
1154
|
}
|
|
1063
1155
|
|
|
1156
|
+
private func shouldUseVoiceProcessingForCurrentRoute() -> Bool {
|
|
1157
|
+
guard aecEnabled else { return false }
|
|
1158
|
+
if isInAECRouteWarmupWindow() { return true }
|
|
1159
|
+
let s = AVAudioSession.sharedInstance()
|
|
1160
|
+
let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
|
|
1161
|
+
let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
|
|
1162
|
+
(s.currentRoute.inputs.first?.portType == .builtInMic)
|
|
1163
|
+
return speakerRoute && usingBuiltInMic
|
|
1164
|
+
}
|
|
1165
|
+
|
|
1166
|
+
private func configureVoiceProcessingDucking(_ inputNode: AVAudioInputNode) {
|
|
1167
|
+
if #available(iOS 17.0, *) {
|
|
1168
|
+
var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
|
|
1169
|
+
duck.enableAdvancedDucking = false
|
|
1170
|
+
duck.duckingLevel = .min
|
|
1171
|
+
inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
|
|
1172
|
+
}
|
|
1173
|
+
}
|
|
1174
|
+
|
|
1175
|
+
private func reconcileAEC(on engine: AVAudioEngine?, reason: String, allowRebuild: Bool = true) {
|
|
1176
|
+
guard let engine = engine else { return }
|
|
1177
|
+
let inputNode = engine.inputNode
|
|
1178
|
+
let desiredVP = shouldUseVoiceProcessingForCurrentRoute()
|
|
1179
|
+
|
|
1180
|
+
if #available(iOS 13.0, *) {
|
|
1181
|
+
let currentVP = inputNode.isVoiceProcessingEnabled
|
|
1182
|
+
if currentVP == desiredVP {
|
|
1183
|
+
if desiredVP { configureVoiceProcessingDucking(inputNode) }
|
|
1184
|
+
NSLog("[STT] AEC reconcile(\(reason)): unchanged vp=\(currentVP ? "ON" : "OFF")")
|
|
1185
|
+
return
|
|
1186
|
+
}
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
do {
|
|
1190
|
+
try inputNode.setVoiceProcessingEnabled(desiredVP)
|
|
1191
|
+
if desiredVP { configureVoiceProcessingDucking(inputNode) }
|
|
1192
|
+
NSLog("[STT] AEC reconcile(\(reason)): set vp=\(desiredVP ? "ON" : "OFF")")
|
|
1193
|
+
} catch {
|
|
1194
|
+
NSLog("[STT] AEC reconcile(\(reason)) failed: \(error.localizedDescription)")
|
|
1195
|
+
if allowRebuild && sttActive && !isTearingDown && !micPaused &&
|
|
1196
|
+
!isTelephonyInterrupted && !isRecoveringAfterTelephony {
|
|
1197
|
+
rebuildEngineGraphAndRestart(reason: "aec-reconcile-\(reason)")
|
|
1198
|
+
}
|
|
1199
|
+
}
|
|
1200
|
+
}
|
|
1201
|
+
|
|
1202
|
+
private func scheduleAECReconcileRetries(reason: String,
|
|
1203
|
+
attempts: Int = 3,
|
|
1204
|
+
stepSec: TimeInterval = 0.20) {
|
|
1205
|
+
guard attempts > 0 else { return }
|
|
1206
|
+
for i in 1...attempts {
|
|
1207
|
+
DispatchQueue.main.asyncAfter(deadline: .now() + stepSec * Double(i)) { [weak self] in
|
|
1208
|
+
guard let self = self else { return }
|
|
1209
|
+
if self.isTearingDown || self.micPaused || self.isTelephonyInterrupted { return }
|
|
1210
|
+
self.reconcileAEC(on: self.audioEngine, reason: "\(reason)-retry\(i)", allowRebuild: false)
|
|
1211
|
+
}
|
|
1212
|
+
}
|
|
1213
|
+
}
|
|
1214
|
+
|
|
1064
1215
|
private func currentInputFormat(_ engine: AVAudioEngine) -> AVAudioFormat? {
|
|
1065
1216
|
// Prefer whatever CoreAudio currently provides; avoid cached formats.
|
|
1066
1217
|
let fmt = engine.inputNode.outputFormat(forBus: 0)
|
|
@@ -1097,22 +1248,8 @@ private func recoverAfterTelephonyInterruption() {
|
|
|
1097
1248
|
_ = setupAudioSession() // ✅ ensures correct mode/options
|
|
1098
1249
|
forceSpeakerIfReceiver("recoverAfterTelephony") // ✅ receiver -> speaker now
|
|
1099
1250
|
|
|
1100
|
-
let s = AVAudioSession.sharedInstance()
|
|
1101
|
-
let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
|
|
1102
|
-
let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
|
|
1103
|
-
(s.currentRoute.inputs.first?.portType == .builtInMic)
|
|
1104
|
-
let willUseVP = speakerRoute && usingBuiltInMic
|
|
1105
|
-
|
|
1106
1251
|
let inputNode = eng.inputNode
|
|
1107
|
-
|
|
1108
|
-
try? inputNode.setVoiceProcessingEnabled(true)
|
|
1109
|
-
if #available(iOS 17.0, *) {
|
|
1110
|
-
var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
|
|
1111
|
-
duck.enableAdvancedDucking = false
|
|
1112
|
-
duck.duckingLevel = .min
|
|
1113
|
-
inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
|
|
1114
|
-
}
|
|
1115
|
-
}
|
|
1252
|
+
reconcileAEC(on: eng, reason: "recover-after-telephony-prestart", allowRebuild: false)
|
|
1116
1253
|
|
|
1117
1254
|
eng.reset()
|
|
1118
1255
|
let micMixer = AVAudioMixerNode()
|
|
@@ -1131,6 +1268,8 @@ private func recoverAfterTelephonyInterruption() {
|
|
|
1131
1268
|
try eng.start()
|
|
1132
1269
|
armFirstIOCycleLatch(on: eng)
|
|
1133
1270
|
tryClearCaptureLossAfterStartSucceeded()
|
|
1271
|
+
reconcileAEC(on: eng, reason: "recover-after-telephony-poststart", allowRebuild: false)
|
|
1272
|
+
scheduleAECReconcileRetries(reason: "recover-after-telephony")
|
|
1134
1273
|
} catch {
|
|
1135
1274
|
NSLog("[STT] recover: engine.start failed → will let watchdog retry: \(error)")
|
|
1136
1275
|
return
|
|
@@ -1176,7 +1315,11 @@ private func recoverAfterTelephonyInterruption() {
|
|
|
1176
1315
|
return []
|
|
1177
1316
|
}
|
|
1178
1317
|
do {
|
|
1179
|
-
|
|
1318
|
+
var contents = try String(contentsOfFile: filePath, encoding: .utf8)
|
|
1319
|
+
// ✅ MIN FIX: remove UTF-8 BOM if present (often only affects the first token)
|
|
1320
|
+
if contents.unicodeScalars.first == "\u{FEFF}" {
|
|
1321
|
+
contents.unicodeScalars.removeFirst()
|
|
1322
|
+
}
|
|
1180
1323
|
let rawItems = contents.components(separatedBy: ",")
|
|
1181
1324
|
var cleaned: [String] = []
|
|
1182
1325
|
cleaned.reserveCapacity(rawItems.count)
|
|
@@ -1210,13 +1353,20 @@ private func recoverAfterTelephonyInterruption() {
|
|
|
1210
1353
|
if #available(iOS 16, *) { req.addsPunctuation = true }
|
|
1211
1354
|
req.shouldReportPartialResults = true
|
|
1212
1355
|
//if #available(iOS 13.0, *) { req.taskHint = .dictation }
|
|
1213
|
-
|
|
1356
|
+
let cs: [String] = loadContextualStrings()
|
|
1357
|
+
req.contextualStrings = cs
|
|
1358
|
+
NSLog("[STT] makeFreshRequest contextualStrings count=\(cs.count) sample=\(cs.prefix(10)) file=\(Bundle.main.path(forResource: "words_flattened", ofType: "txt") ?? "nil")")
|
|
1359
|
+
|
|
1214
1360
|
self.recognitionRequest = req
|
|
1215
|
-
NSLog("makeFreshRequest()")
|
|
1361
|
+
NSLog("[STT] makeFreshRequest()")
|
|
1216
1362
|
return req
|
|
1217
1363
|
}
|
|
1218
1364
|
|
|
1219
1365
|
private func startTask(_ req: SFSpeechAudioBufferRecognitionRequest) {
|
|
1366
|
+
if isSpeechRecognitionLitePaused() {
|
|
1367
|
+
NSLog("[STT] startTask suppressed (speechRecognitionPaused)")
|
|
1368
|
+
return
|
|
1369
|
+
}
|
|
1220
1370
|
NSLog("starting recognitionTask")
|
|
1221
1371
|
lastTaskStartAt = CACurrentMediaTime()
|
|
1222
1372
|
lastResultAt = lastTaskStartAt
|
|
@@ -1240,14 +1390,17 @@ private func recoverAfterTelephonyInterruption() {
|
|
|
1240
1390
|
|
|
1241
1391
|
func markIfReal(_ r: SFSpeechRecognitionResult?) {
|
|
1242
1392
|
guard let r = r else { return }
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1250
|
-
|
|
1393
|
+
|
|
1394
|
+
// ✅ Do NOT use formattedString here (it normalizes spacing/punctuation/number formatting).
|
|
1395
|
+
// Instead, treat "real speech" as "we have at least one non-empty segment substring".
|
|
1396
|
+
let hasReal = r.bestTranscription.segments.contains {
|
|
1397
|
+
!$0.substring.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
if hasReal && !self.seenRealSpeech {
|
|
1401
|
+
self.seenRealSpeech = true
|
|
1402
|
+
NSLog("first real speech detected -> onSpeechStart to JS")
|
|
1403
|
+
self.sendEvent(name: "onSpeechStart", body: nil)
|
|
1251
1404
|
}
|
|
1252
1405
|
}
|
|
1253
1406
|
markIfReal(result)
|
|
@@ -1276,9 +1429,9 @@ private func recoverAfterTelephonyInterruption() {
|
|
|
1276
1429
|
}
|
|
1277
1430
|
|
|
1278
1431
|
let isFinal = result.isFinal
|
|
1279
|
-
let parts = result.transcriptions.map { $0.
|
|
1432
|
+
let parts = result.transcriptions.map { $0.segments.map { $0.substring }.joined(separator: " ") }
|
|
1280
1433
|
self.sendResult(error: nil,
|
|
1281
|
-
bestTranscription: result.bestTranscription.
|
|
1434
|
+
bestTranscription: result.bestTranscription.segments.map { $0.substring }.joined(separator: " "),
|
|
1282
1435
|
transcriptions: parts,
|
|
1283
1436
|
isFinal: isFinal)
|
|
1284
1437
|
|
|
@@ -1338,6 +1491,24 @@ private func recoverAfterTelephonyInterruption() {
|
|
|
1338
1491
|
}
|
|
1339
1492
|
mixerProbeActive = false
|
|
1340
1493
|
mixerProbeCompletions.removeAll()
|
|
1494
|
+
speakerVerificationEngine = nil
|
|
1495
|
+
speakerVerificationFrameSize = 0
|
|
1496
|
+
speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
|
|
1497
|
+
speakerVerificationThreshold = 0
|
|
1498
|
+
speakerVerificationFrameSeq = 0
|
|
1499
|
+
speakerVerificationSourceSampleRate = 0
|
|
1500
|
+
speakerVerificationTargetSampleRate = 0
|
|
1501
|
+
speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
|
|
1502
|
+
speakerVerificationResamplePos = 0
|
|
1503
|
+
speakerLastPositiveMatchAt = 0
|
|
1504
|
+
setSpeakerGateState(enabled: false, open: true)
|
|
1505
|
+
speakerVerificationErrorSent = false
|
|
1506
|
+
speakerPreRollBuffers.removeAll(keepingCapacity: false)
|
|
1507
|
+
speakerPreRollFrames = 0
|
|
1508
|
+
speakerPreRollMaxFrames = 0
|
|
1509
|
+
speakerPendingPreRollFlush = false
|
|
1510
|
+
lastRouteSignature = ""
|
|
1511
|
+
markAECSessionActivation(false, reason: "teardown")
|
|
1341
1512
|
|
|
1342
1513
|
resetAudioSession()
|
|
1343
1514
|
savedSessionBeforePause = nil
|
|
@@ -1358,6 +1529,10 @@ private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
|
|
|
1358
1529
|
NSLog("[STT] ensureEngineRunning suppressed (telephony/recovering)")
|
|
1359
1530
|
return
|
|
1360
1531
|
}
|
|
1532
|
+
if isSpeechRecognitionLitePaused() {
|
|
1533
|
+
NSLog("[STT] ensureEngineRunning(\(reason)) suppressed (speechRecognitionPaused)")
|
|
1534
|
+
return
|
|
1535
|
+
}
|
|
1361
1536
|
if micPaused {
|
|
1362
1537
|
NSLog("[STT] ensureEngineRunning(\(reason)) suppressed (micPaused)")
|
|
1363
1538
|
return
|
|
@@ -1402,7 +1577,9 @@ private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
|
|
|
1402
1577
|
|
|
1403
1578
|
// ensure a task is running
|
|
1404
1579
|
if recognitionTask == nil {
|
|
1405
|
-
if
|
|
1580
|
+
if isSpeechRecognitionLitePaused() {
|
|
1581
|
+
NSLog("[STT] ensureEngineRunning(\(reason)): skip startTask (speechRecognitionPaused)")
|
|
1582
|
+
} else if let req = recognitionRequest {
|
|
1406
1583
|
startTask(req)
|
|
1407
1584
|
} else {
|
|
1408
1585
|
startTask(makeFreshRequest())
|
|
@@ -1482,6 +1659,10 @@ private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
|
|
|
1482
1659
|
bumpGraphGen()
|
|
1483
1660
|
NSLog("[STT] 🔄 rebuildEngineGraphAndRestart (\(reason))")
|
|
1484
1661
|
if isTelephonyInterrupted { NSLog("[STT] rebuild suppressed during telephony"); return }
|
|
1662
|
+
if isSpeechRecognitionLitePaused() {
|
|
1663
|
+
NSLog("[STT] rebuild suppressed (speechRecognitionPaused)")
|
|
1664
|
+
return
|
|
1665
|
+
}
|
|
1485
1666
|
|
|
1486
1667
|
guard hasValidCaptureNow() else {
|
|
1487
1668
|
markCaptureLost()
|
|
@@ -1511,27 +1692,8 @@ private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
|
|
|
1511
1692
|
_ = setupAudioSession() // ✅ keep session policy consistent
|
|
1512
1693
|
forceSpeakerIfReceiver("rebuild:\(reason)") // ✅ receiver -> speaker now
|
|
1513
1694
|
|
|
1514
|
-
let s = AVAudioSession.sharedInstance()
|
|
1515
|
-
let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
|
|
1516
|
-
let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
|
|
1517
|
-
(s.currentRoute.inputs.first?.portType == .builtInMic)
|
|
1518
|
-
|
|
1519
1695
|
let inputNode = newEngine.inputNode
|
|
1520
|
-
|
|
1521
|
-
if aecEnabled, speakerRoute && usingBuiltInMic {
|
|
1522
|
-
// AEC makes sense here
|
|
1523
|
-
do { try inputNode.setVoiceProcessingEnabled(true) } catch {
|
|
1524
|
-
NSLog("Voice processing not available: \(error)")
|
|
1525
|
-
}
|
|
1526
|
-
if #available(iOS 17.0, *) {
|
|
1527
|
-
var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
|
|
1528
|
-
duck.enableAdvancedDucking = false
|
|
1529
|
-
duck.duckingLevel = .min
|
|
1530
|
-
inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
|
|
1531
|
-
}
|
|
1532
|
-
} else {
|
|
1533
|
-
// Headsets / car / AirPlay → skip AEC
|
|
1534
|
-
}
|
|
1696
|
+
reconcileAEC(on: newEngine, reason: "rebuild-\(reason)-prestart", allowRebuild: false)
|
|
1535
1697
|
|
|
1536
1698
|
var inFmt = inputNode.outputFormat(forBus: 0)
|
|
1537
1699
|
|
|
@@ -1543,11 +1705,20 @@ private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
|
|
|
1543
1705
|
micMixer.outputVolume = 0.0
|
|
1544
1706
|
|
|
1545
1707
|
// TTS player → (de-esser) → mainMixer
|
|
1546
|
-
if
|
|
1708
|
+
if let existing = playbackNode, existing.engine !== newEngine {
|
|
1709
|
+
// Node is owned by a different engine instance; recreate for this graph.
|
|
1710
|
+
existing.stop()
|
|
1711
|
+
playbackNode = nil
|
|
1712
|
+
}
|
|
1713
|
+
if playbackNode == nil {
|
|
1714
|
+
playbackNode = AVAudioPlayerNode()
|
|
1715
|
+
}
|
|
1547
1716
|
if let player = playbackNode {
|
|
1548
|
-
if player.engine == nil {
|
|
1549
|
-
|
|
1550
|
-
|
|
1717
|
+
if player.engine == nil {
|
|
1718
|
+
newEngine.attach(player)
|
|
1719
|
+
}
|
|
1720
|
+
newEngine.connect(player, to: newEngine.mainMixerNode, format: nil)
|
|
1721
|
+
}
|
|
1551
1722
|
|
|
1552
1723
|
// // --- Aggressive low-pass only ---
|
|
1553
1724
|
// let deEss = AVAudioUnitEQ(numberOfBands: 1)
|
|
@@ -1572,6 +1743,8 @@ if let player = playbackNode {
|
|
|
1572
1743
|
try newEngine.start()
|
|
1573
1744
|
armFirstIOCycleLatch(on: newEngine)
|
|
1574
1745
|
tryClearCaptureLossAfterStartSucceeded()
|
|
1746
|
+
reconcileAEC(on: newEngine, reason: "rebuild-\(reason)-poststart", allowRebuild: false)
|
|
1747
|
+
scheduleAECReconcileRetries(reason: "rebuild-\(reason)")
|
|
1575
1748
|
NSLog("[STT] rebuild: engine.start() ok, running=\(newEngine.isRunning)")
|
|
1576
1749
|
} catch {
|
|
1577
1750
|
markCaptureLost()
|
|
@@ -1647,7 +1820,11 @@ if let player = playbackNode {
|
|
|
1647
1820
|
}
|
|
1648
1821
|
}
|
|
1649
1822
|
if self.recognitionTask == nil {
|
|
1650
|
-
|
|
1823
|
+
if isSpeechRecognitionLitePaused() {
|
|
1824
|
+
NSLog("[STT] rebuild: skip startTask (speechRecognitionPaused)")
|
|
1825
|
+
} else {
|
|
1826
|
+
startTask(self.recognitionRequest!)
|
|
1827
|
+
}
|
|
1651
1828
|
}
|
|
1652
1829
|
if self.sttActive && !self.micPaused {
|
|
1653
1830
|
self.installPlaybackHooks()
|
|
@@ -1656,6 +1833,10 @@ if let player = playbackNode {
|
|
|
1656
1833
|
|
|
1657
1834
|
@objc private func handleEngineConfigChange(_ note: Notification) {
|
|
1658
1835
|
if isTearingDown { return } // ← add
|
|
1836
|
+
if isSpeechRecognitionLitePaused() {
|
|
1837
|
+
NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange (ignored: speechRecognitionPaused)")
|
|
1838
|
+
return
|
|
1839
|
+
}
|
|
1659
1840
|
if micPaused {
|
|
1660
1841
|
NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange (ignored: micPaused)")
|
|
1661
1842
|
return
|
|
@@ -1667,12 +1848,18 @@ if let player = playbackNode {
|
|
|
1667
1848
|
playbackNode = nil
|
|
1668
1849
|
}
|
|
1669
1850
|
ensureEngineRunning(reason: "engine-config-change")
|
|
1851
|
+
reconcileAEC(on: audioEngine, reason: "engine-config-change")
|
|
1852
|
+
scheduleAECReconcileRetries(reason: "engine-config-change")
|
|
1670
1853
|
}
|
|
1671
1854
|
|
|
1672
1855
|
@objc private func handleMediaServicesReset(_ note: Notification) {
|
|
1673
1856
|
if isTearingDown { return } // ← add
|
|
1674
1857
|
|
|
1675
|
-
|
|
1858
|
+
if isSpeechRecognitionLitePaused() {
|
|
1859
|
+
NSLog("[STT] 📺 Media services RESET (ignored: speechRecognitionPaused)")
|
|
1860
|
+
return
|
|
1861
|
+
}
|
|
1862
|
+
if micPaused {
|
|
1676
1863
|
NSLog("[STT] 📺 Media services RESET (ignored: micPaused)")
|
|
1677
1864
|
return
|
|
1678
1865
|
}
|
|
@@ -1681,6 +1868,8 @@ if let player = playbackNode {
|
|
|
1681
1868
|
bumpGraphGen()
|
|
1682
1869
|
_ = setupAudioSession()
|
|
1683
1870
|
ensureEngineRunning(reason: "media-services-reset")
|
|
1871
|
+
reconcileAEC(on: audioEngine, reason: "media-services-reset")
|
|
1872
|
+
scheduleAECReconcileRetries(reason: "media-services-reset")
|
|
1684
1873
|
}
|
|
1685
1874
|
|
|
1686
1875
|
/*?????????? Why so many changes???
|
|
@@ -1720,19 +1909,52 @@ if let player = playbackNode {
|
|
|
1720
1909
|
*/
|
|
1721
1910
|
@objc private func handleRouteChange(_ note: Notification) {
|
|
1722
1911
|
if isTearingDown { return }
|
|
1723
|
-
|
|
1912
|
+
if !sttActive {
|
|
1913
|
+
NSLog("[STT] 🔀 route change (ignored: sttInactive) \(note.userInfo ?? [:])")
|
|
1914
|
+
return
|
|
1915
|
+
}
|
|
1916
|
+
if isSpeechRecognitionLitePaused() {
|
|
1917
|
+
NSLog("[STT] 🔀 route change (ignored: speechRecognitionPaused) \(note.userInfo ?? [:])")
|
|
1918
|
+
return
|
|
1919
|
+
}
|
|
1920
|
+
if micPaused {
|
|
1724
1921
|
NSLog("[STT] 🔀 route change (ignored: micPaused) \(note.userInfo ?? [:])")
|
|
1725
1922
|
return
|
|
1726
1923
|
}
|
|
1727
1924
|
|
|
1728
|
-
|
|
1925
|
+
let info = note.userInfo ?? [:]
|
|
1926
|
+
NSLog("[STT] 🔀 route change: \(info)")
|
|
1729
1927
|
if isTelephonyInterrupted || isRecoveringAfterTelephony {
|
|
1730
|
-
NSLog("[STT] 🔀 route change (ignored during telephony/recovering): \(
|
|
1928
|
+
NSLog("[STT] 🔀 route change (ignored during telephony/recovering): \(info)")
|
|
1731
1929
|
return
|
|
1732
1930
|
}
|
|
1733
1931
|
|
|
1734
|
-
|
|
1932
|
+
let session = AVAudioSession.sharedInstance()
|
|
1933
|
+
let outSig = session.currentRoute.outputs.map { $0.portType.rawValue }.joined(separator: ",")
|
|
1934
|
+
let inSig = session.currentRoute.inputs.map { $0.portType.rawValue }.joined(separator: ",")
|
|
1935
|
+
let routeSig = "outs=\(outSig)|ins=\(inSig)"
|
|
1936
|
+
if routeSig == lastRouteSignature {
|
|
1937
|
+
NSLog("[STT] 🔀 route change ignored (same route signature)")
|
|
1938
|
+
return
|
|
1939
|
+
}
|
|
1940
|
+
lastRouteSignature = routeSig
|
|
1941
|
+
|
|
1942
|
+
if let reasonVal = info[AVAudioSessionRouteChangeReasonKey] as? UInt,
|
|
1943
|
+
let reason = AVAudioSession.RouteChangeReason(rawValue: reasonVal) {
|
|
1944
|
+
switch reason {
|
|
1945
|
+
// Match AVAudioWrapper behavior: handle concrete hardware events + route config changes.
|
|
1946
|
+
case .newDeviceAvailable, .oldDeviceUnavailable, .routeConfigurationChange:
|
|
1947
|
+
updateSessionRouting(selectBestInput: true)
|
|
1948
|
+
default:
|
|
1949
|
+
NSLog("[STT] 🔀 route change reason=\(reason.rawValue) -> skip updateSessionRouting")
|
|
1950
|
+
}
|
|
1951
|
+
} else {
|
|
1952
|
+
NSLog("[STT] 🔀 route change reason missing -> skip updateSessionRouting")
|
|
1953
|
+
}
|
|
1954
|
+
|
|
1735
1955
|
forceSpeakerIfReceiver("routeChange")
|
|
1956
|
+
reconcileAEC(on: audioEngine, reason: "route-change", allowRebuild: false)
|
|
1957
|
+
scheduleAECReconcileRetries(reason: "route-change")
|
|
1736
1958
|
|
|
1737
1959
|
ensureEngineRunning(reason: "route-change", skipCooldown: true)
|
|
1738
1960
|
}
|
|
@@ -1757,7 +1979,7 @@ if let player = playbackNode {
|
|
|
1757
1979
|
var fired = false
|
|
1758
1980
|
self.safeRemoveTap(mixer, bus: 0)
|
|
1759
1981
|
|
|
1760
|
-
|
|
1982
|
+
mixer.installTap(onBus: 0, bufferSize: 128, format: nil) { [weak self, weak mixer] _, _ in
|
|
1761
1983
|
guard let self = self, gen == self.graphGen else { return }
|
|
1762
1984
|
if fired { return }
|
|
1763
1985
|
fired = true
|
|
@@ -1772,7 +1994,6 @@ if let player = playbackNode {
|
|
|
1772
1994
|
}
|
|
1773
1995
|
}
|
|
1774
1996
|
|
|
1775
|
-
|
|
1776
1997
|
DispatchQueue.main.asyncAfter(deadline: .now() + timeout) { [weak self, weak mixer] in
|
|
1777
1998
|
guard let self = self, gen == self.graphGen else { return }
|
|
1778
1999
|
if fired { return }
|
|
@@ -1857,7 +2078,12 @@ if let player = playbackNode {
|
|
|
1857
2078
|
tapFramesTotal = 0
|
|
1858
2079
|
|
|
1859
2080
|
// Re-activate the session (safe if already active)
|
|
1860
|
-
|
|
2081
|
+
do {
|
|
2082
|
+
try AVAudioSession.sharedInstance().setActive(true, options: [])
|
|
2083
|
+
markAECSessionActivation(true, reason: "interruption-ended")
|
|
2084
|
+
} catch {
|
|
2085
|
+
markAECSessionActivation(false, reason: "interruption-ended-failed")
|
|
2086
|
+
}
|
|
1861
2087
|
|
|
1862
2088
|
// Give routes/formats a moment to settle *before* we rebuild
|
|
1863
2089
|
DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) {
|
|
@@ -2006,12 +2232,27 @@ if let player = playbackNode {
|
|
|
2006
2232
|
lastNoInputRecoveryAt = 0
|
|
2007
2233
|
lastRearmAt = 0
|
|
2008
2234
|
lastReclaimAttempt = 0
|
|
2009
|
-
emaPartialGap = 0
|
|
2010
2235
|
tapFramesTotal = 0
|
|
2011
2236
|
lastTapFramesSeen = 0
|
|
2012
2237
|
pausedForCaptureLoss = false
|
|
2013
2238
|
mixerProbeActive = false
|
|
2014
2239
|
mixerProbeCompletions.removeAll()
|
|
2240
|
+
speakerVerificationEngine = nil
|
|
2241
|
+
speakerVerificationFrameSize = 0
|
|
2242
|
+
speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
|
|
2243
|
+
speakerVerificationThreshold = 0
|
|
2244
|
+
speakerVerificationFrameSeq = 0
|
|
2245
|
+
speakerVerificationSourceSampleRate = 0
|
|
2246
|
+
speakerVerificationTargetSampleRate = 0
|
|
2247
|
+
speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
|
|
2248
|
+
speakerVerificationResamplePos = 0
|
|
2249
|
+
speakerLastPositiveMatchAt = 0
|
|
2250
|
+
setSpeakerGateState(enabled: false, open: true)
|
|
2251
|
+
speakerVerificationErrorSent = false
|
|
2252
|
+
speakerPreRollBuffers.removeAll(keepingCapacity: false)
|
|
2253
|
+
speakerPreRollFrames = 0
|
|
2254
|
+
speakerPreRollMaxFrames = 0
|
|
2255
|
+
speakerPendingPreRollFlush = false
|
|
2015
2256
|
|
|
2016
2257
|
audioSession = AVAudioSession.sharedInstance()
|
|
2017
2258
|
guard let session = audioSession else { return }
|
|
@@ -2060,7 +2301,9 @@ if let player = playbackNode {
|
|
|
2060
2301
|
}
|
|
2061
2302
|
request.shouldReportPartialResults = true
|
|
2062
2303
|
//if #available(iOS 13.0, *) { request.taskHint = .dictation }
|
|
2063
|
-
|
|
2304
|
+
let cs: [String] = loadContextualStrings()
|
|
2305
|
+
request.contextualStrings = cs
|
|
2306
|
+
NSLog("[STT] makeFreshRequest contextualStrings count=\(cs.count) sample=\(cs.prefix(10)) file=\(Bundle.main.path(forResource: "words_flattened", ofType: "txt") ?? "nil")")
|
|
2064
2307
|
|
|
2065
2308
|
guard recognitionRequest != nil else {
|
|
2066
2309
|
sendResult(error: ["code": "recognition_init"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
@@ -2079,25 +2322,7 @@ if let player = playbackNode {
|
|
|
2079
2322
|
let inputNode = engine.inputNode
|
|
2080
2323
|
let _ = inputNode // presence check
|
|
2081
2324
|
|
|
2082
|
-
|
|
2083
|
-
let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
|
|
2084
|
-
let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
|
|
2085
|
-
(s.currentRoute.inputs.first?.portType == .builtInMic)
|
|
2086
|
-
|
|
2087
|
-
if aecEnabled, speakerRoute && usingBuiltInMic {
|
|
2088
|
-
// AEC makes sense here
|
|
2089
|
-
do { try inputNode.setVoiceProcessingEnabled(true) } catch {
|
|
2090
|
-
NSLog("Voice processing not available: \(error)")
|
|
2091
|
-
}
|
|
2092
|
-
if #available(iOS 17.0, *) {
|
|
2093
|
-
var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
|
|
2094
|
-
duck.enableAdvancedDucking = false // disable advanced (VAD-based) ducking
|
|
2095
|
-
duck.duckingLevel = .min // “as loud as possible” for other audio
|
|
2096
|
-
inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
|
|
2097
|
-
}
|
|
2098
|
-
} else {
|
|
2099
|
-
// Headsets / car / AirPlay → skip AEC
|
|
2100
|
-
}
|
|
2325
|
+
reconcileAEC(on: engine, reason: "setup-start-prestart", allowRebuild: false)
|
|
2101
2326
|
|
|
2102
2327
|
// if output node voice processing is ever needed, keep commented as in original:
|
|
2103
2328
|
// do { try engine.outputNode.setVoiceProcessingEnabled(true) } catch { ... }
|
|
@@ -2151,7 +2376,72 @@ if let player = playbackNode {
|
|
|
2151
2376
|
return
|
|
2152
2377
|
}
|
|
2153
2378
|
|
|
2154
|
-
|
|
2379
|
+
var tapBufferSize: AVAudioFrameCount = 1024
|
|
2380
|
+
if let svStart = speakerVerificationStartConfig {
|
|
2381
|
+
do {
|
|
2382
|
+
var svConfig = svStart.config
|
|
2383
|
+
let routeSampleRate = Int(round(format.sampleRate))
|
|
2384
|
+
if useShortSpeakerVerificationTailWindow {
|
|
2385
|
+
let forcedTail = max(0.1, shortSpeakerVerificationTailSeconds)
|
|
2386
|
+
svConfig.tailSeconds = forcedTail
|
|
2387
|
+
if svConfig.maxTailSeconds < forcedTail {
|
|
2388
|
+
svConfig.maxTailSeconds = forcedTail
|
|
2389
|
+
}
|
|
2390
|
+
NSLog("[STT] SV tail override enabled tailSeconds=\(forcedTail)")
|
|
2391
|
+
}
|
|
2392
|
+
|
|
2393
|
+
speakerVerificationFrameSize = svConfig.frameSize
|
|
2394
|
+
speakerVerificationThreshold = svConfig.decisionThreshold
|
|
2395
|
+
speakerVerificationFrameSeq = 0
|
|
2396
|
+
speakerVerificationSourceSampleRate = routeSampleRate
|
|
2397
|
+
speakerVerificationTargetSampleRate = svConfig.sampleRate
|
|
2398
|
+
speakerVerificationResampleCarry.removeAll(keepingCapacity: true)
|
|
2399
|
+
speakerVerificationResamplePos = 0
|
|
2400
|
+
speakerLastPositiveMatchAt = 0
|
|
2401
|
+
speakerVerificationInputBuffer.removeAll(keepingCapacity: true)
|
|
2402
|
+
setSpeakerGateState(enabled: false, open: false)
|
|
2403
|
+
speakerVerificationErrorSent = false
|
|
2404
|
+
speakerPreRollBuffers.removeAll(keepingCapacity: true)
|
|
2405
|
+
speakerPreRollFrames = 0
|
|
2406
|
+
speakerPendingPreRollFlush = false
|
|
2407
|
+
speakerPreRollMaxFrames = max(1, Int(round(format.sampleRate * speakerPreRollSeconds)))
|
|
2408
|
+
|
|
2409
|
+
svConfig.logLevel = .off
|
|
2410
|
+
let svEngine = try SpeakerVerificationEngine(config: svConfig)
|
|
2411
|
+
svEngine.setEnrollment(svStart.enrollment)
|
|
2412
|
+
svEngine.resetStreamingState()
|
|
2413
|
+
|
|
2414
|
+
speakerVerificationEngine = svEngine
|
|
2415
|
+
setSpeakerGateState(enabled: true, open: false)
|
|
2416
|
+
tapBufferSize = AVAudioFrameCount(max(64, svConfig.frameSize))
|
|
2417
|
+
NSLog("[STT] Speaker verification gate enabled frameSize=\(svConfig.frameSize) tailSeconds=\(svConfig.tailSeconds) threshold=\(svConfig.decisionThreshold) hangover=\(useSpeakerGateHangover ? "ON" : "OFF") hangSec=\(String(format: "%.3f", speakerGateHangoverSeconds))")
|
|
2418
|
+
if routeSampleRate != svConfig.sampleRate {
|
|
2419
|
+
NSLog("[STT] SV resampling enabled \(routeSampleRate)Hz -> \(svConfig.sampleRate)Hz")
|
|
2420
|
+
} else {
|
|
2421
|
+
NSLog("[STT] SV sampleRate already matched at \(routeSampleRate)Hz")
|
|
2422
|
+
}
|
|
2423
|
+
} catch {
|
|
2424
|
+
speakerVerificationEngine = nil
|
|
2425
|
+
speakerVerificationThreshold = 0
|
|
2426
|
+
speakerVerificationFrameSeq = 0
|
|
2427
|
+
speakerVerificationSourceSampleRate = 0
|
|
2428
|
+
speakerVerificationTargetSampleRate = 0
|
|
2429
|
+
speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
|
|
2430
|
+
speakerVerificationResamplePos = 0
|
|
2431
|
+
speakerLastPositiveMatchAt = 0
|
|
2432
|
+
setSpeakerGateState(enabled: false, open: true)
|
|
2433
|
+
speakerPreRollBuffers.removeAll(keepingCapacity: false)
|
|
2434
|
+
speakerPreRollFrames = 0
|
|
2435
|
+
speakerPreRollMaxFrames = 0
|
|
2436
|
+
speakerPendingPreRollFlush = false
|
|
2437
|
+
sendResult(error: ["message": "Speaker verification disabled: \(error.localizedDescription)"],
|
|
2438
|
+
bestTranscription: nil,
|
|
2439
|
+
transcriptions: nil,
|
|
2440
|
+
isFinal: nil)
|
|
2441
|
+
}
|
|
2442
|
+
}
|
|
2443
|
+
|
|
2444
|
+
inputNode.installTap(onBus: 0, bufferSize: tapBufferSize, format: format) { [weak self] buffer, _ in
|
|
2155
2445
|
// Strongify self once
|
|
2156
2446
|
guard let self = self else { return }
|
|
2157
2447
|
// ✅ Count frames globally so the watchdog can see forward progress
|
|
@@ -2192,8 +2482,24 @@ if let player = playbackNode {
|
|
|
2192
2482
|
let value = self.averagePowerForChannel1
|
|
2193
2483
|
self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": value])
|
|
2194
2484
|
|
|
2485
|
+
if self.currentSpeakerGateState().enabled, let ch0 = buffer.floatChannelData?[0] {
|
|
2486
|
+
let mono = Array(UnsafeBufferPointer(start: ch0, count: Int(buffer.frameLength)))
|
|
2487
|
+
self.processSpeakerVerificationSamples(mono)
|
|
2488
|
+
}
|
|
2489
|
+
|
|
2195
2490
|
// Append to recognition
|
|
2196
|
-
self.
|
|
2491
|
+
let gate = self.currentSpeakerGateState()
|
|
2492
|
+
if !gate.enabled {
|
|
2493
|
+
self.recognitionRequest?.append(buffer)
|
|
2494
|
+
NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=DISABLED action=append")
|
|
2495
|
+
} else if gate.open {
|
|
2496
|
+
self.flushSpeakerPreRollIfNeeded()
|
|
2497
|
+
self.recognitionRequest?.append(buffer)
|
|
2498
|
+
NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=OPEN action=append preRollFrames=\(self.currentSpeakerPreRollFrames())")
|
|
2499
|
+
} else {
|
|
2500
|
+
self.enqueueSpeakerPreRoll(buffer)
|
|
2501
|
+
NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=CLOSED action=buffer preRollFrames=\(self.currentSpeakerPreRollFrames())")
|
|
2502
|
+
}
|
|
2197
2503
|
|
|
2198
2504
|
// inside inputNode.installTap { buffer, _ in
|
|
2199
2505
|
self.lastBufferAt = CACurrentMediaTime()
|
|
@@ -2205,6 +2511,8 @@ if let player = playbackNode {
|
|
|
2205
2511
|
do {
|
|
2206
2512
|
try engine.start()
|
|
2207
2513
|
armFirstIOCycleLatch(on: engine)
|
|
2514
|
+
reconcileAEC(on: engine, reason: "setup-start-poststart", allowRebuild: false)
|
|
2515
|
+
scheduleAECReconcileRetries(reason: "setup-start")
|
|
2208
2516
|
} catch {
|
|
2209
2517
|
audioSessionError = error as NSError
|
|
2210
2518
|
}
|
|
@@ -2251,6 +2559,236 @@ if let player = playbackNode {
|
|
|
2251
2559
|
}
|
|
2252
2560
|
}
|
|
2253
2561
|
|
|
2562
|
+
private func loadSpeakerVerificationStartConfig(onboardingJsonPath: String) throws -> SpeakerVerificationStartConfig {
|
|
2563
|
+
let data = try Data(contentsOf: URL(fileURLWithPath: onboardingJsonPath))
|
|
2564
|
+
let enrollment = try SpeakerEnrollment.deserialize(data)
|
|
2565
|
+
return SpeakerVerificationStartConfig(enrollment: enrollment, config: enrollment.configSnapshot)
|
|
2566
|
+
}
|
|
2567
|
+
|
|
2568
|
+
private func currentSpeakerGateState() -> (enabled: Bool, open: Bool) {
|
|
2569
|
+
speakerVerificationStateLock.lock()
|
|
2570
|
+
let state = (speakerGateEnabled, speakerGateOpen)
|
|
2571
|
+
speakerVerificationStateLock.unlock()
|
|
2572
|
+
return state
|
|
2573
|
+
}
|
|
2574
|
+
|
|
2575
|
+
private func setSpeakerGateState(enabled: Bool, open: Bool) {
|
|
2576
|
+
speakerVerificationStateLock.lock()
|
|
2577
|
+
let wasOpen = speakerGateOpen
|
|
2578
|
+
let wasEnabled = speakerGateEnabled
|
|
2579
|
+
speakerGateEnabled = enabled
|
|
2580
|
+
speakerGateOpen = open
|
|
2581
|
+
let changed = (wasOpen != open) || (wasEnabled != enabled)
|
|
2582
|
+
if enabled && open && (!wasEnabled || !wasOpen) {
|
|
2583
|
+
speakerPendingPreRollFlush = true
|
|
2584
|
+
}
|
|
2585
|
+
if !enabled {
|
|
2586
|
+
speakerPendingPreRollFlush = false
|
|
2587
|
+
}
|
|
2588
|
+
speakerVerificationStateLock.unlock()
|
|
2589
|
+
if changed {
|
|
2590
|
+
NSLog("[STT][SV][GATE] enabled=\(enabled ? "YES" : "NO") open=\(open ? "YES" : "NO") th=\(speakerVerificationThreshold)")
|
|
2591
|
+
}
|
|
2592
|
+
}
|
|
2593
|
+
|
|
2594
|
+
private func currentSpeakerPreRollFrames() -> Int {
|
|
2595
|
+
speakerVerificationStateLock.lock()
|
|
2596
|
+
let n = speakerPreRollFrames
|
|
2597
|
+
speakerVerificationStateLock.unlock()
|
|
2598
|
+
return n
|
|
2599
|
+
}
|
|
2600
|
+
|
|
2601
|
+
private func enqueueSpeakerPreRoll(_ buffer: AVAudioPCMBuffer) {
|
|
2602
|
+
speakerVerificationStateLock.lock()
|
|
2603
|
+
defer { speakerVerificationStateLock.unlock() }
|
|
2604
|
+
guard speakerPreRollMaxFrames > 0 else { return }
|
|
2605
|
+
guard let copy = copyPCMBuffer(buffer) else { return }
|
|
2606
|
+
speakerPreRollBuffers.append(copy)
|
|
2607
|
+
speakerPreRollFrames += Int(copy.frameLength)
|
|
2608
|
+
|
|
2609
|
+
while speakerPreRollFrames > speakerPreRollMaxFrames, !speakerPreRollBuffers.isEmpty {
|
|
2610
|
+
let dropped = speakerPreRollBuffers.removeFirst()
|
|
2611
|
+
speakerPreRollFrames -= Int(dropped.frameLength)
|
|
2612
|
+
}
|
|
2613
|
+
}
|
|
2614
|
+
|
|
2615
|
+
private func flushSpeakerPreRollIfNeeded() {
|
|
2616
|
+
var toFlush: [AVAudioPCMBuffer] = []
|
|
2617
|
+
var totalFrames = 0
|
|
2618
|
+
var selectedFrames = 0
|
|
2619
|
+
speakerVerificationStateLock.lock()
|
|
2620
|
+
if speakerPendingPreRollFlush {
|
|
2621
|
+
totalFrames = speakerPreRollFrames
|
|
2622
|
+
if useLegacySpeakerGateBehavior {
|
|
2623
|
+
toFlush = speakerPreRollBuffers
|
|
2624
|
+
} else {
|
|
2625
|
+
let sr = max(1, speakerVerificationSourceSampleRate)
|
|
2626
|
+
let maxFrames = max(1, Int(round(Double(sr) * speakerPreRollFlushMaxSeconds)))
|
|
2627
|
+
if totalFrames <= maxFrames {
|
|
2628
|
+
toFlush = speakerPreRollBuffers
|
|
2629
|
+
} else {
|
|
2630
|
+
var kept: [AVAudioPCMBuffer] = []
|
|
2631
|
+
var keptFrames = 0
|
|
2632
|
+
for b in speakerPreRollBuffers.reversed() {
|
|
2633
|
+
kept.append(b)
|
|
2634
|
+
keptFrames += Int(b.frameLength)
|
|
2635
|
+
if keptFrames >= maxFrames { break }
|
|
2636
|
+
}
|
|
2637
|
+
toFlush = kept.reversed()
|
|
2638
|
+
}
|
|
2639
|
+
}
|
|
2640
|
+
selectedFrames = toFlush.reduce(0) { $0 + Int($1.frameLength) }
|
|
2641
|
+
speakerPreRollBuffers.removeAll(keepingCapacity: false)
|
|
2642
|
+
speakerPreRollFrames = 0
|
|
2643
|
+
speakerPendingPreRollFlush = false
|
|
2644
|
+
}
|
|
2645
|
+
speakerVerificationStateLock.unlock()
|
|
2646
|
+
|
|
2647
|
+
if toFlush.isEmpty { return }
|
|
2648
|
+
NSLog("[STT][SV][PREROLL] flushing buffers=\(toFlush.count) frames=\(selectedFrames) totalBuffered=\(totalFrames)")
|
|
2649
|
+
for b in toFlush {
|
|
2650
|
+
recognitionRequest?.append(b)
|
|
2651
|
+
}
|
|
2652
|
+
}
|
|
2653
|
+
|
|
2654
|
+
private func copyPCMBuffer(_ source: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
|
|
2655
|
+
guard let dst = AVAudioPCMBuffer(pcmFormat: source.format, frameCapacity: source.frameLength) else {
|
|
2656
|
+
return nil
|
|
2657
|
+
}
|
|
2658
|
+
dst.frameLength = source.frameLength
|
|
2659
|
+
let channels = Int(source.format.channelCount)
|
|
2660
|
+
let frames = Int(source.frameLength)
|
|
2661
|
+
|
|
2662
|
+
if let src = source.floatChannelData, let out = dst.floatChannelData {
|
|
2663
|
+
let bytes = frames * MemoryLayout<Float>.size
|
|
2664
|
+
for ch in 0..<channels {
|
|
2665
|
+
memcpy(out[ch], src[ch], bytes)
|
|
2666
|
+
}
|
|
2667
|
+
return dst
|
|
2668
|
+
}
|
|
2669
|
+
|
|
2670
|
+
if let src = source.int16ChannelData, let out = dst.int16ChannelData {
|
|
2671
|
+
let bytes = frames * MemoryLayout<Int16>.size
|
|
2672
|
+
for ch in 0..<channels {
|
|
2673
|
+
memcpy(out[ch], src[ch], bytes)
|
|
2674
|
+
}
|
|
2675
|
+
return dst
|
|
2676
|
+
}
|
|
2677
|
+
|
|
2678
|
+
if let src = source.int32ChannelData, let out = dst.int32ChannelData {
|
|
2679
|
+
let bytes = frames * MemoryLayout<Int32>.size
|
|
2680
|
+
for ch in 0..<channels {
|
|
2681
|
+
memcpy(out[ch], src[ch], bytes)
|
|
2682
|
+
}
|
|
2683
|
+
return dst
|
|
2684
|
+
}
|
|
2685
|
+
|
|
2686
|
+
return nil
|
|
2687
|
+
}
|
|
2688
|
+
|
|
2689
|
+
private func resampleSamplesForSpeakerVerificationIfNeeded(_ input: [Float]) -> [Float] {
|
|
2690
|
+
guard !input.isEmpty else { return [] }
|
|
2691
|
+
let srcRate = speakerVerificationSourceSampleRate
|
|
2692
|
+
let dstRate = speakerVerificationTargetSampleRate
|
|
2693
|
+
guard srcRate > 0, dstRate > 0 else { return input }
|
|
2694
|
+
if srcRate == dstRate { return input }
|
|
2695
|
+
|
|
2696
|
+
let ratio = Double(srcRate) / Double(dstRate) // source samples per output sample
|
|
2697
|
+
let source = speakerVerificationResampleCarry + input
|
|
2698
|
+
guard source.count >= 2 else {
|
|
2699
|
+
speakerVerificationResampleCarry = source
|
|
2700
|
+
return []
|
|
2701
|
+
}
|
|
2702
|
+
|
|
2703
|
+
var out: [Float] = []
|
|
2704
|
+
out.reserveCapacity(Int(Double(input.count) * Double(dstRate) / Double(srcRate)) + 8)
|
|
2705
|
+
|
|
2706
|
+
var pos = speakerVerificationResamplePos
|
|
2707
|
+
while pos + 1.0 < Double(source.count) {
|
|
2708
|
+
let i = Int(pos)
|
|
2709
|
+
let frac = Float(pos - Double(i))
|
|
2710
|
+
let a = source[i]
|
|
2711
|
+
let b = source[i + 1]
|
|
2712
|
+
out.append(a + (b - a) * frac)
|
|
2713
|
+
pos += ratio
|
|
2714
|
+
}
|
|
2715
|
+
|
|
2716
|
+
let keepStart = max(0, Int(floor(pos)) - 1)
|
|
2717
|
+
speakerVerificationResampleCarry = Array(source[keepStart...])
|
|
2718
|
+
speakerVerificationResamplePos = pos - Double(keepStart)
|
|
2719
|
+
return out
|
|
2720
|
+
}
|
|
2721
|
+
|
|
2722
|
+
private func processSpeakerVerificationSamples(_ samples: [Float]) {
|
|
2723
|
+
guard !samples.isEmpty else { return }
|
|
2724
|
+
speakerVerificationQueue.async { [weak self] in
|
|
2725
|
+
guard let self = self else { return }
|
|
2726
|
+
guard let engine = self.speakerVerificationEngine else { return }
|
|
2727
|
+
let frameSize = self.speakerVerificationFrameSize
|
|
2728
|
+
guard frameSize > 0 else { return }
|
|
2729
|
+
|
|
2730
|
+
let normalized = self.resampleSamplesForSpeakerVerificationIfNeeded(samples)
|
|
2731
|
+
if normalized.isEmpty { return }
|
|
2732
|
+
self.speakerVerificationInputBuffer.append(contentsOf: normalized)
|
|
2733
|
+
|
|
2734
|
+
while self.speakerVerificationInputBuffer.count >= frameSize {
|
|
2735
|
+
let frame = Array(self.speakerVerificationInputBuffer.prefix(frameSize))
|
|
2736
|
+
self.speakerVerificationInputBuffer.removeFirst(frameSize)
|
|
2737
|
+
self.speakerVerificationFrameSeq &+= 1
|
|
2738
|
+
let seq = self.speakerVerificationFrameSeq
|
|
2739
|
+
|
|
2740
|
+
do {
|
|
2741
|
+
let out = try engine.processFrame(frame: frame)
|
|
2742
|
+
switch out {
|
|
2743
|
+
case .pending(let p):
|
|
2744
|
+
let gate = self.currentSpeakerGateState()
|
|
2745
|
+
NSLog("[STT][SV][FRAME #\(seq)] pending buffered=\(p.bufferedSamples) neededSec=\(p.neededSeconds) gate=\(gate.open ? "OPEN" : "CLOSED") th=\(self.speakerVerificationThreshold)")
|
|
2746
|
+
case .result(let result):
|
|
2747
|
+
if self.useLegacySpeakerGateBehavior || !self.useSpeakerGateHangover {
|
|
2748
|
+
self.setSpeakerGateState(enabled: true, open: result.isMatch)
|
|
2749
|
+
} else {
|
|
2750
|
+
let now = CACurrentMediaTime()
|
|
2751
|
+
if result.isMatch {
|
|
2752
|
+
self.speakerLastPositiveMatchAt = now
|
|
2753
|
+
self.setSpeakerGateState(enabled: true, open: true)
|
|
2754
|
+
} else {
|
|
2755
|
+
let keepOpen = self.speakerLastPositiveMatchAt > 0 &&
|
|
2756
|
+
(now - self.speakerLastPositiveMatchAt) <= max(0, self.speakerGateHangoverSeconds)
|
|
2757
|
+
self.setSpeakerGateState(enabled: true, open: keepOpen)
|
|
2758
|
+
}
|
|
2759
|
+
}
|
|
2760
|
+
let gate = self.currentSpeakerGateState()
|
|
2761
|
+
NSLog("[STT][SV][FRAME #\(seq)] scoreBest=\(String(format: "%.4f", result.scoreBest)) raw=\(String(format: "%.4f", result.scoreBestRaw)) meancombo=\(String(format: "%.4f", result.scoreBestMeancombo)) mean=\(String(format: "%.4f", result.scoreMean)) match=\(result.isMatch ? "YES" : "NO") gate=\(gate.open ? "OPEN" : "CLOSED") th=\(String(format: "%.4f", self.speakerVerificationThreshold)) hangover=\(self.useSpeakerGateHangover ? "ON" : "OFF") hangSec=\(String(format: "%.3f", self.speakerGateHangoverSeconds))")
|
|
2762
|
+
}
|
|
2763
|
+
} catch {
|
|
2764
|
+
self.speakerVerificationEngine = nil
|
|
2765
|
+
self.speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
|
|
2766
|
+
self.speakerVerificationSourceSampleRate = 0
|
|
2767
|
+
self.speakerVerificationTargetSampleRate = 0
|
|
2768
|
+
self.speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
|
|
2769
|
+
self.speakerVerificationResamplePos = 0
|
|
2770
|
+
self.speakerLastPositiveMatchAt = 0
|
|
2771
|
+
self.setSpeakerGateState(enabled: false, open: true)
|
|
2772
|
+
self.speakerVerificationStateLock.lock()
|
|
2773
|
+
self.speakerPreRollBuffers.removeAll(keepingCapacity: false)
|
|
2774
|
+
self.speakerPreRollFrames = 0
|
|
2775
|
+
self.speakerPreRollMaxFrames = 0
|
|
2776
|
+
self.speakerVerificationStateLock.unlock()
|
|
2777
|
+
if !self.speakerVerificationErrorSent {
|
|
2778
|
+
self.speakerVerificationErrorSent = true
|
|
2779
|
+
DispatchQueue.main.async { [weak self] in
|
|
2780
|
+
self?.sendResult(error: ["message": "Speaker verification stopped: \(error.localizedDescription)"],
|
|
2781
|
+
bestTranscription: nil,
|
|
2782
|
+
transcriptions: nil,
|
|
2783
|
+
isFinal: nil)
|
|
2784
|
+
}
|
|
2785
|
+
}
|
|
2786
|
+
return
|
|
2787
|
+
}
|
|
2788
|
+
}
|
|
2789
|
+
}
|
|
2790
|
+
}
|
|
2791
|
+
|
|
2254
2792
|
// MARK: - Helpers
|
|
2255
2793
|
private func _normalizedPowerLevelFromDecibels(_ decibels: CGFloat) -> CGFloat {
|
|
2256
2794
|
if decibels < -80.0 || decibels == 0.0 { return 0.0 }
|