react-native-davoice-tts 1.0.305 → 1.0.307
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/TTSRNBridge.podspec +1 -1
- package/ios/SpeechBridge/SpeechBridge.m +17 -2
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DavoiceTTS +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.abi.json +8831 -8831
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.private.swiftinterface +48 -48
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.swiftinterface +48 -48
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DavoiceTTS +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.abi.json +4092 -4092
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.private.swiftinterface +12 -12
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.swiftinterface +12 -12
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.abi.json +4092 -4092
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.private.swiftinterface +12 -12
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.swiftinterface +12 -12
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeDirectory +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeRequirements-1 +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeResources +24 -24
- package/package.json +1 -1
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift.AEC.CRASH.ETC +0 -2853
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.AEC.CRASH.ETC +0 -2853
|
@@ -1,2853 +0,0 @@
|
|
|
1
|
-
// STT.swift
|
|
2
|
-
// Native iOS Swift version (AEC flow preserved 1:1)
|
|
3
|
-
|
|
4
|
-
import Foundation
|
|
5
|
-
import UIKit
|
|
6
|
-
import Speech
|
|
7
|
-
import Accelerate
|
|
8
|
-
import AVFAudio // or import AVFoundation
|
|
9
|
-
|
|
10
|
-
@objc public protocol STTDelegate: AnyObject {
|
|
11
|
-
@objc func stt(_ stt: STT, didEmitEvent name: String, body: [String: Any]?)
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
@objcMembers
|
|
15
|
-
public final class STT: NSObject, SFSpeechRecognizerDelegate {
|
|
16
|
-
public weak var delegate: STTDelegate?
|
|
17
|
-
public var continuous: Bool = true
|
|
18
|
-
|
|
19
|
-
// Global AEC toggle (default ON to keep existing behavior)
|
|
20
|
-
public var aecEnabled: Bool = true
|
|
21
|
-
// If true, force VP/AEC ON for a short window after session activation while routes settle.
|
|
22
|
-
public var forceAECDuringRouteWarmup: Bool = true
|
|
23
|
-
public var aecRouteWarmupSeconds: Double = 20.0
|
|
24
|
-
// If true, always request 16k input sample rate from AVAudioSession.
|
|
25
|
-
// iOS may still override this depending on route / voice processing constraints.
|
|
26
|
-
public var force16kMicSampleRate: Bool = false
|
|
27
|
-
// If true, use old SV gate behavior (immediate open/close + full pre-roll flush).
|
|
28
|
-
public var useLegacySpeakerGateBehavior: Bool = false
|
|
29
|
-
// If true, keep gate open for a short hangover after the last positive match.
|
|
30
|
-
public var useSpeakerGateHangover: Bool = true
|
|
31
|
-
public var speakerGateHangoverSeconds: Double = 0.40
|
|
32
|
-
// If true, override SV tailSeconds to 0.5s for faster switching tests.
|
|
33
|
-
public var useShortSpeakerVerificationTailWindow: Bool = true
|
|
34
|
-
public var shortSpeakerVerificationTailSeconds: Float = 0.5
|
|
35
|
-
// In protected mode, flush only this much recent pre-roll when gate reopens.
|
|
36
|
-
public var speakerPreRollFlushMaxSeconds: Double = 0.5
|
|
37
|
-
|
|
38
|
-
// MARK: - Private
|
|
39
|
-
private var speechRecognizer: SFSpeechRecognizer?
|
|
40
|
-
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
|
41
|
-
private var audioEngine: AVAudioEngine?
|
|
42
|
-
private var recognitionTask: SFSpeechRecognitionTask?
|
|
43
|
-
private var audioSession: AVAudioSession?
|
|
44
|
-
private let aecSessionActivationLock = NSLock()
|
|
45
|
-
private var lastAECSessionActivationAt: CFTimeInterval = 0
|
|
46
|
-
private var aecSessionIsActive: Bool = false
|
|
47
|
-
private var isTearingDown: Bool = false
|
|
48
|
-
private var sessionId: String?
|
|
49
|
-
private var priorAudioCategory: AVAudioSession.Category?
|
|
50
|
-
private var averagePowerForChannel0: Float = 0
|
|
51
|
-
private var averagePowerForChannel1: Float = 0
|
|
52
|
-
// Add to STT
|
|
53
|
-
private var isAdjustingRoute = false
|
|
54
|
-
private var lastRouteSignature: String = ""
|
|
55
|
-
|
|
56
|
-
private var playbackNode: AVAudioPlayerNode?
|
|
57
|
-
private var seenRealSpeech = false // flips true after first non-blank token
|
|
58
|
-
private var engineHotAt: CFTimeInterval = 0 // when engine actually started
|
|
59
|
-
private let warmupKeepAlive: CFTimeInterval = 4.0 // seconds we’ll keep re-arming in silence
|
|
60
|
-
|
|
61
|
-
// Keep-engine-alive helpers
|
|
62
|
-
private var lastReclaimAttempt: CFAbsoluteTime = 0
|
|
63
|
-
private let reclaimCooldown: CFTimeInterval = 1.0
|
|
64
|
-
|
|
65
|
-
// Serialize pause/unpause (and their waits)
|
|
66
|
-
private let micPauseLock = NSRecursiveLock()
|
|
67
|
-
|
|
68
|
-
// --- Task health ---
|
|
69
|
-
private var lastBufferAt: CFTimeInterval = 0 // updated from tap
|
|
70
|
-
private var lastResultAt: CFTimeInterval = 0 // updated from recognition callback
|
|
71
|
-
private var lastTaskStartAt: CFTimeInterval = 0
|
|
72
|
-
private var stallWatchdog: Timer?
|
|
73
|
-
private var consecutiveStallCount = 0
|
|
74
|
-
private let stallThreshold: CFTimeInterval = 8.0 // seconds w/o results while engine is hot
|
|
75
|
-
private let rearmCooldownTask: CFTimeInterval = 2.0
|
|
76
|
-
private var lastRearmAt: CFTimeInterval = 0
|
|
77
|
-
private var engineHot = false
|
|
78
|
-
private var hotAt: CFTimeInterval = 0
|
|
79
|
-
private var lastLocaleStr: String = ""
|
|
80
|
-
// --- Recovery & diagnostics ---
|
|
81
|
-
private var recoverySeq = 0
|
|
82
|
-
private var lastRecoveryAt: CFTimeInterval = 0
|
|
83
|
-
private var lastTaskOrigin: String = "cold"
|
|
84
|
-
private var savedSessionBeforePause: (
|
|
85
|
-
category: AVAudioSession.Category,
|
|
86
|
-
mode: AVAudioSession.Mode,
|
|
87
|
-
options: AVAudioSession.CategoryOptions,
|
|
88
|
-
sr: Double,
|
|
89
|
-
inCh: Int,
|
|
90
|
-
outCh: Int,
|
|
91
|
-
ioDur: TimeInterval
|
|
92
|
-
)?
|
|
93
|
-
|
|
94
|
-
private(set) var sttActive = false
|
|
95
|
-
// STT.swift (add near `private var playbackNode: AVAudioPlayerNode?`)
|
|
96
|
-
// private var ttsEQ: AVAudioUnitEQ?
|
|
97
|
-
|
|
98
|
-
// Add near your other state:
|
|
99
|
-
private var ioLatchActiveGen: UInt64 = 0
|
|
100
|
-
|
|
101
|
-
// TTS probe state
|
|
102
|
-
private var mixerProbeActive = false
|
|
103
|
-
private var mixerProbeCompletions: [(Bool) -> Void] = []
|
|
104
|
-
private let ttsSerial = DispatchQueue(label: "stt.tts.serial") // serialize TTS schedule/play
|
|
105
|
-
private var engineHasRenderedOnce = false
|
|
106
|
-
|
|
107
|
-
private var tapFramesTotal: UInt64 = 0 // monotonically increases inside input tap
|
|
108
|
-
private var lastTapFramesSeen: UInt64 = 0 // snapshot seen by watchdog
|
|
109
|
-
private var lastNoInputRecoveryAt: CFTimeInterval = 0
|
|
110
|
-
private var consecutiveNoInputResets = 0
|
|
111
|
-
// thresholds / cool-downs
|
|
112
|
-
private let noInputThreshold: CFTimeInterval = 1.0 // seconds without any buffers
|
|
113
|
-
private let noInputCooldown: CFTimeInterval = 5.0 // avoid thrashing recoveries
|
|
114
|
-
private let maxGentleRetries = 2 // try start() a couple times before rebuild
|
|
115
|
-
private var isTelephonyInterrupted = false
|
|
116
|
-
private var isRecoveringAfterTelephony = false // NEW
|
|
117
|
-
// MARK: - Post-telephony recognition kick
|
|
118
|
-
private var startedRecognitionAfterCall = false
|
|
119
|
-
// Add near other state:
|
|
120
|
-
private var activeTaskGen: UInt64 = 0
|
|
121
|
-
private var micPaused: Bool = false
|
|
122
|
-
|
|
123
|
-
// --- Optional speaker verification gate ---
|
|
124
|
-
private struct SpeakerVerificationStartConfig {
|
|
125
|
-
let enrollment: SpeakerEnrollment
|
|
126
|
-
let config: SpeakerVerificationConfig
|
|
127
|
-
}
|
|
128
|
-
private let speakerVerificationQueue = DispatchQueue(label: "stt.sv.queue")
|
|
129
|
-
private let speakerVerificationStateLock = NSLock()
|
|
130
|
-
private var speakerVerificationStartConfig: SpeakerVerificationStartConfig?
|
|
131
|
-
private var speakerVerificationEngine: SpeakerVerificationEngine?
|
|
132
|
-
private var speakerVerificationFrameSize: Int = 0
|
|
133
|
-
private var speakerVerificationInputBuffer: [Float] = []
|
|
134
|
-
private var speakerGateOpen: Bool = true
|
|
135
|
-
private var speakerGateEnabled: Bool = false
|
|
136
|
-
private var speakerVerificationErrorSent: Bool = false
|
|
137
|
-
private var speakerPreRollBuffers: [AVAudioPCMBuffer] = []
|
|
138
|
-
private var speakerPreRollFrames: Int = 0
|
|
139
|
-
private var speakerPreRollMaxFrames: Int = 0
|
|
140
|
-
private var speakerPendingPreRollFlush: Bool = false
|
|
141
|
-
private let speakerPreRollSeconds: Double = 1.0
|
|
142
|
-
private var speakerVerificationThreshold: Float = 0
|
|
143
|
-
private var speakerVerificationFrameSeq: UInt64 = 0
|
|
144
|
-
private var speakerVerificationSourceSampleRate: Int = 0
|
|
145
|
-
private var speakerVerificationTargetSampleRate: Int = 0
|
|
146
|
-
private var speakerVerificationResampleCarry: [Float] = []
|
|
147
|
-
private var speakerVerificationResamplePos: Double = 0
|
|
148
|
-
private var speakerLastPositiveMatchAt: CFTimeInterval = 0
|
|
149
|
-
|
|
150
|
-
// --- Speech recognition lite pause (counter-based) ---
|
|
151
|
-
private let speechPauseLock = NSLock()
|
|
152
|
-
private var speechRecognitionPauseCount: Int = 0
|
|
153
|
-
private var speechRecognitionPaused: Bool = false
|
|
154
|
-
@inline(__always)
|
|
155
|
-
private func isSpeechRecognitionLitePaused() -> Bool {
|
|
156
|
-
speechPauseLock.lock()
|
|
157
|
-
let paused = speechRecognitionPaused
|
|
158
|
-
speechPauseLock.unlock()
|
|
159
|
-
return paused
|
|
160
|
-
}
|
|
161
|
-
@inline(__always)
|
|
162
|
-
private func resetSpeechRecognitionLitePauseState(_ why: String) {
|
|
163
|
-
speechPauseLock.lock()
|
|
164
|
-
speechRecognitionPauseCount = 0
|
|
165
|
-
speechRecognitionPaused = false
|
|
166
|
-
speechPauseLock.unlock()
|
|
167
|
-
NSLog("[STT] resetSpeechRecognitionLitePauseState(\(why)) -> count=0 paused=NO")
|
|
168
|
-
}
|
|
169
|
-
|
|
170
|
-
// MARK: - Event names (unchanged)
|
|
171
|
-
public static let supportedEvents: [String] = [
|
|
172
|
-
"onSpeechResults",
|
|
173
|
-
"onSpeechStart",
|
|
174
|
-
"onSpeechPartialResults",
|
|
175
|
-
"onSpeechError",
|
|
176
|
-
"onSpeechEnd",
|
|
177
|
-
"onSpeechRecognized",
|
|
178
|
-
"onSpeechVolumeChanged"
|
|
179
|
-
]
|
|
180
|
-
private func removeEngineObservers() {
|
|
181
|
-
let nc = NotificationCenter.default
|
|
182
|
-
if let engine = audioEngine {
|
|
183
|
-
nc.removeObserver(self,
|
|
184
|
-
name: .AVAudioEngineConfigurationChange,
|
|
185
|
-
object: engine)
|
|
186
|
-
}
|
|
187
|
-
nc.removeObserver(self,
|
|
188
|
-
name: AVAudioSession.interruptionNotification,
|
|
189
|
-
object: AVAudioSession.sharedInstance())
|
|
190
|
-
nc.removeObserver(self,
|
|
191
|
-
name: AVAudioSession.routeChangeNotification,
|
|
192
|
-
object: AVAudioSession.sharedInstance())
|
|
193
|
-
nc.removeObserver(self,
|
|
194
|
-
name: AVAudioSession.mediaServicesWereResetNotification,
|
|
195
|
-
object: nil)
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
private func hasExternalOutput(_ s: AVAudioSession) -> Bool {
|
|
199
|
-
return s.currentRoute.outputs.contains {
|
|
200
|
-
switch $0.portType {
|
|
201
|
-
case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
|
|
202
|
-
return true
|
|
203
|
-
default:
|
|
204
|
-
return false
|
|
205
|
-
}
|
|
206
|
-
}
|
|
207
|
-
}
|
|
208
|
-
|
|
209
|
-
// Force loudspeaker if iOS routes to receiver while we want speaker.
|
|
210
|
-
private func forceSpeakerIfReceiver(_ why: String) {
|
|
211
|
-
let s = AVAudioSession.sharedInstance()
|
|
212
|
-
|
|
213
|
-
// If there is ANY external output, never fight it.
|
|
214
|
-
if hasExternalOutput(s) { return }
|
|
215
|
-
|
|
216
|
-
let isReceiver = s.currentRoute.outputs.contains { $0.portType == .builtInReceiver }
|
|
217
|
-
if !isReceiver { return }
|
|
218
|
-
|
|
219
|
-
do {
|
|
220
|
-
try s.overrideOutputAudioPort(.speaker)
|
|
221
|
-
NSLog("[STT] 🔊 forceSpeakerIfReceiver(\(why)): receiver -> speaker")
|
|
222
|
-
} catch {
|
|
223
|
-
NSLog("[STT] 🔊 forceSpeakerIfReceiver(\(why)) failed: \(error.localizedDescription)")
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
|
|
227
|
-
private func stopRecognitionTaskLite(_ why: String) {
|
|
228
|
-
// Don't fight teardown / telephony / mic pause.
|
|
229
|
-
if isTearingDown || isTelephonyInterrupted || isRecoveringAfterTelephony || micPaused { return }
|
|
230
|
-
|
|
231
|
-
if recognitionTask != nil || recognitionRequest != nil {
|
|
232
|
-
NSLog("[STT] stopRecognitionTaskLite(\(why)) cancel+drop req/task")
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
recognitionTask?.cancel()
|
|
236
|
-
recognitionTask = nil
|
|
237
|
-
|
|
238
|
-
recognitionRequest?.endAudio()
|
|
239
|
-
recognitionRequest = nil
|
|
240
|
-
|
|
241
|
-
// reset "speech started" gating so we emit cleanly after resume
|
|
242
|
-
seenRealSpeech = false
|
|
243
|
-
}
|
|
244
|
-
|
|
245
|
-
@objc public func pauseSpeechRecognitionLite() {
|
|
246
|
-
// Update counter under lock
|
|
247
|
-
if isTearingDown || isTelephonyInterrupted || isRecoveringAfterTelephony || micPaused { return }
|
|
248
|
-
|
|
249
|
-
speechPauseLock.lock()
|
|
250
|
-
let wasZero = (speechRecognitionPauseCount == 0)
|
|
251
|
-
speechRecognitionPauseCount += 1
|
|
252
|
-
speechRecognitionPaused = true
|
|
253
|
-
let c = speechRecognitionPauseCount
|
|
254
|
-
speechPauseLock.unlock()
|
|
255
|
-
|
|
256
|
-
NSLog("[STT] pauseSpeechRecognitionLite(): count=\(c) (speechRecognitionPaused=YES)")
|
|
257
|
-
|
|
258
|
-
// Only act on the FIRST pause (0->1)
|
|
259
|
-
guard wasZero else { return }
|
|
260
|
-
|
|
261
|
-
// Lite behavior: cancel current speech task & drop request so buffers stop accumulating
|
|
262
|
-
DispatchQueue.main.async { [weak self] in
|
|
263
|
-
guard let self = self else { return }
|
|
264
|
-
|
|
265
|
-
// Only if STT is actually active and mic is not paused
|
|
266
|
-
if !self.sttActive { NSLog("[STT] pauseSpeechRecognitionLite: ignored (sttActive=NO)"); return }
|
|
267
|
-
if self.micPaused { NSLog("[STT] pauseSpeechRecognitionLite: ignored (micPaused=YES)"); return }
|
|
268
|
-
|
|
269
|
-
self.stopRecognitionTaskLite("lite-pause")
|
|
270
|
-
}
|
|
271
|
-
}
|
|
272
|
-
|
|
273
|
-
// NEW: times == -1 => clear all pauses (force resume)
|
|
274
|
-
// times >= 1 => decrement by N
|
|
275
|
-
@objc public func unPauseSpeechRecognitionLite(_ times: NSNumber) {
|
|
276
|
-
let n = times.intValue
|
|
277
|
-
|
|
278
|
-
speechPauseLock.lock()
|
|
279
|
-
if n == -1 {
|
|
280
|
-
speechRecognitionPauseCount = 0
|
|
281
|
-
} else if n > 0 {
|
|
282
|
-
speechRecognitionPauseCount = max(0, speechRecognitionPauseCount - n)
|
|
283
|
-
} else {
|
|
284
|
-
// 0 or weird negatives (except -1): do nothing
|
|
285
|
-
}
|
|
286
|
-
|
|
287
|
-
let reachedZero = (speechRecognitionPauseCount == 0)
|
|
288
|
-
if reachedZero { speechRecognitionPaused = false }
|
|
289
|
-
|
|
290
|
-
let c = speechRecognitionPauseCount
|
|
291
|
-
let paused = speechRecognitionPaused
|
|
292
|
-
speechPauseLock.unlock()
|
|
293
|
-
|
|
294
|
-
NSLog("[STT] unPauseSpeechRecognitionLite(times=\(n)): count=\(c) (speechRecognitionPaused=\(paused ? "YES" : "NO"))")
|
|
295
|
-
|
|
296
|
-
guard reachedZero else { return }
|
|
297
|
-
|
|
298
|
-
DispatchQueue.main.async { [weak self] in
|
|
299
|
-
guard let self = self else { return }
|
|
300
|
-
|
|
301
|
-
// Conditions you asked for
|
|
302
|
-
if self.isTearingDown { NSLog("[STT] lite-unpause: ignored (isTearingDown=YES)"); return }
|
|
303
|
-
if self.isTelephonyInterrupted || self.isRecoveringAfterTelephony {
|
|
304
|
-
NSLog("[STT] lite-unpause: ignored (telephony/recovering)")
|
|
305
|
-
return
|
|
306
|
-
}
|
|
307
|
-
if !self.sttActive { NSLog("[STT] lite-unpause: ignored (sttActive=NO)"); return }
|
|
308
|
-
if self.micPaused { NSLog("[STT] lite-unpause: ignored (micPaused=YES)"); return }
|
|
309
|
-
|
|
310
|
-
// If we don't currently have a task, create a FRESH request/task.
|
|
311
|
-
// Use your existing engine/task bring-up logic (keeps it “lite”).
|
|
312
|
-
self.ensureEngineRunning(reason: "lite-unpause", skipCooldown: true)
|
|
313
|
-
|
|
314
|
-
// Extra defensive: if engine is running but task didn't start, force a fresh task.
|
|
315
|
-
if self.recognitionTask == nil {
|
|
316
|
-
self.startTask(self.makeFreshRequest())
|
|
317
|
-
NSLog("[STT] lite-unpause: forced startTask(makeFreshRequest())")
|
|
318
|
-
}
|
|
319
|
-
}
|
|
320
|
-
}
|
|
321
|
-
|
|
322
|
-
@objc public func pauseMicrophoneAndWait(_ timeoutMs: NSNumber,
|
|
323
|
-
completion: @escaping (Bool, String?) -> Void) {
|
|
324
|
-
micPauseLock.lock()
|
|
325
|
-
|
|
326
|
-
// If already paused, just wait for settle condition (idempotent)
|
|
327
|
-
if !micPaused {
|
|
328
|
-
pauseMicrophone()
|
|
329
|
-
}
|
|
330
|
-
|
|
331
|
-
let timeoutSec = max(0.1, timeoutMs.doubleValue / 1000.0)
|
|
332
|
-
|
|
333
|
-
pollOnMain(timeoutSec: timeoutSec, intervalSec: 0.05,
|
|
334
|
-
condition: { [weak self] in
|
|
335
|
-
guard let self = self else { return false }
|
|
336
|
-
return self.isPausedSettled()
|
|
337
|
-
},
|
|
338
|
-
done: { [weak self] ok in
|
|
339
|
-
// IMPORTANT: unlock BEFORE calling completion (completion may call pause/unpause again)
|
|
340
|
-
self?.micPauseLock.unlock()
|
|
341
|
-
completion(ok, ok ? nil : "pause_timeout")
|
|
342
|
-
})
|
|
343
|
-
}
|
|
344
|
-
|
|
345
|
-
@objc public func unPauseMicrophoneAndWait(_ timeoutMs: NSNumber,
|
|
346
|
-
completion: @escaping (Bool, String?) -> Void) {
|
|
347
|
-
micPauseLock.lock()
|
|
348
|
-
|
|
349
|
-
// If not paused, still ensure we're “live” (idempotent)
|
|
350
|
-
if micPaused {
|
|
351
|
-
unPauseMicrophone()
|
|
352
|
-
}
|
|
353
|
-
|
|
354
|
-
let timeoutSec = max(0.1, timeoutMs.doubleValue / 1000.0)
|
|
355
|
-
|
|
356
|
-
pollOnMain(timeoutSec: timeoutSec, intervalSec: 0.05,
|
|
357
|
-
condition: { [weak self] in
|
|
358
|
-
guard let self = self else { return false }
|
|
359
|
-
return self.isUnpausedSettled()
|
|
360
|
-
},
|
|
361
|
-
done: { [weak self] ok in
|
|
362
|
-
self?.micPauseLock.unlock()
|
|
363
|
-
completion(ok, ok ? nil : "unpause_timeout")
|
|
364
|
-
})
|
|
365
|
-
}
|
|
366
|
-
|
|
367
|
-
// MARK: - settle conditions
|
|
368
|
-
|
|
369
|
-
private func isPausedSettled() -> Bool {
|
|
370
|
-
let s = AVAudioSession.sharedInstance()
|
|
371
|
-
|
|
372
|
-
// What "settled" means for PAUSE in your implementation:
|
|
373
|
-
// - micPaused flag latched
|
|
374
|
-
// - engine + task gone
|
|
375
|
-
// - session in playback
|
|
376
|
-
// - no input ports visible
|
|
377
|
-
if micPaused != true { return false }
|
|
378
|
-
if audioEngine != nil { return false }
|
|
379
|
-
if recognitionTask != nil { return false }
|
|
380
|
-
if recognitionRequest != nil { return false }
|
|
381
|
-
if s.category != .playback { return false }
|
|
382
|
-
if !s.currentRoute.inputs.isEmpty { return false }
|
|
383
|
-
|
|
384
|
-
return true
|
|
385
|
-
}
|
|
386
|
-
|
|
387
|
-
private func isUnpausedSettled() -> Bool {
|
|
388
|
-
let s = AVAudioSession.sharedInstance()
|
|
389
|
-
|
|
390
|
-
// What "settled" means for UNPAUSE:
|
|
391
|
-
// - micPaused false
|
|
392
|
-
// - engine running
|
|
393
|
-
// - recognitionTask exists + running
|
|
394
|
-
// - request exists
|
|
395
|
-
// - capture is valid
|
|
396
|
-
if micPaused != false { return false }
|
|
397
|
-
guard let eng = audioEngine, eng.isRunning else { return false }
|
|
398
|
-
guard let task = recognitionTask, task.state == .running else { return false }
|
|
399
|
-
guard recognitionRequest != nil else { return false }
|
|
400
|
-
if s.category != .playAndRecord { return false }
|
|
401
|
-
if !hasValidCaptureNow(allowColdEngine: true) { return false }
|
|
402
|
-
|
|
403
|
-
return true
|
|
404
|
-
}
|
|
405
|
-
|
|
406
|
-
// MARK: - polling helper (MAIN QUEUE)
|
|
407
|
-
|
|
408
|
-
private func pollOnMain(timeoutSec: TimeInterval,
|
|
409
|
-
intervalSec: TimeInterval,
|
|
410
|
-
condition: @escaping () -> Bool,
|
|
411
|
-
done: @escaping (Bool) -> Void) {
|
|
412
|
-
let deadline = CACurrentMediaTime() + timeoutSec
|
|
413
|
-
|
|
414
|
-
func step() {
|
|
415
|
-
// Always on main
|
|
416
|
-
if condition() {
|
|
417
|
-
done(true)
|
|
418
|
-
return
|
|
419
|
-
}
|
|
420
|
-
if CACurrentMediaTime() >= deadline {
|
|
421
|
-
done(false)
|
|
422
|
-
return
|
|
423
|
-
}
|
|
424
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + intervalSec) {
|
|
425
|
-
step()
|
|
426
|
-
}
|
|
427
|
-
}
|
|
428
|
-
|
|
429
|
-
DispatchQueue.main.async {
|
|
430
|
-
step()
|
|
431
|
-
}
|
|
432
|
-
}
|
|
433
|
-
|
|
434
|
-
public func pauseMicrophone() {
|
|
435
|
-
NSLog("[STT] pauseMicrophone() requested")
|
|
436
|
-
|
|
437
|
-
// ✅ HARD reset speech-lite pause state on mic pause
|
|
438
|
-
resetSpeechRecognitionLitePauseState("pauseMicrophone")
|
|
439
|
-
|
|
440
|
-
guard !micPaused else {
|
|
441
|
-
NSLog("[STT] pauseMicrophone(): already paused")
|
|
442
|
-
return
|
|
443
|
-
}
|
|
444
|
-
micPaused = true
|
|
445
|
-
|
|
446
|
-
let session = AVAudioSession.sharedInstance()
|
|
447
|
-
|
|
448
|
-
// Save current session config (so we can restore on unpause)
|
|
449
|
-
if savedSessionBeforePause == nil {
|
|
450
|
-
let sr = session.sampleRate
|
|
451
|
-
let inCh = Int(session.inputNumberOfChannels)
|
|
452
|
-
let outCh = Int(session.outputNumberOfChannels)
|
|
453
|
-
let ioDur = session.ioBufferDuration
|
|
454
|
-
|
|
455
|
-
savedSessionBeforePause = (
|
|
456
|
-
category: session.category,
|
|
457
|
-
mode: session.mode,
|
|
458
|
-
options: session.categoryOptions,
|
|
459
|
-
sr: sr,
|
|
460
|
-
inCh: inCh,
|
|
461
|
-
outCh: outCh,
|
|
462
|
-
ioDur: ioDur
|
|
463
|
-
)
|
|
464
|
-
}
|
|
465
|
-
// Watchdog is pointless while paused (and would try to “heal” us)
|
|
466
|
-
stopWatchdog()
|
|
467
|
-
|
|
468
|
-
// Stop mic capture but keep TTS safe — remove taps first
|
|
469
|
-
if let eng = audioEngine {
|
|
470
|
-
safeRemoveTap(eng.inputNode)
|
|
471
|
-
safeRemoveTap(eng.mainMixerNode)
|
|
472
|
-
safeRemoveTap(eng.outputNode)
|
|
473
|
-
}
|
|
474
|
-
// 🔴 NEW: fully stop and tear down the engine so session can really deactivate
|
|
475
|
-
if let eng = audioEngine {
|
|
476
|
-
if eng.isRunning {
|
|
477
|
-
eng.stop()
|
|
478
|
-
}
|
|
479
|
-
eng.reset()
|
|
480
|
-
}
|
|
481
|
-
audioEngine = nil
|
|
482
|
-
|
|
483
|
-
// Clear playback node in this engine; unpause will rebuild a fresh engine+player graph
|
|
484
|
-
if let p = playbackNode {
|
|
485
|
-
p.stop()
|
|
486
|
-
}
|
|
487
|
-
playbackNode = nil
|
|
488
|
-
|
|
489
|
-
// Clear AudioPlaybackHook engine-based callbacks (defensive)
|
|
490
|
-
AudioPlaybackHook.currentEngine = nil
|
|
491
|
-
AudioPlaybackHook.engineScheduleFile = nil
|
|
492
|
-
AudioPlaybackHook.isEngineReady = nil
|
|
493
|
-
AudioPlaybackHook.useOnlyEnginePlayback = nil
|
|
494
|
-
AudioPlaybackHook.stopEnginePlayback = nil
|
|
495
|
-
|
|
496
|
-
// Stop recognition cleanly (we'll re-create on unpause)
|
|
497
|
-
recognitionTask?.cancel()
|
|
498
|
-
recognitionTask = nil
|
|
499
|
-
recognitionRequest?.endAudio()
|
|
500
|
-
recognitionRequest = nil
|
|
501
|
-
|
|
502
|
-
// Switch to playback-only session so iOS releases the mic (indicator off)
|
|
503
|
-
do {
|
|
504
|
-
// Use this if we ever have duck others
|
|
505
|
-
// try session.setActive(false, options: [.notifyOthersOnDeactivation])
|
|
506
|
-
try session.setActive(false, options: [])
|
|
507
|
-
markAECSessionActivation(false, reason: "pauseMicrophone-pre")
|
|
508
|
-
NSLog("[STT] pauseMicrophone(): setActive false")
|
|
509
|
-
} catch {
|
|
510
|
-
NSLog("[STT] pauseMicrophone(): failed to switch setActive false: \(error.localizedDescription)")
|
|
511
|
-
}
|
|
512
|
-
// Switch to playback-only session so iOS releases the mic (indicator off)
|
|
513
|
-
do {
|
|
514
|
-
try session.setCategory(.playback, options: [/*.mixWithOthers*/])
|
|
515
|
-
NSLog("[STT] pauseMicrophone(): session set to .playback (mic released)")
|
|
516
|
-
} catch {
|
|
517
|
-
NSLog("[STT] pauseMicrophone(): failed to switch to .playback: \(error.localizedDescription)")
|
|
518
|
-
}
|
|
519
|
-
// Switch to playback-only session so iOS releases the mic (indicator off)
|
|
520
|
-
do {
|
|
521
|
-
try session.setActive(true, options: [])
|
|
522
|
-
markAECSessionActivation(true, reason: "pauseMicrophone-playback")
|
|
523
|
-
NSLog("[STT] pauseMicrophone(): session set to .playback (mic released)")
|
|
524
|
-
} catch {
|
|
525
|
-
NSLog("[STT] pauseMicrophone(): failed to switch to session.setActive with .playback: \(error.localizedDescription)")
|
|
526
|
-
}
|
|
527
|
-
}
|
|
528
|
-
|
|
529
|
-
public func unPauseMicrophone() {
|
|
530
|
-
NSLog("[STT] unPauseMicrophone() requested")
|
|
531
|
-
guard micPaused else {
|
|
532
|
-
NSLog("[STT] unPauseMicrophone(): not paused")
|
|
533
|
-
return
|
|
534
|
-
}
|
|
535
|
-
// ✅ HARD reset speech-lite pause state on mic unpause
|
|
536
|
-
resetSpeechRecognitionLitePauseState("unPauseMicrophone")
|
|
537
|
-
|
|
538
|
-
let session = AVAudioSession.sharedInstance()
|
|
539
|
-
|
|
540
|
-
if let saved = savedSessionBeforePause {
|
|
541
|
-
// Restore previous session category/mode/options and IO prefs
|
|
542
|
-
do {
|
|
543
|
-
try session.setActive(false, options: [.notifyOthersOnDeactivation])
|
|
544
|
-
markAECSessionActivation(false, reason: "unPauseMicrophone-pre")
|
|
545
|
-
} catch {
|
|
546
|
-
NSLog("[STT] unPauseMicrophone: setActive(false) failed: \(error.localizedDescription)")
|
|
547
|
-
}
|
|
548
|
-
|
|
549
|
-
do {
|
|
550
|
-
try session.setCategory(saved.category,
|
|
551
|
-
mode: saved.mode,
|
|
552
|
-
options: saved.options)
|
|
553
|
-
} catch {
|
|
554
|
-
NSLog("[STT] unPauseMicrophone: restoring category failed: \(error.localizedDescription)")
|
|
555
|
-
}
|
|
556
|
-
|
|
557
|
-
if saved.sr > 0 {
|
|
558
|
-
try? session.setPreferredSampleRate(saved.sr)
|
|
559
|
-
}
|
|
560
|
-
if saved.inCh > 0 && session.isInputAvailable {
|
|
561
|
-
try? session.setPreferredInputNumberOfChannels(saved.inCh)
|
|
562
|
-
}
|
|
563
|
-
if saved.outCh > 0 {
|
|
564
|
-
try? session.setPreferredOutputNumberOfChannels(saved.outCh)
|
|
565
|
-
}
|
|
566
|
-
if saved.ioDur > 0 {
|
|
567
|
-
try? session.setPreferredIOBufferDuration(saved.ioDur)
|
|
568
|
-
}
|
|
569
|
-
|
|
570
|
-
do {
|
|
571
|
-
try session.setActive(true, options: [])
|
|
572
|
-
markAECSessionActivation(true, reason: "unPauseMicrophone")
|
|
573
|
-
} catch {
|
|
574
|
-
NSLog("[STT] unPauseMicrophone: setActive(true) failed: \(error.localizedDescription)")
|
|
575
|
-
}
|
|
576
|
-
_ = setupAudioSession()
|
|
577
|
-
|
|
578
|
-
// !!! IMPORTANT if micPaused = true then rebuildEngineGraphAndRestart will not activate necessary things!
|
|
579
|
-
micPaused = false
|
|
580
|
-
// Rebuild graph + reinstall tap + restart recognition
|
|
581
|
-
rebuildEngineGraphAndRestart(reason: "unpause-mic")
|
|
582
|
-
NSLog("[STT] unPauseMicrophone(): session restored + rebuildEngineGraphAndRestart() called")
|
|
583
|
-
startWatchdog()
|
|
584
|
-
|
|
585
|
-
// Clear so next pause re-snapshots the current config
|
|
586
|
-
savedSessionBeforePause = nil
|
|
587
|
-
} else {
|
|
588
|
-
// Fallback if we never saved a session (very defensive)
|
|
589
|
-
NSLog("[STT] unPauseMicrophone(): no savedSessionBeforePause, using setupAudioSession()")
|
|
590
|
-
_ = setupAudioSession()
|
|
591
|
-
rebuildEngineGraphAndRestart(reason: "unpause-mic-nosaved")
|
|
592
|
-
startWatchdog()
|
|
593
|
-
}
|
|
594
|
-
micPaused = false
|
|
595
|
-
}
|
|
596
|
-
|
|
597
|
-
private var graphGen: UInt64 = 0
|
|
598
|
-
@inline(__always) private func bumpGraphGen() { graphGen &+= 1; ioLatchActiveGen = 0 }
|
|
599
|
-
// Add near other state
|
|
600
|
-
private var pausedForCaptureLoss = false
|
|
601
|
-
|
|
602
|
-
private func markCaptureLost() {
|
|
603
|
-
pausedForCaptureLoss = true
|
|
604
|
-
}
|
|
605
|
-
|
|
606
|
-
private func tryClearCaptureLossAfterStartSucceeded() {
|
|
607
|
-
// Only clear after we actually start the engine
|
|
608
|
-
pausedForCaptureLoss = false
|
|
609
|
-
}
|
|
610
|
-
|
|
611
|
-
// MARK: - AEC Toggle API
|
|
612
|
-
|
|
613
|
-
/// Enable/disable iOS voice-processing (AEC + ducking).
|
|
614
|
-
/// If STT is already active, we rebuild the session/graph so it takes effect.
|
|
615
|
-
public func setAECEnabled(_ enabled: Bool) {
|
|
616
|
-
NSLog("[STT] setAECEnabled(\(enabled))")
|
|
617
|
-
aecEnabled = enabled
|
|
618
|
-
|
|
619
|
-
// If recognition is live, re-apply session + graph so change is effective
|
|
620
|
-
if sttActive {
|
|
621
|
-
_ = setupAudioSession()
|
|
622
|
-
rebuildEngineGraphAndRestart(reason: enabled ? "aec-on" : "aec-off")
|
|
623
|
-
}
|
|
624
|
-
}
|
|
625
|
-
|
|
626
|
-
public func isAECEnabled() -> Bool {
|
|
627
|
-
return aecEnabled
|
|
628
|
-
}
|
|
629
|
-
|
|
630
|
-
private func startRecognitionAfterCall() {
|
|
631
|
-
guard !startedRecognitionAfterCall else { return }
|
|
632
|
-
startedRecognitionAfterCall = true
|
|
633
|
-
|
|
634
|
-
// The local speech daemon can be in a funky state after telephony.
|
|
635
|
-
recreateSpeechRecognizerPreservingLocale()
|
|
636
|
-
|
|
637
|
-
let req = makeFreshRequest()
|
|
638
|
-
startTask(req)
|
|
639
|
-
|
|
640
|
-
// Recovery window ends only after we have a live task
|
|
641
|
-
isRecoveringAfterTelephony = false
|
|
642
|
-
startWatchdog() // resume health checks now that we're live again
|
|
643
|
-
NSLog("[STT] recovery: recognition task started after buffers observed")
|
|
644
|
-
}
|
|
645
|
-
|
|
646
|
-
private func hasValidCaptureNow(allowColdEngine: Bool = true) -> Bool {
|
|
647
|
-
let s = AVAudioSession.sharedInstance()
|
|
648
|
-
|
|
649
|
-
// Real “no mic” conditions (A2DP only, telephony, etc.)
|
|
650
|
-
guard s.isInputAvailable,
|
|
651
|
-
!s.currentRoute.inputs.isEmpty,
|
|
652
|
-
s.inputNumberOfChannels > 0,
|
|
653
|
-
s.sampleRate > 0 else { return false }
|
|
654
|
-
|
|
655
|
-
// If we require a hot engine, check the node *only when running*.
|
|
656
|
-
if !allowColdEngine, let eng = audioEngine, eng.isRunning {
|
|
657
|
-
let f = eng.inputNode.outputFormat(forBus: 0)
|
|
658
|
-
return f.sampleRate > 0 && f.channelCount > 0
|
|
659
|
-
}
|
|
660
|
-
|
|
661
|
-
// Session says we have input; engine may be cold — that’s fine to attempt start().
|
|
662
|
-
return true
|
|
663
|
-
}
|
|
664
|
-
|
|
665
|
-
|
|
666
|
-
@inline(__always)
|
|
667
|
-
private func safeRemoveTap(_ node: AVAudioNode?, bus: AVAudioNodeBus = 0) {
|
|
668
|
-
guard let n = node, n.engine != nil else { return } // only remove if still attached
|
|
669
|
-
try? n.removeTap(onBus: bus)
|
|
670
|
-
}
|
|
671
|
-
|
|
672
|
-
// MARK: - Public API (native replacements for the former RCT methods)
|
|
673
|
-
|
|
674
|
-
public func isSpeechAvailable(_ completion: @escaping (Bool) -> Void) {
|
|
675
|
-
SFSpeechRecognizer.requestAuthorization { status in
|
|
676
|
-
switch status {
|
|
677
|
-
case .authorized: completion(true)
|
|
678
|
-
default: completion(false)
|
|
679
|
-
}
|
|
680
|
-
}
|
|
681
|
-
}
|
|
682
|
-
|
|
683
|
-
private func armFirstIOCycleLatch(on engine: AVAudioEngine) {
|
|
684
|
-
engineHasRenderedOnce = false
|
|
685
|
-
let gen = graphGen
|
|
686
|
-
|
|
687
|
-
// Prevent overlapping latches against the same graph generation.
|
|
688
|
-
if ioLatchActiveGen == gen { return }
|
|
689
|
-
ioLatchActiveGen = gen
|
|
690
|
-
|
|
691
|
-
DispatchQueue.main.async { [weak self, weak engine] in
|
|
692
|
-
guard let self = self, let eng = engine, gen == self.graphGen else { return }
|
|
693
|
-
let out = eng.outputNode
|
|
694
|
-
var fired = false
|
|
695
|
-
|
|
696
|
-
// >>> IMPORTANT: ensure no previous tap is left behind
|
|
697
|
-
self.safeRemoveTap(out, bus: 0)
|
|
698
|
-
|
|
699
|
-
out.installTap(onBus: 0, bufferSize: 128, format: nil) { [weak self, weak out] _, _ in
|
|
700
|
-
guard let self = self, gen == self.graphGen else { return }
|
|
701
|
-
if fired { return }
|
|
702
|
-
fired = true
|
|
703
|
-
self.safeRemoveTap(out, bus: 0)
|
|
704
|
-
self.engineHasRenderedOnce = true
|
|
705
|
-
// latch finished for this gen
|
|
706
|
-
if self.ioLatchActiveGen == gen { self.ioLatchActiveGen = 0 }
|
|
707
|
-
}
|
|
708
|
-
|
|
709
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + 2.0) { [weak self, weak out] in
|
|
710
|
-
guard let self = self, gen == self.graphGen else { return }
|
|
711
|
-
if fired { return }
|
|
712
|
-
self.safeRemoveTap(out, bus: 0)
|
|
713
|
-
self.engineHasRenderedOnce = true // fail-open
|
|
714
|
-
if self.ioLatchActiveGen == gen { self.ioLatchActiveGen = 0 }
|
|
715
|
-
}
|
|
716
|
-
}
|
|
717
|
-
}
|
|
718
|
-
|
|
719
|
-
public func isRecognizing() -> Bool {
|
|
720
|
-
guard let task = recognitionTask else { return false }
|
|
721
|
-
return task.state == .running
|
|
722
|
-
}
|
|
723
|
-
|
|
724
|
-
private func ensurePlaybackNode(in engine: AVAudioEngine) -> AVAudioPlayerNode {
|
|
725
|
-
if let p = playbackNode, p.engine === engine {
|
|
726
|
-
return p
|
|
727
|
-
}
|
|
728
|
-
let p = AVAudioPlayerNode()
|
|
729
|
-
playbackNode = p
|
|
730
|
-
engine.attach(p)
|
|
731
|
-
|
|
732
|
-
// // Ensure we have a de-esser on this engine (recreate if needed)
|
|
733
|
-
// if ttsEQ?.engine !== engine {
|
|
734
|
-
// if let old = ttsEQ, old.engine != nil {
|
|
735
|
-
// // Best-effort detach if old EQ belongs to a different engine
|
|
736
|
-
// // (safe even if already gone)
|
|
737
|
-
// }
|
|
738
|
-
// // --- Aggressive low-pass only ---
|
|
739
|
-
// let deEss = AVAudioUnitEQ(numberOfBands: 1)
|
|
740
|
-
// let lpf = deEss.bands[0]
|
|
741
|
-
// lpf.filterType = .lowPass
|
|
742
|
-
// lpf.frequency = 6500 // try 6000–7500
|
|
743
|
-
// lpf.bandwidth = 0.35 // fairly steep
|
|
744
|
-
// lpf.gain = 0.0
|
|
745
|
-
// lpf.bypass = false
|
|
746
|
-
|
|
747
|
-
// self.ttsEQ = deEss
|
|
748
|
-
|
|
749
|
-
// ttsEQ = deEss
|
|
750
|
-
// engine.attach(deEss)
|
|
751
|
-
// }
|
|
752
|
-
|
|
753
|
-
// // Route: player -> EQ -> mainMixer
|
|
754
|
-
// if let deEss = ttsEQ {
|
|
755
|
-
// engine.disconnectNodeOutput(p)
|
|
756
|
-
// engine.connect(p, to: deEss, format: nil)
|
|
757
|
-
// engine.connect(deEss, to: engine.mainMixerNode, format: nil)
|
|
758
|
-
// } else {
|
|
759
|
-
// // ultra-defensive fallback
|
|
760
|
-
// engine.connect(p, to: engine.mainMixerNode, format: nil)
|
|
761
|
-
// }
|
|
762
|
-
engine.connect(p, to: engine.mainMixerNode, format: nil)
|
|
763
|
-
return p
|
|
764
|
-
}
|
|
765
|
-
|
|
766
|
-
private func startWatchdog() {
|
|
767
|
-
stallWatchdog?.invalidate()
|
|
768
|
-
stallWatchdog = Timer.scheduledTimer(withTimeInterval: 2.0, repeats: true) { [weak self] _ in
|
|
769
|
-
self?.checkTaskHealth()
|
|
770
|
-
}
|
|
771
|
-
RunLoop.main.add(stallWatchdog!, forMode: .common)
|
|
772
|
-
}
|
|
773
|
-
|
|
774
|
-
private func stopWatchdog() {
|
|
775
|
-
stallWatchdog?.invalidate()
|
|
776
|
-
stallWatchdog = nil
|
|
777
|
-
}
|
|
778
|
-
|
|
779
|
-
private func rearmTask(reason: String) {
|
|
780
|
-
// Cancel old task only — keep the engine and tap running.
|
|
781
|
-
if isTelephonyInterrupted || isRecoveringAfterTelephony {
|
|
782
|
-
NSLog("[STT] rearmTask(\(reason)) suppressed (telephony/recovering)")
|
|
783
|
-
return
|
|
784
|
-
}
|
|
785
|
-
if micPaused {
|
|
786
|
-
NSLog("[STT] rearmTask(\(reason)) suppressed (micPaused)")
|
|
787
|
-
return
|
|
788
|
-
}
|
|
789
|
-
if isSpeechRecognitionLitePaused() {
|
|
790
|
-
NSLog("[STT] rearmTask(\(reason)) suppressed (speechRecognitionPaused)")
|
|
791
|
-
return
|
|
792
|
-
}
|
|
793
|
-
|
|
794
|
-
// -----------------
|
|
795
|
-
recognitionTask?.cancel()
|
|
796
|
-
recognitionTask = nil
|
|
797
|
-
|
|
798
|
-
seenRealSpeech = false
|
|
799
|
-
lastTaskStartAt = CACurrentMediaTime()
|
|
800
|
-
startTask(makeFreshRequest())
|
|
801
|
-
NSLog("[STT] rearmTask(\(reason)) -> new task started")
|
|
802
|
-
}
|
|
803
|
-
|
|
804
|
-
private func checkTaskHealth() {
|
|
805
|
-
if isTearingDown || isTelephonyInterrupted || isRecoveringAfterTelephony {
|
|
806
|
-
NSLog("[STT] watchdog: isTearingDown || isTelephonyInterrupted || isRecoveringAfterTelephony -- DOING NOTHING")
|
|
807
|
-
return
|
|
808
|
-
}
|
|
809
|
-
if isSpeechRecognitionLitePaused() {
|
|
810
|
-
NSLog("[STT] watchdog: speechRecognitionPaused -- DOING NOTHING")
|
|
811
|
-
return
|
|
812
|
-
}
|
|
813
|
-
if micPaused {
|
|
814
|
-
NSLog("[STT] watchdog: micPaused -- DOING NOTHING")
|
|
815
|
-
return
|
|
816
|
-
}
|
|
817
|
-
|
|
818
|
-
let now = CACurrentMediaTime()
|
|
819
|
-
|
|
820
|
-
// ⛳️ GRACE: don’t call "stall" right after a new task begins
|
|
821
|
-
if now - lastTaskStartAt < 5.0 { return }
|
|
822
|
-
|
|
823
|
-
// 0) No capture? Wait quietly.
|
|
824
|
-
if !hasValidCaptureNow(allowColdEngine: true) {
|
|
825
|
-
markCaptureLost()
|
|
826
|
-
NSLog("[STT] watchdog: capture not available; waiting…")
|
|
827
|
-
return
|
|
828
|
-
}
|
|
829
|
-
|
|
830
|
-
// 1) Engine down? Bring it up (bypass cooldown from watchdog).
|
|
831
|
-
if audioEngine == nil || !(audioEngine?.isRunning ?? false) {
|
|
832
|
-
NSLog("[STT] watchdog: engine down → ensureEngineRunning")
|
|
833
|
-
ensureEngineRunning(reason: "watchdog-engine-down", skipCooldown: true)
|
|
834
|
-
return
|
|
835
|
-
}
|
|
836
|
-
|
|
837
|
-
// 2) Recognizer unavailable? wait.
|
|
838
|
-
if let rec = speechRecognizer, rec.isAvailable == false {
|
|
839
|
-
NSLog("[STT] watchdog: recognizer unavailable; waiting…")
|
|
840
|
-
return
|
|
841
|
-
}
|
|
842
|
-
|
|
843
|
-
// 3) No task? start one.
|
|
844
|
-
if recognitionTask == nil {
|
|
845
|
-
if now - lastRearmAt > rearmCooldownTask {
|
|
846
|
-
lastRearmAt = now
|
|
847
|
-
startTask(makeFreshRequest())
|
|
848
|
-
}
|
|
849
|
-
return
|
|
850
|
-
}
|
|
851
|
-
|
|
852
|
-
// 4) No input buffers? gentle nudge, then rebuild if repeated.
|
|
853
|
-
let timeSinceBuffer = now - lastBufferAt
|
|
854
|
-
if timeSinceBuffer > noInputThreshold {
|
|
855
|
-
if now - lastNoInputRecoveryAt > noInputCooldown {
|
|
856
|
-
lastNoInputRecoveryAt = now
|
|
857
|
-
consecutiveNoInputResets += 1
|
|
858
|
-
ensureEngineRunning(reason: "watchdog-no-input", skipCooldown: true)
|
|
859
|
-
if consecutiveNoInputResets >= maxGentleRetries {
|
|
860
|
-
consecutiveNoInputResets = 0
|
|
861
|
-
rebuildEngineGraphAndRestart(reason: "watchdog-no-input-rebuild")
|
|
862
|
-
}
|
|
863
|
-
}
|
|
864
|
-
return
|
|
865
|
-
} else {
|
|
866
|
-
consecutiveNoInputResets = 0
|
|
867
|
-
}
|
|
868
|
-
|
|
869
|
-
// 5) Buffers flowing but no results → rearm task.
|
|
870
|
-
let noResultsFor = now - lastResultAt
|
|
871
|
-
if noResultsFor > stallThreshold {
|
|
872
|
-
if now - lastRearmAt > rearmCooldownTask {
|
|
873
|
-
lastRearmAt = now
|
|
874
|
-
consecutiveStallCount += 1
|
|
875
|
-
rearmTask(reason: "watchdog-stall")
|
|
876
|
-
if consecutiveStallCount >= 3 {
|
|
877
|
-
recreateSpeechRecognizerPreservingLocale()
|
|
878
|
-
consecutiveStallCount = 0
|
|
879
|
-
}
|
|
880
|
-
}
|
|
881
|
-
} else {
|
|
882
|
-
consecutiveStallCount = 0
|
|
883
|
-
}
|
|
884
|
-
}
|
|
885
|
-
|
|
886
|
-
public func startSpeech(localeStr: String?) {
|
|
887
|
-
startSpeechInternal(localeStr: localeStr, speakerVerificationConfig: nil)
|
|
888
|
-
}
|
|
889
|
-
|
|
890
|
-
public func startSpeech(localeStr: String?, onboardingJsonPath: String) {
|
|
891
|
-
do {
|
|
892
|
-
let loaded = try loadSpeakerVerificationStartConfig(onboardingJsonPath: onboardingJsonPath)
|
|
893
|
-
startSpeechInternal(localeStr: localeStr, speakerVerificationConfig: loaded)
|
|
894
|
-
} catch {
|
|
895
|
-
sendResult(error: ["message": "Failed to load onboarding JSON: \(error.localizedDescription)"],
|
|
896
|
-
bestTranscription: nil,
|
|
897
|
-
transcriptions: nil,
|
|
898
|
-
isFinal: nil)
|
|
899
|
-
}
|
|
900
|
-
}
|
|
901
|
-
|
|
902
|
-
private func startSpeechInternal(localeStr: String?,
|
|
903
|
-
speakerVerificationConfig: SpeakerVerificationStartConfig?) {
|
|
904
|
-
NSLog("[STT] startSpeech(locale=\(localeStr ?? "nil"), sv=\(speakerVerificationConfig == nil ? "off" : "on"))")
|
|
905
|
-
lastLocaleStr = localeStr ?? ""
|
|
906
|
-
speakerVerificationStartConfig = speakerVerificationConfig
|
|
907
|
-
if recognitionTask != nil {
|
|
908
|
-
sendResult(error: ["code": "already_started", "message": "Speech recognition already started!"],
|
|
909
|
-
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
910
|
-
return
|
|
911
|
-
}
|
|
912
|
-
|
|
913
|
-
SFSpeechRecognizer.requestAuthorization { [weak self] status in
|
|
914
|
-
guard let self = self else { return }
|
|
915
|
-
switch status {
|
|
916
|
-
case .notDetermined:
|
|
917
|
-
self.sendResult(error: ["message": "Speech recognition not yet authorized"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
918
|
-
case .denied:
|
|
919
|
-
self.sendResult(error: ["message": "User denied access to speech recognition"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
920
|
-
case .restricted:
|
|
921
|
-
self.sendResult(error: ["message": "Speech recognition restricted on this device"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
922
|
-
case .authorized:
|
|
923
|
-
self.setupAndStartRecognizing(localeStr: localeStr)
|
|
924
|
-
@unknown default:
|
|
925
|
-
self.sendResult(error: ["message": "Unknown authorization status"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
926
|
-
}
|
|
927
|
-
}
|
|
928
|
-
}
|
|
929
|
-
|
|
930
|
-
public func stopSpeech(_ completion: ((Bool) -> Void)? = nil) {
|
|
931
|
-
NSLog("[STT] stopSpeech() requested by app")
|
|
932
|
-
recognitionTask?.finish()
|
|
933
|
-
completion?(false)
|
|
934
|
-
}
|
|
935
|
-
|
|
936
|
-
public func cancelSpeech(_ completion: ((Bool) -> Void)? = nil) {
|
|
937
|
-
NSLog("[STT] cancelSpeech() requested by app")
|
|
938
|
-
|
|
939
|
-
recognitionTask?.cancel()
|
|
940
|
-
completion?(false)
|
|
941
|
-
}
|
|
942
|
-
|
|
943
|
-
public func destroySpeech(_ completion: ((Bool) -> Void)? = nil) {
|
|
944
|
-
NSLog("[STT] **** destroySpeech!!!")
|
|
945
|
-
teardown()
|
|
946
|
-
completion?(false)
|
|
947
|
-
}
|
|
948
|
-
|
|
949
|
-
private func updateSessionRouting(selectBestInput: Bool = true) {
|
|
950
|
-
NSLog("[STT] ⚠️ updateSessionRouting??? why???")
|
|
951
|
-
|
|
952
|
-
if isAdjustingRoute { return }
|
|
953
|
-
|
|
954
|
-
isAdjustingRoute = true
|
|
955
|
-
defer { isAdjustingRoute = false }
|
|
956
|
-
|
|
957
|
-
let s = AVAudioSession.sharedInstance()
|
|
958
|
-
|
|
959
|
-
let hasInputRoute = s.isInputAvailable && !s.currentRoute.inputs.isEmpty
|
|
960
|
-
if !hasInputRoute {
|
|
961
|
-
// Transient during category/route settle — do NOT bail.
|
|
962
|
-
NSLog("[STT] route: input not visible yet (transient) — proceeding to activate session")
|
|
963
|
-
}
|
|
964
|
-
|
|
965
|
-
DispatchQueue.global(qos: .userInitiated).async { [weak self] in
|
|
966
|
-
guard let self = self else { return }
|
|
967
|
-
|
|
968
|
-
let hasWiredOrCar = s.currentRoute.outputs.contains {
|
|
969
|
-
switch $0.portType {
|
|
970
|
-
case .headphones,
|
|
971
|
-
.bluetoothA2DP,
|
|
972
|
-
.bluetoothHFP,
|
|
973
|
-
.bluetoothLE,
|
|
974
|
-
.airPlay,
|
|
975
|
-
.carAudio,
|
|
976
|
-
.usbAudio:
|
|
977
|
-
return true
|
|
978
|
-
default:
|
|
979
|
-
return false
|
|
980
|
-
}
|
|
981
|
-
}
|
|
982
|
-
if selectBestInput, let all = s.availableInputs {
|
|
983
|
-
let btHFP = all.first { $0.portType == .bluetoothHFP || $0.portType == .bluetoothLE }
|
|
984
|
-
let wired = all.first { $0.portType == .headsetMic }
|
|
985
|
-
let built = all.first { $0.portType == .builtInMic }
|
|
986
|
-
//let best = btHFP ?? wired ?? built
|
|
987
|
-
// Prefer BT HFP (mic), then wired mic; otherwise leave preferredInput as-is.
|
|
988
|
-
let desired = btHFP ?? wired
|
|
989
|
-
|
|
990
|
-
do {
|
|
991
|
-
if let desired, s.preferredInput?.uid != desired.uid {
|
|
992
|
-
try s.setPreferredInput(desired)
|
|
993
|
-
} else if desired == nil {
|
|
994
|
-
// No headset mic → clear preference; do NOT force built-in
|
|
995
|
-
if s.preferredInput != nil { try s.setPreferredInput(nil) }
|
|
996
|
-
}
|
|
997
|
-
// If built-in is already what the system selected, we need no action.
|
|
998
|
-
if let builtIn = built, (desired == nil), s.preferredInput?.uid == nil {
|
|
999
|
-
// Optionally hint bottom/back data source, but don’t fight routes
|
|
1000
|
-
if let ds = builtIn.dataSources?.first(where: { $0.orientation == .bottom || $0.orientation == .back }) {
|
|
1001
|
-
try? builtIn.setPreferredDataSource(ds)
|
|
1002
|
-
}
|
|
1003
|
-
}
|
|
1004
|
-
} catch {
|
|
1005
|
-
NSLog("[STT] setPreferredInput failed: \(error.localizedDescription)")
|
|
1006
|
-
}
|
|
1007
|
-
}
|
|
1008
|
-
|
|
1009
|
-
var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
|
|
1010
|
-
if !hasWiredOrCar { opts.insert(.defaultToSpeaker) }
|
|
1011
|
-
|
|
1012
|
-
if s.category != .playAndRecord || s.mode != .default || s.categoryOptions != opts {
|
|
1013
|
-
do { try s.setCategory(.playAndRecord, mode: .default, options: opts) }
|
|
1014
|
-
catch { NSLog("[STT] setCategory failed: \(error.localizedDescription)") }
|
|
1015
|
-
}
|
|
1016
|
-
|
|
1017
|
-
do {
|
|
1018
|
-
try s.setActive(true, options: [])
|
|
1019
|
-
self.markAECSessionActivation(true, reason: "updateSessionRouting")
|
|
1020
|
-
} catch {
|
|
1021
|
-
NSLog("[STT] setActive failed: \(error.localizedDescription)")
|
|
1022
|
-
self.markAECSessionActivation(false, reason: "updateSessionRouting-failed")
|
|
1023
|
-
}
|
|
1024
|
-
|
|
1025
|
-
// Optional: force 16k after activation
|
|
1026
|
-
self.force16kIfPossible(s)
|
|
1027
|
-
self.forceSpeakerIfReceiver("updateSessionRouting")
|
|
1028
|
-
|
|
1029
|
-
// Log route back on main so logs stay ordered
|
|
1030
|
-
DispatchQueue.main.async {
|
|
1031
|
-
let inPorts = s.currentRoute.inputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
|
|
1032
|
-
let outPorts = s.currentRoute.outputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
|
|
1033
|
-
NSLog("[STT] route in=[\(inPorts)] out=[\(outPorts)]")
|
|
1034
|
-
}
|
|
1035
|
-
}
|
|
1036
|
-
}
|
|
1037
|
-
|
|
1038
|
-
// ↓↓↓ preferred settings helper
|
|
1039
|
-
private func force16kIfPossible(_ session: AVAudioSession) {
|
|
1040
|
-
if force16kMicSampleRate {
|
|
1041
|
-
try? session.setPreferredSampleRate(16_000)
|
|
1042
|
-
}
|
|
1043
|
-
|
|
1044
|
-
let hasExternalOutput = session.currentRoute.outputs.contains {
|
|
1045
|
-
switch $0.portType {
|
|
1046
|
-
case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
|
|
1047
|
-
return true
|
|
1048
|
-
default: return false
|
|
1049
|
-
}
|
|
1050
|
-
}
|
|
1051
|
-
|
|
1052
|
-
let builtInOut = session.currentRoute.outputs.allSatisfy { $0.portType == .builtInSpeaker }
|
|
1053
|
-
let builtInIn = session.currentRoute.inputs.allSatisfy { $0.portType == .builtInMic }
|
|
1054
|
-
|
|
1055
|
-
// Prefer 16k only on built-in mic+speaker (voice pipeline). Otherwise leave SR to route.
|
|
1056
|
-
if builtInIn && builtInOut {
|
|
1057
|
-
try? session.setPreferredSampleRate(16_000)
|
|
1058
|
-
if session.isInputAvailable { try? session.setPreferredInputNumberOfChannels(1) }
|
|
1059
|
-
// ⚠️ Do NOT force output channels to 1; many routes require 2ch.
|
|
1060
|
-
// try? session.setPreferredOutputNumberOfChannels(1) // ← REMOVE
|
|
1061
|
-
} else {
|
|
1062
|
-
// Input mono is generally OK, but don’t touch output channels
|
|
1063
|
-
if session.isInputAvailable { try? session.setPreferredInputNumberOfChannels(1) }
|
|
1064
|
-
}
|
|
1065
|
-
|
|
1066
|
-
// A small IO buffer is fine across routes
|
|
1067
|
-
try? session.setPreferredIOBufferDuration(0.02)
|
|
1068
|
-
}
|
|
1069
|
-
|
|
1070
|
-
private func markAECSessionActivation(_ active: Bool, reason: String) {
|
|
1071
|
-
let now = CACurrentMediaTime()
|
|
1072
|
-
aecSessionActivationLock.lock()
|
|
1073
|
-
aecSessionIsActive = active
|
|
1074
|
-
if active { lastAECSessionActivationAt = now }
|
|
1075
|
-
else { lastAECSessionActivationAt = 0 }
|
|
1076
|
-
aecSessionActivationLock.unlock()
|
|
1077
|
-
NSLog("[STT] AEC session activation(\(reason)): active=\(active ? "YES" : "NO") t=\(String(format: "%.3f", now))")
|
|
1078
|
-
}
|
|
1079
|
-
|
|
1080
|
-
private func isInAECRouteWarmupWindow() -> Bool {
|
|
1081
|
-
guard aecEnabled, forceAECDuringRouteWarmup, aecRouteWarmupSeconds > 0 else { return false }
|
|
1082
|
-
let now = CACurrentMediaTime()
|
|
1083
|
-
aecSessionActivationLock.lock()
|
|
1084
|
-
let isActive = aecSessionIsActive
|
|
1085
|
-
let lastActiveAt = lastAECSessionActivationAt
|
|
1086
|
-
aecSessionActivationLock.unlock()
|
|
1087
|
-
guard isActive, lastActiveAt > 0 else { return false }
|
|
1088
|
-
return (now - lastActiveAt) < aecRouteWarmupSeconds
|
|
1089
|
-
}
|
|
1090
|
-
|
|
1091
|
-
// MARK: - Core logic (kept intact, including AEC order/steps)
|
|
1092
|
-
|
|
1093
|
-
/// Returns true if no errors occurred (identical flow & calls as ObjC).
|
|
1094
|
-
/// Returns true if no errors occurred (identical flow & calls as ObjC) + keep-alive opts.
|
|
1095
|
-
/// Returns true if no errors occurred (identical flow & calls as ObjC) + keep-alive opts.
|
|
1096
|
-
private func setupAudioSession() -> Bool {
|
|
1097
|
-
var err: NSError?
|
|
1098
|
-
let session = AVAudioSession.sharedInstance()
|
|
1099
|
-
self.audioSession = session
|
|
1100
|
-
|
|
1101
|
-
do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
|
|
1102
|
-
catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
|
|
1103
|
-
markAECSessionActivation(false, reason: "setupAudioSession-pre")
|
|
1104
|
-
|
|
1105
|
-
// Build options to match our routing rules
|
|
1106
|
-
// (defaultToSpeaker only when no external output is active)
|
|
1107
|
-
let hasExternalOutput: Bool = session.currentRoute.outputs.contains {
|
|
1108
|
-
switch $0.portType {
|
|
1109
|
-
case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
|
|
1110
|
-
return true
|
|
1111
|
-
default:
|
|
1112
|
-
return false
|
|
1113
|
-
}
|
|
1114
|
-
}
|
|
1115
|
-
|
|
1116
|
-
var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
|
|
1117
|
-
if !hasExternalOutput { opts.insert(.defaultToSpeaker) }
|
|
1118
|
-
if #available(iOS 14.5, *) {
|
|
1119
|
-
// Prevent muted switch / mic mute from killing our capture pipeline
|
|
1120
|
-
opts.insert(.overrideMutedMicrophoneInterruption)
|
|
1121
|
-
}
|
|
1122
|
-
do {
|
|
1123
|
-
try session.setCategory(.playAndRecord, mode: .default, options: opts)
|
|
1124
|
-
} catch { err = error as NSError }
|
|
1125
|
-
|
|
1126
|
-
do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
|
|
1127
|
-
catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
|
|
1128
|
-
markAECSessionActivation(false, reason: "setupAudioSession-reconfigure")
|
|
1129
|
-
|
|
1130
|
-
// Force 16k before and after activation (some routes settle only after setActive)
|
|
1131
|
-
force16kIfPossible(session)
|
|
1132
|
-
do {
|
|
1133
|
-
try session.setActive(true)
|
|
1134
|
-
markAECSessionActivation(true, reason: "setupAudioSession")
|
|
1135
|
-
} catch {
|
|
1136
|
-
err = error as NSError
|
|
1137
|
-
markAECSessionActivation(false, reason: "setupAudioSession-failed")
|
|
1138
|
-
}
|
|
1139
|
-
NSLog("[STT] session SR=%.1f inCh=%d outCh=%d (wanted 16000)",
|
|
1140
|
-
session.sampleRate,
|
|
1141
|
-
Int(session.inputNumberOfChannels),
|
|
1142
|
-
Int(session.outputNumberOfChannels))
|
|
1143
|
-
force16kIfPossible(session)
|
|
1144
|
-
forceSpeakerIfReceiver("setupAudioSession")
|
|
1145
|
-
|
|
1146
|
-
if let e = err {
|
|
1147
|
-
NSLog("[STT] setupAudioSession error: \(e.localizedDescription)")
|
|
1148
|
-
sendResult(error: ["code": "audio", "message": e.localizedDescription],
|
|
1149
|
-
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
1150
|
-
return false
|
|
1151
|
-
}
|
|
1152
|
-
|
|
1153
|
-
return true
|
|
1154
|
-
}
|
|
1155
|
-
|
|
1156
|
-
private func shouldUseVoiceProcessingForCurrentRoute() -> Bool {
|
|
1157
|
-
guard aecEnabled else { return false }
|
|
1158
|
-
if isInAECRouteWarmupWindow() { return true }
|
|
1159
|
-
let s = AVAudioSession.sharedInstance()
|
|
1160
|
-
let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
|
|
1161
|
-
let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
|
|
1162
|
-
(s.currentRoute.inputs.first?.portType == .builtInMic)
|
|
1163
|
-
return speakerRoute && usingBuiltInMic
|
|
1164
|
-
}
|
|
1165
|
-
|
|
1166
|
-
private func configureVoiceProcessingDucking(_ inputNode: AVAudioInputNode) {
|
|
1167
|
-
if #available(iOS 17.0, *) {
|
|
1168
|
-
var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
|
|
1169
|
-
duck.enableAdvancedDucking = false
|
|
1170
|
-
duck.duckingLevel = .min
|
|
1171
|
-
inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
|
|
1172
|
-
}
|
|
1173
|
-
}
|
|
1174
|
-
|
|
1175
|
-
private func reconcileAEC(on engine: AVAudioEngine?, reason: String, allowRebuild: Bool = true) {
|
|
1176
|
-
guard let engine = engine else { return }
|
|
1177
|
-
let inputNode = engine.inputNode
|
|
1178
|
-
let desiredVP = shouldUseVoiceProcessingForCurrentRoute()
|
|
1179
|
-
|
|
1180
|
-
if #available(iOS 13.0, *) {
|
|
1181
|
-
let currentVP = inputNode.isVoiceProcessingEnabled
|
|
1182
|
-
if currentVP == desiredVP {
|
|
1183
|
-
if desiredVP { configureVoiceProcessingDucking(inputNode) }
|
|
1184
|
-
NSLog("[STT] AEC reconcile(\(reason)): unchanged vp=\(currentVP ? "ON" : "OFF")")
|
|
1185
|
-
return
|
|
1186
|
-
}
|
|
1187
|
-
}
|
|
1188
|
-
|
|
1189
|
-
do {
|
|
1190
|
-
try inputNode.setVoiceProcessingEnabled(desiredVP)
|
|
1191
|
-
if desiredVP { configureVoiceProcessingDucking(inputNode) }
|
|
1192
|
-
NSLog("[STT] AEC reconcile(\(reason)): set vp=\(desiredVP ? "ON" : "OFF")")
|
|
1193
|
-
} catch {
|
|
1194
|
-
NSLog("[STT] AEC reconcile(\(reason)) failed: \(error.localizedDescription)")
|
|
1195
|
-
if allowRebuild && sttActive && !isTearingDown && !micPaused &&
|
|
1196
|
-
!isTelephonyInterrupted && !isRecoveringAfterTelephony {
|
|
1197
|
-
rebuildEngineGraphAndRestart(reason: "aec-reconcile-\(reason)")
|
|
1198
|
-
}
|
|
1199
|
-
}
|
|
1200
|
-
}
|
|
1201
|
-
|
|
1202
|
-
private func scheduleAECReconcileRetries(reason: String,
|
|
1203
|
-
attempts: Int = 3,
|
|
1204
|
-
stepSec: TimeInterval = 0.20) {
|
|
1205
|
-
guard attempts > 0 else { return }
|
|
1206
|
-
for i in 1...attempts {
|
|
1207
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + stepSec * Double(i)) { [weak self] in
|
|
1208
|
-
guard let self = self else { return }
|
|
1209
|
-
if self.isTearingDown || self.micPaused || self.isTelephonyInterrupted { return }
|
|
1210
|
-
self.reconcileAEC(on: self.audioEngine, reason: "\(reason)-retry\(i)", allowRebuild: false)
|
|
1211
|
-
}
|
|
1212
|
-
}
|
|
1213
|
-
}
|
|
1214
|
-
|
|
1215
|
-
private func currentInputFormat(_ engine: AVAudioEngine) -> AVAudioFormat? {
|
|
1216
|
-
// Prefer whatever CoreAudio currently provides; avoid cached formats.
|
|
1217
|
-
let fmt = engine.inputNode.outputFormat(forBus: 0)
|
|
1218
|
-
if fmt.sampleRate > 0 && fmt.channelCount > 0 { return fmt }
|
|
1219
|
-
// Fallback: build a sane mono format from session if ever needed.
|
|
1220
|
-
let sr = max(8000, AVAudioSession.sharedInstance().sampleRate)
|
|
1221
|
-
return AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: sr, channels: 1, interleaved: false)
|
|
1222
|
-
}
|
|
1223
|
-
|
|
1224
|
-
private func isHeadsetPluggedIn() -> Bool {
|
|
1225
|
-
let route = AVAudioSession.sharedInstance().currentRoute
|
|
1226
|
-
for out in route.outputs {
|
|
1227
|
-
if out.portType == .headphones || out.portType == .bluetoothA2DP {
|
|
1228
|
-
return true
|
|
1229
|
-
}
|
|
1230
|
-
}
|
|
1231
|
-
return false
|
|
1232
|
-
}
|
|
1233
|
-
|
|
1234
|
-
private func recoverAfterTelephonyInterruption() {
|
|
1235
|
-
guard hasValidCaptureNow(allowColdEngine: true) else {
|
|
1236
|
-
NSLog("[STT] recoverAfterTelephonyInterruption: no capture yet; will rely on watchdog/next route change")
|
|
1237
|
-
return
|
|
1238
|
-
}
|
|
1239
|
-
|
|
1240
|
-
bumpGraphGen()
|
|
1241
|
-
NSLog("[STT] 🔄 recovering graph after telephony")
|
|
1242
|
-
|
|
1243
|
-
if audioEngine == nil { audioEngine = AVAudioEngine() }
|
|
1244
|
-
AudioPlaybackHook.currentEngine = { [weak self] in self?.audioEngine } // ⬅️ add this
|
|
1245
|
-
guard let eng = audioEngine else { return }
|
|
1246
|
-
installEngineObservers()
|
|
1247
|
-
|
|
1248
|
-
_ = setupAudioSession() // ✅ ensures correct mode/options
|
|
1249
|
-
forceSpeakerIfReceiver("recoverAfterTelephony") // ✅ receiver -> speaker now
|
|
1250
|
-
|
|
1251
|
-
let inputNode = eng.inputNode
|
|
1252
|
-
reconcileAEC(on: eng, reason: "recover-after-telephony-prestart", allowRebuild: false)
|
|
1253
|
-
|
|
1254
|
-
eng.reset()
|
|
1255
|
-
let micMixer = AVAudioMixerNode()
|
|
1256
|
-
eng.attach(micMixer)
|
|
1257
|
-
eng.connect(inputNode, to: micMixer, format: nil)
|
|
1258
|
-
eng.connect(micMixer, to: eng.mainMixerNode, format: nil)
|
|
1259
|
-
micMixer.outputVolume = 0.0
|
|
1260
|
-
|
|
1261
|
-
if playbackNode == nil { playbackNode = AVAudioPlayerNode() }
|
|
1262
|
-
if let p = playbackNode {
|
|
1263
|
-
if p.engine == nil { eng.attach(p) }
|
|
1264
|
-
eng.connect(p, to: eng.mainMixerNode, format: nil)
|
|
1265
|
-
}
|
|
1266
|
-
|
|
1267
|
-
do {
|
|
1268
|
-
try eng.start()
|
|
1269
|
-
armFirstIOCycleLatch(on: eng)
|
|
1270
|
-
tryClearCaptureLossAfterStartSucceeded()
|
|
1271
|
-
reconcileAEC(on: eng, reason: "recover-after-telephony-poststart", allowRebuild: false)
|
|
1272
|
-
scheduleAECReconcileRetries(reason: "recover-after-telephony")
|
|
1273
|
-
} catch {
|
|
1274
|
-
NSLog("[STT] recover: engine.start failed → will let watchdog retry: \(error)")
|
|
1275
|
-
return
|
|
1276
|
-
}
|
|
1277
|
-
|
|
1278
|
-
// IMPORTANT: install tap and start recognition only after we *see* buffers again
|
|
1279
|
-
safeRemoveTap(inputNode)
|
|
1280
|
-
let tapFmt = inputNode.outputFormat(forBus: 0)
|
|
1281
|
-
guard tapFmt.sampleRate > 0, tapFmt.channelCount > 0 else {
|
|
1282
|
-
NSLog("[STT] recover: invalid input format post-start (sr=%.1f ch=%d)", tapFmt.sampleRate, Int(tapFmt.channelCount))
|
|
1283
|
-
return
|
|
1284
|
-
}
|
|
1285
|
-
|
|
1286
|
-
lastBufferAt = 0
|
|
1287
|
-
tapFramesTotal = 0
|
|
1288
|
-
inputNode.installTap(onBus: 0, bufferSize: 1024, format: tapFmt) { [weak self] buffer, _ in
|
|
1289
|
-
guard let self = self else { return }
|
|
1290
|
-
self.recognitionRequest?.append(buffer)
|
|
1291
|
-
|
|
1292
|
-
// mark that input is flowing again
|
|
1293
|
-
self.tapFramesTotal &+= UInt64(buffer.frameLength)
|
|
1294
|
-
self.lastBufferAt = CACurrentMediaTime()
|
|
1295
|
-
|
|
1296
|
-
// Kick recognition exactly once, the first time we see real audio post-call
|
|
1297
|
-
if !self.startedRecognitionAfterCall && self.tapFramesTotal > 1024 {
|
|
1298
|
-
DispatchQueue.main.async {
|
|
1299
|
-
self.startRecognitionAfterCall()
|
|
1300
|
-
}
|
|
1301
|
-
}
|
|
1302
|
-
}
|
|
1303
|
-
// In recoverAfterTelephonyInterruption(), after engine/graph is rebuilt (near the end is fine):
|
|
1304
|
-
if self.sttActive && !self.micPaused {
|
|
1305
|
-
self.installPlaybackHooks()
|
|
1306
|
-
}
|
|
1307
|
-
|
|
1308
|
-
NSLog("[STT] recovery: IO + tap ready; waiting for buffers to start recognition")
|
|
1309
|
-
// do NOT set isRecoveringAfterTelephony = false here — we clear it in startRecognitionAfterCall()
|
|
1310
|
-
}
|
|
1311
|
-
|
|
1312
|
-
private func loadContextualStrings() -> [String] {
|
|
1313
|
-
guard let filePath = Bundle.main.path(forResource: "words_flattened", ofType: "txt") else {
|
|
1314
|
-
NSLog("words_flattened.txt not found in bundle")
|
|
1315
|
-
return []
|
|
1316
|
-
}
|
|
1317
|
-
do {
|
|
1318
|
-
var contents = try String(contentsOfFile: filePath, encoding: .utf8)
|
|
1319
|
-
// ✅ MIN FIX: remove UTF-8 BOM if present (often only affects the first token)
|
|
1320
|
-
if contents.unicodeScalars.first == "\u{FEFF}" {
|
|
1321
|
-
contents.unicodeScalars.removeFirst()
|
|
1322
|
-
}
|
|
1323
|
-
let rawItems = contents.components(separatedBy: ",")
|
|
1324
|
-
var cleaned: [String] = []
|
|
1325
|
-
cleaned.reserveCapacity(rawItems.count)
|
|
1326
|
-
for item in rawItems {
|
|
1327
|
-
var t = item.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
1328
|
-
t = t.replacingOccurrences(of: "\"", with: "")
|
|
1329
|
-
if !t.isEmpty { cleaned.append(t) }
|
|
1330
|
-
}
|
|
1331
|
-
return cleaned
|
|
1332
|
-
} catch {
|
|
1333
|
-
NSLog("Error reading contextualStrings: \(error)")
|
|
1334
|
-
return []
|
|
1335
|
-
}
|
|
1336
|
-
}
|
|
1337
|
-
|
|
1338
|
-
private func resetAudioSession()
|
|
1339
|
-
{
|
|
1340
|
-
if audioSession == nil {
|
|
1341
|
-
audioSession = AVAudioSession.sharedInstance()
|
|
1342
|
-
}
|
|
1343
|
-
guard let session = audioSession else { return }
|
|
1344
|
-
// Preserve & compare category exactly as original logic
|
|
1345
|
-
let current = session.category
|
|
1346
|
-
if priorAudioCategory == current { return }
|
|
1347
|
-
audioSession = nil
|
|
1348
|
-
}
|
|
1349
|
-
|
|
1350
|
-
|
|
1351
|
-
private func makeFreshRequest() -> SFSpeechAudioBufferRecognitionRequest {
|
|
1352
|
-
let req = SFSpeechAudioBufferRecognitionRequest()
|
|
1353
|
-
if #available(iOS 16, *) { req.addsPunctuation = true }
|
|
1354
|
-
req.shouldReportPartialResults = true
|
|
1355
|
-
//if #available(iOS 13.0, *) { req.taskHint = .dictation }
|
|
1356
|
-
let cs: [String] = loadContextualStrings()
|
|
1357
|
-
req.contextualStrings = cs
|
|
1358
|
-
NSLog("[STT] makeFreshRequest contextualStrings count=\(cs.count) sample=\(cs.prefix(10)) file=\(Bundle.main.path(forResource: "words_flattened", ofType: "txt") ?? "nil")")
|
|
1359
|
-
|
|
1360
|
-
self.recognitionRequest = req
|
|
1361
|
-
NSLog("[STT] makeFreshRequest()")
|
|
1362
|
-
return req
|
|
1363
|
-
}
|
|
1364
|
-
|
|
1365
|
-
private func startTask(_ req: SFSpeechAudioBufferRecognitionRequest) {
|
|
1366
|
-
if isSpeechRecognitionLitePaused() {
|
|
1367
|
-
NSLog("[STT] startTask suppressed (speechRecognitionPaused)")
|
|
1368
|
-
return
|
|
1369
|
-
}
|
|
1370
|
-
NSLog("starting recognitionTask")
|
|
1371
|
-
lastTaskStartAt = CACurrentMediaTime()
|
|
1372
|
-
lastResultAt = lastTaskStartAt
|
|
1373
|
-
// Bump generation and capture it for THIS task
|
|
1374
|
-
activeTaskGen &+= 1
|
|
1375
|
-
let myGen = activeTaskGen
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
let taskSessionId = self.sessionId
|
|
1379
|
-
self.recognitionTask = self.speechRecognizer?.recognitionTask(with: req) { [weak self] result, error in
|
|
1380
|
-
guard let self = self else { return }
|
|
1381
|
-
|
|
1382
|
-
// ❗️Drop callbacks from older tasks
|
|
1383
|
-
guard myGen == self.activeTaskGen else {
|
|
1384
|
-
NSLog("[STT] stale task callback (gen \(myGen) != \(self.activeTaskGen)) → ignore")
|
|
1385
|
-
return
|
|
1386
|
-
}
|
|
1387
|
-
|
|
1388
|
-
if taskSessionId != self.sessionId { NSLog("task session mismatch -> ignore"); return }
|
|
1389
|
-
self.lastResultAt = CACurrentMediaTime()
|
|
1390
|
-
|
|
1391
|
-
func markIfReal(_ r: SFSpeechRecognitionResult?) {
|
|
1392
|
-
guard let r = r else { return }
|
|
1393
|
-
|
|
1394
|
-
// ✅ Do NOT use formattedString here (it normalizes spacing/punctuation/number formatting).
|
|
1395
|
-
// Instead, treat "real speech" as "we have at least one non-empty segment substring".
|
|
1396
|
-
let hasReal = r.bestTranscription.segments.contains {
|
|
1397
|
-
!$0.substring.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty
|
|
1398
|
-
}
|
|
1399
|
-
|
|
1400
|
-
if hasReal && !self.seenRealSpeech {
|
|
1401
|
-
self.seenRealSpeech = true
|
|
1402
|
-
NSLog("first real speech detected -> onSpeechStart to JS")
|
|
1403
|
-
self.sendEvent(name: "onSpeechStart", body: nil)
|
|
1404
|
-
}
|
|
1405
|
-
}
|
|
1406
|
-
markIfReal(result)
|
|
1407
|
-
|
|
1408
|
-
func rearm(_ why: String, delay: TimeInterval = 0.05) {
|
|
1409
|
-
guard self.continuous else { return }
|
|
1410
|
-
NSLog("REARM (\(why))")
|
|
1411
|
-
self.recognitionTask?.cancel()
|
|
1412
|
-
self.recognitionTask = nil
|
|
1413
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + delay) {
|
|
1414
|
-
self.startTask(self.makeFreshRequest())
|
|
1415
|
-
}
|
|
1416
|
-
}
|
|
1417
|
-
|
|
1418
|
-
if let error = error {
|
|
1419
|
-
NSLog("task error \(error._code): \(error.localizedDescription)")
|
|
1420
|
-
// treat as transient for continuous mode
|
|
1421
|
-
rearmTask(reason: "error")
|
|
1422
|
-
return
|
|
1423
|
-
}
|
|
1424
|
-
|
|
1425
|
-
guard let result = result else {
|
|
1426
|
-
NSLog("task nil result")
|
|
1427
|
-
rearmTask(reason: "nil-result")
|
|
1428
|
-
return
|
|
1429
|
-
}
|
|
1430
|
-
|
|
1431
|
-
let isFinal = result.isFinal
|
|
1432
|
-
let parts = result.transcriptions.map { $0.segments.map { $0.substring }.joined(separator: " ") }
|
|
1433
|
-
self.sendResult(error: nil,
|
|
1434
|
-
bestTranscription: result.bestTranscription.segments.map { $0.substring }.joined(separator: " "),
|
|
1435
|
-
transcriptions: parts,
|
|
1436
|
-
isFinal: isFinal)
|
|
1437
|
-
|
|
1438
|
-
if isFinal {
|
|
1439
|
-
NSLog("task final -> onSpeechEnd")
|
|
1440
|
-
self.sendEvent(name: "onSpeechEnd", body: nil)
|
|
1441
|
-
if self.continuous {
|
|
1442
|
-
self.rearmTask(reason: "final")
|
|
1443
|
-
} else {
|
|
1444
|
-
NSLog("non-continuous final -> teardown")
|
|
1445
|
-
self.teardown()
|
|
1446
|
-
}
|
|
1447
|
-
}
|
|
1448
|
-
}
|
|
1449
|
-
}
|
|
1450
|
-
|
|
1451
|
-
public func teardown() {
|
|
1452
|
-
bumpGraphGen()
|
|
1453
|
-
NSLog("[STT] teardown() begin")
|
|
1454
|
-
isTearingDown = true
|
|
1455
|
-
// ✅ HARD reset speech-lite pause state on teardown
|
|
1456
|
-
resetSpeechRecognitionLitePauseState("teardown")
|
|
1457
|
-
stopWatchdog()
|
|
1458
|
-
consecutiveStallCount = 0
|
|
1459
|
-
removeEngineObservers()
|
|
1460
|
-
if let task = recognitionTask {
|
|
1461
|
-
task.cancel()
|
|
1462
|
-
recognitionTask = nil
|
|
1463
|
-
}
|
|
1464
|
-
AudioPlaybackHook.engineScheduleFile = nil
|
|
1465
|
-
AudioPlaybackHook.isEngineReady = nil
|
|
1466
|
-
AudioPlaybackHook.useOnlyEnginePlayback = nil
|
|
1467
|
-
AudioPlaybackHook.stopEnginePlayback = nil // ← NEW
|
|
1468
|
-
sttActive = false
|
|
1469
|
-
|
|
1470
|
-
if let p = playbackNode {
|
|
1471
|
-
p.stop()
|
|
1472
|
-
}
|
|
1473
|
-
playbackNode = nil
|
|
1474
|
-
|
|
1475
|
-
if let req = recognitionRequest {
|
|
1476
|
-
req.endAudio()
|
|
1477
|
-
recognitionRequest = nil
|
|
1478
|
-
}
|
|
1479
|
-
|
|
1480
|
-
if let engine = audioEngine {
|
|
1481
|
-
safeRemoveTap(engine.outputNode, bus: 0) // <- clear IO latch if present
|
|
1482
|
-
safeRemoveTap(engine.mainMixerNode, bus: 0) // <- clear mixer probe if present
|
|
1483
|
-
if engine.inputNode != nil {
|
|
1484
|
-
safeRemoveTap(engine.inputNode, bus: 0)
|
|
1485
|
-
engine.inputNode.reset()
|
|
1486
|
-
}
|
|
1487
|
-
if engine.isRunning { engine.stop() }
|
|
1488
|
-
engine.reset()
|
|
1489
|
-
audioEngine = nil
|
|
1490
|
-
AudioPlaybackHook.currentEngine = nil // ⬅️ add this
|
|
1491
|
-
}
|
|
1492
|
-
mixerProbeActive = false
|
|
1493
|
-
mixerProbeCompletions.removeAll()
|
|
1494
|
-
speakerVerificationEngine = nil
|
|
1495
|
-
speakerVerificationFrameSize = 0
|
|
1496
|
-
speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
|
|
1497
|
-
speakerVerificationThreshold = 0
|
|
1498
|
-
speakerVerificationFrameSeq = 0
|
|
1499
|
-
speakerVerificationSourceSampleRate = 0
|
|
1500
|
-
speakerVerificationTargetSampleRate = 0
|
|
1501
|
-
speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
|
|
1502
|
-
speakerVerificationResamplePos = 0
|
|
1503
|
-
speakerLastPositiveMatchAt = 0
|
|
1504
|
-
setSpeakerGateState(enabled: false, open: true)
|
|
1505
|
-
speakerVerificationErrorSent = false
|
|
1506
|
-
speakerPreRollBuffers.removeAll(keepingCapacity: false)
|
|
1507
|
-
speakerPreRollFrames = 0
|
|
1508
|
-
speakerPreRollMaxFrames = 0
|
|
1509
|
-
speakerPendingPreRollFlush = false
|
|
1510
|
-
lastRouteSignature = ""
|
|
1511
|
-
markAECSessionActivation(false, reason: "teardown")
|
|
1512
|
-
|
|
1513
|
-
resetAudioSession()
|
|
1514
|
-
savedSessionBeforePause = nil
|
|
1515
|
-
|
|
1516
|
-
sessionId = nil
|
|
1517
|
-
isTearingDown = false
|
|
1518
|
-
}
|
|
1519
|
-
|
|
1520
|
-
private func isPlayerConnected(_ player: AVAudioPlayerNode?, to engine: AVAudioEngine?) -> Bool {
|
|
1521
|
-
guard let p = player, let e = engine else { return false }
|
|
1522
|
-
// If the node is attached and has a non-zero channel count on its output, it’s effectively connected.
|
|
1523
|
-
let fmt = p.outputFormat(forBus: 0)
|
|
1524
|
-
return (p.engine === e) && (fmt.channelCount > 0) && (fmt.sampleRate > 0)
|
|
1525
|
-
}
|
|
1526
|
-
|
|
1527
|
-
private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
|
|
1528
|
-
if isTelephonyInterrupted || isRecoveringAfterTelephony {
|
|
1529
|
-
NSLog("[STT] ensureEngineRunning suppressed (telephony/recovering)")
|
|
1530
|
-
return
|
|
1531
|
-
}
|
|
1532
|
-
if isSpeechRecognitionLitePaused() {
|
|
1533
|
-
NSLog("[STT] ensureEngineRunning(\(reason)) suppressed (speechRecognitionPaused)")
|
|
1534
|
-
return
|
|
1535
|
-
}
|
|
1536
|
-
if micPaused {
|
|
1537
|
-
NSLog("[STT] ensureEngineRunning(\(reason)) suppressed (micPaused)")
|
|
1538
|
-
return
|
|
1539
|
-
}
|
|
1540
|
-
|
|
1541
|
-
if isTearingDown { return }
|
|
1542
|
-
|
|
1543
|
-
if !hasValidCaptureNow(allowColdEngine: true) {
|
|
1544
|
-
markCaptureLost()
|
|
1545
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): capture not available; waiting")
|
|
1546
|
-
return
|
|
1547
|
-
}
|
|
1548
|
-
|
|
1549
|
-
let now = CFAbsoluteTimeGetCurrent()
|
|
1550
|
-
if !skipCooldown, (now - lastReclaimAttempt) < reclaimCooldown {
|
|
1551
|
-
NSLog("[STT] ensureEngineRunning(\(reason)) skipped (cooldown)")
|
|
1552
|
-
return
|
|
1553
|
-
}
|
|
1554
|
-
lastReclaimAttempt = now
|
|
1555
|
-
|
|
1556
|
-
// (re)start engine
|
|
1557
|
-
if let eng = audioEngine {
|
|
1558
|
-
if !eng.isRunning {
|
|
1559
|
-
do {
|
|
1560
|
-
playbackNode?.stop()
|
|
1561
|
-
playbackNode = nil
|
|
1562
|
-
try eng.start()
|
|
1563
|
-
armFirstIOCycleLatch(on: eng)
|
|
1564
|
-
tryClearCaptureLossAfterStartSucceeded()
|
|
1565
|
-
NSLog("🔄 AVAudioEngine restarted. running=\(eng.isRunning)")
|
|
1566
|
-
} catch {
|
|
1567
|
-
NSLog("❌ engine.start() failed: \(error) → rebuild")
|
|
1568
|
-
rebuildEngineGraphAndRestart(reason: reason)
|
|
1569
|
-
return
|
|
1570
|
-
}
|
|
1571
|
-
}
|
|
1572
|
-
} else {
|
|
1573
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): no engine → rebuild")
|
|
1574
|
-
rebuildEngineGraphAndRestart(reason: reason)
|
|
1575
|
-
return
|
|
1576
|
-
}
|
|
1577
|
-
|
|
1578
|
-
// ensure a task is running
|
|
1579
|
-
if recognitionTask == nil {
|
|
1580
|
-
if isSpeechRecognitionLitePaused() {
|
|
1581
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): skip startTask (speechRecognitionPaused)")
|
|
1582
|
-
} else if let req = recognitionRequest {
|
|
1583
|
-
startTask(req)
|
|
1584
|
-
} else {
|
|
1585
|
-
startTask(makeFreshRequest())
|
|
1586
|
-
}
|
|
1587
|
-
}
|
|
1588
|
-
}
|
|
1589
|
-
/*
|
|
1590
|
-
private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
|
|
1591
|
-
if isTearingDown { return } // ← add
|
|
1592
|
-
|
|
1593
|
-
// If no mic, don’t touch the graph. Wait for route/interruption end.
|
|
1594
|
-
|
|
1595
|
-
if !hasValidCaptureNow() {
|
|
1596
|
-
markCaptureLost()
|
|
1597
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): capture not available; waiting")
|
|
1598
|
-
return
|
|
1599
|
-
}
|
|
1600
|
-
|
|
1601
|
-
let now = CFAbsoluteTimeGetCurrent()
|
|
1602
|
-
if (now - lastReclaimAttempt) < reclaimCooldown {
|
|
1603
|
-
NSLog("[STT] ensureEngineRunning(\(reason)) skipped (cooldown)")
|
|
1604
|
-
return
|
|
1605
|
-
}
|
|
1606
|
-
lastReclaimAttempt = now
|
|
1607
|
-
|
|
1608
|
-
if (audioEngine != nil) && !audioEngine!.isRunning {
|
|
1609
|
-
do {
|
|
1610
|
-
playbackNode?.stop()
|
|
1611
|
-
playbackNode = nil
|
|
1612
|
-
// Possibly re-apply your format or re-install taps if the hardware changed sample rates
|
|
1613
|
-
try audioEngine!.start()
|
|
1614
|
-
armFirstIOCycleLatch(on: audioEngine!)
|
|
1615
|
-
tryClearCaptureLossAfterStartSucceeded()
|
|
1616
|
-
|
|
1617
|
-
print("🔄 AVAudioEngine restarted after config change. isRunning=%@",
|
|
1618
|
-
audioEngine!.isRunning ? "YES":"NO")
|
|
1619
|
-
} catch {
|
|
1620
|
-
print("❌ Could not re-start after config change: \(error)")
|
|
1621
|
-
}
|
|
1622
|
-
}
|
|
1623
|
-
|
|
1624
|
-
guard let engine = audioEngine else {
|
|
1625
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): no engine → rebuild")
|
|
1626
|
-
rebuildEngineGraphAndRestart(reason: reason)
|
|
1627
|
-
return
|
|
1628
|
-
}
|
|
1629
|
-
|
|
1630
|
-
if !engine.isRunning {
|
|
1631
|
-
do {
|
|
1632
|
-
try engine.start()
|
|
1633
|
-
armFirstIOCycleLatch(on: engine)
|
|
1634
|
-
tryClearCaptureLossAfterStartSucceeded() // ← add this line
|
|
1635
|
-
|
|
1636
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() -> running=\(engine.isRunning)")
|
|
1637
|
-
} catch {
|
|
1638
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() failed: \(error) → rebuild")
|
|
1639
|
-
rebuildEngineGraphAndRestart(reason: reason)
|
|
1640
|
-
return
|
|
1641
|
-
}
|
|
1642
|
-
}
|
|
1643
|
-
|
|
1644
|
-
// If we have no active task, spin one up against the current request
|
|
1645
|
-
if recognitionTask == nil {
|
|
1646
|
-
if let req = recognitionRequest {
|
|
1647
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): no task -> startTask(existing req)")
|
|
1648
|
-
startTask(req)
|
|
1649
|
-
} else {
|
|
1650
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): no req -> makeFreshRequest + startTask")
|
|
1651
|
-
startTask(makeFreshRequest())
|
|
1652
|
-
}
|
|
1653
|
-
}
|
|
1654
|
-
}
|
|
1655
|
-
*/
|
|
1656
|
-
/// Rebuilds AVAudioEngine graph (mic→mute mixer, player→mainMixer), reinstalls tap,
|
|
1657
|
-
/// and restarts the engine. Does NOT nuke the current recognitionRequest/task unless required.
|
|
1658
|
-
private func rebuildEngineGraphAndRestart(reason: String) {
|
|
1659
|
-
bumpGraphGen()
|
|
1660
|
-
NSLog("[STT] 🔄 rebuildEngineGraphAndRestart (\(reason))")
|
|
1661
|
-
if isTelephonyInterrupted { NSLog("[STT] rebuild suppressed during telephony"); return }
|
|
1662
|
-
if isSpeechRecognitionLitePaused() {
|
|
1663
|
-
NSLog("[STT] rebuild suppressed (speechRecognitionPaused)")
|
|
1664
|
-
return
|
|
1665
|
-
}
|
|
1666
|
-
|
|
1667
|
-
guard hasValidCaptureNow() else {
|
|
1668
|
-
markCaptureLost()
|
|
1669
|
-
NSLog("[STT] rebuild: no valid input yet (skip)")
|
|
1670
|
-
return
|
|
1671
|
-
}
|
|
1672
|
-
|
|
1673
|
-
// Keep current request if present; we'll keep appending into it
|
|
1674
|
-
let existingReq = self.recognitionRequest
|
|
1675
|
-
|
|
1676
|
-
// Tear down engine ONLY (keep session, request)
|
|
1677
|
-
if let engine = audioEngine {
|
|
1678
|
-
if engine.inputNode != nil {
|
|
1679
|
-
safeRemoveTap(engine.inputNode, bus: 0)
|
|
1680
|
-
engine.inputNode.reset()
|
|
1681
|
-
}
|
|
1682
|
-
if engine.isRunning { engine.stop() }
|
|
1683
|
-
engine.reset()
|
|
1684
|
-
}
|
|
1685
|
-
|
|
1686
|
-
// Recreate engine and graph
|
|
1687
|
-
let newEngine = AVAudioEngine()
|
|
1688
|
-
self.audioEngine = newEngine
|
|
1689
|
-
AudioPlaybackHook.currentEngine = { [weak self] in self?.audioEngine } // ⬅️ add this
|
|
1690
|
-
|
|
1691
|
-
installEngineObservers() // <-- IMPORTANT: observers were bound to old engine object
|
|
1692
|
-
_ = setupAudioSession() // ✅ keep session policy consistent
|
|
1693
|
-
forceSpeakerIfReceiver("rebuild:\(reason)") // ✅ receiver -> speaker now
|
|
1694
|
-
|
|
1695
|
-
let inputNode = newEngine.inputNode
|
|
1696
|
-
reconcileAEC(on: newEngine, reason: "rebuild-\(reason)-prestart", allowRebuild: false)
|
|
1697
|
-
|
|
1698
|
-
var inFmt = inputNode.outputFormat(forBus: 0)
|
|
1699
|
-
|
|
1700
|
-
// mic → mute mixer → mainMixer
|
|
1701
|
-
let micMixer = AVAudioMixerNode()
|
|
1702
|
-
newEngine.attach(micMixer)
|
|
1703
|
-
newEngine.connect(inputNode, to: micMixer, format: inFmt) // live input format
|
|
1704
|
-
newEngine.connect(micMixer, to: newEngine.mainMixerNode, format: nil) // let mixer choose
|
|
1705
|
-
micMixer.outputVolume = 0.0
|
|
1706
|
-
|
|
1707
|
-
// TTS player → (de-esser) → mainMixer
|
|
1708
|
-
if let existing = playbackNode, existing.engine !== newEngine {
|
|
1709
|
-
// Node is owned by a different engine instance; recreate for this graph.
|
|
1710
|
-
existing.stop()
|
|
1711
|
-
playbackNode = nil
|
|
1712
|
-
}
|
|
1713
|
-
if playbackNode == nil {
|
|
1714
|
-
playbackNode = AVAudioPlayerNode()
|
|
1715
|
-
}
|
|
1716
|
-
if let player = playbackNode {
|
|
1717
|
-
if player.engine == nil {
|
|
1718
|
-
newEngine.attach(player)
|
|
1719
|
-
}
|
|
1720
|
-
newEngine.connect(player, to: newEngine.mainMixerNode, format: nil)
|
|
1721
|
-
}
|
|
1722
|
-
|
|
1723
|
-
// // --- Aggressive low-pass only ---
|
|
1724
|
-
// let deEss = AVAudioUnitEQ(numberOfBands: 1)
|
|
1725
|
-
// let lpf = deEss.bands[0]
|
|
1726
|
-
// lpf.filterType = .lowPass
|
|
1727
|
-
// lpf.frequency = 6500 // try 6000–7500
|
|
1728
|
-
// lpf.bandwidth = 0.35 // fairly steep
|
|
1729
|
-
// lpf.gain = 0.0
|
|
1730
|
-
// lpf.bypass = false
|
|
1731
|
-
|
|
1732
|
-
// self.ttsEQ = deEss
|
|
1733
|
-
// newEngine.attach(deEss)
|
|
1734
|
-
|
|
1735
|
-
// newEngine.disconnectNodeOutput(player)
|
|
1736
|
-
// newEngine.connect(player, to: deEss, format: nil)
|
|
1737
|
-
// newEngine.connect(deEss, to: newEngine.mainMixerNode, format: nil)
|
|
1738
|
-
// }
|
|
1739
|
-
|
|
1740
|
-
// Tap uses nil to follow the node’s current output format
|
|
1741
|
-
newEngine.prepare()
|
|
1742
|
-
do {
|
|
1743
|
-
try newEngine.start()
|
|
1744
|
-
armFirstIOCycleLatch(on: newEngine)
|
|
1745
|
-
tryClearCaptureLossAfterStartSucceeded()
|
|
1746
|
-
reconcileAEC(on: newEngine, reason: "rebuild-\(reason)-poststart", allowRebuild: false)
|
|
1747
|
-
scheduleAECReconcileRetries(reason: "rebuild-\(reason)")
|
|
1748
|
-
NSLog("[STT] rebuild: engine.start() ok, running=\(newEngine.isRunning)")
|
|
1749
|
-
} catch {
|
|
1750
|
-
markCaptureLost()
|
|
1751
|
-
NSLog("[STT] rebuild: engine.start() failed: \(error)")
|
|
1752
|
-
}
|
|
1753
|
-
|
|
1754
|
-
|
|
1755
|
-
// 2) NOW that IO is running, install the tap (format will be valid)
|
|
1756
|
-
safeRemoveTap(inputNode)
|
|
1757
|
-
let tapFmt = inputNode.outputFormat(forBus: 0)
|
|
1758
|
-
guard tapFmt.sampleRate > 0, tapFmt.channelCount > 0 else {
|
|
1759
|
-
markCaptureLost()
|
|
1760
|
-
NSLog("[STT] rebuild: invalid input format after start (sr=%.1f ch=%d)",
|
|
1761
|
-
tapFmt.sampleRate, Int(tapFmt.channelCount))
|
|
1762
|
-
return
|
|
1763
|
-
}
|
|
1764
|
-
|
|
1765
|
-
inputNode.installTap(onBus: 0, bufferSize: 1024, format: tapFmt) { [weak self] buffer, _ in
|
|
1766
|
-
guard let self = self else { return }
|
|
1767
|
-
|
|
1768
|
-
// 👇 EXACT same logic as in setupAndStartRecognizing
|
|
1769
|
-
self.tapFramesTotal &+= UInt64(buffer.frameLength)
|
|
1770
|
-
if self.tapFramesTotal % (44100 * 2) < 1024 {
|
|
1771
|
-
NSLog("[STT] tap alive, totalFrames=\(self.tapFramesTotal)")
|
|
1772
|
-
}
|
|
1773
|
-
|
|
1774
|
-
let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
|
|
1775
|
-
let LEVEL_LOWPASS_TRIG: Float = 0.5
|
|
1776
|
-
|
|
1777
|
-
// CH0
|
|
1778
|
-
if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
|
|
1779
|
-
var peak0: Float = 0
|
|
1780
|
-
vDSP_maxmgv(ch0, 1, &peak0, frames)
|
|
1781
|
-
let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
|
|
1782
|
-
|
|
1783
|
-
let smoothed0 = LEVEL_LOWPASS_TRIG * db0
|
|
1784
|
-
+ (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel0
|
|
1785
|
-
self.averagePowerForChannel0 = smoothed0
|
|
1786
|
-
self.averagePowerForChannel1 = smoothed0
|
|
1787
|
-
}
|
|
1788
|
-
|
|
1789
|
-
// CH1
|
|
1790
|
-
if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
|
|
1791
|
-
var peak1: Float = 0
|
|
1792
|
-
vDSP_maxmgv(ch1, 1, &peak1, frames)
|
|
1793
|
-
let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
|
|
1794
|
-
|
|
1795
|
-
let smoothed1 = LEVEL_LOWPASS_TRIG * db1
|
|
1796
|
-
+ (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel1
|
|
1797
|
-
self.averagePowerForChannel1 = smoothed1
|
|
1798
|
-
}
|
|
1799
|
-
|
|
1800
|
-
// Normalize 0–10 and emit
|
|
1801
|
-
self.averagePowerForChannel1 = Float(
|
|
1802
|
-
self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0
|
|
1803
|
-
)
|
|
1804
|
-
let value = self.averagePowerForChannel1
|
|
1805
|
-
self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": value])
|
|
1806
|
-
|
|
1807
|
-
// Append to recognition
|
|
1808
|
-
self.recognitionRequest?.append(buffer)
|
|
1809
|
-
|
|
1810
|
-
// mark that input is flowing again
|
|
1811
|
-
self.lastBufferAt = CACurrentMediaTime()
|
|
1812
|
-
}
|
|
1813
|
-
|
|
1814
|
-
// If we lost the request during rebuild, recreate + start task.
|
|
1815
|
-
if self.recognitionRequest == nil {
|
|
1816
|
-
if let old = existingReq {
|
|
1817
|
-
self.recognitionRequest = old
|
|
1818
|
-
} else {
|
|
1819
|
-
self.recognitionRequest = makeFreshRequest()
|
|
1820
|
-
}
|
|
1821
|
-
}
|
|
1822
|
-
if self.recognitionTask == nil {
|
|
1823
|
-
if isSpeechRecognitionLitePaused() {
|
|
1824
|
-
NSLog("[STT] rebuild: skip startTask (speechRecognitionPaused)")
|
|
1825
|
-
} else {
|
|
1826
|
-
startTask(self.recognitionRequest!)
|
|
1827
|
-
}
|
|
1828
|
-
}
|
|
1829
|
-
if self.sttActive && !self.micPaused {
|
|
1830
|
-
self.installPlaybackHooks()
|
|
1831
|
-
}
|
|
1832
|
-
}
|
|
1833
|
-
|
|
1834
|
-
@objc private func handleEngineConfigChange(_ note: Notification) {
|
|
1835
|
-
if isTearingDown { return } // ← add
|
|
1836
|
-
if isSpeechRecognitionLitePaused() {
|
|
1837
|
-
NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange (ignored: speechRecognitionPaused)")
|
|
1838
|
-
return
|
|
1839
|
-
}
|
|
1840
|
-
if micPaused {
|
|
1841
|
-
NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange (ignored: micPaused)")
|
|
1842
|
-
return
|
|
1843
|
-
}
|
|
1844
|
-
|
|
1845
|
-
NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange: ensuring engine running")
|
|
1846
|
-
if (audioEngine != nil) && !audioEngine!.isRunning {
|
|
1847
|
-
playbackNode?.stop()
|
|
1848
|
-
playbackNode = nil
|
|
1849
|
-
}
|
|
1850
|
-
ensureEngineRunning(reason: "engine-config-change")
|
|
1851
|
-
reconcileAEC(on: audioEngine, reason: "engine-config-change")
|
|
1852
|
-
scheduleAECReconcileRetries(reason: "engine-config-change")
|
|
1853
|
-
}
|
|
1854
|
-
|
|
1855
|
-
@objc private func handleMediaServicesReset(_ note: Notification) {
|
|
1856
|
-
if isTearingDown { return } // ← add
|
|
1857
|
-
|
|
1858
|
-
if isSpeechRecognitionLitePaused() {
|
|
1859
|
-
NSLog("[STT] 📺 Media services RESET (ignored: speechRecognitionPaused)")
|
|
1860
|
-
return
|
|
1861
|
-
}
|
|
1862
|
-
if micPaused {
|
|
1863
|
-
NSLog("[STT] 📺 Media services RESET (ignored: micPaused)")
|
|
1864
|
-
return
|
|
1865
|
-
}
|
|
1866
|
-
NSLog("[STT] 📺 Media services were RESET: reclaiming mic & session")
|
|
1867
|
-
// Re-apply audio session and try to rebuild graph if needed
|
|
1868
|
-
bumpGraphGen()
|
|
1869
|
-
_ = setupAudioSession()
|
|
1870
|
-
ensureEngineRunning(reason: "media-services-reset")
|
|
1871
|
-
reconcileAEC(on: audioEngine, reason: "media-services-reset")
|
|
1872
|
-
scheduleAECReconcileRetries(reason: "media-services-reset")
|
|
1873
|
-
}
|
|
1874
|
-
|
|
1875
|
-
/*?????????? Why so many changes???
|
|
1876
|
-
@objc private func handleRouteChange(_ note: Notification) {
|
|
1877
|
-
if isTearingDown { return } // ← add
|
|
1878
|
-
|
|
1879
|
-
let info = note.userInfo ?? [:]
|
|
1880
|
-
NSLog("[STT] 🔀 route change: \(info)")
|
|
1881
|
-
updateSessionRouting(selectBestInput: true) // ← add this
|
|
1882
|
-
|
|
1883
|
-
guard let reasonVal = info[AVAudioSessionRouteChangeReasonKey] as? UInt,
|
|
1884
|
-
let reason = AVAudioSession.RouteChangeReason(rawValue: reasonVal) else {
|
|
1885
|
-
ensureEngineRunning(reason: "route-change-unknown")
|
|
1886
|
-
return
|
|
1887
|
-
}
|
|
1888
|
-
// Ignore route-change spam caused by our own adjustments
|
|
1889
|
-
if isAdjustingRoute {
|
|
1890
|
-
NSLog("[STT] route change (self-induced) → ignore")
|
|
1891
|
-
return
|
|
1892
|
-
}
|
|
1893
|
-
|
|
1894
|
-
// Only rebalance on real hardware changes; avoid .categoryChange / .override
|
|
1895
|
-
switch reason {
|
|
1896
|
-
case .newDeviceAvailable, .oldDeviceUnavailable, .routeConfigurationChange:
|
|
1897
|
-
let now = CFAbsoluteTimeGetCurrent()
|
|
1898
|
-
if now - lastRouteTune > routeTuneCooldown {
|
|
1899
|
-
lastRouteTune = now
|
|
1900
|
-
DispatchQueue.global(qos: .userInitiated).async { [weak self] in
|
|
1901
|
-
self?.updateSessionRouting(selectBestInput: true)
|
|
1902
|
-
}
|
|
1903
|
-
}
|
|
1904
|
-
default:
|
|
1905
|
-
break
|
|
1906
|
-
}
|
|
1907
|
-
ensureEngineRunning(reason: "route-change-\(reason.rawValue)")
|
|
1908
|
-
}
|
|
1909
|
-
*/
|
|
1910
|
-
@objc private func handleRouteChange(_ note: Notification) {
|
|
1911
|
-
if isTearingDown { return }
|
|
1912
|
-
if !sttActive {
|
|
1913
|
-
NSLog("[STT] 🔀 route change (ignored: sttInactive) \(note.userInfo ?? [:])")
|
|
1914
|
-
return
|
|
1915
|
-
}
|
|
1916
|
-
if isSpeechRecognitionLitePaused() {
|
|
1917
|
-
NSLog("[STT] 🔀 route change (ignored: speechRecognitionPaused) \(note.userInfo ?? [:])")
|
|
1918
|
-
return
|
|
1919
|
-
}
|
|
1920
|
-
if micPaused {
|
|
1921
|
-
NSLog("[STT] 🔀 route change (ignored: micPaused) \(note.userInfo ?? [:])")
|
|
1922
|
-
return
|
|
1923
|
-
}
|
|
1924
|
-
|
|
1925
|
-
let info = note.userInfo ?? [:]
|
|
1926
|
-
NSLog("[STT] 🔀 route change: \(info)")
|
|
1927
|
-
if isTelephonyInterrupted || isRecoveringAfterTelephony {
|
|
1928
|
-
NSLog("[STT] 🔀 route change (ignored during telephony/recovering): \(info)")
|
|
1929
|
-
return
|
|
1930
|
-
}
|
|
1931
|
-
|
|
1932
|
-
let session = AVAudioSession.sharedInstance()
|
|
1933
|
-
let outSig = session.currentRoute.outputs.map { $0.portType.rawValue }.joined(separator: ",")
|
|
1934
|
-
let inSig = session.currentRoute.inputs.map { $0.portType.rawValue }.joined(separator: ",")
|
|
1935
|
-
let routeSig = "outs=\(outSig)|ins=\(inSig)"
|
|
1936
|
-
if routeSig == lastRouteSignature {
|
|
1937
|
-
NSLog("[STT] 🔀 route change ignored (same route signature)")
|
|
1938
|
-
return
|
|
1939
|
-
}
|
|
1940
|
-
lastRouteSignature = routeSig
|
|
1941
|
-
|
|
1942
|
-
if let reasonVal = info[AVAudioSessionRouteChangeReasonKey] as? UInt,
|
|
1943
|
-
let reason = AVAudioSession.RouteChangeReason(rawValue: reasonVal) {
|
|
1944
|
-
switch reason {
|
|
1945
|
-
// Match AVAudioWrapper behavior: handle concrete hardware events + route config changes.
|
|
1946
|
-
case .newDeviceAvailable, .oldDeviceUnavailable, .routeConfigurationChange:
|
|
1947
|
-
updateSessionRouting(selectBestInput: true)
|
|
1948
|
-
default:
|
|
1949
|
-
NSLog("[STT] 🔀 route change reason=\(reason.rawValue) -> skip updateSessionRouting")
|
|
1950
|
-
}
|
|
1951
|
-
} else {
|
|
1952
|
-
NSLog("[STT] 🔀 route change reason missing -> skip updateSessionRouting")
|
|
1953
|
-
}
|
|
1954
|
-
|
|
1955
|
-
forceSpeakerIfReceiver("routeChange")
|
|
1956
|
-
reconcileAEC(on: audioEngine, reason: "route-change", allowRebuild: false)
|
|
1957
|
-
scheduleAECReconcileRetries(reason: "route-change")
|
|
1958
|
-
|
|
1959
|
-
ensureEngineRunning(reason: "route-change", skipCooldown: true)
|
|
1960
|
-
}
|
|
1961
|
-
|
|
1962
|
-
private func waitForIOCycle(_ engine: AVAudioEngine,
|
|
1963
|
-
timeout: TimeInterval = 0.7,
|
|
1964
|
-
done: @escaping (Bool) -> Void) {
|
|
1965
|
-
let gen = graphGen
|
|
1966
|
-
ttsSerial.async { [weak self, weak engine] in
|
|
1967
|
-
guard let self = self, let eng = engine, gen == self.graphGen else { return }
|
|
1968
|
-
|
|
1969
|
-
if self.mixerProbeActive {
|
|
1970
|
-
self.mixerProbeCompletions.append(done)
|
|
1971
|
-
return
|
|
1972
|
-
}
|
|
1973
|
-
self.mixerProbeActive = true
|
|
1974
|
-
self.mixerProbeCompletions = [done]
|
|
1975
|
-
|
|
1976
|
-
DispatchQueue.main.async { [weak self, weak eng] in
|
|
1977
|
-
guard let self = self, let eng = eng, gen == self.graphGen else { return }
|
|
1978
|
-
let mixer = eng.mainMixerNode
|
|
1979
|
-
var fired = false
|
|
1980
|
-
self.safeRemoveTap(mixer, bus: 0)
|
|
1981
|
-
|
|
1982
|
-
mixer.installTap(onBus: 0, bufferSize: 128, format: nil) { [weak self, weak mixer] _, _ in
|
|
1983
|
-
guard let self = self, gen == self.graphGen else { return }
|
|
1984
|
-
if fired { return }
|
|
1985
|
-
fired = true
|
|
1986
|
-
self.safeRemoveTap(mixer, bus: 0)
|
|
1987
|
-
|
|
1988
|
-
self.ttsSerial.async { [weak self] in
|
|
1989
|
-
guard let self = self else { return }
|
|
1990
|
-
let completions = self.mixerProbeCompletions
|
|
1991
|
-
self.mixerProbeActive = false
|
|
1992
|
-
self.mixerProbeCompletions.removeAll()
|
|
1993
|
-
DispatchQueue.main.async { if gen == self.graphGen { completions.forEach { $0(true) } } }
|
|
1994
|
-
}
|
|
1995
|
-
}
|
|
1996
|
-
|
|
1997
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + timeout) { [weak self, weak mixer] in
|
|
1998
|
-
guard let self = self, gen == self.graphGen else { return }
|
|
1999
|
-
if fired { return }
|
|
2000
|
-
self.safeRemoveTap(mixer, bus: 0)
|
|
2001
|
-
self.ttsSerial.async { [weak self] in
|
|
2002
|
-
guard let self = self else { return }
|
|
2003
|
-
let completions = self.mixerProbeCompletions
|
|
2004
|
-
self.mixerProbeActive = false
|
|
2005
|
-
self.mixerProbeCompletions.removeAll()
|
|
2006
|
-
DispatchQueue.main.async { if gen == self.graphGen { completions.forEach { $0(false) } } }
|
|
2007
|
-
}
|
|
2008
|
-
}
|
|
2009
|
-
}
|
|
2010
|
-
}
|
|
2011
|
-
}
|
|
2012
|
-
|
|
2013
|
-
// Call once, right after you create the engine (or inside setupAudioSession)
|
|
2014
|
-
// Call once after engine is created
|
|
2015
|
-
private func installEngineObservers() {
|
|
2016
|
-
removeEngineObservers()
|
|
2017
|
-
|
|
2018
|
-
let nc = NotificationCenter.default
|
|
2019
|
-
|
|
2020
|
-
if let engine = audioEngine {
|
|
2021
|
-
nc.addObserver(self,
|
|
2022
|
-
selector: #selector(handleEngineConfigChange(_:)),
|
|
2023
|
-
name: .AVAudioEngineConfigurationChange,
|
|
2024
|
-
object: engine)
|
|
2025
|
-
}
|
|
2026
|
-
|
|
2027
|
-
nc.addObserver(self,
|
|
2028
|
-
selector: #selector(handleSessionInterruption(_:)),
|
|
2029
|
-
name: AVAudioSession.interruptionNotification,
|
|
2030
|
-
object: AVAudioSession.sharedInstance())
|
|
2031
|
-
|
|
2032
|
-
nc.addObserver(self,
|
|
2033
|
-
selector: #selector(handleRouteChange(_:)),
|
|
2034
|
-
name: AVAudioSession.routeChangeNotification,
|
|
2035
|
-
object: AVAudioSession.sharedInstance())
|
|
2036
|
-
|
|
2037
|
-
nc.addObserver(self,
|
|
2038
|
-
selector: #selector(handleMediaServicesReset(_:)),
|
|
2039
|
-
name: AVAudioSession.mediaServicesWereResetNotification,
|
|
2040
|
-
object: nil)
|
|
2041
|
-
}
|
|
2042
|
-
|
|
2043
|
-
@objc private func handleSessionInterruption(_ note: Notification) {
|
|
2044
|
-
guard let info = note.userInfo,
|
|
2045
|
-
let typeRaw = info[AVAudioSessionInterruptionTypeKey] as? UInt,
|
|
2046
|
-
let type = AVAudioSession.InterruptionType(rawValue: typeRaw) else { return }
|
|
2047
|
-
|
|
2048
|
-
switch type {
|
|
2049
|
-
case .began:
|
|
2050
|
-
resetSpeechRecognitionLitePauseState("telephony-began")
|
|
2051
|
-
|
|
2052
|
-
NSLog("[STT] 📞 Interruption began")
|
|
2053
|
-
isTelephonyInterrupted = true
|
|
2054
|
-
isRecoveringAfterTelephony = false
|
|
2055
|
-
markCaptureLost()
|
|
2056
|
-
|
|
2057
|
-
// Stop IO safely
|
|
2058
|
-
if let eng = audioEngine {
|
|
2059
|
-
safeRemoveTap(eng.inputNode); safeRemoveTap(eng.mainMixerNode); safeRemoveTap(eng.outputNode)
|
|
2060
|
-
if eng.isRunning { eng.stop() }
|
|
2061
|
-
eng.reset()
|
|
2062
|
-
}
|
|
2063
|
-
|
|
2064
|
-
// Cancel the task and nil out the request — iOS STT tasks rarely recover after a hard break
|
|
2065
|
-
recognitionTask?.cancel()
|
|
2066
|
-
recognitionTask = nil
|
|
2067
|
-
recognitionRequest = nil
|
|
2068
|
-
|
|
2069
|
-
case .ended:
|
|
2070
|
-
isTelephonyInterrupted = false
|
|
2071
|
-
NSLog("[STT] ✅ Interruption ended")
|
|
2072
|
-
|
|
2073
|
-
// Keep the system from thrashing us during recovery
|
|
2074
|
-
isRecoveringAfterTelephony = true
|
|
2075
|
-
stopWatchdog() // <- don't let the watchdog rearm-loop during recovery
|
|
2076
|
-
startedRecognitionAfterCall = false
|
|
2077
|
-
lastBufferAt = 0
|
|
2078
|
-
tapFramesTotal = 0
|
|
2079
|
-
|
|
2080
|
-
// Re-activate the session (safe if already active)
|
|
2081
|
-
do {
|
|
2082
|
-
try AVAudioSession.sharedInstance().setActive(true, options: [])
|
|
2083
|
-
markAECSessionActivation(true, reason: "interruption-ended")
|
|
2084
|
-
} catch {
|
|
2085
|
-
markAECSessionActivation(false, reason: "interruption-ended-failed")
|
|
2086
|
-
}
|
|
2087
|
-
|
|
2088
|
-
// Give routes/formats a moment to settle *before* we rebuild
|
|
2089
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) {
|
|
2090
|
-
_ = self.setupAudioSession() // ✅ ensures defaultToSpeaker + mode
|
|
2091
|
-
self.forceSpeakerIfReceiver("telephonyEnded")// ✅ if iOS still stuck on receiver
|
|
2092
|
-
self.recoverAfterTelephonyInterruption()
|
|
2093
|
-
}
|
|
2094
|
-
|
|
2095
|
-
default: break
|
|
2096
|
-
}
|
|
2097
|
-
}
|
|
2098
|
-
|
|
2099
|
-
private func installPlaybackHooks() {
|
|
2100
|
-
// Expose current engine to the hook layer (safe to overwrite each time)
|
|
2101
|
-
AudioPlaybackHook.currentEngine = { [weak self] in self?.audioEngine }
|
|
2102
|
-
|
|
2103
|
-
// Engine readiness
|
|
2104
|
-
AudioPlaybackHook.isEngineReady = { [weak self] in
|
|
2105
|
-
guard let eng = self?.audioEngine else { return false }
|
|
2106
|
-
return eng.isRunning
|
|
2107
|
-
}
|
|
2108
|
-
|
|
2109
|
-
// Tell TTS layer: do NOT use AVAudioPlayer fallback while STT is active
|
|
2110
|
-
AudioPlaybackHook.useOnlyEnginePlayback = { [weak self] in
|
|
2111
|
-
guard let self = self else { return false }
|
|
2112
|
-
return self.sttActive && !self.micPaused
|
|
2113
|
-
}
|
|
2114
|
-
|
|
2115
|
-
// Schedule & play a file through the engine-owned AVAudioPlayerNode
|
|
2116
|
-
AudioPlaybackHook.engineScheduleFile = { [weak self] url, done in
|
|
2117
|
-
guard let self = self else { return false }
|
|
2118
|
-
|
|
2119
|
-
self.ttsSerial.async { [weak self] in
|
|
2120
|
-
guard let self = self else { return }
|
|
2121
|
-
|
|
2122
|
-
DispatchQueue.main.async {
|
|
2123
|
-
guard !self.isTearingDown,
|
|
2124
|
-
let engine = self.audioEngine else { return }
|
|
2125
|
-
|
|
2126
|
-
// If player belongs to a different engine (or got detached), recreate it
|
|
2127
|
-
if self.playbackNode?.engine !== engine || !self.isPlayerConnected(self.playbackNode, to: engine) {
|
|
2128
|
-
self.playbackNode?.stop()
|
|
2129
|
-
self.playbackNode = nil
|
|
2130
|
-
}
|
|
2131
|
-
|
|
2132
|
-
// Ensure engine is running
|
|
2133
|
-
if !engine.isRunning {
|
|
2134
|
-
do {
|
|
2135
|
-
try engine.start()
|
|
2136
|
-
self.armFirstIOCycleLatch(on: engine)
|
|
2137
|
-
} catch {
|
|
2138
|
-
NSLog("[STT] TTS: engine.start() failed: \(error)")
|
|
2139
|
-
return
|
|
2140
|
-
}
|
|
2141
|
-
}
|
|
2142
|
-
|
|
2143
|
-
let mixer = engine.mainMixerNode
|
|
2144
|
-
mixer.auAudioUnit.inputBusses[0].isEnabled = true
|
|
2145
|
-
|
|
2146
|
-
let player = self.ensurePlaybackNode(in: engine)
|
|
2147
|
-
|
|
2148
|
-
// Prime a silent buffer on a freshly attached player (stabilizes first play)
|
|
2149
|
-
if player.lastRenderTime == nil {
|
|
2150
|
-
let fmt = mixer.outputFormat(forBus: 0)
|
|
2151
|
-
if let prime = AVAudioPCMBuffer(pcmFormat: fmt, frameCapacity: 128) {
|
|
2152
|
-
prime.frameLength = 128
|
|
2153
|
-
if let ch = prime.floatChannelData {
|
|
2154
|
-
memset(ch[0], 0, Int(prime.frameLength) * MemoryLayout<Float>.size)
|
|
2155
|
-
if fmt.channelCount > 1 {
|
|
2156
|
-
memset(ch[1], 0, Int(prime.frameLength) * MemoryLayout<Float>.size)
|
|
2157
|
-
}
|
|
2158
|
-
}
|
|
2159
|
-
player.scheduleBuffer(prime, completionHandler: nil)
|
|
2160
|
-
}
|
|
2161
|
-
}
|
|
2162
|
-
|
|
2163
|
-
do {
|
|
2164
|
-
let file = try AVAudioFile(forReading: url)
|
|
2165
|
-
player.scheduleFile(file, at: nil) {
|
|
2166
|
-
DispatchQueue.main.async { done() }
|
|
2167
|
-
}
|
|
2168
|
-
} catch {
|
|
2169
|
-
NSLog("[STT] TTS schedule error: \(error)")
|
|
2170
|
-
return
|
|
2171
|
-
}
|
|
2172
|
-
|
|
2173
|
-
player.volume = 0.5
|
|
2174
|
-
|
|
2175
|
-
// Gate play on "engine has rendered at least one IO cycle"
|
|
2176
|
-
let startPlay = {
|
|
2177
|
-
if !player.isPlaying { player.play() }
|
|
2178
|
-
}
|
|
2179
|
-
|
|
2180
|
-
if self.engineHasRenderedOnce {
|
|
2181
|
-
startPlay()
|
|
2182
|
-
} else {
|
|
2183
|
-
func tryStart(after ms: Int = 0) {
|
|
2184
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + .milliseconds(ms)) {
|
|
2185
|
-
if self.engineHasRenderedOnce {
|
|
2186
|
-
startPlay()
|
|
2187
|
-
} else if ms < 1500 {
|
|
2188
|
-
tryStart(after: ms + 100)
|
|
2189
|
-
} else {
|
|
2190
|
-
NSLog("[STT] TTS: no IO cycle observed; skipping play to avoid crash")
|
|
2191
|
-
}
|
|
2192
|
-
}
|
|
2193
|
-
}
|
|
2194
|
-
tryStart()
|
|
2195
|
-
}
|
|
2196
|
-
}
|
|
2197
|
-
}
|
|
2198
|
-
|
|
2199
|
-
return true
|
|
2200
|
-
}
|
|
2201
|
-
|
|
2202
|
-
// Stop only the engine playback node (keep STT engine running)
|
|
2203
|
-
AudioPlaybackHook.stopEnginePlayback = { [weak self] in
|
|
2204
|
-
DispatchQueue.main.async {
|
|
2205
|
-
self?.playbackNode?.stop()
|
|
2206
|
-
}
|
|
2207
|
-
}
|
|
2208
|
-
}
|
|
2209
|
-
|
|
2210
|
-
private func setupAndStartRecognizing(localeStr: String?) {
|
|
2211
|
-
NSLog("[STT] setupAndStartRecognizing begin")
|
|
2212
|
-
sttActive = true
|
|
2213
|
-
// ✅ HARD reset speech-lite pause state on start/reinit
|
|
2214
|
-
resetSpeechRecognitionLitePauseState("setupAndStartRecognizing")
|
|
2215
|
-
|
|
2216
|
-
// --- HARD RESET OF STATE (first-run safety) ---
|
|
2217
|
-
isTearingDown = false
|
|
2218
|
-
isTelephonyInterrupted = false
|
|
2219
|
-
isRecoveringAfterTelephony = false
|
|
2220
|
-
|
|
2221
|
-
engineHasRenderedOnce = false
|
|
2222
|
-
ioLatchActiveGen = 0
|
|
2223
|
-
graphGen = 0
|
|
2224
|
-
|
|
2225
|
-
seenRealSpeech = false
|
|
2226
|
-
engineHotAt = 0
|
|
2227
|
-
lastBufferAt = 0
|
|
2228
|
-
lastResultAt = 0
|
|
2229
|
-
lastTaskStartAt = 0
|
|
2230
|
-
consecutiveStallCount = 0
|
|
2231
|
-
consecutiveNoInputResets = 0
|
|
2232
|
-
lastNoInputRecoveryAt = 0
|
|
2233
|
-
lastRearmAt = 0
|
|
2234
|
-
lastReclaimAttempt = 0
|
|
2235
|
-
tapFramesTotal = 0
|
|
2236
|
-
lastTapFramesSeen = 0
|
|
2237
|
-
pausedForCaptureLoss = false
|
|
2238
|
-
mixerProbeActive = false
|
|
2239
|
-
mixerProbeCompletions.removeAll()
|
|
2240
|
-
speakerVerificationEngine = nil
|
|
2241
|
-
speakerVerificationFrameSize = 0
|
|
2242
|
-
speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
|
|
2243
|
-
speakerVerificationThreshold = 0
|
|
2244
|
-
speakerVerificationFrameSeq = 0
|
|
2245
|
-
speakerVerificationSourceSampleRate = 0
|
|
2246
|
-
speakerVerificationTargetSampleRate = 0
|
|
2247
|
-
speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
|
|
2248
|
-
speakerVerificationResamplePos = 0
|
|
2249
|
-
speakerLastPositiveMatchAt = 0
|
|
2250
|
-
setSpeakerGateState(enabled: false, open: true)
|
|
2251
|
-
speakerVerificationErrorSent = false
|
|
2252
|
-
speakerPreRollBuffers.removeAll(keepingCapacity: false)
|
|
2253
|
-
speakerPreRollFrames = 0
|
|
2254
|
-
speakerPreRollMaxFrames = 0
|
|
2255
|
-
speakerPendingPreRollFlush = false
|
|
2256
|
-
|
|
2257
|
-
audioSession = AVAudioSession.sharedInstance()
|
|
2258
|
-
guard let session = audioSession else { return }
|
|
2259
|
-
var err: NSError?
|
|
2260
|
-
|
|
2261
|
-
priorAudioCategory = session.category
|
|
2262
|
-
|
|
2263
|
-
// Tear down resources before starting speech recognition..
|
|
2264
|
-
NSLog("[STT] pre-teardown")
|
|
2265
|
-
teardown()
|
|
2266
|
-
// ** IMPORTANT ** Call this again as teardown marks this false
|
|
2267
|
-
sttActive = true
|
|
2268
|
-
|
|
2269
|
-
sessionId = UUID().uuidString
|
|
2270
|
-
|
|
2271
|
-
let locale: Locale? = {
|
|
2272
|
-
if let s = localeStr, !s.isEmpty { return Locale(identifier: s) }
|
|
2273
|
-
sttActive = false
|
|
2274
|
-
return nil
|
|
2275
|
-
}()
|
|
2276
|
-
|
|
2277
|
-
if let loc = locale {
|
|
2278
|
-
speechRecognizer = SFSpeechRecognizer(locale: loc)
|
|
2279
|
-
} else {
|
|
2280
|
-
speechRecognizer = SFSpeechRecognizer()
|
|
2281
|
-
}
|
|
2282
|
-
speechRecognizer?.delegate = self
|
|
2283
|
-
|
|
2284
|
-
// Start audio session...
|
|
2285
|
-
NSLog("[STT] setupAudioSession()")
|
|
2286
|
-
guard setupAudioSession() else {
|
|
2287
|
-
NSLog("[STT] ERROR ERROR ******** setupAudioSession()")
|
|
2288
|
-
teardown()
|
|
2289
|
-
sttActive = false
|
|
2290
|
-
return
|
|
2291
|
-
}
|
|
2292
|
-
installEngineObservers()
|
|
2293
|
-
|
|
2294
|
-
let request = SFSpeechAudioBufferRecognitionRequest()
|
|
2295
|
-
recognitionRequest = request
|
|
2296
|
-
|
|
2297
|
-
if #available(iOS 16, *) {
|
|
2298
|
-
request.addsPunctuation = true
|
|
2299
|
-
} else {
|
|
2300
|
-
// Fallback on earlier versions
|
|
2301
|
-
}
|
|
2302
|
-
request.shouldReportPartialResults = true
|
|
2303
|
-
//if #available(iOS 13.0, *) { request.taskHint = .dictation }
|
|
2304
|
-
let cs: [String] = loadContextualStrings()
|
|
2305
|
-
request.contextualStrings = cs
|
|
2306
|
-
NSLog("[STT] makeFreshRequest contextualStrings count=\(cs.count) sample=\(cs.prefix(10)) file=\(Bundle.main.path(forResource: "words_flattened", ofType: "txt") ?? "nil")")
|
|
2307
|
-
|
|
2308
|
-
guard recognitionRequest != nil else {
|
|
2309
|
-
sendResult(error: ["code": "recognition_init"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
2310
|
-
teardown()
|
|
2311
|
-
return
|
|
2312
|
-
}
|
|
2313
|
-
|
|
2314
|
-
if audioEngine == nil {
|
|
2315
|
-
bumpGraphGen();
|
|
2316
|
-
audioEngine = AVAudioEngine()
|
|
2317
|
-
}
|
|
2318
|
-
AudioPlaybackHook.currentEngine = { [weak self] in self?.audioEngine } // ⬅️ add this
|
|
2319
|
-
|
|
2320
|
-
do {
|
|
2321
|
-
guard let engine = audioEngine else { throw NSError(domain: "voice.audio", code: -1) }
|
|
2322
|
-
let inputNode = engine.inputNode
|
|
2323
|
-
let _ = inputNode // presence check
|
|
2324
|
-
|
|
2325
|
-
reconcileAEC(on: engine, reason: "setup-start-prestart", allowRebuild: false)
|
|
2326
|
-
|
|
2327
|
-
// if output node voice processing is ever needed, keep commented as in original:
|
|
2328
|
-
// do { try engine.outputNode.setVoiceProcessingEnabled(true) } catch { ... }
|
|
2329
|
-
|
|
2330
|
-
NSLog("[STT] AEC enable done")
|
|
2331
|
-
|
|
2332
|
-
// --- FIXED WIRING: use live format on first hop, nil downstream, nil for tap ---
|
|
2333
|
-
let inFmt = engine.inputNode.outputFormat(forBus: 0)
|
|
2334
|
-
|
|
2335
|
-
// 1) Mute only the mic path, not the whole main mixer
|
|
2336
|
-
let micMixer = AVAudioMixerNode()
|
|
2337
|
-
engine.attach(micMixer)
|
|
2338
|
-
// Use the live input format for input → micMixer
|
|
2339
|
-
engine.connect(inputNode, to: micMixer, format: inFmt)
|
|
2340
|
-
// Let main mixer pick downstream format
|
|
2341
|
-
engine.connect(micMixer, to: engine.mainMixerNode, format: nil)
|
|
2342
|
-
micMixer.outputVolume = 0.0 // ← you won't hear your own mic
|
|
2343
|
-
|
|
2344
|
-
// 2) Prepare a player node for TTS inside the SAME engine/graph
|
|
2345
|
-
let player = AVAudioPlayerNode()
|
|
2346
|
-
self.playbackNode = player
|
|
2347
|
-
engine.attach(player)
|
|
2348
|
-
|
|
2349
|
-
// // --- Aggressive low-pass only ---
|
|
2350
|
-
// let deEss = AVAudioUnitEQ(numberOfBands: 1)
|
|
2351
|
-
// let lpf = deEss.bands[0]
|
|
2352
|
-
// lpf.filterType = .lowPass
|
|
2353
|
-
// lpf.frequency = 6500 // try 6000–7500
|
|
2354
|
-
// lpf.bandwidth = 0.35 // fairly steep
|
|
2355
|
-
// lpf.gain = 0.0
|
|
2356
|
-
// lpf.bypass = false
|
|
2357
|
-
|
|
2358
|
-
// self.ttsEQ = deEss
|
|
2359
|
-
// engine.attach(deEss)
|
|
2360
|
-
|
|
2361
|
-
// engine.disconnectNodeOutput(player)
|
|
2362
|
-
// engine.connect(player, to: deEss, format: nil)
|
|
2363
|
-
// engine.connect(deEss, to: engine.mainMixerNode, format: nil)
|
|
2364
|
-
engine.connect(player, to: engine.mainMixerNode, format: nil)
|
|
2365
|
-
|
|
2366
|
-
|
|
2367
|
-
NSLog("[STT] graph connected (mic->mute mixer, player->mainMixer)")
|
|
2368
|
-
|
|
2369
|
-
var tapFrames: UInt64 = 0
|
|
2370
|
-
// Tap uses nil so it follows the node’s current output format (survives route SR changes)
|
|
2371
|
-
|
|
2372
|
-
safeRemoveTap(inputNode, bus: 0)
|
|
2373
|
-
let format = inputNode.outputFormat(forBus: 0) // <- prefer explicit format
|
|
2374
|
-
guard format.sampleRate > 0, format.channelCount > 0 else {
|
|
2375
|
-
NSLog("[STT] skip tap: invalid input format (sr=\(format.sampleRate), ch=\(format.channelCount))")
|
|
2376
|
-
return
|
|
2377
|
-
}
|
|
2378
|
-
|
|
2379
|
-
var tapBufferSize: AVAudioFrameCount = 1024
|
|
2380
|
-
if let svStart = speakerVerificationStartConfig {
|
|
2381
|
-
do {
|
|
2382
|
-
var svConfig = svStart.config
|
|
2383
|
-
let routeSampleRate = Int(round(format.sampleRate))
|
|
2384
|
-
if useShortSpeakerVerificationTailWindow {
|
|
2385
|
-
let forcedTail = max(0.1, shortSpeakerVerificationTailSeconds)
|
|
2386
|
-
svConfig.tailSeconds = forcedTail
|
|
2387
|
-
if svConfig.maxTailSeconds < forcedTail {
|
|
2388
|
-
svConfig.maxTailSeconds = forcedTail
|
|
2389
|
-
}
|
|
2390
|
-
NSLog("[STT] SV tail override enabled tailSeconds=\(forcedTail)")
|
|
2391
|
-
}
|
|
2392
|
-
|
|
2393
|
-
speakerVerificationFrameSize = svConfig.frameSize
|
|
2394
|
-
speakerVerificationThreshold = svConfig.decisionThreshold
|
|
2395
|
-
speakerVerificationFrameSeq = 0
|
|
2396
|
-
speakerVerificationSourceSampleRate = routeSampleRate
|
|
2397
|
-
speakerVerificationTargetSampleRate = svConfig.sampleRate
|
|
2398
|
-
speakerVerificationResampleCarry.removeAll(keepingCapacity: true)
|
|
2399
|
-
speakerVerificationResamplePos = 0
|
|
2400
|
-
speakerLastPositiveMatchAt = 0
|
|
2401
|
-
speakerVerificationInputBuffer.removeAll(keepingCapacity: true)
|
|
2402
|
-
setSpeakerGateState(enabled: false, open: false)
|
|
2403
|
-
speakerVerificationErrorSent = false
|
|
2404
|
-
speakerPreRollBuffers.removeAll(keepingCapacity: true)
|
|
2405
|
-
speakerPreRollFrames = 0
|
|
2406
|
-
speakerPendingPreRollFlush = false
|
|
2407
|
-
speakerPreRollMaxFrames = max(1, Int(round(format.sampleRate * speakerPreRollSeconds)))
|
|
2408
|
-
|
|
2409
|
-
svConfig.logLevel = .off
|
|
2410
|
-
let svEngine = try SpeakerVerificationEngine(config: svConfig)
|
|
2411
|
-
svEngine.setEnrollment(svStart.enrollment)
|
|
2412
|
-
svEngine.resetStreamingState()
|
|
2413
|
-
|
|
2414
|
-
speakerVerificationEngine = svEngine
|
|
2415
|
-
setSpeakerGateState(enabled: true, open: false)
|
|
2416
|
-
tapBufferSize = AVAudioFrameCount(max(64, svConfig.frameSize))
|
|
2417
|
-
NSLog("[STT] Speaker verification gate enabled frameSize=\(svConfig.frameSize) tailSeconds=\(svConfig.tailSeconds) threshold=\(svConfig.decisionThreshold) hangover=\(useSpeakerGateHangover ? "ON" : "OFF") hangSec=\(String(format: "%.3f", speakerGateHangoverSeconds))")
|
|
2418
|
-
if routeSampleRate != svConfig.sampleRate {
|
|
2419
|
-
NSLog("[STT] SV resampling enabled \(routeSampleRate)Hz -> \(svConfig.sampleRate)Hz")
|
|
2420
|
-
} else {
|
|
2421
|
-
NSLog("[STT] SV sampleRate already matched at \(routeSampleRate)Hz")
|
|
2422
|
-
}
|
|
2423
|
-
} catch {
|
|
2424
|
-
speakerVerificationEngine = nil
|
|
2425
|
-
speakerVerificationThreshold = 0
|
|
2426
|
-
speakerVerificationFrameSeq = 0
|
|
2427
|
-
speakerVerificationSourceSampleRate = 0
|
|
2428
|
-
speakerVerificationTargetSampleRate = 0
|
|
2429
|
-
speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
|
|
2430
|
-
speakerVerificationResamplePos = 0
|
|
2431
|
-
speakerLastPositiveMatchAt = 0
|
|
2432
|
-
setSpeakerGateState(enabled: false, open: true)
|
|
2433
|
-
speakerPreRollBuffers.removeAll(keepingCapacity: false)
|
|
2434
|
-
speakerPreRollFrames = 0
|
|
2435
|
-
speakerPreRollMaxFrames = 0
|
|
2436
|
-
speakerPendingPreRollFlush = false
|
|
2437
|
-
sendResult(error: ["message": "Speaker verification disabled: \(error.localizedDescription)"],
|
|
2438
|
-
bestTranscription: nil,
|
|
2439
|
-
transcriptions: nil,
|
|
2440
|
-
isFinal: nil)
|
|
2441
|
-
}
|
|
2442
|
-
}
|
|
2443
|
-
|
|
2444
|
-
inputNode.installTap(onBus: 0, bufferSize: tapBufferSize, format: format) { [weak self] buffer, _ in
|
|
2445
|
-
// Strongify self once
|
|
2446
|
-
guard let self = self else { return }
|
|
2447
|
-
// ✅ Count frames globally so the watchdog can see forward progress
|
|
2448
|
-
self.tapFramesTotal &+= UInt64(buffer.frameLength)
|
|
2449
|
-
|
|
2450
|
-
if self.tapFramesTotal % (44100 * 2) < 1024 { // ~every 2s at 44.1k
|
|
2451
|
-
NSLog("[STT] tap alive, totalFrames=\(self.tapFramesTotal)")
|
|
2452
|
-
}
|
|
2453
|
-
|
|
2454
|
-
let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
|
|
2455
|
-
let LEVEL_LOWPASS_TRIG: Float = 0.5
|
|
2456
|
-
|
|
2457
|
-
// CH0
|
|
2458
|
-
if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
|
|
2459
|
-
var peak0: Float = 0
|
|
2460
|
-
vDSP_maxmgv(ch0, 1, &peak0, frames)
|
|
2461
|
-
let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
|
|
2462
|
-
|
|
2463
|
-
let smoothed0 = LEVEL_LOWPASS_TRIG * db0
|
|
2464
|
-
+ (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel0
|
|
2465
|
-
self.averagePowerForChannel0 = smoothed0
|
|
2466
|
-
self.averagePowerForChannel1 = smoothed0
|
|
2467
|
-
}
|
|
2468
|
-
|
|
2469
|
-
// CH1
|
|
2470
|
-
if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
|
|
2471
|
-
var peak1: Float = 0
|
|
2472
|
-
vDSP_maxmgv(ch1, 1, &peak1, frames)
|
|
2473
|
-
let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
|
|
2474
|
-
|
|
2475
|
-
let smoothed1 = LEVEL_LOWPASS_TRIG * db1
|
|
2476
|
-
+ (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel1
|
|
2477
|
-
self.averagePowerForChannel1 = smoothed1
|
|
2478
|
-
}
|
|
2479
|
-
|
|
2480
|
-
// Normalize 0–10 and emit
|
|
2481
|
-
self.averagePowerForChannel1 = Float(self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0)
|
|
2482
|
-
let value = self.averagePowerForChannel1
|
|
2483
|
-
self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": value])
|
|
2484
|
-
|
|
2485
|
-
if self.currentSpeakerGateState().enabled, let ch0 = buffer.floatChannelData?[0] {
|
|
2486
|
-
let mono = Array(UnsafeBufferPointer(start: ch0, count: Int(buffer.frameLength)))
|
|
2487
|
-
self.processSpeakerVerificationSamples(mono)
|
|
2488
|
-
}
|
|
2489
|
-
|
|
2490
|
-
// Append to recognition
|
|
2491
|
-
let gate = self.currentSpeakerGateState()
|
|
2492
|
-
if !gate.enabled {
|
|
2493
|
-
self.recognitionRequest?.append(buffer)
|
|
2494
|
-
NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=DISABLED action=append")
|
|
2495
|
-
} else if gate.open {
|
|
2496
|
-
self.flushSpeakerPreRollIfNeeded()
|
|
2497
|
-
self.recognitionRequest?.append(buffer)
|
|
2498
|
-
NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=OPEN action=append preRollFrames=\(self.currentSpeakerPreRollFrames())")
|
|
2499
|
-
} else {
|
|
2500
|
-
self.enqueueSpeakerPreRoll(buffer)
|
|
2501
|
-
NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=CLOSED action=buffer preRollFrames=\(self.currentSpeakerPreRollFrames())")
|
|
2502
|
-
}
|
|
2503
|
-
|
|
2504
|
-
// inside inputNode.installTap { buffer, _ in
|
|
2505
|
-
self.lastBufferAt = CACurrentMediaTime()
|
|
2506
|
-
}
|
|
2507
|
-
|
|
2508
|
-
engine.prepare()
|
|
2509
|
-
NSLog("[STT] audioEngine prepare")
|
|
2510
|
-
var audioSessionError: NSError?
|
|
2511
|
-
do {
|
|
2512
|
-
try engine.start()
|
|
2513
|
-
armFirstIOCycleLatch(on: engine)
|
|
2514
|
-
reconcileAEC(on: engine, reason: "setup-start-poststart", allowRebuild: false)
|
|
2515
|
-
scheduleAECReconcileRetries(reason: "setup-start")
|
|
2516
|
-
} catch {
|
|
2517
|
-
audioSessionError = error as NSError
|
|
2518
|
-
}
|
|
2519
|
-
|
|
2520
|
-
// after engine.start() success:
|
|
2521
|
-
engineHotAt = CACurrentMediaTime()
|
|
2522
|
-
seenRealSpeech = false
|
|
2523
|
-
NSLog("engine HOT at \(engineHotAt)")
|
|
2524
|
-
sendEvent(name: "onSpeechStart", body: nil) // engine hot signal (keep if you want)
|
|
2525
|
-
startTask(makeFreshRequest())
|
|
2526
|
-
|
|
2527
|
-
installPlaybackHooks()
|
|
2528
|
-
|
|
2529
|
-
startWatchdog()
|
|
2530
|
-
|
|
2531
|
-
NSLog("audioEngine startAndReturnError")
|
|
2532
|
-
if let audioSessionError = audioSessionError {
|
|
2533
|
-
NotificationCenter.default.addObserver(self,
|
|
2534
|
-
selector: #selector(self.handleEngineConfigChange(_:)),
|
|
2535
|
-
name: .AVAudioEngineConfigurationChange,
|
|
2536
|
-
object: engine)
|
|
2537
|
-
NSLog("audioEngine audioSessionError!=nil")
|
|
2538
|
-
self.sendResult(error: ["code": "audio", "message": audioSessionError.localizedDescription],
|
|
2539
|
-
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
2540
|
-
NSLog("[STT] self sendResult")
|
|
2541
|
-
// self.teardown()
|
|
2542
|
-
NSLog("[STT] Removed self teardown")
|
|
2543
|
-
return
|
|
2544
|
-
}
|
|
2545
|
-
NSLog("After Start recording and append recording")
|
|
2546
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + 3.0) { [weak self] in
|
|
2547
|
-
guard let self = self else { return }
|
|
2548
|
-
let running = self.audioEngine?.isRunning ?? false
|
|
2549
|
-
let taskState = self.recognitionTask?.state.rawValue ?? -1
|
|
2550
|
-
NSLog("[STT] health: engineRunning=\(running) taskState=\(taskState)")
|
|
2551
|
-
}
|
|
2552
|
-
|
|
2553
|
-
NSLog("After if audioSessionError != nil")
|
|
2554
|
-
} catch let e as NSError {
|
|
2555
|
-
sendResult(error: ["code": "start_recording", "message": e.localizedDescription],
|
|
2556
|
-
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
2557
|
-
NSLog("End of init...")
|
|
2558
|
-
return
|
|
2559
|
-
}
|
|
2560
|
-
}
|
|
2561
|
-
|
|
2562
|
-
private func loadSpeakerVerificationStartConfig(onboardingJsonPath: String) throws -> SpeakerVerificationStartConfig {
|
|
2563
|
-
let data = try Data(contentsOf: URL(fileURLWithPath: onboardingJsonPath))
|
|
2564
|
-
let enrollment = try SpeakerEnrollment.deserialize(data)
|
|
2565
|
-
return SpeakerVerificationStartConfig(enrollment: enrollment, config: enrollment.configSnapshot)
|
|
2566
|
-
}
|
|
2567
|
-
|
|
2568
|
-
private func currentSpeakerGateState() -> (enabled: Bool, open: Bool) {
|
|
2569
|
-
speakerVerificationStateLock.lock()
|
|
2570
|
-
let state = (speakerGateEnabled, speakerGateOpen)
|
|
2571
|
-
speakerVerificationStateLock.unlock()
|
|
2572
|
-
return state
|
|
2573
|
-
}
|
|
2574
|
-
|
|
2575
|
-
private func setSpeakerGateState(enabled: Bool, open: Bool) {
|
|
2576
|
-
speakerVerificationStateLock.lock()
|
|
2577
|
-
let wasOpen = speakerGateOpen
|
|
2578
|
-
let wasEnabled = speakerGateEnabled
|
|
2579
|
-
speakerGateEnabled = enabled
|
|
2580
|
-
speakerGateOpen = open
|
|
2581
|
-
let changed = (wasOpen != open) || (wasEnabled != enabled)
|
|
2582
|
-
if enabled && open && (!wasEnabled || !wasOpen) {
|
|
2583
|
-
speakerPendingPreRollFlush = true
|
|
2584
|
-
}
|
|
2585
|
-
if !enabled {
|
|
2586
|
-
speakerPendingPreRollFlush = false
|
|
2587
|
-
}
|
|
2588
|
-
speakerVerificationStateLock.unlock()
|
|
2589
|
-
if changed {
|
|
2590
|
-
NSLog("[STT][SV][GATE] enabled=\(enabled ? "YES" : "NO") open=\(open ? "YES" : "NO") th=\(speakerVerificationThreshold)")
|
|
2591
|
-
}
|
|
2592
|
-
}
|
|
2593
|
-
|
|
2594
|
-
private func currentSpeakerPreRollFrames() -> Int {
|
|
2595
|
-
speakerVerificationStateLock.lock()
|
|
2596
|
-
let n = speakerPreRollFrames
|
|
2597
|
-
speakerVerificationStateLock.unlock()
|
|
2598
|
-
return n
|
|
2599
|
-
}
|
|
2600
|
-
|
|
2601
|
-
private func enqueueSpeakerPreRoll(_ buffer: AVAudioPCMBuffer) {
|
|
2602
|
-
speakerVerificationStateLock.lock()
|
|
2603
|
-
defer { speakerVerificationStateLock.unlock() }
|
|
2604
|
-
guard speakerPreRollMaxFrames > 0 else { return }
|
|
2605
|
-
guard let copy = copyPCMBuffer(buffer) else { return }
|
|
2606
|
-
speakerPreRollBuffers.append(copy)
|
|
2607
|
-
speakerPreRollFrames += Int(copy.frameLength)
|
|
2608
|
-
|
|
2609
|
-
while speakerPreRollFrames > speakerPreRollMaxFrames, !speakerPreRollBuffers.isEmpty {
|
|
2610
|
-
let dropped = speakerPreRollBuffers.removeFirst()
|
|
2611
|
-
speakerPreRollFrames -= Int(dropped.frameLength)
|
|
2612
|
-
}
|
|
2613
|
-
}
|
|
2614
|
-
|
|
2615
|
-
private func flushSpeakerPreRollIfNeeded() {
|
|
2616
|
-
var toFlush: [AVAudioPCMBuffer] = []
|
|
2617
|
-
var totalFrames = 0
|
|
2618
|
-
var selectedFrames = 0
|
|
2619
|
-
speakerVerificationStateLock.lock()
|
|
2620
|
-
if speakerPendingPreRollFlush {
|
|
2621
|
-
totalFrames = speakerPreRollFrames
|
|
2622
|
-
if useLegacySpeakerGateBehavior {
|
|
2623
|
-
toFlush = speakerPreRollBuffers
|
|
2624
|
-
} else {
|
|
2625
|
-
let sr = max(1, speakerVerificationSourceSampleRate)
|
|
2626
|
-
let maxFrames = max(1, Int(round(Double(sr) * speakerPreRollFlushMaxSeconds)))
|
|
2627
|
-
if totalFrames <= maxFrames {
|
|
2628
|
-
toFlush = speakerPreRollBuffers
|
|
2629
|
-
} else {
|
|
2630
|
-
var kept: [AVAudioPCMBuffer] = []
|
|
2631
|
-
var keptFrames = 0
|
|
2632
|
-
for b in speakerPreRollBuffers.reversed() {
|
|
2633
|
-
kept.append(b)
|
|
2634
|
-
keptFrames += Int(b.frameLength)
|
|
2635
|
-
if keptFrames >= maxFrames { break }
|
|
2636
|
-
}
|
|
2637
|
-
toFlush = kept.reversed()
|
|
2638
|
-
}
|
|
2639
|
-
}
|
|
2640
|
-
selectedFrames = toFlush.reduce(0) { $0 + Int($1.frameLength) }
|
|
2641
|
-
speakerPreRollBuffers.removeAll(keepingCapacity: false)
|
|
2642
|
-
speakerPreRollFrames = 0
|
|
2643
|
-
speakerPendingPreRollFlush = false
|
|
2644
|
-
}
|
|
2645
|
-
speakerVerificationStateLock.unlock()
|
|
2646
|
-
|
|
2647
|
-
if toFlush.isEmpty { return }
|
|
2648
|
-
NSLog("[STT][SV][PREROLL] flushing buffers=\(toFlush.count) frames=\(selectedFrames) totalBuffered=\(totalFrames)")
|
|
2649
|
-
for b in toFlush {
|
|
2650
|
-
recognitionRequest?.append(b)
|
|
2651
|
-
}
|
|
2652
|
-
}
|
|
2653
|
-
|
|
2654
|
-
private func copyPCMBuffer(_ source: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
|
|
2655
|
-
guard let dst = AVAudioPCMBuffer(pcmFormat: source.format, frameCapacity: source.frameLength) else {
|
|
2656
|
-
return nil
|
|
2657
|
-
}
|
|
2658
|
-
dst.frameLength = source.frameLength
|
|
2659
|
-
let channels = Int(source.format.channelCount)
|
|
2660
|
-
let frames = Int(source.frameLength)
|
|
2661
|
-
|
|
2662
|
-
if let src = source.floatChannelData, let out = dst.floatChannelData {
|
|
2663
|
-
let bytes = frames * MemoryLayout<Float>.size
|
|
2664
|
-
for ch in 0..<channels {
|
|
2665
|
-
memcpy(out[ch], src[ch], bytes)
|
|
2666
|
-
}
|
|
2667
|
-
return dst
|
|
2668
|
-
}
|
|
2669
|
-
|
|
2670
|
-
if let src = source.int16ChannelData, let out = dst.int16ChannelData {
|
|
2671
|
-
let bytes = frames * MemoryLayout<Int16>.size
|
|
2672
|
-
for ch in 0..<channels {
|
|
2673
|
-
memcpy(out[ch], src[ch], bytes)
|
|
2674
|
-
}
|
|
2675
|
-
return dst
|
|
2676
|
-
}
|
|
2677
|
-
|
|
2678
|
-
if let src = source.int32ChannelData, let out = dst.int32ChannelData {
|
|
2679
|
-
let bytes = frames * MemoryLayout<Int32>.size
|
|
2680
|
-
for ch in 0..<channels {
|
|
2681
|
-
memcpy(out[ch], src[ch], bytes)
|
|
2682
|
-
}
|
|
2683
|
-
return dst
|
|
2684
|
-
}
|
|
2685
|
-
|
|
2686
|
-
return nil
|
|
2687
|
-
}
|
|
2688
|
-
|
|
2689
|
-
private func resampleSamplesForSpeakerVerificationIfNeeded(_ input: [Float]) -> [Float] {
|
|
2690
|
-
guard !input.isEmpty else { return [] }
|
|
2691
|
-
let srcRate = speakerVerificationSourceSampleRate
|
|
2692
|
-
let dstRate = speakerVerificationTargetSampleRate
|
|
2693
|
-
guard srcRate > 0, dstRate > 0 else { return input }
|
|
2694
|
-
if srcRate == dstRate { return input }
|
|
2695
|
-
|
|
2696
|
-
let ratio = Double(srcRate) / Double(dstRate) // source samples per output sample
|
|
2697
|
-
let source = speakerVerificationResampleCarry + input
|
|
2698
|
-
guard source.count >= 2 else {
|
|
2699
|
-
speakerVerificationResampleCarry = source
|
|
2700
|
-
return []
|
|
2701
|
-
}
|
|
2702
|
-
|
|
2703
|
-
var out: [Float] = []
|
|
2704
|
-
out.reserveCapacity(Int(Double(input.count) * Double(dstRate) / Double(srcRate)) + 8)
|
|
2705
|
-
|
|
2706
|
-
var pos = speakerVerificationResamplePos
|
|
2707
|
-
while pos + 1.0 < Double(source.count) {
|
|
2708
|
-
let i = Int(pos)
|
|
2709
|
-
let frac = Float(pos - Double(i))
|
|
2710
|
-
let a = source[i]
|
|
2711
|
-
let b = source[i + 1]
|
|
2712
|
-
out.append(a + (b - a) * frac)
|
|
2713
|
-
pos += ratio
|
|
2714
|
-
}
|
|
2715
|
-
|
|
2716
|
-
let keepStart = max(0, Int(floor(pos)) - 1)
|
|
2717
|
-
speakerVerificationResampleCarry = Array(source[keepStart...])
|
|
2718
|
-
speakerVerificationResamplePos = pos - Double(keepStart)
|
|
2719
|
-
return out
|
|
2720
|
-
}
|
|
2721
|
-
|
|
2722
|
-
private func processSpeakerVerificationSamples(_ samples: [Float]) {
|
|
2723
|
-
guard !samples.isEmpty else { return }
|
|
2724
|
-
speakerVerificationQueue.async { [weak self] in
|
|
2725
|
-
guard let self = self else { return }
|
|
2726
|
-
guard let engine = self.speakerVerificationEngine else { return }
|
|
2727
|
-
let frameSize = self.speakerVerificationFrameSize
|
|
2728
|
-
guard frameSize > 0 else { return }
|
|
2729
|
-
|
|
2730
|
-
let normalized = self.resampleSamplesForSpeakerVerificationIfNeeded(samples)
|
|
2731
|
-
if normalized.isEmpty { return }
|
|
2732
|
-
self.speakerVerificationInputBuffer.append(contentsOf: normalized)
|
|
2733
|
-
|
|
2734
|
-
while self.speakerVerificationInputBuffer.count >= frameSize {
|
|
2735
|
-
let frame = Array(self.speakerVerificationInputBuffer.prefix(frameSize))
|
|
2736
|
-
self.speakerVerificationInputBuffer.removeFirst(frameSize)
|
|
2737
|
-
self.speakerVerificationFrameSeq &+= 1
|
|
2738
|
-
let seq = self.speakerVerificationFrameSeq
|
|
2739
|
-
|
|
2740
|
-
do {
|
|
2741
|
-
let out = try engine.processFrame(frame: frame)
|
|
2742
|
-
switch out {
|
|
2743
|
-
case .pending(let p):
|
|
2744
|
-
let gate = self.currentSpeakerGateState()
|
|
2745
|
-
NSLog("[STT][SV][FRAME #\(seq)] pending buffered=\(p.bufferedSamples) neededSec=\(p.neededSeconds) gate=\(gate.open ? "OPEN" : "CLOSED") th=\(self.speakerVerificationThreshold)")
|
|
2746
|
-
case .result(let result):
|
|
2747
|
-
if self.useLegacySpeakerGateBehavior || !self.useSpeakerGateHangover {
|
|
2748
|
-
self.setSpeakerGateState(enabled: true, open: result.isMatch)
|
|
2749
|
-
} else {
|
|
2750
|
-
let now = CACurrentMediaTime()
|
|
2751
|
-
if result.isMatch {
|
|
2752
|
-
self.speakerLastPositiveMatchAt = now
|
|
2753
|
-
self.setSpeakerGateState(enabled: true, open: true)
|
|
2754
|
-
} else {
|
|
2755
|
-
let keepOpen = self.speakerLastPositiveMatchAt > 0 &&
|
|
2756
|
-
(now - self.speakerLastPositiveMatchAt) <= max(0, self.speakerGateHangoverSeconds)
|
|
2757
|
-
self.setSpeakerGateState(enabled: true, open: keepOpen)
|
|
2758
|
-
}
|
|
2759
|
-
}
|
|
2760
|
-
let gate = self.currentSpeakerGateState()
|
|
2761
|
-
NSLog("[STT][SV][FRAME #\(seq)] scoreBest=\(String(format: "%.4f", result.scoreBest)) raw=\(String(format: "%.4f", result.scoreBestRaw)) meancombo=\(String(format: "%.4f", result.scoreBestMeancombo)) mean=\(String(format: "%.4f", result.scoreMean)) match=\(result.isMatch ? "YES" : "NO") gate=\(gate.open ? "OPEN" : "CLOSED") th=\(String(format: "%.4f", self.speakerVerificationThreshold)) hangover=\(self.useSpeakerGateHangover ? "ON" : "OFF") hangSec=\(String(format: "%.3f", self.speakerGateHangoverSeconds))")
|
|
2762
|
-
}
|
|
2763
|
-
} catch {
|
|
2764
|
-
self.speakerVerificationEngine = nil
|
|
2765
|
-
self.speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
|
|
2766
|
-
self.speakerVerificationSourceSampleRate = 0
|
|
2767
|
-
self.speakerVerificationTargetSampleRate = 0
|
|
2768
|
-
self.speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
|
|
2769
|
-
self.speakerVerificationResamplePos = 0
|
|
2770
|
-
self.speakerLastPositiveMatchAt = 0
|
|
2771
|
-
self.setSpeakerGateState(enabled: false, open: true)
|
|
2772
|
-
self.speakerVerificationStateLock.lock()
|
|
2773
|
-
self.speakerPreRollBuffers.removeAll(keepingCapacity: false)
|
|
2774
|
-
self.speakerPreRollFrames = 0
|
|
2775
|
-
self.speakerPreRollMaxFrames = 0
|
|
2776
|
-
self.speakerVerificationStateLock.unlock()
|
|
2777
|
-
if !self.speakerVerificationErrorSent {
|
|
2778
|
-
self.speakerVerificationErrorSent = true
|
|
2779
|
-
DispatchQueue.main.async { [weak self] in
|
|
2780
|
-
self?.sendResult(error: ["message": "Speaker verification stopped: \(error.localizedDescription)"],
|
|
2781
|
-
bestTranscription: nil,
|
|
2782
|
-
transcriptions: nil,
|
|
2783
|
-
isFinal: nil)
|
|
2784
|
-
}
|
|
2785
|
-
}
|
|
2786
|
-
return
|
|
2787
|
-
}
|
|
2788
|
-
}
|
|
2789
|
-
}
|
|
2790
|
-
}
|
|
2791
|
-
|
|
2792
|
-
// MARK: - Helpers
|
|
2793
|
-
private func _normalizedPowerLevelFromDecibels(_ decibels: CGFloat) -> CGFloat {
|
|
2794
|
-
if decibels < -80.0 || decibels == 0.0 { return 0.0 }
|
|
2795
|
-
let minDb: Float = -80.0
|
|
2796
|
-
let pow10_min = powf(10.0, 0.05 * minDb)
|
|
2797
|
-
let pow10_db = powf(10.0, 0.05 * Float(decibels))
|
|
2798
|
-
let power = powf((pow10_db - pow10_min) * (1.0 / (1.0 - pow10_min)), 1.0 / 2.0)
|
|
2799
|
-
if power < 1.0 { return CGFloat(power) } else { return 1.0 }
|
|
2800
|
-
}
|
|
2801
|
-
|
|
2802
|
-
private func sendEvent(name: String, body: [String: Any]?) {
|
|
2803
|
-
delegate?.stt(self, didEmitEvent: name, body: body)
|
|
2804
|
-
}
|
|
2805
|
-
|
|
2806
|
-
/// Exact event behavior preserved from ObjC `sendResult`.
|
|
2807
|
-
private func sendResult(error: [String: Any]?,
|
|
2808
|
-
bestTranscription: String?,
|
|
2809
|
-
transcriptions: [String]?,
|
|
2810
|
-
isFinal: Bool?) {
|
|
2811
|
-
NSLog("[STT] sendResult called")
|
|
2812
|
-
if (self.micPaused || self.speechRecognitionPaused) {
|
|
2813
|
-
if self.micPaused && self.speechRecognitionPaused {
|
|
2814
|
-
NSLog("[STT] sendResult suppressed: micPaused + speechRecognitionPaused")
|
|
2815
|
-
} else if self.micPaused {
|
|
2816
|
-
NSLog("[STT] sendResult suppressed: micPaused")
|
|
2817
|
-
} else {
|
|
2818
|
-
NSLog("[STT] sendResult suppressed: speechRecognitionPaused")
|
|
2819
|
-
}
|
|
2820
|
-
return
|
|
2821
|
-
}
|
|
2822
|
-
|
|
2823
|
-
if let error = error {
|
|
2824
|
-
sendEvent(name: "onSpeechError", body: ["error": error])
|
|
2825
|
-
}
|
|
2826
|
-
if let best = bestTranscription {
|
|
2827
|
-
sendEvent(name: "onSpeechResults", body: ["value": [best]])
|
|
2828
|
-
}
|
|
2829
|
-
if let trans = transcriptions {
|
|
2830
|
-
sendEvent(name: "onSpeechPartialResults", body: ["value": trans])
|
|
2831
|
-
}
|
|
2832
|
-
if let isFinal = isFinal {
|
|
2833
|
-
sendEvent(name: "onSpeechRecognized", body: ["isFinal": isFinal])
|
|
2834
|
-
}
|
|
2835
|
-
}
|
|
2836
|
-
|
|
2837
|
-
// MARK: - SFSpeechRecognizerDelegate
|
|
2838
|
-
|
|
2839
|
-
public func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
|
|
2840
|
-
if available == false {
|
|
2841
|
-
sendResult(error: ["message": "Speech recognition is not available now"],
|
|
2842
|
-
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
2843
|
-
}
|
|
2844
|
-
}
|
|
2845
|
-
|
|
2846
|
-
// MARK: - Small helper to recreate recognizer (used by watchdog)
|
|
2847
|
-
private func recreateSpeechRecognizerPreservingLocale() {
|
|
2848
|
-
let loc = speechRecognizer?.locale
|
|
2849
|
-
speechRecognizer = loc != nil ? SFSpeechRecognizer(locale: loc!) : SFSpeechRecognizer()
|
|
2850
|
-
speechRecognizer?.delegate = self
|
|
2851
|
-
NSLog("[STT] recreated SFSpeechRecognizer (locale preserved: \(loc?.identifier ?? "default"))")
|
|
2852
|
-
}
|
|
2853
|
-
}
|