react-native-davoice-tts 1.0.305 → 1.0.307

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (19) hide show
  1. package/TTSRNBridge.podspec +1 -1
  2. package/ios/SpeechBridge/SpeechBridge.m +17 -2
  3. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DavoiceTTS +0 -0
  4. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.abi.json +8831 -8831
  5. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.private.swiftinterface +48 -48
  6. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.swiftinterface +48 -48
  7. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DavoiceTTS +0 -0
  8. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.abi.json +4092 -4092
  9. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.private.swiftinterface +12 -12
  10. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.swiftinterface +12 -12
  11. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.abi.json +4092 -4092
  12. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.private.swiftinterface +12 -12
  13. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.swiftinterface +12 -12
  14. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeDirectory +0 -0
  15. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeRequirements-1 +0 -0
  16. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeResources +24 -24
  17. package/package.json +1 -1
  18. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift.AEC.CRASH.ETC +0 -2853
  19. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.AEC.CRASH.ETC +0 -2853
@@ -1,2853 +0,0 @@
1
- // STT.swift
2
- // Native iOS Swift version (AEC flow preserved 1:1)
3
-
4
- import Foundation
5
- import UIKit
6
- import Speech
7
- import Accelerate
8
- import AVFAudio // or import AVFoundation
9
-
10
- @objc public protocol STTDelegate: AnyObject {
11
- @objc func stt(_ stt: STT, didEmitEvent name: String, body: [String: Any]?)
12
- }
13
-
14
- @objcMembers
15
- public final class STT: NSObject, SFSpeechRecognizerDelegate {
16
- public weak var delegate: STTDelegate?
17
- public var continuous: Bool = true
18
-
19
- // Global AEC toggle (default ON to keep existing behavior)
20
- public var aecEnabled: Bool = true
21
- // If true, force VP/AEC ON for a short window after session activation while routes settle.
22
- public var forceAECDuringRouteWarmup: Bool = true
23
- public var aecRouteWarmupSeconds: Double = 20.0
24
- // If true, always request 16k input sample rate from AVAudioSession.
25
- // iOS may still override this depending on route / voice processing constraints.
26
- public var force16kMicSampleRate: Bool = false
27
- // If true, use old SV gate behavior (immediate open/close + full pre-roll flush).
28
- public var useLegacySpeakerGateBehavior: Bool = false
29
- // If true, keep gate open for a short hangover after the last positive match.
30
- public var useSpeakerGateHangover: Bool = true
31
- public var speakerGateHangoverSeconds: Double = 0.40
32
- // If true, override SV tailSeconds to 0.5s for faster switching tests.
33
- public var useShortSpeakerVerificationTailWindow: Bool = true
34
- public var shortSpeakerVerificationTailSeconds: Float = 0.5
35
- // In protected mode, flush only this much recent pre-roll when gate reopens.
36
- public var speakerPreRollFlushMaxSeconds: Double = 0.5
37
-
38
- // MARK: - Private
39
- private var speechRecognizer: SFSpeechRecognizer?
40
- private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
41
- private var audioEngine: AVAudioEngine?
42
- private var recognitionTask: SFSpeechRecognitionTask?
43
- private var audioSession: AVAudioSession?
44
- private let aecSessionActivationLock = NSLock()
45
- private var lastAECSessionActivationAt: CFTimeInterval = 0
46
- private var aecSessionIsActive: Bool = false
47
- private var isTearingDown: Bool = false
48
- private var sessionId: String?
49
- private var priorAudioCategory: AVAudioSession.Category?
50
- private var averagePowerForChannel0: Float = 0
51
- private var averagePowerForChannel1: Float = 0
52
- // Add to STT
53
- private var isAdjustingRoute = false
54
- private var lastRouteSignature: String = ""
55
-
56
- private var playbackNode: AVAudioPlayerNode?
57
- private var seenRealSpeech = false // flips true after first non-blank token
58
- private var engineHotAt: CFTimeInterval = 0 // when engine actually started
59
- private let warmupKeepAlive: CFTimeInterval = 4.0 // seconds we’ll keep re-arming in silence
60
-
61
- // Keep-engine-alive helpers
62
- private var lastReclaimAttempt: CFAbsoluteTime = 0
63
- private let reclaimCooldown: CFTimeInterval = 1.0
64
-
65
- // Serialize pause/unpause (and their waits)
66
- private let micPauseLock = NSRecursiveLock()
67
-
68
- // --- Task health ---
69
- private var lastBufferAt: CFTimeInterval = 0 // updated from tap
70
- private var lastResultAt: CFTimeInterval = 0 // updated from recognition callback
71
- private var lastTaskStartAt: CFTimeInterval = 0
72
- private var stallWatchdog: Timer?
73
- private var consecutiveStallCount = 0
74
- private let stallThreshold: CFTimeInterval = 8.0 // seconds w/o results while engine is hot
75
- private let rearmCooldownTask: CFTimeInterval = 2.0
76
- private var lastRearmAt: CFTimeInterval = 0
77
- private var engineHot = false
78
- private var hotAt: CFTimeInterval = 0
79
- private var lastLocaleStr: String = ""
80
- // --- Recovery & diagnostics ---
81
- private var recoverySeq = 0
82
- private var lastRecoveryAt: CFTimeInterval = 0
83
- private var lastTaskOrigin: String = "cold"
84
- private var savedSessionBeforePause: (
85
- category: AVAudioSession.Category,
86
- mode: AVAudioSession.Mode,
87
- options: AVAudioSession.CategoryOptions,
88
- sr: Double,
89
- inCh: Int,
90
- outCh: Int,
91
- ioDur: TimeInterval
92
- )?
93
-
94
- private(set) var sttActive = false
95
- // STT.swift (add near `private var playbackNode: AVAudioPlayerNode?`)
96
- // private var ttsEQ: AVAudioUnitEQ?
97
-
98
- // Add near your other state:
99
- private var ioLatchActiveGen: UInt64 = 0
100
-
101
- // TTS probe state
102
- private var mixerProbeActive = false
103
- private var mixerProbeCompletions: [(Bool) -> Void] = []
104
- private let ttsSerial = DispatchQueue(label: "stt.tts.serial") // serialize TTS schedule/play
105
- private var engineHasRenderedOnce = false
106
-
107
- private var tapFramesTotal: UInt64 = 0 // monotonically increases inside input tap
108
- private var lastTapFramesSeen: UInt64 = 0 // snapshot seen by watchdog
109
- private var lastNoInputRecoveryAt: CFTimeInterval = 0
110
- private var consecutiveNoInputResets = 0
111
- // thresholds / cool-downs
112
- private let noInputThreshold: CFTimeInterval = 1.0 // seconds without any buffers
113
- private let noInputCooldown: CFTimeInterval = 5.0 // avoid thrashing recoveries
114
- private let maxGentleRetries = 2 // try start() a couple times before rebuild
115
- private var isTelephonyInterrupted = false
116
- private var isRecoveringAfterTelephony = false // NEW
117
- // MARK: - Post-telephony recognition kick
118
- private var startedRecognitionAfterCall = false
119
- // Add near other state:
120
- private var activeTaskGen: UInt64 = 0
121
- private var micPaused: Bool = false
122
-
123
- // --- Optional speaker verification gate ---
124
- private struct SpeakerVerificationStartConfig {
125
- let enrollment: SpeakerEnrollment
126
- let config: SpeakerVerificationConfig
127
- }
128
- private let speakerVerificationQueue = DispatchQueue(label: "stt.sv.queue")
129
- private let speakerVerificationStateLock = NSLock()
130
- private var speakerVerificationStartConfig: SpeakerVerificationStartConfig?
131
- private var speakerVerificationEngine: SpeakerVerificationEngine?
132
- private var speakerVerificationFrameSize: Int = 0
133
- private var speakerVerificationInputBuffer: [Float] = []
134
- private var speakerGateOpen: Bool = true
135
- private var speakerGateEnabled: Bool = false
136
- private var speakerVerificationErrorSent: Bool = false
137
- private var speakerPreRollBuffers: [AVAudioPCMBuffer] = []
138
- private var speakerPreRollFrames: Int = 0
139
- private var speakerPreRollMaxFrames: Int = 0
140
- private var speakerPendingPreRollFlush: Bool = false
141
- private let speakerPreRollSeconds: Double = 1.0
142
- private var speakerVerificationThreshold: Float = 0
143
- private var speakerVerificationFrameSeq: UInt64 = 0
144
- private var speakerVerificationSourceSampleRate: Int = 0
145
- private var speakerVerificationTargetSampleRate: Int = 0
146
- private var speakerVerificationResampleCarry: [Float] = []
147
- private var speakerVerificationResamplePos: Double = 0
148
- private var speakerLastPositiveMatchAt: CFTimeInterval = 0
149
-
150
- // --- Speech recognition lite pause (counter-based) ---
151
- private let speechPauseLock = NSLock()
152
- private var speechRecognitionPauseCount: Int = 0
153
- private var speechRecognitionPaused: Bool = false
154
- @inline(__always)
155
- private func isSpeechRecognitionLitePaused() -> Bool {
156
- speechPauseLock.lock()
157
- let paused = speechRecognitionPaused
158
- speechPauseLock.unlock()
159
- return paused
160
- }
161
- @inline(__always)
162
- private func resetSpeechRecognitionLitePauseState(_ why: String) {
163
- speechPauseLock.lock()
164
- speechRecognitionPauseCount = 0
165
- speechRecognitionPaused = false
166
- speechPauseLock.unlock()
167
- NSLog("[STT] resetSpeechRecognitionLitePauseState(\(why)) -> count=0 paused=NO")
168
- }
169
-
170
- // MARK: - Event names (unchanged)
171
- public static let supportedEvents: [String] = [
172
- "onSpeechResults",
173
- "onSpeechStart",
174
- "onSpeechPartialResults",
175
- "onSpeechError",
176
- "onSpeechEnd",
177
- "onSpeechRecognized",
178
- "onSpeechVolumeChanged"
179
- ]
180
- private func removeEngineObservers() {
181
- let nc = NotificationCenter.default
182
- if let engine = audioEngine {
183
- nc.removeObserver(self,
184
- name: .AVAudioEngineConfigurationChange,
185
- object: engine)
186
- }
187
- nc.removeObserver(self,
188
- name: AVAudioSession.interruptionNotification,
189
- object: AVAudioSession.sharedInstance())
190
- nc.removeObserver(self,
191
- name: AVAudioSession.routeChangeNotification,
192
- object: AVAudioSession.sharedInstance())
193
- nc.removeObserver(self,
194
- name: AVAudioSession.mediaServicesWereResetNotification,
195
- object: nil)
196
- }
197
-
198
- private func hasExternalOutput(_ s: AVAudioSession) -> Bool {
199
- return s.currentRoute.outputs.contains {
200
- switch $0.portType {
201
- case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
202
- return true
203
- default:
204
- return false
205
- }
206
- }
207
- }
208
-
209
- // Force loudspeaker if iOS routes to receiver while we want speaker.
210
- private func forceSpeakerIfReceiver(_ why: String) {
211
- let s = AVAudioSession.sharedInstance()
212
-
213
- // If there is ANY external output, never fight it.
214
- if hasExternalOutput(s) { return }
215
-
216
- let isReceiver = s.currentRoute.outputs.contains { $0.portType == .builtInReceiver }
217
- if !isReceiver { return }
218
-
219
- do {
220
- try s.overrideOutputAudioPort(.speaker)
221
- NSLog("[STT] 🔊 forceSpeakerIfReceiver(\(why)): receiver -> speaker")
222
- } catch {
223
- NSLog("[STT] 🔊 forceSpeakerIfReceiver(\(why)) failed: \(error.localizedDescription)")
224
- }
225
- }
226
-
227
- private func stopRecognitionTaskLite(_ why: String) {
228
- // Don't fight teardown / telephony / mic pause.
229
- if isTearingDown || isTelephonyInterrupted || isRecoveringAfterTelephony || micPaused { return }
230
-
231
- if recognitionTask != nil || recognitionRequest != nil {
232
- NSLog("[STT] stopRecognitionTaskLite(\(why)) cancel+drop req/task")
233
- }
234
-
235
- recognitionTask?.cancel()
236
- recognitionTask = nil
237
-
238
- recognitionRequest?.endAudio()
239
- recognitionRequest = nil
240
-
241
- // reset "speech started" gating so we emit cleanly after resume
242
- seenRealSpeech = false
243
- }
244
-
245
- @objc public func pauseSpeechRecognitionLite() {
246
- // Update counter under lock
247
- if isTearingDown || isTelephonyInterrupted || isRecoveringAfterTelephony || micPaused { return }
248
-
249
- speechPauseLock.lock()
250
- let wasZero = (speechRecognitionPauseCount == 0)
251
- speechRecognitionPauseCount += 1
252
- speechRecognitionPaused = true
253
- let c = speechRecognitionPauseCount
254
- speechPauseLock.unlock()
255
-
256
- NSLog("[STT] pauseSpeechRecognitionLite(): count=\(c) (speechRecognitionPaused=YES)")
257
-
258
- // Only act on the FIRST pause (0->1)
259
- guard wasZero else { return }
260
-
261
- // Lite behavior: cancel current speech task & drop request so buffers stop accumulating
262
- DispatchQueue.main.async { [weak self] in
263
- guard let self = self else { return }
264
-
265
- // Only if STT is actually active and mic is not paused
266
- if !self.sttActive { NSLog("[STT] pauseSpeechRecognitionLite: ignored (sttActive=NO)"); return }
267
- if self.micPaused { NSLog("[STT] pauseSpeechRecognitionLite: ignored (micPaused=YES)"); return }
268
-
269
- self.stopRecognitionTaskLite("lite-pause")
270
- }
271
- }
272
-
273
- // NEW: times == -1 => clear all pauses (force resume)
274
- // times >= 1 => decrement by N
275
- @objc public func unPauseSpeechRecognitionLite(_ times: NSNumber) {
276
- let n = times.intValue
277
-
278
- speechPauseLock.lock()
279
- if n == -1 {
280
- speechRecognitionPauseCount = 0
281
- } else if n > 0 {
282
- speechRecognitionPauseCount = max(0, speechRecognitionPauseCount - n)
283
- } else {
284
- // 0 or weird negatives (except -1): do nothing
285
- }
286
-
287
- let reachedZero = (speechRecognitionPauseCount == 0)
288
- if reachedZero { speechRecognitionPaused = false }
289
-
290
- let c = speechRecognitionPauseCount
291
- let paused = speechRecognitionPaused
292
- speechPauseLock.unlock()
293
-
294
- NSLog("[STT] unPauseSpeechRecognitionLite(times=\(n)): count=\(c) (speechRecognitionPaused=\(paused ? "YES" : "NO"))")
295
-
296
- guard reachedZero else { return }
297
-
298
- DispatchQueue.main.async { [weak self] in
299
- guard let self = self else { return }
300
-
301
- // Conditions you asked for
302
- if self.isTearingDown { NSLog("[STT] lite-unpause: ignored (isTearingDown=YES)"); return }
303
- if self.isTelephonyInterrupted || self.isRecoveringAfterTelephony {
304
- NSLog("[STT] lite-unpause: ignored (telephony/recovering)")
305
- return
306
- }
307
- if !self.sttActive { NSLog("[STT] lite-unpause: ignored (sttActive=NO)"); return }
308
- if self.micPaused { NSLog("[STT] lite-unpause: ignored (micPaused=YES)"); return }
309
-
310
- // If we don't currently have a task, create a FRESH request/task.
311
- // Use your existing engine/task bring-up logic (keeps it “lite”).
312
- self.ensureEngineRunning(reason: "lite-unpause", skipCooldown: true)
313
-
314
- // Extra defensive: if engine is running but task didn't start, force a fresh task.
315
- if self.recognitionTask == nil {
316
- self.startTask(self.makeFreshRequest())
317
- NSLog("[STT] lite-unpause: forced startTask(makeFreshRequest())")
318
- }
319
- }
320
- }
321
-
322
- @objc public func pauseMicrophoneAndWait(_ timeoutMs: NSNumber,
323
- completion: @escaping (Bool, String?) -> Void) {
324
- micPauseLock.lock()
325
-
326
- // If already paused, just wait for settle condition (idempotent)
327
- if !micPaused {
328
- pauseMicrophone()
329
- }
330
-
331
- let timeoutSec = max(0.1, timeoutMs.doubleValue / 1000.0)
332
-
333
- pollOnMain(timeoutSec: timeoutSec, intervalSec: 0.05,
334
- condition: { [weak self] in
335
- guard let self = self else { return false }
336
- return self.isPausedSettled()
337
- },
338
- done: { [weak self] ok in
339
- // IMPORTANT: unlock BEFORE calling completion (completion may call pause/unpause again)
340
- self?.micPauseLock.unlock()
341
- completion(ok, ok ? nil : "pause_timeout")
342
- })
343
- }
344
-
345
- @objc public func unPauseMicrophoneAndWait(_ timeoutMs: NSNumber,
346
- completion: @escaping (Bool, String?) -> Void) {
347
- micPauseLock.lock()
348
-
349
- // If not paused, still ensure we're “live” (idempotent)
350
- if micPaused {
351
- unPauseMicrophone()
352
- }
353
-
354
- let timeoutSec = max(0.1, timeoutMs.doubleValue / 1000.0)
355
-
356
- pollOnMain(timeoutSec: timeoutSec, intervalSec: 0.05,
357
- condition: { [weak self] in
358
- guard let self = self else { return false }
359
- return self.isUnpausedSettled()
360
- },
361
- done: { [weak self] ok in
362
- self?.micPauseLock.unlock()
363
- completion(ok, ok ? nil : "unpause_timeout")
364
- })
365
- }
366
-
367
- // MARK: - settle conditions
368
-
369
- private func isPausedSettled() -> Bool {
370
- let s = AVAudioSession.sharedInstance()
371
-
372
- // What "settled" means for PAUSE in your implementation:
373
- // - micPaused flag latched
374
- // - engine + task gone
375
- // - session in playback
376
- // - no input ports visible
377
- if micPaused != true { return false }
378
- if audioEngine != nil { return false }
379
- if recognitionTask != nil { return false }
380
- if recognitionRequest != nil { return false }
381
- if s.category != .playback { return false }
382
- if !s.currentRoute.inputs.isEmpty { return false }
383
-
384
- return true
385
- }
386
-
387
- private func isUnpausedSettled() -> Bool {
388
- let s = AVAudioSession.sharedInstance()
389
-
390
- // What "settled" means for UNPAUSE:
391
- // - micPaused false
392
- // - engine running
393
- // - recognitionTask exists + running
394
- // - request exists
395
- // - capture is valid
396
- if micPaused != false { return false }
397
- guard let eng = audioEngine, eng.isRunning else { return false }
398
- guard let task = recognitionTask, task.state == .running else { return false }
399
- guard recognitionRequest != nil else { return false }
400
- if s.category != .playAndRecord { return false }
401
- if !hasValidCaptureNow(allowColdEngine: true) { return false }
402
-
403
- return true
404
- }
405
-
406
- // MARK: - polling helper (MAIN QUEUE)
407
-
408
- private func pollOnMain(timeoutSec: TimeInterval,
409
- intervalSec: TimeInterval,
410
- condition: @escaping () -> Bool,
411
- done: @escaping (Bool) -> Void) {
412
- let deadline = CACurrentMediaTime() + timeoutSec
413
-
414
- func step() {
415
- // Always on main
416
- if condition() {
417
- done(true)
418
- return
419
- }
420
- if CACurrentMediaTime() >= deadline {
421
- done(false)
422
- return
423
- }
424
- DispatchQueue.main.asyncAfter(deadline: .now() + intervalSec) {
425
- step()
426
- }
427
- }
428
-
429
- DispatchQueue.main.async {
430
- step()
431
- }
432
- }
433
-
434
- public func pauseMicrophone() {
435
- NSLog("[STT] pauseMicrophone() requested")
436
-
437
- // ✅ HARD reset speech-lite pause state on mic pause
438
- resetSpeechRecognitionLitePauseState("pauseMicrophone")
439
-
440
- guard !micPaused else {
441
- NSLog("[STT] pauseMicrophone(): already paused")
442
- return
443
- }
444
- micPaused = true
445
-
446
- let session = AVAudioSession.sharedInstance()
447
-
448
- // Save current session config (so we can restore on unpause)
449
- if savedSessionBeforePause == nil {
450
- let sr = session.sampleRate
451
- let inCh = Int(session.inputNumberOfChannels)
452
- let outCh = Int(session.outputNumberOfChannels)
453
- let ioDur = session.ioBufferDuration
454
-
455
- savedSessionBeforePause = (
456
- category: session.category,
457
- mode: session.mode,
458
- options: session.categoryOptions,
459
- sr: sr,
460
- inCh: inCh,
461
- outCh: outCh,
462
- ioDur: ioDur
463
- )
464
- }
465
- // Watchdog is pointless while paused (and would try to “heal” us)
466
- stopWatchdog()
467
-
468
- // Stop mic capture but keep TTS safe — remove taps first
469
- if let eng = audioEngine {
470
- safeRemoveTap(eng.inputNode)
471
- safeRemoveTap(eng.mainMixerNode)
472
- safeRemoveTap(eng.outputNode)
473
- }
474
- // 🔴 NEW: fully stop and tear down the engine so session can really deactivate
475
- if let eng = audioEngine {
476
- if eng.isRunning {
477
- eng.stop()
478
- }
479
- eng.reset()
480
- }
481
- audioEngine = nil
482
-
483
- // Clear playback node in this engine; unpause will rebuild a fresh engine+player graph
484
- if let p = playbackNode {
485
- p.stop()
486
- }
487
- playbackNode = nil
488
-
489
- // Clear AudioPlaybackHook engine-based callbacks (defensive)
490
- AudioPlaybackHook.currentEngine = nil
491
- AudioPlaybackHook.engineScheduleFile = nil
492
- AudioPlaybackHook.isEngineReady = nil
493
- AudioPlaybackHook.useOnlyEnginePlayback = nil
494
- AudioPlaybackHook.stopEnginePlayback = nil
495
-
496
- // Stop recognition cleanly (we'll re-create on unpause)
497
- recognitionTask?.cancel()
498
- recognitionTask = nil
499
- recognitionRequest?.endAudio()
500
- recognitionRequest = nil
501
-
502
- // Switch to playback-only session so iOS releases the mic (indicator off)
503
- do {
504
- // Use this if we ever have duck others
505
- // try session.setActive(false, options: [.notifyOthersOnDeactivation])
506
- try session.setActive(false, options: [])
507
- markAECSessionActivation(false, reason: "pauseMicrophone-pre")
508
- NSLog("[STT] pauseMicrophone(): setActive false")
509
- } catch {
510
- NSLog("[STT] pauseMicrophone(): failed to switch setActive false: \(error.localizedDescription)")
511
- }
512
- // Switch to playback-only session so iOS releases the mic (indicator off)
513
- do {
514
- try session.setCategory(.playback, options: [/*.mixWithOthers*/])
515
- NSLog("[STT] pauseMicrophone(): session set to .playback (mic released)")
516
- } catch {
517
- NSLog("[STT] pauseMicrophone(): failed to switch to .playback: \(error.localizedDescription)")
518
- }
519
- // Switch to playback-only session so iOS releases the mic (indicator off)
520
- do {
521
- try session.setActive(true, options: [])
522
- markAECSessionActivation(true, reason: "pauseMicrophone-playback")
523
- NSLog("[STT] pauseMicrophone(): session set to .playback (mic released)")
524
- } catch {
525
- NSLog("[STT] pauseMicrophone(): failed to switch to session.setActive with .playback: \(error.localizedDescription)")
526
- }
527
- }
528
-
529
- public func unPauseMicrophone() {
530
- NSLog("[STT] unPauseMicrophone() requested")
531
- guard micPaused else {
532
- NSLog("[STT] unPauseMicrophone(): not paused")
533
- return
534
- }
535
- // ✅ HARD reset speech-lite pause state on mic unpause
536
- resetSpeechRecognitionLitePauseState("unPauseMicrophone")
537
-
538
- let session = AVAudioSession.sharedInstance()
539
-
540
- if let saved = savedSessionBeforePause {
541
- // Restore previous session category/mode/options and IO prefs
542
- do {
543
- try session.setActive(false, options: [.notifyOthersOnDeactivation])
544
- markAECSessionActivation(false, reason: "unPauseMicrophone-pre")
545
- } catch {
546
- NSLog("[STT] unPauseMicrophone: setActive(false) failed: \(error.localizedDescription)")
547
- }
548
-
549
- do {
550
- try session.setCategory(saved.category,
551
- mode: saved.mode,
552
- options: saved.options)
553
- } catch {
554
- NSLog("[STT] unPauseMicrophone: restoring category failed: \(error.localizedDescription)")
555
- }
556
-
557
- if saved.sr > 0 {
558
- try? session.setPreferredSampleRate(saved.sr)
559
- }
560
- if saved.inCh > 0 && session.isInputAvailable {
561
- try? session.setPreferredInputNumberOfChannels(saved.inCh)
562
- }
563
- if saved.outCh > 0 {
564
- try? session.setPreferredOutputNumberOfChannels(saved.outCh)
565
- }
566
- if saved.ioDur > 0 {
567
- try? session.setPreferredIOBufferDuration(saved.ioDur)
568
- }
569
-
570
- do {
571
- try session.setActive(true, options: [])
572
- markAECSessionActivation(true, reason: "unPauseMicrophone")
573
- } catch {
574
- NSLog("[STT] unPauseMicrophone: setActive(true) failed: \(error.localizedDescription)")
575
- }
576
- _ = setupAudioSession()
577
-
578
- // !!! IMPORTANT if micPaused = true then rebuildEngineGraphAndRestart will not activate necessary things!
579
- micPaused = false
580
- // Rebuild graph + reinstall tap + restart recognition
581
- rebuildEngineGraphAndRestart(reason: "unpause-mic")
582
- NSLog("[STT] unPauseMicrophone(): session restored + rebuildEngineGraphAndRestart() called")
583
- startWatchdog()
584
-
585
- // Clear so next pause re-snapshots the current config
586
- savedSessionBeforePause = nil
587
- } else {
588
- // Fallback if we never saved a session (very defensive)
589
- NSLog("[STT] unPauseMicrophone(): no savedSessionBeforePause, using setupAudioSession()")
590
- _ = setupAudioSession()
591
- rebuildEngineGraphAndRestart(reason: "unpause-mic-nosaved")
592
- startWatchdog()
593
- }
594
- micPaused = false
595
- }
596
-
597
- private var graphGen: UInt64 = 0
598
- @inline(__always) private func bumpGraphGen() { graphGen &+= 1; ioLatchActiveGen = 0 }
599
- // Add near other state
600
- private var pausedForCaptureLoss = false
601
-
602
- private func markCaptureLost() {
603
- pausedForCaptureLoss = true
604
- }
605
-
606
- private func tryClearCaptureLossAfterStartSucceeded() {
607
- // Only clear after we actually start the engine
608
- pausedForCaptureLoss = false
609
- }
610
-
611
- // MARK: - AEC Toggle API
612
-
613
- /// Enable/disable iOS voice-processing (AEC + ducking).
614
- /// If STT is already active, we rebuild the session/graph so it takes effect.
615
- public func setAECEnabled(_ enabled: Bool) {
616
- NSLog("[STT] setAECEnabled(\(enabled))")
617
- aecEnabled = enabled
618
-
619
- // If recognition is live, re-apply session + graph so change is effective
620
- if sttActive {
621
- _ = setupAudioSession()
622
- rebuildEngineGraphAndRestart(reason: enabled ? "aec-on" : "aec-off")
623
- }
624
- }
625
-
626
- public func isAECEnabled() -> Bool {
627
- return aecEnabled
628
- }
629
-
630
- private func startRecognitionAfterCall() {
631
- guard !startedRecognitionAfterCall else { return }
632
- startedRecognitionAfterCall = true
633
-
634
- // The local speech daemon can be in a funky state after telephony.
635
- recreateSpeechRecognizerPreservingLocale()
636
-
637
- let req = makeFreshRequest()
638
- startTask(req)
639
-
640
- // Recovery window ends only after we have a live task
641
- isRecoveringAfterTelephony = false
642
- startWatchdog() // resume health checks now that we're live again
643
- NSLog("[STT] recovery: recognition task started after buffers observed")
644
- }
645
-
646
- private func hasValidCaptureNow(allowColdEngine: Bool = true) -> Bool {
647
- let s = AVAudioSession.sharedInstance()
648
-
649
- // Real “no mic” conditions (A2DP only, telephony, etc.)
650
- guard s.isInputAvailable,
651
- !s.currentRoute.inputs.isEmpty,
652
- s.inputNumberOfChannels > 0,
653
- s.sampleRate > 0 else { return false }
654
-
655
- // If we require a hot engine, check the node *only when running*.
656
- if !allowColdEngine, let eng = audioEngine, eng.isRunning {
657
- let f = eng.inputNode.outputFormat(forBus: 0)
658
- return f.sampleRate > 0 && f.channelCount > 0
659
- }
660
-
661
- // Session says we have input; engine may be cold — that’s fine to attempt start().
662
- return true
663
- }
664
-
665
-
666
- @inline(__always)
667
- private func safeRemoveTap(_ node: AVAudioNode?, bus: AVAudioNodeBus = 0) {
668
- guard let n = node, n.engine != nil else { return } // only remove if still attached
669
- try? n.removeTap(onBus: bus)
670
- }
671
-
672
- // MARK: - Public API (native replacements for the former RCT methods)
673
-
674
- public func isSpeechAvailable(_ completion: @escaping (Bool) -> Void) {
675
- SFSpeechRecognizer.requestAuthorization { status in
676
- switch status {
677
- case .authorized: completion(true)
678
- default: completion(false)
679
- }
680
- }
681
- }
682
-
683
- private func armFirstIOCycleLatch(on engine: AVAudioEngine) {
684
- engineHasRenderedOnce = false
685
- let gen = graphGen
686
-
687
- // Prevent overlapping latches against the same graph generation.
688
- if ioLatchActiveGen == gen { return }
689
- ioLatchActiveGen = gen
690
-
691
- DispatchQueue.main.async { [weak self, weak engine] in
692
- guard let self = self, let eng = engine, gen == self.graphGen else { return }
693
- let out = eng.outputNode
694
- var fired = false
695
-
696
- // >>> IMPORTANT: ensure no previous tap is left behind
697
- self.safeRemoveTap(out, bus: 0)
698
-
699
- out.installTap(onBus: 0, bufferSize: 128, format: nil) { [weak self, weak out] _, _ in
700
- guard let self = self, gen == self.graphGen else { return }
701
- if fired { return }
702
- fired = true
703
- self.safeRemoveTap(out, bus: 0)
704
- self.engineHasRenderedOnce = true
705
- // latch finished for this gen
706
- if self.ioLatchActiveGen == gen { self.ioLatchActiveGen = 0 }
707
- }
708
-
709
- DispatchQueue.main.asyncAfter(deadline: .now() + 2.0) { [weak self, weak out] in
710
- guard let self = self, gen == self.graphGen else { return }
711
- if fired { return }
712
- self.safeRemoveTap(out, bus: 0)
713
- self.engineHasRenderedOnce = true // fail-open
714
- if self.ioLatchActiveGen == gen { self.ioLatchActiveGen = 0 }
715
- }
716
- }
717
- }
718
-
719
- public func isRecognizing() -> Bool {
720
- guard let task = recognitionTask else { return false }
721
- return task.state == .running
722
- }
723
-
724
- private func ensurePlaybackNode(in engine: AVAudioEngine) -> AVAudioPlayerNode {
725
- if let p = playbackNode, p.engine === engine {
726
- return p
727
- }
728
- let p = AVAudioPlayerNode()
729
- playbackNode = p
730
- engine.attach(p)
731
-
732
- // // Ensure we have a de-esser on this engine (recreate if needed)
733
- // if ttsEQ?.engine !== engine {
734
- // if let old = ttsEQ, old.engine != nil {
735
- // // Best-effort detach if old EQ belongs to a different engine
736
- // // (safe even if already gone)
737
- // }
738
- // // --- Aggressive low-pass only ---
739
- // let deEss = AVAudioUnitEQ(numberOfBands: 1)
740
- // let lpf = deEss.bands[0]
741
- // lpf.filterType = .lowPass
742
- // lpf.frequency = 6500 // try 6000–7500
743
- // lpf.bandwidth = 0.35 // fairly steep
744
- // lpf.gain = 0.0
745
- // lpf.bypass = false
746
-
747
- // self.ttsEQ = deEss
748
-
749
- // ttsEQ = deEss
750
- // engine.attach(deEss)
751
- // }
752
-
753
- // // Route: player -> EQ -> mainMixer
754
- // if let deEss = ttsEQ {
755
- // engine.disconnectNodeOutput(p)
756
- // engine.connect(p, to: deEss, format: nil)
757
- // engine.connect(deEss, to: engine.mainMixerNode, format: nil)
758
- // } else {
759
- // // ultra-defensive fallback
760
- // engine.connect(p, to: engine.mainMixerNode, format: nil)
761
- // }
762
- engine.connect(p, to: engine.mainMixerNode, format: nil)
763
- return p
764
- }
765
-
766
- private func startWatchdog() {
767
- stallWatchdog?.invalidate()
768
- stallWatchdog = Timer.scheduledTimer(withTimeInterval: 2.0, repeats: true) { [weak self] _ in
769
- self?.checkTaskHealth()
770
- }
771
- RunLoop.main.add(stallWatchdog!, forMode: .common)
772
- }
773
-
774
- private func stopWatchdog() {
775
- stallWatchdog?.invalidate()
776
- stallWatchdog = nil
777
- }
778
-
779
- private func rearmTask(reason: String) {
780
- // Cancel old task only — keep the engine and tap running.
781
- if isTelephonyInterrupted || isRecoveringAfterTelephony {
782
- NSLog("[STT] rearmTask(\(reason)) suppressed (telephony/recovering)")
783
- return
784
- }
785
- if micPaused {
786
- NSLog("[STT] rearmTask(\(reason)) suppressed (micPaused)")
787
- return
788
- }
789
- if isSpeechRecognitionLitePaused() {
790
- NSLog("[STT] rearmTask(\(reason)) suppressed (speechRecognitionPaused)")
791
- return
792
- }
793
-
794
- // -----------------
795
- recognitionTask?.cancel()
796
- recognitionTask = nil
797
-
798
- seenRealSpeech = false
799
- lastTaskStartAt = CACurrentMediaTime()
800
- startTask(makeFreshRequest())
801
- NSLog("[STT] rearmTask(\(reason)) -> new task started")
802
- }
803
-
804
- private func checkTaskHealth() {
805
- if isTearingDown || isTelephonyInterrupted || isRecoveringAfterTelephony {
806
- NSLog("[STT] watchdog: isTearingDown || isTelephonyInterrupted || isRecoveringAfterTelephony -- DOING NOTHING")
807
- return
808
- }
809
- if isSpeechRecognitionLitePaused() {
810
- NSLog("[STT] watchdog: speechRecognitionPaused -- DOING NOTHING")
811
- return
812
- }
813
- if micPaused {
814
- NSLog("[STT] watchdog: micPaused -- DOING NOTHING")
815
- return
816
- }
817
-
818
- let now = CACurrentMediaTime()
819
-
820
- // ⛳️ GRACE: don’t call "stall" right after a new task begins
821
- if now - lastTaskStartAt < 5.0 { return }
822
-
823
- // 0) No capture? Wait quietly.
824
- if !hasValidCaptureNow(allowColdEngine: true) {
825
- markCaptureLost()
826
- NSLog("[STT] watchdog: capture not available; waiting…")
827
- return
828
- }
829
-
830
- // 1) Engine down? Bring it up (bypass cooldown from watchdog).
831
- if audioEngine == nil || !(audioEngine?.isRunning ?? false) {
832
- NSLog("[STT] watchdog: engine down → ensureEngineRunning")
833
- ensureEngineRunning(reason: "watchdog-engine-down", skipCooldown: true)
834
- return
835
- }
836
-
837
- // 2) Recognizer unavailable? wait.
838
- if let rec = speechRecognizer, rec.isAvailable == false {
839
- NSLog("[STT] watchdog: recognizer unavailable; waiting…")
840
- return
841
- }
842
-
843
- // 3) No task? start one.
844
- if recognitionTask == nil {
845
- if now - lastRearmAt > rearmCooldownTask {
846
- lastRearmAt = now
847
- startTask(makeFreshRequest())
848
- }
849
- return
850
- }
851
-
852
- // 4) No input buffers? gentle nudge, then rebuild if repeated.
853
- let timeSinceBuffer = now - lastBufferAt
854
- if timeSinceBuffer > noInputThreshold {
855
- if now - lastNoInputRecoveryAt > noInputCooldown {
856
- lastNoInputRecoveryAt = now
857
- consecutiveNoInputResets += 1
858
- ensureEngineRunning(reason: "watchdog-no-input", skipCooldown: true)
859
- if consecutiveNoInputResets >= maxGentleRetries {
860
- consecutiveNoInputResets = 0
861
- rebuildEngineGraphAndRestart(reason: "watchdog-no-input-rebuild")
862
- }
863
- }
864
- return
865
- } else {
866
- consecutiveNoInputResets = 0
867
- }
868
-
869
- // 5) Buffers flowing but no results → rearm task.
870
- let noResultsFor = now - lastResultAt
871
- if noResultsFor > stallThreshold {
872
- if now - lastRearmAt > rearmCooldownTask {
873
- lastRearmAt = now
874
- consecutiveStallCount += 1
875
- rearmTask(reason: "watchdog-stall")
876
- if consecutiveStallCount >= 3 {
877
- recreateSpeechRecognizerPreservingLocale()
878
- consecutiveStallCount = 0
879
- }
880
- }
881
- } else {
882
- consecutiveStallCount = 0
883
- }
884
- }
885
-
886
- public func startSpeech(localeStr: String?) {
887
- startSpeechInternal(localeStr: localeStr, speakerVerificationConfig: nil)
888
- }
889
-
890
- public func startSpeech(localeStr: String?, onboardingJsonPath: String) {
891
- do {
892
- let loaded = try loadSpeakerVerificationStartConfig(onboardingJsonPath: onboardingJsonPath)
893
- startSpeechInternal(localeStr: localeStr, speakerVerificationConfig: loaded)
894
- } catch {
895
- sendResult(error: ["message": "Failed to load onboarding JSON: \(error.localizedDescription)"],
896
- bestTranscription: nil,
897
- transcriptions: nil,
898
- isFinal: nil)
899
- }
900
- }
901
-
902
- private func startSpeechInternal(localeStr: String?,
903
- speakerVerificationConfig: SpeakerVerificationStartConfig?) {
904
- NSLog("[STT] startSpeech(locale=\(localeStr ?? "nil"), sv=\(speakerVerificationConfig == nil ? "off" : "on"))")
905
- lastLocaleStr = localeStr ?? ""
906
- speakerVerificationStartConfig = speakerVerificationConfig
907
- if recognitionTask != nil {
908
- sendResult(error: ["code": "already_started", "message": "Speech recognition already started!"],
909
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
910
- return
911
- }
912
-
913
- SFSpeechRecognizer.requestAuthorization { [weak self] status in
914
- guard let self = self else { return }
915
- switch status {
916
- case .notDetermined:
917
- self.sendResult(error: ["message": "Speech recognition not yet authorized"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
918
- case .denied:
919
- self.sendResult(error: ["message": "User denied access to speech recognition"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
920
- case .restricted:
921
- self.sendResult(error: ["message": "Speech recognition restricted on this device"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
922
- case .authorized:
923
- self.setupAndStartRecognizing(localeStr: localeStr)
924
- @unknown default:
925
- self.sendResult(error: ["message": "Unknown authorization status"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
926
- }
927
- }
928
- }
929
-
930
- public func stopSpeech(_ completion: ((Bool) -> Void)? = nil) {
931
- NSLog("[STT] stopSpeech() requested by app")
932
- recognitionTask?.finish()
933
- completion?(false)
934
- }
935
-
936
- public func cancelSpeech(_ completion: ((Bool) -> Void)? = nil) {
937
- NSLog("[STT] cancelSpeech() requested by app")
938
-
939
- recognitionTask?.cancel()
940
- completion?(false)
941
- }
942
-
943
- public func destroySpeech(_ completion: ((Bool) -> Void)? = nil) {
944
- NSLog("[STT] **** destroySpeech!!!")
945
- teardown()
946
- completion?(false)
947
- }
948
-
949
- private func updateSessionRouting(selectBestInput: Bool = true) {
950
- NSLog("[STT] ⚠️ updateSessionRouting??? why???")
951
-
952
- if isAdjustingRoute { return }
953
-
954
- isAdjustingRoute = true
955
- defer { isAdjustingRoute = false }
956
-
957
- let s = AVAudioSession.sharedInstance()
958
-
959
- let hasInputRoute = s.isInputAvailable && !s.currentRoute.inputs.isEmpty
960
- if !hasInputRoute {
961
- // Transient during category/route settle — do NOT bail.
962
- NSLog("[STT] route: input not visible yet (transient) — proceeding to activate session")
963
- }
964
-
965
- DispatchQueue.global(qos: .userInitiated).async { [weak self] in
966
- guard let self = self else { return }
967
-
968
- let hasWiredOrCar = s.currentRoute.outputs.contains {
969
- switch $0.portType {
970
- case .headphones,
971
- .bluetoothA2DP,
972
- .bluetoothHFP,
973
- .bluetoothLE,
974
- .airPlay,
975
- .carAudio,
976
- .usbAudio:
977
- return true
978
- default:
979
- return false
980
- }
981
- }
982
- if selectBestInput, let all = s.availableInputs {
983
- let btHFP = all.first { $0.portType == .bluetoothHFP || $0.portType == .bluetoothLE }
984
- let wired = all.first { $0.portType == .headsetMic }
985
- let built = all.first { $0.portType == .builtInMic }
986
- //let best = btHFP ?? wired ?? built
987
- // Prefer BT HFP (mic), then wired mic; otherwise leave preferredInput as-is.
988
- let desired = btHFP ?? wired
989
-
990
- do {
991
- if let desired, s.preferredInput?.uid != desired.uid {
992
- try s.setPreferredInput(desired)
993
- } else if desired == nil {
994
- // No headset mic → clear preference; do NOT force built-in
995
- if s.preferredInput != nil { try s.setPreferredInput(nil) }
996
- }
997
- // If built-in is already what the system selected, we need no action.
998
- if let builtIn = built, (desired == nil), s.preferredInput?.uid == nil {
999
- // Optionally hint bottom/back data source, but don’t fight routes
1000
- if let ds = builtIn.dataSources?.first(where: { $0.orientation == .bottom || $0.orientation == .back }) {
1001
- try? builtIn.setPreferredDataSource(ds)
1002
- }
1003
- }
1004
- } catch {
1005
- NSLog("[STT] setPreferredInput failed: \(error.localizedDescription)")
1006
- }
1007
- }
1008
-
1009
- var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
1010
- if !hasWiredOrCar { opts.insert(.defaultToSpeaker) }
1011
-
1012
- if s.category != .playAndRecord || s.mode != .default || s.categoryOptions != opts {
1013
- do { try s.setCategory(.playAndRecord, mode: .default, options: opts) }
1014
- catch { NSLog("[STT] setCategory failed: \(error.localizedDescription)") }
1015
- }
1016
-
1017
- do {
1018
- try s.setActive(true, options: [])
1019
- self.markAECSessionActivation(true, reason: "updateSessionRouting")
1020
- } catch {
1021
- NSLog("[STT] setActive failed: \(error.localizedDescription)")
1022
- self.markAECSessionActivation(false, reason: "updateSessionRouting-failed")
1023
- }
1024
-
1025
- // Optional: force 16k after activation
1026
- self.force16kIfPossible(s)
1027
- self.forceSpeakerIfReceiver("updateSessionRouting")
1028
-
1029
- // Log route back on main so logs stay ordered
1030
- DispatchQueue.main.async {
1031
- let inPorts = s.currentRoute.inputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
1032
- let outPorts = s.currentRoute.outputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
1033
- NSLog("[STT] route in=[\(inPorts)] out=[\(outPorts)]")
1034
- }
1035
- }
1036
- }
1037
-
1038
- // ↓↓↓ preferred settings helper
1039
- private func force16kIfPossible(_ session: AVAudioSession) {
1040
- if force16kMicSampleRate {
1041
- try? session.setPreferredSampleRate(16_000)
1042
- }
1043
-
1044
- let hasExternalOutput = session.currentRoute.outputs.contains {
1045
- switch $0.portType {
1046
- case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
1047
- return true
1048
- default: return false
1049
- }
1050
- }
1051
-
1052
- let builtInOut = session.currentRoute.outputs.allSatisfy { $0.portType == .builtInSpeaker }
1053
- let builtInIn = session.currentRoute.inputs.allSatisfy { $0.portType == .builtInMic }
1054
-
1055
- // Prefer 16k only on built-in mic+speaker (voice pipeline). Otherwise leave SR to route.
1056
- if builtInIn && builtInOut {
1057
- try? session.setPreferredSampleRate(16_000)
1058
- if session.isInputAvailable { try? session.setPreferredInputNumberOfChannels(1) }
1059
- // ⚠️ Do NOT force output channels to 1; many routes require 2ch.
1060
- // try? session.setPreferredOutputNumberOfChannels(1) // ← REMOVE
1061
- } else {
1062
- // Input mono is generally OK, but don’t touch output channels
1063
- if session.isInputAvailable { try? session.setPreferredInputNumberOfChannels(1) }
1064
- }
1065
-
1066
- // A small IO buffer is fine across routes
1067
- try? session.setPreferredIOBufferDuration(0.02)
1068
- }
1069
-
1070
- private func markAECSessionActivation(_ active: Bool, reason: String) {
1071
- let now = CACurrentMediaTime()
1072
- aecSessionActivationLock.lock()
1073
- aecSessionIsActive = active
1074
- if active { lastAECSessionActivationAt = now }
1075
- else { lastAECSessionActivationAt = 0 }
1076
- aecSessionActivationLock.unlock()
1077
- NSLog("[STT] AEC session activation(\(reason)): active=\(active ? "YES" : "NO") t=\(String(format: "%.3f", now))")
1078
- }
1079
-
1080
- private func isInAECRouteWarmupWindow() -> Bool {
1081
- guard aecEnabled, forceAECDuringRouteWarmup, aecRouteWarmupSeconds > 0 else { return false }
1082
- let now = CACurrentMediaTime()
1083
- aecSessionActivationLock.lock()
1084
- let isActive = aecSessionIsActive
1085
- let lastActiveAt = lastAECSessionActivationAt
1086
- aecSessionActivationLock.unlock()
1087
- guard isActive, lastActiveAt > 0 else { return false }
1088
- return (now - lastActiveAt) < aecRouteWarmupSeconds
1089
- }
1090
-
1091
- // MARK: - Core logic (kept intact, including AEC order/steps)
1092
-
1093
- /// Returns true if no errors occurred (identical flow & calls as ObjC).
1094
- /// Returns true if no errors occurred (identical flow & calls as ObjC) + keep-alive opts.
1095
- /// Returns true if no errors occurred (identical flow & calls as ObjC) + keep-alive opts.
1096
- private func setupAudioSession() -> Bool {
1097
- var err: NSError?
1098
- let session = AVAudioSession.sharedInstance()
1099
- self.audioSession = session
1100
-
1101
- do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
1102
- catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
1103
- markAECSessionActivation(false, reason: "setupAudioSession-pre")
1104
-
1105
- // Build options to match our routing rules
1106
- // (defaultToSpeaker only when no external output is active)
1107
- let hasExternalOutput: Bool = session.currentRoute.outputs.contains {
1108
- switch $0.portType {
1109
- case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
1110
- return true
1111
- default:
1112
- return false
1113
- }
1114
- }
1115
-
1116
- var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
1117
- if !hasExternalOutput { opts.insert(.defaultToSpeaker) }
1118
- if #available(iOS 14.5, *) {
1119
- // Prevent muted switch / mic mute from killing our capture pipeline
1120
- opts.insert(.overrideMutedMicrophoneInterruption)
1121
- }
1122
- do {
1123
- try session.setCategory(.playAndRecord, mode: .default, options: opts)
1124
- } catch { err = error as NSError }
1125
-
1126
- do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
1127
- catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
1128
- markAECSessionActivation(false, reason: "setupAudioSession-reconfigure")
1129
-
1130
- // Force 16k before and after activation (some routes settle only after setActive)
1131
- force16kIfPossible(session)
1132
- do {
1133
- try session.setActive(true)
1134
- markAECSessionActivation(true, reason: "setupAudioSession")
1135
- } catch {
1136
- err = error as NSError
1137
- markAECSessionActivation(false, reason: "setupAudioSession-failed")
1138
- }
1139
- NSLog("[STT] session SR=%.1f inCh=%d outCh=%d (wanted 16000)",
1140
- session.sampleRate,
1141
- Int(session.inputNumberOfChannels),
1142
- Int(session.outputNumberOfChannels))
1143
- force16kIfPossible(session)
1144
- forceSpeakerIfReceiver("setupAudioSession")
1145
-
1146
- if let e = err {
1147
- NSLog("[STT] setupAudioSession error: \(e.localizedDescription)")
1148
- sendResult(error: ["code": "audio", "message": e.localizedDescription],
1149
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
1150
- return false
1151
- }
1152
-
1153
- return true
1154
- }
1155
-
1156
- private func shouldUseVoiceProcessingForCurrentRoute() -> Bool {
1157
- guard aecEnabled else { return false }
1158
- if isInAECRouteWarmupWindow() { return true }
1159
- let s = AVAudioSession.sharedInstance()
1160
- let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
1161
- let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
1162
- (s.currentRoute.inputs.first?.portType == .builtInMic)
1163
- return speakerRoute && usingBuiltInMic
1164
- }
1165
-
1166
- private func configureVoiceProcessingDucking(_ inputNode: AVAudioInputNode) {
1167
- if #available(iOS 17.0, *) {
1168
- var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
1169
- duck.enableAdvancedDucking = false
1170
- duck.duckingLevel = .min
1171
- inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
1172
- }
1173
- }
1174
-
1175
- private func reconcileAEC(on engine: AVAudioEngine?, reason: String, allowRebuild: Bool = true) {
1176
- guard let engine = engine else { return }
1177
- let inputNode = engine.inputNode
1178
- let desiredVP = shouldUseVoiceProcessingForCurrentRoute()
1179
-
1180
- if #available(iOS 13.0, *) {
1181
- let currentVP = inputNode.isVoiceProcessingEnabled
1182
- if currentVP == desiredVP {
1183
- if desiredVP { configureVoiceProcessingDucking(inputNode) }
1184
- NSLog("[STT] AEC reconcile(\(reason)): unchanged vp=\(currentVP ? "ON" : "OFF")")
1185
- return
1186
- }
1187
- }
1188
-
1189
- do {
1190
- try inputNode.setVoiceProcessingEnabled(desiredVP)
1191
- if desiredVP { configureVoiceProcessingDucking(inputNode) }
1192
- NSLog("[STT] AEC reconcile(\(reason)): set vp=\(desiredVP ? "ON" : "OFF")")
1193
- } catch {
1194
- NSLog("[STT] AEC reconcile(\(reason)) failed: \(error.localizedDescription)")
1195
- if allowRebuild && sttActive && !isTearingDown && !micPaused &&
1196
- !isTelephonyInterrupted && !isRecoveringAfterTelephony {
1197
- rebuildEngineGraphAndRestart(reason: "aec-reconcile-\(reason)")
1198
- }
1199
- }
1200
- }
1201
-
1202
- private func scheduleAECReconcileRetries(reason: String,
1203
- attempts: Int = 3,
1204
- stepSec: TimeInterval = 0.20) {
1205
- guard attempts > 0 else { return }
1206
- for i in 1...attempts {
1207
- DispatchQueue.main.asyncAfter(deadline: .now() + stepSec * Double(i)) { [weak self] in
1208
- guard let self = self else { return }
1209
- if self.isTearingDown || self.micPaused || self.isTelephonyInterrupted { return }
1210
- self.reconcileAEC(on: self.audioEngine, reason: "\(reason)-retry\(i)", allowRebuild: false)
1211
- }
1212
- }
1213
- }
1214
-
1215
- private func currentInputFormat(_ engine: AVAudioEngine) -> AVAudioFormat? {
1216
- // Prefer whatever CoreAudio currently provides; avoid cached formats.
1217
- let fmt = engine.inputNode.outputFormat(forBus: 0)
1218
- if fmt.sampleRate > 0 && fmt.channelCount > 0 { return fmt }
1219
- // Fallback: build a sane mono format from session if ever needed.
1220
- let sr = max(8000, AVAudioSession.sharedInstance().sampleRate)
1221
- return AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: sr, channels: 1, interleaved: false)
1222
- }
1223
-
1224
- private func isHeadsetPluggedIn() -> Bool {
1225
- let route = AVAudioSession.sharedInstance().currentRoute
1226
- for out in route.outputs {
1227
- if out.portType == .headphones || out.portType == .bluetoothA2DP {
1228
- return true
1229
- }
1230
- }
1231
- return false
1232
- }
1233
-
1234
- private func recoverAfterTelephonyInterruption() {
1235
- guard hasValidCaptureNow(allowColdEngine: true) else {
1236
- NSLog("[STT] recoverAfterTelephonyInterruption: no capture yet; will rely on watchdog/next route change")
1237
- return
1238
- }
1239
-
1240
- bumpGraphGen()
1241
- NSLog("[STT] 🔄 recovering graph after telephony")
1242
-
1243
- if audioEngine == nil { audioEngine = AVAudioEngine() }
1244
- AudioPlaybackHook.currentEngine = { [weak self] in self?.audioEngine } // ⬅️ add this
1245
- guard let eng = audioEngine else { return }
1246
- installEngineObservers()
1247
-
1248
- _ = setupAudioSession() // ✅ ensures correct mode/options
1249
- forceSpeakerIfReceiver("recoverAfterTelephony") // ✅ receiver -> speaker now
1250
-
1251
- let inputNode = eng.inputNode
1252
- reconcileAEC(on: eng, reason: "recover-after-telephony-prestart", allowRebuild: false)
1253
-
1254
- eng.reset()
1255
- let micMixer = AVAudioMixerNode()
1256
- eng.attach(micMixer)
1257
- eng.connect(inputNode, to: micMixer, format: nil)
1258
- eng.connect(micMixer, to: eng.mainMixerNode, format: nil)
1259
- micMixer.outputVolume = 0.0
1260
-
1261
- if playbackNode == nil { playbackNode = AVAudioPlayerNode() }
1262
- if let p = playbackNode {
1263
- if p.engine == nil { eng.attach(p) }
1264
- eng.connect(p, to: eng.mainMixerNode, format: nil)
1265
- }
1266
-
1267
- do {
1268
- try eng.start()
1269
- armFirstIOCycleLatch(on: eng)
1270
- tryClearCaptureLossAfterStartSucceeded()
1271
- reconcileAEC(on: eng, reason: "recover-after-telephony-poststart", allowRebuild: false)
1272
- scheduleAECReconcileRetries(reason: "recover-after-telephony")
1273
- } catch {
1274
- NSLog("[STT] recover: engine.start failed → will let watchdog retry: \(error)")
1275
- return
1276
- }
1277
-
1278
- // IMPORTANT: install tap and start recognition only after we *see* buffers again
1279
- safeRemoveTap(inputNode)
1280
- let tapFmt = inputNode.outputFormat(forBus: 0)
1281
- guard tapFmt.sampleRate > 0, tapFmt.channelCount > 0 else {
1282
- NSLog("[STT] recover: invalid input format post-start (sr=%.1f ch=%d)", tapFmt.sampleRate, Int(tapFmt.channelCount))
1283
- return
1284
- }
1285
-
1286
- lastBufferAt = 0
1287
- tapFramesTotal = 0
1288
- inputNode.installTap(onBus: 0, bufferSize: 1024, format: tapFmt) { [weak self] buffer, _ in
1289
- guard let self = self else { return }
1290
- self.recognitionRequest?.append(buffer)
1291
-
1292
- // mark that input is flowing again
1293
- self.tapFramesTotal &+= UInt64(buffer.frameLength)
1294
- self.lastBufferAt = CACurrentMediaTime()
1295
-
1296
- // Kick recognition exactly once, the first time we see real audio post-call
1297
- if !self.startedRecognitionAfterCall && self.tapFramesTotal > 1024 {
1298
- DispatchQueue.main.async {
1299
- self.startRecognitionAfterCall()
1300
- }
1301
- }
1302
- }
1303
- // In recoverAfterTelephonyInterruption(), after engine/graph is rebuilt (near the end is fine):
1304
- if self.sttActive && !self.micPaused {
1305
- self.installPlaybackHooks()
1306
- }
1307
-
1308
- NSLog("[STT] recovery: IO + tap ready; waiting for buffers to start recognition")
1309
- // do NOT set isRecoveringAfterTelephony = false here — we clear it in startRecognitionAfterCall()
1310
- }
1311
-
1312
- private func loadContextualStrings() -> [String] {
1313
- guard let filePath = Bundle.main.path(forResource: "words_flattened", ofType: "txt") else {
1314
- NSLog("words_flattened.txt not found in bundle")
1315
- return []
1316
- }
1317
- do {
1318
- var contents = try String(contentsOfFile: filePath, encoding: .utf8)
1319
- // ✅ MIN FIX: remove UTF-8 BOM if present (often only affects the first token)
1320
- if contents.unicodeScalars.first == "\u{FEFF}" {
1321
- contents.unicodeScalars.removeFirst()
1322
- }
1323
- let rawItems = contents.components(separatedBy: ",")
1324
- var cleaned: [String] = []
1325
- cleaned.reserveCapacity(rawItems.count)
1326
- for item in rawItems {
1327
- var t = item.trimmingCharacters(in: .whitespacesAndNewlines)
1328
- t = t.replacingOccurrences(of: "\"", with: "")
1329
- if !t.isEmpty { cleaned.append(t) }
1330
- }
1331
- return cleaned
1332
- } catch {
1333
- NSLog("Error reading contextualStrings: \(error)")
1334
- return []
1335
- }
1336
- }
1337
-
1338
- private func resetAudioSession()
1339
- {
1340
- if audioSession == nil {
1341
- audioSession = AVAudioSession.sharedInstance()
1342
- }
1343
- guard let session = audioSession else { return }
1344
- // Preserve & compare category exactly as original logic
1345
- let current = session.category
1346
- if priorAudioCategory == current { return }
1347
- audioSession = nil
1348
- }
1349
-
1350
-
1351
- private func makeFreshRequest() -> SFSpeechAudioBufferRecognitionRequest {
1352
- let req = SFSpeechAudioBufferRecognitionRequest()
1353
- if #available(iOS 16, *) { req.addsPunctuation = true }
1354
- req.shouldReportPartialResults = true
1355
- //if #available(iOS 13.0, *) { req.taskHint = .dictation }
1356
- let cs: [String] = loadContextualStrings()
1357
- req.contextualStrings = cs
1358
- NSLog("[STT] makeFreshRequest contextualStrings count=\(cs.count) sample=\(cs.prefix(10)) file=\(Bundle.main.path(forResource: "words_flattened", ofType: "txt") ?? "nil")")
1359
-
1360
- self.recognitionRequest = req
1361
- NSLog("[STT] makeFreshRequest()")
1362
- return req
1363
- }
1364
-
1365
- private func startTask(_ req: SFSpeechAudioBufferRecognitionRequest) {
1366
- if isSpeechRecognitionLitePaused() {
1367
- NSLog("[STT] startTask suppressed (speechRecognitionPaused)")
1368
- return
1369
- }
1370
- NSLog("starting recognitionTask")
1371
- lastTaskStartAt = CACurrentMediaTime()
1372
- lastResultAt = lastTaskStartAt
1373
- // Bump generation and capture it for THIS task
1374
- activeTaskGen &+= 1
1375
- let myGen = activeTaskGen
1376
-
1377
-
1378
- let taskSessionId = self.sessionId
1379
- self.recognitionTask = self.speechRecognizer?.recognitionTask(with: req) { [weak self] result, error in
1380
- guard let self = self else { return }
1381
-
1382
- // ❗️Drop callbacks from older tasks
1383
- guard myGen == self.activeTaskGen else {
1384
- NSLog("[STT] stale task callback (gen \(myGen) != \(self.activeTaskGen)) → ignore")
1385
- return
1386
- }
1387
-
1388
- if taskSessionId != self.sessionId { NSLog("task session mismatch -> ignore"); return }
1389
- self.lastResultAt = CACurrentMediaTime()
1390
-
1391
- func markIfReal(_ r: SFSpeechRecognitionResult?) {
1392
- guard let r = r else { return }
1393
-
1394
- // ✅ Do NOT use formattedString here (it normalizes spacing/punctuation/number formatting).
1395
- // Instead, treat "real speech" as "we have at least one non-empty segment substring".
1396
- let hasReal = r.bestTranscription.segments.contains {
1397
- !$0.substring.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty
1398
- }
1399
-
1400
- if hasReal && !self.seenRealSpeech {
1401
- self.seenRealSpeech = true
1402
- NSLog("first real speech detected -> onSpeechStart to JS")
1403
- self.sendEvent(name: "onSpeechStart", body: nil)
1404
- }
1405
- }
1406
- markIfReal(result)
1407
-
1408
- func rearm(_ why: String, delay: TimeInterval = 0.05) {
1409
- guard self.continuous else { return }
1410
- NSLog("REARM (\(why))")
1411
- self.recognitionTask?.cancel()
1412
- self.recognitionTask = nil
1413
- DispatchQueue.main.asyncAfter(deadline: .now() + delay) {
1414
- self.startTask(self.makeFreshRequest())
1415
- }
1416
- }
1417
-
1418
- if let error = error {
1419
- NSLog("task error \(error._code): \(error.localizedDescription)")
1420
- // treat as transient for continuous mode
1421
- rearmTask(reason: "error")
1422
- return
1423
- }
1424
-
1425
- guard let result = result else {
1426
- NSLog("task nil result")
1427
- rearmTask(reason: "nil-result")
1428
- return
1429
- }
1430
-
1431
- let isFinal = result.isFinal
1432
- let parts = result.transcriptions.map { $0.segments.map { $0.substring }.joined(separator: " ") }
1433
- self.sendResult(error: nil,
1434
- bestTranscription: result.bestTranscription.segments.map { $0.substring }.joined(separator: " "),
1435
- transcriptions: parts,
1436
- isFinal: isFinal)
1437
-
1438
- if isFinal {
1439
- NSLog("task final -> onSpeechEnd")
1440
- self.sendEvent(name: "onSpeechEnd", body: nil)
1441
- if self.continuous {
1442
- self.rearmTask(reason: "final")
1443
- } else {
1444
- NSLog("non-continuous final -> teardown")
1445
- self.teardown()
1446
- }
1447
- }
1448
- }
1449
- }
1450
-
1451
- public func teardown() {
1452
- bumpGraphGen()
1453
- NSLog("[STT] teardown() begin")
1454
- isTearingDown = true
1455
- // ✅ HARD reset speech-lite pause state on teardown
1456
- resetSpeechRecognitionLitePauseState("teardown")
1457
- stopWatchdog()
1458
- consecutiveStallCount = 0
1459
- removeEngineObservers()
1460
- if let task = recognitionTask {
1461
- task.cancel()
1462
- recognitionTask = nil
1463
- }
1464
- AudioPlaybackHook.engineScheduleFile = nil
1465
- AudioPlaybackHook.isEngineReady = nil
1466
- AudioPlaybackHook.useOnlyEnginePlayback = nil
1467
- AudioPlaybackHook.stopEnginePlayback = nil // ← NEW
1468
- sttActive = false
1469
-
1470
- if let p = playbackNode {
1471
- p.stop()
1472
- }
1473
- playbackNode = nil
1474
-
1475
- if let req = recognitionRequest {
1476
- req.endAudio()
1477
- recognitionRequest = nil
1478
- }
1479
-
1480
- if let engine = audioEngine {
1481
- safeRemoveTap(engine.outputNode, bus: 0) // <- clear IO latch if present
1482
- safeRemoveTap(engine.mainMixerNode, bus: 0) // <- clear mixer probe if present
1483
- if engine.inputNode != nil {
1484
- safeRemoveTap(engine.inputNode, bus: 0)
1485
- engine.inputNode.reset()
1486
- }
1487
- if engine.isRunning { engine.stop() }
1488
- engine.reset()
1489
- audioEngine = nil
1490
- AudioPlaybackHook.currentEngine = nil // ⬅️ add this
1491
- }
1492
- mixerProbeActive = false
1493
- mixerProbeCompletions.removeAll()
1494
- speakerVerificationEngine = nil
1495
- speakerVerificationFrameSize = 0
1496
- speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
1497
- speakerVerificationThreshold = 0
1498
- speakerVerificationFrameSeq = 0
1499
- speakerVerificationSourceSampleRate = 0
1500
- speakerVerificationTargetSampleRate = 0
1501
- speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
1502
- speakerVerificationResamplePos = 0
1503
- speakerLastPositiveMatchAt = 0
1504
- setSpeakerGateState(enabled: false, open: true)
1505
- speakerVerificationErrorSent = false
1506
- speakerPreRollBuffers.removeAll(keepingCapacity: false)
1507
- speakerPreRollFrames = 0
1508
- speakerPreRollMaxFrames = 0
1509
- speakerPendingPreRollFlush = false
1510
- lastRouteSignature = ""
1511
- markAECSessionActivation(false, reason: "teardown")
1512
-
1513
- resetAudioSession()
1514
- savedSessionBeforePause = nil
1515
-
1516
- sessionId = nil
1517
- isTearingDown = false
1518
- }
1519
-
1520
- private func isPlayerConnected(_ player: AVAudioPlayerNode?, to engine: AVAudioEngine?) -> Bool {
1521
- guard let p = player, let e = engine else { return false }
1522
- // If the node is attached and has a non-zero channel count on its output, it’s effectively connected.
1523
- let fmt = p.outputFormat(forBus: 0)
1524
- return (p.engine === e) && (fmt.channelCount > 0) && (fmt.sampleRate > 0)
1525
- }
1526
-
1527
- private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
1528
- if isTelephonyInterrupted || isRecoveringAfterTelephony {
1529
- NSLog("[STT] ensureEngineRunning suppressed (telephony/recovering)")
1530
- return
1531
- }
1532
- if isSpeechRecognitionLitePaused() {
1533
- NSLog("[STT] ensureEngineRunning(\(reason)) suppressed (speechRecognitionPaused)")
1534
- return
1535
- }
1536
- if micPaused {
1537
- NSLog("[STT] ensureEngineRunning(\(reason)) suppressed (micPaused)")
1538
- return
1539
- }
1540
-
1541
- if isTearingDown { return }
1542
-
1543
- if !hasValidCaptureNow(allowColdEngine: true) {
1544
- markCaptureLost()
1545
- NSLog("[STT] ensureEngineRunning(\(reason)): capture not available; waiting")
1546
- return
1547
- }
1548
-
1549
- let now = CFAbsoluteTimeGetCurrent()
1550
- if !skipCooldown, (now - lastReclaimAttempt) < reclaimCooldown {
1551
- NSLog("[STT] ensureEngineRunning(\(reason)) skipped (cooldown)")
1552
- return
1553
- }
1554
- lastReclaimAttempt = now
1555
-
1556
- // (re)start engine
1557
- if let eng = audioEngine {
1558
- if !eng.isRunning {
1559
- do {
1560
- playbackNode?.stop()
1561
- playbackNode = nil
1562
- try eng.start()
1563
- armFirstIOCycleLatch(on: eng)
1564
- tryClearCaptureLossAfterStartSucceeded()
1565
- NSLog("🔄 AVAudioEngine restarted. running=\(eng.isRunning)")
1566
- } catch {
1567
- NSLog("❌ engine.start() failed: \(error) → rebuild")
1568
- rebuildEngineGraphAndRestart(reason: reason)
1569
- return
1570
- }
1571
- }
1572
- } else {
1573
- NSLog("[STT] ensureEngineRunning(\(reason)): no engine → rebuild")
1574
- rebuildEngineGraphAndRestart(reason: reason)
1575
- return
1576
- }
1577
-
1578
- // ensure a task is running
1579
- if recognitionTask == nil {
1580
- if isSpeechRecognitionLitePaused() {
1581
- NSLog("[STT] ensureEngineRunning(\(reason)): skip startTask (speechRecognitionPaused)")
1582
- } else if let req = recognitionRequest {
1583
- startTask(req)
1584
- } else {
1585
- startTask(makeFreshRequest())
1586
- }
1587
- }
1588
- }
1589
- /*
1590
- private func ensureEngineRunning(reason: String, skipCooldown: Bool = false) {
1591
- if isTearingDown { return } // ← add
1592
-
1593
- // If no mic, don’t touch the graph. Wait for route/interruption end.
1594
-
1595
- if !hasValidCaptureNow() {
1596
- markCaptureLost()
1597
- NSLog("[STT] ensureEngineRunning(\(reason)): capture not available; waiting")
1598
- return
1599
- }
1600
-
1601
- let now = CFAbsoluteTimeGetCurrent()
1602
- if (now - lastReclaimAttempt) < reclaimCooldown {
1603
- NSLog("[STT] ensureEngineRunning(\(reason)) skipped (cooldown)")
1604
- return
1605
- }
1606
- lastReclaimAttempt = now
1607
-
1608
- if (audioEngine != nil) && !audioEngine!.isRunning {
1609
- do {
1610
- playbackNode?.stop()
1611
- playbackNode = nil
1612
- // Possibly re-apply your format or re-install taps if the hardware changed sample rates
1613
- try audioEngine!.start()
1614
- armFirstIOCycleLatch(on: audioEngine!)
1615
- tryClearCaptureLossAfterStartSucceeded()
1616
-
1617
- print("🔄 AVAudioEngine restarted after config change. isRunning=%@",
1618
- audioEngine!.isRunning ? "YES":"NO")
1619
- } catch {
1620
- print("❌ Could not re-start after config change: \(error)")
1621
- }
1622
- }
1623
-
1624
- guard let engine = audioEngine else {
1625
- NSLog("[STT] ensureEngineRunning(\(reason)): no engine → rebuild")
1626
- rebuildEngineGraphAndRestart(reason: reason)
1627
- return
1628
- }
1629
-
1630
- if !engine.isRunning {
1631
- do {
1632
- try engine.start()
1633
- armFirstIOCycleLatch(on: engine)
1634
- tryClearCaptureLossAfterStartSucceeded() // ← add this line
1635
-
1636
- NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() -> running=\(engine.isRunning)")
1637
- } catch {
1638
- NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() failed: \(error) → rebuild")
1639
- rebuildEngineGraphAndRestart(reason: reason)
1640
- return
1641
- }
1642
- }
1643
-
1644
- // If we have no active task, spin one up against the current request
1645
- if recognitionTask == nil {
1646
- if let req = recognitionRequest {
1647
- NSLog("[STT] ensureEngineRunning(\(reason)): no task -> startTask(existing req)")
1648
- startTask(req)
1649
- } else {
1650
- NSLog("[STT] ensureEngineRunning(\(reason)): no req -> makeFreshRequest + startTask")
1651
- startTask(makeFreshRequest())
1652
- }
1653
- }
1654
- }
1655
- */
1656
- /// Rebuilds AVAudioEngine graph (mic→mute mixer, player→mainMixer), reinstalls tap,
1657
- /// and restarts the engine. Does NOT nuke the current recognitionRequest/task unless required.
1658
- private func rebuildEngineGraphAndRestart(reason: String) {
1659
- bumpGraphGen()
1660
- NSLog("[STT] 🔄 rebuildEngineGraphAndRestart (\(reason))")
1661
- if isTelephonyInterrupted { NSLog("[STT] rebuild suppressed during telephony"); return }
1662
- if isSpeechRecognitionLitePaused() {
1663
- NSLog("[STT] rebuild suppressed (speechRecognitionPaused)")
1664
- return
1665
- }
1666
-
1667
- guard hasValidCaptureNow() else {
1668
- markCaptureLost()
1669
- NSLog("[STT] rebuild: no valid input yet (skip)")
1670
- return
1671
- }
1672
-
1673
- // Keep current request if present; we'll keep appending into it
1674
- let existingReq = self.recognitionRequest
1675
-
1676
- // Tear down engine ONLY (keep session, request)
1677
- if let engine = audioEngine {
1678
- if engine.inputNode != nil {
1679
- safeRemoveTap(engine.inputNode, bus: 0)
1680
- engine.inputNode.reset()
1681
- }
1682
- if engine.isRunning { engine.stop() }
1683
- engine.reset()
1684
- }
1685
-
1686
- // Recreate engine and graph
1687
- let newEngine = AVAudioEngine()
1688
- self.audioEngine = newEngine
1689
- AudioPlaybackHook.currentEngine = { [weak self] in self?.audioEngine } // ⬅️ add this
1690
-
1691
- installEngineObservers() // <-- IMPORTANT: observers were bound to old engine object
1692
- _ = setupAudioSession() // ✅ keep session policy consistent
1693
- forceSpeakerIfReceiver("rebuild:\(reason)") // ✅ receiver -> speaker now
1694
-
1695
- let inputNode = newEngine.inputNode
1696
- reconcileAEC(on: newEngine, reason: "rebuild-\(reason)-prestart", allowRebuild: false)
1697
-
1698
- var inFmt = inputNode.outputFormat(forBus: 0)
1699
-
1700
- // mic → mute mixer → mainMixer
1701
- let micMixer = AVAudioMixerNode()
1702
- newEngine.attach(micMixer)
1703
- newEngine.connect(inputNode, to: micMixer, format: inFmt) // live input format
1704
- newEngine.connect(micMixer, to: newEngine.mainMixerNode, format: nil) // let mixer choose
1705
- micMixer.outputVolume = 0.0
1706
-
1707
- // TTS player → (de-esser) → mainMixer
1708
- if let existing = playbackNode, existing.engine !== newEngine {
1709
- // Node is owned by a different engine instance; recreate for this graph.
1710
- existing.stop()
1711
- playbackNode = nil
1712
- }
1713
- if playbackNode == nil {
1714
- playbackNode = AVAudioPlayerNode()
1715
- }
1716
- if let player = playbackNode {
1717
- if player.engine == nil {
1718
- newEngine.attach(player)
1719
- }
1720
- newEngine.connect(player, to: newEngine.mainMixerNode, format: nil)
1721
- }
1722
-
1723
- // // --- Aggressive low-pass only ---
1724
- // let deEss = AVAudioUnitEQ(numberOfBands: 1)
1725
- // let lpf = deEss.bands[0]
1726
- // lpf.filterType = .lowPass
1727
- // lpf.frequency = 6500 // try 6000–7500
1728
- // lpf.bandwidth = 0.35 // fairly steep
1729
- // lpf.gain = 0.0
1730
- // lpf.bypass = false
1731
-
1732
- // self.ttsEQ = deEss
1733
- // newEngine.attach(deEss)
1734
-
1735
- // newEngine.disconnectNodeOutput(player)
1736
- // newEngine.connect(player, to: deEss, format: nil)
1737
- // newEngine.connect(deEss, to: newEngine.mainMixerNode, format: nil)
1738
- // }
1739
-
1740
- // Tap uses nil to follow the node’s current output format
1741
- newEngine.prepare()
1742
- do {
1743
- try newEngine.start()
1744
- armFirstIOCycleLatch(on: newEngine)
1745
- tryClearCaptureLossAfterStartSucceeded()
1746
- reconcileAEC(on: newEngine, reason: "rebuild-\(reason)-poststart", allowRebuild: false)
1747
- scheduleAECReconcileRetries(reason: "rebuild-\(reason)")
1748
- NSLog("[STT] rebuild: engine.start() ok, running=\(newEngine.isRunning)")
1749
- } catch {
1750
- markCaptureLost()
1751
- NSLog("[STT] rebuild: engine.start() failed: \(error)")
1752
- }
1753
-
1754
-
1755
- // 2) NOW that IO is running, install the tap (format will be valid)
1756
- safeRemoveTap(inputNode)
1757
- let tapFmt = inputNode.outputFormat(forBus: 0)
1758
- guard tapFmt.sampleRate > 0, tapFmt.channelCount > 0 else {
1759
- markCaptureLost()
1760
- NSLog("[STT] rebuild: invalid input format after start (sr=%.1f ch=%d)",
1761
- tapFmt.sampleRate, Int(tapFmt.channelCount))
1762
- return
1763
- }
1764
-
1765
- inputNode.installTap(onBus: 0, bufferSize: 1024, format: tapFmt) { [weak self] buffer, _ in
1766
- guard let self = self else { return }
1767
-
1768
- // 👇 EXACT same logic as in setupAndStartRecognizing
1769
- self.tapFramesTotal &+= UInt64(buffer.frameLength)
1770
- if self.tapFramesTotal % (44100 * 2) < 1024 {
1771
- NSLog("[STT] tap alive, totalFrames=\(self.tapFramesTotal)")
1772
- }
1773
-
1774
- let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
1775
- let LEVEL_LOWPASS_TRIG: Float = 0.5
1776
-
1777
- // CH0
1778
- if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
1779
- var peak0: Float = 0
1780
- vDSP_maxmgv(ch0, 1, &peak0, frames)
1781
- let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
1782
-
1783
- let smoothed0 = LEVEL_LOWPASS_TRIG * db0
1784
- + (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel0
1785
- self.averagePowerForChannel0 = smoothed0
1786
- self.averagePowerForChannel1 = smoothed0
1787
- }
1788
-
1789
- // CH1
1790
- if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
1791
- var peak1: Float = 0
1792
- vDSP_maxmgv(ch1, 1, &peak1, frames)
1793
- let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
1794
-
1795
- let smoothed1 = LEVEL_LOWPASS_TRIG * db1
1796
- + (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel1
1797
- self.averagePowerForChannel1 = smoothed1
1798
- }
1799
-
1800
- // Normalize 0–10 and emit
1801
- self.averagePowerForChannel1 = Float(
1802
- self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0
1803
- )
1804
- let value = self.averagePowerForChannel1
1805
- self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": value])
1806
-
1807
- // Append to recognition
1808
- self.recognitionRequest?.append(buffer)
1809
-
1810
- // mark that input is flowing again
1811
- self.lastBufferAt = CACurrentMediaTime()
1812
- }
1813
-
1814
- // If we lost the request during rebuild, recreate + start task.
1815
- if self.recognitionRequest == nil {
1816
- if let old = existingReq {
1817
- self.recognitionRequest = old
1818
- } else {
1819
- self.recognitionRequest = makeFreshRequest()
1820
- }
1821
- }
1822
- if self.recognitionTask == nil {
1823
- if isSpeechRecognitionLitePaused() {
1824
- NSLog("[STT] rebuild: skip startTask (speechRecognitionPaused)")
1825
- } else {
1826
- startTask(self.recognitionRequest!)
1827
- }
1828
- }
1829
- if self.sttActive && !self.micPaused {
1830
- self.installPlaybackHooks()
1831
- }
1832
- }
1833
-
1834
- @objc private func handleEngineConfigChange(_ note: Notification) {
1835
- if isTearingDown { return } // ← add
1836
- if isSpeechRecognitionLitePaused() {
1837
- NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange (ignored: speechRecognitionPaused)")
1838
- return
1839
- }
1840
- if micPaused {
1841
- NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange (ignored: micPaused)")
1842
- return
1843
- }
1844
-
1845
- NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange: ensuring engine running")
1846
- if (audioEngine != nil) && !audioEngine!.isRunning {
1847
- playbackNode?.stop()
1848
- playbackNode = nil
1849
- }
1850
- ensureEngineRunning(reason: "engine-config-change")
1851
- reconcileAEC(on: audioEngine, reason: "engine-config-change")
1852
- scheduleAECReconcileRetries(reason: "engine-config-change")
1853
- }
1854
-
1855
- @objc private func handleMediaServicesReset(_ note: Notification) {
1856
- if isTearingDown { return } // ← add
1857
-
1858
- if isSpeechRecognitionLitePaused() {
1859
- NSLog("[STT] 📺 Media services RESET (ignored: speechRecognitionPaused)")
1860
- return
1861
- }
1862
- if micPaused {
1863
- NSLog("[STT] 📺 Media services RESET (ignored: micPaused)")
1864
- return
1865
- }
1866
- NSLog("[STT] 📺 Media services were RESET: reclaiming mic & session")
1867
- // Re-apply audio session and try to rebuild graph if needed
1868
- bumpGraphGen()
1869
- _ = setupAudioSession()
1870
- ensureEngineRunning(reason: "media-services-reset")
1871
- reconcileAEC(on: audioEngine, reason: "media-services-reset")
1872
- scheduleAECReconcileRetries(reason: "media-services-reset")
1873
- }
1874
-
1875
- /*?????????? Why so many changes???
1876
- @objc private func handleRouteChange(_ note: Notification) {
1877
- if isTearingDown { return } // ← add
1878
-
1879
- let info = note.userInfo ?? [:]
1880
- NSLog("[STT] 🔀 route change: \(info)")
1881
- updateSessionRouting(selectBestInput: true) // ← add this
1882
-
1883
- guard let reasonVal = info[AVAudioSessionRouteChangeReasonKey] as? UInt,
1884
- let reason = AVAudioSession.RouteChangeReason(rawValue: reasonVal) else {
1885
- ensureEngineRunning(reason: "route-change-unknown")
1886
- return
1887
- }
1888
- // Ignore route-change spam caused by our own adjustments
1889
- if isAdjustingRoute {
1890
- NSLog("[STT] route change (self-induced) → ignore")
1891
- return
1892
- }
1893
-
1894
- // Only rebalance on real hardware changes; avoid .categoryChange / .override
1895
- switch reason {
1896
- case .newDeviceAvailable, .oldDeviceUnavailable, .routeConfigurationChange:
1897
- let now = CFAbsoluteTimeGetCurrent()
1898
- if now - lastRouteTune > routeTuneCooldown {
1899
- lastRouteTune = now
1900
- DispatchQueue.global(qos: .userInitiated).async { [weak self] in
1901
- self?.updateSessionRouting(selectBestInput: true)
1902
- }
1903
- }
1904
- default:
1905
- break
1906
- }
1907
- ensureEngineRunning(reason: "route-change-\(reason.rawValue)")
1908
- }
1909
- */
1910
- @objc private func handleRouteChange(_ note: Notification) {
1911
- if isTearingDown { return }
1912
- if !sttActive {
1913
- NSLog("[STT] 🔀 route change (ignored: sttInactive) \(note.userInfo ?? [:])")
1914
- return
1915
- }
1916
- if isSpeechRecognitionLitePaused() {
1917
- NSLog("[STT] 🔀 route change (ignored: speechRecognitionPaused) \(note.userInfo ?? [:])")
1918
- return
1919
- }
1920
- if micPaused {
1921
- NSLog("[STT] 🔀 route change (ignored: micPaused) \(note.userInfo ?? [:])")
1922
- return
1923
- }
1924
-
1925
- let info = note.userInfo ?? [:]
1926
- NSLog("[STT] 🔀 route change: \(info)")
1927
- if isTelephonyInterrupted || isRecoveringAfterTelephony {
1928
- NSLog("[STT] 🔀 route change (ignored during telephony/recovering): \(info)")
1929
- return
1930
- }
1931
-
1932
- let session = AVAudioSession.sharedInstance()
1933
- let outSig = session.currentRoute.outputs.map { $0.portType.rawValue }.joined(separator: ",")
1934
- let inSig = session.currentRoute.inputs.map { $0.portType.rawValue }.joined(separator: ",")
1935
- let routeSig = "outs=\(outSig)|ins=\(inSig)"
1936
- if routeSig == lastRouteSignature {
1937
- NSLog("[STT] 🔀 route change ignored (same route signature)")
1938
- return
1939
- }
1940
- lastRouteSignature = routeSig
1941
-
1942
- if let reasonVal = info[AVAudioSessionRouteChangeReasonKey] as? UInt,
1943
- let reason = AVAudioSession.RouteChangeReason(rawValue: reasonVal) {
1944
- switch reason {
1945
- // Match AVAudioWrapper behavior: handle concrete hardware events + route config changes.
1946
- case .newDeviceAvailable, .oldDeviceUnavailable, .routeConfigurationChange:
1947
- updateSessionRouting(selectBestInput: true)
1948
- default:
1949
- NSLog("[STT] 🔀 route change reason=\(reason.rawValue) -> skip updateSessionRouting")
1950
- }
1951
- } else {
1952
- NSLog("[STT] 🔀 route change reason missing -> skip updateSessionRouting")
1953
- }
1954
-
1955
- forceSpeakerIfReceiver("routeChange")
1956
- reconcileAEC(on: audioEngine, reason: "route-change", allowRebuild: false)
1957
- scheduleAECReconcileRetries(reason: "route-change")
1958
-
1959
- ensureEngineRunning(reason: "route-change", skipCooldown: true)
1960
- }
1961
-
1962
- private func waitForIOCycle(_ engine: AVAudioEngine,
1963
- timeout: TimeInterval = 0.7,
1964
- done: @escaping (Bool) -> Void) {
1965
- let gen = graphGen
1966
- ttsSerial.async { [weak self, weak engine] in
1967
- guard let self = self, let eng = engine, gen == self.graphGen else { return }
1968
-
1969
- if self.mixerProbeActive {
1970
- self.mixerProbeCompletions.append(done)
1971
- return
1972
- }
1973
- self.mixerProbeActive = true
1974
- self.mixerProbeCompletions = [done]
1975
-
1976
- DispatchQueue.main.async { [weak self, weak eng] in
1977
- guard let self = self, let eng = eng, gen == self.graphGen else { return }
1978
- let mixer = eng.mainMixerNode
1979
- var fired = false
1980
- self.safeRemoveTap(mixer, bus: 0)
1981
-
1982
- mixer.installTap(onBus: 0, bufferSize: 128, format: nil) { [weak self, weak mixer] _, _ in
1983
- guard let self = self, gen == self.graphGen else { return }
1984
- if fired { return }
1985
- fired = true
1986
- self.safeRemoveTap(mixer, bus: 0)
1987
-
1988
- self.ttsSerial.async { [weak self] in
1989
- guard let self = self else { return }
1990
- let completions = self.mixerProbeCompletions
1991
- self.mixerProbeActive = false
1992
- self.mixerProbeCompletions.removeAll()
1993
- DispatchQueue.main.async { if gen == self.graphGen { completions.forEach { $0(true) } } }
1994
- }
1995
- }
1996
-
1997
- DispatchQueue.main.asyncAfter(deadline: .now() + timeout) { [weak self, weak mixer] in
1998
- guard let self = self, gen == self.graphGen else { return }
1999
- if fired { return }
2000
- self.safeRemoveTap(mixer, bus: 0)
2001
- self.ttsSerial.async { [weak self] in
2002
- guard let self = self else { return }
2003
- let completions = self.mixerProbeCompletions
2004
- self.mixerProbeActive = false
2005
- self.mixerProbeCompletions.removeAll()
2006
- DispatchQueue.main.async { if gen == self.graphGen { completions.forEach { $0(false) } } }
2007
- }
2008
- }
2009
- }
2010
- }
2011
- }
2012
-
2013
- // Call once, right after you create the engine (or inside setupAudioSession)
2014
- // Call once after engine is created
2015
- private func installEngineObservers() {
2016
- removeEngineObservers()
2017
-
2018
- let nc = NotificationCenter.default
2019
-
2020
- if let engine = audioEngine {
2021
- nc.addObserver(self,
2022
- selector: #selector(handleEngineConfigChange(_:)),
2023
- name: .AVAudioEngineConfigurationChange,
2024
- object: engine)
2025
- }
2026
-
2027
- nc.addObserver(self,
2028
- selector: #selector(handleSessionInterruption(_:)),
2029
- name: AVAudioSession.interruptionNotification,
2030
- object: AVAudioSession.sharedInstance())
2031
-
2032
- nc.addObserver(self,
2033
- selector: #selector(handleRouteChange(_:)),
2034
- name: AVAudioSession.routeChangeNotification,
2035
- object: AVAudioSession.sharedInstance())
2036
-
2037
- nc.addObserver(self,
2038
- selector: #selector(handleMediaServicesReset(_:)),
2039
- name: AVAudioSession.mediaServicesWereResetNotification,
2040
- object: nil)
2041
- }
2042
-
2043
- @objc private func handleSessionInterruption(_ note: Notification) {
2044
- guard let info = note.userInfo,
2045
- let typeRaw = info[AVAudioSessionInterruptionTypeKey] as? UInt,
2046
- let type = AVAudioSession.InterruptionType(rawValue: typeRaw) else { return }
2047
-
2048
- switch type {
2049
- case .began:
2050
- resetSpeechRecognitionLitePauseState("telephony-began")
2051
-
2052
- NSLog("[STT] 📞 Interruption began")
2053
- isTelephonyInterrupted = true
2054
- isRecoveringAfterTelephony = false
2055
- markCaptureLost()
2056
-
2057
- // Stop IO safely
2058
- if let eng = audioEngine {
2059
- safeRemoveTap(eng.inputNode); safeRemoveTap(eng.mainMixerNode); safeRemoveTap(eng.outputNode)
2060
- if eng.isRunning { eng.stop() }
2061
- eng.reset()
2062
- }
2063
-
2064
- // Cancel the task and nil out the request — iOS STT tasks rarely recover after a hard break
2065
- recognitionTask?.cancel()
2066
- recognitionTask = nil
2067
- recognitionRequest = nil
2068
-
2069
- case .ended:
2070
- isTelephonyInterrupted = false
2071
- NSLog("[STT] ✅ Interruption ended")
2072
-
2073
- // Keep the system from thrashing us during recovery
2074
- isRecoveringAfterTelephony = true
2075
- stopWatchdog() // <- don't let the watchdog rearm-loop during recovery
2076
- startedRecognitionAfterCall = false
2077
- lastBufferAt = 0
2078
- tapFramesTotal = 0
2079
-
2080
- // Re-activate the session (safe if already active)
2081
- do {
2082
- try AVAudioSession.sharedInstance().setActive(true, options: [])
2083
- markAECSessionActivation(true, reason: "interruption-ended")
2084
- } catch {
2085
- markAECSessionActivation(false, reason: "interruption-ended-failed")
2086
- }
2087
-
2088
- // Give routes/formats a moment to settle *before* we rebuild
2089
- DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) {
2090
- _ = self.setupAudioSession() // ✅ ensures defaultToSpeaker + mode
2091
- self.forceSpeakerIfReceiver("telephonyEnded")// ✅ if iOS still stuck on receiver
2092
- self.recoverAfterTelephonyInterruption()
2093
- }
2094
-
2095
- default: break
2096
- }
2097
- }
2098
-
2099
- private func installPlaybackHooks() {
2100
- // Expose current engine to the hook layer (safe to overwrite each time)
2101
- AudioPlaybackHook.currentEngine = { [weak self] in self?.audioEngine }
2102
-
2103
- // Engine readiness
2104
- AudioPlaybackHook.isEngineReady = { [weak self] in
2105
- guard let eng = self?.audioEngine else { return false }
2106
- return eng.isRunning
2107
- }
2108
-
2109
- // Tell TTS layer: do NOT use AVAudioPlayer fallback while STT is active
2110
- AudioPlaybackHook.useOnlyEnginePlayback = { [weak self] in
2111
- guard let self = self else { return false }
2112
- return self.sttActive && !self.micPaused
2113
- }
2114
-
2115
- // Schedule & play a file through the engine-owned AVAudioPlayerNode
2116
- AudioPlaybackHook.engineScheduleFile = { [weak self] url, done in
2117
- guard let self = self else { return false }
2118
-
2119
- self.ttsSerial.async { [weak self] in
2120
- guard let self = self else { return }
2121
-
2122
- DispatchQueue.main.async {
2123
- guard !self.isTearingDown,
2124
- let engine = self.audioEngine else { return }
2125
-
2126
- // If player belongs to a different engine (or got detached), recreate it
2127
- if self.playbackNode?.engine !== engine || !self.isPlayerConnected(self.playbackNode, to: engine) {
2128
- self.playbackNode?.stop()
2129
- self.playbackNode = nil
2130
- }
2131
-
2132
- // Ensure engine is running
2133
- if !engine.isRunning {
2134
- do {
2135
- try engine.start()
2136
- self.armFirstIOCycleLatch(on: engine)
2137
- } catch {
2138
- NSLog("[STT] TTS: engine.start() failed: \(error)")
2139
- return
2140
- }
2141
- }
2142
-
2143
- let mixer = engine.mainMixerNode
2144
- mixer.auAudioUnit.inputBusses[0].isEnabled = true
2145
-
2146
- let player = self.ensurePlaybackNode(in: engine)
2147
-
2148
- // Prime a silent buffer on a freshly attached player (stabilizes first play)
2149
- if player.lastRenderTime == nil {
2150
- let fmt = mixer.outputFormat(forBus: 0)
2151
- if let prime = AVAudioPCMBuffer(pcmFormat: fmt, frameCapacity: 128) {
2152
- prime.frameLength = 128
2153
- if let ch = prime.floatChannelData {
2154
- memset(ch[0], 0, Int(prime.frameLength) * MemoryLayout<Float>.size)
2155
- if fmt.channelCount > 1 {
2156
- memset(ch[1], 0, Int(prime.frameLength) * MemoryLayout<Float>.size)
2157
- }
2158
- }
2159
- player.scheduleBuffer(prime, completionHandler: nil)
2160
- }
2161
- }
2162
-
2163
- do {
2164
- let file = try AVAudioFile(forReading: url)
2165
- player.scheduleFile(file, at: nil) {
2166
- DispatchQueue.main.async { done() }
2167
- }
2168
- } catch {
2169
- NSLog("[STT] TTS schedule error: \(error)")
2170
- return
2171
- }
2172
-
2173
- player.volume = 0.5
2174
-
2175
- // Gate play on "engine has rendered at least one IO cycle"
2176
- let startPlay = {
2177
- if !player.isPlaying { player.play() }
2178
- }
2179
-
2180
- if self.engineHasRenderedOnce {
2181
- startPlay()
2182
- } else {
2183
- func tryStart(after ms: Int = 0) {
2184
- DispatchQueue.main.asyncAfter(deadline: .now() + .milliseconds(ms)) {
2185
- if self.engineHasRenderedOnce {
2186
- startPlay()
2187
- } else if ms < 1500 {
2188
- tryStart(after: ms + 100)
2189
- } else {
2190
- NSLog("[STT] TTS: no IO cycle observed; skipping play to avoid crash")
2191
- }
2192
- }
2193
- }
2194
- tryStart()
2195
- }
2196
- }
2197
- }
2198
-
2199
- return true
2200
- }
2201
-
2202
- // Stop only the engine playback node (keep STT engine running)
2203
- AudioPlaybackHook.stopEnginePlayback = { [weak self] in
2204
- DispatchQueue.main.async {
2205
- self?.playbackNode?.stop()
2206
- }
2207
- }
2208
- }
2209
-
2210
- private func setupAndStartRecognizing(localeStr: String?) {
2211
- NSLog("[STT] setupAndStartRecognizing begin")
2212
- sttActive = true
2213
- // ✅ HARD reset speech-lite pause state on start/reinit
2214
- resetSpeechRecognitionLitePauseState("setupAndStartRecognizing")
2215
-
2216
- // --- HARD RESET OF STATE (first-run safety) ---
2217
- isTearingDown = false
2218
- isTelephonyInterrupted = false
2219
- isRecoveringAfterTelephony = false
2220
-
2221
- engineHasRenderedOnce = false
2222
- ioLatchActiveGen = 0
2223
- graphGen = 0
2224
-
2225
- seenRealSpeech = false
2226
- engineHotAt = 0
2227
- lastBufferAt = 0
2228
- lastResultAt = 0
2229
- lastTaskStartAt = 0
2230
- consecutiveStallCount = 0
2231
- consecutiveNoInputResets = 0
2232
- lastNoInputRecoveryAt = 0
2233
- lastRearmAt = 0
2234
- lastReclaimAttempt = 0
2235
- tapFramesTotal = 0
2236
- lastTapFramesSeen = 0
2237
- pausedForCaptureLoss = false
2238
- mixerProbeActive = false
2239
- mixerProbeCompletions.removeAll()
2240
- speakerVerificationEngine = nil
2241
- speakerVerificationFrameSize = 0
2242
- speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
2243
- speakerVerificationThreshold = 0
2244
- speakerVerificationFrameSeq = 0
2245
- speakerVerificationSourceSampleRate = 0
2246
- speakerVerificationTargetSampleRate = 0
2247
- speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
2248
- speakerVerificationResamplePos = 0
2249
- speakerLastPositiveMatchAt = 0
2250
- setSpeakerGateState(enabled: false, open: true)
2251
- speakerVerificationErrorSent = false
2252
- speakerPreRollBuffers.removeAll(keepingCapacity: false)
2253
- speakerPreRollFrames = 0
2254
- speakerPreRollMaxFrames = 0
2255
- speakerPendingPreRollFlush = false
2256
-
2257
- audioSession = AVAudioSession.sharedInstance()
2258
- guard let session = audioSession else { return }
2259
- var err: NSError?
2260
-
2261
- priorAudioCategory = session.category
2262
-
2263
- // Tear down resources before starting speech recognition..
2264
- NSLog("[STT] pre-teardown")
2265
- teardown()
2266
- // ** IMPORTANT ** Call this again as teardown marks this false
2267
- sttActive = true
2268
-
2269
- sessionId = UUID().uuidString
2270
-
2271
- let locale: Locale? = {
2272
- if let s = localeStr, !s.isEmpty { return Locale(identifier: s) }
2273
- sttActive = false
2274
- return nil
2275
- }()
2276
-
2277
- if let loc = locale {
2278
- speechRecognizer = SFSpeechRecognizer(locale: loc)
2279
- } else {
2280
- speechRecognizer = SFSpeechRecognizer()
2281
- }
2282
- speechRecognizer?.delegate = self
2283
-
2284
- // Start audio session...
2285
- NSLog("[STT] setupAudioSession()")
2286
- guard setupAudioSession() else {
2287
- NSLog("[STT] ERROR ERROR ******** setupAudioSession()")
2288
- teardown()
2289
- sttActive = false
2290
- return
2291
- }
2292
- installEngineObservers()
2293
-
2294
- let request = SFSpeechAudioBufferRecognitionRequest()
2295
- recognitionRequest = request
2296
-
2297
- if #available(iOS 16, *) {
2298
- request.addsPunctuation = true
2299
- } else {
2300
- // Fallback on earlier versions
2301
- }
2302
- request.shouldReportPartialResults = true
2303
- //if #available(iOS 13.0, *) { request.taskHint = .dictation }
2304
- let cs: [String] = loadContextualStrings()
2305
- request.contextualStrings = cs
2306
- NSLog("[STT] makeFreshRequest contextualStrings count=\(cs.count) sample=\(cs.prefix(10)) file=\(Bundle.main.path(forResource: "words_flattened", ofType: "txt") ?? "nil")")
2307
-
2308
- guard recognitionRequest != nil else {
2309
- sendResult(error: ["code": "recognition_init"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
2310
- teardown()
2311
- return
2312
- }
2313
-
2314
- if audioEngine == nil {
2315
- bumpGraphGen();
2316
- audioEngine = AVAudioEngine()
2317
- }
2318
- AudioPlaybackHook.currentEngine = { [weak self] in self?.audioEngine } // ⬅️ add this
2319
-
2320
- do {
2321
- guard let engine = audioEngine else { throw NSError(domain: "voice.audio", code: -1) }
2322
- let inputNode = engine.inputNode
2323
- let _ = inputNode // presence check
2324
-
2325
- reconcileAEC(on: engine, reason: "setup-start-prestart", allowRebuild: false)
2326
-
2327
- // if output node voice processing is ever needed, keep commented as in original:
2328
- // do { try engine.outputNode.setVoiceProcessingEnabled(true) } catch { ... }
2329
-
2330
- NSLog("[STT] AEC enable done")
2331
-
2332
- // --- FIXED WIRING: use live format on first hop, nil downstream, nil for tap ---
2333
- let inFmt = engine.inputNode.outputFormat(forBus: 0)
2334
-
2335
- // 1) Mute only the mic path, not the whole main mixer
2336
- let micMixer = AVAudioMixerNode()
2337
- engine.attach(micMixer)
2338
- // Use the live input format for input → micMixer
2339
- engine.connect(inputNode, to: micMixer, format: inFmt)
2340
- // Let main mixer pick downstream format
2341
- engine.connect(micMixer, to: engine.mainMixerNode, format: nil)
2342
- micMixer.outputVolume = 0.0 // ← you won't hear your own mic
2343
-
2344
- // 2) Prepare a player node for TTS inside the SAME engine/graph
2345
- let player = AVAudioPlayerNode()
2346
- self.playbackNode = player
2347
- engine.attach(player)
2348
-
2349
- // // --- Aggressive low-pass only ---
2350
- // let deEss = AVAudioUnitEQ(numberOfBands: 1)
2351
- // let lpf = deEss.bands[0]
2352
- // lpf.filterType = .lowPass
2353
- // lpf.frequency = 6500 // try 6000–7500
2354
- // lpf.bandwidth = 0.35 // fairly steep
2355
- // lpf.gain = 0.0
2356
- // lpf.bypass = false
2357
-
2358
- // self.ttsEQ = deEss
2359
- // engine.attach(deEss)
2360
-
2361
- // engine.disconnectNodeOutput(player)
2362
- // engine.connect(player, to: deEss, format: nil)
2363
- // engine.connect(deEss, to: engine.mainMixerNode, format: nil)
2364
- engine.connect(player, to: engine.mainMixerNode, format: nil)
2365
-
2366
-
2367
- NSLog("[STT] graph connected (mic->mute mixer, player->mainMixer)")
2368
-
2369
- var tapFrames: UInt64 = 0
2370
- // Tap uses nil so it follows the node’s current output format (survives route SR changes)
2371
-
2372
- safeRemoveTap(inputNode, bus: 0)
2373
- let format = inputNode.outputFormat(forBus: 0) // <- prefer explicit format
2374
- guard format.sampleRate > 0, format.channelCount > 0 else {
2375
- NSLog("[STT] skip tap: invalid input format (sr=\(format.sampleRate), ch=\(format.channelCount))")
2376
- return
2377
- }
2378
-
2379
- var tapBufferSize: AVAudioFrameCount = 1024
2380
- if let svStart = speakerVerificationStartConfig {
2381
- do {
2382
- var svConfig = svStart.config
2383
- let routeSampleRate = Int(round(format.sampleRate))
2384
- if useShortSpeakerVerificationTailWindow {
2385
- let forcedTail = max(0.1, shortSpeakerVerificationTailSeconds)
2386
- svConfig.tailSeconds = forcedTail
2387
- if svConfig.maxTailSeconds < forcedTail {
2388
- svConfig.maxTailSeconds = forcedTail
2389
- }
2390
- NSLog("[STT] SV tail override enabled tailSeconds=\(forcedTail)")
2391
- }
2392
-
2393
- speakerVerificationFrameSize = svConfig.frameSize
2394
- speakerVerificationThreshold = svConfig.decisionThreshold
2395
- speakerVerificationFrameSeq = 0
2396
- speakerVerificationSourceSampleRate = routeSampleRate
2397
- speakerVerificationTargetSampleRate = svConfig.sampleRate
2398
- speakerVerificationResampleCarry.removeAll(keepingCapacity: true)
2399
- speakerVerificationResamplePos = 0
2400
- speakerLastPositiveMatchAt = 0
2401
- speakerVerificationInputBuffer.removeAll(keepingCapacity: true)
2402
- setSpeakerGateState(enabled: false, open: false)
2403
- speakerVerificationErrorSent = false
2404
- speakerPreRollBuffers.removeAll(keepingCapacity: true)
2405
- speakerPreRollFrames = 0
2406
- speakerPendingPreRollFlush = false
2407
- speakerPreRollMaxFrames = max(1, Int(round(format.sampleRate * speakerPreRollSeconds)))
2408
-
2409
- svConfig.logLevel = .off
2410
- let svEngine = try SpeakerVerificationEngine(config: svConfig)
2411
- svEngine.setEnrollment(svStart.enrollment)
2412
- svEngine.resetStreamingState()
2413
-
2414
- speakerVerificationEngine = svEngine
2415
- setSpeakerGateState(enabled: true, open: false)
2416
- tapBufferSize = AVAudioFrameCount(max(64, svConfig.frameSize))
2417
- NSLog("[STT] Speaker verification gate enabled frameSize=\(svConfig.frameSize) tailSeconds=\(svConfig.tailSeconds) threshold=\(svConfig.decisionThreshold) hangover=\(useSpeakerGateHangover ? "ON" : "OFF") hangSec=\(String(format: "%.3f", speakerGateHangoverSeconds))")
2418
- if routeSampleRate != svConfig.sampleRate {
2419
- NSLog("[STT] SV resampling enabled \(routeSampleRate)Hz -> \(svConfig.sampleRate)Hz")
2420
- } else {
2421
- NSLog("[STT] SV sampleRate already matched at \(routeSampleRate)Hz")
2422
- }
2423
- } catch {
2424
- speakerVerificationEngine = nil
2425
- speakerVerificationThreshold = 0
2426
- speakerVerificationFrameSeq = 0
2427
- speakerVerificationSourceSampleRate = 0
2428
- speakerVerificationTargetSampleRate = 0
2429
- speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
2430
- speakerVerificationResamplePos = 0
2431
- speakerLastPositiveMatchAt = 0
2432
- setSpeakerGateState(enabled: false, open: true)
2433
- speakerPreRollBuffers.removeAll(keepingCapacity: false)
2434
- speakerPreRollFrames = 0
2435
- speakerPreRollMaxFrames = 0
2436
- speakerPendingPreRollFlush = false
2437
- sendResult(error: ["message": "Speaker verification disabled: \(error.localizedDescription)"],
2438
- bestTranscription: nil,
2439
- transcriptions: nil,
2440
- isFinal: nil)
2441
- }
2442
- }
2443
-
2444
- inputNode.installTap(onBus: 0, bufferSize: tapBufferSize, format: format) { [weak self] buffer, _ in
2445
- // Strongify self once
2446
- guard let self = self else { return }
2447
- // ✅ Count frames globally so the watchdog can see forward progress
2448
- self.tapFramesTotal &+= UInt64(buffer.frameLength)
2449
-
2450
- if self.tapFramesTotal % (44100 * 2) < 1024 { // ~every 2s at 44.1k
2451
- NSLog("[STT] tap alive, totalFrames=\(self.tapFramesTotal)")
2452
- }
2453
-
2454
- let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
2455
- let LEVEL_LOWPASS_TRIG: Float = 0.5
2456
-
2457
- // CH0
2458
- if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
2459
- var peak0: Float = 0
2460
- vDSP_maxmgv(ch0, 1, &peak0, frames)
2461
- let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
2462
-
2463
- let smoothed0 = LEVEL_LOWPASS_TRIG * db0
2464
- + (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel0
2465
- self.averagePowerForChannel0 = smoothed0
2466
- self.averagePowerForChannel1 = smoothed0
2467
- }
2468
-
2469
- // CH1
2470
- if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
2471
- var peak1: Float = 0
2472
- vDSP_maxmgv(ch1, 1, &peak1, frames)
2473
- let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
2474
-
2475
- let smoothed1 = LEVEL_LOWPASS_TRIG * db1
2476
- + (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel1
2477
- self.averagePowerForChannel1 = smoothed1
2478
- }
2479
-
2480
- // Normalize 0–10 and emit
2481
- self.averagePowerForChannel1 = Float(self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0)
2482
- let value = self.averagePowerForChannel1
2483
- self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": value])
2484
-
2485
- if self.currentSpeakerGateState().enabled, let ch0 = buffer.floatChannelData?[0] {
2486
- let mono = Array(UnsafeBufferPointer(start: ch0, count: Int(buffer.frameLength)))
2487
- self.processSpeakerVerificationSamples(mono)
2488
- }
2489
-
2490
- // Append to recognition
2491
- let gate = self.currentSpeakerGateState()
2492
- if !gate.enabled {
2493
- self.recognitionRequest?.append(buffer)
2494
- NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=DISABLED action=append")
2495
- } else if gate.open {
2496
- self.flushSpeakerPreRollIfNeeded()
2497
- self.recognitionRequest?.append(buffer)
2498
- NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=OPEN action=append preRollFrames=\(self.currentSpeakerPreRollFrames())")
2499
- } else {
2500
- self.enqueueSpeakerPreRoll(buffer)
2501
- NSLog("[STT][SV][TAP] samples=\(buffer.frameLength) gate=CLOSED action=buffer preRollFrames=\(self.currentSpeakerPreRollFrames())")
2502
- }
2503
-
2504
- // inside inputNode.installTap { buffer, _ in
2505
- self.lastBufferAt = CACurrentMediaTime()
2506
- }
2507
-
2508
- engine.prepare()
2509
- NSLog("[STT] audioEngine prepare")
2510
- var audioSessionError: NSError?
2511
- do {
2512
- try engine.start()
2513
- armFirstIOCycleLatch(on: engine)
2514
- reconcileAEC(on: engine, reason: "setup-start-poststart", allowRebuild: false)
2515
- scheduleAECReconcileRetries(reason: "setup-start")
2516
- } catch {
2517
- audioSessionError = error as NSError
2518
- }
2519
-
2520
- // after engine.start() success:
2521
- engineHotAt = CACurrentMediaTime()
2522
- seenRealSpeech = false
2523
- NSLog("engine HOT at \(engineHotAt)")
2524
- sendEvent(name: "onSpeechStart", body: nil) // engine hot signal (keep if you want)
2525
- startTask(makeFreshRequest())
2526
-
2527
- installPlaybackHooks()
2528
-
2529
- startWatchdog()
2530
-
2531
- NSLog("audioEngine startAndReturnError")
2532
- if let audioSessionError = audioSessionError {
2533
- NotificationCenter.default.addObserver(self,
2534
- selector: #selector(self.handleEngineConfigChange(_:)),
2535
- name: .AVAudioEngineConfigurationChange,
2536
- object: engine)
2537
- NSLog("audioEngine audioSessionError!=nil")
2538
- self.sendResult(error: ["code": "audio", "message": audioSessionError.localizedDescription],
2539
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
2540
- NSLog("[STT] self sendResult")
2541
- // self.teardown()
2542
- NSLog("[STT] Removed self teardown")
2543
- return
2544
- }
2545
- NSLog("After Start recording and append recording")
2546
- DispatchQueue.main.asyncAfter(deadline: .now() + 3.0) { [weak self] in
2547
- guard let self = self else { return }
2548
- let running = self.audioEngine?.isRunning ?? false
2549
- let taskState = self.recognitionTask?.state.rawValue ?? -1
2550
- NSLog("[STT] health: engineRunning=\(running) taskState=\(taskState)")
2551
- }
2552
-
2553
- NSLog("After if audioSessionError != nil")
2554
- } catch let e as NSError {
2555
- sendResult(error: ["code": "start_recording", "message": e.localizedDescription],
2556
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
2557
- NSLog("End of init...")
2558
- return
2559
- }
2560
- }
2561
-
2562
- private func loadSpeakerVerificationStartConfig(onboardingJsonPath: String) throws -> SpeakerVerificationStartConfig {
2563
- let data = try Data(contentsOf: URL(fileURLWithPath: onboardingJsonPath))
2564
- let enrollment = try SpeakerEnrollment.deserialize(data)
2565
- return SpeakerVerificationStartConfig(enrollment: enrollment, config: enrollment.configSnapshot)
2566
- }
2567
-
2568
- private func currentSpeakerGateState() -> (enabled: Bool, open: Bool) {
2569
- speakerVerificationStateLock.lock()
2570
- let state = (speakerGateEnabled, speakerGateOpen)
2571
- speakerVerificationStateLock.unlock()
2572
- return state
2573
- }
2574
-
2575
- private func setSpeakerGateState(enabled: Bool, open: Bool) {
2576
- speakerVerificationStateLock.lock()
2577
- let wasOpen = speakerGateOpen
2578
- let wasEnabled = speakerGateEnabled
2579
- speakerGateEnabled = enabled
2580
- speakerGateOpen = open
2581
- let changed = (wasOpen != open) || (wasEnabled != enabled)
2582
- if enabled && open && (!wasEnabled || !wasOpen) {
2583
- speakerPendingPreRollFlush = true
2584
- }
2585
- if !enabled {
2586
- speakerPendingPreRollFlush = false
2587
- }
2588
- speakerVerificationStateLock.unlock()
2589
- if changed {
2590
- NSLog("[STT][SV][GATE] enabled=\(enabled ? "YES" : "NO") open=\(open ? "YES" : "NO") th=\(speakerVerificationThreshold)")
2591
- }
2592
- }
2593
-
2594
- private func currentSpeakerPreRollFrames() -> Int {
2595
- speakerVerificationStateLock.lock()
2596
- let n = speakerPreRollFrames
2597
- speakerVerificationStateLock.unlock()
2598
- return n
2599
- }
2600
-
2601
- private func enqueueSpeakerPreRoll(_ buffer: AVAudioPCMBuffer) {
2602
- speakerVerificationStateLock.lock()
2603
- defer { speakerVerificationStateLock.unlock() }
2604
- guard speakerPreRollMaxFrames > 0 else { return }
2605
- guard let copy = copyPCMBuffer(buffer) else { return }
2606
- speakerPreRollBuffers.append(copy)
2607
- speakerPreRollFrames += Int(copy.frameLength)
2608
-
2609
- while speakerPreRollFrames > speakerPreRollMaxFrames, !speakerPreRollBuffers.isEmpty {
2610
- let dropped = speakerPreRollBuffers.removeFirst()
2611
- speakerPreRollFrames -= Int(dropped.frameLength)
2612
- }
2613
- }
2614
-
2615
- private func flushSpeakerPreRollIfNeeded() {
2616
- var toFlush: [AVAudioPCMBuffer] = []
2617
- var totalFrames = 0
2618
- var selectedFrames = 0
2619
- speakerVerificationStateLock.lock()
2620
- if speakerPendingPreRollFlush {
2621
- totalFrames = speakerPreRollFrames
2622
- if useLegacySpeakerGateBehavior {
2623
- toFlush = speakerPreRollBuffers
2624
- } else {
2625
- let sr = max(1, speakerVerificationSourceSampleRate)
2626
- let maxFrames = max(1, Int(round(Double(sr) * speakerPreRollFlushMaxSeconds)))
2627
- if totalFrames <= maxFrames {
2628
- toFlush = speakerPreRollBuffers
2629
- } else {
2630
- var kept: [AVAudioPCMBuffer] = []
2631
- var keptFrames = 0
2632
- for b in speakerPreRollBuffers.reversed() {
2633
- kept.append(b)
2634
- keptFrames += Int(b.frameLength)
2635
- if keptFrames >= maxFrames { break }
2636
- }
2637
- toFlush = kept.reversed()
2638
- }
2639
- }
2640
- selectedFrames = toFlush.reduce(0) { $0 + Int($1.frameLength) }
2641
- speakerPreRollBuffers.removeAll(keepingCapacity: false)
2642
- speakerPreRollFrames = 0
2643
- speakerPendingPreRollFlush = false
2644
- }
2645
- speakerVerificationStateLock.unlock()
2646
-
2647
- if toFlush.isEmpty { return }
2648
- NSLog("[STT][SV][PREROLL] flushing buffers=\(toFlush.count) frames=\(selectedFrames) totalBuffered=\(totalFrames)")
2649
- for b in toFlush {
2650
- recognitionRequest?.append(b)
2651
- }
2652
- }
2653
-
2654
- private func copyPCMBuffer(_ source: AVAudioPCMBuffer) -> AVAudioPCMBuffer? {
2655
- guard let dst = AVAudioPCMBuffer(pcmFormat: source.format, frameCapacity: source.frameLength) else {
2656
- return nil
2657
- }
2658
- dst.frameLength = source.frameLength
2659
- let channels = Int(source.format.channelCount)
2660
- let frames = Int(source.frameLength)
2661
-
2662
- if let src = source.floatChannelData, let out = dst.floatChannelData {
2663
- let bytes = frames * MemoryLayout<Float>.size
2664
- for ch in 0..<channels {
2665
- memcpy(out[ch], src[ch], bytes)
2666
- }
2667
- return dst
2668
- }
2669
-
2670
- if let src = source.int16ChannelData, let out = dst.int16ChannelData {
2671
- let bytes = frames * MemoryLayout<Int16>.size
2672
- for ch in 0..<channels {
2673
- memcpy(out[ch], src[ch], bytes)
2674
- }
2675
- return dst
2676
- }
2677
-
2678
- if let src = source.int32ChannelData, let out = dst.int32ChannelData {
2679
- let bytes = frames * MemoryLayout<Int32>.size
2680
- for ch in 0..<channels {
2681
- memcpy(out[ch], src[ch], bytes)
2682
- }
2683
- return dst
2684
- }
2685
-
2686
- return nil
2687
- }
2688
-
2689
- private func resampleSamplesForSpeakerVerificationIfNeeded(_ input: [Float]) -> [Float] {
2690
- guard !input.isEmpty else { return [] }
2691
- let srcRate = speakerVerificationSourceSampleRate
2692
- let dstRate = speakerVerificationTargetSampleRate
2693
- guard srcRate > 0, dstRate > 0 else { return input }
2694
- if srcRate == dstRate { return input }
2695
-
2696
- let ratio = Double(srcRate) / Double(dstRate) // source samples per output sample
2697
- let source = speakerVerificationResampleCarry + input
2698
- guard source.count >= 2 else {
2699
- speakerVerificationResampleCarry = source
2700
- return []
2701
- }
2702
-
2703
- var out: [Float] = []
2704
- out.reserveCapacity(Int(Double(input.count) * Double(dstRate) / Double(srcRate)) + 8)
2705
-
2706
- var pos = speakerVerificationResamplePos
2707
- while pos + 1.0 < Double(source.count) {
2708
- let i = Int(pos)
2709
- let frac = Float(pos - Double(i))
2710
- let a = source[i]
2711
- let b = source[i + 1]
2712
- out.append(a + (b - a) * frac)
2713
- pos += ratio
2714
- }
2715
-
2716
- let keepStart = max(0, Int(floor(pos)) - 1)
2717
- speakerVerificationResampleCarry = Array(source[keepStart...])
2718
- speakerVerificationResamplePos = pos - Double(keepStart)
2719
- return out
2720
- }
2721
-
2722
- private func processSpeakerVerificationSamples(_ samples: [Float]) {
2723
- guard !samples.isEmpty else { return }
2724
- speakerVerificationQueue.async { [weak self] in
2725
- guard let self = self else { return }
2726
- guard let engine = self.speakerVerificationEngine else { return }
2727
- let frameSize = self.speakerVerificationFrameSize
2728
- guard frameSize > 0 else { return }
2729
-
2730
- let normalized = self.resampleSamplesForSpeakerVerificationIfNeeded(samples)
2731
- if normalized.isEmpty { return }
2732
- self.speakerVerificationInputBuffer.append(contentsOf: normalized)
2733
-
2734
- while self.speakerVerificationInputBuffer.count >= frameSize {
2735
- let frame = Array(self.speakerVerificationInputBuffer.prefix(frameSize))
2736
- self.speakerVerificationInputBuffer.removeFirst(frameSize)
2737
- self.speakerVerificationFrameSeq &+= 1
2738
- let seq = self.speakerVerificationFrameSeq
2739
-
2740
- do {
2741
- let out = try engine.processFrame(frame: frame)
2742
- switch out {
2743
- case .pending(let p):
2744
- let gate = self.currentSpeakerGateState()
2745
- NSLog("[STT][SV][FRAME #\(seq)] pending buffered=\(p.bufferedSamples) neededSec=\(p.neededSeconds) gate=\(gate.open ? "OPEN" : "CLOSED") th=\(self.speakerVerificationThreshold)")
2746
- case .result(let result):
2747
- if self.useLegacySpeakerGateBehavior || !self.useSpeakerGateHangover {
2748
- self.setSpeakerGateState(enabled: true, open: result.isMatch)
2749
- } else {
2750
- let now = CACurrentMediaTime()
2751
- if result.isMatch {
2752
- self.speakerLastPositiveMatchAt = now
2753
- self.setSpeakerGateState(enabled: true, open: true)
2754
- } else {
2755
- let keepOpen = self.speakerLastPositiveMatchAt > 0 &&
2756
- (now - self.speakerLastPositiveMatchAt) <= max(0, self.speakerGateHangoverSeconds)
2757
- self.setSpeakerGateState(enabled: true, open: keepOpen)
2758
- }
2759
- }
2760
- let gate = self.currentSpeakerGateState()
2761
- NSLog("[STT][SV][FRAME #\(seq)] scoreBest=\(String(format: "%.4f", result.scoreBest)) raw=\(String(format: "%.4f", result.scoreBestRaw)) meancombo=\(String(format: "%.4f", result.scoreBestMeancombo)) mean=\(String(format: "%.4f", result.scoreMean)) match=\(result.isMatch ? "YES" : "NO") gate=\(gate.open ? "OPEN" : "CLOSED") th=\(String(format: "%.4f", self.speakerVerificationThreshold)) hangover=\(self.useSpeakerGateHangover ? "ON" : "OFF") hangSec=\(String(format: "%.3f", self.speakerGateHangoverSeconds))")
2762
- }
2763
- } catch {
2764
- self.speakerVerificationEngine = nil
2765
- self.speakerVerificationInputBuffer.removeAll(keepingCapacity: false)
2766
- self.speakerVerificationSourceSampleRate = 0
2767
- self.speakerVerificationTargetSampleRate = 0
2768
- self.speakerVerificationResampleCarry.removeAll(keepingCapacity: false)
2769
- self.speakerVerificationResamplePos = 0
2770
- self.speakerLastPositiveMatchAt = 0
2771
- self.setSpeakerGateState(enabled: false, open: true)
2772
- self.speakerVerificationStateLock.lock()
2773
- self.speakerPreRollBuffers.removeAll(keepingCapacity: false)
2774
- self.speakerPreRollFrames = 0
2775
- self.speakerPreRollMaxFrames = 0
2776
- self.speakerVerificationStateLock.unlock()
2777
- if !self.speakerVerificationErrorSent {
2778
- self.speakerVerificationErrorSent = true
2779
- DispatchQueue.main.async { [weak self] in
2780
- self?.sendResult(error: ["message": "Speaker verification stopped: \(error.localizedDescription)"],
2781
- bestTranscription: nil,
2782
- transcriptions: nil,
2783
- isFinal: nil)
2784
- }
2785
- }
2786
- return
2787
- }
2788
- }
2789
- }
2790
- }
2791
-
2792
- // MARK: - Helpers
2793
- private func _normalizedPowerLevelFromDecibels(_ decibels: CGFloat) -> CGFloat {
2794
- if decibels < -80.0 || decibels == 0.0 { return 0.0 }
2795
- let minDb: Float = -80.0
2796
- let pow10_min = powf(10.0, 0.05 * minDb)
2797
- let pow10_db = powf(10.0, 0.05 * Float(decibels))
2798
- let power = powf((pow10_db - pow10_min) * (1.0 / (1.0 - pow10_min)), 1.0 / 2.0)
2799
- if power < 1.0 { return CGFloat(power) } else { return 1.0 }
2800
- }
2801
-
2802
- private func sendEvent(name: String, body: [String: Any]?) {
2803
- delegate?.stt(self, didEmitEvent: name, body: body)
2804
- }
2805
-
2806
- /// Exact event behavior preserved from ObjC `sendResult`.
2807
- private func sendResult(error: [String: Any]?,
2808
- bestTranscription: String?,
2809
- transcriptions: [String]?,
2810
- isFinal: Bool?) {
2811
- NSLog("[STT] sendResult called")
2812
- if (self.micPaused || self.speechRecognitionPaused) {
2813
- if self.micPaused && self.speechRecognitionPaused {
2814
- NSLog("[STT] sendResult suppressed: micPaused + speechRecognitionPaused")
2815
- } else if self.micPaused {
2816
- NSLog("[STT] sendResult suppressed: micPaused")
2817
- } else {
2818
- NSLog("[STT] sendResult suppressed: speechRecognitionPaused")
2819
- }
2820
- return
2821
- }
2822
-
2823
- if let error = error {
2824
- sendEvent(name: "onSpeechError", body: ["error": error])
2825
- }
2826
- if let best = bestTranscription {
2827
- sendEvent(name: "onSpeechResults", body: ["value": [best]])
2828
- }
2829
- if let trans = transcriptions {
2830
- sendEvent(name: "onSpeechPartialResults", body: ["value": trans])
2831
- }
2832
- if let isFinal = isFinal {
2833
- sendEvent(name: "onSpeechRecognized", body: ["isFinal": isFinal])
2834
- }
2835
- }
2836
-
2837
- // MARK: - SFSpeechRecognizerDelegate
2838
-
2839
- public func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
2840
- if available == false {
2841
- sendResult(error: ["message": "Speech recognition is not available now"],
2842
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
2843
- }
2844
- }
2845
-
2846
- // MARK: - Small helper to recreate recognizer (used by watchdog)
2847
- private func recreateSpeechRecognizerPreservingLocale() {
2848
- let loc = speechRecognizer?.locale
2849
- speechRecognizer = loc != nil ? SFSpeechRecognizer(locale: loc!) : SFSpeechRecognizer()
2850
- speechRecognizer?.delegate = self
2851
- NSLog("[STT] recreated SFSpeechRecognizer (locale preserved: \(loc?.identifier ?? "default"))")
2852
- }
2853
- }