react-native-davoice-tts 1.0.209 → 1.0.211

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (21) hide show
  1. package/TTSRNBridge.podspec +1 -1
  2. package/ios/SpeechBridge/SpeechBridge.m +7 -5
  3. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift__ +1329 -0
  4. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DavoiceTTS +0 -0
  5. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Headers/DavoiceTTS-Swift.h +2 -2
  6. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.abi.json +1356 -1350
  7. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.private.swiftinterface +12 -12
  8. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.swiftinterface +12 -12
  9. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift__ +1329 -0
  10. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DavoiceTTS +0 -0
  11. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Headers/DavoiceTTS-Swift.h +4 -4
  12. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.abi.json +2004 -1998
  13. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.private.swiftinterface +30 -30
  14. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.swiftinterface +30 -30
  15. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.abi.json +2004 -1998
  16. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.private.swiftinterface +30 -30
  17. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.swiftinterface +30 -30
  18. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeDirectory +0 -0
  19. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeRequirements-1 +0 -0
  20. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeResources +42 -27
  21. package/package.json +1 -1
@@ -0,0 +1,1329 @@
1
+ // STT.swift
2
+ // Native iOS Swift version (AEC flow preserved 1:1)
3
+
4
+ import Foundation
5
+ import UIKit
6
+ import Speech
7
+ import Accelerate
8
+ import AVFAudio // or import AVFoundation
9
+
10
+ @objc public protocol STTDelegate: AnyObject {
11
+ @objc func stt(_ stt: STT, didEmitEvent name: String, body: [String: Any]?)
12
+ }
13
+
14
+ @objcMembers
15
+ public final class STT: NSObject, SFSpeechRecognizerDelegate {
16
+ public weak var delegate: STTDelegate?
17
+ public var continuous: Bool = true
18
+
19
+ // MARK: - Private
20
+ private var speechRecognizer: SFSpeechRecognizer?
21
+ private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
22
+ private var audioEngine: AVAudioEngine?
23
+ private var recognitionTask: SFSpeechRecognitionTask?
24
+ private var audioSession: AVAudioSession?
25
+ private var isTearingDown: Bool = false
26
+ private var sessionId: String?
27
+ private var priorAudioCategory: AVAudioSession.Category?
28
+ private var averagePowerForChannel0: Float = 0
29
+ private var averagePowerForChannel1: Float = 0
30
+
31
+ private var playbackNode: AVAudioPlayerNode?
32
+ private var seenRealSpeech = false // flips true after first non-blank token
33
+ private var engineHotAt: CFTimeInterval = 0 // when engine actually started
34
+ private let warmupKeepAlive: CFTimeInterval = 4.0 // seconds we’ll keep re-arming in silence
35
+
36
+ // Keep-engine-alive helpers
37
+ private var lastReclaimAttempt: CFAbsoluteTime = 0
38
+ private let reclaimCooldown: CFTimeInterval = 1.0
39
+
40
+ // --- Task health ---
41
+ private var lastBufferAt: CFTimeInterval = 0 // updated from tap
42
+ private var lastResultAt: CFTimeInterval = 0 // updated from recognition callback
43
+ private var lastTaskStartAt: CFTimeInterval = 0
44
+ private var stallWatchdog: Timer?
45
+ private var consecutiveStallCount = 0
46
+ private let stallThreshold: CFTimeInterval = 8.0 // seconds w/o results while engine is hot
47
+ private let rearmCooldownTask: CFTimeInterval = 2.0
48
+ private var lastRearmAt: CFTimeInterval = 0
49
+ private var engineHot = false
50
+ private var hotAt: CFTimeInterval = 0
51
+ private var booting = false
52
+ private var reqConsumed = false // tracks if a request has been used to start a task
53
+
54
+ // --- Recovery & diagnostics ---
55
+ private var recoverySeq = 0
56
+ private var lastRecoveryAt: CFTimeInterval = 0
57
+ private var lastTaskOrigin: String = "cold"
58
+ // --- Mic pause state ---
59
+ private var savedSessionBeforePause: (
60
+ category: AVAudioSession.Category,
61
+ mode: AVAudioSession.Mode,
62
+ options: AVAudioSession.CategoryOptions,
63
+ sr: Double,
64
+ inCh: Int,
65
+ outCh: Int,
66
+ ioDur: TimeInterval
67
+ )?
68
+
69
+ private(set) var sttActive = false
70
+
71
+ // partial cadence monitor
72
+ private var emaPartialGap: Double = 0 // exponential moving average of time between partials
73
+ private let emaAlpha: Double = 0.3
74
+ // Add near your other state:
75
+ private var ioLatchActiveGen: UInt64 = 0
76
+
77
+ // TTS probe state
78
+ private var mixerProbeActive = false
79
+ private var mixerProbeCompletions: [(Bool) -> Void] = []
80
+ private let ttsSerial = DispatchQueue(label: "stt.tts.serial") // serialize TTS schedule/play
81
+ private var engineHasRenderedOnce = false
82
+ // Add near your other state:
83
+ private var lastLocaleStr: String?
84
+ private var restartInFlight = false
85
+
86
+ private func restartSpeechCold(_ reason: String) {
87
+ guard !restartInFlight else { return }
88
+ restartInFlight = true
89
+ NSLog("[STT] 🔁 cold-restart (\(reason))")
90
+
91
+ // Full teardown, brief breather, then fresh start with same locale
92
+ teardown()
93
+ let locale = lastLocaleStr
94
+ DispatchQueue.main.asyncAfter(deadline: .now() + 0.25) { [weak self] in
95
+ guard let self = self else { return }
96
+ self.restartInFlight = false
97
+ self.setupAndStartRecognizing(localeStr: locale)
98
+ }
99
+ }
100
+
101
+ // MARK: - Event names (unchanged)
102
+ public static let supportedEvents: [String] = [
103
+ "onSpeechResults",
104
+ "onSpeechStart",
105
+ "onSpeechPartialResults",
106
+ "onSpeechError",
107
+ "onSpeechEnd",
108
+ "onSpeechRecognized",
109
+ "onSpeechVolumeChanged"
110
+ ]
111
+
112
+ private var graphGen: UInt64 = 0
113
+ @inline(__always) private func bumpGraphGen() { graphGen &+= 1; ioLatchActiveGen = 0 }
114
+
115
+ @inline(__always)
116
+ private func safeRemoveTap(_ node: AVAudioNode?, bus: AVAudioNodeBus = 0) {
117
+ guard let n = node, n.engine != nil else { return } // only remove if still attached
118
+ try? n.removeTap(onBus: bus)
119
+ }
120
+
121
+ // MARK: - Public API (native replacements for the former RCT methods)
122
+
123
+ public func isSpeechAvailable(_ completion: @escaping (Bool) -> Void) {
124
+ SFSpeechRecognizer.requestAuthorization { status in
125
+ switch status {
126
+ case .authorized: completion(true)
127
+ default: completion(false)
128
+ }
129
+ }
130
+ }
131
+
132
+ private func armFirstIOCycleLatch(on engine: AVAudioEngine) {
133
+ engineHasRenderedOnce = false
134
+ let gen = graphGen
135
+
136
+ // Prevent overlapping latches against the same graph generation.
137
+ if ioLatchActiveGen == gen { return }
138
+ ioLatchActiveGen = gen
139
+
140
+ DispatchQueue.main.async { [weak self, weak engine] in
141
+ guard let self = self, let eng = engine, gen == self.graphGen else { return }
142
+ let out = eng.outputNode
143
+ var fired = false
144
+
145
+ // >>> IMPORTANT: ensure no previous tap is left behind
146
+ self.safeRemoveTap(out, bus: 0)
147
+
148
+ out.installTap(onBus: 0, bufferSize: 128, format: nil) { [weak self, weak out] _, _ in
149
+ guard let self = self, gen == self.graphGen else { return }
150
+ if fired { return }
151
+ fired = true
152
+ self.safeRemoveTap(out, bus: 0)
153
+ self.engineHasRenderedOnce = true
154
+ // latch finished for this gen
155
+ if self.ioLatchActiveGen == gen { self.ioLatchActiveGen = 0 }
156
+ }
157
+
158
+ DispatchQueue.main.asyncAfter(deadline: .now() + 2.0) { [weak self, weak out] in
159
+ guard let self = self, gen == self.graphGen else { return }
160
+ if fired { return }
161
+ self.safeRemoveTap(out, bus: 0)
162
+ self.engineHasRenderedOnce = true // fail-open
163
+ if self.ioLatchActiveGen == gen { self.ioLatchActiveGen = 0 }
164
+ }
165
+ }
166
+ }
167
+
168
+ public func isRecognizing() -> Bool {
169
+ guard let task = recognitionTask else { return false }
170
+ return task.state == .running
171
+ }
172
+
173
+ private func ensurePlaybackNode(in engine: AVAudioEngine) -> AVAudioPlayerNode {
174
+ // If we have a node but it's tied to a different engine or got disconnected, recreate it.
175
+ if let p = playbackNode, p.engine === engine {
176
+ return p
177
+ }
178
+ let p = AVAudioPlayerNode()
179
+ playbackNode = p
180
+ engine.attach(p)
181
+ // Connect with nil format so the mixer does SRC if needed
182
+ engine.connect(p, to: engine.mainMixerNode, format: nil)
183
+ return p
184
+ }
185
+
186
+ private func startWatchdog() {
187
+ stallWatchdog?.invalidate()
188
+ stallWatchdog = Timer.scheduledTimer(withTimeInterval: 2.0, repeats: true) { [weak self] _ in
189
+ self?.checkTaskHealth()
190
+ }
191
+ RunLoop.main.add(stallWatchdog!, forMode: .common)
192
+ }
193
+
194
+ private func stopWatchdog() {
195
+ stallWatchdog?.invalidate()
196
+ stallWatchdog = nil
197
+ }
198
+
199
+ // MARK: - Public pause/unpause API
200
+ @objc public func pauseMicrophone() {
201
+ DispatchQueue.main.async {
202
+
203
+ self.teardown() // full cleanup of recognizer + engine
204
+
205
+ NSLog("[STT] 🔇 Microphone paused (clean teardown; TTS in playback mode)")
206
+ }
207
+ }
208
+
209
+ @objc public func unPauseMicrophone() {
210
+ DispatchQueue.main.async {
211
+ self.booting = true
212
+ // Cold start through your existing, stable path
213
+ self.startSpeech(localeStr: self.lastLocaleStr)
214
+ self.booting = false
215
+
216
+ NSLog("[STT] 🎙️ Microphone unpaused (clean restart via startSpeech)")
217
+ }
218
+ }
219
+
220
+ private func rearmTask(reason: String) {
221
+ if self.booting { return } // ← add
222
+ // Cancel old task only — keep the engine and tap running.
223
+ recognitionTask?.cancel()
224
+ recognitionTask = nil
225
+
226
+ seenRealSpeech = false
227
+ lastTaskStartAt = CACurrentMediaTime()
228
+ startTask(makeFreshRequest())
229
+ NSLog("[STT] rearmTask(\(reason)) -> new task started")
230
+ }
231
+
232
+ private func checkTaskHealth() {
233
+ if self.booting { return } // ← add
234
+ guard let engine = audioEngine else { return }
235
+ let now = CACurrentMediaTime()
236
+
237
+ // Engine down? Let your existing logic handle it; just bail.
238
+ if !engine.isRunning { return }
239
+
240
+ // If recognizer is globally unavailable, don’t thrash — wait until it flips back.
241
+ if let rec = speechRecognizer, rec.isAvailable == false {
242
+ NSLog("[STT] watchdog: recognizer unavailable; waiting…")
243
+ return
244
+ }
245
+
246
+ // No task at all? Spin one up.
247
+ if recognitionTask == nil {
248
+ if now - lastRearmAt > rearmCooldownTask {
249
+ NSLog("[STT] watchdog: no task -> start fresh request")
250
+ lastRearmAt = now
251
+ startTask(makeFreshRequest())
252
+ }
253
+ return
254
+ }
255
+
256
+ // If we’ve had buffers recently but no results for a while, assume the task is stuck.
257
+ let noResultsFor = now - lastResultAt
258
+ let hadRecentAudio = (now - lastBufferAt) < max(2.0, stallThreshold) // tap is alive
259
+
260
+ if hadRecentAudio && noResultsFor > stallThreshold {
261
+ if now - lastRearmAt > rearmCooldownTask {
262
+ consecutiveStallCount += 1
263
+ NSLog("[STT] watchdog: stall detected (no results for \(Int(noResultsFor))s, audio flowing). rearm #\(consecutiveStallCount)")
264
+
265
+ rearmTask(reason: "watchdog-stall")
266
+ lastRearmAt = now
267
+
268
+ // If we stall repeatedly, recreate the recognizer itself (server/session could be hosed)
269
+ if consecutiveStallCount >= 3 {
270
+ recreateSpeechRecognizerPreservingLocale()
271
+ consecutiveStallCount = 0
272
+ }
273
+ }
274
+ } else if hadRecentAudio {
275
+ // Healthy path: audio & results are flowing; reset stall counter
276
+ consecutiveStallCount = 0
277
+ }
278
+ }
279
+
280
+ public func startSpeech(localeStr: String?) {
281
+ NSLog("[STT] startSpeech(locale=\(localeStr ?? "nil"))")
282
+ lastLocaleStr = localeStr
283
+
284
+ if recognitionTask != nil {
285
+ sendResult(error: ["code": "already_started", "message": "Speech recognition already started!"],
286
+ bestTranscription: nil, transcriptions: nil, isFinal: nil)
287
+ return
288
+ }
289
+
290
+ SFSpeechRecognizer.requestAuthorization { [weak self] status in
291
+ guard let self = self else { return }
292
+ switch status {
293
+ case .notDetermined:
294
+ self.sendResult(error: ["message": "Speech recognition not yet authorized"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
295
+ case .denied:
296
+ self.sendResult(error: ["message": "User denied access to speech recognition"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
297
+ case .restricted:
298
+ self.sendResult(error: ["message": "Speech recognition restricted on this device"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
299
+ case .authorized:
300
+ self.setupAndStartRecognizing(localeStr: localeStr)
301
+ @unknown default:
302
+ self.sendResult(error: ["message": "Unknown authorization status"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
303
+ }
304
+ }
305
+ }
306
+
307
+ public func stopSpeech(_ completion: ((Bool) -> Void)? = nil) {
308
+ NSLog("[STT] stopSpeech() requested by app")
309
+ recognitionTask?.finish()
310
+ completion?(false)
311
+ }
312
+
313
+ public func cancelSpeech(_ completion: ((Bool) -> Void)? = nil) {
314
+ NSLog("[STT] cancelSpeech() requested by app")
315
+
316
+ recognitionTask?.cancel()
317
+ completion?(false)
318
+ }
319
+
320
+ public func destroySpeech(_ completion: ((Bool) -> Void)? = nil) {
321
+ NSLog("[STT] **** destroySpeech!!!")
322
+ teardown()
323
+ completion?(false)
324
+ }
325
+
326
+ private func updateSessionRouting(selectBestInput: Bool = true) {
327
+ NSLog("[STT] ⚠️ updateSessionRouting??? why???")
328
+
329
+ let s = AVAudioSession.sharedInstance()
330
+
331
+ // fast checks & logs can run on main
332
+ let inputs = s.currentRoute.inputs
333
+ guard !inputs.isEmpty else {
334
+ NSLog("[STT] ⚠️ No capture route (likely A2DP). Deferring engine start.")
335
+ return
336
+ }
337
+
338
+ DispatchQueue.global(qos: .userInitiated).async { [weak self] in
339
+ guard let self = self else { return }
340
+ do { try s.setActive(false, options: [.notifyOthersOnDeactivation]) }
341
+ catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
342
+
343
+ let hasWiredOrCar = s.currentRoute.outputs.contains {
344
+ $0.portType == .headphones || $0.portType == .carAudio || $0.portType == .usbAudio
345
+ }
346
+ if selectBestInput, let all = s.availableInputs {
347
+ let bt = all.first { $0.portType == .bluetoothHFP || $0.portType == .bluetoothLE }
348
+ let wired = all.first { $0.portType == .headsetMic }
349
+ let built = all.first { $0.portType == .builtInMic }
350
+ let best = bt ?? wired ?? built
351
+ do {
352
+ if s.preferredInput?.uid != best?.uid { try s.setPreferredInput(best) }
353
+ if let builtIn = best, builtIn.portType == .builtInMic,
354
+ let ds = builtIn.dataSources?.first(where: { $0.orientation == .bottom || $0.orientation == .back }) {
355
+ try? builtIn.setPreferredDataSource(ds)
356
+ }
357
+ } catch {
358
+ NSLog("[STT] setPreferredInput failed: \(error.localizedDescription)")
359
+ }
360
+ }
361
+
362
+ var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
363
+ if !hasWiredOrCar { opts.insert(.defaultToSpeaker) }
364
+
365
+ if s.category != .playAndRecord || s.mode != .voiceChat || s.categoryOptions != opts {
366
+ do { try s.setCategory(.playAndRecord, mode: .voiceChat, options: opts) }
367
+ catch { NSLog("[STT] setCategory failed: \(error.localizedDescription)") }
368
+ }
369
+
370
+ do { try s.setActive(true, options: []) }
371
+ catch { NSLog("[STT] setActive failed: \(error.localizedDescription)") }
372
+
373
+ // Optional: force 16k after activation
374
+ self.force16kIfPossible(s)
375
+
376
+ // Log route back on main so logs stay ordered
377
+ DispatchQueue.main.async {
378
+ let inPorts = s.currentRoute.inputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
379
+ let outPorts = s.currentRoute.outputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
380
+ NSLog("[STT] route in=[\(inPorts)] out=[\(outPorts)]")
381
+ }
382
+ }
383
+ }
384
+
385
+ // ↓↓↓ preferred settings helper
386
+ private func force16kIfPossible(_ session: AVAudioSession) {
387
+ let hasExternalOutput = session.currentRoute.outputs.contains {
388
+ switch $0.portType {
389
+ case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
390
+ return true
391
+ default: return false
392
+ }
393
+ }
394
+ if hasExternalOutput { return }
395
+
396
+ try? session.setPreferredSampleRate(16_000)
397
+ if session.isInputAvailable { try? session.setPreferredInputNumberOfChannels(1) }
398
+ try? session.setPreferredOutputNumberOfChannels(1)
399
+ try? session.setPreferredIOBufferDuration(0.02) // ~20 ms frames
400
+ }
401
+
402
+ // MARK: - Core logic (kept intact, including AEC order/steps)
403
+
404
+ /// Returns true if no errors occurred (identical flow & calls as ObjC).
405
+ /// Returns true if no errors occurred (identical flow & calls as ObjC) + keep-alive opts.
406
+ /// Returns true if no errors occurred (identical flow & calls as ObjC) + keep-alive opts.
407
+ private func setupAudioSession() -> Bool {
408
+ var err: NSError?
409
+ let session = AVAudioSession.sharedInstance()
410
+ self.audioSession = session
411
+
412
+ do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
413
+ catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
414
+
415
+ // Build options to match our routing rules
416
+ // (defaultToSpeaker only when no external output is active)
417
+ let hasExternalOutput: Bool = session.currentRoute.outputs.contains {
418
+ switch $0.portType {
419
+ case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
420
+ return true
421
+ default:
422
+ return false
423
+ }
424
+ }
425
+
426
+ var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
427
+ if !hasExternalOutput { opts.insert(.defaultToSpeaker) }
428
+ if #available(iOS 14.5, *) {
429
+ // Prevent muted switch / mic mute from killing our capture pipeline
430
+ opts.insert(.overrideMutedMicrophoneInterruption)
431
+ }
432
+
433
+ do {
434
+ try session.setCategory(.playAndRecord, mode: .voiceChat, options: opts)
435
+ } catch { err = error as NSError }
436
+
437
+ // Force 16k before and after activation (some routes settle only after setActive)
438
+ force16kIfPossible(session)
439
+ do { try session.setActive(true) } catch { err = error as NSError }
440
+ NSLog("[STT] session SR=%.1f inCh=%d outCh=%d (wanted 16000)",
441
+ session.sampleRate,
442
+ Int(session.inputNumberOfChannels),
443
+ Int(session.outputNumberOfChannels))
444
+ force16kIfPossible(session)
445
+
446
+ if let e = err {
447
+ NSLog("[STT] setupAudioSession error: \(e.localizedDescription)")
448
+ sendResult(error: ["code": "audio", "message": e.localizedDescription],
449
+ bestTranscription: nil, transcriptions: nil, isFinal: nil)
450
+ return false
451
+ }
452
+ return true
453
+ }
454
+
455
+ private func currentInputFormat(_ engine: AVAudioEngine) -> AVAudioFormat? {
456
+ // Prefer whatever CoreAudio currently provides; avoid cached formats.
457
+ let fmt = engine.inputNode.outputFormat(forBus: 0)
458
+ if fmt.sampleRate > 0 && fmt.channelCount > 0 { return fmt }
459
+ // Fallback: build a sane mono format from session if ever needed.
460
+ let sr = max(8000, AVAudioSession.sharedInstance().sampleRate)
461
+ return AVAudioFormat(commonFormat: .pcmFormatFloat32, sampleRate: sr, channels: 1, interleaved: false)
462
+ }
463
+
464
+ private func isHeadsetPluggedIn() -> Bool {
465
+ let route = AVAudioSession.sharedInstance().currentRoute
466
+ for out in route.outputs {
467
+ if out.portType == .headphones || out.portType == .bluetoothA2DP {
468
+ return true
469
+ }
470
+ }
471
+ return false
472
+ }
473
+
474
+ private func isHeadSetBluetooth() -> Bool {
475
+ for port in AVAudioSession.sharedInstance().availableInputs ?? [] {
476
+ if port.portType == .bluetoothHFP { return true }
477
+ }
478
+ return false
479
+ }
480
+
481
+ private func loadContextualStrings() -> [String] {
482
+ guard let filePath = Bundle.main.path(forResource: "words_flattened", ofType: "txt") else {
483
+ NSLog("words_flattened.txt not found in bundle")
484
+ return []
485
+ }
486
+ do {
487
+ let contents = try String(contentsOfFile: filePath, encoding: .utf8)
488
+ let rawItems = contents.components(separatedBy: ",")
489
+ var cleaned: [String] = []
490
+ cleaned.reserveCapacity(rawItems.count)
491
+ for item in rawItems {
492
+ var t = item.trimmingCharacters(in: .whitespacesAndNewlines)
493
+ t = t.replacingOccurrences(of: "\"", with: "")
494
+ if !t.isEmpty { cleaned.append(t) }
495
+ }
496
+ return cleaned
497
+ } catch {
498
+ NSLog("Error reading contextualStrings: \(error)")
499
+ return []
500
+ }
501
+ }
502
+
503
+ private func resetAudioSession()
504
+ {
505
+ if audioSession == nil {
506
+ audioSession = AVAudioSession.sharedInstance()
507
+ }
508
+ guard let session = audioSession else { return }
509
+ // Preserve & compare category exactly as original logic
510
+ let current = session.category
511
+ if priorAudioCategory == current { return }
512
+ audioSession = nil
513
+ }
514
+
515
+
516
+ private func makeFreshRequest() -> SFSpeechAudioBufferRecognitionRequest {
517
+ let req = SFSpeechAudioBufferRecognitionRequest()
518
+ if #available(iOS 16, *) { req.addsPunctuation = true }
519
+ req.shouldReportPartialResults = true
520
+ //if #available(iOS 13.0, *) { req.taskHint = .dictation }
521
+ req.contextualStrings = loadContextualStrings()
522
+ self.recognitionRequest = req
523
+ self.reqConsumed = false // <— reset here
524
+ NSLog("makeFreshRequest()")
525
+ return req
526
+ }
527
+
528
+ private func startTask(_ req: SFSpeechAudioBufferRecognitionRequest) {
529
+ NSLog("starting recognitionTask")
530
+ lastTaskStartAt = CACurrentMediaTime()
531
+ lastResultAt = lastTaskStartAt
532
+ let taskSessionId = self.sessionId
533
+
534
+ self.recognitionTask = self.speechRecognizer?.recognitionTask(with: req) { [weak self] result, error in
535
+ guard let self = self else { return }
536
+ if taskSessionId != self.sessionId { NSLog("task session mismatch -> ignore"); return }
537
+ self.lastResultAt = CACurrentMediaTime()
538
+
539
+ // ----- ERROR HANDLING (single block) -----
540
+ if let error = error {
541
+ let nserr = error as NSError
542
+ NSLog("task error \(nserr.code): \(nserr.localizedDescription)")
543
+
544
+ // Cold restart for local speech/XPC faults
545
+ if (nserr.domain == "kAFAssistantErrorDomain" && (nserr.code == 1107 || nserr.code == 1101)) ||
546
+ (nserr.domain == NSCocoaErrorDomain && nserr.code == 4097) {
547
+ self.restartSpeechCold("localspeech-\(nserr.code)")
548
+ return
549
+ }
550
+
551
+ // Otherwise: transient stall → rearm the task
552
+ self.rearmTask(reason: "error")
553
+ return
554
+ }
555
+
556
+ // ----- RESULT HANDLING -----
557
+ guard let result = result else {
558
+ NSLog("task nil result")
559
+ self.rearmTask(reason: "nil-result")
560
+ return
561
+ }
562
+
563
+ // Mark first real speech once
564
+ let best = result.bestTranscription.formattedString.trimmingCharacters(in: .whitespacesAndNewlines)
565
+ if !best.isEmpty || result.transcriptions.contains(where: { !$0.formattedString.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }) {
566
+ if !self.seenRealSpeech {
567
+ self.seenRealSpeech = true
568
+ NSLog("first real speech detected -> onSpeechStart to JS")
569
+ self.sendEvent(name: "onSpeechStart", body: nil)
570
+ }
571
+ }
572
+
573
+ // Emit partials/finals
574
+ let parts = result.transcriptions.map { $0.formattedString }
575
+ self.sendResult(error: nil,
576
+ bestTranscription: result.bestTranscription.formattedString,
577
+ transcriptions: parts,
578
+ isFinal: result.isFinal)
579
+
580
+ if result.isFinal {
581
+ NSLog("task final -> onSpeechEnd")
582
+ self.sendEvent(name: "onSpeechEnd", body: nil)
583
+ if self.continuous {
584
+ self.rearmTask(reason: "final")
585
+ } else {
586
+ NSLog("non-continuous final -> teardown")
587
+ self.teardown()
588
+ }
589
+ }
590
+ }
591
+ }
592
+
593
+ public func teardown() {
594
+ bumpGraphGen()
595
+ NSLog("[STT] teardown() begin")
596
+ isTearingDown = true
597
+ stopWatchdog()
598
+ consecutiveStallCount = 0
599
+
600
+ // 1) Tell the request we're done so LocalSpeech sees EOF
601
+ if let req = recognitionRequest {
602
+ req.endAudio()
603
+ }
604
+ // 2) Cancel the task (don’t nil the request yet, let it flush)
605
+ recognitionTask?.cancel()
606
+ recognitionTask = nil
607
+
608
+ // 3) Small grace period before ripping the engine/session
609
+ let grace = DispatchTime.now() + .milliseconds(300)
610
+ DispatchQueue.main.asyncAfter(deadline: grace) { [weak self] in
611
+ guard let self = self else { return }
612
+ AudioPlaybackHook.engineScheduleFile = nil
613
+ AudioPlaybackHook.isEngineReady = nil
614
+ AudioPlaybackHook.useOnlyEnginePlayback = nil
615
+ AudioPlaybackHook.stopEnginePlayback = nil // ← NEW
616
+ sttActive = false
617
+
618
+ if let p = playbackNode {
619
+ p.stop()
620
+ }
621
+ playbackNode = nil
622
+
623
+ if let engine = audioEngine {
624
+ safeRemoveTap(engine.outputNode, bus: 0) // <- clear IO latch if present
625
+ safeRemoveTap(engine.mainMixerNode, bus: 0) // <- clear mixer probe if present
626
+ if engine.inputNode != nil {
627
+ safeRemoveTap(engine.inputNode, bus: 0)
628
+ engine.inputNode.reset()
629
+ }
630
+ if engine.isRunning { engine.stop() }
631
+ engine.reset()
632
+ audioEngine = nil
633
+ }
634
+ mixerProbeActive = false
635
+ mixerProbeCompletions.removeAll()
636
+
637
+ resetAudioSession()
638
+ savedSessionBeforePause = nil
639
+
640
+ sessionId = nil
641
+ isTearingDown = false
642
+ }
643
+ }
644
+
645
+ private func isPlayerConnected(_ player: AVAudioPlayerNode?, to engine: AVAudioEngine?) -> Bool {
646
+ guard let p = player, let e = engine else { return false }
647
+ // If the node is attached and has a non-zero channel count on its output, it’s effectively connected.
648
+ let fmt = p.outputFormat(forBus: 0)
649
+ return (p.engine === e) && (fmt.channelCount > 0) && (fmt.sampleRate > 0)
650
+ }
651
+
652
+ private func waitForStableInputThen(_ block: @escaping () -> Void) {
653
+ let s = AVAudioSession.sharedInstance()
654
+ if !s.currentRoute.inputs.isEmpty {
655
+ block(); return
656
+ }
657
+ // try again shortly; this resolves within a few ticks after unpause
658
+ DispatchQueue.main.asyncAfter(deadline: .now() + 0.15) { [weak self] in
659
+ guard self != nil else { return }
660
+ self?.waitForStableInputThen(block)
661
+ }
662
+ }
663
+
664
+ /// Try to keep the capture alive without tearing down recognition.
665
+ /// 1) If engine exists but not running → try start()
666
+ /// 2) If start fails or graph became invalid → rebuild graph and start
667
+ /// 3) If we don’t have a task yet, start one.
668
+ private func ensureEngineRunning(reason: String) {
669
+ if booting { return } // <— ignore while booting/pause
670
+
671
+ let now = CFAbsoluteTimeGetCurrent()
672
+ let skipCooldown = reason.hasPrefix("route-change") // ← bypass for route changes
673
+ if !skipCooldown && (now - lastReclaimAttempt) < reclaimCooldown {
674
+ NSLog("[STT] ensureEngineRunning(\(reason)) skipped (cooldown)")
675
+ return
676
+ }
677
+ lastReclaimAttempt = now
678
+
679
+ if (audioEngine != nil) && !audioEngine!.isRunning {
680
+ do {
681
+ playbackNode?.stop()
682
+ playbackNode = nil
683
+ // Possibly re-apply your format or re-install taps if the hardware changed sample rates
684
+ try audioEngine!.start()
685
+ armFirstIOCycleLatch(on: audioEngine!)
686
+
687
+ print("🔄 AVAudioEngine restarted after config change. isRunning=%@",
688
+ audioEngine!.isRunning ? "YES":"NO")
689
+ } catch {
690
+ print("❌ Could not re-start after config change: \(error)")
691
+ }
692
+ }
693
+
694
+ guard let engine = audioEngine else {
695
+ NSLog("[STT] ensureEngineRunning(\(reason)): no engine → rebuild")
696
+ rebuildEngineGraphAndRestart(reason: reason)
697
+ return
698
+ }
699
+
700
+ if !engine.isRunning {
701
+ do {
702
+ try engine.start()
703
+ armFirstIOCycleLatch(on: engine)
704
+ NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() -> running=\(engine.isRunning)")
705
+ } catch {
706
+ NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() failed: \(error) → rebuild")
707
+ rebuildEngineGraphAndRestart(reason: reason)
708
+ return
709
+ }
710
+ }
711
+
712
+ // If we have no active task, spin one up against the current request
713
+ if recognitionTask == nil {
714
+ if let req = recognitionRequest {
715
+ NSLog("[STT] ensureEngineRunning(\(reason)): no task -> startTask(existing req)")
716
+ reqConsumed = true
717
+ startTask(req)
718
+ } else {
719
+ NSLog("[STT] ensureEngineRunning(\(reason)): no req -> makeFreshRequest + startTask")
720
+ let newReq = makeFreshRequest()
721
+ reqConsumed = true
722
+ startTask(newReq)
723
+ }
724
+ }
725
+ }
726
+
727
+ /// Rebuilds AVAudioEngine graph (mic→mute mixer, player→mainMixer), reinstalls tap,
728
+ /// and restarts the engine. Does NOT nuke the current recognitionRequest/task unless required.
729
+ private func rebuildEngineGraphAndRestart(reason: String) {
730
+ bumpGraphGen()
731
+ NSLog("[STT] 🔄 rebuildEngineGraphAndRestart (\(reason))")
732
+
733
+ // Keep current request if present; we'll keep appending into it
734
+ let existingReq = self.recognitionRequest
735
+
736
+ // Tear down engine ONLY (keep session, request)
737
+ if let engine = audioEngine {
738
+ if engine.inputNode != nil {
739
+ safeRemoveTap(engine.inputNode, bus: 0)
740
+ engine.inputNode.reset()
741
+ }
742
+ if engine.isRunning { engine.stop() }
743
+ engine.reset()
744
+ }
745
+
746
+ // Recreate engine and graph
747
+ let newEngine = AVAudioEngine()
748
+ self.audioEngine = newEngine
749
+
750
+ let inputNode = newEngine.inputNode
751
+
752
+ let s = AVAudioSession.sharedInstance()
753
+ let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
754
+ let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
755
+ (s.currentRoute.inputs.first?.portType == .builtInMic)
756
+
757
+ if speakerRoute && usingBuiltInMic {
758
+ do {
759
+ try inputNode.setVoiceProcessingEnabled(true)
760
+ } catch {
761
+ NSLog("[STT] rebuild: failed to enable voice processing: \(error)")
762
+ }
763
+ }
764
+ if #available(iOS 17.0, *) {
765
+ var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
766
+ duck.enableAdvancedDucking = false
767
+ duck.duckingLevel = .min
768
+ inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
769
+ }
770
+
771
+ // --- FIXED WIRING: use live format on first hop, nil downstream, nil for tap ---
772
+ let inFmt = newEngine.inputNode.outputFormat(forBus: 0)
773
+
774
+ // mic → mute mixer → mainMixer
775
+ let micMixer = AVAudioMixerNode()
776
+ newEngine.attach(micMixer)
777
+ newEngine.connect(inputNode, to: micMixer, format: inFmt) // live input format
778
+ newEngine.connect(micMixer, to: newEngine.mainMixerNode, format: nil) // let mixer choose
779
+ micMixer.outputVolume = 0.0
780
+
781
+ playbackNode?.stop()
782
+
783
+ // TTS player → mainMixer (keep same player if possible, else recreate)
784
+ let player = AVAudioPlayerNode()
785
+ playbackNode = player
786
+ newEngine.attach(player)
787
+ newEngine.connect(player, to: newEngine.mainMixerNode, format: nil)
788
+
789
+ safeRemoveTap(inputNode, bus: 0)
790
+ let format = inputNode.outputFormat(forBus: 0) // <- prefer explicit format
791
+
792
+ // Tap uses nil to follow the node’s current output format
793
+ inputNode.installTap(onBus: 0, bufferSize: 1024, format: format) { [weak self] buffer, _ in
794
+ guard let self = self else { return }
795
+
796
+ // (same level metering as your current code)
797
+ let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
798
+ let LP: Float = 0.5
799
+
800
+ if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
801
+ var peak0: Float = 0
802
+ vDSP_maxmgv(ch0, 1, &peak0, frames)
803
+ let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
804
+ let sm0 = LP * db0 + (1 - LP) * self.averagePowerForChannel0
805
+ self.averagePowerForChannel0 = sm0
806
+ self.averagePowerForChannel1 = sm0
807
+ }
808
+ if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
809
+ var peak1: Float = 0
810
+ vDSP_maxmgv(ch1, 1, &peak1, frames)
811
+ let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
812
+ let sm1 = LP * db1 + (1 - LP) * self.averagePowerForChannel1
813
+ self.averagePowerForChannel1 = sm1
814
+ }
815
+ self.averagePowerForChannel1 = Float(self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0)
816
+ self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": self.averagePowerForChannel1])
817
+
818
+ self.recognitionRequest?.append(buffer)
819
+ self.lastBufferAt = CACurrentMediaTime()
820
+ }
821
+
822
+ newEngine.prepare()
823
+ do {
824
+ try newEngine.start()
825
+ armFirstIOCycleLatch(on: newEngine)
826
+ NSLog("[STT] rebuild: engine.start() ok, running=\(newEngine.isRunning)")
827
+ } catch {
828
+ NSLog("[STT] rebuild: engine.start() failed: \(error)")
829
+ }
830
+
831
+ if self.recognitionRequest == nil {
832
+ self.recognitionRequest = makeFreshRequest()
833
+ }
834
+ if self.recognitionTask == nil, let req = self.recognitionRequest {
835
+ startTask(req)
836
+ }
837
+ }
838
+
839
+ @objc private func handleEngineConfigChange(_ note: Notification) {
840
+ if self.booting { return } // ← add
841
+ NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange: ensuring engine running")
842
+ if (audioEngine != nil) && !audioEngine!.isRunning {
843
+ playbackNode?.stop()
844
+ playbackNode = nil
845
+ }
846
+ ensureEngineRunning(reason: "engine-config-change")
847
+ }
848
+
849
+ @objc private func handleMediaServicesReset(_ note: Notification) {
850
+ if self.booting { return } // ← add
851
+ NSLog("[STT] 📺 Media services were RESET: reclaiming mic & session")
852
+ // Re-apply audio session and try to rebuild graph if needed
853
+ bumpGraphGen()
854
+ _ = setupAudioSession()
855
+ ensureEngineRunning(reason: "media-services-reset")
856
+ }
857
+
858
+ @objc private func handleRouteChange(_ note: Notification) {
859
+ if booting { return } // ← add
860
+ let info = note.userInfo ?? [:]
861
+ NSLog("[STT] 🔀 route change: \(info)")
862
+
863
+ guard let reasonVal = info[AVAudioSessionRouteChangeReasonKey] as? UInt,
864
+ let reason = AVAudioSession.RouteChangeReason(rawValue: reasonVal) else {
865
+ ensureEngineRunning(reason: "route-change-unknown")
866
+ return
867
+ }
868
+
869
+ // On any meaningful route change, reclaim mic
870
+ switch reason {
871
+ case .oldDeviceUnavailable, .newDeviceAvailable, .categoryChange, .routeConfigurationChange, .override:
872
+ ensureEngineRunning(reason: "route-change-\(reason.rawValue)")
873
+ default:
874
+ break
875
+ }
876
+ }
877
+
878
+ private func waitForIOCycle(_ engine: AVAudioEngine,
879
+ timeout: TimeInterval = 0.7,
880
+ done: @escaping (Bool) -> Void) {
881
+ let gen = graphGen
882
+ ttsSerial.async { [weak self, weak engine] in
883
+ guard let self = self, let eng = engine, gen == self.graphGen else { return }
884
+
885
+ if self.mixerProbeActive {
886
+ self.mixerProbeCompletions.append(done)
887
+ return
888
+ }
889
+ self.mixerProbeActive = true
890
+ self.mixerProbeCompletions = [done]
891
+
892
+ DispatchQueue.main.async { [weak self, weak eng] in
893
+ guard let self = self, let eng = eng, gen == self.graphGen else { return }
894
+ let mixer = eng.mainMixerNode
895
+ var fired = false
896
+ self.safeRemoveTap(mixer, bus: 0)
897
+
898
+ mixer.installTap(onBus: 0, bufferSize: 128, format: nil) { [weak self, weak mixer] _, _ in
899
+ guard let self = self, gen == self.graphGen else { return }
900
+ if fired { return }
901
+ fired = true
902
+ self.safeRemoveTap(mixer, bus: 0)
903
+
904
+ self.ttsSerial.async { [weak self] in
905
+ guard let self = self else { return }
906
+ let completions = self.mixerProbeCompletions
907
+ self.mixerProbeActive = false
908
+ self.mixerProbeCompletions.removeAll()
909
+ DispatchQueue.main.async { if gen == self.graphGen { completions.forEach { $0(true) } } }
910
+ }
911
+ }
912
+
913
+ DispatchQueue.main.asyncAfter(deadline: .now() + timeout) { [weak self, weak mixer] in
914
+ guard let self = self, gen == self.graphGen else { return }
915
+ if fired { return }
916
+ self.safeRemoveTap(mixer, bus: 0)
917
+ self.ttsSerial.async { [weak self] in
918
+ guard let self = self else { return }
919
+ let completions = self.mixerProbeCompletions
920
+ self.mixerProbeActive = false
921
+ self.mixerProbeCompletions.removeAll()
922
+ DispatchQueue.main.async { if gen == self.graphGen { completions.forEach { $0(false) } } }
923
+ }
924
+ }
925
+ }
926
+ }
927
+ }
928
+
929
+ // Call once, right after you create the engine (or inside setupAudioSession)
930
+ // Call once after engine is created
931
+ private func installEngineObservers() {
932
+ let nc = NotificationCenter.default
933
+
934
+ if let engine = audioEngine {
935
+ nc.addObserver(self,
936
+ selector: #selector(handleEngineConfigChange(_:)),
937
+ name: .AVAudioEngineConfigurationChange,
938
+ object: engine)
939
+ }
940
+
941
+ nc.addObserver(self,
942
+ selector: #selector(handleSessionInterruption(_:)),
943
+ name: AVAudioSession.interruptionNotification,
944
+ object: AVAudioSession.sharedInstance())
945
+
946
+ nc.addObserver(self,
947
+ selector: #selector(handleRouteChange(_:)),
948
+ name: AVAudioSession.routeChangeNotification,
949
+ object: AVAudioSession.sharedInstance())
950
+
951
+ nc.addObserver(self,
952
+ selector: #selector(handleMediaServicesReset(_:)),
953
+ name: AVAudioSession.mediaServicesWereResetNotification,
954
+ object: nil)
955
+ }
956
+
957
+ @objc private func handleSessionInterruption(_ note: Notification) {
958
+ if booting { return } // ← add
959
+ guard
960
+ let info = note.userInfo,
961
+ let typeVal = info[AVAudioSessionInterruptionTypeKey] as? UInt,
962
+ let type = AVAudioSession.InterruptionType(rawValue: typeVal)
963
+ else { return }
964
+
965
+ if type == .ended {
966
+ // On real “render err” Core Audio posts an interruption END
967
+ NSLog("Session interruption ended (possible render err):")
968
+ }
969
+ }
970
+
971
+ // Add to STT
972
+ private func safeMixerFormat(_ engine: AVAudioEngine) -> AVAudioFormat {
973
+ // Try the mainMixer's current format first
974
+ var fmt = engine.mainMixerNode.outputFormat(forBus: 0)
975
+ if fmt.sampleRate > 0 && fmt.channelCount > 0 { return fmt }
976
+
977
+ // Fallbacks if it’s still 0 Hz (race during route changes)
978
+ let sessionSR = AVAudioSession.sharedInstance().sampleRate
979
+ let ioFmt = engine.outputNode.inputFormat(forBus: 0) // IO unit’s input
980
+ let sr = (fmt.sampleRate > 0 ? fmt.sampleRate
981
+ : (ioFmt.sampleRate > 0 ? ioFmt.sampleRate
982
+ : (sessionSR > 0 ? sessionSR : 48000)))
983
+ let ch = max(1, Int((fmt.channelCount > 0 ? fmt.channelCount : ioFmt.channelCount)))
984
+ return AVAudioFormat(standardFormatWithSampleRate: sr, channels: AVAudioChannelCount(ch))!
985
+ }
986
+
987
+ private func setupAndStartRecognizing(localeStr: String?) {
988
+ if booting { return }
989
+ booting = true
990
+ NSLog("[STT] setupAndStartRecognizing begin")
991
+ sttActive = true
992
+
993
+ audioSession = AVAudioSession.sharedInstance()
994
+ guard let session = audioSession else { return }
995
+ var err: NSError?
996
+
997
+ priorAudioCategory = session.category
998
+
999
+ // Tear down resources before starting speech recognition..
1000
+ NSLog("[STT] pre-teardown")
1001
+ teardown()
1002
+ // ** IMPORTANT ** Call this again as teardown marks this false
1003
+ sttActive = true
1004
+
1005
+ sessionId = UUID().uuidString
1006
+
1007
+ let locale: Locale? = {
1008
+ if let s = localeStr, !s.isEmpty { return Locale(identifier: s) }
1009
+ sttActive = false
1010
+ return nil
1011
+ }()
1012
+
1013
+ if let loc = locale {
1014
+ speechRecognizer = SFSpeechRecognizer(locale: loc)
1015
+ } else {
1016
+ speechRecognizer = SFSpeechRecognizer()
1017
+ }
1018
+ speechRecognizer?.delegate = self
1019
+
1020
+ // Start audio session...
1021
+ NSLog("[STT] setupAudioSession()")
1022
+ guard setupAudioSession() else {
1023
+ NSLog("[STT] ERROR ERROR ******** setupAudioSession()")
1024
+ teardown()
1025
+ sttActive = false
1026
+ return
1027
+ }
1028
+ installEngineObservers()
1029
+
1030
+ let request = makeFreshRequest()
1031
+ recognitionRequest = request
1032
+
1033
+ guard recognitionRequest != nil else {
1034
+ sendResult(error: ["code": "recognition_init"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
1035
+ teardown()
1036
+ return
1037
+ }
1038
+
1039
+ if audioEngine == nil {
1040
+ bumpGraphGen();
1041
+ audioEngine = AVAudioEngine()
1042
+ }
1043
+ do {
1044
+ guard let engine = audioEngine else { throw NSError(domain: "voice.audio", code: -1) }
1045
+ let inputNode = engine.inputNode
1046
+ let _ = inputNode // presence check
1047
+
1048
+ let s = AVAudioSession.sharedInstance()
1049
+ let speakerRoute = s.currentRoute.outputs.contains { $0.portType == .builtInSpeaker }
1050
+ let usingBuiltInMic = (s.preferredInput?.portType == .builtInMic) ||
1051
+ (s.currentRoute.inputs.first?.portType == .builtInMic)
1052
+
1053
+ if speakerRoute && usingBuiltInMic {
1054
+ // Enable voice processing (AEC)
1055
+ do {
1056
+ try inputNode.setVoiceProcessingEnabled(true)
1057
+ } catch {
1058
+ NSLog("Failed to enable voice processing for AEC on input node: \(error)")
1059
+ }
1060
+ }
1061
+
1062
+ if #available(iOS 17.0, *) {
1063
+ var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
1064
+ duck.enableAdvancedDucking = false // disable advanced (VAD-based) ducking
1065
+ duck.duckingLevel = .min // “as loud as possible” for other audio
1066
+ inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
1067
+ }
1068
+
1069
+ // if output node voice processing is ever needed, keep commented as in original:
1070
+ // do { try engine.outputNode.setVoiceProcessingEnabled(true) } catch { ... }
1071
+
1072
+ NSLog("[STT] AEC enable done")
1073
+
1074
+ // --- FIXED WIRING: use live format on first hop, nil downstream, nil for tap ---
1075
+ let inFmt = engine.inputNode.outputFormat(forBus: 0)
1076
+
1077
+ // 1) Mute only the mic path, not the whole main mixer
1078
+ let micMixer = AVAudioMixerNode()
1079
+ engine.attach(micMixer)
1080
+ // Use the live input format for input → micMixer
1081
+ engine.connect(inputNode, to: micMixer, format: inFmt)
1082
+ // Let main mixer pick downstream format
1083
+ engine.connect(micMixer, to: engine.mainMixerNode, format: nil)
1084
+ micMixer.outputVolume = 0.0 // ← you won't hear your own mic
1085
+
1086
+ // 2) Prepare a player node for TTS inside the SAME engine/graph
1087
+ let player = AVAudioPlayerNode()
1088
+ self.playbackNode = player
1089
+ engine.attach(player)
1090
+ // Let the mixer choose the format for TTS
1091
+ engine.connect(player, to: engine.mainMixerNode, format: nil)
1092
+
1093
+ NSLog("[STT] graph connected (mic->mute mixer, player->mainMixer)")
1094
+
1095
+ var tapFrames: UInt64 = 0
1096
+ // Tap uses nil so it follows the node’s current output format (survives route SR changes)
1097
+
1098
+ safeRemoveTap(inputNode, bus: 0)
1099
+ let format = inputNode.outputFormat(forBus: 0) // <- prefer explicit format
1100
+
1101
+ inputNode.installTap(onBus: 0, bufferSize: 1024, format: format) { [weak self] buffer, _ in
1102
+ // Strongify self once
1103
+ guard let self = self else { return }
1104
+ tapFrames &+= UInt64(buffer.frameLength)
1105
+ if tapFrames % (44100 * 2) < 1024 { // ~every ~2s at 44.1k
1106
+ NSLog("[STT] tap alive, totalFrames=\(tapFrames)")
1107
+ }
1108
+
1109
+ let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
1110
+ let LEVEL_LOWPASS_TRIG: Float = 0.5
1111
+
1112
+ // CH0
1113
+ if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
1114
+ var peak0: Float = 0
1115
+ vDSP_maxmgv(ch0, 1, &peak0, frames)
1116
+ let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
1117
+
1118
+ let smoothed0 = LEVEL_LOWPASS_TRIG * db0
1119
+ + (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel0
1120
+ self.averagePowerForChannel0 = smoothed0
1121
+ self.averagePowerForChannel1 = smoothed0
1122
+ }
1123
+
1124
+ // CH1
1125
+ if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
1126
+ var peak1: Float = 0
1127
+ vDSP_maxmgv(ch1, 1, &peak1, frames)
1128
+ let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
1129
+
1130
+ let smoothed1 = LEVEL_LOWPASS_TRIG * db1
1131
+ + (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel1
1132
+ self.averagePowerForChannel1 = smoothed1
1133
+ }
1134
+
1135
+ // Normalize 0–10 and emit
1136
+ self.averagePowerForChannel1 = Float(self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0)
1137
+ let value = self.averagePowerForChannel1
1138
+ self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": value])
1139
+
1140
+ // Append to recognition
1141
+ self.recognitionRequest?.append(buffer)
1142
+
1143
+ // inside inputNode.installTap { buffer, _ in
1144
+ self.lastBufferAt = CACurrentMediaTime()
1145
+ }
1146
+
1147
+ engine.prepare()
1148
+ NSLog("[STT] audioEngine prepare")
1149
+ var audioSessionError: NSError?
1150
+ do {
1151
+ try engine.start()
1152
+ armFirstIOCycleLatch(on: engine)
1153
+ } catch {
1154
+ audioSessionError = error as NSError
1155
+ }
1156
+
1157
+ // after engine.start() success:
1158
+ engineHotAt = CACurrentMediaTime()
1159
+ seenRealSpeech = false
1160
+ NSLog("engine HOT at \(engineHotAt)")
1161
+ sendEvent(name: "onSpeechStart", body: nil) // engine hot signal (keep if you want)
1162
+ self.reqConsumed = true
1163
+ startTask(recognitionRequest!)
1164
+ self.booting = false
1165
+
1166
+ // Engine is up; expose readiness
1167
+ AudioPlaybackHook.isEngineReady = { [weak self] in
1168
+ guard let eng = self?.audioEngine else { return false }
1169
+ return eng.isRunning
1170
+ }
1171
+
1172
+ // Tell TTS layer: do NOT use AVAudioPlayer fallback while STT is active
1173
+ AudioPlaybackHook.useOnlyEnginePlayback = { [weak self] in
1174
+ return self?.sttActive == true
1175
+ }
1176
+
1177
+ startWatchdog()
1178
+
1179
+ AudioPlaybackHook.engineScheduleFile = { [weak self] url, done in
1180
+ self?.ttsSerial.async { [weak self] in
1181
+ DispatchQueue.main.async {
1182
+ guard let self = self, !self.isTearingDown, let engine = self.audioEngine else { return }
1183
+ // Always recreate the player if lastRenderTime might be stale
1184
+ if self.playbackNode?.engine !== engine || !self.isPlayerConnected(self.playbackNode, to: engine) {
1185
+ self.playbackNode?.stop()
1186
+ self.playbackNode = nil
1187
+ }
1188
+ // Ensure running
1189
+ if !engine.isRunning {
1190
+ do { try engine.start(); self.armFirstIOCycleLatch(on: engine) } catch {
1191
+ NSLog("[STT] TTS: engine.start() failed: \(error)"); return
1192
+ }
1193
+ }
1194
+ let mixer = engine.mainMixerNode
1195
+ mixer.auAudioUnit.inputBusses[0].isEnabled = true
1196
+
1197
+ let player = self.ensurePlaybackNode(in: engine)
1198
+
1199
+ // Always prime a silent buffer on a freshly attached player
1200
+ if player.lastRenderTime == nil {
1201
+ let fmt = self.safeMixerFormat(engine)
1202
+ if let prime = AVAudioPCMBuffer(pcmFormat: fmt, frameCapacity: 128) {
1203
+ prime.frameLength = 128
1204
+ if let ch = prime.floatChannelData {
1205
+ memset(ch[0], 0, Int(prime.frameLength) * MemoryLayout<Float>.size)
1206
+ if fmt.channelCount > 1 { memset(ch[1], 0, Int(prime.frameLength) * MemoryLayout<Float>.size) }
1207
+ }
1208
+ player.scheduleBuffer(prime, completionHandler: nil)
1209
+ }
1210
+ }
1211
+
1212
+ do {
1213
+ let file = try AVAudioFile(forReading: url)
1214
+ player.scheduleFile(file, at: nil) { DispatchQueue.main.async { done() } }
1215
+ } catch {
1216
+ NSLog("[STT] TTS schedule error: \(error)"); return
1217
+ }
1218
+
1219
+ // Gate first-ever play on the engineHasRenderedOnce latch
1220
+ let startPlay = {
1221
+ if !player.isPlaying { player.play() }
1222
+ }
1223
+ if self.engineHasRenderedOnce {
1224
+ startPlay()
1225
+ } else {
1226
+ // Poll briefly until first IO cycle observed
1227
+ func tryStart(after ms: Int = 0) {
1228
+ DispatchQueue.main.asyncAfter(deadline: .now() + .milliseconds(ms)) {
1229
+ if self.engineHasRenderedOnce { startPlay() }
1230
+ else if ms < 1500 { tryStart(after: ms + 100) } // up to ~1.5s on BT
1231
+ else { NSLog("[STT] TTS: no IO cycle observed; skipping play to avoid crash") }
1232
+ }
1233
+ }
1234
+ tryStart()
1235
+ }
1236
+ }
1237
+ }
1238
+ return true
1239
+ }
1240
+
1241
+ AudioPlaybackHook.stopEnginePlayback = { [weak self] in
1242
+ DispatchQueue.main.async {
1243
+ guard let self = self else { return }
1244
+ // Stop only the TTS playback node; keep the engine running for STT
1245
+ self.playbackNode?.stop()
1246
+ }
1247
+ }
1248
+
1249
+ NSLog("audioEngine startAndReturnError")
1250
+ if let audioSessionError = audioSessionError {
1251
+ NotificationCenter.default.addObserver(self,
1252
+ selector: #selector(self.handleEngineConfigChange(_:)),
1253
+ name: .AVAudioEngineConfigurationChange,
1254
+ object: engine)
1255
+ NSLog("audioEngine audioSessionError!=nil")
1256
+ self.sendResult(error: ["code": "audio", "message": audioSessionError.localizedDescription],
1257
+ bestTranscription: nil, transcriptions: nil, isFinal: nil)
1258
+ NSLog("[STT] self sendResult")
1259
+ // self.teardown()
1260
+ NSLog("[STT] Removed self teardown")
1261
+ return
1262
+ }
1263
+ NSLog("After Start recording and append recording")
1264
+ DispatchQueue.main.asyncAfter(deadline: .now() + 3.0) { [weak self] in
1265
+ guard let self = self else { return }
1266
+ let running = self.audioEngine?.isRunning ?? false
1267
+ let taskState = self.recognitionTask?.state.rawValue ?? -1
1268
+ NSLog("[STT] health: engineRunning=\(running) taskState=\(taskState)")
1269
+ }
1270
+
1271
+ NSLog("After if audioSessionError != nil")
1272
+ } catch let e as NSError {
1273
+ sendResult(error: ["code": "start_recording", "message": e.localizedDescription],
1274
+ bestTranscription: nil, transcriptions: nil, isFinal: nil)
1275
+ NSLog("End of init...")
1276
+ return
1277
+ }
1278
+ }
1279
+
1280
+ // MARK: - Helpers
1281
+ private func _normalizedPowerLevelFromDecibels(_ decibels: CGFloat) -> CGFloat {
1282
+ if decibels < -80.0 || decibels == 0.0 { return 0.0 }
1283
+ let minDb: Float = -80.0
1284
+ let pow10_min = powf(10.0, 0.05 * minDb)
1285
+ let pow10_db = powf(10.0, 0.05 * Float(decibels))
1286
+ let power = powf((pow10_db - pow10_min) * (1.0 / (1.0 - pow10_min)), 1.0 / 2.0)
1287
+ if power < 1.0 { return CGFloat(power) } else { return 1.0 }
1288
+ }
1289
+
1290
+ private func sendEvent(name: String, body: [String: Any]?) {
1291
+ delegate?.stt(self, didEmitEvent: name, body: body)
1292
+ }
1293
+
1294
+ /// Exact event behavior preserved from ObjC `sendResult`.
1295
+ private func sendResult(error: [String: Any]?,
1296
+ bestTranscription: String?,
1297
+ transcriptions: [String]?,
1298
+ isFinal: Bool?) {
1299
+ if let error = error {
1300
+ sendEvent(name: "onSpeechError", body: ["error": error])
1301
+ }
1302
+ if let best = bestTranscription {
1303
+ sendEvent(name: "onSpeechResults", body: ["value": [best]])
1304
+ }
1305
+ if let trans = transcriptions {
1306
+ sendEvent(name: "onSpeechPartialResults", body: ["value": trans])
1307
+ }
1308
+ if let isFinal = isFinal {
1309
+ sendEvent(name: "onSpeechRecognized", body: ["isFinal": isFinal])
1310
+ }
1311
+ }
1312
+
1313
+ // MARK: - SFSpeechRecognizerDelegate
1314
+
1315
+ public func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
1316
+ if available == false {
1317
+ sendResult(error: ["message": "Speech recognition is not available now"],
1318
+ bestTranscription: nil, transcriptions: nil, isFinal: nil)
1319
+ }
1320
+ }
1321
+
1322
+ // MARK: - Small helper to recreate recognizer (used by watchdog)
1323
+ private func recreateSpeechRecognizerPreservingLocale() {
1324
+ let loc = speechRecognizer?.locale
1325
+ speechRecognizer = loc != nil ? SFSpeechRecognizer(locale: loc!) : SFSpeechRecognizer()
1326
+ speechRecognizer?.delegate = self
1327
+ NSLog("[STT] recreated SFSpeechRecognizer (locale preserved: \(loc?.identifier ?? "default"))")
1328
+ }
1329
+ }