react-native-davoice-tts 1.0.218 → 1.0.219

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. package/TTSRNBridge.podspec +1 -1
  2. package/ios/SpeechBridge/SpeechBridge.m +153 -0
  3. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DavoiceTTS +0 -0
  4. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.abi.json +3388 -3388
  5. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.private.swiftinterface +20 -20
  6. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.swiftinterface +20 -20
  7. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DavoiceTTS +0 -0
  8. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.abi.json +3316 -3316
  9. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.private.swiftinterface +32 -32
  10. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.swiftinterface +32 -32
  11. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.abi.json +3316 -3316
  12. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.private.swiftinterface +32 -32
  13. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.swiftinterface +32 -32
  14. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeDirectory +0 -0
  15. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeRequirements-1 +0 -0
  16. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeResources +24 -99
  17. package/package.json +1 -1
  18. package/speech/index.ts +106 -0
  19. package/android/src/main/java/com/davoice/tts/rn/DaVoiceTTSPackage.java_old_using_new_for_both_stt_and_tts +0 -26
  20. package/ios/STTRNBridge/STTBridge.m_wtf +0 -109
  21. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT copy.swift____ +0 -1202
  22. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift.bkup +0 -1000
  23. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift.latest +0 -1359
  24. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift1.swift__ +0 -1134
  25. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift__ +0 -1329
  26. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT copy.swift____ +0 -1202
  27. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.bkup +0 -1000
  28. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.latest +0 -1359
  29. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift1.swift__ +0 -1134
  30. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift__ +0 -1329
@@ -1,1359 +0,0 @@
1
- // STT.swift
2
- // Native iOS Swift version (AEC flow preserved 1:1)
3
-
4
- import Foundation
5
- import UIKit
6
- import Speech
7
- import Accelerate
8
- import AVFAudio // or import AVFoundation
9
-
10
- @objc public protocol STTDelegate: AnyObject {
11
- @objc func stt(_ stt: STT, didEmitEvent name: String, body: [String: Any]?)
12
- }
13
-
14
- @objcMembers
15
- public final class STT: NSObject, SFSpeechRecognizerDelegate {
16
- public weak var delegate: STTDelegate?
17
- public var continuous: Bool = true
18
-
19
- // MARK: - Private
20
- private var speechRecognizer: SFSpeechRecognizer?
21
- private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
22
- private var audioEngine: AVAudioEngine?
23
- private var recognitionTask: SFSpeechRecognitionTask?
24
- private var audioSession: AVAudioSession?
25
- private var isTearingDown: Bool = false
26
- private var sessionId: String?
27
- private var priorAudioCategory: AVAudioSession.Category?
28
- private var averagePowerForChannel0: Float = 0
29
- private var averagePowerForChannel1: Float = 0
30
-
31
- private var playbackNode: AVAudioPlayerNode?
32
- private var seenRealSpeech = false // flips true after first non-blank token
33
- private var engineHotAt: CFTimeInterval = 0 // when engine actually started
34
- private let warmupKeepAlive: CFTimeInterval = 4.0 // seconds we’ll keep re-arming in silence
35
-
36
- // Keep-engine-alive helpers
37
- private var lastReclaimAttempt: CFAbsoluteTime = 0
38
- private let reclaimCooldown: CFTimeInterval = 1.0
39
-
40
- // --- Task health ---
41
- private var lastBufferAt: CFTimeInterval = 0 // updated from tap
42
- private var lastResultAt: CFTimeInterval = 0 // updated from recognition callback
43
- private var lastTaskStartAt: CFTimeInterval = 0
44
- private var stallWatchdog: Timer?
45
- private var consecutiveStallCount = 0
46
- private let stallThreshold: CFTimeInterval = 8.0 // seconds w/o results while engine is hot
47
- private let rearmCooldownTask: CFTimeInterval = 2.0
48
- private var lastRearmAt: CFTimeInterval = 0
49
- private var engineHot = false
50
- private var hotAt: CFTimeInterval = 0
51
-
52
- private var observedEngineForConfigChange: AVAudioEngine?
53
- // Pending TTS while engine warms/recovers
54
- private var pendingTTSSchedules: [(url: URL, done: () -> Void)] = []
55
- private let ttsSerial = DispatchQueue(label: "stt.tts.serial")
56
-
57
- // --- Recovery & diagnostics ---
58
- private var recoverySeq = 0
59
- private var lastRecoveryAt: CFTimeInterval = 0
60
- private var lastTaskOrigin: String = "cold"
61
- private enum GraphState { case cold, starting, hot, unstable }
62
- private var graphState: GraphState = .cold
63
- private var stabilityTimer: Timer?
64
-
65
- private func setGraphState(_ s: GraphState, why: String) {
66
- graphState = s
67
- NSLog("[STT] graphState -> \(s) (\(why))")
68
- }
69
-
70
- private func markUnstableThenRecheck(after seconds: TimeInterval = 0.35, why: String) {
71
- setGraphState(.unstable, why: why)
72
- stabilityTimer?.invalidate()
73
- stabilityTimer = Timer.scheduledTimer(withTimeInterval: seconds, repeats: false) { [weak self] _ in
74
- guard let self = self, let eng = self.audioEngine else { return }
75
- if eng.isRunning {
76
- self.setGraphState(.hot, why: "debounce elapsed & engine running")
77
- self.tryFlushPendingTTS() // ← ADD THIS LINE
78
- } else {
79
- do {
80
- try eng.start()
81
- self.setGraphState(.hot, why: "restarted after debounce")
82
- self.tryFlushPendingTTS() // ← ADD THIS LINE
83
- } catch {
84
- self.setGraphState(.starting, why: "start failed: \(error.localizedDescription)")
85
- }
86
- }
87
- }
88
- RunLoop.main.add(stabilityTimer!, forMode: .common)
89
- }
90
-
91
- private func tryFlushPendingTTS() {
92
- ttsSerial.async { [weak self] in
93
- guard let self = self, let engine = self.audioEngine else { return }
94
- // Check readiness: engine running + mixer has valid format
95
- let mixFmt = engine.mainMixerNode.outputFormat(forBus: 0)
96
- guard engine.isRunning, mixFmt.sampleRate > 0, mixFmt.channelCount > 0 else { return }
97
-
98
- // Drain queue in-order
99
- while !self.pendingTTSSchedules.isEmpty {
100
- let item = self.pendingTTSSchedules.removeFirst()
101
- DispatchQueue.main.async { [weak self] in
102
- guard let self = self else { return }
103
- // Ensure player is attached & connected
104
- if self.playbackNode?.engine !== engine || !self.isPlayerConnected(self.playbackNode, to: engine) {
105
- self.playbackNode?.stop()
106
- self.playbackNode = nil
107
- }
108
- let player = self.ensurePlaybackNode(in: engine)
109
-
110
- // Prime → play → schedule
111
- self.primePlayer(player, engine: engine)
112
- if !player.isPlaying { player.play() }
113
-
114
- do {
115
- let file = try AVAudioFile(forReading: item.url)
116
- player.scheduleFile(file, at: nil) {
117
- DispatchQueue.main.async { item.done() }
118
- }
119
- NSLog("[STT] TTS: scheduled pending via AVAudioEngine: \(item.url.lastPathComponent)")
120
- } catch {
121
- NSLog("[STT] TTS pending schedule error: \(error)")
122
- // We still *don’t* fallback by design.
123
- }
124
- }
125
- }
126
- }
127
- }
128
-
129
- private func engineReadyForPlayback(_ engine: AVAudioEngine?) -> Bool {
130
- guard let e = engine, e.isRunning else { return false }
131
- let fmt = e.mainMixerNode.outputFormat(forBus: 0)
132
- // Non-zero SR/ch and we declared the graph "hot"
133
- return fmt.sampleRate > 0 && fmt.channelCount > 0 && graphState == .hot
134
- }
135
-
136
- // Prime the player with a tiny silent buffer so its first pull has data
137
- private func primePlayer(_ player: AVAudioPlayerNode, engine: AVAudioEngine) {
138
- let fmt = engine.mainMixerNode.outputFormat(forBus: 0)
139
- guard fmt.sampleRate > 0, fmt.channelCount > 0 else { return }
140
- if let buf = AVAudioPCMBuffer(pcmFormat: fmt, frameCapacity: 128) {
141
- buf.frameLength = 128
142
- if let ch = buf.floatChannelData {
143
- memset(ch[0], 0, Int(buf.frameLength) * MemoryLayout<Float>.size)
144
- }
145
- player.scheduleBuffer(buf, at: nil, options: .interrupts, completionHandler: nil)
146
- }
147
- }
148
-
149
- private(set) var sttActive = false
150
-
151
- // partial cadence monitor
152
- private var emaPartialGap: Double = 0 // exponential moving average of time between partials
153
- private let emaAlpha: Double = 0.3
154
-
155
- // MARK: - Event names (unchanged)
156
- public static let supportedEvents: [String] = [
157
- "onSpeechResults",
158
- "onSpeechStart",
159
- "onSpeechPartialResults",
160
- "onSpeechError",
161
- "onSpeechEnd",
162
- "onSpeechRecognized",
163
- "onSpeechVolumeChanged"
164
- ]
165
-
166
- // MARK: - Public API (native replacements for the former RCT methods)
167
-
168
- public func isSpeechAvailable(_ completion: @escaping (Bool) -> Void) {
169
- SFSpeechRecognizer.requestAuthorization { status in
170
- switch status {
171
- case .authorized: completion(true)
172
- default: completion(false)
173
- }
174
- }
175
- }
176
-
177
- public func isRecognizing() -> Bool {
178
- guard let task = recognitionTask else { return false }
179
- return task.state == .running
180
- }
181
-
182
- private func rebindEngineConfigObserver(to newEngine: AVAudioEngine?) {
183
- let nc = NotificationCenter.default
184
- if let old = observedEngineForConfigChange {
185
- nc.removeObserver(self,
186
- name: .AVAudioEngineConfigurationChange,
187
- object: old)
188
- }
189
- observedEngineForConfigChange = newEngine
190
- if let e = newEngine {
191
- nc.addObserver(self,
192
- selector: #selector(handleEngineConfigChange(_:)),
193
- name: .AVAudioEngineConfigurationChange,
194
- object: e)
195
- }
196
- }
197
-
198
- private func ensurePlaybackNode(in engine: AVAudioEngine) -> AVAudioPlayerNode {
199
- // If we have a node but it's tied to a different engine or got disconnected, recreate it.
200
- if let p = playbackNode, p.engine === engine {
201
- return p
202
- }
203
- let p = AVAudioPlayerNode()
204
- playbackNode = p
205
- engine.attach(p)
206
- // Connect with nil format so the mixer does SRC if needed
207
- engine.connect(p, to: engine.mainMixerNode, format: nil)
208
- return p
209
- }
210
-
211
- private func startWatchdog() {
212
- stallWatchdog?.invalidate()
213
- stallWatchdog = Timer.scheduledTimer(withTimeInterval: 2.0, repeats: true) { [weak self] _ in
214
- self?.checkTaskHealth()
215
- }
216
- RunLoop.main.add(stallWatchdog!, forMode: .common)
217
- }
218
-
219
- private func stopWatchdog() {
220
- stallWatchdog?.invalidate()
221
- stallWatchdog = nil
222
- }
223
-
224
- private func rearmTask(reason: String) {
225
- // Cancel old task only — keep the engine and tap running.
226
- recognitionTask?.cancel()
227
- recognitionTask = nil
228
-
229
- seenRealSpeech = false
230
- lastTaskStartAt = CACurrentMediaTime()
231
- startTask(makeFreshRequest())
232
- NSLog("[STT] rearmTask(\(reason)) -> new task started")
233
- }
234
-
235
- private func checkTaskHealth() {
236
- guard let engine = audioEngine else { return }
237
- let now = CACurrentMediaTime()
238
-
239
- // Engine down? Let your existing logic handle it; just bail.
240
- if !engine.isRunning { return }
241
-
242
- // If recognizer is globally unavailable, don’t thrash — wait until it flips back.
243
- if let rec = speechRecognizer, rec.isAvailable == false {
244
- NSLog("[STT] watchdog: recognizer unavailable; waiting…")
245
- return
246
- }
247
-
248
- // No task at all? Spin one up.
249
- if recognitionTask == nil {
250
- if now - lastRearmAt > rearmCooldownTask {
251
- NSLog("[STT] watchdog: no task -> start fresh request")
252
- lastRearmAt = now
253
- startTask(makeFreshRequest())
254
- }
255
- return
256
- }
257
-
258
- // If we’ve had buffers recently but no results for a while, assume the task is stuck.
259
- let noResultsFor = now - lastResultAt
260
- let hadRecentAudio = (now - lastBufferAt) < max(2.0, stallThreshold) // tap is alive
261
-
262
- if hadRecentAudio && noResultsFor > stallThreshold {
263
- if now - lastRearmAt > rearmCooldownTask {
264
- consecutiveStallCount += 1
265
- NSLog("[STT] watchdog: stall detected (no results for \(Int(noResultsFor))s, audio flowing). rearm #\(consecutiveStallCount)")
266
-
267
- rearmTask(reason: "watchdog-stall")
268
- lastRearmAt = now
269
-
270
- // If we stall repeatedly, recreate the recognizer itself (server/session could be hosed)
271
- if consecutiveStallCount >= 3 {
272
- recreateSpeechRecognizerPreservingLocale()
273
- consecutiveStallCount = 0
274
- }
275
- }
276
- } else if hadRecentAudio {
277
- // Healthy path: audio & results are flowing; reset stall counter
278
- consecutiveStallCount = 0
279
- }
280
- }
281
-
282
- public func startSpeech(localeStr: String?) {
283
- NSLog("[STT] startSpeech(locale=\(localeStr ?? "nil"))")
284
-
285
- if recognitionTask != nil {
286
- sendResult(error: ["code": "already_started", "message": "Speech recognition already started!"],
287
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
288
- return
289
- }
290
-
291
- SFSpeechRecognizer.requestAuthorization { [weak self] status in
292
- guard let self = self else { return }
293
- switch status {
294
- case .notDetermined:
295
- self.sendResult(error: ["message": "Speech recognition not yet authorized"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
296
- case .denied:
297
- self.sendResult(error: ["message": "User denied access to speech recognition"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
298
- case .restricted:
299
- self.sendResult(error: ["message": "Speech recognition restricted on this device"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
300
- case .authorized:
301
- self.setupAndStartRecognizing(localeStr: localeStr)
302
- @unknown default:
303
- self.sendResult(error: ["message": "Unknown authorization status"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
304
- }
305
- }
306
- }
307
-
308
- public func stopSpeech(_ completion: ((Bool) -> Void)? = nil) {
309
- NSLog("[STT] stopSpeech() requested by app")
310
- recognitionTask?.finish()
311
- completion?(false)
312
- }
313
-
314
- public func cancelSpeech(_ completion: ((Bool) -> Void)? = nil) {
315
- NSLog("[STT] cancelSpeech() requested by app")
316
-
317
- recognitionTask?.cancel()
318
- completion?(false)
319
- }
320
-
321
- public func destroySpeech(_ completion: ((Bool) -> Void)? = nil) {
322
- NSLog("[STT] **** destroySpeech!!!")
323
- teardown()
324
- completion?(false)
325
- }
326
-
327
- private func updateSessionRouting(selectBestInput: Bool = true) {
328
- let s = AVAudioSession.sharedInstance()
329
-
330
- // fast checks & logs can run on main
331
- let inputs = s.currentRoute.inputs
332
- guard !inputs.isEmpty else {
333
- NSLog("[STT] ⚠️ No capture route (likely A2DP). Deferring engine start.")
334
- return
335
- }
336
-
337
- DispatchQueue.global(qos: .userInitiated).async { [weak self] in
338
- guard let self = self else { return }
339
- do { try s.setActive(false, options: [.notifyOthersOnDeactivation]) }
340
- catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
341
-
342
- let hasWiredOrCar = s.currentRoute.outputs.contains {
343
- $0.portType == .headphones || $0.portType == .carAudio || $0.portType == .usbAudio
344
- }
345
- if selectBestInput, let all = s.availableInputs {
346
- let btHFP = all.first { $0.portType == .bluetoothHFP }
347
- let wired = all.first { $0.portType == .headsetMic }
348
- let built = all.first { $0.portType == .builtInMic }
349
- let best = btHFP ?? wired ?? built
350
- do {
351
- if s.preferredInput?.uid != best?.uid { try s.setPreferredInput(best) }
352
- if let builtIn = best, builtIn.portType == .builtInMic,
353
- let ds = builtIn.dataSources?.first(where: { $0.orientation == .bottom || $0.orientation == .back }) {
354
- try? builtIn.setPreferredDataSource(ds)
355
- }
356
- } catch {
357
- NSLog("[STT] setPreferredInput failed: \(error.localizedDescription)")
358
- }
359
- }
360
-
361
- var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
362
- if !hasWiredOrCar { opts.insert(.defaultToSpeaker) }
363
-
364
- if s.category != .playAndRecord || s.mode != .voiceChat || s.categoryOptions != opts {
365
- do { try s.setCategory(.playAndRecord, mode: .voiceChat, options: opts) }
366
- catch { NSLog("[STT] setCategory failed: \(error.localizedDescription)") }
367
- }
368
-
369
- do { try s.setActive(true, options: []) }
370
- catch { NSLog("[STT] setActive failed: \(error.localizedDescription)") }
371
-
372
- // Optional: force 16k after activation
373
- self.force16kIfPossible(s)
374
-
375
- // Log route back on main so logs stay ordered
376
- DispatchQueue.main.async {
377
- let inPorts = s.currentRoute.inputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
378
- let outPorts = s.currentRoute.outputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
379
- NSLog("[STT] route in=[\(inPorts)] out=[\(outPorts)]")
380
- }
381
- }
382
- }
383
-
384
- // ↓↓↓ preferred settings helper
385
- private func force16kIfPossible(_ session: AVAudioSession) {
386
- try? session.setPreferredSampleRate(16_000)
387
- if session.isInputAvailable { try? session.setPreferredInputNumberOfChannels(1) }
388
- try? session.setPreferredOutputNumberOfChannels(1)
389
- try? session.setPreferredIOBufferDuration(0.02) // ~20 ms frames
390
- }
391
-
392
- // MARK: - Core logic (kept intact, including AEC order/steps)
393
-
394
- /// Returns true if no errors occurred (identical flow & calls as ObjC) + keep-alive opts.
395
- private func setupAudioSession() -> Bool {
396
- var err: NSError?
397
- let session = AVAudioSession.sharedInstance()
398
- self.audioSession = session
399
-
400
- do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
401
- catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
402
-
403
- // Build options to match our routing rules
404
- // (defaultToSpeaker only when no external output is active)
405
- let hasExternalOutput: Bool = session.currentRoute.outputs.contains {
406
- switch $0.portType {
407
- case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
408
- return true
409
- default:
410
- return false
411
- }
412
- }
413
-
414
- var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
415
- if !hasExternalOutput { opts.insert(.defaultToSpeaker) }
416
- if #available(iOS 14.5, *) {
417
- // Prevent muted switch / mic mute from killing our capture pipeline
418
- opts.insert(.overrideMutedMicrophoneInterruption)
419
- }
420
-
421
- do {
422
- try session.setCategory(.playAndRecord, mode: .voiceChat, options: opts)
423
- } catch { err = error as NSError }
424
-
425
- do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
426
- catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
427
-
428
- // Force 16k before and after activation (some routes settle only after setActive)
429
- force16kIfPossible(session)
430
- do { try session.setActive(true) } catch { err = error as NSError }
431
- NSLog("[STT] session SR=%.1f inCh=%d outCh=%d (wanted 16000)",
432
- session.sampleRate,
433
- Int(session.inputNumberOfChannels),
434
- Int(session.outputNumberOfChannels))
435
- force16kIfPossible(session)
436
-
437
- if let e = err {
438
- NSLog("[STT] setupAudioSession error: \(e.localizedDescription)")
439
- sendResult(error: ["code": "audio", "message": e.localizedDescription],
440
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
441
- return false
442
- }
443
- return true
444
- }
445
-
446
- private func currentInputFormat(_ engine: AVAudioEngine) -> AVAudioFormat? {
447
- // Prefer whatever CoreAudio currently provides; avoid cached formats.
448
- let fmt = engine.inputNode.outputFormat(forBus: 0)
449
- if fmt.sampleRate > 0 && fmt.channelCount > 0 { return fmt }
450
- // Fallback: build a sane mono format from session if ever needed.
451
- let sr = max(8000, AVAudioSession.sharedInstance().sampleRate)
452
- return AVAudioFormat(commonFormat: .pcmFormatFloat32,
453
- sampleRate: sr,
454
- channels: 1,
455
- interleaved: false)
456
- }
457
-
458
- private func isHeadsetPluggedIn() -> Bool {
459
- let route = AVAudioSession.sharedInstance().currentRoute
460
- for out in route.outputs {
461
- if out.portType == .headphones || out.portType == .bluetoothA2DP {
462
- return true
463
- }
464
- }
465
- return false
466
- }
467
-
468
- private func isHeadSetBluetooth() -> Bool {
469
- for port in AVAudioSession.sharedInstance().availableInputs ?? [] {
470
- if port.portType == .bluetoothHFP { return true }
471
- }
472
- return false
473
- }
474
-
475
- private func loadContextualStrings() -> [String] {
476
- guard let filePath = Bundle.main.path(forResource: "words_flattened", ofType: "txt") else {
477
- NSLog("words_flattened.txt not found in bundle")
478
- return []
479
- }
480
- do {
481
- let contents = try String(contentsOfFile: filePath, encoding: .utf8)
482
- let rawItems = contents.components(separatedBy: ",")
483
- var cleaned: [String] = []
484
- cleaned.reserveCapacity(rawItems.count)
485
- for item in rawItems {
486
- var t = item.trimmingCharacters(in: .whitespacesAndNewlines)
487
- t = t.replacingOccurrences(of: "\"", with: "")
488
- if !t.isEmpty { cleaned.append(t) }
489
- }
490
- return cleaned
491
- } catch {
492
- NSLog("Error reading contextualStrings: \(error)")
493
- return []
494
- }
495
- }
496
-
497
- // Add helpers
498
- private func makeFreshRequest() -> SFSpeechAudioBufferRecognitionRequest {
499
- let req = SFSpeechAudioBufferRecognitionRequest()
500
- if #available(iOS 16, *) { req.addsPunctuation = true }
501
- req.shouldReportPartialResults = true
502
- //if #available(iOS 13.0, *) { req.taskHint = .dictation }
503
- req.contextualStrings = loadContextualStrings()
504
- self.recognitionRequest = req
505
- NSLog("makeFreshRequest()")
506
- return req
507
- }
508
-
509
- private func startTask(_ req: SFSpeechAudioBufferRecognitionRequest) {
510
- NSLog("starting recognitionTask")
511
- lastTaskStartAt = CACurrentMediaTime()
512
- lastResultAt = lastTaskStartAt
513
- let taskSessionId = self.sessionId
514
- self.recognitionTask = self.speechRecognizer?.recognitionTask(with: req) { [weak self] result, error in
515
- guard let self = self else { return }
516
- if taskSessionId != self.sessionId { NSLog("task session mismatch -> ignore"); return }
517
- self.lastResultAt = CACurrentMediaTime()
518
-
519
- func markIfReal(_ r: SFSpeechRecognitionResult?) {
520
- guard let r = r else { return }
521
- let best = r.bestTranscription.formattedString.trimmingCharacters(in: .whitespacesAndNewlines)
522
- if !best.isEmpty ||
523
- r.transcriptions.contains(where: { !$0.formattedString.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }) {
524
- if !self.seenRealSpeech {
525
- self.seenRealSpeech = true
526
- NSLog("first real speech detected -> onSpeechStart to JS")
527
- self.sendEvent(name: "onSpeechStart", body: nil)
528
- }
529
- }
530
- }
531
- markIfReal(result)
532
-
533
- func rearm(_ why: String, delay: TimeInterval = 0.05) {
534
- guard self.continuous else { return }
535
- NSLog("REARM (\(why))")
536
- self.recognitionTask?.cancel()
537
- self.recognitionTask = nil
538
- DispatchQueue.main.asyncAfter(deadline: .now() + delay) {
539
- self.startTask(self.makeFreshRequest())
540
- }
541
- }
542
-
543
- if let error = error {
544
- NSLog("task error \(error._code): \(error.localizedDescription)")
545
- // treat as transient for continuous mode
546
- self.rearmTask(reason: "error")
547
- return
548
- }
549
-
550
- guard let result = result else {
551
- NSLog("task nil result")
552
- self.rearmTask(reason: "nil-result")
553
- return
554
- }
555
-
556
- let isFinal = result.isFinal
557
- let parts = result.transcriptions.map { $0.formattedString }
558
- self.sendResult(error: nil,
559
- bestTranscription: result.bestTranscription.formattedString,
560
- transcriptions: parts,
561
- isFinal: isFinal)
562
-
563
- if isFinal {
564
- NSLog("task final -> onSpeechEnd")
565
- self.sendEvent(name: "onSpeechEnd", body: nil)
566
- if self.continuous {
567
- self.rearmTask(reason: "final")
568
- } else {
569
- NSLog("non-continuous final -> teardown")
570
- self.teardown()
571
- }
572
- }
573
- }
574
- }
575
-
576
- public func teardown() {
577
- NSLog("[STT] teardown() begin")
578
- setGraphState(.cold, why: "teardown")
579
- isTearingDown = true
580
- stopWatchdog()
581
- consecutiveStallCount = 0
582
-
583
- if let task = recognitionTask {
584
- task.cancel()
585
- recognitionTask = nil
586
- }
587
- AudioPlaybackHook.engineScheduleFile = nil
588
- AudioPlaybackHook.isEngineReady = nil
589
- AudioPlaybackHook.useOnlyEnginePlayback = nil
590
- AudioPlaybackHook.stopEnginePlayback = nil // ← NEW
591
- sttActive = false
592
-
593
- if let p = playbackNode {
594
- p.stop()
595
- }
596
- playbackNode = nil
597
-
598
- if let req = recognitionRequest {
599
- req.endAudio()
600
- recognitionRequest = nil
601
- }
602
-
603
- if let engine = audioEngine {
604
- if engine.inputNode != nil {
605
- engine.inputNode.removeTap(onBus: 0)
606
- engine.inputNode.reset()
607
- }
608
- if engine.isRunning {
609
- engine.stop()
610
- }
611
- engine.reset()
612
- rebindEngineConfigObserver(to: nil)
613
- audioEngine = nil // Crucial step!
614
- }
615
-
616
- resetAudioSession()
617
-
618
- sessionId = nil
619
- isTearingDown = false
620
- }
621
-
622
- private func resetAudioSession() {
623
- if audioSession == nil {
624
- audioSession = AVAudioSession.sharedInstance()
625
- }
626
- guard let session = audioSession else { return }
627
-
628
- // Preserve & compare category exactly as original logic
629
- let current = session.category
630
- if priorAudioCategory == current { return }
631
-
632
- // (kept commented as in your code)
633
- // do {
634
- // try session.setCategory(priorAudioCategory ?? .soloAmbient,
635
- // mode: .default,
636
- // options: [.allowBluetooth,
637
- // .defaultToSpeaker,
638
- // .allowAirPlay,
639
- // .mixWithOthers])
640
- // } catch { }
641
- audioSession = nil
642
- }
643
-
644
- // LATEST assertAEC
645
- private func assertAEC(_ engine: AVAudioEngine) {
646
- do { try engine.inputNode.setVoiceProcessingEnabled(true) }
647
- catch { NSLog("[STT] assertAEC: setVoiceProcessingEnabled(true) failed: \(error)") }
648
- }
649
-
650
- private func isPlayerConnected(_ player: AVAudioPlayerNode?, to engine: AVAudioEngine?) -> Bool {
651
- guard let p = player, let e = engine else { return false }
652
- // If the node is attached and has a non-zero channel count on its output, it’s effectively connected.
653
- let fmt = p.outputFormat(forBus: 0)
654
- return (p.engine === e) && (fmt.channelCount > 0) && (fmt.sampleRate > 0)
655
- }
656
-
657
- /// Try to keep the capture alive without tearing down recognition.
658
- /// 1) If engine exists but not running → try start()
659
- /// 2) If start fails or graph became invalid → rebuild graph and start
660
- /// 3) If we don’t have a task yet, start one.
661
- private func ensureEngineRunning(reason: String) {
662
- let now = CFAbsoluteTimeGetCurrent()
663
- if (now - lastReclaimAttempt) < reclaimCooldown {
664
- NSLog("[STT] ensureEngineRunning(\(reason)) skipped (cooldown)")
665
- return
666
- }
667
- lastReclaimAttempt = now
668
-
669
- if let e = audioEngine, !e.isRunning {
670
- assertAEC(e)
671
- do {
672
- playbackNode?.stop()
673
- playbackNode = nil
674
- try e.start()
675
- NSLog("🔄 AVAudioEngine restarted after config change. isRunning=\(e.isRunning)")
676
- } catch {
677
- NSLog("❌ Could not re-start after config change: \(error)")
678
- }
679
- }
680
-
681
- // --- full recovery path (this was previously dead code) ---
682
- guard let engine = audioEngine else {
683
- NSLog("[STT] ensureEngineRunning(\(reason)): no engine → rebuild")
684
- rebuildEngineGraphAndRestart(reason: reason)
685
- return
686
- }
687
-
688
- assertAEC(engine)
689
-
690
- if !engine.isRunning {
691
- setGraphState(.starting, why: "ensureEngineRunning(\(reason))")
692
- do {
693
- try engine.start()
694
- setGraphState(.hot, why: "engine.start() ok (ensureEngineRunning)")
695
- self.tryFlushPendingTTS() // ← ADD THIS LINE
696
- NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() -> \(engine.isRunning)")
697
- } catch {
698
- setGraphState(.unstable, why: "engine.start() failed (ensureEngineRunning)")
699
- NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() failed: \(error) → rebuild")
700
- rebuildEngineGraphAndRestart(reason: reason)
701
- return
702
- }
703
- }
704
-
705
- if recognitionTask == nil {
706
- if let req = recognitionRequest {
707
- NSLog("[STT] ensureEngineRunning(\(reason)): no task -> startTask(existing req)")
708
- startTask(req)
709
- } else {
710
- NSLog("[STT] ensureEngineRunning(\(reason)): no req -> makeFreshRequest + startTask")
711
- startTask(makeFreshRequest())
712
- }
713
- }
714
- }
715
-
716
- /// Rebuilds AVAudioEngine graph (mic→mute mixer, player→mainMixer), reinstalls tap,
717
- /// and restarts the engine. Does NOT nuke the current recognitionRequest/task unless required.
718
- private func rebuildEngineGraphAndRestart(reason: String) {
719
- NSLog("[STT] 🔄 rebuildEngineGraphAndRestart (\(reason))")
720
-
721
- // Keep current request if present; we'll keep appending into it
722
- let existingReq = self.recognitionRequest
723
-
724
- // Tear down engine ONLY (keep session, request)
725
- if let engine = audioEngine {
726
- if engine.inputNode != nil {
727
- engine.inputNode.removeTap(onBus: 0)
728
- engine.inputNode.reset()
729
- }
730
- if engine.isRunning { engine.stop() }
731
- engine.reset()
732
- }
733
-
734
- // Recreate engine and graph
735
- let newEngine = AVAudioEngine()
736
- self.audioEngine = newEngine
737
-
738
- let inputNode = newEngine.inputNode
739
- do {
740
- try inputNode.setVoiceProcessingEnabled(true)
741
- } catch {
742
- NSLog("[STT] rebuild: failed to enable voice processing: \(error)")
743
- }
744
- if #available(iOS 17.0, *) {
745
- var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
746
- duck.enableAdvancedDucking = false
747
- duck.duckingLevel = .min
748
- inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
749
- }
750
-
751
- // Live format (may be 0 Hz briefly during route churn)
752
- let liveFmt = newEngine.inputNode.outputFormat(forBus: 0)
753
- guard liveFmt.sampleRate > 0, liveFmt.channelCount > 0 else {
754
- NSLog("[STT] rebuild: input format invalid (0 Hz) — retry shortly")
755
- DispatchQueue.main.asyncAfter(deadline: .now() + 0.05) { [weak self] in
756
- self?.ensureEngineRunning(reason: "wait-valid-input-format(rebuild)")
757
- }
758
- return
759
- }
760
-
761
- // mic → mute mixer → mainMixer
762
- let micMixer = AVAudioMixerNode()
763
- newEngine.attach(micMixer)
764
- // Use nil to let engine pick a valid format (avoids 0 Hz assertion)
765
- newEngine.connect(inputNode, to: micMixer, format: nil)
766
- newEngine.connect(micMixer, to: newEngine.mainMixerNode, format: nil)
767
- micMixer.outputVolume = 0.0
768
-
769
- // TTS player → mainMixer (keep same player if possible, else recreate)
770
- if playbackNode == nil { playbackNode = AVAudioPlayerNode() }
771
- if let player = playbackNode {
772
- if player.engine == nil { newEngine.attach(player) }
773
- newEngine.connect(player, to: newEngine.mainMixerNode, format: nil)
774
- }
775
-
776
- do {
777
- try? inputNode.removeTap(onBus: 0)
778
- } catch {
779
- NSLog("[STT] removeTap error: \(error)")
780
- }
781
-
782
- let targetFmt = AVAudioFormat(commonFormat: .pcmFormatFloat32,
783
- sampleRate: 16_000,
784
- channels: 1,
785
- interleaved: false)!
786
-
787
- // Tap with nil so it follows route changes automatically
788
- inputNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { [weak self] buffer, _ in
789
- guard let self = self else { return }
790
-
791
- // (same level metering as your current code)
792
- let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
793
- let LP: Float = 0.5
794
-
795
- if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
796
- var peak0: Float = 0
797
- vDSP_maxmgv(ch0, 1, &peak0, frames)
798
- let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
799
- let sm0 = LP * db0 + (1 - LP) * self.averagePowerForChannel0
800
- self.averagePowerForChannel0 = sm0
801
- self.averagePowerForChannel1 = sm0
802
- }
803
- if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
804
- var peak1: Float = 0
805
- vDSP_maxmgv(ch1, 1, &peak1, frames)
806
- let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
807
- let sm1 = LP * db1 + (1 - LP) * self.averagePowerForChannel1
808
- self.averagePowerForChannel1 = sm1
809
- }
810
- self.averagePowerForChannel1 = Float(self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0)
811
- self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": self.averagePowerForChannel1])
812
-
813
- // ---- Convert to 16 kHz MONO for STT request
814
- let inFmt = buffer.format
815
- if inFmt.sampleRate != 16_000 || inFmt.channelCount != 1 {
816
- if let conv = AVAudioConverter(from: inFmt, to: targetFmt) {
817
- let ratio = targetFmt.sampleRate / inFmt.sampleRate
818
- let outCap = AVAudioFrameCount(Double(buffer.frameLength) * ratio) + 8
819
- if let outBuf = AVAudioPCMBuffer(pcmFormat: targetFmt, frameCapacity: outCap) {
820
- var err: NSError? = nil
821
- var fed = false
822
- conv.convert(to: outBuf, error: &err) { _, outStatus -> AVAudioBuffer? in
823
- if fed {
824
- outStatus.pointee = .endOfStream
825
- return nil
826
- } else {
827
- fed = true
828
- outStatus.pointee = .haveData
829
- return buffer
830
- }
831
- }
832
- if err == nil {
833
- self.recognitionRequest?.append(outBuf)
834
- } else {
835
- self.recognitionRequest?.append(buffer) // fallback
836
- }
837
- } else {
838
- self.recognitionRequest?.append(buffer)
839
- }
840
- } else {
841
- self.recognitionRequest?.append(buffer)
842
- }
843
- } else {
844
- self.recognitionRequest?.append(buffer)
845
- }
846
- self.lastBufferAt = CACurrentMediaTime()
847
- }
848
-
849
- newEngine.prepare()
850
- setGraphState(.starting, why: "pre start in rebuild")
851
- do {
852
- try newEngine.start()
853
- setGraphState(.hot, why: "engine.start() ok (rebuild)")
854
- self.tryFlushPendingTTS() // ← ADD THIS LINE
855
- let f = newEngine.inputNode.outputFormat(forBus: 0)
856
- NSLog("[STT] rebuild: engine.start() ok, running=\(newEngine.isRunning) (fmt=%.1f Hz / %d ch)",
857
- f.sampleRate, Int(f.channelCount))
858
- } catch {
859
- setGraphState(.unstable, why: "engine.start() failed (rebuild)")
860
- NSLog("[STT] rebuild: engine.start() failed: \(error)")
861
- }
862
-
863
- // If we lost the request during rebuild, recreate + start task.
864
- if self.recognitionRequest == nil {
865
- if let old = existingReq {
866
- self.recognitionRequest = old
867
- } else {
868
- self.recognitionRequest = makeFreshRequest()
869
- }
870
- }
871
- if self.recognitionTask == nil {
872
- startTask(self.recognitionRequest!)
873
- }
874
- rebindEngineConfigObserver(to: newEngine)
875
- }
876
-
877
- @objc private func handleEngineConfigChange(_ note: Notification) {
878
- NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange")
879
- if playbackNode?.isPlaying == true { playbackNode?.stop() }
880
- markUnstableThenRecheck(why: "AVAudioEngineConfigurationChange")
881
-
882
- // If engine stopped, drop the player node (it will be lazily recreated)
883
- if let e = audioEngine, !e.isRunning {
884
- playbackNode?.stop()
885
- playbackNode = nil
886
- }
887
-
888
- // Re-assert a mic-capable route (HFP/wired/built-in)
889
- updateSessionRouting(selectBestInput: true)
890
-
891
- // Re-enable VoiceProcessingIO (AEC) and restart if needed
892
- ensureEngineRunning(reason: "engine-config-change")
893
- self.tryFlushPendingTTS() // ← ADD THIS LINE
894
- }
895
-
896
- @objc private func handleMediaServicesReset(_ note: Notification) {
897
- NSLog("[STT] 📺 Media services were RESET: reclaiming mic & session")
898
- // Re-apply audio session and try to rebuild graph if needed
899
- _ = setupAudioSession()
900
- ensureEngineRunning(reason: "media-services-reset")
901
- self.tryFlushPendingTTS() // ← OPTIONAL ADD
902
- }
903
-
904
- @objc private func handleRouteChange(_ note: Notification) {
905
- let info = note.userInfo ?? [:]
906
- NSLog("[STT] 🔀 route change: \(info)")
907
- if playbackNode?.isPlaying == true { playbackNode?.stop() }
908
- markUnstableThenRecheck(why: "route-change")
909
-
910
- let s = AVAudioSession.sharedInstance()
911
-
912
- // 1) Re-apply a mic-safe category/mode and prefer HFP/built-in mic.
913
- updateSessionRouting(selectBestInput: true)
914
- if let inputs = s.availableInputs {
915
- let preferred = inputs.first { $0.portType == .bluetoothHFP }
916
- ?? inputs.first { $0.portType == .headsetMic }
917
- ?? inputs.first { $0.portType == .builtInMic }
918
- try? s.setPreferredInput(preferred)
919
- }
920
-
921
- // 2) If there’s still no input, don’t thrash; wait for a usable route.
922
- let inputs = s.currentRoute.inputs
923
- NSLog("[STT] 🎤 inputs after route fix: \(inputs.map { $0.portType.rawValue })")
924
- guard !inputs.isEmpty else {
925
- NSLog("[STT] ⚠️ No mic route available (likely A2DP/AirPlay). Not restarting engine.")
926
- return
927
- }
928
-
929
- // 3) Now recover the engine/task
930
- ensureEngineRunning(reason: "route-change")
931
- self.tryFlushPendingTTS() // ← ADD THIS LINE
932
- }
933
-
934
- // Call once after engine is created
935
- private func installEngineObservers() {
936
- let nc = NotificationCenter.default
937
-
938
- nc.addObserver(self,
939
- selector: #selector(handleSessionInterruption(_:)),
940
- name: AVAudioSession.interruptionNotification,
941
- object: AVAudioSession.sharedInstance())
942
-
943
- nc.addObserver(self,
944
- selector: #selector(handleRouteChange(_:)),
945
- name: AVAudioSession.routeChangeNotification,
946
- object: AVAudioSession.sharedInstance())
947
-
948
- nc.addObserver(self,
949
- selector: #selector(handleMediaServicesReset(_:)),
950
- name: AVAudioSession.mediaServicesWereResetNotification,
951
- object: nil)
952
- }
953
-
954
- @objc private func handleSessionInterruption(_ note: Notification) {
955
- guard
956
- let info = note.userInfo,
957
- let typeVal = info[AVAudioSessionInterruptionTypeKey] as? UInt,
958
- let type = AVAudioSession.InterruptionType(rawValue: typeVal)
959
- else { return }
960
-
961
- if type == .ended {
962
- // On real “render err” Core Audio posts an interruption END
963
- NSLog("Session interruption ended (possible render err):")
964
- }
965
- }
966
-
967
- // Wait for one IO cycle so player won't throw "did not see an IO cycle"
968
- private func awaitOneIOCycle(_ engine: AVAudioEngine,
969
- timeout: TimeInterval = 0.7,
970
- done: @escaping (Bool) -> Void) {
971
- let mixer = engine.mainMixerNode
972
- var fired = false
973
- mixer.installTap(onBus: 0, bufferSize: 128, format: nil) { _, _ in
974
- if !fired {
975
- fired = true
976
- mixer.removeTap(onBus: 0)
977
- DispatchQueue.main.async { done(true) }
978
- }
979
- }
980
- DispatchQueue.main.asyncAfter(deadline: .now() + timeout) {
981
- if !fired {
982
- mixer.removeTap(onBus: 0)
983
- done(false)
984
- }
985
- }
986
- }
987
-
988
- private func setupAndStartRecognizing(localeStr: String?) {
989
- NSLog("[STT] setupAndStartRecognizing begin")
990
- sttActive = true
991
-
992
- audioSession = AVAudioSession.sharedInstance()
993
- guard let session = audioSession else { return }
994
- var err: NSError?
995
-
996
- priorAudioCategory = session.category
997
-
998
- // Tear down resources before starting speech recognition..
999
- NSLog("[STT] pre-teardown")
1000
- teardown()
1001
- // ** IMPORTANT ** Call this again as teardown marks this false
1002
- sttActive = true
1003
-
1004
- sessionId = UUID().uuidString
1005
-
1006
- let locale: Locale? = {
1007
- if let s = localeStr, !s.isEmpty { return Locale(identifier: s) }
1008
- sttActive = false
1009
- return nil
1010
- }()
1011
-
1012
- if let loc = locale {
1013
- speechRecognizer = SFSpeechRecognizer(locale: loc)
1014
- } else {
1015
- speechRecognizer = SFSpeechRecognizer()
1016
- }
1017
- speechRecognizer?.delegate = self
1018
-
1019
- // Start audio session...
1020
- NSLog("[STT] setupAudioSession()")
1021
- guard setupAudioSession() else {
1022
- NSLog("[STT] ERROR ERROR ******** setupAudioSession()")
1023
- teardown()
1024
- sttActive = false
1025
- return
1026
- }
1027
- installEngineObservers()
1028
-
1029
- let request = SFSpeechAudioBufferRecognitionRequest()
1030
- recognitionRequest = request
1031
-
1032
- if #available(iOS 16, *) {
1033
- request.addsPunctuation = true
1034
- }
1035
- request.shouldReportPartialResults = true
1036
- //if #available(iOS 13.0, *) { request.taskHint = .dictation }
1037
- request.contextualStrings = loadContextualStrings()
1038
-
1039
- guard recognitionRequest != nil else {
1040
- sendResult(error: ["code": "recognition_init"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
1041
- teardown()
1042
- return
1043
- }
1044
-
1045
- if audioEngine == nil {
1046
- audioEngine = AVAudioEngine()
1047
- rebindEngineConfigObserver(to: audioEngine)
1048
- }
1049
- do {
1050
- guard let engine = audioEngine else { throw NSError(domain: "voice.audio", code: -1) }
1051
- let inputNode = engine.inputNode
1052
- _ = inputNode // presence check
1053
-
1054
- // Enable voice processing (AEC)
1055
- do {
1056
- try inputNode.setVoiceProcessingEnabled(true)
1057
- } catch {
1058
- NSLog("Failed to enable voice processing for AEC on input node: \(error)")
1059
- }
1060
-
1061
- if #available(iOS 17.0, *) {
1062
- var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
1063
- duck.enableAdvancedDucking = false // disable advanced (VAD-based) ducking
1064
- duck.duckingLevel = .min // “as loud as possible” for other audio
1065
- inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
1066
- }
1067
-
1068
- NSLog("[STT] AEC enable done")
1069
-
1070
- // Live format guard (can briefly be 0 Hz on route churn)
1071
- let liveFmt = engine.inputNode.outputFormat(forBus: 0)
1072
- guard liveFmt.sampleRate > 0, liveFmt.channelCount > 0 else {
1073
- NSLog("[STT] start: input format invalid (0 Hz) — retry shortly")
1074
- DispatchQueue.main.asyncAfter(deadline: .now() + 0.05) { [weak self] in
1075
- self?.ensureEngineRunning(reason: "wait-valid-input-format(start)")
1076
- }
1077
- return
1078
- }
1079
-
1080
- // 1) Mute only the mic path, not the whole main mixer
1081
- let micMixer = AVAudioMixerNode()
1082
- engine.attach(micMixer)
1083
- // Let engine choose format to avoid 0 Hz assertions
1084
- engine.connect(inputNode, to: micMixer, format: nil)
1085
- engine.connect(micMixer, to: engine.mainMixerNode, format: nil)
1086
- micMixer.outputVolume = 0.0 // ← you won't hear your own mic
1087
-
1088
- // 2) Prepare a player node for TTS inside the SAME engine/graph
1089
- let player = AVAudioPlayerNode()
1090
- self.playbackNode = player
1091
- engine.attach(player)
1092
- engine.connect(player, to: engine.mainMixerNode, format: nil)
1093
-
1094
- NSLog("[STT] graph connected (mic->mute mixer, player->mainMixer)")
1095
-
1096
- var tapFrames: UInt64 = 0
1097
-
1098
- do { try? inputNode.removeTap(onBus: 0) } catch {
1099
- NSLog("[STT] removeTap error: \(error)")
1100
- }
1101
-
1102
- let targetFmt = AVAudioFormat(commonFormat: .pcmFormatFloat32,
1103
- sampleRate: 16_000,
1104
- channels: 1,
1105
- interleaved: false)!
1106
-
1107
- // Tap with nil so it follows the node’s live format automatically
1108
- inputNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { [weak self] buffer, _ in
1109
- // Strongify self once
1110
- guard let self = self else { return }
1111
- tapFrames &+= UInt64(buffer.frameLength)
1112
- if tapFrames % (44100 * 2) < 1024 { // ~every ~2s at 44.1k
1113
- NSLog("[STT] tap alive, totalFrames=\(tapFrames)")
1114
- }
1115
-
1116
- let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
1117
- let LEVEL_LOWPASS_TRIG: Float = 0.5
1118
-
1119
- // CH0
1120
- if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
1121
- var peak0: Float = 0
1122
- vDSP_maxmgv(ch0, 1, &peak0, frames)
1123
- let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
1124
-
1125
- let smoothed0 = LEVEL_LOWPASS_TRIG * db0
1126
- + (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel0
1127
- self.averagePowerForChannel0 = smoothed0
1128
- self.averagePowerForChannel1 = smoothed0
1129
- }
1130
-
1131
- // CH1
1132
- if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
1133
- var peak1: Float = 0
1134
- vDSP_maxmgv(ch1, 1, &peak1, frames)
1135
- let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
1136
-
1137
- let smoothed1 = LEVEL_LOWPASS_TRIG * db1
1138
- + (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel1
1139
- self.averagePowerForChannel1 = smoothed1
1140
- }
1141
-
1142
- // Normalize 0–10 and emit
1143
- self.averagePowerForChannel1 = Float(self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0)
1144
- let value = self.averagePowerForChannel1
1145
- self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": value])
1146
-
1147
- // ---- Convert to 16 kHz MONO for STT request
1148
- let inFmt = buffer.format
1149
- if inFmt.sampleRate != 16_000 || inFmt.channelCount != 1 {
1150
- if let conv = AVAudioConverter(from: inFmt, to: targetFmt) {
1151
- // Conservative capacity +8 frames
1152
- let ratio = targetFmt.sampleRate / inFmt.sampleRate
1153
- let outCap = AVAudioFrameCount(Double(buffer.frameLength) * ratio) + 8
1154
- if let outBuf = AVAudioPCMBuffer(pcmFormat: targetFmt, frameCapacity: outCap) {
1155
- var err: NSError? = nil
1156
- var fed = false
1157
- conv.convert(to: outBuf, error: &err) { _, outStatus -> AVAudioBuffer? in
1158
- if fed {
1159
- outStatus.pointee = .endOfStream
1160
- return nil
1161
- } else {
1162
- fed = true
1163
- outStatus.pointee = .haveData
1164
- return buffer
1165
- }
1166
- }
1167
- if err == nil {
1168
- self.recognitionRequest?.append(outBuf)
1169
- } else {
1170
- self.recognitionRequest?.append(buffer) // fallback
1171
- }
1172
- } else {
1173
- self.recognitionRequest?.append(buffer)
1174
- }
1175
- } else {
1176
- self.recognitionRequest?.append(buffer)
1177
- }
1178
- } else {
1179
- self.recognitionRequest?.append(buffer)
1180
- }
1181
-
1182
- self.lastBufferAt = CACurrentMediaTime()
1183
- }
1184
-
1185
- engine.prepare()
1186
- NSLog("[STT] audioEngine prepare")
1187
- setGraphState(.starting, why: "pre start in setupAndStartRecognizing")
1188
- var audioSessionError: NSError?
1189
- do {
1190
- try engine.start()
1191
- setGraphState(.hot, why: "engine.start() ok (setupAndStartRecognizing)")
1192
- self.tryFlushPendingTTS() // ← ADD THIS LINE
1193
- } catch {
1194
- audioSessionError = error as NSError
1195
- setGraphState(.unstable, why: "engine.start() failed (setupAndStartRecognizing)")
1196
- }
1197
-
1198
- // after engine.start() success:
1199
- engineHotAt = CACurrentMediaTime()
1200
- seenRealSpeech = false
1201
- let f = engine.inputNode.outputFormat(forBus: 0)
1202
- NSLog("engine HOT at \(engineHotAt) (fmt=%.1f Hz / %d ch)", f.sampleRate, Int(f.channelCount))
1203
- sendEvent(name: "onSpeechStart", body: nil) // engine hot signal
1204
- startTask(makeFreshRequest())
1205
-
1206
- // Engine is up; expose readiness
1207
- AudioPlaybackHook.isEngineReady = { [weak self] in
1208
- guard let e = self?.audioEngine else { return false }
1209
- let fmt = e.mainMixerNode.outputFormat(forBus: 0)
1210
- return e.isRunning && fmt.sampleRate > 0 && fmt.channelCount > 0
1211
- }
1212
-
1213
- AudioPlaybackHook.useOnlyEnginePlayback = { [weak self] in
1214
- guard let self = self, let e = self.audioEngine else { return false }
1215
- let fmt = e.mainMixerNode.outputFormat(forBus: 0)
1216
- return self.sttActive && e.isRunning && fmt.sampleRate > 0 && fmt.channelCount > 0
1217
- }
1218
-
1219
- startWatchdog()
1220
-
1221
- AudioPlaybackHook.engineScheduleFile = { [weak self] url, done in
1222
- // If STT is active we NEVER fallback; we queue until the engine is ready.
1223
- guard let self = self else { return true }
1224
-
1225
- let scheduleOrQueue: () -> Void = {
1226
- guard let engine = self.audioEngine else {
1227
- // No engine yet — queue
1228
- self.ttsSerial.async { self.pendingTTSSchedules.append((url, done)) }
1229
- return
1230
- }
1231
-
1232
- let mixFmt = engine.mainMixerNode.outputFormat(forBus: 0)
1233
- let ready = engine.isRunning && mixFmt.sampleRate > 0 && mixFmt.channelCount > 0
1234
-
1235
- if ready {
1236
- // Schedule immediately
1237
- if self.playbackNode?.engine !== engine || !self.isPlayerConnected(self.playbackNode, to: engine) {
1238
- self.playbackNode?.stop()
1239
- self.playbackNode = nil
1240
- }
1241
- let player = self.ensurePlaybackNode(in: engine)
1242
- self.primePlayer(player, engine: engine)
1243
- if !player.isPlaying { player.play() }
1244
-
1245
- do {
1246
- let file = try AVAudioFile(forReading: url)
1247
- player.scheduleFile(file, at: nil) {
1248
- DispatchQueue.main.async { done() }
1249
- }
1250
- NSLog("[STT] TTS: scheduled via AVAudioEngine: \(url.lastPathComponent)")
1251
- } catch {
1252
- NSLog("[STT] TTS schedule error: \(error) — queuing instead (no fallback).")
1253
- self.ttsSerial.async { self.pendingTTSSchedules.append((url, done)) }
1254
- }
1255
- } else {
1256
- // Not ready — queue and try to wake the engine
1257
- self.ttsSerial.async { self.pendingTTSSchedules.append((url, done)) }
1258
-
1259
- // Kick the engine and wait for one IO cycle; then flush
1260
- do { if !engine.isRunning { try engine.start() } } catch { }
1261
- self.awaitOneIOCycle(engine, timeout: 0.7) { _ in
1262
- self.tryFlushPendingTTS()
1263
- }
1264
- }
1265
- }
1266
-
1267
- if Thread.isMainThread {
1268
- scheduleOrQueue()
1269
- } else {
1270
- DispatchQueue.main.async { scheduleOrQueue() }
1271
- }
1272
-
1273
- // IMPORTANT: Always “true” while STT is active => no fallback path is taken.
1274
- return true
1275
- }
1276
-
1277
- AudioPlaybackHook.stopEnginePlayback = { [weak self] in
1278
- DispatchQueue.main.async {
1279
- guard let self = self else { return }
1280
- // Stop only the TTS playback node; keep the engine running for STT
1281
- self.playbackNode?.stop()
1282
- }
1283
- }
1284
-
1285
- NSLog("audioEngine startAndReturnError")
1286
- if let audioSessionError = audioSessionError {
1287
- NSLog("audioEngine start error: \(audioSessionError.localizedDescription)")
1288
- self.sendResult(error: ["code": "audio", "message": audioSessionError.localizedDescription],
1289
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
1290
- return
1291
- }
1292
- NSLog("After Start recording and append recording")
1293
- DispatchQueue.main.asyncAfter(deadline: .now() + 3.0) { [weak self] in
1294
- guard let self = self else { return }
1295
- let running = self.audioEngine?.isRunning ?? false
1296
- let taskState = self.recognitionTask?.state.rawValue ?? -1
1297
- NSLog("[STT] health: engineRunning=\(running) taskState=\(taskState)")
1298
- }
1299
-
1300
- NSLog("After if audioSessionError != nil")
1301
- } catch let e as NSError {
1302
- sendResult(error: ["code": "start_recording", "message": e.localizedDescription],
1303
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
1304
- NSLog("End of init...")
1305
- return
1306
- }
1307
- }
1308
-
1309
- // MARK: - Helpers
1310
-
1311
- private func _normalizedPowerLevelFromDecibels(_ decibels: CGFloat) -> CGFloat {
1312
- if decibels < -80.0 || decibels == 0.0 { return 0.0 }
1313
- let minDb: Float = -80.0
1314
- let pow10_min = powf(10.0, 0.05 * minDb)
1315
- let pow10_db = powf(10.0, 0.05 * Float(decibels))
1316
- let power = powf((pow10_db - pow10_min) * (1.0 / (1.0 - pow10_min)), 1.0 / 2.0)
1317
- if power < 1.0 { return CGFloat(power) } else { return 1.0 }
1318
- }
1319
-
1320
- private func sendEvent(name: String, body: [String: Any]?) {
1321
- delegate?.stt(self, didEmitEvent: name, body: body)
1322
- }
1323
-
1324
- /// Exact event behavior preserved from ObjC `sendResult`.
1325
- private func sendResult(error: [String: Any]?,
1326
- bestTranscription: String?,
1327
- transcriptions: [String]?,
1328
- isFinal: Bool?) {
1329
- if let error = error {
1330
- sendEvent(name: "onSpeechError", body: ["error": error])
1331
- }
1332
- if let best = bestTranscription {
1333
- sendEvent(name: "onSpeechResults", body: ["value": [best]])
1334
- }
1335
- if let trans = transcriptions {
1336
- sendEvent(name: "onSpeechPartialResults", body: ["value": trans])
1337
- }
1338
- if let isFinal = isFinal {
1339
- sendEvent(name: "onSpeechRecognized", body: ["isFinal": isFinal])
1340
- }
1341
- }
1342
-
1343
- // MARK: - SFSpeechRecognizerDelegate
1344
-
1345
- public func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
1346
- if available == false {
1347
- sendResult(error: ["message": "Speech recognition is not available now"],
1348
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
1349
- }
1350
- }
1351
-
1352
- // MARK: - Small helper to recreate recognizer (used by watchdog)
1353
- private func recreateSpeechRecognizerPreservingLocale() {
1354
- let loc = speechRecognizer?.locale
1355
- speechRecognizer = loc != nil ? SFSpeechRecognizer(locale: loc!) : SFSpeechRecognizer()
1356
- speechRecognizer?.delegate = self
1357
- NSLog("[STT] recreated SFSpeechRecognizer (locale preserved: \(loc?.identifier ?? "default"))")
1358
- }
1359
- }