react-native-davoice-tts 1.0.217 → 1.0.219
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/TTSRNBridge.podspec +1 -1
- package/android/libs/com/davoice/tts/1.0.0/tts-1.0.0.aar +0 -0
- package/android/libs/com/davoice/tts/1.0.0/tts-1.0.0.aar.md5 +1 -1
- package/android/libs/com/davoice/tts/1.0.0/tts-1.0.0.aar.sha1 +1 -1
- package/ios/SpeechBridge/SpeechBridge.m +153 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DavoiceTTS +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.abi.json +3 -3
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DavoiceTTS +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.abi.json +1174 -1174
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.private.swiftinterface +12 -12
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.swiftinterface +12 -12
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.abi.json +1174 -1174
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.private.swiftinterface +12 -12
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.swiftinterface +12 -12
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeDirectory +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeRequirements-1 +0 -0
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeResources +24 -99
- package/package.json +1 -1
- package/speech/index.ts +106 -0
- package/android/src/main/java/com/davoice/tts/rn/DaVoiceTTSPackage.java_old_using_new_for_both_stt_and_tts +0 -26
- package/ios/STTRNBridge/STTBridge.m_wtf +0 -109
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT copy.swift____ +0 -1202
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift.bkup +0 -1000
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift.latest +0 -1359
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift1.swift__ +0 -1134
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift__ +0 -1329
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT copy.swift____ +0 -1202
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.bkup +0 -1000
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.latest +0 -1359
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift1.swift__ +0 -1134
- package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift__ +0 -1329
|
@@ -1,1359 +0,0 @@
|
|
|
1
|
-
// STT.swift
|
|
2
|
-
// Native iOS Swift version (AEC flow preserved 1:1)
|
|
3
|
-
|
|
4
|
-
import Foundation
|
|
5
|
-
import UIKit
|
|
6
|
-
import Speech
|
|
7
|
-
import Accelerate
|
|
8
|
-
import AVFAudio // or import AVFoundation
|
|
9
|
-
|
|
10
|
-
@objc public protocol STTDelegate: AnyObject {
|
|
11
|
-
@objc func stt(_ stt: STT, didEmitEvent name: String, body: [String: Any]?)
|
|
12
|
-
}
|
|
13
|
-
|
|
14
|
-
@objcMembers
|
|
15
|
-
public final class STT: NSObject, SFSpeechRecognizerDelegate {
|
|
16
|
-
public weak var delegate: STTDelegate?
|
|
17
|
-
public var continuous: Bool = true
|
|
18
|
-
|
|
19
|
-
// MARK: - Private
|
|
20
|
-
private var speechRecognizer: SFSpeechRecognizer?
|
|
21
|
-
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
|
22
|
-
private var audioEngine: AVAudioEngine?
|
|
23
|
-
private var recognitionTask: SFSpeechRecognitionTask?
|
|
24
|
-
private var audioSession: AVAudioSession?
|
|
25
|
-
private var isTearingDown: Bool = false
|
|
26
|
-
private var sessionId: String?
|
|
27
|
-
private var priorAudioCategory: AVAudioSession.Category?
|
|
28
|
-
private var averagePowerForChannel0: Float = 0
|
|
29
|
-
private var averagePowerForChannel1: Float = 0
|
|
30
|
-
|
|
31
|
-
private var playbackNode: AVAudioPlayerNode?
|
|
32
|
-
private var seenRealSpeech = false // flips true after first non-blank token
|
|
33
|
-
private var engineHotAt: CFTimeInterval = 0 // when engine actually started
|
|
34
|
-
private let warmupKeepAlive: CFTimeInterval = 4.0 // seconds we’ll keep re-arming in silence
|
|
35
|
-
|
|
36
|
-
// Keep-engine-alive helpers
|
|
37
|
-
private var lastReclaimAttempt: CFAbsoluteTime = 0
|
|
38
|
-
private let reclaimCooldown: CFTimeInterval = 1.0
|
|
39
|
-
|
|
40
|
-
// --- Task health ---
|
|
41
|
-
private var lastBufferAt: CFTimeInterval = 0 // updated from tap
|
|
42
|
-
private var lastResultAt: CFTimeInterval = 0 // updated from recognition callback
|
|
43
|
-
private var lastTaskStartAt: CFTimeInterval = 0
|
|
44
|
-
private var stallWatchdog: Timer?
|
|
45
|
-
private var consecutiveStallCount = 0
|
|
46
|
-
private let stallThreshold: CFTimeInterval = 8.0 // seconds w/o results while engine is hot
|
|
47
|
-
private let rearmCooldownTask: CFTimeInterval = 2.0
|
|
48
|
-
private var lastRearmAt: CFTimeInterval = 0
|
|
49
|
-
private var engineHot = false
|
|
50
|
-
private var hotAt: CFTimeInterval = 0
|
|
51
|
-
|
|
52
|
-
private var observedEngineForConfigChange: AVAudioEngine?
|
|
53
|
-
// Pending TTS while engine warms/recovers
|
|
54
|
-
private var pendingTTSSchedules: [(url: URL, done: () -> Void)] = []
|
|
55
|
-
private let ttsSerial = DispatchQueue(label: "stt.tts.serial")
|
|
56
|
-
|
|
57
|
-
// --- Recovery & diagnostics ---
|
|
58
|
-
private var recoverySeq = 0
|
|
59
|
-
private var lastRecoveryAt: CFTimeInterval = 0
|
|
60
|
-
private var lastTaskOrigin: String = "cold"
|
|
61
|
-
private enum GraphState { case cold, starting, hot, unstable }
|
|
62
|
-
private var graphState: GraphState = .cold
|
|
63
|
-
private var stabilityTimer: Timer?
|
|
64
|
-
|
|
65
|
-
private func setGraphState(_ s: GraphState, why: String) {
|
|
66
|
-
graphState = s
|
|
67
|
-
NSLog("[STT] graphState -> \(s) (\(why))")
|
|
68
|
-
}
|
|
69
|
-
|
|
70
|
-
private func markUnstableThenRecheck(after seconds: TimeInterval = 0.35, why: String) {
|
|
71
|
-
setGraphState(.unstable, why: why)
|
|
72
|
-
stabilityTimer?.invalidate()
|
|
73
|
-
stabilityTimer = Timer.scheduledTimer(withTimeInterval: seconds, repeats: false) { [weak self] _ in
|
|
74
|
-
guard let self = self, let eng = self.audioEngine else { return }
|
|
75
|
-
if eng.isRunning {
|
|
76
|
-
self.setGraphState(.hot, why: "debounce elapsed & engine running")
|
|
77
|
-
self.tryFlushPendingTTS() // ← ADD THIS LINE
|
|
78
|
-
} else {
|
|
79
|
-
do {
|
|
80
|
-
try eng.start()
|
|
81
|
-
self.setGraphState(.hot, why: "restarted after debounce")
|
|
82
|
-
self.tryFlushPendingTTS() // ← ADD THIS LINE
|
|
83
|
-
} catch {
|
|
84
|
-
self.setGraphState(.starting, why: "start failed: \(error.localizedDescription)")
|
|
85
|
-
}
|
|
86
|
-
}
|
|
87
|
-
}
|
|
88
|
-
RunLoop.main.add(stabilityTimer!, forMode: .common)
|
|
89
|
-
}
|
|
90
|
-
|
|
91
|
-
private func tryFlushPendingTTS() {
|
|
92
|
-
ttsSerial.async { [weak self] in
|
|
93
|
-
guard let self = self, let engine = self.audioEngine else { return }
|
|
94
|
-
// Check readiness: engine running + mixer has valid format
|
|
95
|
-
let mixFmt = engine.mainMixerNode.outputFormat(forBus: 0)
|
|
96
|
-
guard engine.isRunning, mixFmt.sampleRate > 0, mixFmt.channelCount > 0 else { return }
|
|
97
|
-
|
|
98
|
-
// Drain queue in-order
|
|
99
|
-
while !self.pendingTTSSchedules.isEmpty {
|
|
100
|
-
let item = self.pendingTTSSchedules.removeFirst()
|
|
101
|
-
DispatchQueue.main.async { [weak self] in
|
|
102
|
-
guard let self = self else { return }
|
|
103
|
-
// Ensure player is attached & connected
|
|
104
|
-
if self.playbackNode?.engine !== engine || !self.isPlayerConnected(self.playbackNode, to: engine) {
|
|
105
|
-
self.playbackNode?.stop()
|
|
106
|
-
self.playbackNode = nil
|
|
107
|
-
}
|
|
108
|
-
let player = self.ensurePlaybackNode(in: engine)
|
|
109
|
-
|
|
110
|
-
// Prime → play → schedule
|
|
111
|
-
self.primePlayer(player, engine: engine)
|
|
112
|
-
if !player.isPlaying { player.play() }
|
|
113
|
-
|
|
114
|
-
do {
|
|
115
|
-
let file = try AVAudioFile(forReading: item.url)
|
|
116
|
-
player.scheduleFile(file, at: nil) {
|
|
117
|
-
DispatchQueue.main.async { item.done() }
|
|
118
|
-
}
|
|
119
|
-
NSLog("[STT] TTS: scheduled pending via AVAudioEngine: \(item.url.lastPathComponent)")
|
|
120
|
-
} catch {
|
|
121
|
-
NSLog("[STT] TTS pending schedule error: \(error)")
|
|
122
|
-
// We still *don’t* fallback by design.
|
|
123
|
-
}
|
|
124
|
-
}
|
|
125
|
-
}
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
|
|
129
|
-
private func engineReadyForPlayback(_ engine: AVAudioEngine?) -> Bool {
|
|
130
|
-
guard let e = engine, e.isRunning else { return false }
|
|
131
|
-
let fmt = e.mainMixerNode.outputFormat(forBus: 0)
|
|
132
|
-
// Non-zero SR/ch and we declared the graph "hot"
|
|
133
|
-
return fmt.sampleRate > 0 && fmt.channelCount > 0 && graphState == .hot
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
// Prime the player with a tiny silent buffer so its first pull has data
|
|
137
|
-
private func primePlayer(_ player: AVAudioPlayerNode, engine: AVAudioEngine) {
|
|
138
|
-
let fmt = engine.mainMixerNode.outputFormat(forBus: 0)
|
|
139
|
-
guard fmt.sampleRate > 0, fmt.channelCount > 0 else { return }
|
|
140
|
-
if let buf = AVAudioPCMBuffer(pcmFormat: fmt, frameCapacity: 128) {
|
|
141
|
-
buf.frameLength = 128
|
|
142
|
-
if let ch = buf.floatChannelData {
|
|
143
|
-
memset(ch[0], 0, Int(buf.frameLength) * MemoryLayout<Float>.size)
|
|
144
|
-
}
|
|
145
|
-
player.scheduleBuffer(buf, at: nil, options: .interrupts, completionHandler: nil)
|
|
146
|
-
}
|
|
147
|
-
}
|
|
148
|
-
|
|
149
|
-
private(set) var sttActive = false
|
|
150
|
-
|
|
151
|
-
// partial cadence monitor
|
|
152
|
-
private var emaPartialGap: Double = 0 // exponential moving average of time between partials
|
|
153
|
-
private let emaAlpha: Double = 0.3
|
|
154
|
-
|
|
155
|
-
// MARK: - Event names (unchanged)
|
|
156
|
-
public static let supportedEvents: [String] = [
|
|
157
|
-
"onSpeechResults",
|
|
158
|
-
"onSpeechStart",
|
|
159
|
-
"onSpeechPartialResults",
|
|
160
|
-
"onSpeechError",
|
|
161
|
-
"onSpeechEnd",
|
|
162
|
-
"onSpeechRecognized",
|
|
163
|
-
"onSpeechVolumeChanged"
|
|
164
|
-
]
|
|
165
|
-
|
|
166
|
-
// MARK: - Public API (native replacements for the former RCT methods)
|
|
167
|
-
|
|
168
|
-
public func isSpeechAvailable(_ completion: @escaping (Bool) -> Void) {
|
|
169
|
-
SFSpeechRecognizer.requestAuthorization { status in
|
|
170
|
-
switch status {
|
|
171
|
-
case .authorized: completion(true)
|
|
172
|
-
default: completion(false)
|
|
173
|
-
}
|
|
174
|
-
}
|
|
175
|
-
}
|
|
176
|
-
|
|
177
|
-
public func isRecognizing() -> Bool {
|
|
178
|
-
guard let task = recognitionTask else { return false }
|
|
179
|
-
return task.state == .running
|
|
180
|
-
}
|
|
181
|
-
|
|
182
|
-
private func rebindEngineConfigObserver(to newEngine: AVAudioEngine?) {
|
|
183
|
-
let nc = NotificationCenter.default
|
|
184
|
-
if let old = observedEngineForConfigChange {
|
|
185
|
-
nc.removeObserver(self,
|
|
186
|
-
name: .AVAudioEngineConfigurationChange,
|
|
187
|
-
object: old)
|
|
188
|
-
}
|
|
189
|
-
observedEngineForConfigChange = newEngine
|
|
190
|
-
if let e = newEngine {
|
|
191
|
-
nc.addObserver(self,
|
|
192
|
-
selector: #selector(handleEngineConfigChange(_:)),
|
|
193
|
-
name: .AVAudioEngineConfigurationChange,
|
|
194
|
-
object: e)
|
|
195
|
-
}
|
|
196
|
-
}
|
|
197
|
-
|
|
198
|
-
private func ensurePlaybackNode(in engine: AVAudioEngine) -> AVAudioPlayerNode {
|
|
199
|
-
// If we have a node but it's tied to a different engine or got disconnected, recreate it.
|
|
200
|
-
if let p = playbackNode, p.engine === engine {
|
|
201
|
-
return p
|
|
202
|
-
}
|
|
203
|
-
let p = AVAudioPlayerNode()
|
|
204
|
-
playbackNode = p
|
|
205
|
-
engine.attach(p)
|
|
206
|
-
// Connect with nil format so the mixer does SRC if needed
|
|
207
|
-
engine.connect(p, to: engine.mainMixerNode, format: nil)
|
|
208
|
-
return p
|
|
209
|
-
}
|
|
210
|
-
|
|
211
|
-
private func startWatchdog() {
|
|
212
|
-
stallWatchdog?.invalidate()
|
|
213
|
-
stallWatchdog = Timer.scheduledTimer(withTimeInterval: 2.0, repeats: true) { [weak self] _ in
|
|
214
|
-
self?.checkTaskHealth()
|
|
215
|
-
}
|
|
216
|
-
RunLoop.main.add(stallWatchdog!, forMode: .common)
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
private func stopWatchdog() {
|
|
220
|
-
stallWatchdog?.invalidate()
|
|
221
|
-
stallWatchdog = nil
|
|
222
|
-
}
|
|
223
|
-
|
|
224
|
-
private func rearmTask(reason: String) {
|
|
225
|
-
// Cancel old task only — keep the engine and tap running.
|
|
226
|
-
recognitionTask?.cancel()
|
|
227
|
-
recognitionTask = nil
|
|
228
|
-
|
|
229
|
-
seenRealSpeech = false
|
|
230
|
-
lastTaskStartAt = CACurrentMediaTime()
|
|
231
|
-
startTask(makeFreshRequest())
|
|
232
|
-
NSLog("[STT] rearmTask(\(reason)) -> new task started")
|
|
233
|
-
}
|
|
234
|
-
|
|
235
|
-
private func checkTaskHealth() {
|
|
236
|
-
guard let engine = audioEngine else { return }
|
|
237
|
-
let now = CACurrentMediaTime()
|
|
238
|
-
|
|
239
|
-
// Engine down? Let your existing logic handle it; just bail.
|
|
240
|
-
if !engine.isRunning { return }
|
|
241
|
-
|
|
242
|
-
// If recognizer is globally unavailable, don’t thrash — wait until it flips back.
|
|
243
|
-
if let rec = speechRecognizer, rec.isAvailable == false {
|
|
244
|
-
NSLog("[STT] watchdog: recognizer unavailable; waiting…")
|
|
245
|
-
return
|
|
246
|
-
}
|
|
247
|
-
|
|
248
|
-
// No task at all? Spin one up.
|
|
249
|
-
if recognitionTask == nil {
|
|
250
|
-
if now - lastRearmAt > rearmCooldownTask {
|
|
251
|
-
NSLog("[STT] watchdog: no task -> start fresh request")
|
|
252
|
-
lastRearmAt = now
|
|
253
|
-
startTask(makeFreshRequest())
|
|
254
|
-
}
|
|
255
|
-
return
|
|
256
|
-
}
|
|
257
|
-
|
|
258
|
-
// If we’ve had buffers recently but no results for a while, assume the task is stuck.
|
|
259
|
-
let noResultsFor = now - lastResultAt
|
|
260
|
-
let hadRecentAudio = (now - lastBufferAt) < max(2.0, stallThreshold) // tap is alive
|
|
261
|
-
|
|
262
|
-
if hadRecentAudio && noResultsFor > stallThreshold {
|
|
263
|
-
if now - lastRearmAt > rearmCooldownTask {
|
|
264
|
-
consecutiveStallCount += 1
|
|
265
|
-
NSLog("[STT] watchdog: stall detected (no results for \(Int(noResultsFor))s, audio flowing). rearm #\(consecutiveStallCount)")
|
|
266
|
-
|
|
267
|
-
rearmTask(reason: "watchdog-stall")
|
|
268
|
-
lastRearmAt = now
|
|
269
|
-
|
|
270
|
-
// If we stall repeatedly, recreate the recognizer itself (server/session could be hosed)
|
|
271
|
-
if consecutiveStallCount >= 3 {
|
|
272
|
-
recreateSpeechRecognizerPreservingLocale()
|
|
273
|
-
consecutiveStallCount = 0
|
|
274
|
-
}
|
|
275
|
-
}
|
|
276
|
-
} else if hadRecentAudio {
|
|
277
|
-
// Healthy path: audio & results are flowing; reset stall counter
|
|
278
|
-
consecutiveStallCount = 0
|
|
279
|
-
}
|
|
280
|
-
}
|
|
281
|
-
|
|
282
|
-
public func startSpeech(localeStr: String?) {
|
|
283
|
-
NSLog("[STT] startSpeech(locale=\(localeStr ?? "nil"))")
|
|
284
|
-
|
|
285
|
-
if recognitionTask != nil {
|
|
286
|
-
sendResult(error: ["code": "already_started", "message": "Speech recognition already started!"],
|
|
287
|
-
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
288
|
-
return
|
|
289
|
-
}
|
|
290
|
-
|
|
291
|
-
SFSpeechRecognizer.requestAuthorization { [weak self] status in
|
|
292
|
-
guard let self = self else { return }
|
|
293
|
-
switch status {
|
|
294
|
-
case .notDetermined:
|
|
295
|
-
self.sendResult(error: ["message": "Speech recognition not yet authorized"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
296
|
-
case .denied:
|
|
297
|
-
self.sendResult(error: ["message": "User denied access to speech recognition"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
298
|
-
case .restricted:
|
|
299
|
-
self.sendResult(error: ["message": "Speech recognition restricted on this device"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
300
|
-
case .authorized:
|
|
301
|
-
self.setupAndStartRecognizing(localeStr: localeStr)
|
|
302
|
-
@unknown default:
|
|
303
|
-
self.sendResult(error: ["message": "Unknown authorization status"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
304
|
-
}
|
|
305
|
-
}
|
|
306
|
-
}
|
|
307
|
-
|
|
308
|
-
public func stopSpeech(_ completion: ((Bool) -> Void)? = nil) {
|
|
309
|
-
NSLog("[STT] stopSpeech() requested by app")
|
|
310
|
-
recognitionTask?.finish()
|
|
311
|
-
completion?(false)
|
|
312
|
-
}
|
|
313
|
-
|
|
314
|
-
public func cancelSpeech(_ completion: ((Bool) -> Void)? = nil) {
|
|
315
|
-
NSLog("[STT] cancelSpeech() requested by app")
|
|
316
|
-
|
|
317
|
-
recognitionTask?.cancel()
|
|
318
|
-
completion?(false)
|
|
319
|
-
}
|
|
320
|
-
|
|
321
|
-
public func destroySpeech(_ completion: ((Bool) -> Void)? = nil) {
|
|
322
|
-
NSLog("[STT] **** destroySpeech!!!")
|
|
323
|
-
teardown()
|
|
324
|
-
completion?(false)
|
|
325
|
-
}
|
|
326
|
-
|
|
327
|
-
private func updateSessionRouting(selectBestInput: Bool = true) {
|
|
328
|
-
let s = AVAudioSession.sharedInstance()
|
|
329
|
-
|
|
330
|
-
// fast checks & logs can run on main
|
|
331
|
-
let inputs = s.currentRoute.inputs
|
|
332
|
-
guard !inputs.isEmpty else {
|
|
333
|
-
NSLog("[STT] ⚠️ No capture route (likely A2DP). Deferring engine start.")
|
|
334
|
-
return
|
|
335
|
-
}
|
|
336
|
-
|
|
337
|
-
DispatchQueue.global(qos: .userInitiated).async { [weak self] in
|
|
338
|
-
guard let self = self else { return }
|
|
339
|
-
do { try s.setActive(false, options: [.notifyOthersOnDeactivation]) }
|
|
340
|
-
catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
|
|
341
|
-
|
|
342
|
-
let hasWiredOrCar = s.currentRoute.outputs.contains {
|
|
343
|
-
$0.portType == .headphones || $0.portType == .carAudio || $0.portType == .usbAudio
|
|
344
|
-
}
|
|
345
|
-
if selectBestInput, let all = s.availableInputs {
|
|
346
|
-
let btHFP = all.first { $0.portType == .bluetoothHFP }
|
|
347
|
-
let wired = all.first { $0.portType == .headsetMic }
|
|
348
|
-
let built = all.first { $0.portType == .builtInMic }
|
|
349
|
-
let best = btHFP ?? wired ?? built
|
|
350
|
-
do {
|
|
351
|
-
if s.preferredInput?.uid != best?.uid { try s.setPreferredInput(best) }
|
|
352
|
-
if let builtIn = best, builtIn.portType == .builtInMic,
|
|
353
|
-
let ds = builtIn.dataSources?.first(where: { $0.orientation == .bottom || $0.orientation == .back }) {
|
|
354
|
-
try? builtIn.setPreferredDataSource(ds)
|
|
355
|
-
}
|
|
356
|
-
} catch {
|
|
357
|
-
NSLog("[STT] setPreferredInput failed: \(error.localizedDescription)")
|
|
358
|
-
}
|
|
359
|
-
}
|
|
360
|
-
|
|
361
|
-
var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
|
|
362
|
-
if !hasWiredOrCar { opts.insert(.defaultToSpeaker) }
|
|
363
|
-
|
|
364
|
-
if s.category != .playAndRecord || s.mode != .voiceChat || s.categoryOptions != opts {
|
|
365
|
-
do { try s.setCategory(.playAndRecord, mode: .voiceChat, options: opts) }
|
|
366
|
-
catch { NSLog("[STT] setCategory failed: \(error.localizedDescription)") }
|
|
367
|
-
}
|
|
368
|
-
|
|
369
|
-
do { try s.setActive(true, options: []) }
|
|
370
|
-
catch { NSLog("[STT] setActive failed: \(error.localizedDescription)") }
|
|
371
|
-
|
|
372
|
-
// Optional: force 16k after activation
|
|
373
|
-
self.force16kIfPossible(s)
|
|
374
|
-
|
|
375
|
-
// Log route back on main so logs stay ordered
|
|
376
|
-
DispatchQueue.main.async {
|
|
377
|
-
let inPorts = s.currentRoute.inputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
|
|
378
|
-
let outPorts = s.currentRoute.outputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
|
|
379
|
-
NSLog("[STT] route in=[\(inPorts)] out=[\(outPorts)]")
|
|
380
|
-
}
|
|
381
|
-
}
|
|
382
|
-
}
|
|
383
|
-
|
|
384
|
-
// ↓↓↓ preferred settings helper
|
|
385
|
-
private func force16kIfPossible(_ session: AVAudioSession) {
|
|
386
|
-
try? session.setPreferredSampleRate(16_000)
|
|
387
|
-
if session.isInputAvailable { try? session.setPreferredInputNumberOfChannels(1) }
|
|
388
|
-
try? session.setPreferredOutputNumberOfChannels(1)
|
|
389
|
-
try? session.setPreferredIOBufferDuration(0.02) // ~20 ms frames
|
|
390
|
-
}
|
|
391
|
-
|
|
392
|
-
// MARK: - Core logic (kept intact, including AEC order/steps)
|
|
393
|
-
|
|
394
|
-
/// Returns true if no errors occurred (identical flow & calls as ObjC) + keep-alive opts.
|
|
395
|
-
private func setupAudioSession() -> Bool {
|
|
396
|
-
var err: NSError?
|
|
397
|
-
let session = AVAudioSession.sharedInstance()
|
|
398
|
-
self.audioSession = session
|
|
399
|
-
|
|
400
|
-
do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
|
|
401
|
-
catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
|
|
402
|
-
|
|
403
|
-
// Build options to match our routing rules
|
|
404
|
-
// (defaultToSpeaker only when no external output is active)
|
|
405
|
-
let hasExternalOutput: Bool = session.currentRoute.outputs.contains {
|
|
406
|
-
switch $0.portType {
|
|
407
|
-
case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
|
|
408
|
-
return true
|
|
409
|
-
default:
|
|
410
|
-
return false
|
|
411
|
-
}
|
|
412
|
-
}
|
|
413
|
-
|
|
414
|
-
var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
|
|
415
|
-
if !hasExternalOutput { opts.insert(.defaultToSpeaker) }
|
|
416
|
-
if #available(iOS 14.5, *) {
|
|
417
|
-
// Prevent muted switch / mic mute from killing our capture pipeline
|
|
418
|
-
opts.insert(.overrideMutedMicrophoneInterruption)
|
|
419
|
-
}
|
|
420
|
-
|
|
421
|
-
do {
|
|
422
|
-
try session.setCategory(.playAndRecord, mode: .voiceChat, options: opts)
|
|
423
|
-
} catch { err = error as NSError }
|
|
424
|
-
|
|
425
|
-
do { try session.setActive(false, options: [.notifyOthersOnDeactivation]) }
|
|
426
|
-
catch { NSLog("[STT] setActive false failed: \(error.localizedDescription)") }
|
|
427
|
-
|
|
428
|
-
// Force 16k before and after activation (some routes settle only after setActive)
|
|
429
|
-
force16kIfPossible(session)
|
|
430
|
-
do { try session.setActive(true) } catch { err = error as NSError }
|
|
431
|
-
NSLog("[STT] session SR=%.1f inCh=%d outCh=%d (wanted 16000)",
|
|
432
|
-
session.sampleRate,
|
|
433
|
-
Int(session.inputNumberOfChannels),
|
|
434
|
-
Int(session.outputNumberOfChannels))
|
|
435
|
-
force16kIfPossible(session)
|
|
436
|
-
|
|
437
|
-
if let e = err {
|
|
438
|
-
NSLog("[STT] setupAudioSession error: \(e.localizedDescription)")
|
|
439
|
-
sendResult(error: ["code": "audio", "message": e.localizedDescription],
|
|
440
|
-
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
441
|
-
return false
|
|
442
|
-
}
|
|
443
|
-
return true
|
|
444
|
-
}
|
|
445
|
-
|
|
446
|
-
private func currentInputFormat(_ engine: AVAudioEngine) -> AVAudioFormat? {
|
|
447
|
-
// Prefer whatever CoreAudio currently provides; avoid cached formats.
|
|
448
|
-
let fmt = engine.inputNode.outputFormat(forBus: 0)
|
|
449
|
-
if fmt.sampleRate > 0 && fmt.channelCount > 0 { return fmt }
|
|
450
|
-
// Fallback: build a sane mono format from session if ever needed.
|
|
451
|
-
let sr = max(8000, AVAudioSession.sharedInstance().sampleRate)
|
|
452
|
-
return AVAudioFormat(commonFormat: .pcmFormatFloat32,
|
|
453
|
-
sampleRate: sr,
|
|
454
|
-
channels: 1,
|
|
455
|
-
interleaved: false)
|
|
456
|
-
}
|
|
457
|
-
|
|
458
|
-
private func isHeadsetPluggedIn() -> Bool {
|
|
459
|
-
let route = AVAudioSession.sharedInstance().currentRoute
|
|
460
|
-
for out in route.outputs {
|
|
461
|
-
if out.portType == .headphones || out.portType == .bluetoothA2DP {
|
|
462
|
-
return true
|
|
463
|
-
}
|
|
464
|
-
}
|
|
465
|
-
return false
|
|
466
|
-
}
|
|
467
|
-
|
|
468
|
-
private func isHeadSetBluetooth() -> Bool {
|
|
469
|
-
for port in AVAudioSession.sharedInstance().availableInputs ?? [] {
|
|
470
|
-
if port.portType == .bluetoothHFP { return true }
|
|
471
|
-
}
|
|
472
|
-
return false
|
|
473
|
-
}
|
|
474
|
-
|
|
475
|
-
private func loadContextualStrings() -> [String] {
|
|
476
|
-
guard let filePath = Bundle.main.path(forResource: "words_flattened", ofType: "txt") else {
|
|
477
|
-
NSLog("words_flattened.txt not found in bundle")
|
|
478
|
-
return []
|
|
479
|
-
}
|
|
480
|
-
do {
|
|
481
|
-
let contents = try String(contentsOfFile: filePath, encoding: .utf8)
|
|
482
|
-
let rawItems = contents.components(separatedBy: ",")
|
|
483
|
-
var cleaned: [String] = []
|
|
484
|
-
cleaned.reserveCapacity(rawItems.count)
|
|
485
|
-
for item in rawItems {
|
|
486
|
-
var t = item.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
487
|
-
t = t.replacingOccurrences(of: "\"", with: "")
|
|
488
|
-
if !t.isEmpty { cleaned.append(t) }
|
|
489
|
-
}
|
|
490
|
-
return cleaned
|
|
491
|
-
} catch {
|
|
492
|
-
NSLog("Error reading contextualStrings: \(error)")
|
|
493
|
-
return []
|
|
494
|
-
}
|
|
495
|
-
}
|
|
496
|
-
|
|
497
|
-
// Add helpers
|
|
498
|
-
private func makeFreshRequest() -> SFSpeechAudioBufferRecognitionRequest {
|
|
499
|
-
let req = SFSpeechAudioBufferRecognitionRequest()
|
|
500
|
-
if #available(iOS 16, *) { req.addsPunctuation = true }
|
|
501
|
-
req.shouldReportPartialResults = true
|
|
502
|
-
//if #available(iOS 13.0, *) { req.taskHint = .dictation }
|
|
503
|
-
req.contextualStrings = loadContextualStrings()
|
|
504
|
-
self.recognitionRequest = req
|
|
505
|
-
NSLog("makeFreshRequest()")
|
|
506
|
-
return req
|
|
507
|
-
}
|
|
508
|
-
|
|
509
|
-
private func startTask(_ req: SFSpeechAudioBufferRecognitionRequest) {
|
|
510
|
-
NSLog("starting recognitionTask")
|
|
511
|
-
lastTaskStartAt = CACurrentMediaTime()
|
|
512
|
-
lastResultAt = lastTaskStartAt
|
|
513
|
-
let taskSessionId = self.sessionId
|
|
514
|
-
self.recognitionTask = self.speechRecognizer?.recognitionTask(with: req) { [weak self] result, error in
|
|
515
|
-
guard let self = self else { return }
|
|
516
|
-
if taskSessionId != self.sessionId { NSLog("task session mismatch -> ignore"); return }
|
|
517
|
-
self.lastResultAt = CACurrentMediaTime()
|
|
518
|
-
|
|
519
|
-
func markIfReal(_ r: SFSpeechRecognitionResult?) {
|
|
520
|
-
guard let r = r else { return }
|
|
521
|
-
let best = r.bestTranscription.formattedString.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
522
|
-
if !best.isEmpty ||
|
|
523
|
-
r.transcriptions.contains(where: { !$0.formattedString.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }) {
|
|
524
|
-
if !self.seenRealSpeech {
|
|
525
|
-
self.seenRealSpeech = true
|
|
526
|
-
NSLog("first real speech detected -> onSpeechStart to JS")
|
|
527
|
-
self.sendEvent(name: "onSpeechStart", body: nil)
|
|
528
|
-
}
|
|
529
|
-
}
|
|
530
|
-
}
|
|
531
|
-
markIfReal(result)
|
|
532
|
-
|
|
533
|
-
func rearm(_ why: String, delay: TimeInterval = 0.05) {
|
|
534
|
-
guard self.continuous else { return }
|
|
535
|
-
NSLog("REARM (\(why))")
|
|
536
|
-
self.recognitionTask?.cancel()
|
|
537
|
-
self.recognitionTask = nil
|
|
538
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + delay) {
|
|
539
|
-
self.startTask(self.makeFreshRequest())
|
|
540
|
-
}
|
|
541
|
-
}
|
|
542
|
-
|
|
543
|
-
if let error = error {
|
|
544
|
-
NSLog("task error \(error._code): \(error.localizedDescription)")
|
|
545
|
-
// treat as transient for continuous mode
|
|
546
|
-
self.rearmTask(reason: "error")
|
|
547
|
-
return
|
|
548
|
-
}
|
|
549
|
-
|
|
550
|
-
guard let result = result else {
|
|
551
|
-
NSLog("task nil result")
|
|
552
|
-
self.rearmTask(reason: "nil-result")
|
|
553
|
-
return
|
|
554
|
-
}
|
|
555
|
-
|
|
556
|
-
let isFinal = result.isFinal
|
|
557
|
-
let parts = result.transcriptions.map { $0.formattedString }
|
|
558
|
-
self.sendResult(error: nil,
|
|
559
|
-
bestTranscription: result.bestTranscription.formattedString,
|
|
560
|
-
transcriptions: parts,
|
|
561
|
-
isFinal: isFinal)
|
|
562
|
-
|
|
563
|
-
if isFinal {
|
|
564
|
-
NSLog("task final -> onSpeechEnd")
|
|
565
|
-
self.sendEvent(name: "onSpeechEnd", body: nil)
|
|
566
|
-
if self.continuous {
|
|
567
|
-
self.rearmTask(reason: "final")
|
|
568
|
-
} else {
|
|
569
|
-
NSLog("non-continuous final -> teardown")
|
|
570
|
-
self.teardown()
|
|
571
|
-
}
|
|
572
|
-
}
|
|
573
|
-
}
|
|
574
|
-
}
|
|
575
|
-
|
|
576
|
-
public func teardown() {
|
|
577
|
-
NSLog("[STT] teardown() begin")
|
|
578
|
-
setGraphState(.cold, why: "teardown")
|
|
579
|
-
isTearingDown = true
|
|
580
|
-
stopWatchdog()
|
|
581
|
-
consecutiveStallCount = 0
|
|
582
|
-
|
|
583
|
-
if let task = recognitionTask {
|
|
584
|
-
task.cancel()
|
|
585
|
-
recognitionTask = nil
|
|
586
|
-
}
|
|
587
|
-
AudioPlaybackHook.engineScheduleFile = nil
|
|
588
|
-
AudioPlaybackHook.isEngineReady = nil
|
|
589
|
-
AudioPlaybackHook.useOnlyEnginePlayback = nil
|
|
590
|
-
AudioPlaybackHook.stopEnginePlayback = nil // ← NEW
|
|
591
|
-
sttActive = false
|
|
592
|
-
|
|
593
|
-
if let p = playbackNode {
|
|
594
|
-
p.stop()
|
|
595
|
-
}
|
|
596
|
-
playbackNode = nil
|
|
597
|
-
|
|
598
|
-
if let req = recognitionRequest {
|
|
599
|
-
req.endAudio()
|
|
600
|
-
recognitionRequest = nil
|
|
601
|
-
}
|
|
602
|
-
|
|
603
|
-
if let engine = audioEngine {
|
|
604
|
-
if engine.inputNode != nil {
|
|
605
|
-
engine.inputNode.removeTap(onBus: 0)
|
|
606
|
-
engine.inputNode.reset()
|
|
607
|
-
}
|
|
608
|
-
if engine.isRunning {
|
|
609
|
-
engine.stop()
|
|
610
|
-
}
|
|
611
|
-
engine.reset()
|
|
612
|
-
rebindEngineConfigObserver(to: nil)
|
|
613
|
-
audioEngine = nil // Crucial step!
|
|
614
|
-
}
|
|
615
|
-
|
|
616
|
-
resetAudioSession()
|
|
617
|
-
|
|
618
|
-
sessionId = nil
|
|
619
|
-
isTearingDown = false
|
|
620
|
-
}
|
|
621
|
-
|
|
622
|
-
private func resetAudioSession() {
|
|
623
|
-
if audioSession == nil {
|
|
624
|
-
audioSession = AVAudioSession.sharedInstance()
|
|
625
|
-
}
|
|
626
|
-
guard let session = audioSession else { return }
|
|
627
|
-
|
|
628
|
-
// Preserve & compare category exactly as original logic
|
|
629
|
-
let current = session.category
|
|
630
|
-
if priorAudioCategory == current { return }
|
|
631
|
-
|
|
632
|
-
// (kept commented as in your code)
|
|
633
|
-
// do {
|
|
634
|
-
// try session.setCategory(priorAudioCategory ?? .soloAmbient,
|
|
635
|
-
// mode: .default,
|
|
636
|
-
// options: [.allowBluetooth,
|
|
637
|
-
// .defaultToSpeaker,
|
|
638
|
-
// .allowAirPlay,
|
|
639
|
-
// .mixWithOthers])
|
|
640
|
-
// } catch { }
|
|
641
|
-
audioSession = nil
|
|
642
|
-
}
|
|
643
|
-
|
|
644
|
-
// LATEST assertAEC
|
|
645
|
-
private func assertAEC(_ engine: AVAudioEngine) {
|
|
646
|
-
do { try engine.inputNode.setVoiceProcessingEnabled(true) }
|
|
647
|
-
catch { NSLog("[STT] assertAEC: setVoiceProcessingEnabled(true) failed: \(error)") }
|
|
648
|
-
}
|
|
649
|
-
|
|
650
|
-
private func isPlayerConnected(_ player: AVAudioPlayerNode?, to engine: AVAudioEngine?) -> Bool {
|
|
651
|
-
guard let p = player, let e = engine else { return false }
|
|
652
|
-
// If the node is attached and has a non-zero channel count on its output, it’s effectively connected.
|
|
653
|
-
let fmt = p.outputFormat(forBus: 0)
|
|
654
|
-
return (p.engine === e) && (fmt.channelCount > 0) && (fmt.sampleRate > 0)
|
|
655
|
-
}
|
|
656
|
-
|
|
657
|
-
/// Try to keep the capture alive without tearing down recognition.
|
|
658
|
-
/// 1) If engine exists but not running → try start()
|
|
659
|
-
/// 2) If start fails or graph became invalid → rebuild graph and start
|
|
660
|
-
/// 3) If we don’t have a task yet, start one.
|
|
661
|
-
private func ensureEngineRunning(reason: String) {
|
|
662
|
-
let now = CFAbsoluteTimeGetCurrent()
|
|
663
|
-
if (now - lastReclaimAttempt) < reclaimCooldown {
|
|
664
|
-
NSLog("[STT] ensureEngineRunning(\(reason)) skipped (cooldown)")
|
|
665
|
-
return
|
|
666
|
-
}
|
|
667
|
-
lastReclaimAttempt = now
|
|
668
|
-
|
|
669
|
-
if let e = audioEngine, !e.isRunning {
|
|
670
|
-
assertAEC(e)
|
|
671
|
-
do {
|
|
672
|
-
playbackNode?.stop()
|
|
673
|
-
playbackNode = nil
|
|
674
|
-
try e.start()
|
|
675
|
-
NSLog("🔄 AVAudioEngine restarted after config change. isRunning=\(e.isRunning)")
|
|
676
|
-
} catch {
|
|
677
|
-
NSLog("❌ Could not re-start after config change: \(error)")
|
|
678
|
-
}
|
|
679
|
-
}
|
|
680
|
-
|
|
681
|
-
// --- full recovery path (this was previously dead code) ---
|
|
682
|
-
guard let engine = audioEngine else {
|
|
683
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): no engine → rebuild")
|
|
684
|
-
rebuildEngineGraphAndRestart(reason: reason)
|
|
685
|
-
return
|
|
686
|
-
}
|
|
687
|
-
|
|
688
|
-
assertAEC(engine)
|
|
689
|
-
|
|
690
|
-
if !engine.isRunning {
|
|
691
|
-
setGraphState(.starting, why: "ensureEngineRunning(\(reason))")
|
|
692
|
-
do {
|
|
693
|
-
try engine.start()
|
|
694
|
-
setGraphState(.hot, why: "engine.start() ok (ensureEngineRunning)")
|
|
695
|
-
self.tryFlushPendingTTS() // ← ADD THIS LINE
|
|
696
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() -> \(engine.isRunning)")
|
|
697
|
-
} catch {
|
|
698
|
-
setGraphState(.unstable, why: "engine.start() failed (ensureEngineRunning)")
|
|
699
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() failed: \(error) → rebuild")
|
|
700
|
-
rebuildEngineGraphAndRestart(reason: reason)
|
|
701
|
-
return
|
|
702
|
-
}
|
|
703
|
-
}
|
|
704
|
-
|
|
705
|
-
if recognitionTask == nil {
|
|
706
|
-
if let req = recognitionRequest {
|
|
707
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): no task -> startTask(existing req)")
|
|
708
|
-
startTask(req)
|
|
709
|
-
} else {
|
|
710
|
-
NSLog("[STT] ensureEngineRunning(\(reason)): no req -> makeFreshRequest + startTask")
|
|
711
|
-
startTask(makeFreshRequest())
|
|
712
|
-
}
|
|
713
|
-
}
|
|
714
|
-
}
|
|
715
|
-
|
|
716
|
-
/// Rebuilds AVAudioEngine graph (mic→mute mixer, player→mainMixer), reinstalls tap,
|
|
717
|
-
/// and restarts the engine. Does NOT nuke the current recognitionRequest/task unless required.
|
|
718
|
-
private func rebuildEngineGraphAndRestart(reason: String) {
|
|
719
|
-
NSLog("[STT] 🔄 rebuildEngineGraphAndRestart (\(reason))")
|
|
720
|
-
|
|
721
|
-
// Keep current request if present; we'll keep appending into it
|
|
722
|
-
let existingReq = self.recognitionRequest
|
|
723
|
-
|
|
724
|
-
// Tear down engine ONLY (keep session, request)
|
|
725
|
-
if let engine = audioEngine {
|
|
726
|
-
if engine.inputNode != nil {
|
|
727
|
-
engine.inputNode.removeTap(onBus: 0)
|
|
728
|
-
engine.inputNode.reset()
|
|
729
|
-
}
|
|
730
|
-
if engine.isRunning { engine.stop() }
|
|
731
|
-
engine.reset()
|
|
732
|
-
}
|
|
733
|
-
|
|
734
|
-
// Recreate engine and graph
|
|
735
|
-
let newEngine = AVAudioEngine()
|
|
736
|
-
self.audioEngine = newEngine
|
|
737
|
-
|
|
738
|
-
let inputNode = newEngine.inputNode
|
|
739
|
-
do {
|
|
740
|
-
try inputNode.setVoiceProcessingEnabled(true)
|
|
741
|
-
} catch {
|
|
742
|
-
NSLog("[STT] rebuild: failed to enable voice processing: \(error)")
|
|
743
|
-
}
|
|
744
|
-
if #available(iOS 17.0, *) {
|
|
745
|
-
var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
|
|
746
|
-
duck.enableAdvancedDucking = false
|
|
747
|
-
duck.duckingLevel = .min
|
|
748
|
-
inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
|
|
749
|
-
}
|
|
750
|
-
|
|
751
|
-
// Live format (may be 0 Hz briefly during route churn)
|
|
752
|
-
let liveFmt = newEngine.inputNode.outputFormat(forBus: 0)
|
|
753
|
-
guard liveFmt.sampleRate > 0, liveFmt.channelCount > 0 else {
|
|
754
|
-
NSLog("[STT] rebuild: input format invalid (0 Hz) — retry shortly")
|
|
755
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + 0.05) { [weak self] in
|
|
756
|
-
self?.ensureEngineRunning(reason: "wait-valid-input-format(rebuild)")
|
|
757
|
-
}
|
|
758
|
-
return
|
|
759
|
-
}
|
|
760
|
-
|
|
761
|
-
// mic → mute mixer → mainMixer
|
|
762
|
-
let micMixer = AVAudioMixerNode()
|
|
763
|
-
newEngine.attach(micMixer)
|
|
764
|
-
// Use nil to let engine pick a valid format (avoids 0 Hz assertion)
|
|
765
|
-
newEngine.connect(inputNode, to: micMixer, format: nil)
|
|
766
|
-
newEngine.connect(micMixer, to: newEngine.mainMixerNode, format: nil)
|
|
767
|
-
micMixer.outputVolume = 0.0
|
|
768
|
-
|
|
769
|
-
// TTS player → mainMixer (keep same player if possible, else recreate)
|
|
770
|
-
if playbackNode == nil { playbackNode = AVAudioPlayerNode() }
|
|
771
|
-
if let player = playbackNode {
|
|
772
|
-
if player.engine == nil { newEngine.attach(player) }
|
|
773
|
-
newEngine.connect(player, to: newEngine.mainMixerNode, format: nil)
|
|
774
|
-
}
|
|
775
|
-
|
|
776
|
-
do {
|
|
777
|
-
try? inputNode.removeTap(onBus: 0)
|
|
778
|
-
} catch {
|
|
779
|
-
NSLog("[STT] removeTap error: \(error)")
|
|
780
|
-
}
|
|
781
|
-
|
|
782
|
-
let targetFmt = AVAudioFormat(commonFormat: .pcmFormatFloat32,
|
|
783
|
-
sampleRate: 16_000,
|
|
784
|
-
channels: 1,
|
|
785
|
-
interleaved: false)!
|
|
786
|
-
|
|
787
|
-
// Tap with nil so it follows route changes automatically
|
|
788
|
-
inputNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { [weak self] buffer, _ in
|
|
789
|
-
guard let self = self else { return }
|
|
790
|
-
|
|
791
|
-
// (same level metering as your current code)
|
|
792
|
-
let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
|
|
793
|
-
let LP: Float = 0.5
|
|
794
|
-
|
|
795
|
-
if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
|
|
796
|
-
var peak0: Float = 0
|
|
797
|
-
vDSP_maxmgv(ch0, 1, &peak0, frames)
|
|
798
|
-
let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
|
|
799
|
-
let sm0 = LP * db0 + (1 - LP) * self.averagePowerForChannel0
|
|
800
|
-
self.averagePowerForChannel0 = sm0
|
|
801
|
-
self.averagePowerForChannel1 = sm0
|
|
802
|
-
}
|
|
803
|
-
if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
|
|
804
|
-
var peak1: Float = 0
|
|
805
|
-
vDSP_maxmgv(ch1, 1, &peak1, frames)
|
|
806
|
-
let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
|
|
807
|
-
let sm1 = LP * db1 + (1 - LP) * self.averagePowerForChannel1
|
|
808
|
-
self.averagePowerForChannel1 = sm1
|
|
809
|
-
}
|
|
810
|
-
self.averagePowerForChannel1 = Float(self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0)
|
|
811
|
-
self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": self.averagePowerForChannel1])
|
|
812
|
-
|
|
813
|
-
// ---- Convert to 16 kHz MONO for STT request
|
|
814
|
-
let inFmt = buffer.format
|
|
815
|
-
if inFmt.sampleRate != 16_000 || inFmt.channelCount != 1 {
|
|
816
|
-
if let conv = AVAudioConverter(from: inFmt, to: targetFmt) {
|
|
817
|
-
let ratio = targetFmt.sampleRate / inFmt.sampleRate
|
|
818
|
-
let outCap = AVAudioFrameCount(Double(buffer.frameLength) * ratio) + 8
|
|
819
|
-
if let outBuf = AVAudioPCMBuffer(pcmFormat: targetFmt, frameCapacity: outCap) {
|
|
820
|
-
var err: NSError? = nil
|
|
821
|
-
var fed = false
|
|
822
|
-
conv.convert(to: outBuf, error: &err) { _, outStatus -> AVAudioBuffer? in
|
|
823
|
-
if fed {
|
|
824
|
-
outStatus.pointee = .endOfStream
|
|
825
|
-
return nil
|
|
826
|
-
} else {
|
|
827
|
-
fed = true
|
|
828
|
-
outStatus.pointee = .haveData
|
|
829
|
-
return buffer
|
|
830
|
-
}
|
|
831
|
-
}
|
|
832
|
-
if err == nil {
|
|
833
|
-
self.recognitionRequest?.append(outBuf)
|
|
834
|
-
} else {
|
|
835
|
-
self.recognitionRequest?.append(buffer) // fallback
|
|
836
|
-
}
|
|
837
|
-
} else {
|
|
838
|
-
self.recognitionRequest?.append(buffer)
|
|
839
|
-
}
|
|
840
|
-
} else {
|
|
841
|
-
self.recognitionRequest?.append(buffer)
|
|
842
|
-
}
|
|
843
|
-
} else {
|
|
844
|
-
self.recognitionRequest?.append(buffer)
|
|
845
|
-
}
|
|
846
|
-
self.lastBufferAt = CACurrentMediaTime()
|
|
847
|
-
}
|
|
848
|
-
|
|
849
|
-
newEngine.prepare()
|
|
850
|
-
setGraphState(.starting, why: "pre start in rebuild")
|
|
851
|
-
do {
|
|
852
|
-
try newEngine.start()
|
|
853
|
-
setGraphState(.hot, why: "engine.start() ok (rebuild)")
|
|
854
|
-
self.tryFlushPendingTTS() // ← ADD THIS LINE
|
|
855
|
-
let f = newEngine.inputNode.outputFormat(forBus: 0)
|
|
856
|
-
NSLog("[STT] rebuild: engine.start() ok, running=\(newEngine.isRunning) (fmt=%.1f Hz / %d ch)",
|
|
857
|
-
f.sampleRate, Int(f.channelCount))
|
|
858
|
-
} catch {
|
|
859
|
-
setGraphState(.unstable, why: "engine.start() failed (rebuild)")
|
|
860
|
-
NSLog("[STT] rebuild: engine.start() failed: \(error)")
|
|
861
|
-
}
|
|
862
|
-
|
|
863
|
-
// If we lost the request during rebuild, recreate + start task.
|
|
864
|
-
if self.recognitionRequest == nil {
|
|
865
|
-
if let old = existingReq {
|
|
866
|
-
self.recognitionRequest = old
|
|
867
|
-
} else {
|
|
868
|
-
self.recognitionRequest = makeFreshRequest()
|
|
869
|
-
}
|
|
870
|
-
}
|
|
871
|
-
if self.recognitionTask == nil {
|
|
872
|
-
startTask(self.recognitionRequest!)
|
|
873
|
-
}
|
|
874
|
-
rebindEngineConfigObserver(to: newEngine)
|
|
875
|
-
}
|
|
876
|
-
|
|
877
|
-
@objc private func handleEngineConfigChange(_ note: Notification) {
|
|
878
|
-
NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange")
|
|
879
|
-
if playbackNode?.isPlaying == true { playbackNode?.stop() }
|
|
880
|
-
markUnstableThenRecheck(why: "AVAudioEngineConfigurationChange")
|
|
881
|
-
|
|
882
|
-
// If engine stopped, drop the player node (it will be lazily recreated)
|
|
883
|
-
if let e = audioEngine, !e.isRunning {
|
|
884
|
-
playbackNode?.stop()
|
|
885
|
-
playbackNode = nil
|
|
886
|
-
}
|
|
887
|
-
|
|
888
|
-
// Re-assert a mic-capable route (HFP/wired/built-in)
|
|
889
|
-
updateSessionRouting(selectBestInput: true)
|
|
890
|
-
|
|
891
|
-
// Re-enable VoiceProcessingIO (AEC) and restart if needed
|
|
892
|
-
ensureEngineRunning(reason: "engine-config-change")
|
|
893
|
-
self.tryFlushPendingTTS() // ← ADD THIS LINE
|
|
894
|
-
}
|
|
895
|
-
|
|
896
|
-
@objc private func handleMediaServicesReset(_ note: Notification) {
|
|
897
|
-
NSLog("[STT] 📺 Media services were RESET: reclaiming mic & session")
|
|
898
|
-
// Re-apply audio session and try to rebuild graph if needed
|
|
899
|
-
_ = setupAudioSession()
|
|
900
|
-
ensureEngineRunning(reason: "media-services-reset")
|
|
901
|
-
self.tryFlushPendingTTS() // ← OPTIONAL ADD
|
|
902
|
-
}
|
|
903
|
-
|
|
904
|
-
@objc private func handleRouteChange(_ note: Notification) {
|
|
905
|
-
let info = note.userInfo ?? [:]
|
|
906
|
-
NSLog("[STT] 🔀 route change: \(info)")
|
|
907
|
-
if playbackNode?.isPlaying == true { playbackNode?.stop() }
|
|
908
|
-
markUnstableThenRecheck(why: "route-change")
|
|
909
|
-
|
|
910
|
-
let s = AVAudioSession.sharedInstance()
|
|
911
|
-
|
|
912
|
-
// 1) Re-apply a mic-safe category/mode and prefer HFP/built-in mic.
|
|
913
|
-
updateSessionRouting(selectBestInput: true)
|
|
914
|
-
if let inputs = s.availableInputs {
|
|
915
|
-
let preferred = inputs.first { $0.portType == .bluetoothHFP }
|
|
916
|
-
?? inputs.first { $0.portType == .headsetMic }
|
|
917
|
-
?? inputs.first { $0.portType == .builtInMic }
|
|
918
|
-
try? s.setPreferredInput(preferred)
|
|
919
|
-
}
|
|
920
|
-
|
|
921
|
-
// 2) If there’s still no input, don’t thrash; wait for a usable route.
|
|
922
|
-
let inputs = s.currentRoute.inputs
|
|
923
|
-
NSLog("[STT] 🎤 inputs after route fix: \(inputs.map { $0.portType.rawValue })")
|
|
924
|
-
guard !inputs.isEmpty else {
|
|
925
|
-
NSLog("[STT] ⚠️ No mic route available (likely A2DP/AirPlay). Not restarting engine.")
|
|
926
|
-
return
|
|
927
|
-
}
|
|
928
|
-
|
|
929
|
-
// 3) Now recover the engine/task
|
|
930
|
-
ensureEngineRunning(reason: "route-change")
|
|
931
|
-
self.tryFlushPendingTTS() // ← ADD THIS LINE
|
|
932
|
-
}
|
|
933
|
-
|
|
934
|
-
// Call once after engine is created
|
|
935
|
-
private func installEngineObservers() {
|
|
936
|
-
let nc = NotificationCenter.default
|
|
937
|
-
|
|
938
|
-
nc.addObserver(self,
|
|
939
|
-
selector: #selector(handleSessionInterruption(_:)),
|
|
940
|
-
name: AVAudioSession.interruptionNotification,
|
|
941
|
-
object: AVAudioSession.sharedInstance())
|
|
942
|
-
|
|
943
|
-
nc.addObserver(self,
|
|
944
|
-
selector: #selector(handleRouteChange(_:)),
|
|
945
|
-
name: AVAudioSession.routeChangeNotification,
|
|
946
|
-
object: AVAudioSession.sharedInstance())
|
|
947
|
-
|
|
948
|
-
nc.addObserver(self,
|
|
949
|
-
selector: #selector(handleMediaServicesReset(_:)),
|
|
950
|
-
name: AVAudioSession.mediaServicesWereResetNotification,
|
|
951
|
-
object: nil)
|
|
952
|
-
}
|
|
953
|
-
|
|
954
|
-
@objc private func handleSessionInterruption(_ note: Notification) {
|
|
955
|
-
guard
|
|
956
|
-
let info = note.userInfo,
|
|
957
|
-
let typeVal = info[AVAudioSessionInterruptionTypeKey] as? UInt,
|
|
958
|
-
let type = AVAudioSession.InterruptionType(rawValue: typeVal)
|
|
959
|
-
else { return }
|
|
960
|
-
|
|
961
|
-
if type == .ended {
|
|
962
|
-
// On real “render err” Core Audio posts an interruption END
|
|
963
|
-
NSLog("Session interruption ended (possible render err):")
|
|
964
|
-
}
|
|
965
|
-
}
|
|
966
|
-
|
|
967
|
-
// Wait for one IO cycle so player won't throw "did not see an IO cycle"
|
|
968
|
-
private func awaitOneIOCycle(_ engine: AVAudioEngine,
|
|
969
|
-
timeout: TimeInterval = 0.7,
|
|
970
|
-
done: @escaping (Bool) -> Void) {
|
|
971
|
-
let mixer = engine.mainMixerNode
|
|
972
|
-
var fired = false
|
|
973
|
-
mixer.installTap(onBus: 0, bufferSize: 128, format: nil) { _, _ in
|
|
974
|
-
if !fired {
|
|
975
|
-
fired = true
|
|
976
|
-
mixer.removeTap(onBus: 0)
|
|
977
|
-
DispatchQueue.main.async { done(true) }
|
|
978
|
-
}
|
|
979
|
-
}
|
|
980
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + timeout) {
|
|
981
|
-
if !fired {
|
|
982
|
-
mixer.removeTap(onBus: 0)
|
|
983
|
-
done(false)
|
|
984
|
-
}
|
|
985
|
-
}
|
|
986
|
-
}
|
|
987
|
-
|
|
988
|
-
private func setupAndStartRecognizing(localeStr: String?) {
|
|
989
|
-
NSLog("[STT] setupAndStartRecognizing begin")
|
|
990
|
-
sttActive = true
|
|
991
|
-
|
|
992
|
-
audioSession = AVAudioSession.sharedInstance()
|
|
993
|
-
guard let session = audioSession else { return }
|
|
994
|
-
var err: NSError?
|
|
995
|
-
|
|
996
|
-
priorAudioCategory = session.category
|
|
997
|
-
|
|
998
|
-
// Tear down resources before starting speech recognition..
|
|
999
|
-
NSLog("[STT] pre-teardown")
|
|
1000
|
-
teardown()
|
|
1001
|
-
// ** IMPORTANT ** Call this again as teardown marks this false
|
|
1002
|
-
sttActive = true
|
|
1003
|
-
|
|
1004
|
-
sessionId = UUID().uuidString
|
|
1005
|
-
|
|
1006
|
-
let locale: Locale? = {
|
|
1007
|
-
if let s = localeStr, !s.isEmpty { return Locale(identifier: s) }
|
|
1008
|
-
sttActive = false
|
|
1009
|
-
return nil
|
|
1010
|
-
}()
|
|
1011
|
-
|
|
1012
|
-
if let loc = locale {
|
|
1013
|
-
speechRecognizer = SFSpeechRecognizer(locale: loc)
|
|
1014
|
-
} else {
|
|
1015
|
-
speechRecognizer = SFSpeechRecognizer()
|
|
1016
|
-
}
|
|
1017
|
-
speechRecognizer?.delegate = self
|
|
1018
|
-
|
|
1019
|
-
// Start audio session...
|
|
1020
|
-
NSLog("[STT] setupAudioSession()")
|
|
1021
|
-
guard setupAudioSession() else {
|
|
1022
|
-
NSLog("[STT] ERROR ERROR ******** setupAudioSession()")
|
|
1023
|
-
teardown()
|
|
1024
|
-
sttActive = false
|
|
1025
|
-
return
|
|
1026
|
-
}
|
|
1027
|
-
installEngineObservers()
|
|
1028
|
-
|
|
1029
|
-
let request = SFSpeechAudioBufferRecognitionRequest()
|
|
1030
|
-
recognitionRequest = request
|
|
1031
|
-
|
|
1032
|
-
if #available(iOS 16, *) {
|
|
1033
|
-
request.addsPunctuation = true
|
|
1034
|
-
}
|
|
1035
|
-
request.shouldReportPartialResults = true
|
|
1036
|
-
//if #available(iOS 13.0, *) { request.taskHint = .dictation }
|
|
1037
|
-
request.contextualStrings = loadContextualStrings()
|
|
1038
|
-
|
|
1039
|
-
guard recognitionRequest != nil else {
|
|
1040
|
-
sendResult(error: ["code": "recognition_init"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
1041
|
-
teardown()
|
|
1042
|
-
return
|
|
1043
|
-
}
|
|
1044
|
-
|
|
1045
|
-
if audioEngine == nil {
|
|
1046
|
-
audioEngine = AVAudioEngine()
|
|
1047
|
-
rebindEngineConfigObserver(to: audioEngine)
|
|
1048
|
-
}
|
|
1049
|
-
do {
|
|
1050
|
-
guard let engine = audioEngine else { throw NSError(domain: "voice.audio", code: -1) }
|
|
1051
|
-
let inputNode = engine.inputNode
|
|
1052
|
-
_ = inputNode // presence check
|
|
1053
|
-
|
|
1054
|
-
// Enable voice processing (AEC)
|
|
1055
|
-
do {
|
|
1056
|
-
try inputNode.setVoiceProcessingEnabled(true)
|
|
1057
|
-
} catch {
|
|
1058
|
-
NSLog("Failed to enable voice processing for AEC on input node: \(error)")
|
|
1059
|
-
}
|
|
1060
|
-
|
|
1061
|
-
if #available(iOS 17.0, *) {
|
|
1062
|
-
var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
|
|
1063
|
-
duck.enableAdvancedDucking = false // disable advanced (VAD-based) ducking
|
|
1064
|
-
duck.duckingLevel = .min // “as loud as possible” for other audio
|
|
1065
|
-
inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
|
|
1066
|
-
}
|
|
1067
|
-
|
|
1068
|
-
NSLog("[STT] AEC enable done")
|
|
1069
|
-
|
|
1070
|
-
// Live format guard (can briefly be 0 Hz on route churn)
|
|
1071
|
-
let liveFmt = engine.inputNode.outputFormat(forBus: 0)
|
|
1072
|
-
guard liveFmt.sampleRate > 0, liveFmt.channelCount > 0 else {
|
|
1073
|
-
NSLog("[STT] start: input format invalid (0 Hz) — retry shortly")
|
|
1074
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + 0.05) { [weak self] in
|
|
1075
|
-
self?.ensureEngineRunning(reason: "wait-valid-input-format(start)")
|
|
1076
|
-
}
|
|
1077
|
-
return
|
|
1078
|
-
}
|
|
1079
|
-
|
|
1080
|
-
// 1) Mute only the mic path, not the whole main mixer
|
|
1081
|
-
let micMixer = AVAudioMixerNode()
|
|
1082
|
-
engine.attach(micMixer)
|
|
1083
|
-
// Let engine choose format to avoid 0 Hz assertions
|
|
1084
|
-
engine.connect(inputNode, to: micMixer, format: nil)
|
|
1085
|
-
engine.connect(micMixer, to: engine.mainMixerNode, format: nil)
|
|
1086
|
-
micMixer.outputVolume = 0.0 // ← you won't hear your own mic
|
|
1087
|
-
|
|
1088
|
-
// 2) Prepare a player node for TTS inside the SAME engine/graph
|
|
1089
|
-
let player = AVAudioPlayerNode()
|
|
1090
|
-
self.playbackNode = player
|
|
1091
|
-
engine.attach(player)
|
|
1092
|
-
engine.connect(player, to: engine.mainMixerNode, format: nil)
|
|
1093
|
-
|
|
1094
|
-
NSLog("[STT] graph connected (mic->mute mixer, player->mainMixer)")
|
|
1095
|
-
|
|
1096
|
-
var tapFrames: UInt64 = 0
|
|
1097
|
-
|
|
1098
|
-
do { try? inputNode.removeTap(onBus: 0) } catch {
|
|
1099
|
-
NSLog("[STT] removeTap error: \(error)")
|
|
1100
|
-
}
|
|
1101
|
-
|
|
1102
|
-
let targetFmt = AVAudioFormat(commonFormat: .pcmFormatFloat32,
|
|
1103
|
-
sampleRate: 16_000,
|
|
1104
|
-
channels: 1,
|
|
1105
|
-
interleaved: false)!
|
|
1106
|
-
|
|
1107
|
-
// Tap with nil so it follows the node’s live format automatically
|
|
1108
|
-
inputNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { [weak self] buffer, _ in
|
|
1109
|
-
// Strongify self once
|
|
1110
|
-
guard let self = self else { return }
|
|
1111
|
-
tapFrames &+= UInt64(buffer.frameLength)
|
|
1112
|
-
if tapFrames % (44100 * 2) < 1024 { // ~every ~2s at 44.1k
|
|
1113
|
-
NSLog("[STT] tap alive, totalFrames=\(tapFrames)")
|
|
1114
|
-
}
|
|
1115
|
-
|
|
1116
|
-
let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
|
|
1117
|
-
let LEVEL_LOWPASS_TRIG: Float = 0.5
|
|
1118
|
-
|
|
1119
|
-
// CH0
|
|
1120
|
-
if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
|
|
1121
|
-
var peak0: Float = 0
|
|
1122
|
-
vDSP_maxmgv(ch0, 1, &peak0, frames)
|
|
1123
|
-
let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
|
|
1124
|
-
|
|
1125
|
-
let smoothed0 = LEVEL_LOWPASS_TRIG * db0
|
|
1126
|
-
+ (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel0
|
|
1127
|
-
self.averagePowerForChannel0 = smoothed0
|
|
1128
|
-
self.averagePowerForChannel1 = smoothed0
|
|
1129
|
-
}
|
|
1130
|
-
|
|
1131
|
-
// CH1
|
|
1132
|
-
if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
|
|
1133
|
-
var peak1: Float = 0
|
|
1134
|
-
vDSP_maxmgv(ch1, 1, &peak1, frames)
|
|
1135
|
-
let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
|
|
1136
|
-
|
|
1137
|
-
let smoothed1 = LEVEL_LOWPASS_TRIG * db1
|
|
1138
|
-
+ (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel1
|
|
1139
|
-
self.averagePowerForChannel1 = smoothed1
|
|
1140
|
-
}
|
|
1141
|
-
|
|
1142
|
-
// Normalize 0–10 and emit
|
|
1143
|
-
self.averagePowerForChannel1 = Float(self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0)
|
|
1144
|
-
let value = self.averagePowerForChannel1
|
|
1145
|
-
self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": value])
|
|
1146
|
-
|
|
1147
|
-
// ---- Convert to 16 kHz MONO for STT request
|
|
1148
|
-
let inFmt = buffer.format
|
|
1149
|
-
if inFmt.sampleRate != 16_000 || inFmt.channelCount != 1 {
|
|
1150
|
-
if let conv = AVAudioConverter(from: inFmt, to: targetFmt) {
|
|
1151
|
-
// Conservative capacity +8 frames
|
|
1152
|
-
let ratio = targetFmt.sampleRate / inFmt.sampleRate
|
|
1153
|
-
let outCap = AVAudioFrameCount(Double(buffer.frameLength) * ratio) + 8
|
|
1154
|
-
if let outBuf = AVAudioPCMBuffer(pcmFormat: targetFmt, frameCapacity: outCap) {
|
|
1155
|
-
var err: NSError? = nil
|
|
1156
|
-
var fed = false
|
|
1157
|
-
conv.convert(to: outBuf, error: &err) { _, outStatus -> AVAudioBuffer? in
|
|
1158
|
-
if fed {
|
|
1159
|
-
outStatus.pointee = .endOfStream
|
|
1160
|
-
return nil
|
|
1161
|
-
} else {
|
|
1162
|
-
fed = true
|
|
1163
|
-
outStatus.pointee = .haveData
|
|
1164
|
-
return buffer
|
|
1165
|
-
}
|
|
1166
|
-
}
|
|
1167
|
-
if err == nil {
|
|
1168
|
-
self.recognitionRequest?.append(outBuf)
|
|
1169
|
-
} else {
|
|
1170
|
-
self.recognitionRequest?.append(buffer) // fallback
|
|
1171
|
-
}
|
|
1172
|
-
} else {
|
|
1173
|
-
self.recognitionRequest?.append(buffer)
|
|
1174
|
-
}
|
|
1175
|
-
} else {
|
|
1176
|
-
self.recognitionRequest?.append(buffer)
|
|
1177
|
-
}
|
|
1178
|
-
} else {
|
|
1179
|
-
self.recognitionRequest?.append(buffer)
|
|
1180
|
-
}
|
|
1181
|
-
|
|
1182
|
-
self.lastBufferAt = CACurrentMediaTime()
|
|
1183
|
-
}
|
|
1184
|
-
|
|
1185
|
-
engine.prepare()
|
|
1186
|
-
NSLog("[STT] audioEngine prepare")
|
|
1187
|
-
setGraphState(.starting, why: "pre start in setupAndStartRecognizing")
|
|
1188
|
-
var audioSessionError: NSError?
|
|
1189
|
-
do {
|
|
1190
|
-
try engine.start()
|
|
1191
|
-
setGraphState(.hot, why: "engine.start() ok (setupAndStartRecognizing)")
|
|
1192
|
-
self.tryFlushPendingTTS() // ← ADD THIS LINE
|
|
1193
|
-
} catch {
|
|
1194
|
-
audioSessionError = error as NSError
|
|
1195
|
-
setGraphState(.unstable, why: "engine.start() failed (setupAndStartRecognizing)")
|
|
1196
|
-
}
|
|
1197
|
-
|
|
1198
|
-
// after engine.start() success:
|
|
1199
|
-
engineHotAt = CACurrentMediaTime()
|
|
1200
|
-
seenRealSpeech = false
|
|
1201
|
-
let f = engine.inputNode.outputFormat(forBus: 0)
|
|
1202
|
-
NSLog("engine HOT at \(engineHotAt) (fmt=%.1f Hz / %d ch)", f.sampleRate, Int(f.channelCount))
|
|
1203
|
-
sendEvent(name: "onSpeechStart", body: nil) // engine hot signal
|
|
1204
|
-
startTask(makeFreshRequest())
|
|
1205
|
-
|
|
1206
|
-
// Engine is up; expose readiness
|
|
1207
|
-
AudioPlaybackHook.isEngineReady = { [weak self] in
|
|
1208
|
-
guard let e = self?.audioEngine else { return false }
|
|
1209
|
-
let fmt = e.mainMixerNode.outputFormat(forBus: 0)
|
|
1210
|
-
return e.isRunning && fmt.sampleRate > 0 && fmt.channelCount > 0
|
|
1211
|
-
}
|
|
1212
|
-
|
|
1213
|
-
AudioPlaybackHook.useOnlyEnginePlayback = { [weak self] in
|
|
1214
|
-
guard let self = self, let e = self.audioEngine else { return false }
|
|
1215
|
-
let fmt = e.mainMixerNode.outputFormat(forBus: 0)
|
|
1216
|
-
return self.sttActive && e.isRunning && fmt.sampleRate > 0 && fmt.channelCount > 0
|
|
1217
|
-
}
|
|
1218
|
-
|
|
1219
|
-
startWatchdog()
|
|
1220
|
-
|
|
1221
|
-
AudioPlaybackHook.engineScheduleFile = { [weak self] url, done in
|
|
1222
|
-
// If STT is active we NEVER fallback; we queue until the engine is ready.
|
|
1223
|
-
guard let self = self else { return true }
|
|
1224
|
-
|
|
1225
|
-
let scheduleOrQueue: () -> Void = {
|
|
1226
|
-
guard let engine = self.audioEngine else {
|
|
1227
|
-
// No engine yet — queue
|
|
1228
|
-
self.ttsSerial.async { self.pendingTTSSchedules.append((url, done)) }
|
|
1229
|
-
return
|
|
1230
|
-
}
|
|
1231
|
-
|
|
1232
|
-
let mixFmt = engine.mainMixerNode.outputFormat(forBus: 0)
|
|
1233
|
-
let ready = engine.isRunning && mixFmt.sampleRate > 0 && mixFmt.channelCount > 0
|
|
1234
|
-
|
|
1235
|
-
if ready {
|
|
1236
|
-
// Schedule immediately
|
|
1237
|
-
if self.playbackNode?.engine !== engine || !self.isPlayerConnected(self.playbackNode, to: engine) {
|
|
1238
|
-
self.playbackNode?.stop()
|
|
1239
|
-
self.playbackNode = nil
|
|
1240
|
-
}
|
|
1241
|
-
let player = self.ensurePlaybackNode(in: engine)
|
|
1242
|
-
self.primePlayer(player, engine: engine)
|
|
1243
|
-
if !player.isPlaying { player.play() }
|
|
1244
|
-
|
|
1245
|
-
do {
|
|
1246
|
-
let file = try AVAudioFile(forReading: url)
|
|
1247
|
-
player.scheduleFile(file, at: nil) {
|
|
1248
|
-
DispatchQueue.main.async { done() }
|
|
1249
|
-
}
|
|
1250
|
-
NSLog("[STT] TTS: scheduled via AVAudioEngine: \(url.lastPathComponent)")
|
|
1251
|
-
} catch {
|
|
1252
|
-
NSLog("[STT] TTS schedule error: \(error) — queuing instead (no fallback).")
|
|
1253
|
-
self.ttsSerial.async { self.pendingTTSSchedules.append((url, done)) }
|
|
1254
|
-
}
|
|
1255
|
-
} else {
|
|
1256
|
-
// Not ready — queue and try to wake the engine
|
|
1257
|
-
self.ttsSerial.async { self.pendingTTSSchedules.append((url, done)) }
|
|
1258
|
-
|
|
1259
|
-
// Kick the engine and wait for one IO cycle; then flush
|
|
1260
|
-
do { if !engine.isRunning { try engine.start() } } catch { }
|
|
1261
|
-
self.awaitOneIOCycle(engine, timeout: 0.7) { _ in
|
|
1262
|
-
self.tryFlushPendingTTS()
|
|
1263
|
-
}
|
|
1264
|
-
}
|
|
1265
|
-
}
|
|
1266
|
-
|
|
1267
|
-
if Thread.isMainThread {
|
|
1268
|
-
scheduleOrQueue()
|
|
1269
|
-
} else {
|
|
1270
|
-
DispatchQueue.main.async { scheduleOrQueue() }
|
|
1271
|
-
}
|
|
1272
|
-
|
|
1273
|
-
// IMPORTANT: Always “true” while STT is active => no fallback path is taken.
|
|
1274
|
-
return true
|
|
1275
|
-
}
|
|
1276
|
-
|
|
1277
|
-
AudioPlaybackHook.stopEnginePlayback = { [weak self] in
|
|
1278
|
-
DispatchQueue.main.async {
|
|
1279
|
-
guard let self = self else { return }
|
|
1280
|
-
// Stop only the TTS playback node; keep the engine running for STT
|
|
1281
|
-
self.playbackNode?.stop()
|
|
1282
|
-
}
|
|
1283
|
-
}
|
|
1284
|
-
|
|
1285
|
-
NSLog("audioEngine startAndReturnError")
|
|
1286
|
-
if let audioSessionError = audioSessionError {
|
|
1287
|
-
NSLog("audioEngine start error: \(audioSessionError.localizedDescription)")
|
|
1288
|
-
self.sendResult(error: ["code": "audio", "message": audioSessionError.localizedDescription],
|
|
1289
|
-
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
1290
|
-
return
|
|
1291
|
-
}
|
|
1292
|
-
NSLog("After Start recording and append recording")
|
|
1293
|
-
DispatchQueue.main.asyncAfter(deadline: .now() + 3.0) { [weak self] in
|
|
1294
|
-
guard let self = self else { return }
|
|
1295
|
-
let running = self.audioEngine?.isRunning ?? false
|
|
1296
|
-
let taskState = self.recognitionTask?.state.rawValue ?? -1
|
|
1297
|
-
NSLog("[STT] health: engineRunning=\(running) taskState=\(taskState)")
|
|
1298
|
-
}
|
|
1299
|
-
|
|
1300
|
-
NSLog("After if audioSessionError != nil")
|
|
1301
|
-
} catch let e as NSError {
|
|
1302
|
-
sendResult(error: ["code": "start_recording", "message": e.localizedDescription],
|
|
1303
|
-
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
1304
|
-
NSLog("End of init...")
|
|
1305
|
-
return
|
|
1306
|
-
}
|
|
1307
|
-
}
|
|
1308
|
-
|
|
1309
|
-
// MARK: - Helpers
|
|
1310
|
-
|
|
1311
|
-
private func _normalizedPowerLevelFromDecibels(_ decibels: CGFloat) -> CGFloat {
|
|
1312
|
-
if decibels < -80.0 || decibels == 0.0 { return 0.0 }
|
|
1313
|
-
let minDb: Float = -80.0
|
|
1314
|
-
let pow10_min = powf(10.0, 0.05 * minDb)
|
|
1315
|
-
let pow10_db = powf(10.0, 0.05 * Float(decibels))
|
|
1316
|
-
let power = powf((pow10_db - pow10_min) * (1.0 / (1.0 - pow10_min)), 1.0 / 2.0)
|
|
1317
|
-
if power < 1.0 { return CGFloat(power) } else { return 1.0 }
|
|
1318
|
-
}
|
|
1319
|
-
|
|
1320
|
-
private func sendEvent(name: String, body: [String: Any]?) {
|
|
1321
|
-
delegate?.stt(self, didEmitEvent: name, body: body)
|
|
1322
|
-
}
|
|
1323
|
-
|
|
1324
|
-
/// Exact event behavior preserved from ObjC `sendResult`.
|
|
1325
|
-
private func sendResult(error: [String: Any]?,
|
|
1326
|
-
bestTranscription: String?,
|
|
1327
|
-
transcriptions: [String]?,
|
|
1328
|
-
isFinal: Bool?) {
|
|
1329
|
-
if let error = error {
|
|
1330
|
-
sendEvent(name: "onSpeechError", body: ["error": error])
|
|
1331
|
-
}
|
|
1332
|
-
if let best = bestTranscription {
|
|
1333
|
-
sendEvent(name: "onSpeechResults", body: ["value": [best]])
|
|
1334
|
-
}
|
|
1335
|
-
if let trans = transcriptions {
|
|
1336
|
-
sendEvent(name: "onSpeechPartialResults", body: ["value": trans])
|
|
1337
|
-
}
|
|
1338
|
-
if let isFinal = isFinal {
|
|
1339
|
-
sendEvent(name: "onSpeechRecognized", body: ["isFinal": isFinal])
|
|
1340
|
-
}
|
|
1341
|
-
}
|
|
1342
|
-
|
|
1343
|
-
// MARK: - SFSpeechRecognizerDelegate
|
|
1344
|
-
|
|
1345
|
-
public func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
|
|
1346
|
-
if available == false {
|
|
1347
|
-
sendResult(error: ["message": "Speech recognition is not available now"],
|
|
1348
|
-
bestTranscription: nil, transcriptions: nil, isFinal: nil)
|
|
1349
|
-
}
|
|
1350
|
-
}
|
|
1351
|
-
|
|
1352
|
-
// MARK: - Small helper to recreate recognizer (used by watchdog)
|
|
1353
|
-
private func recreateSpeechRecognizerPreservingLocale() {
|
|
1354
|
-
let loc = speechRecognizer?.locale
|
|
1355
|
-
speechRecognizer = loc != nil ? SFSpeechRecognizer(locale: loc!) : SFSpeechRecognizer()
|
|
1356
|
-
speechRecognizer?.delegate = self
|
|
1357
|
-
NSLog("[STT] recreated SFSpeechRecognizer (locale preserved: \(loc?.identifier ?? "default"))")
|
|
1358
|
-
}
|
|
1359
|
-
}
|