react-native-davoice-tts 1.0.218 → 1.0.220

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. package/TTSRNBridge.podspec +1 -1
  2. package/ios/SpeechBridge/SpeechBridge.m +153 -0
  3. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DavoiceTTS +0 -0
  4. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Headers/DavoiceTTS-Swift.h +7 -0
  5. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.abi.json +1324 -1238
  6. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.private.swiftinterface +14 -12
  7. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.swiftdoc +0 -0
  8. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios.swiftinterface +14 -12
  9. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DavoiceTTS +0 -0
  10. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Headers/DavoiceTTS-Swift.h +14 -0
  11. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.abi.json +3405 -3319
  12. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.private.swiftinterface +34 -32
  13. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.swiftdoc +0 -0
  14. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/arm64-apple-ios-simulator.swiftinterface +34 -32
  15. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.abi.json +3405 -3319
  16. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.private.swiftinterface +34 -32
  17. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.swiftdoc +0 -0
  18. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/Modules/DavoiceTTS.swiftmodule/x86_64-apple-ios-simulator.swiftinterface +34 -32
  19. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeDirectory +0 -0
  20. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeRequirements-1 +0 -0
  21. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/_CodeSignature/CodeResources +33 -108
  22. package/package.json +1 -1
  23. package/speech/index.ts +106 -0
  24. package/android/src/main/java/com/davoice/tts/rn/DaVoiceTTSPackage.java_old_using_new_for_both_stt_and_tts +0 -26
  25. package/ios/STTRNBridge/STTBridge.m_wtf +0 -109
  26. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT copy.swift____ +0 -1202
  27. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift.bkup +0 -1000
  28. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift.latest +0 -1359
  29. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift1.swift__ +0 -1134
  30. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64/DavoiceTTS.framework/DaVoiceSTT.swift__ +0 -1329
  31. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT copy.swift____ +0 -1202
  32. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.bkup +0 -1000
  33. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift.latest +0 -1359
  34. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift1.swift__ +0 -1134
  35. package/ios/TTSRNBridge/DavoiceTTS.xcframework/ios-arm64_x86_64-simulator/DavoiceTTS.framework/DaVoiceSTT.swift__ +0 -1329
@@ -1,1202 +0,0 @@
1
- // STT.swift
2
- // Native iOS Swift version (AEC flow preserved 1:1)
3
-
4
- import Foundation
5
- import UIKit
6
- import Speech
7
- import Accelerate
8
- import AVFAudio // or import AVFoundation
9
-
10
- @objc public protocol STTDelegate: AnyObject {
11
- @objc func stt(_ stt: STT, didEmitEvent name: String, body: [String: Any]?)
12
- }
13
-
14
- @objcMembers
15
- public final class STT: NSObject, SFSpeechRecognizerDelegate {
16
- public weak var delegate: STTDelegate?
17
- public var continuous: Bool = true
18
-
19
- // MARK: - Private
20
- private var speechRecognizer: SFSpeechRecognizer?
21
- private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
22
- private var audioEngine: AVAudioEngine?
23
- private var recognitionTask: SFSpeechRecognitionTask?
24
- private var audioSession: AVAudioSession?
25
- private var isTearingDown: Bool = false
26
- private var sessionId: String?
27
- private var priorAudioCategory: AVAudioSession.Category?
28
- private var averagePowerForChannel0: Float = 0
29
- private var averagePowerForChannel1: Float = 0
30
-
31
- private var playbackNode: AVAudioPlayerNode?
32
- private var seenRealSpeech = false // flips true after first non-blank token
33
- private var engineHotAt: CFTimeInterval = 0 // when engine actually started
34
- private let warmupKeepAlive: CFTimeInterval = 4.0 // seconds we’ll keep re-arming in silence
35
-
36
- // Keep-engine-alive helpers
37
- private var lastReclaimAttempt: CFAbsoluteTime = 0
38
- private let reclaimCooldown: CFTimeInterval = 1.0
39
-
40
- // --- Task health ---
41
- private var lastBufferAt: CFTimeInterval = 0 // updated from tap
42
- private var lastResultAt: CFTimeInterval = 0 // updated from recognition callback
43
- private var lastTaskStartAt: CFTimeInterval = 0
44
- private var stallWatchdog: Timer?
45
- private var consecutiveStallCount = 0
46
- private let stallThreshold: CFTimeInterval = 8.0 // seconds w/o results while engine is hot
47
- private let rearmCooldownTask: CFTimeInterval = 2.0
48
- private var lastRearmAt: CFTimeInterval = 0
49
- private var engineHot = false
50
- private var hotAt: CFTimeInterval = 0
51
-
52
- private var observedEngineForConfigChange: AVAudioEngine?
53
-
54
- // --- Recovery & diagnostics ---
55
- private var recoverySeq = 0
56
- private var lastRecoveryAt: CFTimeInterval = 0
57
- private var lastTaskOrigin: String = "cold"
58
-
59
- private(set) var sttActive = false
60
-
61
- // partial cadence monitor
62
- private var emaPartialGap: Double = 0 // exponential moving average of time between partials
63
- private let emaAlpha: Double = 0.3
64
-
65
- // MARK: - Event names (unchanged)
66
- public static let supportedEvents: [String] = [
67
- "onSpeechResults",
68
- "onSpeechStart",
69
- "onSpeechPartialResults",
70
- "onSpeechError",
71
- "onSpeechEnd",
72
- "onSpeechRecognized",
73
- "onSpeechVolumeChanged"
74
- ]
75
-
76
- // MARK: - Public API (native replacements for the former RCT methods)
77
-
78
- public func isSpeechAvailable(_ completion: @escaping (Bool) -> Void) {
79
- SFSpeechRecognizer.requestAuthorization { status in
80
- switch status {
81
- case .authorized: completion(true)
82
- default: completion(false)
83
- }
84
- }
85
- }
86
-
87
- public func isRecognizing() -> Bool {
88
- guard let task = recognitionTask else { return false }
89
- return task.state == .running
90
- }
91
-
92
- private func rebindEngineConfigObserver(to newEngine: AVAudioEngine?) {
93
- let nc = NotificationCenter.default
94
- if let old = observedEngineForConfigChange {
95
- nc.removeObserver(self,
96
- name: .AVAudioEngineConfigurationChange,
97
- object: old)
98
- }
99
- observedEngineForConfigChange = newEngine
100
- if let e = newEngine {
101
- nc.addObserver(self,
102
- selector: #selector(handleEngineConfigChange(_:)),
103
- name: .AVAudioEngineConfigurationChange,
104
- object: e)
105
- }
106
- }
107
-
108
- private func ensurePlaybackNode(in engine: AVAudioEngine) -> AVAudioPlayerNode {
109
- // If we have a node but it's tied to a different engine or got disconnected, recreate it.
110
- if let p = playbackNode, p.engine === engine {
111
- return p
112
- }
113
- let p = AVAudioPlayerNode()
114
- playbackNode = p
115
- engine.attach(p)
116
- // Connect with nil format so the mixer does SRC if needed
117
- engine.connect(p, to: engine.mainMixerNode, format: nil)
118
- return p
119
- }
120
-
121
- private func startWatchdog() {
122
- stallWatchdog?.invalidate()
123
- stallWatchdog = Timer.scheduledTimer(withTimeInterval: 2.0, repeats: true) { [weak self] _ in
124
- self?.checkTaskHealth()
125
- }
126
- RunLoop.main.add(stallWatchdog!, forMode: .common)
127
- }
128
-
129
- private func stopWatchdog() {
130
- stallWatchdog?.invalidate()
131
- stallWatchdog = nil
132
- }
133
-
134
- private func rearmTask(reason: String) {
135
- // Cancel old task only — keep the engine and tap running.
136
- recognitionTask?.cancel()
137
- recognitionTask = nil
138
-
139
- seenRealSpeech = false
140
- lastTaskStartAt = CACurrentMediaTime()
141
- startTask(makeFreshRequest())
142
- NSLog("[STT] rearmTask(\(reason)) -> new task started")
143
- }
144
-
145
- private func checkTaskHealth() {
146
- guard let engine = audioEngine else { return }
147
- let now = CACurrentMediaTime()
148
-
149
- // Engine down? Let your existing logic handle it; just bail.
150
- if !engine.isRunning { return }
151
-
152
- // If recognizer is globally unavailable, don’t thrash — wait until it flips back.
153
- if let rec = speechRecognizer, rec.isAvailable == false {
154
- NSLog("[STT] watchdog: recognizer unavailable; waiting…")
155
- return
156
- }
157
-
158
- // No task at all? Spin one up.
159
- if recognitionTask == nil {
160
- if now - lastRearmAt > rearmCooldownTask {
161
- NSLog("[STT] watchdog: no task -> start fresh request")
162
- lastRearmAt = now
163
- startTask(makeFreshRequest())
164
- }
165
- return
166
- }
167
-
168
- // If we’ve had buffers recently but no results for a while, assume the task is stuck.
169
- let noResultsFor = now - lastResultAt
170
- let hadRecentAudio = (now - lastBufferAt) < max(2.0, stallThreshold) // tap is alive
171
-
172
- if hadRecentAudio && noResultsFor > stallThreshold {
173
- if now - lastRearmAt > rearmCooldownTask {
174
- consecutiveStallCount += 1
175
- NSLog("[STT] watchdog: stall detected (no results for \(Int(noResultsFor))s, audio flowing). rearm #\(consecutiveStallCount)")
176
-
177
- rearmTask(reason: "watchdog-stall")
178
- lastRearmAt = now
179
-
180
- // If we stall repeatedly, recreate the recognizer itself (server/session could be hosed)
181
- if consecutiveStallCount >= 3 {
182
- recreateSpeechRecognizerPreservingLocale()
183
- consecutiveStallCount = 0
184
- }
185
- }
186
- } else if hadRecentAudio {
187
- // Healthy path: audio & results are flowing; reset stall counter
188
- consecutiveStallCount = 0
189
- }
190
- }
191
-
192
- public func startSpeech(localeStr: String?) {
193
- NSLog("[STT] startSpeech(locale=\(localeStr ?? "nil"))")
194
-
195
- if recognitionTask != nil {
196
- sendResult(error: ["code": "already_started", "message": "Speech recognition already started!"],
197
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
198
- return
199
- }
200
-
201
- SFSpeechRecognizer.requestAuthorization { [weak self] status in
202
- guard let self = self else { return }
203
- switch status {
204
- case .notDetermined:
205
- self.sendResult(error: ["message": "Speech recognition not yet authorized"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
206
- case .denied:
207
- self.sendResult(error: ["message": "User denied access to speech recognition"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
208
- case .restricted:
209
- self.sendResult(error: ["message": "Speech recognition restricted on this device"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
210
- case .authorized:
211
- self.setupAndStartRecognizing(localeStr: localeStr)
212
- @unknown default:
213
- self.sendResult(error: ["message": "Unknown authorization status"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
214
- }
215
- }
216
- }
217
-
218
- public func stopSpeech(_ completion: ((Bool) -> Void)? = nil) {
219
- NSLog("[STT] stopSpeech() requested by app")
220
- recognitionTask?.finish()
221
- completion?(false)
222
- }
223
-
224
- public func cancelSpeech(_ completion: ((Bool) -> Void)? = nil) {
225
- NSLog("[STT] cancelSpeech() requested by app")
226
-
227
- recognitionTask?.cancel()
228
- completion?(false)
229
- }
230
-
231
- public func destroySpeech(_ completion: ((Bool) -> Void)? = nil) {
232
- NSLog("[STT] **** destroySpeech!!!")
233
- teardown()
234
- completion?(false)
235
- }
236
-
237
- /// Try to avoide this!!!
238
- // Pick the best input and (optionally) prefer speaker only when nothing external is present.
239
- private func updateSessionRouting(selectBestInput: Bool = true) {
240
- let s = AVAudioSession.sharedInstance()
241
-
242
- // External *playback* devices are irrelevant for capture; don't “opt out” to A2DP.
243
- let hasWiredOrCar = s.currentRoute.outputs.contains { $0.portType == .headphones || $0.portType == .carAudio || $0.portType == .usbAudio }
244
-
245
- // *** NEW: log availableInputs so we can see what iOS exposes on iPhone 15 + AirPods
246
- let avail = (s.availableInputs ?? []).map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator: ", ")
247
- NSLog("[STT] availableInputs=[\(avail)]")
248
-
249
- if selectBestInput, let inputs = s.availableInputs {
250
- // Prefer true two-way BT (HFP). AirPods NEVER show up as .bluetoothLE here.
251
- let btHFP = inputs.first { $0.portType == .bluetoothHFP }
252
- let wired = inputs.first { $0.portType == .headsetMic }
253
- let built = inputs.first { $0.portType == .builtInMic }
254
- let best = btHFP ?? wired ?? built
255
-
256
- do {
257
- if s.preferredInput?.uid != best?.uid {
258
- try s.setPreferredInput(best)
259
- }
260
- } catch {
261
- NSLog("[STT] setPreferredInput failed: \(error.localizedDescription)")
262
- }
263
-
264
- if let builtIn = best, builtIn.portType == .builtInMic,
265
- let ds = builtIn.dataSources?.first(where: { $0.orientation == .bottom || $0.orientation == .back }) {
266
- try? builtIn.setPreferredDataSource(ds)
267
- }
268
- }
269
-
270
- // Don’t advertise A2DP; keep it out to avoid output-only routing.
271
- var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
272
- if !hasWiredOrCar { opts.insert(.defaultToSpeaker) } // speaker only when nothing wired
273
-
274
- // Voice chat engages VoiceProcessingIO (AEC) and prefers HFP over A2DP automatically.
275
- if s.category != .playAndRecord || s.mode != .voiceChat || s.categoryOptions != opts {
276
- NSLog("[STT] reapply .playAndRecord / .voiceChat (opts=\(opts))")
277
- try? s.setCategory(.playAndRecord, mode: .voiceChat, options: opts)
278
- }
279
-
280
- // Always (re)activate after (re)applying the route
281
- try? s.setActive(true, options: [])
282
-
283
- // DEBUG: verify we actually got HFP when BT is connected
284
- let inPorts = s.currentRoute.inputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
285
- let outPorts = s.currentRoute.outputs.map { "\($0.portType.rawValue):\($0.portName)" }.joined(separator:", ")
286
- NSLog("[STT] route in=[\(inPorts)] out=[\(outPorts)]")
287
- }
288
-
289
- // ↓↓↓ preferred settings helper
290
- private func force16kIfPossible(_ session: AVAudioSession) {
291
- try? session.setPreferredSampleRate(16_000)
292
- if session.isInputAvailable { try? session.setPreferredInputNumberOfChannels(1) }
293
- try? session.setPreferredOutputNumberOfChannels(1)
294
- try? session.setPreferredIOBufferDuration(0.02) // ~20 ms frames
295
- }
296
-
297
- // MARK: - Core logic (kept intact, including AEC order/steps)
298
-
299
- /// Returns true if no errors occurred (identical flow & calls as ObjC) + keep-alive opts.
300
- private func setupAudioSession() -> Bool {
301
- var err: NSError?
302
- let session = AVAudioSession.sharedInstance()
303
- self.audioSession = session
304
-
305
- // Build options to match our routing rules
306
- // (defaultToSpeaker only when no external output is active)
307
- let hasExternalOutput: Bool = session.currentRoute.outputs.contains {
308
- switch $0.portType {
309
- case .headphones, .bluetoothA2DP, .bluetoothHFP, .bluetoothLE, .airPlay, .carAudio, .usbAudio:
310
- return true
311
- default:
312
- return false
313
- }
314
- }
315
-
316
- var opts: AVAudioSession.CategoryOptions = [.allowBluetooth]
317
- if !hasExternalOutput { opts.insert(.defaultToSpeaker) }
318
- if #available(iOS 14.5, *) {
319
- // Prevent muted switch / mic mute from killing our capture pipeline
320
- opts.insert(.overrideMutedMicrophoneInterruption)
321
- }
322
-
323
- do {
324
- try session.setCategory(.playAndRecord, mode: .voiceChat, options: opts)
325
- } catch { err = error as NSError }
326
-
327
- // Force 16k before and after activation (some routes settle only after setActive)
328
- force16kIfPossible(session)
329
- do { try session.setActive(true) } catch { err = error as NSError }
330
- NSLog("[STT] session SR=%.1f inCh=%d outCh=%d (wanted 16000)",
331
- session.sampleRate,
332
- Int(session.inputNumberOfChannels),
333
- Int(session.outputNumberOfChannels))
334
- force16kIfPossible(session)
335
-
336
- if let e = err {
337
- NSLog("[STT] setupAudioSession error: \(e.localizedDescription)")
338
- sendResult(error: ["code": "audio", "message": e.localizedDescription],
339
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
340
- return false
341
- }
342
- return true
343
- }
344
-
345
- private func currentInputFormat(_ engine: AVAudioEngine) -> AVAudioFormat? {
346
- // Prefer whatever CoreAudio currently provides; avoid cached formats.
347
- let fmt = engine.inputNode.outputFormat(forBus: 0)
348
- if fmt.sampleRate > 0 && fmt.channelCount > 0 { return fmt }
349
- // Fallback: build a sane mono format from session if ever needed.
350
- let sr = max(8000, AVAudioSession.sharedInstance().sampleRate)
351
- return AVAudioFormat(commonFormat: .pcmFormatFloat32,
352
- sampleRate: sr,
353
- channels: 1,
354
- interleaved: false)
355
- }
356
-
357
- private func isHeadsetPluggedIn() -> Bool {
358
- let route = AVAudioSession.sharedInstance().currentRoute
359
- for out in route.outputs {
360
- if out.portType == .headphones || out.portType == .bluetoothA2DP {
361
- return true
362
- }
363
- }
364
- return false
365
- }
366
-
367
- private func isHeadSetBluetooth() -> Bool {
368
- for port in AVAudioSession.sharedInstance().availableInputs ?? [] {
369
- if port.portType == .bluetoothHFP { return true }
370
- }
371
- return false
372
- }
373
-
374
- private func loadContextualStrings() -> [String] {
375
- guard let filePath = Bundle.main.path(forResource: "words_flattened", ofType: "txt") else {
376
- NSLog("words_flattened.txt not found in bundle")
377
- return []
378
- }
379
- do {
380
- let contents = try String(contentsOfFile: filePath, encoding: .utf8)
381
- let rawItems = contents.components(separatedBy: ",")
382
- var cleaned: [String] = []
383
- cleaned.reserveCapacity(rawItems.count)
384
- for item in rawItems {
385
- var t = item.trimmingCharacters(in: .whitespacesAndNewlines)
386
- t = t.replacingOccurrences(of: "\"", with: "")
387
- if !t.isEmpty { cleaned.append(t) }
388
- }
389
- return cleaned
390
- } catch {
391
- NSLog("Error reading contextualStrings: \(error)")
392
- return []
393
- }
394
- }
395
-
396
- // Add helpers
397
- private func makeFreshRequest() -> SFSpeechAudioBufferRecognitionRequest {
398
- let req = SFSpeechAudioBufferRecognitionRequest()
399
- if #available(iOS 16, *) { req.addsPunctuation = true }
400
- req.shouldReportPartialResults = true
401
- //if #available(iOS 13.0, *) { req.taskHint = .dictation }
402
- req.contextualStrings = loadContextualStrings()
403
- self.recognitionRequest = req
404
- NSLog("makeFreshRequest()")
405
- return req
406
- }
407
-
408
- private func startTask(_ req: SFSpeechAudioBufferRecognitionRequest) {
409
- NSLog("starting recognitionTask")
410
- lastTaskStartAt = CACurrentMediaTime()
411
- lastResultAt = lastTaskStartAt
412
- let taskSessionId = self.sessionId
413
- self.recognitionTask = self.speechRecognizer?.recognitionTask(with: req) { [weak self] result, error in
414
- guard let self = self else { return }
415
- if taskSessionId != self.sessionId { NSLog("task session mismatch -> ignore"); return }
416
- self.lastResultAt = CACurrentMediaTime()
417
-
418
- func markIfReal(_ r: SFSpeechRecognitionResult?) {
419
- guard let r = r else { return }
420
- let best = r.bestTranscription.formattedString.trimmingCharacters(in: .whitespacesAndNewlines)
421
- if !best.isEmpty ||
422
- r.transcriptions.contains(where: { !$0.formattedString.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty }) {
423
- if !self.seenRealSpeech {
424
- self.seenRealSpeech = true
425
- NSLog("first real speech detected -> onSpeechStart to JS")
426
- self.sendEvent(name: "onSpeechStart", body: nil)
427
- }
428
- }
429
- }
430
- markIfReal(result)
431
-
432
- func rearm(_ why: String, delay: TimeInterval = 0.05) {
433
- guard self.continuous else { return }
434
- NSLog("REARM (\(why))")
435
- self.recognitionTask?.cancel()
436
- self.recognitionTask = nil
437
- DispatchQueue.main.asyncAfter(deadline: .now() + delay) {
438
- self.startTask(self.makeFreshRequest())
439
- }
440
- }
441
-
442
- if let error = error {
443
- NSLog("task error \(error._code): \(error.localizedDescription)")
444
- // treat as transient for continuous mode
445
- self.rearmTask(reason: "error")
446
- return
447
- }
448
-
449
- guard let result = result else {
450
- NSLog("task nil result")
451
- self.rearmTask(reason: "nil-result")
452
- return
453
- }
454
-
455
- let isFinal = result.isFinal
456
- let parts = result.transcriptions.map { $0.formattedString }
457
- self.sendResult(error: nil,
458
- bestTranscription: result.bestTranscription.formattedString,
459
- transcriptions: parts,
460
- isFinal: isFinal)
461
-
462
- if isFinal {
463
- NSLog("task final -> onSpeechEnd")
464
- self.sendEvent(name: "onSpeechEnd", body: nil)
465
- if self.continuous {
466
- self.rearmTask(reason: "final")
467
- } else {
468
- NSLog("non-continuous final -> teardown")
469
- self.teardown()
470
- }
471
- }
472
- }
473
- }
474
-
475
- public func teardown() {
476
- NSLog("[STT] teardown() begin")
477
- isTearingDown = true
478
- stopWatchdog()
479
- consecutiveStallCount = 0
480
-
481
- if let task = recognitionTask {
482
- task.cancel()
483
- recognitionTask = nil
484
- }
485
- AudioPlaybackHook.engineScheduleFile = nil
486
- AudioPlaybackHook.isEngineReady = nil
487
- AudioPlaybackHook.useOnlyEnginePlayback = nil
488
- AudioPlaybackHook.stopEnginePlayback = nil // ← NEW
489
- sttActive = false
490
-
491
- if let p = playbackNode {
492
- p.stop()
493
- }
494
- playbackNode = nil
495
-
496
- if let req = recognitionRequest {
497
- req.endAudio()
498
- recognitionRequest = nil
499
- }
500
-
501
- if let engine = audioEngine {
502
- if engine.inputNode != nil {
503
- engine.inputNode.removeTap(onBus: 0)
504
- engine.inputNode.reset()
505
- }
506
- if engine.isRunning {
507
- engine.stop()
508
- }
509
- engine.reset()
510
- rebindEngineConfigObserver(to: nil)
511
- audioEngine = nil // Crucial step!
512
- }
513
-
514
- resetAudioSession()
515
-
516
- sessionId = nil
517
- isTearingDown = false
518
- }
519
-
520
- private func resetAudioSession() {
521
- if audioSession == nil {
522
- audioSession = AVAudioSession.sharedInstance()
523
- }
524
- guard let session = audioSession else { return }
525
-
526
- // Preserve & compare category exactly as original logic
527
- let current = session.category
528
- if priorAudioCategory == current { return }
529
-
530
- // (kept commented as in your code)
531
- // do {
532
- // try session.setCategory(priorAudioCategory ?? .soloAmbient,
533
- // mode: .default,
534
- // options: [.allowBluetooth,
535
- // .defaultToSpeaker,
536
- // .allowAirPlay,
537
- // .mixWithOthers])
538
- // } catch { }
539
- audioSession = nil
540
- }
541
-
542
- // LATEST assertAEC
543
- private func assertAEC(_ engine: AVAudioEngine) {
544
- do { try engine.inputNode.setVoiceProcessingEnabled(true) }
545
- catch { NSLog("[STT] assertAEC: setVoiceProcessingEnabled(true) failed: \(error)") }
546
- }
547
-
548
- private func isPlayerConnected(_ player: AVAudioPlayerNode?, to engine: AVAudioEngine?) -> Bool {
549
- guard let p = player, let e = engine else { return false }
550
- // If the node is attached and has a non-zero channel count on its output, it’s effectively connected.
551
- let fmt = p.outputFormat(forBus: 0)
552
- return (p.engine === e) && (fmt.channelCount > 0) && (fmt.sampleRate > 0)
553
- }
554
-
555
- /// Try to keep the capture alive without tearing down recognition.
556
- /// 1) If engine exists but not running → try start()
557
- /// 2) If start fails or graph became invalid → rebuild graph and start
558
- /// 3) If we don’t have a task yet, start one.
559
- private func ensureEngineRunning(reason: String) {
560
- let now = CFAbsoluteTimeGetCurrent()
561
- if (now - lastReclaimAttempt) < reclaimCooldown {
562
- NSLog("[STT] ensureEngineRunning(\(reason)) skipped (cooldown)")
563
- return
564
- }
565
- lastReclaimAttempt = now
566
-
567
- if let e = audioEngine, !e.isRunning {
568
- assertAEC(e)
569
- do {
570
- playbackNode?.stop()
571
- playbackNode = nil
572
- try e.start()
573
- NSLog("🔄 AVAudioEngine restarted after config change. isRunning=\(e.isRunning)")
574
- } catch {
575
- NSLog("❌ Could not re-start after config change: \(error)")
576
- }
577
- }
578
-
579
- // --- full recovery path (this was previously dead code) ---
580
- guard let engine = audioEngine else {
581
- NSLog("[STT] ensureEngineRunning(\(reason)): no engine → rebuild")
582
- rebuildEngineGraphAndRestart(reason: reason)
583
- return
584
- }
585
-
586
- assertAEC(engine)
587
-
588
- if !engine.isRunning {
589
- do {
590
- try engine.start()
591
- NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() -> \(engine.isRunning)")
592
- } catch {
593
- NSLog("[STT] ensureEngineRunning(\(reason)): engine.start() failed: \(error) → rebuild")
594
- rebuildEngineGraphAndRestart(reason: reason)
595
- return
596
- }
597
- }
598
-
599
- if recognitionTask == nil {
600
- if let req = recognitionRequest {
601
- NSLog("[STT] ensureEngineRunning(\(reason)): no task -> startTask(existing req)")
602
- startTask(req)
603
- } else {
604
- NSLog("[STT] ensureEngineRunning(\(reason)): no req -> makeFreshRequest + startTask")
605
- startTask(makeFreshRequest())
606
- }
607
- }
608
- }
609
-
610
- /// Rebuilds AVAudioEngine graph (mic→mute mixer, player→mainMixer), reinstalls tap,
611
- /// and restarts the engine. Does NOT nuke the current recognitionRequest/task unless required.
612
- private func rebuildEngineGraphAndRestart(reason: String) {
613
- NSLog("[STT] 🔄 rebuildEngineGraphAndRestart (\(reason))")
614
-
615
- // Keep current request if present; we'll keep appending into it
616
- let existingReq = self.recognitionRequest
617
-
618
- // Tear down engine ONLY (keep session, request)
619
- if let engine = audioEngine {
620
- if engine.inputNode != nil {
621
- engine.inputNode.removeTap(onBus: 0)
622
- engine.inputNode.reset()
623
- }
624
- if engine.isRunning { engine.stop() }
625
- engine.reset()
626
- }
627
-
628
- // Recreate engine and graph
629
- let newEngine = AVAudioEngine()
630
- self.audioEngine = newEngine
631
-
632
- let inputNode = newEngine.inputNode
633
- do {
634
- try inputNode.setVoiceProcessingEnabled(true)
635
- } catch {
636
- NSLog("[STT] rebuild: failed to enable voice processing: \(error)")
637
- }
638
- if #available(iOS 17.0, *) {
639
- var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
640
- duck.enableAdvancedDucking = false
641
- duck.duckingLevel = .min
642
- inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
643
- }
644
-
645
- // Live format (may be 0 Hz briefly during route churn)
646
- let liveFmt = newEngine.inputNode.outputFormat(forBus: 0)
647
- guard liveFmt.sampleRate > 0, liveFmt.channelCount > 0 else {
648
- NSLog("[STT] rebuild: input format invalid (0 Hz) — retry shortly")
649
- DispatchQueue.main.asyncAfter(deadline: .now() + 0.05) { [weak self] in
650
- self?.ensureEngineRunning(reason: "wait-valid-input-format(rebuild)")
651
- }
652
- return
653
- }
654
-
655
- // mic → mute mixer → mainMixer
656
- let micMixer = AVAudioMixerNode()
657
- newEngine.attach(micMixer)
658
- // Use nil to let engine pick a valid format (avoids 0 Hz assertion)
659
- newEngine.connect(inputNode, to: micMixer, format: nil)
660
- newEngine.connect(micMixer, to: newEngine.mainMixerNode, format: nil)
661
- micMixer.outputVolume = 0.0
662
-
663
- // TTS player → mainMixer (keep same player if possible, else recreate)
664
- if playbackNode == nil { playbackNode = AVAudioPlayerNode() }
665
- if let player = playbackNode {
666
- if player.engine == nil { newEngine.attach(player) }
667
- newEngine.connect(player, to: newEngine.mainMixerNode, format: nil)
668
- }
669
-
670
- do {
671
- try? inputNode.removeTap(onBus: 0)
672
- } catch {
673
- NSLog("[STT] removeTap error: \(error)")
674
- }
675
-
676
- let targetFmt = AVAudioFormat(commonFormat: .pcmFormatFloat32,
677
- sampleRate: 16_000,
678
- channels: 1,
679
- interleaved: false)!
680
-
681
- // Tap with nil so it follows route changes automatically
682
- inputNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { [weak self] buffer, _ in
683
- guard let self = self else { return }
684
-
685
- // (same level metering as your current code)
686
- let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
687
- let LP: Float = 0.5
688
-
689
- if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
690
- var peak0: Float = 0
691
- vDSP_maxmgv(ch0, 1, &peak0, frames)
692
- let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
693
- let sm0 = LP * db0 + (1 - LP) * self.averagePowerForChannel0
694
- self.averagePowerForChannel0 = sm0
695
- self.averagePowerForChannel1 = sm0
696
- }
697
- if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
698
- var peak1: Float = 0
699
- vDSP_maxmgv(ch1, 1, &peak1, frames)
700
- let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
701
- let sm1 = LP * db1 + (1 - LP) * self.averagePowerForChannel1
702
- self.averagePowerForChannel1 = sm1
703
- }
704
- self.averagePowerForChannel1 = Float(self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0)
705
- self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": self.averagePowerForChannel1])
706
-
707
- // ---- Convert to 16 kHz MONO for STT request
708
- let inFmt = buffer.format
709
- if inFmt.sampleRate != 16_000 || inFmt.channelCount != 1 {
710
- if let conv = AVAudioConverter(from: inFmt, to: targetFmt) {
711
- let ratio = targetFmt.sampleRate / inFmt.sampleRate
712
- let outCap = AVAudioFrameCount(Double(buffer.frameLength) * ratio) + 8
713
- if let outBuf = AVAudioPCMBuffer(pcmFormat: targetFmt, frameCapacity: outCap) {
714
- var err: NSError? = nil
715
- var fed = false
716
- conv.convert(to: outBuf, error: &err) { _, outStatus -> AVAudioBuffer? in
717
- if fed {
718
- outStatus.pointee = .endOfStream
719
- return nil
720
- } else {
721
- fed = true
722
- outStatus.pointee = .haveData
723
- return buffer
724
- }
725
- }
726
- if err == nil {
727
- self.recognitionRequest?.append(outBuf)
728
- } else {
729
- self.recognitionRequest?.append(buffer) // fallback
730
- }
731
- } else {
732
- self.recognitionRequest?.append(buffer)
733
- }
734
- } else {
735
- self.recognitionRequest?.append(buffer)
736
- }
737
- } else {
738
- self.recognitionRequest?.append(buffer)
739
- }
740
- self.lastBufferAt = CACurrentMediaTime()
741
- }
742
-
743
- newEngine.prepare()
744
- do {
745
- try newEngine.start()
746
- let f = newEngine.inputNode.outputFormat(forBus: 0)
747
- NSLog("[STT] rebuild: engine.start() ok, running=\(newEngine.isRunning) (fmt=%.1f Hz / %d ch)",
748
- f.sampleRate, Int(f.channelCount))
749
- } catch {
750
- NSLog("[STT] rebuild: engine.start() failed: \(error)")
751
- }
752
-
753
- // If we lost the request during rebuild, recreate + start task.
754
- if self.recognitionRequest == nil {
755
- if let old = existingReq {
756
- self.recognitionRequest = old
757
- } else {
758
- self.recognitionRequest = makeFreshRequest()
759
- }
760
- }
761
- if self.recognitionTask == nil {
762
- startTask(self.recognitionRequest!)
763
- }
764
- rebindEngineConfigObserver(to: newEngine)
765
- }
766
-
767
- @objc private func handleEngineConfigChange(_ note: Notification) {
768
- NSLog("[STT] ⚙️ AVAudioEngineConfigurationChange")
769
-
770
- // If engine stopped, drop the player node (it will be lazily recreated)
771
- if let e = audioEngine, !e.isRunning {
772
- playbackNode?.stop()
773
- playbackNode = nil
774
- }
775
-
776
- // Re-assert a mic-capable route (HFP/wired/built-in)
777
- updateSessionRouting(selectBestInput: true)
778
-
779
- // Re-enable VoiceProcessingIO (AEC) and restart if needed
780
- ensureEngineRunning(reason: "engine-config-change")
781
- }
782
-
783
- @objc private func handleMediaServicesReset(_ note: Notification) {
784
- NSLog("[STT] 📺 Media services were RESET: reclaiming mic & session")
785
- // Re-apply audio session and try to rebuild graph if needed
786
- _ = setupAudioSession()
787
- ensureEngineRunning(reason: "media-services-reset")
788
- }
789
-
790
- @objc private func handleRouteChange(_ note: Notification) {
791
- let info = note.userInfo ?? [:]
792
- NSLog("[STT] 🔀 route change: \(info)")
793
-
794
- let s = AVAudioSession.sharedInstance()
795
-
796
- // 1) Re-apply a mic-safe category/mode and prefer HFP/built-in mic.
797
- updateSessionRouting(selectBestInput: true)
798
- if let inputs = s.availableInputs {
799
- let preferred = inputs.first { $0.portType == .bluetoothHFP }
800
- ?? inputs.first { $0.portType == .headsetMic }
801
- ?? inputs.first { $0.portType == .builtInMic }
802
- try? s.setPreferredInput(preferred)
803
- }
804
-
805
- // 2) If there’s still no input, don’t thrash; wait for a usable route.
806
- let inputs = s.currentRoute.inputs
807
- NSLog("[STT] 🎤 inputs after route fix: \(inputs.map { $0.portType.rawValue })")
808
- guard !inputs.isEmpty else {
809
- NSLog("[STT] ⚠️ No mic route available (likely A2DP/AirPlay). Not restarting engine.")
810
- return
811
- }
812
-
813
- // 3) Now recover the engine/task
814
- ensureEngineRunning(reason: "route-change")
815
- }
816
-
817
- // Call once after engine is created
818
- private func installEngineObservers() {
819
- let nc = NotificationCenter.default
820
-
821
- nc.addObserver(self,
822
- selector: #selector(handleSessionInterruption(_:)),
823
- name: AVAudioSession.interruptionNotification,
824
- object: AVAudioSession.sharedInstance())
825
-
826
- nc.addObserver(self,
827
- selector: #selector(handleRouteChange(_:)),
828
- name: AVAudioSession.routeChangeNotification,
829
- object: AVAudioSession.sharedInstance())
830
-
831
- nc.addObserver(self,
832
- selector: #selector(handleMediaServicesReset(_:)),
833
- name: AVAudioSession.mediaServicesWereResetNotification,
834
- object: nil)
835
- }
836
-
837
- @objc private func handleSessionInterruption(_ note: Notification) {
838
- guard
839
- let info = note.userInfo,
840
- let typeVal = info[AVAudioSessionInterruptionTypeKey] as? UInt,
841
- let type = AVAudioSession.InterruptionType(rawValue: typeVal)
842
- else { return }
843
-
844
- if type == .ended {
845
- // On real “render err” Core Audio posts an interruption END
846
- NSLog("Session interruption ended (possible render err):")
847
- }
848
- }
849
-
850
- private func setupAndStartRecognizing(localeStr: String?) {
851
- NSLog("[STT] setupAndStartRecognizing begin")
852
- sttActive = true
853
-
854
- audioSession = AVAudioSession.sharedInstance()
855
- guard let session = audioSession else { return }
856
- var err: NSError?
857
-
858
- priorAudioCategory = session.category
859
-
860
- // Tear down resources before starting speech recognition..
861
- NSLog("[STT] pre-teardown")
862
- teardown()
863
- // ** IMPORTANT ** Call this again as teardown marks this false
864
- sttActive = true
865
-
866
- sessionId = UUID().uuidString
867
-
868
- let locale: Locale? = {
869
- if let s = localeStr, !s.isEmpty { return Locale(identifier: s) }
870
- sttActive = false
871
- return nil
872
- }()
873
-
874
- if let loc = locale {
875
- speechRecognizer = SFSpeechRecognizer(locale: loc)
876
- } else {
877
- speechRecognizer = SFSpeechRecognizer()
878
- }
879
- speechRecognizer?.delegate = self
880
-
881
- // Start audio session...
882
- NSLog("[STT] setupAudioSession()")
883
- guard setupAudioSession() else {
884
- NSLog("[STT] ERROR ERROR ******** setupAudioSession()")
885
- teardown()
886
- sttActive = false
887
- return
888
- }
889
- installEngineObservers()
890
-
891
- let request = SFSpeechAudioBufferRecognitionRequest()
892
- recognitionRequest = request
893
-
894
- if #available(iOS 16, *) {
895
- request.addsPunctuation = true
896
- }
897
- request.shouldReportPartialResults = true
898
- //if #available(iOS 13.0, *) { request.taskHint = .dictation }
899
- request.contextualStrings = loadContextualStrings()
900
-
901
- guard recognitionRequest != nil else {
902
- sendResult(error: ["code": "recognition_init"], bestTranscription: nil, transcriptions: nil, isFinal: nil)
903
- teardown()
904
- return
905
- }
906
-
907
- if audioEngine == nil {
908
- audioEngine = AVAudioEngine()
909
- rebindEngineConfigObserver(to: audioEngine)
910
- }
911
- do {
912
- guard let engine = audioEngine else { throw NSError(domain: "voice.audio", code: -1) }
913
- let inputNode = engine.inputNode
914
- _ = inputNode // presence check
915
-
916
- // Enable voice processing (AEC)
917
- do {
918
- try inputNode.setVoiceProcessingEnabled(true)
919
- } catch {
920
- NSLog("Failed to enable voice processing for AEC on input node: \(error)")
921
- }
922
-
923
- if #available(iOS 17.0, *) {
924
- var duck = AVAudioVoiceProcessingOtherAudioDuckingConfiguration()
925
- duck.enableAdvancedDucking = false // disable advanced (VAD-based) ducking
926
- duck.duckingLevel = .min // “as loud as possible” for other audio
927
- inputNode.voiceProcessingOtherAudioDuckingConfiguration = duck
928
- }
929
-
930
- NSLog("[STT] AEC enable done")
931
-
932
- // Live format guard (can briefly be 0 Hz on route churn)
933
- let liveFmt = engine.inputNode.outputFormat(forBus: 0)
934
- guard liveFmt.sampleRate > 0, liveFmt.channelCount > 0 else {
935
- NSLog("[STT] start: input format invalid (0 Hz) — retry shortly")
936
- DispatchQueue.main.asyncAfter(deadline: .now() + 0.05) { [weak self] in
937
- self?.ensureEngineRunning(reason: "wait-valid-input-format(start)")
938
- }
939
- return
940
- }
941
-
942
- // 1) Mute only the mic path, not the whole main mixer
943
- let micMixer = AVAudioMixerNode()
944
- engine.attach(micMixer)
945
- // Let engine choose format to avoid 0 Hz assertions
946
- engine.connect(inputNode, to: micMixer, format: nil)
947
- engine.connect(micMixer, to: engine.mainMixerNode, format: nil)
948
- micMixer.outputVolume = 0.0 // ← you won't hear your own mic
949
-
950
- // 2) Prepare a player node for TTS inside the SAME engine/graph
951
- let player = AVAudioPlayerNode()
952
- self.playbackNode = player
953
- engine.attach(player)
954
- engine.connect(player, to: engine.mainMixerNode, format: nil)
955
-
956
- NSLog("[STT] graph connected (mic->mute mixer, player->mainMixer)")
957
-
958
- var tapFrames: UInt64 = 0
959
-
960
- do { try? inputNode.removeTap(onBus: 0) } catch {
961
- NSLog("[STT] removeTap error: \(error)")
962
- }
963
-
964
- let targetFmt = AVAudioFormat(commonFormat: .pcmFormatFloat32,
965
- sampleRate: 16_000,
966
- channels: 1,
967
- interleaved: false)!
968
-
969
- // Tap with nil so it follows the node’s live format automatically
970
- inputNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { [weak self] buffer, _ in
971
- // Strongify self once
972
- guard let self = self else { return }
973
- tapFrames &+= UInt64(buffer.frameLength)
974
- if tapFrames % (44100 * 2) < 1024 { // ~every ~2s at 44.1k
975
- NSLog("[STT] tap alive, totalFrames=\(tapFrames)")
976
- }
977
-
978
- let frames: vDSP_Length = vDSP_Length(buffer.frameLength)
979
- let LEVEL_LOWPASS_TRIG: Float = 0.5
980
-
981
- // CH0
982
- if buffer.format.channelCount > 0, let ch0 = buffer.floatChannelData?[0] {
983
- var peak0: Float = 0
984
- vDSP_maxmgv(ch0, 1, &peak0, frames)
985
- let db0: Float = (peak0 == 0) ? -100 : 20.0 * log10f(peak0)
986
-
987
- let smoothed0 = LEVEL_LOWPASS_TRIG * db0
988
- + (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel0
989
- self.averagePowerForChannel0 = smoothed0
990
- self.averagePowerForChannel1 = smoothed0
991
- }
992
-
993
- // CH1
994
- if buffer.format.channelCount > 1, let ch1 = buffer.floatChannelData?[1] {
995
- var peak1: Float = 0
996
- vDSP_maxmgv(ch1, 1, &peak1, frames)
997
- let db1: Float = (peak1 == 0) ? -100 : 20.0 * log10f(peak1)
998
-
999
- let smoothed1 = LEVEL_LOWPASS_TRIG * db1
1000
- + (1 - LEVEL_LOWPASS_TRIG) * self.averagePowerForChannel1
1001
- self.averagePowerForChannel1 = smoothed1
1002
- }
1003
-
1004
- // Normalize 0–10 and emit
1005
- self.averagePowerForChannel1 = Float(self._normalizedPowerLevelFromDecibels(CGFloat(self.averagePowerForChannel1)) * 10.0)
1006
- let value = self.averagePowerForChannel1
1007
- self.sendEvent(name: "onSpeechVolumeChanged", body: ["value": value])
1008
-
1009
- // ---- Convert to 16 kHz MONO for STT request
1010
- let inFmt = buffer.format
1011
- if inFmt.sampleRate != 16_000 || inFmt.channelCount != 1 {
1012
- if let conv = AVAudioConverter(from: inFmt, to: targetFmt) {
1013
- // Conservative capacity +8 frames
1014
- let ratio = targetFmt.sampleRate / inFmt.sampleRate
1015
- let outCap = AVAudioFrameCount(Double(buffer.frameLength) * ratio) + 8
1016
- if let outBuf = AVAudioPCMBuffer(pcmFormat: targetFmt, frameCapacity: outCap) {
1017
- var err: NSError? = nil
1018
- var fed = false
1019
- conv.convert(to: outBuf, error: &err) { _, outStatus -> AVAudioBuffer? in
1020
- if fed {
1021
- outStatus.pointee = .endOfStream
1022
- return nil
1023
- } else {
1024
- fed = true
1025
- outStatus.pointee = .haveData
1026
- return buffer
1027
- }
1028
- }
1029
- if err == nil {
1030
- self.recognitionRequest?.append(outBuf)
1031
- } else {
1032
- self.recognitionRequest?.append(buffer) // fallback
1033
- }
1034
- } else {
1035
- self.recognitionRequest?.append(buffer)
1036
- }
1037
- } else {
1038
- self.recognitionRequest?.append(buffer)
1039
- }
1040
- } else {
1041
- self.recognitionRequest?.append(buffer)
1042
- }
1043
-
1044
- self.lastBufferAt = CACurrentMediaTime()
1045
- }
1046
-
1047
- engine.prepare()
1048
- NSLog("[STT] audioEngine prepare")
1049
- var audioSessionError: NSError?
1050
- do {
1051
- try engine.start()
1052
- } catch {
1053
- audioSessionError = error as NSError
1054
- }
1055
-
1056
- // after engine.start() success:
1057
- engineHotAt = CACurrentMediaTime()
1058
- seenRealSpeech = false
1059
- let f = engine.inputNode.outputFormat(forBus: 0)
1060
- NSLog("engine HOT at \(engineHotAt) (fmt=%.1f Hz / %d ch)", f.sampleRate, Int(f.channelCount))
1061
- sendEvent(name: "onSpeechStart", body: nil) // engine hot signal
1062
- startTask(makeFreshRequest())
1063
-
1064
- // Engine is up; expose readiness
1065
- AudioPlaybackHook.isEngineReady = { [weak self] in
1066
- guard let eng = self?.audioEngine else { return false }
1067
- return eng.isRunning
1068
- }
1069
-
1070
- // Tell TTS layer: do NOT use AVAudioPlayer fallback while STT is active
1071
- AudioPlaybackHook.useOnlyEnginePlayback = { [weak self] in
1072
- return self?.sttActive == true
1073
- }
1074
-
1075
- startWatchdog()
1076
-
1077
- // After engine.start() succeeds:
1078
- AudioPlaybackHook.engineScheduleFile = { [weak self] url, done in
1079
- // Always run on main because AVAudioEngine/Nodes are main-thread-y for our usage
1080
- DispatchQueue.main.async {
1081
- guard let self = self else { return }
1082
- if self.isTearingDown { return } // guard against teardown races
1083
-
1084
- guard let engine = self.audioEngine else { return }
1085
-
1086
- do {
1087
-
1088
- // If the graph changed or the node isn't tied to this engine, recreate it.
1089
- if self.playbackNode?.engine !== engine || !self.isPlayerConnected(self.playbackNode, to: engine) {
1090
- self.playbackNode?.stop()
1091
- self.playbackNode = nil
1092
- }
1093
-
1094
- // Ensure the player node is attached/connected to THIS engine
1095
- let player = self.ensurePlaybackNode(in: engine)
1096
-
1097
- // Make sure engine is running before we play
1098
- if !engine.isRunning {
1099
- do { try engine.start() } catch {
1100
- NSLog("[STT] TTS: engine.start() failed: \(error)")
1101
- return
1102
- }
1103
- }
1104
-
1105
- let file = try AVAudioFile(forReading: url)
1106
-
1107
- // Start player after we know it's attached and engine runs
1108
- if !player.isPlaying { player.play() }
1109
-
1110
- player.scheduleFile(file, at: nil) {
1111
- DispatchQueue.main.async { done() }
1112
- }
1113
- } catch {
1114
- NSLog("[STT] TTS schedule error: \(error)")
1115
- }
1116
- }
1117
- return true
1118
- }
1119
-
1120
- AudioPlaybackHook.stopEnginePlayback = { [weak self] in
1121
- DispatchQueue.main.async {
1122
- guard let self = self else { return }
1123
- // Stop only the TTS playback node; keep the engine running for STT
1124
- self.playbackNode?.stop()
1125
- }
1126
- }
1127
-
1128
- NSLog("audioEngine startAndReturnError")
1129
- if let audioSessionError = audioSessionError {
1130
- NSLog("audioEngine start error: \(audioSessionError.localizedDescription)")
1131
- self.sendResult(error: ["code": "audio", "message": audioSessionError.localizedDescription],
1132
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
1133
- return
1134
- }
1135
- NSLog("After Start recording and append recording")
1136
- DispatchQueue.main.asyncAfter(deadline: .now() + 3.0) { [weak self] in
1137
- guard let self = self else { return }
1138
- let running = self.audioEngine?.isRunning ?? false
1139
- let taskState = self.recognitionTask?.state.rawValue ?? -1
1140
- NSLog("[STT] health: engineRunning=\(running) taskState=\(taskState)")
1141
- }
1142
-
1143
- NSLog("After if audioSessionError != nil")
1144
- } catch let e as NSError {
1145
- sendResult(error: ["code": "start_recording", "message": e.localizedDescription],
1146
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
1147
- NSLog("End of init...")
1148
- return
1149
- }
1150
- }
1151
-
1152
- // MARK: - Helpers
1153
-
1154
- private func _normalizedPowerLevelFromDecibels(_ decibels: CGFloat) -> CGFloat {
1155
- if decibels < -80.0 || decibels == 0.0 { return 0.0 }
1156
- let minDb: Float = -80.0
1157
- let pow10_min = powf(10.0, 0.05 * minDb)
1158
- let pow10_db = powf(10.0, 0.05 * Float(decibels))
1159
- let power = powf((pow10_db - pow10_min) * (1.0 / (1.0 - pow10_min)), 1.0 / 2.0)
1160
- if power < 1.0 { return CGFloat(power) } else { return 1.0 }
1161
- }
1162
-
1163
- private func sendEvent(name: String, body: [String: Any]?) {
1164
- delegate?.stt(self, didEmitEvent: name, body: body)
1165
- }
1166
-
1167
- /// Exact event behavior preserved from ObjC `sendResult`.
1168
- private func sendResult(error: [String: Any]?,
1169
- bestTranscription: String?,
1170
- transcriptions: [String]?,
1171
- isFinal: Bool?) {
1172
- if let error = error {
1173
- sendEvent(name: "onSpeechError", body: ["error": error])
1174
- }
1175
- if let best = bestTranscription {
1176
- sendEvent(name: "onSpeechResults", body: ["value": [best]])
1177
- }
1178
- if let trans = transcriptions {
1179
- sendEvent(name: "onSpeechPartialResults", body: ["value": trans])
1180
- }
1181
- if let isFinal = isFinal {
1182
- sendEvent(name: "onSpeechRecognized", body: ["isFinal": isFinal])
1183
- }
1184
- }
1185
-
1186
- // MARK: - SFSpeechRecognizerDelegate
1187
-
1188
- public func speechRecognizer(_ speechRecognizer: SFSpeechRecognizer, availabilityDidChange available: Bool) {
1189
- if available == false {
1190
- sendResult(error: ["message": "Speech recognition is not available now"],
1191
- bestTranscription: nil, transcriptions: nil, isFinal: nil)
1192
- }
1193
- }
1194
-
1195
- // MARK: - Small helper to recreate recognizer (used by watchdog)
1196
- private func recreateSpeechRecognizerPreservingLocale() {
1197
- let loc = speechRecognizer?.locale
1198
- speechRecognizer = loc != nil ? SFSpeechRecognizer(locale: loc!) : SFSpeechRecognizer()
1199
- speechRecognizer?.delegate = self
1200
- NSLog("[STT] recreated SFSpeechRecognizer (locale preserved: \(loc?.identifier ?? "default"))")
1201
- }
1202
- }