@elizaos/capacitor-talkmode 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ElizaosCapacitorTalkmode.podspec +18 -0
- package/android/build.gradle +46 -0
- package/android/src/main/AndroidManifest.xml +7 -0
- package/android/src/main/java/ai/eliza/plugins/talkmode/TalkModePlugin.kt +1202 -0
- package/dist/esm/definitions.d.ts +277 -0
- package/dist/esm/definitions.d.ts.map +1 -0
- package/dist/esm/definitions.js +1 -0
- package/dist/esm/index.d.ts +4 -0
- package/dist/esm/index.d.ts.map +1 -0
- package/dist/esm/index.js +6 -0
- package/dist/esm/web.d.ts +46 -0
- package/dist/esm/web.d.ts.map +1 -0
- package/dist/esm/web.js +201 -0
- package/dist/plugin.cjs.js +214 -0
- package/dist/plugin.cjs.js.map +1 -0
- package/dist/plugin.js +217 -0
- package/dist/plugin.js.map +1 -0
- package/ios/Sources/TalkModePlugin/TalkModePlugin.swift +1121 -0
- package/package.json +83 -0
|
@@ -0,0 +1,1121 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import Capacitor
|
|
3
|
+
import AVFoundation
|
|
4
|
+
import Speech
|
|
5
|
+
|
|
6
|
+
// MARK: - TalkModePlugin
|
|
7
|
+
|
|
8
|
+
@objc(TalkModePlugin)
|
|
9
|
+
public class TalkModePlugin: CAPPlugin, CAPBridgedPlugin {
|
|
10
|
+
public let identifier = "TalkModePlugin"
|
|
11
|
+
public let jsName = "TalkMode"
|
|
12
|
+
public let pluginMethods: [CAPPluginMethod] = [
|
|
13
|
+
CAPPluginMethod(name: "start", returnType: CAPPluginReturnPromise),
|
|
14
|
+
CAPPluginMethod(name: "stop", returnType: CAPPluginReturnPromise),
|
|
15
|
+
CAPPluginMethod(name: "isEnabled", returnType: CAPPluginReturnPromise),
|
|
16
|
+
CAPPluginMethod(name: "getState", returnType: CAPPluginReturnPromise),
|
|
17
|
+
CAPPluginMethod(name: "updateConfig", returnType: CAPPluginReturnPromise),
|
|
18
|
+
CAPPluginMethod(name: "speak", returnType: CAPPluginReturnPromise),
|
|
19
|
+
CAPPluginMethod(name: "stopSpeaking", returnType: CAPPluginReturnPromise),
|
|
20
|
+
CAPPluginMethod(name: "isSpeaking", returnType: CAPPluginReturnPromise),
|
|
21
|
+
CAPPluginMethod(name: "checkPermissions", returnType: CAPPluginReturnPromise),
|
|
22
|
+
CAPPluginMethod(name: "requestPermissions", returnType: CAPPluginReturnPromise),
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
private static let defaultModelId = "eleven_flash_v2_5"
|
|
26
|
+
|
|
27
|
+
// MARK: - State
|
|
28
|
+
|
|
29
|
+
private var enabled = false
|
|
30
|
+
private var state: String = "idle"
|
|
31
|
+
private var statusText: String = "Off"
|
|
32
|
+
|
|
33
|
+
// MARK: - Speech Recognition
|
|
34
|
+
|
|
35
|
+
private let audioEngine = AVAudioEngine()
|
|
36
|
+
private var speechRecognizer: SFSpeechRecognizer?
|
|
37
|
+
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
|
38
|
+
private var recognitionTask: SFSpeechRecognitionTask?
|
|
39
|
+
private var silenceTask: Task<Void, Never>?
|
|
40
|
+
private var lastTranscript = ""
|
|
41
|
+
private var lastHeard: Date?
|
|
42
|
+
private var silenceWindow: TimeInterval = 0.7
|
|
43
|
+
|
|
44
|
+
// MARK: - TTS
|
|
45
|
+
|
|
46
|
+
private let systemSynthesizer = AVSpeechSynthesizer()
|
|
47
|
+
private var systemSpeechDelegate: SystemSpeechDelegate?
|
|
48
|
+
private var isSpeakingValue = false
|
|
49
|
+
private var usedSystemTts = false
|
|
50
|
+
private var lastSpokenText: String?
|
|
51
|
+
private var lastInterruptedAtSeconds: Double?
|
|
52
|
+
|
|
53
|
+
// MARK: - PCM Streaming Playback
|
|
54
|
+
|
|
55
|
+
private var pcmEngine: AVAudioEngine?
|
|
56
|
+
private var pcmPlayerNode: AVAudioPlayerNode?
|
|
57
|
+
private var pcmStopRequested = false
|
|
58
|
+
private var pcmPlaybackStartTime: Date?
|
|
59
|
+
|
|
60
|
+
// MARK: - MP3 Playback
|
|
61
|
+
|
|
62
|
+
private var audioPlayer: AVAudioPlayer?
|
|
63
|
+
private var mp3PlaybackStartTime: Date?
|
|
64
|
+
|
|
65
|
+
// MARK: - Active Tasks
|
|
66
|
+
|
|
67
|
+
private var speakTask: Task<Void, Error>?
|
|
68
|
+
|
|
69
|
+
// MARK: - Config
|
|
70
|
+
|
|
71
|
+
private var apiKey: String?
|
|
72
|
+
private var defaultVoiceId: String?
|
|
73
|
+
private var currentVoiceId: String?
|
|
74
|
+
private var defaultModelId: String? = TalkModePlugin.defaultModelId
|
|
75
|
+
private var currentModelId: String? = TalkModePlugin.defaultModelId
|
|
76
|
+
private var defaultOutputFormat: String? = "pcm_24000"
|
|
77
|
+
private var voiceAliases: [String: String] = [:]
|
|
78
|
+
private var interruptOnSpeech = true
|
|
79
|
+
private var sessionKey = "main"
|
|
80
|
+
private var voiceOverrideActive = false
|
|
81
|
+
private var modelOverrideActive = false
|
|
82
|
+
|
|
83
|
+
// MARK: - Lifecycle
|
|
84
|
+
|
|
85
|
+
public override func load() {
|
|
86
|
+
speechRecognizer = SFSpeechRecognizer()
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
// MARK: - Plugin Methods
|
|
90
|
+
|
|
91
|
+
@objc func start(_ call: CAPPluginCall) {
|
|
92
|
+
// Parse config first so STT language is set before availability check
|
|
93
|
+
if let config = call.getObject("config") {
|
|
94
|
+
applyConfig(config)
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
guard let recognizer = speechRecognizer, recognizer.isAvailable else {
|
|
98
|
+
call.resolve(["started": false, "error": "Speech recognition not available"])
|
|
99
|
+
return
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
Task { @MainActor in
|
|
103
|
+
let micOk = await self.requestMicrophonePermission()
|
|
104
|
+
guard micOk else {
|
|
105
|
+
call.resolve(["started": false, "error": "Microphone permission denied"])
|
|
106
|
+
return
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
let speechOk = await self.requestSpeechPermission()
|
|
110
|
+
guard speechOk else {
|
|
111
|
+
call.resolve(["started": false, "error": "Speech recognition permission denied"])
|
|
112
|
+
return
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
do {
|
|
116
|
+
try self.configureAudioSession()
|
|
117
|
+
try self.startRecognition()
|
|
118
|
+
self.enabled = true
|
|
119
|
+
self.setState("listening", "Listening")
|
|
120
|
+
self.startSilenceMonitor()
|
|
121
|
+
call.resolve(["started": true])
|
|
122
|
+
} catch {
|
|
123
|
+
self.emitError(code: "start_failed", message: error.localizedDescription, recoverable: true)
|
|
124
|
+
call.resolve(["started": false, "error": error.localizedDescription])
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
@objc func stop(_ call: CAPPluginCall) {
|
|
130
|
+
enabled = false
|
|
131
|
+
stopRecognition()
|
|
132
|
+
stopSpeakingInternal()
|
|
133
|
+
silenceTask?.cancel()
|
|
134
|
+
silenceTask = nil
|
|
135
|
+
lastTranscript = ""
|
|
136
|
+
lastHeard = nil
|
|
137
|
+
lastInterruptedAtSeconds = nil
|
|
138
|
+
setState("idle", "Off")
|
|
139
|
+
|
|
140
|
+
do {
|
|
141
|
+
try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation])
|
|
142
|
+
} catch {
|
|
143
|
+
// Ignore deactivation errors
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
call.resolve()
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
@objc func isEnabled(_ call: CAPPluginCall) {
|
|
150
|
+
call.resolve(["enabled": enabled])
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
@objc func getState(_ call: CAPPluginCall) {
|
|
154
|
+
call.resolve(["state": state, "statusText": statusText])
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
@objc func updateConfig(_ call: CAPPluginCall) {
|
|
158
|
+
guard let config = call.getObject("config") else {
|
|
159
|
+
call.resolve()
|
|
160
|
+
return
|
|
161
|
+
}
|
|
162
|
+
applyConfig(config)
|
|
163
|
+
call.resolve()
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
@objc func speak(_ call: CAPPluginCall) {
|
|
167
|
+
guard let text = call.getString("text")?.trimmingCharacters(in: .whitespacesAndNewlines),
|
|
168
|
+
!text.isEmpty else {
|
|
169
|
+
call.resolve(["completed": true, "interrupted": false, "usedSystemTts": false])
|
|
170
|
+
return
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
let useSystemTts = call.getBool("useSystemTts") ?? false
|
|
174
|
+
let directive = call.getObject("directive")
|
|
175
|
+
|
|
176
|
+
speakTask?.cancel()
|
|
177
|
+
speakTask = Task { @MainActor in
|
|
178
|
+
await self.speakInternal(text: text, forceSystemTts: useSystemTts, directive: directive, call: call)
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
@objc func stopSpeaking(_ call: CAPPluginCall) {
|
|
183
|
+
let interruptedAt = stopSpeakingInternal()
|
|
184
|
+
var result: JSObject = [:]
|
|
185
|
+
if let interruptedAt {
|
|
186
|
+
result["interruptedAt"] = interruptedAt
|
|
187
|
+
}
|
|
188
|
+
call.resolve(result)
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
@objc func isSpeaking(_ call: CAPPluginCall) {
|
|
192
|
+
call.resolve(["speaking": isSpeakingValue])
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
@objc public override func checkPermissions(_ call: CAPPluginCall) {
|
|
196
|
+
call.resolve(buildPermissionResult())
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
@objc public override func requestPermissions(_ call: CAPPluginCall) {
|
|
200
|
+
Task { @MainActor in
|
|
201
|
+
_ = await self.requestMicrophonePermission()
|
|
202
|
+
_ = await self.requestSpeechPermission()
|
|
203
|
+
call.resolve(self.buildPermissionResult())
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// MARK: - Config Application
|
|
208
|
+
|
|
209
|
+
private func applyConfig(_ config: JSObject) {
|
|
210
|
+
if let tts = config["tts"] as? [String: Any] {
|
|
211
|
+
if let key = tts["apiKey"] as? String {
|
|
212
|
+
apiKey = key.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
213
|
+
}
|
|
214
|
+
if let voice = tts["voiceId"] as? String {
|
|
215
|
+
defaultVoiceId = voice.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
216
|
+
if !voiceOverrideActive { currentVoiceId = defaultVoiceId }
|
|
217
|
+
}
|
|
218
|
+
if let model = tts["modelId"] as? String {
|
|
219
|
+
let trimmed = model.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
220
|
+
defaultModelId = trimmed.isEmpty ? Self.defaultModelId : trimmed
|
|
221
|
+
if !modelOverrideActive { currentModelId = defaultModelId }
|
|
222
|
+
}
|
|
223
|
+
if let format = tts["outputFormat"] as? String {
|
|
224
|
+
defaultOutputFormat = format.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
225
|
+
}
|
|
226
|
+
if let interrupt = tts["interruptOnSpeech"] as? Bool {
|
|
227
|
+
interruptOnSpeech = interrupt
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
if let aliases = tts["voiceAliases"] as? [String: String] {
|
|
231
|
+
var normalized: [String: String] = [:]
|
|
232
|
+
for (key, value) in aliases {
|
|
233
|
+
let k = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
|
234
|
+
let v = value.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
235
|
+
if !k.isEmpty, !v.isEmpty { normalized[k] = v }
|
|
236
|
+
}
|
|
237
|
+
voiceAliases = normalized
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
if let stt = config["stt"] as? [String: Any] {
|
|
242
|
+
if let lang = stt["language"] as? String, !lang.isEmpty {
|
|
243
|
+
speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: lang))
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if let silenceMs = config["silenceWindowMs"] as? Int, silenceMs > 0 {
|
|
248
|
+
silenceWindow = TimeInterval(silenceMs) / 1000.0
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
if let interrupt = config["interruptOnSpeech"] as? Bool {
|
|
252
|
+
interruptOnSpeech = interrupt
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
if let key = config["sessionKey"] as? String {
|
|
256
|
+
sessionKey = key
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// MARK: - Speech Recognition
|
|
261
|
+
|
|
262
|
+
private func startRecognition() throws {
|
|
263
|
+
#if targetEnvironment(simulator)
|
|
264
|
+
throw NSError(domain: "TalkMode", code: 1, userInfo: [
|
|
265
|
+
NSLocalizedDescriptionKey: "Speech recognition not supported on simulator"
|
|
266
|
+
])
|
|
267
|
+
#endif
|
|
268
|
+
|
|
269
|
+
stopRecognition()
|
|
270
|
+
|
|
271
|
+
guard let recognizer = speechRecognizer, recognizer.isAvailable else {
|
|
272
|
+
throw NSError(domain: "TalkMode", code: 2, userInfo: [
|
|
273
|
+
NSLocalizedDescriptionKey: "Speech recognizer unavailable"
|
|
274
|
+
])
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
|
|
278
|
+
recognitionRequest?.shouldReportPartialResults = true
|
|
279
|
+
|
|
280
|
+
guard let request = recognitionRequest else { return }
|
|
281
|
+
|
|
282
|
+
let input = audioEngine.inputNode
|
|
283
|
+
let format = input.outputFormat(forBus: 0)
|
|
284
|
+
|
|
285
|
+
guard format.sampleRate > 0, format.channelCount > 0 else {
|
|
286
|
+
throw NSError(domain: "TalkMode", code: 3, userInfo: [
|
|
287
|
+
NSLocalizedDescriptionKey: "Invalid audio input format"
|
|
288
|
+
])
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
input.removeTap(onBus: 0)
|
|
292
|
+
input.installTap(onBus: 0, bufferSize: 2048, format: format) { buffer, _ in
|
|
293
|
+
request.append(buffer)
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
audioEngine.prepare()
|
|
297
|
+
try audioEngine.start()
|
|
298
|
+
|
|
299
|
+
recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
|
|
300
|
+
guard let self else { return }
|
|
301
|
+
|
|
302
|
+
if let error {
|
|
303
|
+
if !self.isSpeakingValue {
|
|
304
|
+
print("[TalkMode] Recognition error: \(error.localizedDescription)")
|
|
305
|
+
}
|
|
306
|
+
return
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
guard let result else { return }
|
|
310
|
+
let transcript = result.bestTranscription.formattedString
|
|
311
|
+
|
|
312
|
+
DispatchQueue.main.async {
|
|
313
|
+
self.handleTranscript(transcript: transcript, isFinal: result.isFinal)
|
|
314
|
+
}
|
|
315
|
+
}
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
private func stopRecognition() {
|
|
319
|
+
recognitionTask?.cancel()
|
|
320
|
+
recognitionTask = nil
|
|
321
|
+
recognitionRequest?.endAudio()
|
|
322
|
+
recognitionRequest = nil
|
|
323
|
+
audioEngine.inputNode.removeTap(onBus: 0)
|
|
324
|
+
audioEngine.stop()
|
|
325
|
+
}
|
|
326
|
+
|
|
327
|
+
private func handleTranscript(transcript: String, isFinal: Bool) {
|
|
328
|
+
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
329
|
+
|
|
330
|
+
// During TTS playback, only listen for interrupt triggers
|
|
331
|
+
if isSpeakingValue, interruptOnSpeech {
|
|
332
|
+
if shouldInterrupt(with: trimmed) {
|
|
333
|
+
stopSpeakingInternal()
|
|
334
|
+
}
|
|
335
|
+
return
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
guard enabled else { return }
|
|
339
|
+
|
|
340
|
+
if !trimmed.isEmpty {
|
|
341
|
+
lastTranscript = trimmed
|
|
342
|
+
lastHeard = Date()
|
|
343
|
+
}
|
|
344
|
+
|
|
345
|
+
if isFinal {
|
|
346
|
+
lastTranscript = trimmed
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
notifyListeners("transcript", data: [
|
|
350
|
+
"transcript": trimmed,
|
|
351
|
+
"isFinal": isFinal
|
|
352
|
+
])
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/// Determines whether detected speech should interrupt current TTS playback.
|
|
356
|
+
/// Filters out echo where the mic picks up our own TTS output.
|
|
357
|
+
private func shouldInterrupt(with transcript: String) -> Bool {
|
|
358
|
+
let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
359
|
+
guard trimmed.count >= 3 else { return false }
|
|
360
|
+
|
|
361
|
+
// Echo detection: if the transcript is a substring of the text being spoken,
|
|
362
|
+
// it's likely the microphone picking up the TTS output, not user speech.
|
|
363
|
+
if let spoken = lastSpokenText?.lowercased() {
|
|
364
|
+
let probe = trimmed.lowercased()
|
|
365
|
+
if spoken.contains(probe) { return false }
|
|
366
|
+
}
|
|
367
|
+
|
|
368
|
+
return true
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
// MARK: - Silence Detection
|
|
372
|
+
|
|
373
|
+
private func startSilenceMonitor() {
|
|
374
|
+
silenceTask?.cancel()
|
|
375
|
+
silenceTask = Task { [weak self] in
|
|
376
|
+
while self?.enabled == true {
|
|
377
|
+
try? await Task.sleep(nanoseconds: 200_000_000) // 200ms poll
|
|
378
|
+
// Re-capture `self` explicitly in the inner MainActor
|
|
379
|
+
// closure. Without this, Swift 6 strict concurrency
|
|
380
|
+
// rejects it with:
|
|
381
|
+
// error: reference to captured var 'self' in
|
|
382
|
+
// concurrently-executing code
|
|
383
|
+
// because the outer `[weak self]` list does not
|
|
384
|
+
// propagate into the nested `MainActor.run` closure.
|
|
385
|
+
await MainActor.run { [weak self] in self?.checkSilence() }
|
|
386
|
+
}
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
|
|
390
|
+
/// Check if the user stopped speaking and enough silence has elapsed.
|
|
391
|
+
/// When silence exceeds the configured window, finalize the transcript
|
|
392
|
+
/// so the JS layer can send it to the agent.
|
|
393
|
+
private func checkSilence() {
|
|
394
|
+
guard enabled, !isSpeakingValue, state == "listening" else { return }
|
|
395
|
+
let transcript = lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
396
|
+
guard !transcript.isEmpty else { return }
|
|
397
|
+
guard let lastHeard else { return }
|
|
398
|
+
|
|
399
|
+
if Date().timeIntervalSince(lastHeard) >= silenceWindow {
|
|
400
|
+
finalizeTranscript(transcript)
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
/// Emit the final transcript and transition to processing state.
|
|
405
|
+
/// The JS layer picks this up to send the transcript to the agent.
|
|
406
|
+
private func finalizeTranscript(_ transcript: String) {
|
|
407
|
+
lastTranscript = ""
|
|
408
|
+
lastHeard = nil
|
|
409
|
+
setState("processing", "Processing")
|
|
410
|
+
stopRecognition()
|
|
411
|
+
|
|
412
|
+
notifyListeners("transcript", data: [
|
|
413
|
+
"transcript": transcript,
|
|
414
|
+
"isFinal": true
|
|
415
|
+
])
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
// MARK: - TTS Orchestration
|
|
419
|
+
|
|
420
|
+
private func speakInternal(
|
|
421
|
+
text: String,
|
|
422
|
+
forceSystemTts: Bool,
|
|
423
|
+
directive: [String: Any]?,
|
|
424
|
+
call: CAPPluginCall
|
|
425
|
+
) async {
|
|
426
|
+
isSpeakingValue = true
|
|
427
|
+
usedSystemTts = false
|
|
428
|
+
pcmStopRequested = false
|
|
429
|
+
lastSpokenText = text
|
|
430
|
+
setState("speaking", "Speaking")
|
|
431
|
+
|
|
432
|
+
// Resolve voice/model from directive, with override persistence
|
|
433
|
+
let requestedVoice = (directive?["voiceId"] as? String)?
|
|
434
|
+
.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
435
|
+
let resolvedVoice = resolveVoiceAlias(requestedVoice)
|
|
436
|
+
let isOnce = directive?["once"] as? Bool ?? false
|
|
437
|
+
|
|
438
|
+
if let voice = resolvedVoice, !isOnce {
|
|
439
|
+
currentVoiceId = voice
|
|
440
|
+
voiceOverrideActive = true
|
|
441
|
+
}
|
|
442
|
+
if let model = directive?["modelId"] as? String, !model.isEmpty, !isOnce {
|
|
443
|
+
currentModelId = model
|
|
444
|
+
modelOverrideActive = true
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
let effectiveVoiceId = resolvedVoice ?? currentVoiceId ?? defaultVoiceId
|
|
448
|
+
let effectiveModelId = (directive?["modelId"] as? String)
|
|
449
|
+
?? currentModelId ?? defaultModelId ?? Self.defaultModelId
|
|
450
|
+
let rawFormat = (directive?["outputFormat"] as? String)
|
|
451
|
+
?? defaultOutputFormat ?? "pcm_24000"
|
|
452
|
+
let effectiveFormat = Self.validatedOutputFormat(rawFormat) ?? "pcm_24000"
|
|
453
|
+
let effectiveApiKey = apiKey?.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
454
|
+
|
|
455
|
+
let canUseElevenLabs = !forceSystemTts
|
|
456
|
+
&& !(effectiveApiKey ?? "").isEmpty
|
|
457
|
+
&& !(effectiveVoiceId ?? "").isEmpty
|
|
458
|
+
|
|
459
|
+
notifyListeners("speaking", data: [
|
|
460
|
+
"text": text,
|
|
461
|
+
"isSystemTts": !canUseElevenLabs
|
|
462
|
+
])
|
|
463
|
+
|
|
464
|
+
// Enable STT during playback for interrupt detection
|
|
465
|
+
if interruptOnSpeech {
|
|
466
|
+
do { try startRecognition() } catch {
|
|
467
|
+
print("[TalkMode] Recognition for interrupt detection failed: \(error)")
|
|
468
|
+
}
|
|
469
|
+
} else {
|
|
470
|
+
stopRecognition()
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
var interrupted = false
|
|
474
|
+
let language = Self.validatedLanguage(directive?["language"] as? String)
|
|
475
|
+
|
|
476
|
+
do {
|
|
477
|
+
if canUseElevenLabs {
|
|
478
|
+
do {
|
|
479
|
+
try await streamElevenLabsTts(
|
|
480
|
+
text: text,
|
|
481
|
+
voiceId: effectiveVoiceId ?? "",
|
|
482
|
+
apiKey: effectiveApiKey ?? "",
|
|
483
|
+
modelId: effectiveModelId,
|
|
484
|
+
outputFormat: effectiveFormat,
|
|
485
|
+
directive: directive
|
|
486
|
+
)
|
|
487
|
+
interrupted = pcmStopRequested
|
|
488
|
+
} catch {
|
|
489
|
+
// Fallback to system TTS on ElevenLabs failure
|
|
490
|
+
print("[TalkMode] ElevenLabs failed, falling back to system TTS: \(error)")
|
|
491
|
+
emitError(
|
|
492
|
+
code: "elevenlabs_failed",
|
|
493
|
+
message: error.localizedDescription,
|
|
494
|
+
recoverable: true
|
|
495
|
+
)
|
|
496
|
+
try await speakWithSystemTts(text: text, language: language)
|
|
497
|
+
}
|
|
498
|
+
} else {
|
|
499
|
+
try await speakWithSystemTts(text: text, language: language)
|
|
500
|
+
}
|
|
501
|
+
} catch {
|
|
502
|
+
emitError(code: "tts_failed", message: error.localizedDescription, recoverable: true)
|
|
503
|
+
call.resolve([
|
|
504
|
+
"completed": false,
|
|
505
|
+
"interrupted": false,
|
|
506
|
+
"usedSystemTts": usedSystemTts,
|
|
507
|
+
"error": error.localizedDescription
|
|
508
|
+
])
|
|
509
|
+
finishSpeaking()
|
|
510
|
+
return
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
var result: JSObject = [
|
|
514
|
+
"completed": !interrupted,
|
|
515
|
+
"interrupted": interrupted,
|
|
516
|
+
"usedSystemTts": usedSystemTts
|
|
517
|
+
]
|
|
518
|
+
if interrupted, let at = lastInterruptedAtSeconds {
|
|
519
|
+
result["interruptedAt"] = at
|
|
520
|
+
}
|
|
521
|
+
call.resolve(result)
|
|
522
|
+
|
|
523
|
+
notifyListeners("speakComplete", data: [
|
|
524
|
+
"completed": !interrupted
|
|
525
|
+
])
|
|
526
|
+
|
|
527
|
+
finishSpeaking()
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
/// Clean up after speech and restart recognition if talk mode is still enabled.
|
|
531
|
+
private func finishSpeaking() {
|
|
532
|
+
isSpeakingValue = false
|
|
533
|
+
pcmStopRequested = false
|
|
534
|
+
stopRecognition()
|
|
535
|
+
|
|
536
|
+
if enabled {
|
|
537
|
+
setState("listening", "Listening")
|
|
538
|
+
do {
|
|
539
|
+
try startRecognition()
|
|
540
|
+
startSilenceMonitor()
|
|
541
|
+
} catch {
|
|
542
|
+
print("[TalkMode] Failed to restart recognition: \(error)")
|
|
543
|
+
emitError(
|
|
544
|
+
code: "recognition_restart_failed",
|
|
545
|
+
message: error.localizedDescription,
|
|
546
|
+
recoverable: true
|
|
547
|
+
)
|
|
548
|
+
}
|
|
549
|
+
} else {
|
|
550
|
+
setState("idle", "Off")
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
// MARK: - ElevenLabs Streaming TTS
|
|
555
|
+
|
|
556
|
+
private func streamElevenLabsTts(
|
|
557
|
+
text: String,
|
|
558
|
+
voiceId: String,
|
|
559
|
+
apiKey: String,
|
|
560
|
+
modelId: String,
|
|
561
|
+
outputFormat: String,
|
|
562
|
+
directive: [String: Any]?
|
|
563
|
+
) async throws {
|
|
564
|
+
let urlString = "https://api.elevenlabs.io/v1/text-to-speech/\(voiceId)/stream"
|
|
565
|
+
guard let url = URL(string: urlString) else {
|
|
566
|
+
throw NSError(domain: "TalkMode", code: 1, userInfo: [
|
|
567
|
+
NSLocalizedDescriptionKey: "Invalid ElevenLabs URL"
|
|
568
|
+
])
|
|
569
|
+
}
|
|
570
|
+
|
|
571
|
+
var request = URLRequest(url: url)
|
|
572
|
+
request.httpMethod = "POST"
|
|
573
|
+
request.setValue("application/json", forHTTPHeaderField: "Content-Type")
|
|
574
|
+
request.setValue(apiKey, forHTTPHeaderField: "xi-api-key")
|
|
575
|
+
|
|
576
|
+
// Build voice settings from directive values
|
|
577
|
+
let speed = Self.resolveSpeed(
|
|
578
|
+
speed: directive?["speed"] as? Double,
|
|
579
|
+
rateWpm: directive?["rateWpm"] as? Int
|
|
580
|
+
)
|
|
581
|
+
let stability = Self.validatedUnit(directive?["stability"] as? Double) ?? 0.5
|
|
582
|
+
let similarity = Self.validatedUnit(directive?["similarity"] as? Double) ?? 0.75
|
|
583
|
+
|
|
584
|
+
var voiceSettings: [String: Any] = [
|
|
585
|
+
"stability": stability,
|
|
586
|
+
"similarity_boost": similarity
|
|
587
|
+
]
|
|
588
|
+
if let speed { voiceSettings["speed"] = speed }
|
|
589
|
+
if let style = Self.validatedUnit(directive?["style"] as? Double) {
|
|
590
|
+
voiceSettings["style"] = style
|
|
591
|
+
}
|
|
592
|
+
if let boost = directive?["speakerBoost"] as? Bool {
|
|
593
|
+
voiceSettings["use_speaker_boost"] = boost
|
|
594
|
+
}
|
|
595
|
+
|
|
596
|
+
var body: [String: Any] = [
|
|
597
|
+
"text": text,
|
|
598
|
+
"model_id": modelId,
|
|
599
|
+
"output_format": outputFormat,
|
|
600
|
+
"voice_settings": voiceSettings
|
|
601
|
+
]
|
|
602
|
+
if let seed = Self.validatedSeed(directive?["seed"] as? Int) {
|
|
603
|
+
body["seed"] = seed
|
|
604
|
+
}
|
|
605
|
+
if let normalize = Self.validatedNormalize(directive?["normalize"] as? String) {
|
|
606
|
+
body["apply_text_normalization"] = normalize
|
|
607
|
+
}
|
|
608
|
+
if let language = Self.validatedLanguage(directive?["language"] as? String) {
|
|
609
|
+
body["language_code"] = language
|
|
610
|
+
}
|
|
611
|
+
if let tier = Self.validatedLatencyTier(directive?["latencyTier"] as? Int) {
|
|
612
|
+
body["optimize_streaming_latency"] = tier
|
|
613
|
+
}
|
|
614
|
+
|
|
615
|
+
request.httpBody = try JSONSerialization.data(withJSONObject: body)
|
|
616
|
+
|
|
617
|
+
let isPCM = outputFormat.hasPrefix("pcm_")
|
|
618
|
+
let sampleRate = Self.pcmSampleRate(from: outputFormat)
|
|
619
|
+
|
|
620
|
+
if isPCM, let sampleRate {
|
|
621
|
+
do {
|
|
622
|
+
try await streamPCMPlayback(request: request, sampleRate: sampleRate)
|
|
623
|
+
} catch {
|
|
624
|
+
// PCM playback failed; retry as MP3 as a fallback
|
|
625
|
+
guard !pcmStopRequested else { return }
|
|
626
|
+
print("[TalkMode] PCM playback failed, retrying as MP3: \(error)")
|
|
627
|
+
|
|
628
|
+
let mp3Format = "mp3_44100_128"
|
|
629
|
+
var retryBody = body
|
|
630
|
+
retryBody["output_format"] = mp3Format
|
|
631
|
+
|
|
632
|
+
var retryRequest = request
|
|
633
|
+
retryRequest.httpBody = try JSONSerialization.data(withJSONObject: retryBody)
|
|
634
|
+
try await downloadAndPlayAudio(request: retryRequest)
|
|
635
|
+
}
|
|
636
|
+
} else {
|
|
637
|
+
try await downloadAndPlayAudio(request: request)
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
|
|
641
|
+
/// Stream PCM audio from the network directly into an AVAudioPlayerNode.
|
|
642
|
+
/// Chunks are scheduled onto the player as they arrive for low-latency playback.
|
|
643
|
+
private func streamPCMPlayback(request: URLRequest, sampleRate: Double) async throws {
|
|
644
|
+
let engine = AVAudioEngine()
|
|
645
|
+
let playerNode = AVAudioPlayerNode()
|
|
646
|
+
|
|
647
|
+
let format = AVAudioFormat(
|
|
648
|
+
commonFormat: .pcmFormatInt16,
|
|
649
|
+
sampleRate: sampleRate,
|
|
650
|
+
channels: 1,
|
|
651
|
+
interleaved: true
|
|
652
|
+
)!
|
|
653
|
+
|
|
654
|
+
engine.attach(playerNode)
|
|
655
|
+
engine.connect(playerNode, to: engine.mainMixerNode, format: format)
|
|
656
|
+
try engine.start()
|
|
657
|
+
|
|
658
|
+
pcmEngine = engine
|
|
659
|
+
pcmPlayerNode = playerNode
|
|
660
|
+
pcmPlaybackStartTime = Date()
|
|
661
|
+
playerNode.play()
|
|
662
|
+
|
|
663
|
+
defer {
|
|
664
|
+
engine.stop()
|
|
665
|
+
pcmEngine = nil
|
|
666
|
+
pcmPlayerNode = nil
|
|
667
|
+
}
|
|
668
|
+
|
|
669
|
+
let (bytes, response) = try await URLSession.shared.bytes(for: request)
|
|
670
|
+
|
|
671
|
+
guard let httpResponse = response as? HTTPURLResponse else {
|
|
672
|
+
throw NSError(domain: "TalkMode", code: 2, userInfo: [
|
|
673
|
+
NSLocalizedDescriptionKey: "Invalid HTTP response from ElevenLabs"
|
|
674
|
+
])
|
|
675
|
+
}
|
|
676
|
+
|
|
677
|
+
guard httpResponse.statusCode == 200 else {
|
|
678
|
+
// Read a bit of the error body for diagnostics
|
|
679
|
+
var errorData = Data()
|
|
680
|
+
for try await byte in bytes {
|
|
681
|
+
errorData.append(byte)
|
|
682
|
+
if errorData.count > 2048 { break }
|
|
683
|
+
}
|
|
684
|
+
let errorMsg = String(data: errorData, encoding: .utf8) ?? "status \(httpResponse.statusCode)"
|
|
685
|
+
throw NSError(domain: "TalkMode", code: httpResponse.statusCode, userInfo: [
|
|
686
|
+
NSLocalizedDescriptionKey: "ElevenLabs API error: \(errorMsg)"
|
|
687
|
+
])
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
// Accumulate bytes into chunks; schedule each on the player node.
|
|
691
|
+
// Chunk size is ~0.5s of audio for smooth playback without excessive latency.
|
|
692
|
+
// 16-bit mono PCM: sampleRate * 2 bytes per second.
|
|
693
|
+
let chunkSize = Int(sampleRate) // ~0.5s of 16-bit mono audio
|
|
694
|
+
var buffer = Data()
|
|
695
|
+
var scheduledCount = 0
|
|
696
|
+
let completionGroup = DispatchGroup()
|
|
697
|
+
|
|
698
|
+
for try await byte in bytes {
|
|
699
|
+
if pcmStopRequested { break }
|
|
700
|
+
|
|
701
|
+
buffer.append(byte)
|
|
702
|
+
|
|
703
|
+
if buffer.count >= chunkSize {
|
|
704
|
+
try scheduleChunk(buffer, on: playerNode, format: format, group: completionGroup)
|
|
705
|
+
scheduledCount += 1
|
|
706
|
+
buffer = Data()
|
|
707
|
+
}
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
// Schedule any remaining data
|
|
711
|
+
if !buffer.isEmpty, !pcmStopRequested {
|
|
712
|
+
try scheduleChunk(buffer, on: playerNode, format: format, group: completionGroup)
|
|
713
|
+
scheduledCount += 1
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
// Wait for all scheduled buffers to finish playback
|
|
717
|
+
if scheduledCount > 0, !pcmStopRequested {
|
|
718
|
+
await withCheckedContinuation { (continuation: CheckedContinuation<Void, Never>) in
|
|
719
|
+
completionGroup.notify(queue: .main) {
|
|
720
|
+
continuation.resume()
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
}
|
|
724
|
+
}
|
|
725
|
+
|
|
726
|
+
/// Create a PCM buffer from raw bytes and schedule it on the player node.
|
|
727
|
+
private func scheduleChunk(
|
|
728
|
+
_ data: Data,
|
|
729
|
+
on playerNode: AVAudioPlayerNode,
|
|
730
|
+
format: AVAudioFormat,
|
|
731
|
+
group: DispatchGroup
|
|
732
|
+
) throws {
|
|
733
|
+
let frameCount = UInt32(data.count / 2) // 16-bit = 2 bytes per sample
|
|
734
|
+
guard frameCount > 0 else { return }
|
|
735
|
+
|
|
736
|
+
guard let pcmBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
|
|
737
|
+
throw NSError(domain: "TalkMode", code: 3, userInfo: [
|
|
738
|
+
NSLocalizedDescriptionKey: "Failed to create PCM buffer"
|
|
739
|
+
])
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
pcmBuffer.frameLength = frameCount
|
|
743
|
+
data.withUnsafeBytes { bytes in
|
|
744
|
+
guard let baseAddress = bytes.baseAddress else { return }
|
|
745
|
+
memcpy(pcmBuffer.int16ChannelData![0], baseAddress, data.count)
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
group.enter()
|
|
749
|
+
playerNode.scheduleBuffer(pcmBuffer) {
|
|
750
|
+
group.leave()
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
|
|
754
|
+
/// Download a full audio response (MP3 etc.) and play it with AVAudioPlayer.
|
|
755
|
+
private func downloadAndPlayAudio(request: URLRequest) async throws {
|
|
756
|
+
let (data, response) = try await URLSession.shared.data(for: request)
|
|
757
|
+
|
|
758
|
+
guard let httpResponse = response as? HTTPURLResponse, httpResponse.statusCode == 200 else {
|
|
759
|
+
let msg = String(data: data.prefix(2048), encoding: .utf8) ?? "Unknown error"
|
|
760
|
+
throw NSError(domain: "TalkMode", code: 2, userInfo: [
|
|
761
|
+
NSLocalizedDescriptionKey: "ElevenLabs API error: \(msg)"
|
|
762
|
+
])
|
|
763
|
+
}
|
|
764
|
+
|
|
765
|
+
mp3PlaybackStartTime = Date()
|
|
766
|
+
|
|
767
|
+
let player = try AVAudioPlayer(data: data)
|
|
768
|
+
audioPlayer = player
|
|
769
|
+
player.prepareToPlay()
|
|
770
|
+
|
|
771
|
+
await withCheckedContinuation { (continuation: CheckedContinuation<Void, Never>) in
|
|
772
|
+
let delegate = AudioPlayerDelegate {
|
|
773
|
+
continuation.resume()
|
|
774
|
+
}
|
|
775
|
+
// Retain delegate for the lifetime of playback
|
|
776
|
+
objc_setAssociatedObject(player, "delegate", delegate, .OBJC_ASSOCIATION_RETAIN)
|
|
777
|
+
player.delegate = delegate
|
|
778
|
+
player.play()
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
audioPlayer = nil
|
|
782
|
+
mp3PlaybackStartTime = nil
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
// MARK: - System TTS
|
|
786
|
+
|
|
787
|
+
private func speakWithSystemTts(text: String, language: String? = nil) async throws {
|
|
788
|
+
usedSystemTts = true
|
|
789
|
+
setState("speaking", "Speaking (System)")
|
|
790
|
+
|
|
791
|
+
let utterance = AVSpeechUtterance(string: text)
|
|
792
|
+
if let language, let voice = AVSpeechSynthesisVoice(language: language) {
|
|
793
|
+
utterance.voice = voice
|
|
794
|
+
} else {
|
|
795
|
+
let lang = Locale.current.languageCode ?? "en"
|
|
796
|
+
utterance.voice = AVSpeechSynthesisVoice(language: lang)
|
|
797
|
+
}
|
|
798
|
+
|
|
799
|
+
// Watchdog timeout: estimate from text length (0.08s per character, bounded)
|
|
800
|
+
let estimatedSeconds = max(3.0, min(180.0, Double(text.count) * 0.08))
|
|
801
|
+
|
|
802
|
+
try await withTaskCancellationHandler {
|
|
803
|
+
try await withCheckedThrowingContinuation { (cont: CheckedContinuation<Void, Error>) in
|
|
804
|
+
let delegate = SystemSpeechDelegate(continuation: cont)
|
|
805
|
+
self.systemSpeechDelegate = delegate // retain
|
|
806
|
+
self.systemSynthesizer.delegate = delegate
|
|
807
|
+
self.systemSynthesizer.speak(utterance)
|
|
808
|
+
|
|
809
|
+
// Watchdog: force-finish if TTS takes too long
|
|
810
|
+
delegate.watchdog = Task { @MainActor in
|
|
811
|
+
try? await Task.sleep(nanoseconds: UInt64(estimatedSeconds * 1_000_000_000))
|
|
812
|
+
guard !delegate.isFinished else { return }
|
|
813
|
+
self.systemSynthesizer.stopSpeaking(at: .immediate)
|
|
814
|
+
delegate.finish(error: NSError(domain: "TalkMode", code: 408, userInfo: [
|
|
815
|
+
NSLocalizedDescriptionKey: "System TTS timed out after \(Int(estimatedSeconds))s"
|
|
816
|
+
]))
|
|
817
|
+
}
|
|
818
|
+
}
|
|
819
|
+
} onCancel: {
|
|
820
|
+
Task { @MainActor in
|
|
821
|
+
self.systemSynthesizer.stopSpeaking(at: .immediate)
|
|
822
|
+
self.systemSpeechDelegate?.finish(
|
|
823
|
+
error: NSError(domain: "TalkMode", code: -999, userInfo: [
|
|
824
|
+
NSLocalizedDescriptionKey: "System TTS cancelled"
|
|
825
|
+
])
|
|
826
|
+
)
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
}
|
|
830
|
+
|
|
831
|
+
// MARK: - Stop Speaking
|
|
832
|
+
|
|
833
|
+
/// Stop all TTS playback. Returns the interrupted-at time in seconds, if available.
|
|
834
|
+
@discardableResult
|
|
835
|
+
private func stopSpeakingInternal() -> Double? {
|
|
836
|
+
guard isSpeakingValue else { return nil }
|
|
837
|
+
|
|
838
|
+
pcmStopRequested = true
|
|
839
|
+
|
|
840
|
+
// Compute how far into playback we were
|
|
841
|
+
var interruptedAt: Double?
|
|
842
|
+
if let start = pcmPlaybackStartTime {
|
|
843
|
+
interruptedAt = Date().timeIntervalSince(start)
|
|
844
|
+
} else if let start = mp3PlaybackStartTime {
|
|
845
|
+
interruptedAt = Date().timeIntervalSince(start)
|
|
846
|
+
}
|
|
847
|
+
lastInterruptedAtSeconds = interruptedAt
|
|
848
|
+
|
|
849
|
+
// Stop PCM streaming engine
|
|
850
|
+
pcmPlayerNode?.stop()
|
|
851
|
+
pcmEngine?.stop()
|
|
852
|
+
pcmEngine = nil
|
|
853
|
+
pcmPlayerNode = nil
|
|
854
|
+
pcmPlaybackStartTime = nil
|
|
855
|
+
|
|
856
|
+
// Stop MP3 player
|
|
857
|
+
audioPlayer?.stop()
|
|
858
|
+
audioPlayer = nil
|
|
859
|
+
mp3PlaybackStartTime = nil
|
|
860
|
+
|
|
861
|
+
// Stop system TTS
|
|
862
|
+
systemSynthesizer.stopSpeaking(at: .immediate)
|
|
863
|
+
systemSpeechDelegate?.finish(
|
|
864
|
+
error: NSError(domain: "TalkMode", code: -1, userInfo: [
|
|
865
|
+
NSLocalizedDescriptionKey: "Speech interrupted by user"
|
|
866
|
+
])
|
|
867
|
+
)
|
|
868
|
+
|
|
869
|
+
// Cancel in-flight speak task
|
|
870
|
+
speakTask?.cancel()
|
|
871
|
+
|
|
872
|
+
isSpeakingValue = false
|
|
873
|
+
|
|
874
|
+
return interruptedAt
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
// MARK: - Permissions
|
|
878
|
+
|
|
879
|
+
private func requestMicrophonePermission() async -> Bool {
|
|
880
|
+
await withCheckedContinuation { continuation in
|
|
881
|
+
if #available(iOS 17.0, *) {
|
|
882
|
+
AVAudioApplication.requestRecordPermission { granted in
|
|
883
|
+
continuation.resume(returning: granted)
|
|
884
|
+
}
|
|
885
|
+
} else {
|
|
886
|
+
AVAudioSession.sharedInstance().requestRecordPermission { granted in
|
|
887
|
+
continuation.resume(returning: granted)
|
|
888
|
+
}
|
|
889
|
+
}
|
|
890
|
+
}
|
|
891
|
+
}
|
|
892
|
+
|
|
893
|
+
private func requestSpeechPermission() async -> Bool {
|
|
894
|
+
await withCheckedContinuation { continuation in
|
|
895
|
+
SFSpeechRecognizer.requestAuthorization { status in
|
|
896
|
+
continuation.resume(returning: status == .authorized)
|
|
897
|
+
}
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
|
|
901
|
+
private func buildPermissionResult() -> JSObject {
|
|
902
|
+
let micStatus: String
|
|
903
|
+
switch AVAudioSession.sharedInstance().recordPermission {
|
|
904
|
+
case .granted: micStatus = "granted"
|
|
905
|
+
case .denied: micStatus = "denied"
|
|
906
|
+
case .undetermined: micStatus = "prompt"
|
|
907
|
+
@unknown default: micStatus = "prompt"
|
|
908
|
+
}
|
|
909
|
+
|
|
910
|
+
let speechStatus: String
|
|
911
|
+
switch SFSpeechRecognizer.authorizationStatus() {
|
|
912
|
+
case .authorized: speechStatus = "granted"
|
|
913
|
+
case .denied: speechStatus = "denied"
|
|
914
|
+
case .notDetermined: speechStatus = "prompt"
|
|
915
|
+
case .restricted: speechStatus = "denied"
|
|
916
|
+
@unknown default: speechStatus = "prompt"
|
|
917
|
+
}
|
|
918
|
+
|
|
919
|
+
return [
|
|
920
|
+
"microphone": micStatus,
|
|
921
|
+
"speechRecognition": speechStatus
|
|
922
|
+
]
|
|
923
|
+
}
|
|
924
|
+
|
|
925
|
+
// MARK: - Audio Session
|
|
926
|
+
|
|
927
|
+
private func configureAudioSession() throws {
|
|
928
|
+
let session = AVAudioSession.sharedInstance()
|
|
929
|
+
try session.setCategory(.playAndRecord, mode: .voiceChat, options: [
|
|
930
|
+
.duckOthers,
|
|
931
|
+
.mixWithOthers,
|
|
932
|
+
.allowBluetoothA2DP,
|
|
933
|
+
.defaultToSpeaker
|
|
934
|
+
])
|
|
935
|
+
try session.setActive(true)
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
// MARK: - State & Events
|
|
939
|
+
|
|
940
|
+
private func setState(_ newState: String, _ newStatusText: String) {
|
|
941
|
+
let previousState = state
|
|
942
|
+
state = newState
|
|
943
|
+
statusText = newStatusText
|
|
944
|
+
|
|
945
|
+
notifyListeners("stateChange", data: [
|
|
946
|
+
"state": newState,
|
|
947
|
+
"previousState": previousState,
|
|
948
|
+
"statusText": newStatusText,
|
|
949
|
+
"usingSystemTts": usedSystemTts
|
|
950
|
+
])
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
private func emitError(code: String, message: String, recoverable: Bool) {
|
|
954
|
+
notifyListeners("error", data: [
|
|
955
|
+
"code": code,
|
|
956
|
+
"message": message,
|
|
957
|
+
"recoverable": recoverable
|
|
958
|
+
])
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
// MARK: - Voice Alias Resolution
|
|
962
|
+
|
|
963
|
+
private func resolveVoiceAlias(_ value: String?) -> String? {
|
|
964
|
+
guard let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines),
|
|
965
|
+
!trimmed.isEmpty else {
|
|
966
|
+
return nil
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
let normalized = trimmed.lowercased()
|
|
970
|
+
|
|
971
|
+
// Check alias map
|
|
972
|
+
if let mapped = voiceAliases[normalized] { return mapped }
|
|
973
|
+
|
|
974
|
+
// Check if the value is already a known voice ID in aliases values
|
|
975
|
+
if voiceAliases.values.contains(where: { $0.caseInsensitiveCompare(trimmed) == .orderedSame }) {
|
|
976
|
+
return trimmed
|
|
977
|
+
}
|
|
978
|
+
|
|
979
|
+
// If it looks like a raw ElevenLabs voice ID (alphanumeric, 10+ chars), pass through
|
|
980
|
+
if trimmed.count >= 10,
|
|
981
|
+
trimmed.allSatisfy({ $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }) {
|
|
982
|
+
return trimmed
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
return nil
|
|
986
|
+
}
|
|
987
|
+
|
|
988
|
+
// MARK: - TTS Parameter Validation
|
|
989
|
+
|
|
990
|
+
/// Resolve speed from either explicit speed or words-per-minute rate.
|
|
991
|
+
/// ElevenLabs accepts 0.5–2.0; WPM is normalized against 175 WPM baseline.
|
|
992
|
+
private static func resolveSpeed(speed: Double?, rateWpm: Int?) -> Double? {
|
|
993
|
+
if let rateWpm, rateWpm > 0 {
|
|
994
|
+
let resolved = Double(rateWpm) / 175.0
|
|
995
|
+
guard resolved >= 0.5, resolved <= 2.0 else { return nil }
|
|
996
|
+
return resolved
|
|
997
|
+
}
|
|
998
|
+
if let speed {
|
|
999
|
+
guard speed >= 0.5, speed <= 2.0 else { return nil }
|
|
1000
|
+
return speed
|
|
1001
|
+
}
|
|
1002
|
+
return nil
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
/// Validate a 0–1 unit range parameter (stability, similarity, style).
|
|
1006
|
+
private static func validatedUnit(_ value: Double?) -> Double? {
|
|
1007
|
+
guard let value, value >= 0, value <= 1 else { return nil }
|
|
1008
|
+
return value
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
/// Validate seed (unsigned 32-bit integer range).
|
|
1012
|
+
private static func validatedSeed(_ value: Int?) -> Int? {
|
|
1013
|
+
guard let value, value >= 0, value <= 4_294_967_295 else { return nil }
|
|
1014
|
+
return value
|
|
1015
|
+
}
|
|
1016
|
+
|
|
1017
|
+
/// Validate text normalization mode (auto/on/off).
|
|
1018
|
+
private static func validatedNormalize(_ value: String?) -> String? {
|
|
1019
|
+
guard let value else { return nil }
|
|
1020
|
+
let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
|
1021
|
+
return ["auto", "on", "off"].contains(normalized) ? normalized : nil
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
/// Validate language code (2-letter ISO only).
|
|
1025
|
+
static func validatedLanguage(_ value: String?) -> String? {
|
|
1026
|
+
guard let value else { return nil }
|
|
1027
|
+
let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
|
1028
|
+
guard trimmed.count == 2, trimmed.allSatisfy({ $0.isLetter }) else { return nil }
|
|
1029
|
+
return trimmed
|
|
1030
|
+
}
|
|
1031
|
+
|
|
1032
|
+
/// Validate latency optimization tier (1–4).
|
|
1033
|
+
private static func validatedLatencyTier(_ value: Int?) -> Int? {
|
|
1034
|
+
guard let value, value >= 1, value <= 4 else { return nil }
|
|
1035
|
+
return value
|
|
1036
|
+
}
|
|
1037
|
+
|
|
1038
|
+
/// Validate ElevenLabs output format string.
|
|
1039
|
+
static func validatedOutputFormat(_ value: String?) -> String? {
|
|
1040
|
+
guard let value else { return nil }
|
|
1041
|
+
let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
|
|
1042
|
+
let validFormats: Set<String> = [
|
|
1043
|
+
"mp3_22050_32", "mp3_44100_32", "mp3_44100_64",
|
|
1044
|
+
"mp3_44100_96", "mp3_44100_128", "mp3_44100_192",
|
|
1045
|
+
"pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100",
|
|
1046
|
+
"ulaw_8000"
|
|
1047
|
+
]
|
|
1048
|
+
return validFormats.contains(trimmed) ? trimmed : nil
|
|
1049
|
+
}
|
|
1050
|
+
|
|
1051
|
+
/// Extract sample rate from a PCM output format string (e.g. "pcm_24000" → 24000).
|
|
1052
|
+
static func pcmSampleRate(from format: String?) -> Double? {
|
|
1053
|
+
guard let format, format.hasPrefix("pcm_") else { return nil }
|
|
1054
|
+
if format.contains("44100") { return 44100 }
|
|
1055
|
+
if format.contains("24000") { return 24000 }
|
|
1056
|
+
if format.contains("22050") { return 22050 }
|
|
1057
|
+
if format.contains("16000") { return 16000 }
|
|
1058
|
+
return nil
|
|
1059
|
+
}
|
|
1060
|
+
}
|
|
1061
|
+
|
|
1062
|
+
// MARK: - SystemSpeechDelegate
|
|
1063
|
+
|
|
1064
|
+
/// Delegate for AVSpeechSynthesizer that bridges the callback-based API to async/await
|
|
1065
|
+
/// via a CheckedContinuation, with a watchdog timeout for safety.
|
|
1066
|
+
private class SystemSpeechDelegate: NSObject, AVSpeechSynthesizerDelegate {
|
|
1067
|
+
private var continuation: CheckedContinuation<Void, Error>?
|
|
1068
|
+
var isFinished = false
|
|
1069
|
+
var watchdog: Task<Void, Never>?
|
|
1070
|
+
|
|
1071
|
+
init(continuation: CheckedContinuation<Void, Error>) {
|
|
1072
|
+
self.continuation = continuation
|
|
1073
|
+
super.init()
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
func finish(error: Error? = nil) {
|
|
1077
|
+
guard !isFinished else { return }
|
|
1078
|
+
isFinished = true
|
|
1079
|
+
watchdog?.cancel()
|
|
1080
|
+
watchdog = nil
|
|
1081
|
+
let cont = continuation
|
|
1082
|
+
continuation = nil
|
|
1083
|
+
if let error {
|
|
1084
|
+
cont?.resume(throwing: error)
|
|
1085
|
+
} else {
|
|
1086
|
+
cont?.resume(returning: ())
|
|
1087
|
+
}
|
|
1088
|
+
}
|
|
1089
|
+
|
|
1090
|
+
func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
|
|
1091
|
+
finish()
|
|
1092
|
+
}
|
|
1093
|
+
|
|
1094
|
+
func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
|
|
1095
|
+
finish(error: NSError(domain: "TalkMode", code: -1, userInfo: [
|
|
1096
|
+
NSLocalizedDescriptionKey: "System TTS cancelled"
|
|
1097
|
+
]))
|
|
1098
|
+
}
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
// MARK: - AudioPlayerDelegate
|
|
1102
|
+
|
|
1103
|
+
/// Delegate for AVAudioPlayer (MP3 playback) that signals completion via a closure.
|
|
1104
|
+
private class AudioPlayerDelegate: NSObject, AVAudioPlayerDelegate {
|
|
1105
|
+
private var onComplete: (() -> Void)?
|
|
1106
|
+
|
|
1107
|
+
init(onComplete: @escaping () -> Void) {
|
|
1108
|
+
self.onComplete = onComplete
|
|
1109
|
+
super.init()
|
|
1110
|
+
}
|
|
1111
|
+
|
|
1112
|
+
func audioPlayerDidFinishPlaying(_ player: AVAudioPlayer, successfully flag: Bool) {
|
|
1113
|
+
onComplete?()
|
|
1114
|
+
onComplete = nil
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1117
|
+
func audioPlayerDecodeErrorDidOccur(_ player: AVAudioPlayer, error: Error?) {
|
|
1118
|
+
onComplete?()
|
|
1119
|
+
onComplete = nil
|
|
1120
|
+
}
|
|
1121
|
+
}
|