@elizaos/capacitor-talkmode 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1121 @@
1
+ import Foundation
2
+ import Capacitor
3
+ import AVFoundation
4
+ import Speech
5
+
6
+ // MARK: - TalkModePlugin
7
+
8
+ @objc(TalkModePlugin)
9
+ public class TalkModePlugin: CAPPlugin, CAPBridgedPlugin {
10
+ public let identifier = "TalkModePlugin"
11
+ public let jsName = "TalkMode"
12
+ public let pluginMethods: [CAPPluginMethod] = [
13
+ CAPPluginMethod(name: "start", returnType: CAPPluginReturnPromise),
14
+ CAPPluginMethod(name: "stop", returnType: CAPPluginReturnPromise),
15
+ CAPPluginMethod(name: "isEnabled", returnType: CAPPluginReturnPromise),
16
+ CAPPluginMethod(name: "getState", returnType: CAPPluginReturnPromise),
17
+ CAPPluginMethod(name: "updateConfig", returnType: CAPPluginReturnPromise),
18
+ CAPPluginMethod(name: "speak", returnType: CAPPluginReturnPromise),
19
+ CAPPluginMethod(name: "stopSpeaking", returnType: CAPPluginReturnPromise),
20
+ CAPPluginMethod(name: "isSpeaking", returnType: CAPPluginReturnPromise),
21
+ CAPPluginMethod(name: "checkPermissions", returnType: CAPPluginReturnPromise),
22
+ CAPPluginMethod(name: "requestPermissions", returnType: CAPPluginReturnPromise),
23
+ ]
24
+
25
+ private static let defaultModelId = "eleven_flash_v2_5"
26
+
27
+ // MARK: - State
28
+
29
+ private var enabled = false
30
+ private var state: String = "idle"
31
+ private var statusText: String = "Off"
32
+
33
+ // MARK: - Speech Recognition
34
+
35
+ private let audioEngine = AVAudioEngine()
36
+ private var speechRecognizer: SFSpeechRecognizer?
37
+ private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
38
+ private var recognitionTask: SFSpeechRecognitionTask?
39
+ private var silenceTask: Task<Void, Never>?
40
+ private var lastTranscript = ""
41
+ private var lastHeard: Date?
42
+ private var silenceWindow: TimeInterval = 0.7
43
+
44
+ // MARK: - TTS
45
+
46
+ private let systemSynthesizer = AVSpeechSynthesizer()
47
+ private var systemSpeechDelegate: SystemSpeechDelegate?
48
+ private var isSpeakingValue = false
49
+ private var usedSystemTts = false
50
+ private var lastSpokenText: String?
51
+ private var lastInterruptedAtSeconds: Double?
52
+
53
+ // MARK: - PCM Streaming Playback
54
+
55
+ private var pcmEngine: AVAudioEngine?
56
+ private var pcmPlayerNode: AVAudioPlayerNode?
57
+ private var pcmStopRequested = false
58
+ private var pcmPlaybackStartTime: Date?
59
+
60
+ // MARK: - MP3 Playback
61
+
62
+ private var audioPlayer: AVAudioPlayer?
63
+ private var mp3PlaybackStartTime: Date?
64
+
65
+ // MARK: - Active Tasks
66
+
67
+ private var speakTask: Task<Void, Error>?
68
+
69
+ // MARK: - Config
70
+
71
+ private var apiKey: String?
72
+ private var defaultVoiceId: String?
73
+ private var currentVoiceId: String?
74
+ private var defaultModelId: String? = TalkModePlugin.defaultModelId
75
+ private var currentModelId: String? = TalkModePlugin.defaultModelId
76
+ private var defaultOutputFormat: String? = "pcm_24000"
77
+ private var voiceAliases: [String: String] = [:]
78
+ private var interruptOnSpeech = true
79
+ private var sessionKey = "main"
80
+ private var voiceOverrideActive = false
81
+ private var modelOverrideActive = false
82
+
83
+ // MARK: - Lifecycle
84
+
85
+ public override func load() {
86
+ speechRecognizer = SFSpeechRecognizer()
87
+ }
88
+
89
+ // MARK: - Plugin Methods
90
+
91
+ @objc func start(_ call: CAPPluginCall) {
92
+ // Parse config first so STT language is set before availability check
93
+ if let config = call.getObject("config") {
94
+ applyConfig(config)
95
+ }
96
+
97
+ guard let recognizer = speechRecognizer, recognizer.isAvailable else {
98
+ call.resolve(["started": false, "error": "Speech recognition not available"])
99
+ return
100
+ }
101
+
102
+ Task { @MainActor in
103
+ let micOk = await self.requestMicrophonePermission()
104
+ guard micOk else {
105
+ call.resolve(["started": false, "error": "Microphone permission denied"])
106
+ return
107
+ }
108
+
109
+ let speechOk = await self.requestSpeechPermission()
110
+ guard speechOk else {
111
+ call.resolve(["started": false, "error": "Speech recognition permission denied"])
112
+ return
113
+ }
114
+
115
+ do {
116
+ try self.configureAudioSession()
117
+ try self.startRecognition()
118
+ self.enabled = true
119
+ self.setState("listening", "Listening")
120
+ self.startSilenceMonitor()
121
+ call.resolve(["started": true])
122
+ } catch {
123
+ self.emitError(code: "start_failed", message: error.localizedDescription, recoverable: true)
124
+ call.resolve(["started": false, "error": error.localizedDescription])
125
+ }
126
+ }
127
+ }
128
+
129
+ @objc func stop(_ call: CAPPluginCall) {
130
+ enabled = false
131
+ stopRecognition()
132
+ stopSpeakingInternal()
133
+ silenceTask?.cancel()
134
+ silenceTask = nil
135
+ lastTranscript = ""
136
+ lastHeard = nil
137
+ lastInterruptedAtSeconds = nil
138
+ setState("idle", "Off")
139
+
140
+ do {
141
+ try AVAudioSession.sharedInstance().setActive(false, options: [.notifyOthersOnDeactivation])
142
+ } catch {
143
+ // Ignore deactivation errors
144
+ }
145
+
146
+ call.resolve()
147
+ }
148
+
149
+ @objc func isEnabled(_ call: CAPPluginCall) {
150
+ call.resolve(["enabled": enabled])
151
+ }
152
+
153
+ @objc func getState(_ call: CAPPluginCall) {
154
+ call.resolve(["state": state, "statusText": statusText])
155
+ }
156
+
157
+ @objc func updateConfig(_ call: CAPPluginCall) {
158
+ guard let config = call.getObject("config") else {
159
+ call.resolve()
160
+ return
161
+ }
162
+ applyConfig(config)
163
+ call.resolve()
164
+ }
165
+
166
+ @objc func speak(_ call: CAPPluginCall) {
167
+ guard let text = call.getString("text")?.trimmingCharacters(in: .whitespacesAndNewlines),
168
+ !text.isEmpty else {
169
+ call.resolve(["completed": true, "interrupted": false, "usedSystemTts": false])
170
+ return
171
+ }
172
+
173
+ let useSystemTts = call.getBool("useSystemTts") ?? false
174
+ let directive = call.getObject("directive")
175
+
176
+ speakTask?.cancel()
177
+ speakTask = Task { @MainActor in
178
+ await self.speakInternal(text: text, forceSystemTts: useSystemTts, directive: directive, call: call)
179
+ }
180
+ }
181
+
182
+ @objc func stopSpeaking(_ call: CAPPluginCall) {
183
+ let interruptedAt = stopSpeakingInternal()
184
+ var result: JSObject = [:]
185
+ if let interruptedAt {
186
+ result["interruptedAt"] = interruptedAt
187
+ }
188
+ call.resolve(result)
189
+ }
190
+
191
+ @objc func isSpeaking(_ call: CAPPluginCall) {
192
+ call.resolve(["speaking": isSpeakingValue])
193
+ }
194
+
195
+ @objc public override func checkPermissions(_ call: CAPPluginCall) {
196
+ call.resolve(buildPermissionResult())
197
+ }
198
+
199
+ @objc public override func requestPermissions(_ call: CAPPluginCall) {
200
+ Task { @MainActor in
201
+ _ = await self.requestMicrophonePermission()
202
+ _ = await self.requestSpeechPermission()
203
+ call.resolve(self.buildPermissionResult())
204
+ }
205
+ }
206
+
207
+ // MARK: - Config Application
208
+
209
+ private func applyConfig(_ config: JSObject) {
210
+ if let tts = config["tts"] as? [String: Any] {
211
+ if let key = tts["apiKey"] as? String {
212
+ apiKey = key.trimmingCharacters(in: .whitespacesAndNewlines)
213
+ }
214
+ if let voice = tts["voiceId"] as? String {
215
+ defaultVoiceId = voice.trimmingCharacters(in: .whitespacesAndNewlines)
216
+ if !voiceOverrideActive { currentVoiceId = defaultVoiceId }
217
+ }
218
+ if let model = tts["modelId"] as? String {
219
+ let trimmed = model.trimmingCharacters(in: .whitespacesAndNewlines)
220
+ defaultModelId = trimmed.isEmpty ? Self.defaultModelId : trimmed
221
+ if !modelOverrideActive { currentModelId = defaultModelId }
222
+ }
223
+ if let format = tts["outputFormat"] as? String {
224
+ defaultOutputFormat = format.trimmingCharacters(in: .whitespacesAndNewlines)
225
+ }
226
+ if let interrupt = tts["interruptOnSpeech"] as? Bool {
227
+ interruptOnSpeech = interrupt
228
+ }
229
+
230
+ if let aliases = tts["voiceAliases"] as? [String: String] {
231
+ var normalized: [String: String] = [:]
232
+ for (key, value) in aliases {
233
+ let k = key.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
234
+ let v = value.trimmingCharacters(in: .whitespacesAndNewlines)
235
+ if !k.isEmpty, !v.isEmpty { normalized[k] = v }
236
+ }
237
+ voiceAliases = normalized
238
+ }
239
+ }
240
+
241
+ if let stt = config["stt"] as? [String: Any] {
242
+ if let lang = stt["language"] as? String, !lang.isEmpty {
243
+ speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: lang))
244
+ }
245
+ }
246
+
247
+ if let silenceMs = config["silenceWindowMs"] as? Int, silenceMs > 0 {
248
+ silenceWindow = TimeInterval(silenceMs) / 1000.0
249
+ }
250
+
251
+ if let interrupt = config["interruptOnSpeech"] as? Bool {
252
+ interruptOnSpeech = interrupt
253
+ }
254
+
255
+ if let key = config["sessionKey"] as? String {
256
+ sessionKey = key
257
+ }
258
+ }
259
+
260
+ // MARK: - Speech Recognition
261
+
262
+ private func startRecognition() throws {
263
+ #if targetEnvironment(simulator)
264
+ throw NSError(domain: "TalkMode", code: 1, userInfo: [
265
+ NSLocalizedDescriptionKey: "Speech recognition not supported on simulator"
266
+ ])
267
+ #endif
268
+
269
+ stopRecognition()
270
+
271
+ guard let recognizer = speechRecognizer, recognizer.isAvailable else {
272
+ throw NSError(domain: "TalkMode", code: 2, userInfo: [
273
+ NSLocalizedDescriptionKey: "Speech recognizer unavailable"
274
+ ])
275
+ }
276
+
277
+ recognitionRequest = SFSpeechAudioBufferRecognitionRequest()
278
+ recognitionRequest?.shouldReportPartialResults = true
279
+
280
+ guard let request = recognitionRequest else { return }
281
+
282
+ let input = audioEngine.inputNode
283
+ let format = input.outputFormat(forBus: 0)
284
+
285
+ guard format.sampleRate > 0, format.channelCount > 0 else {
286
+ throw NSError(domain: "TalkMode", code: 3, userInfo: [
287
+ NSLocalizedDescriptionKey: "Invalid audio input format"
288
+ ])
289
+ }
290
+
291
+ input.removeTap(onBus: 0)
292
+ input.installTap(onBus: 0, bufferSize: 2048, format: format) { buffer, _ in
293
+ request.append(buffer)
294
+ }
295
+
296
+ audioEngine.prepare()
297
+ try audioEngine.start()
298
+
299
+ recognitionTask = recognizer.recognitionTask(with: request) { [weak self] result, error in
300
+ guard let self else { return }
301
+
302
+ if let error {
303
+ if !self.isSpeakingValue {
304
+ print("[TalkMode] Recognition error: \(error.localizedDescription)")
305
+ }
306
+ return
307
+ }
308
+
309
+ guard let result else { return }
310
+ let transcript = result.bestTranscription.formattedString
311
+
312
+ DispatchQueue.main.async {
313
+ self.handleTranscript(transcript: transcript, isFinal: result.isFinal)
314
+ }
315
+ }
316
+ }
317
+
318
+ private func stopRecognition() {
319
+ recognitionTask?.cancel()
320
+ recognitionTask = nil
321
+ recognitionRequest?.endAudio()
322
+ recognitionRequest = nil
323
+ audioEngine.inputNode.removeTap(onBus: 0)
324
+ audioEngine.stop()
325
+ }
326
+
327
+ private func handleTranscript(transcript: String, isFinal: Bool) {
328
+ let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
329
+
330
+ // During TTS playback, only listen for interrupt triggers
331
+ if isSpeakingValue, interruptOnSpeech {
332
+ if shouldInterrupt(with: trimmed) {
333
+ stopSpeakingInternal()
334
+ }
335
+ return
336
+ }
337
+
338
+ guard enabled else { return }
339
+
340
+ if !trimmed.isEmpty {
341
+ lastTranscript = trimmed
342
+ lastHeard = Date()
343
+ }
344
+
345
+ if isFinal {
346
+ lastTranscript = trimmed
347
+ }
348
+
349
+ notifyListeners("transcript", data: [
350
+ "transcript": trimmed,
351
+ "isFinal": isFinal
352
+ ])
353
+ }
354
+
355
+ /// Determines whether detected speech should interrupt current TTS playback.
356
+ /// Filters out echo where the mic picks up our own TTS output.
357
+ private func shouldInterrupt(with transcript: String) -> Bool {
358
+ let trimmed = transcript.trimmingCharacters(in: .whitespacesAndNewlines)
359
+ guard trimmed.count >= 3 else { return false }
360
+
361
+ // Echo detection: if the transcript is a substring of the text being spoken,
362
+ // it's likely the microphone picking up the TTS output, not user speech.
363
+ if let spoken = lastSpokenText?.lowercased() {
364
+ let probe = trimmed.lowercased()
365
+ if spoken.contains(probe) { return false }
366
+ }
367
+
368
+ return true
369
+ }
370
+
371
+ // MARK: - Silence Detection
372
+
373
+ private func startSilenceMonitor() {
374
+ silenceTask?.cancel()
375
+ silenceTask = Task { [weak self] in
376
+ while self?.enabled == true {
377
+ try? await Task.sleep(nanoseconds: 200_000_000) // 200ms poll
378
+ // Re-capture `self` explicitly in the inner MainActor
379
+ // closure. Without this, Swift 6 strict concurrency
380
+ // rejects it with:
381
+ // error: reference to captured var 'self' in
382
+ // concurrently-executing code
383
+ // because the outer `[weak self]` list does not
384
+ // propagate into the nested `MainActor.run` closure.
385
+ await MainActor.run { [weak self] in self?.checkSilence() }
386
+ }
387
+ }
388
+ }
389
+
390
+ /// Check if the user stopped speaking and enough silence has elapsed.
391
+ /// When silence exceeds the configured window, finalize the transcript
392
+ /// so the JS layer can send it to the agent.
393
+ private func checkSilence() {
394
+ guard enabled, !isSpeakingValue, state == "listening" else { return }
395
+ let transcript = lastTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
396
+ guard !transcript.isEmpty else { return }
397
+ guard let lastHeard else { return }
398
+
399
+ if Date().timeIntervalSince(lastHeard) >= silenceWindow {
400
+ finalizeTranscript(transcript)
401
+ }
402
+ }
403
+
404
+ /// Emit the final transcript and transition to processing state.
405
+ /// The JS layer picks this up to send the transcript to the agent.
406
+ private func finalizeTranscript(_ transcript: String) {
407
+ lastTranscript = ""
408
+ lastHeard = nil
409
+ setState("processing", "Processing")
410
+ stopRecognition()
411
+
412
+ notifyListeners("transcript", data: [
413
+ "transcript": transcript,
414
+ "isFinal": true
415
+ ])
416
+ }
417
+
418
+ // MARK: - TTS Orchestration
419
+
420
+ private func speakInternal(
421
+ text: String,
422
+ forceSystemTts: Bool,
423
+ directive: [String: Any]?,
424
+ call: CAPPluginCall
425
+ ) async {
426
+ isSpeakingValue = true
427
+ usedSystemTts = false
428
+ pcmStopRequested = false
429
+ lastSpokenText = text
430
+ setState("speaking", "Speaking")
431
+
432
+ // Resolve voice/model from directive, with override persistence
433
+ let requestedVoice = (directive?["voiceId"] as? String)?
434
+ .trimmingCharacters(in: .whitespacesAndNewlines)
435
+ let resolvedVoice = resolveVoiceAlias(requestedVoice)
436
+ let isOnce = directive?["once"] as? Bool ?? false
437
+
438
+ if let voice = resolvedVoice, !isOnce {
439
+ currentVoiceId = voice
440
+ voiceOverrideActive = true
441
+ }
442
+ if let model = directive?["modelId"] as? String, !model.isEmpty, !isOnce {
443
+ currentModelId = model
444
+ modelOverrideActive = true
445
+ }
446
+
447
+ let effectiveVoiceId = resolvedVoice ?? currentVoiceId ?? defaultVoiceId
448
+ let effectiveModelId = (directive?["modelId"] as? String)
449
+ ?? currentModelId ?? defaultModelId ?? Self.defaultModelId
450
+ let rawFormat = (directive?["outputFormat"] as? String)
451
+ ?? defaultOutputFormat ?? "pcm_24000"
452
+ let effectiveFormat = Self.validatedOutputFormat(rawFormat) ?? "pcm_24000"
453
+ let effectiveApiKey = apiKey?.trimmingCharacters(in: .whitespacesAndNewlines)
454
+
455
+ let canUseElevenLabs = !forceSystemTts
456
+ && !(effectiveApiKey ?? "").isEmpty
457
+ && !(effectiveVoiceId ?? "").isEmpty
458
+
459
+ notifyListeners("speaking", data: [
460
+ "text": text,
461
+ "isSystemTts": !canUseElevenLabs
462
+ ])
463
+
464
+ // Enable STT during playback for interrupt detection
465
+ if interruptOnSpeech {
466
+ do { try startRecognition() } catch {
467
+ print("[TalkMode] Recognition for interrupt detection failed: \(error)")
468
+ }
469
+ } else {
470
+ stopRecognition()
471
+ }
472
+
473
+ var interrupted = false
474
+ let language = Self.validatedLanguage(directive?["language"] as? String)
475
+
476
+ do {
477
+ if canUseElevenLabs {
478
+ do {
479
+ try await streamElevenLabsTts(
480
+ text: text,
481
+ voiceId: effectiveVoiceId ?? "",
482
+ apiKey: effectiveApiKey ?? "",
483
+ modelId: effectiveModelId,
484
+ outputFormat: effectiveFormat,
485
+ directive: directive
486
+ )
487
+ interrupted = pcmStopRequested
488
+ } catch {
489
+ // Fallback to system TTS on ElevenLabs failure
490
+ print("[TalkMode] ElevenLabs failed, falling back to system TTS: \(error)")
491
+ emitError(
492
+ code: "elevenlabs_failed",
493
+ message: error.localizedDescription,
494
+ recoverable: true
495
+ )
496
+ try await speakWithSystemTts(text: text, language: language)
497
+ }
498
+ } else {
499
+ try await speakWithSystemTts(text: text, language: language)
500
+ }
501
+ } catch {
502
+ emitError(code: "tts_failed", message: error.localizedDescription, recoverable: true)
503
+ call.resolve([
504
+ "completed": false,
505
+ "interrupted": false,
506
+ "usedSystemTts": usedSystemTts,
507
+ "error": error.localizedDescription
508
+ ])
509
+ finishSpeaking()
510
+ return
511
+ }
512
+
513
+ var result: JSObject = [
514
+ "completed": !interrupted,
515
+ "interrupted": interrupted,
516
+ "usedSystemTts": usedSystemTts
517
+ ]
518
+ if interrupted, let at = lastInterruptedAtSeconds {
519
+ result["interruptedAt"] = at
520
+ }
521
+ call.resolve(result)
522
+
523
+ notifyListeners("speakComplete", data: [
524
+ "completed": !interrupted
525
+ ])
526
+
527
+ finishSpeaking()
528
+ }
529
+
530
+ /// Clean up after speech and restart recognition if talk mode is still enabled.
531
+ private func finishSpeaking() {
532
+ isSpeakingValue = false
533
+ pcmStopRequested = false
534
+ stopRecognition()
535
+
536
+ if enabled {
537
+ setState("listening", "Listening")
538
+ do {
539
+ try startRecognition()
540
+ startSilenceMonitor()
541
+ } catch {
542
+ print("[TalkMode] Failed to restart recognition: \(error)")
543
+ emitError(
544
+ code: "recognition_restart_failed",
545
+ message: error.localizedDescription,
546
+ recoverable: true
547
+ )
548
+ }
549
+ } else {
550
+ setState("idle", "Off")
551
+ }
552
+ }
553
+
554
+ // MARK: - ElevenLabs Streaming TTS
555
+
556
+ private func streamElevenLabsTts(
557
+ text: String,
558
+ voiceId: String,
559
+ apiKey: String,
560
+ modelId: String,
561
+ outputFormat: String,
562
+ directive: [String: Any]?
563
+ ) async throws {
564
+ let urlString = "https://api.elevenlabs.io/v1/text-to-speech/\(voiceId)/stream"
565
+ guard let url = URL(string: urlString) else {
566
+ throw NSError(domain: "TalkMode", code: 1, userInfo: [
567
+ NSLocalizedDescriptionKey: "Invalid ElevenLabs URL"
568
+ ])
569
+ }
570
+
571
+ var request = URLRequest(url: url)
572
+ request.httpMethod = "POST"
573
+ request.setValue("application/json", forHTTPHeaderField: "Content-Type")
574
+ request.setValue(apiKey, forHTTPHeaderField: "xi-api-key")
575
+
576
+ // Build voice settings from directive values
577
+ let speed = Self.resolveSpeed(
578
+ speed: directive?["speed"] as? Double,
579
+ rateWpm: directive?["rateWpm"] as? Int
580
+ )
581
+ let stability = Self.validatedUnit(directive?["stability"] as? Double) ?? 0.5
582
+ let similarity = Self.validatedUnit(directive?["similarity"] as? Double) ?? 0.75
583
+
584
+ var voiceSettings: [String: Any] = [
585
+ "stability": stability,
586
+ "similarity_boost": similarity
587
+ ]
588
+ if let speed { voiceSettings["speed"] = speed }
589
+ if let style = Self.validatedUnit(directive?["style"] as? Double) {
590
+ voiceSettings["style"] = style
591
+ }
592
+ if let boost = directive?["speakerBoost"] as? Bool {
593
+ voiceSettings["use_speaker_boost"] = boost
594
+ }
595
+
596
+ var body: [String: Any] = [
597
+ "text": text,
598
+ "model_id": modelId,
599
+ "output_format": outputFormat,
600
+ "voice_settings": voiceSettings
601
+ ]
602
+ if let seed = Self.validatedSeed(directive?["seed"] as? Int) {
603
+ body["seed"] = seed
604
+ }
605
+ if let normalize = Self.validatedNormalize(directive?["normalize"] as? String) {
606
+ body["apply_text_normalization"] = normalize
607
+ }
608
+ if let language = Self.validatedLanguage(directive?["language"] as? String) {
609
+ body["language_code"] = language
610
+ }
611
+ if let tier = Self.validatedLatencyTier(directive?["latencyTier"] as? Int) {
612
+ body["optimize_streaming_latency"] = tier
613
+ }
614
+
615
+ request.httpBody = try JSONSerialization.data(withJSONObject: body)
616
+
617
+ let isPCM = outputFormat.hasPrefix("pcm_")
618
+ let sampleRate = Self.pcmSampleRate(from: outputFormat)
619
+
620
+ if isPCM, let sampleRate {
621
+ do {
622
+ try await streamPCMPlayback(request: request, sampleRate: sampleRate)
623
+ } catch {
624
+ // PCM playback failed; retry as MP3 as a fallback
625
+ guard !pcmStopRequested else { return }
626
+ print("[TalkMode] PCM playback failed, retrying as MP3: \(error)")
627
+
628
+ let mp3Format = "mp3_44100_128"
629
+ var retryBody = body
630
+ retryBody["output_format"] = mp3Format
631
+
632
+ var retryRequest = request
633
+ retryRequest.httpBody = try JSONSerialization.data(withJSONObject: retryBody)
634
+ try await downloadAndPlayAudio(request: retryRequest)
635
+ }
636
+ } else {
637
+ try await downloadAndPlayAudio(request: request)
638
+ }
639
+ }
640
+
641
+ /// Stream PCM audio from the network directly into an AVAudioPlayerNode.
642
+ /// Chunks are scheduled onto the player as they arrive for low-latency playback.
643
+ private func streamPCMPlayback(request: URLRequest, sampleRate: Double) async throws {
644
+ let engine = AVAudioEngine()
645
+ let playerNode = AVAudioPlayerNode()
646
+
647
+ let format = AVAudioFormat(
648
+ commonFormat: .pcmFormatInt16,
649
+ sampleRate: sampleRate,
650
+ channels: 1,
651
+ interleaved: true
652
+ )!
653
+
654
+ engine.attach(playerNode)
655
+ engine.connect(playerNode, to: engine.mainMixerNode, format: format)
656
+ try engine.start()
657
+
658
+ pcmEngine = engine
659
+ pcmPlayerNode = playerNode
660
+ pcmPlaybackStartTime = Date()
661
+ playerNode.play()
662
+
663
+ defer {
664
+ engine.stop()
665
+ pcmEngine = nil
666
+ pcmPlayerNode = nil
667
+ }
668
+
669
+ let (bytes, response) = try await URLSession.shared.bytes(for: request)
670
+
671
+ guard let httpResponse = response as? HTTPURLResponse else {
672
+ throw NSError(domain: "TalkMode", code: 2, userInfo: [
673
+ NSLocalizedDescriptionKey: "Invalid HTTP response from ElevenLabs"
674
+ ])
675
+ }
676
+
677
+ guard httpResponse.statusCode == 200 else {
678
+ // Read a bit of the error body for diagnostics
679
+ var errorData = Data()
680
+ for try await byte in bytes {
681
+ errorData.append(byte)
682
+ if errorData.count > 2048 { break }
683
+ }
684
+ let errorMsg = String(data: errorData, encoding: .utf8) ?? "status \(httpResponse.statusCode)"
685
+ throw NSError(domain: "TalkMode", code: httpResponse.statusCode, userInfo: [
686
+ NSLocalizedDescriptionKey: "ElevenLabs API error: \(errorMsg)"
687
+ ])
688
+ }
689
+
690
+ // Accumulate bytes into chunks; schedule each on the player node.
691
+ // Chunk size is ~0.5s of audio for smooth playback without excessive latency.
692
+ // 16-bit mono PCM: sampleRate * 2 bytes per second.
693
+ let chunkSize = Int(sampleRate) // ~0.5s of 16-bit mono audio
694
+ var buffer = Data()
695
+ var scheduledCount = 0
696
+ let completionGroup = DispatchGroup()
697
+
698
+ for try await byte in bytes {
699
+ if pcmStopRequested { break }
700
+
701
+ buffer.append(byte)
702
+
703
+ if buffer.count >= chunkSize {
704
+ try scheduleChunk(buffer, on: playerNode, format: format, group: completionGroup)
705
+ scheduledCount += 1
706
+ buffer = Data()
707
+ }
708
+ }
709
+
710
+ // Schedule any remaining data
711
+ if !buffer.isEmpty, !pcmStopRequested {
712
+ try scheduleChunk(buffer, on: playerNode, format: format, group: completionGroup)
713
+ scheduledCount += 1
714
+ }
715
+
716
+ // Wait for all scheduled buffers to finish playback
717
+ if scheduledCount > 0, !pcmStopRequested {
718
+ await withCheckedContinuation { (continuation: CheckedContinuation<Void, Never>) in
719
+ completionGroup.notify(queue: .main) {
720
+ continuation.resume()
721
+ }
722
+ }
723
+ }
724
+ }
725
+
726
+ /// Create a PCM buffer from raw bytes and schedule it on the player node.
727
+ private func scheduleChunk(
728
+ _ data: Data,
729
+ on playerNode: AVAudioPlayerNode,
730
+ format: AVAudioFormat,
731
+ group: DispatchGroup
732
+ ) throws {
733
+ let frameCount = UInt32(data.count / 2) // 16-bit = 2 bytes per sample
734
+ guard frameCount > 0 else { return }
735
+
736
+ guard let pcmBuffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
737
+ throw NSError(domain: "TalkMode", code: 3, userInfo: [
738
+ NSLocalizedDescriptionKey: "Failed to create PCM buffer"
739
+ ])
740
+ }
741
+
742
+ pcmBuffer.frameLength = frameCount
743
+ data.withUnsafeBytes { bytes in
744
+ guard let baseAddress = bytes.baseAddress else { return }
745
+ memcpy(pcmBuffer.int16ChannelData![0], baseAddress, data.count)
746
+ }
747
+
748
+ group.enter()
749
+ playerNode.scheduleBuffer(pcmBuffer) {
750
+ group.leave()
751
+ }
752
+ }
753
+
754
+ /// Download a full audio response (MP3 etc.) and play it with AVAudioPlayer.
755
+ private func downloadAndPlayAudio(request: URLRequest) async throws {
756
+ let (data, response) = try await URLSession.shared.data(for: request)
757
+
758
+ guard let httpResponse = response as? HTTPURLResponse, httpResponse.statusCode == 200 else {
759
+ let msg = String(data: data.prefix(2048), encoding: .utf8) ?? "Unknown error"
760
+ throw NSError(domain: "TalkMode", code: 2, userInfo: [
761
+ NSLocalizedDescriptionKey: "ElevenLabs API error: \(msg)"
762
+ ])
763
+ }
764
+
765
+ mp3PlaybackStartTime = Date()
766
+
767
+ let player = try AVAudioPlayer(data: data)
768
+ audioPlayer = player
769
+ player.prepareToPlay()
770
+
771
+ await withCheckedContinuation { (continuation: CheckedContinuation<Void, Never>) in
772
+ let delegate = AudioPlayerDelegate {
773
+ continuation.resume()
774
+ }
775
+ // Retain delegate for the lifetime of playback
776
+ objc_setAssociatedObject(player, "delegate", delegate, .OBJC_ASSOCIATION_RETAIN)
777
+ player.delegate = delegate
778
+ player.play()
779
+ }
780
+
781
+ audioPlayer = nil
782
+ mp3PlaybackStartTime = nil
783
+ }
784
+
785
+ // MARK: - System TTS
786
+
787
+ private func speakWithSystemTts(text: String, language: String? = nil) async throws {
788
+ usedSystemTts = true
789
+ setState("speaking", "Speaking (System)")
790
+
791
+ let utterance = AVSpeechUtterance(string: text)
792
+ if let language, let voice = AVSpeechSynthesisVoice(language: language) {
793
+ utterance.voice = voice
794
+ } else {
795
+ let lang = Locale.current.languageCode ?? "en"
796
+ utterance.voice = AVSpeechSynthesisVoice(language: lang)
797
+ }
798
+
799
+ // Watchdog timeout: estimate from text length (0.08s per character, bounded)
800
+ let estimatedSeconds = max(3.0, min(180.0, Double(text.count) * 0.08))
801
+
802
+ try await withTaskCancellationHandler {
803
+ try await withCheckedThrowingContinuation { (cont: CheckedContinuation<Void, Error>) in
804
+ let delegate = SystemSpeechDelegate(continuation: cont)
805
+ self.systemSpeechDelegate = delegate // retain
806
+ self.systemSynthesizer.delegate = delegate
807
+ self.systemSynthesizer.speak(utterance)
808
+
809
+ // Watchdog: force-finish if TTS takes too long
810
+ delegate.watchdog = Task { @MainActor in
811
+ try? await Task.sleep(nanoseconds: UInt64(estimatedSeconds * 1_000_000_000))
812
+ guard !delegate.isFinished else { return }
813
+ self.systemSynthesizer.stopSpeaking(at: .immediate)
814
+ delegate.finish(error: NSError(domain: "TalkMode", code: 408, userInfo: [
815
+ NSLocalizedDescriptionKey: "System TTS timed out after \(Int(estimatedSeconds))s"
816
+ ]))
817
+ }
818
+ }
819
+ } onCancel: {
820
+ Task { @MainActor in
821
+ self.systemSynthesizer.stopSpeaking(at: .immediate)
822
+ self.systemSpeechDelegate?.finish(
823
+ error: NSError(domain: "TalkMode", code: -999, userInfo: [
824
+ NSLocalizedDescriptionKey: "System TTS cancelled"
825
+ ])
826
+ )
827
+ }
828
+ }
829
+ }
830
+
831
+ // MARK: - Stop Speaking
832
+
833
+ /// Stop all TTS playback. Returns the interrupted-at time in seconds, if available.
834
+ @discardableResult
835
+ private func stopSpeakingInternal() -> Double? {
836
+ guard isSpeakingValue else { return nil }
837
+
838
+ pcmStopRequested = true
839
+
840
+ // Compute how far into playback we were
841
+ var interruptedAt: Double?
842
+ if let start = pcmPlaybackStartTime {
843
+ interruptedAt = Date().timeIntervalSince(start)
844
+ } else if let start = mp3PlaybackStartTime {
845
+ interruptedAt = Date().timeIntervalSince(start)
846
+ }
847
+ lastInterruptedAtSeconds = interruptedAt
848
+
849
+ // Stop PCM streaming engine
850
+ pcmPlayerNode?.stop()
851
+ pcmEngine?.stop()
852
+ pcmEngine = nil
853
+ pcmPlayerNode = nil
854
+ pcmPlaybackStartTime = nil
855
+
856
+ // Stop MP3 player
857
+ audioPlayer?.stop()
858
+ audioPlayer = nil
859
+ mp3PlaybackStartTime = nil
860
+
861
+ // Stop system TTS
862
+ systemSynthesizer.stopSpeaking(at: .immediate)
863
+ systemSpeechDelegate?.finish(
864
+ error: NSError(domain: "TalkMode", code: -1, userInfo: [
865
+ NSLocalizedDescriptionKey: "Speech interrupted by user"
866
+ ])
867
+ )
868
+
869
+ // Cancel in-flight speak task
870
+ speakTask?.cancel()
871
+
872
+ isSpeakingValue = false
873
+
874
+ return interruptedAt
875
+ }
876
+
877
+ // MARK: - Permissions
878
+
879
+ private func requestMicrophonePermission() async -> Bool {
880
+ await withCheckedContinuation { continuation in
881
+ if #available(iOS 17.0, *) {
882
+ AVAudioApplication.requestRecordPermission { granted in
883
+ continuation.resume(returning: granted)
884
+ }
885
+ } else {
886
+ AVAudioSession.sharedInstance().requestRecordPermission { granted in
887
+ continuation.resume(returning: granted)
888
+ }
889
+ }
890
+ }
891
+ }
892
+
893
+ private func requestSpeechPermission() async -> Bool {
894
+ await withCheckedContinuation { continuation in
895
+ SFSpeechRecognizer.requestAuthorization { status in
896
+ continuation.resume(returning: status == .authorized)
897
+ }
898
+ }
899
+ }
900
+
901
+ private func buildPermissionResult() -> JSObject {
902
+ let micStatus: String
903
+ switch AVAudioSession.sharedInstance().recordPermission {
904
+ case .granted: micStatus = "granted"
905
+ case .denied: micStatus = "denied"
906
+ case .undetermined: micStatus = "prompt"
907
+ @unknown default: micStatus = "prompt"
908
+ }
909
+
910
+ let speechStatus: String
911
+ switch SFSpeechRecognizer.authorizationStatus() {
912
+ case .authorized: speechStatus = "granted"
913
+ case .denied: speechStatus = "denied"
914
+ case .notDetermined: speechStatus = "prompt"
915
+ case .restricted: speechStatus = "denied"
916
+ @unknown default: speechStatus = "prompt"
917
+ }
918
+
919
+ return [
920
+ "microphone": micStatus,
921
+ "speechRecognition": speechStatus
922
+ ]
923
+ }
924
+
925
+ // MARK: - Audio Session
926
+
927
+ private func configureAudioSession() throws {
928
+ let session = AVAudioSession.sharedInstance()
929
+ try session.setCategory(.playAndRecord, mode: .voiceChat, options: [
930
+ .duckOthers,
931
+ .mixWithOthers,
932
+ .allowBluetoothA2DP,
933
+ .defaultToSpeaker
934
+ ])
935
+ try session.setActive(true)
936
+ }
937
+
938
+ // MARK: - State & Events
939
+
940
+ private func setState(_ newState: String, _ newStatusText: String) {
941
+ let previousState = state
942
+ state = newState
943
+ statusText = newStatusText
944
+
945
+ notifyListeners("stateChange", data: [
946
+ "state": newState,
947
+ "previousState": previousState,
948
+ "statusText": newStatusText,
949
+ "usingSystemTts": usedSystemTts
950
+ ])
951
+ }
952
+
953
+ private func emitError(code: String, message: String, recoverable: Bool) {
954
+ notifyListeners("error", data: [
955
+ "code": code,
956
+ "message": message,
957
+ "recoverable": recoverable
958
+ ])
959
+ }
960
+
961
+ // MARK: - Voice Alias Resolution
962
+
963
+ private func resolveVoiceAlias(_ value: String?) -> String? {
964
+ guard let trimmed = value?.trimmingCharacters(in: .whitespacesAndNewlines),
965
+ !trimmed.isEmpty else {
966
+ return nil
967
+ }
968
+
969
+ let normalized = trimmed.lowercased()
970
+
971
+ // Check alias map
972
+ if let mapped = voiceAliases[normalized] { return mapped }
973
+
974
+ // Check if the value is already a known voice ID in aliases values
975
+ if voiceAliases.values.contains(where: { $0.caseInsensitiveCompare(trimmed) == .orderedSame }) {
976
+ return trimmed
977
+ }
978
+
979
+ // If it looks like a raw ElevenLabs voice ID (alphanumeric, 10+ chars), pass through
980
+ if trimmed.count >= 10,
981
+ trimmed.allSatisfy({ $0.isLetter || $0.isNumber || $0 == "-" || $0 == "_" }) {
982
+ return trimmed
983
+ }
984
+
985
+ return nil
986
+ }
987
+
988
+ // MARK: - TTS Parameter Validation
989
+
990
+ /// Resolve speed from either explicit speed or words-per-minute rate.
991
+ /// ElevenLabs accepts 0.5–2.0; WPM is normalized against 175 WPM baseline.
992
+ private static func resolveSpeed(speed: Double?, rateWpm: Int?) -> Double? {
993
+ if let rateWpm, rateWpm > 0 {
994
+ let resolved = Double(rateWpm) / 175.0
995
+ guard resolved >= 0.5, resolved <= 2.0 else { return nil }
996
+ return resolved
997
+ }
998
+ if let speed {
999
+ guard speed >= 0.5, speed <= 2.0 else { return nil }
1000
+ return speed
1001
+ }
1002
+ return nil
1003
+ }
1004
+
1005
+ /// Validate a 0–1 unit range parameter (stability, similarity, style).
1006
+ private static func validatedUnit(_ value: Double?) -> Double? {
1007
+ guard let value, value >= 0, value <= 1 else { return nil }
1008
+ return value
1009
+ }
1010
+
1011
+ /// Validate seed (unsigned 32-bit integer range).
1012
+ private static func validatedSeed(_ value: Int?) -> Int? {
1013
+ guard let value, value >= 0, value <= 4_294_967_295 else { return nil }
1014
+ return value
1015
+ }
1016
+
1017
+ /// Validate text normalization mode (auto/on/off).
1018
+ private static func validatedNormalize(_ value: String?) -> String? {
1019
+ guard let value else { return nil }
1020
+ let normalized = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
1021
+ return ["auto", "on", "off"].contains(normalized) ? normalized : nil
1022
+ }
1023
+
1024
+ /// Validate language code (2-letter ISO only).
1025
+ static func validatedLanguage(_ value: String?) -> String? {
1026
+ guard let value else { return nil }
1027
+ let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
1028
+ guard trimmed.count == 2, trimmed.allSatisfy({ $0.isLetter }) else { return nil }
1029
+ return trimmed
1030
+ }
1031
+
1032
+ /// Validate latency optimization tier (1–4).
1033
+ private static func validatedLatencyTier(_ value: Int?) -> Int? {
1034
+ guard let value, value >= 1, value <= 4 else { return nil }
1035
+ return value
1036
+ }
1037
+
1038
+ /// Validate ElevenLabs output format string.
1039
+ static func validatedOutputFormat(_ value: String?) -> String? {
1040
+ guard let value else { return nil }
1041
+ let trimmed = value.trimmingCharacters(in: .whitespacesAndNewlines).lowercased()
1042
+ let validFormats: Set<String> = [
1043
+ "mp3_22050_32", "mp3_44100_32", "mp3_44100_64",
1044
+ "mp3_44100_96", "mp3_44100_128", "mp3_44100_192",
1045
+ "pcm_16000", "pcm_22050", "pcm_24000", "pcm_44100",
1046
+ "ulaw_8000"
1047
+ ]
1048
+ return validFormats.contains(trimmed) ? trimmed : nil
1049
+ }
1050
+
1051
+ /// Extract sample rate from a PCM output format string (e.g. "pcm_24000" → 24000).
1052
+ static func pcmSampleRate(from format: String?) -> Double? {
1053
+ guard let format, format.hasPrefix("pcm_") else { return nil }
1054
+ if format.contains("44100") { return 44100 }
1055
+ if format.contains("24000") { return 24000 }
1056
+ if format.contains("22050") { return 22050 }
1057
+ if format.contains("16000") { return 16000 }
1058
+ return nil
1059
+ }
1060
+ }
1061
+
1062
+ // MARK: - SystemSpeechDelegate
1063
+
1064
+ /// Delegate for AVSpeechSynthesizer that bridges the callback-based API to async/await
1065
+ /// via a CheckedContinuation, with a watchdog timeout for safety.
1066
+ private class SystemSpeechDelegate: NSObject, AVSpeechSynthesizerDelegate {
1067
+ private var continuation: CheckedContinuation<Void, Error>?
1068
+ var isFinished = false
1069
+ var watchdog: Task<Void, Never>?
1070
+
1071
+ init(continuation: CheckedContinuation<Void, Error>) {
1072
+ self.continuation = continuation
1073
+ super.init()
1074
+ }
1075
+
1076
+ func finish(error: Error? = nil) {
1077
+ guard !isFinished else { return }
1078
+ isFinished = true
1079
+ watchdog?.cancel()
1080
+ watchdog = nil
1081
+ let cont = continuation
1082
+ continuation = nil
1083
+ if let error {
1084
+ cont?.resume(throwing: error)
1085
+ } else {
1086
+ cont?.resume(returning: ())
1087
+ }
1088
+ }
1089
+
1090
+ func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance) {
1091
+ finish()
1092
+ }
1093
+
1094
+ func speechSynthesizer(_ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance) {
1095
+ finish(error: NSError(domain: "TalkMode", code: -1, userInfo: [
1096
+ NSLocalizedDescriptionKey: "System TTS cancelled"
1097
+ ]))
1098
+ }
1099
+ }
1100
+
1101
+ // MARK: - AudioPlayerDelegate
1102
+
1103
+ /// Delegate for AVAudioPlayer (MP3 playback) that signals completion via a closure.
1104
+ private class AudioPlayerDelegate: NSObject, AVAudioPlayerDelegate {
1105
+ private var onComplete: (() -> Void)?
1106
+
1107
+ init(onComplete: @escaping () -> Void) {
1108
+ self.onComplete = onComplete
1109
+ super.init()
1110
+ }
1111
+
1112
+ func audioPlayerDidFinishPlaying(_ player: AVAudioPlayer, successfully flag: Bool) {
1113
+ onComplete?()
1114
+ onComplete = nil
1115
+ }
1116
+
1117
+ func audioPlayerDecodeErrorDidOccur(_ player: AVAudioPlayer, error: Error?) {
1118
+ onComplete?()
1119
+ onComplete = nil
1120
+ }
1121
+ }