@elizaos/capacitor-talkmode 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1202 @@
1
+ package ai.eliza.plugins.talkmode
2
+
3
+ import android.Manifest
4
+ import android.content.Intent
5
+ import android.media.AudioAttributes
6
+ import android.media.AudioFormat
7
+ import android.media.AudioManager
8
+ import android.media.AudioTrack
9
+ import android.os.Bundle
10
+ import android.os.Handler
11
+ import android.os.Looper
12
+ import android.os.SystemClock
13
+ import android.speech.RecognitionListener
14
+ import android.speech.RecognizerIntent
15
+ import android.speech.SpeechRecognizer
16
+ import android.speech.tts.TextToSpeech
17
+ import android.speech.tts.UtteranceProgressListener
18
+ import android.util.Log
19
+ import com.getcapacitor.JSObject
20
+ import com.getcapacitor.PermissionState
21
+ import com.getcapacitor.Plugin
22
+ import com.getcapacitor.PluginCall
23
+ import com.getcapacitor.PluginMethod
24
+ import com.getcapacitor.annotation.CapacitorPlugin
25
+ import com.getcapacitor.annotation.Permission
26
+ import com.getcapacitor.annotation.PermissionCallback
27
+ import kotlinx.coroutines.*
28
+ import java.io.BufferedInputStream
29
+ import java.net.HttpURLConnection
30
+ import java.net.URL
31
+ import java.util.Locale
32
+ import java.util.UUID
33
+ import java.util.concurrent.atomic.AtomicBoolean
34
+ import kotlin.math.max
35
+ import org.json.JSONObject
36
+
37
+ @CapacitorPlugin(
38
+ name = "TalkMode",
39
+ permissions = [
40
+ Permission(alias = "microphone", strings = [Manifest.permission.RECORD_AUDIO])
41
+ ]
42
+ )
43
+ class TalkModePlugin : Plugin() {
44
+ companion object {
45
+ private const val TAG = "TalkMode"
46
+ private const val DEFAULT_MODEL_ID = "eleven_flash_v2_5"
47
+ private const val DEFAULT_OUTPUT_FORMAT = "pcm_24000"
48
+ }
49
+
50
+ private val mainHandler = Handler(Looper.getMainLooper())
51
+ private val scope = CoroutineScope(Dispatchers.Main + SupervisorJob())
52
+
53
+ // State
54
+ private var enabled = false
55
+ private var state = "idle"
56
+ private var statusText = "Off"
57
+
58
+ // Speech recognition
59
+ private var recognizer: SpeechRecognizer? = null
60
+ private var isListening = false
61
+ private var listeningMode = false
62
+ private var stopRequested = false
63
+ private var restartJob: Job? = null
64
+ private var lastTranscript = ""
65
+ private var lastHeardAtMs: Long? = null
66
+ private var silenceJob: Job? = null
67
+ private val silenceWindowMs = 700L
68
+
69
+ // TTS
70
+ private var systemTts: TextToSpeech? = null
71
+ private var systemTtsReady = false
72
+ private var systemTtsPendingId: String? = null
73
+ private var systemTtsPending: CompletableDeferred<Unit>? = null
74
+ private var pcmTrack: AudioTrack? = null
75
+ private val pcmStopRequested = AtomicBoolean(false)
76
+ private var speakingJob: Job? = null
77
+ private var isSpeaking = false
78
+ private var usedSystemTts = false
79
+ private var lastSpokenText: String? = null
80
+ private var speakStartTimeMs: Long = 0
81
+ private var lastInterruptedAtSeconds: Double? = null
82
+
83
+ // Audio focus
84
+ private var audioManager: AudioManager? = null
85
+ private var audioFocusRequest: AudioManager.OnAudioFocusChangeListener? = null
86
+
87
+ // Config
88
+ private var apiKey: String? = null
89
+ private var voiceId: String? = null
90
+ private var modelId: String? = DEFAULT_MODEL_ID
91
+ private var outputFormat: String? = DEFAULT_OUTPUT_FORMAT
92
+ private var voiceAliases: Map<String, String> = emptyMap()
93
+ private var interruptOnSpeech = true
94
+ private var sessionKey = "main"
95
+ private var sttLanguage: String? = null
96
+
97
+ // ── Recognition listener ────────────────────────────────────────────
98
+
99
+ private val recognitionListener = object : RecognitionListener {
100
+ override fun onReadyForSpeech(params: Bundle?) {
101
+ Log.d(TAG, "Ready for speech")
102
+ if (enabled && isListening) {
103
+ setState("listening", "Listening")
104
+ }
105
+ }
106
+
107
+ override fun onBeginningOfSpeech() {
108
+ Log.d(TAG, "Beginning of speech")
109
+ }
110
+
111
+ override fun onRmsChanged(rmsdB: Float) {}
112
+ override fun onBufferReceived(buffer: ByteArray?) {}
113
+
114
+ override fun onEndOfSpeech() {
115
+ Log.d(TAG, "End of speech")
116
+ scheduleRestart()
117
+ }
118
+
119
+ override fun onError(error: Int) {
120
+ if (stopRequested) return
121
+
122
+ val errorMsg = when (error) {
123
+ SpeechRecognizer.ERROR_AUDIO -> "Audio recording error"
124
+ SpeechRecognizer.ERROR_CLIENT -> "Client error"
125
+ SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS -> "Insufficient permissions"
126
+ SpeechRecognizer.ERROR_NETWORK -> "Network error"
127
+ SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> "Network timeout"
128
+ SpeechRecognizer.ERROR_NO_MATCH -> "No match"
129
+ SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "Recognizer busy"
130
+ SpeechRecognizer.ERROR_SERVER -> "Server error"
131
+ SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "Speech timeout"
132
+ else -> "Unknown error"
133
+ }
134
+ Log.d(TAG, "Recognition error: $errorMsg ($error)")
135
+
136
+ if (error == SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS) {
137
+ notifyListeners("error", JSObject().apply {
138
+ put("code", "recognition_error")
139
+ put("message", "Microphone permission required")
140
+ put("recoverable", false)
141
+ })
142
+ return
143
+ }
144
+
145
+ // Don't notify error for no-match / speech-timeout, just restart
146
+ if (error != SpeechRecognizer.ERROR_NO_MATCH &&
147
+ error != SpeechRecognizer.ERROR_SPEECH_TIMEOUT
148
+ ) {
149
+ notifyListeners("error", JSObject().apply {
150
+ put("code", "recognition_error")
151
+ put("message", errorMsg)
152
+ put("recoverable", true)
153
+ })
154
+ }
155
+
156
+ scheduleRestart(delayMs = 600)
157
+ }
158
+
159
+ override fun onResults(results: Bundle?) {
160
+ val matches = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
161
+ val transcript = matches?.firstOrNull()?.trim() ?: ""
162
+ if (transcript.isNotEmpty()) {
163
+ handleTranscript(transcript, isFinal = true)
164
+ }
165
+ scheduleRestart()
166
+ }
167
+
168
+ override fun onPartialResults(partialResults: Bundle?) {
169
+ val matches = partialResults?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
170
+ val transcript = matches?.firstOrNull()?.trim() ?: ""
171
+ if (transcript.isNotEmpty()) {
172
+ handleTranscript(transcript, isFinal = false)
173
+ }
174
+ }
175
+
176
+ override fun onEvent(eventType: Int, params: Bundle?) {}
177
+ }
178
+
179
+ // ── Lifecycle ────────────────────────────────────────────────────────
180
+
181
+ override fun load() {
182
+ super.load()
183
+ audioManager = context.getSystemService(android.content.Context.AUDIO_SERVICE) as? AudioManager
184
+ initSystemTts()
185
+ }
186
+
187
+ private fun initSystemTts() {
188
+ systemTts = TextToSpeech(context) { status ->
189
+ systemTtsReady = status == TextToSpeech.SUCCESS
190
+ if (systemTtsReady) {
191
+ systemTts?.language = Locale.getDefault()
192
+ systemTts?.setOnUtteranceProgressListener(object : UtteranceProgressListener() {
193
+ override fun onStart(id: String?) {}
194
+
195
+ override fun onDone(id: String?) {
196
+ if (id != null && id == systemTtsPendingId) {
197
+ systemTtsPending?.complete(Unit)
198
+ systemTtsPending = null
199
+ systemTtsPendingId = null
200
+ }
201
+ }
202
+
203
+ @Deprecated("Deprecated in Java")
204
+ override fun onError(id: String?) {
205
+ if (id != null && id == systemTtsPendingId) {
206
+ systemTtsPending?.completeExceptionally(
207
+ IllegalStateException("System TTS error")
208
+ )
209
+ systemTtsPending = null
210
+ systemTtsPendingId = null
211
+ }
212
+ }
213
+
214
+ override fun onError(id: String?, errorCode: Int) {
215
+ if (id != null && id == systemTtsPendingId) {
216
+ systemTtsPending?.completeExceptionally(
217
+ IllegalStateException("System TTS error $errorCode")
218
+ )
219
+ systemTtsPending = null
220
+ systemTtsPendingId = null
221
+ }
222
+ }
223
+ })
224
+ Log.d(TAG, "System TTS initialized")
225
+ } else {
226
+ Log.w(TAG, "System TTS init failed")
227
+ }
228
+ }
229
+ }
230
+
231
+ // ── Plugin methods ──────────────────────────────────────────────────
232
+
233
+ @PluginMethod
234
+ fun start(call: PluginCall) {
235
+ if (!SpeechRecognizer.isRecognitionAvailable(context)) {
236
+ call.resolve(JSObject().apply {
237
+ put("started", false)
238
+ put("error", "Speech recognition not available")
239
+ })
240
+ return
241
+ }
242
+
243
+ if (getPermissionState("microphone") != PermissionState.GRANTED) {
244
+ requestPermissionForAlias("microphone", call, "handleStartPermission")
245
+ return
246
+ }
247
+
248
+ startInternal(call)
249
+ }
250
+
251
+ @PermissionCallback
252
+ private fun handleStartPermission(call: PluginCall) {
253
+ if (getPermissionState("microphone") == PermissionState.GRANTED) {
254
+ startInternal(call)
255
+ } else {
256
+ call.resolve(JSObject().apply {
257
+ put("started", false)
258
+ put("error", "Microphone permission denied")
259
+ })
260
+ }
261
+ }
262
+
263
+ private fun startInternal(call: PluginCall) {
264
+ // Parse config
265
+ val config = call.getObject("config")
266
+ if (config != null) {
267
+ applyConfig(config)
268
+ }
269
+
270
+ enabled = true
271
+ stopRequested = false
272
+ listeningMode = true
273
+ setState("listening", "Listening")
274
+
275
+ mainHandler.post {
276
+ try {
277
+ recognizer?.destroy()
278
+ recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
279
+ setRecognitionListener(recognitionListener)
280
+ }
281
+ startListeningInternal(markListening = true)
282
+ startSilenceMonitor()
283
+
284
+ call.resolve(JSObject().apply {
285
+ put("started", true)
286
+ })
287
+ } catch (e: Exception) {
288
+ Log.e(TAG, "Failed to start", e)
289
+ call.resolve(JSObject().apply {
290
+ put("started", false)
291
+ put("error", e.message ?: "Failed to start")
292
+ })
293
+ }
294
+ }
295
+ }
296
+
297
+ @PluginMethod
298
+ fun stop(call: PluginCall) {
299
+ enabled = false
300
+ stopRequested = true
301
+ listeningMode = false
302
+ isListening = false
303
+ restartJob?.cancel()
304
+ restartJob = null
305
+ silenceJob?.cancel()
306
+ silenceJob = null
307
+ lastTranscript = ""
308
+ lastHeardAtMs = null
309
+
310
+ mainHandler.post {
311
+ recognizer?.cancel()
312
+ recognizer?.destroy()
313
+ recognizer = null
314
+ }
315
+
316
+ stopSpeakingInternal()
317
+ setState("idle", "Off")
318
+ call.resolve()
319
+ }
320
+
321
+ @PluginMethod
322
+ fun isEnabled(call: PluginCall) {
323
+ call.resolve(JSObject().apply {
324
+ put("enabled", enabled)
325
+ })
326
+ }
327
+
328
+ @PluginMethod
329
+ fun getState(call: PluginCall) {
330
+ call.resolve(JSObject().apply {
331
+ put("state", state)
332
+ put("statusText", statusText)
333
+ })
334
+ }
335
+
336
+ @PluginMethod
337
+ fun updateConfig(call: PluginCall) {
338
+ val config = call.getObject("config") ?: run {
339
+ call.resolve()
340
+ return
341
+ }
342
+ applyConfig(config)
343
+ call.resolve()
344
+ }
345
+
346
+ @PluginMethod
347
+ fun speak(call: PluginCall) {
348
+ val text = call.getString("text")?.trim() ?: run {
349
+ call.resolve(JSObject().apply {
350
+ put("completed", true)
351
+ put("interrupted", false)
352
+ put("usedSystemTts", false)
353
+ })
354
+ return
355
+ }
356
+
357
+ if (text.isEmpty()) {
358
+ call.resolve(JSObject().apply {
359
+ put("completed", true)
360
+ put("interrupted", false)
361
+ put("usedSystemTts", false)
362
+ })
363
+ return
364
+ }
365
+
366
+ val useSystemTts = call.getBoolean("useSystemTts", false) ?: false
367
+ val directive = call.getObject("directive")
368
+
369
+ speakingJob = scope.launch {
370
+ speakInternal(text, useSystemTts, directive, call)
371
+ }
372
+ }
373
+
374
+ @PluginMethod
375
+ fun stopSpeaking(call: PluginCall) {
376
+ val interruptedAt = computeInterruptedAt()
377
+ stopSpeakingInternal()
378
+ call.resolve(JSObject().apply {
379
+ if (interruptedAt != null) {
380
+ put("interruptedAt", interruptedAt)
381
+ }
382
+ })
383
+ }
384
+
385
+ @PluginMethod
386
+ fun isSpeaking(call: PluginCall) {
387
+ call.resolve(JSObject().apply {
388
+ put("speaking", isSpeaking)
389
+ })
390
+ }
391
+
392
+ @PluginMethod
393
+ override fun checkPermissions(call: PluginCall) {
394
+ call.resolve(buildPermissionResult())
395
+ }
396
+
397
+ @PluginMethod
398
+ override fun requestPermissions(call: PluginCall) {
399
+ if (!isPermissionGranted(Manifest.permission.RECORD_AUDIO)) {
400
+ requestPermissionForAlias("microphone", call, "handlePermissionResult")
401
+ } else {
402
+ call.resolve(buildPermissionResult())
403
+ }
404
+ }
405
+
406
+ @PermissionCallback
407
+ private fun handlePermissionResult(call: PluginCall) {
408
+ call.resolve(buildPermissionResult())
409
+ }
410
+
411
+ // ── Config ──────────────────────────────────────────────────────────
412
+
413
+ private fun applyConfig(config: JSObject) {
414
+ val tts = config.optJSONObject("tts")
415
+ if (tts != null) {
416
+ tts.stringOrNull("apiKey")?.takeIf { it.isNotEmpty() }?.let { apiKey = it }
417
+ tts.stringOrNull("voiceId")?.takeIf { it.isNotEmpty() }?.let { voiceId = it }
418
+ tts.stringOrNull("modelId")?.takeIf { it.isNotEmpty() }?.let { modelId = it }
419
+ tts.stringOrNull("outputFormat")?.takeIf { it.isNotEmpty() }?.let {
420
+ outputFormat = validatedOutputFormat(it) ?: outputFormat
421
+ }
422
+ if (tts.has("interruptOnSpeech")) {
423
+ interruptOnSpeech = tts.optBoolean("interruptOnSpeech", true)
424
+ }
425
+
426
+ val aliases = tts.optJSONObject("voiceAliases")
427
+ if (aliases != null) {
428
+ val map = mutableMapOf<String, String>()
429
+ aliases.keys().forEach { key ->
430
+ val value = aliases.stringOrNull(key)?.trim()
431
+ if (!value.isNullOrEmpty()) {
432
+ map[key.trim().lowercase()] = value
433
+ }
434
+ }
435
+ voiceAliases = map
436
+ }
437
+ }
438
+
439
+ val stt = config.optJSONObject("stt")
440
+ if (stt != null) {
441
+ stt.stringOrNull("language")?.takeIf { it.isNotEmpty() }?.let {
442
+ sttLanguage = validatedLanguage(it)
443
+ }
444
+ }
445
+
446
+ config.stringOrNull("sessionKey")?.takeIf { it.isNotEmpty() }?.let { sessionKey = it }
447
+
448
+ if (config.has("silenceWindowMs")) {
449
+ // silenceWindowMs is final for stability; log but don't change
450
+ Log.d(TAG, "silenceWindowMs config ignored on Android (fixed at ${silenceWindowMs}ms)")
451
+ }
452
+ }
453
+
454
+ // ── STT internals ───────────────────────────────────────────────────
455
+
456
+ private fun startListeningInternal(markListening: Boolean) {
457
+ if (stopRequested) return
458
+ val r = recognizer ?: return
459
+
460
+ val intent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply {
461
+ putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM)
462
+ putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
463
+ putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
464
+ putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
465
+ sttLanguage?.let { putExtra(RecognizerIntent.EXTRA_LANGUAGE, it) }
466
+ }
467
+
468
+ if (markListening) {
469
+ isListening = true
470
+ setState("listening", "Listening")
471
+ }
472
+
473
+ try {
474
+ r.startListening(intent)
475
+ } catch (e: Exception) {
476
+ Log.e(TAG, "Failed to start listening", e)
477
+ }
478
+ }
479
+
480
+ private fun scheduleRestart(delayMs: Long = 350) {
481
+ if (stopRequested) return
482
+ restartJob?.cancel()
483
+ restartJob = scope.launch {
484
+ delay(delayMs)
485
+ mainHandler.post {
486
+ if (stopRequested) return@post
487
+ try {
488
+ recognizer?.cancel()
489
+ val shouldListen = listeningMode
490
+ val shouldInterrupt = isSpeaking && interruptOnSpeech
491
+ if (!shouldListen && !shouldInterrupt) return@post
492
+ startListeningInternal(markListening = shouldListen)
493
+ } catch (_: Throwable) {
494
+ // Will be picked up by onError and retry again
495
+ }
496
+ }
497
+ }
498
+ }
499
+
500
+ private fun startSilenceMonitor() {
501
+ silenceJob?.cancel()
502
+ silenceJob = scope.launch {
503
+ while (enabled) {
504
+ delay(200)
505
+ checkSilence()
506
+ }
507
+ }
508
+ }
509
+
510
+ private fun checkSilence() {
511
+ if (!isListening) return
512
+ val transcript = lastTranscript.trim()
513
+ if (transcript.isEmpty()) return
514
+ val lastHeard = lastHeardAtMs ?: return
515
+ val elapsed = SystemClock.elapsedRealtime() - lastHeard
516
+ if (elapsed < silenceWindowMs) return
517
+
518
+ // Finalize: emit a final transcript event
519
+ notifyListeners("transcript", JSObject().apply {
520
+ put("transcript", transcript)
521
+ put("isFinal", true)
522
+ })
523
+ lastTranscript = ""
524
+ lastHeardAtMs = null
525
+ }
526
+
527
+ private fun handleTranscript(transcript: String, isFinal: Boolean) {
528
+ if (transcript.isEmpty()) return
529
+
530
+ // If speaking and interrupt enabled, check for interruption
531
+ if (isSpeaking && interruptOnSpeech) {
532
+ if (shouldInterrupt(transcript)) {
533
+ val interruptedAt = computeInterruptedAt()
534
+ stopSpeakingInternal()
535
+ lastInterruptedAtSeconds = interruptedAt
536
+ }
537
+ return
538
+ }
539
+
540
+ if (!isListening) return
541
+
542
+ if (transcript.isNotEmpty()) {
543
+ lastTranscript = transcript
544
+ lastHeardAtMs = SystemClock.elapsedRealtime()
545
+ }
546
+
547
+ notifyListeners("transcript", JSObject().apply {
548
+ put("transcript", transcript)
549
+ put("isFinal", isFinal)
550
+ })
551
+ }
552
+
553
+ /**
554
+ * Avoid false interrupts: don't interrupt if the heard text is just a
555
+ * substring of what we're currently speaking (echo from speaker).
556
+ */
557
+ private fun shouldInterrupt(transcript: String): Boolean {
558
+ val trimmed = transcript.trim()
559
+ if (trimmed.length < 3) return false
560
+ val spoken = lastSpokenText?.lowercase()
561
+ if (spoken != null && spoken.contains(trimmed.lowercase())) return false
562
+ return true
563
+ }
564
+
565
+ /**
566
+ * Ensure the recognizer is active during speech so we can detect
567
+ * interruption from the user speaking over TTS playback.
568
+ */
569
+ private fun ensureInterruptListener() {
570
+ if (!interruptOnSpeech || !enabled) return
571
+ mainHandler.post {
572
+ if (stopRequested) return@post
573
+ if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
574
+ try {
575
+ if (recognizer == null) {
576
+ recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
577
+ setRecognitionListener(recognitionListener)
578
+ }
579
+ }
580
+ recognizer?.cancel()
581
+ startListeningInternal(markListening = false)
582
+ } catch (_: Throwable) {}
583
+ }
584
+ }
585
+
586
+ // ── TTS internals ───────────────────────────────────────────────────
587
+
588
+ private suspend fun speakInternal(
589
+ text: String,
590
+ forceSystemTts: Boolean,
591
+ directive: JSObject?,
592
+ call: PluginCall
593
+ ) {
594
+ isSpeaking = true
595
+ usedSystemTts = false
596
+ lastSpokenText = text
597
+ speakStartTimeMs = SystemClock.elapsedRealtime()
598
+ pcmStopRequested.set(false)
599
+ setState("speaking", "Speaking")
600
+
601
+ val effectiveVoiceId = directive.stringOrNull("voiceId")?.let(::resolveVoiceAlias) ?: voiceId
602
+ val effectiveApiKey = apiKey
603
+
604
+ notifyListeners("speaking", JSObject().apply {
605
+ put("text", text)
606
+ put("isSystemTts", forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
607
+ })
608
+
609
+ // Stop listening during speech (we keep recognizer for interrupt detection)
610
+ mainHandler.post { recognizer?.stopListening() }
611
+ ensureInterruptListener()
612
+
613
+ // Request audio focus
614
+ requestAudioFocus()
615
+
616
+ try {
617
+ val canUseElevenLabs = !forceSystemTts &&
618
+ !effectiveApiKey.isNullOrEmpty() &&
619
+ !effectiveVoiceId.isNullOrEmpty()
620
+
621
+ if (canUseElevenLabs) {
622
+ try {
623
+ val request = buildElevenLabsRequest(text, directive)
624
+ streamAndPlayPcm(
625
+ voiceId = effectiveVoiceId!!,
626
+ apiKey = effectiveApiKey!!,
627
+ request = request
628
+ )
629
+
630
+ if (!pcmStopRequested.get()) {
631
+ call.resolve(JSObject().apply {
632
+ put("completed", true)
633
+ put("interrupted", false)
634
+ put("usedSystemTts", false)
635
+ })
636
+ } else {
637
+ call.resolve(JSObject().apply {
638
+ put("completed", false)
639
+ put("interrupted", true)
640
+ put("usedSystemTts", false)
641
+ lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
642
+ })
643
+ }
644
+ } catch (e: Exception) {
645
+ if (pcmStopRequested.get()) {
646
+ call.resolve(JSObject().apply {
647
+ put("completed", false)
648
+ put("interrupted", true)
649
+ put("usedSystemTts", false)
650
+ })
651
+ } else {
652
+ Log.w(TAG, "ElevenLabs TTS failed, falling back to system", e)
653
+ speakWithSystemTts(text, call)
654
+ }
655
+ }
656
+ } else {
657
+ speakWithSystemTts(text, call)
658
+ }
659
+ } catch (e: Exception) {
660
+ Log.e(TAG, "Speak failed", e)
661
+ call.resolve(JSObject().apply {
662
+ put("completed", false)
663
+ put("interrupted", false)
664
+ put("usedSystemTts", usedSystemTts)
665
+ put("error", e.message ?: "Speak failed")
666
+ })
667
+ } finally {
668
+ isSpeaking = false
669
+ pcmStopRequested.set(false)
670
+ abandonAudioFocus()
671
+
672
+ notifyListeners("speakComplete", JSObject().apply {
673
+ put("completed", !pcmStopRequested.get())
674
+ lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
675
+ })
676
+
677
+ if (enabled) {
678
+ listeningMode = true
679
+ setState("listening", "Listening")
680
+ mainHandler.post { startListeningInternal(markListening = true) }
681
+ } else {
682
+ setState("idle", "Off")
683
+ }
684
+ }
685
+ }
686
+
687
+ /**
688
+ * Build the full ElevenLabs request parameters from directive + defaults,
689
+ * applying all validation from the classic TalkModeRuntime.
690
+ */
691
+ private fun buildElevenLabsRequest(text: String, directive: JSObject?): ElevenLabsRequest {
692
+ val effectiveModelId = directive.stringOrNull("modelId")?.takeIf { it.isNotEmpty() }
693
+ ?: modelId
694
+ ?: DEFAULT_MODEL_ID
695
+ val effectiveFormat = validatedOutputFormat(
696
+ directive.stringOrNull("outputFormat") ?: outputFormat
697
+ ) ?: DEFAULT_OUTPUT_FORMAT
698
+
699
+ val rawSpeed = directive?.optDouble("speed", -1.0)?.takeIf { it > 0 }
700
+ val rawRateWpm = directive?.optInt("rateWpm", -1)?.takeIf { it > 0 }
701
+ val speed = resolveSpeed(rawSpeed, rawRateWpm)
702
+
703
+ val rawStability = directive?.optDouble("stability", -1.0)?.takeIf { it >= 0 }
704
+ val stability = validatedStability(rawStability, effectiveModelId)
705
+
706
+ val rawSimilarity = directive?.optDouble("similarity", -1.0)?.takeIf { it >= 0 }
707
+ val similarity = validatedUnit(rawSimilarity)
708
+
709
+ val rawStyle = directive?.optDouble("style", -1.0)?.takeIf { it >= 0 }
710
+ val style = validatedUnit(rawStyle)
711
+
712
+ val speakerBoost = if (directive?.has("speakerBoost") == true) {
713
+ directive.optBoolean("speakerBoost", false)
714
+ } else null
715
+
716
+ val rawSeed = directive?.optLong("seed", -1)?.takeIf { it >= 0 }
717
+ val seed = validatedSeed(rawSeed)
718
+
719
+ val rawNormalize = directive.stringOrNull("normalize")
720
+ val normalize = validatedNormalize(rawNormalize)
721
+
722
+ val rawLanguage = directive.stringOrNull("language")
723
+ val language = validatedLanguage(rawLanguage)
724
+
725
+ val rawLatencyTier = directive?.optInt("latencyTier", -1)?.takeIf { it >= 0 }
726
+ val latencyTier = validatedLatencyTier(rawLatencyTier)
727
+
728
+ return ElevenLabsRequest(
729
+ text = text,
730
+ modelId = effectiveModelId,
731
+ outputFormat = effectiveFormat,
732
+ speed = speed,
733
+ stability = stability,
734
+ similarity = similarity,
735
+ style = style,
736
+ speakerBoost = speakerBoost,
737
+ seed = seed,
738
+ normalize = normalize,
739
+ language = language,
740
+ latencyTier = latencyTier
741
+ )
742
+ }
743
+
744
+ private fun JSObject?.stringOrNull(key: String): String? {
745
+ if (this == null || !has(key) || isNull(key)) return null
746
+ val value = opt(key)
747
+ return if (value == null || value === JSONObject.NULL) null else value.toString()
748
+ }
749
+
750
+ private fun JSONObject?.stringOrNull(key: String): String? {
751
+ if (this == null || !has(key) || isNull(key)) return null
752
+ val value = opt(key)
753
+ return if (value == null || value === JSONObject.NULL) null else value.toString()
754
+ }
755
+
756
+ /**
757
+ * Stream PCM audio from ElevenLabs and play via AudioTrack.
758
+ * Ported from classic TalkModeManager with proper offset-based writes.
759
+ */
760
+ private suspend fun streamAndPlayPcm(
761
+ voiceId: String,
762
+ apiKey: String,
763
+ request: ElevenLabsRequest
764
+ ) = withContext(Dispatchers.IO) {
765
+ pcmStopRequested.set(false)
766
+
767
+ val sampleRate = parsePcmSampleRate(request.outputFormat) ?: 24000
768
+ val minBuffer = AudioTrack.getMinBufferSize(
769
+ sampleRate,
770
+ AudioFormat.CHANNEL_OUT_MONO,
771
+ AudioFormat.ENCODING_PCM_16BIT
772
+ )
773
+ if (minBuffer <= 0) {
774
+ throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
775
+ }
776
+
777
+ val bufferSize = max(minBuffer * 2, 8 * 1024)
778
+ val track = AudioTrack.Builder()
779
+ .setAudioAttributes(
780
+ AudioAttributes.Builder()
781
+ .setUsage(AudioAttributes.USAGE_ASSISTANT)
782
+ .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
783
+ .build()
784
+ )
785
+ .setAudioFormat(
786
+ AudioFormat.Builder()
787
+ .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
788
+ .setSampleRate(sampleRate)
789
+ .setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
790
+ .build()
791
+ )
792
+ .setBufferSizeInBytes(bufferSize)
793
+ .setTransferMode(AudioTrack.MODE_STREAM)
794
+ .build()
795
+
796
+ if (track.state != AudioTrack.STATE_INITIALIZED) {
797
+ track.release()
798
+ throw IllegalStateException("AudioTrack init failed")
799
+ }
800
+ pcmTrack = track
801
+ track.play()
802
+
803
+ Log.d(TAG, "PCM play start sampleRate=$sampleRate bufferSize=$bufferSize")
804
+ val conn = openTtsConnection(voiceId, apiKey, request)
805
+ try {
806
+ val payload = buildRequestPayload(request)
807
+ conn.outputStream.use { it.write(payload.toByteArray()) }
808
+
809
+ val code = conn.responseCode
810
+ if (code >= 400) {
811
+ val errBody = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
812
+ throw IllegalStateException("ElevenLabs API error: $code $errBody")
813
+ }
814
+
815
+ BufferedInputStream(conn.inputStream).use { input ->
816
+ val buffer = ByteArray(8 * 1024)
817
+ while (true) {
818
+ if (pcmStopRequested.get()) return@withContext
819
+ val bytesRead = input.read(buffer)
820
+ if (bytesRead <= 0) break
821
+
822
+ // Write all bytes, handling partial writes
823
+ var offset = 0
824
+ while (offset < bytesRead) {
825
+ if (pcmStopRequested.get()) return@withContext
826
+ val wrote = try {
827
+ track.write(buffer, offset, bytesRead - offset)
828
+ } catch (e: Throwable) {
829
+ if (pcmStopRequested.get()) return@withContext
830
+ throw e
831
+ }
832
+ if (wrote <= 0) {
833
+ if (pcmStopRequested.get()) return@withContext
834
+ throw IllegalStateException("AudioTrack write failed: $wrote")
835
+ }
836
+ offset += wrote
837
+ }
838
+ }
839
+ }
840
+
841
+ // Wait for playback buffer to drain
842
+ if (!pcmStopRequested.get()) {
843
+ track.stop()
844
+ }
845
+ Log.d(TAG, "PCM play done")
846
+ } finally {
847
+ cleanupPcmTrack()
848
+ conn.disconnect()
849
+ }
850
+ }
851
+
852
+ /**
853
+ * Open HTTP connection to ElevenLabs streaming TTS endpoint.
854
+ * Includes Accept header and latency tier query parameter.
855
+ */
856
+ private fun openTtsConnection(
857
+ voiceId: String,
858
+ apiKey: String,
859
+ request: ElevenLabsRequest
860
+ ): HttpURLConnection {
861
+ val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
862
+ val url = if (request.latencyTier != null) {
863
+ URL("$baseUrl?optimize_streaming_latency=${request.latencyTier}")
864
+ } else {
865
+ URL(baseUrl)
866
+ }
867
+
868
+ val conn = url.openConnection() as HttpURLConnection
869
+ conn.requestMethod = "POST"
870
+ conn.connectTimeout = 30_000
871
+ conn.readTimeout = 30_000
872
+ conn.setRequestProperty("Content-Type", "application/json")
873
+ conn.setRequestProperty("Accept", resolveAcceptHeader(request.outputFormat))
874
+ conn.setRequestProperty("xi-api-key", apiKey)
875
+ conn.doOutput = true
876
+ return conn
877
+ }
878
+
879
+ private fun resolveAcceptHeader(outputFormat: String?): String {
880
+ val normalized = outputFormat?.trim()?.lowercase().orEmpty()
881
+ return if (normalized.startsWith("pcm_")) "audio/pcm" else "audio/mpeg"
882
+ }
883
+
884
+ /**
885
+ * Build the full JSON request payload with all ElevenLabs voice_settings.
886
+ */
887
+ private fun buildRequestPayload(request: ElevenLabsRequest): String {
888
+ val sb = StringBuilder()
889
+ sb.append("{")
890
+ sb.append("\"text\":").append(jsonString(request.text))
891
+ request.modelId?.takeIf { it.isNotEmpty() }?.let {
892
+ sb.append(",\"model_id\":").append(jsonString(it))
893
+ }
894
+ request.outputFormat?.takeIf { it.isNotEmpty() }?.let {
895
+ sb.append(",\"output_format\":").append(jsonString(it))
896
+ }
897
+ request.seed?.let { sb.append(",\"seed\":$it") }
898
+ request.normalize?.let { sb.append(",\"apply_text_normalization\":").append(jsonString(it)) }
899
+ request.language?.let { sb.append(",\"language_code\":").append(jsonString(it)) }
900
+
901
+ // voice_settings sub-object
902
+ val vsEntries = mutableListOf<String>()
903
+ request.speed?.let { vsEntries.add("\"speed\":$it") }
904
+ request.stability?.let { vsEntries.add("\"stability\":$it") }
905
+ request.similarity?.let { vsEntries.add("\"similarity_boost\":$it") }
906
+ request.style?.let { vsEntries.add("\"style\":$it") }
907
+ request.speakerBoost?.let { vsEntries.add("\"use_speaker_boost\":$it") }
908
+ if (vsEntries.isNotEmpty()) {
909
+ sb.append(",\"voice_settings\":{")
910
+ sb.append(vsEntries.joinToString(","))
911
+ sb.append("}")
912
+ }
913
+
914
+ sb.append("}")
915
+ return sb.toString()
916
+ }
917
+
918
+ /** Escape a string for JSON. */
919
+ private fun jsonString(value: String): String {
920
+ val escaped = value
921
+ .replace("\\", "\\\\")
922
+ .replace("\"", "\\\"")
923
+ .replace("\n", "\\n")
924
+ .replace("\r", "\\r")
925
+ .replace("\t", "\\t")
926
+ return "\"$escaped\""
927
+ }
928
+
929
+ private suspend fun speakWithSystemTts(text: String, call: PluginCall) {
930
+ usedSystemTts = true
931
+ setState("speaking", "Speaking (System)")
932
+
933
+ if (!systemTtsReady || systemTts == null) {
934
+ call.resolve(JSObject().apply {
935
+ put("completed", false)
936
+ put("interrupted", false)
937
+ put("usedSystemTts", true)
938
+ put("error", "System TTS not available")
939
+ })
940
+ return
941
+ }
942
+
943
+ val utteranceId = "talkmode-${UUID.randomUUID()}"
944
+ val deferred = CompletableDeferred<Unit>()
945
+ systemTtsPending?.cancel()
946
+ systemTtsPending = deferred
947
+ systemTtsPendingId = utteranceId
948
+
949
+ withContext(Dispatchers.Main) {
950
+ val params = Bundle()
951
+ systemTts?.speak(text, TextToSpeech.QUEUE_FLUSH, params, utteranceId)
952
+ }
953
+
954
+ try {
955
+ withContext(Dispatchers.IO) {
956
+ kotlinx.coroutines.withTimeout(180_000) { deferred.await() }
957
+ }
958
+ call.resolve(JSObject().apply {
959
+ put("completed", true)
960
+ put("interrupted", false)
961
+ put("usedSystemTts", true)
962
+ })
963
+ } catch (e: Exception) {
964
+ call.resolve(JSObject().apply {
965
+ put("completed", false)
966
+ put("interrupted", false)
967
+ put("usedSystemTts", true)
968
+ put("error", e.message ?: "System TTS error")
969
+ })
970
+ }
971
+ }
972
+
973
+ // ── Audio focus ─────────────────────────────────────────────────────
974
+
975
+ private fun requestAudioFocus() {
976
+ val am = audioManager ?: return
977
+ val focusListener = AudioManager.OnAudioFocusChangeListener { focusChange ->
978
+ when (focusChange) {
979
+ AudioManager.AUDIOFOCUS_LOSS,
980
+ AudioManager.AUDIOFOCUS_LOSS_TRANSIENT -> {
981
+ // Another app took audio; stop speaking if we are
982
+ if (isSpeaking) {
983
+ stopSpeakingInternal()
984
+ }
985
+ }
986
+ }
987
+ }
988
+ audioFocusRequest = focusListener
989
+
990
+ @Suppress("DEPRECATION")
991
+ am.requestAudioFocus(
992
+ focusListener,
993
+ AudioManager.STREAM_MUSIC,
994
+ AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK
995
+ )
996
+ }
997
+
998
+ private fun abandonAudioFocus() {
999
+ val am = audioManager ?: return
1000
+ val listener = audioFocusRequest ?: return
1001
+ @Suppress("DEPRECATION")
1002
+ am.abandonAudioFocus(listener)
1003
+ audioFocusRequest = null
1004
+ }
1005
+
1006
+ // ── Cleanup helpers ─────────────────────────────────────────────────
1007
+
1008
+ private fun stopSpeakingInternal() {
1009
+ pcmStopRequested.set(true)
1010
+ cleanupPcmTrack()
1011
+ systemTts?.stop()
1012
+ systemTtsPending?.cancel()
1013
+ systemTtsPending = null
1014
+ systemTtsPendingId = null
1015
+ speakingJob?.cancel()
1016
+ isSpeaking = false
1017
+ }
1018
+
1019
+ private fun cleanupPcmTrack() {
1020
+ val track = pcmTrack ?: return
1021
+ try {
1022
+ track.pause()
1023
+ track.flush()
1024
+ track.stop()
1025
+ } catch (_: Throwable) {
1026
+ // ignore cleanup errors
1027
+ } finally {
1028
+ track.release()
1029
+ }
1030
+ pcmTrack = null
1031
+ }
1032
+
1033
+ private fun computeInterruptedAt(): Double? {
1034
+ if (!isSpeaking) return null
1035
+ val elapsed = SystemClock.elapsedRealtime() - speakStartTimeMs
1036
+ return elapsed.toDouble() / 1000.0
1037
+ }
1038
+
1039
+ // ── Voice alias resolution ──────────────────────────────────────────
1040
+
1041
+ private fun resolveVoiceAlias(value: String?): String? {
1042
+ val trimmed = value?.trim() ?: return null
1043
+ if (trimmed.isEmpty()) return null
1044
+
1045
+ val normalized = trimmed.lowercase()
1046
+
1047
+ // Check alias map
1048
+ voiceAliases[normalized]?.let { return it }
1049
+
1050
+ // Check if it's already a known voice ID (direct passthrough)
1051
+ if (voiceAliases.values.any { it.equals(trimmed, ignoreCase = true) }) return trimmed
1052
+
1053
+ // Looks like a raw ElevenLabs voice ID
1054
+ if (isLikelyVoiceId(trimmed)) return trimmed
1055
+
1056
+ return null
1057
+ }
1058
+
1059
+ private fun isLikelyVoiceId(value: String): Boolean {
1060
+ if (value.length < 10) return false
1061
+ return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
1062
+ }
1063
+
1064
+ // ── Validation helpers (from classic TalkModeRuntime) ───────────────
1065
+
1066
+ private fun resolveSpeed(speed: Double?, rateWpm: Int?): Double? {
1067
+ if (rateWpm != null && rateWpm > 0) {
1068
+ val resolved = rateWpm.toDouble() / 175.0
1069
+ if (resolved <= 0.5 || resolved >= 2.0) return null
1070
+ return resolved
1071
+ }
1072
+ if (speed != null) {
1073
+ if (speed <= 0.5 || speed >= 2.0) return null
1074
+ return speed
1075
+ }
1076
+ return null
1077
+ }
1078
+
1079
+ private fun validatedUnit(value: Double?): Double? {
1080
+ if (value == null) return null
1081
+ if (value < 0 || value > 1) return null
1082
+ return value
1083
+ }
1084
+
1085
+ private fun validatedStability(value: Double?, modelId: String?): Double? {
1086
+ if (value == null) return null
1087
+ val normalized = modelId?.trim()?.lowercase()
1088
+ if (normalized == "eleven_v3") {
1089
+ // v3 only supports discrete stability values
1090
+ return if (value == 0.0 || value == 0.5 || value == 1.0) value else null
1091
+ }
1092
+ return validatedUnit(value)
1093
+ }
1094
+
1095
+ private fun validatedSeed(value: Long?): Long? {
1096
+ if (value == null) return null
1097
+ if (value < 0 || value > 4294967295L) return null
1098
+ return value
1099
+ }
1100
+
1101
+ private fun validatedNormalize(value: String?): String? {
1102
+ val normalized = value?.trim()?.lowercase() ?: return null
1103
+ return if (normalized in listOf("auto", "on", "off")) normalized else null
1104
+ }
1105
+
1106
+ private fun validatedLanguage(value: String?): String? {
1107
+ val normalized = value?.trim()?.lowercase() ?: return null
1108
+ if (normalized.length != 2) return null
1109
+ if (!normalized.all { it in 'a'..'z' }) return null
1110
+ return normalized
1111
+ }
1112
+
1113
+ private fun validatedOutputFormat(value: String?): String? {
1114
+ val trimmed = value?.trim()?.lowercase() ?: return null
1115
+ if (trimmed.isEmpty()) return null
1116
+ if (trimmed.startsWith("mp3_")) return trimmed
1117
+ return if (parsePcmSampleRate(trimmed) != null) trimmed else null
1118
+ }
1119
+
1120
+ private fun validatedLatencyTier(value: Int?): Int? {
1121
+ if (value == null) return null
1122
+ if (value < 0 || value > 4) return null
1123
+ return value
1124
+ }
1125
+
1126
+ private fun parsePcmSampleRate(value: String?): Int? {
1127
+ val trimmed = value?.trim()?.lowercase() ?: return null
1128
+ if (!trimmed.startsWith("pcm_")) return null
1129
+ val suffix = trimmed.removePrefix("pcm_")
1130
+ val digits = suffix.takeWhile { it.isDigit() }
1131
+ val rate = digits.toIntOrNull() ?: return null
1132
+ return if (rate in setOf(16000, 22050, 24000, 44100)) rate else null
1133
+ }
1134
+
1135
+ // ── State management ────────────────────────────────────────────────
1136
+
1137
+ private fun setState(newState: String, newStatusText: String) {
1138
+ val previousState = state
1139
+ state = newState
1140
+ statusText = newStatusText
1141
+
1142
+ notifyListeners("stateChange", JSObject().apply {
1143
+ put("state", newState)
1144
+ put("previousState", previousState)
1145
+ put("statusText", newStatusText)
1146
+ put("usingSystemTts", usedSystemTts)
1147
+ })
1148
+ }
1149
+
1150
+ private fun buildPermissionResult(): JSObject {
1151
+ val micGranted = isPermissionGranted(Manifest.permission.RECORD_AUDIO)
1152
+ val speechAvailable = SpeechRecognizer.isRecognitionAvailable(context)
1153
+
1154
+ return JSObject().apply {
1155
+ put("microphone", if (micGranted) "granted" else "denied")
1156
+ put("speechRecognition", if (speechAvailable) {
1157
+ if (micGranted) "granted" else "prompt"
1158
+ } else {
1159
+ "not_supported"
1160
+ })
1161
+ }
1162
+ }
1163
+
1164
+ private fun isPermissionGranted(permission: String): Boolean {
1165
+ return getPermissionState(permission) == com.getcapacitor.PermissionState.GRANTED
1166
+ }
1167
+
1168
+ // ── Cleanup ─────────────────────────────────────────────────────────
1169
+
1170
+ override fun handleOnDestroy() {
1171
+ super.handleOnDestroy()
1172
+ enabled = false
1173
+ stopRequested = true
1174
+ recognizer?.destroy()
1175
+ recognizer = null
1176
+ systemTts?.shutdown()
1177
+ systemTts = null
1178
+ cleanupPcmTrack()
1179
+ silenceJob?.cancel()
1180
+ restartJob?.cancel()
1181
+ speakingJob?.cancel()
1182
+ abandonAudioFocus()
1183
+ scope.cancel()
1184
+ }
1185
+
1186
+ // ── Data class ──────────────────────────────────────────────────────
1187
+
1188
+ private data class ElevenLabsRequest(
1189
+ val text: String,
1190
+ val modelId: String?,
1191
+ val outputFormat: String?,
1192
+ val speed: Double?,
1193
+ val stability: Double?,
1194
+ val similarity: Double?,
1195
+ val style: Double?,
1196
+ val speakerBoost: Boolean?,
1197
+ val seed: Long?,
1198
+ val normalize: String?,
1199
+ val language: String?,
1200
+ val latencyTier: Int?
1201
+ )
1202
+ }