@elizaos/capacitor-talkmode 1.0.0 → 2.0.3-beta.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,10 +2,17 @@ package ai.eliza.plugins.talkmode
2
2
 
3
3
  import android.Manifest
4
4
  import android.content.Intent
5
+ import android.content.pm.PackageManager
5
6
  import android.media.AudioAttributes
7
+ import android.media.AudioDeviceInfo
8
+ import android.media.AudioFocusRequest
6
9
  import android.media.AudioFormat
7
10
  import android.media.AudioManager
11
+ import android.media.AudioRecord
8
12
  import android.media.AudioTrack
13
+ import android.media.MediaRecorder
14
+ import android.util.Base64
15
+ import android.os.Build
9
16
  import android.os.Bundle
10
17
  import android.os.Handler
11
18
  import android.os.Looper
@@ -25,7 +32,15 @@ import com.getcapacitor.annotation.CapacitorPlugin
25
32
  import com.getcapacitor.annotation.Permission
26
33
  import com.getcapacitor.annotation.PermissionCallback
27
34
  import kotlinx.coroutines.*
35
+ import android.net.LocalSocket
36
+ import android.net.LocalSocketAddress
28
37
  import java.io.BufferedInputStream
38
+ import java.io.ByteArrayInputStream
39
+ import java.io.DataInputStream
40
+ import java.io.DataOutputStream
41
+ import java.nio.ByteBuffer
42
+ import java.nio.ByteOrder
43
+ import java.io.File
29
44
  import java.net.HttpURLConnection
30
45
  import java.net.URL
31
46
  import java.util.Locale
@@ -45,6 +60,15 @@ class TalkModePlugin : Plugin() {
45
60
  private const val TAG = "TalkMode"
46
61
  private const val DEFAULT_MODEL_ID = "eleven_flash_v2_5"
47
62
  private const val DEFAULT_OUTPUT_FORMAT = "pcm_24000"
63
+ private const val LOCAL_INFERENCE_TTS_URL = "http://127.0.0.1:31337/api/tts/local-inference"
64
+ // Abstract-namespace UDS of ElizaBionicInferenceServer (the bionic app
65
+ // process that has libelizainference loaded). Kept in sync with
66
+ // BIONIC_INFERENCE_SOCKET_NAME in ElizaAgentService.
67
+ private const val BIONIC_INFER_SOCKET = "eliza_bionic_infer_v1"
68
+ // 16 kHz mono is the rate VAD / diarizer / wake-word models expect; 20 ms
69
+ // (320 samples) is the standard VAD frame size.
70
+ private const val DEFAULT_FRAME_SAMPLE_RATE = 16000
71
+ private const val DEFAULT_FRAME_MS = 20
48
72
  }
49
73
 
50
74
  private val mainHandler = Handler(Looper.getMainLooper())
@@ -65,6 +89,10 @@ class TalkModePlugin : Plugin() {
65
89
  private var lastHeardAtMs: Long? = null
66
90
  private var silenceJob: Job? = null
67
91
  private val silenceWindowMs = 700L
92
+ // The recognizer's own onResults AND our silence monitor can both finalize
93
+ // the same utterance; dedup so a turn is emitted (and sent) exactly once.
94
+ private var lastEmittedFinal = ""
95
+ private var lastEmittedFinalAtMs = 0L
68
96
 
69
97
  // TTS
70
98
  private var systemTts: TextToSpeech? = null
@@ -79,10 +107,37 @@ class TalkModePlugin : Plugin() {
79
107
  private var lastSpokenText: String? = null
80
108
  private var speakStartTimeMs: Long = 0
81
109
  private var lastInterruptedAtSeconds: Double? = null
110
+ @Volatile private var activePcmConnection: HttpURLConnection? = null
82
111
 
83
- // Audio focus
112
+ // Voice audio session (communication-mode routing + focus, mirrors the iOS
113
+ // .playAndRecord/.voiceChat/.defaultToSpeaker session). Held for the whole
114
+ // conversation so the platform AEC has a stable speaker reference to cancel.
84
115
  private var audioManager: AudioManager? = null
85
- private var audioFocusRequest: AudioManager.OnAudioFocusChangeListener? = null
116
+ private var audioFocusRequest: AudioFocusRequest? = null
117
+ private var audioSessionActive = false
118
+ private var savedAudioMode = AudioManager.MODE_NORMAL
119
+ private var savedSpeakerphoneOn = false
120
+ // Streams we mute for the session to suppress the platform recognizer's
121
+ // start/stop earcons (the "on/off" beeps heard as it re-arms continuously).
122
+ // TTS plays on STREAM_VOICE_CALL (USAGE_VOICE_COMMUNICATION) so it stays
123
+ // audible. Tracked so we only unmute streams we muted.
124
+ private val earconStreams = intArrayOf(
125
+ AudioManager.STREAM_MUSIC,
126
+ AudioManager.STREAM_SYSTEM,
127
+ AudioManager.STREAM_NOTIFICATION,
128
+ )
129
+ private var earconStreamsMuted = false
130
+
131
+ // Raw PCM frame capture (diarization / VAD / wake-word source). Opt-in and
132
+ // mutually exclusive with SpeechRecognizer on the mic: Android only lets one
133
+ // capture client own a given input source at a time, so starting frame
134
+ // capture SUSPENDS any active SpeechRecognizer and stopping it resumes STT.
135
+ private var audioRecord: AudioRecord? = null
136
+ private var audioFrameJob: Job? = null
137
+ private val audioFrameRunning = AtomicBoolean(false)
138
+ private var sttSuspendedForFrames = false
139
+ private var lastFrameSampleRate = DEFAULT_FRAME_SAMPLE_RATE
140
+ private var lastFrameSamples = 0
86
141
 
87
142
  // Config
88
143
  private var apiKey: String? = null
@@ -189,6 +244,7 @@ class TalkModePlugin : Plugin() {
189
244
  systemTtsReady = status == TextToSpeech.SUCCESS
190
245
  if (systemTtsReady) {
191
246
  systemTts?.language = Locale.getDefault()
247
+ systemTts?.setAudioAttributes(voiceAudioAttributes())
192
248
  systemTts?.setOnUtteranceProgressListener(object : UtteranceProgressListener() {
193
249
  override fun onStart(id: String?) {}
194
250
 
@@ -270,6 +326,7 @@ class TalkModePlugin : Plugin() {
270
326
  enabled = true
271
327
  stopRequested = false
272
328
  listeningMode = true
329
+ configureVoiceAudioSession()
273
330
  setState("listening", "Listening")
274
331
 
275
332
  mainHandler.post {
@@ -286,6 +343,13 @@ class TalkModePlugin : Plugin() {
286
343
  })
287
344
  } catch (e: Exception) {
288
345
  Log.e(TAG, "Failed to start", e)
346
+ // Recognizer creation failed AFTER the audio session was
347
+ // configured — release it so the earcon streams aren't left
348
+ // muted and the device isn't stuck in MODE_IN_COMMUNICATION.
349
+ enabled = false
350
+ listeningMode = false
351
+ releaseVoiceAudioSession()
352
+ setState("idle", "Off")
289
353
  call.resolve(JSObject().apply {
290
354
  put("started", false)
291
355
  put("error", e.message ?: "Failed to start")
@@ -307,6 +371,10 @@ class TalkModePlugin : Plugin() {
307
371
  lastTranscript = ""
308
372
  lastHeardAtMs = null
309
373
 
374
+ // Release any raw-PCM capture; `enabled` is already false so this won't
375
+ // re-arm SpeechRecognizer.
376
+ stopAudioFramesInternal()
377
+
310
378
  mainHandler.post {
311
379
  recognizer?.cancel()
312
380
  recognizer?.destroy()
@@ -314,6 +382,7 @@ class TalkModePlugin : Plugin() {
314
382
  }
315
383
 
316
384
  stopSpeakingInternal()
385
+ releaseVoiceAudioSession()
317
386
  setState("idle", "Off")
318
387
  call.resolve()
319
388
  }
@@ -364,16 +433,18 @@ class TalkModePlugin : Plugin() {
364
433
  }
365
434
 
366
435
  val useSystemTts = call.getBoolean("useSystemTts", false) ?: false
436
+ val useLocalInferenceTts = call.getBoolean("useLocalInferenceTts", false) ?: false
367
437
  val directive = call.getObject("directive")
368
438
 
369
439
  speakingJob = scope.launch {
370
- speakInternal(text, useSystemTts, directive, call)
440
+ speakInternal(text, useSystemTts, useLocalInferenceTts, directive, call)
371
441
  }
372
442
  }
373
443
 
374
444
  @PluginMethod
375
445
  fun stopSpeaking(call: PluginCall) {
376
446
  val interruptedAt = computeInterruptedAt()
447
+ lastInterruptedAtSeconds = interruptedAt
377
448
  stopSpeakingInternal()
378
449
  call.resolve(JSObject().apply {
379
450
  if (interruptedAt != null) {
@@ -408,6 +479,279 @@ class TalkModePlugin : Plugin() {
408
479
  call.resolve(buildPermissionResult())
409
480
  }
410
481
 
482
+ // ── Raw PCM frame capture (diarization / VAD / wake-word) ────────────
483
+
484
+ @PluginMethod
485
+ fun startAudioFrames(call: PluginCall) {
486
+ if (getPermissionState("microphone") != PermissionState.GRANTED) {
487
+ requestPermissionForAlias("microphone", call, "handleStartAudioFramesPermission")
488
+ return
489
+ }
490
+ startAudioFramesInternal(call)
491
+ }
492
+
493
+ @PermissionCallback
494
+ private fun handleStartAudioFramesPermission(call: PluginCall) {
495
+ if (getPermissionState("microphone") == PermissionState.GRANTED) {
496
+ startAudioFramesInternal(call)
497
+ } else {
498
+ call.resolve(JSObject().apply {
499
+ put("started", false)
500
+ put("error", "Microphone permission denied")
501
+ })
502
+ }
503
+ }
504
+
505
+ private fun startAudioFramesInternal(call: PluginCall) {
506
+ if (audioFrameRunning.get()) {
507
+ call.resolve(JSObject().apply {
508
+ put("started", true)
509
+ put("sampleRate", lastFrameSampleRate)
510
+ put("frameSamples", lastFrameSamples)
511
+ put("suspendedStt", sttSuspendedForFrames)
512
+ })
513
+ return
514
+ }
515
+
516
+ val requestedRate = call.getInt("sampleRate") ?: DEFAULT_FRAME_SAMPLE_RATE
517
+ val frameMs = call.getInt("frameMs") ?: DEFAULT_FRAME_MS
518
+ // SpeechRecognizer (SODA) holds the mic; a parallel AudioRecord on the
519
+ // same input fails on virtually every device. Suspend it for the
520
+ // duration of capture and remember to resume on stop.
521
+ val wasListening = isListening || listeningMode
522
+ if (wasListening) {
523
+ suspendSpeechRecognizerForFrames()
524
+ }
525
+
526
+ val record = try {
527
+ openAudioRecord(requestedRate)
528
+ } catch (e: Exception) {
529
+ Log.e(TAG, "AudioRecord open failed", e)
530
+ if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
531
+ call.resolve(JSObject().apply {
532
+ put("started", false)
533
+ put("error", e.message ?: "AudioRecord open failed")
534
+ })
535
+ return
536
+ }
537
+
538
+ val actualRate = record.sampleRate
539
+ val frameSamples = max(1, actualRate * frameMs / 1000)
540
+ audioRecord = record
541
+ lastFrameSampleRate = actualRate
542
+ lastFrameSamples = frameSamples
543
+
544
+ try {
545
+ record.startRecording()
546
+ } catch (e: Exception) {
547
+ Log.e(TAG, "AudioRecord startRecording failed", e)
548
+ releaseAudioRecord()
549
+ if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
550
+ call.resolve(JSObject().apply {
551
+ put("started", false)
552
+ put("error", e.message ?: "AudioRecord start failed")
553
+ })
554
+ return
555
+ }
556
+
557
+ if (record.recordingState != AudioRecord.RECORDSTATE_RECORDING) {
558
+ Log.e(TAG, "AudioRecord did not enter RECORDING state")
559
+ releaseAudioRecord()
560
+ if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
561
+ call.resolve(JSObject().apply {
562
+ put("started", false)
563
+ put("error", "AudioRecord did not start (mic likely held by SpeechRecognizer)")
564
+ })
565
+ return
566
+ }
567
+
568
+ audioFrameRunning.set(true)
569
+ launchFrameLoop(record, frameSamples)
570
+
571
+ call.resolve(JSObject().apply {
572
+ put("started", true)
573
+ put("sampleRate", actualRate)
574
+ put("frameSamples", frameSamples)
575
+ put("suspendedStt", sttSuspendedForFrames)
576
+ })
577
+ }
578
+
579
+ @PluginMethod
580
+ fun stopAudioFrames(call: PluginCall) {
581
+ stopAudioFramesInternal()
582
+ call.resolve()
583
+ }
584
+
585
+ @PluginMethod
586
+ fun isCapturingAudioFrames(call: PluginCall) {
587
+ call.resolve(JSObject().apply {
588
+ put("capturing", audioFrameRunning.get())
589
+ })
590
+ }
591
+
592
+ /**
593
+ * Open a 16 kHz mono 16-bit AudioRecord. Tries VOICE_RECOGNITION first (the
594
+ * pre-processing-light source diarization wants), then falls back to MIC.
595
+ */
596
+ private fun openAudioRecord(sampleRate: Int): AudioRecord {
597
+ val minBuffer = AudioRecord.getMinBufferSize(
598
+ sampleRate,
599
+ AudioFormat.CHANNEL_IN_MONO,
600
+ AudioFormat.ENCODING_PCM_16BIT
601
+ )
602
+ if (minBuffer <= 0) {
603
+ throw IllegalStateException("AudioRecord min buffer invalid ($minBuffer) for ${sampleRate}Hz")
604
+ }
605
+ val bufferBytes = max(minBuffer * 2, 4 * 1024)
606
+ val sources = intArrayOf(
607
+ MediaRecorder.AudioSource.VOICE_RECOGNITION,
608
+ MediaRecorder.AudioSource.MIC,
609
+ )
610
+ var lastError: Throwable? = null
611
+ for (source in sources) {
612
+ try {
613
+ @Suppress("MissingPermission")
614
+ val record = AudioRecord(
615
+ source,
616
+ sampleRate,
617
+ AudioFormat.CHANNEL_IN_MONO,
618
+ AudioFormat.ENCODING_PCM_16BIT,
619
+ bufferBytes
620
+ )
621
+ if (record.state == AudioRecord.STATE_INITIALIZED) {
622
+ return record
623
+ }
624
+ record.release()
625
+ lastError = IllegalStateException("AudioRecord uninitialized for source $source")
626
+ } catch (e: Exception) {
627
+ lastError = e
628
+ }
629
+ }
630
+ throw IllegalStateException(
631
+ "AudioRecord could not initialize at ${sampleRate}Hz",
632
+ lastError
633
+ )
634
+ }
635
+
636
+ private fun launchFrameLoop(record: AudioRecord, frameSamples: Int) {
637
+ audioFrameJob?.cancel()
638
+ // IO dispatcher: a tight blocking read loop must not sit on the main
639
+ // thread. Frames are marshalled to JS via notifyListeners (thread-safe).
640
+ audioFrameJob = scope.launch(Dispatchers.IO) {
641
+ val buffer = ShortArray(frameSamples)
642
+ val bytes = ByteArray(frameSamples * 2)
643
+ var frameIndex = 0L
644
+ try {
645
+ while (audioFrameRunning.get() && isActive) {
646
+ val read = record.read(buffer, 0, frameSamples)
647
+ if (read <= 0) {
648
+ // ERROR_INVALID_OPERATION (-3) / ERROR_BAD_VALUE (-2):
649
+ // the record was released or the mic was taken; stop.
650
+ if (read < 0) break
651
+ continue
652
+ }
653
+ var sumSquares = 0.0
654
+ var b = 0
655
+ for (i in 0 until read) {
656
+ val s = buffer[i].toInt()
657
+ bytes[b] = (s and 0xff).toByte()
658
+ bytes[b + 1] = ((s shr 8) and 0xff).toByte()
659
+ b += 2
660
+ sumSquares += (s.toDouble() * s.toDouble())
661
+ }
662
+ val rms = if (read > 0) {
663
+ Math.sqrt(sumSquares / read) / 32768.0
664
+ } else 0.0
665
+ val pcmBase64 = Base64.encodeToString(
666
+ bytes, 0, read * 2, Base64.NO_WRAP
667
+ )
668
+ val idx = frameIndex
669
+ frameIndex += 1
670
+ val ts = SystemClock.elapsedRealtime()
671
+ notifyListeners("audioFrame", JSObject().apply {
672
+ put("pcm16", pcmBase64)
673
+ put("sampleRate", record.sampleRate)
674
+ put("channels", 1)
675
+ put("samples", read)
676
+ put("rms", rms)
677
+ put("timestamp", ts)
678
+ put("frameIndex", idx)
679
+ })
680
+ }
681
+ } catch (e: Throwable) {
682
+ Log.e(TAG, "Audio frame loop error", e)
683
+ notifyListeners("error", JSObject().apply {
684
+ put("message", "Audio frame capture stopped: ${e.message}")
685
+ put("fatal", false)
686
+ })
687
+ }
688
+ }
689
+ }
690
+
691
+ private fun stopAudioFramesInternal() {
692
+ if (!audioFrameRunning.getAndSet(false) && audioRecord == null) {
693
+ return
694
+ }
695
+ audioFrameJob?.cancel()
696
+ audioFrameJob = null
697
+ releaseAudioRecord()
698
+ if (sttSuspendedForFrames) {
699
+ resumeSpeechRecognizerAfterFrames()
700
+ }
701
+ }
702
+
703
+ private fun releaseAudioRecord() {
704
+ val record = audioRecord ?: return
705
+ audioRecord = null
706
+ try {
707
+ if (record.recordingState == AudioRecord.RECORDSTATE_RECORDING) {
708
+ record.stop()
709
+ }
710
+ } catch (_: Throwable) {
711
+ }
712
+ try {
713
+ record.release()
714
+ } catch (_: Throwable) {
715
+ }
716
+ }
717
+
718
+ /** Suspend SpeechRecognizer so AudioRecord can own the mic. */
719
+ private fun suspendSpeechRecognizerForFrames() {
720
+ sttSuspendedForFrames = true
721
+ listeningMode = false
722
+ isListening = false
723
+ restartJob?.cancel()
724
+ silenceJob?.cancel()
725
+ mainHandler.post {
726
+ try {
727
+ recognizer?.cancel()
728
+ recognizer?.destroy()
729
+ } catch (_: Throwable) {
730
+ }
731
+ recognizer = null
732
+ }
733
+ }
734
+
735
+ /** Re-arm SpeechRecognizer after frame capture ends, if a session is active. */
736
+ private fun resumeSpeechRecognizerAfterFrames() {
737
+ sttSuspendedForFrames = false
738
+ if (!enabled || stopRequested) return
739
+ listeningMode = true
740
+ mainHandler.post {
741
+ try {
742
+ if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
743
+ recognizer?.destroy()
744
+ recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
745
+ setRecognitionListener(recognitionListener)
746
+ }
747
+ startListeningInternal(markListening = true)
748
+ startSilenceMonitor()
749
+ } catch (e: Exception) {
750
+ Log.e(TAG, "Failed to resume STT after frames", e)
751
+ }
752
+ }
753
+ }
754
+
411
755
  // ── Config ──────────────────────────────────────────────────────────
412
756
 
413
757
  private fun applyConfig(config: JSObject) {
@@ -462,6 +806,13 @@ class TalkModePlugin : Plugin() {
462
806
  putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
463
807
  putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
464
808
  putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
809
+ // On-device recognizer (no network round-trip; works offline). The
810
+ // platform recognizer's open/close cadence during continuous use is
811
+ // intrinsic and not controllable via the silence-length extras (the
812
+ // on-device SODA engine ignores them); we silence the AUDIBLE part of
813
+ // that churn by muting the earcon streams for the session instead
814
+ // (see configureVoiceAudioSession).
815
+ putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
465
816
  sttLanguage?.let { putExtra(RecognizerIntent.EXTRA_LANGUAGE, it) }
466
817
  }
467
818
 
@@ -515,13 +866,14 @@ class TalkModePlugin : Plugin() {
515
866
  val elapsed = SystemClock.elapsedRealtime() - lastHeard
516
867
  if (elapsed < silenceWindowMs) return
517
868
 
518
- // Finalize: emit a final transcript event
519
- notifyListeners("transcript", JSObject().apply {
520
- put("transcript", transcript)
521
- put("isFinal", true)
522
- })
869
+ // Finalize this turn (deduped against the recognizer's own onResults),
870
+ // then restart the recognizer so the next utterance is a CLEAN session —
871
+ // Android SpeechRecognizer accumulates within a session, so without the
872
+ // restart the next turn's partials would prepend the words we just sent.
523
873
  lastTranscript = ""
524
874
  lastHeardAtMs = null
875
+ emitFinalOnce(transcript)
876
+ scheduleRestart()
525
877
  }
526
878
 
527
879
  private fun handleTranscript(transcript: String, isFinal: Boolean) {
@@ -531,34 +883,71 @@ class TalkModePlugin : Plugin() {
531
883
  if (isSpeaking && interruptOnSpeech) {
532
884
  if (shouldInterrupt(transcript)) {
533
885
  val interruptedAt = computeInterruptedAt()
534
- stopSpeakingInternal()
535
886
  lastInterruptedAtSeconds = interruptedAt
887
+ stopSpeakingInternal()
536
888
  }
537
889
  return
538
890
  }
539
891
 
540
892
  if (!isListening) return
541
893
 
542
- if (transcript.isNotEmpty()) {
894
+ if (isFinal) {
895
+ // A real end-of-turn from the recognizer: emit once and clear the
896
+ // pending buffer so the silence monitor doesn't re-finalize the same
897
+ // words (the double-send bug).
898
+ lastTranscript = ""
899
+ lastHeardAtMs = null
900
+ emitFinalOnce(transcript)
901
+ } else {
543
902
  lastTranscript = transcript
544
903
  lastHeardAtMs = SystemClock.elapsedRealtime()
904
+ notifyListeners("transcript", JSObject().apply {
905
+ put("transcript", transcript)
906
+ put("isFinal", false)
907
+ })
545
908
  }
909
+ }
546
910
 
911
+ /**
912
+ * Emit a FINAL transcript exactly once. Both the recognizer's `onResults`
913
+ * and the silence monitor can finalize the same utterance; collapse them so
914
+ * the turn is sent a single time (a repeated final within 2s is dropped).
915
+ */
916
+ private fun emitFinalOnce(transcript: String) {
917
+ val text = transcript.trim()
918
+ if (text.isEmpty()) return
919
+ val now = SystemClock.elapsedRealtime()
920
+ if (text == lastEmittedFinal && now - lastEmittedFinalAtMs < 2000L) return
921
+ lastEmittedFinal = text
922
+ lastEmittedFinalAtMs = now
547
923
  notifyListeners("transcript", JSObject().apply {
548
- put("transcript", transcript)
549
- put("isFinal", isFinal)
924
+ put("transcript", text)
925
+ put("isFinal", true)
550
926
  })
551
927
  }
552
928
 
553
929
  /**
554
- * Avoid false interrupts: don't interrupt if the heard text is just a
555
- * substring of what we're currently speaking (echo from speaker).
930
+ * Decide whether heard speech should barge in on the agent's TTS. Tuned to
931
+ * avoid FALSE interrupts (which cut the reply mid-sentence and read as
932
+ * "intermittent audio"): a one-word ASR blip, background noise, or the
933
+ * agent's own voice bleeding back into the mic must NOT interrupt — only a
934
+ * genuine couple-of-words utterance from the user does.
556
935
  */
557
936
  private fun shouldInterrupt(transcript: String): Boolean {
558
937
  val trimmed = transcript.trim()
559
- if (trimmed.length < 3) return false
560
- val spoken = lastSpokenText?.lowercase()
561
- if (spoken != null && spoken.contains(trimmed.lowercase())) return false
938
+ val lower = trimmed.lowercase()
939
+ val words = lower.split(Regex("\\s+")).filter { it.isNotBlank() }
940
+ // Need real intent: at least two words, or one long word ( 8 chars).
941
+ if (words.size < 2 && trimmed.length < 8) return false
942
+ val spoken = lastSpokenText?.lowercase() ?: return true
943
+ // Exact echo of what we're saying → speaker bleed, not the user.
944
+ if (spoken.contains(lower)) return false
945
+ // Fuzzy echo: if most of the heard words appear in the text we're
946
+ // currently speaking, treat it as echo (ASR mishears of our own audio).
947
+ val echoed = words.count { spoken.contains(it) }
948
+ if (words.isNotEmpty() && echoed.toDouble() / words.size >= 0.6) {
949
+ return false
950
+ }
562
951
  return true
563
952
  }
564
953
 
@@ -588,6 +977,7 @@ class TalkModePlugin : Plugin() {
588
977
  private suspend fun speakInternal(
589
978
  text: String,
590
979
  forceSystemTts: Boolean,
980
+ useLocalInferenceTts: Boolean,
591
981
  directive: JSObject?,
592
982
  call: PluginCall
593
983
  ) {
@@ -596,6 +986,7 @@ class TalkModePlugin : Plugin() {
596
986
  lastSpokenText = text
597
987
  speakStartTimeMs = SystemClock.elapsedRealtime()
598
988
  pcmStopRequested.set(false)
989
+ lastInterruptedAtSeconds = null
599
990
  setState("speaking", "Speaking")
600
991
 
601
992
  val effectiveVoiceId = directive.stringOrNull("voiceId")?.let(::resolveVoiceAlias) ?: voiceId
@@ -603,27 +994,74 @@ class TalkModePlugin : Plugin() {
603
994
 
604
995
  notifyListeners("speaking", JSObject().apply {
605
996
  put("text", text)
606
- put("isSystemTts", forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
997
+ put(
998
+ "isSystemTts",
999
+ !useLocalInferenceTts &&
1000
+ (forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
1001
+ )
607
1002
  })
608
1003
 
609
1004
  // Stop listening during speech (we keep recognizer for interrupt detection)
610
1005
  mainHandler.post { recognizer?.stopListening() }
611
1006
  ensureInterruptListener()
612
1007
 
613
- // Request audio focus
614
- requestAudioFocus()
1008
+ // Ensure the communication-mode session + audio focus are active even
1009
+ // for a standalone speak() that wasn't preceded by start().
1010
+ configureVoiceAudioSession()
1011
+ // Re-assert loudspeaker routing right before playback. configureVoice…
1012
+ // only routes on the FIRST activation; if the session was already up (the
1013
+ // STT path opened it) the speaker route may have drifted, leaving TTS on
1014
+ // the earpiece. Re-route here so replies are audible out the speaker.
1015
+ audioManager?.let { routeVoiceOutput(it) }
615
1016
 
616
1017
  try {
617
- val canUseElevenLabs = !forceSystemTts &&
1018
+ val canUseLocalInference = useLocalInferenceTts && !forceSystemTts
1019
+ val canUseElevenLabs = !canUseLocalInference &&
1020
+ !forceSystemTts &&
618
1021
  !effectiveApiKey.isNullOrEmpty() &&
619
1022
  !effectiveVoiceId.isNullOrEmpty()
620
1023
 
621
- if (canUseElevenLabs) {
1024
+ if (canUseLocalInference) {
1025
+ try {
1026
+ streamAndPlayLocalInferenceTts(text, directive)
1027
+
1028
+ if (!pcmStopRequested.get()) {
1029
+ call.resolve(JSObject().apply {
1030
+ put("completed", true)
1031
+ put("interrupted", false)
1032
+ put("usedSystemTts", false)
1033
+ })
1034
+ } else {
1035
+ call.resolve(JSObject().apply {
1036
+ put("completed", false)
1037
+ put("interrupted", true)
1038
+ put("usedSystemTts", false)
1039
+ lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
1040
+ })
1041
+ }
1042
+ } catch (e: Exception) {
1043
+ if (pcmStopRequested.get()) {
1044
+ call.resolve(JSObject().apply {
1045
+ put("completed", false)
1046
+ put("interrupted", true)
1047
+ put("usedSystemTts", false)
1048
+ })
1049
+ } else {
1050
+ // The on-device OmniVoice TTS assets aren't always staged
1051
+ // (it 502s "TEXT_TO_SPEECH not available"). Rather than go
1052
+ // silent — the JS browser-SpeechSynthesis fallback doesn't
1053
+ // exist in the Android WebView — fall back to the platform
1054
+ // TextToSpeech so replies are always spoken aloud.
1055
+ Log.w(TAG, "Local inference TTS failed, falling back to system TTS", e)
1056
+ speakWithSystemTts(text, call)
1057
+ }
1058
+ }
1059
+ } else if (canUseElevenLabs) {
622
1060
  try {
623
1061
  val request = buildElevenLabsRequest(text, directive)
624
1062
  streamAndPlayPcm(
625
- voiceId = effectiveVoiceId!!,
626
- apiKey = effectiveApiKey!!,
1063
+ voiceId = effectiveVoiceId,
1064
+ apiKey = effectiveApiKey,
627
1065
  request = request
628
1066
  )
629
1067
 
@@ -665,13 +1103,16 @@ class TalkModePlugin : Plugin() {
665
1103
  put("error", e.message ?: "Speak failed")
666
1104
  })
667
1105
  } finally {
1106
+ val wasInterrupted = pcmStopRequested.get()
1107
+ val interruptedAt = lastInterruptedAtSeconds
668
1108
  isSpeaking = false
669
1109
  pcmStopRequested.set(false)
670
- abandonAudioFocus()
671
1110
 
672
1111
  notifyListeners("speakComplete", JSObject().apply {
673
- put("completed", !pcmStopRequested.get())
674
- lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
1112
+ put("completed", !wasInterrupted)
1113
+ if (wasInterrupted) {
1114
+ interruptedAt?.let { put("interruptedAt", it) }
1115
+ }
675
1116
  })
676
1117
 
677
1118
  if (enabled) {
@@ -679,6 +1120,8 @@ class TalkModePlugin : Plugin() {
679
1120
  setState("listening", "Listening")
680
1121
  mainHandler.post { startListeningInternal(markListening = true) }
681
1122
  } else {
1123
+ // Standalone speak (no active conversation): release the session.
1124
+ releaseVoiceAudioSession()
682
1125
  setState("idle", "Off")
683
1126
  }
684
1127
  }
@@ -753,6 +1196,363 @@ class TalkModePlugin : Plugin() {
753
1196
  return if (value == null || value === JSONObject.NULL) null else value.toString()
754
1197
  }
755
1198
 
1199
+ private data class PcmStreamFormat(
1200
+ val sampleRate: Int,
1201
+ val channels: Int,
1202
+ val bitsPerSample: Int,
1203
+ val dataBytes: Int
1204
+ )
1205
+
1206
+ /**
1207
+ * Stream local-inference TTS from the embedded agent and play it natively.
1208
+ *
1209
+ * The agent currently returns a buffered WAV, but keeping playback in
1210
+ * AudioTrack means this path is ready for a chunked PCM/WAV response without
1211
+ * going back through WebView decodeAudioData.
1212
+ */
1213
+ private suspend fun streamAndPlayLocalInferenceTts(
1214
+ text: String,
1215
+ directive: JSObject?
1216
+ ) = withContext(Dispatchers.IO) {
1217
+ pcmStopRequested.set(false)
1218
+ // Prefer the in-process fused Kokoro voice via the bionic inference host.
1219
+ // Only if that host is unreachable (e.g. desktop/Electrobun, or a build
1220
+ // without it) do we fall through to the HTTP agent endpoint.
1221
+ if (streamAndPlayBionicKokoroTts(text, directive)) {
1222
+ return@withContext
1223
+ }
1224
+ val conn = openLocalInferenceTtsConnection()
1225
+ activePcmConnection = conn
1226
+ try {
1227
+ val payload = buildLocalInferenceTtsPayload(text, directive)
1228
+ conn.outputStream.use { it.write(payload.toByteArray(Charsets.UTF_8)) }
1229
+
1230
+ val code = conn.responseCode
1231
+ if (code >= 400) {
1232
+ val errBody = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
1233
+ throw IllegalStateException("Local inference TTS error: $code $errBody")
1234
+ }
1235
+
1236
+ BufferedInputStream(conn.inputStream).use { input ->
1237
+ val format = readWavPcmFormat(input)
1238
+ val track = createPcmAudioTrack(format)
1239
+ pcmTrack = track
1240
+ track.play()
1241
+
1242
+ Log.d(
1243
+ TAG,
1244
+ "Local inference PCM play start sampleRate=${format.sampleRate} channels=${format.channels}"
1245
+ )
1246
+ notifyListeners("playbackStart", JSObject().apply {
1247
+ put("provider", "local-inference")
1248
+ put("sampleRate", format.sampleRate)
1249
+ put("channels", format.channels)
1250
+ })
1251
+ val framesWritten = writePcmStreamToTrack(input, track, format)
1252
+ drainPcmTrack(track, framesWritten, format.sampleRate)
1253
+ if (!pcmStopRequested.get()) {
1254
+ track.stop()
1255
+ }
1256
+ Log.d(TAG, "Local inference PCM play done frames=$framesWritten")
1257
+ }
1258
+ } finally {
1259
+ cleanupPcmTrack()
1260
+ if (activePcmConnection === conn) {
1261
+ activePcmConnection = null
1262
+ }
1263
+ conn.disconnect()
1264
+ }
1265
+ }
1266
+
1267
+ /**
1268
+ * Synthesize + play with the fused Kokoro-82M head in the bionic inference
1269
+ * host (ElizaBionicInferenceServer, op "tts") over its abstract-namespace
1270
+ * UDS. The host loads the same libelizainference that runs GPU text and
1271
+ * synthesizes Kokoro PCM in-process — no musl agent, no HTTP, no 502 → no
1272
+ * fallback to the platform TextToSpeech (the bug this fixes: the app was
1273
+ * speaking with the Android system voice). Returns true on success; false if
1274
+ * the host is unreachable so the caller can fall through.
1275
+ */
1276
+ private suspend fun streamAndPlayBionicKokoroTts(
1277
+ text: String,
1278
+ directive: JSObject?
1279
+ ): Boolean = withContext(Dispatchers.IO) {
1280
+ val trimmed = text.trim()
1281
+ if (trimmed.isEmpty()) return@withContext false
1282
+ val speed = (directive?.optDouble("speed", 1.0) ?: 1.0).toFloat()
1283
+ val sock = LocalSocket()
1284
+ try {
1285
+ sock.connect(
1286
+ LocalSocketAddress(BIONIC_INFER_SOCKET, LocalSocketAddress.Namespace.ABSTRACT)
1287
+ )
1288
+ } catch (e: Exception) {
1289
+ Log.d(TAG, "bionic Kokoro TTS host unreachable: ${e.message}")
1290
+ try { sock.close() } catch (_: Exception) {}
1291
+ return@withContext false
1292
+ }
1293
+ try {
1294
+ val req = JSONObject().apply {
1295
+ put("op", "tts")
1296
+ put("text", trimmed)
1297
+ put("speed", speed.toDouble())
1298
+ }.toString().toByteArray(Charsets.UTF_8)
1299
+ DataOutputStream(sock.outputStream).apply {
1300
+ writeInt(req.size) // big-endian length prefix
1301
+ write(req)
1302
+ flush()
1303
+ }
1304
+ val din = DataInputStream(sock.inputStream)
1305
+ val len = din.readInt()
1306
+ if (len <= 0 || len > 64 * 1024 * 1024) {
1307
+ throw IllegalStateException("bionic TTS bad frame length $len")
1308
+ }
1309
+ val respBytes = ByteArray(len)
1310
+ din.readFully(respBytes)
1311
+ val resp = JSONObject(String(respBytes, Charsets.UTF_8))
1312
+ if (!resp.optBoolean("ok", false)) {
1313
+ throw IllegalStateException("bionic TTS error: ${resp.optString("error")}")
1314
+ }
1315
+ val sampleRate = resp.optInt("sampleRate", 24000)
1316
+ val pcmF32 = Base64.decode(resp.getString("pcmBase64"), Base64.NO_WRAP)
1317
+ // fp32 LE → int16 PCM (the play path is ENCODING_PCM_16BIT).
1318
+ val fb = ByteBuffer.wrap(pcmF32).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer()
1319
+ val nSamples = fb.remaining()
1320
+ if (nSamples == 0) {
1321
+ throw IllegalStateException("bionic TTS returned 0 samples")
1322
+ }
1323
+ val pcm16 = ByteArray(nSamples * 2)
1324
+ val ob = ByteBuffer.wrap(pcm16).order(ByteOrder.LITTLE_ENDIAN)
1325
+ for (i in 0 until nSamples) {
1326
+ val s = (fb.get(i) * 32767f).coerceIn(-32768f, 32767f).toInt().toShort()
1327
+ ob.putShort(s)
1328
+ }
1329
+ val format = PcmStreamFormat(sampleRate, 1, 16, pcm16.size)
1330
+ val track = createPcmAudioTrack(format)
1331
+ pcmTrack = track
1332
+ track.play()
1333
+ notifyListeners("playbackStart", JSObject().apply {
1334
+ put("provider", "local-inference")
1335
+ put("sampleRate", sampleRate)
1336
+ put("channels", 1)
1337
+ })
1338
+ val framesWritten = writePcmStreamToTrack(
1339
+ BufferedInputStream(ByteArrayInputStream(pcm16)), track, format
1340
+ )
1341
+ drainPcmTrack(track, framesWritten, sampleRate)
1342
+ if (!pcmStopRequested.get()) track.stop()
1343
+ Log.d(TAG, "bionic Kokoro TTS played $nSamples samples @ $sampleRate Hz")
1344
+ true
1345
+ } finally {
1346
+ cleanupPcmTrack()
1347
+ try { sock.close() } catch (_: Exception) {}
1348
+ }
1349
+ }
1350
+
1351
+ private fun openLocalInferenceTtsConnection(): HttpURLConnection {
1352
+ val tokenFile = File(context.filesDir, "auth/local-agent-token")
1353
+ val token = tokenFile.takeIf { it.isFile }?.readText()?.trim().orEmpty()
1354
+ if (token.isEmpty()) {
1355
+ throw IllegalStateException("Local agent auth token is missing")
1356
+ }
1357
+
1358
+ val conn = URL(LOCAL_INFERENCE_TTS_URL).openConnection() as HttpURLConnection
1359
+ conn.requestMethod = "POST"
1360
+ conn.connectTimeout = 30_000
1361
+ conn.readTimeout = 180_000
1362
+ conn.setRequestProperty("Authorization", "Bearer $token")
1363
+ conn.setRequestProperty("Content-Type", "application/json")
1364
+ conn.setRequestProperty("Accept", "audio/wav")
1365
+ conn.doOutput = true
1366
+ return conn
1367
+ }
1368
+
1369
+ private fun buildLocalInferenceTtsPayload(text: String, directive: JSObject?): String {
1370
+ val payload = JSONObject()
1371
+ payload.put("text", text)
1372
+ directive.stringOrNull("voiceId")?.let { payload.put("voiceId", it) }
1373
+ directive.stringOrNull("voice")?.let { payload.put("voice", it) }
1374
+ directive.stringOrNull("modelId")?.let { payload.put("modelId", it) }
1375
+ directive.stringOrNull("model")?.let { payload.put("model", it) }
1376
+ val speed = directive?.optDouble("speed", Double.NaN)
1377
+ if (speed != null && speed.isFinite() && speed > 0.0) {
1378
+ payload.put("speed", speed)
1379
+ }
1380
+ return payload.toString()
1381
+ }
1382
+
1383
+ private fun readExactly(input: BufferedInputStream, size: Int): ByteArray {
1384
+ val bytes = ByteArray(size)
1385
+ var offset = 0
1386
+ while (offset < size) {
1387
+ val read = input.read(bytes, offset, size - offset)
1388
+ if (read < 0) {
1389
+ throw IllegalStateException("Unexpected end of WAV stream")
1390
+ }
1391
+ offset += read
1392
+ }
1393
+ return bytes
1394
+ }
1395
+
1396
+ private fun skipFully(input: BufferedInputStream, count: Int) {
1397
+ var remaining = count
1398
+ while (remaining > 0) {
1399
+ val skipped = input.skip(remaining.toLong()).toInt()
1400
+ if (skipped > 0) {
1401
+ remaining -= skipped
1402
+ continue
1403
+ }
1404
+ if (input.read() < 0) {
1405
+ throw IllegalStateException("Unexpected end of WAV stream")
1406
+ }
1407
+ remaining -= 1
1408
+ }
1409
+ }
1410
+
1411
+ private fun littleEndianShort(bytes: ByteArray, offset: Int): Int {
1412
+ return (bytes[offset].toInt() and 0xff) or
1413
+ ((bytes[offset + 1].toInt() and 0xff) shl 8)
1414
+ }
1415
+
1416
+ private fun littleEndianInt(bytes: ByteArray, offset: Int): Int {
1417
+ return (bytes[offset].toInt() and 0xff) or
1418
+ ((bytes[offset + 1].toInt() and 0xff) shl 8) or
1419
+ ((bytes[offset + 2].toInt() and 0xff) shl 16) or
1420
+ ((bytes[offset + 3].toInt() and 0xff) shl 24)
1421
+ }
1422
+
1423
+ private fun chunkId(bytes: ByteArray): String {
1424
+ return String(bytes, 0, 4, Charsets.US_ASCII)
1425
+ }
1426
+
1427
+ private fun readWavPcmFormat(input: BufferedInputStream): PcmStreamFormat {
1428
+ val riff = readExactly(input, 12)
1429
+ if (
1430
+ String(riff, 0, 4, Charsets.US_ASCII) != "RIFF" ||
1431
+ String(riff, 8, 4, Charsets.US_ASCII) != "WAVE"
1432
+ ) {
1433
+ throw IllegalStateException("Local inference TTS returned non-WAV audio")
1434
+ }
1435
+
1436
+ var format: PcmStreamFormat? = null
1437
+ while (true) {
1438
+ val header = readExactly(input, 8)
1439
+ val id = chunkId(header)
1440
+ val size = littleEndianInt(header, 4)
1441
+ if (size < 0) {
1442
+ throw IllegalStateException("Invalid WAV chunk size for $id")
1443
+ }
1444
+
1445
+ if (id == "fmt ") {
1446
+ val fmt = readExactly(input, size)
1447
+ if (fmt.size < 16) {
1448
+ throw IllegalStateException("Invalid WAV fmt chunk")
1449
+ }
1450
+ val audioFormat = littleEndianShort(fmt, 0)
1451
+ val channels = littleEndianShort(fmt, 2)
1452
+ val sampleRate = littleEndianInt(fmt, 4)
1453
+ val bitsPerSample = littleEndianShort(fmt, 14)
1454
+ if (audioFormat != 1) {
1455
+ throw IllegalStateException("Only PCM WAV is supported, got format=$audioFormat")
1456
+ }
1457
+ if (bitsPerSample != 16) {
1458
+ throw IllegalStateException("Only 16-bit PCM WAV is supported, got bits=$bitsPerSample")
1459
+ }
1460
+ if (channels !in 1..2 || sampleRate <= 0) {
1461
+ throw IllegalStateException("Invalid WAV format sampleRate=$sampleRate channels=$channels")
1462
+ }
1463
+ format = PcmStreamFormat(sampleRate, channels, bitsPerSample, 0)
1464
+ if (size % 2 == 1) skipFully(input, 1)
1465
+ continue
1466
+ }
1467
+
1468
+ if (id == "data") {
1469
+ val parsed = format ?: throw IllegalStateException("WAV data arrived before fmt chunk")
1470
+ return parsed.copy(dataBytes = size)
1471
+ }
1472
+
1473
+ skipFully(input, size)
1474
+ if (size % 2 == 1) skipFully(input, 1)
1475
+ }
1476
+ }
1477
+
1478
+ private fun createPcmAudioTrack(format: PcmStreamFormat): AudioTrack {
1479
+ val channelMask = when (format.channels) {
1480
+ 1 -> AudioFormat.CHANNEL_OUT_MONO
1481
+ 2 -> AudioFormat.CHANNEL_OUT_STEREO
1482
+ else -> throw IllegalStateException("Unsupported PCM channel count ${format.channels}")
1483
+ }
1484
+ val minBuffer = AudioTrack.getMinBufferSize(
1485
+ format.sampleRate,
1486
+ channelMask,
1487
+ AudioFormat.ENCODING_PCM_16BIT
1488
+ )
1489
+ if (minBuffer <= 0) {
1490
+ throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
1491
+ }
1492
+ val bufferSize = max(minBuffer * 2, 8 * 1024)
1493
+ val track = AudioTrack.Builder()
1494
+ .setAudioAttributes(voiceAudioAttributes())
1495
+ .setAudioFormat(
1496
+ AudioFormat.Builder()
1497
+ .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
1498
+ .setSampleRate(format.sampleRate)
1499
+ .setChannelMask(channelMask)
1500
+ .build()
1501
+ )
1502
+ .setBufferSizeInBytes(bufferSize)
1503
+ .setTransferMode(AudioTrack.MODE_STREAM)
1504
+ .build()
1505
+
1506
+ if (track.state != AudioTrack.STATE_INITIALIZED) {
1507
+ track.release()
1508
+ throw IllegalStateException("AudioTrack init failed")
1509
+ }
1510
+ return track
1511
+ }
1512
+
1513
+ private fun writePcmStreamToTrack(
1514
+ input: BufferedInputStream,
1515
+ track: AudioTrack,
1516
+ format: PcmStreamFormat
1517
+ ): Long {
1518
+ val bytesPerFrame = format.channels * (format.bitsPerSample / 8)
1519
+ var bytesWrittenTotal = 0L
1520
+ var remainingBytes = format.dataBytes
1521
+ val buffer = ByteArray(8 * 1024)
1522
+ while (remainingBytes > 0) {
1523
+ if (pcmStopRequested.get()) break
1524
+ val requestBytes = if (remainingBytes < buffer.size) remainingBytes else buffer.size
1525
+ val bytesRead = input.read(buffer, 0, requestBytes)
1526
+ if (bytesRead <= 0) break
1527
+ remainingBytes -= bytesRead
1528
+
1529
+ var offset = 0
1530
+ while (offset < bytesRead) {
1531
+ if (pcmStopRequested.get()) break
1532
+ val wrote = track.write(buffer, offset, bytesRead - offset)
1533
+ if (wrote <= 0) {
1534
+ throw IllegalStateException("AudioTrack write failed: $wrote")
1535
+ }
1536
+ offset += wrote
1537
+ bytesWrittenTotal += wrote.toLong()
1538
+ }
1539
+ }
1540
+ return if (bytesPerFrame > 0) bytesWrittenTotal / bytesPerFrame else 0L
1541
+ }
1542
+
1543
+ private fun drainPcmTrack(track: AudioTrack, framesWritten: Long, sampleRate: Int) {
1544
+ if (framesWritten <= 0L || sampleRate <= 0) return
1545
+ val maxDrainMs = (framesWritten * 1000L / sampleRate).coerceAtMost(30_000L) + 1_000L
1546
+ val deadline = SystemClock.elapsedRealtime() + maxDrainMs
1547
+ while (
1548
+ !pcmStopRequested.get() &&
1549
+ track.playbackHeadPosition.toLong() < framesWritten &&
1550
+ SystemClock.elapsedRealtime() < deadline
1551
+ ) {
1552
+ SystemClock.sleep(20)
1553
+ }
1554
+ }
1555
+
756
1556
  /**
757
1557
  * Stream PCM audio from ElevenLabs and play via AudioTrack.
758
1558
  * Ported from classic TalkModeManager with proper offset-based writes.
@@ -776,12 +1576,7 @@ class TalkModePlugin : Plugin() {
776
1576
 
777
1577
  val bufferSize = max(minBuffer * 2, 8 * 1024)
778
1578
  val track = AudioTrack.Builder()
779
- .setAudioAttributes(
780
- AudioAttributes.Builder()
781
- .setUsage(AudioAttributes.USAGE_ASSISTANT)
782
- .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
783
- .build()
784
- )
1579
+ .setAudioAttributes(voiceAudioAttributes())
785
1580
  .setAudioFormat(
786
1581
  AudioFormat.Builder()
787
1582
  .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
@@ -802,6 +1597,7 @@ class TalkModePlugin : Plugin() {
802
1597
 
803
1598
  Log.d(TAG, "PCM play start sampleRate=$sampleRate bufferSize=$bufferSize")
804
1599
  val conn = openTtsConnection(voiceId, apiKey, request)
1600
+ activePcmConnection = conn
805
1601
  try {
806
1602
  val payload = buildRequestPayload(request)
807
1603
  conn.outputStream.use { it.write(payload.toByteArray()) }
@@ -845,6 +1641,9 @@ class TalkModePlugin : Plugin() {
845
1641
  Log.d(TAG, "PCM play done")
846
1642
  } finally {
847
1643
  cleanupPcmTrack()
1644
+ if (activePcmConnection === conn) {
1645
+ activePcmConnection = null
1646
+ }
848
1647
  conn.disconnect()
849
1648
  }
850
1649
  }
@@ -970,43 +1769,125 @@ class TalkModePlugin : Plugin() {
970
1769
  }
971
1770
  }
972
1771
 
973
- // ── Audio focus ─────────────────────────────────────────────────────
1772
+ // ── Voice audio session ─────────────────────────────────────────────
1773
+ //
1774
+ // The Android analog of the iOS `.playAndRecord` / `.voiceChat` /
1775
+ // `.defaultToSpeaker` session. Putting the device in MODE_IN_COMMUNICATION
1776
+ // for the whole conversation routes capture + playback through the
1777
+ // telephony path, which engages the platform hardware AEC so TTS coming out
1778
+ // the speaker is cancelled from the mic (the core fix for the mic+speaker
1779
+ // echo loop in hands-free mode). We also hold voice-communication audio
1780
+ // focus and route to the loudspeaker (unless a headset is connected) so
1781
+ // hands-free playback is audible.
1782
+
1783
+ private fun voiceAudioAttributes(): AudioAttributes =
1784
+ AudioAttributes.Builder()
1785
+ .setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
1786
+ .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
1787
+ .build()
974
1788
 
975
- private fun requestAudioFocus() {
1789
+ private fun configureVoiceAudioSession() {
1790
+ if (audioSessionActive) return
976
1791
  val am = audioManager ?: return
977
- val focusListener = AudioManager.OnAudioFocusChangeListener { focusChange ->
978
- when (focusChange) {
979
- AudioManager.AUDIOFOCUS_LOSS,
980
- AudioManager.AUDIOFOCUS_LOSS_TRANSIENT -> {
981
- // Another app took audio; stop speaking if we are
982
- if (isSpeaking) {
983
- stopSpeakingInternal()
984
- }
1792
+
1793
+ savedAudioMode = am.mode
1794
+ @Suppress("DEPRECATION")
1795
+ savedSpeakerphoneOn = am.isSpeakerphoneOn
1796
+
1797
+ val request = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_EXCLUSIVE)
1798
+ .setAudioAttributes(voiceAudioAttributes())
1799
+ .setOnAudioFocusChangeListener { focusChange ->
1800
+ if (
1801
+ focusChange == AudioManager.AUDIOFOCUS_LOSS ||
1802
+ focusChange == AudioManager.AUDIOFOCUS_LOSS_TRANSIENT
1803
+ ) {
1804
+ // Another app took audio; stop speaking if we are.
1805
+ if (isSpeaking) stopSpeakingInternal()
985
1806
  }
986
1807
  }
1808
+ .build()
1809
+ audioFocusRequest = request
1810
+ am.requestAudioFocus(request)
1811
+
1812
+ am.mode = AudioManager.MODE_IN_COMMUNICATION
1813
+ routeVoiceOutput(am)
1814
+ muteEarconStreams(am)
1815
+ audioSessionActive = true
1816
+ Log.d(TAG, "Voice audio session active (communication mode)")
1817
+ }
1818
+
1819
+ /** Mute the recognizer earcon streams for the session; idempotent. */
1820
+ private fun muteEarconStreams(am: AudioManager) {
1821
+ if (earconStreamsMuted) return
1822
+ for (stream in earconStreams) {
1823
+ try {
1824
+ am.adjustStreamVolume(stream, AudioManager.ADJUST_MUTE, 0)
1825
+ } catch (_: Throwable) {
1826
+ // Some OEMs disallow muting certain streams without DND access.
1827
+ }
1828
+ }
1829
+ earconStreamsMuted = true
1830
+ }
1831
+
1832
+ private fun unmuteEarconStreams(am: AudioManager) {
1833
+ if (!earconStreamsMuted) return
1834
+ for (stream in earconStreams) {
1835
+ try {
1836
+ am.adjustStreamVolume(stream, AudioManager.ADJUST_UNMUTE, 0)
1837
+ } catch (_: Throwable) {}
987
1838
  }
988
- audioFocusRequest = focusListener
1839
+ earconStreamsMuted = false
1840
+ }
989
1841
 
1842
+ /**
1843
+ * Default playback to the loudspeaker for hands-free use, but let a wired or
1844
+ * Bluetooth headset win — the iOS `.defaultToSpeaker` semantic.
1845
+ */
1846
+ private fun routeVoiceOutput(am: AudioManager) {
1847
+ val hasHeadset = am.getDevices(AudioManager.GET_DEVICES_OUTPUTS).any { device ->
1848
+ device.type == AudioDeviceInfo.TYPE_WIRED_HEADSET ||
1849
+ device.type == AudioDeviceInfo.TYPE_WIRED_HEADPHONES ||
1850
+ device.type == AudioDeviceInfo.TYPE_USB_HEADSET ||
1851
+ device.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO ||
1852
+ device.type == AudioDeviceInfo.TYPE_BLUETOOTH_A2DP
1853
+ }
1854
+ if (hasHeadset) {
1855
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
1856
+ @Suppress("DEPRECATION")
1857
+ am.isSpeakerphoneOn = false
1858
+ return
1859
+ }
1860
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
1861
+ val speaker = am.availableCommunicationDevices.firstOrNull {
1862
+ it.type == AudioDeviceInfo.TYPE_BUILTIN_SPEAKER
1863
+ }
1864
+ if (speaker != null && am.setCommunicationDevice(speaker)) return
1865
+ }
990
1866
  @Suppress("DEPRECATION")
991
- am.requestAudioFocus(
992
- focusListener,
993
- AudioManager.STREAM_MUSIC,
994
- AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK
995
- )
1867
+ am.isSpeakerphoneOn = true
996
1868
  }
997
1869
 
998
- private fun abandonAudioFocus() {
1870
+ private fun releaseVoiceAudioSession() {
1871
+ if (!audioSessionActive) return
999
1872
  val am = audioManager ?: return
1000
- val listener = audioFocusRequest ?: return
1001
- @Suppress("DEPRECATION")
1002
- am.abandonAudioFocus(listener)
1873
+ unmuteEarconStreams(am)
1874
+ audioFocusRequest?.let { am.abandonAudioFocusRequest(it) }
1003
1875
  audioFocusRequest = null
1876
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
1877
+ @Suppress("DEPRECATION")
1878
+ am.isSpeakerphoneOn = savedSpeakerphoneOn
1879
+ am.mode = savedAudioMode
1880
+ audioSessionActive = false
1881
+ Log.d(TAG, "Voice audio session released")
1004
1882
  }
1005
1883
 
1006
1884
  // ── Cleanup helpers ─────────────────────────────────────────────────
1007
1885
 
1008
1886
  private fun stopSpeakingInternal() {
1009
1887
  pcmStopRequested.set(true)
1888
+ val conn = activePcmConnection
1889
+ activePcmConnection = null
1890
+ conn?.disconnect()
1010
1891
  cleanupPcmTrack()
1011
1892
  systemTts?.stop()
1012
1893
  systemTtsPending?.cancel()
@@ -1162,6 +2043,9 @@ class TalkModePlugin : Plugin() {
1162
2043
  }
1163
2044
 
1164
2045
  private fun isPermissionGranted(permission: String): Boolean {
2046
+ if (permission == Manifest.permission.RECORD_AUDIO) {
2047
+ return context.checkSelfPermission(permission) == PackageManager.PERMISSION_GRANTED
2048
+ }
1165
2049
  return getPermissionState(permission) == com.getcapacitor.PermissionState.GRANTED
1166
2050
  }
1167
2051
 
@@ -1176,10 +2060,13 @@ class TalkModePlugin : Plugin() {
1176
2060
  systemTts?.shutdown()
1177
2061
  systemTts = null
1178
2062
  cleanupPcmTrack()
2063
+ audioFrameRunning.set(false)
2064
+ audioFrameJob?.cancel()
2065
+ releaseAudioRecord()
1179
2066
  silenceJob?.cancel()
1180
2067
  restartJob?.cancel()
1181
2068
  speakingJob?.cancel()
1182
- abandonAudioFocus()
2069
+ releaseVoiceAudioSession()
1183
2070
  scope.cancel()
1184
2071
  }
1185
2072