@elizaos/capacitor-talkmode 1.0.0 → 2.0.11-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,10 +2,17 @@ package ai.eliza.plugins.talkmode
2
2
 
3
3
  import android.Manifest
4
4
  import android.content.Intent
5
+ import android.content.pm.PackageManager
5
6
  import android.media.AudioAttributes
7
+ import android.media.AudioDeviceInfo
8
+ import android.media.AudioFocusRequest
6
9
  import android.media.AudioFormat
7
10
  import android.media.AudioManager
11
+ import android.media.AudioRecord
8
12
  import android.media.AudioTrack
13
+ import android.media.MediaRecorder
14
+ import android.util.Base64
15
+ import android.os.Build
9
16
  import android.os.Bundle
10
17
  import android.os.Handler
11
18
  import android.os.Looper
@@ -26,6 +33,7 @@ import com.getcapacitor.annotation.Permission
26
33
  import com.getcapacitor.annotation.PermissionCallback
27
34
  import kotlinx.coroutines.*
28
35
  import java.io.BufferedInputStream
36
+ import java.io.File
29
37
  import java.net.HttpURLConnection
30
38
  import java.net.URL
31
39
  import java.util.Locale
@@ -45,6 +53,11 @@ class TalkModePlugin : Plugin() {
45
53
  private const val TAG = "TalkMode"
46
54
  private const val DEFAULT_MODEL_ID = "eleven_flash_v2_5"
47
55
  private const val DEFAULT_OUTPUT_FORMAT = "pcm_24000"
56
+ private const val LOCAL_INFERENCE_TTS_URL = "http://127.0.0.1:31337/api/tts/local-inference"
57
+ // 16 kHz mono is the rate VAD / diarizer / wake-word models expect; 20 ms
58
+ // (320 samples) is the standard VAD frame size.
59
+ private const val DEFAULT_FRAME_SAMPLE_RATE = 16000
60
+ private const val DEFAULT_FRAME_MS = 20
48
61
  }
49
62
 
50
63
  private val mainHandler = Handler(Looper.getMainLooper())
@@ -65,6 +78,10 @@ class TalkModePlugin : Plugin() {
65
78
  private var lastHeardAtMs: Long? = null
66
79
  private var silenceJob: Job? = null
67
80
  private val silenceWindowMs = 700L
81
+ // The recognizer's own onResults AND our silence monitor can both finalize
82
+ // the same utterance; dedup so a turn is emitted (and sent) exactly once.
83
+ private var lastEmittedFinal = ""
84
+ private var lastEmittedFinalAtMs = 0L
68
85
 
69
86
  // TTS
70
87
  private var systemTts: TextToSpeech? = null
@@ -79,10 +96,37 @@ class TalkModePlugin : Plugin() {
79
96
  private var lastSpokenText: String? = null
80
97
  private var speakStartTimeMs: Long = 0
81
98
  private var lastInterruptedAtSeconds: Double? = null
99
+ @Volatile private var activePcmConnection: HttpURLConnection? = null
82
100
 
83
- // Audio focus
101
+ // Voice audio session (communication-mode routing + focus, mirrors the iOS
102
+ // .playAndRecord/.voiceChat/.defaultToSpeaker session). Held for the whole
103
+ // conversation so the platform AEC has a stable speaker reference to cancel.
84
104
  private var audioManager: AudioManager? = null
85
- private var audioFocusRequest: AudioManager.OnAudioFocusChangeListener? = null
105
+ private var audioFocusRequest: AudioFocusRequest? = null
106
+ private var audioSessionActive = false
107
+ private var savedAudioMode = AudioManager.MODE_NORMAL
108
+ private var savedSpeakerphoneOn = false
109
+ // Streams we mute for the session to suppress the platform recognizer's
110
+ // start/stop earcons (the "on/off" beeps heard as it re-arms continuously).
111
+ // TTS plays on STREAM_VOICE_CALL (USAGE_VOICE_COMMUNICATION) so it stays
112
+ // audible. Tracked so we only unmute streams we muted.
113
+ private val earconStreams = intArrayOf(
114
+ AudioManager.STREAM_MUSIC,
115
+ AudioManager.STREAM_SYSTEM,
116
+ AudioManager.STREAM_NOTIFICATION,
117
+ )
118
+ private var earconStreamsMuted = false
119
+
120
+ // Raw PCM frame capture (diarization / VAD / wake-word source). Opt-in and
121
+ // mutually exclusive with SpeechRecognizer on the mic: Android only lets one
122
+ // capture client own a given input source at a time, so starting frame
123
+ // capture SUSPENDS any active SpeechRecognizer and stopping it resumes STT.
124
+ private var audioRecord: AudioRecord? = null
125
+ private var audioFrameJob: Job? = null
126
+ private val audioFrameRunning = AtomicBoolean(false)
127
+ private var sttSuspendedForFrames = false
128
+ private var lastFrameSampleRate = DEFAULT_FRAME_SAMPLE_RATE
129
+ private var lastFrameSamples = 0
86
130
 
87
131
  // Config
88
132
  private var apiKey: String? = null
@@ -189,6 +233,7 @@ class TalkModePlugin : Plugin() {
189
233
  systemTtsReady = status == TextToSpeech.SUCCESS
190
234
  if (systemTtsReady) {
191
235
  systemTts?.language = Locale.getDefault()
236
+ systemTts?.setAudioAttributes(voiceAudioAttributes())
192
237
  systemTts?.setOnUtteranceProgressListener(object : UtteranceProgressListener() {
193
238
  override fun onStart(id: String?) {}
194
239
 
@@ -270,6 +315,7 @@ class TalkModePlugin : Plugin() {
270
315
  enabled = true
271
316
  stopRequested = false
272
317
  listeningMode = true
318
+ configureVoiceAudioSession()
273
319
  setState("listening", "Listening")
274
320
 
275
321
  mainHandler.post {
@@ -286,6 +332,13 @@ class TalkModePlugin : Plugin() {
286
332
  })
287
333
  } catch (e: Exception) {
288
334
  Log.e(TAG, "Failed to start", e)
335
+ // Recognizer creation failed AFTER the audio session was
336
+ // configured — release it so the earcon streams aren't left
337
+ // muted and the device isn't stuck in MODE_IN_COMMUNICATION.
338
+ enabled = false
339
+ listeningMode = false
340
+ releaseVoiceAudioSession()
341
+ setState("idle", "Off")
289
342
  call.resolve(JSObject().apply {
290
343
  put("started", false)
291
344
  put("error", e.message ?: "Failed to start")
@@ -307,6 +360,10 @@ class TalkModePlugin : Plugin() {
307
360
  lastTranscript = ""
308
361
  lastHeardAtMs = null
309
362
 
363
+ // Release any raw-PCM capture; `enabled` is already false so this won't
364
+ // re-arm SpeechRecognizer.
365
+ stopAudioFramesInternal()
366
+
310
367
  mainHandler.post {
311
368
  recognizer?.cancel()
312
369
  recognizer?.destroy()
@@ -314,6 +371,7 @@ class TalkModePlugin : Plugin() {
314
371
  }
315
372
 
316
373
  stopSpeakingInternal()
374
+ releaseVoiceAudioSession()
317
375
  setState("idle", "Off")
318
376
  call.resolve()
319
377
  }
@@ -364,16 +422,18 @@ class TalkModePlugin : Plugin() {
364
422
  }
365
423
 
366
424
  val useSystemTts = call.getBoolean("useSystemTts", false) ?: false
425
+ val useLocalInferenceTts = call.getBoolean("useLocalInferenceTts", false) ?: false
367
426
  val directive = call.getObject("directive")
368
427
 
369
428
  speakingJob = scope.launch {
370
- speakInternal(text, useSystemTts, directive, call)
429
+ speakInternal(text, useSystemTts, useLocalInferenceTts, directive, call)
371
430
  }
372
431
  }
373
432
 
374
433
  @PluginMethod
375
434
  fun stopSpeaking(call: PluginCall) {
376
435
  val interruptedAt = computeInterruptedAt()
436
+ lastInterruptedAtSeconds = interruptedAt
377
437
  stopSpeakingInternal()
378
438
  call.resolve(JSObject().apply {
379
439
  if (interruptedAt != null) {
@@ -408,6 +468,279 @@ class TalkModePlugin : Plugin() {
408
468
  call.resolve(buildPermissionResult())
409
469
  }
410
470
 
471
+ // ── Raw PCM frame capture (diarization / VAD / wake-word) ────────────
472
+
473
+ @PluginMethod
474
+ fun startAudioFrames(call: PluginCall) {
475
+ if (getPermissionState("microphone") != PermissionState.GRANTED) {
476
+ requestPermissionForAlias("microphone", call, "handleStartAudioFramesPermission")
477
+ return
478
+ }
479
+ startAudioFramesInternal(call)
480
+ }
481
+
482
+ @PermissionCallback
483
+ private fun handleStartAudioFramesPermission(call: PluginCall) {
484
+ if (getPermissionState("microphone") == PermissionState.GRANTED) {
485
+ startAudioFramesInternal(call)
486
+ } else {
487
+ call.resolve(JSObject().apply {
488
+ put("started", false)
489
+ put("error", "Microphone permission denied")
490
+ })
491
+ }
492
+ }
493
+
494
+ private fun startAudioFramesInternal(call: PluginCall) {
495
+ if (audioFrameRunning.get()) {
496
+ call.resolve(JSObject().apply {
497
+ put("started", true)
498
+ put("sampleRate", lastFrameSampleRate)
499
+ put("frameSamples", lastFrameSamples)
500
+ put("suspendedStt", sttSuspendedForFrames)
501
+ })
502
+ return
503
+ }
504
+
505
+ val requestedRate = call.getInt("sampleRate") ?: DEFAULT_FRAME_SAMPLE_RATE
506
+ val frameMs = call.getInt("frameMs") ?: DEFAULT_FRAME_MS
507
+ // SpeechRecognizer (SODA) holds the mic; a parallel AudioRecord on the
508
+ // same input fails on virtually every device. Suspend it for the
509
+ // duration of capture and remember to resume on stop.
510
+ val wasListening = isListening || listeningMode
511
+ if (wasListening) {
512
+ suspendSpeechRecognizerForFrames()
513
+ }
514
+
515
+ val record = try {
516
+ openAudioRecord(requestedRate)
517
+ } catch (e: Exception) {
518
+ Log.e(TAG, "AudioRecord open failed", e)
519
+ if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
520
+ call.resolve(JSObject().apply {
521
+ put("started", false)
522
+ put("error", e.message ?: "AudioRecord open failed")
523
+ })
524
+ return
525
+ }
526
+
527
+ val actualRate = record.sampleRate
528
+ val frameSamples = max(1, actualRate * frameMs / 1000)
529
+ audioRecord = record
530
+ lastFrameSampleRate = actualRate
531
+ lastFrameSamples = frameSamples
532
+
533
+ try {
534
+ record.startRecording()
535
+ } catch (e: Exception) {
536
+ Log.e(TAG, "AudioRecord startRecording failed", e)
537
+ releaseAudioRecord()
538
+ if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
539
+ call.resolve(JSObject().apply {
540
+ put("started", false)
541
+ put("error", e.message ?: "AudioRecord start failed")
542
+ })
543
+ return
544
+ }
545
+
546
+ if (record.recordingState != AudioRecord.RECORDSTATE_RECORDING) {
547
+ Log.e(TAG, "AudioRecord did not enter RECORDING state")
548
+ releaseAudioRecord()
549
+ if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
550
+ call.resolve(JSObject().apply {
551
+ put("started", false)
552
+ put("error", "AudioRecord did not start (mic likely held by SpeechRecognizer)")
553
+ })
554
+ return
555
+ }
556
+
557
+ audioFrameRunning.set(true)
558
+ launchFrameLoop(record, frameSamples)
559
+
560
+ call.resolve(JSObject().apply {
561
+ put("started", true)
562
+ put("sampleRate", actualRate)
563
+ put("frameSamples", frameSamples)
564
+ put("suspendedStt", sttSuspendedForFrames)
565
+ })
566
+ }
567
+
568
+ @PluginMethod
569
+ fun stopAudioFrames(call: PluginCall) {
570
+ stopAudioFramesInternal()
571
+ call.resolve()
572
+ }
573
+
574
+ @PluginMethod
575
+ fun isCapturingAudioFrames(call: PluginCall) {
576
+ call.resolve(JSObject().apply {
577
+ put("capturing", audioFrameRunning.get())
578
+ })
579
+ }
580
+
581
+ /**
582
+ * Open a 16 kHz mono 16-bit AudioRecord. Tries VOICE_RECOGNITION first (the
583
+ * pre-processing-light source diarization wants), then falls back to MIC.
584
+ */
585
+ private fun openAudioRecord(sampleRate: Int): AudioRecord {
586
+ val minBuffer = AudioRecord.getMinBufferSize(
587
+ sampleRate,
588
+ AudioFormat.CHANNEL_IN_MONO,
589
+ AudioFormat.ENCODING_PCM_16BIT
590
+ )
591
+ if (minBuffer <= 0) {
592
+ throw IllegalStateException("AudioRecord min buffer invalid ($minBuffer) for ${sampleRate}Hz")
593
+ }
594
+ val bufferBytes = max(minBuffer * 2, 4 * 1024)
595
+ val sources = intArrayOf(
596
+ MediaRecorder.AudioSource.VOICE_RECOGNITION,
597
+ MediaRecorder.AudioSource.MIC,
598
+ )
599
+ var lastError: Throwable? = null
600
+ for (source in sources) {
601
+ try {
602
+ @Suppress("MissingPermission")
603
+ val record = AudioRecord(
604
+ source,
605
+ sampleRate,
606
+ AudioFormat.CHANNEL_IN_MONO,
607
+ AudioFormat.ENCODING_PCM_16BIT,
608
+ bufferBytes
609
+ )
610
+ if (record.state == AudioRecord.STATE_INITIALIZED) {
611
+ return record
612
+ }
613
+ record.release()
614
+ lastError = IllegalStateException("AudioRecord uninitialized for source $source")
615
+ } catch (e: Exception) {
616
+ lastError = e
617
+ }
618
+ }
619
+ throw IllegalStateException(
620
+ "AudioRecord could not initialize at ${sampleRate}Hz",
621
+ lastError
622
+ )
623
+ }
624
+
625
+ private fun launchFrameLoop(record: AudioRecord, frameSamples: Int) {
626
+ audioFrameJob?.cancel()
627
+ // IO dispatcher: a tight blocking read loop must not sit on the main
628
+ // thread. Frames are marshalled to JS via notifyListeners (thread-safe).
629
+ audioFrameJob = scope.launch(Dispatchers.IO) {
630
+ val buffer = ShortArray(frameSamples)
631
+ val bytes = ByteArray(frameSamples * 2)
632
+ var frameIndex = 0L
633
+ try {
634
+ while (audioFrameRunning.get() && isActive) {
635
+ val read = record.read(buffer, 0, frameSamples)
636
+ if (read <= 0) {
637
+ // ERROR_INVALID_OPERATION (-3) / ERROR_BAD_VALUE (-2):
638
+ // the record was released or the mic was taken; stop.
639
+ if (read < 0) break
640
+ continue
641
+ }
642
+ var sumSquares = 0.0
643
+ var b = 0
644
+ for (i in 0 until read) {
645
+ val s = buffer[i].toInt()
646
+ bytes[b] = (s and 0xff).toByte()
647
+ bytes[b + 1] = ((s shr 8) and 0xff).toByte()
648
+ b += 2
649
+ sumSquares += (s.toDouble() * s.toDouble())
650
+ }
651
+ val rms = if (read > 0) {
652
+ Math.sqrt(sumSquares / read) / 32768.0
653
+ } else 0.0
654
+ val pcmBase64 = Base64.encodeToString(
655
+ bytes, 0, read * 2, Base64.NO_WRAP
656
+ )
657
+ val idx = frameIndex
658
+ frameIndex += 1
659
+ val ts = SystemClock.elapsedRealtime()
660
+ notifyListeners("audioFrame", JSObject().apply {
661
+ put("pcm16", pcmBase64)
662
+ put("sampleRate", record.sampleRate)
663
+ put("channels", 1)
664
+ put("samples", read)
665
+ put("rms", rms)
666
+ put("timestamp", ts)
667
+ put("frameIndex", idx)
668
+ })
669
+ }
670
+ } catch (e: Throwable) {
671
+ Log.e(TAG, "Audio frame loop error", e)
672
+ notifyListeners("error", JSObject().apply {
673
+ put("message", "Audio frame capture stopped: ${e.message}")
674
+ put("fatal", false)
675
+ })
676
+ }
677
+ }
678
+ }
679
+
680
+ private fun stopAudioFramesInternal() {
681
+ if (!audioFrameRunning.getAndSet(false) && audioRecord == null) {
682
+ return
683
+ }
684
+ audioFrameJob?.cancel()
685
+ audioFrameJob = null
686
+ releaseAudioRecord()
687
+ if (sttSuspendedForFrames) {
688
+ resumeSpeechRecognizerAfterFrames()
689
+ }
690
+ }
691
+
692
+ private fun releaseAudioRecord() {
693
+ val record = audioRecord ?: return
694
+ audioRecord = null
695
+ try {
696
+ if (record.recordingState == AudioRecord.RECORDSTATE_RECORDING) {
697
+ record.stop()
698
+ }
699
+ } catch (_: Throwable) {
700
+ }
701
+ try {
702
+ record.release()
703
+ } catch (_: Throwable) {
704
+ }
705
+ }
706
+
707
+ /** Suspend SpeechRecognizer so AudioRecord can own the mic. */
708
+ private fun suspendSpeechRecognizerForFrames() {
709
+ sttSuspendedForFrames = true
710
+ listeningMode = false
711
+ isListening = false
712
+ restartJob?.cancel()
713
+ silenceJob?.cancel()
714
+ mainHandler.post {
715
+ try {
716
+ recognizer?.cancel()
717
+ recognizer?.destroy()
718
+ } catch (_: Throwable) {
719
+ }
720
+ recognizer = null
721
+ }
722
+ }
723
+
724
+ /** Re-arm SpeechRecognizer after frame capture ends, if a session is active. */
725
+ private fun resumeSpeechRecognizerAfterFrames() {
726
+ sttSuspendedForFrames = false
727
+ if (!enabled || stopRequested) return
728
+ listeningMode = true
729
+ mainHandler.post {
730
+ try {
731
+ if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
732
+ recognizer?.destroy()
733
+ recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
734
+ setRecognitionListener(recognitionListener)
735
+ }
736
+ startListeningInternal(markListening = true)
737
+ startSilenceMonitor()
738
+ } catch (e: Exception) {
739
+ Log.e(TAG, "Failed to resume STT after frames", e)
740
+ }
741
+ }
742
+ }
743
+
411
744
  // ── Config ──────────────────────────────────────────────────────────
412
745
 
413
746
  private fun applyConfig(config: JSObject) {
@@ -462,6 +795,13 @@ class TalkModePlugin : Plugin() {
462
795
  putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
463
796
  putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
464
797
  putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
798
+ // On-device recognizer (no network round-trip; works offline). The
799
+ // platform recognizer's open/close cadence during continuous use is
800
+ // intrinsic and not controllable via the silence-length extras (the
801
+ // on-device SODA engine ignores them); we silence the AUDIBLE part of
802
+ // that churn by muting the earcon streams for the session instead
803
+ // (see configureVoiceAudioSession).
804
+ putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
465
805
  sttLanguage?.let { putExtra(RecognizerIntent.EXTRA_LANGUAGE, it) }
466
806
  }
467
807
 
@@ -515,13 +855,14 @@ class TalkModePlugin : Plugin() {
515
855
  val elapsed = SystemClock.elapsedRealtime() - lastHeard
516
856
  if (elapsed < silenceWindowMs) return
517
857
 
518
- // Finalize: emit a final transcript event
519
- notifyListeners("transcript", JSObject().apply {
520
- put("transcript", transcript)
521
- put("isFinal", true)
522
- })
858
+ // Finalize this turn (deduped against the recognizer's own onResults),
859
+ // then restart the recognizer so the next utterance is a CLEAN session —
860
+ // Android SpeechRecognizer accumulates within a session, so without the
861
+ // restart the next turn's partials would prepend the words we just sent.
523
862
  lastTranscript = ""
524
863
  lastHeardAtMs = null
864
+ emitFinalOnce(transcript)
865
+ scheduleRestart()
525
866
  }
526
867
 
527
868
  private fun handleTranscript(transcript: String, isFinal: Boolean) {
@@ -531,34 +872,71 @@ class TalkModePlugin : Plugin() {
531
872
  if (isSpeaking && interruptOnSpeech) {
532
873
  if (shouldInterrupt(transcript)) {
533
874
  val interruptedAt = computeInterruptedAt()
534
- stopSpeakingInternal()
535
875
  lastInterruptedAtSeconds = interruptedAt
876
+ stopSpeakingInternal()
536
877
  }
537
878
  return
538
879
  }
539
880
 
540
881
  if (!isListening) return
541
882
 
542
- if (transcript.isNotEmpty()) {
883
+ if (isFinal) {
884
+ // A real end-of-turn from the recognizer: emit once and clear the
885
+ // pending buffer so the silence monitor doesn't re-finalize the same
886
+ // words (the double-send bug).
887
+ lastTranscript = ""
888
+ lastHeardAtMs = null
889
+ emitFinalOnce(transcript)
890
+ } else {
543
891
  lastTranscript = transcript
544
892
  lastHeardAtMs = SystemClock.elapsedRealtime()
893
+ notifyListeners("transcript", JSObject().apply {
894
+ put("transcript", transcript)
895
+ put("isFinal", false)
896
+ })
545
897
  }
898
+ }
546
899
 
900
+ /**
901
+ * Emit a FINAL transcript exactly once. Both the recognizer's `onResults`
902
+ * and the silence monitor can finalize the same utterance; collapse them so
903
+ * the turn is sent a single time (a repeated final within 2s is dropped).
904
+ */
905
+ private fun emitFinalOnce(transcript: String) {
906
+ val text = transcript.trim()
907
+ if (text.isEmpty()) return
908
+ val now = SystemClock.elapsedRealtime()
909
+ if (text == lastEmittedFinal && now - lastEmittedFinalAtMs < 2000L) return
910
+ lastEmittedFinal = text
911
+ lastEmittedFinalAtMs = now
547
912
  notifyListeners("transcript", JSObject().apply {
548
- put("transcript", transcript)
549
- put("isFinal", isFinal)
913
+ put("transcript", text)
914
+ put("isFinal", true)
550
915
  })
551
916
  }
552
917
 
553
918
  /**
554
- * Avoid false interrupts: don't interrupt if the heard text is just a
555
- * substring of what we're currently speaking (echo from speaker).
919
+ * Decide whether heard speech should barge in on the agent's TTS. Tuned to
920
+ * avoid FALSE interrupts (which cut the reply mid-sentence and read as
921
+ * "intermittent audio"): a one-word ASR blip, background noise, or the
922
+ * agent's own voice bleeding back into the mic must NOT interrupt — only a
923
+ * genuine couple-of-words utterance from the user does.
556
924
  */
557
925
  private fun shouldInterrupt(transcript: String): Boolean {
558
926
  val trimmed = transcript.trim()
559
- if (trimmed.length < 3) return false
560
- val spoken = lastSpokenText?.lowercase()
561
- if (spoken != null && spoken.contains(trimmed.lowercase())) return false
927
+ val lower = trimmed.lowercase()
928
+ val words = lower.split(Regex("\\s+")).filter { it.isNotBlank() }
929
+ // Need real intent: at least two words, or one long word ( 8 chars).
930
+ if (words.size < 2 && trimmed.length < 8) return false
931
+ val spoken = lastSpokenText?.lowercase() ?: return true
932
+ // Exact echo of what we're saying → speaker bleed, not the user.
933
+ if (spoken.contains(lower)) return false
934
+ // Fuzzy echo: if most of the heard words appear in the text we're
935
+ // currently speaking, treat it as echo (ASR mishears of our own audio).
936
+ val echoed = words.count { spoken.contains(it) }
937
+ if (words.isNotEmpty() && echoed.toDouble() / words.size >= 0.6) {
938
+ return false
939
+ }
562
940
  return true
563
941
  }
564
942
 
@@ -588,6 +966,7 @@ class TalkModePlugin : Plugin() {
588
966
  private suspend fun speakInternal(
589
967
  text: String,
590
968
  forceSystemTts: Boolean,
969
+ useLocalInferenceTts: Boolean,
591
970
  directive: JSObject?,
592
971
  call: PluginCall
593
972
  ) {
@@ -596,6 +975,7 @@ class TalkModePlugin : Plugin() {
596
975
  lastSpokenText = text
597
976
  speakStartTimeMs = SystemClock.elapsedRealtime()
598
977
  pcmStopRequested.set(false)
978
+ lastInterruptedAtSeconds = null
599
979
  setState("speaking", "Speaking")
600
980
 
601
981
  val effectiveVoiceId = directive.stringOrNull("voiceId")?.let(::resolveVoiceAlias) ?: voiceId
@@ -603,27 +983,74 @@ class TalkModePlugin : Plugin() {
603
983
 
604
984
  notifyListeners("speaking", JSObject().apply {
605
985
  put("text", text)
606
- put("isSystemTts", forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
986
+ put(
987
+ "isSystemTts",
988
+ !useLocalInferenceTts &&
989
+ (forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
990
+ )
607
991
  })
608
992
 
609
993
  // Stop listening during speech (we keep recognizer for interrupt detection)
610
994
  mainHandler.post { recognizer?.stopListening() }
611
995
  ensureInterruptListener()
612
996
 
613
- // Request audio focus
614
- requestAudioFocus()
997
+ // Ensure the communication-mode session + audio focus are active even
998
+ // for a standalone speak() that wasn't preceded by start().
999
+ configureVoiceAudioSession()
1000
+ // Re-assert loudspeaker routing right before playback. configureVoice…
1001
+ // only routes on the FIRST activation; if the session was already up (the
1002
+ // STT path opened it) the speaker route may have drifted, leaving TTS on
1003
+ // the earpiece. Re-route here so replies are audible out the speaker.
1004
+ audioManager?.let { routeVoiceOutput(it) }
615
1005
 
616
1006
  try {
617
- val canUseElevenLabs = !forceSystemTts &&
1007
+ val canUseLocalInference = useLocalInferenceTts && !forceSystemTts
1008
+ val canUseElevenLabs = !canUseLocalInference &&
1009
+ !forceSystemTts &&
618
1010
  !effectiveApiKey.isNullOrEmpty() &&
619
1011
  !effectiveVoiceId.isNullOrEmpty()
620
1012
 
621
- if (canUseElevenLabs) {
1013
+ if (canUseLocalInference) {
1014
+ try {
1015
+ streamAndPlayLocalInferenceTts(text, directive)
1016
+
1017
+ if (!pcmStopRequested.get()) {
1018
+ call.resolve(JSObject().apply {
1019
+ put("completed", true)
1020
+ put("interrupted", false)
1021
+ put("usedSystemTts", false)
1022
+ })
1023
+ } else {
1024
+ call.resolve(JSObject().apply {
1025
+ put("completed", false)
1026
+ put("interrupted", true)
1027
+ put("usedSystemTts", false)
1028
+ lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
1029
+ })
1030
+ }
1031
+ } catch (e: Exception) {
1032
+ if (pcmStopRequested.get()) {
1033
+ call.resolve(JSObject().apply {
1034
+ put("completed", false)
1035
+ put("interrupted", true)
1036
+ put("usedSystemTts", false)
1037
+ })
1038
+ } else {
1039
+ // The on-device OmniVoice TTS assets aren't always staged
1040
+ // (it 502s "TEXT_TO_SPEECH not available"). Rather than go
1041
+ // silent — the JS browser-SpeechSynthesis fallback doesn't
1042
+ // exist in the Android WebView — fall back to the platform
1043
+ // TextToSpeech so replies are always spoken aloud.
1044
+ Log.w(TAG, "Local inference TTS failed, falling back to system TTS", e)
1045
+ speakWithSystemTts(text, call)
1046
+ }
1047
+ }
1048
+ } else if (canUseElevenLabs) {
622
1049
  try {
623
1050
  val request = buildElevenLabsRequest(text, directive)
624
1051
  streamAndPlayPcm(
625
- voiceId = effectiveVoiceId!!,
626
- apiKey = effectiveApiKey!!,
1052
+ voiceId = effectiveVoiceId,
1053
+ apiKey = effectiveApiKey,
627
1054
  request = request
628
1055
  )
629
1056
 
@@ -665,13 +1092,16 @@ class TalkModePlugin : Plugin() {
665
1092
  put("error", e.message ?: "Speak failed")
666
1093
  })
667
1094
  } finally {
1095
+ val wasInterrupted = pcmStopRequested.get()
1096
+ val interruptedAt = lastInterruptedAtSeconds
668
1097
  isSpeaking = false
669
1098
  pcmStopRequested.set(false)
670
- abandonAudioFocus()
671
1099
 
672
1100
  notifyListeners("speakComplete", JSObject().apply {
673
- put("completed", !pcmStopRequested.get())
674
- lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
1101
+ put("completed", !wasInterrupted)
1102
+ if (wasInterrupted) {
1103
+ interruptedAt?.let { put("interruptedAt", it) }
1104
+ }
675
1105
  })
676
1106
 
677
1107
  if (enabled) {
@@ -679,6 +1109,8 @@ class TalkModePlugin : Plugin() {
679
1109
  setState("listening", "Listening")
680
1110
  mainHandler.post { startListeningInternal(markListening = true) }
681
1111
  } else {
1112
+ // Standalone speak (no active conversation): release the session.
1113
+ releaseVoiceAudioSession()
682
1114
  setState("idle", "Off")
683
1115
  }
684
1116
  }
@@ -753,6 +1185,273 @@ class TalkModePlugin : Plugin() {
753
1185
  return if (value == null || value === JSONObject.NULL) null else value.toString()
754
1186
  }
755
1187
 
1188
+ private data class PcmStreamFormat(
1189
+ val sampleRate: Int,
1190
+ val channels: Int,
1191
+ val bitsPerSample: Int,
1192
+ val dataBytes: Int
1193
+ )
1194
+
1195
+ /**
1196
+ * Stream local-inference TTS from the embedded agent and play it natively.
1197
+ *
1198
+ * The agent currently returns a buffered WAV, but keeping playback in
1199
+ * AudioTrack means this path is ready for a chunked PCM/WAV response without
1200
+ * going back through WebView decodeAudioData.
1201
+ */
1202
+ private suspend fun streamAndPlayLocalInferenceTts(
1203
+ text: String,
1204
+ directive: JSObject?
1205
+ ) = withContext(Dispatchers.IO) {
1206
+ pcmStopRequested.set(false)
1207
+ val conn = openLocalInferenceTtsConnection()
1208
+ activePcmConnection = conn
1209
+ try {
1210
+ val payload = buildLocalInferenceTtsPayload(text, directive)
1211
+ conn.outputStream.use { it.write(payload.toByteArray(Charsets.UTF_8)) }
1212
+
1213
+ val code = conn.responseCode
1214
+ if (code >= 400) {
1215
+ val errBody = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
1216
+ throw IllegalStateException("Local inference TTS error: $code $errBody")
1217
+ }
1218
+
1219
+ BufferedInputStream(conn.inputStream).use { input ->
1220
+ val format = readWavPcmFormat(input)
1221
+ val track = createPcmAudioTrack(format)
1222
+ pcmTrack = track
1223
+ track.play()
1224
+
1225
+ Log.d(
1226
+ TAG,
1227
+ "Local inference PCM play start sampleRate=${format.sampleRate} channels=${format.channels}"
1228
+ )
1229
+ notifyListeners("playbackStart", JSObject().apply {
1230
+ put("provider", "local-inference")
1231
+ put("sampleRate", format.sampleRate)
1232
+ put("channels", format.channels)
1233
+ })
1234
+ val framesWritten = writePcmStreamToTrack(input, track, format)
1235
+ drainPcmTrack(track, framesWritten, format.sampleRate)
1236
+ if (!pcmStopRequested.get()) {
1237
+ track.stop()
1238
+ }
1239
+ Log.d(TAG, "Local inference PCM play done frames=$framesWritten")
1240
+ }
1241
+ } finally {
1242
+ cleanupPcmTrack()
1243
+ if (activePcmConnection === conn) {
1244
+ activePcmConnection = null
1245
+ }
1246
+ conn.disconnect()
1247
+ }
1248
+ }
1249
+
1250
+ private fun openLocalInferenceTtsConnection(): HttpURLConnection {
1251
+ val tokenFile = File(context.filesDir, "auth/local-agent-token")
1252
+ val token = tokenFile.takeIf { it.isFile }?.readText()?.trim().orEmpty()
1253
+ if (token.isEmpty()) {
1254
+ throw IllegalStateException("Local agent auth token is missing")
1255
+ }
1256
+
1257
+ val conn = URL(LOCAL_INFERENCE_TTS_URL).openConnection() as HttpURLConnection
1258
+ conn.requestMethod = "POST"
1259
+ conn.connectTimeout = 30_000
1260
+ conn.readTimeout = 180_000
1261
+ conn.setRequestProperty("Authorization", "Bearer $token")
1262
+ conn.setRequestProperty("Content-Type", "application/json")
1263
+ conn.setRequestProperty("Accept", "audio/wav")
1264
+ conn.doOutput = true
1265
+ return conn
1266
+ }
1267
+
1268
+ private fun buildLocalInferenceTtsPayload(text: String, directive: JSObject?): String {
1269
+ val payload = JSONObject()
1270
+ payload.put("text", text)
1271
+ directive.stringOrNull("voiceId")?.let { payload.put("voiceId", it) }
1272
+ directive.stringOrNull("voice")?.let { payload.put("voice", it) }
1273
+ directive.stringOrNull("modelId")?.let { payload.put("modelId", it) }
1274
+ directive.stringOrNull("model")?.let { payload.put("model", it) }
1275
+ val speed = directive?.optDouble("speed", Double.NaN)
1276
+ if (speed != null && speed.isFinite() && speed > 0.0) {
1277
+ payload.put("speed", speed)
1278
+ }
1279
+ return payload.toString()
1280
+ }
1281
+
1282
+ private fun readExactly(input: BufferedInputStream, size: Int): ByteArray {
1283
+ val bytes = ByteArray(size)
1284
+ var offset = 0
1285
+ while (offset < size) {
1286
+ val read = input.read(bytes, offset, size - offset)
1287
+ if (read < 0) {
1288
+ throw IllegalStateException("Unexpected end of WAV stream")
1289
+ }
1290
+ offset += read
1291
+ }
1292
+ return bytes
1293
+ }
1294
+
1295
+ private fun skipFully(input: BufferedInputStream, count: Int) {
1296
+ var remaining = count
1297
+ while (remaining > 0) {
1298
+ val skipped = input.skip(remaining.toLong()).toInt()
1299
+ if (skipped > 0) {
1300
+ remaining -= skipped
1301
+ continue
1302
+ }
1303
+ if (input.read() < 0) {
1304
+ throw IllegalStateException("Unexpected end of WAV stream")
1305
+ }
1306
+ remaining -= 1
1307
+ }
1308
+ }
1309
+
1310
+ private fun littleEndianShort(bytes: ByteArray, offset: Int): Int {
1311
+ return (bytes[offset].toInt() and 0xff) or
1312
+ ((bytes[offset + 1].toInt() and 0xff) shl 8)
1313
+ }
1314
+
1315
+ private fun littleEndianInt(bytes: ByteArray, offset: Int): Int {
1316
+ return (bytes[offset].toInt() and 0xff) or
1317
+ ((bytes[offset + 1].toInt() and 0xff) shl 8) or
1318
+ ((bytes[offset + 2].toInt() and 0xff) shl 16) or
1319
+ ((bytes[offset + 3].toInt() and 0xff) shl 24)
1320
+ }
1321
+
1322
+ private fun chunkId(bytes: ByteArray): String {
1323
+ return String(bytes, 0, 4, Charsets.US_ASCII)
1324
+ }
1325
+
1326
+ private fun readWavPcmFormat(input: BufferedInputStream): PcmStreamFormat {
1327
+ val riff = readExactly(input, 12)
1328
+ if (
1329
+ String(riff, 0, 4, Charsets.US_ASCII) != "RIFF" ||
1330
+ String(riff, 8, 4, Charsets.US_ASCII) != "WAVE"
1331
+ ) {
1332
+ throw IllegalStateException("Local inference TTS returned non-WAV audio")
1333
+ }
1334
+
1335
+ var format: PcmStreamFormat? = null
1336
+ while (true) {
1337
+ val header = readExactly(input, 8)
1338
+ val id = chunkId(header)
1339
+ val size = littleEndianInt(header, 4)
1340
+ if (size < 0) {
1341
+ throw IllegalStateException("Invalid WAV chunk size for $id")
1342
+ }
1343
+
1344
+ if (id == "fmt ") {
1345
+ val fmt = readExactly(input, size)
1346
+ if (fmt.size < 16) {
1347
+ throw IllegalStateException("Invalid WAV fmt chunk")
1348
+ }
1349
+ val audioFormat = littleEndianShort(fmt, 0)
1350
+ val channels = littleEndianShort(fmt, 2)
1351
+ val sampleRate = littleEndianInt(fmt, 4)
1352
+ val bitsPerSample = littleEndianShort(fmt, 14)
1353
+ if (audioFormat != 1) {
1354
+ throw IllegalStateException("Only PCM WAV is supported, got format=$audioFormat")
1355
+ }
1356
+ if (bitsPerSample != 16) {
1357
+ throw IllegalStateException("Only 16-bit PCM WAV is supported, got bits=$bitsPerSample")
1358
+ }
1359
+ if (channels !in 1..2 || sampleRate <= 0) {
1360
+ throw IllegalStateException("Invalid WAV format sampleRate=$sampleRate channels=$channels")
1361
+ }
1362
+ format = PcmStreamFormat(sampleRate, channels, bitsPerSample, 0)
1363
+ if (size % 2 == 1) skipFully(input, 1)
1364
+ continue
1365
+ }
1366
+
1367
+ if (id == "data") {
1368
+ val parsed = format ?: throw IllegalStateException("WAV data arrived before fmt chunk")
1369
+ return parsed.copy(dataBytes = size)
1370
+ }
1371
+
1372
+ skipFully(input, size)
1373
+ if (size % 2 == 1) skipFully(input, 1)
1374
+ }
1375
+ }
1376
+
1377
+ private fun createPcmAudioTrack(format: PcmStreamFormat): AudioTrack {
1378
+ val channelMask = when (format.channels) {
1379
+ 1 -> AudioFormat.CHANNEL_OUT_MONO
1380
+ 2 -> AudioFormat.CHANNEL_OUT_STEREO
1381
+ else -> throw IllegalStateException("Unsupported PCM channel count ${format.channels}")
1382
+ }
1383
+ val minBuffer = AudioTrack.getMinBufferSize(
1384
+ format.sampleRate,
1385
+ channelMask,
1386
+ AudioFormat.ENCODING_PCM_16BIT
1387
+ )
1388
+ if (minBuffer <= 0) {
1389
+ throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
1390
+ }
1391
+ val bufferSize = max(minBuffer * 2, 8 * 1024)
1392
+ val track = AudioTrack.Builder()
1393
+ .setAudioAttributes(voiceAudioAttributes())
1394
+ .setAudioFormat(
1395
+ AudioFormat.Builder()
1396
+ .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
1397
+ .setSampleRate(format.sampleRate)
1398
+ .setChannelMask(channelMask)
1399
+ .build()
1400
+ )
1401
+ .setBufferSizeInBytes(bufferSize)
1402
+ .setTransferMode(AudioTrack.MODE_STREAM)
1403
+ .build()
1404
+
1405
+ if (track.state != AudioTrack.STATE_INITIALIZED) {
1406
+ track.release()
1407
+ throw IllegalStateException("AudioTrack init failed")
1408
+ }
1409
+ return track
1410
+ }
1411
+
1412
+ private fun writePcmStreamToTrack(
1413
+ input: BufferedInputStream,
1414
+ track: AudioTrack,
1415
+ format: PcmStreamFormat
1416
+ ): Long {
1417
+ val bytesPerFrame = format.channels * (format.bitsPerSample / 8)
1418
+ var bytesWrittenTotal = 0L
1419
+ var remainingBytes = format.dataBytes
1420
+ val buffer = ByteArray(8 * 1024)
1421
+ while (remainingBytes > 0) {
1422
+ if (pcmStopRequested.get()) break
1423
+ val requestBytes = if (remainingBytes < buffer.size) remainingBytes else buffer.size
1424
+ val bytesRead = input.read(buffer, 0, requestBytes)
1425
+ if (bytesRead <= 0) break
1426
+ remainingBytes -= bytesRead
1427
+
1428
+ var offset = 0
1429
+ while (offset < bytesRead) {
1430
+ if (pcmStopRequested.get()) break
1431
+ val wrote = track.write(buffer, offset, bytesRead - offset)
1432
+ if (wrote <= 0) {
1433
+ throw IllegalStateException("AudioTrack write failed: $wrote")
1434
+ }
1435
+ offset += wrote
1436
+ bytesWrittenTotal += wrote.toLong()
1437
+ }
1438
+ }
1439
+ return if (bytesPerFrame > 0) bytesWrittenTotal / bytesPerFrame else 0L
1440
+ }
1441
+
1442
+ private fun drainPcmTrack(track: AudioTrack, framesWritten: Long, sampleRate: Int) {
1443
+ if (framesWritten <= 0L || sampleRate <= 0) return
1444
+ val maxDrainMs = (framesWritten * 1000L / sampleRate).coerceAtMost(30_000L) + 1_000L
1445
+ val deadline = SystemClock.elapsedRealtime() + maxDrainMs
1446
+ while (
1447
+ !pcmStopRequested.get() &&
1448
+ track.playbackHeadPosition.toLong() < framesWritten &&
1449
+ SystemClock.elapsedRealtime() < deadline
1450
+ ) {
1451
+ SystemClock.sleep(20)
1452
+ }
1453
+ }
1454
+
756
1455
  /**
757
1456
  * Stream PCM audio from ElevenLabs and play via AudioTrack.
758
1457
  * Ported from classic TalkModeManager with proper offset-based writes.
@@ -776,12 +1475,7 @@ class TalkModePlugin : Plugin() {
776
1475
 
777
1476
  val bufferSize = max(minBuffer * 2, 8 * 1024)
778
1477
  val track = AudioTrack.Builder()
779
- .setAudioAttributes(
780
- AudioAttributes.Builder()
781
- .setUsage(AudioAttributes.USAGE_ASSISTANT)
782
- .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
783
- .build()
784
- )
1478
+ .setAudioAttributes(voiceAudioAttributes())
785
1479
  .setAudioFormat(
786
1480
  AudioFormat.Builder()
787
1481
  .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
@@ -802,6 +1496,7 @@ class TalkModePlugin : Plugin() {
802
1496
 
803
1497
  Log.d(TAG, "PCM play start sampleRate=$sampleRate bufferSize=$bufferSize")
804
1498
  val conn = openTtsConnection(voiceId, apiKey, request)
1499
+ activePcmConnection = conn
805
1500
  try {
806
1501
  val payload = buildRequestPayload(request)
807
1502
  conn.outputStream.use { it.write(payload.toByteArray()) }
@@ -845,6 +1540,9 @@ class TalkModePlugin : Plugin() {
845
1540
  Log.d(TAG, "PCM play done")
846
1541
  } finally {
847
1542
  cleanupPcmTrack()
1543
+ if (activePcmConnection === conn) {
1544
+ activePcmConnection = null
1545
+ }
848
1546
  conn.disconnect()
849
1547
  }
850
1548
  }
@@ -970,43 +1668,125 @@ class TalkModePlugin : Plugin() {
970
1668
  }
971
1669
  }
972
1670
 
973
- // ── Audio focus ─────────────────────────────────────────────────────
1671
+ // ── Voice audio session ─────────────────────────────────────────────
1672
+ //
1673
+ // The Android analog of the iOS `.playAndRecord` / `.voiceChat` /
1674
+ // `.defaultToSpeaker` session. Putting the device in MODE_IN_COMMUNICATION
1675
+ // for the whole conversation routes capture + playback through the
1676
+ // telephony path, which engages the platform hardware AEC so TTS coming out
1677
+ // the speaker is cancelled from the mic (the core fix for the mic+speaker
1678
+ // echo loop in hands-free mode). We also hold voice-communication audio
1679
+ // focus and route to the loudspeaker (unless a headset is connected) so
1680
+ // hands-free playback is audible.
1681
+
1682
+ private fun voiceAudioAttributes(): AudioAttributes =
1683
+ AudioAttributes.Builder()
1684
+ .setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
1685
+ .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
1686
+ .build()
974
1687
 
975
- private fun requestAudioFocus() {
1688
+ private fun configureVoiceAudioSession() {
1689
+ if (audioSessionActive) return
976
1690
  val am = audioManager ?: return
977
- val focusListener = AudioManager.OnAudioFocusChangeListener { focusChange ->
978
- when (focusChange) {
979
- AudioManager.AUDIOFOCUS_LOSS,
980
- AudioManager.AUDIOFOCUS_LOSS_TRANSIENT -> {
981
- // Another app took audio; stop speaking if we are
982
- if (isSpeaking) {
983
- stopSpeakingInternal()
984
- }
1691
+
1692
+ savedAudioMode = am.mode
1693
+ @Suppress("DEPRECATION")
1694
+ savedSpeakerphoneOn = am.isSpeakerphoneOn
1695
+
1696
+ val request = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_EXCLUSIVE)
1697
+ .setAudioAttributes(voiceAudioAttributes())
1698
+ .setOnAudioFocusChangeListener { focusChange ->
1699
+ if (
1700
+ focusChange == AudioManager.AUDIOFOCUS_LOSS ||
1701
+ focusChange == AudioManager.AUDIOFOCUS_LOSS_TRANSIENT
1702
+ ) {
1703
+ // Another app took audio; stop speaking if we are.
1704
+ if (isSpeaking) stopSpeakingInternal()
985
1705
  }
986
1706
  }
1707
+ .build()
1708
+ audioFocusRequest = request
1709
+ am.requestAudioFocus(request)
1710
+
1711
+ am.mode = AudioManager.MODE_IN_COMMUNICATION
1712
+ routeVoiceOutput(am)
1713
+ muteEarconStreams(am)
1714
+ audioSessionActive = true
1715
+ Log.d(TAG, "Voice audio session active (communication mode)")
1716
+ }
1717
+
1718
+ /** Mute the recognizer earcon streams for the session; idempotent. */
1719
+ private fun muteEarconStreams(am: AudioManager) {
1720
+ if (earconStreamsMuted) return
1721
+ for (stream in earconStreams) {
1722
+ try {
1723
+ am.adjustStreamVolume(stream, AudioManager.ADJUST_MUTE, 0)
1724
+ } catch (_: Throwable) {
1725
+ // Some OEMs disallow muting certain streams without DND access.
1726
+ }
987
1727
  }
988
- audioFocusRequest = focusListener
1728
+ earconStreamsMuted = true
1729
+ }
989
1730
 
1731
+ private fun unmuteEarconStreams(am: AudioManager) {
1732
+ if (!earconStreamsMuted) return
1733
+ for (stream in earconStreams) {
1734
+ try {
1735
+ am.adjustStreamVolume(stream, AudioManager.ADJUST_UNMUTE, 0)
1736
+ } catch (_: Throwable) {}
1737
+ }
1738
+ earconStreamsMuted = false
1739
+ }
1740
+
1741
+ /**
1742
+ * Default playback to the loudspeaker for hands-free use, but let a wired or
1743
+ * Bluetooth headset win — the iOS `.defaultToSpeaker` semantic.
1744
+ */
1745
+ private fun routeVoiceOutput(am: AudioManager) {
1746
+ val hasHeadset = am.getDevices(AudioManager.GET_DEVICES_OUTPUTS).any { device ->
1747
+ device.type == AudioDeviceInfo.TYPE_WIRED_HEADSET ||
1748
+ device.type == AudioDeviceInfo.TYPE_WIRED_HEADPHONES ||
1749
+ device.type == AudioDeviceInfo.TYPE_USB_HEADSET ||
1750
+ device.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO ||
1751
+ device.type == AudioDeviceInfo.TYPE_BLUETOOTH_A2DP
1752
+ }
1753
+ if (hasHeadset) {
1754
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
1755
+ @Suppress("DEPRECATION")
1756
+ am.isSpeakerphoneOn = false
1757
+ return
1758
+ }
1759
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
1760
+ val speaker = am.availableCommunicationDevices.firstOrNull {
1761
+ it.type == AudioDeviceInfo.TYPE_BUILTIN_SPEAKER
1762
+ }
1763
+ if (speaker != null && am.setCommunicationDevice(speaker)) return
1764
+ }
990
1765
  @Suppress("DEPRECATION")
991
- am.requestAudioFocus(
992
- focusListener,
993
- AudioManager.STREAM_MUSIC,
994
- AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK
995
- )
1766
+ am.isSpeakerphoneOn = true
996
1767
  }
997
1768
 
998
- private fun abandonAudioFocus() {
1769
+ private fun releaseVoiceAudioSession() {
1770
+ if (!audioSessionActive) return
999
1771
  val am = audioManager ?: return
1000
- val listener = audioFocusRequest ?: return
1001
- @Suppress("DEPRECATION")
1002
- am.abandonAudioFocus(listener)
1772
+ unmuteEarconStreams(am)
1773
+ audioFocusRequest?.let { am.abandonAudioFocusRequest(it) }
1003
1774
  audioFocusRequest = null
1775
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
1776
+ @Suppress("DEPRECATION")
1777
+ am.isSpeakerphoneOn = savedSpeakerphoneOn
1778
+ am.mode = savedAudioMode
1779
+ audioSessionActive = false
1780
+ Log.d(TAG, "Voice audio session released")
1004
1781
  }
1005
1782
 
1006
1783
  // ── Cleanup helpers ─────────────────────────────────────────────────
1007
1784
 
1008
1785
  private fun stopSpeakingInternal() {
1009
1786
  pcmStopRequested.set(true)
1787
+ val conn = activePcmConnection
1788
+ activePcmConnection = null
1789
+ conn?.disconnect()
1010
1790
  cleanupPcmTrack()
1011
1791
  systemTts?.stop()
1012
1792
  systemTtsPending?.cancel()
@@ -1162,6 +1942,9 @@ class TalkModePlugin : Plugin() {
1162
1942
  }
1163
1943
 
1164
1944
  private fun isPermissionGranted(permission: String): Boolean {
1945
+ if (permission == Manifest.permission.RECORD_AUDIO) {
1946
+ return context.checkSelfPermission(permission) == PackageManager.PERMISSION_GRANTED
1947
+ }
1165
1948
  return getPermissionState(permission) == com.getcapacitor.PermissionState.GRANTED
1166
1949
  }
1167
1950
 
@@ -1176,10 +1959,13 @@ class TalkModePlugin : Plugin() {
1176
1959
  systemTts?.shutdown()
1177
1960
  systemTts = null
1178
1961
  cleanupPcmTrack()
1962
+ audioFrameRunning.set(false)
1963
+ audioFrameJob?.cancel()
1964
+ releaseAudioRecord()
1179
1965
  silenceJob?.cancel()
1180
1966
  restartJob?.cancel()
1181
1967
  speakingJob?.cancel()
1182
- abandonAudioFocus()
1968
+ releaseVoiceAudioSession()
1183
1969
  scope.cancel()
1184
1970
  }
1185
1971