@elizaos/capacitor-talkmode 2.0.0-beta.1 → 2.0.3-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,10 +2,17 @@ package ai.eliza.plugins.talkmode
2
2
 
3
3
  import android.Manifest
4
4
  import android.content.Intent
5
+ import android.content.pm.PackageManager
5
6
  import android.media.AudioAttributes
7
+ import android.media.AudioDeviceInfo
8
+ import android.media.AudioFocusRequest
6
9
  import android.media.AudioFormat
7
10
  import android.media.AudioManager
11
+ import android.media.AudioRecord
8
12
  import android.media.AudioTrack
13
+ import android.media.MediaRecorder
14
+ import android.util.Base64
15
+ import android.os.Build
9
16
  import android.os.Bundle
10
17
  import android.os.Handler
11
18
  import android.os.Looper
@@ -25,7 +32,15 @@ import com.getcapacitor.annotation.CapacitorPlugin
25
32
  import com.getcapacitor.annotation.Permission
26
33
  import com.getcapacitor.annotation.PermissionCallback
27
34
  import kotlinx.coroutines.*
35
+ import android.net.LocalSocket
36
+ import android.net.LocalSocketAddress
28
37
  import java.io.BufferedInputStream
38
+ import java.io.ByteArrayInputStream
39
+ import java.io.DataInputStream
40
+ import java.io.DataOutputStream
41
+ import java.nio.ByteBuffer
42
+ import java.nio.ByteOrder
43
+ import java.io.File
29
44
  import java.net.HttpURLConnection
30
45
  import java.net.URL
31
46
  import java.util.Locale
@@ -45,6 +60,15 @@ class TalkModePlugin : Plugin() {
45
60
  private const val TAG = "TalkMode"
46
61
  private const val DEFAULT_MODEL_ID = "eleven_flash_v2_5"
47
62
  private const val DEFAULT_OUTPUT_FORMAT = "pcm_24000"
63
+ private const val LOCAL_INFERENCE_TTS_URL = "http://127.0.0.1:31337/api/tts/local-inference"
64
+ // Abstract-namespace UDS of ElizaBionicInferenceServer (the bionic app
65
+ // process that has libelizainference loaded). Kept in sync with
66
+ // BIONIC_INFERENCE_SOCKET_NAME in ElizaAgentService.
67
+ private const val BIONIC_INFER_SOCKET = "eliza_bionic_infer_v1"
68
+ // 16 kHz mono is the rate VAD / diarizer / wake-word models expect; 20 ms
69
+ // (320 samples) is the standard VAD frame size.
70
+ private const val DEFAULT_FRAME_SAMPLE_RATE = 16000
71
+ private const val DEFAULT_FRAME_MS = 20
48
72
  }
49
73
 
50
74
  private val mainHandler = Handler(Looper.getMainLooper())
@@ -60,11 +84,19 @@ class TalkModePlugin : Plugin() {
60
84
  private var isListening = false
61
85
  private var listeningMode = false
62
86
  private var stopRequested = false
87
+ // Consecutive ERROR_NO_MATCH/SPEECH_TIMEOUT count, for exponential restart
88
+ // backoff so an idle always-on session settles instead of re-arming (and,
89
+ // with the system recognizer, beeping) every ~600ms when nobody is talking.
90
+ private var consecutiveNoMatch = 0
63
91
  private var restartJob: Job? = null
64
92
  private var lastTranscript = ""
65
93
  private var lastHeardAtMs: Long? = null
66
94
  private var silenceJob: Job? = null
67
95
  private val silenceWindowMs = 700L
96
+ // The recognizer's own onResults AND our silence monitor can both finalize
97
+ // the same utterance; dedup so a turn is emitted (and sent) exactly once.
98
+ private var lastEmittedFinal = ""
99
+ private var lastEmittedFinalAtMs = 0L
68
100
 
69
101
  // TTS
70
102
  private var systemTts: TextToSpeech? = null
@@ -79,10 +111,37 @@ class TalkModePlugin : Plugin() {
79
111
  private var lastSpokenText: String? = null
80
112
  private var speakStartTimeMs: Long = 0
81
113
  private var lastInterruptedAtSeconds: Double? = null
114
+ @Volatile private var activePcmConnection: HttpURLConnection? = null
82
115
 
83
- // Audio focus
116
+ // Voice audio session (communication-mode routing + focus, mirrors the iOS
117
+ // .playAndRecord/.voiceChat/.defaultToSpeaker session). Held for the whole
118
+ // conversation so the platform AEC has a stable speaker reference to cancel.
84
119
  private var audioManager: AudioManager? = null
85
- private var audioFocusRequest: AudioManager.OnAudioFocusChangeListener? = null
120
+ private var audioFocusRequest: AudioFocusRequest? = null
121
+ private var audioSessionActive = false
122
+ private var savedAudioMode = AudioManager.MODE_NORMAL
123
+ private var savedSpeakerphoneOn = false
124
+ // Streams we mute for the session to suppress the platform recognizer's
125
+ // start/stop earcons (the "on/off" beeps heard as it re-arms continuously).
126
+ // TTS plays on STREAM_VOICE_CALL (USAGE_VOICE_COMMUNICATION) so it stays
127
+ // audible. Tracked so we only unmute streams we muted.
128
+ private val earconStreams = intArrayOf(
129
+ AudioManager.STREAM_MUSIC,
130
+ AudioManager.STREAM_SYSTEM,
131
+ AudioManager.STREAM_NOTIFICATION,
132
+ )
133
+ private var earconStreamsMuted = false
134
+
135
+ // Raw PCM frame capture (diarization / VAD / wake-word source). Opt-in and
136
+ // mutually exclusive with SpeechRecognizer on the mic: Android only lets one
137
+ // capture client own a given input source at a time, so starting frame
138
+ // capture SUSPENDS any active SpeechRecognizer and stopping it resumes STT.
139
+ private var audioRecord: AudioRecord? = null
140
+ private var audioFrameJob: Job? = null
141
+ private val audioFrameRunning = AtomicBoolean(false)
142
+ private var sttSuspendedForFrames = false
143
+ private var lastFrameSampleRate = DEFAULT_FRAME_SAMPLE_RATE
144
+ private var lastFrameSamples = 0
86
145
 
87
146
  // Config
88
147
  private var apiKey: String? = null
@@ -106,6 +165,7 @@ class TalkModePlugin : Plugin() {
106
165
 
107
166
  override fun onBeginningOfSpeech() {
108
167
  Log.d(TAG, "Beginning of speech")
168
+ consecutiveNoMatch = 0
109
169
  }
110
170
 
111
171
  override fun onRmsChanged(rmsdB: Float) {}
@@ -142,24 +202,34 @@ class TalkModePlugin : Plugin() {
142
202
  return
143
203
  }
144
204
 
145
- // Don't notify error for no-match / speech-timeout, just restart
146
- if (error != SpeechRecognizer.ERROR_NO_MATCH &&
147
- error != SpeechRecognizer.ERROR_SPEECH_TIMEOUT
205
+ // Don't notify error for no-match / speech-timeout, just restart.
206
+ // These fire continuously when the always-on session hears only
207
+ // silence, so back off exponentially (600ms → 8s cap) instead of
208
+ // re-arming the recognizer every 600ms. onBeginningOfSpeech /
209
+ // onResults reset the counter the moment real speech arrives.
210
+ if (error == SpeechRecognizer.ERROR_NO_MATCH ||
211
+ error == SpeechRecognizer.ERROR_SPEECH_TIMEOUT
148
212
  ) {
213
+ consecutiveNoMatch++
214
+ scheduleRestart(
215
+ delayMs = minOf(600L * (1L shl minOf(consecutiveNoMatch, 4)), 8000L),
216
+ )
217
+ } else {
218
+ consecutiveNoMatch = 0
149
219
  notifyListeners("error", JSObject().apply {
150
220
  put("code", "recognition_error")
151
221
  put("message", errorMsg)
152
222
  put("recoverable", true)
153
223
  })
224
+ scheduleRestart(delayMs = 600)
154
225
  }
155
-
156
- scheduleRestart(delayMs = 600)
157
226
  }
158
227
 
159
228
  override fun onResults(results: Bundle?) {
160
229
  val matches = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
161
230
  val transcript = matches?.firstOrNull()?.trim() ?: ""
162
231
  if (transcript.isNotEmpty()) {
232
+ consecutiveNoMatch = 0
163
233
  handleTranscript(transcript, isFinal = true)
164
234
  }
165
235
  scheduleRestart()
@@ -189,6 +259,7 @@ class TalkModePlugin : Plugin() {
189
259
  systemTtsReady = status == TextToSpeech.SUCCESS
190
260
  if (systemTtsReady) {
191
261
  systemTts?.language = Locale.getDefault()
262
+ systemTts?.setAudioAttributes(voiceAudioAttributes())
192
263
  systemTts?.setOnUtteranceProgressListener(object : UtteranceProgressListener() {
193
264
  override fun onStart(id: String?) {}
194
265
 
@@ -270,14 +341,13 @@ class TalkModePlugin : Plugin() {
270
341
  enabled = true
271
342
  stopRequested = false
272
343
  listeningMode = true
344
+ configureVoiceAudioSession()
273
345
  setState("listening", "Listening")
274
346
 
275
347
  mainHandler.post {
276
348
  try {
277
349
  recognizer?.destroy()
278
- recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
279
- setRecognitionListener(recognitionListener)
280
- }
350
+ recognizer = createRecognizer()
281
351
  startListeningInternal(markListening = true)
282
352
  startSilenceMonitor()
283
353
 
@@ -286,6 +356,13 @@ class TalkModePlugin : Plugin() {
286
356
  })
287
357
  } catch (e: Exception) {
288
358
  Log.e(TAG, "Failed to start", e)
359
+ // Recognizer creation failed AFTER the audio session was
360
+ // configured — release it so the earcon streams aren't left
361
+ // muted and the device isn't stuck in MODE_IN_COMMUNICATION.
362
+ enabled = false
363
+ listeningMode = false
364
+ releaseVoiceAudioSession()
365
+ setState("idle", "Off")
289
366
  call.resolve(JSObject().apply {
290
367
  put("started", false)
291
368
  put("error", e.message ?: "Failed to start")
@@ -307,6 +384,10 @@ class TalkModePlugin : Plugin() {
307
384
  lastTranscript = ""
308
385
  lastHeardAtMs = null
309
386
 
387
+ // Release any raw-PCM capture; `enabled` is already false so this won't
388
+ // re-arm SpeechRecognizer.
389
+ stopAudioFramesInternal()
390
+
310
391
  mainHandler.post {
311
392
  recognizer?.cancel()
312
393
  recognizer?.destroy()
@@ -314,6 +395,7 @@ class TalkModePlugin : Plugin() {
314
395
  }
315
396
 
316
397
  stopSpeakingInternal()
398
+ releaseVoiceAudioSession()
317
399
  setState("idle", "Off")
318
400
  call.resolve()
319
401
  }
@@ -364,16 +446,18 @@ class TalkModePlugin : Plugin() {
364
446
  }
365
447
 
366
448
  val useSystemTts = call.getBoolean("useSystemTts", false) ?: false
449
+ val useLocalInferenceTts = call.getBoolean("useLocalInferenceTts", false) ?: false
367
450
  val directive = call.getObject("directive")
368
451
 
369
452
  speakingJob = scope.launch {
370
- speakInternal(text, useSystemTts, directive, call)
453
+ speakInternal(text, useSystemTts, useLocalInferenceTts, directive, call)
371
454
  }
372
455
  }
373
456
 
374
457
  @PluginMethod
375
458
  fun stopSpeaking(call: PluginCall) {
376
459
  val interruptedAt = computeInterruptedAt()
460
+ lastInterruptedAtSeconds = interruptedAt
377
461
  stopSpeakingInternal()
378
462
  call.resolve(JSObject().apply {
379
463
  if (interruptedAt != null) {
@@ -408,6 +492,277 @@ class TalkModePlugin : Plugin() {
408
492
  call.resolve(buildPermissionResult())
409
493
  }
410
494
 
495
+ // ── Raw PCM frame capture (diarization / VAD / wake-word) ────────────
496
+
497
+ @PluginMethod
498
+ fun startAudioFrames(call: PluginCall) {
499
+ if (getPermissionState("microphone") != PermissionState.GRANTED) {
500
+ requestPermissionForAlias("microphone", call, "handleStartAudioFramesPermission")
501
+ return
502
+ }
503
+ startAudioFramesInternal(call)
504
+ }
505
+
506
+ @PermissionCallback
507
+ private fun handleStartAudioFramesPermission(call: PluginCall) {
508
+ if (getPermissionState("microphone") == PermissionState.GRANTED) {
509
+ startAudioFramesInternal(call)
510
+ } else {
511
+ call.resolve(JSObject().apply {
512
+ put("started", false)
513
+ put("error", "Microphone permission denied")
514
+ })
515
+ }
516
+ }
517
+
518
+ private fun startAudioFramesInternal(call: PluginCall) {
519
+ if (audioFrameRunning.get()) {
520
+ call.resolve(JSObject().apply {
521
+ put("started", true)
522
+ put("sampleRate", lastFrameSampleRate)
523
+ put("frameSamples", lastFrameSamples)
524
+ put("suspendedStt", sttSuspendedForFrames)
525
+ })
526
+ return
527
+ }
528
+
529
+ val requestedRate = call.getInt("sampleRate") ?: DEFAULT_FRAME_SAMPLE_RATE
530
+ val frameMs = call.getInt("frameMs") ?: DEFAULT_FRAME_MS
531
+ // SpeechRecognizer (SODA) holds the mic; a parallel AudioRecord on the
532
+ // same input fails on virtually every device. Suspend it for the
533
+ // duration of capture and remember to resume on stop.
534
+ val wasListening = isListening || listeningMode
535
+ if (wasListening) {
536
+ suspendSpeechRecognizerForFrames()
537
+ }
538
+
539
+ val record = try {
540
+ openAudioRecord(requestedRate)
541
+ } catch (e: Exception) {
542
+ Log.e(TAG, "AudioRecord open failed", e)
543
+ if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
544
+ call.resolve(JSObject().apply {
545
+ put("started", false)
546
+ put("error", e.message ?: "AudioRecord open failed")
547
+ })
548
+ return
549
+ }
550
+
551
+ val actualRate = record.sampleRate
552
+ val frameSamples = max(1, actualRate * frameMs / 1000)
553
+ audioRecord = record
554
+ lastFrameSampleRate = actualRate
555
+ lastFrameSamples = frameSamples
556
+
557
+ try {
558
+ record.startRecording()
559
+ } catch (e: Exception) {
560
+ Log.e(TAG, "AudioRecord startRecording failed", e)
561
+ releaseAudioRecord()
562
+ if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
563
+ call.resolve(JSObject().apply {
564
+ put("started", false)
565
+ put("error", e.message ?: "AudioRecord start failed")
566
+ })
567
+ return
568
+ }
569
+
570
+ if (record.recordingState != AudioRecord.RECORDSTATE_RECORDING) {
571
+ Log.e(TAG, "AudioRecord did not enter RECORDING state")
572
+ releaseAudioRecord()
573
+ if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
574
+ call.resolve(JSObject().apply {
575
+ put("started", false)
576
+ put("error", "AudioRecord did not start (mic likely held by SpeechRecognizer)")
577
+ })
578
+ return
579
+ }
580
+
581
+ audioFrameRunning.set(true)
582
+ launchFrameLoop(record, frameSamples)
583
+
584
+ call.resolve(JSObject().apply {
585
+ put("started", true)
586
+ put("sampleRate", actualRate)
587
+ put("frameSamples", frameSamples)
588
+ put("suspendedStt", sttSuspendedForFrames)
589
+ })
590
+ }
591
+
592
+ @PluginMethod
593
+ fun stopAudioFrames(call: PluginCall) {
594
+ stopAudioFramesInternal()
595
+ call.resolve()
596
+ }
597
+
598
+ @PluginMethod
599
+ fun isCapturingAudioFrames(call: PluginCall) {
600
+ call.resolve(JSObject().apply {
601
+ put("capturing", audioFrameRunning.get())
602
+ })
603
+ }
604
+
605
+ /**
606
+ * Open a 16 kHz mono 16-bit AudioRecord. Tries VOICE_RECOGNITION first (the
607
+ * pre-processing-light source diarization wants), then falls back to MIC.
608
+ */
609
+ private fun openAudioRecord(sampleRate: Int): AudioRecord {
610
+ val minBuffer = AudioRecord.getMinBufferSize(
611
+ sampleRate,
612
+ AudioFormat.CHANNEL_IN_MONO,
613
+ AudioFormat.ENCODING_PCM_16BIT
614
+ )
615
+ if (minBuffer <= 0) {
616
+ throw IllegalStateException("AudioRecord min buffer invalid ($minBuffer) for ${sampleRate}Hz")
617
+ }
618
+ val bufferBytes = max(minBuffer * 2, 4 * 1024)
619
+ val sources = intArrayOf(
620
+ MediaRecorder.AudioSource.VOICE_RECOGNITION,
621
+ MediaRecorder.AudioSource.MIC,
622
+ )
623
+ var lastError: Throwable? = null
624
+ for (source in sources) {
625
+ try {
626
+ @Suppress("MissingPermission")
627
+ val record = AudioRecord(
628
+ source,
629
+ sampleRate,
630
+ AudioFormat.CHANNEL_IN_MONO,
631
+ AudioFormat.ENCODING_PCM_16BIT,
632
+ bufferBytes
633
+ )
634
+ if (record.state == AudioRecord.STATE_INITIALIZED) {
635
+ return record
636
+ }
637
+ record.release()
638
+ lastError = IllegalStateException("AudioRecord uninitialized for source $source")
639
+ } catch (e: Exception) {
640
+ lastError = e
641
+ }
642
+ }
643
+ throw IllegalStateException(
644
+ "AudioRecord could not initialize at ${sampleRate}Hz",
645
+ lastError
646
+ )
647
+ }
648
+
649
+ private fun launchFrameLoop(record: AudioRecord, frameSamples: Int) {
650
+ audioFrameJob?.cancel()
651
+ // IO dispatcher: a tight blocking read loop must not sit on the main
652
+ // thread. Frames are marshalled to JS via notifyListeners (thread-safe).
653
+ audioFrameJob = scope.launch(Dispatchers.IO) {
654
+ val buffer = ShortArray(frameSamples)
655
+ val bytes = ByteArray(frameSamples * 2)
656
+ var frameIndex = 0L
657
+ try {
658
+ while (audioFrameRunning.get() && isActive) {
659
+ val read = record.read(buffer, 0, frameSamples)
660
+ if (read <= 0) {
661
+ // ERROR_INVALID_OPERATION (-3) / ERROR_BAD_VALUE (-2):
662
+ // the record was released or the mic was taken; stop.
663
+ if (read < 0) break
664
+ continue
665
+ }
666
+ var sumSquares = 0.0
667
+ var b = 0
668
+ for (i in 0 until read) {
669
+ val s = buffer[i].toInt()
670
+ bytes[b] = (s and 0xff).toByte()
671
+ bytes[b + 1] = ((s shr 8) and 0xff).toByte()
672
+ b += 2
673
+ sumSquares += (s.toDouble() * s.toDouble())
674
+ }
675
+ val rms = if (read > 0) {
676
+ Math.sqrt(sumSquares / read) / 32768.0
677
+ } else 0.0
678
+ val pcmBase64 = Base64.encodeToString(
679
+ bytes, 0, read * 2, Base64.NO_WRAP
680
+ )
681
+ val idx = frameIndex
682
+ frameIndex += 1
683
+ val ts = SystemClock.elapsedRealtime()
684
+ notifyListeners("audioFrame", JSObject().apply {
685
+ put("pcm16", pcmBase64)
686
+ put("sampleRate", record.sampleRate)
687
+ put("channels", 1)
688
+ put("samples", read)
689
+ put("rms", rms)
690
+ put("timestamp", ts)
691
+ put("frameIndex", idx)
692
+ })
693
+ }
694
+ } catch (e: Throwable) {
695
+ Log.e(TAG, "Audio frame loop error", e)
696
+ notifyListeners("error", JSObject().apply {
697
+ put("message", "Audio frame capture stopped: ${e.message}")
698
+ put("fatal", false)
699
+ })
700
+ }
701
+ }
702
+ }
703
+
704
+ private fun stopAudioFramesInternal() {
705
+ if (!audioFrameRunning.getAndSet(false) && audioRecord == null) {
706
+ return
707
+ }
708
+ audioFrameJob?.cancel()
709
+ audioFrameJob = null
710
+ releaseAudioRecord()
711
+ if (sttSuspendedForFrames) {
712
+ resumeSpeechRecognizerAfterFrames()
713
+ }
714
+ }
715
+
716
+ private fun releaseAudioRecord() {
717
+ val record = audioRecord ?: return
718
+ audioRecord = null
719
+ try {
720
+ if (record.recordingState == AudioRecord.RECORDSTATE_RECORDING) {
721
+ record.stop()
722
+ }
723
+ } catch (_: Throwable) {
724
+ }
725
+ try {
726
+ record.release()
727
+ } catch (_: Throwable) {
728
+ }
729
+ }
730
+
731
+ /** Suspend SpeechRecognizer so AudioRecord can own the mic. */
732
+ private fun suspendSpeechRecognizerForFrames() {
733
+ sttSuspendedForFrames = true
734
+ listeningMode = false
735
+ isListening = false
736
+ restartJob?.cancel()
737
+ silenceJob?.cancel()
738
+ mainHandler.post {
739
+ try {
740
+ recognizer?.cancel()
741
+ recognizer?.destroy()
742
+ } catch (_: Throwable) {
743
+ }
744
+ recognizer = null
745
+ }
746
+ }
747
+
748
+ /** Re-arm SpeechRecognizer after frame capture ends, if a session is active. */
749
+ private fun resumeSpeechRecognizerAfterFrames() {
750
+ sttSuspendedForFrames = false
751
+ if (!enabled || stopRequested) return
752
+ listeningMode = true
753
+ mainHandler.post {
754
+ try {
755
+ if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
756
+ recognizer?.destroy()
757
+ recognizer = createRecognizer()
758
+ startListeningInternal(markListening = true)
759
+ startSilenceMonitor()
760
+ } catch (e: Exception) {
761
+ Log.e(TAG, "Failed to resume STT after frames", e)
762
+ }
763
+ }
764
+ }
765
+
411
766
  // ── Config ──────────────────────────────────────────────────────────
412
767
 
413
768
  private fun applyConfig(config: JSObject) {
@@ -462,6 +817,13 @@ class TalkModePlugin : Plugin() {
462
817
  putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
463
818
  putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
464
819
  putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
820
+ // On-device recognizer (no network round-trip; works offline). The
821
+ // platform recognizer's open/close cadence during continuous use is
822
+ // intrinsic and not controllable via the silence-length extras (the
823
+ // on-device SODA engine ignores them); we silence the AUDIBLE part of
824
+ // that churn by muting the earcon streams for the session instead
825
+ // (see configureVoiceAudioSession).
826
+ putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
465
827
  sttLanguage?.let { putExtra(RecognizerIntent.EXTRA_LANGUAGE, it) }
466
828
  }
467
829
 
@@ -477,6 +839,28 @@ class TalkModePlugin : Plugin() {
477
839
  }
478
840
  }
479
841
 
842
+ /**
843
+ * Create the speech recognizer. Prefer the API-31+ ON-DEVICE recognizer
844
+ * (in-process SODA): it plays NO start/error earcons, eliminating the
845
+ * audible "open"/"failure" beeps that came from the system
846
+ * com.google.android.tts recognizer service (which also can't be muted
847
+ * without ACCESS_NOTIFICATION_POLICY / STREAM_SYSTEM_ENFORCED control we
848
+ * don't hold). Falls back to the system recognizer when on-device SODA is
849
+ * unavailable.
850
+ */
851
+ private fun createRecognizer(): SpeechRecognizer {
852
+ val rec = if (
853
+ Build.VERSION.SDK_INT >= Build.VERSION_CODES.S &&
854
+ SpeechRecognizer.isOnDeviceRecognitionAvailable(context)
855
+ ) {
856
+ SpeechRecognizer.createOnDeviceSpeechRecognizer(context)
857
+ } else {
858
+ SpeechRecognizer.createSpeechRecognizer(context)
859
+ }
860
+ rec.setRecognitionListener(recognitionListener)
861
+ return rec
862
+ }
863
+
480
864
  private fun scheduleRestart(delayMs: Long = 350) {
481
865
  if (stopRequested) return
482
866
  restartJob?.cancel()
@@ -515,13 +899,14 @@ class TalkModePlugin : Plugin() {
515
899
  val elapsed = SystemClock.elapsedRealtime() - lastHeard
516
900
  if (elapsed < silenceWindowMs) return
517
901
 
518
- // Finalize: emit a final transcript event
519
- notifyListeners("transcript", JSObject().apply {
520
- put("transcript", transcript)
521
- put("isFinal", true)
522
- })
902
+ // Finalize this turn (deduped against the recognizer's own onResults),
903
+ // then restart the recognizer so the next utterance is a CLEAN session —
904
+ // Android SpeechRecognizer accumulates within a session, so without the
905
+ // restart the next turn's partials would prepend the words we just sent.
523
906
  lastTranscript = ""
524
907
  lastHeardAtMs = null
908
+ emitFinalOnce(transcript)
909
+ scheduleRestart()
525
910
  }
526
911
 
527
912
  private fun handleTranscript(transcript: String, isFinal: Boolean) {
@@ -531,34 +916,71 @@ class TalkModePlugin : Plugin() {
531
916
  if (isSpeaking && interruptOnSpeech) {
532
917
  if (shouldInterrupt(transcript)) {
533
918
  val interruptedAt = computeInterruptedAt()
534
- stopSpeakingInternal()
535
919
  lastInterruptedAtSeconds = interruptedAt
920
+ stopSpeakingInternal()
536
921
  }
537
922
  return
538
923
  }
539
924
 
540
925
  if (!isListening) return
541
926
 
542
- if (transcript.isNotEmpty()) {
927
+ if (isFinal) {
928
+ // A real end-of-turn from the recognizer: emit once and clear the
929
+ // pending buffer so the silence monitor doesn't re-finalize the same
930
+ // words (the double-send bug).
931
+ lastTranscript = ""
932
+ lastHeardAtMs = null
933
+ emitFinalOnce(transcript)
934
+ } else {
543
935
  lastTranscript = transcript
544
936
  lastHeardAtMs = SystemClock.elapsedRealtime()
937
+ notifyListeners("transcript", JSObject().apply {
938
+ put("transcript", transcript)
939
+ put("isFinal", false)
940
+ })
545
941
  }
942
+ }
546
943
 
944
+ /**
945
+ * Emit a FINAL transcript exactly once. Both the recognizer's `onResults`
946
+ * and the silence monitor can finalize the same utterance; collapse them so
947
+ * the turn is sent a single time (a repeated final within 2s is dropped).
948
+ */
949
+ private fun emitFinalOnce(transcript: String) {
950
+ val text = transcript.trim()
951
+ if (text.isEmpty()) return
952
+ val now = SystemClock.elapsedRealtime()
953
+ if (text == lastEmittedFinal && now - lastEmittedFinalAtMs < 2000L) return
954
+ lastEmittedFinal = text
955
+ lastEmittedFinalAtMs = now
547
956
  notifyListeners("transcript", JSObject().apply {
548
- put("transcript", transcript)
549
- put("isFinal", isFinal)
957
+ put("transcript", text)
958
+ put("isFinal", true)
550
959
  })
551
960
  }
552
961
 
553
962
  /**
554
- * Avoid false interrupts: don't interrupt if the heard text is just a
555
- * substring of what we're currently speaking (echo from speaker).
963
+ * Decide whether heard speech should barge in on the agent's TTS. Tuned to
964
+ * avoid FALSE interrupts (which cut the reply mid-sentence and read as
965
+ * "intermittent audio"): a one-word ASR blip, background noise, or the
966
+ * agent's own voice bleeding back into the mic must NOT interrupt — only a
967
+ * genuine couple-of-words utterance from the user does.
556
968
  */
557
969
  private fun shouldInterrupt(transcript: String): Boolean {
558
970
  val trimmed = transcript.trim()
559
- if (trimmed.length < 3) return false
560
- val spoken = lastSpokenText?.lowercase()
561
- if (spoken != null && spoken.contains(trimmed.lowercase())) return false
971
+ val lower = trimmed.lowercase()
972
+ val words = lower.split(Regex("\\s+")).filter { it.isNotBlank() }
973
+ // Need real intent: at least two words, or one long word ( 8 chars).
974
+ if (words.size < 2 && trimmed.length < 8) return false
975
+ val spoken = lastSpokenText?.lowercase() ?: return true
976
+ // Exact echo of what we're saying → speaker bleed, not the user.
977
+ if (spoken.contains(lower)) return false
978
+ // Fuzzy echo: if most of the heard words appear in the text we're
979
+ // currently speaking, treat it as echo (ASR mishears of our own audio).
980
+ val echoed = words.count { spoken.contains(it) }
981
+ if (words.isNotEmpty() && echoed.toDouble() / words.size >= 0.6) {
982
+ return false
983
+ }
562
984
  return true
563
985
  }
564
986
 
@@ -573,9 +995,7 @@ class TalkModePlugin : Plugin() {
573
995
  if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
574
996
  try {
575
997
  if (recognizer == null) {
576
- recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
577
- setRecognitionListener(recognitionListener)
578
- }
998
+ recognizer = createRecognizer()
579
999
  }
580
1000
  recognizer?.cancel()
581
1001
  startListeningInternal(markListening = false)
@@ -588,6 +1008,7 @@ class TalkModePlugin : Plugin() {
588
1008
  private suspend fun speakInternal(
589
1009
  text: String,
590
1010
  forceSystemTts: Boolean,
1011
+ useLocalInferenceTts: Boolean,
591
1012
  directive: JSObject?,
592
1013
  call: PluginCall
593
1014
  ) {
@@ -596,6 +1017,7 @@ class TalkModePlugin : Plugin() {
596
1017
  lastSpokenText = text
597
1018
  speakStartTimeMs = SystemClock.elapsedRealtime()
598
1019
  pcmStopRequested.set(false)
1020
+ lastInterruptedAtSeconds = null
599
1021
  setState("speaking", "Speaking")
600
1022
 
601
1023
  val effectiveVoiceId = directive.stringOrNull("voiceId")?.let(::resolveVoiceAlias) ?: voiceId
@@ -603,27 +1025,74 @@ class TalkModePlugin : Plugin() {
603
1025
 
604
1026
  notifyListeners("speaking", JSObject().apply {
605
1027
  put("text", text)
606
- put("isSystemTts", forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
1028
+ put(
1029
+ "isSystemTts",
1030
+ !useLocalInferenceTts &&
1031
+ (forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
1032
+ )
607
1033
  })
608
1034
 
609
1035
  // Stop listening during speech (we keep recognizer for interrupt detection)
610
1036
  mainHandler.post { recognizer?.stopListening() }
611
1037
  ensureInterruptListener()
612
1038
 
613
- // Request audio focus
614
- requestAudioFocus()
1039
+ // Ensure the communication-mode session + audio focus are active even
1040
+ // for a standalone speak() that wasn't preceded by start().
1041
+ configureVoiceAudioSession()
1042
+ // Re-assert loudspeaker routing right before playback. configureVoice…
1043
+ // only routes on the FIRST activation; if the session was already up (the
1044
+ // STT path opened it) the speaker route may have drifted, leaving TTS on
1045
+ // the earpiece. Re-route here so replies are audible out the speaker.
1046
+ audioManager?.let { routeVoiceOutput(it) }
615
1047
 
616
1048
  try {
617
- val canUseElevenLabs = !forceSystemTts &&
1049
+ val canUseLocalInference = useLocalInferenceTts && !forceSystemTts
1050
+ val canUseElevenLabs = !canUseLocalInference &&
1051
+ !forceSystemTts &&
618
1052
  !effectiveApiKey.isNullOrEmpty() &&
619
1053
  !effectiveVoiceId.isNullOrEmpty()
620
1054
 
621
- if (canUseElevenLabs) {
1055
+ if (canUseLocalInference) {
1056
+ try {
1057
+ streamAndPlayLocalInferenceTts(text, directive)
1058
+
1059
+ if (!pcmStopRequested.get()) {
1060
+ call.resolve(JSObject().apply {
1061
+ put("completed", true)
1062
+ put("interrupted", false)
1063
+ put("usedSystemTts", false)
1064
+ })
1065
+ } else {
1066
+ call.resolve(JSObject().apply {
1067
+ put("completed", false)
1068
+ put("interrupted", true)
1069
+ put("usedSystemTts", false)
1070
+ lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
1071
+ })
1072
+ }
1073
+ } catch (e: Exception) {
1074
+ if (pcmStopRequested.get()) {
1075
+ call.resolve(JSObject().apply {
1076
+ put("completed", false)
1077
+ put("interrupted", true)
1078
+ put("usedSystemTts", false)
1079
+ })
1080
+ } else {
1081
+ // The on-device OmniVoice TTS assets aren't always staged
1082
+ // (it 502s "TEXT_TO_SPEECH not available"). Rather than go
1083
+ // silent — the JS browser-SpeechSynthesis fallback doesn't
1084
+ // exist in the Android WebView — fall back to the platform
1085
+ // TextToSpeech so replies are always spoken aloud.
1086
+ Log.w(TAG, "Local inference TTS failed, falling back to system TTS", e)
1087
+ speakWithSystemTts(text, call)
1088
+ }
1089
+ }
1090
+ } else if (canUseElevenLabs) {
622
1091
  try {
623
1092
  val request = buildElevenLabsRequest(text, directive)
624
1093
  streamAndPlayPcm(
625
- voiceId = effectiveVoiceId!!,
626
- apiKey = effectiveApiKey!!,
1094
+ voiceId = effectiveVoiceId,
1095
+ apiKey = effectiveApiKey,
627
1096
  request = request
628
1097
  )
629
1098
 
@@ -665,13 +1134,16 @@ class TalkModePlugin : Plugin() {
665
1134
  put("error", e.message ?: "Speak failed")
666
1135
  })
667
1136
  } finally {
1137
+ val wasInterrupted = pcmStopRequested.get()
1138
+ val interruptedAt = lastInterruptedAtSeconds
668
1139
  isSpeaking = false
669
1140
  pcmStopRequested.set(false)
670
- abandonAudioFocus()
671
1141
 
672
1142
  notifyListeners("speakComplete", JSObject().apply {
673
- put("completed", !pcmStopRequested.get())
674
- lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
1143
+ put("completed", !wasInterrupted)
1144
+ if (wasInterrupted) {
1145
+ interruptedAt?.let { put("interruptedAt", it) }
1146
+ }
675
1147
  })
676
1148
 
677
1149
  if (enabled) {
@@ -679,6 +1151,8 @@ class TalkModePlugin : Plugin() {
679
1151
  setState("listening", "Listening")
680
1152
  mainHandler.post { startListeningInternal(markListening = true) }
681
1153
  } else {
1154
+ // Standalone speak (no active conversation): release the session.
1155
+ releaseVoiceAudioSession()
682
1156
  setState("idle", "Off")
683
1157
  }
684
1158
  }
@@ -753,6 +1227,363 @@ class TalkModePlugin : Plugin() {
753
1227
  return if (value == null || value === JSONObject.NULL) null else value.toString()
754
1228
  }
755
1229
 
1230
+ private data class PcmStreamFormat(
1231
+ val sampleRate: Int,
1232
+ val channels: Int,
1233
+ val bitsPerSample: Int,
1234
+ val dataBytes: Int
1235
+ )
1236
+
1237
+ /**
1238
+ * Stream local-inference TTS from the embedded agent and play it natively.
1239
+ *
1240
+ * The agent currently returns a buffered WAV, but keeping playback in
1241
+ * AudioTrack means this path is ready for a chunked PCM/WAV response without
1242
+ * going back through WebView decodeAudioData.
1243
+ */
1244
+ private suspend fun streamAndPlayLocalInferenceTts(
1245
+ text: String,
1246
+ directive: JSObject?
1247
+ ) = withContext(Dispatchers.IO) {
1248
+ pcmStopRequested.set(false)
1249
+ // Prefer the in-process fused Kokoro voice via the bionic inference host.
1250
+ // Only if that host is unreachable (e.g. desktop/Electrobun, or a build
1251
+ // without it) do we fall through to the HTTP agent endpoint.
1252
+ if (streamAndPlayBionicKokoroTts(text, directive)) {
1253
+ return@withContext
1254
+ }
1255
+ val conn = openLocalInferenceTtsConnection()
1256
+ activePcmConnection = conn
1257
+ try {
1258
+ val payload = buildLocalInferenceTtsPayload(text, directive)
1259
+ conn.outputStream.use { it.write(payload.toByteArray(Charsets.UTF_8)) }
1260
+
1261
+ val code = conn.responseCode
1262
+ if (code >= 400) {
1263
+ val errBody = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
1264
+ throw IllegalStateException("Local inference TTS error: $code $errBody")
1265
+ }
1266
+
1267
+ BufferedInputStream(conn.inputStream).use { input ->
1268
+ val format = readWavPcmFormat(input)
1269
+ val track = createPcmAudioTrack(format)
1270
+ pcmTrack = track
1271
+ track.play()
1272
+
1273
+ Log.d(
1274
+ TAG,
1275
+ "Local inference PCM play start sampleRate=${format.sampleRate} channels=${format.channels}"
1276
+ )
1277
+ notifyListeners("playbackStart", JSObject().apply {
1278
+ put("provider", "local-inference")
1279
+ put("sampleRate", format.sampleRate)
1280
+ put("channels", format.channels)
1281
+ })
1282
+ val framesWritten = writePcmStreamToTrack(input, track, format)
1283
+ drainPcmTrack(track, framesWritten, format.sampleRate)
1284
+ if (!pcmStopRequested.get()) {
1285
+ track.stop()
1286
+ }
1287
+ Log.d(TAG, "Local inference PCM play done frames=$framesWritten")
1288
+ }
1289
+ } finally {
1290
+ cleanupPcmTrack()
1291
+ if (activePcmConnection === conn) {
1292
+ activePcmConnection = null
1293
+ }
1294
+ conn.disconnect()
1295
+ }
1296
+ }
1297
+
1298
+ /**
1299
+ * Synthesize + play with the fused Kokoro-82M head in the bionic inference
1300
+ * host (ElizaBionicInferenceServer, op "tts") over its abstract-namespace
1301
+ * UDS. The host loads the same libelizainference that runs GPU text and
1302
+ * synthesizes Kokoro PCM in-process — no musl agent, no HTTP, no 502 → no
1303
+ * fallback to the platform TextToSpeech (the bug this fixes: the app was
1304
+ * speaking with the Android system voice). Returns true on success; false if
1305
+ * the host is unreachable so the caller can fall through.
1306
+ */
1307
+ private suspend fun streamAndPlayBionicKokoroTts(
1308
+ text: String,
1309
+ directive: JSObject?
1310
+ ): Boolean = withContext(Dispatchers.IO) {
1311
+ val trimmed = text.trim()
1312
+ if (trimmed.isEmpty()) return@withContext false
1313
+ val speed = (directive?.optDouble("speed", 1.0) ?: 1.0).toFloat()
1314
+ val sock = LocalSocket()
1315
+ try {
1316
+ sock.connect(
1317
+ LocalSocketAddress(BIONIC_INFER_SOCKET, LocalSocketAddress.Namespace.ABSTRACT)
1318
+ )
1319
+ } catch (e: Exception) {
1320
+ Log.d(TAG, "bionic Kokoro TTS host unreachable: ${e.message}")
1321
+ try { sock.close() } catch (_: Exception) {}
1322
+ return@withContext false
1323
+ }
1324
+ try {
1325
+ val req = JSONObject().apply {
1326
+ put("op", "tts")
1327
+ put("text", trimmed)
1328
+ put("speed", speed.toDouble())
1329
+ }.toString().toByteArray(Charsets.UTF_8)
1330
+ DataOutputStream(sock.outputStream).apply {
1331
+ writeInt(req.size) // big-endian length prefix
1332
+ write(req)
1333
+ flush()
1334
+ }
1335
+ val din = DataInputStream(sock.inputStream)
1336
+ val len = din.readInt()
1337
+ if (len <= 0 || len > 64 * 1024 * 1024) {
1338
+ throw IllegalStateException("bionic TTS bad frame length $len")
1339
+ }
1340
+ val respBytes = ByteArray(len)
1341
+ din.readFully(respBytes)
1342
+ val resp = JSONObject(String(respBytes, Charsets.UTF_8))
1343
+ if (!resp.optBoolean("ok", false)) {
1344
+ throw IllegalStateException("bionic TTS error: ${resp.optString("error")}")
1345
+ }
1346
+ val sampleRate = resp.optInt("sampleRate", 24000)
1347
+ val pcmF32 = Base64.decode(resp.getString("pcmBase64"), Base64.NO_WRAP)
1348
+ // fp32 LE → int16 PCM (the play path is ENCODING_PCM_16BIT).
1349
+ val fb = ByteBuffer.wrap(pcmF32).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer()
1350
+ val nSamples = fb.remaining()
1351
+ if (nSamples == 0) {
1352
+ throw IllegalStateException("bionic TTS returned 0 samples")
1353
+ }
1354
+ val pcm16 = ByteArray(nSamples * 2)
1355
+ val ob = ByteBuffer.wrap(pcm16).order(ByteOrder.LITTLE_ENDIAN)
1356
+ for (i in 0 until nSamples) {
1357
+ val s = (fb.get(i) * 32767f).coerceIn(-32768f, 32767f).toInt().toShort()
1358
+ ob.putShort(s)
1359
+ }
1360
+ val format = PcmStreamFormat(sampleRate, 1, 16, pcm16.size)
1361
+ val track = createPcmAudioTrack(format)
1362
+ pcmTrack = track
1363
+ track.play()
1364
+ notifyListeners("playbackStart", JSObject().apply {
1365
+ put("provider", "local-inference")
1366
+ put("sampleRate", sampleRate)
1367
+ put("channels", 1)
1368
+ })
1369
+ val framesWritten = writePcmStreamToTrack(
1370
+ BufferedInputStream(ByteArrayInputStream(pcm16)), track, format
1371
+ )
1372
+ drainPcmTrack(track, framesWritten, sampleRate)
1373
+ if (!pcmStopRequested.get()) track.stop()
1374
+ Log.d(TAG, "bionic Kokoro TTS played $nSamples samples @ $sampleRate Hz")
1375
+ true
1376
+ } finally {
1377
+ cleanupPcmTrack()
1378
+ try { sock.close() } catch (_: Exception) {}
1379
+ }
1380
+ }
1381
+
1382
+ private fun openLocalInferenceTtsConnection(): HttpURLConnection {
1383
+ val tokenFile = File(context.filesDir, "auth/local-agent-token")
1384
+ val token = tokenFile.takeIf { it.isFile }?.readText()?.trim().orEmpty()
1385
+ if (token.isEmpty()) {
1386
+ throw IllegalStateException("Local agent auth token is missing")
1387
+ }
1388
+
1389
+ val conn = URL(LOCAL_INFERENCE_TTS_URL).openConnection() as HttpURLConnection
1390
+ conn.requestMethod = "POST"
1391
+ conn.connectTimeout = 30_000
1392
+ conn.readTimeout = 180_000
1393
+ conn.setRequestProperty("Authorization", "Bearer $token")
1394
+ conn.setRequestProperty("Content-Type", "application/json")
1395
+ conn.setRequestProperty("Accept", "audio/wav")
1396
+ conn.doOutput = true
1397
+ return conn
1398
+ }
1399
+
1400
+ private fun buildLocalInferenceTtsPayload(text: String, directive: JSObject?): String {
1401
+ val payload = JSONObject()
1402
+ payload.put("text", text)
1403
+ directive.stringOrNull("voiceId")?.let { payload.put("voiceId", it) }
1404
+ directive.stringOrNull("voice")?.let { payload.put("voice", it) }
1405
+ directive.stringOrNull("modelId")?.let { payload.put("modelId", it) }
1406
+ directive.stringOrNull("model")?.let { payload.put("model", it) }
1407
+ val speed = directive?.optDouble("speed", Double.NaN)
1408
+ if (speed != null && speed.isFinite() && speed > 0.0) {
1409
+ payload.put("speed", speed)
1410
+ }
1411
+ return payload.toString()
1412
+ }
1413
+
1414
+ private fun readExactly(input: BufferedInputStream, size: Int): ByteArray {
1415
+ val bytes = ByteArray(size)
1416
+ var offset = 0
1417
+ while (offset < size) {
1418
+ val read = input.read(bytes, offset, size - offset)
1419
+ if (read < 0) {
1420
+ throw IllegalStateException("Unexpected end of WAV stream")
1421
+ }
1422
+ offset += read
1423
+ }
1424
+ return bytes
1425
+ }
1426
+
1427
+ private fun skipFully(input: BufferedInputStream, count: Int) {
1428
+ var remaining = count
1429
+ while (remaining > 0) {
1430
+ val skipped = input.skip(remaining.toLong()).toInt()
1431
+ if (skipped > 0) {
1432
+ remaining -= skipped
1433
+ continue
1434
+ }
1435
+ if (input.read() < 0) {
1436
+ throw IllegalStateException("Unexpected end of WAV stream")
1437
+ }
1438
+ remaining -= 1
1439
+ }
1440
+ }
1441
+
1442
+ private fun littleEndianShort(bytes: ByteArray, offset: Int): Int {
1443
+ return (bytes[offset].toInt() and 0xff) or
1444
+ ((bytes[offset + 1].toInt() and 0xff) shl 8)
1445
+ }
1446
+
1447
+ private fun littleEndianInt(bytes: ByteArray, offset: Int): Int {
1448
+ return (bytes[offset].toInt() and 0xff) or
1449
+ ((bytes[offset + 1].toInt() and 0xff) shl 8) or
1450
+ ((bytes[offset + 2].toInt() and 0xff) shl 16) or
1451
+ ((bytes[offset + 3].toInt() and 0xff) shl 24)
1452
+ }
1453
+
1454
+ private fun chunkId(bytes: ByteArray): String {
1455
+ return String(bytes, 0, 4, Charsets.US_ASCII)
1456
+ }
1457
+
1458
+ private fun readWavPcmFormat(input: BufferedInputStream): PcmStreamFormat {
1459
+ val riff = readExactly(input, 12)
1460
+ if (
1461
+ String(riff, 0, 4, Charsets.US_ASCII) != "RIFF" ||
1462
+ String(riff, 8, 4, Charsets.US_ASCII) != "WAVE"
1463
+ ) {
1464
+ throw IllegalStateException("Local inference TTS returned non-WAV audio")
1465
+ }
1466
+
1467
+ var format: PcmStreamFormat? = null
1468
+ while (true) {
1469
+ val header = readExactly(input, 8)
1470
+ val id = chunkId(header)
1471
+ val size = littleEndianInt(header, 4)
1472
+ if (size < 0) {
1473
+ throw IllegalStateException("Invalid WAV chunk size for $id")
1474
+ }
1475
+
1476
+ if (id == "fmt ") {
1477
+ val fmt = readExactly(input, size)
1478
+ if (fmt.size < 16) {
1479
+ throw IllegalStateException("Invalid WAV fmt chunk")
1480
+ }
1481
+ val audioFormat = littleEndianShort(fmt, 0)
1482
+ val channels = littleEndianShort(fmt, 2)
1483
+ val sampleRate = littleEndianInt(fmt, 4)
1484
+ val bitsPerSample = littleEndianShort(fmt, 14)
1485
+ if (audioFormat != 1) {
1486
+ throw IllegalStateException("Only PCM WAV is supported, got format=$audioFormat")
1487
+ }
1488
+ if (bitsPerSample != 16) {
1489
+ throw IllegalStateException("Only 16-bit PCM WAV is supported, got bits=$bitsPerSample")
1490
+ }
1491
+ if (channels !in 1..2 || sampleRate <= 0) {
1492
+ throw IllegalStateException("Invalid WAV format sampleRate=$sampleRate channels=$channels")
1493
+ }
1494
+ format = PcmStreamFormat(sampleRate, channels, bitsPerSample, 0)
1495
+ if (size % 2 == 1) skipFully(input, 1)
1496
+ continue
1497
+ }
1498
+
1499
+ if (id == "data") {
1500
+ val parsed = format ?: throw IllegalStateException("WAV data arrived before fmt chunk")
1501
+ return parsed.copy(dataBytes = size)
1502
+ }
1503
+
1504
+ skipFully(input, size)
1505
+ if (size % 2 == 1) skipFully(input, 1)
1506
+ }
1507
+ }
1508
+
1509
+ private fun createPcmAudioTrack(format: PcmStreamFormat): AudioTrack {
1510
+ val channelMask = when (format.channels) {
1511
+ 1 -> AudioFormat.CHANNEL_OUT_MONO
1512
+ 2 -> AudioFormat.CHANNEL_OUT_STEREO
1513
+ else -> throw IllegalStateException("Unsupported PCM channel count ${format.channels}")
1514
+ }
1515
+ val minBuffer = AudioTrack.getMinBufferSize(
1516
+ format.sampleRate,
1517
+ channelMask,
1518
+ AudioFormat.ENCODING_PCM_16BIT
1519
+ )
1520
+ if (minBuffer <= 0) {
1521
+ throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
1522
+ }
1523
+ val bufferSize = max(minBuffer * 2, 8 * 1024)
1524
+ val track = AudioTrack.Builder()
1525
+ .setAudioAttributes(voiceAudioAttributes())
1526
+ .setAudioFormat(
1527
+ AudioFormat.Builder()
1528
+ .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
1529
+ .setSampleRate(format.sampleRate)
1530
+ .setChannelMask(channelMask)
1531
+ .build()
1532
+ )
1533
+ .setBufferSizeInBytes(bufferSize)
1534
+ .setTransferMode(AudioTrack.MODE_STREAM)
1535
+ .build()
1536
+
1537
+ if (track.state != AudioTrack.STATE_INITIALIZED) {
1538
+ track.release()
1539
+ throw IllegalStateException("AudioTrack init failed")
1540
+ }
1541
+ return track
1542
+ }
1543
+
1544
+ private fun writePcmStreamToTrack(
1545
+ input: BufferedInputStream,
1546
+ track: AudioTrack,
1547
+ format: PcmStreamFormat
1548
+ ): Long {
1549
+ val bytesPerFrame = format.channels * (format.bitsPerSample / 8)
1550
+ var bytesWrittenTotal = 0L
1551
+ var remainingBytes = format.dataBytes
1552
+ val buffer = ByteArray(8 * 1024)
1553
+ while (remainingBytes > 0) {
1554
+ if (pcmStopRequested.get()) break
1555
+ val requestBytes = if (remainingBytes < buffer.size) remainingBytes else buffer.size
1556
+ val bytesRead = input.read(buffer, 0, requestBytes)
1557
+ if (bytesRead <= 0) break
1558
+ remainingBytes -= bytesRead
1559
+
1560
+ var offset = 0
1561
+ while (offset < bytesRead) {
1562
+ if (pcmStopRequested.get()) break
1563
+ val wrote = track.write(buffer, offset, bytesRead - offset)
1564
+ if (wrote <= 0) {
1565
+ throw IllegalStateException("AudioTrack write failed: $wrote")
1566
+ }
1567
+ offset += wrote
1568
+ bytesWrittenTotal += wrote.toLong()
1569
+ }
1570
+ }
1571
+ return if (bytesPerFrame > 0) bytesWrittenTotal / bytesPerFrame else 0L
1572
+ }
1573
+
1574
+ private fun drainPcmTrack(track: AudioTrack, framesWritten: Long, sampleRate: Int) {
1575
+ if (framesWritten <= 0L || sampleRate <= 0) return
1576
+ val maxDrainMs = (framesWritten * 1000L / sampleRate).coerceAtMost(30_000L) + 1_000L
1577
+ val deadline = SystemClock.elapsedRealtime() + maxDrainMs
1578
+ while (
1579
+ !pcmStopRequested.get() &&
1580
+ track.playbackHeadPosition.toLong() < framesWritten &&
1581
+ SystemClock.elapsedRealtime() < deadline
1582
+ ) {
1583
+ SystemClock.sleep(20)
1584
+ }
1585
+ }
1586
+
756
1587
  /**
757
1588
  * Stream PCM audio from ElevenLabs and play via AudioTrack.
758
1589
  * Ported from classic TalkModeManager with proper offset-based writes.
@@ -776,12 +1607,7 @@ class TalkModePlugin : Plugin() {
776
1607
 
777
1608
  val bufferSize = max(minBuffer * 2, 8 * 1024)
778
1609
  val track = AudioTrack.Builder()
779
- .setAudioAttributes(
780
- AudioAttributes.Builder()
781
- .setUsage(AudioAttributes.USAGE_ASSISTANT)
782
- .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
783
- .build()
784
- )
1610
+ .setAudioAttributes(voiceAudioAttributes())
785
1611
  .setAudioFormat(
786
1612
  AudioFormat.Builder()
787
1613
  .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
@@ -802,6 +1628,7 @@ class TalkModePlugin : Plugin() {
802
1628
 
803
1629
  Log.d(TAG, "PCM play start sampleRate=$sampleRate bufferSize=$bufferSize")
804
1630
  val conn = openTtsConnection(voiceId, apiKey, request)
1631
+ activePcmConnection = conn
805
1632
  try {
806
1633
  val payload = buildRequestPayload(request)
807
1634
  conn.outputStream.use { it.write(payload.toByteArray()) }
@@ -845,6 +1672,9 @@ class TalkModePlugin : Plugin() {
845
1672
  Log.d(TAG, "PCM play done")
846
1673
  } finally {
847
1674
  cleanupPcmTrack()
1675
+ if (activePcmConnection === conn) {
1676
+ activePcmConnection = null
1677
+ }
848
1678
  conn.disconnect()
849
1679
  }
850
1680
  }
@@ -970,43 +1800,125 @@ class TalkModePlugin : Plugin() {
970
1800
  }
971
1801
  }
972
1802
 
973
- // ── Audio focus ─────────────────────────────────────────────────────
1803
+ // ── Voice audio session ─────────────────────────────────────────────
1804
+ //
1805
+ // The Android analog of the iOS `.playAndRecord` / `.voiceChat` /
1806
+ // `.defaultToSpeaker` session. Putting the device in MODE_IN_COMMUNICATION
1807
+ // for the whole conversation routes capture + playback through the
1808
+ // telephony path, which engages the platform hardware AEC so TTS coming out
1809
+ // the speaker is cancelled from the mic (the core fix for the mic+speaker
1810
+ // echo loop in hands-free mode). We also hold voice-communication audio
1811
+ // focus and route to the loudspeaker (unless a headset is connected) so
1812
+ // hands-free playback is audible.
1813
+
1814
+ private fun voiceAudioAttributes(): AudioAttributes =
1815
+ AudioAttributes.Builder()
1816
+ .setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
1817
+ .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
1818
+ .build()
974
1819
 
975
- private fun requestAudioFocus() {
1820
+ private fun configureVoiceAudioSession() {
1821
+ if (audioSessionActive) return
976
1822
  val am = audioManager ?: return
977
- val focusListener = AudioManager.OnAudioFocusChangeListener { focusChange ->
978
- when (focusChange) {
979
- AudioManager.AUDIOFOCUS_LOSS,
980
- AudioManager.AUDIOFOCUS_LOSS_TRANSIENT -> {
981
- // Another app took audio; stop speaking if we are
982
- if (isSpeaking) {
983
- stopSpeakingInternal()
984
- }
1823
+
1824
+ savedAudioMode = am.mode
1825
+ @Suppress("DEPRECATION")
1826
+ savedSpeakerphoneOn = am.isSpeakerphoneOn
1827
+
1828
+ val request = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_EXCLUSIVE)
1829
+ .setAudioAttributes(voiceAudioAttributes())
1830
+ .setOnAudioFocusChangeListener { focusChange ->
1831
+ if (
1832
+ focusChange == AudioManager.AUDIOFOCUS_LOSS ||
1833
+ focusChange == AudioManager.AUDIOFOCUS_LOSS_TRANSIENT
1834
+ ) {
1835
+ // Another app took audio; stop speaking if we are.
1836
+ if (isSpeaking) stopSpeakingInternal()
985
1837
  }
986
1838
  }
1839
+ .build()
1840
+ audioFocusRequest = request
1841
+ am.requestAudioFocus(request)
1842
+
1843
+ am.mode = AudioManager.MODE_IN_COMMUNICATION
1844
+ routeVoiceOutput(am)
1845
+ muteEarconStreams(am)
1846
+ audioSessionActive = true
1847
+ Log.d(TAG, "Voice audio session active (communication mode)")
1848
+ }
1849
+
1850
+ /** Mute the recognizer earcon streams for the session; idempotent. */
1851
+ private fun muteEarconStreams(am: AudioManager) {
1852
+ if (earconStreamsMuted) return
1853
+ for (stream in earconStreams) {
1854
+ try {
1855
+ am.adjustStreamVolume(stream, AudioManager.ADJUST_MUTE, 0)
1856
+ } catch (_: Throwable) {
1857
+ // Some OEMs disallow muting certain streams without DND access.
1858
+ }
1859
+ }
1860
+ earconStreamsMuted = true
1861
+ }
1862
+
1863
+ private fun unmuteEarconStreams(am: AudioManager) {
1864
+ if (!earconStreamsMuted) return
1865
+ for (stream in earconStreams) {
1866
+ try {
1867
+ am.adjustStreamVolume(stream, AudioManager.ADJUST_UNMUTE, 0)
1868
+ } catch (_: Throwable) {}
987
1869
  }
988
- audioFocusRequest = focusListener
1870
+ earconStreamsMuted = false
1871
+ }
989
1872
 
1873
+ /**
1874
+ * Default playback to the loudspeaker for hands-free use, but let a wired or
1875
+ * Bluetooth headset win — the iOS `.defaultToSpeaker` semantic.
1876
+ */
1877
+ private fun routeVoiceOutput(am: AudioManager) {
1878
+ val hasHeadset = am.getDevices(AudioManager.GET_DEVICES_OUTPUTS).any { device ->
1879
+ device.type == AudioDeviceInfo.TYPE_WIRED_HEADSET ||
1880
+ device.type == AudioDeviceInfo.TYPE_WIRED_HEADPHONES ||
1881
+ device.type == AudioDeviceInfo.TYPE_USB_HEADSET ||
1882
+ device.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO ||
1883
+ device.type == AudioDeviceInfo.TYPE_BLUETOOTH_A2DP
1884
+ }
1885
+ if (hasHeadset) {
1886
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
1887
+ @Suppress("DEPRECATION")
1888
+ am.isSpeakerphoneOn = false
1889
+ return
1890
+ }
1891
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
1892
+ val speaker = am.availableCommunicationDevices.firstOrNull {
1893
+ it.type == AudioDeviceInfo.TYPE_BUILTIN_SPEAKER
1894
+ }
1895
+ if (speaker != null && am.setCommunicationDevice(speaker)) return
1896
+ }
990
1897
  @Suppress("DEPRECATION")
991
- am.requestAudioFocus(
992
- focusListener,
993
- AudioManager.STREAM_MUSIC,
994
- AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK
995
- )
1898
+ am.isSpeakerphoneOn = true
996
1899
  }
997
1900
 
998
- private fun abandonAudioFocus() {
1901
+ private fun releaseVoiceAudioSession() {
1902
+ if (!audioSessionActive) return
999
1903
  val am = audioManager ?: return
1000
- val listener = audioFocusRequest ?: return
1001
- @Suppress("DEPRECATION")
1002
- am.abandonAudioFocus(listener)
1904
+ unmuteEarconStreams(am)
1905
+ audioFocusRequest?.let { am.abandonAudioFocusRequest(it) }
1003
1906
  audioFocusRequest = null
1907
+ if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
1908
+ @Suppress("DEPRECATION")
1909
+ am.isSpeakerphoneOn = savedSpeakerphoneOn
1910
+ am.mode = savedAudioMode
1911
+ audioSessionActive = false
1912
+ Log.d(TAG, "Voice audio session released")
1004
1913
  }
1005
1914
 
1006
1915
  // ── Cleanup helpers ─────────────────────────────────────────────────
1007
1916
 
1008
1917
  private fun stopSpeakingInternal() {
1009
1918
  pcmStopRequested.set(true)
1919
+ val conn = activePcmConnection
1920
+ activePcmConnection = null
1921
+ conn?.disconnect()
1010
1922
  cleanupPcmTrack()
1011
1923
  systemTts?.stop()
1012
1924
  systemTtsPending?.cancel()
@@ -1162,6 +2074,9 @@ class TalkModePlugin : Plugin() {
1162
2074
  }
1163
2075
 
1164
2076
  private fun isPermissionGranted(permission: String): Boolean {
2077
+ if (permission == Manifest.permission.RECORD_AUDIO) {
2078
+ return context.checkSelfPermission(permission) == PackageManager.PERMISSION_GRANTED
2079
+ }
1165
2080
  return getPermissionState(permission) == com.getcapacitor.PermissionState.GRANTED
1166
2081
  }
1167
2082
 
@@ -1176,10 +2091,13 @@ class TalkModePlugin : Plugin() {
1176
2091
  systemTts?.shutdown()
1177
2092
  systemTts = null
1178
2093
  cleanupPcmTrack()
2094
+ audioFrameRunning.set(false)
2095
+ audioFrameJob?.cancel()
2096
+ releaseAudioRecord()
1179
2097
  silenceJob?.cancel()
1180
2098
  restartJob?.cancel()
1181
2099
  speakingJob?.cancel()
1182
- abandonAudioFocus()
2100
+ releaseVoiceAudioSession()
1183
2101
  scope.cancel()
1184
2102
  }
1185
2103