npm - @elizaos/capacitor-talkmode - Versions diffs - 2.0.0-beta.1 → 2.0.3-beta.3 - Mend

@elizaos/capacitor-talkmode 2.0.0-beta.1 → 2.0.3-beta.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/LICENSE +21 -0
package/README.md +137 -0
package/android/build.gradle +16 -2
package/android/src/main/java/ai/eliza/plugins/talkmode/TalkModePlugin.kt +983 -65
package/dist/esm/definitions.d.ts +149 -3
package/dist/esm/definitions.d.ts.map +1 -1
package/dist/esm/web.d.ts +6 -1
package/dist/esm/web.d.ts.map +1 -1
package/dist/esm/web.js +34 -5
package/dist/esm/web.test.d.ts +2 -0
package/dist/esm/web.test.d.ts.map +1 -0
package/dist/esm/web.test.js +137 -0
package/dist/plugin.cjs.js +34 -5
package/dist/plugin.cjs.js.map +1 -1
package/dist/plugin.js +34 -5
package/dist/plugin.js.map +1 -1
package/ios/Sources/TalkModePlugin/TalkModePlugin.swift +266 -16
package/package.json +14 -11

package/android/src/main/java/ai/eliza/plugins/talkmode/TalkModePlugin.kt CHANGED Viewed

@@ -2,10 +2,17 @@ package ai.eliza.plugins.talkmode
 import android.Manifest
 import android.content.Intent
+import android.content.pm.PackageManager
 import android.media.AudioAttributes
+import android.media.AudioDeviceInfo
+import android.media.AudioFocusRequest
 import android.media.AudioFormat
 import android.media.AudioManager
+import android.media.AudioRecord
 import android.media.AudioTrack
+import android.media.MediaRecorder
+import android.util.Base64
+import android.os.Build
 import android.os.Bundle
 import android.os.Handler
 import android.os.Looper
@@ -25,7 +32,15 @@ import com.getcapacitor.annotation.CapacitorPlugin
 import com.getcapacitor.annotation.Permission
 import com.getcapacitor.annotation.PermissionCallback
 import kotlinx.coroutines.*
+import android.net.LocalSocket
+import android.net.LocalSocketAddress
 import java.io.BufferedInputStream
+import java.io.ByteArrayInputStream
+import java.io.DataInputStream
+import java.io.DataOutputStream
+import java.nio.ByteBuffer
+import java.nio.ByteOrder
+import java.io.File
 import java.net.HttpURLConnection
 import java.net.URL
 import java.util.Locale
@@ -45,6 +60,15 @@ class TalkModePlugin : Plugin() {
         private const val TAG = "TalkMode"
         private const val DEFAULT_MODEL_ID = "eleven_flash_v2_5"
         private const val DEFAULT_OUTPUT_FORMAT = "pcm_24000"
+        private const val LOCAL_INFERENCE_TTS_URL = "http://127.0.0.1:31337/api/tts/local-inference"
+        // Abstract-namespace UDS of ElizaBionicInferenceServer (the bionic app
+        // process that has libelizainference loaded). Kept in sync with
+        // BIONIC_INFERENCE_SOCKET_NAME in ElizaAgentService.
+        private const val BIONIC_INFER_SOCKET = "eliza_bionic_infer_v1"
+        // 16 kHz mono is the rate VAD / diarizer / wake-word models expect; 20 ms
+        // (320 samples) is the standard VAD frame size.
+        private const val DEFAULT_FRAME_SAMPLE_RATE = 16000
+        private const val DEFAULT_FRAME_MS = 20
     }
     private val mainHandler = Handler(Looper.getMainLooper())
@@ -60,11 +84,19 @@ class TalkModePlugin : Plugin() {
     private var isListening = false
     private var listeningMode = false
     private var stopRequested = false
+    // Consecutive ERROR_NO_MATCH/SPEECH_TIMEOUT count, for exponential restart
+    // backoff so an idle always-on session settles instead of re-arming (and,
+    // with the system recognizer, beeping) every ~600ms when nobody is talking.
+    private var consecutiveNoMatch = 0
     private var restartJob: Job? = null
     private var lastTranscript = ""
     private var lastHeardAtMs: Long? = null
     private var silenceJob: Job? = null
     private val silenceWindowMs = 700L
+    // The recognizer's own onResults AND our silence monitor can both finalize
+    // the same utterance; dedup so a turn is emitted (and sent) exactly once.
+    private var lastEmittedFinal = ""
+    private var lastEmittedFinalAtMs = 0L
     // TTS
     private var systemTts: TextToSpeech? = null
@@ -79,10 +111,37 @@ class TalkModePlugin : Plugin() {
     private var lastSpokenText: String? = null
     private var speakStartTimeMs: Long = 0
     private var lastInterruptedAtSeconds: Double? = null
+    @Volatile private var activePcmConnection: HttpURLConnection? = null
-    // Audio focus
+    // Voice audio session (communication-mode routing + focus, mirrors the iOS
+    // .playAndRecord/.voiceChat/.defaultToSpeaker session). Held for the whole
+    // conversation so the platform AEC has a stable speaker reference to cancel.
     private var audioManager: AudioManager? = null
-    private var audioFocusRequest: AudioManager.OnAudioFocusChangeListener? = null
+    private var audioFocusRequest: AudioFocusRequest? = null
+    private var audioSessionActive = false
+    private var savedAudioMode = AudioManager.MODE_NORMAL
+    private var savedSpeakerphoneOn = false
+    // Streams we mute for the session to suppress the platform recognizer's
+    // start/stop earcons (the "on/off" beeps heard as it re-arms continuously).
+    // TTS plays on STREAM_VOICE_CALL (USAGE_VOICE_COMMUNICATION) so it stays
+    // audible. Tracked so we only unmute streams we muted.
+    private val earconStreams = intArrayOf(
+        AudioManager.STREAM_MUSIC,
+        AudioManager.STREAM_SYSTEM,
+        AudioManager.STREAM_NOTIFICATION,
+    )
+    private var earconStreamsMuted = false
+    // Raw PCM frame capture (diarization / VAD / wake-word source). Opt-in and
+    // mutually exclusive with SpeechRecognizer on the mic: Android only lets one
+    // capture client own a given input source at a time, so starting frame
+    // capture SUSPENDS any active SpeechRecognizer and stopping it resumes STT.
+    private var audioRecord: AudioRecord? = null
+    private var audioFrameJob: Job? = null
+    private val audioFrameRunning = AtomicBoolean(false)
+    private var sttSuspendedForFrames = false
+    private var lastFrameSampleRate = DEFAULT_FRAME_SAMPLE_RATE
+    private var lastFrameSamples = 0
     // Config
     private var apiKey: String? = null
@@ -106,6 +165,7 @@ class TalkModePlugin : Plugin() {
         override fun onBeginningOfSpeech() {
             Log.d(TAG, "Beginning of speech")
+            consecutiveNoMatch = 0
         }
         override fun onRmsChanged(rmsdB: Float) {}
@@ -142,24 +202,34 @@ class TalkModePlugin : Plugin() {
                 return
             }
-            // Don't notify error for no-match / speech-timeout, just restart
-            if (error != SpeechRecognizer.ERROR_NO_MATCH &&
-                error != SpeechRecognizer.ERROR_SPEECH_TIMEOUT
+            // Don't notify error for no-match / speech-timeout, just restart.
+            // These fire continuously when the always-on session hears only
+            // silence, so back off exponentially (600ms → 8s cap) instead of
+            // re-arming the recognizer every 600ms. onBeginningOfSpeech /
+            // onResults reset the counter the moment real speech arrives.
+            if (error == SpeechRecognizer.ERROR_NO_MATCH ||
+                error == SpeechRecognizer.ERROR_SPEECH_TIMEOUT
             ) {
+                consecutiveNoMatch++
+                scheduleRestart(
+                    delayMs = minOf(600L * (1L shl minOf(consecutiveNoMatch, 4)), 8000L),
+                )
+            } else {
+                consecutiveNoMatch = 0
                 notifyListeners("error", JSObject().apply {
                     put("code", "recognition_error")
                     put("message", errorMsg)
                     put("recoverable", true)
                 })
+                scheduleRestart(delayMs = 600)
             }
-            scheduleRestart(delayMs = 600)
         }
         override fun onResults(results: Bundle?) {
             val matches = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
             val transcript = matches?.firstOrNull()?.trim() ?: ""
             if (transcript.isNotEmpty()) {
+                consecutiveNoMatch = 0
                 handleTranscript(transcript, isFinal = true)
             }
             scheduleRestart()
@@ -189,6 +259,7 @@ class TalkModePlugin : Plugin() {
             systemTtsReady = status == TextToSpeech.SUCCESS
             if (systemTtsReady) {
                 systemTts?.language = Locale.getDefault()
+                systemTts?.setAudioAttributes(voiceAudioAttributes())
                 systemTts?.setOnUtteranceProgressListener(object : UtteranceProgressListener() {
                     override fun onStart(id: String?) {}
@@ -270,14 +341,13 @@ class TalkModePlugin : Plugin() {
         enabled = true
         stopRequested = false
         listeningMode = true
+        configureVoiceAudioSession()
         setState("listening", "Listening")
         mainHandler.post {
             try {
                 recognizer?.destroy()
-                recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
-                    setRecognitionListener(recognitionListener)
-                }
+                recognizer = createRecognizer()
                 startListeningInternal(markListening = true)
                 startSilenceMonitor()
@@ -286,6 +356,13 @@ class TalkModePlugin : Plugin() {
                 })
             } catch (e: Exception) {
                 Log.e(TAG, "Failed to start", e)
+                // Recognizer creation failed AFTER the audio session was
+                // configured — release it so the earcon streams aren't left
+                // muted and the device isn't stuck in MODE_IN_COMMUNICATION.
+                enabled = false
+                listeningMode = false
+                releaseVoiceAudioSession()
+                setState("idle", "Off")
                 call.resolve(JSObject().apply {
                     put("started", false)
                     put("error", e.message ?: "Failed to start")
@@ -307,6 +384,10 @@ class TalkModePlugin : Plugin() {
         lastTranscript = ""
         lastHeardAtMs = null
+        // Release any raw-PCM capture; `enabled` is already false so this won't
+        // re-arm SpeechRecognizer.
+        stopAudioFramesInternal()
         mainHandler.post {
             recognizer?.cancel()
             recognizer?.destroy()
@@ -314,6 +395,7 @@ class TalkModePlugin : Plugin() {
         }
         stopSpeakingInternal()
+        releaseVoiceAudioSession()
         setState("idle", "Off")
         call.resolve()
     }
@@ -364,16 +446,18 @@ class TalkModePlugin : Plugin() {
         }
         val useSystemTts = call.getBoolean("useSystemTts", false) ?: false
+        val useLocalInferenceTts = call.getBoolean("useLocalInferenceTts", false) ?: false
         val directive = call.getObject("directive")
         speakingJob = scope.launch {
-            speakInternal(text, useSystemTts, directive, call)
+            speakInternal(text, useSystemTts, useLocalInferenceTts, directive, call)
         }
     }
     @PluginMethod
     fun stopSpeaking(call: PluginCall) {
         val interruptedAt = computeInterruptedAt()
+        lastInterruptedAtSeconds = interruptedAt
         stopSpeakingInternal()
         call.resolve(JSObject().apply {
             if (interruptedAt != null) {
@@ -408,6 +492,277 @@ class TalkModePlugin : Plugin() {
         call.resolve(buildPermissionResult())
     }
+    // ── Raw PCM frame capture (diarization / VAD / wake-word) ────────────
+    @PluginMethod
+    fun startAudioFrames(call: PluginCall) {
+        if (getPermissionState("microphone") != PermissionState.GRANTED) {
+            requestPermissionForAlias("microphone", call, "handleStartAudioFramesPermission")
+            return
+        }
+        startAudioFramesInternal(call)
+    }
+    @PermissionCallback
+    private fun handleStartAudioFramesPermission(call: PluginCall) {
+        if (getPermissionState("microphone") == PermissionState.GRANTED) {
+            startAudioFramesInternal(call)
+        } else {
+            call.resolve(JSObject().apply {
+                put("started", false)
+                put("error", "Microphone permission denied")
+            })
+        }
+    }
+    private fun startAudioFramesInternal(call: PluginCall) {
+        if (audioFrameRunning.get()) {
+            call.resolve(JSObject().apply {
+                put("started", true)
+                put("sampleRate", lastFrameSampleRate)
+                put("frameSamples", lastFrameSamples)
+                put("suspendedStt", sttSuspendedForFrames)
+            })
+            return
+        }
+        val requestedRate = call.getInt("sampleRate") ?: DEFAULT_FRAME_SAMPLE_RATE
+        val frameMs = call.getInt("frameMs") ?: DEFAULT_FRAME_MS
+        // SpeechRecognizer (SODA) holds the mic; a parallel AudioRecord on the
+        // same input fails on virtually every device. Suspend it for the
+        // duration of capture and remember to resume on stop.
+        val wasListening = isListening || listeningMode
+        if (wasListening) {
+            suspendSpeechRecognizerForFrames()
+        }
+        val record = try {
+            openAudioRecord(requestedRate)
+        } catch (e: Exception) {
+            Log.e(TAG, "AudioRecord open failed", e)
+            if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
+            call.resolve(JSObject().apply {
+                put("started", false)
+                put("error", e.message ?: "AudioRecord open failed")
+            })
+            return
+        }
+        val actualRate = record.sampleRate
+        val frameSamples = max(1, actualRate * frameMs / 1000)
+        audioRecord = record
+        lastFrameSampleRate = actualRate
+        lastFrameSamples = frameSamples
+        try {
+            record.startRecording()
+        } catch (e: Exception) {
+            Log.e(TAG, "AudioRecord startRecording failed", e)
+            releaseAudioRecord()
+            if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
+            call.resolve(JSObject().apply {
+                put("started", false)
+                put("error", e.message ?: "AudioRecord start failed")
+            })
+            return
+        }
+        if (record.recordingState != AudioRecord.RECORDSTATE_RECORDING) {
+            Log.e(TAG, "AudioRecord did not enter RECORDING state")
+            releaseAudioRecord()
+            if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
+            call.resolve(JSObject().apply {
+                put("started", false)
+                put("error", "AudioRecord did not start (mic likely held by SpeechRecognizer)")
+            })
+            return
+        }
+        audioFrameRunning.set(true)
+        launchFrameLoop(record, frameSamples)
+        call.resolve(JSObject().apply {
+            put("started", true)
+            put("sampleRate", actualRate)
+            put("frameSamples", frameSamples)
+            put("suspendedStt", sttSuspendedForFrames)
+        })
+    }
+    @PluginMethod
+    fun stopAudioFrames(call: PluginCall) {
+        stopAudioFramesInternal()
+        call.resolve()
+    }
+    @PluginMethod
+    fun isCapturingAudioFrames(call: PluginCall) {
+        call.resolve(JSObject().apply {
+            put("capturing", audioFrameRunning.get())
+        })
+    }
+    /**
+     * Open a 16 kHz mono 16-bit AudioRecord. Tries VOICE_RECOGNITION first (the
+     * pre-processing-light source diarization wants), then falls back to MIC.
+     */
+    private fun openAudioRecord(sampleRate: Int): AudioRecord {
+        val minBuffer = AudioRecord.getMinBufferSize(
+            sampleRate,
+            AudioFormat.CHANNEL_IN_MONO,
+            AudioFormat.ENCODING_PCM_16BIT
+        )
+        if (minBuffer <= 0) {
+            throw IllegalStateException("AudioRecord min buffer invalid ($minBuffer) for ${sampleRate}Hz")
+        }
+        val bufferBytes = max(minBuffer * 2, 4 * 1024)
+        val sources = intArrayOf(
+            MediaRecorder.AudioSource.VOICE_RECOGNITION,
+            MediaRecorder.AudioSource.MIC,
+        )
+        var lastError: Throwable? = null
+        for (source in sources) {
+            try {
+                @Suppress("MissingPermission")
+                val record = AudioRecord(
+                    source,
+                    sampleRate,
+                    AudioFormat.CHANNEL_IN_MONO,
+                    AudioFormat.ENCODING_PCM_16BIT,
+                    bufferBytes
+                )
+                if (record.state == AudioRecord.STATE_INITIALIZED) {
+                    return record
+                }
+                record.release()
+                lastError = IllegalStateException("AudioRecord uninitialized for source $source")
+            } catch (e: Exception) {
+                lastError = e
+            }
+        }
+        throw IllegalStateException(
+            "AudioRecord could not initialize at ${sampleRate}Hz",
+            lastError
+        )
+    }
+    private fun launchFrameLoop(record: AudioRecord, frameSamples: Int) {
+        audioFrameJob?.cancel()
+        // IO dispatcher: a tight blocking read loop must not sit on the main
+        // thread. Frames are marshalled to JS via notifyListeners (thread-safe).
+        audioFrameJob = scope.launch(Dispatchers.IO) {
+            val buffer = ShortArray(frameSamples)
+            val bytes = ByteArray(frameSamples * 2)
+            var frameIndex = 0L
+            try {
+                while (audioFrameRunning.get() && isActive) {
+                    val read = record.read(buffer, 0, frameSamples)
+                    if (read <= 0) {
+                        // ERROR_INVALID_OPERATION (-3) / ERROR_BAD_VALUE (-2):
+                        // the record was released or the mic was taken; stop.
+                        if (read < 0) break
+                        continue
+                    }
+                    var sumSquares = 0.0
+                    var b = 0
+                    for (i in 0 until read) {
+                        val s = buffer[i].toInt()
+                        bytes[b] = (s and 0xff).toByte()
+                        bytes[b + 1] = ((s shr 8) and 0xff).toByte()
+                        b += 2
+                        sumSquares += (s.toDouble() * s.toDouble())
+                    }
+                    val rms = if (read > 0) {
+                        Math.sqrt(sumSquares / read) / 32768.0
+                    } else 0.0
+                    val pcmBase64 = Base64.encodeToString(
+                        bytes, 0, read * 2, Base64.NO_WRAP
+                    )
+                    val idx = frameIndex
+                    frameIndex += 1
+                    val ts = SystemClock.elapsedRealtime()
+                    notifyListeners("audioFrame", JSObject().apply {
+                        put("pcm16", pcmBase64)
+                        put("sampleRate", record.sampleRate)
+                        put("channels", 1)
+                        put("samples", read)
+                        put("rms", rms)
+                        put("timestamp", ts)
+                        put("frameIndex", idx)
+                    })
+                }
+            } catch (e: Throwable) {
+                Log.e(TAG, "Audio frame loop error", e)
+                notifyListeners("error", JSObject().apply {
+                    put("message", "Audio frame capture stopped: ${e.message}")
+                    put("fatal", false)
+                })
+            }
+        }
+    }
+    private fun stopAudioFramesInternal() {
+        if (!audioFrameRunning.getAndSet(false) && audioRecord == null) {
+            return
+        }
+        audioFrameJob?.cancel()
+        audioFrameJob = null
+        releaseAudioRecord()
+        if (sttSuspendedForFrames) {
+            resumeSpeechRecognizerAfterFrames()
+        }
+    }
+    private fun releaseAudioRecord() {
+        val record = audioRecord ?: return
+        audioRecord = null
+        try {
+            if (record.recordingState == AudioRecord.RECORDSTATE_RECORDING) {
+                record.stop()
+            }
+        } catch (_: Throwable) {
+        }
+        try {
+            record.release()
+        } catch (_: Throwable) {
+        }
+    }
+    /** Suspend SpeechRecognizer so AudioRecord can own the mic. */
+    private fun suspendSpeechRecognizerForFrames() {
+        sttSuspendedForFrames = true
+        listeningMode = false
+        isListening = false
+        restartJob?.cancel()
+        silenceJob?.cancel()
+        mainHandler.post {
+            try {
+                recognizer?.cancel()
+                recognizer?.destroy()
+            } catch (_: Throwable) {
+            }
+            recognizer = null
+        }
+    }
+    /** Re-arm SpeechRecognizer after frame capture ends, if a session is active. */
+    private fun resumeSpeechRecognizerAfterFrames() {
+        sttSuspendedForFrames = false
+        if (!enabled || stopRequested) return
+        listeningMode = true
+        mainHandler.post {
+            try {
+                if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
+                recognizer?.destroy()
+                recognizer = createRecognizer()
+                startListeningInternal(markListening = true)
+                startSilenceMonitor()
+            } catch (e: Exception) {
+                Log.e(TAG, "Failed to resume STT after frames", e)
+            }
+        }
+    }
     // ── Config ──────────────────────────────────────────────────────────
     private fun applyConfig(config: JSObject) {
@@ -462,6 +817,13 @@ class TalkModePlugin : Plugin() {
             putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
             putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
             putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
+            // On-device recognizer (no network round-trip; works offline). The
+            // platform recognizer's open/close cadence during continuous use is
+            // intrinsic and not controllable via the silence-length extras (the
+            // on-device SODA engine ignores them); we silence the AUDIBLE part of
+            // that churn by muting the earcon streams for the session instead
+            // (see configureVoiceAudioSession).
+            putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
             sttLanguage?.let { putExtra(RecognizerIntent.EXTRA_LANGUAGE, it) }
         }
@@ -477,6 +839,28 @@ class TalkModePlugin : Plugin() {
         }
     }
+    /**
+     * Create the speech recognizer. Prefer the API-31+ ON-DEVICE recognizer
+     * (in-process SODA): it plays NO start/error earcons, eliminating the
+     * audible "open"/"failure" beeps that came from the system
+     * com.google.android.tts recognizer service (which also can't be muted
+     * without ACCESS_NOTIFICATION_POLICY / STREAM_SYSTEM_ENFORCED control we
+     * don't hold). Falls back to the system recognizer when on-device SODA is
+     * unavailable.
+     */
+    private fun createRecognizer(): SpeechRecognizer {
+        val rec = if (
+            Build.VERSION.SDK_INT >= Build.VERSION_CODES.S &&
+            SpeechRecognizer.isOnDeviceRecognitionAvailable(context)
+        ) {
+            SpeechRecognizer.createOnDeviceSpeechRecognizer(context)
+        } else {
+            SpeechRecognizer.createSpeechRecognizer(context)
+        }
+        rec.setRecognitionListener(recognitionListener)
+        return rec
+    }
     private fun scheduleRestart(delayMs: Long = 350) {
         if (stopRequested) return
         restartJob?.cancel()
@@ -515,13 +899,14 @@ class TalkModePlugin : Plugin() {
         val elapsed = SystemClock.elapsedRealtime() - lastHeard
         if (elapsed < silenceWindowMs) return
-        // Finalize: emit a final transcript event
-        notifyListeners("transcript", JSObject().apply {
-            put("transcript", transcript)
-            put("isFinal", true)
-        })
+        // Finalize this turn (deduped against the recognizer's own onResults),
+        // then restart the recognizer so the next utterance is a CLEAN session —
+        // Android SpeechRecognizer accumulates within a session, so without the
+        // restart the next turn's partials would prepend the words we just sent.
         lastTranscript = ""
         lastHeardAtMs = null
+        emitFinalOnce(transcript)
+        scheduleRestart()
     }
     private fun handleTranscript(transcript: String, isFinal: Boolean) {
@@ -531,34 +916,71 @@ class TalkModePlugin : Plugin() {
         if (isSpeaking && interruptOnSpeech) {
             if (shouldInterrupt(transcript)) {
                 val interruptedAt = computeInterruptedAt()
-                stopSpeakingInternal()
                 lastInterruptedAtSeconds = interruptedAt
+                stopSpeakingInternal()
             }
             return
         }
         if (!isListening) return
-        if (transcript.isNotEmpty()) {
+        if (isFinal) {
+            // A real end-of-turn from the recognizer: emit once and clear the
+            // pending buffer so the silence monitor doesn't re-finalize the same
+            // words (the double-send bug).
+            lastTranscript = ""
+            lastHeardAtMs = null
+            emitFinalOnce(transcript)
+        } else {
             lastTranscript = transcript
             lastHeardAtMs = SystemClock.elapsedRealtime()
+            notifyListeners("transcript", JSObject().apply {
+                put("transcript", transcript)
+                put("isFinal", false)
+            })
         }
+    }
+    /**
+     * Emit a FINAL transcript exactly once. Both the recognizer's `onResults`
+     * and the silence monitor can finalize the same utterance; collapse them so
+     * the turn is sent a single time (a repeated final within 2s is dropped).
+     */
+    private fun emitFinalOnce(transcript: String) {
+        val text = transcript.trim()
+        if (text.isEmpty()) return
+        val now = SystemClock.elapsedRealtime()
+        if (text == lastEmittedFinal && now - lastEmittedFinalAtMs < 2000L) return
+        lastEmittedFinal = text
+        lastEmittedFinalAtMs = now
         notifyListeners("transcript", JSObject().apply {
-            put("transcript", transcript)
-            put("isFinal", isFinal)
+            put("transcript", text)
+            put("isFinal", true)
         })
     }
     /**
-     * Avoid false interrupts: don't interrupt if the heard text is just a
-     * substring of what we're currently speaking (echo from speaker).
+     * Decide whether heard speech should barge in on the agent's TTS. Tuned to
+     * avoid FALSE interrupts (which cut the reply mid-sentence and read as
+     * "intermittent audio"): a one-word ASR blip, background noise, or the
+     * agent's own voice bleeding back into the mic must NOT interrupt — only a
+     * genuine couple-of-words utterance from the user does.
      */
     private fun shouldInterrupt(transcript: String): Boolean {
         val trimmed = transcript.trim()
-        if (trimmed.length < 3) return false
-        val spoken = lastSpokenText?.lowercase()
-        if (spoken != null && spoken.contains(trimmed.lowercase())) return false
+        val lower = trimmed.lowercase()
+        val words = lower.split(Regex("\\s+")).filter { it.isNotBlank() }
+        // Need real intent: at least two words, or one long word (≥ 8 chars).
+        if (words.size < 2 && trimmed.length < 8) return false
+        val spoken = lastSpokenText?.lowercase() ?: return true
+        // Exact echo of what we're saying → speaker bleed, not the user.
+        if (spoken.contains(lower)) return false
+        // Fuzzy echo: if most of the heard words appear in the text we're
+        // currently speaking, treat it as echo (ASR mishears of our own audio).
+        val echoed = words.count { spoken.contains(it) }
+        if (words.isNotEmpty() && echoed.toDouble() / words.size >= 0.6) {
+            return false
+        }
         return true
     }
@@ -573,9 +995,7 @@ class TalkModePlugin : Plugin() {
             if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
             try {
                 if (recognizer == null) {
-                    recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
-                        setRecognitionListener(recognitionListener)
-                    }
+                    recognizer = createRecognizer()
                 }
                 recognizer?.cancel()
                 startListeningInternal(markListening = false)
@@ -588,6 +1008,7 @@ class TalkModePlugin : Plugin() {
     private suspend fun speakInternal(
         text: String,
         forceSystemTts: Boolean,
+        useLocalInferenceTts: Boolean,
         directive: JSObject?,
         call: PluginCall
     ) {
@@ -596,6 +1017,7 @@ class TalkModePlugin : Plugin() {
         lastSpokenText = text
         speakStartTimeMs = SystemClock.elapsedRealtime()
         pcmStopRequested.set(false)
+        lastInterruptedAtSeconds = null
         setState("speaking", "Speaking")
         val effectiveVoiceId = directive.stringOrNull("voiceId")?.let(::resolveVoiceAlias) ?: voiceId
@@ -603,27 +1025,74 @@ class TalkModePlugin : Plugin() {
         notifyListeners("speaking", JSObject().apply {
             put("text", text)
-            put("isSystemTts", forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
+            put(
+                "isSystemTts",
+                !useLocalInferenceTts &&
+                    (forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
+            )
         })
         // Stop listening during speech (we keep recognizer for interrupt detection)
         mainHandler.post { recognizer?.stopListening() }
         ensureInterruptListener()
-        // Request audio focus
-        requestAudioFocus()
+        // Ensure the communication-mode session + audio focus are active even
+        // for a standalone speak() that wasn't preceded by start().
+        configureVoiceAudioSession()
+        // Re-assert loudspeaker routing right before playback. configureVoice…
+        // only routes on the FIRST activation; if the session was already up (the
+        // STT path opened it) the speaker route may have drifted, leaving TTS on
+        // the earpiece. Re-route here so replies are audible out the speaker.
+        audioManager?.let { routeVoiceOutput(it) }
         try {
-            val canUseElevenLabs = !forceSystemTts &&
+            val canUseLocalInference = useLocalInferenceTts && !forceSystemTts
+            val canUseElevenLabs = !canUseLocalInference &&
+                !forceSystemTts &&
                 !effectiveApiKey.isNullOrEmpty() &&
                 !effectiveVoiceId.isNullOrEmpty()
-            if (canUseElevenLabs) {
+            if (canUseLocalInference) {
+                try {
+                    streamAndPlayLocalInferenceTts(text, directive)
+                    if (!pcmStopRequested.get()) {
+                        call.resolve(JSObject().apply {
+                            put("completed", true)
+                            put("interrupted", false)
+                            put("usedSystemTts", false)
+                        })
+                    } else {
+                        call.resolve(JSObject().apply {
+                            put("completed", false)
+                            put("interrupted", true)
+                            put("usedSystemTts", false)
+                            lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
+                        })
+                    }
+                } catch (e: Exception) {
+                    if (pcmStopRequested.get()) {
+                        call.resolve(JSObject().apply {
+                            put("completed", false)
+                            put("interrupted", true)
+                            put("usedSystemTts", false)
+                        })
+                    } else {
+                        // The on-device OmniVoice TTS assets aren't always staged
+                        // (it 502s "TEXT_TO_SPEECH not available"). Rather than go
+                        // silent — the JS browser-SpeechSynthesis fallback doesn't
+                        // exist in the Android WebView — fall back to the platform
+                        // TextToSpeech so replies are always spoken aloud.
+                        Log.w(TAG, "Local inference TTS failed, falling back to system TTS", e)
+                        speakWithSystemTts(text, call)
+                    }
+                }
+            } else if (canUseElevenLabs) {
                 try {
                     val request = buildElevenLabsRequest(text, directive)
                     streamAndPlayPcm(
-                        voiceId = effectiveVoiceId!!,
-                        apiKey = effectiveApiKey!!,
+                        voiceId = effectiveVoiceId,
+                        apiKey = effectiveApiKey,
                         request = request
                     )
@@ -665,13 +1134,16 @@ class TalkModePlugin : Plugin() {
                 put("error", e.message ?: "Speak failed")
             })
         } finally {
+            val wasInterrupted = pcmStopRequested.get()
+            val interruptedAt = lastInterruptedAtSeconds
             isSpeaking = false
             pcmStopRequested.set(false)
-            abandonAudioFocus()
             notifyListeners("speakComplete", JSObject().apply {
-                put("completed", !pcmStopRequested.get())
-                lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
+                put("completed", !wasInterrupted)
+                if (wasInterrupted) {
+                    interruptedAt?.let { put("interruptedAt", it) }
+                }
             })
             if (enabled) {
@@ -679,6 +1151,8 @@ class TalkModePlugin : Plugin() {
                 setState("listening", "Listening")
                 mainHandler.post { startListeningInternal(markListening = true) }
             } else {
+                // Standalone speak (no active conversation): release the session.
+                releaseVoiceAudioSession()
                 setState("idle", "Off")
             }
         }
@@ -753,6 +1227,363 @@ class TalkModePlugin : Plugin() {
         return if (value == null || value === JSONObject.NULL) null else value.toString()
     }
+    private data class PcmStreamFormat(
+        val sampleRate: Int,
+        val channels: Int,
+        val bitsPerSample: Int,
+        val dataBytes: Int
+    )
+    /**
+     * Stream local-inference TTS from the embedded agent and play it natively.
+     *
+     * The agent currently returns a buffered WAV, but keeping playback in
+     * AudioTrack means this path is ready for a chunked PCM/WAV response without
+     * going back through WebView decodeAudioData.
+     */
+    private suspend fun streamAndPlayLocalInferenceTts(
+        text: String,
+        directive: JSObject?
+    ) = withContext(Dispatchers.IO) {
+        pcmStopRequested.set(false)
+        // Prefer the in-process fused Kokoro voice via the bionic inference host.
+        // Only if that host is unreachable (e.g. desktop/Electrobun, or a build
+        // without it) do we fall through to the HTTP agent endpoint.
+        if (streamAndPlayBionicKokoroTts(text, directive)) {
+            return@withContext
+        }
+        val conn = openLocalInferenceTtsConnection()
+        activePcmConnection = conn
+        try {
+            val payload = buildLocalInferenceTtsPayload(text, directive)
+            conn.outputStream.use { it.write(payload.toByteArray(Charsets.UTF_8)) }
+            val code = conn.responseCode
+            if (code >= 400) {
+                val errBody = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
+                throw IllegalStateException("Local inference TTS error: $code $errBody")
+            }
+            BufferedInputStream(conn.inputStream).use { input ->
+                val format = readWavPcmFormat(input)
+                val track = createPcmAudioTrack(format)
+                pcmTrack = track
+                track.play()
+                Log.d(
+                    TAG,
+                    "Local inference PCM play start sampleRate=${format.sampleRate} channels=${format.channels}"
+                )
+                notifyListeners("playbackStart", JSObject().apply {
+                    put("provider", "local-inference")
+                    put("sampleRate", format.sampleRate)
+                    put("channels", format.channels)
+                })
+                val framesWritten = writePcmStreamToTrack(input, track, format)
+                drainPcmTrack(track, framesWritten, format.sampleRate)
+                if (!pcmStopRequested.get()) {
+                    track.stop()
+                }
+                Log.d(TAG, "Local inference PCM play done frames=$framesWritten")
+            }
+        } finally {
+            cleanupPcmTrack()
+            if (activePcmConnection === conn) {
+                activePcmConnection = null
+            }
+            conn.disconnect()
+        }
+    }
+    /**
+     * Synthesize + play with the fused Kokoro-82M head in the bionic inference
+     * host (ElizaBionicInferenceServer, op "tts") over its abstract-namespace
+     * UDS. The host loads the same libelizainference that runs GPU text and
+     * synthesizes Kokoro PCM in-process — no musl agent, no HTTP, no 502 → no
+     * fallback to the platform TextToSpeech (the bug this fixes: the app was
+     * speaking with the Android system voice). Returns true on success; false if
+     * the host is unreachable so the caller can fall through.
+     */
+    private suspend fun streamAndPlayBionicKokoroTts(
+        text: String,
+        directive: JSObject?
+    ): Boolean = withContext(Dispatchers.IO) {
+        val trimmed = text.trim()
+        if (trimmed.isEmpty()) return@withContext false
+        val speed = (directive?.optDouble("speed", 1.0) ?: 1.0).toFloat()
+        val sock = LocalSocket()
+        try {
+            sock.connect(
+                LocalSocketAddress(BIONIC_INFER_SOCKET, LocalSocketAddress.Namespace.ABSTRACT)
+            )
+        } catch (e: Exception) {
+            Log.d(TAG, "bionic Kokoro TTS host unreachable: ${e.message}")
+            try { sock.close() } catch (_: Exception) {}
+            return@withContext false
+        }
+        try {
+            val req = JSONObject().apply {
+                put("op", "tts")
+                put("text", trimmed)
+                put("speed", speed.toDouble())
+            }.toString().toByteArray(Charsets.UTF_8)
+            DataOutputStream(sock.outputStream).apply {
+                writeInt(req.size) // big-endian length prefix
+                write(req)
+                flush()
+            }
+            val din = DataInputStream(sock.inputStream)
+            val len = din.readInt()
+            if (len <= 0 || len > 64 * 1024 * 1024) {
+                throw IllegalStateException("bionic TTS bad frame length $len")
+            }
+            val respBytes = ByteArray(len)
+            din.readFully(respBytes)
+            val resp = JSONObject(String(respBytes, Charsets.UTF_8))
+            if (!resp.optBoolean("ok", false)) {
+                throw IllegalStateException("bionic TTS error: ${resp.optString("error")}")
+            }
+            val sampleRate = resp.optInt("sampleRate", 24000)
+            val pcmF32 = Base64.decode(resp.getString("pcmBase64"), Base64.NO_WRAP)
+            // fp32 LE → int16 PCM (the play path is ENCODING_PCM_16BIT).
+            val fb = ByteBuffer.wrap(pcmF32).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer()
+            val nSamples = fb.remaining()
+            if (nSamples == 0) {
+                throw IllegalStateException("bionic TTS returned 0 samples")
+            }
+            val pcm16 = ByteArray(nSamples * 2)
+            val ob = ByteBuffer.wrap(pcm16).order(ByteOrder.LITTLE_ENDIAN)
+            for (i in 0 until nSamples) {
+                val s = (fb.get(i) * 32767f).coerceIn(-32768f, 32767f).toInt().toShort()
+                ob.putShort(s)
+            }
+            val format = PcmStreamFormat(sampleRate, 1, 16, pcm16.size)
+            val track = createPcmAudioTrack(format)
+            pcmTrack = track
+            track.play()
+            notifyListeners("playbackStart", JSObject().apply {
+                put("provider", "local-inference")
+                put("sampleRate", sampleRate)
+                put("channels", 1)
+            })
+            val framesWritten = writePcmStreamToTrack(
+                BufferedInputStream(ByteArrayInputStream(pcm16)), track, format
+            )
+            drainPcmTrack(track, framesWritten, sampleRate)
+            if (!pcmStopRequested.get()) track.stop()
+            Log.d(TAG, "bionic Kokoro TTS played $nSamples samples @ $sampleRate Hz")
+            true
+        } finally {
+            cleanupPcmTrack()
+            try { sock.close() } catch (_: Exception) {}
+        }
+    }
+    private fun openLocalInferenceTtsConnection(): HttpURLConnection {
+        val tokenFile = File(context.filesDir, "auth/local-agent-token")
+        val token = tokenFile.takeIf { it.isFile }?.readText()?.trim().orEmpty()
+        if (token.isEmpty()) {
+            throw IllegalStateException("Local agent auth token is missing")
+        }
+        val conn = URL(LOCAL_INFERENCE_TTS_URL).openConnection() as HttpURLConnection
+        conn.requestMethod = "POST"
+        conn.connectTimeout = 30_000
+        conn.readTimeout = 180_000
+        conn.setRequestProperty("Authorization", "Bearer $token")
+        conn.setRequestProperty("Content-Type", "application/json")
+        conn.setRequestProperty("Accept", "audio/wav")
+        conn.doOutput = true
+        return conn
+    }
+    private fun buildLocalInferenceTtsPayload(text: String, directive: JSObject?): String {
+        val payload = JSONObject()
+        payload.put("text", text)
+        directive.stringOrNull("voiceId")?.let { payload.put("voiceId", it) }
+        directive.stringOrNull("voice")?.let { payload.put("voice", it) }
+        directive.stringOrNull("modelId")?.let { payload.put("modelId", it) }
+        directive.stringOrNull("model")?.let { payload.put("model", it) }
+        val speed = directive?.optDouble("speed", Double.NaN)
+        if (speed != null && speed.isFinite() && speed > 0.0) {
+            payload.put("speed", speed)
+        }
+        return payload.toString()
+    }
+    private fun readExactly(input: BufferedInputStream, size: Int): ByteArray {
+        val bytes = ByteArray(size)
+        var offset = 0
+        while (offset < size) {
+            val read = input.read(bytes, offset, size - offset)
+            if (read < 0) {
+                throw IllegalStateException("Unexpected end of WAV stream")
+            }
+            offset += read
+        }
+        return bytes
+    }
+    private fun skipFully(input: BufferedInputStream, count: Int) {
+        var remaining = count
+        while (remaining > 0) {
+            val skipped = input.skip(remaining.toLong()).toInt()
+            if (skipped > 0) {
+                remaining -= skipped
+                continue
+            }
+            if (input.read() < 0) {
+                throw IllegalStateException("Unexpected end of WAV stream")
+            }
+            remaining -= 1
+        }
+    }
+    private fun littleEndianShort(bytes: ByteArray, offset: Int): Int {
+        return (bytes[offset].toInt() and 0xff) or
+            ((bytes[offset + 1].toInt() and 0xff) shl 8)
+    }
+    private fun littleEndianInt(bytes: ByteArray, offset: Int): Int {
+        return (bytes[offset].toInt() and 0xff) or
+            ((bytes[offset + 1].toInt() and 0xff) shl 8) or
+            ((bytes[offset + 2].toInt() and 0xff) shl 16) or
+            ((bytes[offset + 3].toInt() and 0xff) shl 24)
+    }
+    private fun chunkId(bytes: ByteArray): String {
+        return String(bytes, 0, 4, Charsets.US_ASCII)
+    }
+    private fun readWavPcmFormat(input: BufferedInputStream): PcmStreamFormat {
+        val riff = readExactly(input, 12)
+        if (
+            String(riff, 0, 4, Charsets.US_ASCII) != "RIFF" ||
+            String(riff, 8, 4, Charsets.US_ASCII) != "WAVE"
+        ) {
+            throw IllegalStateException("Local inference TTS returned non-WAV audio")
+        }
+        var format: PcmStreamFormat? = null
+        while (true) {
+            val header = readExactly(input, 8)
+            val id = chunkId(header)
+            val size = littleEndianInt(header, 4)
+            if (size < 0) {
+                throw IllegalStateException("Invalid WAV chunk size for $id")
+            }
+            if (id == "fmt ") {
+                val fmt = readExactly(input, size)
+                if (fmt.size < 16) {
+                    throw IllegalStateException("Invalid WAV fmt chunk")
+                }
+                val audioFormat = littleEndianShort(fmt, 0)
+                val channels = littleEndianShort(fmt, 2)
+                val sampleRate = littleEndianInt(fmt, 4)
+                val bitsPerSample = littleEndianShort(fmt, 14)
+                if (audioFormat != 1) {
+                    throw IllegalStateException("Only PCM WAV is supported, got format=$audioFormat")
+                }
+                if (bitsPerSample != 16) {
+                    throw IllegalStateException("Only 16-bit PCM WAV is supported, got bits=$bitsPerSample")
+                }
+                if (channels !in 1..2 || sampleRate <= 0) {
+                    throw IllegalStateException("Invalid WAV format sampleRate=$sampleRate channels=$channels")
+                }
+                format = PcmStreamFormat(sampleRate, channels, bitsPerSample, 0)
+                if (size % 2 == 1) skipFully(input, 1)
+                continue
+            }
+            if (id == "data") {
+                val parsed = format ?: throw IllegalStateException("WAV data arrived before fmt chunk")
+                return parsed.copy(dataBytes = size)
+            }
+            skipFully(input, size)
+            if (size % 2 == 1) skipFully(input, 1)
+        }
+    }
+    private fun createPcmAudioTrack(format: PcmStreamFormat): AudioTrack {
+        val channelMask = when (format.channels) {
+            1 -> AudioFormat.CHANNEL_OUT_MONO
+            2 -> AudioFormat.CHANNEL_OUT_STEREO
+            else -> throw IllegalStateException("Unsupported PCM channel count ${format.channels}")
+        }
+        val minBuffer = AudioTrack.getMinBufferSize(
+            format.sampleRate,
+            channelMask,
+            AudioFormat.ENCODING_PCM_16BIT
+        )
+        if (minBuffer <= 0) {
+            throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
+        }
+        val bufferSize = max(minBuffer * 2, 8 * 1024)
+        val track = AudioTrack.Builder()
+            .setAudioAttributes(voiceAudioAttributes())
+            .setAudioFormat(
+                AudioFormat.Builder()
+                    .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
+                    .setSampleRate(format.sampleRate)
+                    .setChannelMask(channelMask)
+                    .build()
+            )
+            .setBufferSizeInBytes(bufferSize)
+            .setTransferMode(AudioTrack.MODE_STREAM)
+            .build()
+        if (track.state != AudioTrack.STATE_INITIALIZED) {
+            track.release()
+            throw IllegalStateException("AudioTrack init failed")
+        }
+        return track
+    }
+    private fun writePcmStreamToTrack(
+        input: BufferedInputStream,
+        track: AudioTrack,
+        format: PcmStreamFormat
+    ): Long {
+        val bytesPerFrame = format.channels * (format.bitsPerSample / 8)
+        var bytesWrittenTotal = 0L
+        var remainingBytes = format.dataBytes
+        val buffer = ByteArray(8 * 1024)
+        while (remainingBytes > 0) {
+            if (pcmStopRequested.get()) break
+            val requestBytes = if (remainingBytes < buffer.size) remainingBytes else buffer.size
+            val bytesRead = input.read(buffer, 0, requestBytes)
+            if (bytesRead <= 0) break
+            remainingBytes -= bytesRead
+            var offset = 0
+            while (offset < bytesRead) {
+                if (pcmStopRequested.get()) break
+                val wrote = track.write(buffer, offset, bytesRead - offset)
+                if (wrote <= 0) {
+                    throw IllegalStateException("AudioTrack write failed: $wrote")
+                }
+                offset += wrote
+                bytesWrittenTotal += wrote.toLong()
+            }
+        }
+        return if (bytesPerFrame > 0) bytesWrittenTotal / bytesPerFrame else 0L
+    }
+    private fun drainPcmTrack(track: AudioTrack, framesWritten: Long, sampleRate: Int) {
+        if (framesWritten <= 0L || sampleRate <= 0) return
+        val maxDrainMs = (framesWritten * 1000L / sampleRate).coerceAtMost(30_000L) + 1_000L
+        val deadline = SystemClock.elapsedRealtime() + maxDrainMs
+        while (
+            !pcmStopRequested.get() &&
+            track.playbackHeadPosition.toLong() < framesWritten &&
+            SystemClock.elapsedRealtime() < deadline
+        ) {
+            SystemClock.sleep(20)
+        }
+    }
     /**
      * Stream PCM audio from ElevenLabs and play via AudioTrack.
      * Ported from classic TalkModeManager with proper offset-based writes.
@@ -776,12 +1607,7 @@ class TalkModePlugin : Plugin() {
         val bufferSize = max(minBuffer * 2, 8 * 1024)
         val track = AudioTrack.Builder()
-            .setAudioAttributes(
-                AudioAttributes.Builder()
-                    .setUsage(AudioAttributes.USAGE_ASSISTANT)
-                    .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
-                    .build()
-            )
+            .setAudioAttributes(voiceAudioAttributes())
             .setAudioFormat(
                 AudioFormat.Builder()
                     .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
@@ -802,6 +1628,7 @@ class TalkModePlugin : Plugin() {
         Log.d(TAG, "PCM play start sampleRate=$sampleRate bufferSize=$bufferSize")
         val conn = openTtsConnection(voiceId, apiKey, request)
+        activePcmConnection = conn
         try {
             val payload = buildRequestPayload(request)
             conn.outputStream.use { it.write(payload.toByteArray()) }
@@ -845,6 +1672,9 @@ class TalkModePlugin : Plugin() {
             Log.d(TAG, "PCM play done")
         } finally {
             cleanupPcmTrack()
+            if (activePcmConnection === conn) {
+                activePcmConnection = null
+            }
             conn.disconnect()
         }
     }
@@ -970,43 +1800,125 @@ class TalkModePlugin : Plugin() {
         }
     }
-    // ── Audio focus ─────────────────────────────────────────────────────
+    // ── Voice audio session ─────────────────────────────────────────────
+    //
+    // The Android analog of the iOS `.playAndRecord` / `.voiceChat` /
+    // `.defaultToSpeaker` session. Putting the device in MODE_IN_COMMUNICATION
+    // for the whole conversation routes capture + playback through the
+    // telephony path, which engages the platform hardware AEC so TTS coming out
+    // the speaker is cancelled from the mic (the core fix for the mic+speaker
+    // echo loop in hands-free mode). We also hold voice-communication audio
+    // focus and route to the loudspeaker (unless a headset is connected) so
+    // hands-free playback is audible.
+    private fun voiceAudioAttributes(): AudioAttributes =
+        AudioAttributes.Builder()
+            .setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
+            .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
+            .build()
-    private fun requestAudioFocus() {
+    private fun configureVoiceAudioSession() {
+        if (audioSessionActive) return
         val am = audioManager ?: return
-        val focusListener = AudioManager.OnAudioFocusChangeListener { focusChange ->
-            when (focusChange) {
-                AudioManager.AUDIOFOCUS_LOSS,
-                AudioManager.AUDIOFOCUS_LOSS_TRANSIENT -> {
-                    // Another app took audio; stop speaking if we are
-                    if (isSpeaking) {
-                        stopSpeakingInternal()
-                    }
+        savedAudioMode = am.mode
+        @Suppress("DEPRECATION")
+        savedSpeakerphoneOn = am.isSpeakerphoneOn
+        val request = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_EXCLUSIVE)
+            .setAudioAttributes(voiceAudioAttributes())
+            .setOnAudioFocusChangeListener { focusChange ->
+                if (
+                    focusChange == AudioManager.AUDIOFOCUS_LOSS ||
+                    focusChange == AudioManager.AUDIOFOCUS_LOSS_TRANSIENT
+                ) {
+                    // Another app took audio; stop speaking if we are.
+                    if (isSpeaking) stopSpeakingInternal()
                 }
             }
+            .build()
+        audioFocusRequest = request
+        am.requestAudioFocus(request)
+        am.mode = AudioManager.MODE_IN_COMMUNICATION
+        routeVoiceOutput(am)
+        muteEarconStreams(am)
+        audioSessionActive = true
+        Log.d(TAG, "Voice audio session active (communication mode)")
+    }
+    /** Mute the recognizer earcon streams for the session; idempotent. */
+    private fun muteEarconStreams(am: AudioManager) {
+        if (earconStreamsMuted) return
+        for (stream in earconStreams) {
+            try {
+                am.adjustStreamVolume(stream, AudioManager.ADJUST_MUTE, 0)
+            } catch (_: Throwable) {
+                // Some OEMs disallow muting certain streams without DND access.
+            }
+        }
+        earconStreamsMuted = true
+    }
+    private fun unmuteEarconStreams(am: AudioManager) {
+        if (!earconStreamsMuted) return
+        for (stream in earconStreams) {
+            try {
+                am.adjustStreamVolume(stream, AudioManager.ADJUST_UNMUTE, 0)
+            } catch (_: Throwable) {}
         }
-        audioFocusRequest = focusListener
+        earconStreamsMuted = false
+    }
+    /**
+     * Default playback to the loudspeaker for hands-free use, but let a wired or
+     * Bluetooth headset win — the iOS `.defaultToSpeaker` semantic.
+     */
+    private fun routeVoiceOutput(am: AudioManager) {
+        val hasHeadset = am.getDevices(AudioManager.GET_DEVICES_OUTPUTS).any { device ->
+            device.type == AudioDeviceInfo.TYPE_WIRED_HEADSET ||
+                device.type == AudioDeviceInfo.TYPE_WIRED_HEADPHONES ||
+                device.type == AudioDeviceInfo.TYPE_USB_HEADSET ||
+                device.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO ||
+                device.type == AudioDeviceInfo.TYPE_BLUETOOTH_A2DP
+        }
+        if (hasHeadset) {
+            if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
+            @Suppress("DEPRECATION")
+            am.isSpeakerphoneOn = false
+            return
+        }
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
+            val speaker = am.availableCommunicationDevices.firstOrNull {
+                it.type == AudioDeviceInfo.TYPE_BUILTIN_SPEAKER
+            }
+            if (speaker != null && am.setCommunicationDevice(speaker)) return
+        }
         @Suppress("DEPRECATION")
-        am.requestAudioFocus(
-            focusListener,
-            AudioManager.STREAM_MUSIC,
-            AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK
-        )
+        am.isSpeakerphoneOn = true
     }
-    private fun abandonAudioFocus() {
+    private fun releaseVoiceAudioSession() {
+        if (!audioSessionActive) return
         val am = audioManager ?: return
-        val listener = audioFocusRequest ?: return
-        @Suppress("DEPRECATION")
-        am.abandonAudioFocus(listener)
+        unmuteEarconStreams(am)
+        audioFocusRequest?.let { am.abandonAudioFocusRequest(it) }
         audioFocusRequest = null
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
+        @Suppress("DEPRECATION")
+        am.isSpeakerphoneOn = savedSpeakerphoneOn
+        am.mode = savedAudioMode
+        audioSessionActive = false
+        Log.d(TAG, "Voice audio session released")
     }
     // ── Cleanup helpers ─────────────────────────────────────────────────
     private fun stopSpeakingInternal() {
         pcmStopRequested.set(true)
+        val conn = activePcmConnection
+        activePcmConnection = null
+        conn?.disconnect()
         cleanupPcmTrack()
         systemTts?.stop()
         systemTtsPending?.cancel()
@@ -1162,6 +2074,9 @@ class TalkModePlugin : Plugin() {
     }
     private fun isPermissionGranted(permission: String): Boolean {
+        if (permission == Manifest.permission.RECORD_AUDIO) {
+            return context.checkSelfPermission(permission) == PackageManager.PERMISSION_GRANTED
+        }
         return getPermissionState(permission) == com.getcapacitor.PermissionState.GRANTED
     }
@@ -1176,10 +2091,13 @@ class TalkModePlugin : Plugin() {
         systemTts?.shutdown()
         systemTts = null
         cleanupPcmTrack()
+        audioFrameRunning.set(false)
+        audioFrameJob?.cancel()
+        releaseAudioRecord()
         silenceJob?.cancel()
         restartJob?.cancel()
         speakingJob?.cancel()
-        abandonAudioFocus()
+        releaseVoiceAudioSession()
         scope.cancel()
     }