npm - @elizaos/capacitor-talkmode - Versions diffs - 1.0.0 → 2.0.11-beta.7 - Mend

@elizaos/capacitor-talkmode 1.0.0 → 2.0.11-beta.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

package/LICENSE +21 -0
package/README.md +137 -0
package/android/build.gradle +17 -3
package/android/src/main/java/ai/eliza/plugins/talkmode/TalkModePlugin.kt +840 -54
package/dist/esm/definitions.d.ts +146 -0
package/dist/esm/definitions.d.ts.map +1 -1
package/dist/esm/web.d.ts +6 -1
package/dist/esm/web.d.ts.map +1 -1
package/dist/esm/web.js +34 -5
package/dist/esm/web.test.d.ts +2 -0
package/dist/esm/web.test.d.ts.map +1 -0
package/dist/esm/web.test.js +137 -0
package/dist/plugin.cjs.js +34 -5
package/dist/plugin.cjs.js.map +1 -1
package/dist/plugin.js +34 -5
package/dist/plugin.js.map +1 -1
package/ios/Sources/TalkModePlugin/TalkModePlugin.swift +266 -16
package/package.json +18 -14

package/android/src/main/java/ai/eliza/plugins/talkmode/TalkModePlugin.kt CHANGED Viewed

@@ -2,10 +2,17 @@ package ai.eliza.plugins.talkmode
 import android.Manifest
 import android.content.Intent
+import android.content.pm.PackageManager
 import android.media.AudioAttributes
+import android.media.AudioDeviceInfo
+import android.media.AudioFocusRequest
 import android.media.AudioFormat
 import android.media.AudioManager
+import android.media.AudioRecord
 import android.media.AudioTrack
+import android.media.MediaRecorder
+import android.util.Base64
+import android.os.Build
 import android.os.Bundle
 import android.os.Handler
 import android.os.Looper
@@ -26,6 +33,7 @@ import com.getcapacitor.annotation.Permission
 import com.getcapacitor.annotation.PermissionCallback
 import kotlinx.coroutines.*
 import java.io.BufferedInputStream
+import java.io.File
 import java.net.HttpURLConnection
 import java.net.URL
 import java.util.Locale
@@ -45,6 +53,11 @@ class TalkModePlugin : Plugin() {
         private const val TAG = "TalkMode"
         private const val DEFAULT_MODEL_ID = "eleven_flash_v2_5"
         private const val DEFAULT_OUTPUT_FORMAT = "pcm_24000"
+        private const val LOCAL_INFERENCE_TTS_URL = "http://127.0.0.1:31337/api/tts/local-inference"
+        // 16 kHz mono is the rate VAD / diarizer / wake-word models expect; 20 ms
+        // (320 samples) is the standard VAD frame size.
+        private const val DEFAULT_FRAME_SAMPLE_RATE = 16000
+        private const val DEFAULT_FRAME_MS = 20
     }
     private val mainHandler = Handler(Looper.getMainLooper())
@@ -65,6 +78,10 @@ class TalkModePlugin : Plugin() {
     private var lastHeardAtMs: Long? = null
     private var silenceJob: Job? = null
     private val silenceWindowMs = 700L
+    // The recognizer's own onResults AND our silence monitor can both finalize
+    // the same utterance; dedup so a turn is emitted (and sent) exactly once.
+    private var lastEmittedFinal = ""
+    private var lastEmittedFinalAtMs = 0L
     // TTS
     private var systemTts: TextToSpeech? = null
@@ -79,10 +96,37 @@ class TalkModePlugin : Plugin() {
     private var lastSpokenText: String? = null
     private var speakStartTimeMs: Long = 0
     private var lastInterruptedAtSeconds: Double? = null
+    @Volatile private var activePcmConnection: HttpURLConnection? = null
-    // Audio focus
+    // Voice audio session (communication-mode routing + focus, mirrors the iOS
+    // .playAndRecord/.voiceChat/.defaultToSpeaker session). Held for the whole
+    // conversation so the platform AEC has a stable speaker reference to cancel.
     private var audioManager: AudioManager? = null
-    private var audioFocusRequest: AudioManager.OnAudioFocusChangeListener? = null
+    private var audioFocusRequest: AudioFocusRequest? = null
+    private var audioSessionActive = false
+    private var savedAudioMode = AudioManager.MODE_NORMAL
+    private var savedSpeakerphoneOn = false
+    // Streams we mute for the session to suppress the platform recognizer's
+    // start/stop earcons (the "on/off" beeps heard as it re-arms continuously).
+    // TTS plays on STREAM_VOICE_CALL (USAGE_VOICE_COMMUNICATION) so it stays
+    // audible. Tracked so we only unmute streams we muted.
+    private val earconStreams = intArrayOf(
+        AudioManager.STREAM_MUSIC,
+        AudioManager.STREAM_SYSTEM,
+        AudioManager.STREAM_NOTIFICATION,
+    )
+    private var earconStreamsMuted = false
+    // Raw PCM frame capture (diarization / VAD / wake-word source). Opt-in and
+    // mutually exclusive with SpeechRecognizer on the mic: Android only lets one
+    // capture client own a given input source at a time, so starting frame
+    // capture SUSPENDS any active SpeechRecognizer and stopping it resumes STT.
+    private var audioRecord: AudioRecord? = null
+    private var audioFrameJob: Job? = null
+    private val audioFrameRunning = AtomicBoolean(false)
+    private var sttSuspendedForFrames = false
+    private var lastFrameSampleRate = DEFAULT_FRAME_SAMPLE_RATE
+    private var lastFrameSamples = 0
     // Config
     private var apiKey: String? = null
@@ -189,6 +233,7 @@ class TalkModePlugin : Plugin() {
             systemTtsReady = status == TextToSpeech.SUCCESS
             if (systemTtsReady) {
                 systemTts?.language = Locale.getDefault()
+                systemTts?.setAudioAttributes(voiceAudioAttributes())
                 systemTts?.setOnUtteranceProgressListener(object : UtteranceProgressListener() {
                     override fun onStart(id: String?) {}
@@ -270,6 +315,7 @@ class TalkModePlugin : Plugin() {
         enabled = true
         stopRequested = false
         listeningMode = true
+        configureVoiceAudioSession()
         setState("listening", "Listening")
         mainHandler.post {
@@ -286,6 +332,13 @@ class TalkModePlugin : Plugin() {
                 })
             } catch (e: Exception) {
                 Log.e(TAG, "Failed to start", e)
+                // Recognizer creation failed AFTER the audio session was
+                // configured — release it so the earcon streams aren't left
+                // muted and the device isn't stuck in MODE_IN_COMMUNICATION.
+                enabled = false
+                listeningMode = false
+                releaseVoiceAudioSession()
+                setState("idle", "Off")
                 call.resolve(JSObject().apply {
                     put("started", false)
                     put("error", e.message ?: "Failed to start")
@@ -307,6 +360,10 @@ class TalkModePlugin : Plugin() {
         lastTranscript = ""
         lastHeardAtMs = null
+        // Release any raw-PCM capture; `enabled` is already false so this won't
+        // re-arm SpeechRecognizer.
+        stopAudioFramesInternal()
         mainHandler.post {
             recognizer?.cancel()
             recognizer?.destroy()
@@ -314,6 +371,7 @@ class TalkModePlugin : Plugin() {
         }
         stopSpeakingInternal()
+        releaseVoiceAudioSession()
         setState("idle", "Off")
         call.resolve()
     }
@@ -364,16 +422,18 @@ class TalkModePlugin : Plugin() {
         }
         val useSystemTts = call.getBoolean("useSystemTts", false) ?: false
+        val useLocalInferenceTts = call.getBoolean("useLocalInferenceTts", false) ?: false
         val directive = call.getObject("directive")
         speakingJob = scope.launch {
-            speakInternal(text, useSystemTts, directive, call)
+            speakInternal(text, useSystemTts, useLocalInferenceTts, directive, call)
         }
     }
     @PluginMethod
     fun stopSpeaking(call: PluginCall) {
         val interruptedAt = computeInterruptedAt()
+        lastInterruptedAtSeconds = interruptedAt
         stopSpeakingInternal()
         call.resolve(JSObject().apply {
             if (interruptedAt != null) {
@@ -408,6 +468,279 @@ class TalkModePlugin : Plugin() {
         call.resolve(buildPermissionResult())
     }
+    // ── Raw PCM frame capture (diarization / VAD / wake-word) ────────────
+    @PluginMethod
+    fun startAudioFrames(call: PluginCall) {
+        if (getPermissionState("microphone") != PermissionState.GRANTED) {
+            requestPermissionForAlias("microphone", call, "handleStartAudioFramesPermission")
+            return
+        }
+        startAudioFramesInternal(call)
+    }
+    @PermissionCallback
+    private fun handleStartAudioFramesPermission(call: PluginCall) {
+        if (getPermissionState("microphone") == PermissionState.GRANTED) {
+            startAudioFramesInternal(call)
+        } else {
+            call.resolve(JSObject().apply {
+                put("started", false)
+                put("error", "Microphone permission denied")
+            })
+        }
+    }
+    private fun startAudioFramesInternal(call: PluginCall) {
+        if (audioFrameRunning.get()) {
+            call.resolve(JSObject().apply {
+                put("started", true)
+                put("sampleRate", lastFrameSampleRate)
+                put("frameSamples", lastFrameSamples)
+                put("suspendedStt", sttSuspendedForFrames)
+            })
+            return
+        }
+        val requestedRate = call.getInt("sampleRate") ?: DEFAULT_FRAME_SAMPLE_RATE
+        val frameMs = call.getInt("frameMs") ?: DEFAULT_FRAME_MS
+        // SpeechRecognizer (SODA) holds the mic; a parallel AudioRecord on the
+        // same input fails on virtually every device. Suspend it for the
+        // duration of capture and remember to resume on stop.
+        val wasListening = isListening || listeningMode
+        if (wasListening) {
+            suspendSpeechRecognizerForFrames()
+        }
+        val record = try {
+            openAudioRecord(requestedRate)
+        } catch (e: Exception) {
+            Log.e(TAG, "AudioRecord open failed", e)
+            if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
+            call.resolve(JSObject().apply {
+                put("started", false)
+                put("error", e.message ?: "AudioRecord open failed")
+            })
+            return
+        }
+        val actualRate = record.sampleRate
+        val frameSamples = max(1, actualRate * frameMs / 1000)
+        audioRecord = record
+        lastFrameSampleRate = actualRate
+        lastFrameSamples = frameSamples
+        try {
+            record.startRecording()
+        } catch (e: Exception) {
+            Log.e(TAG, "AudioRecord startRecording failed", e)
+            releaseAudioRecord()
+            if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
+            call.resolve(JSObject().apply {
+                put("started", false)
+                put("error", e.message ?: "AudioRecord start failed")
+            })
+            return
+        }
+        if (record.recordingState != AudioRecord.RECORDSTATE_RECORDING) {
+            Log.e(TAG, "AudioRecord did not enter RECORDING state")
+            releaseAudioRecord()
+            if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
+            call.resolve(JSObject().apply {
+                put("started", false)
+                put("error", "AudioRecord did not start (mic likely held by SpeechRecognizer)")
+            })
+            return
+        }
+        audioFrameRunning.set(true)
+        launchFrameLoop(record, frameSamples)
+        call.resolve(JSObject().apply {
+            put("started", true)
+            put("sampleRate", actualRate)
+            put("frameSamples", frameSamples)
+            put("suspendedStt", sttSuspendedForFrames)
+        })
+    }
+    @PluginMethod
+    fun stopAudioFrames(call: PluginCall) {
+        stopAudioFramesInternal()
+        call.resolve()
+    }
+    @PluginMethod
+    fun isCapturingAudioFrames(call: PluginCall) {
+        call.resolve(JSObject().apply {
+            put("capturing", audioFrameRunning.get())
+        })
+    }
+    /**
+     * Open a 16 kHz mono 16-bit AudioRecord. Tries VOICE_RECOGNITION first (the
+     * pre-processing-light source diarization wants), then falls back to MIC.
+     */
+    private fun openAudioRecord(sampleRate: Int): AudioRecord {
+        val minBuffer = AudioRecord.getMinBufferSize(
+            sampleRate,
+            AudioFormat.CHANNEL_IN_MONO,
+            AudioFormat.ENCODING_PCM_16BIT
+        )
+        if (minBuffer <= 0) {
+            throw IllegalStateException("AudioRecord min buffer invalid ($minBuffer) for ${sampleRate}Hz")
+        }
+        val bufferBytes = max(minBuffer * 2, 4 * 1024)
+        val sources = intArrayOf(
+            MediaRecorder.AudioSource.VOICE_RECOGNITION,
+            MediaRecorder.AudioSource.MIC,
+        )
+        var lastError: Throwable? = null
+        for (source in sources) {
+            try {
+                @Suppress("MissingPermission")
+                val record = AudioRecord(
+                    source,
+                    sampleRate,
+                    AudioFormat.CHANNEL_IN_MONO,
+                    AudioFormat.ENCODING_PCM_16BIT,
+                    bufferBytes
+                )
+                if (record.state == AudioRecord.STATE_INITIALIZED) {
+                    return record
+                }
+                record.release()
+                lastError = IllegalStateException("AudioRecord uninitialized for source $source")
+            } catch (e: Exception) {
+                lastError = e
+            }
+        }
+        throw IllegalStateException(
+            "AudioRecord could not initialize at ${sampleRate}Hz",
+            lastError
+        )
+    }
+    private fun launchFrameLoop(record: AudioRecord, frameSamples: Int) {
+        audioFrameJob?.cancel()
+        // IO dispatcher: a tight blocking read loop must not sit on the main
+        // thread. Frames are marshalled to JS via notifyListeners (thread-safe).
+        audioFrameJob = scope.launch(Dispatchers.IO) {
+            val buffer = ShortArray(frameSamples)
+            val bytes = ByteArray(frameSamples * 2)
+            var frameIndex = 0L
+            try {
+                while (audioFrameRunning.get() && isActive) {
+                    val read = record.read(buffer, 0, frameSamples)
+                    if (read <= 0) {
+                        // ERROR_INVALID_OPERATION (-3) / ERROR_BAD_VALUE (-2):
+                        // the record was released or the mic was taken; stop.
+                        if (read < 0) break
+                        continue
+                    }
+                    var sumSquares = 0.0
+                    var b = 0
+                    for (i in 0 until read) {
+                        val s = buffer[i].toInt()
+                        bytes[b] = (s and 0xff).toByte()
+                        bytes[b + 1] = ((s shr 8) and 0xff).toByte()
+                        b += 2
+                        sumSquares += (s.toDouble() * s.toDouble())
+                    }
+                    val rms = if (read > 0) {
+                        Math.sqrt(sumSquares / read) / 32768.0
+                    } else 0.0
+                    val pcmBase64 = Base64.encodeToString(
+                        bytes, 0, read * 2, Base64.NO_WRAP
+                    )
+                    val idx = frameIndex
+                    frameIndex += 1
+                    val ts = SystemClock.elapsedRealtime()
+                    notifyListeners("audioFrame", JSObject().apply {
+                        put("pcm16", pcmBase64)
+                        put("sampleRate", record.sampleRate)
+                        put("channels", 1)
+                        put("samples", read)
+                        put("rms", rms)
+                        put("timestamp", ts)
+                        put("frameIndex", idx)
+                    })
+                }
+            } catch (e: Throwable) {
+                Log.e(TAG, "Audio frame loop error", e)
+                notifyListeners("error", JSObject().apply {
+                    put("message", "Audio frame capture stopped: ${e.message}")
+                    put("fatal", false)
+                })
+            }
+        }
+    }
+    private fun stopAudioFramesInternal() {
+        if (!audioFrameRunning.getAndSet(false) && audioRecord == null) {
+            return
+        }
+        audioFrameJob?.cancel()
+        audioFrameJob = null
+        releaseAudioRecord()
+        if (sttSuspendedForFrames) {
+            resumeSpeechRecognizerAfterFrames()
+        }
+    }
+    private fun releaseAudioRecord() {
+        val record = audioRecord ?: return
+        audioRecord = null
+        try {
+            if (record.recordingState == AudioRecord.RECORDSTATE_RECORDING) {
+                record.stop()
+            }
+        } catch (_: Throwable) {
+        }
+        try {
+            record.release()
+        } catch (_: Throwable) {
+        }
+    }
+    /** Suspend SpeechRecognizer so AudioRecord can own the mic. */
+    private fun suspendSpeechRecognizerForFrames() {
+        sttSuspendedForFrames = true
+        listeningMode = false
+        isListening = false
+        restartJob?.cancel()
+        silenceJob?.cancel()
+        mainHandler.post {
+            try {
+                recognizer?.cancel()
+                recognizer?.destroy()
+            } catch (_: Throwable) {
+            }
+            recognizer = null
+        }
+    }
+    /** Re-arm SpeechRecognizer after frame capture ends, if a session is active. */
+    private fun resumeSpeechRecognizerAfterFrames() {
+        sttSuspendedForFrames = false
+        if (!enabled || stopRequested) return
+        listeningMode = true
+        mainHandler.post {
+            try {
+                if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
+                recognizer?.destroy()
+                recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
+                    setRecognitionListener(recognitionListener)
+                }
+                startListeningInternal(markListening = true)
+                startSilenceMonitor()
+            } catch (e: Exception) {
+                Log.e(TAG, "Failed to resume STT after frames", e)
+            }
+        }
+    }
     // ── Config ──────────────────────────────────────────────────────────
     private fun applyConfig(config: JSObject) {
@@ -462,6 +795,13 @@ class TalkModePlugin : Plugin() {
             putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
             putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
             putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
+            // On-device recognizer (no network round-trip; works offline). The
+            // platform recognizer's open/close cadence during continuous use is
+            // intrinsic and not controllable via the silence-length extras (the
+            // on-device SODA engine ignores them); we silence the AUDIBLE part of
+            // that churn by muting the earcon streams for the session instead
+            // (see configureVoiceAudioSession).
+            putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
             sttLanguage?.let { putExtra(RecognizerIntent.EXTRA_LANGUAGE, it) }
         }
@@ -515,13 +855,14 @@ class TalkModePlugin : Plugin() {
         val elapsed = SystemClock.elapsedRealtime() - lastHeard
         if (elapsed < silenceWindowMs) return
-        // Finalize: emit a final transcript event
-        notifyListeners("transcript", JSObject().apply {
-            put("transcript", transcript)
-            put("isFinal", true)
-        })
+        // Finalize this turn (deduped against the recognizer's own onResults),
+        // then restart the recognizer so the next utterance is a CLEAN session —
+        // Android SpeechRecognizer accumulates within a session, so without the
+        // restart the next turn's partials would prepend the words we just sent.
         lastTranscript = ""
         lastHeardAtMs = null
+        emitFinalOnce(transcript)
+        scheduleRestart()
     }
     private fun handleTranscript(transcript: String, isFinal: Boolean) {
@@ -531,34 +872,71 @@ class TalkModePlugin : Plugin() {
         if (isSpeaking && interruptOnSpeech) {
             if (shouldInterrupt(transcript)) {
                 val interruptedAt = computeInterruptedAt()
-                stopSpeakingInternal()
                 lastInterruptedAtSeconds = interruptedAt
+                stopSpeakingInternal()
             }
             return
         }
         if (!isListening) return
-        if (transcript.isNotEmpty()) {
+        if (isFinal) {
+            // A real end-of-turn from the recognizer: emit once and clear the
+            // pending buffer so the silence monitor doesn't re-finalize the same
+            // words (the double-send bug).
+            lastTranscript = ""
+            lastHeardAtMs = null
+            emitFinalOnce(transcript)
+        } else {
             lastTranscript = transcript
             lastHeardAtMs = SystemClock.elapsedRealtime()
+            notifyListeners("transcript", JSObject().apply {
+                put("transcript", transcript)
+                put("isFinal", false)
+            })
         }
+    }
+    /**
+     * Emit a FINAL transcript exactly once. Both the recognizer's `onResults`
+     * and the silence monitor can finalize the same utterance; collapse them so
+     * the turn is sent a single time (a repeated final within 2s is dropped).
+     */
+    private fun emitFinalOnce(transcript: String) {
+        val text = transcript.trim()
+        if (text.isEmpty()) return
+        val now = SystemClock.elapsedRealtime()
+        if (text == lastEmittedFinal && now - lastEmittedFinalAtMs < 2000L) return
+        lastEmittedFinal = text
+        lastEmittedFinalAtMs = now
         notifyListeners("transcript", JSObject().apply {
-            put("transcript", transcript)
-            put("isFinal", isFinal)
+            put("transcript", text)
+            put("isFinal", true)
         })
     }
     /**
-     * Avoid false interrupts: don't interrupt if the heard text is just a
-     * substring of what we're currently speaking (echo from speaker).
+     * Decide whether heard speech should barge in on the agent's TTS. Tuned to
+     * avoid FALSE interrupts (which cut the reply mid-sentence and read as
+     * "intermittent audio"): a one-word ASR blip, background noise, or the
+     * agent's own voice bleeding back into the mic must NOT interrupt — only a
+     * genuine couple-of-words utterance from the user does.
      */
     private fun shouldInterrupt(transcript: String): Boolean {
         val trimmed = transcript.trim()
-        if (trimmed.length < 3) return false
-        val spoken = lastSpokenText?.lowercase()
-        if (spoken != null && spoken.contains(trimmed.lowercase())) return false
+        val lower = trimmed.lowercase()
+        val words = lower.split(Regex("\\s+")).filter { it.isNotBlank() }
+        // Need real intent: at least two words, or one long word (≥ 8 chars).
+        if (words.size < 2 && trimmed.length < 8) return false
+        val spoken = lastSpokenText?.lowercase() ?: return true
+        // Exact echo of what we're saying → speaker bleed, not the user.
+        if (spoken.contains(lower)) return false
+        // Fuzzy echo: if most of the heard words appear in the text we're
+        // currently speaking, treat it as echo (ASR mishears of our own audio).
+        val echoed = words.count { spoken.contains(it) }
+        if (words.isNotEmpty() && echoed.toDouble() / words.size >= 0.6) {
+            return false
+        }
         return true
     }
@@ -588,6 +966,7 @@ class TalkModePlugin : Plugin() {
     private suspend fun speakInternal(
         text: String,
         forceSystemTts: Boolean,
+        useLocalInferenceTts: Boolean,
         directive: JSObject?,
         call: PluginCall
     ) {
@@ -596,6 +975,7 @@ class TalkModePlugin : Plugin() {
         lastSpokenText = text
         speakStartTimeMs = SystemClock.elapsedRealtime()
         pcmStopRequested.set(false)
+        lastInterruptedAtSeconds = null
         setState("speaking", "Speaking")
         val effectiveVoiceId = directive.stringOrNull("voiceId")?.let(::resolveVoiceAlias) ?: voiceId
@@ -603,27 +983,74 @@ class TalkModePlugin : Plugin() {
         notifyListeners("speaking", JSObject().apply {
             put("text", text)
-            put("isSystemTts", forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
+            put(
+                "isSystemTts",
+                !useLocalInferenceTts &&
+                    (forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
+            )
         })
         // Stop listening during speech (we keep recognizer for interrupt detection)
         mainHandler.post { recognizer?.stopListening() }
         ensureInterruptListener()
-        // Request audio focus
-        requestAudioFocus()
+        // Ensure the communication-mode session + audio focus are active even
+        // for a standalone speak() that wasn't preceded by start().
+        configureVoiceAudioSession()
+        // Re-assert loudspeaker routing right before playback. configureVoice…
+        // only routes on the FIRST activation; if the session was already up (the
+        // STT path opened it) the speaker route may have drifted, leaving TTS on
+        // the earpiece. Re-route here so replies are audible out the speaker.
+        audioManager?.let { routeVoiceOutput(it) }
         try {
-            val canUseElevenLabs = !forceSystemTts &&
+            val canUseLocalInference = useLocalInferenceTts && !forceSystemTts
+            val canUseElevenLabs = !canUseLocalInference &&
+                !forceSystemTts &&
                 !effectiveApiKey.isNullOrEmpty() &&
                 !effectiveVoiceId.isNullOrEmpty()
-            if (canUseElevenLabs) {
+            if (canUseLocalInference) {
+                try {
+                    streamAndPlayLocalInferenceTts(text, directive)
+                    if (!pcmStopRequested.get()) {
+                        call.resolve(JSObject().apply {
+                            put("completed", true)
+                            put("interrupted", false)
+                            put("usedSystemTts", false)
+                        })
+                    } else {
+                        call.resolve(JSObject().apply {
+                            put("completed", false)
+                            put("interrupted", true)
+                            put("usedSystemTts", false)
+                            lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
+                        })
+                    }
+                } catch (e: Exception) {
+                    if (pcmStopRequested.get()) {
+                        call.resolve(JSObject().apply {
+                            put("completed", false)
+                            put("interrupted", true)
+                            put("usedSystemTts", false)
+                        })
+                    } else {
+                        // The on-device OmniVoice TTS assets aren't always staged
+                        // (it 502s "TEXT_TO_SPEECH not available"). Rather than go
+                        // silent — the JS browser-SpeechSynthesis fallback doesn't
+                        // exist in the Android WebView — fall back to the platform
+                        // TextToSpeech so replies are always spoken aloud.
+                        Log.w(TAG, "Local inference TTS failed, falling back to system TTS", e)
+                        speakWithSystemTts(text, call)
+                    }
+                }
+            } else if (canUseElevenLabs) {
                 try {
                     val request = buildElevenLabsRequest(text, directive)
                     streamAndPlayPcm(
-                        voiceId = effectiveVoiceId!!,
-                        apiKey = effectiveApiKey!!,
+                        voiceId = effectiveVoiceId,
+                        apiKey = effectiveApiKey,
                         request = request
                     )
@@ -665,13 +1092,16 @@ class TalkModePlugin : Plugin() {
                 put("error", e.message ?: "Speak failed")
             })
         } finally {
+            val wasInterrupted = pcmStopRequested.get()
+            val interruptedAt = lastInterruptedAtSeconds
             isSpeaking = false
             pcmStopRequested.set(false)
-            abandonAudioFocus()
             notifyListeners("speakComplete", JSObject().apply {
-                put("completed", !pcmStopRequested.get())
-                lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
+                put("completed", !wasInterrupted)
+                if (wasInterrupted) {
+                    interruptedAt?.let { put("interruptedAt", it) }
+                }
             })
             if (enabled) {
@@ -679,6 +1109,8 @@ class TalkModePlugin : Plugin() {
                 setState("listening", "Listening")
                 mainHandler.post { startListeningInternal(markListening = true) }
             } else {
+                // Standalone speak (no active conversation): release the session.
+                releaseVoiceAudioSession()
                 setState("idle", "Off")
             }
         }
@@ -753,6 +1185,273 @@ class TalkModePlugin : Plugin() {
         return if (value == null || value === JSONObject.NULL) null else value.toString()
     }
+    private data class PcmStreamFormat(
+        val sampleRate: Int,
+        val channels: Int,
+        val bitsPerSample: Int,
+        val dataBytes: Int
+    )
+    /**
+     * Stream local-inference TTS from the embedded agent and play it natively.
+     *
+     * The agent currently returns a buffered WAV, but keeping playback in
+     * AudioTrack means this path is ready for a chunked PCM/WAV response without
+     * going back through WebView decodeAudioData.
+     */
+    private suspend fun streamAndPlayLocalInferenceTts(
+        text: String,
+        directive: JSObject?
+    ) = withContext(Dispatchers.IO) {
+        pcmStopRequested.set(false)
+        val conn = openLocalInferenceTtsConnection()
+        activePcmConnection = conn
+        try {
+            val payload = buildLocalInferenceTtsPayload(text, directive)
+            conn.outputStream.use { it.write(payload.toByteArray(Charsets.UTF_8)) }
+            val code = conn.responseCode
+            if (code >= 400) {
+                val errBody = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
+                throw IllegalStateException("Local inference TTS error: $code $errBody")
+            }
+            BufferedInputStream(conn.inputStream).use { input ->
+                val format = readWavPcmFormat(input)
+                val track = createPcmAudioTrack(format)
+                pcmTrack = track
+                track.play()
+                Log.d(
+                    TAG,
+                    "Local inference PCM play start sampleRate=${format.sampleRate} channels=${format.channels}"
+                )
+                notifyListeners("playbackStart", JSObject().apply {
+                    put("provider", "local-inference")
+                    put("sampleRate", format.sampleRate)
+                    put("channels", format.channels)
+                })
+                val framesWritten = writePcmStreamToTrack(input, track, format)
+                drainPcmTrack(track, framesWritten, format.sampleRate)
+                if (!pcmStopRequested.get()) {
+                    track.stop()
+                }
+                Log.d(TAG, "Local inference PCM play done frames=$framesWritten")
+            }
+        } finally {
+            cleanupPcmTrack()
+            if (activePcmConnection === conn) {
+                activePcmConnection = null
+            }
+            conn.disconnect()
+        }
+    }
+    private fun openLocalInferenceTtsConnection(): HttpURLConnection {
+        val tokenFile = File(context.filesDir, "auth/local-agent-token")
+        val token = tokenFile.takeIf { it.isFile }?.readText()?.trim().orEmpty()
+        if (token.isEmpty()) {
+            throw IllegalStateException("Local agent auth token is missing")
+        }
+        val conn = URL(LOCAL_INFERENCE_TTS_URL).openConnection() as HttpURLConnection
+        conn.requestMethod = "POST"
+        conn.connectTimeout = 30_000
+        conn.readTimeout = 180_000
+        conn.setRequestProperty("Authorization", "Bearer $token")
+        conn.setRequestProperty("Content-Type", "application/json")
+        conn.setRequestProperty("Accept", "audio/wav")
+        conn.doOutput = true
+        return conn
+    }
+    private fun buildLocalInferenceTtsPayload(text: String, directive: JSObject?): String {
+        val payload = JSONObject()
+        payload.put("text", text)
+        directive.stringOrNull("voiceId")?.let { payload.put("voiceId", it) }
+        directive.stringOrNull("voice")?.let { payload.put("voice", it) }
+        directive.stringOrNull("modelId")?.let { payload.put("modelId", it) }
+        directive.stringOrNull("model")?.let { payload.put("model", it) }
+        val speed = directive?.optDouble("speed", Double.NaN)
+        if (speed != null && speed.isFinite() && speed > 0.0) {
+            payload.put("speed", speed)
+        }
+        return payload.toString()
+    }
+    private fun readExactly(input: BufferedInputStream, size: Int): ByteArray {
+        val bytes = ByteArray(size)
+        var offset = 0
+        while (offset < size) {
+            val read = input.read(bytes, offset, size - offset)
+            if (read < 0) {
+                throw IllegalStateException("Unexpected end of WAV stream")
+            }
+            offset += read
+        }
+        return bytes
+    }
+    private fun skipFully(input: BufferedInputStream, count: Int) {
+        var remaining = count
+        while (remaining > 0) {
+            val skipped = input.skip(remaining.toLong()).toInt()
+            if (skipped > 0) {
+                remaining -= skipped
+                continue
+            }
+            if (input.read() < 0) {
+                throw IllegalStateException("Unexpected end of WAV stream")
+            }
+            remaining -= 1
+        }
+    }
+    private fun littleEndianShort(bytes: ByteArray, offset: Int): Int {
+        return (bytes[offset].toInt() and 0xff) or
+            ((bytes[offset + 1].toInt() and 0xff) shl 8)
+    }
+    private fun littleEndianInt(bytes: ByteArray, offset: Int): Int {
+        return (bytes[offset].toInt() and 0xff) or
+            ((bytes[offset + 1].toInt() and 0xff) shl 8) or
+            ((bytes[offset + 2].toInt() and 0xff) shl 16) or
+            ((bytes[offset + 3].toInt() and 0xff) shl 24)
+    }
+    private fun chunkId(bytes: ByteArray): String {
+        return String(bytes, 0, 4, Charsets.US_ASCII)
+    }
+    private fun readWavPcmFormat(input: BufferedInputStream): PcmStreamFormat {
+        val riff = readExactly(input, 12)
+        if (
+            String(riff, 0, 4, Charsets.US_ASCII) != "RIFF" ||
+            String(riff, 8, 4, Charsets.US_ASCII) != "WAVE"
+        ) {
+            throw IllegalStateException("Local inference TTS returned non-WAV audio")
+        }
+        var format: PcmStreamFormat? = null
+        while (true) {
+            val header = readExactly(input, 8)
+            val id = chunkId(header)
+            val size = littleEndianInt(header, 4)
+            if (size < 0) {
+                throw IllegalStateException("Invalid WAV chunk size for $id")
+            }
+            if (id == "fmt ") {
+                val fmt = readExactly(input, size)
+                if (fmt.size < 16) {
+                    throw IllegalStateException("Invalid WAV fmt chunk")
+                }
+                val audioFormat = littleEndianShort(fmt, 0)
+                val channels = littleEndianShort(fmt, 2)
+                val sampleRate = littleEndianInt(fmt, 4)
+                val bitsPerSample = littleEndianShort(fmt, 14)
+                if (audioFormat != 1) {
+                    throw IllegalStateException("Only PCM WAV is supported, got format=$audioFormat")
+                }
+                if (bitsPerSample != 16) {
+                    throw IllegalStateException("Only 16-bit PCM WAV is supported, got bits=$bitsPerSample")
+                }
+                if (channels !in 1..2 || sampleRate <= 0) {
+                    throw IllegalStateException("Invalid WAV format sampleRate=$sampleRate channels=$channels")
+                }
+                format = PcmStreamFormat(sampleRate, channels, bitsPerSample, 0)
+                if (size % 2 == 1) skipFully(input, 1)
+                continue
+            }
+            if (id == "data") {
+                val parsed = format ?: throw IllegalStateException("WAV data arrived before fmt chunk")
+                return parsed.copy(dataBytes = size)
+            }
+            skipFully(input, size)
+            if (size % 2 == 1) skipFully(input, 1)
+        }
+    }
+    private fun createPcmAudioTrack(format: PcmStreamFormat): AudioTrack {
+        val channelMask = when (format.channels) {
+            1 -> AudioFormat.CHANNEL_OUT_MONO
+            2 -> AudioFormat.CHANNEL_OUT_STEREO
+            else -> throw IllegalStateException("Unsupported PCM channel count ${format.channels}")
+        }
+        val minBuffer = AudioTrack.getMinBufferSize(
+            format.sampleRate,
+            channelMask,
+            AudioFormat.ENCODING_PCM_16BIT
+        )
+        if (minBuffer <= 0) {
+            throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
+        }
+        val bufferSize = max(minBuffer * 2, 8 * 1024)
+        val track = AudioTrack.Builder()
+            .setAudioAttributes(voiceAudioAttributes())
+            .setAudioFormat(
+                AudioFormat.Builder()
+                    .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
+                    .setSampleRate(format.sampleRate)
+                    .setChannelMask(channelMask)
+                    .build()
+            )
+            .setBufferSizeInBytes(bufferSize)
+            .setTransferMode(AudioTrack.MODE_STREAM)
+            .build()
+        if (track.state != AudioTrack.STATE_INITIALIZED) {
+            track.release()
+            throw IllegalStateException("AudioTrack init failed")
+        }
+        return track
+    }
+    private fun writePcmStreamToTrack(
+        input: BufferedInputStream,
+        track: AudioTrack,
+        format: PcmStreamFormat
+    ): Long {
+        val bytesPerFrame = format.channels * (format.bitsPerSample / 8)
+        var bytesWrittenTotal = 0L
+        var remainingBytes = format.dataBytes
+        val buffer = ByteArray(8 * 1024)
+        while (remainingBytes > 0) {
+            if (pcmStopRequested.get()) break
+            val requestBytes = if (remainingBytes < buffer.size) remainingBytes else buffer.size
+            val bytesRead = input.read(buffer, 0, requestBytes)
+            if (bytesRead <= 0) break
+            remainingBytes -= bytesRead
+            var offset = 0
+            while (offset < bytesRead) {
+                if (pcmStopRequested.get()) break
+                val wrote = track.write(buffer, offset, bytesRead - offset)
+                if (wrote <= 0) {
+                    throw IllegalStateException("AudioTrack write failed: $wrote")
+                }
+                offset += wrote
+                bytesWrittenTotal += wrote.toLong()
+            }
+        }
+        return if (bytesPerFrame > 0) bytesWrittenTotal / bytesPerFrame else 0L
+    }
+    private fun drainPcmTrack(track: AudioTrack, framesWritten: Long, sampleRate: Int) {
+        if (framesWritten <= 0L || sampleRate <= 0) return
+        val maxDrainMs = (framesWritten * 1000L / sampleRate).coerceAtMost(30_000L) + 1_000L
+        val deadline = SystemClock.elapsedRealtime() + maxDrainMs
+        while (
+            !pcmStopRequested.get() &&
+            track.playbackHeadPosition.toLong() < framesWritten &&
+            SystemClock.elapsedRealtime() < deadline
+        ) {
+            SystemClock.sleep(20)
+        }
+    }
     /**
      * Stream PCM audio from ElevenLabs and play via AudioTrack.
      * Ported from classic TalkModeManager with proper offset-based writes.
@@ -776,12 +1475,7 @@ class TalkModePlugin : Plugin() {
         val bufferSize = max(minBuffer * 2, 8 * 1024)
         val track = AudioTrack.Builder()
-            .setAudioAttributes(
-                AudioAttributes.Builder()
-                    .setUsage(AudioAttributes.USAGE_ASSISTANT)
-                    .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
-                    .build()
-            )
+            .setAudioAttributes(voiceAudioAttributes())
             .setAudioFormat(
                 AudioFormat.Builder()
                     .setEncoding(AudioFormat.ENCODING_PCM_16BIT)
@@ -802,6 +1496,7 @@ class TalkModePlugin : Plugin() {
         Log.d(TAG, "PCM play start sampleRate=$sampleRate bufferSize=$bufferSize")
         val conn = openTtsConnection(voiceId, apiKey, request)
+        activePcmConnection = conn
         try {
             val payload = buildRequestPayload(request)
             conn.outputStream.use { it.write(payload.toByteArray()) }
@@ -845,6 +1540,9 @@ class TalkModePlugin : Plugin() {
             Log.d(TAG, "PCM play done")
         } finally {
             cleanupPcmTrack()
+            if (activePcmConnection === conn) {
+                activePcmConnection = null
+            }
             conn.disconnect()
         }
     }
@@ -970,43 +1668,125 @@ class TalkModePlugin : Plugin() {
         }
     }
-    // ── Audio focus ─────────────────────────────────────────────────────
+    // ── Voice audio session ─────────────────────────────────────────────
+    //
+    // The Android analog of the iOS `.playAndRecord` / `.voiceChat` /
+    // `.defaultToSpeaker` session. Putting the device in MODE_IN_COMMUNICATION
+    // for the whole conversation routes capture + playback through the
+    // telephony path, which engages the platform hardware AEC so TTS coming out
+    // the speaker is cancelled from the mic (the core fix for the mic+speaker
+    // echo loop in hands-free mode). We also hold voice-communication audio
+    // focus and route to the loudspeaker (unless a headset is connected) so
+    // hands-free playback is audible.
+    private fun voiceAudioAttributes(): AudioAttributes =
+        AudioAttributes.Builder()
+            .setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
+            .setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
+            .build()
-    private fun requestAudioFocus() {
+    private fun configureVoiceAudioSession() {
+        if (audioSessionActive) return
         val am = audioManager ?: return
-        val focusListener = AudioManager.OnAudioFocusChangeListener { focusChange ->
-            when (focusChange) {
-                AudioManager.AUDIOFOCUS_LOSS,
-                AudioManager.AUDIOFOCUS_LOSS_TRANSIENT -> {
-                    // Another app took audio; stop speaking if we are
-                    if (isSpeaking) {
-                        stopSpeakingInternal()
-                    }
+        savedAudioMode = am.mode
+        @Suppress("DEPRECATION")
+        savedSpeakerphoneOn = am.isSpeakerphoneOn
+        val request = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_EXCLUSIVE)
+            .setAudioAttributes(voiceAudioAttributes())
+            .setOnAudioFocusChangeListener { focusChange ->
+                if (
+                    focusChange == AudioManager.AUDIOFOCUS_LOSS ||
+                    focusChange == AudioManager.AUDIOFOCUS_LOSS_TRANSIENT
+                ) {
+                    // Another app took audio; stop speaking if we are.
+                    if (isSpeaking) stopSpeakingInternal()
                 }
             }
+            .build()
+        audioFocusRequest = request
+        am.requestAudioFocus(request)
+        am.mode = AudioManager.MODE_IN_COMMUNICATION
+        routeVoiceOutput(am)
+        muteEarconStreams(am)
+        audioSessionActive = true
+        Log.d(TAG, "Voice audio session active (communication mode)")
+    }
+    /** Mute the recognizer earcon streams for the session; idempotent. */
+    private fun muteEarconStreams(am: AudioManager) {
+        if (earconStreamsMuted) return
+        for (stream in earconStreams) {
+            try {
+                am.adjustStreamVolume(stream, AudioManager.ADJUST_MUTE, 0)
+            } catch (_: Throwable) {
+                // Some OEMs disallow muting certain streams without DND access.
+            }
         }
-        audioFocusRequest = focusListener
+        earconStreamsMuted = true
+    }
+    private fun unmuteEarconStreams(am: AudioManager) {
+        if (!earconStreamsMuted) return
+        for (stream in earconStreams) {
+            try {
+                am.adjustStreamVolume(stream, AudioManager.ADJUST_UNMUTE, 0)
+            } catch (_: Throwable) {}
+        }
+        earconStreamsMuted = false
+    }
+    /**
+     * Default playback to the loudspeaker for hands-free use, but let a wired or
+     * Bluetooth headset win — the iOS `.defaultToSpeaker` semantic.
+     */
+    private fun routeVoiceOutput(am: AudioManager) {
+        val hasHeadset = am.getDevices(AudioManager.GET_DEVICES_OUTPUTS).any { device ->
+            device.type == AudioDeviceInfo.TYPE_WIRED_HEADSET ||
+                device.type == AudioDeviceInfo.TYPE_WIRED_HEADPHONES ||
+                device.type == AudioDeviceInfo.TYPE_USB_HEADSET ||
+                device.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO ||
+                device.type == AudioDeviceInfo.TYPE_BLUETOOTH_A2DP
+        }
+        if (hasHeadset) {
+            if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
+            @Suppress("DEPRECATION")
+            am.isSpeakerphoneOn = false
+            return
+        }
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
+            val speaker = am.availableCommunicationDevices.firstOrNull {
+                it.type == AudioDeviceInfo.TYPE_BUILTIN_SPEAKER
+            }
+            if (speaker != null && am.setCommunicationDevice(speaker)) return
+        }
         @Suppress("DEPRECATION")
-        am.requestAudioFocus(
-            focusListener,
-            AudioManager.STREAM_MUSIC,
-            AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK
-        )
+        am.isSpeakerphoneOn = true
     }
-    private fun abandonAudioFocus() {
+    private fun releaseVoiceAudioSession() {
+        if (!audioSessionActive) return
         val am = audioManager ?: return
-        val listener = audioFocusRequest ?: return
-        @Suppress("DEPRECATION")
-        am.abandonAudioFocus(listener)
+        unmuteEarconStreams(am)
+        audioFocusRequest?.let { am.abandonAudioFocusRequest(it) }
         audioFocusRequest = null
+        if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
+        @Suppress("DEPRECATION")
+        am.isSpeakerphoneOn = savedSpeakerphoneOn
+        am.mode = savedAudioMode
+        audioSessionActive = false
+        Log.d(TAG, "Voice audio session released")
     }
     // ── Cleanup helpers ─────────────────────────────────────────────────
     private fun stopSpeakingInternal() {
         pcmStopRequested.set(true)
+        val conn = activePcmConnection
+        activePcmConnection = null
+        conn?.disconnect()
         cleanupPcmTrack()
         systemTts?.stop()
         systemTtsPending?.cancel()
@@ -1162,6 +1942,9 @@ class TalkModePlugin : Plugin() {
     }
     private fun isPermissionGranted(permission: String): Boolean {
+        if (permission == Manifest.permission.RECORD_AUDIO) {
+            return context.checkSelfPermission(permission) == PackageManager.PERMISSION_GRANTED
+        }
         return getPermissionState(permission) == com.getcapacitor.PermissionState.GRANTED
     }
@@ -1176,10 +1959,13 @@ class TalkModePlugin : Plugin() {
         systemTts?.shutdown()
         systemTts = null
         cleanupPcmTrack()
+        audioFrameRunning.set(false)
+        audioFrameJob?.cancel()
+        releaseAudioRecord()
         silenceJob?.cancel()
         restartJob?.cancel()
         speakingJob?.cancel()
-        abandonAudioFocus()
+        releaseVoiceAudioSession()
         scope.cancel()
     }