@elizaos/capacitor-talkmode 1.0.0 → 2.0.3-beta.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +137 -0
- package/android/build.gradle +17 -3
- package/android/src/main/java/ai/eliza/plugins/talkmode/TalkModePlugin.kt +941 -54
- package/dist/esm/definitions.d.ts +146 -0
- package/dist/esm/definitions.d.ts.map +1 -1
- package/dist/esm/web.d.ts +6 -1
- package/dist/esm/web.d.ts.map +1 -1
- package/dist/esm/web.js +34 -5
- package/dist/esm/web.test.d.ts +2 -0
- package/dist/esm/web.test.d.ts.map +1 -0
- package/dist/esm/web.test.js +137 -0
- package/dist/plugin.cjs.js +34 -5
- package/dist/plugin.cjs.js.map +1 -1
- package/dist/plugin.js +34 -5
- package/dist/plugin.js.map +1 -1
- package/ios/Sources/TalkModePlugin/TalkModePlugin.swift +266 -16
- package/package.json +18 -14
|
@@ -2,10 +2,17 @@ package ai.eliza.plugins.talkmode
|
|
|
2
2
|
|
|
3
3
|
import android.Manifest
|
|
4
4
|
import android.content.Intent
|
|
5
|
+
import android.content.pm.PackageManager
|
|
5
6
|
import android.media.AudioAttributes
|
|
7
|
+
import android.media.AudioDeviceInfo
|
|
8
|
+
import android.media.AudioFocusRequest
|
|
6
9
|
import android.media.AudioFormat
|
|
7
10
|
import android.media.AudioManager
|
|
11
|
+
import android.media.AudioRecord
|
|
8
12
|
import android.media.AudioTrack
|
|
13
|
+
import android.media.MediaRecorder
|
|
14
|
+
import android.util.Base64
|
|
15
|
+
import android.os.Build
|
|
9
16
|
import android.os.Bundle
|
|
10
17
|
import android.os.Handler
|
|
11
18
|
import android.os.Looper
|
|
@@ -25,7 +32,15 @@ import com.getcapacitor.annotation.CapacitorPlugin
|
|
|
25
32
|
import com.getcapacitor.annotation.Permission
|
|
26
33
|
import com.getcapacitor.annotation.PermissionCallback
|
|
27
34
|
import kotlinx.coroutines.*
|
|
35
|
+
import android.net.LocalSocket
|
|
36
|
+
import android.net.LocalSocketAddress
|
|
28
37
|
import java.io.BufferedInputStream
|
|
38
|
+
import java.io.ByteArrayInputStream
|
|
39
|
+
import java.io.DataInputStream
|
|
40
|
+
import java.io.DataOutputStream
|
|
41
|
+
import java.nio.ByteBuffer
|
|
42
|
+
import java.nio.ByteOrder
|
|
43
|
+
import java.io.File
|
|
29
44
|
import java.net.HttpURLConnection
|
|
30
45
|
import java.net.URL
|
|
31
46
|
import java.util.Locale
|
|
@@ -45,6 +60,15 @@ class TalkModePlugin : Plugin() {
|
|
|
45
60
|
private const val TAG = "TalkMode"
|
|
46
61
|
private const val DEFAULT_MODEL_ID = "eleven_flash_v2_5"
|
|
47
62
|
private const val DEFAULT_OUTPUT_FORMAT = "pcm_24000"
|
|
63
|
+
private const val LOCAL_INFERENCE_TTS_URL = "http://127.0.0.1:31337/api/tts/local-inference"
|
|
64
|
+
// Abstract-namespace UDS of ElizaBionicInferenceServer (the bionic app
|
|
65
|
+
// process that has libelizainference loaded). Kept in sync with
|
|
66
|
+
// BIONIC_INFERENCE_SOCKET_NAME in ElizaAgentService.
|
|
67
|
+
private const val BIONIC_INFER_SOCKET = "eliza_bionic_infer_v1"
|
|
68
|
+
// 16 kHz mono is the rate VAD / diarizer / wake-word models expect; 20 ms
|
|
69
|
+
// (320 samples) is the standard VAD frame size.
|
|
70
|
+
private const val DEFAULT_FRAME_SAMPLE_RATE = 16000
|
|
71
|
+
private const val DEFAULT_FRAME_MS = 20
|
|
48
72
|
}
|
|
49
73
|
|
|
50
74
|
private val mainHandler = Handler(Looper.getMainLooper())
|
|
@@ -65,6 +89,10 @@ class TalkModePlugin : Plugin() {
|
|
|
65
89
|
private var lastHeardAtMs: Long? = null
|
|
66
90
|
private var silenceJob: Job? = null
|
|
67
91
|
private val silenceWindowMs = 700L
|
|
92
|
+
// The recognizer's own onResults AND our silence monitor can both finalize
|
|
93
|
+
// the same utterance; dedup so a turn is emitted (and sent) exactly once.
|
|
94
|
+
private var lastEmittedFinal = ""
|
|
95
|
+
private var lastEmittedFinalAtMs = 0L
|
|
68
96
|
|
|
69
97
|
// TTS
|
|
70
98
|
private var systemTts: TextToSpeech? = null
|
|
@@ -79,10 +107,37 @@ class TalkModePlugin : Plugin() {
|
|
|
79
107
|
private var lastSpokenText: String? = null
|
|
80
108
|
private var speakStartTimeMs: Long = 0
|
|
81
109
|
private var lastInterruptedAtSeconds: Double? = null
|
|
110
|
+
@Volatile private var activePcmConnection: HttpURLConnection? = null
|
|
82
111
|
|
|
83
|
-
//
|
|
112
|
+
// Voice audio session (communication-mode routing + focus, mirrors the iOS
|
|
113
|
+
// .playAndRecord/.voiceChat/.defaultToSpeaker session). Held for the whole
|
|
114
|
+
// conversation so the platform AEC has a stable speaker reference to cancel.
|
|
84
115
|
private var audioManager: AudioManager? = null
|
|
85
|
-
private var audioFocusRequest:
|
|
116
|
+
private var audioFocusRequest: AudioFocusRequest? = null
|
|
117
|
+
private var audioSessionActive = false
|
|
118
|
+
private var savedAudioMode = AudioManager.MODE_NORMAL
|
|
119
|
+
private var savedSpeakerphoneOn = false
|
|
120
|
+
// Streams we mute for the session to suppress the platform recognizer's
|
|
121
|
+
// start/stop earcons (the "on/off" beeps heard as it re-arms continuously).
|
|
122
|
+
// TTS plays on STREAM_VOICE_CALL (USAGE_VOICE_COMMUNICATION) so it stays
|
|
123
|
+
// audible. Tracked so we only unmute streams we muted.
|
|
124
|
+
private val earconStreams = intArrayOf(
|
|
125
|
+
AudioManager.STREAM_MUSIC,
|
|
126
|
+
AudioManager.STREAM_SYSTEM,
|
|
127
|
+
AudioManager.STREAM_NOTIFICATION,
|
|
128
|
+
)
|
|
129
|
+
private var earconStreamsMuted = false
|
|
130
|
+
|
|
131
|
+
// Raw PCM frame capture (diarization / VAD / wake-word source). Opt-in and
|
|
132
|
+
// mutually exclusive with SpeechRecognizer on the mic: Android only lets one
|
|
133
|
+
// capture client own a given input source at a time, so starting frame
|
|
134
|
+
// capture SUSPENDS any active SpeechRecognizer and stopping it resumes STT.
|
|
135
|
+
private var audioRecord: AudioRecord? = null
|
|
136
|
+
private var audioFrameJob: Job? = null
|
|
137
|
+
private val audioFrameRunning = AtomicBoolean(false)
|
|
138
|
+
private var sttSuspendedForFrames = false
|
|
139
|
+
private var lastFrameSampleRate = DEFAULT_FRAME_SAMPLE_RATE
|
|
140
|
+
private var lastFrameSamples = 0
|
|
86
141
|
|
|
87
142
|
// Config
|
|
88
143
|
private var apiKey: String? = null
|
|
@@ -189,6 +244,7 @@ class TalkModePlugin : Plugin() {
|
|
|
189
244
|
systemTtsReady = status == TextToSpeech.SUCCESS
|
|
190
245
|
if (systemTtsReady) {
|
|
191
246
|
systemTts?.language = Locale.getDefault()
|
|
247
|
+
systemTts?.setAudioAttributes(voiceAudioAttributes())
|
|
192
248
|
systemTts?.setOnUtteranceProgressListener(object : UtteranceProgressListener() {
|
|
193
249
|
override fun onStart(id: String?) {}
|
|
194
250
|
|
|
@@ -270,6 +326,7 @@ class TalkModePlugin : Plugin() {
|
|
|
270
326
|
enabled = true
|
|
271
327
|
stopRequested = false
|
|
272
328
|
listeningMode = true
|
|
329
|
+
configureVoiceAudioSession()
|
|
273
330
|
setState("listening", "Listening")
|
|
274
331
|
|
|
275
332
|
mainHandler.post {
|
|
@@ -286,6 +343,13 @@ class TalkModePlugin : Plugin() {
|
|
|
286
343
|
})
|
|
287
344
|
} catch (e: Exception) {
|
|
288
345
|
Log.e(TAG, "Failed to start", e)
|
|
346
|
+
// Recognizer creation failed AFTER the audio session was
|
|
347
|
+
// configured — release it so the earcon streams aren't left
|
|
348
|
+
// muted and the device isn't stuck in MODE_IN_COMMUNICATION.
|
|
349
|
+
enabled = false
|
|
350
|
+
listeningMode = false
|
|
351
|
+
releaseVoiceAudioSession()
|
|
352
|
+
setState("idle", "Off")
|
|
289
353
|
call.resolve(JSObject().apply {
|
|
290
354
|
put("started", false)
|
|
291
355
|
put("error", e.message ?: "Failed to start")
|
|
@@ -307,6 +371,10 @@ class TalkModePlugin : Plugin() {
|
|
|
307
371
|
lastTranscript = ""
|
|
308
372
|
lastHeardAtMs = null
|
|
309
373
|
|
|
374
|
+
// Release any raw-PCM capture; `enabled` is already false so this won't
|
|
375
|
+
// re-arm SpeechRecognizer.
|
|
376
|
+
stopAudioFramesInternal()
|
|
377
|
+
|
|
310
378
|
mainHandler.post {
|
|
311
379
|
recognizer?.cancel()
|
|
312
380
|
recognizer?.destroy()
|
|
@@ -314,6 +382,7 @@ class TalkModePlugin : Plugin() {
|
|
|
314
382
|
}
|
|
315
383
|
|
|
316
384
|
stopSpeakingInternal()
|
|
385
|
+
releaseVoiceAudioSession()
|
|
317
386
|
setState("idle", "Off")
|
|
318
387
|
call.resolve()
|
|
319
388
|
}
|
|
@@ -364,16 +433,18 @@ class TalkModePlugin : Plugin() {
|
|
|
364
433
|
}
|
|
365
434
|
|
|
366
435
|
val useSystemTts = call.getBoolean("useSystemTts", false) ?: false
|
|
436
|
+
val useLocalInferenceTts = call.getBoolean("useLocalInferenceTts", false) ?: false
|
|
367
437
|
val directive = call.getObject("directive")
|
|
368
438
|
|
|
369
439
|
speakingJob = scope.launch {
|
|
370
|
-
speakInternal(text, useSystemTts, directive, call)
|
|
440
|
+
speakInternal(text, useSystemTts, useLocalInferenceTts, directive, call)
|
|
371
441
|
}
|
|
372
442
|
}
|
|
373
443
|
|
|
374
444
|
@PluginMethod
|
|
375
445
|
fun stopSpeaking(call: PluginCall) {
|
|
376
446
|
val interruptedAt = computeInterruptedAt()
|
|
447
|
+
lastInterruptedAtSeconds = interruptedAt
|
|
377
448
|
stopSpeakingInternal()
|
|
378
449
|
call.resolve(JSObject().apply {
|
|
379
450
|
if (interruptedAt != null) {
|
|
@@ -408,6 +479,279 @@ class TalkModePlugin : Plugin() {
|
|
|
408
479
|
call.resolve(buildPermissionResult())
|
|
409
480
|
}
|
|
410
481
|
|
|
482
|
+
// ── Raw PCM frame capture (diarization / VAD / wake-word) ────────────
|
|
483
|
+
|
|
484
|
+
@PluginMethod
|
|
485
|
+
fun startAudioFrames(call: PluginCall) {
|
|
486
|
+
if (getPermissionState("microphone") != PermissionState.GRANTED) {
|
|
487
|
+
requestPermissionForAlias("microphone", call, "handleStartAudioFramesPermission")
|
|
488
|
+
return
|
|
489
|
+
}
|
|
490
|
+
startAudioFramesInternal(call)
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
@PermissionCallback
|
|
494
|
+
private fun handleStartAudioFramesPermission(call: PluginCall) {
|
|
495
|
+
if (getPermissionState("microphone") == PermissionState.GRANTED) {
|
|
496
|
+
startAudioFramesInternal(call)
|
|
497
|
+
} else {
|
|
498
|
+
call.resolve(JSObject().apply {
|
|
499
|
+
put("started", false)
|
|
500
|
+
put("error", "Microphone permission denied")
|
|
501
|
+
})
|
|
502
|
+
}
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
private fun startAudioFramesInternal(call: PluginCall) {
|
|
506
|
+
if (audioFrameRunning.get()) {
|
|
507
|
+
call.resolve(JSObject().apply {
|
|
508
|
+
put("started", true)
|
|
509
|
+
put("sampleRate", lastFrameSampleRate)
|
|
510
|
+
put("frameSamples", lastFrameSamples)
|
|
511
|
+
put("suspendedStt", sttSuspendedForFrames)
|
|
512
|
+
})
|
|
513
|
+
return
|
|
514
|
+
}
|
|
515
|
+
|
|
516
|
+
val requestedRate = call.getInt("sampleRate") ?: DEFAULT_FRAME_SAMPLE_RATE
|
|
517
|
+
val frameMs = call.getInt("frameMs") ?: DEFAULT_FRAME_MS
|
|
518
|
+
// SpeechRecognizer (SODA) holds the mic; a parallel AudioRecord on the
|
|
519
|
+
// same input fails on virtually every device. Suspend it for the
|
|
520
|
+
// duration of capture and remember to resume on stop.
|
|
521
|
+
val wasListening = isListening || listeningMode
|
|
522
|
+
if (wasListening) {
|
|
523
|
+
suspendSpeechRecognizerForFrames()
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
val record = try {
|
|
527
|
+
openAudioRecord(requestedRate)
|
|
528
|
+
} catch (e: Exception) {
|
|
529
|
+
Log.e(TAG, "AudioRecord open failed", e)
|
|
530
|
+
if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
|
|
531
|
+
call.resolve(JSObject().apply {
|
|
532
|
+
put("started", false)
|
|
533
|
+
put("error", e.message ?: "AudioRecord open failed")
|
|
534
|
+
})
|
|
535
|
+
return
|
|
536
|
+
}
|
|
537
|
+
|
|
538
|
+
val actualRate = record.sampleRate
|
|
539
|
+
val frameSamples = max(1, actualRate * frameMs / 1000)
|
|
540
|
+
audioRecord = record
|
|
541
|
+
lastFrameSampleRate = actualRate
|
|
542
|
+
lastFrameSamples = frameSamples
|
|
543
|
+
|
|
544
|
+
try {
|
|
545
|
+
record.startRecording()
|
|
546
|
+
} catch (e: Exception) {
|
|
547
|
+
Log.e(TAG, "AudioRecord startRecording failed", e)
|
|
548
|
+
releaseAudioRecord()
|
|
549
|
+
if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
|
|
550
|
+
call.resolve(JSObject().apply {
|
|
551
|
+
put("started", false)
|
|
552
|
+
put("error", e.message ?: "AudioRecord start failed")
|
|
553
|
+
})
|
|
554
|
+
return
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
if (record.recordingState != AudioRecord.RECORDSTATE_RECORDING) {
|
|
558
|
+
Log.e(TAG, "AudioRecord did not enter RECORDING state")
|
|
559
|
+
releaseAudioRecord()
|
|
560
|
+
if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
|
|
561
|
+
call.resolve(JSObject().apply {
|
|
562
|
+
put("started", false)
|
|
563
|
+
put("error", "AudioRecord did not start (mic likely held by SpeechRecognizer)")
|
|
564
|
+
})
|
|
565
|
+
return
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
audioFrameRunning.set(true)
|
|
569
|
+
launchFrameLoop(record, frameSamples)
|
|
570
|
+
|
|
571
|
+
call.resolve(JSObject().apply {
|
|
572
|
+
put("started", true)
|
|
573
|
+
put("sampleRate", actualRate)
|
|
574
|
+
put("frameSamples", frameSamples)
|
|
575
|
+
put("suspendedStt", sttSuspendedForFrames)
|
|
576
|
+
})
|
|
577
|
+
}
|
|
578
|
+
|
|
579
|
+
@PluginMethod
|
|
580
|
+
fun stopAudioFrames(call: PluginCall) {
|
|
581
|
+
stopAudioFramesInternal()
|
|
582
|
+
call.resolve()
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
@PluginMethod
|
|
586
|
+
fun isCapturingAudioFrames(call: PluginCall) {
|
|
587
|
+
call.resolve(JSObject().apply {
|
|
588
|
+
put("capturing", audioFrameRunning.get())
|
|
589
|
+
})
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
/**
|
|
593
|
+
* Open a 16 kHz mono 16-bit AudioRecord. Tries VOICE_RECOGNITION first (the
|
|
594
|
+
* pre-processing-light source diarization wants), then falls back to MIC.
|
|
595
|
+
*/
|
|
596
|
+
private fun openAudioRecord(sampleRate: Int): AudioRecord {
|
|
597
|
+
val minBuffer = AudioRecord.getMinBufferSize(
|
|
598
|
+
sampleRate,
|
|
599
|
+
AudioFormat.CHANNEL_IN_MONO,
|
|
600
|
+
AudioFormat.ENCODING_PCM_16BIT
|
|
601
|
+
)
|
|
602
|
+
if (minBuffer <= 0) {
|
|
603
|
+
throw IllegalStateException("AudioRecord min buffer invalid ($minBuffer) for ${sampleRate}Hz")
|
|
604
|
+
}
|
|
605
|
+
val bufferBytes = max(minBuffer * 2, 4 * 1024)
|
|
606
|
+
val sources = intArrayOf(
|
|
607
|
+
MediaRecorder.AudioSource.VOICE_RECOGNITION,
|
|
608
|
+
MediaRecorder.AudioSource.MIC,
|
|
609
|
+
)
|
|
610
|
+
var lastError: Throwable? = null
|
|
611
|
+
for (source in sources) {
|
|
612
|
+
try {
|
|
613
|
+
@Suppress("MissingPermission")
|
|
614
|
+
val record = AudioRecord(
|
|
615
|
+
source,
|
|
616
|
+
sampleRate,
|
|
617
|
+
AudioFormat.CHANNEL_IN_MONO,
|
|
618
|
+
AudioFormat.ENCODING_PCM_16BIT,
|
|
619
|
+
bufferBytes
|
|
620
|
+
)
|
|
621
|
+
if (record.state == AudioRecord.STATE_INITIALIZED) {
|
|
622
|
+
return record
|
|
623
|
+
}
|
|
624
|
+
record.release()
|
|
625
|
+
lastError = IllegalStateException("AudioRecord uninitialized for source $source")
|
|
626
|
+
} catch (e: Exception) {
|
|
627
|
+
lastError = e
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
throw IllegalStateException(
|
|
631
|
+
"AudioRecord could not initialize at ${sampleRate}Hz",
|
|
632
|
+
lastError
|
|
633
|
+
)
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
private fun launchFrameLoop(record: AudioRecord, frameSamples: Int) {
|
|
637
|
+
audioFrameJob?.cancel()
|
|
638
|
+
// IO dispatcher: a tight blocking read loop must not sit on the main
|
|
639
|
+
// thread. Frames are marshalled to JS via notifyListeners (thread-safe).
|
|
640
|
+
audioFrameJob = scope.launch(Dispatchers.IO) {
|
|
641
|
+
val buffer = ShortArray(frameSamples)
|
|
642
|
+
val bytes = ByteArray(frameSamples * 2)
|
|
643
|
+
var frameIndex = 0L
|
|
644
|
+
try {
|
|
645
|
+
while (audioFrameRunning.get() && isActive) {
|
|
646
|
+
val read = record.read(buffer, 0, frameSamples)
|
|
647
|
+
if (read <= 0) {
|
|
648
|
+
// ERROR_INVALID_OPERATION (-3) / ERROR_BAD_VALUE (-2):
|
|
649
|
+
// the record was released or the mic was taken; stop.
|
|
650
|
+
if (read < 0) break
|
|
651
|
+
continue
|
|
652
|
+
}
|
|
653
|
+
var sumSquares = 0.0
|
|
654
|
+
var b = 0
|
|
655
|
+
for (i in 0 until read) {
|
|
656
|
+
val s = buffer[i].toInt()
|
|
657
|
+
bytes[b] = (s and 0xff).toByte()
|
|
658
|
+
bytes[b + 1] = ((s shr 8) and 0xff).toByte()
|
|
659
|
+
b += 2
|
|
660
|
+
sumSquares += (s.toDouble() * s.toDouble())
|
|
661
|
+
}
|
|
662
|
+
val rms = if (read > 0) {
|
|
663
|
+
Math.sqrt(sumSquares / read) / 32768.0
|
|
664
|
+
} else 0.0
|
|
665
|
+
val pcmBase64 = Base64.encodeToString(
|
|
666
|
+
bytes, 0, read * 2, Base64.NO_WRAP
|
|
667
|
+
)
|
|
668
|
+
val idx = frameIndex
|
|
669
|
+
frameIndex += 1
|
|
670
|
+
val ts = SystemClock.elapsedRealtime()
|
|
671
|
+
notifyListeners("audioFrame", JSObject().apply {
|
|
672
|
+
put("pcm16", pcmBase64)
|
|
673
|
+
put("sampleRate", record.sampleRate)
|
|
674
|
+
put("channels", 1)
|
|
675
|
+
put("samples", read)
|
|
676
|
+
put("rms", rms)
|
|
677
|
+
put("timestamp", ts)
|
|
678
|
+
put("frameIndex", idx)
|
|
679
|
+
})
|
|
680
|
+
}
|
|
681
|
+
} catch (e: Throwable) {
|
|
682
|
+
Log.e(TAG, "Audio frame loop error", e)
|
|
683
|
+
notifyListeners("error", JSObject().apply {
|
|
684
|
+
put("message", "Audio frame capture stopped: ${e.message}")
|
|
685
|
+
put("fatal", false)
|
|
686
|
+
})
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
private fun stopAudioFramesInternal() {
|
|
692
|
+
if (!audioFrameRunning.getAndSet(false) && audioRecord == null) {
|
|
693
|
+
return
|
|
694
|
+
}
|
|
695
|
+
audioFrameJob?.cancel()
|
|
696
|
+
audioFrameJob = null
|
|
697
|
+
releaseAudioRecord()
|
|
698
|
+
if (sttSuspendedForFrames) {
|
|
699
|
+
resumeSpeechRecognizerAfterFrames()
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
|
|
703
|
+
private fun releaseAudioRecord() {
|
|
704
|
+
val record = audioRecord ?: return
|
|
705
|
+
audioRecord = null
|
|
706
|
+
try {
|
|
707
|
+
if (record.recordingState == AudioRecord.RECORDSTATE_RECORDING) {
|
|
708
|
+
record.stop()
|
|
709
|
+
}
|
|
710
|
+
} catch (_: Throwable) {
|
|
711
|
+
}
|
|
712
|
+
try {
|
|
713
|
+
record.release()
|
|
714
|
+
} catch (_: Throwable) {
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
|
|
718
|
+
/** Suspend SpeechRecognizer so AudioRecord can own the mic. */
|
|
719
|
+
private fun suspendSpeechRecognizerForFrames() {
|
|
720
|
+
sttSuspendedForFrames = true
|
|
721
|
+
listeningMode = false
|
|
722
|
+
isListening = false
|
|
723
|
+
restartJob?.cancel()
|
|
724
|
+
silenceJob?.cancel()
|
|
725
|
+
mainHandler.post {
|
|
726
|
+
try {
|
|
727
|
+
recognizer?.cancel()
|
|
728
|
+
recognizer?.destroy()
|
|
729
|
+
} catch (_: Throwable) {
|
|
730
|
+
}
|
|
731
|
+
recognizer = null
|
|
732
|
+
}
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
/** Re-arm SpeechRecognizer after frame capture ends, if a session is active. */
|
|
736
|
+
private fun resumeSpeechRecognizerAfterFrames() {
|
|
737
|
+
sttSuspendedForFrames = false
|
|
738
|
+
if (!enabled || stopRequested) return
|
|
739
|
+
listeningMode = true
|
|
740
|
+
mainHandler.post {
|
|
741
|
+
try {
|
|
742
|
+
if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
|
|
743
|
+
recognizer?.destroy()
|
|
744
|
+
recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
|
|
745
|
+
setRecognitionListener(recognitionListener)
|
|
746
|
+
}
|
|
747
|
+
startListeningInternal(markListening = true)
|
|
748
|
+
startSilenceMonitor()
|
|
749
|
+
} catch (e: Exception) {
|
|
750
|
+
Log.e(TAG, "Failed to resume STT after frames", e)
|
|
751
|
+
}
|
|
752
|
+
}
|
|
753
|
+
}
|
|
754
|
+
|
|
411
755
|
// ── Config ──────────────────────────────────────────────────────────
|
|
412
756
|
|
|
413
757
|
private fun applyConfig(config: JSObject) {
|
|
@@ -462,6 +806,13 @@ class TalkModePlugin : Plugin() {
|
|
|
462
806
|
putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
|
|
463
807
|
putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
|
|
464
808
|
putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
|
|
809
|
+
// On-device recognizer (no network round-trip; works offline). The
|
|
810
|
+
// platform recognizer's open/close cadence during continuous use is
|
|
811
|
+
// intrinsic and not controllable via the silence-length extras (the
|
|
812
|
+
// on-device SODA engine ignores them); we silence the AUDIBLE part of
|
|
813
|
+
// that churn by muting the earcon streams for the session instead
|
|
814
|
+
// (see configureVoiceAudioSession).
|
|
815
|
+
putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
|
|
465
816
|
sttLanguage?.let { putExtra(RecognizerIntent.EXTRA_LANGUAGE, it) }
|
|
466
817
|
}
|
|
467
818
|
|
|
@@ -515,13 +866,14 @@ class TalkModePlugin : Plugin() {
|
|
|
515
866
|
val elapsed = SystemClock.elapsedRealtime() - lastHeard
|
|
516
867
|
if (elapsed < silenceWindowMs) return
|
|
517
868
|
|
|
518
|
-
// Finalize
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
})
|
|
869
|
+
// Finalize this turn (deduped against the recognizer's own onResults),
|
|
870
|
+
// then restart the recognizer so the next utterance is a CLEAN session —
|
|
871
|
+
// Android SpeechRecognizer accumulates within a session, so without the
|
|
872
|
+
// restart the next turn's partials would prepend the words we just sent.
|
|
523
873
|
lastTranscript = ""
|
|
524
874
|
lastHeardAtMs = null
|
|
875
|
+
emitFinalOnce(transcript)
|
|
876
|
+
scheduleRestart()
|
|
525
877
|
}
|
|
526
878
|
|
|
527
879
|
private fun handleTranscript(transcript: String, isFinal: Boolean) {
|
|
@@ -531,34 +883,71 @@ class TalkModePlugin : Plugin() {
|
|
|
531
883
|
if (isSpeaking && interruptOnSpeech) {
|
|
532
884
|
if (shouldInterrupt(transcript)) {
|
|
533
885
|
val interruptedAt = computeInterruptedAt()
|
|
534
|
-
stopSpeakingInternal()
|
|
535
886
|
lastInterruptedAtSeconds = interruptedAt
|
|
887
|
+
stopSpeakingInternal()
|
|
536
888
|
}
|
|
537
889
|
return
|
|
538
890
|
}
|
|
539
891
|
|
|
540
892
|
if (!isListening) return
|
|
541
893
|
|
|
542
|
-
if (
|
|
894
|
+
if (isFinal) {
|
|
895
|
+
// A real end-of-turn from the recognizer: emit once and clear the
|
|
896
|
+
// pending buffer so the silence monitor doesn't re-finalize the same
|
|
897
|
+
// words (the double-send bug).
|
|
898
|
+
lastTranscript = ""
|
|
899
|
+
lastHeardAtMs = null
|
|
900
|
+
emitFinalOnce(transcript)
|
|
901
|
+
} else {
|
|
543
902
|
lastTranscript = transcript
|
|
544
903
|
lastHeardAtMs = SystemClock.elapsedRealtime()
|
|
904
|
+
notifyListeners("transcript", JSObject().apply {
|
|
905
|
+
put("transcript", transcript)
|
|
906
|
+
put("isFinal", false)
|
|
907
|
+
})
|
|
545
908
|
}
|
|
909
|
+
}
|
|
546
910
|
|
|
911
|
+
/**
|
|
912
|
+
* Emit a FINAL transcript exactly once. Both the recognizer's `onResults`
|
|
913
|
+
* and the silence monitor can finalize the same utterance; collapse them so
|
|
914
|
+
* the turn is sent a single time (a repeated final within 2s is dropped).
|
|
915
|
+
*/
|
|
916
|
+
private fun emitFinalOnce(transcript: String) {
|
|
917
|
+
val text = transcript.trim()
|
|
918
|
+
if (text.isEmpty()) return
|
|
919
|
+
val now = SystemClock.elapsedRealtime()
|
|
920
|
+
if (text == lastEmittedFinal && now - lastEmittedFinalAtMs < 2000L) return
|
|
921
|
+
lastEmittedFinal = text
|
|
922
|
+
lastEmittedFinalAtMs = now
|
|
547
923
|
notifyListeners("transcript", JSObject().apply {
|
|
548
|
-
put("transcript",
|
|
549
|
-
put("isFinal",
|
|
924
|
+
put("transcript", text)
|
|
925
|
+
put("isFinal", true)
|
|
550
926
|
})
|
|
551
927
|
}
|
|
552
928
|
|
|
553
929
|
/**
|
|
554
|
-
*
|
|
555
|
-
*
|
|
930
|
+
* Decide whether heard speech should barge in on the agent's TTS. Tuned to
|
|
931
|
+
* avoid FALSE interrupts (which cut the reply mid-sentence and read as
|
|
932
|
+
* "intermittent audio"): a one-word ASR blip, background noise, or the
|
|
933
|
+
* agent's own voice bleeding back into the mic must NOT interrupt — only a
|
|
934
|
+
* genuine couple-of-words utterance from the user does.
|
|
556
935
|
*/
|
|
557
936
|
private fun shouldInterrupt(transcript: String): Boolean {
|
|
558
937
|
val trimmed = transcript.trim()
|
|
559
|
-
|
|
560
|
-
val
|
|
561
|
-
|
|
938
|
+
val lower = trimmed.lowercase()
|
|
939
|
+
val words = lower.split(Regex("\\s+")).filter { it.isNotBlank() }
|
|
940
|
+
// Need real intent: at least two words, or one long word (≥ 8 chars).
|
|
941
|
+
if (words.size < 2 && trimmed.length < 8) return false
|
|
942
|
+
val spoken = lastSpokenText?.lowercase() ?: return true
|
|
943
|
+
// Exact echo of what we're saying → speaker bleed, not the user.
|
|
944
|
+
if (spoken.contains(lower)) return false
|
|
945
|
+
// Fuzzy echo: if most of the heard words appear in the text we're
|
|
946
|
+
// currently speaking, treat it as echo (ASR mishears of our own audio).
|
|
947
|
+
val echoed = words.count { spoken.contains(it) }
|
|
948
|
+
if (words.isNotEmpty() && echoed.toDouble() / words.size >= 0.6) {
|
|
949
|
+
return false
|
|
950
|
+
}
|
|
562
951
|
return true
|
|
563
952
|
}
|
|
564
953
|
|
|
@@ -588,6 +977,7 @@ class TalkModePlugin : Plugin() {
|
|
|
588
977
|
private suspend fun speakInternal(
|
|
589
978
|
text: String,
|
|
590
979
|
forceSystemTts: Boolean,
|
|
980
|
+
useLocalInferenceTts: Boolean,
|
|
591
981
|
directive: JSObject?,
|
|
592
982
|
call: PluginCall
|
|
593
983
|
) {
|
|
@@ -596,6 +986,7 @@ class TalkModePlugin : Plugin() {
|
|
|
596
986
|
lastSpokenText = text
|
|
597
987
|
speakStartTimeMs = SystemClock.elapsedRealtime()
|
|
598
988
|
pcmStopRequested.set(false)
|
|
989
|
+
lastInterruptedAtSeconds = null
|
|
599
990
|
setState("speaking", "Speaking")
|
|
600
991
|
|
|
601
992
|
val effectiveVoiceId = directive.stringOrNull("voiceId")?.let(::resolveVoiceAlias) ?: voiceId
|
|
@@ -603,27 +994,74 @@ class TalkModePlugin : Plugin() {
|
|
|
603
994
|
|
|
604
995
|
notifyListeners("speaking", JSObject().apply {
|
|
605
996
|
put("text", text)
|
|
606
|
-
put(
|
|
997
|
+
put(
|
|
998
|
+
"isSystemTts",
|
|
999
|
+
!useLocalInferenceTts &&
|
|
1000
|
+
(forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
|
|
1001
|
+
)
|
|
607
1002
|
})
|
|
608
1003
|
|
|
609
1004
|
// Stop listening during speech (we keep recognizer for interrupt detection)
|
|
610
1005
|
mainHandler.post { recognizer?.stopListening() }
|
|
611
1006
|
ensureInterruptListener()
|
|
612
1007
|
|
|
613
|
-
//
|
|
614
|
-
|
|
1008
|
+
// Ensure the communication-mode session + audio focus are active even
|
|
1009
|
+
// for a standalone speak() that wasn't preceded by start().
|
|
1010
|
+
configureVoiceAudioSession()
|
|
1011
|
+
// Re-assert loudspeaker routing right before playback. configureVoice…
|
|
1012
|
+
// only routes on the FIRST activation; if the session was already up (the
|
|
1013
|
+
// STT path opened it) the speaker route may have drifted, leaving TTS on
|
|
1014
|
+
// the earpiece. Re-route here so replies are audible out the speaker.
|
|
1015
|
+
audioManager?.let { routeVoiceOutput(it) }
|
|
615
1016
|
|
|
616
1017
|
try {
|
|
617
|
-
val
|
|
1018
|
+
val canUseLocalInference = useLocalInferenceTts && !forceSystemTts
|
|
1019
|
+
val canUseElevenLabs = !canUseLocalInference &&
|
|
1020
|
+
!forceSystemTts &&
|
|
618
1021
|
!effectiveApiKey.isNullOrEmpty() &&
|
|
619
1022
|
!effectiveVoiceId.isNullOrEmpty()
|
|
620
1023
|
|
|
621
|
-
if (
|
|
1024
|
+
if (canUseLocalInference) {
|
|
1025
|
+
try {
|
|
1026
|
+
streamAndPlayLocalInferenceTts(text, directive)
|
|
1027
|
+
|
|
1028
|
+
if (!pcmStopRequested.get()) {
|
|
1029
|
+
call.resolve(JSObject().apply {
|
|
1030
|
+
put("completed", true)
|
|
1031
|
+
put("interrupted", false)
|
|
1032
|
+
put("usedSystemTts", false)
|
|
1033
|
+
})
|
|
1034
|
+
} else {
|
|
1035
|
+
call.resolve(JSObject().apply {
|
|
1036
|
+
put("completed", false)
|
|
1037
|
+
put("interrupted", true)
|
|
1038
|
+
put("usedSystemTts", false)
|
|
1039
|
+
lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
|
|
1040
|
+
})
|
|
1041
|
+
}
|
|
1042
|
+
} catch (e: Exception) {
|
|
1043
|
+
if (pcmStopRequested.get()) {
|
|
1044
|
+
call.resolve(JSObject().apply {
|
|
1045
|
+
put("completed", false)
|
|
1046
|
+
put("interrupted", true)
|
|
1047
|
+
put("usedSystemTts", false)
|
|
1048
|
+
})
|
|
1049
|
+
} else {
|
|
1050
|
+
// The on-device OmniVoice TTS assets aren't always staged
|
|
1051
|
+
// (it 502s "TEXT_TO_SPEECH not available"). Rather than go
|
|
1052
|
+
// silent — the JS browser-SpeechSynthesis fallback doesn't
|
|
1053
|
+
// exist in the Android WebView — fall back to the platform
|
|
1054
|
+
// TextToSpeech so replies are always spoken aloud.
|
|
1055
|
+
Log.w(TAG, "Local inference TTS failed, falling back to system TTS", e)
|
|
1056
|
+
speakWithSystemTts(text, call)
|
|
1057
|
+
}
|
|
1058
|
+
}
|
|
1059
|
+
} else if (canUseElevenLabs) {
|
|
622
1060
|
try {
|
|
623
1061
|
val request = buildElevenLabsRequest(text, directive)
|
|
624
1062
|
streamAndPlayPcm(
|
|
625
|
-
voiceId = effectiveVoiceId
|
|
626
|
-
apiKey = effectiveApiKey
|
|
1063
|
+
voiceId = effectiveVoiceId,
|
|
1064
|
+
apiKey = effectiveApiKey,
|
|
627
1065
|
request = request
|
|
628
1066
|
)
|
|
629
1067
|
|
|
@@ -665,13 +1103,16 @@ class TalkModePlugin : Plugin() {
|
|
|
665
1103
|
put("error", e.message ?: "Speak failed")
|
|
666
1104
|
})
|
|
667
1105
|
} finally {
|
|
1106
|
+
val wasInterrupted = pcmStopRequested.get()
|
|
1107
|
+
val interruptedAt = lastInterruptedAtSeconds
|
|
668
1108
|
isSpeaking = false
|
|
669
1109
|
pcmStopRequested.set(false)
|
|
670
|
-
abandonAudioFocus()
|
|
671
1110
|
|
|
672
1111
|
notifyListeners("speakComplete", JSObject().apply {
|
|
673
|
-
put("completed", !
|
|
674
|
-
|
|
1112
|
+
put("completed", !wasInterrupted)
|
|
1113
|
+
if (wasInterrupted) {
|
|
1114
|
+
interruptedAt?.let { put("interruptedAt", it) }
|
|
1115
|
+
}
|
|
675
1116
|
})
|
|
676
1117
|
|
|
677
1118
|
if (enabled) {
|
|
@@ -679,6 +1120,8 @@ class TalkModePlugin : Plugin() {
|
|
|
679
1120
|
setState("listening", "Listening")
|
|
680
1121
|
mainHandler.post { startListeningInternal(markListening = true) }
|
|
681
1122
|
} else {
|
|
1123
|
+
// Standalone speak (no active conversation): release the session.
|
|
1124
|
+
releaseVoiceAudioSession()
|
|
682
1125
|
setState("idle", "Off")
|
|
683
1126
|
}
|
|
684
1127
|
}
|
|
@@ -753,6 +1196,363 @@ class TalkModePlugin : Plugin() {
|
|
|
753
1196
|
return if (value == null || value === JSONObject.NULL) null else value.toString()
|
|
754
1197
|
}
|
|
755
1198
|
|
|
1199
|
+
private data class PcmStreamFormat(
|
|
1200
|
+
val sampleRate: Int,
|
|
1201
|
+
val channels: Int,
|
|
1202
|
+
val bitsPerSample: Int,
|
|
1203
|
+
val dataBytes: Int
|
|
1204
|
+
)
|
|
1205
|
+
|
|
1206
|
+
/**
|
|
1207
|
+
* Stream local-inference TTS from the embedded agent and play it natively.
|
|
1208
|
+
*
|
|
1209
|
+
* The agent currently returns a buffered WAV, but keeping playback in
|
|
1210
|
+
* AudioTrack means this path is ready for a chunked PCM/WAV response without
|
|
1211
|
+
* going back through WebView decodeAudioData.
|
|
1212
|
+
*/
|
|
1213
|
+
private suspend fun streamAndPlayLocalInferenceTts(
|
|
1214
|
+
text: String,
|
|
1215
|
+
directive: JSObject?
|
|
1216
|
+
) = withContext(Dispatchers.IO) {
|
|
1217
|
+
pcmStopRequested.set(false)
|
|
1218
|
+
// Prefer the in-process fused Kokoro voice via the bionic inference host.
|
|
1219
|
+
// Only if that host is unreachable (e.g. desktop/Electrobun, or a build
|
|
1220
|
+
// without it) do we fall through to the HTTP agent endpoint.
|
|
1221
|
+
if (streamAndPlayBionicKokoroTts(text, directive)) {
|
|
1222
|
+
return@withContext
|
|
1223
|
+
}
|
|
1224
|
+
val conn = openLocalInferenceTtsConnection()
|
|
1225
|
+
activePcmConnection = conn
|
|
1226
|
+
try {
|
|
1227
|
+
val payload = buildLocalInferenceTtsPayload(text, directive)
|
|
1228
|
+
conn.outputStream.use { it.write(payload.toByteArray(Charsets.UTF_8)) }
|
|
1229
|
+
|
|
1230
|
+
val code = conn.responseCode
|
|
1231
|
+
if (code >= 400) {
|
|
1232
|
+
val errBody = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
|
|
1233
|
+
throw IllegalStateException("Local inference TTS error: $code $errBody")
|
|
1234
|
+
}
|
|
1235
|
+
|
|
1236
|
+
BufferedInputStream(conn.inputStream).use { input ->
|
|
1237
|
+
val format = readWavPcmFormat(input)
|
|
1238
|
+
val track = createPcmAudioTrack(format)
|
|
1239
|
+
pcmTrack = track
|
|
1240
|
+
track.play()
|
|
1241
|
+
|
|
1242
|
+
Log.d(
|
|
1243
|
+
TAG,
|
|
1244
|
+
"Local inference PCM play start sampleRate=${format.sampleRate} channels=${format.channels}"
|
|
1245
|
+
)
|
|
1246
|
+
notifyListeners("playbackStart", JSObject().apply {
|
|
1247
|
+
put("provider", "local-inference")
|
|
1248
|
+
put("sampleRate", format.sampleRate)
|
|
1249
|
+
put("channels", format.channels)
|
|
1250
|
+
})
|
|
1251
|
+
val framesWritten = writePcmStreamToTrack(input, track, format)
|
|
1252
|
+
drainPcmTrack(track, framesWritten, format.sampleRate)
|
|
1253
|
+
if (!pcmStopRequested.get()) {
|
|
1254
|
+
track.stop()
|
|
1255
|
+
}
|
|
1256
|
+
Log.d(TAG, "Local inference PCM play done frames=$framesWritten")
|
|
1257
|
+
}
|
|
1258
|
+
} finally {
|
|
1259
|
+
cleanupPcmTrack()
|
|
1260
|
+
if (activePcmConnection === conn) {
|
|
1261
|
+
activePcmConnection = null
|
|
1262
|
+
}
|
|
1263
|
+
conn.disconnect()
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
/**
|
|
1268
|
+
* Synthesize + play with the fused Kokoro-82M head in the bionic inference
|
|
1269
|
+
* host (ElizaBionicInferenceServer, op "tts") over its abstract-namespace
|
|
1270
|
+
* UDS. The host loads the same libelizainference that runs GPU text and
|
|
1271
|
+
* synthesizes Kokoro PCM in-process — no musl agent, no HTTP, no 502 → no
|
|
1272
|
+
* fallback to the platform TextToSpeech (the bug this fixes: the app was
|
|
1273
|
+
* speaking with the Android system voice). Returns true on success; false if
|
|
1274
|
+
* the host is unreachable so the caller can fall through.
|
|
1275
|
+
*/
|
|
1276
|
+
private suspend fun streamAndPlayBionicKokoroTts(
|
|
1277
|
+
text: String,
|
|
1278
|
+
directive: JSObject?
|
|
1279
|
+
): Boolean = withContext(Dispatchers.IO) {
|
|
1280
|
+
val trimmed = text.trim()
|
|
1281
|
+
if (trimmed.isEmpty()) return@withContext false
|
|
1282
|
+
val speed = (directive?.optDouble("speed", 1.0) ?: 1.0).toFloat()
|
|
1283
|
+
val sock = LocalSocket()
|
|
1284
|
+
try {
|
|
1285
|
+
sock.connect(
|
|
1286
|
+
LocalSocketAddress(BIONIC_INFER_SOCKET, LocalSocketAddress.Namespace.ABSTRACT)
|
|
1287
|
+
)
|
|
1288
|
+
} catch (e: Exception) {
|
|
1289
|
+
Log.d(TAG, "bionic Kokoro TTS host unreachable: ${e.message}")
|
|
1290
|
+
try { sock.close() } catch (_: Exception) {}
|
|
1291
|
+
return@withContext false
|
|
1292
|
+
}
|
|
1293
|
+
try {
|
|
1294
|
+
val req = JSONObject().apply {
|
|
1295
|
+
put("op", "tts")
|
|
1296
|
+
put("text", trimmed)
|
|
1297
|
+
put("speed", speed.toDouble())
|
|
1298
|
+
}.toString().toByteArray(Charsets.UTF_8)
|
|
1299
|
+
DataOutputStream(sock.outputStream).apply {
|
|
1300
|
+
writeInt(req.size) // big-endian length prefix
|
|
1301
|
+
write(req)
|
|
1302
|
+
flush()
|
|
1303
|
+
}
|
|
1304
|
+
val din = DataInputStream(sock.inputStream)
|
|
1305
|
+
val len = din.readInt()
|
|
1306
|
+
if (len <= 0 || len > 64 * 1024 * 1024) {
|
|
1307
|
+
throw IllegalStateException("bionic TTS bad frame length $len")
|
|
1308
|
+
}
|
|
1309
|
+
val respBytes = ByteArray(len)
|
|
1310
|
+
din.readFully(respBytes)
|
|
1311
|
+
val resp = JSONObject(String(respBytes, Charsets.UTF_8))
|
|
1312
|
+
if (!resp.optBoolean("ok", false)) {
|
|
1313
|
+
throw IllegalStateException("bionic TTS error: ${resp.optString("error")}")
|
|
1314
|
+
}
|
|
1315
|
+
val sampleRate = resp.optInt("sampleRate", 24000)
|
|
1316
|
+
val pcmF32 = Base64.decode(resp.getString("pcmBase64"), Base64.NO_WRAP)
|
|
1317
|
+
// fp32 LE → int16 PCM (the play path is ENCODING_PCM_16BIT).
|
|
1318
|
+
val fb = ByteBuffer.wrap(pcmF32).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer()
|
|
1319
|
+
val nSamples = fb.remaining()
|
|
1320
|
+
if (nSamples == 0) {
|
|
1321
|
+
throw IllegalStateException("bionic TTS returned 0 samples")
|
|
1322
|
+
}
|
|
1323
|
+
val pcm16 = ByteArray(nSamples * 2)
|
|
1324
|
+
val ob = ByteBuffer.wrap(pcm16).order(ByteOrder.LITTLE_ENDIAN)
|
|
1325
|
+
for (i in 0 until nSamples) {
|
|
1326
|
+
val s = (fb.get(i) * 32767f).coerceIn(-32768f, 32767f).toInt().toShort()
|
|
1327
|
+
ob.putShort(s)
|
|
1328
|
+
}
|
|
1329
|
+
val format = PcmStreamFormat(sampleRate, 1, 16, pcm16.size)
|
|
1330
|
+
val track = createPcmAudioTrack(format)
|
|
1331
|
+
pcmTrack = track
|
|
1332
|
+
track.play()
|
|
1333
|
+
notifyListeners("playbackStart", JSObject().apply {
|
|
1334
|
+
put("provider", "local-inference")
|
|
1335
|
+
put("sampleRate", sampleRate)
|
|
1336
|
+
put("channels", 1)
|
|
1337
|
+
})
|
|
1338
|
+
val framesWritten = writePcmStreamToTrack(
|
|
1339
|
+
BufferedInputStream(ByteArrayInputStream(pcm16)), track, format
|
|
1340
|
+
)
|
|
1341
|
+
drainPcmTrack(track, framesWritten, sampleRate)
|
|
1342
|
+
if (!pcmStopRequested.get()) track.stop()
|
|
1343
|
+
Log.d(TAG, "bionic Kokoro TTS played $nSamples samples @ $sampleRate Hz")
|
|
1344
|
+
true
|
|
1345
|
+
} finally {
|
|
1346
|
+
cleanupPcmTrack()
|
|
1347
|
+
try { sock.close() } catch (_: Exception) {}
|
|
1348
|
+
}
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
private fun openLocalInferenceTtsConnection(): HttpURLConnection {
|
|
1352
|
+
val tokenFile = File(context.filesDir, "auth/local-agent-token")
|
|
1353
|
+
val token = tokenFile.takeIf { it.isFile }?.readText()?.trim().orEmpty()
|
|
1354
|
+
if (token.isEmpty()) {
|
|
1355
|
+
throw IllegalStateException("Local agent auth token is missing")
|
|
1356
|
+
}
|
|
1357
|
+
|
|
1358
|
+
val conn = URL(LOCAL_INFERENCE_TTS_URL).openConnection() as HttpURLConnection
|
|
1359
|
+
conn.requestMethod = "POST"
|
|
1360
|
+
conn.connectTimeout = 30_000
|
|
1361
|
+
conn.readTimeout = 180_000
|
|
1362
|
+
conn.setRequestProperty("Authorization", "Bearer $token")
|
|
1363
|
+
conn.setRequestProperty("Content-Type", "application/json")
|
|
1364
|
+
conn.setRequestProperty("Accept", "audio/wav")
|
|
1365
|
+
conn.doOutput = true
|
|
1366
|
+
return conn
|
|
1367
|
+
}
|
|
1368
|
+
|
|
1369
|
+
private fun buildLocalInferenceTtsPayload(text: String, directive: JSObject?): String {
|
|
1370
|
+
val payload = JSONObject()
|
|
1371
|
+
payload.put("text", text)
|
|
1372
|
+
directive.stringOrNull("voiceId")?.let { payload.put("voiceId", it) }
|
|
1373
|
+
directive.stringOrNull("voice")?.let { payload.put("voice", it) }
|
|
1374
|
+
directive.stringOrNull("modelId")?.let { payload.put("modelId", it) }
|
|
1375
|
+
directive.stringOrNull("model")?.let { payload.put("model", it) }
|
|
1376
|
+
val speed = directive?.optDouble("speed", Double.NaN)
|
|
1377
|
+
if (speed != null && speed.isFinite() && speed > 0.0) {
|
|
1378
|
+
payload.put("speed", speed)
|
|
1379
|
+
}
|
|
1380
|
+
return payload.toString()
|
|
1381
|
+
}
|
|
1382
|
+
|
|
1383
|
+
private fun readExactly(input: BufferedInputStream, size: Int): ByteArray {
|
|
1384
|
+
val bytes = ByteArray(size)
|
|
1385
|
+
var offset = 0
|
|
1386
|
+
while (offset < size) {
|
|
1387
|
+
val read = input.read(bytes, offset, size - offset)
|
|
1388
|
+
if (read < 0) {
|
|
1389
|
+
throw IllegalStateException("Unexpected end of WAV stream")
|
|
1390
|
+
}
|
|
1391
|
+
offset += read
|
|
1392
|
+
}
|
|
1393
|
+
return bytes
|
|
1394
|
+
}
|
|
1395
|
+
|
|
1396
|
+
private fun skipFully(input: BufferedInputStream, count: Int) {
|
|
1397
|
+
var remaining = count
|
|
1398
|
+
while (remaining > 0) {
|
|
1399
|
+
val skipped = input.skip(remaining.toLong()).toInt()
|
|
1400
|
+
if (skipped > 0) {
|
|
1401
|
+
remaining -= skipped
|
|
1402
|
+
continue
|
|
1403
|
+
}
|
|
1404
|
+
if (input.read() < 0) {
|
|
1405
|
+
throw IllegalStateException("Unexpected end of WAV stream")
|
|
1406
|
+
}
|
|
1407
|
+
remaining -= 1
|
|
1408
|
+
}
|
|
1409
|
+
}
|
|
1410
|
+
|
|
1411
|
+
private fun littleEndianShort(bytes: ByteArray, offset: Int): Int {
|
|
1412
|
+
return (bytes[offset].toInt() and 0xff) or
|
|
1413
|
+
((bytes[offset + 1].toInt() and 0xff) shl 8)
|
|
1414
|
+
}
|
|
1415
|
+
|
|
1416
|
+
private fun littleEndianInt(bytes: ByteArray, offset: Int): Int {
|
|
1417
|
+
return (bytes[offset].toInt() and 0xff) or
|
|
1418
|
+
((bytes[offset + 1].toInt() and 0xff) shl 8) or
|
|
1419
|
+
((bytes[offset + 2].toInt() and 0xff) shl 16) or
|
|
1420
|
+
((bytes[offset + 3].toInt() and 0xff) shl 24)
|
|
1421
|
+
}
|
|
1422
|
+
|
|
1423
|
+
private fun chunkId(bytes: ByteArray): String {
|
|
1424
|
+
return String(bytes, 0, 4, Charsets.US_ASCII)
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
private fun readWavPcmFormat(input: BufferedInputStream): PcmStreamFormat {
|
|
1428
|
+
val riff = readExactly(input, 12)
|
|
1429
|
+
if (
|
|
1430
|
+
String(riff, 0, 4, Charsets.US_ASCII) != "RIFF" ||
|
|
1431
|
+
String(riff, 8, 4, Charsets.US_ASCII) != "WAVE"
|
|
1432
|
+
) {
|
|
1433
|
+
throw IllegalStateException("Local inference TTS returned non-WAV audio")
|
|
1434
|
+
}
|
|
1435
|
+
|
|
1436
|
+
var format: PcmStreamFormat? = null
|
|
1437
|
+
while (true) {
|
|
1438
|
+
val header = readExactly(input, 8)
|
|
1439
|
+
val id = chunkId(header)
|
|
1440
|
+
val size = littleEndianInt(header, 4)
|
|
1441
|
+
if (size < 0) {
|
|
1442
|
+
throw IllegalStateException("Invalid WAV chunk size for $id")
|
|
1443
|
+
}
|
|
1444
|
+
|
|
1445
|
+
if (id == "fmt ") {
|
|
1446
|
+
val fmt = readExactly(input, size)
|
|
1447
|
+
if (fmt.size < 16) {
|
|
1448
|
+
throw IllegalStateException("Invalid WAV fmt chunk")
|
|
1449
|
+
}
|
|
1450
|
+
val audioFormat = littleEndianShort(fmt, 0)
|
|
1451
|
+
val channels = littleEndianShort(fmt, 2)
|
|
1452
|
+
val sampleRate = littleEndianInt(fmt, 4)
|
|
1453
|
+
val bitsPerSample = littleEndianShort(fmt, 14)
|
|
1454
|
+
if (audioFormat != 1) {
|
|
1455
|
+
throw IllegalStateException("Only PCM WAV is supported, got format=$audioFormat")
|
|
1456
|
+
}
|
|
1457
|
+
if (bitsPerSample != 16) {
|
|
1458
|
+
throw IllegalStateException("Only 16-bit PCM WAV is supported, got bits=$bitsPerSample")
|
|
1459
|
+
}
|
|
1460
|
+
if (channels !in 1..2 || sampleRate <= 0) {
|
|
1461
|
+
throw IllegalStateException("Invalid WAV format sampleRate=$sampleRate channels=$channels")
|
|
1462
|
+
}
|
|
1463
|
+
format = PcmStreamFormat(sampleRate, channels, bitsPerSample, 0)
|
|
1464
|
+
if (size % 2 == 1) skipFully(input, 1)
|
|
1465
|
+
continue
|
|
1466
|
+
}
|
|
1467
|
+
|
|
1468
|
+
if (id == "data") {
|
|
1469
|
+
val parsed = format ?: throw IllegalStateException("WAV data arrived before fmt chunk")
|
|
1470
|
+
return parsed.copy(dataBytes = size)
|
|
1471
|
+
}
|
|
1472
|
+
|
|
1473
|
+
skipFully(input, size)
|
|
1474
|
+
if (size % 2 == 1) skipFully(input, 1)
|
|
1475
|
+
}
|
|
1476
|
+
}
|
|
1477
|
+
|
|
1478
|
+
private fun createPcmAudioTrack(format: PcmStreamFormat): AudioTrack {
|
|
1479
|
+
val channelMask = when (format.channels) {
|
|
1480
|
+
1 -> AudioFormat.CHANNEL_OUT_MONO
|
|
1481
|
+
2 -> AudioFormat.CHANNEL_OUT_STEREO
|
|
1482
|
+
else -> throw IllegalStateException("Unsupported PCM channel count ${format.channels}")
|
|
1483
|
+
}
|
|
1484
|
+
val minBuffer = AudioTrack.getMinBufferSize(
|
|
1485
|
+
format.sampleRate,
|
|
1486
|
+
channelMask,
|
|
1487
|
+
AudioFormat.ENCODING_PCM_16BIT
|
|
1488
|
+
)
|
|
1489
|
+
if (minBuffer <= 0) {
|
|
1490
|
+
throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
|
|
1491
|
+
}
|
|
1492
|
+
val bufferSize = max(minBuffer * 2, 8 * 1024)
|
|
1493
|
+
val track = AudioTrack.Builder()
|
|
1494
|
+
.setAudioAttributes(voiceAudioAttributes())
|
|
1495
|
+
.setAudioFormat(
|
|
1496
|
+
AudioFormat.Builder()
|
|
1497
|
+
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
|
|
1498
|
+
.setSampleRate(format.sampleRate)
|
|
1499
|
+
.setChannelMask(channelMask)
|
|
1500
|
+
.build()
|
|
1501
|
+
)
|
|
1502
|
+
.setBufferSizeInBytes(bufferSize)
|
|
1503
|
+
.setTransferMode(AudioTrack.MODE_STREAM)
|
|
1504
|
+
.build()
|
|
1505
|
+
|
|
1506
|
+
if (track.state != AudioTrack.STATE_INITIALIZED) {
|
|
1507
|
+
track.release()
|
|
1508
|
+
throw IllegalStateException("AudioTrack init failed")
|
|
1509
|
+
}
|
|
1510
|
+
return track
|
|
1511
|
+
}
|
|
1512
|
+
|
|
1513
|
+
private fun writePcmStreamToTrack(
|
|
1514
|
+
input: BufferedInputStream,
|
|
1515
|
+
track: AudioTrack,
|
|
1516
|
+
format: PcmStreamFormat
|
|
1517
|
+
): Long {
|
|
1518
|
+
val bytesPerFrame = format.channels * (format.bitsPerSample / 8)
|
|
1519
|
+
var bytesWrittenTotal = 0L
|
|
1520
|
+
var remainingBytes = format.dataBytes
|
|
1521
|
+
val buffer = ByteArray(8 * 1024)
|
|
1522
|
+
while (remainingBytes > 0) {
|
|
1523
|
+
if (pcmStopRequested.get()) break
|
|
1524
|
+
val requestBytes = if (remainingBytes < buffer.size) remainingBytes else buffer.size
|
|
1525
|
+
val bytesRead = input.read(buffer, 0, requestBytes)
|
|
1526
|
+
if (bytesRead <= 0) break
|
|
1527
|
+
remainingBytes -= bytesRead
|
|
1528
|
+
|
|
1529
|
+
var offset = 0
|
|
1530
|
+
while (offset < bytesRead) {
|
|
1531
|
+
if (pcmStopRequested.get()) break
|
|
1532
|
+
val wrote = track.write(buffer, offset, bytesRead - offset)
|
|
1533
|
+
if (wrote <= 0) {
|
|
1534
|
+
throw IllegalStateException("AudioTrack write failed: $wrote")
|
|
1535
|
+
}
|
|
1536
|
+
offset += wrote
|
|
1537
|
+
bytesWrittenTotal += wrote.toLong()
|
|
1538
|
+
}
|
|
1539
|
+
}
|
|
1540
|
+
return if (bytesPerFrame > 0) bytesWrittenTotal / bytesPerFrame else 0L
|
|
1541
|
+
}
|
|
1542
|
+
|
|
1543
|
+
private fun drainPcmTrack(track: AudioTrack, framesWritten: Long, sampleRate: Int) {
|
|
1544
|
+
if (framesWritten <= 0L || sampleRate <= 0) return
|
|
1545
|
+
val maxDrainMs = (framesWritten * 1000L / sampleRate).coerceAtMost(30_000L) + 1_000L
|
|
1546
|
+
val deadline = SystemClock.elapsedRealtime() + maxDrainMs
|
|
1547
|
+
while (
|
|
1548
|
+
!pcmStopRequested.get() &&
|
|
1549
|
+
track.playbackHeadPosition.toLong() < framesWritten &&
|
|
1550
|
+
SystemClock.elapsedRealtime() < deadline
|
|
1551
|
+
) {
|
|
1552
|
+
SystemClock.sleep(20)
|
|
1553
|
+
}
|
|
1554
|
+
}
|
|
1555
|
+
|
|
756
1556
|
/**
|
|
757
1557
|
* Stream PCM audio from ElevenLabs and play via AudioTrack.
|
|
758
1558
|
* Ported from classic TalkModeManager with proper offset-based writes.
|
|
@@ -776,12 +1576,7 @@ class TalkModePlugin : Plugin() {
|
|
|
776
1576
|
|
|
777
1577
|
val bufferSize = max(minBuffer * 2, 8 * 1024)
|
|
778
1578
|
val track = AudioTrack.Builder()
|
|
779
|
-
.setAudioAttributes(
|
|
780
|
-
AudioAttributes.Builder()
|
|
781
|
-
.setUsage(AudioAttributes.USAGE_ASSISTANT)
|
|
782
|
-
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
|
783
|
-
.build()
|
|
784
|
-
)
|
|
1579
|
+
.setAudioAttributes(voiceAudioAttributes())
|
|
785
1580
|
.setAudioFormat(
|
|
786
1581
|
AudioFormat.Builder()
|
|
787
1582
|
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
|
|
@@ -802,6 +1597,7 @@ class TalkModePlugin : Plugin() {
|
|
|
802
1597
|
|
|
803
1598
|
Log.d(TAG, "PCM play start sampleRate=$sampleRate bufferSize=$bufferSize")
|
|
804
1599
|
val conn = openTtsConnection(voiceId, apiKey, request)
|
|
1600
|
+
activePcmConnection = conn
|
|
805
1601
|
try {
|
|
806
1602
|
val payload = buildRequestPayload(request)
|
|
807
1603
|
conn.outputStream.use { it.write(payload.toByteArray()) }
|
|
@@ -845,6 +1641,9 @@ class TalkModePlugin : Plugin() {
|
|
|
845
1641
|
Log.d(TAG, "PCM play done")
|
|
846
1642
|
} finally {
|
|
847
1643
|
cleanupPcmTrack()
|
|
1644
|
+
if (activePcmConnection === conn) {
|
|
1645
|
+
activePcmConnection = null
|
|
1646
|
+
}
|
|
848
1647
|
conn.disconnect()
|
|
849
1648
|
}
|
|
850
1649
|
}
|
|
@@ -970,43 +1769,125 @@ class TalkModePlugin : Plugin() {
|
|
|
970
1769
|
}
|
|
971
1770
|
}
|
|
972
1771
|
|
|
973
|
-
// ──
|
|
1772
|
+
// ── Voice audio session ─────────────────────────────────────────────
|
|
1773
|
+
//
|
|
1774
|
+
// The Android analog of the iOS `.playAndRecord` / `.voiceChat` /
|
|
1775
|
+
// `.defaultToSpeaker` session. Putting the device in MODE_IN_COMMUNICATION
|
|
1776
|
+
// for the whole conversation routes capture + playback through the
|
|
1777
|
+
// telephony path, which engages the platform hardware AEC so TTS coming out
|
|
1778
|
+
// the speaker is cancelled from the mic (the core fix for the mic+speaker
|
|
1779
|
+
// echo loop in hands-free mode). We also hold voice-communication audio
|
|
1780
|
+
// focus and route to the loudspeaker (unless a headset is connected) so
|
|
1781
|
+
// hands-free playback is audible.
|
|
1782
|
+
|
|
1783
|
+
private fun voiceAudioAttributes(): AudioAttributes =
|
|
1784
|
+
AudioAttributes.Builder()
|
|
1785
|
+
.setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
|
|
1786
|
+
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
|
1787
|
+
.build()
|
|
974
1788
|
|
|
975
|
-
private fun
|
|
1789
|
+
private fun configureVoiceAudioSession() {
|
|
1790
|
+
if (audioSessionActive) return
|
|
976
1791
|
val am = audioManager ?: return
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
1792
|
+
|
|
1793
|
+
savedAudioMode = am.mode
|
|
1794
|
+
@Suppress("DEPRECATION")
|
|
1795
|
+
savedSpeakerphoneOn = am.isSpeakerphoneOn
|
|
1796
|
+
|
|
1797
|
+
val request = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_EXCLUSIVE)
|
|
1798
|
+
.setAudioAttributes(voiceAudioAttributes())
|
|
1799
|
+
.setOnAudioFocusChangeListener { focusChange ->
|
|
1800
|
+
if (
|
|
1801
|
+
focusChange == AudioManager.AUDIOFOCUS_LOSS ||
|
|
1802
|
+
focusChange == AudioManager.AUDIOFOCUS_LOSS_TRANSIENT
|
|
1803
|
+
) {
|
|
1804
|
+
// Another app took audio; stop speaking if we are.
|
|
1805
|
+
if (isSpeaking) stopSpeakingInternal()
|
|
985
1806
|
}
|
|
986
1807
|
}
|
|
1808
|
+
.build()
|
|
1809
|
+
audioFocusRequest = request
|
|
1810
|
+
am.requestAudioFocus(request)
|
|
1811
|
+
|
|
1812
|
+
am.mode = AudioManager.MODE_IN_COMMUNICATION
|
|
1813
|
+
routeVoiceOutput(am)
|
|
1814
|
+
muteEarconStreams(am)
|
|
1815
|
+
audioSessionActive = true
|
|
1816
|
+
Log.d(TAG, "Voice audio session active (communication mode)")
|
|
1817
|
+
}
|
|
1818
|
+
|
|
1819
|
+
/** Mute the recognizer earcon streams for the session; idempotent. */
|
|
1820
|
+
private fun muteEarconStreams(am: AudioManager) {
|
|
1821
|
+
if (earconStreamsMuted) return
|
|
1822
|
+
for (stream in earconStreams) {
|
|
1823
|
+
try {
|
|
1824
|
+
am.adjustStreamVolume(stream, AudioManager.ADJUST_MUTE, 0)
|
|
1825
|
+
} catch (_: Throwable) {
|
|
1826
|
+
// Some OEMs disallow muting certain streams without DND access.
|
|
1827
|
+
}
|
|
1828
|
+
}
|
|
1829
|
+
earconStreamsMuted = true
|
|
1830
|
+
}
|
|
1831
|
+
|
|
1832
|
+
private fun unmuteEarconStreams(am: AudioManager) {
|
|
1833
|
+
if (!earconStreamsMuted) return
|
|
1834
|
+
for (stream in earconStreams) {
|
|
1835
|
+
try {
|
|
1836
|
+
am.adjustStreamVolume(stream, AudioManager.ADJUST_UNMUTE, 0)
|
|
1837
|
+
} catch (_: Throwable) {}
|
|
987
1838
|
}
|
|
988
|
-
|
|
1839
|
+
earconStreamsMuted = false
|
|
1840
|
+
}
|
|
989
1841
|
|
|
1842
|
+
/**
|
|
1843
|
+
* Default playback to the loudspeaker for hands-free use, but let a wired or
|
|
1844
|
+
* Bluetooth headset win — the iOS `.defaultToSpeaker` semantic.
|
|
1845
|
+
*/
|
|
1846
|
+
private fun routeVoiceOutput(am: AudioManager) {
|
|
1847
|
+
val hasHeadset = am.getDevices(AudioManager.GET_DEVICES_OUTPUTS).any { device ->
|
|
1848
|
+
device.type == AudioDeviceInfo.TYPE_WIRED_HEADSET ||
|
|
1849
|
+
device.type == AudioDeviceInfo.TYPE_WIRED_HEADPHONES ||
|
|
1850
|
+
device.type == AudioDeviceInfo.TYPE_USB_HEADSET ||
|
|
1851
|
+
device.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO ||
|
|
1852
|
+
device.type == AudioDeviceInfo.TYPE_BLUETOOTH_A2DP
|
|
1853
|
+
}
|
|
1854
|
+
if (hasHeadset) {
|
|
1855
|
+
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
|
|
1856
|
+
@Suppress("DEPRECATION")
|
|
1857
|
+
am.isSpeakerphoneOn = false
|
|
1858
|
+
return
|
|
1859
|
+
}
|
|
1860
|
+
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
|
|
1861
|
+
val speaker = am.availableCommunicationDevices.firstOrNull {
|
|
1862
|
+
it.type == AudioDeviceInfo.TYPE_BUILTIN_SPEAKER
|
|
1863
|
+
}
|
|
1864
|
+
if (speaker != null && am.setCommunicationDevice(speaker)) return
|
|
1865
|
+
}
|
|
990
1866
|
@Suppress("DEPRECATION")
|
|
991
|
-
am.
|
|
992
|
-
focusListener,
|
|
993
|
-
AudioManager.STREAM_MUSIC,
|
|
994
|
-
AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK
|
|
995
|
-
)
|
|
1867
|
+
am.isSpeakerphoneOn = true
|
|
996
1868
|
}
|
|
997
1869
|
|
|
998
|
-
private fun
|
|
1870
|
+
private fun releaseVoiceAudioSession() {
|
|
1871
|
+
if (!audioSessionActive) return
|
|
999
1872
|
val am = audioManager ?: return
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
am.abandonAudioFocus(listener)
|
|
1873
|
+
unmuteEarconStreams(am)
|
|
1874
|
+
audioFocusRequest?.let { am.abandonAudioFocusRequest(it) }
|
|
1003
1875
|
audioFocusRequest = null
|
|
1876
|
+
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
|
|
1877
|
+
@Suppress("DEPRECATION")
|
|
1878
|
+
am.isSpeakerphoneOn = savedSpeakerphoneOn
|
|
1879
|
+
am.mode = savedAudioMode
|
|
1880
|
+
audioSessionActive = false
|
|
1881
|
+
Log.d(TAG, "Voice audio session released")
|
|
1004
1882
|
}
|
|
1005
1883
|
|
|
1006
1884
|
// ── Cleanup helpers ─────────────────────────────────────────────────
|
|
1007
1885
|
|
|
1008
1886
|
private fun stopSpeakingInternal() {
|
|
1009
1887
|
pcmStopRequested.set(true)
|
|
1888
|
+
val conn = activePcmConnection
|
|
1889
|
+
activePcmConnection = null
|
|
1890
|
+
conn?.disconnect()
|
|
1010
1891
|
cleanupPcmTrack()
|
|
1011
1892
|
systemTts?.stop()
|
|
1012
1893
|
systemTtsPending?.cancel()
|
|
@@ -1162,6 +2043,9 @@ class TalkModePlugin : Plugin() {
|
|
|
1162
2043
|
}
|
|
1163
2044
|
|
|
1164
2045
|
private fun isPermissionGranted(permission: String): Boolean {
|
|
2046
|
+
if (permission == Manifest.permission.RECORD_AUDIO) {
|
|
2047
|
+
return context.checkSelfPermission(permission) == PackageManager.PERMISSION_GRANTED
|
|
2048
|
+
}
|
|
1165
2049
|
return getPermissionState(permission) == com.getcapacitor.PermissionState.GRANTED
|
|
1166
2050
|
}
|
|
1167
2051
|
|
|
@@ -1176,10 +2060,13 @@ class TalkModePlugin : Plugin() {
|
|
|
1176
2060
|
systemTts?.shutdown()
|
|
1177
2061
|
systemTts = null
|
|
1178
2062
|
cleanupPcmTrack()
|
|
2063
|
+
audioFrameRunning.set(false)
|
|
2064
|
+
audioFrameJob?.cancel()
|
|
2065
|
+
releaseAudioRecord()
|
|
1179
2066
|
silenceJob?.cancel()
|
|
1180
2067
|
restartJob?.cancel()
|
|
1181
2068
|
speakingJob?.cancel()
|
|
1182
|
-
|
|
2069
|
+
releaseVoiceAudioSession()
|
|
1183
2070
|
scope.cancel()
|
|
1184
2071
|
}
|
|
1185
2072
|
|