@elizaos/capacitor-talkmode 2.0.0-beta.1 → 2.0.3-beta.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +137 -0
- package/android/build.gradle +16 -2
- package/android/src/main/java/ai/eliza/plugins/talkmode/TalkModePlugin.kt +983 -65
- package/dist/esm/definitions.d.ts +149 -3
- package/dist/esm/definitions.d.ts.map +1 -1
- package/dist/esm/web.d.ts +6 -1
- package/dist/esm/web.d.ts.map +1 -1
- package/dist/esm/web.js +34 -5
- package/dist/esm/web.test.d.ts +2 -0
- package/dist/esm/web.test.d.ts.map +1 -0
- package/dist/esm/web.test.js +137 -0
- package/dist/plugin.cjs.js +34 -5
- package/dist/plugin.cjs.js.map +1 -1
- package/dist/plugin.js +34 -5
- package/dist/plugin.js.map +1 -1
- package/ios/Sources/TalkModePlugin/TalkModePlugin.swift +266 -16
- package/package.json +14 -11
|
@@ -2,10 +2,17 @@ package ai.eliza.plugins.talkmode
|
|
|
2
2
|
|
|
3
3
|
import android.Manifest
|
|
4
4
|
import android.content.Intent
|
|
5
|
+
import android.content.pm.PackageManager
|
|
5
6
|
import android.media.AudioAttributes
|
|
7
|
+
import android.media.AudioDeviceInfo
|
|
8
|
+
import android.media.AudioFocusRequest
|
|
6
9
|
import android.media.AudioFormat
|
|
7
10
|
import android.media.AudioManager
|
|
11
|
+
import android.media.AudioRecord
|
|
8
12
|
import android.media.AudioTrack
|
|
13
|
+
import android.media.MediaRecorder
|
|
14
|
+
import android.util.Base64
|
|
15
|
+
import android.os.Build
|
|
9
16
|
import android.os.Bundle
|
|
10
17
|
import android.os.Handler
|
|
11
18
|
import android.os.Looper
|
|
@@ -25,7 +32,15 @@ import com.getcapacitor.annotation.CapacitorPlugin
|
|
|
25
32
|
import com.getcapacitor.annotation.Permission
|
|
26
33
|
import com.getcapacitor.annotation.PermissionCallback
|
|
27
34
|
import kotlinx.coroutines.*
|
|
35
|
+
import android.net.LocalSocket
|
|
36
|
+
import android.net.LocalSocketAddress
|
|
28
37
|
import java.io.BufferedInputStream
|
|
38
|
+
import java.io.ByteArrayInputStream
|
|
39
|
+
import java.io.DataInputStream
|
|
40
|
+
import java.io.DataOutputStream
|
|
41
|
+
import java.nio.ByteBuffer
|
|
42
|
+
import java.nio.ByteOrder
|
|
43
|
+
import java.io.File
|
|
29
44
|
import java.net.HttpURLConnection
|
|
30
45
|
import java.net.URL
|
|
31
46
|
import java.util.Locale
|
|
@@ -45,6 +60,15 @@ class TalkModePlugin : Plugin() {
|
|
|
45
60
|
private const val TAG = "TalkMode"
|
|
46
61
|
private const val DEFAULT_MODEL_ID = "eleven_flash_v2_5"
|
|
47
62
|
private const val DEFAULT_OUTPUT_FORMAT = "pcm_24000"
|
|
63
|
+
private const val LOCAL_INFERENCE_TTS_URL = "http://127.0.0.1:31337/api/tts/local-inference"
|
|
64
|
+
// Abstract-namespace UDS of ElizaBionicInferenceServer (the bionic app
|
|
65
|
+
// process that has libelizainference loaded). Kept in sync with
|
|
66
|
+
// BIONIC_INFERENCE_SOCKET_NAME in ElizaAgentService.
|
|
67
|
+
private const val BIONIC_INFER_SOCKET = "eliza_bionic_infer_v1"
|
|
68
|
+
// 16 kHz mono is the rate VAD / diarizer / wake-word models expect; 20 ms
|
|
69
|
+
// (320 samples) is the standard VAD frame size.
|
|
70
|
+
private const val DEFAULT_FRAME_SAMPLE_RATE = 16000
|
|
71
|
+
private const val DEFAULT_FRAME_MS = 20
|
|
48
72
|
}
|
|
49
73
|
|
|
50
74
|
private val mainHandler = Handler(Looper.getMainLooper())
|
|
@@ -60,11 +84,19 @@ class TalkModePlugin : Plugin() {
|
|
|
60
84
|
private var isListening = false
|
|
61
85
|
private var listeningMode = false
|
|
62
86
|
private var stopRequested = false
|
|
87
|
+
// Consecutive ERROR_NO_MATCH/SPEECH_TIMEOUT count, for exponential restart
|
|
88
|
+
// backoff so an idle always-on session settles instead of re-arming (and,
|
|
89
|
+
// with the system recognizer, beeping) every ~600ms when nobody is talking.
|
|
90
|
+
private var consecutiveNoMatch = 0
|
|
63
91
|
private var restartJob: Job? = null
|
|
64
92
|
private var lastTranscript = ""
|
|
65
93
|
private var lastHeardAtMs: Long? = null
|
|
66
94
|
private var silenceJob: Job? = null
|
|
67
95
|
private val silenceWindowMs = 700L
|
|
96
|
+
// The recognizer's own onResults AND our silence monitor can both finalize
|
|
97
|
+
// the same utterance; dedup so a turn is emitted (and sent) exactly once.
|
|
98
|
+
private var lastEmittedFinal = ""
|
|
99
|
+
private var lastEmittedFinalAtMs = 0L
|
|
68
100
|
|
|
69
101
|
// TTS
|
|
70
102
|
private var systemTts: TextToSpeech? = null
|
|
@@ -79,10 +111,37 @@ class TalkModePlugin : Plugin() {
|
|
|
79
111
|
private var lastSpokenText: String? = null
|
|
80
112
|
private var speakStartTimeMs: Long = 0
|
|
81
113
|
private var lastInterruptedAtSeconds: Double? = null
|
|
114
|
+
@Volatile private var activePcmConnection: HttpURLConnection? = null
|
|
82
115
|
|
|
83
|
-
//
|
|
116
|
+
// Voice audio session (communication-mode routing + focus, mirrors the iOS
|
|
117
|
+
// .playAndRecord/.voiceChat/.defaultToSpeaker session). Held for the whole
|
|
118
|
+
// conversation so the platform AEC has a stable speaker reference to cancel.
|
|
84
119
|
private var audioManager: AudioManager? = null
|
|
85
|
-
private var audioFocusRequest:
|
|
120
|
+
private var audioFocusRequest: AudioFocusRequest? = null
|
|
121
|
+
private var audioSessionActive = false
|
|
122
|
+
private var savedAudioMode = AudioManager.MODE_NORMAL
|
|
123
|
+
private var savedSpeakerphoneOn = false
|
|
124
|
+
// Streams we mute for the session to suppress the platform recognizer's
|
|
125
|
+
// start/stop earcons (the "on/off" beeps heard as it re-arms continuously).
|
|
126
|
+
// TTS plays on STREAM_VOICE_CALL (USAGE_VOICE_COMMUNICATION) so it stays
|
|
127
|
+
// audible. Tracked so we only unmute streams we muted.
|
|
128
|
+
private val earconStreams = intArrayOf(
|
|
129
|
+
AudioManager.STREAM_MUSIC,
|
|
130
|
+
AudioManager.STREAM_SYSTEM,
|
|
131
|
+
AudioManager.STREAM_NOTIFICATION,
|
|
132
|
+
)
|
|
133
|
+
private var earconStreamsMuted = false
|
|
134
|
+
|
|
135
|
+
// Raw PCM frame capture (diarization / VAD / wake-word source). Opt-in and
|
|
136
|
+
// mutually exclusive with SpeechRecognizer on the mic: Android only lets one
|
|
137
|
+
// capture client own a given input source at a time, so starting frame
|
|
138
|
+
// capture SUSPENDS any active SpeechRecognizer and stopping it resumes STT.
|
|
139
|
+
private var audioRecord: AudioRecord? = null
|
|
140
|
+
private var audioFrameJob: Job? = null
|
|
141
|
+
private val audioFrameRunning = AtomicBoolean(false)
|
|
142
|
+
private var sttSuspendedForFrames = false
|
|
143
|
+
private var lastFrameSampleRate = DEFAULT_FRAME_SAMPLE_RATE
|
|
144
|
+
private var lastFrameSamples = 0
|
|
86
145
|
|
|
87
146
|
// Config
|
|
88
147
|
private var apiKey: String? = null
|
|
@@ -106,6 +165,7 @@ class TalkModePlugin : Plugin() {
|
|
|
106
165
|
|
|
107
166
|
override fun onBeginningOfSpeech() {
|
|
108
167
|
Log.d(TAG, "Beginning of speech")
|
|
168
|
+
consecutiveNoMatch = 0
|
|
109
169
|
}
|
|
110
170
|
|
|
111
171
|
override fun onRmsChanged(rmsdB: Float) {}
|
|
@@ -142,24 +202,34 @@ class TalkModePlugin : Plugin() {
|
|
|
142
202
|
return
|
|
143
203
|
}
|
|
144
204
|
|
|
145
|
-
// Don't notify error for no-match / speech-timeout, just restart
|
|
146
|
-
|
|
147
|
-
|
|
205
|
+
// Don't notify error for no-match / speech-timeout, just restart.
|
|
206
|
+
// These fire continuously when the always-on session hears only
|
|
207
|
+
// silence, so back off exponentially (600ms → 8s cap) instead of
|
|
208
|
+
// re-arming the recognizer every 600ms. onBeginningOfSpeech /
|
|
209
|
+
// onResults reset the counter the moment real speech arrives.
|
|
210
|
+
if (error == SpeechRecognizer.ERROR_NO_MATCH ||
|
|
211
|
+
error == SpeechRecognizer.ERROR_SPEECH_TIMEOUT
|
|
148
212
|
) {
|
|
213
|
+
consecutiveNoMatch++
|
|
214
|
+
scheduleRestart(
|
|
215
|
+
delayMs = minOf(600L * (1L shl minOf(consecutiveNoMatch, 4)), 8000L),
|
|
216
|
+
)
|
|
217
|
+
} else {
|
|
218
|
+
consecutiveNoMatch = 0
|
|
149
219
|
notifyListeners("error", JSObject().apply {
|
|
150
220
|
put("code", "recognition_error")
|
|
151
221
|
put("message", errorMsg)
|
|
152
222
|
put("recoverable", true)
|
|
153
223
|
})
|
|
224
|
+
scheduleRestart(delayMs = 600)
|
|
154
225
|
}
|
|
155
|
-
|
|
156
|
-
scheduleRestart(delayMs = 600)
|
|
157
226
|
}
|
|
158
227
|
|
|
159
228
|
override fun onResults(results: Bundle?) {
|
|
160
229
|
val matches = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
|
|
161
230
|
val transcript = matches?.firstOrNull()?.trim() ?: ""
|
|
162
231
|
if (transcript.isNotEmpty()) {
|
|
232
|
+
consecutiveNoMatch = 0
|
|
163
233
|
handleTranscript(transcript, isFinal = true)
|
|
164
234
|
}
|
|
165
235
|
scheduleRestart()
|
|
@@ -189,6 +259,7 @@ class TalkModePlugin : Plugin() {
|
|
|
189
259
|
systemTtsReady = status == TextToSpeech.SUCCESS
|
|
190
260
|
if (systemTtsReady) {
|
|
191
261
|
systemTts?.language = Locale.getDefault()
|
|
262
|
+
systemTts?.setAudioAttributes(voiceAudioAttributes())
|
|
192
263
|
systemTts?.setOnUtteranceProgressListener(object : UtteranceProgressListener() {
|
|
193
264
|
override fun onStart(id: String?) {}
|
|
194
265
|
|
|
@@ -270,14 +341,13 @@ class TalkModePlugin : Plugin() {
|
|
|
270
341
|
enabled = true
|
|
271
342
|
stopRequested = false
|
|
272
343
|
listeningMode = true
|
|
344
|
+
configureVoiceAudioSession()
|
|
273
345
|
setState("listening", "Listening")
|
|
274
346
|
|
|
275
347
|
mainHandler.post {
|
|
276
348
|
try {
|
|
277
349
|
recognizer?.destroy()
|
|
278
|
-
recognizer =
|
|
279
|
-
setRecognitionListener(recognitionListener)
|
|
280
|
-
}
|
|
350
|
+
recognizer = createRecognizer()
|
|
281
351
|
startListeningInternal(markListening = true)
|
|
282
352
|
startSilenceMonitor()
|
|
283
353
|
|
|
@@ -286,6 +356,13 @@ class TalkModePlugin : Plugin() {
|
|
|
286
356
|
})
|
|
287
357
|
} catch (e: Exception) {
|
|
288
358
|
Log.e(TAG, "Failed to start", e)
|
|
359
|
+
// Recognizer creation failed AFTER the audio session was
|
|
360
|
+
// configured — release it so the earcon streams aren't left
|
|
361
|
+
// muted and the device isn't stuck in MODE_IN_COMMUNICATION.
|
|
362
|
+
enabled = false
|
|
363
|
+
listeningMode = false
|
|
364
|
+
releaseVoiceAudioSession()
|
|
365
|
+
setState("idle", "Off")
|
|
289
366
|
call.resolve(JSObject().apply {
|
|
290
367
|
put("started", false)
|
|
291
368
|
put("error", e.message ?: "Failed to start")
|
|
@@ -307,6 +384,10 @@ class TalkModePlugin : Plugin() {
|
|
|
307
384
|
lastTranscript = ""
|
|
308
385
|
lastHeardAtMs = null
|
|
309
386
|
|
|
387
|
+
// Release any raw-PCM capture; `enabled` is already false so this won't
|
|
388
|
+
// re-arm SpeechRecognizer.
|
|
389
|
+
stopAudioFramesInternal()
|
|
390
|
+
|
|
310
391
|
mainHandler.post {
|
|
311
392
|
recognizer?.cancel()
|
|
312
393
|
recognizer?.destroy()
|
|
@@ -314,6 +395,7 @@ class TalkModePlugin : Plugin() {
|
|
|
314
395
|
}
|
|
315
396
|
|
|
316
397
|
stopSpeakingInternal()
|
|
398
|
+
releaseVoiceAudioSession()
|
|
317
399
|
setState("idle", "Off")
|
|
318
400
|
call.resolve()
|
|
319
401
|
}
|
|
@@ -364,16 +446,18 @@ class TalkModePlugin : Plugin() {
|
|
|
364
446
|
}
|
|
365
447
|
|
|
366
448
|
val useSystemTts = call.getBoolean("useSystemTts", false) ?: false
|
|
449
|
+
val useLocalInferenceTts = call.getBoolean("useLocalInferenceTts", false) ?: false
|
|
367
450
|
val directive = call.getObject("directive")
|
|
368
451
|
|
|
369
452
|
speakingJob = scope.launch {
|
|
370
|
-
speakInternal(text, useSystemTts, directive, call)
|
|
453
|
+
speakInternal(text, useSystemTts, useLocalInferenceTts, directive, call)
|
|
371
454
|
}
|
|
372
455
|
}
|
|
373
456
|
|
|
374
457
|
@PluginMethod
|
|
375
458
|
fun stopSpeaking(call: PluginCall) {
|
|
376
459
|
val interruptedAt = computeInterruptedAt()
|
|
460
|
+
lastInterruptedAtSeconds = interruptedAt
|
|
377
461
|
stopSpeakingInternal()
|
|
378
462
|
call.resolve(JSObject().apply {
|
|
379
463
|
if (interruptedAt != null) {
|
|
@@ -408,6 +492,277 @@ class TalkModePlugin : Plugin() {
|
|
|
408
492
|
call.resolve(buildPermissionResult())
|
|
409
493
|
}
|
|
410
494
|
|
|
495
|
+
// ── Raw PCM frame capture (diarization / VAD / wake-word) ────────────
|
|
496
|
+
|
|
497
|
+
@PluginMethod
|
|
498
|
+
fun startAudioFrames(call: PluginCall) {
|
|
499
|
+
if (getPermissionState("microphone") != PermissionState.GRANTED) {
|
|
500
|
+
requestPermissionForAlias("microphone", call, "handleStartAudioFramesPermission")
|
|
501
|
+
return
|
|
502
|
+
}
|
|
503
|
+
startAudioFramesInternal(call)
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
@PermissionCallback
|
|
507
|
+
private fun handleStartAudioFramesPermission(call: PluginCall) {
|
|
508
|
+
if (getPermissionState("microphone") == PermissionState.GRANTED) {
|
|
509
|
+
startAudioFramesInternal(call)
|
|
510
|
+
} else {
|
|
511
|
+
call.resolve(JSObject().apply {
|
|
512
|
+
put("started", false)
|
|
513
|
+
put("error", "Microphone permission denied")
|
|
514
|
+
})
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
private fun startAudioFramesInternal(call: PluginCall) {
|
|
519
|
+
if (audioFrameRunning.get()) {
|
|
520
|
+
call.resolve(JSObject().apply {
|
|
521
|
+
put("started", true)
|
|
522
|
+
put("sampleRate", lastFrameSampleRate)
|
|
523
|
+
put("frameSamples", lastFrameSamples)
|
|
524
|
+
put("suspendedStt", sttSuspendedForFrames)
|
|
525
|
+
})
|
|
526
|
+
return
|
|
527
|
+
}
|
|
528
|
+
|
|
529
|
+
val requestedRate = call.getInt("sampleRate") ?: DEFAULT_FRAME_SAMPLE_RATE
|
|
530
|
+
val frameMs = call.getInt("frameMs") ?: DEFAULT_FRAME_MS
|
|
531
|
+
// SpeechRecognizer (SODA) holds the mic; a parallel AudioRecord on the
|
|
532
|
+
// same input fails on virtually every device. Suspend it for the
|
|
533
|
+
// duration of capture and remember to resume on stop.
|
|
534
|
+
val wasListening = isListening || listeningMode
|
|
535
|
+
if (wasListening) {
|
|
536
|
+
suspendSpeechRecognizerForFrames()
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
val record = try {
|
|
540
|
+
openAudioRecord(requestedRate)
|
|
541
|
+
} catch (e: Exception) {
|
|
542
|
+
Log.e(TAG, "AudioRecord open failed", e)
|
|
543
|
+
if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
|
|
544
|
+
call.resolve(JSObject().apply {
|
|
545
|
+
put("started", false)
|
|
546
|
+
put("error", e.message ?: "AudioRecord open failed")
|
|
547
|
+
})
|
|
548
|
+
return
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
val actualRate = record.sampleRate
|
|
552
|
+
val frameSamples = max(1, actualRate * frameMs / 1000)
|
|
553
|
+
audioRecord = record
|
|
554
|
+
lastFrameSampleRate = actualRate
|
|
555
|
+
lastFrameSamples = frameSamples
|
|
556
|
+
|
|
557
|
+
try {
|
|
558
|
+
record.startRecording()
|
|
559
|
+
} catch (e: Exception) {
|
|
560
|
+
Log.e(TAG, "AudioRecord startRecording failed", e)
|
|
561
|
+
releaseAudioRecord()
|
|
562
|
+
if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
|
|
563
|
+
call.resolve(JSObject().apply {
|
|
564
|
+
put("started", false)
|
|
565
|
+
put("error", e.message ?: "AudioRecord start failed")
|
|
566
|
+
})
|
|
567
|
+
return
|
|
568
|
+
}
|
|
569
|
+
|
|
570
|
+
if (record.recordingState != AudioRecord.RECORDSTATE_RECORDING) {
|
|
571
|
+
Log.e(TAG, "AudioRecord did not enter RECORDING state")
|
|
572
|
+
releaseAudioRecord()
|
|
573
|
+
if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
|
|
574
|
+
call.resolve(JSObject().apply {
|
|
575
|
+
put("started", false)
|
|
576
|
+
put("error", "AudioRecord did not start (mic likely held by SpeechRecognizer)")
|
|
577
|
+
})
|
|
578
|
+
return
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
audioFrameRunning.set(true)
|
|
582
|
+
launchFrameLoop(record, frameSamples)
|
|
583
|
+
|
|
584
|
+
call.resolve(JSObject().apply {
|
|
585
|
+
put("started", true)
|
|
586
|
+
put("sampleRate", actualRate)
|
|
587
|
+
put("frameSamples", frameSamples)
|
|
588
|
+
put("suspendedStt", sttSuspendedForFrames)
|
|
589
|
+
})
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
@PluginMethod
|
|
593
|
+
fun stopAudioFrames(call: PluginCall) {
|
|
594
|
+
stopAudioFramesInternal()
|
|
595
|
+
call.resolve()
|
|
596
|
+
}
|
|
597
|
+
|
|
598
|
+
@PluginMethod
|
|
599
|
+
fun isCapturingAudioFrames(call: PluginCall) {
|
|
600
|
+
call.resolve(JSObject().apply {
|
|
601
|
+
put("capturing", audioFrameRunning.get())
|
|
602
|
+
})
|
|
603
|
+
}
|
|
604
|
+
|
|
605
|
+
/**
|
|
606
|
+
* Open a 16 kHz mono 16-bit AudioRecord. Tries VOICE_RECOGNITION first (the
|
|
607
|
+
* pre-processing-light source diarization wants), then falls back to MIC.
|
|
608
|
+
*/
|
|
609
|
+
private fun openAudioRecord(sampleRate: Int): AudioRecord {
|
|
610
|
+
val minBuffer = AudioRecord.getMinBufferSize(
|
|
611
|
+
sampleRate,
|
|
612
|
+
AudioFormat.CHANNEL_IN_MONO,
|
|
613
|
+
AudioFormat.ENCODING_PCM_16BIT
|
|
614
|
+
)
|
|
615
|
+
if (minBuffer <= 0) {
|
|
616
|
+
throw IllegalStateException("AudioRecord min buffer invalid ($minBuffer) for ${sampleRate}Hz")
|
|
617
|
+
}
|
|
618
|
+
val bufferBytes = max(minBuffer * 2, 4 * 1024)
|
|
619
|
+
val sources = intArrayOf(
|
|
620
|
+
MediaRecorder.AudioSource.VOICE_RECOGNITION,
|
|
621
|
+
MediaRecorder.AudioSource.MIC,
|
|
622
|
+
)
|
|
623
|
+
var lastError: Throwable? = null
|
|
624
|
+
for (source in sources) {
|
|
625
|
+
try {
|
|
626
|
+
@Suppress("MissingPermission")
|
|
627
|
+
val record = AudioRecord(
|
|
628
|
+
source,
|
|
629
|
+
sampleRate,
|
|
630
|
+
AudioFormat.CHANNEL_IN_MONO,
|
|
631
|
+
AudioFormat.ENCODING_PCM_16BIT,
|
|
632
|
+
bufferBytes
|
|
633
|
+
)
|
|
634
|
+
if (record.state == AudioRecord.STATE_INITIALIZED) {
|
|
635
|
+
return record
|
|
636
|
+
}
|
|
637
|
+
record.release()
|
|
638
|
+
lastError = IllegalStateException("AudioRecord uninitialized for source $source")
|
|
639
|
+
} catch (e: Exception) {
|
|
640
|
+
lastError = e
|
|
641
|
+
}
|
|
642
|
+
}
|
|
643
|
+
throw IllegalStateException(
|
|
644
|
+
"AudioRecord could not initialize at ${sampleRate}Hz",
|
|
645
|
+
lastError
|
|
646
|
+
)
|
|
647
|
+
}
|
|
648
|
+
|
|
649
|
+
private fun launchFrameLoop(record: AudioRecord, frameSamples: Int) {
|
|
650
|
+
audioFrameJob?.cancel()
|
|
651
|
+
// IO dispatcher: a tight blocking read loop must not sit on the main
|
|
652
|
+
// thread. Frames are marshalled to JS via notifyListeners (thread-safe).
|
|
653
|
+
audioFrameJob = scope.launch(Dispatchers.IO) {
|
|
654
|
+
val buffer = ShortArray(frameSamples)
|
|
655
|
+
val bytes = ByteArray(frameSamples * 2)
|
|
656
|
+
var frameIndex = 0L
|
|
657
|
+
try {
|
|
658
|
+
while (audioFrameRunning.get() && isActive) {
|
|
659
|
+
val read = record.read(buffer, 0, frameSamples)
|
|
660
|
+
if (read <= 0) {
|
|
661
|
+
// ERROR_INVALID_OPERATION (-3) / ERROR_BAD_VALUE (-2):
|
|
662
|
+
// the record was released or the mic was taken; stop.
|
|
663
|
+
if (read < 0) break
|
|
664
|
+
continue
|
|
665
|
+
}
|
|
666
|
+
var sumSquares = 0.0
|
|
667
|
+
var b = 0
|
|
668
|
+
for (i in 0 until read) {
|
|
669
|
+
val s = buffer[i].toInt()
|
|
670
|
+
bytes[b] = (s and 0xff).toByte()
|
|
671
|
+
bytes[b + 1] = ((s shr 8) and 0xff).toByte()
|
|
672
|
+
b += 2
|
|
673
|
+
sumSquares += (s.toDouble() * s.toDouble())
|
|
674
|
+
}
|
|
675
|
+
val rms = if (read > 0) {
|
|
676
|
+
Math.sqrt(sumSquares / read) / 32768.0
|
|
677
|
+
} else 0.0
|
|
678
|
+
val pcmBase64 = Base64.encodeToString(
|
|
679
|
+
bytes, 0, read * 2, Base64.NO_WRAP
|
|
680
|
+
)
|
|
681
|
+
val idx = frameIndex
|
|
682
|
+
frameIndex += 1
|
|
683
|
+
val ts = SystemClock.elapsedRealtime()
|
|
684
|
+
notifyListeners("audioFrame", JSObject().apply {
|
|
685
|
+
put("pcm16", pcmBase64)
|
|
686
|
+
put("sampleRate", record.sampleRate)
|
|
687
|
+
put("channels", 1)
|
|
688
|
+
put("samples", read)
|
|
689
|
+
put("rms", rms)
|
|
690
|
+
put("timestamp", ts)
|
|
691
|
+
put("frameIndex", idx)
|
|
692
|
+
})
|
|
693
|
+
}
|
|
694
|
+
} catch (e: Throwable) {
|
|
695
|
+
Log.e(TAG, "Audio frame loop error", e)
|
|
696
|
+
notifyListeners("error", JSObject().apply {
|
|
697
|
+
put("message", "Audio frame capture stopped: ${e.message}")
|
|
698
|
+
put("fatal", false)
|
|
699
|
+
})
|
|
700
|
+
}
|
|
701
|
+
}
|
|
702
|
+
}
|
|
703
|
+
|
|
704
|
+
private fun stopAudioFramesInternal() {
|
|
705
|
+
if (!audioFrameRunning.getAndSet(false) && audioRecord == null) {
|
|
706
|
+
return
|
|
707
|
+
}
|
|
708
|
+
audioFrameJob?.cancel()
|
|
709
|
+
audioFrameJob = null
|
|
710
|
+
releaseAudioRecord()
|
|
711
|
+
if (sttSuspendedForFrames) {
|
|
712
|
+
resumeSpeechRecognizerAfterFrames()
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
|
|
716
|
+
private fun releaseAudioRecord() {
|
|
717
|
+
val record = audioRecord ?: return
|
|
718
|
+
audioRecord = null
|
|
719
|
+
try {
|
|
720
|
+
if (record.recordingState == AudioRecord.RECORDSTATE_RECORDING) {
|
|
721
|
+
record.stop()
|
|
722
|
+
}
|
|
723
|
+
} catch (_: Throwable) {
|
|
724
|
+
}
|
|
725
|
+
try {
|
|
726
|
+
record.release()
|
|
727
|
+
} catch (_: Throwable) {
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
/** Suspend SpeechRecognizer so AudioRecord can own the mic. */
|
|
732
|
+
private fun suspendSpeechRecognizerForFrames() {
|
|
733
|
+
sttSuspendedForFrames = true
|
|
734
|
+
listeningMode = false
|
|
735
|
+
isListening = false
|
|
736
|
+
restartJob?.cancel()
|
|
737
|
+
silenceJob?.cancel()
|
|
738
|
+
mainHandler.post {
|
|
739
|
+
try {
|
|
740
|
+
recognizer?.cancel()
|
|
741
|
+
recognizer?.destroy()
|
|
742
|
+
} catch (_: Throwable) {
|
|
743
|
+
}
|
|
744
|
+
recognizer = null
|
|
745
|
+
}
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
/** Re-arm SpeechRecognizer after frame capture ends, if a session is active. */
|
|
749
|
+
private fun resumeSpeechRecognizerAfterFrames() {
|
|
750
|
+
sttSuspendedForFrames = false
|
|
751
|
+
if (!enabled || stopRequested) return
|
|
752
|
+
listeningMode = true
|
|
753
|
+
mainHandler.post {
|
|
754
|
+
try {
|
|
755
|
+
if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
|
|
756
|
+
recognizer?.destroy()
|
|
757
|
+
recognizer = createRecognizer()
|
|
758
|
+
startListeningInternal(markListening = true)
|
|
759
|
+
startSilenceMonitor()
|
|
760
|
+
} catch (e: Exception) {
|
|
761
|
+
Log.e(TAG, "Failed to resume STT after frames", e)
|
|
762
|
+
}
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
|
|
411
766
|
// ── Config ──────────────────────────────────────────────────────────
|
|
412
767
|
|
|
413
768
|
private fun applyConfig(config: JSObject) {
|
|
@@ -462,6 +817,13 @@ class TalkModePlugin : Plugin() {
|
|
|
462
817
|
putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
|
|
463
818
|
putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
|
|
464
819
|
putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
|
|
820
|
+
// On-device recognizer (no network round-trip; works offline). The
|
|
821
|
+
// platform recognizer's open/close cadence during continuous use is
|
|
822
|
+
// intrinsic and not controllable via the silence-length extras (the
|
|
823
|
+
// on-device SODA engine ignores them); we silence the AUDIBLE part of
|
|
824
|
+
// that churn by muting the earcon streams for the session instead
|
|
825
|
+
// (see configureVoiceAudioSession).
|
|
826
|
+
putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
|
|
465
827
|
sttLanguage?.let { putExtra(RecognizerIntent.EXTRA_LANGUAGE, it) }
|
|
466
828
|
}
|
|
467
829
|
|
|
@@ -477,6 +839,28 @@ class TalkModePlugin : Plugin() {
|
|
|
477
839
|
}
|
|
478
840
|
}
|
|
479
841
|
|
|
842
|
+
/**
|
|
843
|
+
* Create the speech recognizer. Prefer the API-31+ ON-DEVICE recognizer
|
|
844
|
+
* (in-process SODA): it plays NO start/error earcons, eliminating the
|
|
845
|
+
* audible "open"/"failure" beeps that came from the system
|
|
846
|
+
* com.google.android.tts recognizer service (which also can't be muted
|
|
847
|
+
* without ACCESS_NOTIFICATION_POLICY / STREAM_SYSTEM_ENFORCED control we
|
|
848
|
+
* don't hold). Falls back to the system recognizer when on-device SODA is
|
|
849
|
+
* unavailable.
|
|
850
|
+
*/
|
|
851
|
+
private fun createRecognizer(): SpeechRecognizer {
|
|
852
|
+
val rec = if (
|
|
853
|
+
Build.VERSION.SDK_INT >= Build.VERSION_CODES.S &&
|
|
854
|
+
SpeechRecognizer.isOnDeviceRecognitionAvailable(context)
|
|
855
|
+
) {
|
|
856
|
+
SpeechRecognizer.createOnDeviceSpeechRecognizer(context)
|
|
857
|
+
} else {
|
|
858
|
+
SpeechRecognizer.createSpeechRecognizer(context)
|
|
859
|
+
}
|
|
860
|
+
rec.setRecognitionListener(recognitionListener)
|
|
861
|
+
return rec
|
|
862
|
+
}
|
|
863
|
+
|
|
480
864
|
private fun scheduleRestart(delayMs: Long = 350) {
|
|
481
865
|
if (stopRequested) return
|
|
482
866
|
restartJob?.cancel()
|
|
@@ -515,13 +899,14 @@ class TalkModePlugin : Plugin() {
|
|
|
515
899
|
val elapsed = SystemClock.elapsedRealtime() - lastHeard
|
|
516
900
|
if (elapsed < silenceWindowMs) return
|
|
517
901
|
|
|
518
|
-
// Finalize
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
})
|
|
902
|
+
// Finalize this turn (deduped against the recognizer's own onResults),
|
|
903
|
+
// then restart the recognizer so the next utterance is a CLEAN session —
|
|
904
|
+
// Android SpeechRecognizer accumulates within a session, so without the
|
|
905
|
+
// restart the next turn's partials would prepend the words we just sent.
|
|
523
906
|
lastTranscript = ""
|
|
524
907
|
lastHeardAtMs = null
|
|
908
|
+
emitFinalOnce(transcript)
|
|
909
|
+
scheduleRestart()
|
|
525
910
|
}
|
|
526
911
|
|
|
527
912
|
private fun handleTranscript(transcript: String, isFinal: Boolean) {
|
|
@@ -531,34 +916,71 @@ class TalkModePlugin : Plugin() {
|
|
|
531
916
|
if (isSpeaking && interruptOnSpeech) {
|
|
532
917
|
if (shouldInterrupt(transcript)) {
|
|
533
918
|
val interruptedAt = computeInterruptedAt()
|
|
534
|
-
stopSpeakingInternal()
|
|
535
919
|
lastInterruptedAtSeconds = interruptedAt
|
|
920
|
+
stopSpeakingInternal()
|
|
536
921
|
}
|
|
537
922
|
return
|
|
538
923
|
}
|
|
539
924
|
|
|
540
925
|
if (!isListening) return
|
|
541
926
|
|
|
542
|
-
if (
|
|
927
|
+
if (isFinal) {
|
|
928
|
+
// A real end-of-turn from the recognizer: emit once and clear the
|
|
929
|
+
// pending buffer so the silence monitor doesn't re-finalize the same
|
|
930
|
+
// words (the double-send bug).
|
|
931
|
+
lastTranscript = ""
|
|
932
|
+
lastHeardAtMs = null
|
|
933
|
+
emitFinalOnce(transcript)
|
|
934
|
+
} else {
|
|
543
935
|
lastTranscript = transcript
|
|
544
936
|
lastHeardAtMs = SystemClock.elapsedRealtime()
|
|
937
|
+
notifyListeners("transcript", JSObject().apply {
|
|
938
|
+
put("transcript", transcript)
|
|
939
|
+
put("isFinal", false)
|
|
940
|
+
})
|
|
545
941
|
}
|
|
942
|
+
}
|
|
546
943
|
|
|
944
|
+
/**
|
|
945
|
+
* Emit a FINAL transcript exactly once. Both the recognizer's `onResults`
|
|
946
|
+
* and the silence monitor can finalize the same utterance; collapse them so
|
|
947
|
+
* the turn is sent a single time (a repeated final within 2s is dropped).
|
|
948
|
+
*/
|
|
949
|
+
private fun emitFinalOnce(transcript: String) {
|
|
950
|
+
val text = transcript.trim()
|
|
951
|
+
if (text.isEmpty()) return
|
|
952
|
+
val now = SystemClock.elapsedRealtime()
|
|
953
|
+
if (text == lastEmittedFinal && now - lastEmittedFinalAtMs < 2000L) return
|
|
954
|
+
lastEmittedFinal = text
|
|
955
|
+
lastEmittedFinalAtMs = now
|
|
547
956
|
notifyListeners("transcript", JSObject().apply {
|
|
548
|
-
put("transcript",
|
|
549
|
-
put("isFinal",
|
|
957
|
+
put("transcript", text)
|
|
958
|
+
put("isFinal", true)
|
|
550
959
|
})
|
|
551
960
|
}
|
|
552
961
|
|
|
553
962
|
/**
|
|
554
|
-
*
|
|
555
|
-
*
|
|
963
|
+
* Decide whether heard speech should barge in on the agent's TTS. Tuned to
|
|
964
|
+
* avoid FALSE interrupts (which cut the reply mid-sentence and read as
|
|
965
|
+
* "intermittent audio"): a one-word ASR blip, background noise, or the
|
|
966
|
+
* agent's own voice bleeding back into the mic must NOT interrupt — only a
|
|
967
|
+
* genuine couple-of-words utterance from the user does.
|
|
556
968
|
*/
|
|
557
969
|
private fun shouldInterrupt(transcript: String): Boolean {
|
|
558
970
|
val trimmed = transcript.trim()
|
|
559
|
-
|
|
560
|
-
val
|
|
561
|
-
|
|
971
|
+
val lower = trimmed.lowercase()
|
|
972
|
+
val words = lower.split(Regex("\\s+")).filter { it.isNotBlank() }
|
|
973
|
+
// Need real intent: at least two words, or one long word (≥ 8 chars).
|
|
974
|
+
if (words.size < 2 && trimmed.length < 8) return false
|
|
975
|
+
val spoken = lastSpokenText?.lowercase() ?: return true
|
|
976
|
+
// Exact echo of what we're saying → speaker bleed, not the user.
|
|
977
|
+
if (spoken.contains(lower)) return false
|
|
978
|
+
// Fuzzy echo: if most of the heard words appear in the text we're
|
|
979
|
+
// currently speaking, treat it as echo (ASR mishears of our own audio).
|
|
980
|
+
val echoed = words.count { spoken.contains(it) }
|
|
981
|
+
if (words.isNotEmpty() && echoed.toDouble() / words.size >= 0.6) {
|
|
982
|
+
return false
|
|
983
|
+
}
|
|
562
984
|
return true
|
|
563
985
|
}
|
|
564
986
|
|
|
@@ -573,9 +995,7 @@ class TalkModePlugin : Plugin() {
|
|
|
573
995
|
if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
|
|
574
996
|
try {
|
|
575
997
|
if (recognizer == null) {
|
|
576
|
-
recognizer =
|
|
577
|
-
setRecognitionListener(recognitionListener)
|
|
578
|
-
}
|
|
998
|
+
recognizer = createRecognizer()
|
|
579
999
|
}
|
|
580
1000
|
recognizer?.cancel()
|
|
581
1001
|
startListeningInternal(markListening = false)
|
|
@@ -588,6 +1008,7 @@ class TalkModePlugin : Plugin() {
|
|
|
588
1008
|
private suspend fun speakInternal(
|
|
589
1009
|
text: String,
|
|
590
1010
|
forceSystemTts: Boolean,
|
|
1011
|
+
useLocalInferenceTts: Boolean,
|
|
591
1012
|
directive: JSObject?,
|
|
592
1013
|
call: PluginCall
|
|
593
1014
|
) {
|
|
@@ -596,6 +1017,7 @@ class TalkModePlugin : Plugin() {
|
|
|
596
1017
|
lastSpokenText = text
|
|
597
1018
|
speakStartTimeMs = SystemClock.elapsedRealtime()
|
|
598
1019
|
pcmStopRequested.set(false)
|
|
1020
|
+
lastInterruptedAtSeconds = null
|
|
599
1021
|
setState("speaking", "Speaking")
|
|
600
1022
|
|
|
601
1023
|
val effectiveVoiceId = directive.stringOrNull("voiceId")?.let(::resolveVoiceAlias) ?: voiceId
|
|
@@ -603,27 +1025,74 @@ class TalkModePlugin : Plugin() {
|
|
|
603
1025
|
|
|
604
1026
|
notifyListeners("speaking", JSObject().apply {
|
|
605
1027
|
put("text", text)
|
|
606
|
-
put(
|
|
1028
|
+
put(
|
|
1029
|
+
"isSystemTts",
|
|
1030
|
+
!useLocalInferenceTts &&
|
|
1031
|
+
(forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
|
|
1032
|
+
)
|
|
607
1033
|
})
|
|
608
1034
|
|
|
609
1035
|
// Stop listening during speech (we keep recognizer for interrupt detection)
|
|
610
1036
|
mainHandler.post { recognizer?.stopListening() }
|
|
611
1037
|
ensureInterruptListener()
|
|
612
1038
|
|
|
613
|
-
//
|
|
614
|
-
|
|
1039
|
+
// Ensure the communication-mode session + audio focus are active even
|
|
1040
|
+
// for a standalone speak() that wasn't preceded by start().
|
|
1041
|
+
configureVoiceAudioSession()
|
|
1042
|
+
// Re-assert loudspeaker routing right before playback. configureVoice…
|
|
1043
|
+
// only routes on the FIRST activation; if the session was already up (the
|
|
1044
|
+
// STT path opened it) the speaker route may have drifted, leaving TTS on
|
|
1045
|
+
// the earpiece. Re-route here so replies are audible out the speaker.
|
|
1046
|
+
audioManager?.let { routeVoiceOutput(it) }
|
|
615
1047
|
|
|
616
1048
|
try {
|
|
617
|
-
val
|
|
1049
|
+
val canUseLocalInference = useLocalInferenceTts && !forceSystemTts
|
|
1050
|
+
val canUseElevenLabs = !canUseLocalInference &&
|
|
1051
|
+
!forceSystemTts &&
|
|
618
1052
|
!effectiveApiKey.isNullOrEmpty() &&
|
|
619
1053
|
!effectiveVoiceId.isNullOrEmpty()
|
|
620
1054
|
|
|
621
|
-
if (
|
|
1055
|
+
if (canUseLocalInference) {
|
|
1056
|
+
try {
|
|
1057
|
+
streamAndPlayLocalInferenceTts(text, directive)
|
|
1058
|
+
|
|
1059
|
+
if (!pcmStopRequested.get()) {
|
|
1060
|
+
call.resolve(JSObject().apply {
|
|
1061
|
+
put("completed", true)
|
|
1062
|
+
put("interrupted", false)
|
|
1063
|
+
put("usedSystemTts", false)
|
|
1064
|
+
})
|
|
1065
|
+
} else {
|
|
1066
|
+
call.resolve(JSObject().apply {
|
|
1067
|
+
put("completed", false)
|
|
1068
|
+
put("interrupted", true)
|
|
1069
|
+
put("usedSystemTts", false)
|
|
1070
|
+
lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
|
|
1071
|
+
})
|
|
1072
|
+
}
|
|
1073
|
+
} catch (e: Exception) {
|
|
1074
|
+
if (pcmStopRequested.get()) {
|
|
1075
|
+
call.resolve(JSObject().apply {
|
|
1076
|
+
put("completed", false)
|
|
1077
|
+
put("interrupted", true)
|
|
1078
|
+
put("usedSystemTts", false)
|
|
1079
|
+
})
|
|
1080
|
+
} else {
|
|
1081
|
+
// The on-device OmniVoice TTS assets aren't always staged
|
|
1082
|
+
// (it 502s "TEXT_TO_SPEECH not available"). Rather than go
|
|
1083
|
+
// silent — the JS browser-SpeechSynthesis fallback doesn't
|
|
1084
|
+
// exist in the Android WebView — fall back to the platform
|
|
1085
|
+
// TextToSpeech so replies are always spoken aloud.
|
|
1086
|
+
Log.w(TAG, "Local inference TTS failed, falling back to system TTS", e)
|
|
1087
|
+
speakWithSystemTts(text, call)
|
|
1088
|
+
}
|
|
1089
|
+
}
|
|
1090
|
+
} else if (canUseElevenLabs) {
|
|
622
1091
|
try {
|
|
623
1092
|
val request = buildElevenLabsRequest(text, directive)
|
|
624
1093
|
streamAndPlayPcm(
|
|
625
|
-
voiceId = effectiveVoiceId
|
|
626
|
-
apiKey = effectiveApiKey
|
|
1094
|
+
voiceId = effectiveVoiceId,
|
|
1095
|
+
apiKey = effectiveApiKey,
|
|
627
1096
|
request = request
|
|
628
1097
|
)
|
|
629
1098
|
|
|
@@ -665,13 +1134,16 @@ class TalkModePlugin : Plugin() {
|
|
|
665
1134
|
put("error", e.message ?: "Speak failed")
|
|
666
1135
|
})
|
|
667
1136
|
} finally {
|
|
1137
|
+
val wasInterrupted = pcmStopRequested.get()
|
|
1138
|
+
val interruptedAt = lastInterruptedAtSeconds
|
|
668
1139
|
isSpeaking = false
|
|
669
1140
|
pcmStopRequested.set(false)
|
|
670
|
-
abandonAudioFocus()
|
|
671
1141
|
|
|
672
1142
|
notifyListeners("speakComplete", JSObject().apply {
|
|
673
|
-
put("completed", !
|
|
674
|
-
|
|
1143
|
+
put("completed", !wasInterrupted)
|
|
1144
|
+
if (wasInterrupted) {
|
|
1145
|
+
interruptedAt?.let { put("interruptedAt", it) }
|
|
1146
|
+
}
|
|
675
1147
|
})
|
|
676
1148
|
|
|
677
1149
|
if (enabled) {
|
|
@@ -679,6 +1151,8 @@ class TalkModePlugin : Plugin() {
|
|
|
679
1151
|
setState("listening", "Listening")
|
|
680
1152
|
mainHandler.post { startListeningInternal(markListening = true) }
|
|
681
1153
|
} else {
|
|
1154
|
+
// Standalone speak (no active conversation): release the session.
|
|
1155
|
+
releaseVoiceAudioSession()
|
|
682
1156
|
setState("idle", "Off")
|
|
683
1157
|
}
|
|
684
1158
|
}
|
|
@@ -753,6 +1227,363 @@ class TalkModePlugin : Plugin() {
|
|
|
753
1227
|
return if (value == null || value === JSONObject.NULL) null else value.toString()
|
|
754
1228
|
}
|
|
755
1229
|
|
|
1230
|
+
private data class PcmStreamFormat(
|
|
1231
|
+
val sampleRate: Int,
|
|
1232
|
+
val channels: Int,
|
|
1233
|
+
val bitsPerSample: Int,
|
|
1234
|
+
val dataBytes: Int
|
|
1235
|
+
)
|
|
1236
|
+
|
|
1237
|
+
/**
|
|
1238
|
+
* Stream local-inference TTS from the embedded agent and play it natively.
|
|
1239
|
+
*
|
|
1240
|
+
* The agent currently returns a buffered WAV, but keeping playback in
|
|
1241
|
+
* AudioTrack means this path is ready for a chunked PCM/WAV response without
|
|
1242
|
+
* going back through WebView decodeAudioData.
|
|
1243
|
+
*/
|
|
1244
|
+
private suspend fun streamAndPlayLocalInferenceTts(
|
|
1245
|
+
text: String,
|
|
1246
|
+
directive: JSObject?
|
|
1247
|
+
) = withContext(Dispatchers.IO) {
|
|
1248
|
+
pcmStopRequested.set(false)
|
|
1249
|
+
// Prefer the in-process fused Kokoro voice via the bionic inference host.
|
|
1250
|
+
// Only if that host is unreachable (e.g. desktop/Electrobun, or a build
|
|
1251
|
+
// without it) do we fall through to the HTTP agent endpoint.
|
|
1252
|
+
if (streamAndPlayBionicKokoroTts(text, directive)) {
|
|
1253
|
+
return@withContext
|
|
1254
|
+
}
|
|
1255
|
+
val conn = openLocalInferenceTtsConnection()
|
|
1256
|
+
activePcmConnection = conn
|
|
1257
|
+
try {
|
|
1258
|
+
val payload = buildLocalInferenceTtsPayload(text, directive)
|
|
1259
|
+
conn.outputStream.use { it.write(payload.toByteArray(Charsets.UTF_8)) }
|
|
1260
|
+
|
|
1261
|
+
val code = conn.responseCode
|
|
1262
|
+
if (code >= 400) {
|
|
1263
|
+
val errBody = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
|
|
1264
|
+
throw IllegalStateException("Local inference TTS error: $code $errBody")
|
|
1265
|
+
}
|
|
1266
|
+
|
|
1267
|
+
BufferedInputStream(conn.inputStream).use { input ->
|
|
1268
|
+
val format = readWavPcmFormat(input)
|
|
1269
|
+
val track = createPcmAudioTrack(format)
|
|
1270
|
+
pcmTrack = track
|
|
1271
|
+
track.play()
|
|
1272
|
+
|
|
1273
|
+
Log.d(
|
|
1274
|
+
TAG,
|
|
1275
|
+
"Local inference PCM play start sampleRate=${format.sampleRate} channels=${format.channels}"
|
|
1276
|
+
)
|
|
1277
|
+
notifyListeners("playbackStart", JSObject().apply {
|
|
1278
|
+
put("provider", "local-inference")
|
|
1279
|
+
put("sampleRate", format.sampleRate)
|
|
1280
|
+
put("channels", format.channels)
|
|
1281
|
+
})
|
|
1282
|
+
val framesWritten = writePcmStreamToTrack(input, track, format)
|
|
1283
|
+
drainPcmTrack(track, framesWritten, format.sampleRate)
|
|
1284
|
+
if (!pcmStopRequested.get()) {
|
|
1285
|
+
track.stop()
|
|
1286
|
+
}
|
|
1287
|
+
Log.d(TAG, "Local inference PCM play done frames=$framesWritten")
|
|
1288
|
+
}
|
|
1289
|
+
} finally {
|
|
1290
|
+
cleanupPcmTrack()
|
|
1291
|
+
if (activePcmConnection === conn) {
|
|
1292
|
+
activePcmConnection = null
|
|
1293
|
+
}
|
|
1294
|
+
conn.disconnect()
|
|
1295
|
+
}
|
|
1296
|
+
}
|
|
1297
|
+
|
|
1298
|
+
/**
|
|
1299
|
+
* Synthesize + play with the fused Kokoro-82M head in the bionic inference
|
|
1300
|
+
* host (ElizaBionicInferenceServer, op "tts") over its abstract-namespace
|
|
1301
|
+
* UDS. The host loads the same libelizainference that runs GPU text and
|
|
1302
|
+
* synthesizes Kokoro PCM in-process — no musl agent, no HTTP, no 502 → no
|
|
1303
|
+
* fallback to the platform TextToSpeech (the bug this fixes: the app was
|
|
1304
|
+
* speaking with the Android system voice). Returns true on success; false if
|
|
1305
|
+
* the host is unreachable so the caller can fall through.
|
|
1306
|
+
*/
|
|
1307
|
+
private suspend fun streamAndPlayBionicKokoroTts(
|
|
1308
|
+
text: String,
|
|
1309
|
+
directive: JSObject?
|
|
1310
|
+
): Boolean = withContext(Dispatchers.IO) {
|
|
1311
|
+
val trimmed = text.trim()
|
|
1312
|
+
if (trimmed.isEmpty()) return@withContext false
|
|
1313
|
+
val speed = (directive?.optDouble("speed", 1.0) ?: 1.0).toFloat()
|
|
1314
|
+
val sock = LocalSocket()
|
|
1315
|
+
try {
|
|
1316
|
+
sock.connect(
|
|
1317
|
+
LocalSocketAddress(BIONIC_INFER_SOCKET, LocalSocketAddress.Namespace.ABSTRACT)
|
|
1318
|
+
)
|
|
1319
|
+
} catch (e: Exception) {
|
|
1320
|
+
Log.d(TAG, "bionic Kokoro TTS host unreachable: ${e.message}")
|
|
1321
|
+
try { sock.close() } catch (_: Exception) {}
|
|
1322
|
+
return@withContext false
|
|
1323
|
+
}
|
|
1324
|
+
try {
|
|
1325
|
+
val req = JSONObject().apply {
|
|
1326
|
+
put("op", "tts")
|
|
1327
|
+
put("text", trimmed)
|
|
1328
|
+
put("speed", speed.toDouble())
|
|
1329
|
+
}.toString().toByteArray(Charsets.UTF_8)
|
|
1330
|
+
DataOutputStream(sock.outputStream).apply {
|
|
1331
|
+
writeInt(req.size) // big-endian length prefix
|
|
1332
|
+
write(req)
|
|
1333
|
+
flush()
|
|
1334
|
+
}
|
|
1335
|
+
val din = DataInputStream(sock.inputStream)
|
|
1336
|
+
val len = din.readInt()
|
|
1337
|
+
if (len <= 0 || len > 64 * 1024 * 1024) {
|
|
1338
|
+
throw IllegalStateException("bionic TTS bad frame length $len")
|
|
1339
|
+
}
|
|
1340
|
+
val respBytes = ByteArray(len)
|
|
1341
|
+
din.readFully(respBytes)
|
|
1342
|
+
val resp = JSONObject(String(respBytes, Charsets.UTF_8))
|
|
1343
|
+
if (!resp.optBoolean("ok", false)) {
|
|
1344
|
+
throw IllegalStateException("bionic TTS error: ${resp.optString("error")}")
|
|
1345
|
+
}
|
|
1346
|
+
val sampleRate = resp.optInt("sampleRate", 24000)
|
|
1347
|
+
val pcmF32 = Base64.decode(resp.getString("pcmBase64"), Base64.NO_WRAP)
|
|
1348
|
+
// fp32 LE → int16 PCM (the play path is ENCODING_PCM_16BIT).
|
|
1349
|
+
val fb = ByteBuffer.wrap(pcmF32).order(ByteOrder.LITTLE_ENDIAN).asFloatBuffer()
|
|
1350
|
+
val nSamples = fb.remaining()
|
|
1351
|
+
if (nSamples == 0) {
|
|
1352
|
+
throw IllegalStateException("bionic TTS returned 0 samples")
|
|
1353
|
+
}
|
|
1354
|
+
val pcm16 = ByteArray(nSamples * 2)
|
|
1355
|
+
val ob = ByteBuffer.wrap(pcm16).order(ByteOrder.LITTLE_ENDIAN)
|
|
1356
|
+
for (i in 0 until nSamples) {
|
|
1357
|
+
val s = (fb.get(i) * 32767f).coerceIn(-32768f, 32767f).toInt().toShort()
|
|
1358
|
+
ob.putShort(s)
|
|
1359
|
+
}
|
|
1360
|
+
val format = PcmStreamFormat(sampleRate, 1, 16, pcm16.size)
|
|
1361
|
+
val track = createPcmAudioTrack(format)
|
|
1362
|
+
pcmTrack = track
|
|
1363
|
+
track.play()
|
|
1364
|
+
notifyListeners("playbackStart", JSObject().apply {
|
|
1365
|
+
put("provider", "local-inference")
|
|
1366
|
+
put("sampleRate", sampleRate)
|
|
1367
|
+
put("channels", 1)
|
|
1368
|
+
})
|
|
1369
|
+
val framesWritten = writePcmStreamToTrack(
|
|
1370
|
+
BufferedInputStream(ByteArrayInputStream(pcm16)), track, format
|
|
1371
|
+
)
|
|
1372
|
+
drainPcmTrack(track, framesWritten, sampleRate)
|
|
1373
|
+
if (!pcmStopRequested.get()) track.stop()
|
|
1374
|
+
Log.d(TAG, "bionic Kokoro TTS played $nSamples samples @ $sampleRate Hz")
|
|
1375
|
+
true
|
|
1376
|
+
} finally {
|
|
1377
|
+
cleanupPcmTrack()
|
|
1378
|
+
try { sock.close() } catch (_: Exception) {}
|
|
1379
|
+
}
|
|
1380
|
+
}
|
|
1381
|
+
|
|
1382
|
+
private fun openLocalInferenceTtsConnection(): HttpURLConnection {
|
|
1383
|
+
val tokenFile = File(context.filesDir, "auth/local-agent-token")
|
|
1384
|
+
val token = tokenFile.takeIf { it.isFile }?.readText()?.trim().orEmpty()
|
|
1385
|
+
if (token.isEmpty()) {
|
|
1386
|
+
throw IllegalStateException("Local agent auth token is missing")
|
|
1387
|
+
}
|
|
1388
|
+
|
|
1389
|
+
val conn = URL(LOCAL_INFERENCE_TTS_URL).openConnection() as HttpURLConnection
|
|
1390
|
+
conn.requestMethod = "POST"
|
|
1391
|
+
conn.connectTimeout = 30_000
|
|
1392
|
+
conn.readTimeout = 180_000
|
|
1393
|
+
conn.setRequestProperty("Authorization", "Bearer $token")
|
|
1394
|
+
conn.setRequestProperty("Content-Type", "application/json")
|
|
1395
|
+
conn.setRequestProperty("Accept", "audio/wav")
|
|
1396
|
+
conn.doOutput = true
|
|
1397
|
+
return conn
|
|
1398
|
+
}
|
|
1399
|
+
|
|
1400
|
+
private fun buildLocalInferenceTtsPayload(text: String, directive: JSObject?): String {
|
|
1401
|
+
val payload = JSONObject()
|
|
1402
|
+
payload.put("text", text)
|
|
1403
|
+
directive.stringOrNull("voiceId")?.let { payload.put("voiceId", it) }
|
|
1404
|
+
directive.stringOrNull("voice")?.let { payload.put("voice", it) }
|
|
1405
|
+
directive.stringOrNull("modelId")?.let { payload.put("modelId", it) }
|
|
1406
|
+
directive.stringOrNull("model")?.let { payload.put("model", it) }
|
|
1407
|
+
val speed = directive?.optDouble("speed", Double.NaN)
|
|
1408
|
+
if (speed != null && speed.isFinite() && speed > 0.0) {
|
|
1409
|
+
payload.put("speed", speed)
|
|
1410
|
+
}
|
|
1411
|
+
return payload.toString()
|
|
1412
|
+
}
|
|
1413
|
+
|
|
1414
|
+
private fun readExactly(input: BufferedInputStream, size: Int): ByteArray {
|
|
1415
|
+
val bytes = ByteArray(size)
|
|
1416
|
+
var offset = 0
|
|
1417
|
+
while (offset < size) {
|
|
1418
|
+
val read = input.read(bytes, offset, size - offset)
|
|
1419
|
+
if (read < 0) {
|
|
1420
|
+
throw IllegalStateException("Unexpected end of WAV stream")
|
|
1421
|
+
}
|
|
1422
|
+
offset += read
|
|
1423
|
+
}
|
|
1424
|
+
return bytes
|
|
1425
|
+
}
|
|
1426
|
+
|
|
1427
|
+
private fun skipFully(input: BufferedInputStream, count: Int) {
|
|
1428
|
+
var remaining = count
|
|
1429
|
+
while (remaining > 0) {
|
|
1430
|
+
val skipped = input.skip(remaining.toLong()).toInt()
|
|
1431
|
+
if (skipped > 0) {
|
|
1432
|
+
remaining -= skipped
|
|
1433
|
+
continue
|
|
1434
|
+
}
|
|
1435
|
+
if (input.read() < 0) {
|
|
1436
|
+
throw IllegalStateException("Unexpected end of WAV stream")
|
|
1437
|
+
}
|
|
1438
|
+
remaining -= 1
|
|
1439
|
+
}
|
|
1440
|
+
}
|
|
1441
|
+
|
|
1442
|
+
private fun littleEndianShort(bytes: ByteArray, offset: Int): Int {
|
|
1443
|
+
return (bytes[offset].toInt() and 0xff) or
|
|
1444
|
+
((bytes[offset + 1].toInt() and 0xff) shl 8)
|
|
1445
|
+
}
|
|
1446
|
+
|
|
1447
|
+
private fun littleEndianInt(bytes: ByteArray, offset: Int): Int {
|
|
1448
|
+
return (bytes[offset].toInt() and 0xff) or
|
|
1449
|
+
((bytes[offset + 1].toInt() and 0xff) shl 8) or
|
|
1450
|
+
((bytes[offset + 2].toInt() and 0xff) shl 16) or
|
|
1451
|
+
((bytes[offset + 3].toInt() and 0xff) shl 24)
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
private fun chunkId(bytes: ByteArray): String {
|
|
1455
|
+
return String(bytes, 0, 4, Charsets.US_ASCII)
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1458
|
+
private fun readWavPcmFormat(input: BufferedInputStream): PcmStreamFormat {
|
|
1459
|
+
val riff = readExactly(input, 12)
|
|
1460
|
+
if (
|
|
1461
|
+
String(riff, 0, 4, Charsets.US_ASCII) != "RIFF" ||
|
|
1462
|
+
String(riff, 8, 4, Charsets.US_ASCII) != "WAVE"
|
|
1463
|
+
) {
|
|
1464
|
+
throw IllegalStateException("Local inference TTS returned non-WAV audio")
|
|
1465
|
+
}
|
|
1466
|
+
|
|
1467
|
+
var format: PcmStreamFormat? = null
|
|
1468
|
+
while (true) {
|
|
1469
|
+
val header = readExactly(input, 8)
|
|
1470
|
+
val id = chunkId(header)
|
|
1471
|
+
val size = littleEndianInt(header, 4)
|
|
1472
|
+
if (size < 0) {
|
|
1473
|
+
throw IllegalStateException("Invalid WAV chunk size for $id")
|
|
1474
|
+
}
|
|
1475
|
+
|
|
1476
|
+
if (id == "fmt ") {
|
|
1477
|
+
val fmt = readExactly(input, size)
|
|
1478
|
+
if (fmt.size < 16) {
|
|
1479
|
+
throw IllegalStateException("Invalid WAV fmt chunk")
|
|
1480
|
+
}
|
|
1481
|
+
val audioFormat = littleEndianShort(fmt, 0)
|
|
1482
|
+
val channels = littleEndianShort(fmt, 2)
|
|
1483
|
+
val sampleRate = littleEndianInt(fmt, 4)
|
|
1484
|
+
val bitsPerSample = littleEndianShort(fmt, 14)
|
|
1485
|
+
if (audioFormat != 1) {
|
|
1486
|
+
throw IllegalStateException("Only PCM WAV is supported, got format=$audioFormat")
|
|
1487
|
+
}
|
|
1488
|
+
if (bitsPerSample != 16) {
|
|
1489
|
+
throw IllegalStateException("Only 16-bit PCM WAV is supported, got bits=$bitsPerSample")
|
|
1490
|
+
}
|
|
1491
|
+
if (channels !in 1..2 || sampleRate <= 0) {
|
|
1492
|
+
throw IllegalStateException("Invalid WAV format sampleRate=$sampleRate channels=$channels")
|
|
1493
|
+
}
|
|
1494
|
+
format = PcmStreamFormat(sampleRate, channels, bitsPerSample, 0)
|
|
1495
|
+
if (size % 2 == 1) skipFully(input, 1)
|
|
1496
|
+
continue
|
|
1497
|
+
}
|
|
1498
|
+
|
|
1499
|
+
if (id == "data") {
|
|
1500
|
+
val parsed = format ?: throw IllegalStateException("WAV data arrived before fmt chunk")
|
|
1501
|
+
return parsed.copy(dataBytes = size)
|
|
1502
|
+
}
|
|
1503
|
+
|
|
1504
|
+
skipFully(input, size)
|
|
1505
|
+
if (size % 2 == 1) skipFully(input, 1)
|
|
1506
|
+
}
|
|
1507
|
+
}
|
|
1508
|
+
|
|
1509
|
+
private fun createPcmAudioTrack(format: PcmStreamFormat): AudioTrack {
|
|
1510
|
+
val channelMask = when (format.channels) {
|
|
1511
|
+
1 -> AudioFormat.CHANNEL_OUT_MONO
|
|
1512
|
+
2 -> AudioFormat.CHANNEL_OUT_STEREO
|
|
1513
|
+
else -> throw IllegalStateException("Unsupported PCM channel count ${format.channels}")
|
|
1514
|
+
}
|
|
1515
|
+
val minBuffer = AudioTrack.getMinBufferSize(
|
|
1516
|
+
format.sampleRate,
|
|
1517
|
+
channelMask,
|
|
1518
|
+
AudioFormat.ENCODING_PCM_16BIT
|
|
1519
|
+
)
|
|
1520
|
+
if (minBuffer <= 0) {
|
|
1521
|
+
throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
|
|
1522
|
+
}
|
|
1523
|
+
val bufferSize = max(minBuffer * 2, 8 * 1024)
|
|
1524
|
+
val track = AudioTrack.Builder()
|
|
1525
|
+
.setAudioAttributes(voiceAudioAttributes())
|
|
1526
|
+
.setAudioFormat(
|
|
1527
|
+
AudioFormat.Builder()
|
|
1528
|
+
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
|
|
1529
|
+
.setSampleRate(format.sampleRate)
|
|
1530
|
+
.setChannelMask(channelMask)
|
|
1531
|
+
.build()
|
|
1532
|
+
)
|
|
1533
|
+
.setBufferSizeInBytes(bufferSize)
|
|
1534
|
+
.setTransferMode(AudioTrack.MODE_STREAM)
|
|
1535
|
+
.build()
|
|
1536
|
+
|
|
1537
|
+
if (track.state != AudioTrack.STATE_INITIALIZED) {
|
|
1538
|
+
track.release()
|
|
1539
|
+
throw IllegalStateException("AudioTrack init failed")
|
|
1540
|
+
}
|
|
1541
|
+
return track
|
|
1542
|
+
}
|
|
1543
|
+
|
|
1544
|
+
private fun writePcmStreamToTrack(
|
|
1545
|
+
input: BufferedInputStream,
|
|
1546
|
+
track: AudioTrack,
|
|
1547
|
+
format: PcmStreamFormat
|
|
1548
|
+
): Long {
|
|
1549
|
+
val bytesPerFrame = format.channels * (format.bitsPerSample / 8)
|
|
1550
|
+
var bytesWrittenTotal = 0L
|
|
1551
|
+
var remainingBytes = format.dataBytes
|
|
1552
|
+
val buffer = ByteArray(8 * 1024)
|
|
1553
|
+
while (remainingBytes > 0) {
|
|
1554
|
+
if (pcmStopRequested.get()) break
|
|
1555
|
+
val requestBytes = if (remainingBytes < buffer.size) remainingBytes else buffer.size
|
|
1556
|
+
val bytesRead = input.read(buffer, 0, requestBytes)
|
|
1557
|
+
if (bytesRead <= 0) break
|
|
1558
|
+
remainingBytes -= bytesRead
|
|
1559
|
+
|
|
1560
|
+
var offset = 0
|
|
1561
|
+
while (offset < bytesRead) {
|
|
1562
|
+
if (pcmStopRequested.get()) break
|
|
1563
|
+
val wrote = track.write(buffer, offset, bytesRead - offset)
|
|
1564
|
+
if (wrote <= 0) {
|
|
1565
|
+
throw IllegalStateException("AudioTrack write failed: $wrote")
|
|
1566
|
+
}
|
|
1567
|
+
offset += wrote
|
|
1568
|
+
bytesWrittenTotal += wrote.toLong()
|
|
1569
|
+
}
|
|
1570
|
+
}
|
|
1571
|
+
return if (bytesPerFrame > 0) bytesWrittenTotal / bytesPerFrame else 0L
|
|
1572
|
+
}
|
|
1573
|
+
|
|
1574
|
+
private fun drainPcmTrack(track: AudioTrack, framesWritten: Long, sampleRate: Int) {
|
|
1575
|
+
if (framesWritten <= 0L || sampleRate <= 0) return
|
|
1576
|
+
val maxDrainMs = (framesWritten * 1000L / sampleRate).coerceAtMost(30_000L) + 1_000L
|
|
1577
|
+
val deadline = SystemClock.elapsedRealtime() + maxDrainMs
|
|
1578
|
+
while (
|
|
1579
|
+
!pcmStopRequested.get() &&
|
|
1580
|
+
track.playbackHeadPosition.toLong() < framesWritten &&
|
|
1581
|
+
SystemClock.elapsedRealtime() < deadline
|
|
1582
|
+
) {
|
|
1583
|
+
SystemClock.sleep(20)
|
|
1584
|
+
}
|
|
1585
|
+
}
|
|
1586
|
+
|
|
756
1587
|
/**
|
|
757
1588
|
* Stream PCM audio from ElevenLabs and play via AudioTrack.
|
|
758
1589
|
* Ported from classic TalkModeManager with proper offset-based writes.
|
|
@@ -776,12 +1607,7 @@ class TalkModePlugin : Plugin() {
|
|
|
776
1607
|
|
|
777
1608
|
val bufferSize = max(minBuffer * 2, 8 * 1024)
|
|
778
1609
|
val track = AudioTrack.Builder()
|
|
779
|
-
.setAudioAttributes(
|
|
780
|
-
AudioAttributes.Builder()
|
|
781
|
-
.setUsage(AudioAttributes.USAGE_ASSISTANT)
|
|
782
|
-
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
|
783
|
-
.build()
|
|
784
|
-
)
|
|
1610
|
+
.setAudioAttributes(voiceAudioAttributes())
|
|
785
1611
|
.setAudioFormat(
|
|
786
1612
|
AudioFormat.Builder()
|
|
787
1613
|
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
|
|
@@ -802,6 +1628,7 @@ class TalkModePlugin : Plugin() {
|
|
|
802
1628
|
|
|
803
1629
|
Log.d(TAG, "PCM play start sampleRate=$sampleRate bufferSize=$bufferSize")
|
|
804
1630
|
val conn = openTtsConnection(voiceId, apiKey, request)
|
|
1631
|
+
activePcmConnection = conn
|
|
805
1632
|
try {
|
|
806
1633
|
val payload = buildRequestPayload(request)
|
|
807
1634
|
conn.outputStream.use { it.write(payload.toByteArray()) }
|
|
@@ -845,6 +1672,9 @@ class TalkModePlugin : Plugin() {
|
|
|
845
1672
|
Log.d(TAG, "PCM play done")
|
|
846
1673
|
} finally {
|
|
847
1674
|
cleanupPcmTrack()
|
|
1675
|
+
if (activePcmConnection === conn) {
|
|
1676
|
+
activePcmConnection = null
|
|
1677
|
+
}
|
|
848
1678
|
conn.disconnect()
|
|
849
1679
|
}
|
|
850
1680
|
}
|
|
@@ -970,43 +1800,125 @@ class TalkModePlugin : Plugin() {
|
|
|
970
1800
|
}
|
|
971
1801
|
}
|
|
972
1802
|
|
|
973
|
-
// ──
|
|
1803
|
+
// ── Voice audio session ─────────────────────────────────────────────
|
|
1804
|
+
//
|
|
1805
|
+
// The Android analog of the iOS `.playAndRecord` / `.voiceChat` /
|
|
1806
|
+
// `.defaultToSpeaker` session. Putting the device in MODE_IN_COMMUNICATION
|
|
1807
|
+
// for the whole conversation routes capture + playback through the
|
|
1808
|
+
// telephony path, which engages the platform hardware AEC so TTS coming out
|
|
1809
|
+
// the speaker is cancelled from the mic (the core fix for the mic+speaker
|
|
1810
|
+
// echo loop in hands-free mode). We also hold voice-communication audio
|
|
1811
|
+
// focus and route to the loudspeaker (unless a headset is connected) so
|
|
1812
|
+
// hands-free playback is audible.
|
|
1813
|
+
|
|
1814
|
+
private fun voiceAudioAttributes(): AudioAttributes =
|
|
1815
|
+
AudioAttributes.Builder()
|
|
1816
|
+
.setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
|
|
1817
|
+
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
|
1818
|
+
.build()
|
|
974
1819
|
|
|
975
|
-
private fun
|
|
1820
|
+
private fun configureVoiceAudioSession() {
|
|
1821
|
+
if (audioSessionActive) return
|
|
976
1822
|
val am = audioManager ?: return
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
1823
|
+
|
|
1824
|
+
savedAudioMode = am.mode
|
|
1825
|
+
@Suppress("DEPRECATION")
|
|
1826
|
+
savedSpeakerphoneOn = am.isSpeakerphoneOn
|
|
1827
|
+
|
|
1828
|
+
val request = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_EXCLUSIVE)
|
|
1829
|
+
.setAudioAttributes(voiceAudioAttributes())
|
|
1830
|
+
.setOnAudioFocusChangeListener { focusChange ->
|
|
1831
|
+
if (
|
|
1832
|
+
focusChange == AudioManager.AUDIOFOCUS_LOSS ||
|
|
1833
|
+
focusChange == AudioManager.AUDIOFOCUS_LOSS_TRANSIENT
|
|
1834
|
+
) {
|
|
1835
|
+
// Another app took audio; stop speaking if we are.
|
|
1836
|
+
if (isSpeaking) stopSpeakingInternal()
|
|
985
1837
|
}
|
|
986
1838
|
}
|
|
1839
|
+
.build()
|
|
1840
|
+
audioFocusRequest = request
|
|
1841
|
+
am.requestAudioFocus(request)
|
|
1842
|
+
|
|
1843
|
+
am.mode = AudioManager.MODE_IN_COMMUNICATION
|
|
1844
|
+
routeVoiceOutput(am)
|
|
1845
|
+
muteEarconStreams(am)
|
|
1846
|
+
audioSessionActive = true
|
|
1847
|
+
Log.d(TAG, "Voice audio session active (communication mode)")
|
|
1848
|
+
}
|
|
1849
|
+
|
|
1850
|
+
/** Mute the recognizer earcon streams for the session; idempotent. */
|
|
1851
|
+
private fun muteEarconStreams(am: AudioManager) {
|
|
1852
|
+
if (earconStreamsMuted) return
|
|
1853
|
+
for (stream in earconStreams) {
|
|
1854
|
+
try {
|
|
1855
|
+
am.adjustStreamVolume(stream, AudioManager.ADJUST_MUTE, 0)
|
|
1856
|
+
} catch (_: Throwable) {
|
|
1857
|
+
// Some OEMs disallow muting certain streams without DND access.
|
|
1858
|
+
}
|
|
1859
|
+
}
|
|
1860
|
+
earconStreamsMuted = true
|
|
1861
|
+
}
|
|
1862
|
+
|
|
1863
|
+
private fun unmuteEarconStreams(am: AudioManager) {
|
|
1864
|
+
if (!earconStreamsMuted) return
|
|
1865
|
+
for (stream in earconStreams) {
|
|
1866
|
+
try {
|
|
1867
|
+
am.adjustStreamVolume(stream, AudioManager.ADJUST_UNMUTE, 0)
|
|
1868
|
+
} catch (_: Throwable) {}
|
|
987
1869
|
}
|
|
988
|
-
|
|
1870
|
+
earconStreamsMuted = false
|
|
1871
|
+
}
|
|
989
1872
|
|
|
1873
|
+
/**
|
|
1874
|
+
* Default playback to the loudspeaker for hands-free use, but let a wired or
|
|
1875
|
+
* Bluetooth headset win — the iOS `.defaultToSpeaker` semantic.
|
|
1876
|
+
*/
|
|
1877
|
+
private fun routeVoiceOutput(am: AudioManager) {
|
|
1878
|
+
val hasHeadset = am.getDevices(AudioManager.GET_DEVICES_OUTPUTS).any { device ->
|
|
1879
|
+
device.type == AudioDeviceInfo.TYPE_WIRED_HEADSET ||
|
|
1880
|
+
device.type == AudioDeviceInfo.TYPE_WIRED_HEADPHONES ||
|
|
1881
|
+
device.type == AudioDeviceInfo.TYPE_USB_HEADSET ||
|
|
1882
|
+
device.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO ||
|
|
1883
|
+
device.type == AudioDeviceInfo.TYPE_BLUETOOTH_A2DP
|
|
1884
|
+
}
|
|
1885
|
+
if (hasHeadset) {
|
|
1886
|
+
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
|
|
1887
|
+
@Suppress("DEPRECATION")
|
|
1888
|
+
am.isSpeakerphoneOn = false
|
|
1889
|
+
return
|
|
1890
|
+
}
|
|
1891
|
+
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
|
|
1892
|
+
val speaker = am.availableCommunicationDevices.firstOrNull {
|
|
1893
|
+
it.type == AudioDeviceInfo.TYPE_BUILTIN_SPEAKER
|
|
1894
|
+
}
|
|
1895
|
+
if (speaker != null && am.setCommunicationDevice(speaker)) return
|
|
1896
|
+
}
|
|
990
1897
|
@Suppress("DEPRECATION")
|
|
991
|
-
am.
|
|
992
|
-
focusListener,
|
|
993
|
-
AudioManager.STREAM_MUSIC,
|
|
994
|
-
AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK
|
|
995
|
-
)
|
|
1898
|
+
am.isSpeakerphoneOn = true
|
|
996
1899
|
}
|
|
997
1900
|
|
|
998
|
-
private fun
|
|
1901
|
+
private fun releaseVoiceAudioSession() {
|
|
1902
|
+
if (!audioSessionActive) return
|
|
999
1903
|
val am = audioManager ?: return
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
am.abandonAudioFocus(listener)
|
|
1904
|
+
unmuteEarconStreams(am)
|
|
1905
|
+
audioFocusRequest?.let { am.abandonAudioFocusRequest(it) }
|
|
1003
1906
|
audioFocusRequest = null
|
|
1907
|
+
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
|
|
1908
|
+
@Suppress("DEPRECATION")
|
|
1909
|
+
am.isSpeakerphoneOn = savedSpeakerphoneOn
|
|
1910
|
+
am.mode = savedAudioMode
|
|
1911
|
+
audioSessionActive = false
|
|
1912
|
+
Log.d(TAG, "Voice audio session released")
|
|
1004
1913
|
}
|
|
1005
1914
|
|
|
1006
1915
|
// ── Cleanup helpers ─────────────────────────────────────────────────
|
|
1007
1916
|
|
|
1008
1917
|
private fun stopSpeakingInternal() {
|
|
1009
1918
|
pcmStopRequested.set(true)
|
|
1919
|
+
val conn = activePcmConnection
|
|
1920
|
+
activePcmConnection = null
|
|
1921
|
+
conn?.disconnect()
|
|
1010
1922
|
cleanupPcmTrack()
|
|
1011
1923
|
systemTts?.stop()
|
|
1012
1924
|
systemTtsPending?.cancel()
|
|
@@ -1162,6 +2074,9 @@ class TalkModePlugin : Plugin() {
|
|
|
1162
2074
|
}
|
|
1163
2075
|
|
|
1164
2076
|
private fun isPermissionGranted(permission: String): Boolean {
|
|
2077
|
+
if (permission == Manifest.permission.RECORD_AUDIO) {
|
|
2078
|
+
return context.checkSelfPermission(permission) == PackageManager.PERMISSION_GRANTED
|
|
2079
|
+
}
|
|
1165
2080
|
return getPermissionState(permission) == com.getcapacitor.PermissionState.GRANTED
|
|
1166
2081
|
}
|
|
1167
2082
|
|
|
@@ -1176,10 +2091,13 @@ class TalkModePlugin : Plugin() {
|
|
|
1176
2091
|
systemTts?.shutdown()
|
|
1177
2092
|
systemTts = null
|
|
1178
2093
|
cleanupPcmTrack()
|
|
2094
|
+
audioFrameRunning.set(false)
|
|
2095
|
+
audioFrameJob?.cancel()
|
|
2096
|
+
releaseAudioRecord()
|
|
1179
2097
|
silenceJob?.cancel()
|
|
1180
2098
|
restartJob?.cancel()
|
|
1181
2099
|
speakingJob?.cancel()
|
|
1182
|
-
|
|
2100
|
+
releaseVoiceAudioSession()
|
|
1183
2101
|
scope.cancel()
|
|
1184
2102
|
}
|
|
1185
2103
|
|