@elizaos/capacitor-talkmode 1.0.0 → 2.0.11-beta.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +137 -0
- package/android/build.gradle +17 -3
- package/android/src/main/java/ai/eliza/plugins/talkmode/TalkModePlugin.kt +840 -54
- package/dist/esm/definitions.d.ts +146 -0
- package/dist/esm/definitions.d.ts.map +1 -1
- package/dist/esm/web.d.ts +6 -1
- package/dist/esm/web.d.ts.map +1 -1
- package/dist/esm/web.js +34 -5
- package/dist/esm/web.test.d.ts +2 -0
- package/dist/esm/web.test.d.ts.map +1 -0
- package/dist/esm/web.test.js +137 -0
- package/dist/plugin.cjs.js +34 -5
- package/dist/plugin.cjs.js.map +1 -1
- package/dist/plugin.js +34 -5
- package/dist/plugin.js.map +1 -1
- package/ios/Sources/TalkModePlugin/TalkModePlugin.swift +266 -16
- package/package.json +18 -14
|
@@ -2,10 +2,17 @@ package ai.eliza.plugins.talkmode
|
|
|
2
2
|
|
|
3
3
|
import android.Manifest
|
|
4
4
|
import android.content.Intent
|
|
5
|
+
import android.content.pm.PackageManager
|
|
5
6
|
import android.media.AudioAttributes
|
|
7
|
+
import android.media.AudioDeviceInfo
|
|
8
|
+
import android.media.AudioFocusRequest
|
|
6
9
|
import android.media.AudioFormat
|
|
7
10
|
import android.media.AudioManager
|
|
11
|
+
import android.media.AudioRecord
|
|
8
12
|
import android.media.AudioTrack
|
|
13
|
+
import android.media.MediaRecorder
|
|
14
|
+
import android.util.Base64
|
|
15
|
+
import android.os.Build
|
|
9
16
|
import android.os.Bundle
|
|
10
17
|
import android.os.Handler
|
|
11
18
|
import android.os.Looper
|
|
@@ -26,6 +33,7 @@ import com.getcapacitor.annotation.Permission
|
|
|
26
33
|
import com.getcapacitor.annotation.PermissionCallback
|
|
27
34
|
import kotlinx.coroutines.*
|
|
28
35
|
import java.io.BufferedInputStream
|
|
36
|
+
import java.io.File
|
|
29
37
|
import java.net.HttpURLConnection
|
|
30
38
|
import java.net.URL
|
|
31
39
|
import java.util.Locale
|
|
@@ -45,6 +53,11 @@ class TalkModePlugin : Plugin() {
|
|
|
45
53
|
private const val TAG = "TalkMode"
|
|
46
54
|
private const val DEFAULT_MODEL_ID = "eleven_flash_v2_5"
|
|
47
55
|
private const val DEFAULT_OUTPUT_FORMAT = "pcm_24000"
|
|
56
|
+
private const val LOCAL_INFERENCE_TTS_URL = "http://127.0.0.1:31337/api/tts/local-inference"
|
|
57
|
+
// 16 kHz mono is the rate VAD / diarizer / wake-word models expect; 20 ms
|
|
58
|
+
// (320 samples) is the standard VAD frame size.
|
|
59
|
+
private const val DEFAULT_FRAME_SAMPLE_RATE = 16000
|
|
60
|
+
private const val DEFAULT_FRAME_MS = 20
|
|
48
61
|
}
|
|
49
62
|
|
|
50
63
|
private val mainHandler = Handler(Looper.getMainLooper())
|
|
@@ -65,6 +78,10 @@ class TalkModePlugin : Plugin() {
|
|
|
65
78
|
private var lastHeardAtMs: Long? = null
|
|
66
79
|
private var silenceJob: Job? = null
|
|
67
80
|
private val silenceWindowMs = 700L
|
|
81
|
+
// The recognizer's own onResults AND our silence monitor can both finalize
|
|
82
|
+
// the same utterance; dedup so a turn is emitted (and sent) exactly once.
|
|
83
|
+
private var lastEmittedFinal = ""
|
|
84
|
+
private var lastEmittedFinalAtMs = 0L
|
|
68
85
|
|
|
69
86
|
// TTS
|
|
70
87
|
private var systemTts: TextToSpeech? = null
|
|
@@ -79,10 +96,37 @@ class TalkModePlugin : Plugin() {
|
|
|
79
96
|
private var lastSpokenText: String? = null
|
|
80
97
|
private var speakStartTimeMs: Long = 0
|
|
81
98
|
private var lastInterruptedAtSeconds: Double? = null
|
|
99
|
+
@Volatile private var activePcmConnection: HttpURLConnection? = null
|
|
82
100
|
|
|
83
|
-
//
|
|
101
|
+
// Voice audio session (communication-mode routing + focus, mirrors the iOS
|
|
102
|
+
// .playAndRecord/.voiceChat/.defaultToSpeaker session). Held for the whole
|
|
103
|
+
// conversation so the platform AEC has a stable speaker reference to cancel.
|
|
84
104
|
private var audioManager: AudioManager? = null
|
|
85
|
-
private var audioFocusRequest:
|
|
105
|
+
private var audioFocusRequest: AudioFocusRequest? = null
|
|
106
|
+
private var audioSessionActive = false
|
|
107
|
+
private var savedAudioMode = AudioManager.MODE_NORMAL
|
|
108
|
+
private var savedSpeakerphoneOn = false
|
|
109
|
+
// Streams we mute for the session to suppress the platform recognizer's
|
|
110
|
+
// start/stop earcons (the "on/off" beeps heard as it re-arms continuously).
|
|
111
|
+
// TTS plays on STREAM_VOICE_CALL (USAGE_VOICE_COMMUNICATION) so it stays
|
|
112
|
+
// audible. Tracked so we only unmute streams we muted.
|
|
113
|
+
private val earconStreams = intArrayOf(
|
|
114
|
+
AudioManager.STREAM_MUSIC,
|
|
115
|
+
AudioManager.STREAM_SYSTEM,
|
|
116
|
+
AudioManager.STREAM_NOTIFICATION,
|
|
117
|
+
)
|
|
118
|
+
private var earconStreamsMuted = false
|
|
119
|
+
|
|
120
|
+
// Raw PCM frame capture (diarization / VAD / wake-word source). Opt-in and
|
|
121
|
+
// mutually exclusive with SpeechRecognizer on the mic: Android only lets one
|
|
122
|
+
// capture client own a given input source at a time, so starting frame
|
|
123
|
+
// capture SUSPENDS any active SpeechRecognizer and stopping it resumes STT.
|
|
124
|
+
private var audioRecord: AudioRecord? = null
|
|
125
|
+
private var audioFrameJob: Job? = null
|
|
126
|
+
private val audioFrameRunning = AtomicBoolean(false)
|
|
127
|
+
private var sttSuspendedForFrames = false
|
|
128
|
+
private var lastFrameSampleRate = DEFAULT_FRAME_SAMPLE_RATE
|
|
129
|
+
private var lastFrameSamples = 0
|
|
86
130
|
|
|
87
131
|
// Config
|
|
88
132
|
private var apiKey: String? = null
|
|
@@ -189,6 +233,7 @@ class TalkModePlugin : Plugin() {
|
|
|
189
233
|
systemTtsReady = status == TextToSpeech.SUCCESS
|
|
190
234
|
if (systemTtsReady) {
|
|
191
235
|
systemTts?.language = Locale.getDefault()
|
|
236
|
+
systemTts?.setAudioAttributes(voiceAudioAttributes())
|
|
192
237
|
systemTts?.setOnUtteranceProgressListener(object : UtteranceProgressListener() {
|
|
193
238
|
override fun onStart(id: String?) {}
|
|
194
239
|
|
|
@@ -270,6 +315,7 @@ class TalkModePlugin : Plugin() {
|
|
|
270
315
|
enabled = true
|
|
271
316
|
stopRequested = false
|
|
272
317
|
listeningMode = true
|
|
318
|
+
configureVoiceAudioSession()
|
|
273
319
|
setState("listening", "Listening")
|
|
274
320
|
|
|
275
321
|
mainHandler.post {
|
|
@@ -286,6 +332,13 @@ class TalkModePlugin : Plugin() {
|
|
|
286
332
|
})
|
|
287
333
|
} catch (e: Exception) {
|
|
288
334
|
Log.e(TAG, "Failed to start", e)
|
|
335
|
+
// Recognizer creation failed AFTER the audio session was
|
|
336
|
+
// configured — release it so the earcon streams aren't left
|
|
337
|
+
// muted and the device isn't stuck in MODE_IN_COMMUNICATION.
|
|
338
|
+
enabled = false
|
|
339
|
+
listeningMode = false
|
|
340
|
+
releaseVoiceAudioSession()
|
|
341
|
+
setState("idle", "Off")
|
|
289
342
|
call.resolve(JSObject().apply {
|
|
290
343
|
put("started", false)
|
|
291
344
|
put("error", e.message ?: "Failed to start")
|
|
@@ -307,6 +360,10 @@ class TalkModePlugin : Plugin() {
|
|
|
307
360
|
lastTranscript = ""
|
|
308
361
|
lastHeardAtMs = null
|
|
309
362
|
|
|
363
|
+
// Release any raw-PCM capture; `enabled` is already false so this won't
|
|
364
|
+
// re-arm SpeechRecognizer.
|
|
365
|
+
stopAudioFramesInternal()
|
|
366
|
+
|
|
310
367
|
mainHandler.post {
|
|
311
368
|
recognizer?.cancel()
|
|
312
369
|
recognizer?.destroy()
|
|
@@ -314,6 +371,7 @@ class TalkModePlugin : Plugin() {
|
|
|
314
371
|
}
|
|
315
372
|
|
|
316
373
|
stopSpeakingInternal()
|
|
374
|
+
releaseVoiceAudioSession()
|
|
317
375
|
setState("idle", "Off")
|
|
318
376
|
call.resolve()
|
|
319
377
|
}
|
|
@@ -364,16 +422,18 @@ class TalkModePlugin : Plugin() {
|
|
|
364
422
|
}
|
|
365
423
|
|
|
366
424
|
val useSystemTts = call.getBoolean("useSystemTts", false) ?: false
|
|
425
|
+
val useLocalInferenceTts = call.getBoolean("useLocalInferenceTts", false) ?: false
|
|
367
426
|
val directive = call.getObject("directive")
|
|
368
427
|
|
|
369
428
|
speakingJob = scope.launch {
|
|
370
|
-
speakInternal(text, useSystemTts, directive, call)
|
|
429
|
+
speakInternal(text, useSystemTts, useLocalInferenceTts, directive, call)
|
|
371
430
|
}
|
|
372
431
|
}
|
|
373
432
|
|
|
374
433
|
@PluginMethod
|
|
375
434
|
fun stopSpeaking(call: PluginCall) {
|
|
376
435
|
val interruptedAt = computeInterruptedAt()
|
|
436
|
+
lastInterruptedAtSeconds = interruptedAt
|
|
377
437
|
stopSpeakingInternal()
|
|
378
438
|
call.resolve(JSObject().apply {
|
|
379
439
|
if (interruptedAt != null) {
|
|
@@ -408,6 +468,279 @@ class TalkModePlugin : Plugin() {
|
|
|
408
468
|
call.resolve(buildPermissionResult())
|
|
409
469
|
}
|
|
410
470
|
|
|
471
|
+
// ── Raw PCM frame capture (diarization / VAD / wake-word) ────────────
|
|
472
|
+
|
|
473
|
+
@PluginMethod
|
|
474
|
+
fun startAudioFrames(call: PluginCall) {
|
|
475
|
+
if (getPermissionState("microphone") != PermissionState.GRANTED) {
|
|
476
|
+
requestPermissionForAlias("microphone", call, "handleStartAudioFramesPermission")
|
|
477
|
+
return
|
|
478
|
+
}
|
|
479
|
+
startAudioFramesInternal(call)
|
|
480
|
+
}
|
|
481
|
+
|
|
482
|
+
@PermissionCallback
|
|
483
|
+
private fun handleStartAudioFramesPermission(call: PluginCall) {
|
|
484
|
+
if (getPermissionState("microphone") == PermissionState.GRANTED) {
|
|
485
|
+
startAudioFramesInternal(call)
|
|
486
|
+
} else {
|
|
487
|
+
call.resolve(JSObject().apply {
|
|
488
|
+
put("started", false)
|
|
489
|
+
put("error", "Microphone permission denied")
|
|
490
|
+
})
|
|
491
|
+
}
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
private fun startAudioFramesInternal(call: PluginCall) {
|
|
495
|
+
if (audioFrameRunning.get()) {
|
|
496
|
+
call.resolve(JSObject().apply {
|
|
497
|
+
put("started", true)
|
|
498
|
+
put("sampleRate", lastFrameSampleRate)
|
|
499
|
+
put("frameSamples", lastFrameSamples)
|
|
500
|
+
put("suspendedStt", sttSuspendedForFrames)
|
|
501
|
+
})
|
|
502
|
+
return
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
val requestedRate = call.getInt("sampleRate") ?: DEFAULT_FRAME_SAMPLE_RATE
|
|
506
|
+
val frameMs = call.getInt("frameMs") ?: DEFAULT_FRAME_MS
|
|
507
|
+
// SpeechRecognizer (SODA) holds the mic; a parallel AudioRecord on the
|
|
508
|
+
// same input fails on virtually every device. Suspend it for the
|
|
509
|
+
// duration of capture and remember to resume on stop.
|
|
510
|
+
val wasListening = isListening || listeningMode
|
|
511
|
+
if (wasListening) {
|
|
512
|
+
suspendSpeechRecognizerForFrames()
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
val record = try {
|
|
516
|
+
openAudioRecord(requestedRate)
|
|
517
|
+
} catch (e: Exception) {
|
|
518
|
+
Log.e(TAG, "AudioRecord open failed", e)
|
|
519
|
+
if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
|
|
520
|
+
call.resolve(JSObject().apply {
|
|
521
|
+
put("started", false)
|
|
522
|
+
put("error", e.message ?: "AudioRecord open failed")
|
|
523
|
+
})
|
|
524
|
+
return
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
val actualRate = record.sampleRate
|
|
528
|
+
val frameSamples = max(1, actualRate * frameMs / 1000)
|
|
529
|
+
audioRecord = record
|
|
530
|
+
lastFrameSampleRate = actualRate
|
|
531
|
+
lastFrameSamples = frameSamples
|
|
532
|
+
|
|
533
|
+
try {
|
|
534
|
+
record.startRecording()
|
|
535
|
+
} catch (e: Exception) {
|
|
536
|
+
Log.e(TAG, "AudioRecord startRecording failed", e)
|
|
537
|
+
releaseAudioRecord()
|
|
538
|
+
if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
|
|
539
|
+
call.resolve(JSObject().apply {
|
|
540
|
+
put("started", false)
|
|
541
|
+
put("error", e.message ?: "AudioRecord start failed")
|
|
542
|
+
})
|
|
543
|
+
return
|
|
544
|
+
}
|
|
545
|
+
|
|
546
|
+
if (record.recordingState != AudioRecord.RECORDSTATE_RECORDING) {
|
|
547
|
+
Log.e(TAG, "AudioRecord did not enter RECORDING state")
|
|
548
|
+
releaseAudioRecord()
|
|
549
|
+
if (sttSuspendedForFrames) resumeSpeechRecognizerAfterFrames()
|
|
550
|
+
call.resolve(JSObject().apply {
|
|
551
|
+
put("started", false)
|
|
552
|
+
put("error", "AudioRecord did not start (mic likely held by SpeechRecognizer)")
|
|
553
|
+
})
|
|
554
|
+
return
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
audioFrameRunning.set(true)
|
|
558
|
+
launchFrameLoop(record, frameSamples)
|
|
559
|
+
|
|
560
|
+
call.resolve(JSObject().apply {
|
|
561
|
+
put("started", true)
|
|
562
|
+
put("sampleRate", actualRate)
|
|
563
|
+
put("frameSamples", frameSamples)
|
|
564
|
+
put("suspendedStt", sttSuspendedForFrames)
|
|
565
|
+
})
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
@PluginMethod
|
|
569
|
+
fun stopAudioFrames(call: PluginCall) {
|
|
570
|
+
stopAudioFramesInternal()
|
|
571
|
+
call.resolve()
|
|
572
|
+
}
|
|
573
|
+
|
|
574
|
+
@PluginMethod
|
|
575
|
+
fun isCapturingAudioFrames(call: PluginCall) {
|
|
576
|
+
call.resolve(JSObject().apply {
|
|
577
|
+
put("capturing", audioFrameRunning.get())
|
|
578
|
+
})
|
|
579
|
+
}
|
|
580
|
+
|
|
581
|
+
/**
|
|
582
|
+
* Open a 16 kHz mono 16-bit AudioRecord. Tries VOICE_RECOGNITION first (the
|
|
583
|
+
* pre-processing-light source diarization wants), then falls back to MIC.
|
|
584
|
+
*/
|
|
585
|
+
private fun openAudioRecord(sampleRate: Int): AudioRecord {
|
|
586
|
+
val minBuffer = AudioRecord.getMinBufferSize(
|
|
587
|
+
sampleRate,
|
|
588
|
+
AudioFormat.CHANNEL_IN_MONO,
|
|
589
|
+
AudioFormat.ENCODING_PCM_16BIT
|
|
590
|
+
)
|
|
591
|
+
if (minBuffer <= 0) {
|
|
592
|
+
throw IllegalStateException("AudioRecord min buffer invalid ($minBuffer) for ${sampleRate}Hz")
|
|
593
|
+
}
|
|
594
|
+
val bufferBytes = max(minBuffer * 2, 4 * 1024)
|
|
595
|
+
val sources = intArrayOf(
|
|
596
|
+
MediaRecorder.AudioSource.VOICE_RECOGNITION,
|
|
597
|
+
MediaRecorder.AudioSource.MIC,
|
|
598
|
+
)
|
|
599
|
+
var lastError: Throwable? = null
|
|
600
|
+
for (source in sources) {
|
|
601
|
+
try {
|
|
602
|
+
@Suppress("MissingPermission")
|
|
603
|
+
val record = AudioRecord(
|
|
604
|
+
source,
|
|
605
|
+
sampleRate,
|
|
606
|
+
AudioFormat.CHANNEL_IN_MONO,
|
|
607
|
+
AudioFormat.ENCODING_PCM_16BIT,
|
|
608
|
+
bufferBytes
|
|
609
|
+
)
|
|
610
|
+
if (record.state == AudioRecord.STATE_INITIALIZED) {
|
|
611
|
+
return record
|
|
612
|
+
}
|
|
613
|
+
record.release()
|
|
614
|
+
lastError = IllegalStateException("AudioRecord uninitialized for source $source")
|
|
615
|
+
} catch (e: Exception) {
|
|
616
|
+
lastError = e
|
|
617
|
+
}
|
|
618
|
+
}
|
|
619
|
+
throw IllegalStateException(
|
|
620
|
+
"AudioRecord could not initialize at ${sampleRate}Hz",
|
|
621
|
+
lastError
|
|
622
|
+
)
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
private fun launchFrameLoop(record: AudioRecord, frameSamples: Int) {
|
|
626
|
+
audioFrameJob?.cancel()
|
|
627
|
+
// IO dispatcher: a tight blocking read loop must not sit on the main
|
|
628
|
+
// thread. Frames are marshalled to JS via notifyListeners (thread-safe).
|
|
629
|
+
audioFrameJob = scope.launch(Dispatchers.IO) {
|
|
630
|
+
val buffer = ShortArray(frameSamples)
|
|
631
|
+
val bytes = ByteArray(frameSamples * 2)
|
|
632
|
+
var frameIndex = 0L
|
|
633
|
+
try {
|
|
634
|
+
while (audioFrameRunning.get() && isActive) {
|
|
635
|
+
val read = record.read(buffer, 0, frameSamples)
|
|
636
|
+
if (read <= 0) {
|
|
637
|
+
// ERROR_INVALID_OPERATION (-3) / ERROR_BAD_VALUE (-2):
|
|
638
|
+
// the record was released or the mic was taken; stop.
|
|
639
|
+
if (read < 0) break
|
|
640
|
+
continue
|
|
641
|
+
}
|
|
642
|
+
var sumSquares = 0.0
|
|
643
|
+
var b = 0
|
|
644
|
+
for (i in 0 until read) {
|
|
645
|
+
val s = buffer[i].toInt()
|
|
646
|
+
bytes[b] = (s and 0xff).toByte()
|
|
647
|
+
bytes[b + 1] = ((s shr 8) and 0xff).toByte()
|
|
648
|
+
b += 2
|
|
649
|
+
sumSquares += (s.toDouble() * s.toDouble())
|
|
650
|
+
}
|
|
651
|
+
val rms = if (read > 0) {
|
|
652
|
+
Math.sqrt(sumSquares / read) / 32768.0
|
|
653
|
+
} else 0.0
|
|
654
|
+
val pcmBase64 = Base64.encodeToString(
|
|
655
|
+
bytes, 0, read * 2, Base64.NO_WRAP
|
|
656
|
+
)
|
|
657
|
+
val idx = frameIndex
|
|
658
|
+
frameIndex += 1
|
|
659
|
+
val ts = SystemClock.elapsedRealtime()
|
|
660
|
+
notifyListeners("audioFrame", JSObject().apply {
|
|
661
|
+
put("pcm16", pcmBase64)
|
|
662
|
+
put("sampleRate", record.sampleRate)
|
|
663
|
+
put("channels", 1)
|
|
664
|
+
put("samples", read)
|
|
665
|
+
put("rms", rms)
|
|
666
|
+
put("timestamp", ts)
|
|
667
|
+
put("frameIndex", idx)
|
|
668
|
+
})
|
|
669
|
+
}
|
|
670
|
+
} catch (e: Throwable) {
|
|
671
|
+
Log.e(TAG, "Audio frame loop error", e)
|
|
672
|
+
notifyListeners("error", JSObject().apply {
|
|
673
|
+
put("message", "Audio frame capture stopped: ${e.message}")
|
|
674
|
+
put("fatal", false)
|
|
675
|
+
})
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
|
|
680
|
+
private fun stopAudioFramesInternal() {
|
|
681
|
+
if (!audioFrameRunning.getAndSet(false) && audioRecord == null) {
|
|
682
|
+
return
|
|
683
|
+
}
|
|
684
|
+
audioFrameJob?.cancel()
|
|
685
|
+
audioFrameJob = null
|
|
686
|
+
releaseAudioRecord()
|
|
687
|
+
if (sttSuspendedForFrames) {
|
|
688
|
+
resumeSpeechRecognizerAfterFrames()
|
|
689
|
+
}
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
private fun releaseAudioRecord() {
|
|
693
|
+
val record = audioRecord ?: return
|
|
694
|
+
audioRecord = null
|
|
695
|
+
try {
|
|
696
|
+
if (record.recordingState == AudioRecord.RECORDSTATE_RECORDING) {
|
|
697
|
+
record.stop()
|
|
698
|
+
}
|
|
699
|
+
} catch (_: Throwable) {
|
|
700
|
+
}
|
|
701
|
+
try {
|
|
702
|
+
record.release()
|
|
703
|
+
} catch (_: Throwable) {
|
|
704
|
+
}
|
|
705
|
+
}
|
|
706
|
+
|
|
707
|
+
/** Suspend SpeechRecognizer so AudioRecord can own the mic. */
|
|
708
|
+
private fun suspendSpeechRecognizerForFrames() {
|
|
709
|
+
sttSuspendedForFrames = true
|
|
710
|
+
listeningMode = false
|
|
711
|
+
isListening = false
|
|
712
|
+
restartJob?.cancel()
|
|
713
|
+
silenceJob?.cancel()
|
|
714
|
+
mainHandler.post {
|
|
715
|
+
try {
|
|
716
|
+
recognizer?.cancel()
|
|
717
|
+
recognizer?.destroy()
|
|
718
|
+
} catch (_: Throwable) {
|
|
719
|
+
}
|
|
720
|
+
recognizer = null
|
|
721
|
+
}
|
|
722
|
+
}
|
|
723
|
+
|
|
724
|
+
/** Re-arm SpeechRecognizer after frame capture ends, if a session is active. */
|
|
725
|
+
private fun resumeSpeechRecognizerAfterFrames() {
|
|
726
|
+
sttSuspendedForFrames = false
|
|
727
|
+
if (!enabled || stopRequested) return
|
|
728
|
+
listeningMode = true
|
|
729
|
+
mainHandler.post {
|
|
730
|
+
try {
|
|
731
|
+
if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
|
|
732
|
+
recognizer?.destroy()
|
|
733
|
+
recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
|
|
734
|
+
setRecognitionListener(recognitionListener)
|
|
735
|
+
}
|
|
736
|
+
startListeningInternal(markListening = true)
|
|
737
|
+
startSilenceMonitor()
|
|
738
|
+
} catch (e: Exception) {
|
|
739
|
+
Log.e(TAG, "Failed to resume STT after frames", e)
|
|
740
|
+
}
|
|
741
|
+
}
|
|
742
|
+
}
|
|
743
|
+
|
|
411
744
|
// ── Config ──────────────────────────────────────────────────────────
|
|
412
745
|
|
|
413
746
|
private fun applyConfig(config: JSObject) {
|
|
@@ -462,6 +795,13 @@ class TalkModePlugin : Plugin() {
|
|
|
462
795
|
putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
|
|
463
796
|
putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
|
|
464
797
|
putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
|
|
798
|
+
// On-device recognizer (no network round-trip; works offline). The
|
|
799
|
+
// platform recognizer's open/close cadence during continuous use is
|
|
800
|
+
// intrinsic and not controllable via the silence-length extras (the
|
|
801
|
+
// on-device SODA engine ignores them); we silence the AUDIBLE part of
|
|
802
|
+
// that churn by muting the earcon streams for the session instead
|
|
803
|
+
// (see configureVoiceAudioSession).
|
|
804
|
+
putExtra(RecognizerIntent.EXTRA_PREFER_OFFLINE, true)
|
|
465
805
|
sttLanguage?.let { putExtra(RecognizerIntent.EXTRA_LANGUAGE, it) }
|
|
466
806
|
}
|
|
467
807
|
|
|
@@ -515,13 +855,14 @@ class TalkModePlugin : Plugin() {
|
|
|
515
855
|
val elapsed = SystemClock.elapsedRealtime() - lastHeard
|
|
516
856
|
if (elapsed < silenceWindowMs) return
|
|
517
857
|
|
|
518
|
-
// Finalize
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
})
|
|
858
|
+
// Finalize this turn (deduped against the recognizer's own onResults),
|
|
859
|
+
// then restart the recognizer so the next utterance is a CLEAN session —
|
|
860
|
+
// Android SpeechRecognizer accumulates within a session, so without the
|
|
861
|
+
// restart the next turn's partials would prepend the words we just sent.
|
|
523
862
|
lastTranscript = ""
|
|
524
863
|
lastHeardAtMs = null
|
|
864
|
+
emitFinalOnce(transcript)
|
|
865
|
+
scheduleRestart()
|
|
525
866
|
}
|
|
526
867
|
|
|
527
868
|
private fun handleTranscript(transcript: String, isFinal: Boolean) {
|
|
@@ -531,34 +872,71 @@ class TalkModePlugin : Plugin() {
|
|
|
531
872
|
if (isSpeaking && interruptOnSpeech) {
|
|
532
873
|
if (shouldInterrupt(transcript)) {
|
|
533
874
|
val interruptedAt = computeInterruptedAt()
|
|
534
|
-
stopSpeakingInternal()
|
|
535
875
|
lastInterruptedAtSeconds = interruptedAt
|
|
876
|
+
stopSpeakingInternal()
|
|
536
877
|
}
|
|
537
878
|
return
|
|
538
879
|
}
|
|
539
880
|
|
|
540
881
|
if (!isListening) return
|
|
541
882
|
|
|
542
|
-
if (
|
|
883
|
+
if (isFinal) {
|
|
884
|
+
// A real end-of-turn from the recognizer: emit once and clear the
|
|
885
|
+
// pending buffer so the silence monitor doesn't re-finalize the same
|
|
886
|
+
// words (the double-send bug).
|
|
887
|
+
lastTranscript = ""
|
|
888
|
+
lastHeardAtMs = null
|
|
889
|
+
emitFinalOnce(transcript)
|
|
890
|
+
} else {
|
|
543
891
|
lastTranscript = transcript
|
|
544
892
|
lastHeardAtMs = SystemClock.elapsedRealtime()
|
|
893
|
+
notifyListeners("transcript", JSObject().apply {
|
|
894
|
+
put("transcript", transcript)
|
|
895
|
+
put("isFinal", false)
|
|
896
|
+
})
|
|
545
897
|
}
|
|
898
|
+
}
|
|
546
899
|
|
|
900
|
+
/**
|
|
901
|
+
* Emit a FINAL transcript exactly once. Both the recognizer's `onResults`
|
|
902
|
+
* and the silence monitor can finalize the same utterance; collapse them so
|
|
903
|
+
* the turn is sent a single time (a repeated final within 2s is dropped).
|
|
904
|
+
*/
|
|
905
|
+
private fun emitFinalOnce(transcript: String) {
|
|
906
|
+
val text = transcript.trim()
|
|
907
|
+
if (text.isEmpty()) return
|
|
908
|
+
val now = SystemClock.elapsedRealtime()
|
|
909
|
+
if (text == lastEmittedFinal && now - lastEmittedFinalAtMs < 2000L) return
|
|
910
|
+
lastEmittedFinal = text
|
|
911
|
+
lastEmittedFinalAtMs = now
|
|
547
912
|
notifyListeners("transcript", JSObject().apply {
|
|
548
|
-
put("transcript",
|
|
549
|
-
put("isFinal",
|
|
913
|
+
put("transcript", text)
|
|
914
|
+
put("isFinal", true)
|
|
550
915
|
})
|
|
551
916
|
}
|
|
552
917
|
|
|
553
918
|
/**
|
|
554
|
-
*
|
|
555
|
-
*
|
|
919
|
+
* Decide whether heard speech should barge in on the agent's TTS. Tuned to
|
|
920
|
+
* avoid FALSE interrupts (which cut the reply mid-sentence and read as
|
|
921
|
+
* "intermittent audio"): a one-word ASR blip, background noise, or the
|
|
922
|
+
* agent's own voice bleeding back into the mic must NOT interrupt — only a
|
|
923
|
+
* genuine couple-of-words utterance from the user does.
|
|
556
924
|
*/
|
|
557
925
|
private fun shouldInterrupt(transcript: String): Boolean {
|
|
558
926
|
val trimmed = transcript.trim()
|
|
559
|
-
|
|
560
|
-
val
|
|
561
|
-
|
|
927
|
+
val lower = trimmed.lowercase()
|
|
928
|
+
val words = lower.split(Regex("\\s+")).filter { it.isNotBlank() }
|
|
929
|
+
// Need real intent: at least two words, or one long word (≥ 8 chars).
|
|
930
|
+
if (words.size < 2 && trimmed.length < 8) return false
|
|
931
|
+
val spoken = lastSpokenText?.lowercase() ?: return true
|
|
932
|
+
// Exact echo of what we're saying → speaker bleed, not the user.
|
|
933
|
+
if (spoken.contains(lower)) return false
|
|
934
|
+
// Fuzzy echo: if most of the heard words appear in the text we're
|
|
935
|
+
// currently speaking, treat it as echo (ASR mishears of our own audio).
|
|
936
|
+
val echoed = words.count { spoken.contains(it) }
|
|
937
|
+
if (words.isNotEmpty() && echoed.toDouble() / words.size >= 0.6) {
|
|
938
|
+
return false
|
|
939
|
+
}
|
|
562
940
|
return true
|
|
563
941
|
}
|
|
564
942
|
|
|
@@ -588,6 +966,7 @@ class TalkModePlugin : Plugin() {
|
|
|
588
966
|
private suspend fun speakInternal(
|
|
589
967
|
text: String,
|
|
590
968
|
forceSystemTts: Boolean,
|
|
969
|
+
useLocalInferenceTts: Boolean,
|
|
591
970
|
directive: JSObject?,
|
|
592
971
|
call: PluginCall
|
|
593
972
|
) {
|
|
@@ -596,6 +975,7 @@ class TalkModePlugin : Plugin() {
|
|
|
596
975
|
lastSpokenText = text
|
|
597
976
|
speakStartTimeMs = SystemClock.elapsedRealtime()
|
|
598
977
|
pcmStopRequested.set(false)
|
|
978
|
+
lastInterruptedAtSeconds = null
|
|
599
979
|
setState("speaking", "Speaking")
|
|
600
980
|
|
|
601
981
|
val effectiveVoiceId = directive.stringOrNull("voiceId")?.let(::resolveVoiceAlias) ?: voiceId
|
|
@@ -603,27 +983,74 @@ class TalkModePlugin : Plugin() {
|
|
|
603
983
|
|
|
604
984
|
notifyListeners("speaking", JSObject().apply {
|
|
605
985
|
put("text", text)
|
|
606
|
-
put(
|
|
986
|
+
put(
|
|
987
|
+
"isSystemTts",
|
|
988
|
+
!useLocalInferenceTts &&
|
|
989
|
+
(forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
|
|
990
|
+
)
|
|
607
991
|
})
|
|
608
992
|
|
|
609
993
|
// Stop listening during speech (we keep recognizer for interrupt detection)
|
|
610
994
|
mainHandler.post { recognizer?.stopListening() }
|
|
611
995
|
ensureInterruptListener()
|
|
612
996
|
|
|
613
|
-
//
|
|
614
|
-
|
|
997
|
+
// Ensure the communication-mode session + audio focus are active even
|
|
998
|
+
// for a standalone speak() that wasn't preceded by start().
|
|
999
|
+
configureVoiceAudioSession()
|
|
1000
|
+
// Re-assert loudspeaker routing right before playback. configureVoice…
|
|
1001
|
+
// only routes on the FIRST activation; if the session was already up (the
|
|
1002
|
+
// STT path opened it) the speaker route may have drifted, leaving TTS on
|
|
1003
|
+
// the earpiece. Re-route here so replies are audible out the speaker.
|
|
1004
|
+
audioManager?.let { routeVoiceOutput(it) }
|
|
615
1005
|
|
|
616
1006
|
try {
|
|
617
|
-
val
|
|
1007
|
+
val canUseLocalInference = useLocalInferenceTts && !forceSystemTts
|
|
1008
|
+
val canUseElevenLabs = !canUseLocalInference &&
|
|
1009
|
+
!forceSystemTts &&
|
|
618
1010
|
!effectiveApiKey.isNullOrEmpty() &&
|
|
619
1011
|
!effectiveVoiceId.isNullOrEmpty()
|
|
620
1012
|
|
|
621
|
-
if (
|
|
1013
|
+
if (canUseLocalInference) {
|
|
1014
|
+
try {
|
|
1015
|
+
streamAndPlayLocalInferenceTts(text, directive)
|
|
1016
|
+
|
|
1017
|
+
if (!pcmStopRequested.get()) {
|
|
1018
|
+
call.resolve(JSObject().apply {
|
|
1019
|
+
put("completed", true)
|
|
1020
|
+
put("interrupted", false)
|
|
1021
|
+
put("usedSystemTts", false)
|
|
1022
|
+
})
|
|
1023
|
+
} else {
|
|
1024
|
+
call.resolve(JSObject().apply {
|
|
1025
|
+
put("completed", false)
|
|
1026
|
+
put("interrupted", true)
|
|
1027
|
+
put("usedSystemTts", false)
|
|
1028
|
+
lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
|
|
1029
|
+
})
|
|
1030
|
+
}
|
|
1031
|
+
} catch (e: Exception) {
|
|
1032
|
+
if (pcmStopRequested.get()) {
|
|
1033
|
+
call.resolve(JSObject().apply {
|
|
1034
|
+
put("completed", false)
|
|
1035
|
+
put("interrupted", true)
|
|
1036
|
+
put("usedSystemTts", false)
|
|
1037
|
+
})
|
|
1038
|
+
} else {
|
|
1039
|
+
// The on-device OmniVoice TTS assets aren't always staged
|
|
1040
|
+
// (it 502s "TEXT_TO_SPEECH not available"). Rather than go
|
|
1041
|
+
// silent — the JS browser-SpeechSynthesis fallback doesn't
|
|
1042
|
+
// exist in the Android WebView — fall back to the platform
|
|
1043
|
+
// TextToSpeech so replies are always spoken aloud.
|
|
1044
|
+
Log.w(TAG, "Local inference TTS failed, falling back to system TTS", e)
|
|
1045
|
+
speakWithSystemTts(text, call)
|
|
1046
|
+
}
|
|
1047
|
+
}
|
|
1048
|
+
} else if (canUseElevenLabs) {
|
|
622
1049
|
try {
|
|
623
1050
|
val request = buildElevenLabsRequest(text, directive)
|
|
624
1051
|
streamAndPlayPcm(
|
|
625
|
-
voiceId = effectiveVoiceId
|
|
626
|
-
apiKey = effectiveApiKey
|
|
1052
|
+
voiceId = effectiveVoiceId,
|
|
1053
|
+
apiKey = effectiveApiKey,
|
|
627
1054
|
request = request
|
|
628
1055
|
)
|
|
629
1056
|
|
|
@@ -665,13 +1092,16 @@ class TalkModePlugin : Plugin() {
|
|
|
665
1092
|
put("error", e.message ?: "Speak failed")
|
|
666
1093
|
})
|
|
667
1094
|
} finally {
|
|
1095
|
+
val wasInterrupted = pcmStopRequested.get()
|
|
1096
|
+
val interruptedAt = lastInterruptedAtSeconds
|
|
668
1097
|
isSpeaking = false
|
|
669
1098
|
pcmStopRequested.set(false)
|
|
670
|
-
abandonAudioFocus()
|
|
671
1099
|
|
|
672
1100
|
notifyListeners("speakComplete", JSObject().apply {
|
|
673
|
-
put("completed", !
|
|
674
|
-
|
|
1101
|
+
put("completed", !wasInterrupted)
|
|
1102
|
+
if (wasInterrupted) {
|
|
1103
|
+
interruptedAt?.let { put("interruptedAt", it) }
|
|
1104
|
+
}
|
|
675
1105
|
})
|
|
676
1106
|
|
|
677
1107
|
if (enabled) {
|
|
@@ -679,6 +1109,8 @@ class TalkModePlugin : Plugin() {
|
|
|
679
1109
|
setState("listening", "Listening")
|
|
680
1110
|
mainHandler.post { startListeningInternal(markListening = true) }
|
|
681
1111
|
} else {
|
|
1112
|
+
// Standalone speak (no active conversation): release the session.
|
|
1113
|
+
releaseVoiceAudioSession()
|
|
682
1114
|
setState("idle", "Off")
|
|
683
1115
|
}
|
|
684
1116
|
}
|
|
@@ -753,6 +1185,273 @@ class TalkModePlugin : Plugin() {
|
|
|
753
1185
|
return if (value == null || value === JSONObject.NULL) null else value.toString()
|
|
754
1186
|
}
|
|
755
1187
|
|
|
1188
|
+
private data class PcmStreamFormat(
|
|
1189
|
+
val sampleRate: Int,
|
|
1190
|
+
val channels: Int,
|
|
1191
|
+
val bitsPerSample: Int,
|
|
1192
|
+
val dataBytes: Int
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
/**
|
|
1196
|
+
* Stream local-inference TTS from the embedded agent and play it natively.
|
|
1197
|
+
*
|
|
1198
|
+
* The agent currently returns a buffered WAV, but keeping playback in
|
|
1199
|
+
* AudioTrack means this path is ready for a chunked PCM/WAV response without
|
|
1200
|
+
* going back through WebView decodeAudioData.
|
|
1201
|
+
*/
|
|
1202
|
+
private suspend fun streamAndPlayLocalInferenceTts(
|
|
1203
|
+
text: String,
|
|
1204
|
+
directive: JSObject?
|
|
1205
|
+
) = withContext(Dispatchers.IO) {
|
|
1206
|
+
pcmStopRequested.set(false)
|
|
1207
|
+
val conn = openLocalInferenceTtsConnection()
|
|
1208
|
+
activePcmConnection = conn
|
|
1209
|
+
try {
|
|
1210
|
+
val payload = buildLocalInferenceTtsPayload(text, directive)
|
|
1211
|
+
conn.outputStream.use { it.write(payload.toByteArray(Charsets.UTF_8)) }
|
|
1212
|
+
|
|
1213
|
+
val code = conn.responseCode
|
|
1214
|
+
if (code >= 400) {
|
|
1215
|
+
val errBody = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
|
|
1216
|
+
throw IllegalStateException("Local inference TTS error: $code $errBody")
|
|
1217
|
+
}
|
|
1218
|
+
|
|
1219
|
+
BufferedInputStream(conn.inputStream).use { input ->
|
|
1220
|
+
val format = readWavPcmFormat(input)
|
|
1221
|
+
val track = createPcmAudioTrack(format)
|
|
1222
|
+
pcmTrack = track
|
|
1223
|
+
track.play()
|
|
1224
|
+
|
|
1225
|
+
Log.d(
|
|
1226
|
+
TAG,
|
|
1227
|
+
"Local inference PCM play start sampleRate=${format.sampleRate} channels=${format.channels}"
|
|
1228
|
+
)
|
|
1229
|
+
notifyListeners("playbackStart", JSObject().apply {
|
|
1230
|
+
put("provider", "local-inference")
|
|
1231
|
+
put("sampleRate", format.sampleRate)
|
|
1232
|
+
put("channels", format.channels)
|
|
1233
|
+
})
|
|
1234
|
+
val framesWritten = writePcmStreamToTrack(input, track, format)
|
|
1235
|
+
drainPcmTrack(track, framesWritten, format.sampleRate)
|
|
1236
|
+
if (!pcmStopRequested.get()) {
|
|
1237
|
+
track.stop()
|
|
1238
|
+
}
|
|
1239
|
+
Log.d(TAG, "Local inference PCM play done frames=$framesWritten")
|
|
1240
|
+
}
|
|
1241
|
+
} finally {
|
|
1242
|
+
cleanupPcmTrack()
|
|
1243
|
+
if (activePcmConnection === conn) {
|
|
1244
|
+
activePcmConnection = null
|
|
1245
|
+
}
|
|
1246
|
+
conn.disconnect()
|
|
1247
|
+
}
|
|
1248
|
+
}
|
|
1249
|
+
|
|
1250
|
+
private fun openLocalInferenceTtsConnection(): HttpURLConnection {
|
|
1251
|
+
val tokenFile = File(context.filesDir, "auth/local-agent-token")
|
|
1252
|
+
val token = tokenFile.takeIf { it.isFile }?.readText()?.trim().orEmpty()
|
|
1253
|
+
if (token.isEmpty()) {
|
|
1254
|
+
throw IllegalStateException("Local agent auth token is missing")
|
|
1255
|
+
}
|
|
1256
|
+
|
|
1257
|
+
val conn = URL(LOCAL_INFERENCE_TTS_URL).openConnection() as HttpURLConnection
|
|
1258
|
+
conn.requestMethod = "POST"
|
|
1259
|
+
conn.connectTimeout = 30_000
|
|
1260
|
+
conn.readTimeout = 180_000
|
|
1261
|
+
conn.setRequestProperty("Authorization", "Bearer $token")
|
|
1262
|
+
conn.setRequestProperty("Content-Type", "application/json")
|
|
1263
|
+
conn.setRequestProperty("Accept", "audio/wav")
|
|
1264
|
+
conn.doOutput = true
|
|
1265
|
+
return conn
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
private fun buildLocalInferenceTtsPayload(text: String, directive: JSObject?): String {
|
|
1269
|
+
val payload = JSONObject()
|
|
1270
|
+
payload.put("text", text)
|
|
1271
|
+
directive.stringOrNull("voiceId")?.let { payload.put("voiceId", it) }
|
|
1272
|
+
directive.stringOrNull("voice")?.let { payload.put("voice", it) }
|
|
1273
|
+
directive.stringOrNull("modelId")?.let { payload.put("modelId", it) }
|
|
1274
|
+
directive.stringOrNull("model")?.let { payload.put("model", it) }
|
|
1275
|
+
val speed = directive?.optDouble("speed", Double.NaN)
|
|
1276
|
+
if (speed != null && speed.isFinite() && speed > 0.0) {
|
|
1277
|
+
payload.put("speed", speed)
|
|
1278
|
+
}
|
|
1279
|
+
return payload.toString()
|
|
1280
|
+
}
|
|
1281
|
+
|
|
1282
|
+
private fun readExactly(input: BufferedInputStream, size: Int): ByteArray {
|
|
1283
|
+
val bytes = ByteArray(size)
|
|
1284
|
+
var offset = 0
|
|
1285
|
+
while (offset < size) {
|
|
1286
|
+
val read = input.read(bytes, offset, size - offset)
|
|
1287
|
+
if (read < 0) {
|
|
1288
|
+
throw IllegalStateException("Unexpected end of WAV stream")
|
|
1289
|
+
}
|
|
1290
|
+
offset += read
|
|
1291
|
+
}
|
|
1292
|
+
return bytes
|
|
1293
|
+
}
|
|
1294
|
+
|
|
1295
|
+
private fun skipFully(input: BufferedInputStream, count: Int) {
|
|
1296
|
+
var remaining = count
|
|
1297
|
+
while (remaining > 0) {
|
|
1298
|
+
val skipped = input.skip(remaining.toLong()).toInt()
|
|
1299
|
+
if (skipped > 0) {
|
|
1300
|
+
remaining -= skipped
|
|
1301
|
+
continue
|
|
1302
|
+
}
|
|
1303
|
+
if (input.read() < 0) {
|
|
1304
|
+
throw IllegalStateException("Unexpected end of WAV stream")
|
|
1305
|
+
}
|
|
1306
|
+
remaining -= 1
|
|
1307
|
+
}
|
|
1308
|
+
}
|
|
1309
|
+
|
|
1310
|
+
private fun littleEndianShort(bytes: ByteArray, offset: Int): Int {
|
|
1311
|
+
return (bytes[offset].toInt() and 0xff) or
|
|
1312
|
+
((bytes[offset + 1].toInt() and 0xff) shl 8)
|
|
1313
|
+
}
|
|
1314
|
+
|
|
1315
|
+
private fun littleEndianInt(bytes: ByteArray, offset: Int): Int {
|
|
1316
|
+
return (bytes[offset].toInt() and 0xff) or
|
|
1317
|
+
((bytes[offset + 1].toInt() and 0xff) shl 8) or
|
|
1318
|
+
((bytes[offset + 2].toInt() and 0xff) shl 16) or
|
|
1319
|
+
((bytes[offset + 3].toInt() and 0xff) shl 24)
|
|
1320
|
+
}
|
|
1321
|
+
|
|
1322
|
+
private fun chunkId(bytes: ByteArray): String {
|
|
1323
|
+
return String(bytes, 0, 4, Charsets.US_ASCII)
|
|
1324
|
+
}
|
|
1325
|
+
|
|
1326
|
+
private fun readWavPcmFormat(input: BufferedInputStream): PcmStreamFormat {
|
|
1327
|
+
val riff = readExactly(input, 12)
|
|
1328
|
+
if (
|
|
1329
|
+
String(riff, 0, 4, Charsets.US_ASCII) != "RIFF" ||
|
|
1330
|
+
String(riff, 8, 4, Charsets.US_ASCII) != "WAVE"
|
|
1331
|
+
) {
|
|
1332
|
+
throw IllegalStateException("Local inference TTS returned non-WAV audio")
|
|
1333
|
+
}
|
|
1334
|
+
|
|
1335
|
+
var format: PcmStreamFormat? = null
|
|
1336
|
+
while (true) {
|
|
1337
|
+
val header = readExactly(input, 8)
|
|
1338
|
+
val id = chunkId(header)
|
|
1339
|
+
val size = littleEndianInt(header, 4)
|
|
1340
|
+
if (size < 0) {
|
|
1341
|
+
throw IllegalStateException("Invalid WAV chunk size for $id")
|
|
1342
|
+
}
|
|
1343
|
+
|
|
1344
|
+
if (id == "fmt ") {
|
|
1345
|
+
val fmt = readExactly(input, size)
|
|
1346
|
+
if (fmt.size < 16) {
|
|
1347
|
+
throw IllegalStateException("Invalid WAV fmt chunk")
|
|
1348
|
+
}
|
|
1349
|
+
val audioFormat = littleEndianShort(fmt, 0)
|
|
1350
|
+
val channels = littleEndianShort(fmt, 2)
|
|
1351
|
+
val sampleRate = littleEndianInt(fmt, 4)
|
|
1352
|
+
val bitsPerSample = littleEndianShort(fmt, 14)
|
|
1353
|
+
if (audioFormat != 1) {
|
|
1354
|
+
throw IllegalStateException("Only PCM WAV is supported, got format=$audioFormat")
|
|
1355
|
+
}
|
|
1356
|
+
if (bitsPerSample != 16) {
|
|
1357
|
+
throw IllegalStateException("Only 16-bit PCM WAV is supported, got bits=$bitsPerSample")
|
|
1358
|
+
}
|
|
1359
|
+
if (channels !in 1..2 || sampleRate <= 0) {
|
|
1360
|
+
throw IllegalStateException("Invalid WAV format sampleRate=$sampleRate channels=$channels")
|
|
1361
|
+
}
|
|
1362
|
+
format = PcmStreamFormat(sampleRate, channels, bitsPerSample, 0)
|
|
1363
|
+
if (size % 2 == 1) skipFully(input, 1)
|
|
1364
|
+
continue
|
|
1365
|
+
}
|
|
1366
|
+
|
|
1367
|
+
if (id == "data") {
|
|
1368
|
+
val parsed = format ?: throw IllegalStateException("WAV data arrived before fmt chunk")
|
|
1369
|
+
return parsed.copy(dataBytes = size)
|
|
1370
|
+
}
|
|
1371
|
+
|
|
1372
|
+
skipFully(input, size)
|
|
1373
|
+
if (size % 2 == 1) skipFully(input, 1)
|
|
1374
|
+
}
|
|
1375
|
+
}
|
|
1376
|
+
|
|
1377
|
+
private fun createPcmAudioTrack(format: PcmStreamFormat): AudioTrack {
|
|
1378
|
+
val channelMask = when (format.channels) {
|
|
1379
|
+
1 -> AudioFormat.CHANNEL_OUT_MONO
|
|
1380
|
+
2 -> AudioFormat.CHANNEL_OUT_STEREO
|
|
1381
|
+
else -> throw IllegalStateException("Unsupported PCM channel count ${format.channels}")
|
|
1382
|
+
}
|
|
1383
|
+
val minBuffer = AudioTrack.getMinBufferSize(
|
|
1384
|
+
format.sampleRate,
|
|
1385
|
+
channelMask,
|
|
1386
|
+
AudioFormat.ENCODING_PCM_16BIT
|
|
1387
|
+
)
|
|
1388
|
+
if (minBuffer <= 0) {
|
|
1389
|
+
throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
|
|
1390
|
+
}
|
|
1391
|
+
val bufferSize = max(minBuffer * 2, 8 * 1024)
|
|
1392
|
+
val track = AudioTrack.Builder()
|
|
1393
|
+
.setAudioAttributes(voiceAudioAttributes())
|
|
1394
|
+
.setAudioFormat(
|
|
1395
|
+
AudioFormat.Builder()
|
|
1396
|
+
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
|
|
1397
|
+
.setSampleRate(format.sampleRate)
|
|
1398
|
+
.setChannelMask(channelMask)
|
|
1399
|
+
.build()
|
|
1400
|
+
)
|
|
1401
|
+
.setBufferSizeInBytes(bufferSize)
|
|
1402
|
+
.setTransferMode(AudioTrack.MODE_STREAM)
|
|
1403
|
+
.build()
|
|
1404
|
+
|
|
1405
|
+
if (track.state != AudioTrack.STATE_INITIALIZED) {
|
|
1406
|
+
track.release()
|
|
1407
|
+
throw IllegalStateException("AudioTrack init failed")
|
|
1408
|
+
}
|
|
1409
|
+
return track
|
|
1410
|
+
}
|
|
1411
|
+
|
|
1412
|
+
private fun writePcmStreamToTrack(
|
|
1413
|
+
input: BufferedInputStream,
|
|
1414
|
+
track: AudioTrack,
|
|
1415
|
+
format: PcmStreamFormat
|
|
1416
|
+
): Long {
|
|
1417
|
+
val bytesPerFrame = format.channels * (format.bitsPerSample / 8)
|
|
1418
|
+
var bytesWrittenTotal = 0L
|
|
1419
|
+
var remainingBytes = format.dataBytes
|
|
1420
|
+
val buffer = ByteArray(8 * 1024)
|
|
1421
|
+
while (remainingBytes > 0) {
|
|
1422
|
+
if (pcmStopRequested.get()) break
|
|
1423
|
+
val requestBytes = if (remainingBytes < buffer.size) remainingBytes else buffer.size
|
|
1424
|
+
val bytesRead = input.read(buffer, 0, requestBytes)
|
|
1425
|
+
if (bytesRead <= 0) break
|
|
1426
|
+
remainingBytes -= bytesRead
|
|
1427
|
+
|
|
1428
|
+
var offset = 0
|
|
1429
|
+
while (offset < bytesRead) {
|
|
1430
|
+
if (pcmStopRequested.get()) break
|
|
1431
|
+
val wrote = track.write(buffer, offset, bytesRead - offset)
|
|
1432
|
+
if (wrote <= 0) {
|
|
1433
|
+
throw IllegalStateException("AudioTrack write failed: $wrote")
|
|
1434
|
+
}
|
|
1435
|
+
offset += wrote
|
|
1436
|
+
bytesWrittenTotal += wrote.toLong()
|
|
1437
|
+
}
|
|
1438
|
+
}
|
|
1439
|
+
return if (bytesPerFrame > 0) bytesWrittenTotal / bytesPerFrame else 0L
|
|
1440
|
+
}
|
|
1441
|
+
|
|
1442
|
+
private fun drainPcmTrack(track: AudioTrack, framesWritten: Long, sampleRate: Int) {
|
|
1443
|
+
if (framesWritten <= 0L || sampleRate <= 0) return
|
|
1444
|
+
val maxDrainMs = (framesWritten * 1000L / sampleRate).coerceAtMost(30_000L) + 1_000L
|
|
1445
|
+
val deadline = SystemClock.elapsedRealtime() + maxDrainMs
|
|
1446
|
+
while (
|
|
1447
|
+
!pcmStopRequested.get() &&
|
|
1448
|
+
track.playbackHeadPosition.toLong() < framesWritten &&
|
|
1449
|
+
SystemClock.elapsedRealtime() < deadline
|
|
1450
|
+
) {
|
|
1451
|
+
SystemClock.sleep(20)
|
|
1452
|
+
}
|
|
1453
|
+
}
|
|
1454
|
+
|
|
756
1455
|
/**
|
|
757
1456
|
* Stream PCM audio from ElevenLabs and play via AudioTrack.
|
|
758
1457
|
* Ported from classic TalkModeManager with proper offset-based writes.
|
|
@@ -776,12 +1475,7 @@ class TalkModePlugin : Plugin() {
|
|
|
776
1475
|
|
|
777
1476
|
val bufferSize = max(minBuffer * 2, 8 * 1024)
|
|
778
1477
|
val track = AudioTrack.Builder()
|
|
779
|
-
.setAudioAttributes(
|
|
780
|
-
AudioAttributes.Builder()
|
|
781
|
-
.setUsage(AudioAttributes.USAGE_ASSISTANT)
|
|
782
|
-
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
|
783
|
-
.build()
|
|
784
|
-
)
|
|
1478
|
+
.setAudioAttributes(voiceAudioAttributes())
|
|
785
1479
|
.setAudioFormat(
|
|
786
1480
|
AudioFormat.Builder()
|
|
787
1481
|
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
|
|
@@ -802,6 +1496,7 @@ class TalkModePlugin : Plugin() {
|
|
|
802
1496
|
|
|
803
1497
|
Log.d(TAG, "PCM play start sampleRate=$sampleRate bufferSize=$bufferSize")
|
|
804
1498
|
val conn = openTtsConnection(voiceId, apiKey, request)
|
|
1499
|
+
activePcmConnection = conn
|
|
805
1500
|
try {
|
|
806
1501
|
val payload = buildRequestPayload(request)
|
|
807
1502
|
conn.outputStream.use { it.write(payload.toByteArray()) }
|
|
@@ -845,6 +1540,9 @@ class TalkModePlugin : Plugin() {
|
|
|
845
1540
|
Log.d(TAG, "PCM play done")
|
|
846
1541
|
} finally {
|
|
847
1542
|
cleanupPcmTrack()
|
|
1543
|
+
if (activePcmConnection === conn) {
|
|
1544
|
+
activePcmConnection = null
|
|
1545
|
+
}
|
|
848
1546
|
conn.disconnect()
|
|
849
1547
|
}
|
|
850
1548
|
}
|
|
@@ -970,43 +1668,125 @@ class TalkModePlugin : Plugin() {
|
|
|
970
1668
|
}
|
|
971
1669
|
}
|
|
972
1670
|
|
|
973
|
-
// ──
|
|
1671
|
+
// ── Voice audio session ─────────────────────────────────────────────
|
|
1672
|
+
//
|
|
1673
|
+
// The Android analog of the iOS `.playAndRecord` / `.voiceChat` /
|
|
1674
|
+
// `.defaultToSpeaker` session. Putting the device in MODE_IN_COMMUNICATION
|
|
1675
|
+
// for the whole conversation routes capture + playback through the
|
|
1676
|
+
// telephony path, which engages the platform hardware AEC so TTS coming out
|
|
1677
|
+
// the speaker is cancelled from the mic (the core fix for the mic+speaker
|
|
1678
|
+
// echo loop in hands-free mode). We also hold voice-communication audio
|
|
1679
|
+
// focus and route to the loudspeaker (unless a headset is connected) so
|
|
1680
|
+
// hands-free playback is audible.
|
|
1681
|
+
|
|
1682
|
+
private fun voiceAudioAttributes(): AudioAttributes =
|
|
1683
|
+
AudioAttributes.Builder()
|
|
1684
|
+
.setUsage(AudioAttributes.USAGE_VOICE_COMMUNICATION)
|
|
1685
|
+
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
|
1686
|
+
.build()
|
|
974
1687
|
|
|
975
|
-
private fun
|
|
1688
|
+
private fun configureVoiceAudioSession() {
|
|
1689
|
+
if (audioSessionActive) return
|
|
976
1690
|
val am = audioManager ?: return
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
1691
|
+
|
|
1692
|
+
savedAudioMode = am.mode
|
|
1693
|
+
@Suppress("DEPRECATION")
|
|
1694
|
+
savedSpeakerphoneOn = am.isSpeakerphoneOn
|
|
1695
|
+
|
|
1696
|
+
val request = AudioFocusRequest.Builder(AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_EXCLUSIVE)
|
|
1697
|
+
.setAudioAttributes(voiceAudioAttributes())
|
|
1698
|
+
.setOnAudioFocusChangeListener { focusChange ->
|
|
1699
|
+
if (
|
|
1700
|
+
focusChange == AudioManager.AUDIOFOCUS_LOSS ||
|
|
1701
|
+
focusChange == AudioManager.AUDIOFOCUS_LOSS_TRANSIENT
|
|
1702
|
+
) {
|
|
1703
|
+
// Another app took audio; stop speaking if we are.
|
|
1704
|
+
if (isSpeaking) stopSpeakingInternal()
|
|
985
1705
|
}
|
|
986
1706
|
}
|
|
1707
|
+
.build()
|
|
1708
|
+
audioFocusRequest = request
|
|
1709
|
+
am.requestAudioFocus(request)
|
|
1710
|
+
|
|
1711
|
+
am.mode = AudioManager.MODE_IN_COMMUNICATION
|
|
1712
|
+
routeVoiceOutput(am)
|
|
1713
|
+
muteEarconStreams(am)
|
|
1714
|
+
audioSessionActive = true
|
|
1715
|
+
Log.d(TAG, "Voice audio session active (communication mode)")
|
|
1716
|
+
}
|
|
1717
|
+
|
|
1718
|
+
/** Mute the recognizer earcon streams for the session; idempotent. */
|
|
1719
|
+
private fun muteEarconStreams(am: AudioManager) {
|
|
1720
|
+
if (earconStreamsMuted) return
|
|
1721
|
+
for (stream in earconStreams) {
|
|
1722
|
+
try {
|
|
1723
|
+
am.adjustStreamVolume(stream, AudioManager.ADJUST_MUTE, 0)
|
|
1724
|
+
} catch (_: Throwable) {
|
|
1725
|
+
// Some OEMs disallow muting certain streams without DND access.
|
|
1726
|
+
}
|
|
987
1727
|
}
|
|
988
|
-
|
|
1728
|
+
earconStreamsMuted = true
|
|
1729
|
+
}
|
|
989
1730
|
|
|
1731
|
+
private fun unmuteEarconStreams(am: AudioManager) {
|
|
1732
|
+
if (!earconStreamsMuted) return
|
|
1733
|
+
for (stream in earconStreams) {
|
|
1734
|
+
try {
|
|
1735
|
+
am.adjustStreamVolume(stream, AudioManager.ADJUST_UNMUTE, 0)
|
|
1736
|
+
} catch (_: Throwable) {}
|
|
1737
|
+
}
|
|
1738
|
+
earconStreamsMuted = false
|
|
1739
|
+
}
|
|
1740
|
+
|
|
1741
|
+
/**
|
|
1742
|
+
* Default playback to the loudspeaker for hands-free use, but let a wired or
|
|
1743
|
+
* Bluetooth headset win — the iOS `.defaultToSpeaker` semantic.
|
|
1744
|
+
*/
|
|
1745
|
+
private fun routeVoiceOutput(am: AudioManager) {
|
|
1746
|
+
val hasHeadset = am.getDevices(AudioManager.GET_DEVICES_OUTPUTS).any { device ->
|
|
1747
|
+
device.type == AudioDeviceInfo.TYPE_WIRED_HEADSET ||
|
|
1748
|
+
device.type == AudioDeviceInfo.TYPE_WIRED_HEADPHONES ||
|
|
1749
|
+
device.type == AudioDeviceInfo.TYPE_USB_HEADSET ||
|
|
1750
|
+
device.type == AudioDeviceInfo.TYPE_BLUETOOTH_SCO ||
|
|
1751
|
+
device.type == AudioDeviceInfo.TYPE_BLUETOOTH_A2DP
|
|
1752
|
+
}
|
|
1753
|
+
if (hasHeadset) {
|
|
1754
|
+
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
|
|
1755
|
+
@Suppress("DEPRECATION")
|
|
1756
|
+
am.isSpeakerphoneOn = false
|
|
1757
|
+
return
|
|
1758
|
+
}
|
|
1759
|
+
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) {
|
|
1760
|
+
val speaker = am.availableCommunicationDevices.firstOrNull {
|
|
1761
|
+
it.type == AudioDeviceInfo.TYPE_BUILTIN_SPEAKER
|
|
1762
|
+
}
|
|
1763
|
+
if (speaker != null && am.setCommunicationDevice(speaker)) return
|
|
1764
|
+
}
|
|
990
1765
|
@Suppress("DEPRECATION")
|
|
991
|
-
am.
|
|
992
|
-
focusListener,
|
|
993
|
-
AudioManager.STREAM_MUSIC,
|
|
994
|
-
AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK
|
|
995
|
-
)
|
|
1766
|
+
am.isSpeakerphoneOn = true
|
|
996
1767
|
}
|
|
997
1768
|
|
|
998
|
-
private fun
|
|
1769
|
+
private fun releaseVoiceAudioSession() {
|
|
1770
|
+
if (!audioSessionActive) return
|
|
999
1771
|
val am = audioManager ?: return
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
am.abandonAudioFocus(listener)
|
|
1772
|
+
unmuteEarconStreams(am)
|
|
1773
|
+
audioFocusRequest?.let { am.abandonAudioFocusRequest(it) }
|
|
1003
1774
|
audioFocusRequest = null
|
|
1775
|
+
if (Build.VERSION.SDK_INT >= Build.VERSION_CODES.S) am.clearCommunicationDevice()
|
|
1776
|
+
@Suppress("DEPRECATION")
|
|
1777
|
+
am.isSpeakerphoneOn = savedSpeakerphoneOn
|
|
1778
|
+
am.mode = savedAudioMode
|
|
1779
|
+
audioSessionActive = false
|
|
1780
|
+
Log.d(TAG, "Voice audio session released")
|
|
1004
1781
|
}
|
|
1005
1782
|
|
|
1006
1783
|
// ── Cleanup helpers ─────────────────────────────────────────────────
|
|
1007
1784
|
|
|
1008
1785
|
private fun stopSpeakingInternal() {
|
|
1009
1786
|
pcmStopRequested.set(true)
|
|
1787
|
+
val conn = activePcmConnection
|
|
1788
|
+
activePcmConnection = null
|
|
1789
|
+
conn?.disconnect()
|
|
1010
1790
|
cleanupPcmTrack()
|
|
1011
1791
|
systemTts?.stop()
|
|
1012
1792
|
systemTtsPending?.cancel()
|
|
@@ -1162,6 +1942,9 @@ class TalkModePlugin : Plugin() {
|
|
|
1162
1942
|
}
|
|
1163
1943
|
|
|
1164
1944
|
private fun isPermissionGranted(permission: String): Boolean {
|
|
1945
|
+
if (permission == Manifest.permission.RECORD_AUDIO) {
|
|
1946
|
+
return context.checkSelfPermission(permission) == PackageManager.PERMISSION_GRANTED
|
|
1947
|
+
}
|
|
1165
1948
|
return getPermissionState(permission) == com.getcapacitor.PermissionState.GRANTED
|
|
1166
1949
|
}
|
|
1167
1950
|
|
|
@@ -1176,10 +1959,13 @@ class TalkModePlugin : Plugin() {
|
|
|
1176
1959
|
systemTts?.shutdown()
|
|
1177
1960
|
systemTts = null
|
|
1178
1961
|
cleanupPcmTrack()
|
|
1962
|
+
audioFrameRunning.set(false)
|
|
1963
|
+
audioFrameJob?.cancel()
|
|
1964
|
+
releaseAudioRecord()
|
|
1179
1965
|
silenceJob?.cancel()
|
|
1180
1966
|
restartJob?.cancel()
|
|
1181
1967
|
speakingJob?.cancel()
|
|
1182
|
-
|
|
1968
|
+
releaseVoiceAudioSession()
|
|
1183
1969
|
scope.cancel()
|
|
1184
1970
|
}
|
|
1185
1971
|
|