@elizaos/capacitor-talkmode 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ElizaosCapacitorTalkmode.podspec +18 -0
- package/android/build.gradle +46 -0
- package/android/src/main/AndroidManifest.xml +7 -0
- package/android/src/main/java/ai/eliza/plugins/talkmode/TalkModePlugin.kt +1202 -0
- package/dist/esm/definitions.d.ts +277 -0
- package/dist/esm/definitions.d.ts.map +1 -0
- package/dist/esm/definitions.js +1 -0
- package/dist/esm/index.d.ts +4 -0
- package/dist/esm/index.d.ts.map +1 -0
- package/dist/esm/index.js +6 -0
- package/dist/esm/web.d.ts +46 -0
- package/dist/esm/web.d.ts.map +1 -0
- package/dist/esm/web.js +201 -0
- package/dist/plugin.cjs.js +214 -0
- package/dist/plugin.cjs.js.map +1 -0
- package/dist/plugin.js +217 -0
- package/dist/plugin.js.map +1 -0
- package/ios/Sources/TalkModePlugin/TalkModePlugin.swift +1121 -0
- package/package.json +83 -0
|
@@ -0,0 +1,1202 @@
|
|
|
1
|
+
package ai.eliza.plugins.talkmode
|
|
2
|
+
|
|
3
|
+
import android.Manifest
|
|
4
|
+
import android.content.Intent
|
|
5
|
+
import android.media.AudioAttributes
|
|
6
|
+
import android.media.AudioFormat
|
|
7
|
+
import android.media.AudioManager
|
|
8
|
+
import android.media.AudioTrack
|
|
9
|
+
import android.os.Bundle
|
|
10
|
+
import android.os.Handler
|
|
11
|
+
import android.os.Looper
|
|
12
|
+
import android.os.SystemClock
|
|
13
|
+
import android.speech.RecognitionListener
|
|
14
|
+
import android.speech.RecognizerIntent
|
|
15
|
+
import android.speech.SpeechRecognizer
|
|
16
|
+
import android.speech.tts.TextToSpeech
|
|
17
|
+
import android.speech.tts.UtteranceProgressListener
|
|
18
|
+
import android.util.Log
|
|
19
|
+
import com.getcapacitor.JSObject
|
|
20
|
+
import com.getcapacitor.PermissionState
|
|
21
|
+
import com.getcapacitor.Plugin
|
|
22
|
+
import com.getcapacitor.PluginCall
|
|
23
|
+
import com.getcapacitor.PluginMethod
|
|
24
|
+
import com.getcapacitor.annotation.CapacitorPlugin
|
|
25
|
+
import com.getcapacitor.annotation.Permission
|
|
26
|
+
import com.getcapacitor.annotation.PermissionCallback
|
|
27
|
+
import kotlinx.coroutines.*
|
|
28
|
+
import java.io.BufferedInputStream
|
|
29
|
+
import java.net.HttpURLConnection
|
|
30
|
+
import java.net.URL
|
|
31
|
+
import java.util.Locale
|
|
32
|
+
import java.util.UUID
|
|
33
|
+
import java.util.concurrent.atomic.AtomicBoolean
|
|
34
|
+
import kotlin.math.max
|
|
35
|
+
import org.json.JSONObject
|
|
36
|
+
|
|
37
|
+
@CapacitorPlugin(
|
|
38
|
+
name = "TalkMode",
|
|
39
|
+
permissions = [
|
|
40
|
+
Permission(alias = "microphone", strings = [Manifest.permission.RECORD_AUDIO])
|
|
41
|
+
]
|
|
42
|
+
)
|
|
43
|
+
class TalkModePlugin : Plugin() {
|
|
44
|
+
companion object {
|
|
45
|
+
private const val TAG = "TalkMode"
|
|
46
|
+
private const val DEFAULT_MODEL_ID = "eleven_flash_v2_5"
|
|
47
|
+
private const val DEFAULT_OUTPUT_FORMAT = "pcm_24000"
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
private val mainHandler = Handler(Looper.getMainLooper())
|
|
51
|
+
private val scope = CoroutineScope(Dispatchers.Main + SupervisorJob())
|
|
52
|
+
|
|
53
|
+
// State
|
|
54
|
+
private var enabled = false
|
|
55
|
+
private var state = "idle"
|
|
56
|
+
private var statusText = "Off"
|
|
57
|
+
|
|
58
|
+
// Speech recognition
|
|
59
|
+
private var recognizer: SpeechRecognizer? = null
|
|
60
|
+
private var isListening = false
|
|
61
|
+
private var listeningMode = false
|
|
62
|
+
private var stopRequested = false
|
|
63
|
+
private var restartJob: Job? = null
|
|
64
|
+
private var lastTranscript = ""
|
|
65
|
+
private var lastHeardAtMs: Long? = null
|
|
66
|
+
private var silenceJob: Job? = null
|
|
67
|
+
private val silenceWindowMs = 700L
|
|
68
|
+
|
|
69
|
+
// TTS
|
|
70
|
+
private var systemTts: TextToSpeech? = null
|
|
71
|
+
private var systemTtsReady = false
|
|
72
|
+
private var systemTtsPendingId: String? = null
|
|
73
|
+
private var systemTtsPending: CompletableDeferred<Unit>? = null
|
|
74
|
+
private var pcmTrack: AudioTrack? = null
|
|
75
|
+
private val pcmStopRequested = AtomicBoolean(false)
|
|
76
|
+
private var speakingJob: Job? = null
|
|
77
|
+
private var isSpeaking = false
|
|
78
|
+
private var usedSystemTts = false
|
|
79
|
+
private var lastSpokenText: String? = null
|
|
80
|
+
private var speakStartTimeMs: Long = 0
|
|
81
|
+
private var lastInterruptedAtSeconds: Double? = null
|
|
82
|
+
|
|
83
|
+
// Audio focus
|
|
84
|
+
private var audioManager: AudioManager? = null
|
|
85
|
+
private var audioFocusRequest: AudioManager.OnAudioFocusChangeListener? = null
|
|
86
|
+
|
|
87
|
+
// Config
|
|
88
|
+
private var apiKey: String? = null
|
|
89
|
+
private var voiceId: String? = null
|
|
90
|
+
private var modelId: String? = DEFAULT_MODEL_ID
|
|
91
|
+
private var outputFormat: String? = DEFAULT_OUTPUT_FORMAT
|
|
92
|
+
private var voiceAliases: Map<String, String> = emptyMap()
|
|
93
|
+
private var interruptOnSpeech = true
|
|
94
|
+
private var sessionKey = "main"
|
|
95
|
+
private var sttLanguage: String? = null
|
|
96
|
+
|
|
97
|
+
// ── Recognition listener ────────────────────────────────────────────
|
|
98
|
+
|
|
99
|
+
private val recognitionListener = object : RecognitionListener {
|
|
100
|
+
override fun onReadyForSpeech(params: Bundle?) {
|
|
101
|
+
Log.d(TAG, "Ready for speech")
|
|
102
|
+
if (enabled && isListening) {
|
|
103
|
+
setState("listening", "Listening")
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
override fun onBeginningOfSpeech() {
|
|
108
|
+
Log.d(TAG, "Beginning of speech")
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
override fun onRmsChanged(rmsdB: Float) {}
|
|
112
|
+
override fun onBufferReceived(buffer: ByteArray?) {}
|
|
113
|
+
|
|
114
|
+
override fun onEndOfSpeech() {
|
|
115
|
+
Log.d(TAG, "End of speech")
|
|
116
|
+
scheduleRestart()
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
override fun onError(error: Int) {
|
|
120
|
+
if (stopRequested) return
|
|
121
|
+
|
|
122
|
+
val errorMsg = when (error) {
|
|
123
|
+
SpeechRecognizer.ERROR_AUDIO -> "Audio recording error"
|
|
124
|
+
SpeechRecognizer.ERROR_CLIENT -> "Client error"
|
|
125
|
+
SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS -> "Insufficient permissions"
|
|
126
|
+
SpeechRecognizer.ERROR_NETWORK -> "Network error"
|
|
127
|
+
SpeechRecognizer.ERROR_NETWORK_TIMEOUT -> "Network timeout"
|
|
128
|
+
SpeechRecognizer.ERROR_NO_MATCH -> "No match"
|
|
129
|
+
SpeechRecognizer.ERROR_RECOGNIZER_BUSY -> "Recognizer busy"
|
|
130
|
+
SpeechRecognizer.ERROR_SERVER -> "Server error"
|
|
131
|
+
SpeechRecognizer.ERROR_SPEECH_TIMEOUT -> "Speech timeout"
|
|
132
|
+
else -> "Unknown error"
|
|
133
|
+
}
|
|
134
|
+
Log.d(TAG, "Recognition error: $errorMsg ($error)")
|
|
135
|
+
|
|
136
|
+
if (error == SpeechRecognizer.ERROR_INSUFFICIENT_PERMISSIONS) {
|
|
137
|
+
notifyListeners("error", JSObject().apply {
|
|
138
|
+
put("code", "recognition_error")
|
|
139
|
+
put("message", "Microphone permission required")
|
|
140
|
+
put("recoverable", false)
|
|
141
|
+
})
|
|
142
|
+
return
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
// Don't notify error for no-match / speech-timeout, just restart
|
|
146
|
+
if (error != SpeechRecognizer.ERROR_NO_MATCH &&
|
|
147
|
+
error != SpeechRecognizer.ERROR_SPEECH_TIMEOUT
|
|
148
|
+
) {
|
|
149
|
+
notifyListeners("error", JSObject().apply {
|
|
150
|
+
put("code", "recognition_error")
|
|
151
|
+
put("message", errorMsg)
|
|
152
|
+
put("recoverable", true)
|
|
153
|
+
})
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
scheduleRestart(delayMs = 600)
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
override fun onResults(results: Bundle?) {
|
|
160
|
+
val matches = results?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
|
|
161
|
+
val transcript = matches?.firstOrNull()?.trim() ?: ""
|
|
162
|
+
if (transcript.isNotEmpty()) {
|
|
163
|
+
handleTranscript(transcript, isFinal = true)
|
|
164
|
+
}
|
|
165
|
+
scheduleRestart()
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
override fun onPartialResults(partialResults: Bundle?) {
|
|
169
|
+
val matches = partialResults?.getStringArrayList(SpeechRecognizer.RESULTS_RECOGNITION)
|
|
170
|
+
val transcript = matches?.firstOrNull()?.trim() ?: ""
|
|
171
|
+
if (transcript.isNotEmpty()) {
|
|
172
|
+
handleTranscript(transcript, isFinal = false)
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
override fun onEvent(eventType: Int, params: Bundle?) {}
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// ── Lifecycle ────────────────────────────────────────────────────────
|
|
180
|
+
|
|
181
|
+
override fun load() {
|
|
182
|
+
super.load()
|
|
183
|
+
audioManager = context.getSystemService(android.content.Context.AUDIO_SERVICE) as? AudioManager
|
|
184
|
+
initSystemTts()
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
private fun initSystemTts() {
|
|
188
|
+
systemTts = TextToSpeech(context) { status ->
|
|
189
|
+
systemTtsReady = status == TextToSpeech.SUCCESS
|
|
190
|
+
if (systemTtsReady) {
|
|
191
|
+
systemTts?.language = Locale.getDefault()
|
|
192
|
+
systemTts?.setOnUtteranceProgressListener(object : UtteranceProgressListener() {
|
|
193
|
+
override fun onStart(id: String?) {}
|
|
194
|
+
|
|
195
|
+
override fun onDone(id: String?) {
|
|
196
|
+
if (id != null && id == systemTtsPendingId) {
|
|
197
|
+
systemTtsPending?.complete(Unit)
|
|
198
|
+
systemTtsPending = null
|
|
199
|
+
systemTtsPendingId = null
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
@Deprecated("Deprecated in Java")
|
|
204
|
+
override fun onError(id: String?) {
|
|
205
|
+
if (id != null && id == systemTtsPendingId) {
|
|
206
|
+
systemTtsPending?.completeExceptionally(
|
|
207
|
+
IllegalStateException("System TTS error")
|
|
208
|
+
)
|
|
209
|
+
systemTtsPending = null
|
|
210
|
+
systemTtsPendingId = null
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
override fun onError(id: String?, errorCode: Int) {
|
|
215
|
+
if (id != null && id == systemTtsPendingId) {
|
|
216
|
+
systemTtsPending?.completeExceptionally(
|
|
217
|
+
IllegalStateException("System TTS error $errorCode")
|
|
218
|
+
)
|
|
219
|
+
systemTtsPending = null
|
|
220
|
+
systemTtsPendingId = null
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
})
|
|
224
|
+
Log.d(TAG, "System TTS initialized")
|
|
225
|
+
} else {
|
|
226
|
+
Log.w(TAG, "System TTS init failed")
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
// ── Plugin methods ──────────────────────────────────────────────────
|
|
232
|
+
|
|
233
|
+
@PluginMethod
|
|
234
|
+
fun start(call: PluginCall) {
|
|
235
|
+
if (!SpeechRecognizer.isRecognitionAvailable(context)) {
|
|
236
|
+
call.resolve(JSObject().apply {
|
|
237
|
+
put("started", false)
|
|
238
|
+
put("error", "Speech recognition not available")
|
|
239
|
+
})
|
|
240
|
+
return
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
if (getPermissionState("microphone") != PermissionState.GRANTED) {
|
|
244
|
+
requestPermissionForAlias("microphone", call, "handleStartPermission")
|
|
245
|
+
return
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
startInternal(call)
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
@PermissionCallback
|
|
252
|
+
private fun handleStartPermission(call: PluginCall) {
|
|
253
|
+
if (getPermissionState("microphone") == PermissionState.GRANTED) {
|
|
254
|
+
startInternal(call)
|
|
255
|
+
} else {
|
|
256
|
+
call.resolve(JSObject().apply {
|
|
257
|
+
put("started", false)
|
|
258
|
+
put("error", "Microphone permission denied")
|
|
259
|
+
})
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
private fun startInternal(call: PluginCall) {
|
|
264
|
+
// Parse config
|
|
265
|
+
val config = call.getObject("config")
|
|
266
|
+
if (config != null) {
|
|
267
|
+
applyConfig(config)
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
enabled = true
|
|
271
|
+
stopRequested = false
|
|
272
|
+
listeningMode = true
|
|
273
|
+
setState("listening", "Listening")
|
|
274
|
+
|
|
275
|
+
mainHandler.post {
|
|
276
|
+
try {
|
|
277
|
+
recognizer?.destroy()
|
|
278
|
+
recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
|
|
279
|
+
setRecognitionListener(recognitionListener)
|
|
280
|
+
}
|
|
281
|
+
startListeningInternal(markListening = true)
|
|
282
|
+
startSilenceMonitor()
|
|
283
|
+
|
|
284
|
+
call.resolve(JSObject().apply {
|
|
285
|
+
put("started", true)
|
|
286
|
+
})
|
|
287
|
+
} catch (e: Exception) {
|
|
288
|
+
Log.e(TAG, "Failed to start", e)
|
|
289
|
+
call.resolve(JSObject().apply {
|
|
290
|
+
put("started", false)
|
|
291
|
+
put("error", e.message ?: "Failed to start")
|
|
292
|
+
})
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
@PluginMethod
|
|
298
|
+
fun stop(call: PluginCall) {
|
|
299
|
+
enabled = false
|
|
300
|
+
stopRequested = true
|
|
301
|
+
listeningMode = false
|
|
302
|
+
isListening = false
|
|
303
|
+
restartJob?.cancel()
|
|
304
|
+
restartJob = null
|
|
305
|
+
silenceJob?.cancel()
|
|
306
|
+
silenceJob = null
|
|
307
|
+
lastTranscript = ""
|
|
308
|
+
lastHeardAtMs = null
|
|
309
|
+
|
|
310
|
+
mainHandler.post {
|
|
311
|
+
recognizer?.cancel()
|
|
312
|
+
recognizer?.destroy()
|
|
313
|
+
recognizer = null
|
|
314
|
+
}
|
|
315
|
+
|
|
316
|
+
stopSpeakingInternal()
|
|
317
|
+
setState("idle", "Off")
|
|
318
|
+
call.resolve()
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
@PluginMethod
|
|
322
|
+
fun isEnabled(call: PluginCall) {
|
|
323
|
+
call.resolve(JSObject().apply {
|
|
324
|
+
put("enabled", enabled)
|
|
325
|
+
})
|
|
326
|
+
}
|
|
327
|
+
|
|
328
|
+
@PluginMethod
|
|
329
|
+
fun getState(call: PluginCall) {
|
|
330
|
+
call.resolve(JSObject().apply {
|
|
331
|
+
put("state", state)
|
|
332
|
+
put("statusText", statusText)
|
|
333
|
+
})
|
|
334
|
+
}
|
|
335
|
+
|
|
336
|
+
@PluginMethod
|
|
337
|
+
fun updateConfig(call: PluginCall) {
|
|
338
|
+
val config = call.getObject("config") ?: run {
|
|
339
|
+
call.resolve()
|
|
340
|
+
return
|
|
341
|
+
}
|
|
342
|
+
applyConfig(config)
|
|
343
|
+
call.resolve()
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
@PluginMethod
|
|
347
|
+
fun speak(call: PluginCall) {
|
|
348
|
+
val text = call.getString("text")?.trim() ?: run {
|
|
349
|
+
call.resolve(JSObject().apply {
|
|
350
|
+
put("completed", true)
|
|
351
|
+
put("interrupted", false)
|
|
352
|
+
put("usedSystemTts", false)
|
|
353
|
+
})
|
|
354
|
+
return
|
|
355
|
+
}
|
|
356
|
+
|
|
357
|
+
if (text.isEmpty()) {
|
|
358
|
+
call.resolve(JSObject().apply {
|
|
359
|
+
put("completed", true)
|
|
360
|
+
put("interrupted", false)
|
|
361
|
+
put("usedSystemTts", false)
|
|
362
|
+
})
|
|
363
|
+
return
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
val useSystemTts = call.getBoolean("useSystemTts", false) ?: false
|
|
367
|
+
val directive = call.getObject("directive")
|
|
368
|
+
|
|
369
|
+
speakingJob = scope.launch {
|
|
370
|
+
speakInternal(text, useSystemTts, directive, call)
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
@PluginMethod
|
|
375
|
+
fun stopSpeaking(call: PluginCall) {
|
|
376
|
+
val interruptedAt = computeInterruptedAt()
|
|
377
|
+
stopSpeakingInternal()
|
|
378
|
+
call.resolve(JSObject().apply {
|
|
379
|
+
if (interruptedAt != null) {
|
|
380
|
+
put("interruptedAt", interruptedAt)
|
|
381
|
+
}
|
|
382
|
+
})
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
@PluginMethod
|
|
386
|
+
fun isSpeaking(call: PluginCall) {
|
|
387
|
+
call.resolve(JSObject().apply {
|
|
388
|
+
put("speaking", isSpeaking)
|
|
389
|
+
})
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
@PluginMethod
|
|
393
|
+
override fun checkPermissions(call: PluginCall) {
|
|
394
|
+
call.resolve(buildPermissionResult())
|
|
395
|
+
}
|
|
396
|
+
|
|
397
|
+
@PluginMethod
|
|
398
|
+
override fun requestPermissions(call: PluginCall) {
|
|
399
|
+
if (!isPermissionGranted(Manifest.permission.RECORD_AUDIO)) {
|
|
400
|
+
requestPermissionForAlias("microphone", call, "handlePermissionResult")
|
|
401
|
+
} else {
|
|
402
|
+
call.resolve(buildPermissionResult())
|
|
403
|
+
}
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
@PermissionCallback
|
|
407
|
+
private fun handlePermissionResult(call: PluginCall) {
|
|
408
|
+
call.resolve(buildPermissionResult())
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
// ── Config ──────────────────────────────────────────────────────────
|
|
412
|
+
|
|
413
|
+
private fun applyConfig(config: JSObject) {
|
|
414
|
+
val tts = config.optJSONObject("tts")
|
|
415
|
+
if (tts != null) {
|
|
416
|
+
tts.stringOrNull("apiKey")?.takeIf { it.isNotEmpty() }?.let { apiKey = it }
|
|
417
|
+
tts.stringOrNull("voiceId")?.takeIf { it.isNotEmpty() }?.let { voiceId = it }
|
|
418
|
+
tts.stringOrNull("modelId")?.takeIf { it.isNotEmpty() }?.let { modelId = it }
|
|
419
|
+
tts.stringOrNull("outputFormat")?.takeIf { it.isNotEmpty() }?.let {
|
|
420
|
+
outputFormat = validatedOutputFormat(it) ?: outputFormat
|
|
421
|
+
}
|
|
422
|
+
if (tts.has("interruptOnSpeech")) {
|
|
423
|
+
interruptOnSpeech = tts.optBoolean("interruptOnSpeech", true)
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
val aliases = tts.optJSONObject("voiceAliases")
|
|
427
|
+
if (aliases != null) {
|
|
428
|
+
val map = mutableMapOf<String, String>()
|
|
429
|
+
aliases.keys().forEach { key ->
|
|
430
|
+
val value = aliases.stringOrNull(key)?.trim()
|
|
431
|
+
if (!value.isNullOrEmpty()) {
|
|
432
|
+
map[key.trim().lowercase()] = value
|
|
433
|
+
}
|
|
434
|
+
}
|
|
435
|
+
voiceAliases = map
|
|
436
|
+
}
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
val stt = config.optJSONObject("stt")
|
|
440
|
+
if (stt != null) {
|
|
441
|
+
stt.stringOrNull("language")?.takeIf { it.isNotEmpty() }?.let {
|
|
442
|
+
sttLanguage = validatedLanguage(it)
|
|
443
|
+
}
|
|
444
|
+
}
|
|
445
|
+
|
|
446
|
+
config.stringOrNull("sessionKey")?.takeIf { it.isNotEmpty() }?.let { sessionKey = it }
|
|
447
|
+
|
|
448
|
+
if (config.has("silenceWindowMs")) {
|
|
449
|
+
// silenceWindowMs is final for stability; log but don't change
|
|
450
|
+
Log.d(TAG, "silenceWindowMs config ignored on Android (fixed at ${silenceWindowMs}ms)")
|
|
451
|
+
}
|
|
452
|
+
}
|
|
453
|
+
|
|
454
|
+
// ── STT internals ───────────────────────────────────────────────────
|
|
455
|
+
|
|
456
|
+
private fun startListeningInternal(markListening: Boolean) {
|
|
457
|
+
if (stopRequested) return
|
|
458
|
+
val r = recognizer ?: return
|
|
459
|
+
|
|
460
|
+
val intent = Intent(RecognizerIntent.ACTION_RECOGNIZE_SPEECH).apply {
|
|
461
|
+
putExtra(RecognizerIntent.EXTRA_LANGUAGE_MODEL, RecognizerIntent.LANGUAGE_MODEL_FREE_FORM)
|
|
462
|
+
putExtra(RecognizerIntent.EXTRA_PARTIAL_RESULTS, true)
|
|
463
|
+
putExtra(RecognizerIntent.EXTRA_MAX_RESULTS, 3)
|
|
464
|
+
putExtra(RecognizerIntent.EXTRA_CALLING_PACKAGE, context.packageName)
|
|
465
|
+
sttLanguage?.let { putExtra(RecognizerIntent.EXTRA_LANGUAGE, it) }
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
if (markListening) {
|
|
469
|
+
isListening = true
|
|
470
|
+
setState("listening", "Listening")
|
|
471
|
+
}
|
|
472
|
+
|
|
473
|
+
try {
|
|
474
|
+
r.startListening(intent)
|
|
475
|
+
} catch (e: Exception) {
|
|
476
|
+
Log.e(TAG, "Failed to start listening", e)
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
private fun scheduleRestart(delayMs: Long = 350) {
|
|
481
|
+
if (stopRequested) return
|
|
482
|
+
restartJob?.cancel()
|
|
483
|
+
restartJob = scope.launch {
|
|
484
|
+
delay(delayMs)
|
|
485
|
+
mainHandler.post {
|
|
486
|
+
if (stopRequested) return@post
|
|
487
|
+
try {
|
|
488
|
+
recognizer?.cancel()
|
|
489
|
+
val shouldListen = listeningMode
|
|
490
|
+
val shouldInterrupt = isSpeaking && interruptOnSpeech
|
|
491
|
+
if (!shouldListen && !shouldInterrupt) return@post
|
|
492
|
+
startListeningInternal(markListening = shouldListen)
|
|
493
|
+
} catch (_: Throwable) {
|
|
494
|
+
// Will be picked up by onError and retry again
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
private fun startSilenceMonitor() {
|
|
501
|
+
silenceJob?.cancel()
|
|
502
|
+
silenceJob = scope.launch {
|
|
503
|
+
while (enabled) {
|
|
504
|
+
delay(200)
|
|
505
|
+
checkSilence()
|
|
506
|
+
}
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
|
|
510
|
+
private fun checkSilence() {
|
|
511
|
+
if (!isListening) return
|
|
512
|
+
val transcript = lastTranscript.trim()
|
|
513
|
+
if (transcript.isEmpty()) return
|
|
514
|
+
val lastHeard = lastHeardAtMs ?: return
|
|
515
|
+
val elapsed = SystemClock.elapsedRealtime() - lastHeard
|
|
516
|
+
if (elapsed < silenceWindowMs) return
|
|
517
|
+
|
|
518
|
+
// Finalize: emit a final transcript event
|
|
519
|
+
notifyListeners("transcript", JSObject().apply {
|
|
520
|
+
put("transcript", transcript)
|
|
521
|
+
put("isFinal", true)
|
|
522
|
+
})
|
|
523
|
+
lastTranscript = ""
|
|
524
|
+
lastHeardAtMs = null
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
private fun handleTranscript(transcript: String, isFinal: Boolean) {
|
|
528
|
+
if (transcript.isEmpty()) return
|
|
529
|
+
|
|
530
|
+
// If speaking and interrupt enabled, check for interruption
|
|
531
|
+
if (isSpeaking && interruptOnSpeech) {
|
|
532
|
+
if (shouldInterrupt(transcript)) {
|
|
533
|
+
val interruptedAt = computeInterruptedAt()
|
|
534
|
+
stopSpeakingInternal()
|
|
535
|
+
lastInterruptedAtSeconds = interruptedAt
|
|
536
|
+
}
|
|
537
|
+
return
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
if (!isListening) return
|
|
541
|
+
|
|
542
|
+
if (transcript.isNotEmpty()) {
|
|
543
|
+
lastTranscript = transcript
|
|
544
|
+
lastHeardAtMs = SystemClock.elapsedRealtime()
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
notifyListeners("transcript", JSObject().apply {
|
|
548
|
+
put("transcript", transcript)
|
|
549
|
+
put("isFinal", isFinal)
|
|
550
|
+
})
|
|
551
|
+
}
|
|
552
|
+
|
|
553
|
+
/**
|
|
554
|
+
* Avoid false interrupts: don't interrupt if the heard text is just a
|
|
555
|
+
* substring of what we're currently speaking (echo from speaker).
|
|
556
|
+
*/
|
|
557
|
+
private fun shouldInterrupt(transcript: String): Boolean {
|
|
558
|
+
val trimmed = transcript.trim()
|
|
559
|
+
if (trimmed.length < 3) return false
|
|
560
|
+
val spoken = lastSpokenText?.lowercase()
|
|
561
|
+
if (spoken != null && spoken.contains(trimmed.lowercase())) return false
|
|
562
|
+
return true
|
|
563
|
+
}
|
|
564
|
+
|
|
565
|
+
/**
|
|
566
|
+
* Ensure the recognizer is active during speech so we can detect
|
|
567
|
+
* interruption from the user speaking over TTS playback.
|
|
568
|
+
*/
|
|
569
|
+
private fun ensureInterruptListener() {
|
|
570
|
+
if (!interruptOnSpeech || !enabled) return
|
|
571
|
+
mainHandler.post {
|
|
572
|
+
if (stopRequested) return@post
|
|
573
|
+
if (!SpeechRecognizer.isRecognitionAvailable(context)) return@post
|
|
574
|
+
try {
|
|
575
|
+
if (recognizer == null) {
|
|
576
|
+
recognizer = SpeechRecognizer.createSpeechRecognizer(context).apply {
|
|
577
|
+
setRecognitionListener(recognitionListener)
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
recognizer?.cancel()
|
|
581
|
+
startListeningInternal(markListening = false)
|
|
582
|
+
} catch (_: Throwable) {}
|
|
583
|
+
}
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
// ── TTS internals ───────────────────────────────────────────────────
|
|
587
|
+
|
|
588
|
+
private suspend fun speakInternal(
|
|
589
|
+
text: String,
|
|
590
|
+
forceSystemTts: Boolean,
|
|
591
|
+
directive: JSObject?,
|
|
592
|
+
call: PluginCall
|
|
593
|
+
) {
|
|
594
|
+
isSpeaking = true
|
|
595
|
+
usedSystemTts = false
|
|
596
|
+
lastSpokenText = text
|
|
597
|
+
speakStartTimeMs = SystemClock.elapsedRealtime()
|
|
598
|
+
pcmStopRequested.set(false)
|
|
599
|
+
setState("speaking", "Speaking")
|
|
600
|
+
|
|
601
|
+
val effectiveVoiceId = directive.stringOrNull("voiceId")?.let(::resolveVoiceAlias) ?: voiceId
|
|
602
|
+
val effectiveApiKey = apiKey
|
|
603
|
+
|
|
604
|
+
notifyListeners("speaking", JSObject().apply {
|
|
605
|
+
put("text", text)
|
|
606
|
+
put("isSystemTts", forceSystemTts || effectiveApiKey.isNullOrEmpty() || effectiveVoiceId.isNullOrEmpty())
|
|
607
|
+
})
|
|
608
|
+
|
|
609
|
+
// Stop listening during speech (we keep recognizer for interrupt detection)
|
|
610
|
+
mainHandler.post { recognizer?.stopListening() }
|
|
611
|
+
ensureInterruptListener()
|
|
612
|
+
|
|
613
|
+
// Request audio focus
|
|
614
|
+
requestAudioFocus()
|
|
615
|
+
|
|
616
|
+
try {
|
|
617
|
+
val canUseElevenLabs = !forceSystemTts &&
|
|
618
|
+
!effectiveApiKey.isNullOrEmpty() &&
|
|
619
|
+
!effectiveVoiceId.isNullOrEmpty()
|
|
620
|
+
|
|
621
|
+
if (canUseElevenLabs) {
|
|
622
|
+
try {
|
|
623
|
+
val request = buildElevenLabsRequest(text, directive)
|
|
624
|
+
streamAndPlayPcm(
|
|
625
|
+
voiceId = effectiveVoiceId!!,
|
|
626
|
+
apiKey = effectiveApiKey!!,
|
|
627
|
+
request = request
|
|
628
|
+
)
|
|
629
|
+
|
|
630
|
+
if (!pcmStopRequested.get()) {
|
|
631
|
+
call.resolve(JSObject().apply {
|
|
632
|
+
put("completed", true)
|
|
633
|
+
put("interrupted", false)
|
|
634
|
+
put("usedSystemTts", false)
|
|
635
|
+
})
|
|
636
|
+
} else {
|
|
637
|
+
call.resolve(JSObject().apply {
|
|
638
|
+
put("completed", false)
|
|
639
|
+
put("interrupted", true)
|
|
640
|
+
put("usedSystemTts", false)
|
|
641
|
+
lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
|
|
642
|
+
})
|
|
643
|
+
}
|
|
644
|
+
} catch (e: Exception) {
|
|
645
|
+
if (pcmStopRequested.get()) {
|
|
646
|
+
call.resolve(JSObject().apply {
|
|
647
|
+
put("completed", false)
|
|
648
|
+
put("interrupted", true)
|
|
649
|
+
put("usedSystemTts", false)
|
|
650
|
+
})
|
|
651
|
+
} else {
|
|
652
|
+
Log.w(TAG, "ElevenLabs TTS failed, falling back to system", e)
|
|
653
|
+
speakWithSystemTts(text, call)
|
|
654
|
+
}
|
|
655
|
+
}
|
|
656
|
+
} else {
|
|
657
|
+
speakWithSystemTts(text, call)
|
|
658
|
+
}
|
|
659
|
+
} catch (e: Exception) {
|
|
660
|
+
Log.e(TAG, "Speak failed", e)
|
|
661
|
+
call.resolve(JSObject().apply {
|
|
662
|
+
put("completed", false)
|
|
663
|
+
put("interrupted", false)
|
|
664
|
+
put("usedSystemTts", usedSystemTts)
|
|
665
|
+
put("error", e.message ?: "Speak failed")
|
|
666
|
+
})
|
|
667
|
+
} finally {
|
|
668
|
+
isSpeaking = false
|
|
669
|
+
pcmStopRequested.set(false)
|
|
670
|
+
abandonAudioFocus()
|
|
671
|
+
|
|
672
|
+
notifyListeners("speakComplete", JSObject().apply {
|
|
673
|
+
put("completed", !pcmStopRequested.get())
|
|
674
|
+
lastInterruptedAtSeconds?.let { put("interruptedAt", it) }
|
|
675
|
+
})
|
|
676
|
+
|
|
677
|
+
if (enabled) {
|
|
678
|
+
listeningMode = true
|
|
679
|
+
setState("listening", "Listening")
|
|
680
|
+
mainHandler.post { startListeningInternal(markListening = true) }
|
|
681
|
+
} else {
|
|
682
|
+
setState("idle", "Off")
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
/**
|
|
688
|
+
* Build the full ElevenLabs request parameters from directive + defaults,
|
|
689
|
+
* applying all validation from the classic TalkModeRuntime.
|
|
690
|
+
*/
|
|
691
|
+
private fun buildElevenLabsRequest(text: String, directive: JSObject?): ElevenLabsRequest {
|
|
692
|
+
val effectiveModelId = directive.stringOrNull("modelId")?.takeIf { it.isNotEmpty() }
|
|
693
|
+
?: modelId
|
|
694
|
+
?: DEFAULT_MODEL_ID
|
|
695
|
+
val effectiveFormat = validatedOutputFormat(
|
|
696
|
+
directive.stringOrNull("outputFormat") ?: outputFormat
|
|
697
|
+
) ?: DEFAULT_OUTPUT_FORMAT
|
|
698
|
+
|
|
699
|
+
val rawSpeed = directive?.optDouble("speed", -1.0)?.takeIf { it > 0 }
|
|
700
|
+
val rawRateWpm = directive?.optInt("rateWpm", -1)?.takeIf { it > 0 }
|
|
701
|
+
val speed = resolveSpeed(rawSpeed, rawRateWpm)
|
|
702
|
+
|
|
703
|
+
val rawStability = directive?.optDouble("stability", -1.0)?.takeIf { it >= 0 }
|
|
704
|
+
val stability = validatedStability(rawStability, effectiveModelId)
|
|
705
|
+
|
|
706
|
+
val rawSimilarity = directive?.optDouble("similarity", -1.0)?.takeIf { it >= 0 }
|
|
707
|
+
val similarity = validatedUnit(rawSimilarity)
|
|
708
|
+
|
|
709
|
+
val rawStyle = directive?.optDouble("style", -1.0)?.takeIf { it >= 0 }
|
|
710
|
+
val style = validatedUnit(rawStyle)
|
|
711
|
+
|
|
712
|
+
val speakerBoost = if (directive?.has("speakerBoost") == true) {
|
|
713
|
+
directive.optBoolean("speakerBoost", false)
|
|
714
|
+
} else null
|
|
715
|
+
|
|
716
|
+
val rawSeed = directive?.optLong("seed", -1)?.takeIf { it >= 0 }
|
|
717
|
+
val seed = validatedSeed(rawSeed)
|
|
718
|
+
|
|
719
|
+
val rawNormalize = directive.stringOrNull("normalize")
|
|
720
|
+
val normalize = validatedNormalize(rawNormalize)
|
|
721
|
+
|
|
722
|
+
val rawLanguage = directive.stringOrNull("language")
|
|
723
|
+
val language = validatedLanguage(rawLanguage)
|
|
724
|
+
|
|
725
|
+
val rawLatencyTier = directive?.optInt("latencyTier", -1)?.takeIf { it >= 0 }
|
|
726
|
+
val latencyTier = validatedLatencyTier(rawLatencyTier)
|
|
727
|
+
|
|
728
|
+
return ElevenLabsRequest(
|
|
729
|
+
text = text,
|
|
730
|
+
modelId = effectiveModelId,
|
|
731
|
+
outputFormat = effectiveFormat,
|
|
732
|
+
speed = speed,
|
|
733
|
+
stability = stability,
|
|
734
|
+
similarity = similarity,
|
|
735
|
+
style = style,
|
|
736
|
+
speakerBoost = speakerBoost,
|
|
737
|
+
seed = seed,
|
|
738
|
+
normalize = normalize,
|
|
739
|
+
language = language,
|
|
740
|
+
latencyTier = latencyTier
|
|
741
|
+
)
|
|
742
|
+
}
|
|
743
|
+
|
|
744
|
+
private fun JSObject?.stringOrNull(key: String): String? {
|
|
745
|
+
if (this == null || !has(key) || isNull(key)) return null
|
|
746
|
+
val value = opt(key)
|
|
747
|
+
return if (value == null || value === JSONObject.NULL) null else value.toString()
|
|
748
|
+
}
|
|
749
|
+
|
|
750
|
+
private fun JSONObject?.stringOrNull(key: String): String? {
|
|
751
|
+
if (this == null || !has(key) || isNull(key)) return null
|
|
752
|
+
val value = opt(key)
|
|
753
|
+
return if (value == null || value === JSONObject.NULL) null else value.toString()
|
|
754
|
+
}
|
|
755
|
+
|
|
756
|
+
/**
|
|
757
|
+
* Stream PCM audio from ElevenLabs and play via AudioTrack.
|
|
758
|
+
* Ported from classic TalkModeManager with proper offset-based writes.
|
|
759
|
+
*/
|
|
760
|
+
private suspend fun streamAndPlayPcm(
|
|
761
|
+
voiceId: String,
|
|
762
|
+
apiKey: String,
|
|
763
|
+
request: ElevenLabsRequest
|
|
764
|
+
) = withContext(Dispatchers.IO) {
|
|
765
|
+
pcmStopRequested.set(false)
|
|
766
|
+
|
|
767
|
+
val sampleRate = parsePcmSampleRate(request.outputFormat) ?: 24000
|
|
768
|
+
val minBuffer = AudioTrack.getMinBufferSize(
|
|
769
|
+
sampleRate,
|
|
770
|
+
AudioFormat.CHANNEL_OUT_MONO,
|
|
771
|
+
AudioFormat.ENCODING_PCM_16BIT
|
|
772
|
+
)
|
|
773
|
+
if (minBuffer <= 0) {
|
|
774
|
+
throw IllegalStateException("AudioTrack buffer size invalid: $minBuffer")
|
|
775
|
+
}
|
|
776
|
+
|
|
777
|
+
val bufferSize = max(minBuffer * 2, 8 * 1024)
|
|
778
|
+
val track = AudioTrack.Builder()
|
|
779
|
+
.setAudioAttributes(
|
|
780
|
+
AudioAttributes.Builder()
|
|
781
|
+
.setUsage(AudioAttributes.USAGE_ASSISTANT)
|
|
782
|
+
.setContentType(AudioAttributes.CONTENT_TYPE_SPEECH)
|
|
783
|
+
.build()
|
|
784
|
+
)
|
|
785
|
+
.setAudioFormat(
|
|
786
|
+
AudioFormat.Builder()
|
|
787
|
+
.setEncoding(AudioFormat.ENCODING_PCM_16BIT)
|
|
788
|
+
.setSampleRate(sampleRate)
|
|
789
|
+
.setChannelMask(AudioFormat.CHANNEL_OUT_MONO)
|
|
790
|
+
.build()
|
|
791
|
+
)
|
|
792
|
+
.setBufferSizeInBytes(bufferSize)
|
|
793
|
+
.setTransferMode(AudioTrack.MODE_STREAM)
|
|
794
|
+
.build()
|
|
795
|
+
|
|
796
|
+
if (track.state != AudioTrack.STATE_INITIALIZED) {
|
|
797
|
+
track.release()
|
|
798
|
+
throw IllegalStateException("AudioTrack init failed")
|
|
799
|
+
}
|
|
800
|
+
pcmTrack = track
|
|
801
|
+
track.play()
|
|
802
|
+
|
|
803
|
+
Log.d(TAG, "PCM play start sampleRate=$sampleRate bufferSize=$bufferSize")
|
|
804
|
+
val conn = openTtsConnection(voiceId, apiKey, request)
|
|
805
|
+
try {
|
|
806
|
+
val payload = buildRequestPayload(request)
|
|
807
|
+
conn.outputStream.use { it.write(payload.toByteArray()) }
|
|
808
|
+
|
|
809
|
+
val code = conn.responseCode
|
|
810
|
+
if (code >= 400) {
|
|
811
|
+
val errBody = conn.errorStream?.readBytes()?.toString(Charsets.UTF_8) ?: ""
|
|
812
|
+
throw IllegalStateException("ElevenLabs API error: $code $errBody")
|
|
813
|
+
}
|
|
814
|
+
|
|
815
|
+
BufferedInputStream(conn.inputStream).use { input ->
|
|
816
|
+
val buffer = ByteArray(8 * 1024)
|
|
817
|
+
while (true) {
|
|
818
|
+
if (pcmStopRequested.get()) return@withContext
|
|
819
|
+
val bytesRead = input.read(buffer)
|
|
820
|
+
if (bytesRead <= 0) break
|
|
821
|
+
|
|
822
|
+
// Write all bytes, handling partial writes
|
|
823
|
+
var offset = 0
|
|
824
|
+
while (offset < bytesRead) {
|
|
825
|
+
if (pcmStopRequested.get()) return@withContext
|
|
826
|
+
val wrote = try {
|
|
827
|
+
track.write(buffer, offset, bytesRead - offset)
|
|
828
|
+
} catch (e: Throwable) {
|
|
829
|
+
if (pcmStopRequested.get()) return@withContext
|
|
830
|
+
throw e
|
|
831
|
+
}
|
|
832
|
+
if (wrote <= 0) {
|
|
833
|
+
if (pcmStopRequested.get()) return@withContext
|
|
834
|
+
throw IllegalStateException("AudioTrack write failed: $wrote")
|
|
835
|
+
}
|
|
836
|
+
offset += wrote
|
|
837
|
+
}
|
|
838
|
+
}
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
// Wait for playback buffer to drain
|
|
842
|
+
if (!pcmStopRequested.get()) {
|
|
843
|
+
track.stop()
|
|
844
|
+
}
|
|
845
|
+
Log.d(TAG, "PCM play done")
|
|
846
|
+
} finally {
|
|
847
|
+
cleanupPcmTrack()
|
|
848
|
+
conn.disconnect()
|
|
849
|
+
}
|
|
850
|
+
}
|
|
851
|
+
|
|
852
|
+
/**
|
|
853
|
+
* Open HTTP connection to ElevenLabs streaming TTS endpoint.
|
|
854
|
+
* Includes Accept header and latency tier query parameter.
|
|
855
|
+
*/
|
|
856
|
+
private fun openTtsConnection(
|
|
857
|
+
voiceId: String,
|
|
858
|
+
apiKey: String,
|
|
859
|
+
request: ElevenLabsRequest
|
|
860
|
+
): HttpURLConnection {
|
|
861
|
+
val baseUrl = "https://api.elevenlabs.io/v1/text-to-speech/$voiceId/stream"
|
|
862
|
+
val url = if (request.latencyTier != null) {
|
|
863
|
+
URL("$baseUrl?optimize_streaming_latency=${request.latencyTier}")
|
|
864
|
+
} else {
|
|
865
|
+
URL(baseUrl)
|
|
866
|
+
}
|
|
867
|
+
|
|
868
|
+
val conn = url.openConnection() as HttpURLConnection
|
|
869
|
+
conn.requestMethod = "POST"
|
|
870
|
+
conn.connectTimeout = 30_000
|
|
871
|
+
conn.readTimeout = 30_000
|
|
872
|
+
conn.setRequestProperty("Content-Type", "application/json")
|
|
873
|
+
conn.setRequestProperty("Accept", resolveAcceptHeader(request.outputFormat))
|
|
874
|
+
conn.setRequestProperty("xi-api-key", apiKey)
|
|
875
|
+
conn.doOutput = true
|
|
876
|
+
return conn
|
|
877
|
+
}
|
|
878
|
+
|
|
879
|
+
private fun resolveAcceptHeader(outputFormat: String?): String {
|
|
880
|
+
val normalized = outputFormat?.trim()?.lowercase().orEmpty()
|
|
881
|
+
return if (normalized.startsWith("pcm_")) "audio/pcm" else "audio/mpeg"
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
/**
|
|
885
|
+
* Build the full JSON request payload with all ElevenLabs voice_settings.
|
|
886
|
+
*/
|
|
887
|
+
private fun buildRequestPayload(request: ElevenLabsRequest): String {
|
|
888
|
+
val sb = StringBuilder()
|
|
889
|
+
sb.append("{")
|
|
890
|
+
sb.append("\"text\":").append(jsonString(request.text))
|
|
891
|
+
request.modelId?.takeIf { it.isNotEmpty() }?.let {
|
|
892
|
+
sb.append(",\"model_id\":").append(jsonString(it))
|
|
893
|
+
}
|
|
894
|
+
request.outputFormat?.takeIf { it.isNotEmpty() }?.let {
|
|
895
|
+
sb.append(",\"output_format\":").append(jsonString(it))
|
|
896
|
+
}
|
|
897
|
+
request.seed?.let { sb.append(",\"seed\":$it") }
|
|
898
|
+
request.normalize?.let { sb.append(",\"apply_text_normalization\":").append(jsonString(it)) }
|
|
899
|
+
request.language?.let { sb.append(",\"language_code\":").append(jsonString(it)) }
|
|
900
|
+
|
|
901
|
+
// voice_settings sub-object
|
|
902
|
+
val vsEntries = mutableListOf<String>()
|
|
903
|
+
request.speed?.let { vsEntries.add("\"speed\":$it") }
|
|
904
|
+
request.stability?.let { vsEntries.add("\"stability\":$it") }
|
|
905
|
+
request.similarity?.let { vsEntries.add("\"similarity_boost\":$it") }
|
|
906
|
+
request.style?.let { vsEntries.add("\"style\":$it") }
|
|
907
|
+
request.speakerBoost?.let { vsEntries.add("\"use_speaker_boost\":$it") }
|
|
908
|
+
if (vsEntries.isNotEmpty()) {
|
|
909
|
+
sb.append(",\"voice_settings\":{")
|
|
910
|
+
sb.append(vsEntries.joinToString(","))
|
|
911
|
+
sb.append("}")
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
sb.append("}")
|
|
915
|
+
return sb.toString()
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
/** Escape a string for JSON. */
|
|
919
|
+
private fun jsonString(value: String): String {
|
|
920
|
+
val escaped = value
|
|
921
|
+
.replace("\\", "\\\\")
|
|
922
|
+
.replace("\"", "\\\"")
|
|
923
|
+
.replace("\n", "\\n")
|
|
924
|
+
.replace("\r", "\\r")
|
|
925
|
+
.replace("\t", "\\t")
|
|
926
|
+
return "\"$escaped\""
|
|
927
|
+
}
|
|
928
|
+
|
|
929
|
+
private suspend fun speakWithSystemTts(text: String, call: PluginCall) {
|
|
930
|
+
usedSystemTts = true
|
|
931
|
+
setState("speaking", "Speaking (System)")
|
|
932
|
+
|
|
933
|
+
if (!systemTtsReady || systemTts == null) {
|
|
934
|
+
call.resolve(JSObject().apply {
|
|
935
|
+
put("completed", false)
|
|
936
|
+
put("interrupted", false)
|
|
937
|
+
put("usedSystemTts", true)
|
|
938
|
+
put("error", "System TTS not available")
|
|
939
|
+
})
|
|
940
|
+
return
|
|
941
|
+
}
|
|
942
|
+
|
|
943
|
+
val utteranceId = "talkmode-${UUID.randomUUID()}"
|
|
944
|
+
val deferred = CompletableDeferred<Unit>()
|
|
945
|
+
systemTtsPending?.cancel()
|
|
946
|
+
systemTtsPending = deferred
|
|
947
|
+
systemTtsPendingId = utteranceId
|
|
948
|
+
|
|
949
|
+
withContext(Dispatchers.Main) {
|
|
950
|
+
val params = Bundle()
|
|
951
|
+
systemTts?.speak(text, TextToSpeech.QUEUE_FLUSH, params, utteranceId)
|
|
952
|
+
}
|
|
953
|
+
|
|
954
|
+
try {
|
|
955
|
+
withContext(Dispatchers.IO) {
|
|
956
|
+
kotlinx.coroutines.withTimeout(180_000) { deferred.await() }
|
|
957
|
+
}
|
|
958
|
+
call.resolve(JSObject().apply {
|
|
959
|
+
put("completed", true)
|
|
960
|
+
put("interrupted", false)
|
|
961
|
+
put("usedSystemTts", true)
|
|
962
|
+
})
|
|
963
|
+
} catch (e: Exception) {
|
|
964
|
+
call.resolve(JSObject().apply {
|
|
965
|
+
put("completed", false)
|
|
966
|
+
put("interrupted", false)
|
|
967
|
+
put("usedSystemTts", true)
|
|
968
|
+
put("error", e.message ?: "System TTS error")
|
|
969
|
+
})
|
|
970
|
+
}
|
|
971
|
+
}
|
|
972
|
+
|
|
973
|
+
// ── Audio focus ─────────────────────────────────────────────────────
|
|
974
|
+
|
|
975
|
+
private fun requestAudioFocus() {
|
|
976
|
+
val am = audioManager ?: return
|
|
977
|
+
val focusListener = AudioManager.OnAudioFocusChangeListener { focusChange ->
|
|
978
|
+
when (focusChange) {
|
|
979
|
+
AudioManager.AUDIOFOCUS_LOSS,
|
|
980
|
+
AudioManager.AUDIOFOCUS_LOSS_TRANSIENT -> {
|
|
981
|
+
// Another app took audio; stop speaking if we are
|
|
982
|
+
if (isSpeaking) {
|
|
983
|
+
stopSpeakingInternal()
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
audioFocusRequest = focusListener
|
|
989
|
+
|
|
990
|
+
@Suppress("DEPRECATION")
|
|
991
|
+
am.requestAudioFocus(
|
|
992
|
+
focusListener,
|
|
993
|
+
AudioManager.STREAM_MUSIC,
|
|
994
|
+
AudioManager.AUDIOFOCUS_GAIN_TRANSIENT_MAY_DUCK
|
|
995
|
+
)
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
private fun abandonAudioFocus() {
|
|
999
|
+
val am = audioManager ?: return
|
|
1000
|
+
val listener = audioFocusRequest ?: return
|
|
1001
|
+
@Suppress("DEPRECATION")
|
|
1002
|
+
am.abandonAudioFocus(listener)
|
|
1003
|
+
audioFocusRequest = null
|
|
1004
|
+
}
|
|
1005
|
+
|
|
1006
|
+
// ── Cleanup helpers ─────────────────────────────────────────────────
|
|
1007
|
+
|
|
1008
|
+
private fun stopSpeakingInternal() {
|
|
1009
|
+
pcmStopRequested.set(true)
|
|
1010
|
+
cleanupPcmTrack()
|
|
1011
|
+
systemTts?.stop()
|
|
1012
|
+
systemTtsPending?.cancel()
|
|
1013
|
+
systemTtsPending = null
|
|
1014
|
+
systemTtsPendingId = null
|
|
1015
|
+
speakingJob?.cancel()
|
|
1016
|
+
isSpeaking = false
|
|
1017
|
+
}
|
|
1018
|
+
|
|
1019
|
+
private fun cleanupPcmTrack() {
|
|
1020
|
+
val track = pcmTrack ?: return
|
|
1021
|
+
try {
|
|
1022
|
+
track.pause()
|
|
1023
|
+
track.flush()
|
|
1024
|
+
track.stop()
|
|
1025
|
+
} catch (_: Throwable) {
|
|
1026
|
+
// ignore cleanup errors
|
|
1027
|
+
} finally {
|
|
1028
|
+
track.release()
|
|
1029
|
+
}
|
|
1030
|
+
pcmTrack = null
|
|
1031
|
+
}
|
|
1032
|
+
|
|
1033
|
+
private fun computeInterruptedAt(): Double? {
|
|
1034
|
+
if (!isSpeaking) return null
|
|
1035
|
+
val elapsed = SystemClock.elapsedRealtime() - speakStartTimeMs
|
|
1036
|
+
return elapsed.toDouble() / 1000.0
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
// ── Voice alias resolution ──────────────────────────────────────────
|
|
1040
|
+
|
|
1041
|
+
private fun resolveVoiceAlias(value: String?): String? {
|
|
1042
|
+
val trimmed = value?.trim() ?: return null
|
|
1043
|
+
if (trimmed.isEmpty()) return null
|
|
1044
|
+
|
|
1045
|
+
val normalized = trimmed.lowercase()
|
|
1046
|
+
|
|
1047
|
+
// Check alias map
|
|
1048
|
+
voiceAliases[normalized]?.let { return it }
|
|
1049
|
+
|
|
1050
|
+
// Check if it's already a known voice ID (direct passthrough)
|
|
1051
|
+
if (voiceAliases.values.any { it.equals(trimmed, ignoreCase = true) }) return trimmed
|
|
1052
|
+
|
|
1053
|
+
// Looks like a raw ElevenLabs voice ID
|
|
1054
|
+
if (isLikelyVoiceId(trimmed)) return trimmed
|
|
1055
|
+
|
|
1056
|
+
return null
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
private fun isLikelyVoiceId(value: String): Boolean {
|
|
1060
|
+
if (value.length < 10) return false
|
|
1061
|
+
return value.all { it.isLetterOrDigit() || it == '-' || it == '_' }
|
|
1062
|
+
}
|
|
1063
|
+
|
|
1064
|
+
// ── Validation helpers (from classic TalkModeRuntime) ───────────────
|
|
1065
|
+
|
|
1066
|
+
private fun resolveSpeed(speed: Double?, rateWpm: Int?): Double? {
|
|
1067
|
+
if (rateWpm != null && rateWpm > 0) {
|
|
1068
|
+
val resolved = rateWpm.toDouble() / 175.0
|
|
1069
|
+
if (resolved <= 0.5 || resolved >= 2.0) return null
|
|
1070
|
+
return resolved
|
|
1071
|
+
}
|
|
1072
|
+
if (speed != null) {
|
|
1073
|
+
if (speed <= 0.5 || speed >= 2.0) return null
|
|
1074
|
+
return speed
|
|
1075
|
+
}
|
|
1076
|
+
return null
|
|
1077
|
+
}
|
|
1078
|
+
|
|
1079
|
+
private fun validatedUnit(value: Double?): Double? {
|
|
1080
|
+
if (value == null) return null
|
|
1081
|
+
if (value < 0 || value > 1) return null
|
|
1082
|
+
return value
|
|
1083
|
+
}
|
|
1084
|
+
|
|
1085
|
+
private fun validatedStability(value: Double?, modelId: String?): Double? {
|
|
1086
|
+
if (value == null) return null
|
|
1087
|
+
val normalized = modelId?.trim()?.lowercase()
|
|
1088
|
+
if (normalized == "eleven_v3") {
|
|
1089
|
+
// v3 only supports discrete stability values
|
|
1090
|
+
return if (value == 0.0 || value == 0.5 || value == 1.0) value else null
|
|
1091
|
+
}
|
|
1092
|
+
return validatedUnit(value)
|
|
1093
|
+
}
|
|
1094
|
+
|
|
1095
|
+
private fun validatedSeed(value: Long?): Long? {
|
|
1096
|
+
if (value == null) return null
|
|
1097
|
+
if (value < 0 || value > 4294967295L) return null
|
|
1098
|
+
return value
|
|
1099
|
+
}
|
|
1100
|
+
|
|
1101
|
+
private fun validatedNormalize(value: String?): String? {
|
|
1102
|
+
val normalized = value?.trim()?.lowercase() ?: return null
|
|
1103
|
+
return if (normalized in listOf("auto", "on", "off")) normalized else null
|
|
1104
|
+
}
|
|
1105
|
+
|
|
1106
|
+
private fun validatedLanguage(value: String?): String? {
|
|
1107
|
+
val normalized = value?.trim()?.lowercase() ?: return null
|
|
1108
|
+
if (normalized.length != 2) return null
|
|
1109
|
+
if (!normalized.all { it in 'a'..'z' }) return null
|
|
1110
|
+
return normalized
|
|
1111
|
+
}
|
|
1112
|
+
|
|
1113
|
+
private fun validatedOutputFormat(value: String?): String? {
|
|
1114
|
+
val trimmed = value?.trim()?.lowercase() ?: return null
|
|
1115
|
+
if (trimmed.isEmpty()) return null
|
|
1116
|
+
if (trimmed.startsWith("mp3_")) return trimmed
|
|
1117
|
+
return if (parsePcmSampleRate(trimmed) != null) trimmed else null
|
|
1118
|
+
}
|
|
1119
|
+
|
|
1120
|
+
private fun validatedLatencyTier(value: Int?): Int? {
|
|
1121
|
+
if (value == null) return null
|
|
1122
|
+
if (value < 0 || value > 4) return null
|
|
1123
|
+
return value
|
|
1124
|
+
}
|
|
1125
|
+
|
|
1126
|
+
private fun parsePcmSampleRate(value: String?): Int? {
|
|
1127
|
+
val trimmed = value?.trim()?.lowercase() ?: return null
|
|
1128
|
+
if (!trimmed.startsWith("pcm_")) return null
|
|
1129
|
+
val suffix = trimmed.removePrefix("pcm_")
|
|
1130
|
+
val digits = suffix.takeWhile { it.isDigit() }
|
|
1131
|
+
val rate = digits.toIntOrNull() ?: return null
|
|
1132
|
+
return if (rate in setOf(16000, 22050, 24000, 44100)) rate else null
|
|
1133
|
+
}
|
|
1134
|
+
|
|
1135
|
+
// ── State management ────────────────────────────────────────────────
|
|
1136
|
+
|
|
1137
|
+
private fun setState(newState: String, newStatusText: String) {
|
|
1138
|
+
val previousState = state
|
|
1139
|
+
state = newState
|
|
1140
|
+
statusText = newStatusText
|
|
1141
|
+
|
|
1142
|
+
notifyListeners("stateChange", JSObject().apply {
|
|
1143
|
+
put("state", newState)
|
|
1144
|
+
put("previousState", previousState)
|
|
1145
|
+
put("statusText", newStatusText)
|
|
1146
|
+
put("usingSystemTts", usedSystemTts)
|
|
1147
|
+
})
|
|
1148
|
+
}
|
|
1149
|
+
|
|
1150
|
+
private fun buildPermissionResult(): JSObject {
|
|
1151
|
+
val micGranted = isPermissionGranted(Manifest.permission.RECORD_AUDIO)
|
|
1152
|
+
val speechAvailable = SpeechRecognizer.isRecognitionAvailable(context)
|
|
1153
|
+
|
|
1154
|
+
return JSObject().apply {
|
|
1155
|
+
put("microphone", if (micGranted) "granted" else "denied")
|
|
1156
|
+
put("speechRecognition", if (speechAvailable) {
|
|
1157
|
+
if (micGranted) "granted" else "prompt"
|
|
1158
|
+
} else {
|
|
1159
|
+
"not_supported"
|
|
1160
|
+
})
|
|
1161
|
+
}
|
|
1162
|
+
}
|
|
1163
|
+
|
|
1164
|
+
private fun isPermissionGranted(permission: String): Boolean {
|
|
1165
|
+
return getPermissionState(permission) == com.getcapacitor.PermissionState.GRANTED
|
|
1166
|
+
}
|
|
1167
|
+
|
|
1168
|
+
// ── Cleanup ─────────────────────────────────────────────────────────
|
|
1169
|
+
|
|
1170
|
+
override fun handleOnDestroy() {
|
|
1171
|
+
super.handleOnDestroy()
|
|
1172
|
+
enabled = false
|
|
1173
|
+
stopRequested = true
|
|
1174
|
+
recognizer?.destroy()
|
|
1175
|
+
recognizer = null
|
|
1176
|
+
systemTts?.shutdown()
|
|
1177
|
+
systemTts = null
|
|
1178
|
+
cleanupPcmTrack()
|
|
1179
|
+
silenceJob?.cancel()
|
|
1180
|
+
restartJob?.cancel()
|
|
1181
|
+
speakingJob?.cancel()
|
|
1182
|
+
abandonAudioFocus()
|
|
1183
|
+
scope.cancel()
|
|
1184
|
+
}
|
|
1185
|
+
|
|
1186
|
+
// ── Data class ──────────────────────────────────────────────────────
|
|
1187
|
+
|
|
1188
|
+
private data class ElevenLabsRequest(
|
|
1189
|
+
val text: String,
|
|
1190
|
+
val modelId: String?,
|
|
1191
|
+
val outputFormat: String?,
|
|
1192
|
+
val speed: Double?,
|
|
1193
|
+
val stability: Double?,
|
|
1194
|
+
val similarity: Double?,
|
|
1195
|
+
val style: Double?,
|
|
1196
|
+
val speakerBoost: Boolean?,
|
|
1197
|
+
val seed: Long?,
|
|
1198
|
+
val normalize: String?,
|
|
1199
|
+
val language: String?,
|
|
1200
|
+
val latencyTier: Int?
|
|
1201
|
+
)
|
|
1202
|
+
}
|