@elizaos/capacitor-swabble 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ElizaosCapacitorSwabble.podspec +18 -0
- package/android/build.gradle +50 -0
- package/android/src/main/AndroidManifest.xml +4 -0
- package/android/src/main/java/ai/eliza/plugins/swabble/SwabblePlugin.kt +840 -0
- package/dist/esm/definitions.d.ts +218 -0
- package/dist/esm/definitions.d.ts.map +1 -0
- package/dist/esm/definitions.js +1 -0
- package/dist/esm/index.d.ts +4 -0
- package/dist/esm/index.d.ts.map +1 -0
- package/dist/esm/index.js +6 -0
- package/dist/esm/web.d.ts +54 -0
- package/dist/esm/web.d.ts.map +1 -0
- package/dist/esm/web.js +461 -0
- package/dist/plugin.cjs.js +477 -0
- package/dist/plugin.cjs.js.map +1 -0
- package/dist/plugin.js +480 -0
- package/dist/plugin.js.map +1 -0
- package/electrobun/src/global.d.ts +1 -0
- package/electrobun/src/index.ts +786 -0
- package/electrobun/tsconfig.json +16 -0
- package/ios/Sources/SwabblePlugin/SwabblePlugin.swift +1156 -0
- package/package.json +84 -0
|
@@ -0,0 +1,1156 @@
|
|
|
1
|
+
import Foundation
|
|
2
|
+
import Capacitor
|
|
3
|
+
import Speech
|
|
4
|
+
import AVFoundation
|
|
5
|
+
|
|
6
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
7
|
+
// MARK: - Thread-safe Audio Buffer Queue
|
|
8
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
9
|
+
|
|
10
|
+
/// Audio tap callbacks fire on a realtime audio thread. We deep-copy buffers into a
|
|
11
|
+
/// lock-protected queue and drain them on a main-thread timer for the speech recognition
|
|
12
|
+
/// request. This keeps the audio callback tiny and avoids blocking the realtime thread.
|
|
13
|
+
/// Pattern: thread-safe lock-protected queue drained on a main-thread timer.
|
|
14
|
+
private final class AudioBufferQueue: @unchecked Sendable {
|
|
15
|
+
private let lock = NSLock()
|
|
16
|
+
private var buffers: [AVAudioPCMBuffer] = []
|
|
17
|
+
|
|
18
|
+
func enqueue(_ buffer: AVAudioPCMBuffer) {
|
|
19
|
+
guard let copy = buffer.deepCopy() else { return }
|
|
20
|
+
lock.lock()
|
|
21
|
+
buffers.append(copy)
|
|
22
|
+
lock.unlock()
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
func drain() -> [AVAudioPCMBuffer] {
|
|
26
|
+
lock.lock()
|
|
27
|
+
let result = buffers
|
|
28
|
+
buffers.removeAll(keepingCapacity: true)
|
|
29
|
+
lock.unlock()
|
|
30
|
+
return result
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
func clear() {
|
|
34
|
+
lock.lock()
|
|
35
|
+
buffers.removeAll(keepingCapacity: false)
|
|
36
|
+
lock.unlock()
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
private extension AVAudioPCMBuffer {
|
|
41
|
+
func deepCopy() -> AVAudioPCMBuffer? {
|
|
42
|
+
let fmt = format
|
|
43
|
+
let len = frameLength
|
|
44
|
+
guard let copy = AVAudioPCMBuffer(pcmFormat: fmt, frameCapacity: len) else { return nil }
|
|
45
|
+
copy.frameLength = len
|
|
46
|
+
if let src = floatChannelData, let dst = copy.floatChannelData {
|
|
47
|
+
for ch in 0..<Int(fmt.channelCount) { dst[ch].update(from: src[ch], count: Int(len)) }
|
|
48
|
+
return copy
|
|
49
|
+
}
|
|
50
|
+
if let src = int16ChannelData, let dst = copy.int16ChannelData {
|
|
51
|
+
for ch in 0..<Int(fmt.channelCount) { dst[ch].update(from: src[ch], count: Int(len)) }
|
|
52
|
+
return copy
|
|
53
|
+
}
|
|
54
|
+
if let src = int32ChannelData, let dst = copy.int32ChannelData {
|
|
55
|
+
for ch in 0..<Int(fmt.channelCount) { dst[ch].update(from: src[ch], count: Int(len)) }
|
|
56
|
+
return copy
|
|
57
|
+
}
|
|
58
|
+
return nil
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
63
|
+
// MARK: - Wake Word Gate (inlined from SwabbleKit)
|
|
64
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
65
|
+
|
|
66
|
+
/// Speech segment with timing data from SFSpeechRecognizer.
|
|
67
|
+
private struct WakeSegment {
|
|
68
|
+
let text: String
|
|
69
|
+
let start: TimeInterval
|
|
70
|
+
let duration: TimeInterval
|
|
71
|
+
let range: Range<String.Index>?
|
|
72
|
+
var end: TimeInterval { start + duration }
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
private struct GateConfig {
|
|
76
|
+
var triggers: [String]
|
|
77
|
+
var minPostTriggerGap: TimeInterval
|
|
78
|
+
var minCommandLength: Int
|
|
79
|
+
|
|
80
|
+
init(triggers: [String], minPostTriggerGap: TimeInterval = 0.45, minCommandLength: Int = 1) {
|
|
81
|
+
self.triggers = triggers
|
|
82
|
+
self.minPostTriggerGap = minPostTriggerGap
|
|
83
|
+
self.minCommandLength = minCommandLength
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
private struct GateMatch {
|
|
88
|
+
let triggerWord: String
|
|
89
|
+
let triggerEndTime: TimeInterval
|
|
90
|
+
let postGap: TimeInterval
|
|
91
|
+
let command: String
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/// Wake word detection engine. Matches trigger words against speech segments using timing
|
|
95
|
+
/// data to confirm a deliberate pause after the trigger, then extracts the command text.
|
|
96
|
+
/// Supports fuzzy matching via Levenshtein edit distance so that imprecise recognition
|
|
97
|
+
/// (e.g. "melody" for trigger "eliza") still fires.
|
|
98
|
+
private enum WakeGate {
|
|
99
|
+
|
|
100
|
+
// MARK: Token types
|
|
101
|
+
|
|
102
|
+
private struct Token {
|
|
103
|
+
let normalized: String
|
|
104
|
+
let start: TimeInterval
|
|
105
|
+
let end: TimeInterval
|
|
106
|
+
let range: Range<String.Index>?
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
private struct TriggerTokens {
|
|
110
|
+
let original: String
|
|
111
|
+
let tokens: [String]
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// MARK: Primary timing-based match
|
|
115
|
+
|
|
116
|
+
/// Match trigger words against speech segments using timing data.
|
|
117
|
+
/// Looks for trigger tokens, confirms a post-trigger gap, and extracts the command.
|
|
118
|
+
static func match(transcript: String, segments: [WakeSegment], config: GateConfig) -> GateMatch? {
|
|
119
|
+
let triggers = normalizeTriggers(config.triggers)
|
|
120
|
+
guard !triggers.isEmpty else { return nil }
|
|
121
|
+
let tokens = normalizeSegments(segments)
|
|
122
|
+
guard !tokens.isEmpty else { return nil }
|
|
123
|
+
|
|
124
|
+
struct Candidate {
|
|
125
|
+
let trigger: String; let index: Int; let triggerEnd: TimeInterval; let gap: TimeInterval
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
var best: Candidate?
|
|
129
|
+
for trig in triggers {
|
|
130
|
+
let count = trig.tokens.count
|
|
131
|
+
guard count > 0, tokens.count > count else { continue }
|
|
132
|
+
for i in 0...(tokens.count - count - 1) {
|
|
133
|
+
let exact = (0..<count).allSatisfy { tokens[i + $0].normalized == trig.tokens[$0] }
|
|
134
|
+
let fuzzy = !exact && (0..<count).allSatisfy {
|
|
135
|
+
fuzzyTokenMatch(tokens[i + $0].normalized, trig.tokens[$0])
|
|
136
|
+
}
|
|
137
|
+
guard exact || fuzzy else { continue }
|
|
138
|
+
let trigEnd = tokens[i + count - 1].end
|
|
139
|
+
let gap = tokens[i + count].start - trigEnd
|
|
140
|
+
guard gap >= config.minPostTriggerGap else { continue }
|
|
141
|
+
if let b = best, i <= b.index { continue }
|
|
142
|
+
best = Candidate(trigger: trig.original, index: i, triggerEnd: trigEnd, gap: gap)
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
guard let best else { return nil }
|
|
147
|
+
let cmd = commandText(transcript: transcript, segments: segments, triggerEndTime: best.triggerEnd)
|
|
148
|
+
.trimmingCharacters(in: wsPunct)
|
|
149
|
+
guard cmd.count >= config.minCommandLength else { return nil }
|
|
150
|
+
return GateMatch(triggerWord: best.trigger, triggerEndTime: best.triggerEnd,
|
|
151
|
+
postGap: best.gap, command: cmd)
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// MARK: Command text extraction
|
|
155
|
+
|
|
156
|
+
/// Extract command text from segments appearing after the trigger end time.
|
|
157
|
+
static func commandText(transcript: String, segments: [WakeSegment], triggerEndTime: TimeInterval) -> String {
|
|
158
|
+
let threshold = triggerEndTime + 0.001
|
|
159
|
+
for seg in segments where seg.start >= threshold {
|
|
160
|
+
if normalizeToken(seg.text).isEmpty { continue }
|
|
161
|
+
if let range = seg.range {
|
|
162
|
+
return String(transcript[range.lowerBound...]).trimmingCharacters(in: wsPunct)
|
|
163
|
+
}
|
|
164
|
+
break
|
|
165
|
+
}
|
|
166
|
+
return segments
|
|
167
|
+
.filter { $0.start >= threshold && !normalizeToken($0.text).isEmpty }
|
|
168
|
+
.map(\.text).joined(separator: " ")
|
|
169
|
+
.trimmingCharacters(in: wsPunct)
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/// Find the first trigger word and return everything after it.
|
|
173
|
+
/// Supports fuzzy matching so "melody" matches trigger "eliza".
|
|
174
|
+
static func textAfterTrigger(_ text: String, triggers: [String]) -> String {
|
|
175
|
+
let words = text.split(whereSeparator: \.isWhitespace).map(String.init)
|
|
176
|
+
guard !words.isEmpty else { return "" }
|
|
177
|
+
for trigger in triggers {
|
|
178
|
+
let tw = trigger.split(whereSeparator: \.isWhitespace)
|
|
179
|
+
.map { normalizeToken(String($0)) }.filter { !$0.isEmpty }
|
|
180
|
+
guard !tw.isEmpty, words.count >= tw.count else { continue }
|
|
181
|
+
for i in 0...(words.count - tw.count) {
|
|
182
|
+
let matched = (0..<tw.count).allSatisfy { j in
|
|
183
|
+
let w = normalizeToken(words[i + j])
|
|
184
|
+
return w == tw[j] || fuzzyTokenMatch(w, tw[j])
|
|
185
|
+
}
|
|
186
|
+
if matched {
|
|
187
|
+
let afterIdx = i + tw.count
|
|
188
|
+
return afterIdx < words.count
|
|
189
|
+
? words[afterIdx...].joined(separator: " ").trimmingCharacters(in: wsPunct)
|
|
190
|
+
: ""
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
return text
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
// MARK: Text-only helpers (no timing data required)
|
|
198
|
+
|
|
199
|
+
/// Quick text-only check for trigger presence.
|
|
200
|
+
static func matchesTextOnly(text: String, triggers: [String]) -> Bool {
|
|
201
|
+
guard !text.isEmpty else { return false }
|
|
202
|
+
let lower = text.lowercased()
|
|
203
|
+
for trigger in triggers {
|
|
204
|
+
let token = trigger.trimmingCharacters(in: wsPunct).lowercased()
|
|
205
|
+
if token.isEmpty { continue }
|
|
206
|
+
if lower.contains(token) { return true }
|
|
207
|
+
// Fuzzy: check individual words
|
|
208
|
+
let words = lower.split(whereSeparator: \.isWhitespace).map(String.init)
|
|
209
|
+
if words.contains(where: { fuzzyTokenMatch($0, token) }) { return true }
|
|
210
|
+
}
|
|
211
|
+
return false
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
/// Check if transcript begins with a trigger word.
|
|
215
|
+
static func startsWithTrigger(transcript: String, triggers: [String]) -> Bool {
|
|
216
|
+
let words = transcript.split(whereSeparator: \.isWhitespace)
|
|
217
|
+
.map { normalizeToken(String($0)) }.filter { !$0.isEmpty }
|
|
218
|
+
guard !words.isEmpty else { return false }
|
|
219
|
+
for trigger in triggers {
|
|
220
|
+
let tw = trigger.split(whereSeparator: \.isWhitespace)
|
|
221
|
+
.map { normalizeToken(String($0)) }.filter { !$0.isEmpty }
|
|
222
|
+
guard !tw.isEmpty, words.count >= tw.count else { continue }
|
|
223
|
+
if zip(tw, words.prefix(tw.count)).allSatisfy({ $0 == $1 || fuzzyTokenMatch($0, $1) }) {
|
|
224
|
+
return true
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
return false
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
/// Text-only command extraction fallback (when timing data is absent or unreliable).
|
|
231
|
+
static func textOnlyCommand(transcript: String, triggers: [String], minCommandLength: Int) -> String? {
|
|
232
|
+
guard matchesTextOnly(text: transcript, triggers: triggers),
|
|
233
|
+
startsWithTrigger(transcript: transcript, triggers: triggers) else { return nil }
|
|
234
|
+
let after = textAfterTrigger(transcript, triggers: triggers)
|
|
235
|
+
return after.count >= minCommandLength ? after : nil
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// MARK: Fuzzy matching via Levenshtein distance
|
|
239
|
+
|
|
240
|
+
/// Returns true if two normalized tokens are "close enough" to be considered a match.
|
|
241
|
+
/// Threshold: ceil(maxLen / 3). e.g. "eliza" (7) ↔ "melody" (6) → threshold 3, distance 3 → match.
|
|
242
|
+
static func fuzzyTokenMatch(_ a: String, _ b: String) -> Bool {
|
|
243
|
+
if a == b { return true }
|
|
244
|
+
let maxLen = max(a.count, b.count)
|
|
245
|
+
guard maxLen > 2 else { return false } // Very short words → exact only
|
|
246
|
+
let threshold = max(1, (maxLen + 1) / 3)
|
|
247
|
+
return editDistance(a, b) <= threshold
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
private static func editDistance(_ a: String, _ b: String) -> Int {
|
|
251
|
+
let ac = Array(a), bc = Array(b)
|
|
252
|
+
let m = ac.count, n = bc.count
|
|
253
|
+
if m == 0 { return n }
|
|
254
|
+
if n == 0 { return m }
|
|
255
|
+
var prev = Array(0...n), curr = Array(repeating: 0, count: n + 1)
|
|
256
|
+
for i in 1...m {
|
|
257
|
+
curr[0] = i
|
|
258
|
+
for j in 1...n {
|
|
259
|
+
curr[j] = ac[i - 1] == bc[j - 1]
|
|
260
|
+
? prev[j - 1]
|
|
261
|
+
: 1 + min(prev[j], curr[j - 1], prev[j - 1])
|
|
262
|
+
}
|
|
263
|
+
swap(&prev, &curr)
|
|
264
|
+
}
|
|
265
|
+
return prev[n]
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// MARK: Normalization helpers
|
|
269
|
+
|
|
270
|
+
private static func normalizeTriggers(_ triggers: [String]) -> [TriggerTokens] {
|
|
271
|
+
triggers.compactMap { trig in
|
|
272
|
+
let t = trig.split(whereSeparator: \.isWhitespace)
|
|
273
|
+
.map { normalizeToken(String($0)) }.filter { !$0.isEmpty }
|
|
274
|
+
return t.isEmpty ? nil : TriggerTokens(original: trig, tokens: t)
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
private static func normalizeSegments(_ segments: [WakeSegment]) -> [Token] {
|
|
279
|
+
segments.compactMap { seg in
|
|
280
|
+
let n = normalizeToken(seg.text)
|
|
281
|
+
return n.isEmpty ? nil : Token(normalized: n, start: seg.start, end: seg.end, range: seg.range)
|
|
282
|
+
}
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
static func normalizeToken(_ t: String) -> String {
|
|
286
|
+
t.trimmingCharacters(in: wsPunct).lowercased()
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
private static let wsPunct = CharacterSet.whitespacesAndNewlines.union(.punctuationCharacters)
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
293
|
+
// MARK: - SFTranscription → WakeSegment
|
|
294
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
295
|
+
|
|
296
|
+
private extension SFTranscription {
|
|
297
|
+
func toWakeSegments(transcript: String) -> [WakeSegment] {
|
|
298
|
+
segments.map { seg in
|
|
299
|
+
WakeSegment(text: seg.substring, start: seg.timestamp, duration: seg.duration,
|
|
300
|
+
range: Range(seg.substringRange, in: transcript))
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
306
|
+
// MARK: - Swabble Plugin
|
|
307
|
+
// ═══════════════════════════════════════════════════════════════════════════════
|
|
308
|
+
|
|
309
|
+
/**
|
|
310
|
+
* Swabble Plugin for Capacitor
|
|
311
|
+
*
|
|
312
|
+
* Provides voice wake word detection and continuous speech-to-text using Apple's
|
|
313
|
+
* Speech framework.
|
|
314
|
+
*
|
|
315
|
+
* State machine: idle → listening → triggered → capturing → listening
|
|
316
|
+
*/
|
|
317
|
+
@objc(SwabblePlugin)
|
|
318
|
+
public class SwabblePlugin: CAPPlugin, CAPBridgedPlugin {
|
|
319
|
+
public let identifier = "SwabblePlugin"
|
|
320
|
+
public let jsName = "Swabble"
|
|
321
|
+
public let pluginMethods: [CAPPluginMethod] = [
|
|
322
|
+
CAPPluginMethod(name: "start", returnType: CAPPluginReturnPromise),
|
|
323
|
+
CAPPluginMethod(name: "stop", returnType: CAPPluginReturnPromise),
|
|
324
|
+
CAPPluginMethod(name: "isListening", returnType: CAPPluginReturnPromise),
|
|
325
|
+
CAPPluginMethod(name: "getConfig", returnType: CAPPluginReturnPromise),
|
|
326
|
+
CAPPluginMethod(name: "updateConfig", returnType: CAPPluginReturnPromise),
|
|
327
|
+
CAPPluginMethod(name: "checkPermissions", returnType: CAPPluginReturnPromise),
|
|
328
|
+
CAPPluginMethod(name: "requestPermissions", returnType: CAPPluginReturnPromise),
|
|
329
|
+
CAPPluginMethod(name: "getAudioDevices", returnType: CAPPluginReturnPromise),
|
|
330
|
+
CAPPluginMethod(name: "setAudioDevice", returnType: CAPPluginReturnPromise),
|
|
331
|
+
]
|
|
332
|
+
|
|
333
|
+
// ── State Machine ──────────────────────────────────────────────────────
|
|
334
|
+
|
|
335
|
+
private enum State: String {
|
|
336
|
+
case idle, listening, triggered, capturing
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
private var state: State = .idle {
|
|
340
|
+
didSet {
|
|
341
|
+
guard state != oldValue else { return }
|
|
342
|
+
notifyListeners("stateChange", data: ["state": state.rawValue])
|
|
343
|
+
}
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
// ── Audio & Speech ─────────────────────────────────────────────────────
|
|
347
|
+
|
|
348
|
+
private var audioEngine: AVAudioEngine?
|
|
349
|
+
private var speechRecognizer: SFSpeechRecognizer?
|
|
350
|
+
private var recognitionRequest: SFSpeechAudioBufferRecognitionRequest?
|
|
351
|
+
private var recognitionTask: SFSpeechRecognitionTask?
|
|
352
|
+
private let bufferQueue = AudioBufferQueue()
|
|
353
|
+
private var drainTimer: Timer?
|
|
354
|
+
private var captureTimer: Timer?
|
|
355
|
+
private var recognitionGeneration: Int = 0
|
|
356
|
+
|
|
357
|
+
// ── Audio Level Tracking (written from audio thread, read on main) ─────
|
|
358
|
+
|
|
359
|
+
private let audioLevelLock = NSLock()
|
|
360
|
+
private var _pendingRMS: Double = 0
|
|
361
|
+
private var _pendingPeak: Double = 0
|
|
362
|
+
private var noiseFloorRMS: Double = 1e-4
|
|
363
|
+
private var lastSpeechTime: Date?
|
|
364
|
+
private var lastAudioLevelEmitTime: Date?
|
|
365
|
+
|
|
366
|
+
// Voice-activity detection tunables (from classic VoiceWakeRuntime)
|
|
367
|
+
private let minSpeechRMS: Double = 1e-3
|
|
368
|
+
private let speechBoostFactor: Double = 6.0
|
|
369
|
+
private let audioLevelEmitInterval: TimeInterval = 0.066 // ~15 Hz
|
|
370
|
+
|
|
371
|
+
// ── Capture State ──────────────────────────────────────────────────────
|
|
372
|
+
|
|
373
|
+
private var captureStartTime: Date?
|
|
374
|
+
private var capturedTranscript: String = ""
|
|
375
|
+
private var activeTriggerEndTime: TimeInterval?
|
|
376
|
+
private var heardBeyondTrigger: Bool = false
|
|
377
|
+
private var lastTranscript: String = ""
|
|
378
|
+
private var lastTranscriptTime: Date?
|
|
379
|
+
private var cooldownUntil: Date?
|
|
380
|
+
|
|
381
|
+
// Capture tunables (from classic VoiceWakeRuntime)
|
|
382
|
+
private let silenceWindow: TimeInterval = 2.0
|
|
383
|
+
private let triggerOnlySilenceWindow: TimeInterval = 5.0
|
|
384
|
+
private let captureHardStop: TimeInterval = 120.0
|
|
385
|
+
private let debounceAfterSend: TimeInterval = 0.35
|
|
386
|
+
private let triggerPauseWindow: TimeInterval = 0.55
|
|
387
|
+
private let restartDelay: TimeInterval = 0.5
|
|
388
|
+
|
|
389
|
+
// ── Configuration ──────────────────────────────────────────────────────
|
|
390
|
+
|
|
391
|
+
private var config: PluginConfig?
|
|
392
|
+
|
|
393
|
+
struct PluginConfig {
|
|
394
|
+
var triggers: [String]
|
|
395
|
+
var minPostTriggerGap: TimeInterval
|
|
396
|
+
var minCommandLength: Int
|
|
397
|
+
var locale: String
|
|
398
|
+
var sampleRate: Double
|
|
399
|
+
|
|
400
|
+
init(from obj: JSObject) {
|
|
401
|
+
self.triggers = (obj["triggers"] as? [String]) ?? ["eliza"]
|
|
402
|
+
self.minPostTriggerGap = (obj["minPostTriggerGap"] as? Double) ?? 0.45
|
|
403
|
+
self.minCommandLength = (obj["minCommandLength"] as? Int) ?? 1
|
|
404
|
+
self.locale = (obj["locale"] as? String) ?? Locale.current.identifier
|
|
405
|
+
self.sampleRate = (obj["sampleRate"] as? Double) ?? 16000
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
func toJSObject() -> JSObject {
|
|
409
|
+
[
|
|
410
|
+
"triggers": triggers,
|
|
411
|
+
"minPostTriggerGap": minPostTriggerGap,
|
|
412
|
+
"minCommandLength": minCommandLength,
|
|
413
|
+
"locale": locale,
|
|
414
|
+
"sampleRate": sampleRate,
|
|
415
|
+
]
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
fileprivate var gateConfig: GateConfig {
|
|
419
|
+
GateConfig(triggers: triggers, minPostTriggerGap: minPostTriggerGap,
|
|
420
|
+
minCommandLength: minCommandLength)
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
// ── Notification Observers ─────────────────────────────────────────────
|
|
425
|
+
|
|
426
|
+
private var interruptionObserver: NSObjectProtocol?
|
|
427
|
+
private var routeChangeObserver: NSObjectProtocol?
|
|
428
|
+
private var mediaResetObserver: NSObjectProtocol?
|
|
429
|
+
|
|
430
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
431
|
+
// MARK: - Lifecycle
|
|
432
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
433
|
+
|
|
434
|
+
override public func load() {
|
|
435
|
+
super.load()
|
|
436
|
+
setupNotificationObservers()
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
private func setupNotificationObservers() {
|
|
440
|
+
interruptionObserver = NotificationCenter.default.addObserver(
|
|
441
|
+
forName: AVAudioSession.interruptionNotification,
|
|
442
|
+
object: nil, queue: .main
|
|
443
|
+
) { [weak self] note in self?.handleAudioInterruption(note) }
|
|
444
|
+
|
|
445
|
+
routeChangeObserver = NotificationCenter.default.addObserver(
|
|
446
|
+
forName: AVAudioSession.routeChangeNotification,
|
|
447
|
+
object: nil, queue: .main
|
|
448
|
+
) { [weak self] _ in self?.handleRouteChange() }
|
|
449
|
+
|
|
450
|
+
mediaResetObserver = NotificationCenter.default.addObserver(
|
|
451
|
+
forName: AVAudioSession.mediaServicesWereResetNotification,
|
|
452
|
+
object: nil, queue: .main
|
|
453
|
+
) { [weak self] _ in self?.handleMediaServicesReset() }
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
deinit {
|
|
457
|
+
[interruptionObserver, routeChangeObserver, mediaResetObserver]
|
|
458
|
+
.compactMap { $0 }
|
|
459
|
+
.forEach { NotificationCenter.default.removeObserver($0) }
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
463
|
+
// MARK: - Plugin Methods
|
|
464
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
465
|
+
|
|
466
|
+
@objc func start(_ call: CAPPluginCall) {
|
|
467
|
+
guard let configObj = call.getObject("config") else {
|
|
468
|
+
call.reject("Missing config parameter")
|
|
469
|
+
return
|
|
470
|
+
}
|
|
471
|
+
let cfg = PluginConfig(from: configObj)
|
|
472
|
+
config = cfg
|
|
473
|
+
|
|
474
|
+
SFSpeechRecognizer.requestAuthorization { [weak self] status in
|
|
475
|
+
DispatchQueue.main.async {
|
|
476
|
+
guard let self else { return }
|
|
477
|
+
switch status {
|
|
478
|
+
case .authorized:
|
|
479
|
+
self.beginListening(config: cfg, call: call)
|
|
480
|
+
case .denied, .restricted:
|
|
481
|
+
call.resolve(["started": false, "error": "Speech recognition not authorized"])
|
|
482
|
+
case .notDetermined:
|
|
483
|
+
call.resolve(["started": false, "error": "Speech recognition authorization pending"])
|
|
484
|
+
@unknown default:
|
|
485
|
+
call.resolve(["started": false, "error": "Unknown authorization status"])
|
|
486
|
+
}
|
|
487
|
+
}
|
|
488
|
+
}
|
|
489
|
+
}
|
|
490
|
+
|
|
491
|
+
@objc func stop(_ call: CAPPluginCall) {
|
|
492
|
+
stopInternal()
|
|
493
|
+
call.resolve()
|
|
494
|
+
}
|
|
495
|
+
|
|
496
|
+
@objc func isListening(_ call: CAPPluginCall) {
|
|
497
|
+
call.resolve(["listening": state != .idle])
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
@objc func getConfig(_ call: CAPPluginCall) {
|
|
501
|
+
if let config {
|
|
502
|
+
call.resolve(["config": config.toJSObject()])
|
|
503
|
+
} else {
|
|
504
|
+
call.resolve(["config": NSNull()])
|
|
505
|
+
}
|
|
506
|
+
}
|
|
507
|
+
|
|
508
|
+
@objc func updateConfig(_ call: CAPPluginCall) {
|
|
509
|
+
guard let obj = call.getObject("config") else {
|
|
510
|
+
call.reject("Missing config parameter")
|
|
511
|
+
return
|
|
512
|
+
}
|
|
513
|
+
if var cfg = config {
|
|
514
|
+
if let t = obj["triggers"] as? [String] { cfg.triggers = t }
|
|
515
|
+
if let g = obj["minPostTriggerGap"] as? Double { cfg.minPostTriggerGap = g }
|
|
516
|
+
if let l = obj["minCommandLength"] as? Int { cfg.minCommandLength = l }
|
|
517
|
+
if let loc = obj["locale"] as? String { cfg.locale = loc }
|
|
518
|
+
if let sr = obj["sampleRate"] as? Double { cfg.sampleRate = sr }
|
|
519
|
+
config = cfg
|
|
520
|
+
}
|
|
521
|
+
call.resolve()
|
|
522
|
+
}
|
|
523
|
+
|
|
524
|
+
@objc public override func checkPermissions(_ call: CAPPluginCall) {
|
|
525
|
+
let speech = SFSpeechRecognizer.authorizationStatus()
|
|
526
|
+
let mic = AVAudioSession.sharedInstance().recordPermission
|
|
527
|
+
call.resolve([
|
|
528
|
+
"microphone": micPermissionString(mic),
|
|
529
|
+
"speechRecognition": speechPermissionString(speech),
|
|
530
|
+
])
|
|
531
|
+
}
|
|
532
|
+
|
|
533
|
+
@objc public override func requestPermissions(_ call: CAPPluginCall) {
|
|
534
|
+
SFSpeechRecognizer.requestAuthorization { [weak self] speechStatus in
|
|
535
|
+
AVAudioSession.sharedInstance().requestRecordPermission { micGranted in
|
|
536
|
+
DispatchQueue.main.async {
|
|
537
|
+
guard let self else { return }
|
|
538
|
+
call.resolve([
|
|
539
|
+
"microphone": self.micPermissionString(micGranted ? .granted : .denied),
|
|
540
|
+
"speechRecognition": self.speechPermissionString(speechStatus),
|
|
541
|
+
])
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
@objc func getAudioDevices(_ call: CAPPluginCall) {
|
|
548
|
+
let session = AVAudioSession.sharedInstance()
|
|
549
|
+
let inputs = session.availableInputs ?? []
|
|
550
|
+
let currentUID = session.currentRoute.inputs.first?.uid
|
|
551
|
+
let devices: [[String: Any]] = inputs.map { port in
|
|
552
|
+
["id": port.uid, "name": port.portName, "isDefault": port.uid == currentUID]
|
|
553
|
+
}
|
|
554
|
+
call.resolve(["devices": devices])
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
@objc func setAudioDevice(_ call: CAPPluginCall) {
|
|
558
|
+
guard let deviceId = call.getString("deviceId") else {
|
|
559
|
+
call.reject("Missing deviceId")
|
|
560
|
+
return
|
|
561
|
+
}
|
|
562
|
+
let session = AVAudioSession.sharedInstance()
|
|
563
|
+
guard let inputs = session.availableInputs,
|
|
564
|
+
let preferred = inputs.first(where: { $0.uid == deviceId }) else {
|
|
565
|
+
call.reject("Audio device not found")
|
|
566
|
+
return
|
|
567
|
+
}
|
|
568
|
+
do {
|
|
569
|
+
try session.setPreferredInput(preferred)
|
|
570
|
+
call.resolve()
|
|
571
|
+
} catch {
|
|
572
|
+
call.reject("Failed to set audio device: \(error.localizedDescription)")
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
577
|
+
// MARK: - Recognition Lifecycle
|
|
578
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
579
|
+
|
|
580
|
+
private func beginListening(config: PluginConfig, call: CAPPluginCall) {
|
|
581
|
+
// Clean up any prior session without emitting idle stateChange
|
|
582
|
+
stopInternal(emitIdle: false)
|
|
583
|
+
|
|
584
|
+
speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: config.locale))
|
|
585
|
+
guard let speechRecognizer, speechRecognizer.isAvailable else {
|
|
586
|
+
call.resolve(["started": false,
|
|
587
|
+
"error": "Speech recognizer unavailable for locale \(config.locale)"])
|
|
588
|
+
return
|
|
589
|
+
}
|
|
590
|
+
|
|
591
|
+
do {
|
|
592
|
+
try configureAudioSession()
|
|
593
|
+
try startRecognitionPipeline()
|
|
594
|
+
state = .listening
|
|
595
|
+
call.resolve(["started": true])
|
|
596
|
+
} catch {
|
|
597
|
+
call.resolve(["started": false, "error": error.localizedDescription])
|
|
598
|
+
}
|
|
599
|
+
}
|
|
600
|
+
|
|
601
|
+
/// Start audio engine + recognition task + drain timer.
|
|
602
|
+
private func startRecognitionPipeline() throws {
|
|
603
|
+
recognitionGeneration &+= 1
|
|
604
|
+
let generation = recognitionGeneration
|
|
605
|
+
|
|
606
|
+
let request = SFSpeechAudioBufferRecognitionRequest()
|
|
607
|
+
request.shouldReportPartialResults = true
|
|
608
|
+
request.taskHint = .dictation
|
|
609
|
+
if #available(iOS 16, *) { request.addsPunctuation = true }
|
|
610
|
+
recognitionRequest = request
|
|
611
|
+
|
|
612
|
+
if audioEngine == nil { audioEngine = AVAudioEngine() }
|
|
613
|
+
guard let audioEngine else { throw SwabbleError.audioEngineUnavailable }
|
|
614
|
+
|
|
615
|
+
let inputNode = audioEngine.inputNode
|
|
616
|
+
let format = inputNode.outputFormat(forBus: 0)
|
|
617
|
+
guard format.channelCount > 0, format.sampleRate > 0 else {
|
|
618
|
+
throw SwabbleError.noAudioInput
|
|
619
|
+
}
|
|
620
|
+
inputNode.removeTap(onBus: 0)
|
|
621
|
+
|
|
622
|
+
// Audio tap: copy buffer (thread-safe) and store RMS for main thread to read.
|
|
623
|
+
inputNode.installTap(onBus: 0, bufferSize: 1024, format: format) { [weak self] buffer, _ in
|
|
624
|
+
guard let self else { return }
|
|
625
|
+
self.bufferQueue.enqueue(buffer)
|
|
626
|
+
if let (rms, peak) = Self.calculateRMS(buffer: buffer) {
|
|
627
|
+
self.audioLevelLock.lock()
|
|
628
|
+
self._pendingRMS = rms
|
|
629
|
+
self._pendingPeak = peak
|
|
630
|
+
self.audioLevelLock.unlock()
|
|
631
|
+
}
|
|
632
|
+
}
|
|
633
|
+
|
|
634
|
+
audioEngine.prepare()
|
|
635
|
+
try audioEngine.start()
|
|
636
|
+
startDrainTimer()
|
|
637
|
+
|
|
638
|
+
recognitionTask = speechRecognizer?.recognitionTask(with: request) {
|
|
639
|
+
[weak self, generation] result, error in
|
|
640
|
+
DispatchQueue.main.async {
|
|
641
|
+
self?.handleRecognitionResult(result: result, error: error, generation: generation)
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
/// Soft restart: keep audio engine running, just restart recognition request + task.
|
|
647
|
+
/// Used when the recognizer hits its ~1-minute limit or encounters a transient error.
|
|
648
|
+
private func softRestartRecognition() {
|
|
649
|
+
recognitionGeneration &+= 1
|
|
650
|
+
let generation = recognitionGeneration
|
|
651
|
+
|
|
652
|
+
recognitionTask?.cancel()
|
|
653
|
+
recognitionTask = nil
|
|
654
|
+
recognitionRequest?.endAudio()
|
|
655
|
+
recognitionRequest = nil
|
|
656
|
+
|
|
657
|
+
guard state != .idle, let speechRecognizer, speechRecognizer.isAvailable else { return }
|
|
658
|
+
|
|
659
|
+
DispatchQueue.main.asyncAfter(deadline: .now() + restartDelay) { [weak self] in
|
|
660
|
+
guard let self, self.state != .idle else { return }
|
|
661
|
+
|
|
662
|
+
let request = SFSpeechAudioBufferRecognitionRequest()
|
|
663
|
+
request.shouldReportPartialResults = true
|
|
664
|
+
request.taskHint = .dictation
|
|
665
|
+
if #available(iOS 16, *) { request.addsPunctuation = true }
|
|
666
|
+
self.recognitionRequest = request
|
|
667
|
+
|
|
668
|
+
let gen = self.recognitionGeneration
|
|
669
|
+
self.recognitionTask = self.speechRecognizer?.recognitionTask(with: request) {
|
|
670
|
+
[weak self] result, error in
|
|
671
|
+
DispatchQueue.main.async {
|
|
672
|
+
self?.handleRecognitionResult(result: result, error: error, generation: gen)
|
|
673
|
+
}
|
|
674
|
+
}
|
|
675
|
+
}
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
/// Hard restart: tear everything down and rebuild.
|
|
679
|
+
/// Used after audio interruptions or media services reset.
|
|
680
|
+
private func hardRestartRecognition() {
|
|
681
|
+
haltRecognitionPipeline()
|
|
682
|
+
guard let config, state != .idle else { return }
|
|
683
|
+
|
|
684
|
+
DispatchQueue.main.asyncAfter(deadline: .now() + restartDelay) { [weak self] in
|
|
685
|
+
guard let self, self.state != .idle else { return }
|
|
686
|
+
self.speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: config.locale))
|
|
687
|
+
do {
|
|
688
|
+
try self.configureAudioSession()
|
|
689
|
+
try self.startRecognitionPipeline()
|
|
690
|
+
} catch {
|
|
691
|
+
self.emitError(code: "restart_failed", message: error.localizedDescription,
|
|
692
|
+
recoverable: true)
|
|
693
|
+
// Exponential backoff retry
|
|
694
|
+
DispatchQueue.main.asyncAfter(deadline: .now() + 3.0) { [weak self] in
|
|
695
|
+
self?.hardRestartRecognition()
|
|
696
|
+
}
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
|
|
701
|
+
/// Halt the speech recognition pipeline. Audio engine, tap, and drain timer are stopped.
|
|
702
|
+
private func haltRecognitionPipeline() {
|
|
703
|
+
recognitionGeneration &+= 1
|
|
704
|
+
recognitionTask?.cancel()
|
|
705
|
+
recognitionTask = nil
|
|
706
|
+
recognitionRequest?.endAudio()
|
|
707
|
+
recognitionRequest = nil
|
|
708
|
+
stopDrainTimer()
|
|
709
|
+
bufferQueue.clear()
|
|
710
|
+
audioEngine?.inputNode.removeTap(onBus: 0)
|
|
711
|
+
audioEngine?.stop()
|
|
712
|
+
audioEngine = nil
|
|
713
|
+
}
|
|
714
|
+
|
|
715
|
+
/// Full stop - return to idle and release all resources.
|
|
716
|
+
private func stopInternal(emitIdle: Bool = true) {
|
|
717
|
+
haltRecognitionPipeline()
|
|
718
|
+
stopCaptureTimer()
|
|
719
|
+
speechRecognizer = nil
|
|
720
|
+
capturedTranscript = ""
|
|
721
|
+
captureStartTime = nil
|
|
722
|
+
activeTriggerEndTime = nil
|
|
723
|
+
heardBeyondTrigger = false
|
|
724
|
+
lastTranscript = ""
|
|
725
|
+
lastTranscriptTime = nil
|
|
726
|
+
lastSpeechTime = nil
|
|
727
|
+
cooldownUntil = nil
|
|
728
|
+
noiseFloorRMS = 1e-4
|
|
729
|
+
try? AVAudioSession.sharedInstance().setActive(false, options: .notifyOthersOnDeactivation)
|
|
730
|
+
if emitIdle { state = .idle }
|
|
731
|
+
}
|
|
732
|
+
|
|
733
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
734
|
+
// MARK: - Recognition Result Handling
|
|
735
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
736
|
+
|
|
737
|
+
private func handleRecognitionResult(result: SFSpeechRecognitionResult?,
|
|
738
|
+
error: Error?, generation: Int) {
|
|
739
|
+
// Drop stale callbacks from superseded recognition sessions
|
|
740
|
+
guard generation == recognitionGeneration else { return }
|
|
741
|
+
|
|
742
|
+
if let error {
|
|
743
|
+
if state != .idle {
|
|
744
|
+
emitError(code: "recognition_error", message: error.localizedDescription,
|
|
745
|
+
recoverable: true)
|
|
746
|
+
softRestartRecognition()
|
|
747
|
+
}
|
|
748
|
+
return
|
|
749
|
+
}
|
|
750
|
+
|
|
751
|
+
guard let result else { return }
|
|
752
|
+
let transcript = result.bestTranscription.formattedString
|
|
753
|
+
let isFinal = result.isFinal
|
|
754
|
+
let segments = result.bestTranscription.toWakeSegments(transcript: transcript)
|
|
755
|
+
let confidence = result.bestTranscription.segments.last?.confidence ?? 0
|
|
756
|
+
|
|
757
|
+
// Build JS segments for the transcript event
|
|
758
|
+
let jsSegments: [[String: Any]] = segments.map { seg in
|
|
759
|
+
["text": seg.text, "start": seg.start, "duration": seg.duration, "isFinal": isFinal]
|
|
760
|
+
}
|
|
761
|
+
notifyListeners("transcript", data: [
|
|
762
|
+
"transcript": transcript, "segments": jsSegments,
|
|
763
|
+
"isFinal": isFinal, "confidence": Double(confidence),
|
|
764
|
+
])
|
|
765
|
+
|
|
766
|
+
if !transcript.isEmpty {
|
|
767
|
+
lastTranscript = transcript
|
|
768
|
+
lastTranscriptTime = Date()
|
|
769
|
+
}
|
|
770
|
+
|
|
771
|
+
switch state {
|
|
772
|
+
case .listening:
|
|
773
|
+
handleListeningResult(transcript: transcript, segments: segments,
|
|
774
|
+
isFinal: isFinal, confidence: confidence)
|
|
775
|
+
case .capturing:
|
|
776
|
+
handleCapturingResult(transcript: transcript, segments: segments, isFinal: isFinal)
|
|
777
|
+
case .triggered, .idle:
|
|
778
|
+
break
|
|
779
|
+
}
|
|
780
|
+
|
|
781
|
+
// When recognition ends naturally (time limit), soft-restart to keep listening
|
|
782
|
+
if isFinal, state != .idle {
|
|
783
|
+
softRestartRecognition()
|
|
784
|
+
}
|
|
785
|
+
}
|
|
786
|
+
|
|
787
|
+
// ── Listening state: look for wake word ────────────────────────────────
|
|
788
|
+
|
|
789
|
+
private func handleListeningResult(transcript: String, segments: [WakeSegment],
|
|
790
|
+
isFinal: Bool, confidence: Float) {
|
|
791
|
+
guard let config else { return }
|
|
792
|
+
if let cooldown = cooldownUntil, Date() < cooldown { return }
|
|
793
|
+
|
|
794
|
+
// 1) Timing-based match (preferred: uses post-trigger gap from segment timing)
|
|
795
|
+
if let match = WakeGate.match(transcript: transcript, segments: segments,
|
|
796
|
+
config: config.gateConfig) {
|
|
797
|
+
triggerWakeWord(match: match, transcript: transcript, confidence: confidence)
|
|
798
|
+
return
|
|
799
|
+
}
|
|
800
|
+
|
|
801
|
+
// 2) Text-only fallback on final results (timing data absent/unreliable)
|
|
802
|
+
if isFinal,
|
|
803
|
+
let command = WakeGate.textOnlyCommand(transcript: transcript,
|
|
804
|
+
triggers: config.triggers,
|
|
805
|
+
minCommandLength: config.minCommandLength) {
|
|
806
|
+
let trigger = config.triggers.first ?? ""
|
|
807
|
+
let fallback = GateMatch(triggerWord: trigger, triggerEndTime: 0,
|
|
808
|
+
postGap: 0, command: command)
|
|
809
|
+
triggerWakeWord(match: fallback, transcript: transcript, confidence: confidence)
|
|
810
|
+
return
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
// 3) Trigger-only detection: user said just the wake word and paused
|
|
814
|
+
if isTriggerOnly(transcript: transcript) {
|
|
815
|
+
scheduleTriggerOnlyCheck(transcript: transcript)
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
|
|
819
|
+
// ── Capturing state: accumulate post-trigger speech ────────────────────
|
|
820
|
+
|
|
821
|
+
private func handleCapturingResult(transcript: String, segments: [WakeSegment],
|
|
822
|
+
isFinal: Bool) {
|
|
823
|
+
guard let config else { return }
|
|
824
|
+
|
|
825
|
+
// Use timing data if available, fall back to text-based extraction
|
|
826
|
+
let command: String
|
|
827
|
+
if let trigEnd = activeTriggerEndTime, !segments.isEmpty {
|
|
828
|
+
let timed = WakeGate.commandText(transcript: transcript, segments: segments,
|
|
829
|
+
triggerEndTime: trigEnd)
|
|
830
|
+
command = timed.isEmpty
|
|
831
|
+
? WakeGate.textAfterTrigger(transcript, triggers: config.triggers)
|
|
832
|
+
: timed
|
|
833
|
+
} else {
|
|
834
|
+
command = WakeGate.textAfterTrigger(transcript, triggers: config.triggers)
|
|
835
|
+
}
|
|
836
|
+
|
|
837
|
+
if !command.isEmpty {
|
|
838
|
+
capturedTranscript = command
|
|
839
|
+
if !heardBeyondTrigger { heardBeyondTrigger = true }
|
|
840
|
+
}
|
|
841
|
+
}
|
|
842
|
+
|
|
843
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
844
|
+
// MARK: - Wake Word Trigger & Capture
|
|
845
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
846
|
+
|
|
847
|
+
private func triggerWakeWord(match: GateMatch, transcript: String, confidence: Float) {
|
|
848
|
+
state = .triggered
|
|
849
|
+
notifyListeners("wakeWord", data: [
|
|
850
|
+
"wakeWord": match.triggerWord,
|
|
851
|
+
"command": match.command,
|
|
852
|
+
"transcript": transcript,
|
|
853
|
+
"postGap": match.postGap,
|
|
854
|
+
"confidence": Double(confidence),
|
|
855
|
+
])
|
|
856
|
+
beginCapture(initialCommand: match.command, triggerEndTime: match.triggerEndTime)
|
|
857
|
+
}
|
|
858
|
+
|
|
859
|
+
private func beginCapture(initialCommand: String, triggerEndTime: TimeInterval) {
|
|
860
|
+
state = .capturing
|
|
861
|
+
capturedTranscript = initialCommand
|
|
862
|
+
captureStartTime = Date()
|
|
863
|
+
activeTriggerEndTime = triggerEndTime
|
|
864
|
+
heardBeyondTrigger = !initialCommand.isEmpty
|
|
865
|
+
lastSpeechTime = Date()
|
|
866
|
+
cooldownUntil = nil
|
|
867
|
+
startCaptureTimer()
|
|
868
|
+
}
|
|
869
|
+
|
|
870
|
+
private func startCaptureTimer() {
|
|
871
|
+
stopCaptureTimer()
|
|
872
|
+
captureTimer = Timer.scheduledTimer(withTimeInterval: 0.2, repeats: true) { [weak self] _ in
|
|
873
|
+
self?.checkCaptureTimeout()
|
|
874
|
+
}
|
|
875
|
+
}
|
|
876
|
+
|
|
877
|
+
private func stopCaptureTimer() {
|
|
878
|
+
captureTimer?.invalidate()
|
|
879
|
+
captureTimer = nil
|
|
880
|
+
}
|
|
881
|
+
|
|
882
|
+
private func checkCaptureTimeout() {
|
|
883
|
+
guard state == .capturing else { stopCaptureTimer(); return }
|
|
884
|
+
let now = Date()
|
|
885
|
+
|
|
886
|
+
// Hard stop after maximum capture duration
|
|
887
|
+
if let start = captureStartTime, now.timeIntervalSince(start) >= captureHardStop {
|
|
888
|
+
finalizeCapture()
|
|
889
|
+
return
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
// Silence detection: different thresholds based on whether we heard post-trigger speech
|
|
893
|
+
let threshold = heardBeyondTrigger ? silenceWindow : triggerOnlySilenceWindow
|
|
894
|
+
if let lastSpeech = lastSpeechTime, now.timeIntervalSince(lastSpeech) >= threshold {
|
|
895
|
+
finalizeCapture()
|
|
896
|
+
}
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
private func finalizeCapture() {
|
|
900
|
+
guard state == .capturing else { return }
|
|
901
|
+
stopCaptureTimer()
|
|
902
|
+
cooldownUntil = Date().addingTimeInterval(debounceAfterSend)
|
|
903
|
+
|
|
904
|
+
let finalText = capturedTranscript.trimmingCharacters(in: .whitespacesAndNewlines)
|
|
905
|
+
if !finalText.isEmpty {
|
|
906
|
+
notifyListeners("transcript", data: [
|
|
907
|
+
"transcript": finalText, "segments": [] as [[String: Any]],
|
|
908
|
+
"isFinal": true, "confidence": 1.0,
|
|
909
|
+
])
|
|
910
|
+
}
|
|
911
|
+
|
|
912
|
+
// Reset capture state
|
|
913
|
+
capturedTranscript = ""
|
|
914
|
+
captureStartTime = nil
|
|
915
|
+
activeTriggerEndTime = nil
|
|
916
|
+
heardBeyondTrigger = false
|
|
917
|
+
lastSpeechTime = nil
|
|
918
|
+
|
|
919
|
+
// Return to listening
|
|
920
|
+
state = .listening
|
|
921
|
+
softRestartRecognition()
|
|
922
|
+
}
|
|
923
|
+
|
|
924
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
925
|
+
// MARK: - Trigger-Only Detection
|
|
926
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
927
|
+
|
|
928
|
+
/// Check if the transcript is just the trigger word with no command after it.
|
|
929
|
+
private func isTriggerOnly(transcript: String) -> Bool {
|
|
930
|
+
guard let config else { return false }
|
|
931
|
+
guard WakeGate.matchesTextOnly(text: transcript, triggers: config.triggers),
|
|
932
|
+
WakeGate.startsWithTrigger(transcript: transcript, triggers: config.triggers) else {
|
|
933
|
+
return false
|
|
934
|
+
}
|
|
935
|
+
return WakeGate.textAfterTrigger(transcript, triggers: config.triggers).isEmpty
|
|
936
|
+
}
|
|
937
|
+
|
|
938
|
+
/// If the transcript hasn't changed after the pause window, start capture.
|
|
939
|
+
private func scheduleTriggerOnlyCheck(transcript: String) {
|
|
940
|
+
let snapshotTime = lastTranscriptTime
|
|
941
|
+
DispatchQueue.main.asyncAfter(deadline: .now() + triggerPauseWindow) { [weak self] in
|
|
942
|
+
guard let self, self.state == .listening else { return }
|
|
943
|
+
guard self.lastTranscriptTime == snapshotTime, self.lastTranscript == transcript else { return }
|
|
944
|
+
guard self.isTriggerOnly(transcript: transcript) else { return }
|
|
945
|
+
if let cooldown = self.cooldownUntil, Date() < cooldown { return }
|
|
946
|
+
|
|
947
|
+
let trigger = self.config?.triggers.first ?? ""
|
|
948
|
+
self.state = .triggered
|
|
949
|
+
self.notifyListeners("wakeWord", data: [
|
|
950
|
+
"wakeWord": trigger, "command": "", "transcript": transcript,
|
|
951
|
+
"postGap": 0.0, "confidence": 0.0,
|
|
952
|
+
])
|
|
953
|
+
self.beginCapture(initialCommand: "", triggerEndTime: 0)
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
958
|
+
// MARK: - Audio Level Processing
|
|
959
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
960
|
+
|
|
961
|
+
private static func calculateRMS(buffer: AVAudioPCMBuffer) -> (rms: Double, peak: Double)? {
|
|
962
|
+
guard let channelData = buffer.floatChannelData?[0] else { return nil }
|
|
963
|
+
let count = Int(buffer.frameLength)
|
|
964
|
+
guard count > 0 else { return nil }
|
|
965
|
+
var sum: Double = 0
|
|
966
|
+
var peak: Float = 0
|
|
967
|
+
for i in 0..<count {
|
|
968
|
+
let sample = abs(channelData[i])
|
|
969
|
+
sum += Double(sample) * Double(sample)
|
|
970
|
+
if sample > peak { peak = sample }
|
|
971
|
+
}
|
|
972
|
+
return (sqrt(sum / Double(count)), Double(peak))
|
|
973
|
+
}
|
|
974
|
+
|
|
975
|
+
/// Called from the drain timer on the main thread. Updates noise floor, speech detection,
|
|
976
|
+
/// and emits throttled audioLevel events.
|
|
977
|
+
private func processAudioLevel() {
|
|
978
|
+
audioLevelLock.lock()
|
|
979
|
+
let rms = _pendingRMS
|
|
980
|
+
let peak = _pendingPeak
|
|
981
|
+
audioLevelLock.unlock()
|
|
982
|
+
guard rms > 0 else { return }
|
|
983
|
+
|
|
984
|
+
// Adaptive noise floor: fast decay (quiet room), slow rise (speech/noise)
|
|
985
|
+
let alpha: Double = rms < noiseFloorRMS ? 0.08 : 0.01
|
|
986
|
+
noiseFloorRMS = max(1e-7, noiseFloorRMS + (rms - noiseFloorRMS) * alpha)
|
|
987
|
+
|
|
988
|
+
// Mark speech when audio is clearly above adaptive threshold
|
|
989
|
+
let threshold = max(minSpeechRMS, noiseFloorRMS * speechBoostFactor)
|
|
990
|
+
if rms >= threshold {
|
|
991
|
+
lastSpeechTime = Date()
|
|
992
|
+
}
|
|
993
|
+
|
|
994
|
+
// Throttle audioLevel events to ~15 Hz
|
|
995
|
+
let now = Date()
|
|
996
|
+
if let lastEmit = lastAudioLevelEmitTime, now.timeIntervalSince(lastEmit) < audioLevelEmitInterval {
|
|
997
|
+
return
|
|
998
|
+
}
|
|
999
|
+
lastAudioLevelEmitTime = now
|
|
1000
|
+
|
|
1001
|
+
let normalized = min(1.0, max(0.0, rms / max(minSpeechRMS, threshold)))
|
|
1002
|
+
notifyListeners("audioLevel", data: ["level": normalized, "peak": min(1.0, peak)])
|
|
1003
|
+
}
|
|
1004
|
+
|
|
1005
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
1006
|
+
// MARK: - Drain Timer
|
|
1007
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
1008
|
+
|
|
1009
|
+
private func startDrainTimer() {
|
|
1010
|
+
stopDrainTimer()
|
|
1011
|
+
// 40ms interval matches the classic iOS implementation's drain cadence
|
|
1012
|
+
drainTimer = Timer.scheduledTimer(withTimeInterval: 0.04, repeats: true) { [weak self] _ in
|
|
1013
|
+
guard let self else { return }
|
|
1014
|
+
let buffers = self.bufferQueue.drain()
|
|
1015
|
+
if let request = self.recognitionRequest {
|
|
1016
|
+
for buf in buffers { request.append(buf) }
|
|
1017
|
+
}
|
|
1018
|
+
self.processAudioLevel()
|
|
1019
|
+
}
|
|
1020
|
+
}
|
|
1021
|
+
|
|
1022
|
+
private func stopDrainTimer() {
|
|
1023
|
+
drainTimer?.invalidate()
|
|
1024
|
+
drainTimer = nil
|
|
1025
|
+
}
|
|
1026
|
+
|
|
1027
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
1028
|
+
// MARK: - Audio Session Configuration
|
|
1029
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
1030
|
+
|
|
1031
|
+
private func configureAudioSession() throws {
|
|
1032
|
+
let session = AVAudioSession.sharedInstance()
|
|
1033
|
+
try session.setCategory(.playAndRecord, mode: .measurement, options: [
|
|
1034
|
+
.duckOthers, .mixWithOthers, .allowBluetooth, .defaultToSpeaker,
|
|
1035
|
+
])
|
|
1036
|
+
try session.setActive(true, options: [])
|
|
1037
|
+
}
|
|
1038
|
+
|
|
1039
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
1040
|
+
// MARK: - Audio Interruption Handling
|
|
1041
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
1042
|
+
|
|
1043
|
+
private func handleAudioInterruption(_ notification: Notification) {
|
|
1044
|
+
guard let info = notification.userInfo,
|
|
1045
|
+
let typeValue = info[AVAudioSessionInterruptionTypeKey] as? UInt,
|
|
1046
|
+
let type = AVAudioSession.InterruptionType(rawValue: typeValue) else { return }
|
|
1047
|
+
|
|
1048
|
+
switch type {
|
|
1049
|
+
case .began:
|
|
1050
|
+
guard state != .idle else { return }
|
|
1051
|
+
// Another app (phone call, Siri) took audio focus. Halt gracefully.
|
|
1052
|
+
haltRecognitionPipeline()
|
|
1053
|
+
stopCaptureTimer()
|
|
1054
|
+
capturedTranscript = ""
|
|
1055
|
+
captureStartTime = nil
|
|
1056
|
+
activeTriggerEndTime = nil
|
|
1057
|
+
heardBeyondTrigger = false
|
|
1058
|
+
state = .idle
|
|
1059
|
+
emitError(code: "audio_interrupted",
|
|
1060
|
+
message: "Audio session interrupted by another app", recoverable: true)
|
|
1061
|
+
|
|
1062
|
+
case .ended:
|
|
1063
|
+
let options: AVAudioSession.InterruptionOptions
|
|
1064
|
+
if let raw = info[AVAudioSessionInterruptionOptionKey] as? UInt {
|
|
1065
|
+
options = AVAudioSession.InterruptionOptions(rawValue: raw)
|
|
1066
|
+
} else {
|
|
1067
|
+
options = []
|
|
1068
|
+
}
|
|
1069
|
+
if options.contains(.shouldResume), let config {
|
|
1070
|
+
DispatchQueue.main.asyncAfter(deadline: .now() + 0.5) { [weak self] in
|
|
1071
|
+
self?.autoRestart(config: config)
|
|
1072
|
+
}
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
@unknown default:
|
|
1076
|
+
break
|
|
1077
|
+
}
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
private func handleRouteChange() {
|
|
1081
|
+
// Audio route changed (headphones, Bluetooth). Restart to pick up new device.
|
|
1082
|
+
guard state != .idle else { return }
|
|
1083
|
+
hardRestartRecognition()
|
|
1084
|
+
}
|
|
1085
|
+
|
|
1086
|
+
private func handleMediaServicesReset() {
|
|
1087
|
+
let savedConfig = config
|
|
1088
|
+
stopInternal()
|
|
1089
|
+
emitError(code: "media_reset", message: "Media services were reset", recoverable: true)
|
|
1090
|
+
if let savedConfig {
|
|
1091
|
+
DispatchQueue.main.asyncAfter(deadline: .now() + 1.0) { [weak self] in
|
|
1092
|
+
self?.autoRestart(config: savedConfig)
|
|
1093
|
+
}
|
|
1094
|
+
}
|
|
1095
|
+
}
|
|
1096
|
+
|
|
1097
|
+
/// Restart from idle after an interruption or reset.
|
|
1098
|
+
private func autoRestart(config: PluginConfig) {
|
|
1099
|
+
guard state == .idle else { return }
|
|
1100
|
+
self.config = config
|
|
1101
|
+
speechRecognizer = SFSpeechRecognizer(locale: Locale(identifier: config.locale))
|
|
1102
|
+
guard let speechRecognizer, speechRecognizer.isAvailable else {
|
|
1103
|
+
emitError(code: "restart_failed", message: "Speech recognizer unavailable",
|
|
1104
|
+
recoverable: false)
|
|
1105
|
+
return
|
|
1106
|
+
}
|
|
1107
|
+
do {
|
|
1108
|
+
try configureAudioSession()
|
|
1109
|
+
try startRecognitionPipeline()
|
|
1110
|
+
state = .listening
|
|
1111
|
+
} catch {
|
|
1112
|
+
emitError(code: "restart_failed", message: error.localizedDescription,
|
|
1113
|
+
recoverable: false)
|
|
1114
|
+
}
|
|
1115
|
+
}
|
|
1116
|
+
|
|
1117
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
1118
|
+
// MARK: - Helpers
|
|
1119
|
+
// ════════════════════════════════════════════════════════════════════════
|
|
1120
|
+
|
|
1121
|
+
private func emitError(code: String, message: String, recoverable: Bool) {
|
|
1122
|
+
notifyListeners("error", data: [
|
|
1123
|
+
"code": code, "message": message, "recoverable": recoverable,
|
|
1124
|
+
])
|
|
1125
|
+
}
|
|
1126
|
+
|
|
1127
|
+
private func micPermissionString(_ status: AVAudioSession.RecordPermission) -> String {
|
|
1128
|
+
switch status {
|
|
1129
|
+
case .granted: return "granted"
|
|
1130
|
+
case .denied: return "denied"
|
|
1131
|
+
case .undetermined: return "prompt"
|
|
1132
|
+
@unknown default: return "prompt"
|
|
1133
|
+
}
|
|
1134
|
+
}
|
|
1135
|
+
|
|
1136
|
+
private func speechPermissionString(_ status: SFSpeechRecognizerAuthorizationStatus) -> String {
|
|
1137
|
+
switch status {
|
|
1138
|
+
case .authorized: return "granted"
|
|
1139
|
+
case .denied, .restricted: return "denied"
|
|
1140
|
+
case .notDetermined: return "prompt"
|
|
1141
|
+
@unknown default: return "prompt"
|
|
1142
|
+
}
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
private enum SwabbleError: LocalizedError {
|
|
1146
|
+
case audioEngineUnavailable
|
|
1147
|
+
case noAudioInput
|
|
1148
|
+
|
|
1149
|
+
var errorDescription: String? {
|
|
1150
|
+
switch self {
|
|
1151
|
+
case .audioEngineUnavailable: return "Unable to create audio engine"
|
|
1152
|
+
case .noAudioInput: return "No audio input available"
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
}
|
|
1156
|
+
}
|