@lumiastream/wakeword 1.1.7 → 1.1.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/voice.js +110 -31
- package/package.json +1 -1
package/lib/voice.js
CHANGED
|
@@ -51,6 +51,30 @@ const UNKNOWN_TOKEN = "[unk]";
|
|
|
51
51
|
const normalizePhrase = (phrase = "") => phrase.trim().toLowerCase();
|
|
52
52
|
const toBool = (v = "") =>
|
|
53
53
|
["1", "true", "yes", "y"].includes(`${v}`.trim().toLowerCase());
|
|
54
|
+
const tokenize = (phrase = "") =>
|
|
55
|
+
(phrase ?? "")
|
|
56
|
+
.toString()
|
|
57
|
+
.toLowerCase()
|
|
58
|
+
.replace(/[^a-z0-9]+/gi, " ")
|
|
59
|
+
.trim()
|
|
60
|
+
.split(/\s+/)
|
|
61
|
+
.filter(Boolean);
|
|
62
|
+
const tokensEqual = (a = [], b = []) =>
|
|
63
|
+
a.length === b.length && a.every((token, idx) => token === b[idx]);
|
|
64
|
+
const tokensContainSequence = (tokens = [], phraseTokens = []) => {
|
|
65
|
+
if (!phraseTokens.length || tokens.length < phraseTokens.length) return false;
|
|
66
|
+
for (let i = 0; i <= tokens.length - phraseTokens.length; i += 1) {
|
|
67
|
+
let matches = true;
|
|
68
|
+
for (let j = 0; j < phraseTokens.length; j += 1) {
|
|
69
|
+
if (tokens[i + j] !== phraseTokens[j]) {
|
|
70
|
+
matches = false;
|
|
71
|
+
break;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
if (matches) return true;
|
|
75
|
+
}
|
|
76
|
+
return false;
|
|
77
|
+
};
|
|
54
78
|
|
|
55
79
|
/* ------------------------------------------------------------------ */
|
|
56
80
|
/* 1. Resolve SoX binary and audio device */
|
|
@@ -93,7 +117,8 @@ try {
|
|
|
93
117
|
/* 2. Resolve Vosk model */
|
|
94
118
|
/* ------------------------------------------------------------------ */
|
|
95
119
|
const envModelPath = (process.env.LUMIA_VOICE_MODEL_PATH || "").trim();
|
|
96
|
-
let modelPath =
|
|
120
|
+
let modelPath =
|
|
121
|
+
envModelPath || join(here, "..", "models", "vosk-model-small-en-us-0.15");
|
|
97
122
|
modelPath = unpacked(modelPath);
|
|
98
123
|
|
|
99
124
|
if (!existsSync(modelPath))
|
|
@@ -107,12 +132,14 @@ setLogLevel(0);
|
|
|
107
132
|
const SAMPLE_RATE = Number(process.env.SAMPLE_RATE || 16_000);
|
|
108
133
|
let GRAMMAR = [UNKNOWN_TOKEN]; // seed; always keep [unk]
|
|
109
134
|
let COMMANDS = [];
|
|
135
|
+
let EXTRA_GRAMMAR = [];
|
|
110
136
|
|
|
111
137
|
const model = new Model(modelPath);
|
|
112
138
|
const buildRecognizer = () => {
|
|
113
|
-
const recognizer =
|
|
114
|
-
|
|
115
|
-
|
|
139
|
+
const recognizer =
|
|
140
|
+
MATCH_SENTENCE || DISABLE_GRAMMAR
|
|
141
|
+
? new Recognizer({ model, sampleRate: SAMPLE_RATE })
|
|
142
|
+
: new Recognizer({ model, sampleRate: SAMPLE_RATE, grammar: GRAMMAR });
|
|
116
143
|
recognizer.setWords(true);
|
|
117
144
|
return recognizer;
|
|
118
145
|
};
|
|
@@ -136,7 +163,7 @@ if (audioDevice !== null) {
|
|
|
136
163
|
recArgs.device = "default";
|
|
137
164
|
console.error("Using default Windows audio device: default");
|
|
138
165
|
console.error(
|
|
139
|
-
"To specify a different device, use: AUDIO_DEVICE=<device_id> or pass as 3rd argument"
|
|
166
|
+
"To specify a different device, use: AUDIO_DEVICE=<device_id> or pass as 3rd argument",
|
|
140
167
|
);
|
|
141
168
|
}
|
|
142
169
|
|
|
@@ -151,16 +178,17 @@ mic.on("error", (err) => {
|
|
|
151
178
|
// You might need to adjust this value based on your specific use case.
|
|
152
179
|
let WORD_CONFIDENCE_THRESHOLD = 0.7;
|
|
153
180
|
const DEBUG_AUDIO = ["1", "true", "yes"].includes(
|
|
154
|
-
(process.env.WAKEWORD_DEBUG || "").toLowerCase()
|
|
181
|
+
(process.env.WAKEWORD_DEBUG || "").toLowerCase(),
|
|
155
182
|
);
|
|
156
183
|
const LOG_PARTIAL =
|
|
157
184
|
DEBUG_AUDIO ||
|
|
158
185
|
["1", "true", "yes"].includes(
|
|
159
|
-
(process.env.WAKEWORD_LOG_PARTIAL || "").toLowerCase()
|
|
186
|
+
(process.env.WAKEWORD_LOG_PARTIAL || "").toLowerCase(),
|
|
160
187
|
);
|
|
161
188
|
let LOG_FINAL = ["1", "true", "yes"].includes(
|
|
162
|
-
(process.env.WAKEWORD_LOG_FINAL || "").toLowerCase()
|
|
189
|
+
(process.env.WAKEWORD_LOG_FINAL || "").toLowerCase(),
|
|
163
190
|
);
|
|
191
|
+
let emittedMatchesInUtterance = new Set();
|
|
164
192
|
let lastLevelLog = 0;
|
|
165
193
|
|
|
166
194
|
function logAudioLevel(buf) {
|
|
@@ -206,62 +234,79 @@ mic.on("data", (buf) => {
|
|
|
206
234
|
console.log(
|
|
207
235
|
`Discarding low-confidence word: "${
|
|
208
236
|
wordDetail.word
|
|
209
|
-
}" (Conf: ${wordDetail.conf.toFixed(2)})
|
|
237
|
+
}" (Conf: ${wordDetail.conf.toFixed(2)})`,
|
|
210
238
|
);
|
|
211
239
|
}
|
|
212
240
|
}
|
|
213
241
|
|
|
214
242
|
const finalRecognizedText = recognizedWords.join(" ").trim();
|
|
215
243
|
const averageConfidenceAll =
|
|
216
|
-
totalConfidenceCount > 0
|
|
244
|
+
totalConfidenceCount > 0
|
|
245
|
+
? totalConfidenceAll / totalConfidenceCount
|
|
246
|
+
: 0;
|
|
217
247
|
const averageConfidence =
|
|
218
248
|
recognizedWords.length > 0
|
|
219
249
|
? totalConfidence / recognizedWords.length
|
|
220
250
|
: averageConfidenceAll;
|
|
221
251
|
|
|
222
|
-
handle(finalRecognizedText, averageConfidence, fullResult.text
|
|
252
|
+
handle(finalRecognizedText, averageConfidence, fullResult.text, {
|
|
253
|
+
isPartial: false,
|
|
254
|
+
}); // Pass both the filtered text and an average confidence
|
|
223
255
|
} else if (fullResult && fullResult.text) {
|
|
224
|
-
// Fallback for cases where setWords(true) might not fully apply
|
|
225
|
-
handle(fullResult.text.trim(), 1.0, fullResult.text
|
|
256
|
+
// Fallback for cases where setWords(true) might not fully apply
|
|
257
|
+
handle(fullResult.text.trim(), 1.0, fullResult.text, {
|
|
258
|
+
isPartial: false,
|
|
259
|
+
}); // Assume high confidence if no word-level details
|
|
226
260
|
}
|
|
227
|
-
} else
|
|
261
|
+
} else {
|
|
228
262
|
const partial = rec.partialResult();
|
|
229
|
-
if (partial?.partial) {
|
|
263
|
+
if (partial?.partial && LOG_PARTIAL) {
|
|
230
264
|
console.error(`[wakeword] partial: "${partial.partial}"`);
|
|
231
265
|
}
|
|
266
|
+
if (partial?.partial && !MATCH_SENTENCE) {
|
|
267
|
+
handle(partial.partial.trim(), 1.0, partial.partial, { isPartial: true });
|
|
268
|
+
}
|
|
232
269
|
}
|
|
233
270
|
});
|
|
234
271
|
|
|
235
|
-
function handle(processedWord, averageConfidence, originalText) {
|
|
272
|
+
function handle(processedWord, averageConfidence, originalText, options = {}) {
|
|
273
|
+
const { isPartial = false } = options;
|
|
236
274
|
if (!processedWord && !originalText) return;
|
|
237
275
|
|
|
238
276
|
const finalSentence =
|
|
239
277
|
typeof originalText === "string" && originalText.trim()
|
|
240
278
|
? originalText.trim()
|
|
241
279
|
: (processedWord ?? "").toString().trim();
|
|
242
|
-
if (LOG_FINAL && finalSentence) {
|
|
280
|
+
if (!isPartial && LOG_FINAL && finalSentence) {
|
|
243
281
|
process.stdout?.write(`final|${finalSentence}\n`);
|
|
244
282
|
}
|
|
245
283
|
|
|
246
284
|
const normalizedProcessed = normalizePhrase(processedWord);
|
|
247
285
|
const normalizedOriginal = normalizePhrase(originalText);
|
|
286
|
+
const processedTokens = tokenize(normalizedProcessed);
|
|
287
|
+
const originalTokens = tokenize(normalizedOriginal);
|
|
248
288
|
const matches = new Set();
|
|
249
289
|
const confidentCommands = new Set();
|
|
250
290
|
|
|
251
|
-
const findMatches = (
|
|
252
|
-
if (!
|
|
291
|
+
const findMatches = (tokens, allowedCommands = COMMANDS) => {
|
|
292
|
+
if (!tokens?.length) return;
|
|
253
293
|
const hits = MATCH_SENTENCE
|
|
254
|
-
? allowedCommands.filter((command) =>
|
|
255
|
-
|
|
294
|
+
? allowedCommands.filter((command) =>
|
|
295
|
+
tokensContainSequence(tokens, tokenize(command)),
|
|
296
|
+
)
|
|
297
|
+
: allowedCommands.filter((command) =>
|
|
298
|
+
tokensEqual(tokens, tokenize(command)),
|
|
299
|
+
);
|
|
256
300
|
hits.forEach((hit) => matches.add(hit));
|
|
257
301
|
};
|
|
258
302
|
|
|
259
303
|
// Only allow sentence matches for commands that were confidently recognized.
|
|
260
304
|
if (normalizedProcessed) {
|
|
261
305
|
COMMANDS.forEach((command) => {
|
|
306
|
+
const commandTokens = tokenize(command);
|
|
262
307
|
const isMatch = MATCH_SENTENCE
|
|
263
|
-
?
|
|
264
|
-
:
|
|
308
|
+
? tokensContainSequence(processedTokens, commandTokens)
|
|
309
|
+
: tokensEqual(processedTokens, commandTokens);
|
|
265
310
|
if (isMatch) {
|
|
266
311
|
confidentCommands.add(command);
|
|
267
312
|
}
|
|
@@ -269,24 +314,47 @@ function handle(processedWord, averageConfidence, originalText) {
|
|
|
269
314
|
}
|
|
270
315
|
|
|
271
316
|
// Try the filtered text first, then fall back to the raw sentence only for confident commands.
|
|
272
|
-
findMatches(
|
|
273
|
-
findMatches(
|
|
317
|
+
findMatches(processedTokens);
|
|
318
|
+
findMatches(originalTokens, [...confidentCommands]);
|
|
274
319
|
|
|
275
320
|
// If word-level confidence filtering removed all words, fall back to the
|
|
276
321
|
// original text when overall confidence is still acceptable.
|
|
277
|
-
if (
|
|
278
|
-
|
|
322
|
+
if (
|
|
323
|
+
!matches.size &&
|
|
324
|
+
normalizedOriginal &&
|
|
325
|
+
averageConfidence >= WORD_CONFIDENCE_THRESHOLD
|
|
326
|
+
) {
|
|
327
|
+
findMatches(originalTokens);
|
|
279
328
|
}
|
|
280
329
|
|
|
281
|
-
if (!matches.size)
|
|
330
|
+
if (!matches.size) {
|
|
331
|
+
if (!isPartial) {
|
|
332
|
+
emittedMatchesInUtterance.clear();
|
|
333
|
+
}
|
|
334
|
+
return;
|
|
335
|
+
}
|
|
282
336
|
|
|
283
|
-
matches.
|
|
337
|
+
const uniqueMatches = [...matches].filter(
|
|
338
|
+
(match) => !emittedMatchesInUtterance.has(match),
|
|
339
|
+
);
|
|
340
|
+
if (!uniqueMatches.length) {
|
|
341
|
+
if (!isPartial) {
|
|
342
|
+
emittedMatchesInUtterance.clear();
|
|
343
|
+
}
|
|
344
|
+
return;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
uniqueMatches.forEach((match) => {
|
|
284
348
|
if (finalSentence) {
|
|
285
349
|
process.stdout?.write(`sentence|${finalSentence}\n`);
|
|
286
350
|
}
|
|
287
351
|
process.stdout?.write(`voice|${match}\n`);
|
|
288
352
|
process.stdout?.write(`confidence|${averageConfidence}\n`);
|
|
353
|
+
emittedMatchesInUtterance.add(match);
|
|
289
354
|
});
|
|
355
|
+
if (!isPartial) {
|
|
356
|
+
emittedMatchesInUtterance.clear();
|
|
357
|
+
}
|
|
290
358
|
}
|
|
291
359
|
/* ------------------------------------------------------------------ */
|
|
292
360
|
/* 6. Hot-reload grammar via stdin */
|
|
@@ -297,6 +365,7 @@ rl.on("line", (line) => {
|
|
|
297
365
|
const trimmed = line.trim();
|
|
298
366
|
if (
|
|
299
367
|
!trimmed.startsWith("update,") &&
|
|
368
|
+
!trimmed.startsWith("extras,") &&
|
|
300
369
|
!trimmed.startsWith("confidence,") &&
|
|
301
370
|
!trimmed.startsWith("debug,")
|
|
302
371
|
)
|
|
@@ -317,12 +386,22 @@ rl.on("line", (line) => {
|
|
|
317
386
|
.map((s) => normalizePhrase(s))
|
|
318
387
|
.filter(Boolean);
|
|
319
388
|
|
|
389
|
+
if (trimmed.startsWith("extras,")) {
|
|
390
|
+
EXTRA_GRAMMAR = phrases;
|
|
391
|
+
GRAMMAR = [...COMMANDS, ...EXTRA_GRAMMAR, UNKNOWN_TOKEN];
|
|
392
|
+
console.error(
|
|
393
|
+
`[wakeword] extra grammar updated (${phrases.length}): ${phrases.join(", ")}`,
|
|
394
|
+
);
|
|
395
|
+
rec = buildRecognizer();
|
|
396
|
+
return;
|
|
397
|
+
}
|
|
398
|
+
|
|
320
399
|
if (!phrases.length) return;
|
|
321
400
|
|
|
322
401
|
COMMANDS = phrases;
|
|
323
|
-
GRAMMAR = [...
|
|
402
|
+
GRAMMAR = [...COMMANDS, ...EXTRA_GRAMMAR, UNKNOWN_TOKEN];
|
|
324
403
|
console.error(
|
|
325
|
-
`[wakeword] grammar updated (${phrases.length}): ${phrases.join(", ")}
|
|
404
|
+
`[wakeword] grammar updated (${phrases.length}): ${phrases.join(", ")}`,
|
|
326
405
|
);
|
|
327
406
|
rec = buildRecognizer();
|
|
328
407
|
});
|