@lumiastream/wakeword 1.1.8 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/voice.js +93 -29
- package/package.json +1 -1
package/lib/voice.js
CHANGED
|
@@ -48,6 +48,7 @@ function unpacked(p) {
|
|
|
48
48
|
}
|
|
49
49
|
|
|
50
50
|
const UNKNOWN_TOKEN = "[unk]";
|
|
51
|
+
const UNKNOWN_TOKEN_NORMALIZED = "unk";
|
|
51
52
|
const normalizePhrase = (phrase = "") => phrase.trim().toLowerCase();
|
|
52
53
|
const toBool = (v = "") =>
|
|
53
54
|
["1", "true", "yes", "y"].includes(`${v}`.trim().toLowerCase());
|
|
@@ -61,6 +62,23 @@ const tokenize = (phrase = "") =>
|
|
|
61
62
|
.filter(Boolean);
|
|
62
63
|
const tokensEqual = (a = [], b = []) =>
|
|
63
64
|
a.length === b.length && a.every((token, idx) => token === b[idx]);
|
|
65
|
+
const trimUnknownBoundaryTokens = (tokens = []) => {
|
|
66
|
+
let start = 0;
|
|
67
|
+
let end = tokens.length;
|
|
68
|
+
while (
|
|
69
|
+
start < end &&
|
|
70
|
+
(tokens[start] === UNKNOWN_TOKEN_NORMALIZED || tokens[start] === UNKNOWN_TOKEN)
|
|
71
|
+
) {
|
|
72
|
+
start += 1;
|
|
73
|
+
}
|
|
74
|
+
while (
|
|
75
|
+
end > start &&
|
|
76
|
+
(tokens[end - 1] === UNKNOWN_TOKEN_NORMALIZED || tokens[end - 1] === UNKNOWN_TOKEN)
|
|
77
|
+
) {
|
|
78
|
+
end -= 1;
|
|
79
|
+
}
|
|
80
|
+
return tokens.slice(start, end);
|
|
81
|
+
};
|
|
64
82
|
const tokensContainSequence = (tokens = [], phraseTokens = []) => {
|
|
65
83
|
if (!phraseTokens.length || tokens.length < phraseTokens.length) return false;
|
|
66
84
|
for (let i = 0; i <= tokens.length - phraseTokens.length; i += 1) {
|
|
@@ -117,7 +135,8 @@ try {
|
|
|
117
135
|
/* 2. Resolve Vosk model */
|
|
118
136
|
/* ------------------------------------------------------------------ */
|
|
119
137
|
const envModelPath = (process.env.LUMIA_VOICE_MODEL_PATH || "").trim();
|
|
120
|
-
let modelPath =
|
|
138
|
+
let modelPath =
|
|
139
|
+
envModelPath || join(here, "..", "models", "vosk-model-small-en-us-0.15");
|
|
121
140
|
modelPath = unpacked(modelPath);
|
|
122
141
|
|
|
123
142
|
if (!existsSync(modelPath))
|
|
@@ -135,9 +154,10 @@ let EXTRA_GRAMMAR = [];
|
|
|
135
154
|
|
|
136
155
|
const model = new Model(modelPath);
|
|
137
156
|
const buildRecognizer = () => {
|
|
138
|
-
const recognizer =
|
|
139
|
-
|
|
140
|
-
|
|
157
|
+
const recognizer =
|
|
158
|
+
MATCH_SENTENCE || DISABLE_GRAMMAR
|
|
159
|
+
? new Recognizer({ model, sampleRate: SAMPLE_RATE })
|
|
160
|
+
: new Recognizer({ model, sampleRate: SAMPLE_RATE, grammar: GRAMMAR });
|
|
141
161
|
recognizer.setWords(true);
|
|
142
162
|
return recognizer;
|
|
143
163
|
};
|
|
@@ -161,7 +181,7 @@ if (audioDevice !== null) {
|
|
|
161
181
|
recArgs.device = "default";
|
|
162
182
|
console.error("Using default Windows audio device: default");
|
|
163
183
|
console.error(
|
|
164
|
-
"To specify a different device, use: AUDIO_DEVICE=<device_id> or pass as 3rd argument"
|
|
184
|
+
"To specify a different device, use: AUDIO_DEVICE=<device_id> or pass as 3rd argument",
|
|
165
185
|
);
|
|
166
186
|
}
|
|
167
187
|
|
|
@@ -176,16 +196,17 @@ mic.on("error", (err) => {
|
|
|
176
196
|
// You might need to adjust this value based on your specific use case.
|
|
177
197
|
let WORD_CONFIDENCE_THRESHOLD = 0.7;
|
|
178
198
|
const DEBUG_AUDIO = ["1", "true", "yes"].includes(
|
|
179
|
-
(process.env.WAKEWORD_DEBUG || "").toLowerCase()
|
|
199
|
+
(process.env.WAKEWORD_DEBUG || "").toLowerCase(),
|
|
180
200
|
);
|
|
181
201
|
const LOG_PARTIAL =
|
|
182
202
|
DEBUG_AUDIO ||
|
|
183
203
|
["1", "true", "yes"].includes(
|
|
184
|
-
(process.env.WAKEWORD_LOG_PARTIAL || "").toLowerCase()
|
|
204
|
+
(process.env.WAKEWORD_LOG_PARTIAL || "").toLowerCase(),
|
|
185
205
|
);
|
|
186
206
|
let LOG_FINAL = ["1", "true", "yes"].includes(
|
|
187
|
-
(process.env.WAKEWORD_LOG_FINAL || "").toLowerCase()
|
|
207
|
+
(process.env.WAKEWORD_LOG_FINAL || "").toLowerCase(),
|
|
188
208
|
);
|
|
209
|
+
let emittedMatchesInUtterance = new Set();
|
|
189
210
|
let lastLevelLog = 0;
|
|
190
211
|
|
|
191
212
|
function logAudioLevel(buf) {
|
|
@@ -231,47 +252,61 @@ mic.on("data", (buf) => {
|
|
|
231
252
|
console.log(
|
|
232
253
|
`Discarding low-confidence word: "${
|
|
233
254
|
wordDetail.word
|
|
234
|
-
}" (Conf: ${wordDetail.conf.toFixed(2)})
|
|
255
|
+
}" (Conf: ${wordDetail.conf.toFixed(2)})`,
|
|
235
256
|
);
|
|
236
257
|
}
|
|
237
258
|
}
|
|
238
259
|
|
|
239
260
|
const finalRecognizedText = recognizedWords.join(" ").trim();
|
|
240
261
|
const averageConfidenceAll =
|
|
241
|
-
totalConfidenceCount > 0
|
|
262
|
+
totalConfidenceCount > 0
|
|
263
|
+
? totalConfidenceAll / totalConfidenceCount
|
|
264
|
+
: 0;
|
|
242
265
|
const averageConfidence =
|
|
243
266
|
recognizedWords.length > 0
|
|
244
267
|
? totalConfidence / recognizedWords.length
|
|
245
268
|
: averageConfidenceAll;
|
|
246
269
|
|
|
247
|
-
handle(finalRecognizedText, averageConfidence, fullResult.text
|
|
270
|
+
handle(finalRecognizedText, averageConfidence, fullResult.text, {
|
|
271
|
+
isPartial: false,
|
|
272
|
+
}); // Pass both the filtered text and an average confidence
|
|
248
273
|
} else if (fullResult && fullResult.text) {
|
|
249
|
-
// Fallback for cases where setWords(true) might not fully apply
|
|
250
|
-
handle(fullResult.text.trim(), 1.0, fullResult.text
|
|
274
|
+
// Fallback for cases where setWords(true) might not fully apply
|
|
275
|
+
handle(fullResult.text.trim(), 1.0, fullResult.text, {
|
|
276
|
+
isPartial: false,
|
|
277
|
+
}); // Assume high confidence if no word-level details
|
|
251
278
|
}
|
|
252
|
-
} else
|
|
279
|
+
} else {
|
|
253
280
|
const partial = rec.partialResult();
|
|
254
|
-
if (partial?.partial) {
|
|
281
|
+
if (partial?.partial && LOG_PARTIAL) {
|
|
255
282
|
console.error(`[wakeword] partial: "${partial.partial}"`);
|
|
256
283
|
}
|
|
284
|
+
if (partial?.partial && !MATCH_SENTENCE) {
|
|
285
|
+
handle(partial.partial.trim(), 1.0, partial.partial, { isPartial: true });
|
|
286
|
+
}
|
|
257
287
|
}
|
|
258
288
|
});
|
|
259
289
|
|
|
260
|
-
function handle(processedWord, averageConfidence, originalText) {
|
|
290
|
+
function handle(processedWord, averageConfidence, originalText, options = {}) {
|
|
291
|
+
const { isPartial = false } = options;
|
|
261
292
|
if (!processedWord && !originalText) return;
|
|
262
293
|
|
|
263
294
|
const finalSentence =
|
|
264
295
|
typeof originalText === "string" && originalText.trim()
|
|
265
296
|
? originalText.trim()
|
|
266
297
|
: (processedWord ?? "").toString().trim();
|
|
267
|
-
if (LOG_FINAL && finalSentence) {
|
|
298
|
+
if (!isPartial && LOG_FINAL && finalSentence) {
|
|
268
299
|
process.stdout?.write(`final|${finalSentence}\n`);
|
|
269
300
|
}
|
|
270
301
|
|
|
271
302
|
const normalizedProcessed = normalizePhrase(processedWord);
|
|
272
303
|
const normalizedOriginal = normalizePhrase(originalText);
|
|
273
|
-
const processedTokens =
|
|
274
|
-
|
|
304
|
+
const processedTokens = trimUnknownBoundaryTokens(
|
|
305
|
+
tokenize(normalizedProcessed),
|
|
306
|
+
);
|
|
307
|
+
const originalTokens = trimUnknownBoundaryTokens(
|
|
308
|
+
tokenize(normalizedOriginal),
|
|
309
|
+
);
|
|
275
310
|
const matches = new Set();
|
|
276
311
|
const confidentCommands = new Set();
|
|
277
312
|
|
|
@@ -279,18 +314,24 @@ function handle(processedWord, averageConfidence, originalText) {
|
|
|
279
314
|
if (!tokens?.length) return;
|
|
280
315
|
const hits = MATCH_SENTENCE
|
|
281
316
|
? allowedCommands.filter((command) =>
|
|
282
|
-
tokensContainSequence(
|
|
283
|
-
|
|
317
|
+
tokensContainSequence(
|
|
318
|
+
tokens,
|
|
319
|
+
trimUnknownBoundaryTokens(tokenize(command)),
|
|
320
|
+
),
|
|
321
|
+
)
|
|
284
322
|
: allowedCommands.filter((command) =>
|
|
285
|
-
tokensEqual(
|
|
286
|
-
|
|
323
|
+
tokensEqual(
|
|
324
|
+
tokens,
|
|
325
|
+
trimUnknownBoundaryTokens(tokenize(command)),
|
|
326
|
+
),
|
|
327
|
+
);
|
|
287
328
|
hits.forEach((hit) => matches.add(hit));
|
|
288
329
|
};
|
|
289
330
|
|
|
290
331
|
// Only allow sentence matches for commands that were confidently recognized.
|
|
291
332
|
if (normalizedProcessed) {
|
|
292
333
|
COMMANDS.forEach((command) => {
|
|
293
|
-
const commandTokens = tokenize(command);
|
|
334
|
+
const commandTokens = trimUnknownBoundaryTokens(tokenize(command));
|
|
294
335
|
const isMatch = MATCH_SENTENCE
|
|
295
336
|
? tokensContainSequence(processedTokens, commandTokens)
|
|
296
337
|
: tokensEqual(processedTokens, commandTokens);
|
|
@@ -306,19 +347,42 @@ function handle(processedWord, averageConfidence, originalText) {
|
|
|
306
347
|
|
|
307
348
|
// If word-level confidence filtering removed all words, fall back to the
|
|
308
349
|
// original text when overall confidence is still acceptable.
|
|
309
|
-
if (
|
|
350
|
+
if (
|
|
351
|
+
!matches.size &&
|
|
352
|
+
normalizedOriginal &&
|
|
353
|
+
averageConfidence >= WORD_CONFIDENCE_THRESHOLD
|
|
354
|
+
) {
|
|
310
355
|
findMatches(originalTokens);
|
|
311
356
|
}
|
|
312
357
|
|
|
313
|
-
if (!matches.size)
|
|
358
|
+
if (!matches.size) {
|
|
359
|
+
if (!isPartial) {
|
|
360
|
+
emittedMatchesInUtterance.clear();
|
|
361
|
+
}
|
|
362
|
+
return;
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
const uniqueMatches = [...matches].filter(
|
|
366
|
+
(match) => !emittedMatchesInUtterance.has(match),
|
|
367
|
+
);
|
|
368
|
+
if (!uniqueMatches.length) {
|
|
369
|
+
if (!isPartial) {
|
|
370
|
+
emittedMatchesInUtterance.clear();
|
|
371
|
+
}
|
|
372
|
+
return;
|
|
373
|
+
}
|
|
314
374
|
|
|
315
|
-
|
|
375
|
+
uniqueMatches.forEach((match) => {
|
|
316
376
|
if (finalSentence) {
|
|
317
377
|
process.stdout?.write(`sentence|${finalSentence}\n`);
|
|
318
378
|
}
|
|
319
379
|
process.stdout?.write(`voice|${match}\n`);
|
|
320
380
|
process.stdout?.write(`confidence|${averageConfidence}\n`);
|
|
381
|
+
emittedMatchesInUtterance.add(match);
|
|
321
382
|
});
|
|
383
|
+
if (!isPartial) {
|
|
384
|
+
emittedMatchesInUtterance.clear();
|
|
385
|
+
}
|
|
322
386
|
}
|
|
323
387
|
/* ------------------------------------------------------------------ */
|
|
324
388
|
/* 6. Hot-reload grammar via stdin */
|
|
@@ -354,7 +418,7 @@ rl.on("line", (line) => {
|
|
|
354
418
|
EXTRA_GRAMMAR = phrases;
|
|
355
419
|
GRAMMAR = [...COMMANDS, ...EXTRA_GRAMMAR, UNKNOWN_TOKEN];
|
|
356
420
|
console.error(
|
|
357
|
-
`[wakeword] extra grammar updated (${phrases.length}): ${phrases.join(", ")}
|
|
421
|
+
`[wakeword] extra grammar updated (${phrases.length}): ${phrases.join(", ")}`,
|
|
358
422
|
);
|
|
359
423
|
rec = buildRecognizer();
|
|
360
424
|
return;
|
|
@@ -365,7 +429,7 @@ rl.on("line", (line) => {
|
|
|
365
429
|
COMMANDS = phrases;
|
|
366
430
|
GRAMMAR = [...COMMANDS, ...EXTRA_GRAMMAR, UNKNOWN_TOKEN];
|
|
367
431
|
console.error(
|
|
368
|
-
`[wakeword] grammar updated (${phrases.length}): ${phrases.join(", ")}
|
|
432
|
+
`[wakeword] grammar updated (${phrases.length}): ${phrases.join(", ")}`,
|
|
369
433
|
);
|
|
370
434
|
rec = buildRecognizer();
|
|
371
435
|
});
|