@lumiastream/wakeword 1.1.7 → 1.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/lib/voice.js +110 -31
  2. package/package.json +1 -1
package/lib/voice.js CHANGED
@@ -51,6 +51,30 @@ const UNKNOWN_TOKEN = "[unk]";
51
51
  const normalizePhrase = (phrase = "") => phrase.trim().toLowerCase();
52
52
  const toBool = (v = "") =>
53
53
  ["1", "true", "yes", "y"].includes(`${v}`.trim().toLowerCase());
54
+ const tokenize = (phrase = "") =>
55
+ (phrase ?? "")
56
+ .toString()
57
+ .toLowerCase()
58
+ .replace(/[^a-z0-9]+/gi, " ")
59
+ .trim()
60
+ .split(/\s+/)
61
+ .filter(Boolean);
62
+ const tokensEqual = (a = [], b = []) =>
63
+ a.length === b.length && a.every((token, idx) => token === b[idx]);
64
+ const tokensContainSequence = (tokens = [], phraseTokens = []) => {
65
+ if (!phraseTokens.length || tokens.length < phraseTokens.length) return false;
66
+ for (let i = 0; i <= tokens.length - phraseTokens.length; i += 1) {
67
+ let matches = true;
68
+ for (let j = 0; j < phraseTokens.length; j += 1) {
69
+ if (tokens[i + j] !== phraseTokens[j]) {
70
+ matches = false;
71
+ break;
72
+ }
73
+ }
74
+ if (matches) return true;
75
+ }
76
+ return false;
77
+ };
54
78
 
55
79
  /* ------------------------------------------------------------------ */
56
80
  /* 1. Resolve SoX binary and audio device */
@@ -93,7 +117,8 @@ try {
93
117
  /* 2. Resolve Vosk model */
94
118
  /* ------------------------------------------------------------------ */
95
119
  const envModelPath = (process.env.LUMIA_VOICE_MODEL_PATH || "").trim();
96
- let modelPath = envModelPath || join(here, "..", "models", "vosk-model-small-en-us-0.15");
120
+ let modelPath =
121
+ envModelPath || join(here, "..", "models", "vosk-model-small-en-us-0.15");
97
122
  modelPath = unpacked(modelPath);
98
123
 
99
124
  if (!existsSync(modelPath))
@@ -107,12 +132,14 @@ setLogLevel(0);
107
132
  const SAMPLE_RATE = Number(process.env.SAMPLE_RATE || 16_000);
108
133
  let GRAMMAR = [UNKNOWN_TOKEN]; // seed; always keep [unk]
109
134
  let COMMANDS = [];
135
+ let EXTRA_GRAMMAR = [];
110
136
 
111
137
  const model = new Model(modelPath);
112
138
  const buildRecognizer = () => {
113
- const recognizer = MATCH_SENTENCE || DISABLE_GRAMMAR
114
- ? new Recognizer({ model, sampleRate: SAMPLE_RATE })
115
- : new Recognizer({ model, sampleRate: SAMPLE_RATE, grammar: GRAMMAR });
139
+ const recognizer =
140
+ MATCH_SENTENCE || DISABLE_GRAMMAR
141
+ ? new Recognizer({ model, sampleRate: SAMPLE_RATE })
142
+ : new Recognizer({ model, sampleRate: SAMPLE_RATE, grammar: GRAMMAR });
116
143
  recognizer.setWords(true);
117
144
  return recognizer;
118
145
  };
@@ -136,7 +163,7 @@ if (audioDevice !== null) {
136
163
  recArgs.device = "default";
137
164
  console.error("Using default Windows audio device: default");
138
165
  console.error(
139
- "To specify a different device, use: AUDIO_DEVICE=<device_id> or pass as 3rd argument"
166
+ "To specify a different device, use: AUDIO_DEVICE=<device_id> or pass as 3rd argument",
140
167
  );
141
168
  }
142
169
 
@@ -151,16 +178,17 @@ mic.on("error", (err) => {
151
178
  // You might need to adjust this value based on your specific use case.
152
179
  let WORD_CONFIDENCE_THRESHOLD = 0.7;
153
180
  const DEBUG_AUDIO = ["1", "true", "yes"].includes(
154
- (process.env.WAKEWORD_DEBUG || "").toLowerCase()
181
+ (process.env.WAKEWORD_DEBUG || "").toLowerCase(),
155
182
  );
156
183
  const LOG_PARTIAL =
157
184
  DEBUG_AUDIO ||
158
185
  ["1", "true", "yes"].includes(
159
- (process.env.WAKEWORD_LOG_PARTIAL || "").toLowerCase()
186
+ (process.env.WAKEWORD_LOG_PARTIAL || "").toLowerCase(),
160
187
  );
161
188
  let LOG_FINAL = ["1", "true", "yes"].includes(
162
- (process.env.WAKEWORD_LOG_FINAL || "").toLowerCase()
189
+ (process.env.WAKEWORD_LOG_FINAL || "").toLowerCase(),
163
190
  );
191
+ let emittedMatchesInUtterance = new Set();
164
192
  let lastLevelLog = 0;
165
193
 
166
194
  function logAudioLevel(buf) {
@@ -206,62 +234,79 @@ mic.on("data", (buf) => {
206
234
  console.log(
207
235
  `Discarding low-confidence word: "${
208
236
  wordDetail.word
209
- }" (Conf: ${wordDetail.conf.toFixed(2)})`
237
+ }" (Conf: ${wordDetail.conf.toFixed(2)})`,
210
238
  );
211
239
  }
212
240
  }
213
241
 
214
242
  const finalRecognizedText = recognizedWords.join(" ").trim();
215
243
  const averageConfidenceAll =
216
- totalConfidenceCount > 0 ? totalConfidenceAll / totalConfidenceCount : 0;
244
+ totalConfidenceCount > 0
245
+ ? totalConfidenceAll / totalConfidenceCount
246
+ : 0;
217
247
  const averageConfidence =
218
248
  recognizedWords.length > 0
219
249
  ? totalConfidence / recognizedWords.length
220
250
  : averageConfidenceAll;
221
251
 
222
- handle(finalRecognizedText, averageConfidence, fullResult.text); // Pass both the filtered text and an average confidence
252
+ handle(finalRecognizedText, averageConfidence, fullResult.text, {
253
+ isPartial: false,
254
+ }); // Pass both the filtered text and an average confidence
223
255
  } else if (fullResult && fullResult.text) {
224
- // Fallback for cases where setWords(true) might not fully apply or for partial results
225
- handle(fullResult.text.trim(), 1.0, fullResult.text); // Assume high confidence if no word-level details
256
+ // Fallback for cases where setWords(true) might not fully apply
257
+ handle(fullResult.text.trim(), 1.0, fullResult.text, {
258
+ isPartial: false,
259
+ }); // Assume high confidence if no word-level details
226
260
  }
227
- } else if (LOG_PARTIAL) {
261
+ } else {
228
262
  const partial = rec.partialResult();
229
- if (partial?.partial) {
263
+ if (partial?.partial && LOG_PARTIAL) {
230
264
  console.error(`[wakeword] partial: "${partial.partial}"`);
231
265
  }
266
+ if (partial?.partial && !MATCH_SENTENCE) {
267
+ handle(partial.partial.trim(), 1.0, partial.partial, { isPartial: true });
268
+ }
232
269
  }
233
270
  });
234
271
 
235
- function handle(processedWord, averageConfidence, originalText) {
272
+ function handle(processedWord, averageConfidence, originalText, options = {}) {
273
+ const { isPartial = false } = options;
236
274
  if (!processedWord && !originalText) return;
237
275
 
238
276
  const finalSentence =
239
277
  typeof originalText === "string" && originalText.trim()
240
278
  ? originalText.trim()
241
279
  : (processedWord ?? "").toString().trim();
242
- if (LOG_FINAL && finalSentence) {
280
+ if (!isPartial && LOG_FINAL && finalSentence) {
243
281
  process.stdout?.write(`final|${finalSentence}\n`);
244
282
  }
245
283
 
246
284
  const normalizedProcessed = normalizePhrase(processedWord);
247
285
  const normalizedOriginal = normalizePhrase(originalText);
286
+ const processedTokens = tokenize(normalizedProcessed);
287
+ const originalTokens = tokenize(normalizedOriginal);
248
288
  const matches = new Set();
249
289
  const confidentCommands = new Set();
250
290
 
251
- const findMatches = (text, allowedCommands = COMMANDS) => {
252
- if (!text || text.includes(UNKNOWN_TOKEN)) return;
291
+ const findMatches = (tokens, allowedCommands = COMMANDS) => {
292
+ if (!tokens?.length) return;
253
293
  const hits = MATCH_SENTENCE
254
- ? allowedCommands.filter((command) => text.includes(command))
255
- : allowedCommands.filter((command) => text === command);
294
+ ? allowedCommands.filter((command) =>
295
+ tokensContainSequence(tokens, tokenize(command)),
296
+ )
297
+ : allowedCommands.filter((command) =>
298
+ tokensEqual(tokens, tokenize(command)),
299
+ );
256
300
  hits.forEach((hit) => matches.add(hit));
257
301
  };
258
302
 
259
303
  // Only allow sentence matches for commands that were confidently recognized.
260
304
  if (normalizedProcessed) {
261
305
  COMMANDS.forEach((command) => {
306
+ const commandTokens = tokenize(command);
262
307
  const isMatch = MATCH_SENTENCE
263
- ? normalizedProcessed.includes(command)
264
- : normalizedProcessed === command;
308
+ ? tokensContainSequence(processedTokens, commandTokens)
309
+ : tokensEqual(processedTokens, commandTokens);
265
310
  if (isMatch) {
266
311
  confidentCommands.add(command);
267
312
  }
@@ -269,24 +314,47 @@ function handle(processedWord, averageConfidence, originalText) {
269
314
  }
270
315
 
271
316
  // Try the filtered text first, then fall back to the raw sentence only for confident commands.
272
- findMatches(normalizedProcessed);
273
- findMatches(normalizedOriginal, [...confidentCommands]);
317
+ findMatches(processedTokens);
318
+ findMatches(originalTokens, [...confidentCommands]);
274
319
 
275
320
  // If word-level confidence filtering removed all words, fall back to the
276
321
  // original text when overall confidence is still acceptable.
277
- if (!matches.size && normalizedOriginal && averageConfidence >= WORD_CONFIDENCE_THRESHOLD) {
278
- findMatches(normalizedOriginal);
322
+ if (
323
+ !matches.size &&
324
+ normalizedOriginal &&
325
+ averageConfidence >= WORD_CONFIDENCE_THRESHOLD
326
+ ) {
327
+ findMatches(originalTokens);
279
328
  }
280
329
 
281
- if (!matches.size) return;
330
+ if (!matches.size) {
331
+ if (!isPartial) {
332
+ emittedMatchesInUtterance.clear();
333
+ }
334
+ return;
335
+ }
282
336
 
283
- matches.forEach((match) => {
337
+ const uniqueMatches = [...matches].filter(
338
+ (match) => !emittedMatchesInUtterance.has(match),
339
+ );
340
+ if (!uniqueMatches.length) {
341
+ if (!isPartial) {
342
+ emittedMatchesInUtterance.clear();
343
+ }
344
+ return;
345
+ }
346
+
347
+ uniqueMatches.forEach((match) => {
284
348
  if (finalSentence) {
285
349
  process.stdout?.write(`sentence|${finalSentence}\n`);
286
350
  }
287
351
  process.stdout?.write(`voice|${match}\n`);
288
352
  process.stdout?.write(`confidence|${averageConfidence}\n`);
353
+ emittedMatchesInUtterance.add(match);
289
354
  });
355
+ if (!isPartial) {
356
+ emittedMatchesInUtterance.clear();
357
+ }
290
358
  }
291
359
  /* ------------------------------------------------------------------ */
292
360
  /* 6. Hot-reload grammar via stdin */
@@ -297,6 +365,7 @@ rl.on("line", (line) => {
297
365
  const trimmed = line.trim();
298
366
  if (
299
367
  !trimmed.startsWith("update,") &&
368
+ !trimmed.startsWith("extras,") &&
300
369
  !trimmed.startsWith("confidence,") &&
301
370
  !trimmed.startsWith("debug,")
302
371
  )
@@ -317,12 +386,22 @@ rl.on("line", (line) => {
317
386
  .map((s) => normalizePhrase(s))
318
387
  .filter(Boolean);
319
388
 
389
+ if (trimmed.startsWith("extras,")) {
390
+ EXTRA_GRAMMAR = phrases;
391
+ GRAMMAR = [...COMMANDS, ...EXTRA_GRAMMAR, UNKNOWN_TOKEN];
392
+ console.error(
393
+ `[wakeword] extra grammar updated (${phrases.length}): ${phrases.join(", ")}`,
394
+ );
395
+ rec = buildRecognizer();
396
+ return;
397
+ }
398
+
320
399
  if (!phrases.length) return;
321
400
 
322
401
  COMMANDS = phrases;
323
- GRAMMAR = [...phrases, UNKNOWN_TOKEN];
402
+ GRAMMAR = [...COMMANDS, ...EXTRA_GRAMMAR, UNKNOWN_TOKEN];
324
403
  console.error(
325
- `[wakeword] grammar updated (${phrases.length}): ${phrases.join(", ")}`
404
+ `[wakeword] grammar updated (${phrases.length}): ${phrases.join(", ")}`,
326
405
  );
327
406
  rec = buildRecognizer();
328
407
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lumiastream/wakeword",
3
- "version": "1.1.7",
3
+ "version": "1.1.9",
4
4
  "type": "module",
5
5
  "main": "lib/index.js",
6
6
  "files": [