@lumiastream/wakeword 1.1.8 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/lib/voice.js +93 -29
  2. package/package.json +1 -1
package/lib/voice.js CHANGED
@@ -48,6 +48,7 @@ function unpacked(p) {
48
48
  }
49
49
 
50
50
  const UNKNOWN_TOKEN = "[unk]";
51
+ const UNKNOWN_TOKEN_NORMALIZED = "unk";
51
52
  const normalizePhrase = (phrase = "") => phrase.trim().toLowerCase();
52
53
  const toBool = (v = "") =>
53
54
  ["1", "true", "yes", "y"].includes(`${v}`.trim().toLowerCase());
@@ -61,6 +62,23 @@ const tokenize = (phrase = "") =>
61
62
  .filter(Boolean);
62
63
  const tokensEqual = (a = [], b = []) =>
63
64
  a.length === b.length && a.every((token, idx) => token === b[idx]);
65
+ const trimUnknownBoundaryTokens = (tokens = []) => {
66
+ let start = 0;
67
+ let end = tokens.length;
68
+ while (
69
+ start < end &&
70
+ (tokens[start] === UNKNOWN_TOKEN_NORMALIZED || tokens[start] === UNKNOWN_TOKEN)
71
+ ) {
72
+ start += 1;
73
+ }
74
+ while (
75
+ end > start &&
76
+ (tokens[end - 1] === UNKNOWN_TOKEN_NORMALIZED || tokens[end - 1] === UNKNOWN_TOKEN)
77
+ ) {
78
+ end -= 1;
79
+ }
80
+ return tokens.slice(start, end);
81
+ };
64
82
  const tokensContainSequence = (tokens = [], phraseTokens = []) => {
65
83
  if (!phraseTokens.length || tokens.length < phraseTokens.length) return false;
66
84
  for (let i = 0; i <= tokens.length - phraseTokens.length; i += 1) {
@@ -117,7 +135,8 @@ try {
117
135
  /* 2. Resolve Vosk model */
118
136
  /* ------------------------------------------------------------------ */
119
137
  const envModelPath = (process.env.LUMIA_VOICE_MODEL_PATH || "").trim();
120
- let modelPath = envModelPath || join(here, "..", "models", "vosk-model-small-en-us-0.15");
138
+ let modelPath =
139
+ envModelPath || join(here, "..", "models", "vosk-model-small-en-us-0.15");
121
140
  modelPath = unpacked(modelPath);
122
141
 
123
142
  if (!existsSync(modelPath))
@@ -135,9 +154,10 @@ let EXTRA_GRAMMAR = [];
135
154
 
136
155
  const model = new Model(modelPath);
137
156
  const buildRecognizer = () => {
138
- const recognizer = MATCH_SENTENCE || DISABLE_GRAMMAR
139
- ? new Recognizer({ model, sampleRate: SAMPLE_RATE })
140
- : new Recognizer({ model, sampleRate: SAMPLE_RATE, grammar: GRAMMAR });
157
+ const recognizer =
158
+ MATCH_SENTENCE || DISABLE_GRAMMAR
159
+ ? new Recognizer({ model, sampleRate: SAMPLE_RATE })
160
+ : new Recognizer({ model, sampleRate: SAMPLE_RATE, grammar: GRAMMAR });
141
161
  recognizer.setWords(true);
142
162
  return recognizer;
143
163
  };
@@ -161,7 +181,7 @@ if (audioDevice !== null) {
161
181
  recArgs.device = "default";
162
182
  console.error("Using default Windows audio device: default");
163
183
  console.error(
164
- "To specify a different device, use: AUDIO_DEVICE=<device_id> or pass as 3rd argument"
184
+ "To specify a different device, use: AUDIO_DEVICE=<device_id> or pass as 3rd argument",
165
185
  );
166
186
  }
167
187
 
@@ -176,16 +196,17 @@ mic.on("error", (err) => {
176
196
  // You might need to adjust this value based on your specific use case.
177
197
  let WORD_CONFIDENCE_THRESHOLD = 0.7;
178
198
  const DEBUG_AUDIO = ["1", "true", "yes"].includes(
179
- (process.env.WAKEWORD_DEBUG || "").toLowerCase()
199
+ (process.env.WAKEWORD_DEBUG || "").toLowerCase(),
180
200
  );
181
201
  const LOG_PARTIAL =
182
202
  DEBUG_AUDIO ||
183
203
  ["1", "true", "yes"].includes(
184
- (process.env.WAKEWORD_LOG_PARTIAL || "").toLowerCase()
204
+ (process.env.WAKEWORD_LOG_PARTIAL || "").toLowerCase(),
185
205
  );
186
206
  let LOG_FINAL = ["1", "true", "yes"].includes(
187
- (process.env.WAKEWORD_LOG_FINAL || "").toLowerCase()
207
+ (process.env.WAKEWORD_LOG_FINAL || "").toLowerCase(),
188
208
  );
209
+ let emittedMatchesInUtterance = new Set();
189
210
  let lastLevelLog = 0;
190
211
 
191
212
  function logAudioLevel(buf) {
@@ -231,47 +252,61 @@ mic.on("data", (buf) => {
231
252
  console.log(
232
253
  `Discarding low-confidence word: "${
233
254
  wordDetail.word
234
- }" (Conf: ${wordDetail.conf.toFixed(2)})`
255
+ }" (Conf: ${wordDetail.conf.toFixed(2)})`,
235
256
  );
236
257
  }
237
258
  }
238
259
 
239
260
  const finalRecognizedText = recognizedWords.join(" ").trim();
240
261
  const averageConfidenceAll =
241
- totalConfidenceCount > 0 ? totalConfidenceAll / totalConfidenceCount : 0;
262
+ totalConfidenceCount > 0
263
+ ? totalConfidenceAll / totalConfidenceCount
264
+ : 0;
242
265
  const averageConfidence =
243
266
  recognizedWords.length > 0
244
267
  ? totalConfidence / recognizedWords.length
245
268
  : averageConfidenceAll;
246
269
 
247
- handle(finalRecognizedText, averageConfidence, fullResult.text); // Pass both the filtered text and an average confidence
270
+ handle(finalRecognizedText, averageConfidence, fullResult.text, {
271
+ isPartial: false,
272
+ }); // Pass both the filtered text and an average confidence
248
273
  } else if (fullResult && fullResult.text) {
249
- // Fallback for cases where setWords(true) might not fully apply or for partial results
250
- handle(fullResult.text.trim(), 1.0, fullResult.text); // Assume high confidence if no word-level details
274
+ // Fallback for cases where setWords(true) might not fully apply
275
+ handle(fullResult.text.trim(), 1.0, fullResult.text, {
276
+ isPartial: false,
277
+ }); // Assume high confidence if no word-level details
251
278
  }
252
- } else if (LOG_PARTIAL) {
279
+ } else {
253
280
  const partial = rec.partialResult();
254
- if (partial?.partial) {
281
+ if (partial?.partial && LOG_PARTIAL) {
255
282
  console.error(`[wakeword] partial: "${partial.partial}"`);
256
283
  }
284
+ if (partial?.partial && !MATCH_SENTENCE) {
285
+ handle(partial.partial.trim(), 1.0, partial.partial, { isPartial: true });
286
+ }
257
287
  }
258
288
  });
259
289
 
260
- function handle(processedWord, averageConfidence, originalText) {
290
+ function handle(processedWord, averageConfidence, originalText, options = {}) {
291
+ const { isPartial = false } = options;
261
292
  if (!processedWord && !originalText) return;
262
293
 
263
294
  const finalSentence =
264
295
  typeof originalText === "string" && originalText.trim()
265
296
  ? originalText.trim()
266
297
  : (processedWord ?? "").toString().trim();
267
- if (LOG_FINAL && finalSentence) {
298
+ if (!isPartial && LOG_FINAL && finalSentence) {
268
299
  process.stdout?.write(`final|${finalSentence}\n`);
269
300
  }
270
301
 
271
302
  const normalizedProcessed = normalizePhrase(processedWord);
272
303
  const normalizedOriginal = normalizePhrase(originalText);
273
- const processedTokens = tokenize(normalizedProcessed);
274
- const originalTokens = tokenize(normalizedOriginal);
304
+ const processedTokens = trimUnknownBoundaryTokens(
305
+ tokenize(normalizedProcessed),
306
+ );
307
+ const originalTokens = trimUnknownBoundaryTokens(
308
+ tokenize(normalizedOriginal),
309
+ );
275
310
  const matches = new Set();
276
311
  const confidentCommands = new Set();
277
312
 
@@ -279,18 +314,24 @@ function handle(processedWord, averageConfidence, originalText) {
279
314
  if (!tokens?.length) return;
280
315
  const hits = MATCH_SENTENCE
281
316
  ? allowedCommands.filter((command) =>
282
- tokensContainSequence(tokens, tokenize(command))
283
- )
317
+ tokensContainSequence(
318
+ tokens,
319
+ trimUnknownBoundaryTokens(tokenize(command)),
320
+ ),
321
+ )
284
322
  : allowedCommands.filter((command) =>
285
- tokensEqual(tokens, tokenize(command))
286
- );
323
+ tokensEqual(
324
+ tokens,
325
+ trimUnknownBoundaryTokens(tokenize(command)),
326
+ ),
327
+ );
287
328
  hits.forEach((hit) => matches.add(hit));
288
329
  };
289
330
 
290
331
  // Only allow sentence matches for commands that were confidently recognized.
291
332
  if (normalizedProcessed) {
292
333
  COMMANDS.forEach((command) => {
293
- const commandTokens = tokenize(command);
334
+ const commandTokens = trimUnknownBoundaryTokens(tokenize(command));
294
335
  const isMatch = MATCH_SENTENCE
295
336
  ? tokensContainSequence(processedTokens, commandTokens)
296
337
  : tokensEqual(processedTokens, commandTokens);
@@ -306,19 +347,42 @@ function handle(processedWord, averageConfidence, originalText) {
306
347
 
307
348
  // If word-level confidence filtering removed all words, fall back to the
308
349
  // original text when overall confidence is still acceptable.
309
- if (!matches.size && normalizedOriginal && averageConfidence >= WORD_CONFIDENCE_THRESHOLD) {
350
+ if (
351
+ !matches.size &&
352
+ normalizedOriginal &&
353
+ averageConfidence >= WORD_CONFIDENCE_THRESHOLD
354
+ ) {
310
355
  findMatches(originalTokens);
311
356
  }
312
357
 
313
- if (!matches.size) return;
358
+ if (!matches.size) {
359
+ if (!isPartial) {
360
+ emittedMatchesInUtterance.clear();
361
+ }
362
+ return;
363
+ }
364
+
365
+ const uniqueMatches = [...matches].filter(
366
+ (match) => !emittedMatchesInUtterance.has(match),
367
+ );
368
+ if (!uniqueMatches.length) {
369
+ if (!isPartial) {
370
+ emittedMatchesInUtterance.clear();
371
+ }
372
+ return;
373
+ }
314
374
 
315
- matches.forEach((match) => {
375
+ uniqueMatches.forEach((match) => {
316
376
  if (finalSentence) {
317
377
  process.stdout?.write(`sentence|${finalSentence}\n`);
318
378
  }
319
379
  process.stdout?.write(`voice|${match}\n`);
320
380
  process.stdout?.write(`confidence|${averageConfidence}\n`);
381
+ emittedMatchesInUtterance.add(match);
321
382
  });
383
+ if (!isPartial) {
384
+ emittedMatchesInUtterance.clear();
385
+ }
322
386
  }
323
387
  /* ------------------------------------------------------------------ */
324
388
  /* 6. Hot-reload grammar via stdin */
@@ -354,7 +418,7 @@ rl.on("line", (line) => {
354
418
  EXTRA_GRAMMAR = phrases;
355
419
  GRAMMAR = [...COMMANDS, ...EXTRA_GRAMMAR, UNKNOWN_TOKEN];
356
420
  console.error(
357
- `[wakeword] extra grammar updated (${phrases.length}): ${phrases.join(", ")}`
421
+ `[wakeword] extra grammar updated (${phrases.length}): ${phrases.join(", ")}`,
358
422
  );
359
423
  rec = buildRecognizer();
360
424
  return;
@@ -365,7 +429,7 @@ rl.on("line", (line) => {
365
429
  COMMANDS = phrases;
366
430
  GRAMMAR = [...COMMANDS, ...EXTRA_GRAMMAR, UNKNOWN_TOKEN];
367
431
  console.error(
368
- `[wakeword] grammar updated (${phrases.length}): ${phrases.join(", ")}`
432
+ `[wakeword] grammar updated (${phrases.length}): ${phrases.join(", ")}`,
369
433
  );
370
434
  rec = buildRecognizer();
371
435
  });
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@lumiastream/wakeword",
3
- "version": "1.1.8",
3
+ "version": "1.2.0",
4
4
  "type": "module",
5
5
  "main": "lib/index.js",
6
6
  "files": [