npm - @lumiastream/wakeword - Versions diffs - 1.1.8 → 1.2.0 - Mend

@lumiastream/wakeword 1.1.8 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/lib/voice.js +93 -29
package/package.json +1 -1

package/lib/voice.js CHANGED Viewed

@@ -48,6 +48,7 @@ function unpacked(p) {
 }
 const UNKNOWN_TOKEN = "[unk]";
+const UNKNOWN_TOKEN_NORMALIZED = "unk";
 const normalizePhrase = (phrase = "") => phrase.trim().toLowerCase();
 const toBool = (v = "") =>
 	["1", "true", "yes", "y"].includes(`${v}`.trim().toLowerCase());
@@ -61,6 +62,23 @@ const tokenize = (phrase = "") =>
 		.filter(Boolean);
 const tokensEqual = (a = [], b = []) =>
 	a.length === b.length && a.every((token, idx) => token === b[idx]);
+const trimUnknownBoundaryTokens = (tokens = []) => {
+	let start = 0;
+	let end = tokens.length;
+	while (
+		start < end &&
+		(tokens[start] === UNKNOWN_TOKEN_NORMALIZED || tokens[start] === UNKNOWN_TOKEN)
+	) {
+		start += 1;
+	}
+	while (
+		end > start &&
+		(tokens[end - 1] === UNKNOWN_TOKEN_NORMALIZED || tokens[end - 1] === UNKNOWN_TOKEN)
+	) {
+		end -= 1;
+	}
+	return tokens.slice(start, end);
+};
 const tokensContainSequence = (tokens = [], phraseTokens = []) => {
 	if (!phraseTokens.length || tokens.length < phraseTokens.length) return false;
 	for (let i = 0; i <= tokens.length - phraseTokens.length; i += 1) {
@@ -117,7 +135,8 @@ try {
 /* 2. Resolve Vosk model                                              */
 /* ------------------------------------------------------------------ */
 const envModelPath = (process.env.LUMIA_VOICE_MODEL_PATH || "").trim();
-let modelPath = envModelPath || join(here, "..", "models", "vosk-model-small-en-us-0.15");
+let modelPath =
+	envModelPath || join(here, "..", "models", "vosk-model-small-en-us-0.15");
 modelPath = unpacked(modelPath);
 if (!existsSync(modelPath))
@@ -135,9 +154,10 @@ let EXTRA_GRAMMAR = [];
 const model = new Model(modelPath);
 const buildRecognizer = () => {
-	const recognizer = MATCH_SENTENCE || DISABLE_GRAMMAR
-		? new Recognizer({ model, sampleRate: SAMPLE_RATE })
-		: new Recognizer({ model, sampleRate: SAMPLE_RATE, grammar: GRAMMAR });
+	const recognizer =
+		MATCH_SENTENCE || DISABLE_GRAMMAR
+			? new Recognizer({ model, sampleRate: SAMPLE_RATE })
+			: new Recognizer({ model, sampleRate: SAMPLE_RATE, grammar: GRAMMAR });
 	recognizer.setWords(true);
 	return recognizer;
 };
@@ -161,7 +181,7 @@ if (audioDevice !== null) {
 	recArgs.device = "default";
 	console.error("Using default Windows audio device: default");
 	console.error(
-		"To specify a different device, use: AUDIO_DEVICE=<device_id> or pass as 3rd argument"
+		"To specify a different device, use: AUDIO_DEVICE=<device_id> or pass as 3rd argument",
 	);
 }
@@ -176,16 +196,17 @@ mic.on("error", (err) => {
 // You might need to adjust this value based on your specific use case.
 let WORD_CONFIDENCE_THRESHOLD = 0.7;
 const DEBUG_AUDIO = ["1", "true", "yes"].includes(
-	(process.env.WAKEWORD_DEBUG || "").toLowerCase()
+	(process.env.WAKEWORD_DEBUG || "").toLowerCase(),
 );
 const LOG_PARTIAL =
 	DEBUG_AUDIO ||
 	["1", "true", "yes"].includes(
-		(process.env.WAKEWORD_LOG_PARTIAL || "").toLowerCase()
+		(process.env.WAKEWORD_LOG_PARTIAL || "").toLowerCase(),
 	);
 let LOG_FINAL = ["1", "true", "yes"].includes(
-	(process.env.WAKEWORD_LOG_FINAL || "").toLowerCase()
+	(process.env.WAKEWORD_LOG_FINAL || "").toLowerCase(),
 );
+let emittedMatchesInUtterance = new Set();
 let lastLevelLog = 0;
 function logAudioLevel(buf) {
@@ -231,47 +252,61 @@ mic.on("data", (buf) => {
 					console.log(
 						`Discarding low-confidence word: "${
 							wordDetail.word
-						}" (Conf: ${wordDetail.conf.toFixed(2)})`
+						}" (Conf: ${wordDetail.conf.toFixed(2)})`,
 					);
 				}
 			}
 			const finalRecognizedText = recognizedWords.join(" ").trim();
 			const averageConfidenceAll =
-				totalConfidenceCount > 0 ? totalConfidenceAll / totalConfidenceCount : 0;
+				totalConfidenceCount > 0
+					? totalConfidenceAll / totalConfidenceCount
+					: 0;
 			const averageConfidence =
 				recognizedWords.length > 0
 					? totalConfidence / recognizedWords.length
 					: averageConfidenceAll;
-			handle(finalRecognizedText, averageConfidence, fullResult.text); // Pass both the filtered text and an average confidence
+			handle(finalRecognizedText, averageConfidence, fullResult.text, {
+				isPartial: false,
+			}); // Pass both the filtered text and an average confidence
 		} else if (fullResult && fullResult.text) {
-			// Fallback for cases where setWords(true) might not fully apply or for partial results
-			handle(fullResult.text.trim(), 1.0, fullResult.text); // Assume high confidence if no word-level details
+			// Fallback for cases where setWords(true) might not fully apply
+			handle(fullResult.text.trim(), 1.0, fullResult.text, {
+				isPartial: false,
+			}); // Assume high confidence if no word-level details
 		}
-	} else if (LOG_PARTIAL) {
+	} else {
 		const partial = rec.partialResult();
-		if (partial?.partial) {
+		if (partial?.partial && LOG_PARTIAL) {
 			console.error(`[wakeword] partial: "${partial.partial}"`);
 		}
+		if (partial?.partial && !MATCH_SENTENCE) {
+			handle(partial.partial.trim(), 1.0, partial.partial, { isPartial: true });
+		}
 	}
 });
-function handle(processedWord, averageConfidence, originalText) {
+function handle(processedWord, averageConfidence, originalText, options = {}) {
+	const { isPartial = false } = options;
 	if (!processedWord && !originalText) return;
 	const finalSentence =
 		typeof originalText === "string" && originalText.trim()
 			? originalText.trim()
 			: (processedWord ?? "").toString().trim();
-	if (LOG_FINAL && finalSentence) {
+	if (!isPartial && LOG_FINAL && finalSentence) {
 		process.stdout?.write(`final|${finalSentence}\n`);
 	}
 	const normalizedProcessed = normalizePhrase(processedWord);
 	const normalizedOriginal = normalizePhrase(originalText);
-	const processedTokens = tokenize(normalizedProcessed);
-	const originalTokens = tokenize(normalizedOriginal);
+	const processedTokens = trimUnknownBoundaryTokens(
+		tokenize(normalizedProcessed),
+	);
+	const originalTokens = trimUnknownBoundaryTokens(
+		tokenize(normalizedOriginal),
+	);
 	const matches = new Set();
 	const confidentCommands = new Set();
@@ -279,18 +314,24 @@ function handle(processedWord, averageConfidence, originalText) {
 		if (!tokens?.length) return;
 		const hits = MATCH_SENTENCE
 			? allowedCommands.filter((command) =>
-					tokensContainSequence(tokens, tokenize(command))
-			  )
+					tokensContainSequence(
+						tokens,
+						trimUnknownBoundaryTokens(tokenize(command)),
+					),
+				)
 			: allowedCommands.filter((command) =>
-					tokensEqual(tokens, tokenize(command))
-			  );
+					tokensEqual(
+						tokens,
+						trimUnknownBoundaryTokens(tokenize(command)),
+					),
+				);
 		hits.forEach((hit) => matches.add(hit));
 	};
 	// Only allow sentence matches for commands that were confidently recognized.
 	if (normalizedProcessed) {
 		COMMANDS.forEach((command) => {
-			const commandTokens = tokenize(command);
+			const commandTokens = trimUnknownBoundaryTokens(tokenize(command));
 			const isMatch = MATCH_SENTENCE
 				? tokensContainSequence(processedTokens, commandTokens)
 				: tokensEqual(processedTokens, commandTokens);
@@ -306,19 +347,42 @@ function handle(processedWord, averageConfidence, originalText) {
 	// If word-level confidence filtering removed all words, fall back to the
 	// original text when overall confidence is still acceptable.
-	if (!matches.size && normalizedOriginal && averageConfidence >= WORD_CONFIDENCE_THRESHOLD) {
+	if (
+		!matches.size &&
+		normalizedOriginal &&
+		averageConfidence >= WORD_CONFIDENCE_THRESHOLD
+	) {
 		findMatches(originalTokens);
 	}
-	if (!matches.size) return;
+	if (!matches.size) {
+		if (!isPartial) {
+			emittedMatchesInUtterance.clear();
+		}
+		return;
+	}
+	const uniqueMatches = [...matches].filter(
+		(match) => !emittedMatchesInUtterance.has(match),
+	);
+	if (!uniqueMatches.length) {
+		if (!isPartial) {
+			emittedMatchesInUtterance.clear();
+		}
+		return;
+	}
-	matches.forEach((match) => {
+	uniqueMatches.forEach((match) => {
 		if (finalSentence) {
 			process.stdout?.write(`sentence|${finalSentence}\n`);
 		}
 		process.stdout?.write(`voice|${match}\n`);
 		process.stdout?.write(`confidence|${averageConfidence}\n`);
+		emittedMatchesInUtterance.add(match);
 	});
+	if (!isPartial) {
+		emittedMatchesInUtterance.clear();
+	}
 }
 /* ------------------------------------------------------------------ */
 /* 6. Hot-reload grammar via stdin                                    */
@@ -354,7 +418,7 @@ rl.on("line", (line) => {
 		EXTRA_GRAMMAR = phrases;
 		GRAMMAR = [...COMMANDS, ...EXTRA_GRAMMAR, UNKNOWN_TOKEN];
 		console.error(
-			`[wakeword] extra grammar updated (${phrases.length}): ${phrases.join(", ")}`
+			`[wakeword] extra grammar updated (${phrases.length}): ${phrases.join(", ")}`,
 		);
 		rec = buildRecognizer();
 		return;
@@ -365,7 +429,7 @@ rl.on("line", (line) => {
 	COMMANDS = phrases;
 	GRAMMAR = [...COMMANDS, ...EXTRA_GRAMMAR, UNKNOWN_TOKEN];
 	console.error(
-		`[wakeword] grammar updated (${phrases.length}): ${phrases.join(", ")}`
+		`[wakeword] grammar updated (${phrases.length}): ${phrases.join(", ")}`,
 	);
 	rec = buildRecognizer();
 });

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@lumiastream/wakeword",
-	"version": "1.1.8",
+	"version": "1.2.0",
 	"type": "module",
 	"main": "lib/index.js",
 	"files": [