npm - @juicesharp/rpiv-voice - Versions diffs - 1.4.2 - Mend

@juicesharp/rpiv-voice 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

package/CHANGELOG.md +45 -0
package/LICENSE +21 -0
package/README.md +116 -0
package/audio/error-log.ts +37 -0
package/audio/hallucination-filter.ts +71 -0
package/audio/mic-source.ts +38 -0
package/audio/model-download.ts +268 -0
package/audio/pcm.ts +45 -0
package/audio/sherpa-onnx-node.d.ts +55 -0
package/audio/stt-engine.ts +117 -0
package/command/pipeline-runner.ts +238 -0
package/command/splash-runner.ts +72 -0
package/command/voice-command.ts +251 -0
package/config/voice-config.ts +80 -0
package/docs/cover.png +0 -0
package/docs/cover.svg +173 -0
package/docs/equalizer.svg +86 -0
package/docs/overlay.jpg +0 -0
package/docs/overlay.png +0 -0
package/docs/vertical-cover.png +0 -0
package/docs/vertical-cover.svg +239 -0
package/index.ts +66 -0
package/locales/de.json +39 -0
package/locales/en.json +42 -0
package/locales/es.json +39 -0
package/locales/fr.json +39 -0
package/locales/pt-BR.json +39 -0
package/locales/pt.json +39 -0
package/locales/ru.json +39 -0
package/locales/uk.json +39 -0
package/package.json +94 -0
package/state/i18n-bridge.ts +51 -0
package/state/key-router.ts +46 -0
package/state/screen-intent.ts +27 -0
package/state/selectors/contract.ts +13 -0
package/state/selectors/derivations.ts +9 -0
package/state/selectors/focus.ts +6 -0
package/state/selectors/projections.ts +112 -0
package/state/state-reducer.ts +197 -0
package/state/state.ts +48 -0
package/state/status-intent.ts +23 -0
package/state/voice-session.ts +176 -0
package/view/component-binding.ts +24 -0
package/view/components/equalizer-view.ts +237 -0
package/view/components/settings-field-view.ts +77 -0
package/view/components/settings-form-view.ts +26 -0
package/view/components/splash-view.ts +98 -0
package/view/components/status-bar-view.ts +112 -0
package/view/components/transcript-view.ts +50 -0
package/view/overlay-view.ts +82 -0
package/view/props-adapter.ts +29 -0
package/view/screen-content-strategy.ts +58 -0
package/view/stateful-view.ts +7 -0

package/audio/sherpa-onnx-node.d.ts ADDED Viewed

@@ -0,0 +1,55 @@
+// Ambient type declarations for sherpa-onnx-node (no .d.ts shipped upstream).
+// Mirrors `nodejs-addon-examples/test_asr_non_streaming_whisper.js` from
+// k2-fsa/sherpa-onnx — top-level keys are camelCase; binding converts to
+// snake_case C struct internally.
+declare module "sherpa-onnx-node" {
+	export interface Samples {
+		samples: Float32Array;
+		sampleRate: number;
+	}
+	export interface Result {
+		text: string;
+		tokens: string[];
+		timestamps: number[];
+	}
+	export interface Stream {
+		acceptWaveform(input: Samples): void;
+	}
+	// Note: OfflineRecognizer has no release/destroy/free method in
+	// sherpa-onnx-node@1.13.0 — the native handle is GC-managed.
+	// We use the synchronous `decode` + `getResult` pair (the canonical
+	// upstream example uses sync exclusively).
+	export interface Recognizer {
+		createStream(): Stream;
+		decode(stream: Stream): void;
+		getResult(stream: Stream): Result;
+	}
+	// Whisper config: `language` and `task` are optional (and meaningless for
+	// the *.en monolingual variants — the upstream example omits them
+	// entirely). `tailPaddings` defaults to 0.
+	export interface WhisperModelConfig {
+		encoder: string;
+		decoder: string;
+		language?: string;
+		task?: string;
+		tailPaddings?: number;
+	}
+	export interface Config {
+		featConfig: { sampleRate: number; featureDim: number };
+		modelConfig: {
+			whisper: WhisperModelConfig;
+			tokens: string;
+			numThreads?: number;
+			provider?: string;
+		};
+	}
+	// The binding exposes both a sync constructor and an async factory.
+	// The canonical examples use the sync constructor; we keep both signatures
+	// here so consumers can pick.
+	export interface OfflineRecognizerCtor {
+		new (config: Config): Recognizer;
+		createAsync(config: Config): Promise<Recognizer>;
+	}
+	export const OfflineRecognizer: OfflineRecognizerCtor;
+}

package/audio/stt-engine.ts ADDED Viewed

@@ -0,0 +1,117 @@
+/**
+ * stt-engine — thin typed wrapper around sherpa-onnx-node.
+ *
+ * Type model: sherpa-onnx-node ships no .d.ts files; ambient types live in
+ * ./sherpa-onnx-node.d.ts. Config keys are camelCase; the binding maps to
+ * snake_case C structs internally.
+ *
+ * Model layout: Whisper base multilingual — `modelConfig.whisper.{encoder,
+ * decoder}`, matching the canonical upstream example
+ * `nodejs-addon-examples/test_asr_non_streaming_whisper.js`. We use the int8
+ * quantized variants (`base-encoder.int8.onnx`, `base-decoder.int8.onnx`) to
+ * keep CPU latency low.
+ *
+ * Language pre-set: optional `language` (ISO 639-1 like "en", "ru") biases
+ * Whisper toward that language for accuracy and skips the per-utterance
+ * auto-detect. Threaded from `getActiveLocale()` in voice-command. When
+ * undefined, the multilingual model's built-in auto-detect runs — the
+ * historical default behavior.
+ *
+ * Decode path: SYNCHRONOUS `recognizer.decode(stream)` + `getResult(stream)`,
+ * same as upstream's example.
+ */
+import type { Config } from "sherpa-onnx-node";
+// ── Whisper fixed input contract ─────────────────────────────────────────────
+// 16 kHz mono PCM. featureDim 80 matches the model's mel-spectrogram output.
+const WHISPER_SAMPLE_RATE = 16000;
+const WHISPER_FEATURE_DIM = 80;
+// ── Defaults ─────────────────────────────────────────────────────────────────
+// 4 threads is the sweet spot for Whisper base.en on a modern multi-core CPU
+// per upstream tuning guidance (whisper.cpp benchmarks; the sherpa-onnx ORT
+// thread pool follows the same pattern). More than 4 shows diminishing
+// returns and can starve other Pi work on smaller machines.
+const DEFAULT_NUM_THREADS = 4;
+const DEFAULT_PROVIDER = "cpu";
+// `tailPaddings` is the only decoder-adjacent knob sherpa-onnx exposes for
+// Whisper. Per maintainer guidance in k2-fsa/sherpa-onnx#2787, audio under
+// 30 s makes Whisper miss EOS and hallucinate; padding the encoder input
+// reduces the chunk-end EOT bias that produces spurious terminal punctuation.
+// 1000 frames ≈ 100 mel-frame steps of trailing silence.
+const DEFAULT_TAIL_PADDINGS = 1000;
+// ── Types ────────────────────────────────────────────────────────────────────
+export interface SttEngineConfig {
+	encoderPath: string;
+	decoderPath: string;
+	tokensPath: string;
+	/** ISO 639-1 hint (e.g. "en", "ru"). Undefined → Whisper auto-detects. */
+	language?: string;
+	numThreads?: number;
+	provider?: string;
+}
+export interface SttEngine {
+	recognize(samples: Float32Array, sampleRate: number): Promise<string>;
+	release(): void;
+}
+// ── Factory ──────────────────────────────────────────────────────────────────
+export async function createSttEngine(config: SttEngineConfig): Promise<SttEngine> {
+	const ns = await loadSherpaNamespace();
+	const recognizer = new ns.OfflineRecognizer(buildRecognizerConfig(config));
+	return {
+		async recognize(samples: Float32Array, sampleRate: number): Promise<string> {
+			if (samples.length === 0) return "";
+			const stream = recognizer.createStream();
+			stream.acceptWaveform({ samples, sampleRate });
+			recognizer.decode(stream);
+			return recognizer.getResult(stream).text.trim();
+		},
+		release(): void {
+			// sherpa-onnx-node@1.13.0 exposes no destructor; the native handle is
+			// GC-managed. Kept as a no-op so the lifecycle contract is stable for
+			// callers and tests.
+		},
+	};
+}
+// ── Internal ─────────────────────────────────────────────────────────────────
+// sherpa-onnx-node ships as CJS; under ESM dynamic import only
+// `OnlineRecognizer` is auto-detected as a named export. Everything else
+// (including `OfflineRecognizer`) lives on `.default`. We fall back to the
+// namespace itself in case a future ESM build flattens the shape.
+async function loadSherpaNamespace(): Promise<{
+	OfflineRecognizer: typeof import("sherpa-onnx-node").OfflineRecognizer;
+}> {
+	const mod = (await import("sherpa-onnx-node")) as Record<string, unknown> & {
+		default?: Record<string, unknown>;
+	};
+	return (mod.default ?? mod) as { OfflineRecognizer: typeof import("sherpa-onnx-node").OfflineRecognizer };
+}
+function buildRecognizerConfig(config: SttEngineConfig): Config {
+	return {
+		featConfig: {
+			sampleRate: WHISPER_SAMPLE_RATE,
+			featureDim: WHISPER_FEATURE_DIM,
+		},
+		modelConfig: {
+			whisper: {
+				encoder: config.encoderPath,
+				decoder: config.decoderPath,
+				tailPaddings: DEFAULT_TAIL_PADDINGS,
+				...(config.language ? { language: config.language } : {}),
+			},
+			tokens: config.tokensPath,
+			numThreads: config.numThreads ?? DEFAULT_NUM_THREADS,
+			provider: config.provider ?? DEFAULT_PROVIDER,
+		},
+	};
+}

package/command/pipeline-runner.ts ADDED Viewed

@@ -0,0 +1,238 @@
+import { appendErrorLog } from "../audio/error-log.js";
+import { isHallucination } from "../audio/hallucination-filter.js";
+import type { DecibriLike } from "../audio/mic-source.js";
+import { TARGET_SAMPLE_RATE } from "../audio/mic-source.js";
+import { bufferToFloat32, computeRmsFloat32, computeRmsInt16, samplesInInt16Chunk } from "../audio/pcm.js";
+import type { SttEngine } from "../audio/stt-engine.js";
+import { isHallucinationFilterEnabled } from "../config/voice-config.js";
+import type { VoiceSession } from "../state/voice-session.js";
+// 12 s soft cap: Whisper trains on 30 s windows and degrades on very short
+// inputs; 5 s force-flushes routinely bisect a clause mid-word. 12 s is the
+// dictation-tool consensus (LiveKit, whisper_streaming) — long enough to fit
+// most sentences, short enough to bound first-token latency.
+const MAX_SEGMENT_MS = 12000;
+const MAX_SEGMENT_SAMPLES = (TARGET_SAMPLE_RATE * MAX_SEGMENT_MS) / 1000;
+// When the cap fires, scan the trailing 800 ms for the chunk with the lowest
+// RMS and split there instead of at the wall-clock boundary. The "head" half
+// goes to Whisper, the "tail" carries forward as the start of the next
+// segment. Cuts mid-breath instead of mid-syllable.
+const CAP_CUT_SCAN_MS = 800;
+const CAP_CUT_SCAN_SAMPLES = (TARGET_SAMPLE_RATE * CAP_CUT_SCAN_MS) / 1000;
+// Whisper hallucinates filler ("Thanks for watching", "♪", "1/2 1/2…") on
+// near-silent input. sherpa-onnx-node doesn't expose the decoder thresholds
+// that would suppress this, so we gate at the input: skip segments whose mean
+// RMS is below a floor. ~-46 dBFS sits between room noise and quiet speech.
+const MIN_SEGMENT_RMS = 0.005;
+// Cadence of rolling partial-transcript decodes during an active utterance.
+// 1 s gives the user a continuously-refining preview without saturating the
+// CPU on Whisper-base (typical decode of a sub-12 s buffer is well under 1 s
+// on modern silicon, leaving headroom for mic + render). Single-flight: a
+// new tick is skipped if the previous decode hasn't returned.
+const PARTIAL_DECODE_INTERVAL_MS = 1000;
+export interface PipelineHandle {
+	finalTranscriptPromise: Promise<string>;
+	isPaused(): boolean;
+	setPaused(paused: boolean): void;
+	setHallucinationFilterEnabled(enabled: boolean): void;
+	stop(): void;
+}
+export interface PipelineOptions {
+	hallucinationFilterEnabled?: boolean;
+}
+export function startDictationPipeline(
+	mic: DecibriLike,
+	sttEngine: SttEngine,
+	session: VoiceSession,
+	signal: AbortSignal,
+	options: PipelineOptions = {},
+): PipelineHandle {
+	let speechBuffer: Buffer[] = [];
+	let speechBufferSamples = 0;
+	let transcript = "";
+	let recognizing: Promise<void> = Promise.resolve();
+	let paused = false;
+	let hallucinationFilterEnabled = isHallucinationFilterEnabled(options);
+	// Single-flight gate for partial decodes. Combined with the interval
+	// throttle this means at most one partial recognize() at a time and at
+	// most one per PARTIAL_DECODE_INTERVAL_MS — never queueing a backlog if
+	// the CPU stalls.
+	let partialInFlight = false;
+	let lastPartialAt = 0;
+	// Bumps every time the buffer is committed (silence flush, cap flush, or
+	// session shutdown). A partial whose snapshot was taken at an earlier
+	// epoch is dropped on dispatch — protects against a slow partial decode
+	// painting stale text after the final commit.
+	let utteranceEpoch = 0;
+	const recognizeFinal = async (chunks: Buffer[]): Promise<void> => {
+		if (chunks.length === 0) return;
+		const samples = bufferToFloat32(Buffer.concat(chunks));
+		if (computeRmsFloat32(samples) < MIN_SEGMENT_RMS) {
+			// No audible content — but still finalize so any in-flight partial
+			// gets cleared by the reducer's empty-append branch.
+			session.dispatchAction({ kind: "audio_transcript_appended", text: "" });
+			return;
+		}
+		try {
+			const text = await sttEngine.recognize(samples, TARGET_SAMPLE_RATE);
+			if (!text || (hallucinationFilterEnabled && isHallucination(text))) {
+				session.dispatchAction({ kind: "audio_transcript_appended", text: "" });
+				return;
+			}
+			transcript = transcript ? `${transcript} ${text}` : text;
+			session.dispatchAction({ kind: "audio_transcript_appended", text });
+		} catch (err) {
+			// We deliberately do not surface this to the TUI: writing to stderr
+			// corrupts the active render, and `notify` would churn the chat for
+			// every dropped segment. Instead, append a breadcrumb to a file the
+			// user can `cat` later when investigating transcript gaps.
+			appendErrorLog("stt.recognize", err);
+			session.dispatchAction({ kind: "audio_transcript_appended", text: "" });
+		}
+	};
+	const flushBuffer = (): void => {
+		if (speechBuffer.length === 0) return;
+		const chunks = speechBuffer;
+		speechBuffer = [];
+		speechBufferSamples = 0;
+		utteranceEpoch++;
+		recognizing = recognizing.then(() => recognizeFinal(chunks));
+	};
+	const queueCapFlush = (): void => {
+		const cutIdx = findLowestEnergyCutIndex(speechBuffer);
+		if (cutIdx <= 0 || cutIdx >= speechBuffer.length) {
+			flushBuffer();
+			return;
+		}
+		const head = speechBuffer.slice(0, cutIdx);
+		const tail = speechBuffer.slice(cutIdx);
+		speechBuffer = tail;
+		speechBufferSamples = countSamples(tail);
+		utteranceEpoch++;
+		recognizing = recognizing.then(() => recognizeFinal(head));
+	};
+	// Rolling partial preview. Runs *outside* the `recognizing` chain so the
+	// preview latency isn't queued behind pending finals. Best-effort: a
+	// snapshot of the current buffer is decoded, and the result is dispatched
+	// as the new partial only if the utterance epoch hasn't advanced under us.
+	const tryEmitPartial = (): void => {
+		if (partialInFlight) return;
+		if (speechBuffer.length === 0) return;
+		const now = Date.now();
+		if (now - lastPartialAt < PARTIAL_DECODE_INTERVAL_MS) return;
+		lastPartialAt = now;
+		partialInFlight = true;
+		const snapshotEpoch = utteranceEpoch;
+		const snapshot = speechBuffer.slice();
+		void (async () => {
+			try {
+				const samples = bufferToFloat32(Buffer.concat(snapshot));
+				if (computeRmsFloat32(samples) < MIN_SEGMENT_RMS) return;
+				const text = await sttEngine.recognize(samples, TARGET_SAMPLE_RATE);
+				if (snapshotEpoch !== utteranceEpoch) return;
+				if (hallucinationFilterEnabled && isHallucination(text)) return;
+				session.dispatchAction({ kind: "audio_partial_transcript_set", text });
+			} catch (err) {
+				appendErrorLog("stt.recognize.partial", err);
+			} finally {
+				partialInFlight = false;
+			}
+		})();
+	};
+	mic.on("data", (chunk: Buffer) => {
+		const level = computeRmsInt16(chunk);
+		session.dispatchAction({ kind: "audio_chunk", level });
+		if (paused) return;
+		speechBuffer.push(chunk);
+		speechBufferSamples += samplesInInt16Chunk(chunk);
+		if (speechBufferSamples >= MAX_SEGMENT_SAMPLES) {
+			queueCapFlush();
+		} else {
+			tryEmitPartial();
+		}
+	});
+	mic.on("silence", () => {
+		if (paused) return;
+		flushBuffer();
+	});
+	const finalTranscriptPromise = waitForMicShutdown(mic, signal, async () => {
+		flushBuffer();
+		await recognizing;
+	}).then(() => transcript);
+	return {
+		finalTranscriptPromise,
+		isPaused: () => paused,
+		setPaused: (v) => {
+			paused = v;
+		},
+		setHallucinationFilterEnabled: (v) => {
+			hallucinationFilterEnabled = v;
+		},
+		stop: () => {
+			mic.stop();
+		},
+	};
+}
+function countSamples(chunks: Buffer[]): number {
+	let total = 0;
+	for (const chunk of chunks) total += samplesInInt16Chunk(chunk);
+	return total;
+}
+// Walk chunks newest-first up to CAP_CUT_SCAN_SAMPLES of audio; return the
+// index of the lowest-RMS chunk in that window. Returns chunks.length when
+// the buffer is too short to scan, telling the caller to fall back to a full
+// flush.
+function findLowestEnergyCutIndex(chunks: Buffer[]): number {
+	if (chunks.length < 2) return chunks.length;
+	let scanned = 0;
+	let lowestRms = Number.POSITIVE_INFINITY;
+	let lowestIdx = chunks.length;
+	for (let i = chunks.length - 1; i >= 1; i--) {
+		const chunk = chunks[i];
+		if (!chunk) continue;
+		const rms = computeRmsInt16(chunk);
+		if (rms < lowestRms) {
+			lowestRms = rms;
+			lowestIdx = i;
+		}
+		scanned += samplesInInt16Chunk(chunk);
+		if (scanned >= CAP_CUT_SCAN_SAMPLES) break;
+	}
+	return lowestIdx;
+}
+function waitForMicShutdown(mic: DecibriLike, signal: AbortSignal, onFinish: () => Promise<void>): Promise<void> {
+	return new Promise<void>((resolve) => {
+		const onAbort = () => {
+			mic.stop();
+		};
+		const finish = async () => {
+			signal.removeEventListener("abort", onAbort);
+			await onFinish();
+			resolve();
+		};
+		mic.once("end", finish);
+		mic.once("error", finish);
+		if (signal.aborted) {
+			mic.stop();
+		} else {
+			signal.addEventListener("abort", onAbort, { once: true });
+		}
+	});
+}

package/command/splash-runner.ts ADDED Viewed

@@ -0,0 +1,72 @@
+import type { ExtensionCommandContext } from "@earendil-works/pi-coding-agent";
+import {
+	SPLASH_FRAME_INTERVAL_MS,
+	SPLASH_FRAMES,
+	type SplashPhase,
+	SplashView,
+} from "../view/components/splash-view.js";
+export interface SplashController {
+	setPhase(phase: SplashPhase): void;
+}
+export interface SplashRunnerConfig {
+	initialPhase: SplashPhase;
+}
+export async function runWithSplash<T>(
+	ctx: ExtensionCommandContext,
+	config: SplashRunnerConfig,
+	work: (controller: SplashController) => Promise<T>,
+): Promise<T> {
+	let workResult: T | undefined;
+	let workError: unknown;
+	// Render inline (replace the editor) rather than as a bottom-anchored overlay.
+	// Bottom-anchored overlays force pi-tui to pad the chat buffer to the full
+	// terminal height, which pushes short chat content to the very top of the
+	// screen and leaves a large gap above the overlay. Inline mode keeps the
+	// component in the chat flow — it appears exactly where the editor was.
+	await ctx.ui.custom<void>((tui, theme, _kb, done) => {
+		const splash = new SplashView(theme);
+		let phase: SplashPhase = config.initialPhase;
+		let frame = 0;
+		splash.setProps({ phase, frame });
+		const tick = setInterval(() => {
+			frame = (frame + 1) % SPLASH_FRAMES.length;
+			splash.setProps({ phase, frame });
+			tui.requestRender();
+		}, SPLASH_FRAME_INTERVAL_MS);
+		const controller: SplashController = {
+			setPhase(next: SplashPhase) {
+				phase = next;
+				splash.setProps({ phase, frame });
+				tui.requestRender();
+			},
+		};
+		work(controller).then(
+			(result) => {
+				workResult = result;
+				clearInterval(tick);
+				done(undefined);
+			},
+			(err) => {
+				workError = err;
+				clearInterval(tick);
+				done(undefined);
+			},
+		);
+		return {
+			render: (w: number) => splash.render(w),
+			invalidate: () => splash.setProps({ phase, frame }),
+			handleInput: (_d: string) => {},
+		};
+	});
+	if (workError) throw workError;
+	return workResult as T;
+}