npm - @kpritam/grimoire-output-docusaurus - Versions diffs - 0.1.8 - Mend

@kpritam/grimoire-output-docusaurus 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

package/templates/spellbook/src/components/SpellbookChat/useSpeechRecognition.ts ADDED Viewed

@@ -0,0 +1,271 @@
+import { useCallback, useEffect, useRef, useState } from "react";
+import { transcriptDebug, voiceLog } from "./voiceDebug";
+/** Narrow surface of the Web Speech API used here (DOM lib may omit these in some TS configs). */
+interface VoiceSpeechRecognitionAlternative {
+  readonly transcript: string;
+}
+interface VoiceSpeechRecognitionResult {
+  readonly isFinal: boolean;
+  readonly length: number;
+  readonly [index: number]: VoiceSpeechRecognitionAlternative;
+}
+interface VoiceSpeechRecognitionResultList {
+  readonly length: number;
+  readonly [index: number]: VoiceSpeechRecognitionResult;
+}
+interface VoiceSpeechRecognitionEvent extends Event {
+  readonly resultIndex: number;
+  readonly results: VoiceSpeechRecognitionResultList;
+}
+interface VoiceSpeechRecognitionErrorEvent extends Event {
+  readonly error: string;
+  readonly message: string;
+}
+interface VoiceSpeechRecognition extends EventTarget {
+  continuous: boolean;
+  interimResults: boolean;
+  lang: string;
+  onresult: ((this: VoiceSpeechRecognition, ev: VoiceSpeechRecognitionEvent) => void) | null;
+  onerror:
+    | ((this: VoiceSpeechRecognition, ev: VoiceSpeechRecognitionErrorEvent) => void)
+    | null;
+  onend: ((this: VoiceSpeechRecognition, ev: Event) => void) | null;
+  start(): void;
+  stop(): void;
+  abort(): void;
+}
+type SpeechRecognitionConstructor = new () => VoiceSpeechRecognition;
+function getSpeechRecognitionCtor(): SpeechRecognitionConstructor | undefined {
+  if (typeof window === "undefined") {
+    return undefined;
+  }
+  const w = window as Window & {
+    SpeechRecognition?: SpeechRecognitionConstructor;
+    webkitSpeechRecognition?: SpeechRecognitionConstructor;
+  };
+  return w.SpeechRecognition ?? w.webkitSpeechRecognition ?? undefined;
+}
+/**
+ * Fatal native-runtime errors. Once one of these fires the API is effectively
+ * dead for this page-load (no network path to Google's speech service, mic
+ * blocked at the OS level, etc.) — there's no point retrying it. We surface
+ * `unsupported` so the unified hook can transparently fall back to Whisper.
+ */
+const FATAL_NATIVE_ERRORS = new Set([
+  "network",
+  "service-not-allowed",
+  "audio-capture",
+]);
+export interface UseSpeechRecognitionResult {
+  readonly supported: boolean;
+  readonly listening: boolean;
+  readonly interimTranscript: string;
+  readonly finalTranscript: string;
+  readonly error: string | null;
+  readonly start: (opts?: { lang?: string }) => void;
+  /** Graceful stop: the engine still emits any trailing final onresult. */
+  readonly stop: () => void;
+  /** Hard cancel: drops any pending result. Use on unmount / re-listen. */
+  readonly abort: () => void;
+  readonly reset: () => void;
+}
+export function useSpeechRecognition(): UseSpeechRecognitionResult {
+  const [listening, setListening] = useState(false);
+  const [interimTranscript, setInterimTranscript] = useState("");
+  const [finalTranscript, setFinalTranscript] = useState("");
+  const [error, setError] = useState<string | null>(null);
+  // Sticky once the API errors out fatally — flips `supported` to false so
+  // the unified hook can fall back to Whisper for the rest of the session.
+  const [runtimeUnsupported, setRuntimeUnsupported] = useState(false);
+  const recognitionRef = useRef<VoiceSpeechRecognition | null>(null);
+  // Lazy init avoids ever rendering with `supported: false` on the client.
+  // SSR is irrelevant — `Root.tsx` lazy-imports the panel.
+  const [hasCtor, setHasCtor] = useState<boolean>(() =>
+    Boolean(getSpeechRecognitionCtor()),
+  );
+  useEffect(() => {
+    const next = Boolean(getSpeechRecognitionCtor());
+    setHasCtor((prev) => (prev === next ? prev : next));
+  }, []);
+  const supported = hasCtor && !runtimeUnsupported;
+  // Graceful stop — Chrome will still fire one trailing `onresult` (final)
+  // and then `onend`. The recognition reference is kept until `onend` so
+  // that final result can land in state; only the listening flag flips
+  // immediately. Use `abort` if you need to drop the pending result.
+  const stop = useCallback(() => {
+    const r = recognitionRef.current;
+    voiceLog("native.stop", { hasRecognition: Boolean(r) });
+    if (!r) {
+      setListening(false);
+      return;
+    }
+    try {
+      r.stop();
+    } catch {
+      /* already stopped */
+    }
+    setListening(false);
+  }, []);
+  const abort = useCallback(() => {
+    const r = recognitionRef.current;
+    voiceLog("native.abort", { hasRecognition: Boolean(r) });
+    recognitionRef.current = null;
+    try {
+      r?.abort();
+    } catch {
+      /* ignore */
+    }
+    setListening(false);
+  }, []);
+  const reset = useCallback(() => {
+    voiceLog("native.reset");
+    abort();
+    setInterimTranscript("");
+    setFinalTranscript("");
+    setError(null);
+  }, [abort]);
+  useEffect(() => {
+    return () => {
+      const r = recognitionRef.current;
+      recognitionRef.current = null;
+      try {
+        r?.abort();
+      } catch {
+        /* ignore */
+      }
+    };
+  }, []);
+  const start = useCallback((opts?: { lang?: string }) => {
+    const Ctor = getSpeechRecognitionCtor();
+    if (!Ctor) {
+      voiceLog("native.unsupported");
+      return;
+    }
+    const lang = opts?.lang ?? "en-US";
+    voiceLog("native.start", { lang });
+    setError(null);
+    abort();
+    let recognition: VoiceSpeechRecognition;
+    try {
+      recognition = new Ctor();
+    } catch {
+      voiceLog("native.start.error", { message: "constructor failed" });
+      setError("Could not start speech recognition.");
+      return;
+    }
+    recognition.continuous = true;
+    recognition.interimResults = true;
+    recognition.lang = lang;
+    recognition.onresult = (event: VoiceSpeechRecognitionEvent) => {
+      let interim = "";
+      let final = "";
+      for (let i = event.resultIndex; i < event.results.length; i += 1) {
+        const result = event.results[i];
+        const piece = result[0]?.transcript ?? "";
+        if (result.isFinal) {
+          final += piece;
+        } else {
+          interim += piece;
+        }
+      }
+      if (final) {
+        voiceLog("native.result.final", transcriptDebug(final));
+        setFinalTranscript((prev) =>
+          prev ? `${prev.trimEnd()} ${final.trim()}` : final.trim(),
+        );
+      }
+      if (interim) {
+        voiceLog("native.result.interim", transcriptDebug(interim));
+        setInterimTranscript(interim);
+      } else if (final) {
+        setInterimTranscript("");
+      }
+    };
+    recognition.onerror = (ev: VoiceSpeechRecognitionErrorEvent) => {
+      voiceLog("native.error", {
+        error: ev.error,
+        message: ev.message,
+      });
+      if (ev.error === "aborted" || ev.error === "no-speech") {
+        return;
+      }
+      if (ev.error === "not-allowed") {
+        setError("Microphone permission denied");
+        setListening(false);
+        recognitionRef.current = null;
+        return;
+      }
+      if (FATAL_NATIVE_ERRORS.has(ev.error)) {
+        // Web Speech API is dead for this session — flip to unsupported so
+        // the unified hook switches to Whisper. Don't surface the raw error
+        // (the user doesn't care about Google STT internals).
+        setRuntimeUnsupported(true);
+        setError(null);
+        setListening(false);
+        recognitionRef.current = null;
+        return;
+      }
+      setError(ev.message || ev.error || "Speech recognition error");
+      setListening(false);
+      recognitionRef.current = null;
+    };
+    recognition.onend = () => {
+      voiceLog("native.end");
+      if (recognitionRef.current === recognition) {
+        recognitionRef.current = null;
+      }
+      setListening(false);
+    };
+    recognitionRef.current = recognition;
+    try {
+      recognition.start();
+      voiceLog("native.started");
+      setListening(true);
+      setInterimTranscript("");
+    } catch {
+      voiceLog("native.start.error", { message: "start failed" });
+      setError("Could not start speech recognition.");
+      recognitionRef.current = null;
+      setListening(false);
+    }
+  }, [abort]);
+  return {
+    supported,
+    listening,
+    interimTranscript,
+    finalTranscript,
+    error,
+    start,
+    stop,
+    abort,
+    reset,
+  };
+}

package/templates/spellbook/src/components/SpellbookChat/useSpeechSynthesis.ts ADDED Viewed

@@ -0,0 +1,229 @@
+import { useCallback, useEffect, useMemo, useRef, useState } from "react";
+/**
+ * Thin, idempotent wrapper around `window.speechSynthesis` with a FIFO queue
+ * so streamed answers can be spoken sentence-by-sentence. Guarantees:
+ *
+ *   1. `cancel()` empties the queue, aborts the live utterance, and resets
+ *      internal state. Callers that re-enqueue after `cancel()` start a
+ *      brand new utterance chain.
+ *   2. The hook cancels automatically on unmount — no leftover TTS after
+ *      the chat panel is closed, the mode is switched, or the route changes.
+ *
+ * Browser quirk: `speechSynthesis.cancel()` fires `onend`/`onerror` on the
+ * in-flight utterance which would normally try to advance our queue. We
+ * flip `processingRef` to `false` and empty the queue BEFORE calling cancel
+ * so the `finish` callback is a no-op on the cancellation path.
+ */
+export interface UseSpeechSynthesisResult {
+  readonly supported: boolean;
+  readonly speaking: boolean;
+  readonly voices: readonly SpeechSynthesisVoice[];
+  readonly speak: (
+    text: string,
+    opts?: {
+      voice?: SpeechSynthesisVoice;
+      rate?: number;
+      pitch?: number;
+    },
+  ) => void;
+  readonly cancel: () => void;
+  readonly enqueue: (text: string) => void;
+}
+function pickDefaultVoice(
+  list: readonly SpeechSynthesisVoice[],
+): SpeechSynthesisVoice | undefined {
+  const en = list.filter((v) => v.lang.toLowerCase().startsWith("en"));
+  const pool = en.length > 0 ? en : [...list];
+  const remote = pool.find((v) => v.localService === false);
+  if (remote) {
+    return remote;
+  }
+  return pool[0];
+}
+export function useSpeechSynthesis(): UseSpeechSynthesisResult {
+  const supported =
+    typeof window !== "undefined" && typeof speechSynthesis !== "undefined";
+  const [voices, setVoices] = useState<readonly SpeechSynthesisVoice[]>([]);
+  const [speaking, setSpeaking] = useState(false);
+  const queueRef = useRef<string[]>([]);
+  const processingRef = useRef(false);
+  const defaultVoiceRef = useRef<SpeechSynthesisVoice | undefined>(undefined);
+  const refreshVoices = useCallback(() => {
+    if (!supported) {
+      return;
+    }
+    const v = speechSynthesis.getVoices();
+    setVoices(v);
+    defaultVoiceRef.current = pickDefaultVoice(v);
+  }, [supported]);
+  useEffect(() => {
+    if (!supported) {
+      return;
+    }
+    refreshVoices();
+    const previous = speechSynthesis.onvoiceschanged;
+    const handler: typeof speechSynthesis.onvoiceschanged = () => {
+      refreshVoices();
+    };
+    speechSynthesis.onvoiceschanged = handler;
+    return () => {
+      if (speechSynthesis.onvoiceschanged === handler) {
+        speechSynthesis.onvoiceschanged = previous;
+      }
+    };
+  }, [supported, refreshVoices]);
+  const processQueue = useCallback(() => {
+    if (!supported) {
+      return;
+    }
+    if (processingRef.current) {
+      return;
+    }
+    const next = queueRef.current.shift();
+    if (!next) {
+      setSpeaking(false);
+      return;
+    }
+    processingRef.current = true;
+    setSpeaking(true);
+    const utterance = new SpeechSynthesisUtterance(next);
+    const voice =
+      defaultVoiceRef.current ??
+      pickDefaultVoice(speechSynthesis.getVoices());
+    if (voice) {
+      utterance.voice = voice;
+    }
+    utterance.lang = voice?.lang ?? "en-US";
+    const finish = (): void => {
+      processingRef.current = false;
+      if (queueRef.current.length === 0) {
+        setSpeaking(false);
+      }
+      processQueue();
+    };
+    utterance.onend = finish;
+    utterance.onerror = finish;
+    try {
+      speechSynthesis.speak(utterance);
+    } catch {
+      processingRef.current = false;
+      finish();
+    }
+  }, [supported]);
+  const cancel = useCallback(() => {
+    if (!supported) {
+      return;
+    }
+    queueRef.current = [];
+    processingRef.current = false;
+    try {
+      speechSynthesis.cancel();
+    } catch {
+      // Chromium occasionally throws if cancel races with tab suspension —
+      // the subsequent state reset still leaves us in a consistent idle state.
+    }
+    setSpeaking(false);
+  }, [supported]);
+  const speak = useCallback(
+    (
+      text: string,
+      opts?: {
+        voice?: SpeechSynthesisVoice;
+        rate?: number;
+        pitch?: number;
+      },
+    ) => {
+      if (!supported) {
+        return;
+      }
+      cancel();
+      const trimmed = text.trim();
+      if (!trimmed) {
+        return;
+      }
+      processingRef.current = true;
+      setSpeaking(true);
+      const utterance = new SpeechSynthesisUtterance(trimmed);
+      const voice =
+        opts?.voice ??
+        defaultVoiceRef.current ??
+        pickDefaultVoice(speechSynthesis.getVoices());
+      if (voice) {
+        utterance.voice = voice;
+      }
+      utterance.lang = voice?.lang ?? "en-US";
+      if (opts?.rate != null) {
+        utterance.rate = opts.rate;
+      }
+      if (opts?.pitch != null) {
+        utterance.pitch = opts.pitch;
+      }
+      const finish = (): void => {
+        processingRef.current = false;
+        setSpeaking(false);
+      };
+      utterance.onend = finish;
+      utterance.onerror = finish;
+      try {
+        speechSynthesis.speak(utterance);
+      } catch {
+        finish();
+      }
+    },
+    [supported, cancel],
+  );
+  const enqueue = useCallback(
+    (text: string) => {
+      if (!supported) {
+        return;
+      }
+      const trimmed = text.trim();
+      if (!trimmed) {
+        return;
+      }
+      queueRef.current.push(trimmed);
+      processQueue();
+    },
+    [supported, processQueue],
+  );
+  /**
+   * Every VoiceMode mount/unmount (panel close, mode switch, route change)
+   * must flush `speechSynthesis`; otherwise a live utterance keeps talking
+   * long after the UI is gone. Stash cancel in a ref so the cleanup fires
+   * exactly once, on unmount.
+   */
+  const cancelRef = useRef(cancel);
+  cancelRef.current = cancel;
+  useEffect(() => {
+    return () => {
+      cancelRef.current();
+    };
+  }, []);
+  const stableVoices = useMemo(() => voices, [voices]);
+  return {
+    supported,
+    speaking,
+    voices: stableVoices,
+    speak,
+    cancel,
+    enqueue,
+  };
+}

package/templates/spellbook/src/components/SpellbookChat/useUnifiedSTT.ts ADDED Viewed

@@ -0,0 +1,134 @@
+import { useCallback, useEffect, useRef } from "react";
+import { useSpeechRecognition } from "./useSpeechRecognition";
+import {
+  useWhisperSTT,
+  type WhisperLoadStatus,
+} from "./useWhisperSTT";
+import { voiceLog } from "./voiceDebug";
+/**
+ * Common shape for native (Web Speech API) and Whisper-fallback STT.
+ * Native streams interim + final transcripts; Whisper is single-shot
+ * (record → stop → transcript). Whisper-only metadata is reported as
+ * `ready` / `1` / `false` in native mode so callers don't need to branch.
+ */
+export interface UnifiedSTT {
+  readonly mode: "native" | "whisper";
+  readonly supported: boolean;
+  readonly listening: boolean;
+  readonly interimTranscript: string;
+  readonly finalTranscript: string;
+  readonly error: string | null;
+  readonly start: (opts?: { lang?: string }) => void;
+  /** Graceful stop — native mode still emits a trailing final transcript. */
+  readonly stop: () => void;
+  /** Hard cancel — drops any pending transcript. */
+  readonly abort: () => void;
+  readonly reset: () => void;
+  readonly modelLoadStatus: WhisperLoadStatus;
+  readonly modelLoadProgress: number;
+  readonly transcribing: boolean;
+}
+/**
+ * Prefer the native Web Speech API; fall back to the in-browser Whisper
+ * pipeline. Both hooks are always called (hooks must run unconditionally)
+ * but only the active one is exercised by `start()`.
+ *
+ * If the user calls `start()` while native is supported and native then
+ * errors fatally (typically `network` because Google's STT service is
+ * unreachable), `useSpeechRecognition` flips its `supported` flag and
+ * we transparently restart on Whisper. The caller doesn't have to know.
+ */
+export function useUnifiedSTT(): UnifiedSTT {
+  const native = useSpeechRecognition();
+  const whisper = useWhisperSTT();
+  const usingNative = native.supported;
+  const wantsListenRef = useRef(false);
+  const langRef = useRef<string>("en-US");
+  const start = useCallback(
+    (opts?: { lang?: string }) => {
+      wantsListenRef.current = true;
+      if (opts?.lang) langRef.current = opts.lang;
+      voiceLog("stt.start", {
+        mode: native.supported ? "native" : "whisper",
+        lang: langRef.current,
+      });
+      if (native.supported) {
+        native.start(opts);
+      } else {
+        whisper.start(opts);
+      }
+    },
+    [native, whisper],
+  );
+  const stop = useCallback(() => {
+    wantsListenRef.current = false;
+    voiceLog("stt.stop", { mode: usingNative ? "native" : "whisper" });
+    if (usingNative) native.stop();
+    else whisper.stop();
+  }, [usingNative, native, whisper]);
+  const abort = useCallback(() => {
+    wantsListenRef.current = false;
+    voiceLog("stt.abort", { mode: usingNative ? "native" : "whisper" });
+    if (usingNative) native.abort();
+    else whisper.stop();
+  }, [usingNative, native, whisper]);
+  const reset = useCallback(() => {
+    wantsListenRef.current = false;
+    voiceLog("stt.reset", { mode: usingNative ? "native" : "whisper" });
+    if (usingNative) native.reset();
+    else whisper.reset();
+  }, [usingNative, native, whisper]);
+  /**
+   * If we asked native to listen and it died fatally (mode flipped to
+   * whisper), kick off whisper so the user doesn't have to tap twice.
+   */
+  useEffect(() => {
+    if (usingNative) return;
+    if (!wantsListenRef.current) return;
+    if (whisper.listening || whisper.transcribing || whisper.error) return;
+    voiceLog("stt.fallback-to-whisper");
+    whisper.start({ lang: langRef.current });
+  }, [usingNative, whisper]);
+  if (usingNative) {
+    return {
+      mode: "native",
+      supported: true,
+      listening: native.listening,
+      interimTranscript: native.interimTranscript,
+      finalTranscript: native.finalTranscript,
+      error: native.error,
+      start,
+      stop,
+      abort,
+      reset,
+      modelLoadStatus: "ready",
+      modelLoadProgress: 1,
+      transcribing: false,
+    };
+  }
+  return {
+    mode: "whisper",
+    supported: whisper.supported,
+    listening: whisper.listening,
+    interimTranscript: whisper.interimTranscript,
+    finalTranscript: whisper.finalTranscript,
+    error: whisper.error,
+    start,
+    stop,
+    abort,
+    reset,
+    modelLoadStatus: whisper.modelLoadStatus,
+    modelLoadProgress: whisper.modelLoadProgress,
+    transcribing: whisper.transcribing,
+  };
+}