npm - @kpritam/grimoire-output-docusaurus - Versions diffs - 0.1.8 - Mend

@kpritam/grimoire-output-docusaurus 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

package/templates/spellbook/src/components/SpellbookChat/useWhisperSTT.ts ADDED Viewed

@@ -0,0 +1,411 @@
+import { useCallback, useEffect, useRef, useState } from "react";
+import { transcriptDebug, voiceLog } from "./voiceDebug";
+/**
+ * Whisper-based STT fallback for browsers without the native Web Speech API
+ * (Firefox, some Chromium derivatives). Single-shot: tap to record, tap
+ * again to stop, transcript arrives after a Whisper pass. Uses the same
+ * `@huggingface/transformers` runtime as the embedder. The model (~40 MB
+ * quantized) downloads once and is browser-cached thereafter.
+ */
+export type WhisperLoadStatus = "idle" | "loading" | "ready" | "error";
+export interface UseWhisperSTTResult {
+  readonly supported: boolean;
+  readonly listening: boolean;
+  readonly interimTranscript: string;
+  readonly finalTranscript: string;
+  readonly error: string | null;
+  readonly start: (opts?: { lang?: string }) => void;
+  readonly stop: () => void;
+  readonly reset: () => void;
+  /** Lifecycle of the on-disk Whisper pipeline. */
+  readonly modelLoadStatus: WhisperLoadStatus;
+  /** 0 → 1 progress for the initial model download; meaningless after `ready`. */
+  readonly modelLoadProgress: number;
+  /** True while Whisper is post-processing the captured audio. */
+  readonly transcribing: boolean;
+}
+interface AnyTransformersPipeline {
+  (input: Float32Array, options?: Record<string, unknown>): Promise<{
+    text?: string;
+  }>;
+}
+let whisperPipelinePromise: Promise<AnyTransformersPipeline> | null = null;
+/** Memoized at module scope so `start()` calls and other hook instances share one pipeline. */
+async function loadWhisperPipeline(
+  onProgress: (progress: number) => void,
+): Promise<AnyTransformersPipeline> {
+  if (whisperPipelinePromise) {
+    return whisperPipelinePromise;
+  }
+  whisperPipelinePromise = (async () => {
+    const transformers = await import("@huggingface/transformers");
+    const pipelineFn = transformers.pipeline as unknown as (
+      task: string,
+      model: string,
+      opts?: { progress_callback?: (info: unknown) => void },
+    ) => Promise<AnyTransformersPipeline>;
+    return pipelineFn("automatic-speech-recognition", "Xenova/whisper-tiny.en", {
+      progress_callback: (info: unknown) => {
+        if (!info || typeof info !== "object") return;
+        const o = info as { status?: string; progress?: number };
+        if (
+          (o.status === "progress" || o.status === "progress_total") &&
+          typeof o.progress === "number"
+        ) {
+          onProgress(Math.max(0, Math.min(1, o.progress / 100)));
+        }
+      },
+    });
+  })();
+  whisperPipelinePromise.catch(() => {
+    whisperPipelinePromise = null;
+  });
+  return whisperPipelinePromise;
+}
+/** Decode the recording into mono 16 kHz Float32, as Whisper expects. */
+async function decodeAndResample(blob: Blob): Promise<Float32Array> {
+  const arrayBuffer = await blob.arrayBuffer();
+  const Ctx =
+    window.AudioContext ??
+    (window as unknown as { webkitAudioContext?: typeof AudioContext })
+      .webkitAudioContext;
+  if (!Ctx) {
+    throw new Error("AudioContext is not supported in this browser.");
+  }
+  const ctx = new Ctx();
+  try {
+    const audioBuffer = await ctx.decodeAudioData(arrayBuffer.slice(0));
+    if (audioBuffer.sampleRate === 16000 && audioBuffer.numberOfChannels === 1) {
+      return new Float32Array(audioBuffer.getChannelData(0));
+    }
+    const offline = new OfflineAudioContext(
+      1,
+      Math.max(1, Math.ceil(audioBuffer.duration * 16000)),
+      16000,
+    );
+    const src = offline.createBufferSource();
+    src.buffer = audioBuffer;
+    src.connect(offline.destination);
+    src.start(0);
+    const resampled = await offline.startRendering();
+    return new Float32Array(resampled.getChannelData(0));
+  } finally {
+    ctx.close().catch(() => {
+      /* ignore */
+    });
+  }
+}
+function detectSupported(): boolean {
+  if (typeof window === "undefined") return false;
+  if (typeof MediaRecorder === "undefined") return false;
+  if (!navigator?.mediaDevices?.getUserMedia) return false;
+  if (
+    typeof window.AudioContext === "undefined" &&
+    typeof (window as unknown as { webkitAudioContext?: unknown })
+      .webkitAudioContext === "undefined"
+  ) {
+    return false;
+  }
+  return true;
+}
+function pickRecorderMimeType(): string | undefined {
+  if (typeof MediaRecorder === "undefined") return undefined;
+  const candidates = [
+    "audio/webm;codecs=opus",
+    "audio/webm",
+    "audio/mp4",
+    "audio/ogg;codecs=opus",
+  ];
+  for (const m of candidates) {
+    try {
+      if (MediaRecorder.isTypeSupported(m)) return m;
+    } catch {
+      /* ignore */
+    }
+  }
+  return undefined;
+}
+export function useWhisperSTT(): UseWhisperSTTResult {
+  // Lazy initial state — same rationale as `useSpeechRecognition`. Without
+  // this, the first render reports `supported: false` and the unified hook
+  // briefly shows the "not supported" caption before flipping to true on
+  // the next render, which the user can race past with a fast click.
+  const [supported, setSupported] = useState<boolean>(() => detectSupported());
+  const [listening, setListening] = useState(false);
+  const [finalTranscript, setFinalTranscript] = useState("");
+  const [error, setError] = useState<string | null>(null);
+  const [modelLoadStatus, setModelLoadStatus] =
+    useState<WhisperLoadStatus>("idle");
+  const [modelLoadProgress, setModelLoadProgress] = useState(0);
+  const [transcribing, setTranscribing] = useState(false);
+  const recorderRef = useRef<MediaRecorder | null>(null);
+  const streamRef = useRef<MediaStream | null>(null);
+  const chunksRef = useRef<Blob[]>([]);
+  const startSeqRef = useRef(0);
+  useEffect(() => {
+    const next = detectSupported();
+    setSupported((prev) => (prev === next ? prev : next));
+  }, []);
+  const cleanupTracks = useCallback(() => {
+    streamRef.current?.getTracks().forEach((t) => {
+      try {
+        t.stop();
+      } catch {
+        /* ignore */
+      }
+    });
+    streamRef.current = null;
+  }, []);
+  const cleanupRecorder = useCallback(() => {
+    const recorder = recorderRef.current;
+    if (recorder && recorder.state !== "inactive") {
+      try {
+        recorder.stop();
+      } catch {
+        /* ignore */
+      }
+    }
+    recorderRef.current = null;
+  }, []);
+  useEffect(() => {
+    return () => {
+      cleanupRecorder();
+      cleanupTracks();
+      chunksRef.current = [];
+    };
+  }, [cleanupRecorder, cleanupTracks]);
+  const stop = useCallback(() => {
+    const recorder = recorderRef.current;
+    voiceLog("whisper.stop", { state: recorder?.state ?? "none" });
+    if (!recorder) {
+      startSeqRef.current += 1;
+    }
+    if (recorder && recorder.state !== "inactive") {
+      try {
+        recorder.stop();
+      } catch {
+        /* ignore */
+      }
+    }
+    setListening(false);
+  }, []);
+  const reset = useCallback(() => {
+    voiceLog("whisper.reset");
+    startSeqRef.current += 1;
+    cleanupRecorder();
+    cleanupTracks();
+    chunksRef.current = [];
+    setFinalTranscript("");
+    setError(null);
+    setListening(false);
+    setTranscribing(false);
+  }, [cleanupRecorder, cleanupTracks]);
+  const start = useCallback(
+    async (_opts?: { lang?: string }) => {
+      if (!supported) {
+        voiceLog("whisper.unsupported");
+        setError("Voice input is not supported in this browser.");
+        return;
+      }
+      voiceLog("whisper.start", { modelLoadStatus });
+      setError(null);
+      setFinalTranscript("");
+      chunksRef.current = [];
+      const seq = ++startSeqRef.current;
+      if (modelLoadStatus !== "ready") {
+        voiceLog("whisper.model.loading");
+        setModelLoadStatus("loading");
+        setModelLoadProgress(0);
+        try {
+          await loadWhisperPipeline((p) => {
+            if (startSeqRef.current === seq) {
+              setModelLoadProgress(p);
+            }
+          });
+          if (startSeqRef.current !== seq) return;
+          voiceLog("whisper.model.ready");
+          setModelLoadStatus("ready");
+          setModelLoadProgress(1);
+        } catch (e) {
+          voiceLog("whisper.model.error", {
+            message: e instanceof Error ? e.message : String(e),
+          });
+          setModelLoadStatus("error");
+          setError(
+            `Could not load voice model: ${
+              e instanceof Error ? e.message : String(e)
+            }`,
+          );
+          return;
+        }
+      }
+      let stream: MediaStream;
+      try {
+        voiceLog("whisper.media.request");
+        stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+      } catch (e) {
+        const name = (e as DOMException)?.name;
+        voiceLog("whisper.media.error", {
+          name,
+          message: e instanceof Error ? e.message : String(e),
+        });
+        setError(
+          name === "NotAllowedError" || name === "SecurityError"
+            ? "Microphone permission denied"
+            : `Could not access microphone: ${
+                e instanceof Error ? e.message : String(e)
+              }`,
+        );
+        return;
+      }
+      if (startSeqRef.current !== seq) {
+        stream.getTracks().forEach((t) => t.stop());
+        return;
+      }
+      streamRef.current = stream;
+      const mimeType = pickRecorderMimeType();
+      let recorder: MediaRecorder;
+      try {
+        recorder = mimeType
+          ? new MediaRecorder(stream, { mimeType })
+          : new MediaRecorder(stream);
+      } catch (e) {
+        cleanupTracks();
+        voiceLog("whisper.recorder.error", {
+          message: e instanceof Error ? e.message : String(e),
+        });
+        setError(
+          `Could not start recorder: ${
+            e instanceof Error ? e.message : String(e)
+          }`,
+        );
+        return;
+      }
+      recorderRef.current = recorder;
+      recorder.ondataavailable = (ev: BlobEvent) => {
+        if (ev.data && ev.data.size > 0) {
+          voiceLog("whisper.recorder.data", { size: ev.data.size });
+          chunksRef.current.push(ev.data);
+        }
+      };
+      recorder.onstop = async () => {
+        const chunks = chunksRef.current;
+        chunksRef.current = [];
+        cleanupTracks();
+        recorderRef.current = null;
+        setListening(false);
+        if (chunks.length === 0 || startSeqRef.current !== seq) {
+          voiceLog("whisper.stop.empty", { chunks: chunks.length });
+          return;
+        }
+        const blobType = chunks[0]?.type || mimeType || "audio/webm";
+        const blob = new Blob(chunks, { type: blobType });
+        voiceLog("whisper.stop.blob", {
+          size: blob.size,
+          type: blob.type,
+          chunks: chunks.length,
+        });
+        if (blob.size < 1024) {
+          voiceLog("whisper.stop.too-small", { size: blob.size });
+          return;
+        }
+        setTranscribing(true);
+        try {
+          voiceLog("whisper.transcribe.start");
+          const pipe = await loadWhisperPipeline(() => {});
+          if (startSeqRef.current !== seq) return;
+          const pcm = await decodeAndResample(blob);
+          if (startSeqRef.current !== seq) return;
+          const result = await pipe(pcm, {
+            language: "english",
+            task: "transcribe",
+          });
+          if (startSeqRef.current !== seq) return;
+          const text = (result?.text ?? "").trim();
+          voiceLog("whisper.transcribe.done", transcriptDebug(text));
+          if (text) {
+            setFinalTranscript(text);
+          }
+        } catch (e) {
+          voiceLog("whisper.transcribe.error", {
+            message: e instanceof Error ? e.message : String(e),
+          });
+          setError(
+            `Could not transcribe audio: ${
+              e instanceof Error ? e.message : String(e)
+            }`,
+          );
+        } finally {
+          if (startSeqRef.current === seq) {
+            setTranscribing(false);
+          }
+        }
+      };
+      recorder.onerror = () => {
+        voiceLog("whisper.recorder.onerror");
+        setError("Recorder error.");
+        cleanupTracks();
+        recorderRef.current = null;
+        setListening(false);
+      };
+      try {
+        recorder.start();
+        voiceLog("whisper.recorder.started", { mimeType: recorder.mimeType });
+        setListening(true);
+      } catch (e) {
+        cleanupTracks();
+        voiceLog("whisper.recorder.start-error", {
+          message: e instanceof Error ? e.message : String(e),
+        });
+        setError(
+          `Could not start recording: ${
+            e instanceof Error ? e.message : String(e)
+          }`,
+        );
+      }
+    },
+    [supported, modelLoadStatus, cleanupTracks],
+  );
+  return {
+    supported,
+    listening,
+    interimTranscript: "",
+    finalTranscript,
+    error,
+    start,
+    stop,
+    reset,
+    modelLoadStatus,
+    modelLoadProgress,
+    transcribing,
+  };
+}

package/templates/spellbook/src/components/SpellbookChat/vad-ssr-stub.ts ADDED Viewed

@@ -0,0 +1,25 @@
+/**
+ * SSR / server webpack target: the real `@ricky0123/vad-web` pulls
+ * `onnxruntime-web` + an AudioWorklet asset path that Webpack must not
+ * try to parse on the server. The chat only loads this module on the
+ * client (lazy import inside `useSileroVAD`).
+ */
+export class MicVAD {
+  static async new(): Promise<MicVAD> {
+    throw new Error("@ricky0123/vad-web is client-only");
+  }
+  start = async (): Promise<void> => {
+    throw new Error("@ricky0123/vad-web is client-only");
+  };
+  pause = async (): Promise<void> => {
+    /* noop */
+  };
+  destroy = async (): Promise<void> => {
+    /* noop */
+  };
+  listening = false;
+  errored: string | null = null;
+}
+export const DEFAULT_MODEL = "legacy" as const;

package/templates/spellbook/src/components/SpellbookChat/voiceDebug.ts ADDED Viewed

@@ -0,0 +1,60 @@
+/**
+ * Opt-in debug logging for the voice pipeline.
+ *
+ * The voice state machine is notoriously hard to reason about (phase
+ * transitions, STT modes, TTS queueing, abort timing), so the dev
+ * surface has a firehose of `voiceLog` calls. In production those
+ * logs would:
+ *   - spam the end user's console
+ *   - expose transcript previews to any browser extension that can
+ *     read `console.debug` output
+ *
+ * Both are undesirable, so logging is silent by default and only lights
+ * up when the consumer opts in, either per-tab via the DevTools:
+ *
+ *     localStorage.setItem("grimoire.chat.debug", "1")
+ *     location.reload()
+ *
+ * …or per-session by setting `window.__grimoireChatDebug = true` before
+ * the panel mounts. Both checks are read once on module load; flip and
+ * reload to toggle.
+ */
+const PREFIX = "[chat:voice]";
+const debugEnabled = ((): boolean => {
+  if (typeof window === "undefined") return false;
+  try {
+    const viaWindow = (window as unknown as { __grimoireChatDebug?: boolean })
+      .__grimoireChatDebug;
+    if (viaWindow === true) return true;
+    return window.localStorage?.getItem("grimoire.chat.debug") === "1";
+  } catch {
+    // SSR, storage-disabled iframes, or sandboxed contexts — silent.
+    return false;
+  }
+})();
+export function voiceLog(
+  event: string,
+  details?: Record<string, unknown>,
+): void {
+  if (!debugEnabled) return;
+  console.debug(PREFIX, event, details ?? {});
+}
+/**
+ * Collapse a transcript into `{ length, preview }` for the log sink. We
+ * never log the full transcript; at most the first 80 characters, and
+ * only when debug is on (see `voiceLog`).
+ */
+export function transcriptDebug(text: string): {
+  readonly length: number;
+  readonly preview: string;
+} {
+  const trimmed = text.trim();
+  return {
+    length: trimmed.length,
+    preview: trimmed.length > 80 ? `${trimmed.slice(0, 80)}...` : trimmed,
+  };
+}

package/templates/spellbook/src/components/SpellbookChat/voiceFsm.ts ADDED Viewed

@@ -0,0 +1,196 @@
+/**
+ * Voice mode finite-state machine.
+ *
+ * The four UI phases (`idle`, `listening`, `thinking`, `speaking`) are the
+ * states the user can observe; the machine here makes the *transitions*
+ * between them explicit and disallows illegal jumps. Side effects
+ * (timers, AbortControllers, mic streams) are NOT modelled here — those
+ * are owned by VoiceMode and reacted to via the `effects` array each
+ * transition emits. Keeping effects out of the reducer keeps it pure and
+ * testable without a real DOM.
+ *
+ * Transition table:
+ *
+ *   idle      ──MIC_PRESS──>  listening   (start STT + VAD)
+ *   listening ──VAD_END / PHRASE_END / MIC_PRESS / NATIVE_FINAL──> thinking
+ *   listening ──STT_ERROR──> idle (with error)
+ *   thinking  ──TOKEN──>     speaking   (first sentence enqueued)
+ *   thinking  ──ASK_DONE──>  idle       (no audible reply)
+ *   thinking  ──ASK_ERROR──> idle       (with error)
+ *   speaking  ──TTS_DRAIN──> idle
+ *   *         ──CANCEL──>    idle       (hard reset; runs cleanup effects)
+ *
+ * `engineBlocked` and `pendingTranscript` live on the FSM only because
+ * they gate transitions. Anything else (assistant text, citations, mic
+ * permission state) belongs to the component.
+ */
+export type VoiceUiPhase = "idle" | "listening" | "thinking" | "speaking";
+export interface VoiceFsmState {
+  readonly phase: VoiceUiPhase;
+  readonly error: string | null;
+}
+export type VoiceFsmEvent =
+  | { readonly type: "MIC_PRESS" }
+  | { readonly type: "VAD_END" }
+  | { readonly type: "PHRASE_END" }
+  | { readonly type: "NATIVE_FINAL_RECEIVED" }
+  | { readonly type: "STT_ERROR"; readonly message: string }
+  | { readonly type: "TOKEN_RECEIVED" }
+  | { readonly type: "ASK_DONE"; readonly hadAudibleReply: boolean }
+  | { readonly type: "ASK_ERROR"; readonly message: string }
+  | { readonly type: "TTS_DRAIN" }
+  | { readonly type: "CANCEL" };
+export type VoiceFsmEffect =
+  | "start-stt"
+  | "stop-stt-graceful"
+  | "stop-stt-hard"
+  | "start-vad"
+  | "stop-vad"
+  | "abort-ask"
+  | "cancel-tts"
+  | "clear-timers"
+  | "reset-transcripts";
+export interface VoiceFsmTransition {
+  readonly state: VoiceFsmState;
+  readonly effects: readonly VoiceFsmEffect[];
+}
+export const initialVoiceState: VoiceFsmState = {
+  phase: "idle",
+  error: null,
+};
+const into = (
+  phase: VoiceUiPhase,
+  effects: readonly VoiceFsmEffect[] = [],
+  error: string | null = null,
+): VoiceFsmTransition => ({ state: { phase, error }, effects });
+/**
+ * Pure reducer. Returns the next state plus a list of effects the
+ * VoiceMode component should fire after committing the state. The
+ * effect names are deliberately coarse — VoiceMode owns the actual
+ * function calls (so we don't have to mock timers or AudioContext to
+ * test the FSM).
+ */
+export function voiceFsmReduce(
+  state: VoiceFsmState,
+  ev: VoiceFsmEvent,
+): VoiceFsmTransition {
+  // CANCEL is the universal escape hatch — it always resets.
+  if (ev.type === "CANCEL") {
+    return into("idle", [
+      "abort-ask",
+      "cancel-tts",
+      "stop-stt-hard",
+      "stop-vad",
+      "clear-timers",
+      "reset-transcripts",
+    ]);
+  }
+  switch (state.phase) {
+    case "idle": {
+      if (ev.type === "MIC_PRESS") {
+        return into("listening", [
+          "reset-transcripts",
+          "start-stt",
+          "start-vad",
+        ]);
+      }
+      return { state, effects: [] };
+    }
+    case "listening": {
+      switch (ev.type) {
+        case "VAD_END":
+        case "PHRASE_END":
+        case "MIC_PRESS":
+          return into("thinking", [
+            "stop-stt-graceful",
+            "stop-vad",
+            "clear-timers",
+          ]);
+        case "NATIVE_FINAL_RECEIVED":
+          // Native API streams a final without us asking; treat as
+          // implicit phrase end + speech end.
+          return into("thinking", [
+            "stop-stt-graceful",
+            "stop-vad",
+            "clear-timers",
+          ]);
+        case "STT_ERROR":
+          return into(
+            "idle",
+            ["stop-vad", "clear-timers", "reset-transcripts"],
+            ev.message,
+          );
+        default:
+          return { state, effects: [] };
+      }
+    }
+    case "thinking": {
+      switch (ev.type) {
+        case "TOKEN_RECEIVED":
+          return into("speaking");
+        case "ASK_DONE":
+          // No audible reply means TTS never started; jump straight to idle.
+          if (!ev.hadAudibleReply) {
+            return into("idle");
+          }
+          // Otherwise stay in thinking and wait for first token; this branch
+          // is mostly defensive since `TOKEN_RECEIVED` arrives first.
+          return { state, effects: [] };
+        case "ASK_ERROR":
+          return into("idle", ["abort-ask"], ev.message);
+        case "MIC_PRESS":
+          // Interrupt: user wants to start over while the model is thinking.
+          return into("listening", [
+            "abort-ask",
+            "cancel-tts",
+            "reset-transcripts",
+            "start-stt",
+            "start-vad",
+          ]);
+        default:
+          return { state, effects: [] };
+      }
+    }
+    case "speaking": {
+      switch (ev.type) {
+        case "TTS_DRAIN":
+          return into("idle");
+        case "ASK_ERROR":
+          return into("idle", ["abort-ask", "cancel-tts"], ev.message);
+        case "MIC_PRESS":
+          // Barge-in: user starts a new turn while the assistant is talking.
+          return into("listening", [
+            "abort-ask",
+            "cancel-tts",
+            "reset-transcripts",
+            "start-stt",
+            "start-vad",
+          ]);
+        case "TOKEN_RECEIVED":
+          // Still streaming additional sentences mid-speak; nothing to do
+          // (the component flushes them to TTS, not the FSM).
+          return { state, effects: [] };
+        default:
+          return { state, effects: [] };
+      }
+    }
+    default: {
+      // Exhaustiveness: TS will flag if we add a phase without handling it.
+      const _exhaustive: never = state.phase;
+      return { state, effects: [] };
+    }
+  }
+}