npm - @kpritam/grimoire-output-docusaurus - Versions diffs - 0.1.8 - Mend

@kpritam/grimoire-output-docusaurus 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

package/templates/spellbook/src/components/SpellbookChat/VoiceMode.tsx ADDED Viewed

@@ -0,0 +1,867 @@
+import React, {
+  forwardRef,
+  useCallback,
+  useEffect,
+  useImperativeHandle,
+  useMemo,
+  useRef,
+  useState,
+} from "react";
+import type { ChatEngine } from "./ChatEngine";
+import { useSileroVAD } from "./useSileroVAD";
+import { useSpeechSynthesis } from "./useSpeechSynthesis";
+import { useUnifiedSTT } from "./useUnifiedSTT";
+import { transcriptDebug, voiceLog } from "./voiceDebug";
+import styles from "./voiceStyles.module.css";
+export type VoiceUiPhase = "idle" | "listening" | "thinking" | "speaking";
+/**
+ * Fallback phrase-end timeout used when Silero VAD is unavailable or its
+ * model is still downloading. Halved from the original (1.2 s) to 600 ms
+ * because even the timer fallback should feel snappier — and when VAD is
+ * available, this timer never fires (VAD's `onSpeechEnd` short-circuits it
+ * with a confident speech-end signal in ~250 ms).
+ */
+const PHRASE_END_MS = 600;
+const NATIVE_STOP_GRACE_MS = 2500;
+export interface VoiceModeHandle {
+  /**
+   * Hard-stop voice mode: abort the in-flight ask, silence TTS, drop any
+   * pending STT transcript, clear timers, and reset to idle. Safe to call
+   * repeatedly — all operations are idempotent.
+   *
+   * Called by the parent chat panel on Close / Clear / mode-switch so TTS
+   * can never outlive the UI the user just dismissed.
+   */
+  readonly cancel: () => void;
+}
+export interface VoiceModeProps {
+  /**
+   * The chat engine instance. MUST be the same instance the parent chat
+   * panel is using — sharing state keeps the mic availability and the
+   * panel header status in agreement.
+   */
+  readonly engine: ChatEngine;
+  readonly onTranscriptUpdate?: (entry: {
+    role: "user" | "assistant";
+    text: string;
+    partial: boolean;
+  }) => void;
+}
+function flushStreamBuffer(
+  buffer: string,
+  enqueue: (s: string) => void,
+): string {
+  let rest = buffer;
+  while (true) {
+    const sentence = rest.match(/^(.*?[.!?])(\s+|$)/);
+    if (sentence) {
+      enqueue(sentence[1]);
+      rest = rest.slice(sentence[0].length);
+      continue;
+    }
+    const para = rest.match(/^(.*?)(\n\n+)/);
+    if (para && para[1].trim()) {
+      enqueue(para[1].trim());
+      rest = rest.slice(para[0].length);
+      continue;
+    }
+    break;
+  }
+  return rest;
+}
+function MicIcon(): React.ReactElement {
+  return (
+    <svg
+      className={styles.micIcon}
+      viewBox="0 0 24 24"
+      fill="none"
+      stroke="currentColor"
+      strokeWidth="1.75"
+      strokeLinecap="round"
+      strokeLinejoin="round"
+      aria-hidden
+    >
+      <path d="M12 14a3 3 0 0 0 3-3V5a3 3 0 1 0-6 0v6a3 3 0 0 0 3 3Z" />
+      <path d="M19 10v1a7 7 0 0 1-14 0v-1" />
+      <path d="M12 19v3" />
+      <path d="M8 22h8" />
+    </svg>
+  );
+}
+interface StreamingTextHandle {
+  set: (text: string) => void;
+}
+/**
+ * Isolates the assistant streaming text in its own state so VoiceMode
+ * doesn't re-render on every token. Parent updates via the imperative
+ * `set` method.
+ */
+const StreamingAssistantText = forwardRef<StreamingTextHandle>(
+  function StreamingAssistantText(_, ref) {
+    const [text, setText] = useState("");
+    useImperativeHandle(ref, () => ({ set: setText }), []);
+    if (!text) return null;
+    return (
+      <>
+        <span className={styles.transcriptLabel} style={{ marginTop: "0.65rem" }}>
+          Assistant
+        </span>
+        {text}
+      </>
+    );
+  },
+);
+const VoiceMode = forwardRef<VoiceModeHandle, VoiceModeProps>(function VoiceMode(
+  { engine, onTranscriptUpdate },
+  ref,
+): React.ReactElement {
+  const stt = useUnifiedSTT();
+  const tts = useSpeechSynthesis();
+  const [phase, setPhase] = useState<VoiceUiPhase>("idle");
+  const [askError, setAskError] = useState<string | null>(null);
+  const [liveUserText, setLiveUserText] = useState("");
+  const assistantTextRef = useRef<StreamingTextHandle | null>(null);
+  const setAssistantText = useCallback((s: string) => {
+    assistantTextRef.current?.set(s);
+  }, []);
+  const sttRef = useRef(stt);
+  sttRef.current = stt;
+  const phraseTimerRef = useRef<number | null>(null);
+  const nativeStopTimerRef = useRef<number | null>(null);
+  const awaitingNativeStopRef = useRef(false);
+  const abortRef = useRef<AbortController | null>(null);
+  const phaseRef = useRef(phase);
+  phaseRef.current = phase;
+  const assistantAccRef = useRef("");
+  const engineRef = useRef(engine);
+  engineRef.current = engine;
+  const ttsRef = useRef(tts);
+  ttsRef.current = tts;
+  // Declared early so `cancel` (defined below) can stop it on hard-stop.
+  // The actual VAD instance is wired up further down once
+  // `stopNativeAndWaitForFinal` is in scope.
+  const vadRef = useRef<{
+    readonly stop: () => Promise<void>;
+    readonly listening: boolean;
+  } | null>(null);
+  const phaseSafeSet = useCallback((next: VoiceUiPhase) => {
+    if (phaseRef.current !== next) {
+      voiceLog("phase", { from: phaseRef.current, to: next });
+    }
+    phaseRef.current = next;
+    setPhase(next);
+  }, []);
+  const engineBlocked =
+    engine.state === "missing-key" || engine.state === "error";
+  const showEngineWait =
+    !engineBlocked &&
+    engine.state !== "ready" &&
+    (engine.state === "idle" ||
+      engine.state === "loading-bundle" ||
+      engine.state === "loading-model");
+  const micBlocked =
+    !stt.supported ||
+    engineBlocked ||
+    (engine.state !== "ready" && phase === "idle");
+  const clearPhraseTimer = useCallback(() => {
+    if (phraseTimerRef.current != null) {
+      window.clearTimeout(phraseTimerRef.current);
+      phraseTimerRef.current = null;
+    }
+  }, []);
+  const clearNativeStopTimer = useCallback(() => {
+    if (nativeStopTimerRef.current != null) {
+      window.clearTimeout(nativeStopTimerRef.current);
+      nativeStopTimerRef.current = null;
+    }
+  }, []);
+  /**
+   * Hard-stop the whole voice pipeline. Drains the ask stream, flushes TTS,
+   * drops any pending STT transcript, clears timers, and puts the UI back
+   * in idle. Shared by the Stop button, the "Interrupt" path on mic click,
+   * the parent-facing imperative handle, and the unmount cleanup.
+   */
+  const cancel = useCallback(() => {
+    voiceLog("voice.cancel", {
+      phase: phaseRef.current,
+      mode: sttRef.current.mode,
+    });
+    abortRef.current?.abort();
+    abortRef.current = null;
+    ttsRef.current.cancel();
+    sttRef.current.abort();
+    // Pause (don't destroy) so the next mic tap doesn't pay the Silero
+    // model load cost again. The unmount effect tears it down fully.
+    void vadRef.current?.stop();
+    clearPhraseTimer();
+    clearNativeStopTimer();
+    awaitingNativeStopRef.current = false;
+    assistantAccRef.current = "";
+    setAssistantText("");
+    setLiveUserText("");
+    setAskError(null);
+    phaseSafeSet("idle");
+  }, [clearPhraseTimer, clearNativeStopTimer, phaseSafeSet, setAssistantText]);
+  const cancelRef = useRef(cancel);
+  cancelRef.current = cancel;
+  useImperativeHandle(
+    ref,
+    () => ({
+      cancel: () => cancelRef.current(),
+    }),
+    [],
+  );
+  // Cancel everything on unmount (panel close doesn't unmount us, but mode
+  // switch / route change does; `useSpeechSynthesis` also cancels on its
+  // own unmount, so the live utterance gets a second belt-and-braces stop).
+  useEffect(() => {
+    return () => {
+      cancelRef.current();
+    };
+  }, []);
+  useEffect(() => {
+    if (phase === "listening" && stt.error) {
+      phaseSafeSet("idle");
+    }
+  }, [phase, stt.error, phaseSafeSet]);
+  /**
+   * If the STT service stops while we still think we're listening (tab
+   * blur, timeout, permission revoke), drop back to idle — but not while
+   * Whisper is still downloading the model or transcribing, since those
+   * are normal pre/post-listening states.
+   */
+  useEffect(() => {
+    if (phase !== "listening" || stt.listening || stt.error) {
+      return;
+    }
+    if (stt.mode === "whisper" && stt.modelLoadStatus !== "ready") {
+      return;
+    }
+    if (stt.mode === "whisper" && stt.transcribing) {
+      return;
+    }
+    const id = window.setTimeout(() => {
+      const s = sttRef.current;
+      if (phaseRef.current !== "listening") return;
+      if (s.listening || s.error) return;
+      if (s.mode === "whisper" && s.modelLoadStatus !== "ready") return;
+      if (s.mode === "whisper" && s.transcribing) return;
+      phaseSafeSet("idle");
+    }, 600);
+    return () => window.clearTimeout(id);
+  }, [
+    phase,
+    stt.listening,
+    stt.error,
+    stt.mode,
+    stt.modelLoadStatus,
+    stt.transcribing,
+    phaseSafeSet,
+  ]);
+  /**
+   * Run the engine.ask → TTS pipeline against a finalized question. Caller
+   * is responsible for any STT cleanup (stopping recognition, clearing
+   * timers) before invoking. Phase transitions to `thinking` immediately
+   * (idempotent if already `thinking`), then to `speaking` once the first
+   * sentence is queued, then back to `idle` after TTS drains.
+   */
+  const runAskFlow = useCallback(
+    (question: string) => {
+      const trimmed = question.trim();
+      if (!trimmed) {
+        voiceLog("ask.skip-empty");
+        phaseSafeSet("idle");
+        return;
+      }
+      voiceLog("ask.start", transcriptDebug(trimmed));
+      setLiveUserText(trimmed);
+      onTranscriptUpdate?.({ role: "user", text: trimmed, partial: false });
+      setAskError(null);
+      assistantAccRef.current = "";
+      setAssistantText("");
+      if (phaseRef.current !== "thinking") {
+        phaseSafeSet("thinking");
+      }
+      const run = async (): Promise<void> => {
+        const eng = engineRef.current;
+        const speech = ttsRef.current;
+        eng.preload();
+        if (eng.state === "missing-key" || eng.state === "error") {
+          voiceLog("ask.engine-blocked", { state: eng.state });
+          phaseSafeSet("idle");
+          return;
+        }
+        const ac = new AbortController();
+        abortRef.current = ac;
+        try {
+          let buffer = "";
+          let didEnqueue = false;
+          // Guard every enqueue path against the abort signal. The stream
+          // providers already surface buffered deltas after `abort()`; we
+          // must not let those buffered deltas re-populate the TTS queue
+          // we just cleared.
+          const safeEnqueue = (chunk: string): void => {
+            if (ac.signal.aborted) return;
+            if (!speech.supported) return;
+            speech.enqueue(chunk);
+            didEnqueue = true;
+            if (phaseRef.current === "thinking") {
+              phaseSafeSet("speaking");
+            }
+          };
+          const flush = (): void => {
+            buffer = flushStreamBuffer(buffer, safeEnqueue);
+          };
+          const result = await eng.ask(trimmed, {
+            signal: ac.signal,
+            onToken: (e) => {
+              if (ac.signal.aborted) return;
+              buffer += e.text;
+              assistantAccRef.current += e.text;
+              const full = assistantAccRef.current;
+              setAssistantText(full);
+              onTranscriptUpdate?.({
+                role: "assistant",
+                text: full,
+                partial: true,
+              });
+              flush();
+            },
+          });
+          if (ac.signal.aborted) {
+            voiceLog("ask.aborted");
+            return;
+          }
+          flush();
+          const tail = buffer.trim();
+          if (tail && speech.supported && !ac.signal.aborted) {
+            safeEnqueue(tail);
+          }
+          const finalText = result.answer.trim() || assistantAccRef.current;
+          voiceLog("ask.done", transcriptDebug(finalText));
+          assistantAccRef.current = finalText;
+          setAssistantText(finalText);
+          onTranscriptUpdate?.({
+            role: "assistant",
+            text: finalText,
+            partial: false,
+          });
+          if (!speech.supported || !didEnqueue) {
+            phaseSafeSet("idle");
+          }
+        } catch (err) {
+          if (ac.signal.aborted) {
+            voiceLog("ask.aborted");
+            phaseSafeSet("idle");
+            return;
+          }
+          const message =
+            err instanceof Error ? err.message : "The assistant could not answer.";
+          setAskError(message);
+          voiceLog("ask.error", { message });
+          phaseSafeSet("idle");
+        } finally {
+          if (abortRef.current === ac) {
+            abortRef.current = null;
+          }
+        }
+      };
+      void run();
+    },
+    [onTranscriptUpdate, phaseSafeSet, setAssistantText],
+  );
+  /** Capture the current STT text (trimmed). Used during native graceful stop. */
+  const transcriptFromState = useCallback(() => {
+    return `${sttRef.current.finalTranscript} ${sttRef.current.interimTranscript}`
+      .trim();
+  }, []);
+  const submitTranscriptFromState = useCallback(
+    (reason: string) => {
+      const text = transcriptFromState();
+      voiceLog("stt.submit", {
+        reason,
+        mode: sttRef.current.mode,
+        ...transcriptDebug(text),
+      });
+      awaitingNativeStopRef.current = false;
+      clearNativeStopTimer();
+      sttRef.current.reset();
+      if (text) {
+        runAskFlow(text);
+      } else if (phaseRef.current !== "speaking") {
+        phaseSafeSet("idle");
+      }
+    },
+    [clearNativeStopTimer, phaseSafeSet, runAskFlow, transcriptFromState],
+  );
+  const startNativeStopDeadline = useCallback(
+    (reason: string) => {
+      clearNativeStopTimer();
+      nativeStopTimerRef.current = window.setTimeout(() => {
+        if (!awaitingNativeStopRef.current) return;
+        voiceLog("native.stop.timeout", {
+          reason,
+          finalLength: sttRef.current.finalTranscript.trim().length,
+          interimLength: sttRef.current.interimTranscript.trim().length,
+        });
+        submitTranscriptFromState(`${reason}:timeout`);
+      }, NATIVE_STOP_GRACE_MS);
+    },
+    [clearNativeStopTimer, submitTranscriptFromState],
+  );
+  const stopNativeAndWaitForFinal = useCallback(
+    (reason: string) => {
+      awaitingNativeStopRef.current = true;
+      voiceLog("native.stop.request", {
+        reason,
+        finalLength: sttRef.current.finalTranscript.trim().length,
+        interimLength: sttRef.current.interimTranscript.trim().length,
+      });
+      phaseSafeSet("thinking");
+      sttRef.current.stop();
+      startNativeStopDeadline(reason);
+    },
+    [phaseSafeSet, startNativeStopDeadline],
+  );
+  /**
+   * Silero VAD speech-end handler. Fires the moment the speaker pauses
+   * for the "redemption" window (~250 ms by default), which is far
+   * snappier than the legacy 600 ms trailing-silence timer. Uses the
+   * existing native-stop pipeline so any in-flight `onresult` still
+   * lands in state before we submit.
+   */
+  const onVadSpeechEnd = useCallback(() => {
+    if (phaseRef.current !== "listening") return;
+    if (awaitingNativeStopRef.current) return;
+    if (sttRef.current.mode === "native") {
+      const hasContent =
+        sttRef.current.finalTranscript.trim().length > 0 ||
+        sttRef.current.interimTranscript.trim().length > 0;
+      if (!hasContent) return;
+      voiceLog("vad.handoff", { mode: "native" });
+      clearPhraseTimer();
+      stopNativeAndWaitForFinal("vad-end");
+    } else {
+      // Whisper mode: VAD telling us the user stopped is the green light
+      // to stop recording. The Whisper hook will then transcribe the
+      // captured audio and a separate effect submits the result.
+      voiceLog("vad.handoff", { mode: "whisper" });
+      sttRef.current.stop();
+    }
+  }, [clearPhraseTimer, stopNativeAndWaitForFinal]);
+  const vad = useSileroVAD({ onSpeechEnd: onVadSpeechEnd });
+  vadRef.current = vad;
+  /**
+   * Drive VAD lifecycle off the listening phase. Starting it lazily on
+   * first listen avoids paying the ~1.6 MB Silero model download (and
+   * the AudioWorklet permission flow) for users who never use voice
+   * mode. We never tear it down between turns — keeping the worklet
+   * warm makes turn-taking feel instant.
+   */
+  useEffect(() => {
+    if (phase === "listening") {
+      void vad.start();
+    } else if (vad.listening) {
+      void vad.stop();
+    }
+  }, [phase, vad]);
+  /**
+   * Native (Web Speech API) auto-stop on phrase-end silence. Only runs in
+   * native mode — Whisper has no interim activity and is driven by mic
+   * taps. We stop recognition here, then wait for the native `onresult`
+   * to settle before submitting.
+   */
+  useEffect(() => {
+    if (stt.mode !== "native") {
+      clearPhraseTimer();
+      return;
+    }
+    if (phase !== "listening") {
+      clearPhraseTimer();
+      return;
+    }
+    const { interimTranscript, finalTranscript, listening } = sttRef.current;
+    if (!listening) {
+      return;
+    }
+    if (interimTranscript.trim()) {
+      clearPhraseTimer();
+      return;
+    }
+    if (!finalTranscript.trim()) {
+      clearPhraseTimer();
+      return;
+    }
+    clearPhraseTimer();
+    phraseTimerRef.current = window.setTimeout(() => {
+      if (phaseRef.current !== "listening") return;
+      if (sttRef.current.interimTranscript.trim()) return;
+      if (!sttRef.current.finalTranscript.trim()) return;
+      clearPhraseTimer();
+      stopNativeAndWaitForFinal("phrase-end");
+    }, PHRASE_END_MS);
+    return () => {
+      clearPhraseTimer();
+    };
+  }, [
+    stt.mode,
+    phase,
+    stt.listening,
+    stt.finalTranscript,
+    stt.interimTranscript,
+    clearPhraseTimer,
+    stopNativeAndWaitForFinal,
+  ]);
+  // Trailing native finalTranscript during the graceful-stop window.
+  useEffect(() => {
+    if (!awaitingNativeStopRef.current) return;
+    if (stt.mode !== "native") return;
+    if (phase !== "thinking") return;
+    if (!stt.finalTranscript.trim()) return;
+    submitTranscriptFromState("native-final");
+  }, [stt.mode, stt.finalTranscript, phase, submitTranscriptFromState]);
+  /**
+   * Whisper fallback: when the user manually stops recording, the hook
+   * runs Whisper asynchronously and eventually populates `finalTranscript`.
+   * We're already in `thinking` from the click handler, so when the
+   * transcript arrives (and we're not still transcribing), fire the ask
+   * flow with it.
+   */
+  useEffect(() => {
+    if (stt.mode !== "whisper") return;
+    if (phase !== "thinking") return;
+    if (stt.transcribing) return;
+    const text = stt.finalTranscript.trim();
+    if (!text) return;
+    voiceLog("whisper.final", transcriptDebug(text));
+    runAskFlow(text);
+    sttRef.current.reset();
+  }, [stt.mode, stt.transcribing, stt.finalTranscript, phase, runAskFlow]);
+  /**
+   * If Whisper transcription returns empty (user said nothing intelligible)
+   * drop back to idle rather than hanging in `thinking` forever.
+   */
+  useEffect(() => {
+    if (stt.mode !== "whisper") return;
+    if (phase !== "thinking") return;
+    if (stt.transcribing) return;
+    if (stt.listening) return;
+    if (stt.finalTranscript.trim()) return;
+    if (stt.error) return;
+    const id = window.setTimeout(() => {
+      if (
+        phaseRef.current === "thinking" &&
+        !sttRef.current.transcribing &&
+        !sttRef.current.listening &&
+        !sttRef.current.finalTranscript.trim()
+      ) {
+        voiceLog("whisper.empty");
+        phaseSafeSet("idle");
+      }
+    }, 600);
+    return () => window.clearTimeout(id);
+  }, [
+    stt.mode,
+    stt.transcribing,
+    stt.listening,
+    stt.finalTranscript,
+    stt.error,
+    phase,
+    phaseSafeSet,
+  ]);
+  useEffect(() => {
+    if (phase !== "speaking" || !tts.supported) {
+      return;
+    }
+    if (tts.speaking) {
+      return;
+    }
+    phaseSafeSet("idle");
+  }, [phase, tts.speaking, tts.supported, phaseSafeSet]);
+  const displayUserLine = useMemo(() => {
+    if (phase === "listening") {
+      const parts = [stt.finalTranscript, stt.interimTranscript].filter(Boolean);
+      return parts.join(" ").trim();
+    }
+    if (liveUserText) {
+      return liveUserText;
+    }
+    return "";
+  }, [phase, stt.finalTranscript, stt.interimTranscript, liveUserText]);
+  const micAriaLabel = useMemo(() => {
+    if (!stt.supported) {
+      return "Voice input not supported in this browser";
+    }
+    if (engineBlocked) {
+      return engine.statusMessage || "Assistant unavailable";
+    }
+    if (showEngineWait) {
+      return engine.statusMessage || "Preparing the assistant…";
+    }
+    if (phase === "listening") {
+      return stt.mode === "whisper"
+        ? "Stop recording and transcribe"
+        : "Stop listening";
+    }
+    if (phase === "speaking" || phase === "thinking") {
+      return "Interrupt and start listening";
+    }
+    return "Start voice input";
+  }, [
+    stt.supported,
+    stt.mode,
+    engineBlocked,
+    showEngineWait,
+    engine.statusMessage,
+    phase,
+  ]);
+  const handleMicClick = (): void => {
+    voiceLog("mic.click", {
+      phase,
+      mode: stt.mode,
+      listening: stt.listening,
+      engineState: engine.state,
+    });
+    setAskError(null);
+    if (!stt.supported || engineBlocked) {
+      voiceLog("mic.blocked", { supported: stt.supported, engineBlocked });
+      return;
+    }
+    if (showEngineWait) {
+      voiceLog("mic.wait-engine", { state: engine.state });
+      return;
+    }
+    if (phase === "speaking" || phase === "thinking") {
+      // Interrupt: cancel the in-flight run then start a fresh listen.
+      cancelRef.current();
+      sttRef.current.start({ lang: "en-US" });
+      phaseSafeSet("listening");
+      return;
+    }
+    if (phase === "listening") {
+      clearPhraseTimer();
+      if (stt.mode === "whisper") {
+        voiceLog("whisper.stop.request");
+        stt.stop();
+        phaseSafeSet("thinking");
+        return;
+      }
+      stopNativeAndWaitForFinal("manual-stop");
+      return;
+    }
+    engine.preload();
+    stt.reset();
+    stt.start({ lang: "en-US" });
+    setLiveUserText("");
+    setAssistantText("");
+    phaseSafeSet("listening");
+  };
+  const handleMicKeyDown = (e: React.KeyboardEvent): void => {
+    if (e.key === "Enter" || e.key === " ") {
+      e.preventDefault();
+      handleMicClick();
+    }
+  };
+  const handleStopSpeech = (): void => {
+    cancelRef.current();
+  };
+  const micButtonDisabled =
+    micBlocked &&
+    phase !== "listening" &&
+    phase !== "speaking" &&
+    phase !== "thinking";
+  const caption = (() => {
+    if (!stt.supported) {
+      return "Voice not supported in this browser. Use the text input.";
+    }
+    if (stt.error) {
+      return stt.error;
+    }
+    if (engineBlocked) {
+      return engine.statusMessage || engine.error || "Assistant unavailable.";
+    }
+    if (showEngineWait) {
+      return engine.statusMessage || "Preparing the assistant…";
+    }
+    if (stt.mode === "whisper" && stt.modelLoadStatus === "loading") {
+      const pct = Math.round(stt.modelLoadProgress * 100);
+      return `First-time setup: downloading voice model (${pct}%). Cached after this.`;
+    }
+    if (askError) {
+      return askError;
+    }
+    if (phase === "listening") {
+      if (stt.mode === "whisper") {
+        return "Recording… tap the mic when you're done.";
+      }
+      return "Listening… finish your thought, or tap the mic to stop.";
+    }
+    if (phase === "thinking") {
+      if (stt.mode === "whisper" && stt.transcribing) {
+        return "Transcribing your question…";
+      }
+      return "Thinking…";
+    }
+    if (phase === "speaking") {
+      if (!tts.supported) {
+        return "Answer shown below — speech is not supported in this browser.";
+      }
+      return "Tap the mic to interrupt, or stop playback.";
+    }
+    if (stt.mode === "whisper" && stt.modelLoadStatus !== "ready") {
+      return "Tap to ask. The voice model (~40 MB) will download on first use.";
+    }
+    return "Tap to ask a question.";
+  })();
+  const micClassName = [
+    styles.micButton,
+    phase === "idle" && stt.supported && !engineBlocked && !showEngineWait
+      ? styles.micButtonIdle
+      : "",
+    phase === "listening" ? styles.micButtonListening : "",
+    phase === "speaking" ? styles.micButtonSpeaking : "",
+  ]
+    .filter(Boolean)
+    .join(" ");
+  return (
+    <div className={styles.wrap}>
+      <div
+        className={styles.transcript}
+        role="region"
+        aria-label="Voice mode transcript"
+        aria-live="polite"
+      >
+        {displayUserLine ? (
+          <>
+            <span className={styles.transcriptLabel}>You said</span>
+            {displayUserLine}
+          </>
+        ) : (
+          <span className={styles.transcriptLabel}>You said</span>
+        )}
+        <StreamingAssistantText ref={assistantTextRef} />
+      </div>
+      <div className={styles.controls}>
+        {phase === "thinking" ? (
+          <div className={styles.thinking}>
+            <div className={styles.spinner} aria-hidden />
+            <p className={styles.caption}>{caption}</p>
+          </div>
+        ) : null}
+        {phase === "speaking" && tts.supported ? (
+          <div className={styles.wave} aria-hidden>
+            <span className={styles.waveBar} />
+            <span className={styles.waveBar} />
+            <span className={styles.waveBar} />
+            <span className={styles.waveBar} />
+            <span className={styles.waveBar} />
+          </div>
+        ) : null}
+        <div className={styles.micOuter}>
+          {phase === "listening" ? (
+            <span
+              className={`${styles.micRing} ${styles.micRingListening}`}
+              aria-hidden
+            />
+          ) : null}
+          <button
+            type="button"
+            className={micClassName}
+            aria-label={micAriaLabel}
+            aria-pressed={phase === "listening"}
+            disabled={micButtonDisabled}
+            onClick={handleMicClick}
+            onKeyDown={handleMicKeyDown}
+          >
+            <MicIcon />
+          </button>
+        </div>
+        {phase !== "thinking" ? (
+          <p className={styles.caption}>{caption}</p>
+        ) : null}
+        {phase === "speaking" && tts.supported ? (
+          <button
+            type="button"
+            className={styles.stopButton}
+            onClick={handleStopSpeech}
+          >
+            Stop
+          </button>
+        ) : null}
+      </div>
+    </div>
+  );
+});
+export default VoiceMode;