@kpritam/grimoire-output-docusaurus 0.1.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +25 -0
  3. package/dist/.tsbuildinfo +1 -0
  4. package/dist/index.d.ts +1 -0
  5. package/dist/index.js +1 -0
  6. package/dist/internal/assets.d.ts +9 -0
  7. package/dist/internal/assets.js +50 -0
  8. package/dist/internal/docusaurusConfig.d.ts +9 -0
  9. package/dist/internal/docusaurusConfig.js +259 -0
  10. package/dist/internal/spellbookAssets.d.ts +39 -0
  11. package/dist/internal/spellbookAssets.js +68 -0
  12. package/dist/layer.d.ts +3 -0
  13. package/dist/layer.js +6 -0
  14. package/dist/shared.d.ts +10 -0
  15. package/dist/shared.js +36 -0
  16. package/dist/upstream.d.ts +6 -0
  17. package/dist/upstream.js +84 -0
  18. package/package.json +59 -0
  19. package/src/index.ts +1 -0
  20. package/src/internal/assets.ts +66 -0
  21. package/src/internal/docusaurusConfig.ts +281 -0
  22. package/src/internal/spellbookAssets.ts +80 -0
  23. package/src/layer.ts +12 -0
  24. package/src/shared.ts +43 -0
  25. package/src/upstream.ts +119 -0
  26. package/templates/spellbook/spellbookPlugin.ts +156 -0
  27. package/templates/spellbook/src/components/SpellbookChat/ChatEngine.ts +79 -0
  28. package/templates/spellbook/src/components/SpellbookChat/ChatErrorBoundary.tsx +65 -0
  29. package/templates/spellbook/src/components/SpellbookChat/Markdown.tsx +259 -0
  30. package/templates/spellbook/src/components/SpellbookChat/README.md +111 -0
  31. package/templates/spellbook/src/components/SpellbookChat/SettingsPanel.tsx +376 -0
  32. package/templates/spellbook/src/components/SpellbookChat/VoiceMode.tsx +867 -0
  33. package/templates/spellbook/src/components/SpellbookChat/index.tsx +744 -0
  34. package/templates/spellbook/src/components/SpellbookChat/markdown.module.css +343 -0
  35. package/templates/spellbook/src/components/SpellbookChat/secretStore.ts +106 -0
  36. package/templates/spellbook/src/components/SpellbookChat/streamProviders/anthropic.ts +36 -0
  37. package/templates/spellbook/src/components/SpellbookChat/streamProviders/createCloudProvider.ts +112 -0
  38. package/templates/spellbook/src/components/SpellbookChat/streamProviders/google.ts +33 -0
  39. package/templates/spellbook/src/components/SpellbookChat/streamProviders/index.ts +32 -0
  40. package/templates/spellbook/src/components/SpellbookChat/streamProviders/mapFinishReason.ts +23 -0
  41. package/templates/spellbook/src/components/SpellbookChat/streamProviders/ollama.ts +44 -0
  42. package/templates/spellbook/src/components/SpellbookChat/streamProviders/openai.ts +34 -0
  43. package/templates/spellbook/src/components/SpellbookChat/streamProviders/openaiRealtime.ts +320 -0
  44. package/templates/spellbook/src/components/SpellbookChat/streamProviders/types.ts +172 -0
  45. package/templates/spellbook/src/components/SpellbookChat/streamProviders/webllm.ts +214 -0
  46. package/templates/spellbook/src/components/SpellbookChat/styles.module.css +852 -0
  47. package/templates/spellbook/src/components/SpellbookChat/systemPrompt.ts +107 -0
  48. package/templates/spellbook/src/components/SpellbookChat/transformers-ssr-stub.ts +16 -0
  49. package/templates/spellbook/src/components/SpellbookChat/types.ts +52 -0
  50. package/templates/spellbook/src/components/SpellbookChat/useBundleLoader.ts +46 -0
  51. package/templates/spellbook/src/components/SpellbookChat/useChatEngine.ts +524 -0
  52. package/templates/spellbook/src/components/SpellbookChat/useEmbeddings.ts +147 -0
  53. package/templates/spellbook/src/components/SpellbookChat/useRetrieval.ts +377 -0
  54. package/templates/spellbook/src/components/SpellbookChat/useSileroVAD.ts +236 -0
  55. package/templates/spellbook/src/components/SpellbookChat/useSpeechRecognition.ts +271 -0
  56. package/templates/spellbook/src/components/SpellbookChat/useSpeechSynthesis.ts +229 -0
  57. package/templates/spellbook/src/components/SpellbookChat/useUnifiedSTT.ts +134 -0
  58. package/templates/spellbook/src/components/SpellbookChat/useWhisperSTT.ts +411 -0
  59. package/templates/spellbook/src/components/SpellbookChat/vad-ssr-stub.ts +25 -0
  60. package/templates/spellbook/src/components/SpellbookChat/voiceDebug.ts +60 -0
  61. package/templates/spellbook/src/components/SpellbookChat/voiceFsm.ts +196 -0
  62. package/templates/spellbook/src/components/SpellbookChat/voiceStyles.module.css +334 -0
  63. package/templates/spellbook/src/components/SpellbookChat/webllm-ssr-stub.ts +8 -0
  64. package/templates/spellbook/src/components/SpellbookChatDisabled.tsx +20 -0
  65. package/templates/spellbook/src/theme/Root.tsx +29 -0
@@ -0,0 +1,867 @@
1
+ import React, {
2
+ forwardRef,
3
+ useCallback,
4
+ useEffect,
5
+ useImperativeHandle,
6
+ useMemo,
7
+ useRef,
8
+ useState,
9
+ } from "react";
10
+ import type { ChatEngine } from "./ChatEngine";
11
+ import { useSileroVAD } from "./useSileroVAD";
12
+ import { useSpeechSynthesis } from "./useSpeechSynthesis";
13
+ import { useUnifiedSTT } from "./useUnifiedSTT";
14
+ import { transcriptDebug, voiceLog } from "./voiceDebug";
15
+ import styles from "./voiceStyles.module.css";
16
+
17
+ export type VoiceUiPhase = "idle" | "listening" | "thinking" | "speaking";
18
+
19
+ /**
20
+ * Fallback phrase-end timeout used when Silero VAD is unavailable or its
21
+ * model is still downloading. Halved from the original (1.2 s) to 600 ms
22
+ * because even the timer fallback should feel snappier — and when VAD is
23
+ * available, this timer never fires (VAD's `onSpeechEnd` short-circuits it
24
+ * with a confident speech-end signal in ~250 ms).
25
+ */
26
+ const PHRASE_END_MS = 600;
27
+ const NATIVE_STOP_GRACE_MS = 2500;
28
+
29
+ export interface VoiceModeHandle {
30
+ /**
31
+ * Hard-stop voice mode: abort the in-flight ask, silence TTS, drop any
32
+ * pending STT transcript, clear timers, and reset to idle. Safe to call
33
+ * repeatedly — all operations are idempotent.
34
+ *
35
+ * Called by the parent chat panel on Close / Clear / mode-switch so TTS
36
+ * can never outlive the UI the user just dismissed.
37
+ */
38
+ readonly cancel: () => void;
39
+ }
40
+
41
+ export interface VoiceModeProps {
42
+ /**
43
+ * The chat engine instance. MUST be the same instance the parent chat
44
+ * panel is using — sharing state keeps the mic availability and the
45
+ * panel header status in agreement.
46
+ */
47
+ readonly engine: ChatEngine;
48
+ readonly onTranscriptUpdate?: (entry: {
49
+ role: "user" | "assistant";
50
+ text: string;
51
+ partial: boolean;
52
+ }) => void;
53
+ }
54
+
55
+ function flushStreamBuffer(
56
+ buffer: string,
57
+ enqueue: (s: string) => void,
58
+ ): string {
59
+ let rest = buffer;
60
+ while (true) {
61
+ const sentence = rest.match(/^(.*?[.!?])(\s+|$)/);
62
+ if (sentence) {
63
+ enqueue(sentence[1]);
64
+ rest = rest.slice(sentence[0].length);
65
+ continue;
66
+ }
67
+ const para = rest.match(/^(.*?)(\n\n+)/);
68
+ if (para && para[1].trim()) {
69
+ enqueue(para[1].trim());
70
+ rest = rest.slice(para[0].length);
71
+ continue;
72
+ }
73
+ break;
74
+ }
75
+ return rest;
76
+ }
77
+
78
+ function MicIcon(): React.ReactElement {
79
+ return (
80
+ <svg
81
+ className={styles.micIcon}
82
+ viewBox="0 0 24 24"
83
+ fill="none"
84
+ stroke="currentColor"
85
+ strokeWidth="1.75"
86
+ strokeLinecap="round"
87
+ strokeLinejoin="round"
88
+ aria-hidden
89
+ >
90
+ <path d="M12 14a3 3 0 0 0 3-3V5a3 3 0 1 0-6 0v6a3 3 0 0 0 3 3Z" />
91
+ <path d="M19 10v1a7 7 0 0 1-14 0v-1" />
92
+ <path d="M12 19v3" />
93
+ <path d="M8 22h8" />
94
+ </svg>
95
+ );
96
+ }
97
+
98
+ interface StreamingTextHandle {
99
+ set: (text: string) => void;
100
+ }
101
+
102
+ /**
103
+ * Isolates the assistant streaming text in its own state so VoiceMode
104
+ * doesn't re-render on every token. Parent updates via the imperative
105
+ * `set` method.
106
+ */
107
+ const StreamingAssistantText = forwardRef<StreamingTextHandle>(
108
+ function StreamingAssistantText(_, ref) {
109
+ const [text, setText] = useState("");
110
+ useImperativeHandle(ref, () => ({ set: setText }), []);
111
+ if (!text) return null;
112
+ return (
113
+ <>
114
+ <span className={styles.transcriptLabel} style={{ marginTop: "0.65rem" }}>
115
+ Assistant
116
+ </span>
117
+ {text}
118
+ </>
119
+ );
120
+ },
121
+ );
122
+
123
+ const VoiceMode = forwardRef<VoiceModeHandle, VoiceModeProps>(function VoiceMode(
124
+ { engine, onTranscriptUpdate },
125
+ ref,
126
+ ): React.ReactElement {
127
+ const stt = useUnifiedSTT();
128
+ const tts = useSpeechSynthesis();
129
+
130
+ const [phase, setPhase] = useState<VoiceUiPhase>("idle");
131
+ const [askError, setAskError] = useState<string | null>(null);
132
+ const [liveUserText, setLiveUserText] = useState("");
133
+
134
+ const assistantTextRef = useRef<StreamingTextHandle | null>(null);
135
+ const setAssistantText = useCallback((s: string) => {
136
+ assistantTextRef.current?.set(s);
137
+ }, []);
138
+
139
+ const sttRef = useRef(stt);
140
+ sttRef.current = stt;
141
+
142
+ const phraseTimerRef = useRef<number | null>(null);
143
+ const nativeStopTimerRef = useRef<number | null>(null);
144
+ const awaitingNativeStopRef = useRef(false);
145
+ const abortRef = useRef<AbortController | null>(null);
146
+ const phaseRef = useRef(phase);
147
+ phaseRef.current = phase;
148
+ const assistantAccRef = useRef("");
149
+ const engineRef = useRef(engine);
150
+ engineRef.current = engine;
151
+ const ttsRef = useRef(tts);
152
+ ttsRef.current = tts;
153
+ // Declared early so `cancel` (defined below) can stop it on hard-stop.
154
+ // The actual VAD instance is wired up further down once
155
+ // `stopNativeAndWaitForFinal` is in scope.
156
+ const vadRef = useRef<{
157
+ readonly stop: () => Promise<void>;
158
+ readonly listening: boolean;
159
+ } | null>(null);
160
+
161
+ const phaseSafeSet = useCallback((next: VoiceUiPhase) => {
162
+ if (phaseRef.current !== next) {
163
+ voiceLog("phase", { from: phaseRef.current, to: next });
164
+ }
165
+ phaseRef.current = next;
166
+ setPhase(next);
167
+ }, []);
168
+
169
+ const engineBlocked =
170
+ engine.state === "missing-key" || engine.state === "error";
171
+
172
+ const showEngineWait =
173
+ !engineBlocked &&
174
+ engine.state !== "ready" &&
175
+ (engine.state === "idle" ||
176
+ engine.state === "loading-bundle" ||
177
+ engine.state === "loading-model");
178
+
179
+ const micBlocked =
180
+ !stt.supported ||
181
+ engineBlocked ||
182
+ (engine.state !== "ready" && phase === "idle");
183
+
184
+ const clearPhraseTimer = useCallback(() => {
185
+ if (phraseTimerRef.current != null) {
186
+ window.clearTimeout(phraseTimerRef.current);
187
+ phraseTimerRef.current = null;
188
+ }
189
+ }, []);
190
+
191
+ const clearNativeStopTimer = useCallback(() => {
192
+ if (nativeStopTimerRef.current != null) {
193
+ window.clearTimeout(nativeStopTimerRef.current);
194
+ nativeStopTimerRef.current = null;
195
+ }
196
+ }, []);
197
+
198
+ /**
199
+ * Hard-stop the whole voice pipeline. Drains the ask stream, flushes TTS,
200
+ * drops any pending STT transcript, clears timers, and puts the UI back
201
+ * in idle. Shared by the Stop button, the "Interrupt" path on mic click,
202
+ * the parent-facing imperative handle, and the unmount cleanup.
203
+ */
204
+ const cancel = useCallback(() => {
205
+ voiceLog("voice.cancel", {
206
+ phase: phaseRef.current,
207
+ mode: sttRef.current.mode,
208
+ });
209
+ abortRef.current?.abort();
210
+ abortRef.current = null;
211
+ ttsRef.current.cancel();
212
+ sttRef.current.abort();
213
+ // Pause (don't destroy) so the next mic tap doesn't pay the Silero
214
+ // model load cost again. The unmount effect tears it down fully.
215
+ void vadRef.current?.stop();
216
+ clearPhraseTimer();
217
+ clearNativeStopTimer();
218
+ awaitingNativeStopRef.current = false;
219
+ assistantAccRef.current = "";
220
+ setAssistantText("");
221
+ setLiveUserText("");
222
+ setAskError(null);
223
+ phaseSafeSet("idle");
224
+ }, [clearPhraseTimer, clearNativeStopTimer, phaseSafeSet, setAssistantText]);
225
+
226
+ const cancelRef = useRef(cancel);
227
+ cancelRef.current = cancel;
228
+
229
+ useImperativeHandle(
230
+ ref,
231
+ () => ({
232
+ cancel: () => cancelRef.current(),
233
+ }),
234
+ [],
235
+ );
236
+
237
+ // Cancel everything on unmount (panel close doesn't unmount us, but mode
238
+ // switch / route change does; `useSpeechSynthesis` also cancels on its
239
+ // own unmount, so the live utterance gets a second belt-and-braces stop).
240
+ useEffect(() => {
241
+ return () => {
242
+ cancelRef.current();
243
+ };
244
+ }, []);
245
+
246
+ useEffect(() => {
247
+ if (phase === "listening" && stt.error) {
248
+ phaseSafeSet("idle");
249
+ }
250
+ }, [phase, stt.error, phaseSafeSet]);
251
+
252
+ /**
253
+ * If the STT service stops while we still think we're listening (tab
254
+ * blur, timeout, permission revoke), drop back to idle — but not while
255
+ * Whisper is still downloading the model or transcribing, since those
256
+ * are normal pre/post-listening states.
257
+ */
258
+ useEffect(() => {
259
+ if (phase !== "listening" || stt.listening || stt.error) {
260
+ return;
261
+ }
262
+ if (stt.mode === "whisper" && stt.modelLoadStatus !== "ready") {
263
+ return;
264
+ }
265
+ if (stt.mode === "whisper" && stt.transcribing) {
266
+ return;
267
+ }
268
+ const id = window.setTimeout(() => {
269
+ const s = sttRef.current;
270
+ if (phaseRef.current !== "listening") return;
271
+ if (s.listening || s.error) return;
272
+ if (s.mode === "whisper" && s.modelLoadStatus !== "ready") return;
273
+ if (s.mode === "whisper" && s.transcribing) return;
274
+ phaseSafeSet("idle");
275
+ }, 600);
276
+ return () => window.clearTimeout(id);
277
+ }, [
278
+ phase,
279
+ stt.listening,
280
+ stt.error,
281
+ stt.mode,
282
+ stt.modelLoadStatus,
283
+ stt.transcribing,
284
+ phaseSafeSet,
285
+ ]);
286
+
287
+ /**
288
+ * Run the engine.ask → TTS pipeline against a finalized question. Caller
289
+ * is responsible for any STT cleanup (stopping recognition, clearing
290
+ * timers) before invoking. Phase transitions to `thinking` immediately
291
+ * (idempotent if already `thinking`), then to `speaking` once the first
292
+ * sentence is queued, then back to `idle` after TTS drains.
293
+ */
294
+ const runAskFlow = useCallback(
295
+ (question: string) => {
296
+ const trimmed = question.trim();
297
+ if (!trimmed) {
298
+ voiceLog("ask.skip-empty");
299
+ phaseSafeSet("idle");
300
+ return;
301
+ }
302
+
303
+ voiceLog("ask.start", transcriptDebug(trimmed));
304
+ setLiveUserText(trimmed);
305
+ onTranscriptUpdate?.({ role: "user", text: trimmed, partial: false });
306
+
307
+ setAskError(null);
308
+ assistantAccRef.current = "";
309
+ setAssistantText("");
310
+ if (phaseRef.current !== "thinking") {
311
+ phaseSafeSet("thinking");
312
+ }
313
+
314
+ const run = async (): Promise<void> => {
315
+ const eng = engineRef.current;
316
+ const speech = ttsRef.current;
317
+ eng.preload();
318
+ if (eng.state === "missing-key" || eng.state === "error") {
319
+ voiceLog("ask.engine-blocked", { state: eng.state });
320
+ phaseSafeSet("idle");
321
+ return;
322
+ }
323
+
324
+ const ac = new AbortController();
325
+ abortRef.current = ac;
326
+
327
+ try {
328
+ let buffer = "";
329
+ let didEnqueue = false;
330
+ // Guard every enqueue path against the abort signal. The stream
331
+ // providers already surface buffered deltas after `abort()`; we
332
+ // must not let those buffered deltas re-populate the TTS queue
333
+ // we just cleared.
334
+ const safeEnqueue = (chunk: string): void => {
335
+ if (ac.signal.aborted) return;
336
+ if (!speech.supported) return;
337
+ speech.enqueue(chunk);
338
+ didEnqueue = true;
339
+ if (phaseRef.current === "thinking") {
340
+ phaseSafeSet("speaking");
341
+ }
342
+ };
343
+
344
+ const flush = (): void => {
345
+ buffer = flushStreamBuffer(buffer, safeEnqueue);
346
+ };
347
+
348
+ const result = await eng.ask(trimmed, {
349
+ signal: ac.signal,
350
+ onToken: (e) => {
351
+ if (ac.signal.aborted) return;
352
+ buffer += e.text;
353
+ assistantAccRef.current += e.text;
354
+ const full = assistantAccRef.current;
355
+ setAssistantText(full);
356
+ onTranscriptUpdate?.({
357
+ role: "assistant",
358
+ text: full,
359
+ partial: true,
360
+ });
361
+ flush();
362
+ },
363
+ });
364
+
365
+ if (ac.signal.aborted) {
366
+ voiceLog("ask.aborted");
367
+ return;
368
+ }
369
+
370
+ flush();
371
+ const tail = buffer.trim();
372
+ if (tail && speech.supported && !ac.signal.aborted) {
373
+ safeEnqueue(tail);
374
+ }
375
+
376
+ const finalText = result.answer.trim() || assistantAccRef.current;
377
+ voiceLog("ask.done", transcriptDebug(finalText));
378
+ assistantAccRef.current = finalText;
379
+ setAssistantText(finalText);
380
+ onTranscriptUpdate?.({
381
+ role: "assistant",
382
+ text: finalText,
383
+ partial: false,
384
+ });
385
+
386
+ if (!speech.supported || !didEnqueue) {
387
+ phaseSafeSet("idle");
388
+ }
389
+ } catch (err) {
390
+ if (ac.signal.aborted) {
391
+ voiceLog("ask.aborted");
392
+ phaseSafeSet("idle");
393
+ return;
394
+ }
395
+ const message =
396
+ err instanceof Error ? err.message : "The assistant could not answer.";
397
+ setAskError(message);
398
+ voiceLog("ask.error", { message });
399
+ phaseSafeSet("idle");
400
+ } finally {
401
+ if (abortRef.current === ac) {
402
+ abortRef.current = null;
403
+ }
404
+ }
405
+ };
406
+
407
+ void run();
408
+ },
409
+ [onTranscriptUpdate, phaseSafeSet, setAssistantText],
410
+ );
411
+
412
+ /** Capture the current STT text (trimmed). Used during native graceful stop. */
413
+ const transcriptFromState = useCallback(() => {
414
+ return `${sttRef.current.finalTranscript} ${sttRef.current.interimTranscript}`
415
+ .trim();
416
+ }, []);
417
+
418
+ const submitTranscriptFromState = useCallback(
419
+ (reason: string) => {
420
+ const text = transcriptFromState();
421
+ voiceLog("stt.submit", {
422
+ reason,
423
+ mode: sttRef.current.mode,
424
+ ...transcriptDebug(text),
425
+ });
426
+ awaitingNativeStopRef.current = false;
427
+ clearNativeStopTimer();
428
+ sttRef.current.reset();
429
+ if (text) {
430
+ runAskFlow(text);
431
+ } else if (phaseRef.current !== "speaking") {
432
+ phaseSafeSet("idle");
433
+ }
434
+ },
435
+ [clearNativeStopTimer, phaseSafeSet, runAskFlow, transcriptFromState],
436
+ );
437
+
438
+ const startNativeStopDeadline = useCallback(
439
+ (reason: string) => {
440
+ clearNativeStopTimer();
441
+ nativeStopTimerRef.current = window.setTimeout(() => {
442
+ if (!awaitingNativeStopRef.current) return;
443
+ voiceLog("native.stop.timeout", {
444
+ reason,
445
+ finalLength: sttRef.current.finalTranscript.trim().length,
446
+ interimLength: sttRef.current.interimTranscript.trim().length,
447
+ });
448
+ submitTranscriptFromState(`${reason}:timeout`);
449
+ }, NATIVE_STOP_GRACE_MS);
450
+ },
451
+ [clearNativeStopTimer, submitTranscriptFromState],
452
+ );
453
+
454
+ const stopNativeAndWaitForFinal = useCallback(
455
+ (reason: string) => {
456
+ awaitingNativeStopRef.current = true;
457
+ voiceLog("native.stop.request", {
458
+ reason,
459
+ finalLength: sttRef.current.finalTranscript.trim().length,
460
+ interimLength: sttRef.current.interimTranscript.trim().length,
461
+ });
462
+ phaseSafeSet("thinking");
463
+ sttRef.current.stop();
464
+ startNativeStopDeadline(reason);
465
+ },
466
+ [phaseSafeSet, startNativeStopDeadline],
467
+ );
468
+
469
+ /**
470
+ * Silero VAD speech-end handler. Fires the moment the speaker pauses
471
+ * for the "redemption" window (~250 ms by default), which is far
472
+ * snappier than the legacy 600 ms trailing-silence timer. Uses the
473
+ * existing native-stop pipeline so any in-flight `onresult` still
474
+ * lands in state before we submit.
475
+ */
476
+ const onVadSpeechEnd = useCallback(() => {
477
+ if (phaseRef.current !== "listening") return;
478
+ if (awaitingNativeStopRef.current) return;
479
+ if (sttRef.current.mode === "native") {
480
+ const hasContent =
481
+ sttRef.current.finalTranscript.trim().length > 0 ||
482
+ sttRef.current.interimTranscript.trim().length > 0;
483
+ if (!hasContent) return;
484
+ voiceLog("vad.handoff", { mode: "native" });
485
+ clearPhraseTimer();
486
+ stopNativeAndWaitForFinal("vad-end");
487
+ } else {
488
+ // Whisper mode: VAD telling us the user stopped is the green light
489
+ // to stop recording. The Whisper hook will then transcribe the
490
+ // captured audio and a separate effect submits the result.
491
+ voiceLog("vad.handoff", { mode: "whisper" });
492
+ sttRef.current.stop();
493
+ }
494
+ }, [clearPhraseTimer, stopNativeAndWaitForFinal]);
495
+
496
+ const vad = useSileroVAD({ onSpeechEnd: onVadSpeechEnd });
497
+ vadRef.current = vad;
498
+
499
+ /**
500
+ * Drive VAD lifecycle off the listening phase. Starting it lazily on
501
+ * first listen avoids paying the ~1.6 MB Silero model download (and
502
+ * the AudioWorklet permission flow) for users who never use voice
503
+ * mode. We never tear it down between turns — keeping the worklet
504
+ * warm makes turn-taking feel instant.
505
+ */
506
+ useEffect(() => {
507
+ if (phase === "listening") {
508
+ void vad.start();
509
+ } else if (vad.listening) {
510
+ void vad.stop();
511
+ }
512
+ }, [phase, vad]);
513
+
514
+ /**
515
+ * Native (Web Speech API) auto-stop on phrase-end silence. Only runs in
516
+ * native mode — Whisper has no interim activity and is driven by mic
517
+ * taps. We stop recognition here, then wait for the native `onresult`
518
+ * to settle before submitting.
519
+ */
520
+ useEffect(() => {
521
+ if (stt.mode !== "native") {
522
+ clearPhraseTimer();
523
+ return;
524
+ }
525
+ if (phase !== "listening") {
526
+ clearPhraseTimer();
527
+ return;
528
+ }
529
+
530
+ const { interimTranscript, finalTranscript, listening } = sttRef.current;
531
+ if (!listening) {
532
+ return;
533
+ }
534
+ if (interimTranscript.trim()) {
535
+ clearPhraseTimer();
536
+ return;
537
+ }
538
+ if (!finalTranscript.trim()) {
539
+ clearPhraseTimer();
540
+ return;
541
+ }
542
+
543
+ clearPhraseTimer();
544
+ phraseTimerRef.current = window.setTimeout(() => {
545
+ if (phaseRef.current !== "listening") return;
546
+ if (sttRef.current.interimTranscript.trim()) return;
547
+ if (!sttRef.current.finalTranscript.trim()) return;
548
+
549
+ clearPhraseTimer();
550
+ stopNativeAndWaitForFinal("phrase-end");
551
+ }, PHRASE_END_MS);
552
+
553
+ return () => {
554
+ clearPhraseTimer();
555
+ };
556
+ }, [
557
+ stt.mode,
558
+ phase,
559
+ stt.listening,
560
+ stt.finalTranscript,
561
+ stt.interimTranscript,
562
+ clearPhraseTimer,
563
+ stopNativeAndWaitForFinal,
564
+ ]);
565
+
566
+ // Trailing native finalTranscript during the graceful-stop window.
567
+ useEffect(() => {
568
+ if (!awaitingNativeStopRef.current) return;
569
+ if (stt.mode !== "native") return;
570
+ if (phase !== "thinking") return;
571
+ if (!stt.finalTranscript.trim()) return;
572
+ submitTranscriptFromState("native-final");
573
+ }, [stt.mode, stt.finalTranscript, phase, submitTranscriptFromState]);
574
+
575
+ /**
576
+ * Whisper fallback: when the user manually stops recording, the hook
577
+ * runs Whisper asynchronously and eventually populates `finalTranscript`.
578
+ * We're already in `thinking` from the click handler, so when the
579
+ * transcript arrives (and we're not still transcribing), fire the ask
580
+ * flow with it.
581
+ */
582
+ useEffect(() => {
583
+ if (stt.mode !== "whisper") return;
584
+ if (phase !== "thinking") return;
585
+ if (stt.transcribing) return;
586
+ const text = stt.finalTranscript.trim();
587
+ if (!text) return;
588
+ voiceLog("whisper.final", transcriptDebug(text));
589
+ runAskFlow(text);
590
+ sttRef.current.reset();
591
+ }, [stt.mode, stt.transcribing, stt.finalTranscript, phase, runAskFlow]);
592
+
593
+ /**
594
+ * If Whisper transcription returns empty (user said nothing intelligible)
595
+ * drop back to idle rather than hanging in `thinking` forever.
596
+ */
597
+ useEffect(() => {
598
+ if (stt.mode !== "whisper") return;
599
+ if (phase !== "thinking") return;
600
+ if (stt.transcribing) return;
601
+ if (stt.listening) return;
602
+ if (stt.finalTranscript.trim()) return;
603
+ if (stt.error) return;
604
+ const id = window.setTimeout(() => {
605
+ if (
606
+ phaseRef.current === "thinking" &&
607
+ !sttRef.current.transcribing &&
608
+ !sttRef.current.listening &&
609
+ !sttRef.current.finalTranscript.trim()
610
+ ) {
611
+ voiceLog("whisper.empty");
612
+ phaseSafeSet("idle");
613
+ }
614
+ }, 600);
615
+ return () => window.clearTimeout(id);
616
+ }, [
617
+ stt.mode,
618
+ stt.transcribing,
619
+ stt.listening,
620
+ stt.finalTranscript,
621
+ stt.error,
622
+ phase,
623
+ phaseSafeSet,
624
+ ]);
625
+
626
+ useEffect(() => {
627
+ if (phase !== "speaking" || !tts.supported) {
628
+ return;
629
+ }
630
+ if (tts.speaking) {
631
+ return;
632
+ }
633
+ phaseSafeSet("idle");
634
+ }, [phase, tts.speaking, tts.supported, phaseSafeSet]);
635
+
636
+ const displayUserLine = useMemo(() => {
637
+ if (phase === "listening") {
638
+ const parts = [stt.finalTranscript, stt.interimTranscript].filter(Boolean);
639
+ return parts.join(" ").trim();
640
+ }
641
+ if (liveUserText) {
642
+ return liveUserText;
643
+ }
644
+ return "";
645
+ }, [phase, stt.finalTranscript, stt.interimTranscript, liveUserText]);
646
+
647
+ const micAriaLabel = useMemo(() => {
648
+ if (!stt.supported) {
649
+ return "Voice input not supported in this browser";
650
+ }
651
+ if (engineBlocked) {
652
+ return engine.statusMessage || "Assistant unavailable";
653
+ }
654
+ if (showEngineWait) {
655
+ return engine.statusMessage || "Preparing the assistant…";
656
+ }
657
+ if (phase === "listening") {
658
+ return stt.mode === "whisper"
659
+ ? "Stop recording and transcribe"
660
+ : "Stop listening";
661
+ }
662
+ if (phase === "speaking" || phase === "thinking") {
663
+ return "Interrupt and start listening";
664
+ }
665
+ return "Start voice input";
666
+ }, [
667
+ stt.supported,
668
+ stt.mode,
669
+ engineBlocked,
670
+ showEngineWait,
671
+ engine.statusMessage,
672
+ phase,
673
+ ]);
674
+
675
+ const handleMicClick = (): void => {
676
+ voiceLog("mic.click", {
677
+ phase,
678
+ mode: stt.mode,
679
+ listening: stt.listening,
680
+ engineState: engine.state,
681
+ });
682
+ setAskError(null);
683
+ if (!stt.supported || engineBlocked) {
684
+ voiceLog("mic.blocked", { supported: stt.supported, engineBlocked });
685
+ return;
686
+ }
687
+ if (showEngineWait) {
688
+ voiceLog("mic.wait-engine", { state: engine.state });
689
+ return;
690
+ }
691
+
692
+ if (phase === "speaking" || phase === "thinking") {
693
+ // Interrupt: cancel the in-flight run then start a fresh listen.
694
+ cancelRef.current();
695
+ sttRef.current.start({ lang: "en-US" });
696
+ phaseSafeSet("listening");
697
+ return;
698
+ }
699
+
700
+ if (phase === "listening") {
701
+ clearPhraseTimer();
702
+ if (stt.mode === "whisper") {
703
+ voiceLog("whisper.stop.request");
704
+ stt.stop();
705
+ phaseSafeSet("thinking");
706
+ return;
707
+ }
708
+ stopNativeAndWaitForFinal("manual-stop");
709
+ return;
710
+ }
711
+
712
+ engine.preload();
713
+ stt.reset();
714
+ stt.start({ lang: "en-US" });
715
+ setLiveUserText("");
716
+ setAssistantText("");
717
+ phaseSafeSet("listening");
718
+ };
719
+
720
+ const handleMicKeyDown = (e: React.KeyboardEvent): void => {
721
+ if (e.key === "Enter" || e.key === " ") {
722
+ e.preventDefault();
723
+ handleMicClick();
724
+ }
725
+ };
726
+
727
+ const handleStopSpeech = (): void => {
728
+ cancelRef.current();
729
+ };
730
+
731
+ const micButtonDisabled =
732
+ micBlocked &&
733
+ phase !== "listening" &&
734
+ phase !== "speaking" &&
735
+ phase !== "thinking";
736
+
737
+ const caption = (() => {
738
+ if (!stt.supported) {
739
+ return "Voice not supported in this browser. Use the text input.";
740
+ }
741
+ if (stt.error) {
742
+ return stt.error;
743
+ }
744
+ if (engineBlocked) {
745
+ return engine.statusMessage || engine.error || "Assistant unavailable.";
746
+ }
747
+ if (showEngineWait) {
748
+ return engine.statusMessage || "Preparing the assistant…";
749
+ }
750
+ if (stt.mode === "whisper" && stt.modelLoadStatus === "loading") {
751
+ const pct = Math.round(stt.modelLoadProgress * 100);
752
+ return `First-time setup: downloading voice model (${pct}%). Cached after this.`;
753
+ }
754
+ if (askError) {
755
+ return askError;
756
+ }
757
+ if (phase === "listening") {
758
+ if (stt.mode === "whisper") {
759
+ return "Recording… tap the mic when you're done.";
760
+ }
761
+ return "Listening… finish your thought, or tap the mic to stop.";
762
+ }
763
+ if (phase === "thinking") {
764
+ if (stt.mode === "whisper" && stt.transcribing) {
765
+ return "Transcribing your question…";
766
+ }
767
+ return "Thinking…";
768
+ }
769
+ if (phase === "speaking") {
770
+ if (!tts.supported) {
771
+ return "Answer shown below — speech is not supported in this browser.";
772
+ }
773
+ return "Tap the mic to interrupt, or stop playback.";
774
+ }
775
+ if (stt.mode === "whisper" && stt.modelLoadStatus !== "ready") {
776
+ return "Tap to ask. The voice model (~40 MB) will download on first use.";
777
+ }
778
+ return "Tap to ask a question.";
779
+ })();
780
+
781
+ const micClassName = [
782
+ styles.micButton,
783
+ phase === "idle" && stt.supported && !engineBlocked && !showEngineWait
784
+ ? styles.micButtonIdle
785
+ : "",
786
+ phase === "listening" ? styles.micButtonListening : "",
787
+ phase === "speaking" ? styles.micButtonSpeaking : "",
788
+ ]
789
+ .filter(Boolean)
790
+ .join(" ");
791
+
792
+ return (
793
+ <div className={styles.wrap}>
794
+ <div
795
+ className={styles.transcript}
796
+ role="region"
797
+ aria-label="Voice mode transcript"
798
+ aria-live="polite"
799
+ >
800
+ {displayUserLine ? (
801
+ <>
802
+ <span className={styles.transcriptLabel}>You said</span>
803
+ {displayUserLine}
804
+ </>
805
+ ) : (
806
+ <span className={styles.transcriptLabel}>You said</span>
807
+ )}
808
+ <StreamingAssistantText ref={assistantTextRef} />
809
+ </div>
810
+
811
+ <div className={styles.controls}>
812
+ {phase === "thinking" ? (
813
+ <div className={styles.thinking}>
814
+ <div className={styles.spinner} aria-hidden />
815
+ <p className={styles.caption}>{caption}</p>
816
+ </div>
817
+ ) : null}
818
+
819
+ {phase === "speaking" && tts.supported ? (
820
+ <div className={styles.wave} aria-hidden>
821
+ <span className={styles.waveBar} />
822
+ <span className={styles.waveBar} />
823
+ <span className={styles.waveBar} />
824
+ <span className={styles.waveBar} />
825
+ <span className={styles.waveBar} />
826
+ </div>
827
+ ) : null}
828
+
829
+ <div className={styles.micOuter}>
830
+ {phase === "listening" ? (
831
+ <span
832
+ className={`${styles.micRing} ${styles.micRingListening}`}
833
+ aria-hidden
834
+ />
835
+ ) : null}
836
+ <button
837
+ type="button"
838
+ className={micClassName}
839
+ aria-label={micAriaLabel}
840
+ aria-pressed={phase === "listening"}
841
+ disabled={micButtonDisabled}
842
+ onClick={handleMicClick}
843
+ onKeyDown={handleMicKeyDown}
844
+ >
845
+ <MicIcon />
846
+ </button>
847
+ </div>
848
+
849
+ {phase !== "thinking" ? (
850
+ <p className={styles.caption}>{caption}</p>
851
+ ) : null}
852
+
853
+ {phase === "speaking" && tts.supported ? (
854
+ <button
855
+ type="button"
856
+ className={styles.stopButton}
857
+ onClick={handleStopSpeech}
858
+ >
859
+ Stop
860
+ </button>
861
+ ) : null}
862
+ </div>
863
+ </div>
864
+ );
865
+ });
866
+
867
+ export default VoiceMode;