@usecrow/ui 0.1.56 → 0.1.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -1743,6 +1743,186 @@ function usePreviewCopilotStyles(previewStyles) {
1743
1743
  styles: mergeCopilotStyles(void 0, previewStyles)
1744
1744
  };
1745
1745
  }
1746
+ function useTTSOutput({
1747
+ backendUrl,
1748
+ voiceId = "YTpq7expH9539ERJ"
1749
+ }) {
1750
+ const [isSpeaking, setIsSpeaking] = useState(false);
1751
+ const [error, setError] = useState(null);
1752
+ const wsRef = useRef(null);
1753
+ const audioContextRef = useRef(null);
1754
+ const nextTimeRef = useRef(0);
1755
+ const streamCompleteRef = useRef(false);
1756
+ const completionCheckIntervalRef = useRef(null);
1757
+ const cleanupAudioContext = useCallback(() => {
1758
+ setIsSpeaking(false);
1759
+ if (audioContextRef.current && audioContextRef.current.state !== "closed") {
1760
+ audioContextRef.current.close();
1761
+ audioContextRef.current = null;
1762
+ }
1763
+ if (completionCheckIntervalRef.current) {
1764
+ clearInterval(completionCheckIntervalRef.current);
1765
+ completionCheckIntervalRef.current = null;
1766
+ }
1767
+ }, []);
1768
+ const closeWebSocket = useCallback(() => {
1769
+ if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
1770
+ try {
1771
+ wsRef.current.send(JSON.stringify({ type: "stop" }));
1772
+ wsRef.current.close();
1773
+ } catch (e) {
1774
+ }
1775
+ }
1776
+ wsRef.current = null;
1777
+ }, []);
1778
+ const cleanupTTS = useCallback(() => {
1779
+ setIsSpeaking(false);
1780
+ setError(null);
1781
+ closeWebSocket();
1782
+ cleanupAudioContext();
1783
+ }, [closeWebSocket, cleanupAudioContext]);
1784
+ const waitForAudioComplete = useCallback(() => {
1785
+ if (completionCheckIntervalRef.current) {
1786
+ clearInterval(completionCheckIntervalRef.current);
1787
+ }
1788
+ completionCheckIntervalRef.current = setInterval(() => {
1789
+ if (!audioContextRef.current) {
1790
+ if (completionCheckIntervalRef.current) {
1791
+ clearInterval(completionCheckIntervalRef.current);
1792
+ completionCheckIntervalRef.current = null;
1793
+ }
1794
+ return;
1795
+ }
1796
+ const now = audioContextRef.current.currentTime;
1797
+ if (now >= nextTimeRef.current) {
1798
+ if (completionCheckIntervalRef.current) {
1799
+ clearInterval(completionCheckIntervalRef.current);
1800
+ completionCheckIntervalRef.current = null;
1801
+ }
1802
+ cleanupAudioContext();
1803
+ }
1804
+ }, 100);
1805
+ }, [cleanupAudioContext]);
1806
+ const playAudioChunk = useCallback((base64Audio) => {
1807
+ if (!audioContextRef.current || audioContextRef.current.state === "closed") {
1808
+ console.error("TTS: AudioContext not available");
1809
+ return;
1810
+ }
1811
+ try {
1812
+ const binary = atob(base64Audio);
1813
+ const bytes = new Uint8Array(binary.length);
1814
+ for (let i = 0; i < binary.length; i++) {
1815
+ bytes[i] = binary.charCodeAt(i);
1816
+ }
1817
+ const pcm16 = new Int16Array(bytes.buffer);
1818
+ const float32 = new Float32Array(pcm16.length);
1819
+ for (let i = 0; i < pcm16.length; i++) {
1820
+ float32[i] = pcm16[i] / 32768;
1821
+ }
1822
+ const buffer = audioContextRef.current.createBuffer(1, float32.length, 48e3);
1823
+ buffer.getChannelData(0).set(float32);
1824
+ const source = audioContextRef.current.createBufferSource();
1825
+ source.buffer = buffer;
1826
+ source.connect(audioContextRef.current.destination);
1827
+ const now = audioContextRef.current.currentTime;
1828
+ if (nextTimeRef.current < now) {
1829
+ nextTimeRef.current = now;
1830
+ }
1831
+ source.start(nextTimeRef.current);
1832
+ nextTimeRef.current += buffer.duration;
1833
+ } catch (err) {
1834
+ console.error("TTS: Error playing audio chunk:", err);
1835
+ setError(err instanceof Error ? err.message : "Failed to play audio chunk");
1836
+ }
1837
+ }, []);
1838
+ const speak = useCallback(
1839
+ (text) => {
1840
+ console.log("[TTS Hook] speak called with:", text.substring(0, 50), "backendUrl:", backendUrl);
1841
+ if (!text.trim()) {
1842
+ console.log("[TTS Hook] No text to speak");
1843
+ setError("No text to speak");
1844
+ return;
1845
+ }
1846
+ if (isSpeaking || wsRef.current) {
1847
+ console.log("[TTS Hook] Already playing");
1848
+ setError("Already playing, stop first");
1849
+ return;
1850
+ }
1851
+ setError(null);
1852
+ nextTimeRef.current = 0;
1853
+ streamCompleteRef.current = false;
1854
+ try {
1855
+ audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)({
1856
+ sampleRate: 48e3
1857
+ });
1858
+ const url = backendUrl.startsWith("http") ? backendUrl.replace(/^http/, "ws") : backendUrl;
1859
+ const wsUrl = `${url}/api/tts/stream`;
1860
+ console.log("[TTS Hook] Connecting to:", wsUrl);
1861
+ const ws = new WebSocket(wsUrl);
1862
+ wsRef.current = ws;
1863
+ ws.onopen = () => {
1864
+ ws.send(
1865
+ JSON.stringify({
1866
+ type: "setup",
1867
+ voice_id: voiceId,
1868
+ output_format: "pcm"
1869
+ })
1870
+ );
1871
+ };
1872
+ ws.onmessage = (event) => {
1873
+ const msg = JSON.parse(event.data);
1874
+ if (msg.type === "ready") {
1875
+ ws.send(JSON.stringify({ type: "text", text }));
1876
+ ws.send(JSON.stringify({ type: "end_of_stream" }));
1877
+ } else if (msg.type === "audio") {
1878
+ playAudioChunk(msg.audio);
1879
+ } else if (msg.type === "done") {
1880
+ streamCompleteRef.current = true;
1881
+ closeWebSocket();
1882
+ waitForAudioComplete();
1883
+ } else if (msg.type === "error") {
1884
+ setError(msg.message || "TTS error");
1885
+ cleanupTTS();
1886
+ }
1887
+ };
1888
+ ws.onerror = () => {
1889
+ setError("WebSocket error");
1890
+ cleanupTTS();
1891
+ };
1892
+ ws.onclose = () => {
1893
+ wsRef.current = null;
1894
+ };
1895
+ setIsSpeaking(true);
1896
+ } catch (err) {
1897
+ setError(err instanceof Error ? err.message : "Failed to start TTS");
1898
+ cleanupTTS();
1899
+ }
1900
+ },
1901
+ [
1902
+ isSpeaking,
1903
+ backendUrl,
1904
+ voiceId,
1905
+ playAudioChunk,
1906
+ closeWebSocket,
1907
+ waitForAudioComplete,
1908
+ cleanupTTS
1909
+ ]
1910
+ );
1911
+ const stop = useCallback(() => {
1912
+ cleanupTTS();
1913
+ }, [cleanupTTS]);
1914
+ useEffect(() => {
1915
+ return () => {
1916
+ cleanupTTS();
1917
+ };
1918
+ }, [cleanupTTS]);
1919
+ return {
1920
+ speak,
1921
+ stop,
1922
+ isSpeaking,
1923
+ error
1924
+ };
1925
+ }
1746
1926
  var WidgetStyleContext = createContext(null);
1747
1927
  function WidgetStyleProvider({
1748
1928
  children,
@@ -2730,80 +2910,176 @@ var ModelSelector = ({
2730
2910
  ] }, provider)) })
2731
2911
  ] });
2732
2912
  };
2733
- var getSpeechRecognition = () => {
2734
- if (typeof window === "undefined") return null;
2735
- return window.SpeechRecognition || window.webkitSpeechRecognition || null;
2913
+ var isMediaRecorderSupported = () => {
2914
+ if (typeof window === "undefined") return false;
2915
+ return !!(navigator.mediaDevices && typeof navigator.mediaDevices.getUserMedia === "function" && (window.AudioContext || window.webkitAudioContext));
2736
2916
  };
2737
- function useVoiceInput(options = {}) {
2738
- const { lang, silenceTimeoutMs } = options;
2739
- const [supported] = useState(() => getSpeechRecognition() !== null);
2917
+ function useVoiceInput(options) {
2918
+ const { backendUrl, silenceTimeoutMs } = options;
2919
+ const [supported] = useState(() => isMediaRecorderSupported());
2740
2920
  const [isRecording, setIsRecording] = useState(false);
2741
2921
  const [transcript, setTranscript] = useState("");
2742
- const recognitionRef = useRef(null);
2922
+ const [error, setError] = useState(null);
2923
+ const wsRef = useRef(null);
2924
+ const streamRef = useRef(null);
2925
+ const audioContextRef = useRef(null);
2926
+ const processorRef = useRef(null);
2743
2927
  const silenceTimerRef = useRef(null);
2744
- const finalTranscriptRef = useRef("");
2928
+ const transcriptRef = useRef("");
2929
+ const interimRef = useRef("");
2930
+ const isRecordingRef = useRef(false);
2745
2931
  const clearSilenceTimer = useCallback(() => {
2746
2932
  if (silenceTimerRef.current) {
2747
2933
  clearTimeout(silenceTimerRef.current);
2748
2934
  silenceTimerRef.current = null;
2749
2935
  }
2750
2936
  }, []);
2751
- const stop = useCallback(() => {
2937
+ const cleanup = useCallback(() => {
2752
2938
  clearSilenceTimer();
2753
- if (recognitionRef.current) {
2754
- recognitionRef.current.stop();
2939
+ isRecordingRef.current = false;
2940
+ if (interimRef.current) {
2941
+ transcriptRef.current += interimRef.current + " ";
2942
+ setTranscript(transcriptRef.current.trim());
2943
+ interimRef.current = "";
2755
2944
  }
2945
+ if (wsRef.current) {
2946
+ try {
2947
+ if (wsRef.current.readyState === WebSocket.OPEN) {
2948
+ wsRef.current.send(JSON.stringify({ type: "stop" }));
2949
+ }
2950
+ wsRef.current.close();
2951
+ } catch (e) {
2952
+ }
2953
+ wsRef.current = null;
2954
+ }
2955
+ if (processorRef.current) {
2956
+ processorRef.current.disconnect();
2957
+ processorRef.current = null;
2958
+ }
2959
+ if (audioContextRef.current) {
2960
+ audioContextRef.current.close();
2961
+ audioContextRef.current = null;
2962
+ }
2963
+ if (streamRef.current) {
2964
+ streamRef.current.getTracks().forEach((track) => track.stop());
2965
+ streamRef.current = null;
2966
+ }
2967
+ setIsRecording(false);
2756
2968
  }, [clearSilenceTimer]);
2969
+ const stop = useCallback(() => {
2970
+ cleanup();
2971
+ }, [cleanup]);
2757
2972
  const clear = useCallback(() => {
2758
2973
  setTranscript("");
2759
- finalTranscriptRef.current = "";
2974
+ transcriptRef.current = "";
2975
+ setError(null);
2760
2976
  }, []);
2761
- const start = useCallback(() => {
2762
- const SpeechRecognition = getSpeechRecognition();
2763
- if (!SpeechRecognition) return;
2764
- if (recognitionRef.current) {
2765
- recognitionRef.current.abort();
2766
- }
2767
- finalTranscriptRef.current = "";
2768
- setTranscript("");
2769
- const recognition = new SpeechRecognition();
2770
- recognition.continuous = true;
2771
- recognition.interimResults = true;
2772
- recognition.lang = lang || navigator.language || "en-US";
2773
- recognition.onresult = (event) => {
2774
- let interim = "";
2775
- let final = "";
2776
- for (let i = 0; i < event.results.length; i++) {
2777
- const result = event.results[i];
2778
- if (result.isFinal) {
2779
- final += result[0].transcript;
2780
- } else {
2781
- interim += result[0].transcript;
2782
- }
2977
+ const startAudioCapture = useCallback(() => {
2978
+ if (!streamRef.current || !wsRef.current) return;
2979
+ audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 24e3 });
2980
+ const source = audioContextRef.current.createMediaStreamSource(
2981
+ streamRef.current
2982
+ );
2983
+ processorRef.current = audioContextRef.current.createScriptProcessor(
2984
+ 4096,
2985
+ 1,
2986
+ 1
2987
+ );
2988
+ processorRef.current.onaudioprocess = (event) => {
2989
+ if (!isRecordingRef.current || !wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
2990
+ return;
2783
2991
  }
2784
- finalTranscriptRef.current = final;
2785
- setTranscript(final + interim);
2786
- if (silenceTimeoutMs) {
2787
- clearSilenceTimer();
2788
- silenceTimerRef.current = setTimeout(() => {
2789
- stop();
2790
- }, silenceTimeoutMs);
2992
+ const inputData = event.inputBuffer.getChannelData(0);
2993
+ const pcm16 = new Int16Array(inputData.length);
2994
+ for (let i = 0; i < inputData.length; i++) {
2995
+ const s = Math.max(-1, Math.min(1, inputData[i]));
2996
+ pcm16[i] = s < 0 ? s * 32768 : s * 32767;
2791
2997
  }
2792
- };
2793
- recognition.onerror = (event) => {
2794
- if (event.error !== "aborted") {
2795
- console.warn("[Crow Voice] Speech recognition error:", event.error);
2998
+ const bytes = new Uint8Array(pcm16.buffer);
2999
+ let binary = "";
3000
+ for (let i = 0; i < bytes.length; i++) {
3001
+ binary += String.fromCharCode(bytes[i]);
2796
3002
  }
2797
- setIsRecording(false);
2798
- };
2799
- recognition.onend = () => {
2800
- setIsRecording(false);
2801
- recognitionRef.current = null;
3003
+ wsRef.current.send(
3004
+ JSON.stringify({ type: "audio", data: btoa(binary) })
3005
+ );
2802
3006
  };
2803
- recognitionRef.current = recognition;
2804
- recognition.start();
2805
- setIsRecording(true);
2806
- }, [lang, silenceTimeoutMs, clearSilenceTimer, stop]);
3007
+ source.connect(processorRef.current);
3008
+ processorRef.current.connect(audioContextRef.current.destination);
3009
+ }, []);
3010
+ const start = useCallback(async () => {
3011
+ if (!supported) {
3012
+ setError("Audio recording not supported in this browser");
3013
+ return;
3014
+ }
3015
+ setError(null);
3016
+ transcriptRef.current = "";
3017
+ setTranscript("");
3018
+ try {
3019
+ streamRef.current = await navigator.mediaDevices.getUserMedia({
3020
+ audio: {
3021
+ echoCancellation: true,
3022
+ noiseSuppression: true,
3023
+ sampleRate: 24e3
3024
+ }
3025
+ });
3026
+ const wsProtocol = backendUrl.startsWith("https") ? "wss" : "ws";
3027
+ const wsHost = backendUrl.replace(/^https?:\/\//, "");
3028
+ const wsUrl = `${wsProtocol}://${wsHost}/api/stt/stream`;
3029
+ wsRef.current = new WebSocket(wsUrl);
3030
+ wsRef.current.onopen = () => {
3031
+ wsRef.current?.send(JSON.stringify({ type: "setup" }));
3032
+ };
3033
+ wsRef.current.onmessage = (event) => {
3034
+ const msg = JSON.parse(event.data);
3035
+ if (msg.type === "ready") {
3036
+ startAudioCapture();
3037
+ isRecordingRef.current = true;
3038
+ setIsRecording(true);
3039
+ } else if (msg.type === "transcript") {
3040
+ if (msg.is_final && msg.text) {
3041
+ transcriptRef.current += msg.text + " ";
3042
+ interimRef.current = "";
3043
+ setTranscript(transcriptRef.current.trim());
3044
+ if (silenceTimeoutMs) {
3045
+ clearSilenceTimer();
3046
+ silenceTimerRef.current = setTimeout(() => {
3047
+ stop();
3048
+ }, silenceTimeoutMs);
3049
+ }
3050
+ } else if (!msg.is_final && msg.text) {
3051
+ interimRef.current = msg.text;
3052
+ setTranscript((transcriptRef.current + msg.text).trim());
3053
+ }
3054
+ } else if (msg.type === "error") {
3055
+ setError(msg.message || "STT error");
3056
+ cleanup();
3057
+ }
3058
+ };
3059
+ wsRef.current.onerror = () => {
3060
+ setError("WebSocket connection error");
3061
+ cleanup();
3062
+ };
3063
+ wsRef.current.onclose = () => {
3064
+ if (isRecordingRef.current) {
3065
+ cleanup();
3066
+ }
3067
+ };
3068
+ } catch (err) {
3069
+ setError(
3070
+ err instanceof Error ? err.message : "Failed to start recording"
3071
+ );
3072
+ cleanup();
3073
+ }
3074
+ }, [
3075
+ supported,
3076
+ backendUrl,
3077
+ startAudioCapture,
3078
+ silenceTimeoutMs,
3079
+ clearSilenceTimer,
3080
+ stop,
3081
+ cleanup
3082
+ ]);
2807
3083
  const toggle = useCallback(() => {
2808
3084
  if (isRecording) {
2809
3085
  stop();
@@ -2813,13 +3089,19 @@ function useVoiceInput(options = {}) {
2813
3089
  }, [isRecording, start, stop]);
2814
3090
  useEffect(() => {
2815
3091
  return () => {
2816
- clearSilenceTimer();
2817
- if (recognitionRef.current) {
2818
- recognitionRef.current.abort();
2819
- }
3092
+ cleanup();
2820
3093
  };
2821
- }, [clearSilenceTimer]);
2822
- return { supported, isRecording, transcript, start, stop, toggle, clear };
3094
+ }, [cleanup]);
3095
+ return {
3096
+ supported,
3097
+ isRecording,
3098
+ transcript,
3099
+ error,
3100
+ start,
3101
+ stop,
3102
+ toggle,
3103
+ clear
3104
+ };
2823
3105
  }
2824
3106
  var Textarea = React3.forwardRef(
2825
3107
  ({ className, ...props }, ref) => /* @__PURE__ */ jsx(
@@ -3013,11 +3295,23 @@ var PromptInputBox = React3.forwardRef(
3013
3295
  selectedModel = "gpt-4o",
3014
3296
  onModelChange,
3015
3297
  availableModels = [],
3016
- highlighted = false
3298
+ highlighted = false,
3299
+ backendUrl = "",
3300
+ triggerVoiceRecording = 0
3017
3301
  }, ref) => {
3018
3302
  const [input, setInput] = React3.useState("");
3019
3303
  const promptBoxRef = React3.useRef(null);
3020
- const voice = useVoiceInput();
3304
+ const voice = useVoiceInput({ backendUrl, silenceTimeoutMs: 1500 });
3305
+ const lastTriggerRef = React3.useRef(0);
3306
+ const voiceRef = React3.useRef(voice);
3307
+ voiceRef.current = voice;
3308
+ React3.useEffect(() => {
3309
+ if (triggerVoiceRecording > 0 && triggerVoiceRecording !== lastTriggerRef.current) {
3310
+ console.log("[Voice] Auto-starting recording from trigger");
3311
+ voiceRef.current.start();
3312
+ }
3313
+ lastTriggerRef.current = triggerVoiceRecording;
3314
+ }, [triggerVoiceRecording]);
3021
3315
  React3.useEffect(() => {
3022
3316
  if (voice.isRecording && voice.transcript) {
3023
3317
  setInput(voice.transcript);
@@ -3026,11 +3320,16 @@ var PromptInputBox = React3.forwardRef(
3026
3320
  const wasRecordingRef = React3.useRef(false);
3027
3321
  React3.useEffect(() => {
3028
3322
  if (wasRecordingRef.current && !voice.isRecording && voice.transcript) {
3029
- setInput(voice.transcript);
3323
+ const messageToSend = voice.transcript.trim();
3324
+ if (messageToSend) {
3325
+ console.log("[Voice] Auto-sending:", messageToSend);
3326
+ onSend(messageToSend);
3327
+ setInput("");
3328
+ }
3030
3329
  voice.clear();
3031
3330
  }
3032
3331
  wasRecordingRef.current = voice.isRecording;
3033
- }, [voice.isRecording, voice.transcript, voice.clear]);
3332
+ }, [voice.isRecording, voice.transcript, voice.clear, onSend]);
3034
3333
  const handleSubmit = () => {
3035
3334
  if (input.trim()) {
3036
3335
  if (voice.isRecording) {
@@ -3513,7 +3812,8 @@ function CrowWidget({
3513
3812
  getIdentityToken,
3514
3813
  context,
3515
3814
  toolRenderers,
3516
- language
3815
+ language,
3816
+ customCss
3517
3817
  }) {
3518
3818
  const effectiveGetIdentityToken = getIdentityToken || window.__crow_identity_token_fetcher;
3519
3819
  const effectiveOnToolResult = onToolResult || window.__crow_on_tool_result;
@@ -3640,6 +3940,25 @@ function CrowWidget({
3640
3940
  setShouldRestoreHistory(true);
3641
3941
  }
3642
3942
  });
3943
+ const tts = useTTSOutput({ backendUrl: apiUrl });
3944
+ const ttsRef = useRef(tts);
3945
+ ttsRef.current = tts;
3946
+ const wasLoadingRef = useRef(false);
3947
+ useEffect(() => {
3948
+ console.log("[Crow TTS] isLoading changed:", chat.isLoading, "wasLoading:", wasLoadingRef.current);
3949
+ if (wasLoadingRef.current && !chat.isLoading) {
3950
+ const lastMessage = [...chat.messages].reverse().find((m) => m.isBot);
3951
+ console.log("[Crow TTS] Last bot message:", lastMessage?.content?.substring(0, 50));
3952
+ if (lastMessage?.content) {
3953
+ const textToSpeak = lastMessage.content.replace(/\*\*/g, "").replace(/\*/g, "").replace(/`[^`]+`/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").trim();
3954
+ if (textToSpeak) {
3955
+ console.log("[Crow TTS] Speaking:", textToSpeak.substring(0, 50));
3956
+ ttsRef.current.speak(textToSpeak);
3957
+ }
3958
+ }
3959
+ }
3960
+ wasLoadingRef.current = chat.isLoading;
3961
+ }, [chat.isLoading, chat.messages]);
3643
3962
  useEffect(() => {
3644
3963
  if (initialSuggestions.length > 0 && chat.suggestedActions.length === 0) {
3645
3964
  chat.setSuggestedActions(initialSuggestions);
@@ -4034,12 +4353,20 @@ function CrowWidget({
4034
4353
  isLoading: chat.isLoading,
4035
4354
  showStopButton: isBrowserUseActive || !!askUserResolver || !!pendingConfirmation,
4036
4355
  highlighted: !!askUserResolver,
4037
- className: "crow-backdrop-blur-md"
4356
+ className: "crow-backdrop-blur-md",
4357
+ backendUrl: apiUrl
4038
4358
  }
4039
4359
  )
4040
4360
  ] })
4041
4361
  ] });
4042
- return /* @__PURE__ */ jsx(ShadowContainer, { styles: WIDGET_CSS, children: /* @__PURE__ */ jsx("div", { className: "crow-widget-root", style: cssVars, children: /* @__PURE__ */ jsxs(
4362
+ const combinedStyles = useMemo(
4363
+ () => customCss ? `${WIDGET_CSS}
4364
+
4365
+ /* Custom CSS */
4366
+ ${customCss}` : WIDGET_CSS,
4367
+ [customCss]
4368
+ );
4369
+ return /* @__PURE__ */ jsx(ShadowContainer, { styles: combinedStyles, children: /* @__PURE__ */ jsx("div", { className: "crow-widget-root", style: cssVars, children: /* @__PURE__ */ jsxs(
4043
4370
  WidgetStyleProvider,
4044
4371
  {
4045
4372
  styles,