@usecrow/ui 0.1.57 → 0.1.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -1769,6 +1769,186 @@ function usePreviewCopilotStyles(previewStyles) {
1769
1769
  styles: mergeCopilotStyles(void 0, previewStyles)
1770
1770
  };
1771
1771
  }
1772
+ function useTTSOutput({
1773
+ backendUrl,
1774
+ voiceId = "YTpq7expH9539ERJ"
1775
+ }) {
1776
+ const [isSpeaking, setIsSpeaking] = React3.useState(false);
1777
+ const [error, setError] = React3.useState(null);
1778
+ const wsRef = React3.useRef(null);
1779
+ const audioContextRef = React3.useRef(null);
1780
+ const nextTimeRef = React3.useRef(0);
1781
+ const streamCompleteRef = React3.useRef(false);
1782
+ const completionCheckIntervalRef = React3.useRef(null);
1783
+ const cleanupAudioContext = React3.useCallback(() => {
1784
+ setIsSpeaking(false);
1785
+ if (audioContextRef.current && audioContextRef.current.state !== "closed") {
1786
+ audioContextRef.current.close();
1787
+ audioContextRef.current = null;
1788
+ }
1789
+ if (completionCheckIntervalRef.current) {
1790
+ clearInterval(completionCheckIntervalRef.current);
1791
+ completionCheckIntervalRef.current = null;
1792
+ }
1793
+ }, []);
1794
+ const closeWebSocket = React3.useCallback(() => {
1795
+ if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
1796
+ try {
1797
+ wsRef.current.send(JSON.stringify({ type: "stop" }));
1798
+ wsRef.current.close();
1799
+ } catch (e) {
1800
+ }
1801
+ }
1802
+ wsRef.current = null;
1803
+ }, []);
1804
+ const cleanupTTS = React3.useCallback(() => {
1805
+ setIsSpeaking(false);
1806
+ setError(null);
1807
+ closeWebSocket();
1808
+ cleanupAudioContext();
1809
+ }, [closeWebSocket, cleanupAudioContext]);
1810
+ const waitForAudioComplete = React3.useCallback(() => {
1811
+ if (completionCheckIntervalRef.current) {
1812
+ clearInterval(completionCheckIntervalRef.current);
1813
+ }
1814
+ completionCheckIntervalRef.current = setInterval(() => {
1815
+ if (!audioContextRef.current) {
1816
+ if (completionCheckIntervalRef.current) {
1817
+ clearInterval(completionCheckIntervalRef.current);
1818
+ completionCheckIntervalRef.current = null;
1819
+ }
1820
+ return;
1821
+ }
1822
+ const now = audioContextRef.current.currentTime;
1823
+ if (now >= nextTimeRef.current) {
1824
+ if (completionCheckIntervalRef.current) {
1825
+ clearInterval(completionCheckIntervalRef.current);
1826
+ completionCheckIntervalRef.current = null;
1827
+ }
1828
+ cleanupAudioContext();
1829
+ }
1830
+ }, 100);
1831
+ }, [cleanupAudioContext]);
1832
+ const playAudioChunk = React3.useCallback((base64Audio) => {
1833
+ if (!audioContextRef.current || audioContextRef.current.state === "closed") {
1834
+ console.error("TTS: AudioContext not available");
1835
+ return;
1836
+ }
1837
+ try {
1838
+ const binary = atob(base64Audio);
1839
+ const bytes = new Uint8Array(binary.length);
1840
+ for (let i = 0; i < binary.length; i++) {
1841
+ bytes[i] = binary.charCodeAt(i);
1842
+ }
1843
+ const pcm16 = new Int16Array(bytes.buffer);
1844
+ const float32 = new Float32Array(pcm16.length);
1845
+ for (let i = 0; i < pcm16.length; i++) {
1846
+ float32[i] = pcm16[i] / 32768;
1847
+ }
1848
+ const buffer = audioContextRef.current.createBuffer(1, float32.length, 48e3);
1849
+ buffer.getChannelData(0).set(float32);
1850
+ const source = audioContextRef.current.createBufferSource();
1851
+ source.buffer = buffer;
1852
+ source.connect(audioContextRef.current.destination);
1853
+ const now = audioContextRef.current.currentTime;
1854
+ if (nextTimeRef.current < now) {
1855
+ nextTimeRef.current = now;
1856
+ }
1857
+ source.start(nextTimeRef.current);
1858
+ nextTimeRef.current += buffer.duration;
1859
+ } catch (err) {
1860
+ console.error("TTS: Error playing audio chunk:", err);
1861
+ setError(err instanceof Error ? err.message : "Failed to play audio chunk");
1862
+ }
1863
+ }, []);
1864
+ const speak = React3.useCallback(
1865
+ (text) => {
1866
+ console.log("[TTS Hook] speak called with:", text.substring(0, 50), "backendUrl:", backendUrl);
1867
+ if (!text.trim()) {
1868
+ console.log("[TTS Hook] No text to speak");
1869
+ setError("No text to speak");
1870
+ return;
1871
+ }
1872
+ if (isSpeaking || wsRef.current) {
1873
+ console.log("[TTS Hook] Already playing");
1874
+ setError("Already playing, stop first");
1875
+ return;
1876
+ }
1877
+ setError(null);
1878
+ nextTimeRef.current = 0;
1879
+ streamCompleteRef.current = false;
1880
+ try {
1881
+ audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)({
1882
+ sampleRate: 48e3
1883
+ });
1884
+ const url = backendUrl.startsWith("http") ? backendUrl.replace(/^http/, "ws") : backendUrl;
1885
+ const wsUrl = `${url}/api/tts/stream`;
1886
+ console.log("[TTS Hook] Connecting to:", wsUrl);
1887
+ const ws = new WebSocket(wsUrl);
1888
+ wsRef.current = ws;
1889
+ ws.onopen = () => {
1890
+ ws.send(
1891
+ JSON.stringify({
1892
+ type: "setup",
1893
+ voice_id: voiceId,
1894
+ output_format: "pcm"
1895
+ })
1896
+ );
1897
+ };
1898
+ ws.onmessage = (event) => {
1899
+ const msg = JSON.parse(event.data);
1900
+ if (msg.type === "ready") {
1901
+ ws.send(JSON.stringify({ type: "text", text }));
1902
+ ws.send(JSON.stringify({ type: "end_of_stream" }));
1903
+ } else if (msg.type === "audio") {
1904
+ playAudioChunk(msg.audio);
1905
+ } else if (msg.type === "done") {
1906
+ streamCompleteRef.current = true;
1907
+ closeWebSocket();
1908
+ waitForAudioComplete();
1909
+ } else if (msg.type === "error") {
1910
+ setError(msg.message || "TTS error");
1911
+ cleanupTTS();
1912
+ }
1913
+ };
1914
+ ws.onerror = () => {
1915
+ setError("WebSocket error");
1916
+ cleanupTTS();
1917
+ };
1918
+ ws.onclose = () => {
1919
+ wsRef.current = null;
1920
+ };
1921
+ setIsSpeaking(true);
1922
+ } catch (err) {
1923
+ setError(err instanceof Error ? err.message : "Failed to start TTS");
1924
+ cleanupTTS();
1925
+ }
1926
+ },
1927
+ [
1928
+ isSpeaking,
1929
+ backendUrl,
1930
+ voiceId,
1931
+ playAudioChunk,
1932
+ closeWebSocket,
1933
+ waitForAudioComplete,
1934
+ cleanupTTS
1935
+ ]
1936
+ );
1937
+ const stop = React3.useCallback(() => {
1938
+ cleanupTTS();
1939
+ }, [cleanupTTS]);
1940
+ React3.useEffect(() => {
1941
+ return () => {
1942
+ cleanupTTS();
1943
+ };
1944
+ }, [cleanupTTS]);
1945
+ return {
1946
+ speak,
1947
+ stop,
1948
+ isSpeaking,
1949
+ error
1950
+ };
1951
+ }
1772
1952
  var WidgetStyleContext = React3.createContext(null);
1773
1953
  function WidgetStyleProvider({
1774
1954
  children,
@@ -2756,80 +2936,176 @@ var ModelSelector = ({
2756
2936
  ] }, provider)) })
2757
2937
  ] });
2758
2938
  };
2759
- var getSpeechRecognition = () => {
2760
- if (typeof window === "undefined") return null;
2761
- return window.SpeechRecognition || window.webkitSpeechRecognition || null;
2939
+ var isMediaRecorderSupported = () => {
2940
+ if (typeof window === "undefined") return false;
2941
+ return !!(navigator.mediaDevices && typeof navigator.mediaDevices.getUserMedia === "function" && (window.AudioContext || window.webkitAudioContext));
2762
2942
  };
2763
- function useVoiceInput(options = {}) {
2764
- const { lang, silenceTimeoutMs } = options;
2765
- const [supported] = React3.useState(() => getSpeechRecognition() !== null);
2943
+ function useVoiceInput(options) {
2944
+ const { backendUrl, silenceTimeoutMs } = options;
2945
+ const [supported] = React3.useState(() => isMediaRecorderSupported());
2766
2946
  const [isRecording, setIsRecording] = React3.useState(false);
2767
2947
  const [transcript, setTranscript] = React3.useState("");
2768
- const recognitionRef = React3.useRef(null);
2948
+ const [error, setError] = React3.useState(null);
2949
+ const wsRef = React3.useRef(null);
2950
+ const streamRef = React3.useRef(null);
2951
+ const audioContextRef = React3.useRef(null);
2952
+ const processorRef = React3.useRef(null);
2769
2953
  const silenceTimerRef = React3.useRef(null);
2770
- const finalTranscriptRef = React3.useRef("");
2954
+ const transcriptRef = React3.useRef("");
2955
+ const interimRef = React3.useRef("");
2956
+ const isRecordingRef = React3.useRef(false);
2771
2957
  const clearSilenceTimer = React3.useCallback(() => {
2772
2958
  if (silenceTimerRef.current) {
2773
2959
  clearTimeout(silenceTimerRef.current);
2774
2960
  silenceTimerRef.current = null;
2775
2961
  }
2776
2962
  }, []);
2777
- const stop = React3.useCallback(() => {
2963
+ const cleanup = React3.useCallback(() => {
2778
2964
  clearSilenceTimer();
2779
- if (recognitionRef.current) {
2780
- recognitionRef.current.stop();
2965
+ isRecordingRef.current = false;
2966
+ if (interimRef.current) {
2967
+ transcriptRef.current += interimRef.current + " ";
2968
+ setTranscript(transcriptRef.current.trim());
2969
+ interimRef.current = "";
2781
2970
  }
2971
+ if (wsRef.current) {
2972
+ try {
2973
+ if (wsRef.current.readyState === WebSocket.OPEN) {
2974
+ wsRef.current.send(JSON.stringify({ type: "stop" }));
2975
+ }
2976
+ wsRef.current.close();
2977
+ } catch (e) {
2978
+ }
2979
+ wsRef.current = null;
2980
+ }
2981
+ if (processorRef.current) {
2982
+ processorRef.current.disconnect();
2983
+ processorRef.current = null;
2984
+ }
2985
+ if (audioContextRef.current) {
2986
+ audioContextRef.current.close();
2987
+ audioContextRef.current = null;
2988
+ }
2989
+ if (streamRef.current) {
2990
+ streamRef.current.getTracks().forEach((track) => track.stop());
2991
+ streamRef.current = null;
2992
+ }
2993
+ setIsRecording(false);
2782
2994
  }, [clearSilenceTimer]);
2995
+ const stop = React3.useCallback(() => {
2996
+ cleanup();
2997
+ }, [cleanup]);
2783
2998
  const clear = React3.useCallback(() => {
2784
2999
  setTranscript("");
2785
- finalTranscriptRef.current = "";
3000
+ transcriptRef.current = "";
3001
+ setError(null);
2786
3002
  }, []);
2787
- const start = React3.useCallback(() => {
2788
- const SpeechRecognition = getSpeechRecognition();
2789
- if (!SpeechRecognition) return;
2790
- if (recognitionRef.current) {
2791
- recognitionRef.current.abort();
2792
- }
2793
- finalTranscriptRef.current = "";
2794
- setTranscript("");
2795
- const recognition = new SpeechRecognition();
2796
- recognition.continuous = true;
2797
- recognition.interimResults = true;
2798
- recognition.lang = lang || navigator.language || "en-US";
2799
- recognition.onresult = (event) => {
2800
- let interim = "";
2801
- let final = "";
2802
- for (let i = 0; i < event.results.length; i++) {
2803
- const result = event.results[i];
2804
- if (result.isFinal) {
2805
- final += result[0].transcript;
2806
- } else {
2807
- interim += result[0].transcript;
2808
- }
3003
+ const startAudioCapture = React3.useCallback(() => {
3004
+ if (!streamRef.current || !wsRef.current) return;
3005
+ audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 24e3 });
3006
+ const source = audioContextRef.current.createMediaStreamSource(
3007
+ streamRef.current
3008
+ );
3009
+ processorRef.current = audioContextRef.current.createScriptProcessor(
3010
+ 4096,
3011
+ 1,
3012
+ 1
3013
+ );
3014
+ processorRef.current.onaudioprocess = (event) => {
3015
+ if (!isRecordingRef.current || !wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
3016
+ return;
2809
3017
  }
2810
- finalTranscriptRef.current = final;
2811
- setTranscript(final + interim);
2812
- if (silenceTimeoutMs) {
2813
- clearSilenceTimer();
2814
- silenceTimerRef.current = setTimeout(() => {
2815
- stop();
2816
- }, silenceTimeoutMs);
3018
+ const inputData = event.inputBuffer.getChannelData(0);
3019
+ const pcm16 = new Int16Array(inputData.length);
3020
+ for (let i = 0; i < inputData.length; i++) {
3021
+ const s = Math.max(-1, Math.min(1, inputData[i]));
3022
+ pcm16[i] = s < 0 ? s * 32768 : s * 32767;
2817
3023
  }
2818
- };
2819
- recognition.onerror = (event) => {
2820
- if (event.error !== "aborted") {
2821
- console.warn("[Crow Voice] Speech recognition error:", event.error);
3024
+ const bytes = new Uint8Array(pcm16.buffer);
3025
+ let binary = "";
3026
+ for (let i = 0; i < bytes.length; i++) {
3027
+ binary += String.fromCharCode(bytes[i]);
2822
3028
  }
2823
- setIsRecording(false);
2824
- };
2825
- recognition.onend = () => {
2826
- setIsRecording(false);
2827
- recognitionRef.current = null;
3029
+ wsRef.current.send(
3030
+ JSON.stringify({ type: "audio", data: btoa(binary) })
3031
+ );
2828
3032
  };
2829
- recognitionRef.current = recognition;
2830
- recognition.start();
2831
- setIsRecording(true);
2832
- }, [lang, silenceTimeoutMs, clearSilenceTimer, stop]);
3033
+ source.connect(processorRef.current);
3034
+ processorRef.current.connect(audioContextRef.current.destination);
3035
+ }, []);
3036
+ const start = React3.useCallback(async () => {
3037
+ if (!supported) {
3038
+ setError("Audio recording not supported in this browser");
3039
+ return;
3040
+ }
3041
+ setError(null);
3042
+ transcriptRef.current = "";
3043
+ setTranscript("");
3044
+ try {
3045
+ streamRef.current = await navigator.mediaDevices.getUserMedia({
3046
+ audio: {
3047
+ echoCancellation: true,
3048
+ noiseSuppression: true,
3049
+ sampleRate: 24e3
3050
+ }
3051
+ });
3052
+ const wsProtocol = backendUrl.startsWith("https") ? "wss" : "ws";
3053
+ const wsHost = backendUrl.replace(/^https?:\/\//, "");
3054
+ const wsUrl = `${wsProtocol}://${wsHost}/api/stt/stream`;
3055
+ wsRef.current = new WebSocket(wsUrl);
3056
+ wsRef.current.onopen = () => {
3057
+ wsRef.current?.send(JSON.stringify({ type: "setup" }));
3058
+ };
3059
+ wsRef.current.onmessage = (event) => {
3060
+ const msg = JSON.parse(event.data);
3061
+ if (msg.type === "ready") {
3062
+ startAudioCapture();
3063
+ isRecordingRef.current = true;
3064
+ setIsRecording(true);
3065
+ } else if (msg.type === "transcript") {
3066
+ if (msg.is_final && msg.text) {
3067
+ transcriptRef.current += msg.text + " ";
3068
+ interimRef.current = "";
3069
+ setTranscript(transcriptRef.current.trim());
3070
+ if (silenceTimeoutMs) {
3071
+ clearSilenceTimer();
3072
+ silenceTimerRef.current = setTimeout(() => {
3073
+ stop();
3074
+ }, silenceTimeoutMs);
3075
+ }
3076
+ } else if (!msg.is_final && msg.text) {
3077
+ interimRef.current = msg.text;
3078
+ setTranscript((transcriptRef.current + msg.text).trim());
3079
+ }
3080
+ } else if (msg.type === "error") {
3081
+ setError(msg.message || "STT error");
3082
+ cleanup();
3083
+ }
3084
+ };
3085
+ wsRef.current.onerror = () => {
3086
+ setError("WebSocket connection error");
3087
+ cleanup();
3088
+ };
3089
+ wsRef.current.onclose = () => {
3090
+ if (isRecordingRef.current) {
3091
+ cleanup();
3092
+ }
3093
+ };
3094
+ } catch (err) {
3095
+ setError(
3096
+ err instanceof Error ? err.message : "Failed to start recording"
3097
+ );
3098
+ cleanup();
3099
+ }
3100
+ }, [
3101
+ supported,
3102
+ backendUrl,
3103
+ startAudioCapture,
3104
+ silenceTimeoutMs,
3105
+ clearSilenceTimer,
3106
+ stop,
3107
+ cleanup
3108
+ ]);
2833
3109
  const toggle = React3.useCallback(() => {
2834
3110
  if (isRecording) {
2835
3111
  stop();
@@ -2839,13 +3115,19 @@ function useVoiceInput(options = {}) {
2839
3115
  }, [isRecording, start, stop]);
2840
3116
  React3.useEffect(() => {
2841
3117
  return () => {
2842
- clearSilenceTimer();
2843
- if (recognitionRef.current) {
2844
- recognitionRef.current.abort();
2845
- }
3118
+ cleanup();
2846
3119
  };
2847
- }, [clearSilenceTimer]);
2848
- return { supported, isRecording, transcript, start, stop, toggle, clear };
3120
+ }, [cleanup]);
3121
+ return {
3122
+ supported,
3123
+ isRecording,
3124
+ transcript,
3125
+ error,
3126
+ start,
3127
+ stop,
3128
+ toggle,
3129
+ clear
3130
+ };
2849
3131
  }
2850
3132
  var Textarea = React3__default.default.forwardRef(
2851
3133
  ({ className, ...props }, ref) => /* @__PURE__ */ jsxRuntime.jsx(
@@ -3039,11 +3321,23 @@ var PromptInputBox = React3__default.default.forwardRef(
3039
3321
  selectedModel = "gpt-4o",
3040
3322
  onModelChange,
3041
3323
  availableModels = [],
3042
- highlighted = false
3324
+ highlighted = false,
3325
+ backendUrl = "",
3326
+ triggerVoiceRecording = 0
3043
3327
  }, ref) => {
3044
3328
  const [input, setInput] = React3__default.default.useState("");
3045
3329
  const promptBoxRef = React3__default.default.useRef(null);
3046
- const voice = useVoiceInput();
3330
+ const voice = useVoiceInput({ backendUrl, silenceTimeoutMs: 1500 });
3331
+ const lastTriggerRef = React3__default.default.useRef(0);
3332
+ const voiceRef = React3__default.default.useRef(voice);
3333
+ voiceRef.current = voice;
3334
+ React3__default.default.useEffect(() => {
3335
+ if (triggerVoiceRecording > 0 && triggerVoiceRecording !== lastTriggerRef.current) {
3336
+ console.log("[Voice] Auto-starting recording from trigger");
3337
+ voiceRef.current.start();
3338
+ }
3339
+ lastTriggerRef.current = triggerVoiceRecording;
3340
+ }, [triggerVoiceRecording]);
3047
3341
  React3__default.default.useEffect(() => {
3048
3342
  if (voice.isRecording && voice.transcript) {
3049
3343
  setInput(voice.transcript);
@@ -3052,11 +3346,16 @@ var PromptInputBox = React3__default.default.forwardRef(
3052
3346
  const wasRecordingRef = React3__default.default.useRef(false);
3053
3347
  React3__default.default.useEffect(() => {
3054
3348
  if (wasRecordingRef.current && !voice.isRecording && voice.transcript) {
3055
- setInput(voice.transcript);
3349
+ const messageToSend = voice.transcript.trim();
3350
+ if (messageToSend) {
3351
+ console.log("[Voice] Auto-sending:", messageToSend);
3352
+ onSend(messageToSend);
3353
+ setInput("");
3354
+ }
3056
3355
  voice.clear();
3057
3356
  }
3058
3357
  wasRecordingRef.current = voice.isRecording;
3059
- }, [voice.isRecording, voice.transcript, voice.clear]);
3358
+ }, [voice.isRecording, voice.transcript, voice.clear, onSend]);
3060
3359
  const handleSubmit = () => {
3061
3360
  if (input.trim()) {
3062
3361
  if (voice.isRecording) {
@@ -3667,6 +3966,25 @@ function CrowWidget({
3667
3966
  setShouldRestoreHistory(true);
3668
3967
  }
3669
3968
  });
3969
+ const tts = useTTSOutput({ backendUrl: apiUrl });
3970
+ const ttsRef = React3.useRef(tts);
3971
+ ttsRef.current = tts;
3972
+ const wasLoadingRef = React3.useRef(false);
3973
+ React3.useEffect(() => {
3974
+ console.log("[Crow TTS] isLoading changed:", chat.isLoading, "wasLoading:", wasLoadingRef.current);
3975
+ if (wasLoadingRef.current && !chat.isLoading) {
3976
+ const lastMessage = [...chat.messages].reverse().find((m) => m.isBot);
3977
+ console.log("[Crow TTS] Last bot message:", lastMessage?.content?.substring(0, 50));
3978
+ if (lastMessage?.content) {
3979
+ const textToSpeak = lastMessage.content.replace(/\*\*/g, "").replace(/\*/g, "").replace(/`[^`]+`/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").trim();
3980
+ if (textToSpeak) {
3981
+ console.log("[Crow TTS] Speaking:", textToSpeak.substring(0, 50));
3982
+ ttsRef.current.speak(textToSpeak);
3983
+ }
3984
+ }
3985
+ }
3986
+ wasLoadingRef.current = chat.isLoading;
3987
+ }, [chat.isLoading, chat.messages]);
3670
3988
  React3.useEffect(() => {
3671
3989
  if (initialSuggestions.length > 0 && chat.suggestedActions.length === 0) {
3672
3990
  chat.setSuggestedActions(initialSuggestions);
@@ -4061,7 +4379,8 @@ function CrowWidget({
4061
4379
  isLoading: chat.isLoading,
4062
4380
  showStopButton: isBrowserUseActive || !!askUserResolver || !!pendingConfirmation,
4063
4381
  highlighted: !!askUserResolver,
4064
- className: "crow-backdrop-blur-md"
4382
+ className: "crow-backdrop-blur-md",
4383
+ backendUrl: apiUrl
4065
4384
  }
4066
4385
  )
4067
4386
  ] })