@usecrow/ui 0.1.57 → 0.1.58
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +384 -65
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +15 -35
- package/dist/index.d.ts +15 -35
- package/dist/index.js +384 -65
- package/dist/index.js.map +1 -1
- package/package.json +1 -1
package/dist/index.d.cts
CHANGED
|
@@ -761,50 +761,26 @@ declare function useCrowAPI({ onIdentified, onReset }?: UseCrowAPIOptions): {
|
|
|
761
761
|
};
|
|
762
762
|
|
|
763
763
|
/**
|
|
764
|
-
* useVoiceInput -
|
|
764
|
+
* useVoiceInput - Speech-to-text via Gradium STT WebSocket
|
|
765
765
|
*
|
|
766
|
-
*
|
|
767
|
-
*
|
|
766
|
+
* Uses MediaRecorder API (works on all browsers including mobile Safari)
|
|
767
|
+
* and streams audio to backend WebSocket proxy for Gradium STT.
|
|
768
768
|
*/
|
|
769
|
-
interface SpeechRecognitionEvent {
|
|
770
|
-
results: SpeechRecognitionResultList;
|
|
771
|
-
resultIndex: number;
|
|
772
|
-
}
|
|
773
|
-
interface SpeechRecognitionErrorEvent {
|
|
774
|
-
error: string;
|
|
775
|
-
message?: string;
|
|
776
|
-
}
|
|
777
|
-
interface SpeechRecognitionInstance extends EventTarget {
|
|
778
|
-
continuous: boolean;
|
|
779
|
-
interimResults: boolean;
|
|
780
|
-
lang: string;
|
|
781
|
-
start(): void;
|
|
782
|
-
stop(): void;
|
|
783
|
-
abort(): void;
|
|
784
|
-
onresult: ((event: SpeechRecognitionEvent) => void) | null;
|
|
785
|
-
onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
|
|
786
|
-
onend: (() => void) | null;
|
|
787
|
-
onspeechend: (() => void) | null;
|
|
788
|
-
}
|
|
789
|
-
declare global {
|
|
790
|
-
interface Window {
|
|
791
|
-
SpeechRecognition?: new () => SpeechRecognitionInstance;
|
|
792
|
-
webkitSpeechRecognition?: new () => SpeechRecognitionInstance;
|
|
793
|
-
}
|
|
794
|
-
}
|
|
795
769
|
interface UseVoiceInputOptions {
|
|
796
|
-
/**
|
|
797
|
-
|
|
798
|
-
/** Auto-submit after silence. If set,
|
|
770
|
+
/** Backend URL for WebSocket connection (e.g., "ws://localhost:8000" or "wss://api.example.com") */
|
|
771
|
+
backendUrl: string;
|
|
772
|
+
/** Auto-submit after silence. If set, stops recording after this many ms of silence. */
|
|
799
773
|
silenceTimeoutMs?: number;
|
|
800
774
|
}
|
|
801
775
|
interface UseVoiceInputReturn {
|
|
802
|
-
/** Whether the browser supports
|
|
776
|
+
/** Whether the browser supports audio recording (MediaRecorder API) */
|
|
803
777
|
supported: boolean;
|
|
804
778
|
/** Whether currently recording */
|
|
805
779
|
isRecording: boolean;
|
|
806
|
-
/** Current transcript (
|
|
780
|
+
/** Current transcript (accumulated final results) */
|
|
807
781
|
transcript: string;
|
|
782
|
+
/** Error message if any */
|
|
783
|
+
error: string | null;
|
|
808
784
|
/** Start recording */
|
|
809
785
|
start: () => void;
|
|
810
786
|
/** Stop recording and finalize transcript */
|
|
@@ -814,7 +790,7 @@ interface UseVoiceInputReturn {
|
|
|
814
790
|
/** Clear the transcript */
|
|
815
791
|
clear: () => void;
|
|
816
792
|
}
|
|
817
|
-
declare function useVoiceInput(options
|
|
793
|
+
declare function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputReturn;
|
|
818
794
|
|
|
819
795
|
/**
|
|
820
796
|
* useWidgetStyles Hook
|
|
@@ -1144,6 +1120,10 @@ interface PromptInputBoxProps {
|
|
|
1144
1120
|
availableModels?: Model[];
|
|
1145
1121
|
/** When true, adds a pulsing highlight effect to indicate user input is needed */
|
|
1146
1122
|
highlighted?: boolean;
|
|
1123
|
+
/** Backend URL for voice input WebSocket (required for voice input to work) */
|
|
1124
|
+
backendUrl?: string;
|
|
1125
|
+
/** When this value changes (and is > 0), start voice recording */
|
|
1126
|
+
triggerVoiceRecording?: number;
|
|
1147
1127
|
}
|
|
1148
1128
|
declare const PromptInputBox: react__default.ForwardRefExoticComponent<PromptInputBoxProps & react__default.RefAttributes<HTMLDivElement>>;
|
|
1149
1129
|
|
package/dist/index.d.ts
CHANGED
|
@@ -761,50 +761,26 @@ declare function useCrowAPI({ onIdentified, onReset }?: UseCrowAPIOptions): {
|
|
|
761
761
|
};
|
|
762
762
|
|
|
763
763
|
/**
|
|
764
|
-
* useVoiceInput -
|
|
764
|
+
* useVoiceInput - Speech-to-text via Gradium STT WebSocket
|
|
765
765
|
*
|
|
766
|
-
*
|
|
767
|
-
*
|
|
766
|
+
* Uses MediaRecorder API (works on all browsers including mobile Safari)
|
|
767
|
+
* and streams audio to backend WebSocket proxy for Gradium STT.
|
|
768
768
|
*/
|
|
769
|
-
interface SpeechRecognitionEvent {
|
|
770
|
-
results: SpeechRecognitionResultList;
|
|
771
|
-
resultIndex: number;
|
|
772
|
-
}
|
|
773
|
-
interface SpeechRecognitionErrorEvent {
|
|
774
|
-
error: string;
|
|
775
|
-
message?: string;
|
|
776
|
-
}
|
|
777
|
-
interface SpeechRecognitionInstance extends EventTarget {
|
|
778
|
-
continuous: boolean;
|
|
779
|
-
interimResults: boolean;
|
|
780
|
-
lang: string;
|
|
781
|
-
start(): void;
|
|
782
|
-
stop(): void;
|
|
783
|
-
abort(): void;
|
|
784
|
-
onresult: ((event: SpeechRecognitionEvent) => void) | null;
|
|
785
|
-
onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
|
|
786
|
-
onend: (() => void) | null;
|
|
787
|
-
onspeechend: (() => void) | null;
|
|
788
|
-
}
|
|
789
|
-
declare global {
|
|
790
|
-
interface Window {
|
|
791
|
-
SpeechRecognition?: new () => SpeechRecognitionInstance;
|
|
792
|
-
webkitSpeechRecognition?: new () => SpeechRecognitionInstance;
|
|
793
|
-
}
|
|
794
|
-
}
|
|
795
769
|
interface UseVoiceInputOptions {
|
|
796
|
-
/**
|
|
797
|
-
|
|
798
|
-
/** Auto-submit after silence. If set,
|
|
770
|
+
/** Backend URL for WebSocket connection (e.g., "ws://localhost:8000" or "wss://api.example.com") */
|
|
771
|
+
backendUrl: string;
|
|
772
|
+
/** Auto-submit after silence. If set, stops recording after this many ms of silence. */
|
|
799
773
|
silenceTimeoutMs?: number;
|
|
800
774
|
}
|
|
801
775
|
interface UseVoiceInputReturn {
|
|
802
|
-
/** Whether the browser supports
|
|
776
|
+
/** Whether the browser supports audio recording (MediaRecorder API) */
|
|
803
777
|
supported: boolean;
|
|
804
778
|
/** Whether currently recording */
|
|
805
779
|
isRecording: boolean;
|
|
806
|
-
/** Current transcript (
|
|
780
|
+
/** Current transcript (accumulated final results) */
|
|
807
781
|
transcript: string;
|
|
782
|
+
/** Error message if any */
|
|
783
|
+
error: string | null;
|
|
808
784
|
/** Start recording */
|
|
809
785
|
start: () => void;
|
|
810
786
|
/** Stop recording and finalize transcript */
|
|
@@ -814,7 +790,7 @@ interface UseVoiceInputReturn {
|
|
|
814
790
|
/** Clear the transcript */
|
|
815
791
|
clear: () => void;
|
|
816
792
|
}
|
|
817
|
-
declare function useVoiceInput(options
|
|
793
|
+
declare function useVoiceInput(options: UseVoiceInputOptions): UseVoiceInputReturn;
|
|
818
794
|
|
|
819
795
|
/**
|
|
820
796
|
* useWidgetStyles Hook
|
|
@@ -1144,6 +1120,10 @@ interface PromptInputBoxProps {
|
|
|
1144
1120
|
availableModels?: Model[];
|
|
1145
1121
|
/** When true, adds a pulsing highlight effect to indicate user input is needed */
|
|
1146
1122
|
highlighted?: boolean;
|
|
1123
|
+
/** Backend URL for voice input WebSocket (required for voice input to work) */
|
|
1124
|
+
backendUrl?: string;
|
|
1125
|
+
/** When this value changes (and is > 0), start voice recording */
|
|
1126
|
+
triggerVoiceRecording?: number;
|
|
1147
1127
|
}
|
|
1148
1128
|
declare const PromptInputBox: react__default.ForwardRefExoticComponent<PromptInputBoxProps & react__default.RefAttributes<HTMLDivElement>>;
|
|
1149
1129
|
|
package/dist/index.js
CHANGED
|
@@ -1743,6 +1743,186 @@ function usePreviewCopilotStyles(previewStyles) {
|
|
|
1743
1743
|
styles: mergeCopilotStyles(void 0, previewStyles)
|
|
1744
1744
|
};
|
|
1745
1745
|
}
|
|
1746
|
+
function useTTSOutput({
|
|
1747
|
+
backendUrl,
|
|
1748
|
+
voiceId = "YTpq7expH9539ERJ"
|
|
1749
|
+
}) {
|
|
1750
|
+
const [isSpeaking, setIsSpeaking] = useState(false);
|
|
1751
|
+
const [error, setError] = useState(null);
|
|
1752
|
+
const wsRef = useRef(null);
|
|
1753
|
+
const audioContextRef = useRef(null);
|
|
1754
|
+
const nextTimeRef = useRef(0);
|
|
1755
|
+
const streamCompleteRef = useRef(false);
|
|
1756
|
+
const completionCheckIntervalRef = useRef(null);
|
|
1757
|
+
const cleanupAudioContext = useCallback(() => {
|
|
1758
|
+
setIsSpeaking(false);
|
|
1759
|
+
if (audioContextRef.current && audioContextRef.current.state !== "closed") {
|
|
1760
|
+
audioContextRef.current.close();
|
|
1761
|
+
audioContextRef.current = null;
|
|
1762
|
+
}
|
|
1763
|
+
if (completionCheckIntervalRef.current) {
|
|
1764
|
+
clearInterval(completionCheckIntervalRef.current);
|
|
1765
|
+
completionCheckIntervalRef.current = null;
|
|
1766
|
+
}
|
|
1767
|
+
}, []);
|
|
1768
|
+
const closeWebSocket = useCallback(() => {
|
|
1769
|
+
if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
|
|
1770
|
+
try {
|
|
1771
|
+
wsRef.current.send(JSON.stringify({ type: "stop" }));
|
|
1772
|
+
wsRef.current.close();
|
|
1773
|
+
} catch (e) {
|
|
1774
|
+
}
|
|
1775
|
+
}
|
|
1776
|
+
wsRef.current = null;
|
|
1777
|
+
}, []);
|
|
1778
|
+
const cleanupTTS = useCallback(() => {
|
|
1779
|
+
setIsSpeaking(false);
|
|
1780
|
+
setError(null);
|
|
1781
|
+
closeWebSocket();
|
|
1782
|
+
cleanupAudioContext();
|
|
1783
|
+
}, [closeWebSocket, cleanupAudioContext]);
|
|
1784
|
+
const waitForAudioComplete = useCallback(() => {
|
|
1785
|
+
if (completionCheckIntervalRef.current) {
|
|
1786
|
+
clearInterval(completionCheckIntervalRef.current);
|
|
1787
|
+
}
|
|
1788
|
+
completionCheckIntervalRef.current = setInterval(() => {
|
|
1789
|
+
if (!audioContextRef.current) {
|
|
1790
|
+
if (completionCheckIntervalRef.current) {
|
|
1791
|
+
clearInterval(completionCheckIntervalRef.current);
|
|
1792
|
+
completionCheckIntervalRef.current = null;
|
|
1793
|
+
}
|
|
1794
|
+
return;
|
|
1795
|
+
}
|
|
1796
|
+
const now = audioContextRef.current.currentTime;
|
|
1797
|
+
if (now >= nextTimeRef.current) {
|
|
1798
|
+
if (completionCheckIntervalRef.current) {
|
|
1799
|
+
clearInterval(completionCheckIntervalRef.current);
|
|
1800
|
+
completionCheckIntervalRef.current = null;
|
|
1801
|
+
}
|
|
1802
|
+
cleanupAudioContext();
|
|
1803
|
+
}
|
|
1804
|
+
}, 100);
|
|
1805
|
+
}, [cleanupAudioContext]);
|
|
1806
|
+
const playAudioChunk = useCallback((base64Audio) => {
|
|
1807
|
+
if (!audioContextRef.current || audioContextRef.current.state === "closed") {
|
|
1808
|
+
console.error("TTS: AudioContext not available");
|
|
1809
|
+
return;
|
|
1810
|
+
}
|
|
1811
|
+
try {
|
|
1812
|
+
const binary = atob(base64Audio);
|
|
1813
|
+
const bytes = new Uint8Array(binary.length);
|
|
1814
|
+
for (let i = 0; i < binary.length; i++) {
|
|
1815
|
+
bytes[i] = binary.charCodeAt(i);
|
|
1816
|
+
}
|
|
1817
|
+
const pcm16 = new Int16Array(bytes.buffer);
|
|
1818
|
+
const float32 = new Float32Array(pcm16.length);
|
|
1819
|
+
for (let i = 0; i < pcm16.length; i++) {
|
|
1820
|
+
float32[i] = pcm16[i] / 32768;
|
|
1821
|
+
}
|
|
1822
|
+
const buffer = audioContextRef.current.createBuffer(1, float32.length, 48e3);
|
|
1823
|
+
buffer.getChannelData(0).set(float32);
|
|
1824
|
+
const source = audioContextRef.current.createBufferSource();
|
|
1825
|
+
source.buffer = buffer;
|
|
1826
|
+
source.connect(audioContextRef.current.destination);
|
|
1827
|
+
const now = audioContextRef.current.currentTime;
|
|
1828
|
+
if (nextTimeRef.current < now) {
|
|
1829
|
+
nextTimeRef.current = now;
|
|
1830
|
+
}
|
|
1831
|
+
source.start(nextTimeRef.current);
|
|
1832
|
+
nextTimeRef.current += buffer.duration;
|
|
1833
|
+
} catch (err) {
|
|
1834
|
+
console.error("TTS: Error playing audio chunk:", err);
|
|
1835
|
+
setError(err instanceof Error ? err.message : "Failed to play audio chunk");
|
|
1836
|
+
}
|
|
1837
|
+
}, []);
|
|
1838
|
+
const speak = useCallback(
|
|
1839
|
+
(text) => {
|
|
1840
|
+
console.log("[TTS Hook] speak called with:", text.substring(0, 50), "backendUrl:", backendUrl);
|
|
1841
|
+
if (!text.trim()) {
|
|
1842
|
+
console.log("[TTS Hook] No text to speak");
|
|
1843
|
+
setError("No text to speak");
|
|
1844
|
+
return;
|
|
1845
|
+
}
|
|
1846
|
+
if (isSpeaking || wsRef.current) {
|
|
1847
|
+
console.log("[TTS Hook] Already playing");
|
|
1848
|
+
setError("Already playing, stop first");
|
|
1849
|
+
return;
|
|
1850
|
+
}
|
|
1851
|
+
setError(null);
|
|
1852
|
+
nextTimeRef.current = 0;
|
|
1853
|
+
streamCompleteRef.current = false;
|
|
1854
|
+
try {
|
|
1855
|
+
audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)({
|
|
1856
|
+
sampleRate: 48e3
|
|
1857
|
+
});
|
|
1858
|
+
const url = backendUrl.startsWith("http") ? backendUrl.replace(/^http/, "ws") : backendUrl;
|
|
1859
|
+
const wsUrl = `${url}/api/tts/stream`;
|
|
1860
|
+
console.log("[TTS Hook] Connecting to:", wsUrl);
|
|
1861
|
+
const ws = new WebSocket(wsUrl);
|
|
1862
|
+
wsRef.current = ws;
|
|
1863
|
+
ws.onopen = () => {
|
|
1864
|
+
ws.send(
|
|
1865
|
+
JSON.stringify({
|
|
1866
|
+
type: "setup",
|
|
1867
|
+
voice_id: voiceId,
|
|
1868
|
+
output_format: "pcm"
|
|
1869
|
+
})
|
|
1870
|
+
);
|
|
1871
|
+
};
|
|
1872
|
+
ws.onmessage = (event) => {
|
|
1873
|
+
const msg = JSON.parse(event.data);
|
|
1874
|
+
if (msg.type === "ready") {
|
|
1875
|
+
ws.send(JSON.stringify({ type: "text", text }));
|
|
1876
|
+
ws.send(JSON.stringify({ type: "end_of_stream" }));
|
|
1877
|
+
} else if (msg.type === "audio") {
|
|
1878
|
+
playAudioChunk(msg.audio);
|
|
1879
|
+
} else if (msg.type === "done") {
|
|
1880
|
+
streamCompleteRef.current = true;
|
|
1881
|
+
closeWebSocket();
|
|
1882
|
+
waitForAudioComplete();
|
|
1883
|
+
} else if (msg.type === "error") {
|
|
1884
|
+
setError(msg.message || "TTS error");
|
|
1885
|
+
cleanupTTS();
|
|
1886
|
+
}
|
|
1887
|
+
};
|
|
1888
|
+
ws.onerror = () => {
|
|
1889
|
+
setError("WebSocket error");
|
|
1890
|
+
cleanupTTS();
|
|
1891
|
+
};
|
|
1892
|
+
ws.onclose = () => {
|
|
1893
|
+
wsRef.current = null;
|
|
1894
|
+
};
|
|
1895
|
+
setIsSpeaking(true);
|
|
1896
|
+
} catch (err) {
|
|
1897
|
+
setError(err instanceof Error ? err.message : "Failed to start TTS");
|
|
1898
|
+
cleanupTTS();
|
|
1899
|
+
}
|
|
1900
|
+
},
|
|
1901
|
+
[
|
|
1902
|
+
isSpeaking,
|
|
1903
|
+
backendUrl,
|
|
1904
|
+
voiceId,
|
|
1905
|
+
playAudioChunk,
|
|
1906
|
+
closeWebSocket,
|
|
1907
|
+
waitForAudioComplete,
|
|
1908
|
+
cleanupTTS
|
|
1909
|
+
]
|
|
1910
|
+
);
|
|
1911
|
+
const stop = useCallback(() => {
|
|
1912
|
+
cleanupTTS();
|
|
1913
|
+
}, [cleanupTTS]);
|
|
1914
|
+
useEffect(() => {
|
|
1915
|
+
return () => {
|
|
1916
|
+
cleanupTTS();
|
|
1917
|
+
};
|
|
1918
|
+
}, [cleanupTTS]);
|
|
1919
|
+
return {
|
|
1920
|
+
speak,
|
|
1921
|
+
stop,
|
|
1922
|
+
isSpeaking,
|
|
1923
|
+
error
|
|
1924
|
+
};
|
|
1925
|
+
}
|
|
1746
1926
|
var WidgetStyleContext = createContext(null);
|
|
1747
1927
|
function WidgetStyleProvider({
|
|
1748
1928
|
children,
|
|
@@ -2730,80 +2910,176 @@ var ModelSelector = ({
|
|
|
2730
2910
|
] }, provider)) })
|
|
2731
2911
|
] });
|
|
2732
2912
|
};
|
|
2733
|
-
var
|
|
2734
|
-
if (typeof window === "undefined") return
|
|
2735
|
-
return
|
|
2913
|
+
var isMediaRecorderSupported = () => {
|
|
2914
|
+
if (typeof window === "undefined") return false;
|
|
2915
|
+
return !!(navigator.mediaDevices && typeof navigator.mediaDevices.getUserMedia === "function" && (window.AudioContext || window.webkitAudioContext));
|
|
2736
2916
|
};
|
|
2737
|
-
function useVoiceInput(options
|
|
2738
|
-
const {
|
|
2739
|
-
const [supported] = useState(() =>
|
|
2917
|
+
function useVoiceInput(options) {
|
|
2918
|
+
const { backendUrl, silenceTimeoutMs } = options;
|
|
2919
|
+
const [supported] = useState(() => isMediaRecorderSupported());
|
|
2740
2920
|
const [isRecording, setIsRecording] = useState(false);
|
|
2741
2921
|
const [transcript, setTranscript] = useState("");
|
|
2742
|
-
const
|
|
2922
|
+
const [error, setError] = useState(null);
|
|
2923
|
+
const wsRef = useRef(null);
|
|
2924
|
+
const streamRef = useRef(null);
|
|
2925
|
+
const audioContextRef = useRef(null);
|
|
2926
|
+
const processorRef = useRef(null);
|
|
2743
2927
|
const silenceTimerRef = useRef(null);
|
|
2744
|
-
const
|
|
2928
|
+
const transcriptRef = useRef("");
|
|
2929
|
+
const interimRef = useRef("");
|
|
2930
|
+
const isRecordingRef = useRef(false);
|
|
2745
2931
|
const clearSilenceTimer = useCallback(() => {
|
|
2746
2932
|
if (silenceTimerRef.current) {
|
|
2747
2933
|
clearTimeout(silenceTimerRef.current);
|
|
2748
2934
|
silenceTimerRef.current = null;
|
|
2749
2935
|
}
|
|
2750
2936
|
}, []);
|
|
2751
|
-
const
|
|
2937
|
+
const cleanup = useCallback(() => {
|
|
2752
2938
|
clearSilenceTimer();
|
|
2753
|
-
|
|
2754
|
-
|
|
2939
|
+
isRecordingRef.current = false;
|
|
2940
|
+
if (interimRef.current) {
|
|
2941
|
+
transcriptRef.current += interimRef.current + " ";
|
|
2942
|
+
setTranscript(transcriptRef.current.trim());
|
|
2943
|
+
interimRef.current = "";
|
|
2755
2944
|
}
|
|
2945
|
+
if (wsRef.current) {
|
|
2946
|
+
try {
|
|
2947
|
+
if (wsRef.current.readyState === WebSocket.OPEN) {
|
|
2948
|
+
wsRef.current.send(JSON.stringify({ type: "stop" }));
|
|
2949
|
+
}
|
|
2950
|
+
wsRef.current.close();
|
|
2951
|
+
} catch (e) {
|
|
2952
|
+
}
|
|
2953
|
+
wsRef.current = null;
|
|
2954
|
+
}
|
|
2955
|
+
if (processorRef.current) {
|
|
2956
|
+
processorRef.current.disconnect();
|
|
2957
|
+
processorRef.current = null;
|
|
2958
|
+
}
|
|
2959
|
+
if (audioContextRef.current) {
|
|
2960
|
+
audioContextRef.current.close();
|
|
2961
|
+
audioContextRef.current = null;
|
|
2962
|
+
}
|
|
2963
|
+
if (streamRef.current) {
|
|
2964
|
+
streamRef.current.getTracks().forEach((track) => track.stop());
|
|
2965
|
+
streamRef.current = null;
|
|
2966
|
+
}
|
|
2967
|
+
setIsRecording(false);
|
|
2756
2968
|
}, [clearSilenceTimer]);
|
|
2969
|
+
const stop = useCallback(() => {
|
|
2970
|
+
cleanup();
|
|
2971
|
+
}, [cleanup]);
|
|
2757
2972
|
const clear = useCallback(() => {
|
|
2758
2973
|
setTranscript("");
|
|
2759
|
-
|
|
2974
|
+
transcriptRef.current = "";
|
|
2975
|
+
setError(null);
|
|
2760
2976
|
}, []);
|
|
2761
|
-
const
|
|
2762
|
-
|
|
2763
|
-
|
|
2764
|
-
|
|
2765
|
-
|
|
2766
|
-
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
|
|
2775
|
-
let final = "";
|
|
2776
|
-
for (let i = 0; i < event.results.length; i++) {
|
|
2777
|
-
const result = event.results[i];
|
|
2778
|
-
if (result.isFinal) {
|
|
2779
|
-
final += result[0].transcript;
|
|
2780
|
-
} else {
|
|
2781
|
-
interim += result[0].transcript;
|
|
2782
|
-
}
|
|
2977
|
+
const startAudioCapture = useCallback(() => {
|
|
2978
|
+
if (!streamRef.current || !wsRef.current) return;
|
|
2979
|
+
audioContextRef.current = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 24e3 });
|
|
2980
|
+
const source = audioContextRef.current.createMediaStreamSource(
|
|
2981
|
+
streamRef.current
|
|
2982
|
+
);
|
|
2983
|
+
processorRef.current = audioContextRef.current.createScriptProcessor(
|
|
2984
|
+
4096,
|
|
2985
|
+
1,
|
|
2986
|
+
1
|
|
2987
|
+
);
|
|
2988
|
+
processorRef.current.onaudioprocess = (event) => {
|
|
2989
|
+
if (!isRecordingRef.current || !wsRef.current || wsRef.current.readyState !== WebSocket.OPEN) {
|
|
2990
|
+
return;
|
|
2783
2991
|
}
|
|
2784
|
-
|
|
2785
|
-
|
|
2786
|
-
|
|
2787
|
-
|
|
2788
|
-
|
|
2789
|
-
stop();
|
|
2790
|
-
}, silenceTimeoutMs);
|
|
2992
|
+
const inputData = event.inputBuffer.getChannelData(0);
|
|
2993
|
+
const pcm16 = new Int16Array(inputData.length);
|
|
2994
|
+
for (let i = 0; i < inputData.length; i++) {
|
|
2995
|
+
const s = Math.max(-1, Math.min(1, inputData[i]));
|
|
2996
|
+
pcm16[i] = s < 0 ? s * 32768 : s * 32767;
|
|
2791
2997
|
}
|
|
2792
|
-
|
|
2793
|
-
|
|
2794
|
-
|
|
2795
|
-
|
|
2998
|
+
const bytes = new Uint8Array(pcm16.buffer);
|
|
2999
|
+
let binary = "";
|
|
3000
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
3001
|
+
binary += String.fromCharCode(bytes[i]);
|
|
2796
3002
|
}
|
|
2797
|
-
|
|
2798
|
-
|
|
2799
|
-
|
|
2800
|
-
setIsRecording(false);
|
|
2801
|
-
recognitionRef.current = null;
|
|
3003
|
+
wsRef.current.send(
|
|
3004
|
+
JSON.stringify({ type: "audio", data: btoa(binary) })
|
|
3005
|
+
);
|
|
2802
3006
|
};
|
|
2803
|
-
|
|
2804
|
-
|
|
2805
|
-
|
|
2806
|
-
|
|
3007
|
+
source.connect(processorRef.current);
|
|
3008
|
+
processorRef.current.connect(audioContextRef.current.destination);
|
|
3009
|
+
}, []);
|
|
3010
|
+
const start = useCallback(async () => {
|
|
3011
|
+
if (!supported) {
|
|
3012
|
+
setError("Audio recording not supported in this browser");
|
|
3013
|
+
return;
|
|
3014
|
+
}
|
|
3015
|
+
setError(null);
|
|
3016
|
+
transcriptRef.current = "";
|
|
3017
|
+
setTranscript("");
|
|
3018
|
+
try {
|
|
3019
|
+
streamRef.current = await navigator.mediaDevices.getUserMedia({
|
|
3020
|
+
audio: {
|
|
3021
|
+
echoCancellation: true,
|
|
3022
|
+
noiseSuppression: true,
|
|
3023
|
+
sampleRate: 24e3
|
|
3024
|
+
}
|
|
3025
|
+
});
|
|
3026
|
+
const wsProtocol = backendUrl.startsWith("https") ? "wss" : "ws";
|
|
3027
|
+
const wsHost = backendUrl.replace(/^https?:\/\//, "");
|
|
3028
|
+
const wsUrl = `${wsProtocol}://${wsHost}/api/stt/stream`;
|
|
3029
|
+
wsRef.current = new WebSocket(wsUrl);
|
|
3030
|
+
wsRef.current.onopen = () => {
|
|
3031
|
+
wsRef.current?.send(JSON.stringify({ type: "setup" }));
|
|
3032
|
+
};
|
|
3033
|
+
wsRef.current.onmessage = (event) => {
|
|
3034
|
+
const msg = JSON.parse(event.data);
|
|
3035
|
+
if (msg.type === "ready") {
|
|
3036
|
+
startAudioCapture();
|
|
3037
|
+
isRecordingRef.current = true;
|
|
3038
|
+
setIsRecording(true);
|
|
3039
|
+
} else if (msg.type === "transcript") {
|
|
3040
|
+
if (msg.is_final && msg.text) {
|
|
3041
|
+
transcriptRef.current += msg.text + " ";
|
|
3042
|
+
interimRef.current = "";
|
|
3043
|
+
setTranscript(transcriptRef.current.trim());
|
|
3044
|
+
if (silenceTimeoutMs) {
|
|
3045
|
+
clearSilenceTimer();
|
|
3046
|
+
silenceTimerRef.current = setTimeout(() => {
|
|
3047
|
+
stop();
|
|
3048
|
+
}, silenceTimeoutMs);
|
|
3049
|
+
}
|
|
3050
|
+
} else if (!msg.is_final && msg.text) {
|
|
3051
|
+
interimRef.current = msg.text;
|
|
3052
|
+
setTranscript((transcriptRef.current + msg.text).trim());
|
|
3053
|
+
}
|
|
3054
|
+
} else if (msg.type === "error") {
|
|
3055
|
+
setError(msg.message || "STT error");
|
|
3056
|
+
cleanup();
|
|
3057
|
+
}
|
|
3058
|
+
};
|
|
3059
|
+
wsRef.current.onerror = () => {
|
|
3060
|
+
setError("WebSocket connection error");
|
|
3061
|
+
cleanup();
|
|
3062
|
+
};
|
|
3063
|
+
wsRef.current.onclose = () => {
|
|
3064
|
+
if (isRecordingRef.current) {
|
|
3065
|
+
cleanup();
|
|
3066
|
+
}
|
|
3067
|
+
};
|
|
3068
|
+
} catch (err) {
|
|
3069
|
+
setError(
|
|
3070
|
+
err instanceof Error ? err.message : "Failed to start recording"
|
|
3071
|
+
);
|
|
3072
|
+
cleanup();
|
|
3073
|
+
}
|
|
3074
|
+
}, [
|
|
3075
|
+
supported,
|
|
3076
|
+
backendUrl,
|
|
3077
|
+
startAudioCapture,
|
|
3078
|
+
silenceTimeoutMs,
|
|
3079
|
+
clearSilenceTimer,
|
|
3080
|
+
stop,
|
|
3081
|
+
cleanup
|
|
3082
|
+
]);
|
|
2807
3083
|
const toggle = useCallback(() => {
|
|
2808
3084
|
if (isRecording) {
|
|
2809
3085
|
stop();
|
|
@@ -2813,13 +3089,19 @@ function useVoiceInput(options = {}) {
|
|
|
2813
3089
|
}, [isRecording, start, stop]);
|
|
2814
3090
|
useEffect(() => {
|
|
2815
3091
|
return () => {
|
|
2816
|
-
|
|
2817
|
-
if (recognitionRef.current) {
|
|
2818
|
-
recognitionRef.current.abort();
|
|
2819
|
-
}
|
|
3092
|
+
cleanup();
|
|
2820
3093
|
};
|
|
2821
|
-
}, [
|
|
2822
|
-
return {
|
|
3094
|
+
}, [cleanup]);
|
|
3095
|
+
return {
|
|
3096
|
+
supported,
|
|
3097
|
+
isRecording,
|
|
3098
|
+
transcript,
|
|
3099
|
+
error,
|
|
3100
|
+
start,
|
|
3101
|
+
stop,
|
|
3102
|
+
toggle,
|
|
3103
|
+
clear
|
|
3104
|
+
};
|
|
2823
3105
|
}
|
|
2824
3106
|
var Textarea = React3.forwardRef(
|
|
2825
3107
|
({ className, ...props }, ref) => /* @__PURE__ */ jsx(
|
|
@@ -3013,11 +3295,23 @@ var PromptInputBox = React3.forwardRef(
|
|
|
3013
3295
|
selectedModel = "gpt-4o",
|
|
3014
3296
|
onModelChange,
|
|
3015
3297
|
availableModels = [],
|
|
3016
|
-
highlighted = false
|
|
3298
|
+
highlighted = false,
|
|
3299
|
+
backendUrl = "",
|
|
3300
|
+
triggerVoiceRecording = 0
|
|
3017
3301
|
}, ref) => {
|
|
3018
3302
|
const [input, setInput] = React3.useState("");
|
|
3019
3303
|
const promptBoxRef = React3.useRef(null);
|
|
3020
|
-
const voice = useVoiceInput();
|
|
3304
|
+
const voice = useVoiceInput({ backendUrl, silenceTimeoutMs: 1500 });
|
|
3305
|
+
const lastTriggerRef = React3.useRef(0);
|
|
3306
|
+
const voiceRef = React3.useRef(voice);
|
|
3307
|
+
voiceRef.current = voice;
|
|
3308
|
+
React3.useEffect(() => {
|
|
3309
|
+
if (triggerVoiceRecording > 0 && triggerVoiceRecording !== lastTriggerRef.current) {
|
|
3310
|
+
console.log("[Voice] Auto-starting recording from trigger");
|
|
3311
|
+
voiceRef.current.start();
|
|
3312
|
+
}
|
|
3313
|
+
lastTriggerRef.current = triggerVoiceRecording;
|
|
3314
|
+
}, [triggerVoiceRecording]);
|
|
3021
3315
|
React3.useEffect(() => {
|
|
3022
3316
|
if (voice.isRecording && voice.transcript) {
|
|
3023
3317
|
setInput(voice.transcript);
|
|
@@ -3026,11 +3320,16 @@ var PromptInputBox = React3.forwardRef(
|
|
|
3026
3320
|
const wasRecordingRef = React3.useRef(false);
|
|
3027
3321
|
React3.useEffect(() => {
|
|
3028
3322
|
if (wasRecordingRef.current && !voice.isRecording && voice.transcript) {
|
|
3029
|
-
|
|
3323
|
+
const messageToSend = voice.transcript.trim();
|
|
3324
|
+
if (messageToSend) {
|
|
3325
|
+
console.log("[Voice] Auto-sending:", messageToSend);
|
|
3326
|
+
onSend(messageToSend);
|
|
3327
|
+
setInput("");
|
|
3328
|
+
}
|
|
3030
3329
|
voice.clear();
|
|
3031
3330
|
}
|
|
3032
3331
|
wasRecordingRef.current = voice.isRecording;
|
|
3033
|
-
}, [voice.isRecording, voice.transcript, voice.clear]);
|
|
3332
|
+
}, [voice.isRecording, voice.transcript, voice.clear, onSend]);
|
|
3034
3333
|
const handleSubmit = () => {
|
|
3035
3334
|
if (input.trim()) {
|
|
3036
3335
|
if (voice.isRecording) {
|
|
@@ -3641,6 +3940,25 @@ function CrowWidget({
|
|
|
3641
3940
|
setShouldRestoreHistory(true);
|
|
3642
3941
|
}
|
|
3643
3942
|
});
|
|
3943
|
+
const tts = useTTSOutput({ backendUrl: apiUrl });
|
|
3944
|
+
const ttsRef = useRef(tts);
|
|
3945
|
+
ttsRef.current = tts;
|
|
3946
|
+
const wasLoadingRef = useRef(false);
|
|
3947
|
+
useEffect(() => {
|
|
3948
|
+
console.log("[Crow TTS] isLoading changed:", chat.isLoading, "wasLoading:", wasLoadingRef.current);
|
|
3949
|
+
if (wasLoadingRef.current && !chat.isLoading) {
|
|
3950
|
+
const lastMessage = [...chat.messages].reverse().find((m) => m.isBot);
|
|
3951
|
+
console.log("[Crow TTS] Last bot message:", lastMessage?.content?.substring(0, 50));
|
|
3952
|
+
if (lastMessage?.content) {
|
|
3953
|
+
const textToSpeak = lastMessage.content.replace(/\*\*/g, "").replace(/\*/g, "").replace(/`[^`]+`/g, "").replace(/\[([^\]]+)\]\([^)]+\)/g, "$1").trim();
|
|
3954
|
+
if (textToSpeak) {
|
|
3955
|
+
console.log("[Crow TTS] Speaking:", textToSpeak.substring(0, 50));
|
|
3956
|
+
ttsRef.current.speak(textToSpeak);
|
|
3957
|
+
}
|
|
3958
|
+
}
|
|
3959
|
+
}
|
|
3960
|
+
wasLoadingRef.current = chat.isLoading;
|
|
3961
|
+
}, [chat.isLoading, chat.messages]);
|
|
3644
3962
|
useEffect(() => {
|
|
3645
3963
|
if (initialSuggestions.length > 0 && chat.suggestedActions.length === 0) {
|
|
3646
3964
|
chat.setSuggestedActions(initialSuggestions);
|
|
@@ -4035,7 +4353,8 @@ function CrowWidget({
|
|
|
4035
4353
|
isLoading: chat.isLoading,
|
|
4036
4354
|
showStopButton: isBrowserUseActive || !!askUserResolver || !!pendingConfirmation,
|
|
4037
4355
|
highlighted: !!askUserResolver,
|
|
4038
|
-
className: "crow-backdrop-blur-md"
|
|
4356
|
+
className: "crow-backdrop-blur-md",
|
|
4357
|
+
backendUrl: apiUrl
|
|
4039
4358
|
}
|
|
4040
4359
|
)
|
|
4041
4360
|
] })
|