@arcote.tech/arc-ai-voice 0.7.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +23 -0
- package/src/adapters/whisper.ts +74 -0
- package/src/arc.d.ts +6 -0
- package/src/index.ts +31 -0
- package/src/react/use-voice-recorder.ts +255 -0
- package/src/react/voice-button.tsx +154 -0
- package/src/react/voice-content-editable.tsx +130 -0
- package/src/react/voice-text-input.tsx +87 -0
- package/src/react/voice-textarea.tsx +98 -0
- package/src/routes/transcribe-route.ts +94 -0
- package/src/types.ts +23 -0
- package/src/voice-builder.ts +36 -0
- package/tsconfig.json +4 -0
package/package.json
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@arcote.tech/arc-ai-voice",
|
|
3
|
+
"type": "module",
|
|
4
|
+
"version": "0.7.9",
|
|
5
|
+
"private": false,
|
|
6
|
+
"description": "Voice input + transcription standard for Arc — provider abstraction (Whisper, ...) + React VoiceTextInput/Textarea/ContentEditable components",
|
|
7
|
+
"main": "./src/index.ts",
|
|
8
|
+
"types": "./src/index.ts",
|
|
9
|
+
"scripts": {
|
|
10
|
+
"type-check": "tsc --noEmit"
|
|
11
|
+
},
|
|
12
|
+
"peerDependencies": {
|
|
13
|
+
"@arcote.tech/arc": "^0.7.9",
|
|
14
|
+
"@arcote.tech/arc-ds": "^0.7.9",
|
|
15
|
+
"@arcote.tech/platform": "^0.7.9",
|
|
16
|
+
"react": "^18.0.0 || ^19.0.0",
|
|
17
|
+
"lucide-react": ">=0.400.0",
|
|
18
|
+
"typescript": "^5.0.0"
|
|
19
|
+
},
|
|
20
|
+
"devDependencies": {
|
|
21
|
+
"@types/bun": "latest"
|
|
22
|
+
}
|
|
23
|
+
}
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
import type { TranscriptionOptions, TranscriptionProvider } from "../types";
|
|
2
|
+
|
|
3
|
+
export interface WhisperConfig {
|
|
4
|
+
/** OpenAI API key. */
|
|
5
|
+
apiKey: string;
|
|
6
|
+
/** Model — default `"whisper-1"`. Można też podać nowsze np. `gpt-4o-transcribe`. */
|
|
7
|
+
model?: string;
|
|
8
|
+
/** Custom base URL (proxy / Azure OpenAI / self-hosted). Default OpenAI. */
|
|
9
|
+
baseUrl?: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
/**
|
|
13
|
+
* OpenAI Whisper adapter. `whisper-1` jest sprawdzony, ekonomiczny i wspiera
|
|
14
|
+
* polski + większość innych języków. Endpoint `/v1/audio/transcriptions`
|
|
15
|
+
* przyjmuje multipart z polem `file` (webm/opus/mp4/wav/mp3).
|
|
16
|
+
*/
|
|
17
|
+
export function whisper(config: WhisperConfig): TranscriptionProvider {
|
|
18
|
+
const baseUrl = config.baseUrl ?? "https://api.openai.com/v1";
|
|
19
|
+
const model = config.model ?? "whisper-1";
|
|
20
|
+
|
|
21
|
+
return {
|
|
22
|
+
name: "whisper",
|
|
23
|
+
async transcribe(audio: Blob, options: TranscriptionOptions = {}) {
|
|
24
|
+
const formData = new FormData();
|
|
25
|
+
// Whisper wymaga `file` z rozszerzeniem w nazwie żeby rozpoznać format —
|
|
26
|
+
// sam Content-Type Bloba nie wystarczy.
|
|
27
|
+
const ext = mimeToExt(audio.type);
|
|
28
|
+
formData.append("file", audio, `audio.${ext}`);
|
|
29
|
+
formData.append("model", model);
|
|
30
|
+
formData.append("response_format", "text");
|
|
31
|
+
if (options.language) formData.append("language", options.language);
|
|
32
|
+
|
|
33
|
+
const response = await fetch(`${baseUrl}/audio/transcriptions`, {
|
|
34
|
+
method: "POST",
|
|
35
|
+
headers: { Authorization: `Bearer ${config.apiKey}` },
|
|
36
|
+
body: formData,
|
|
37
|
+
});
|
|
38
|
+
|
|
39
|
+
if (!response.ok) {
|
|
40
|
+
const detail = await response.text().catch(() => "");
|
|
41
|
+
throw new Error(
|
|
42
|
+
`Whisper API ${response.status}: ${detail || response.statusText}`,
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// response_format=text → plain string body, nie JSON.
|
|
47
|
+
return (await response.text()).trim();
|
|
48
|
+
},
|
|
49
|
+
};
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
function mimeToExt(mime: string): string {
|
|
53
|
+
// Whisper waliduje po rozszerzeniu pliku. `audio/webm;codecs=opus` → "webm".
|
|
54
|
+
const m = mime.split(";")[0].trim();
|
|
55
|
+
switch (m) {
|
|
56
|
+
case "audio/webm":
|
|
57
|
+
return "webm";
|
|
58
|
+
case "audio/mp4":
|
|
59
|
+
case "audio/m4a":
|
|
60
|
+
case "audio/x-m4a":
|
|
61
|
+
return "m4a";
|
|
62
|
+
case "audio/mpeg":
|
|
63
|
+
case "audio/mp3":
|
|
64
|
+
return "mp3";
|
|
65
|
+
case "audio/wav":
|
|
66
|
+
case "audio/x-wav":
|
|
67
|
+
return "wav";
|
|
68
|
+
case "audio/ogg":
|
|
69
|
+
return "ogg";
|
|
70
|
+
default:
|
|
71
|
+
// Bezpieczny fallback — Whisper akceptuje webm szeroko.
|
|
72
|
+
return "webm";
|
|
73
|
+
}
|
|
74
|
+
}
|
package/src/arc.d.ts
ADDED
package/src/index.ts
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
// Provider abstrakcja + adaptery
|
|
2
|
+
export type { TranscriptionOptions, TranscriptionProvider } from "./types";
|
|
3
|
+
export { whisper, type WhisperConfig } from "./adapters/whisper";
|
|
4
|
+
|
|
5
|
+
// Server-side: builder modułu + route
|
|
6
|
+
export { voice, type VoiceConfig } from "./voice-builder";
|
|
7
|
+
export {
|
|
8
|
+
createTranscribeRoute,
|
|
9
|
+
type TranscribeRouteConfig,
|
|
10
|
+
} from "./routes/transcribe-route";
|
|
11
|
+
|
|
12
|
+
// React: hook + komponenty UI
|
|
13
|
+
export {
|
|
14
|
+
useVoiceRecorder,
|
|
15
|
+
type UseVoiceRecorderOptions,
|
|
16
|
+
type UseVoiceRecorderResult,
|
|
17
|
+
type VoiceRecorderState,
|
|
18
|
+
} from "./react/use-voice-recorder";
|
|
19
|
+
export { VoiceButton, type VoiceButtonProps } from "./react/voice-button";
|
|
20
|
+
export {
|
|
21
|
+
VoiceTextInput,
|
|
22
|
+
type VoiceTextInputProps,
|
|
23
|
+
} from "./react/voice-text-input";
|
|
24
|
+
export {
|
|
25
|
+
VoiceTextarea,
|
|
26
|
+
type VoiceTextareaProps,
|
|
27
|
+
} from "./react/voice-textarea";
|
|
28
|
+
export {
|
|
29
|
+
VoiceContentEditable,
|
|
30
|
+
type VoiceContentEditableProps,
|
|
31
|
+
} from "./react/voice-content-editable";
|
|
@@ -0,0 +1,255 @@
|
|
|
1
|
+
import { useCallback, useEffect, useRef, useState } from "react";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Niskopoziomowa klasa do nagrywania audio. Trzymana jako moduł
|
|
5
|
+
* (nie hook) bo MediaRecorder ma własny lifecycle ortogonalny do React.
|
|
6
|
+
* Inspirowana legacy/packages/platform/src/utils/voice-recorder.ts (NDT,
|
|
7
|
+
* sprawdzona produkcyjnie).
|
|
8
|
+
*/
|
|
9
|
+
class Recorder {
|
|
10
|
+
private mediaRecorder: MediaRecorder | null = null;
|
|
11
|
+
private stream: MediaStream | null = null;
|
|
12
|
+
private chunks: Blob[] = [];
|
|
13
|
+
|
|
14
|
+
async start(): Promise<void> {
|
|
15
|
+
this.stream = await navigator.mediaDevices.getUserMedia({
|
|
16
|
+
audio: { echoCancellation: true, noiseSuppression: true, sampleRate: 44100 },
|
|
17
|
+
});
|
|
18
|
+
// Fallback chain: webm/opus → mp4 → default. Whisper akceptuje wszystkie.
|
|
19
|
+
let mimeType: string | undefined = "audio/webm;codecs=opus";
|
|
20
|
+
if (!MediaRecorder.isTypeSupported(mimeType)) mimeType = "audio/mp4";
|
|
21
|
+
if (!MediaRecorder.isTypeSupported(mimeType)) mimeType = undefined;
|
|
22
|
+
|
|
23
|
+
this.mediaRecorder = new MediaRecorder(
|
|
24
|
+
this.stream,
|
|
25
|
+
mimeType ? { mimeType } : undefined,
|
|
26
|
+
);
|
|
27
|
+
this.chunks = [];
|
|
28
|
+
this.mediaRecorder.ondataavailable = (e) => {
|
|
29
|
+
if (e.data.size > 0) this.chunks.push(e.data);
|
|
30
|
+
};
|
|
31
|
+
// start(100) — zbieraj data co 100ms, lepszy responsiveness na cancel.
|
|
32
|
+
this.mediaRecorder.start(100);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
stop(): Promise<Blob> {
|
|
36
|
+
return new Promise((resolve, reject) => {
|
|
37
|
+
const mr = this.mediaRecorder;
|
|
38
|
+
if (!mr) return reject(new Error("not recording"));
|
|
39
|
+
mr.onstop = () => {
|
|
40
|
+
const mime = mr.mimeType || "audio/webm";
|
|
41
|
+
const blob = new Blob(this.chunks, { type: mime });
|
|
42
|
+
this.cleanup();
|
|
43
|
+
resolve(blob);
|
|
44
|
+
};
|
|
45
|
+
mr.onerror = () => reject(new Error("recording error"));
|
|
46
|
+
mr.stop();
|
|
47
|
+
});
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
cancel(): void {
|
|
51
|
+
if (this.mediaRecorder?.state === "recording") {
|
|
52
|
+
try { this.mediaRecorder.stop(); } catch {}
|
|
53
|
+
}
|
|
54
|
+
this.cleanup();
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
private cleanup(): void {
|
|
58
|
+
if (this.stream) {
|
|
59
|
+
this.stream.getTracks().forEach((t) => t.stop());
|
|
60
|
+
this.stream = null;
|
|
61
|
+
}
|
|
62
|
+
this.mediaRecorder = null;
|
|
63
|
+
this.chunks = [];
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
static isSupported(): boolean {
|
|
67
|
+
return (
|
|
68
|
+
typeof navigator !== "undefined" &&
|
|
69
|
+
typeof navigator.mediaDevices?.getUserMedia === "function" &&
|
|
70
|
+
typeof MediaRecorder !== "undefined"
|
|
71
|
+
);
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
export type VoiceRecorderState =
|
|
76
|
+
| "idle"
|
|
77
|
+
| "recording"
|
|
78
|
+
| "processing"
|
|
79
|
+
| "error";
|
|
80
|
+
|
|
81
|
+
export interface UseVoiceRecorderOptions {
|
|
82
|
+
/** Endpoint do POST audio. Default `/route/voice/transcribe` (zgodne z arc-host prefix `/route` + voice path). */
|
|
83
|
+
apiUrl?: string;
|
|
84
|
+
/** Język ISO 639-1 (np. "pl"). Przekazany jako form field. */
|
|
85
|
+
language?: string;
|
|
86
|
+
/** Max długość nagrania (ms) — auto-stop po przekroczeniu. Default 60_000. */
|
|
87
|
+
maxDurationMs?: number;
|
|
88
|
+
/** Wywoływany gdy transkrypcja gotowa. */
|
|
89
|
+
onTranscript: (text: string) => void;
|
|
90
|
+
/** Wywoływany przy każdym błędzie (permission, network, API). */
|
|
91
|
+
onError?: (err: Error) => void;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
export interface UseVoiceRecorderResult {
|
|
95
|
+
state: VoiceRecorderState;
|
|
96
|
+
/** Czas nagrywania w ms — rośnie podczas `state === "recording"`. */
|
|
97
|
+
elapsedMs: number;
|
|
98
|
+
/** Ostatni błąd (kasowany przy następnym start()). */
|
|
99
|
+
error: Error | null;
|
|
100
|
+
start: () => Promise<void>;
|
|
101
|
+
stop: () => Promise<void>;
|
|
102
|
+
cancel: () => void;
|
|
103
|
+
/** Sprawdza dostępność MediaRecorder + getUserMedia. */
|
|
104
|
+
isSupported: boolean;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
const DEFAULT_API_URL = "/route/voice/transcribe";
|
|
108
|
+
const DEFAULT_MAX_MS = 60_000;
|
|
109
|
+
// Refresh elapsed time — 100ms wystarczy do gładkiej animacji koła postępu
|
|
110
|
+
// bez przeciążania React renderem.
|
|
111
|
+
const TICK_INTERVAL_MS = 100;
|
|
112
|
+
|
|
113
|
+
export function useVoiceRecorder(
|
|
114
|
+
options: UseVoiceRecorderOptions,
|
|
115
|
+
): UseVoiceRecorderResult {
|
|
116
|
+
const {
|
|
117
|
+
apiUrl = DEFAULT_API_URL,
|
|
118
|
+
language,
|
|
119
|
+
maxDurationMs = DEFAULT_MAX_MS,
|
|
120
|
+
onTranscript,
|
|
121
|
+
onError,
|
|
122
|
+
} = options;
|
|
123
|
+
|
|
124
|
+
const [state, setState] = useState<VoiceRecorderState>("idle");
|
|
125
|
+
const [elapsedMs, setElapsedMs] = useState(0);
|
|
126
|
+
const [error, setError] = useState<Error | null>(null);
|
|
127
|
+
|
|
128
|
+
const recorderRef = useRef<Recorder | null>(null);
|
|
129
|
+
const startTimeRef = useRef<number>(0);
|
|
130
|
+
const tickRef = useRef<ReturnType<typeof setInterval> | null>(null);
|
|
131
|
+
const autoStopRef = useRef<ReturnType<typeof setTimeout> | null>(null);
|
|
132
|
+
// Callbacks w refach żeby start/stop nie potrzebowały ich w deps i nie
|
|
133
|
+
// re-tworzyły się przy każdym renderze konsumenta.
|
|
134
|
+
const onTranscriptRef = useRef(onTranscript);
|
|
135
|
+
const onErrorRef = useRef(onError);
|
|
136
|
+
onTranscriptRef.current = onTranscript;
|
|
137
|
+
onErrorRef.current = onError;
|
|
138
|
+
|
|
139
|
+
const clearTimers = useCallback(() => {
|
|
140
|
+
if (tickRef.current !== null) {
|
|
141
|
+
clearInterval(tickRef.current);
|
|
142
|
+
tickRef.current = null;
|
|
143
|
+
}
|
|
144
|
+
if (autoStopRef.current !== null) {
|
|
145
|
+
clearTimeout(autoStopRef.current);
|
|
146
|
+
autoStopRef.current = null;
|
|
147
|
+
}
|
|
148
|
+
}, []);
|
|
149
|
+
|
|
150
|
+
const handleError = useCallback(
|
|
151
|
+
(e: unknown) => {
|
|
152
|
+
const err = e instanceof Error ? e : new Error(String(e));
|
|
153
|
+
setError(err);
|
|
154
|
+
setState("error");
|
|
155
|
+
onErrorRef.current?.(err);
|
|
156
|
+
},
|
|
157
|
+
[],
|
|
158
|
+
);
|
|
159
|
+
|
|
160
|
+
const transcribe = useCallback(
|
|
161
|
+
async (audio: Blob) => {
|
|
162
|
+
setState("processing");
|
|
163
|
+
try {
|
|
164
|
+
const form = new FormData();
|
|
165
|
+
form.append("audio", audio);
|
|
166
|
+
if (language) form.append("language", language);
|
|
167
|
+
const res = await fetch(apiUrl, { method: "POST", body: form });
|
|
168
|
+
if (!res.ok) {
|
|
169
|
+
throw new Error(`transcribe failed: ${res.status} ${await res.text().catch(() => "")}`);
|
|
170
|
+
}
|
|
171
|
+
const text = await res.text();
|
|
172
|
+
setState("idle");
|
|
173
|
+
setElapsedMs(0);
|
|
174
|
+
onTranscriptRef.current(text.trim());
|
|
175
|
+
} catch (e) {
|
|
176
|
+
handleError(e);
|
|
177
|
+
}
|
|
178
|
+
},
|
|
179
|
+
[apiUrl, language, handleError],
|
|
180
|
+
);
|
|
181
|
+
|
|
182
|
+
const stop = useCallback(async () => {
|
|
183
|
+
clearTimers();
|
|
184
|
+
const rec = recorderRef.current;
|
|
185
|
+
if (!rec) return;
|
|
186
|
+
recorderRef.current = null;
|
|
187
|
+
try {
|
|
188
|
+
const blob = await rec.stop();
|
|
189
|
+
if (blob.size === 0) {
|
|
190
|
+
setState("idle");
|
|
191
|
+
setElapsedMs(0);
|
|
192
|
+
return;
|
|
193
|
+
}
|
|
194
|
+
await transcribe(blob);
|
|
195
|
+
} catch (e) {
|
|
196
|
+
handleError(e);
|
|
197
|
+
}
|
|
198
|
+
}, [clearTimers, transcribe, handleError]);
|
|
199
|
+
|
|
200
|
+
const start = useCallback(async () => {
|
|
201
|
+
if (state === "recording" || state === "processing") return;
|
|
202
|
+
setError(null);
|
|
203
|
+
setElapsedMs(0);
|
|
204
|
+
if (!Recorder.isSupported()) {
|
|
205
|
+
handleError(new Error("Voice recording not supported in this browser"));
|
|
206
|
+
return;
|
|
207
|
+
}
|
|
208
|
+
const rec = new Recorder();
|
|
209
|
+
try {
|
|
210
|
+
await rec.start();
|
|
211
|
+
} catch (e) {
|
|
212
|
+
handleError(e);
|
|
213
|
+
return;
|
|
214
|
+
}
|
|
215
|
+
recorderRef.current = rec;
|
|
216
|
+
startTimeRef.current = Date.now();
|
|
217
|
+
setState("recording");
|
|
218
|
+
tickRef.current = setInterval(() => {
|
|
219
|
+
setElapsedMs(Date.now() - startTimeRef.current);
|
|
220
|
+
}, TICK_INTERVAL_MS);
|
|
221
|
+
autoStopRef.current = setTimeout(() => {
|
|
222
|
+
// Po przekroczeniu max time — auto-stop + transkrypcja.
|
|
223
|
+
void stop();
|
|
224
|
+
}, maxDurationMs);
|
|
225
|
+
}, [state, maxDurationMs, stop, handleError]);
|
|
226
|
+
|
|
227
|
+
const cancel = useCallback(() => {
|
|
228
|
+
clearTimers();
|
|
229
|
+
const rec = recorderRef.current;
|
|
230
|
+
recorderRef.current = null;
|
|
231
|
+
rec?.cancel();
|
|
232
|
+
setState("idle");
|
|
233
|
+
setElapsedMs(0);
|
|
234
|
+
setError(null);
|
|
235
|
+
}, [clearTimers]);
|
|
236
|
+
|
|
237
|
+
// Cleanup gdy komponent zniknie podczas nagrywania (np. user nawiguje).
|
|
238
|
+
useEffect(() => {
|
|
239
|
+
return () => {
|
|
240
|
+
clearTimers();
|
|
241
|
+
recorderRef.current?.cancel();
|
|
242
|
+
recorderRef.current = null;
|
|
243
|
+
};
|
|
244
|
+
}, [clearTimers]);
|
|
245
|
+
|
|
246
|
+
return {
|
|
247
|
+
state,
|
|
248
|
+
elapsedMs,
|
|
249
|
+
error,
|
|
250
|
+
start,
|
|
251
|
+
stop,
|
|
252
|
+
cancel,
|
|
253
|
+
isSupported: Recorder.isSupported(),
|
|
254
|
+
};
|
|
255
|
+
}
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
import { Mic, MicOff, Loader2, Square } from "lucide-react";
|
|
2
|
+
import type { VoiceRecorderState } from "./use-voice-recorder";
|
|
3
|
+
|
|
4
|
+
export interface VoiceButtonProps {
|
|
5
|
+
state: VoiceRecorderState;
|
|
6
|
+
/** Czas nagrywania w ms — driver progress koła i MM:SS. */
|
|
7
|
+
elapsedMs: number;
|
|
8
|
+
/** Maksimum (ms) — odniesienie dla wypełnienia koła (0% → 100%). */
|
|
9
|
+
maxDurationMs: number;
|
|
10
|
+
/** Klik podczas idle = start, podczas recording = stop. */
|
|
11
|
+
onClick: () => void;
|
|
12
|
+
/** Klik cancel — pokazywany jako osobny przycisk obok podczas recording. */
|
|
13
|
+
onCancel?: () => void;
|
|
14
|
+
/** Komunikat błędu — tooltip nad ikoną MicOff. */
|
|
15
|
+
error?: Error | null;
|
|
16
|
+
className?: string;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
// Constanty rysowania koła. Promień taki żeby SVG mieścił się w 32×32 z
|
|
20
|
+
// 2px stroke + 2px padding na pulsującą obwódkę.
|
|
21
|
+
const SIZE = 32;
|
|
22
|
+
const STROKE = 2.5;
|
|
23
|
+
const RADIUS = (SIZE - STROKE) / 2;
|
|
24
|
+
const CIRCUMFERENCE = 2 * Math.PI * RADIUS;
|
|
25
|
+
|
|
26
|
+
function formatMmSs(ms: number): string {
|
|
27
|
+
const total = Math.max(0, Math.floor(ms / 1000));
|
|
28
|
+
const m = Math.floor(total / 60);
|
|
29
|
+
const s = total % 60;
|
|
30
|
+
return `${m}:${String(s).padStart(2, "0")}`;
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function VoiceButton({
|
|
34
|
+
state,
|
|
35
|
+
elapsedMs,
|
|
36
|
+
maxDurationMs,
|
|
37
|
+
onClick,
|
|
38
|
+
onCancel,
|
|
39
|
+
error,
|
|
40
|
+
className,
|
|
41
|
+
}: VoiceButtonProps) {
|
|
42
|
+
const recording = state === "recording";
|
|
43
|
+
const processing = state === "processing";
|
|
44
|
+
const errored = state === "error";
|
|
45
|
+
|
|
46
|
+
if (recording) {
|
|
47
|
+
const progress = Math.min(1, elapsedMs / maxDurationMs);
|
|
48
|
+
const dashOffset = CIRCUMFERENCE * (1 - progress);
|
|
49
|
+
return (
|
|
50
|
+
<div className={`flex items-center gap-1 ${className ?? ""}`}>
|
|
51
|
+
{onCancel && (
|
|
52
|
+
<button
|
|
53
|
+
type="button"
|
|
54
|
+
onClick={onCancel}
|
|
55
|
+
aria-label="Anuluj nagrywanie"
|
|
56
|
+
title="Anuluj"
|
|
57
|
+
className="flex h-7 w-7 items-center justify-center rounded-full text-muted-foreground/70 transition-colors hover:bg-muted hover:text-foreground"
|
|
58
|
+
>
|
|
59
|
+
<span className="text-xs">✕</span>
|
|
60
|
+
</button>
|
|
61
|
+
)}
|
|
62
|
+
<button
|
|
63
|
+
type="button"
|
|
64
|
+
onClick={onClick}
|
|
65
|
+
aria-label="Zakończ nagrywanie"
|
|
66
|
+
className="relative inline-flex items-center justify-center"
|
|
67
|
+
style={{ width: SIZE, height: SIZE }}
|
|
68
|
+
>
|
|
69
|
+
<svg
|
|
70
|
+
width={SIZE}
|
|
71
|
+
height={SIZE}
|
|
72
|
+
className="absolute inset-0 -rotate-90"
|
|
73
|
+
aria-hidden
|
|
74
|
+
>
|
|
75
|
+
{/* Tło — pełne koło, niska opacity */}
|
|
76
|
+
<circle
|
|
77
|
+
cx={SIZE / 2}
|
|
78
|
+
cy={SIZE / 2}
|
|
79
|
+
r={RADIUS}
|
|
80
|
+
fill="none"
|
|
81
|
+
stroke="currentColor"
|
|
82
|
+
strokeWidth={STROKE}
|
|
83
|
+
className="text-muted-foreground/20"
|
|
84
|
+
/>
|
|
85
|
+
{/* Progress — wypełnia się w miarę elapsedMs */}
|
|
86
|
+
<circle
|
|
87
|
+
cx={SIZE / 2}
|
|
88
|
+
cy={SIZE / 2}
|
|
89
|
+
r={RADIUS}
|
|
90
|
+
fill="none"
|
|
91
|
+
stroke="currentColor"
|
|
92
|
+
strokeWidth={STROKE}
|
|
93
|
+
strokeLinecap="round"
|
|
94
|
+
strokeDasharray={CIRCUMFERENCE}
|
|
95
|
+
strokeDashoffset={dashOffset}
|
|
96
|
+
className="text-primary transition-[stroke-dashoffset] duration-100 ease-linear"
|
|
97
|
+
/>
|
|
98
|
+
</svg>
|
|
99
|
+
{/* Pulsująca obwódka — sygnał że nagrywanie aktywne */}
|
|
100
|
+
<span
|
|
101
|
+
className="absolute inset-0 rounded-full bg-primary/20 animate-ping"
|
|
102
|
+
style={{ animationDuration: "1.5s" }}
|
|
103
|
+
aria-hidden
|
|
104
|
+
/>
|
|
105
|
+
{/* Środek — kwadracik stop + timer pod spodem */}
|
|
106
|
+
<Square className="relative h-3 w-3 fill-primary text-primary" />
|
|
107
|
+
</button>
|
|
108
|
+
<span className="text-xs font-medium tabular-nums text-muted-foreground min-w-[2.5rem]">
|
|
109
|
+
{formatMmSs(elapsedMs)}
|
|
110
|
+
</span>
|
|
111
|
+
</div>
|
|
112
|
+
);
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (processing) {
|
|
116
|
+
return (
|
|
117
|
+
<button
|
|
118
|
+
type="button"
|
|
119
|
+
disabled
|
|
120
|
+
aria-label="Transkrybuję…"
|
|
121
|
+
className={`inline-flex h-7 w-7 items-center justify-center rounded-full text-primary ${className ?? ""}`}
|
|
122
|
+
>
|
|
123
|
+
<Loader2 className="h-4 w-4 animate-spin" />
|
|
124
|
+
</button>
|
|
125
|
+
);
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
if (errored) {
|
|
129
|
+
return (
|
|
130
|
+
<button
|
|
131
|
+
type="button"
|
|
132
|
+
onClick={onClick}
|
|
133
|
+
aria-label="Spróbuj ponownie nagrać"
|
|
134
|
+
title={error?.message ?? "Błąd nagrywania"}
|
|
135
|
+
className={`inline-flex h-7 w-7 items-center justify-center rounded-full text-destructive transition-colors hover:bg-destructive/10 ${className ?? ""}`}
|
|
136
|
+
>
|
|
137
|
+
<MicOff className="h-4 w-4" />
|
|
138
|
+
</button>
|
|
139
|
+
);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// idle
|
|
143
|
+
return (
|
|
144
|
+
<button
|
|
145
|
+
type="button"
|
|
146
|
+
onClick={onClick}
|
|
147
|
+
aria-label="Nagraj głosowo"
|
|
148
|
+
title="Nagraj głosowo"
|
|
149
|
+
className={`inline-flex h-7 w-7 items-center justify-center rounded-full text-muted-foreground/70 transition-colors hover:bg-muted hover:text-foreground ${className ?? ""}`}
|
|
150
|
+
>
|
|
151
|
+
<Mic className="h-4 w-4" />
|
|
152
|
+
</button>
|
|
153
|
+
);
|
|
154
|
+
}
|
|
@@ -0,0 +1,130 @@
|
|
|
1
|
+
import { useEffect, useRef } from "react";
|
|
2
|
+
import { useI18n } from "@arcote.tech/platform";
|
|
3
|
+
import { useVoiceRecorder } from "./use-voice-recorder";
|
|
4
|
+
import { VoiceButton } from "./voice-button";
|
|
5
|
+
|
|
6
|
+
export interface VoiceContentEditableProps {
|
|
7
|
+
value: string;
|
|
8
|
+
onChange: (value: string) => void;
|
|
9
|
+
placeholder?: string;
|
|
10
|
+
className?: string;
|
|
11
|
+
/** ISO 639-1 (np. "pl"). Default — z `useI18n().locale`. */
|
|
12
|
+
language?: string;
|
|
13
|
+
/** Max długość nagrania w ms. Default 60_000. */
|
|
14
|
+
maxDurationMs?: number;
|
|
15
|
+
/** Override endpointu transkrypcji. Default `/route/voice/transcribe`. */
|
|
16
|
+
transcribeUrl?: string;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const DEFAULT_MAX_MS = 60_000;
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Auto-grow contentEditable — bez wewnętrznego scrolla, height rośnie z
|
|
23
|
+
* treścią. Sensowny dla pól typu "opisz dłuższą myśl" gdzie chcemy widzieć
|
|
24
|
+
* całość bez maxHeight. Voice button w prawym górnym rogu (absolute), tekst
|
|
25
|
+
* z padding-right żeby nie wchodził pod ikonkę.
|
|
26
|
+
*
|
|
27
|
+
* Implementacja podobna do `TextareaField` (`/ds/form/fields/textarea-field.tsx`),
|
|
28
|
+
* ale bez `maxHeight` / scrolla — div sam rośnie.
|
|
29
|
+
*/
|
|
30
|
+
export function VoiceContentEditable({
|
|
31
|
+
value,
|
|
32
|
+
onChange,
|
|
33
|
+
placeholder,
|
|
34
|
+
className,
|
|
35
|
+
language,
|
|
36
|
+
maxDurationMs = DEFAULT_MAX_MS,
|
|
37
|
+
transcribeUrl,
|
|
38
|
+
}: VoiceContentEditableProps) {
|
|
39
|
+
const ref = useRef<HTMLDivElement>(null);
|
|
40
|
+
const composingRef = useRef(false);
|
|
41
|
+
const locale = useLocaleLanguage();
|
|
42
|
+
|
|
43
|
+
const { state, elapsedMs, error, start, stop, cancel } = useVoiceRecorder({
|
|
44
|
+
apiUrl: transcribeUrl,
|
|
45
|
+
language: language ?? locale,
|
|
46
|
+
maxDurationMs,
|
|
47
|
+
onTranscript: (text) => {
|
|
48
|
+
const trimmed = text.trim();
|
|
49
|
+
if (!trimmed) return;
|
|
50
|
+
onChange(value ? `${value.trimEnd()} ${trimmed}` : trimmed);
|
|
51
|
+
},
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
// Sync zewnętrznego value → DOM. Zapisujemy w innerText (plaintext)
|
|
55
|
+
// żeby uniknąć HTML escapingu i niespójności caret.
|
|
56
|
+
useEffect(() => {
|
|
57
|
+
if (!ref.current) return;
|
|
58
|
+
if (ref.current.innerText !== (value ?? "")) {
|
|
59
|
+
ref.current.innerText = value ?? "";
|
|
60
|
+
}
|
|
61
|
+
}, [value]);
|
|
62
|
+
|
|
63
|
+
const handleInput = () => {
|
|
64
|
+
if (composingRef.current) return;
|
|
65
|
+
onChange(ref.current?.innerText ?? "");
|
|
66
|
+
};
|
|
67
|
+
|
|
68
|
+
// Paste jako plaintext — bez stylów z innej apki.
|
|
69
|
+
const handlePaste = (e: React.ClipboardEvent) => {
|
|
70
|
+
e.preventDefault();
|
|
71
|
+
const text = e.clipboardData.getData("text/plain");
|
|
72
|
+
document.execCommand("insertText", false, text);
|
|
73
|
+
};
|
|
74
|
+
|
|
75
|
+
const recording = state === "recording";
|
|
76
|
+
const isEmpty = !value;
|
|
77
|
+
|
|
78
|
+
return (
|
|
79
|
+
<div className={`relative ${className ?? ""}`}>
|
|
80
|
+
<div
|
|
81
|
+
ref={ref}
|
|
82
|
+
contentEditable
|
|
83
|
+
onInput={handleInput}
|
|
84
|
+
onPaste={handlePaste}
|
|
85
|
+
onCompositionStart={() => (composingRef.current = true)}
|
|
86
|
+
onCompositionEnd={() => {
|
|
87
|
+
composingRef.current = false;
|
|
88
|
+
handleInput();
|
|
89
|
+
}}
|
|
90
|
+
suppressContentEditableWarning
|
|
91
|
+
className={
|
|
92
|
+
// `whitespace-pre-wrap` + `break-words` (overflow-wrap: break-word)
|
|
93
|
+
// łamie długie słowa gdy nie mieszczą się w linii. Arbitrary
|
|
94
|
+
// `[overflow-wrap:anywhere]` jest bardziej agresywne i łapie
|
|
95
|
+
// edge case'y typu długie URL-e / sklejone wyrazy bez spacji.
|
|
96
|
+
"min-h-[6rem] w-full rounded-md border border-input bg-background px-3 py-2 text-sm whitespace-pre-wrap break-words [overflow-wrap:anywhere] focus:outline-none focus:ring-2 focus:ring-ring " +
|
|
97
|
+
(recording ? "pr-28" : "pr-10")
|
|
98
|
+
}
|
|
99
|
+
data-placeholder={placeholder}
|
|
100
|
+
/>
|
|
101
|
+
{isEmpty && placeholder && (
|
|
102
|
+
<span
|
|
103
|
+
aria-hidden
|
|
104
|
+
className="pointer-events-none absolute left-3 top-2 text-sm text-muted-foreground"
|
|
105
|
+
>
|
|
106
|
+
{placeholder}
|
|
107
|
+
</span>
|
|
108
|
+
)}
|
|
109
|
+
<div className="absolute top-1.5 right-1.5">
|
|
110
|
+
<VoiceButton
|
|
111
|
+
state={state}
|
|
112
|
+
elapsedMs={elapsedMs}
|
|
113
|
+
maxDurationMs={maxDurationMs}
|
|
114
|
+
error={error}
|
|
115
|
+
onClick={recording ? stop : start}
|
|
116
|
+
onCancel={recording ? cancel : undefined}
|
|
117
|
+
/>
|
|
118
|
+
</div>
|
|
119
|
+
</div>
|
|
120
|
+
);
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
function useLocaleLanguage(): string | undefined {
|
|
124
|
+
try {
|
|
125
|
+
const { locale } = useI18n();
|
|
126
|
+
return locale ? locale.split("-")[0] : undefined;
|
|
127
|
+
} catch {
|
|
128
|
+
return undefined;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { Input } from "@arcote.tech/arc-ds";
|
|
2
|
+
import { useI18n } from "@arcote.tech/platform";
|
|
3
|
+
import type { ComponentType } from "react";
|
|
4
|
+
import { useVoiceRecorder } from "./use-voice-recorder";
|
|
5
|
+
import { VoiceButton } from "./voice-button";
|
|
6
|
+
|
|
7
|
+
export interface VoiceTextInputProps {
|
|
8
|
+
value: string;
|
|
9
|
+
onChange: (value: string) => void;
|
|
10
|
+
placeholder?: string;
|
|
11
|
+
icon?: ComponentType<{ className?: string }>;
|
|
12
|
+
size?: "default" | "sm" | "xs" | "lg";
|
|
13
|
+
className?: string;
|
|
14
|
+
/** ISO 639-1 (np. "pl"). Default — z `useI18n().locale` (np. `pl-PL` → `pl`). */
|
|
15
|
+
language?: string;
|
|
16
|
+
/** Max długość nagrania w ms. Default 60_000. */
|
|
17
|
+
maxDurationMs?: number;
|
|
18
|
+
/** Override endpointu transkrypcji. Default `/route/voice/transcribe`. */
|
|
19
|
+
transcribeUrl?: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const DEFAULT_MAX_MS = 60_000;
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Jednoliniowy `Input` z przyciskiem nagrywania głosu w prawej części
|
|
26
|
+
* (vertically centered). Transkrypt jest **dopisywany** do bieżącego
|
|
27
|
+
* `value` (ze spacją gdy value nie pusty) — pozwala dyktować w kawałkach.
|
|
28
|
+
*/
|
|
29
|
+
export function VoiceTextInput({
|
|
30
|
+
value,
|
|
31
|
+
onChange,
|
|
32
|
+
placeholder,
|
|
33
|
+
icon,
|
|
34
|
+
size = "default",
|
|
35
|
+
className,
|
|
36
|
+
language,
|
|
37
|
+
maxDurationMs = DEFAULT_MAX_MS,
|
|
38
|
+
transcribeUrl,
|
|
39
|
+
}: VoiceTextInputProps) {
|
|
40
|
+
const locale = useLocaleLanguage();
|
|
41
|
+
const { state, elapsedMs, error, start, stop, cancel } = useVoiceRecorder({
|
|
42
|
+
apiUrl: transcribeUrl,
|
|
43
|
+
language: language ?? locale,
|
|
44
|
+
maxDurationMs,
|
|
45
|
+
onTranscript: (text) => {
|
|
46
|
+
const trimmed = text.trim();
|
|
47
|
+
if (!trimmed) return;
|
|
48
|
+
onChange(value ? `${value.trimEnd()} ${trimmed}` : trimmed);
|
|
49
|
+
},
|
|
50
|
+
});
|
|
51
|
+
|
|
52
|
+
const recording = state === "recording";
|
|
53
|
+
|
|
54
|
+
return (
|
|
55
|
+
<div className={`relative ${className ?? ""}`}>
|
|
56
|
+
<Input
|
|
57
|
+
value={value}
|
|
58
|
+
onChange={(e) => onChange(e.target.value)}
|
|
59
|
+
placeholder={placeholder}
|
|
60
|
+
icon={icon}
|
|
61
|
+
size={size}
|
|
62
|
+
// Padding-right zostawiamy miejsce na przycisk + ewentualny timer.
|
|
63
|
+
className={recording ? "pr-24" : "pr-10"}
|
|
64
|
+
/>
|
|
65
|
+
<div className="absolute right-2 top-1/2 -translate-y-1/2">
|
|
66
|
+
<VoiceButton
|
|
67
|
+
state={state}
|
|
68
|
+
elapsedMs={elapsedMs}
|
|
69
|
+
maxDurationMs={maxDurationMs}
|
|
70
|
+
error={error}
|
|
71
|
+
onClick={recording ? stop : start}
|
|
72
|
+
onCancel={recording ? cancel : undefined}
|
|
73
|
+
/>
|
|
74
|
+
</div>
|
|
75
|
+
</div>
|
|
76
|
+
);
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// `pl-PL` → `pl`. Whisper akceptuje BCP-47 ale ISO 639-1 jest bezpieczniejsze.
|
|
80
|
+
function useLocaleLanguage(): string | undefined {
|
|
81
|
+
try {
|
|
82
|
+
const { locale } = useI18n();
|
|
83
|
+
return locale ? locale.split("-")[0] : undefined;
|
|
84
|
+
} catch {
|
|
85
|
+
return undefined;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import { TextareaField } from "@arcote.tech/arc-ds";
|
|
2
|
+
import { useI18n } from "@arcote.tech/platform";
|
|
3
|
+
import type { ReactNode } from "react";
|
|
4
|
+
import { useVoiceRecorder } from "./use-voice-recorder";
|
|
5
|
+
import { VoiceButton } from "./voice-button";
|
|
6
|
+
|
|
7
|
+
export interface VoiceTextareaProps {
|
|
8
|
+
value: string;
|
|
9
|
+
onChange: (value: string) => void;
|
|
10
|
+
placeholder?: string;
|
|
11
|
+
label?: ReactNode;
|
|
12
|
+
rows?: number;
|
|
13
|
+
maxHeight?: number;
|
|
14
|
+
className?: string;
|
|
15
|
+
/** ISO 639-1 (np. "pl"). Default — z `useI18n().locale`. */
|
|
16
|
+
language?: string;
|
|
17
|
+
/** Max długość nagrania w ms. Default 60_000. */
|
|
18
|
+
maxDurationMs?: number;
|
|
19
|
+
/** Override endpointu transkrypcji. Default `/route/voice/transcribe`. */
|
|
20
|
+
transcribeUrl?: string;
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
const DEFAULT_MAX_MS = 60_000;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Wieloliniowy edytor `TextareaField` z przyciskiem nagrywania w prawym
|
|
27
|
+
* GÓRNYM rogu. Transkrypt jest **dopisywany** do bieżącego `value`.
|
|
28
|
+
* Dla użycia w chacie zostaw `rows=1` — TextareaField auto-rośnie z treścią.
|
|
29
|
+
*/
|
|
30
|
+
export function VoiceTextarea({
|
|
31
|
+
value,
|
|
32
|
+
onChange,
|
|
33
|
+
placeholder,
|
|
34
|
+
label,
|
|
35
|
+
rows = 4,
|
|
36
|
+
maxHeight,
|
|
37
|
+
className,
|
|
38
|
+
language,
|
|
39
|
+
maxDurationMs = DEFAULT_MAX_MS,
|
|
40
|
+
transcribeUrl,
|
|
41
|
+
}: VoiceTextareaProps) {
|
|
42
|
+
const locale = useLocaleLanguage();
|
|
43
|
+
const { state, elapsedMs, error, start, stop, cancel } = useVoiceRecorder({
|
|
44
|
+
apiUrl: transcribeUrl,
|
|
45
|
+
language: language ?? locale,
|
|
46
|
+
maxDurationMs,
|
|
47
|
+
onTranscript: (text) => {
|
|
48
|
+
const trimmed = text.trim();
|
|
49
|
+
if (!trimmed) return;
|
|
50
|
+
onChange(value ? `${value.trimEnd()} ${trimmed}` : trimmed);
|
|
51
|
+
},
|
|
52
|
+
});
|
|
53
|
+
|
|
54
|
+
const recording = state === "recording";
|
|
55
|
+
|
|
56
|
+
// Pojedyncza linia (np. chat-input rows=1) — button vertically centered.
|
|
57
|
+
// Wieloliniowo — button kotwiczony w prawym górnym, przy pierwszej linii.
|
|
58
|
+
const isSingleLine = (rows ?? 1) <= 1;
|
|
59
|
+
const buttonPosition = isSingleLine
|
|
60
|
+
? "top-1/2 -translate-y-1/2 right-1.5"
|
|
61
|
+
: "top-1.5 right-1.5";
|
|
62
|
+
|
|
63
|
+
return (
|
|
64
|
+
<div className={`relative ${className ?? ""}`}>
|
|
65
|
+
<TextareaField
|
|
66
|
+
value={value}
|
|
67
|
+
onChange={(val) => onChange(val ?? "")}
|
|
68
|
+
placeholder={placeholder}
|
|
69
|
+
label={label}
|
|
70
|
+
rows={rows}
|
|
71
|
+
maxHeight={maxHeight}
|
|
72
|
+
// Padding-right na samym contentEditable żeby tekst nie wchodził
|
|
73
|
+
// pod absolute-positioned VoiceButton. Recording wyświetla mic +
|
|
74
|
+
// MM:SS timer (szerszy widget) — większy padding.
|
|
75
|
+
inputClassName={recording ? "pr-28" : "pr-10"}
|
|
76
|
+
/>
|
|
77
|
+
<div className={`absolute ${buttonPosition}`}>
|
|
78
|
+
<VoiceButton
|
|
79
|
+
state={state}
|
|
80
|
+
elapsedMs={elapsedMs}
|
|
81
|
+
maxDurationMs={maxDurationMs}
|
|
82
|
+
error={error}
|
|
83
|
+
onClick={recording ? stop : start}
|
|
84
|
+
onCancel={recording ? cancel : undefined}
|
|
85
|
+
/>
|
|
86
|
+
</div>
|
|
87
|
+
</div>
|
|
88
|
+
);
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
function useLocaleLanguage(): string | undefined {
|
|
92
|
+
try {
|
|
93
|
+
const { locale } = useI18n();
|
|
94
|
+
return locale ? locale.split("-")[0] : undefined;
|
|
95
|
+
} catch {
|
|
96
|
+
return undefined;
|
|
97
|
+
}
|
|
98
|
+
}
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
/// <reference path="../arc.d.ts" />
|
|
2
|
+
import { route, type ArcTokenAny } from "@arcote.tech/arc";
|
|
3
|
+
import type { TranscriptionProvider } from "../types";
|
|
4
|
+
|
|
5
|
+
export interface TranscribeRouteConfig {
|
|
6
|
+
provider: TranscriptionProvider;
|
|
7
|
+
/** Domyślny język gdy klient nie przekaże `language` w form-data. */
|
|
8
|
+
defaultLanguage?: string;
|
|
9
|
+
/**
|
|
10
|
+
* Opcjonalny gating endpointu. Konsumer przekazuje swój token (np.
|
|
11
|
+
* `userToken` z arc-auth) i funkcję check (zwracającą `true` lub
|
|
12
|
+
* `{ ...whereClause }`). Bez tego endpoint jest `.public()` — koszty
|
|
13
|
+
* API są wtedy eksponowane na każdego, więc rekomendacja: zawsze
|
|
14
|
+
* przekazać token.
|
|
15
|
+
*/
|
|
16
|
+
protectBy?: {
|
|
17
|
+
token: ArcTokenAny;
|
|
18
|
+
check?: (params: any) => boolean | object;
|
|
19
|
+
};
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const MAX_AUDIO_BYTES = 25 * 1024 * 1024; // Whisper limit = 25MB
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* POST /route/voice/transcribe
|
|
26
|
+
*
|
|
27
|
+
* Multipart body:
|
|
28
|
+
* - `audio` (Blob, required) — webm/opus, mp4, wav, mp3, m4a, ogg
|
|
29
|
+
* - `language` (string, optional) — ISO 639-1 (np. "pl"), nadpisuje default
|
|
30
|
+
*
|
|
31
|
+
* Response: plain text z transkrypcją (200) albo `{ error }` JSON (4xx/5xx).
|
|
32
|
+
*/
|
|
33
|
+
export function createTranscribeRoute(config: TranscribeRouteConfig) {
|
|
34
|
+
const base = route("voiceTranscribe").path("/voice/transcribe");
|
|
35
|
+
// `.public()` i `.protectBy()` zwracają niezgodne typy (isPublic: true vs
|
|
36
|
+
// protections w typie), więc rozdzielamy branche zamiast reassignować let.
|
|
37
|
+
const gated = config.protectBy
|
|
38
|
+
? base.protectBy(
|
|
39
|
+
config.protectBy.token,
|
|
40
|
+
(config.protectBy.check ?? (() => true)) as any,
|
|
41
|
+
)
|
|
42
|
+
: base.public();
|
|
43
|
+
|
|
44
|
+
return gated.handle({
|
|
45
|
+
POST: async (_ctx, req: Request) => {
|
|
46
|
+
if (!ONLY_SERVER) {
|
|
47
|
+
return new Response("server only", { status: 500 });
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
let form: FormData;
|
|
51
|
+
try {
|
|
52
|
+
form = await req.formData();
|
|
53
|
+
} catch (e) {
|
|
54
|
+
return jsonError(400, "invalid multipart body");
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
const audio = form.get("audio");
|
|
58
|
+
if (!(audio instanceof Blob)) {
|
|
59
|
+
return jsonError(400, "missing 'audio' field");
|
|
60
|
+
}
|
|
61
|
+
if (audio.size === 0) {
|
|
62
|
+
return jsonError(400, "empty audio");
|
|
63
|
+
}
|
|
64
|
+
if (audio.size > MAX_AUDIO_BYTES) {
|
|
65
|
+
return jsonError(413, `audio exceeds ${MAX_AUDIO_BYTES} bytes`);
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
const language =
|
|
69
|
+
(form.get("language") as string | null) ?? config.defaultLanguage;
|
|
70
|
+
|
|
71
|
+
try {
|
|
72
|
+
const text = await config.provider.transcribe(
|
|
73
|
+
audio,
|
|
74
|
+
language ? { language } : undefined,
|
|
75
|
+
);
|
|
76
|
+
return new Response(text, {
|
|
77
|
+
status: 200,
|
|
78
|
+
headers: { "Content-Type": "text/plain; charset=utf-8" },
|
|
79
|
+
});
|
|
80
|
+
} catch (e) {
|
|
81
|
+
const msg = e instanceof Error ? e.message : "transcription failed";
|
|
82
|
+
console.error("[voice:transcribe]", msg);
|
|
83
|
+
return jsonError(502, msg);
|
|
84
|
+
}
|
|
85
|
+
},
|
|
86
|
+
});
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
function jsonError(status: number, error: string): Response {
|
|
90
|
+
return new Response(JSON.stringify({ error }), {
|
|
91
|
+
status,
|
|
92
|
+
headers: { "Content-Type": "application/json" },
|
|
93
|
+
});
|
|
94
|
+
}
|
package/src/types.ts
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
// Abstrakcja providera transkrypcji audio → tekst. Konkretne adaptery
|
|
2
|
+
// (whisper, google-speech itp.) eksportują fabrykę zwracającą instancję
|
|
3
|
+
// tej struktury. `voice({ provider })` wstrzykuje wybranego providera do
|
|
4
|
+
// route handlera `/voice/transcribe`.
|
|
5
|
+
|
|
6
|
+
export interface TranscriptionOptions {
|
|
7
|
+
/**
|
|
8
|
+
* Język nagrania w formacie ISO 639-1 (np. "pl", "en"). Większość
|
|
9
|
+
* providerów potrafi auto-detekować, ale podanie języka znacząco
|
|
10
|
+
* poprawia accuracy dla krótkich nagrań.
|
|
11
|
+
*/
|
|
12
|
+
language?: string;
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export interface TranscriptionProvider {
|
|
16
|
+
/** Identyfikator providera — używany w logach i diagnozie. */
|
|
17
|
+
name: string;
|
|
18
|
+
/**
|
|
19
|
+
* Transkrybuje audio (webm/opus, mp4, wav, mp3...) na tekst.
|
|
20
|
+
* Powinien rzucić błąd przy nieprawidłowym formacie / awarii API.
|
|
21
|
+
*/
|
|
22
|
+
transcribe(audio: Blob, options?: TranscriptionOptions): Promise<string>;
|
|
23
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import { contextElement, module } from "@arcote.tech/platform";
|
|
2
|
+
import {
|
|
3
|
+
createTranscribeRoute,
|
|
4
|
+
type TranscribeRouteConfig,
|
|
5
|
+
} from "./routes/transcribe-route";
|
|
6
|
+
|
|
7
|
+
export interface VoiceConfig extends TranscribeRouteConfig {}
|
|
8
|
+
|
|
9
|
+
/**
|
|
10
|
+
* Rejestruje moduł `voice` z route `/voice/transcribe`. Konsumer wywołuje raz
|
|
11
|
+
* w init aplikacji:
|
|
12
|
+
*
|
|
13
|
+
* ```ts
|
|
14
|
+
* voice({
|
|
15
|
+
* provider: whisper({ apiKey: process.env.OPENAI_API_KEY! }),
|
|
16
|
+
* defaultLanguage: "pl",
|
|
17
|
+
* protectBy: { token: userToken },
|
|
18
|
+
* }).build();
|
|
19
|
+
* ```
|
|
20
|
+
*
|
|
21
|
+
* Maksymalny czas nagrania (`maxDurationMs`) kontroluje hook
|
|
22
|
+
* `useVoiceRecorder` — `VoiceTextarea`/`VoiceTextInput`/`VoiceContentEditable`
|
|
23
|
+
* przyjmują go z propsa, więc konsumer ustawia go per-komponent.
|
|
24
|
+
*/
|
|
25
|
+
export function voice(config: VoiceConfig) {
|
|
26
|
+
const transcribeRoute = createTranscribeRoute(config);
|
|
27
|
+
return {
|
|
28
|
+
route: transcribeRoute,
|
|
29
|
+
/** Rejestruje moduł "voice" z transcribe route jako jednym fragmentem. */
|
|
30
|
+
build() {
|
|
31
|
+
return module("voice")
|
|
32
|
+
.public([contextElement(transcribeRoute)])
|
|
33
|
+
.build();
|
|
34
|
+
},
|
|
35
|
+
};
|
|
36
|
+
}
|
package/tsconfig.json
ADDED