@oh-my-pi/pi-web-ui 3.15.1 → 3.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +10 -1
- package/README.md +2 -1
- package/example/src/main.ts +3 -3
- package/example/tsconfig.json +1 -1
- package/package.json +4 -4
- package/src/components/AgentInterface.ts +326 -3
- package/src/utils/voice.ts +272 -0
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,15 @@
|
|
|
2
2
|
|
|
3
3
|
## [Unreleased]
|
|
4
4
|
|
|
5
|
+
## [3.20.0] - 2026-01-06
|
|
6
|
+
### Added
|
|
7
|
+
|
|
8
|
+
- Added voice input and output support using OpenAI Whisper for transcription and TTS for speech synthesis
|
|
9
|
+
- Added `enableVoice` property to toggle voice features in the agent interface
|
|
10
|
+
- Added Caps Lock key as push-to-talk trigger for voice capture
|
|
11
|
+
- Added automatic voice summarization and playback of assistant responses
|
|
12
|
+
- Added silence detection to automatically stop voice recording after speech ends
|
|
13
|
+
|
|
5
14
|
## [3.15.1] - 2026-01-05
|
|
6
15
|
|
|
7
16
|
## [3.15.0] - 2026-01-05
|
|
@@ -148,4 +157,4 @@ declare module "@oh-my-pi/pi-agent-core" {
|
|
|
148
157
|
"my-message": MyMessage;
|
|
149
158
|
}
|
|
150
159
|
}
|
|
151
|
-
```
|
|
160
|
+
```
|
package/README.md
CHANGED
package/example/src/main.ts
CHANGED
|
@@ -343,11 +343,11 @@ const renderApp = () => {
|
|
|
343
343
|
size: "sm",
|
|
344
344
|
children: icon(Bell, "sm"),
|
|
345
345
|
onClick: () => {
|
|
346
|
-
// Demo:
|
|
346
|
+
// Demo: Queue custom message for the next turn
|
|
347
347
|
if (agent) {
|
|
348
|
-
agent.
|
|
348
|
+
agent.followUp(
|
|
349
349
|
createSystemNotification(
|
|
350
|
-
"This is a custom message! It appears in the UI
|
|
350
|
+
"This is a custom message! It appears in the UI and is sent to the LLM.",
|
|
351
351
|
),
|
|
352
352
|
);
|
|
353
353
|
}
|
package/example/tsconfig.json
CHANGED
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@oh-my-pi/pi-web-ui",
|
|
3
|
-
"version": "3.
|
|
3
|
+
"version": "3.20.0",
|
|
4
4
|
"description": "Reusable web UI components for AI chat interfaces powered by @oh-my-pi/pi-ai",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "src/index.ts",
|
|
@@ -19,9 +19,9 @@
|
|
|
19
19
|
},
|
|
20
20
|
"dependencies": {
|
|
21
21
|
"@lmstudio/sdk": "^1.5.0",
|
|
22
|
-
"@oh-my-pi/pi-agent-core": "3.
|
|
23
|
-
"@oh-my-pi/pi-ai": "3.
|
|
24
|
-
"@oh-my-pi/pi-tui": "3.
|
|
22
|
+
"@oh-my-pi/pi-agent-core": "3.20.0",
|
|
23
|
+
"@oh-my-pi/pi-ai": "3.20.0",
|
|
24
|
+
"@oh-my-pi/pi-tui": "3.20.0",
|
|
25
25
|
"docx-preview": "^0.3.7",
|
|
26
26
|
"highlight.js": "^11.11.1",
|
|
27
27
|
"jszip": "^3.10.1",
|
|
@@ -1,6 +1,15 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import {
|
|
2
|
+
type AssistantMessage,
|
|
3
|
+
completeSimple,
|
|
4
|
+
getModels,
|
|
5
|
+
type Model,
|
|
6
|
+
streamSimple,
|
|
7
|
+
type TextContent,
|
|
8
|
+
type ToolResultMessage,
|
|
9
|
+
type Usage,
|
|
10
|
+
} from "@oh-my-pi/pi-ai";
|
|
2
11
|
import { html, LitElement } from "lit";
|
|
3
|
-
import { customElement, property, query } from "lit/decorators.js";
|
|
12
|
+
import { customElement, property, query, state } from "lit/decorators.js";
|
|
4
13
|
import { ModelSelector } from "../dialogs/ModelSelector";
|
|
5
14
|
import type { MessageEditor } from "./MessageEditor";
|
|
6
15
|
import "./MessageEditor.js";
|
|
@@ -13,6 +22,7 @@ import type { Attachment } from "../utils/attachment-utils";
|
|
|
13
22
|
import { formatUsage } from "../utils/format";
|
|
14
23
|
import { i18n } from "../utils/i18n";
|
|
15
24
|
import { createStreamFn } from "../utils/proxy-utils";
|
|
25
|
+
import { synthesizeOpenAI, transcribeOpenAI, VoiceCapture, type VoiceCaptureResult } from "../utils/voice";
|
|
16
26
|
import type { UserMessageWithAttachments } from "./Messages";
|
|
17
27
|
import type { StreamingMessageContainer } from "./StreamingMessageContainer";
|
|
18
28
|
|
|
@@ -24,6 +34,7 @@ export class AgentInterface extends LitElement {
|
|
|
24
34
|
@property({ type: Boolean }) enableModelSelector = true;
|
|
25
35
|
@property({ type: Boolean }) enableThinkingSelector = true;
|
|
26
36
|
@property({ type: Boolean }) showThemeToggle = false;
|
|
37
|
+
@property({ type: Boolean }) enableVoice = true;
|
|
27
38
|
// Optional custom API key prompt handler - if not provided, uses default dialog
|
|
28
39
|
@property({ attribute: false }) onApiKeyRequired?: (provider: string) => Promise<boolean>;
|
|
29
40
|
// Optional callback called before sending a message
|
|
@@ -37,12 +48,21 @@ export class AgentInterface extends LitElement {
|
|
|
37
48
|
@query("message-editor") private _messageEditor!: MessageEditor;
|
|
38
49
|
@query("streaming-message-container") private _streamingContainer!: StreamingMessageContainer;
|
|
39
50
|
|
|
51
|
+
@state() private voiceStatus: "idle" | "listening" | "transcribing" | "speaking" | "error" = "idle";
|
|
52
|
+
|
|
40
53
|
private _autoScroll = true;
|
|
41
54
|
private _lastScrollTop = 0;
|
|
42
55
|
private _lastClientHeight = 0;
|
|
43
56
|
private _scrollContainer?: HTMLElement;
|
|
44
57
|
private _resizeObserver?: ResizeObserver;
|
|
45
58
|
private _unsubscribeSession?: () => void;
|
|
59
|
+
private voiceCapture?: VoiceCapture;
|
|
60
|
+
private voiceAudio?: HTMLAudioElement;
|
|
61
|
+
private voiceSummaryAbort?: AbortController;
|
|
62
|
+
private voiceLastSpokenTimestamp = 0;
|
|
63
|
+
private voiceKeyListener = (event: KeyboardEvent) => {
|
|
64
|
+
void this.handleVoiceKey(event);
|
|
65
|
+
};
|
|
46
66
|
|
|
47
67
|
public setInput(text: string, attachments?: Attachment[]) {
|
|
48
68
|
const update = () => {
|
|
@@ -104,6 +124,8 @@ export class AgentInterface extends LitElement {
|
|
|
104
124
|
|
|
105
125
|
// Subscribe to external session if provided
|
|
106
126
|
this.setupSessionSubscription();
|
|
127
|
+
|
|
128
|
+
window.addEventListener("keydown", this.voiceKeyListener);
|
|
107
129
|
}
|
|
108
130
|
|
|
109
131
|
override disconnectedCallback() {
|
|
@@ -123,6 +145,11 @@ export class AgentInterface extends LitElement {
|
|
|
123
145
|
this._unsubscribeSession();
|
|
124
146
|
this._unsubscribeSession = undefined;
|
|
125
147
|
}
|
|
148
|
+
|
|
149
|
+
window.removeEventListener("keydown", this.voiceKeyListener);
|
|
150
|
+
void this.stopVoiceCapture("manual");
|
|
151
|
+
this.stopVoicePlayback();
|
|
152
|
+
this.voiceSummaryAbort?.abort();
|
|
126
153
|
}
|
|
127
154
|
|
|
128
155
|
private setupSessionSubscription() {
|
|
@@ -164,6 +191,7 @@ export class AgentInterface extends LitElement {
|
|
|
164
191
|
this._streamingContainer.setMessage(null, true);
|
|
165
192
|
}
|
|
166
193
|
this.requestUpdate();
|
|
194
|
+
void this.handleAgentEnd();
|
|
167
195
|
break;
|
|
168
196
|
case "message_update":
|
|
169
197
|
if (this._streamingContainer) {
|
|
@@ -203,6 +231,287 @@ export class AgentInterface extends LitElement {
|
|
|
203
231
|
this._lastClientHeight = clientHeight;
|
|
204
232
|
};
|
|
205
233
|
|
|
234
|
+
private async handleVoiceKey(event: KeyboardEvent): Promise<void> {
|
|
235
|
+
if (!this.enableVoice) return;
|
|
236
|
+
if (event.code !== "CapsLock" && event.key !== "CapsLock") return;
|
|
237
|
+
if (event.repeat) return;
|
|
238
|
+
event.preventDefault();
|
|
239
|
+
|
|
240
|
+
if (this.voiceStatus === "transcribing" || this.voiceStatus === "speaking") {
|
|
241
|
+
return;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
if (this.voiceCapture?.isRecording) {
|
|
245
|
+
await this.stopVoiceCapture("manual");
|
|
246
|
+
} else {
|
|
247
|
+
await this.startVoiceCapture();
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
private async startVoiceCapture(): Promise<void> {
|
|
252
|
+
if (!this.enableVoice || this.voiceCapture?.isRecording) return;
|
|
253
|
+
const apiKey = await this.resolveApiKey("openai");
|
|
254
|
+
if (!apiKey) {
|
|
255
|
+
this.voiceStatus = "error";
|
|
256
|
+
console.error("Voice capture requires an OpenAI API key.");
|
|
257
|
+
return;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
this.voiceStatus = "listening";
|
|
261
|
+
this.voiceCapture = new VoiceCapture();
|
|
262
|
+
const resultPromise = this.voiceCapture.start();
|
|
263
|
+
void resultPromise
|
|
264
|
+
.then((result) => this.handleVoiceCaptureResult(result))
|
|
265
|
+
.catch((error) => {
|
|
266
|
+
this.voiceStatus = "error";
|
|
267
|
+
this.voiceCapture = undefined;
|
|
268
|
+
console.error("Failed to start voice capture:", error);
|
|
269
|
+
});
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
private async stopVoiceCapture(reason: "manual" | "silence" | "max" | "error"): Promise<void> {
|
|
273
|
+
if (!this.voiceCapture) return;
|
|
274
|
+
await this.voiceCapture.stop(reason);
|
|
275
|
+
}
|
|
276
|
+
|
|
277
|
+
private async handleVoiceCaptureResult(result: VoiceCaptureResult): Promise<void> {
|
|
278
|
+
if (this.voiceStatus === "transcribing" || this.voiceStatus === "speaking") return;
|
|
279
|
+
if (!result.blob) {
|
|
280
|
+
this.voiceStatus = "idle";
|
|
281
|
+
this.voiceCapture = undefined;
|
|
282
|
+
return;
|
|
283
|
+
}
|
|
284
|
+
this.voiceStatus = "transcribing";
|
|
285
|
+
try {
|
|
286
|
+
const transcript = await this.transcribeAudio(result.blob);
|
|
287
|
+
if (transcript.trim()) {
|
|
288
|
+
await this.sendVoiceMessage(transcript);
|
|
289
|
+
}
|
|
290
|
+
this.voiceStatus = "idle";
|
|
291
|
+
this.voiceCapture = undefined;
|
|
292
|
+
} catch (error) {
|
|
293
|
+
this.voiceStatus = "error";
|
|
294
|
+
this.voiceCapture = undefined;
|
|
295
|
+
console.error("Voice transcription failed:", error);
|
|
296
|
+
}
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
private async transcribeAudio(blob: Blob): Promise<string> {
|
|
300
|
+
const apiKey = await this.resolveApiKey("openai");
|
|
301
|
+
if (!apiKey) {
|
|
302
|
+
throw new Error("OpenAI API key required for transcription.");
|
|
303
|
+
}
|
|
304
|
+
return transcribeOpenAI(blob, {
|
|
305
|
+
apiKey,
|
|
306
|
+
model: "whisper-1",
|
|
307
|
+
prompt: "Short voice command or question.",
|
|
308
|
+
});
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
private async sendVoiceMessage(input: string): Promise<void> {
|
|
312
|
+
const session = this.session;
|
|
313
|
+
if (!session) return;
|
|
314
|
+
|
|
315
|
+
// Design choice: keep the agent's full response for the UI, and generate short voice summaries separately.
|
|
316
|
+
const text = input.trim();
|
|
317
|
+
if (!text) return;
|
|
318
|
+
|
|
319
|
+
if (!(await this.ensureApiKeyForProvider(session.state.model?.provider))) {
|
|
320
|
+
return;
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
if (session.state.isStreaming) {
|
|
324
|
+
if (this.onBeforeSend) {
|
|
325
|
+
await this.onBeforeSend();
|
|
326
|
+
}
|
|
327
|
+
session.steer({ role: "user", content: text, timestamp: Date.now() });
|
|
328
|
+
return;
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
await this.sendMessage(text);
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
private async handleAgentEnd(): Promise<void> {
|
|
335
|
+
if (!this.enableVoice) return;
|
|
336
|
+
const session = this.session;
|
|
337
|
+
if (!session) return;
|
|
338
|
+
|
|
339
|
+
const lastAssistant = this.getLastAssistantMessage(session.state.messages);
|
|
340
|
+
if (!lastAssistant) return;
|
|
341
|
+
if (lastAssistant.timestamp <= this.voiceLastSpokenTimestamp) return;
|
|
342
|
+
|
|
343
|
+
const assistantText = this.extractAssistantText(lastAssistant);
|
|
344
|
+
if (!assistantText.trim()) return;
|
|
345
|
+
|
|
346
|
+
this.voiceLastSpokenTimestamp = lastAssistant.timestamp;
|
|
347
|
+
|
|
348
|
+
try {
|
|
349
|
+
const summary = await this.summarizeForVoice(assistantText);
|
|
350
|
+
if (summary.trim()) {
|
|
351
|
+
await this.speak(summary);
|
|
352
|
+
}
|
|
353
|
+
} catch (error) {
|
|
354
|
+
this.voiceStatus = "error";
|
|
355
|
+
console.error("Voice summary failed:", error);
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
private async summarizeForVoice(text: string): Promise<string> {
|
|
360
|
+
const model = this.resolveFastModel() || this.session?.state.model;
|
|
361
|
+
if (!model) {
|
|
362
|
+
return this.fallbackSummary(text);
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
const apiKey = await this.resolveApiKey(model.provider);
|
|
366
|
+
if (!apiKey) {
|
|
367
|
+
return this.fallbackSummary(text);
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
this.voiceSummaryAbort?.abort();
|
|
371
|
+
this.voiceSummaryAbort = new AbortController();
|
|
372
|
+
|
|
373
|
+
const prompt = [
|
|
374
|
+
"Summarize the assistant response for voice playback.",
|
|
375
|
+
"Keep it to 1-3 short sentences, conversational tone.",
|
|
376
|
+
"Preserve any question the assistant asked.",
|
|
377
|
+
'Keep uncertainty if present (e.g. "hmm... maybe...").',
|
|
378
|
+
"Do not use bullet points.",
|
|
379
|
+
"",
|
|
380
|
+
"Assistant response:",
|
|
381
|
+
text,
|
|
382
|
+
].join("\n");
|
|
383
|
+
|
|
384
|
+
const result = await completeSimple(
|
|
385
|
+
model,
|
|
386
|
+
{
|
|
387
|
+
messages: [
|
|
388
|
+
{
|
|
389
|
+
role: "user",
|
|
390
|
+
content: prompt,
|
|
391
|
+
timestamp: Date.now(),
|
|
392
|
+
},
|
|
393
|
+
],
|
|
394
|
+
},
|
|
395
|
+
{
|
|
396
|
+
apiKey,
|
|
397
|
+
maxTokens: 200,
|
|
398
|
+
temperature: 0.2,
|
|
399
|
+
signal: this.voiceSummaryAbort.signal,
|
|
400
|
+
},
|
|
401
|
+
);
|
|
402
|
+
|
|
403
|
+
const summary = this.extractAssistantText(result);
|
|
404
|
+
return summary.trim() || this.fallbackSummary(text);
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
private async speak(text: string): Promise<void> {
|
|
408
|
+
const apiKey = await this.resolveApiKey("openai");
|
|
409
|
+
if (!apiKey) {
|
|
410
|
+
console.error("OpenAI API key required for speech synthesis.");
|
|
411
|
+
return;
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
this.stopVoicePlayback();
|
|
415
|
+
this.voiceStatus = "speaking";
|
|
416
|
+
|
|
417
|
+
const audioBlob = await synthesizeOpenAI(text, {
|
|
418
|
+
apiKey,
|
|
419
|
+
model: "tts-1",
|
|
420
|
+
voice: "alloy",
|
|
421
|
+
responseFormat: "mp3",
|
|
422
|
+
});
|
|
423
|
+
|
|
424
|
+
const audioUrl = URL.createObjectURL(audioBlob);
|
|
425
|
+
const audio = new Audio(audioUrl);
|
|
426
|
+
this.voiceAudio = audio;
|
|
427
|
+
|
|
428
|
+
try {
|
|
429
|
+
await audio.play();
|
|
430
|
+
await new Promise<void>((resolve, reject) => {
|
|
431
|
+
audio.onended = () => resolve();
|
|
432
|
+
audio.onerror = () => reject(new Error("Audio playback failed"));
|
|
433
|
+
});
|
|
434
|
+
} finally {
|
|
435
|
+
URL.revokeObjectURL(audioUrl);
|
|
436
|
+
if (this.voiceAudio === audio) {
|
|
437
|
+
this.voiceAudio = undefined;
|
|
438
|
+
}
|
|
439
|
+
this.voiceStatus = "idle";
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
|
|
443
|
+
private stopVoicePlayback(): void {
|
|
444
|
+
if (this.voiceAudio) {
|
|
445
|
+
this.voiceAudio.pause();
|
|
446
|
+
this.voiceAudio.currentTime = 0;
|
|
447
|
+
this.voiceAudio = undefined;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
private resolveFastModel(): Model<any> | undefined {
|
|
452
|
+
const openAiModels = getModels("openai");
|
|
453
|
+
const preferred = ["gpt-5-mini", "gpt-4o-mini", "gpt-4.1-mini"];
|
|
454
|
+
for (const id of preferred) {
|
|
455
|
+
const found = openAiModels.find((m) => m.id === id);
|
|
456
|
+
if (found) return found;
|
|
457
|
+
}
|
|
458
|
+
return openAiModels.find((m) => m.id.includes("mini")) || undefined;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
private async resolveApiKey(provider: string): Promise<string | undefined> {
|
|
462
|
+
if (this.session?.getApiKey) {
|
|
463
|
+
const key = await this.session.getApiKey(provider);
|
|
464
|
+
if (key) return key;
|
|
465
|
+
}
|
|
466
|
+
const stored = await getAppStorage().providerKeys.get(provider);
|
|
467
|
+
return stored ?? undefined;
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
private async ensureApiKeyForProvider(provider?: string): Promise<boolean> {
|
|
471
|
+
if (!provider) return false;
|
|
472
|
+
const apiKey = await getAppStorage().providerKeys.get(provider);
|
|
473
|
+
if (apiKey) return true;
|
|
474
|
+
|
|
475
|
+
if (!this.onApiKeyRequired) {
|
|
476
|
+
console.error("No API key configured and no onApiKeyRequired handler set");
|
|
477
|
+
return false;
|
|
478
|
+
}
|
|
479
|
+
|
|
480
|
+
const success = await this.onApiKeyRequired(provider);
|
|
481
|
+
return success;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
private getLastAssistantMessage(messages: readonly unknown[]): AssistantMessage | undefined {
|
|
485
|
+
for (let i = messages.length - 1; i >= 0; i -= 1) {
|
|
486
|
+
const message = messages[i];
|
|
487
|
+
if (this.isAssistantMessage(message)) {
|
|
488
|
+
return message;
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
return undefined;
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
private isAssistantMessage(message: unknown): message is AssistantMessage {
|
|
495
|
+
if (!message || typeof message !== "object") return false;
|
|
496
|
+
return (message as AssistantMessage).role === "assistant";
|
|
497
|
+
}
|
|
498
|
+
|
|
499
|
+
private extractAssistantText(message: AssistantMessage): string {
|
|
500
|
+
return message.content
|
|
501
|
+
.filter((part): part is TextContent => part.type === "text")
|
|
502
|
+
.map((part) => part.text)
|
|
503
|
+
.join("\n");
|
|
504
|
+
}
|
|
505
|
+
|
|
506
|
+
private fallbackSummary(text: string): string {
|
|
507
|
+
const cleaned = text.replace(/\s+/g, " ").trim();
|
|
508
|
+
const matches = cleaned.match(/[^.!?]+[.!?]+/g);
|
|
509
|
+
if (matches && matches.length > 0) {
|
|
510
|
+
return matches.slice(0, 2).join(" ").trim();
|
|
511
|
+
}
|
|
512
|
+
return cleaned.slice(0, 240);
|
|
513
|
+
}
|
|
514
|
+
|
|
206
515
|
public async sendMessage(input: string, attachments?: Attachment[]) {
|
|
207
516
|
if ((!input.trim() && attachments?.length === 0) || this.session?.state.isStreaming) return;
|
|
208
517
|
const session = this.session;
|
|
@@ -320,8 +629,9 @@ export class AgentInterface extends LitElement {
|
|
|
320
629
|
|
|
321
630
|
return html`
|
|
322
631
|
<div class="text-xs text-muted-foreground flex justify-between items-center h-5">
|
|
323
|
-
<div class="flex items-center gap-
|
|
632
|
+
<div class="flex items-center gap-2">
|
|
324
633
|
${this.showThemeToggle ? html`<theme-toggle></theme-toggle>` : html``}
|
|
634
|
+
${this.renderVoiceStatus()}
|
|
325
635
|
</div>
|
|
326
636
|
<div class="flex ml-auto items-center gap-3">
|
|
327
637
|
${
|
|
@@ -338,6 +648,19 @@ export class AgentInterface extends LitElement {
|
|
|
338
648
|
`;
|
|
339
649
|
}
|
|
340
650
|
|
|
651
|
+
private renderVoiceStatus() {
|
|
652
|
+
if (!this.enableVoice || this.voiceStatus === "idle") return html``;
|
|
653
|
+
const statusText =
|
|
654
|
+
this.voiceStatus === "listening"
|
|
655
|
+
? "Listening... (Caps Lock to stop)"
|
|
656
|
+
: this.voiceStatus === "transcribing"
|
|
657
|
+
? "Transcribing..."
|
|
658
|
+
: this.voiceStatus === "speaking"
|
|
659
|
+
? "Speaking..."
|
|
660
|
+
: "Voice error";
|
|
661
|
+
return html`<span>${statusText}</span>`;
|
|
662
|
+
}
|
|
663
|
+
|
|
341
664
|
override render() {
|
|
342
665
|
if (!this.session)
|
|
343
666
|
return html`<div class="p-4 text-center text-muted-foreground">${i18n("No session set")}</div>`;
|
|
@@ -0,0 +1,272 @@
|
|
|
1
|
+
export type VoiceStopReason = "manual" | "silence" | "max" | "error";
|
|
2
|
+
|
|
3
|
+
export interface VoiceCaptureOptions {
|
|
4
|
+
silenceMs: number;
|
|
5
|
+
maxDurationMs: number;
|
|
6
|
+
minSpeechMs: number;
|
|
7
|
+
levelThreshold: number;
|
|
8
|
+
chunkMs: number;
|
|
9
|
+
preferredMimeType?: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
export interface VoiceCaptureResult {
|
|
13
|
+
blob: Blob | null;
|
|
14
|
+
reason: VoiceStopReason;
|
|
15
|
+
durationMs: number;
|
|
16
|
+
mimeType: string | null;
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
const DEFAULT_CAPTURE_OPTIONS: VoiceCaptureOptions = {
|
|
20
|
+
silenceMs: 1200,
|
|
21
|
+
maxDurationMs: 20000,
|
|
22
|
+
minSpeechMs: 500,
|
|
23
|
+
levelThreshold: 0.02,
|
|
24
|
+
chunkMs: 250,
|
|
25
|
+
preferredMimeType: "audio/webm;codecs=opus",
|
|
26
|
+
};
|
|
27
|
+
|
|
28
|
+
export class VoiceCapture {
|
|
29
|
+
private options: VoiceCaptureOptions;
|
|
30
|
+
private stream?: MediaStream;
|
|
31
|
+
private recorder?: MediaRecorder;
|
|
32
|
+
private audioContext?: AudioContext;
|
|
33
|
+
private analyser?: AnalyserNode;
|
|
34
|
+
private data?: Float32Array<ArrayBuffer>;
|
|
35
|
+
private chunks: Blob[] = [];
|
|
36
|
+
private startedAt = 0;
|
|
37
|
+
private speechDetectedAt?: number;
|
|
38
|
+
private silenceStartedAt?: number;
|
|
39
|
+
private stopResolver?: (result: VoiceCaptureResult) => void;
|
|
40
|
+
private stopPromise?: Promise<VoiceCaptureResult>;
|
|
41
|
+
private stopReason: VoiceStopReason = "manual";
|
|
42
|
+
private monitorTimer?: number;
|
|
43
|
+
private isStopping = false;
|
|
44
|
+
private _isRecording = false;
|
|
45
|
+
|
|
46
|
+
public get isRecording(): boolean {
|
|
47
|
+
return this._isRecording;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
constructor(options?: Partial<VoiceCaptureOptions>) {
|
|
51
|
+
this.options = { ...DEFAULT_CAPTURE_OPTIONS, ...options };
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
async start(): Promise<VoiceCaptureResult> {
|
|
55
|
+
if (this._isRecording && this.stopPromise) return this.stopPromise;
|
|
56
|
+
if (!navigator.mediaDevices?.getUserMedia) {
|
|
57
|
+
throw new Error("Media devices not available.");
|
|
58
|
+
}
|
|
59
|
+
if (typeof MediaRecorder === "undefined") {
|
|
60
|
+
throw new Error("MediaRecorder not supported.");
|
|
61
|
+
}
|
|
62
|
+
this.stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
|
63
|
+
|
|
64
|
+
const mimeType = pickSupportedMimeType(this.options.preferredMimeType);
|
|
65
|
+
this.recorder = mimeType ? new MediaRecorder(this.stream, { mimeType }) : new MediaRecorder(this.stream);
|
|
66
|
+
|
|
67
|
+
this.chunks = [];
|
|
68
|
+
this.startedAt = performance.now();
|
|
69
|
+
this.speechDetectedAt = undefined;
|
|
70
|
+
this.silenceStartedAt = undefined;
|
|
71
|
+
this.stopReason = "manual";
|
|
72
|
+
this.isStopping = false;
|
|
73
|
+
this._isRecording = true;
|
|
74
|
+
this.stopPromise = new Promise((resolve) => {
|
|
75
|
+
this.stopResolver = resolve;
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
this.recorder.ondataavailable = (event: BlobEvent) => {
|
|
79
|
+
if (event.data && event.data.size > 0) {
|
|
80
|
+
this.chunks.push(event.data);
|
|
81
|
+
}
|
|
82
|
+
};
|
|
83
|
+
this.recorder.onstop = () => {
|
|
84
|
+
this.finish();
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
this.recorder.start(this.options.chunkMs);
|
|
88
|
+
this.setupAnalyzer();
|
|
89
|
+
this.startMonitor();
|
|
90
|
+
|
|
91
|
+
return this.stopPromise;
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
async stop(reason: VoiceStopReason = "manual"): Promise<VoiceCaptureResult> {
|
|
95
|
+
if (!this.recorder || !this._isRecording) {
|
|
96
|
+
return { blob: null, reason, durationMs: 0, mimeType: null };
|
|
97
|
+
}
|
|
98
|
+
if (this.isStopping && this.stopPromise) {
|
|
99
|
+
return this.stopPromise;
|
|
100
|
+
}
|
|
101
|
+
this.isStopping = true;
|
|
102
|
+
this.stopReason = reason;
|
|
103
|
+
this.recorder.stop();
|
|
104
|
+
return this.stopPromise!;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
private finish(): void {
|
|
108
|
+
this._isRecording = false;
|
|
109
|
+
const durationMs = Math.max(0, performance.now() - this.startedAt);
|
|
110
|
+
const mimeType = this.recorder?.mimeType || null;
|
|
111
|
+
const hasSpeech = this.speechDetectedAt !== undefined;
|
|
112
|
+
const blob =
|
|
113
|
+
hasSpeech && this.chunks.length > 0 ? new Blob(this.chunks, { type: mimeType || "audio/webm" }) : null;
|
|
114
|
+
const result: VoiceCaptureResult = {
|
|
115
|
+
blob,
|
|
116
|
+
reason: this.stopReason,
|
|
117
|
+
durationMs,
|
|
118
|
+
mimeType,
|
|
119
|
+
};
|
|
120
|
+
this.cleanup();
|
|
121
|
+
this.stopResolver?.(result);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
private startMonitor(): void {
|
|
125
|
+
this.monitorTimer = window.setInterval(() => {
|
|
126
|
+
if (!this.analyser || !this.data || this.isStopping) return;
|
|
127
|
+
|
|
128
|
+
this.analyser.getFloatTimeDomainData(this.data);
|
|
129
|
+
const rms = getRms(this.data);
|
|
130
|
+
const now = performance.now();
|
|
131
|
+
|
|
132
|
+
if (rms >= this.options.levelThreshold) {
|
|
133
|
+
if (!this.speechDetectedAt) {
|
|
134
|
+
this.speechDetectedAt = now;
|
|
135
|
+
}
|
|
136
|
+
this.silenceStartedAt = undefined;
|
|
137
|
+
} else if (this.speechDetectedAt) {
|
|
138
|
+
if (!this.silenceStartedAt) {
|
|
139
|
+
this.silenceStartedAt = now;
|
|
140
|
+
} else if (
|
|
141
|
+
now - this.silenceStartedAt >= this.options.silenceMs &&
|
|
142
|
+
now - this.speechDetectedAt >= this.options.minSpeechMs
|
|
143
|
+
) {
|
|
144
|
+
void this.stop("silence");
|
|
145
|
+
return;
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
if (now - this.startedAt >= this.options.maxDurationMs) {
|
|
150
|
+
void this.stop("max");
|
|
151
|
+
}
|
|
152
|
+
}, 100);
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
private setupAnalyzer(): void {
|
|
156
|
+
if (!this.stream) return;
|
|
157
|
+
this.audioContext = new AudioContext();
|
|
158
|
+
this.analyser = this.audioContext.createAnalyser();
|
|
159
|
+
this.analyser.fftSize = 2048;
|
|
160
|
+
this.data = new Float32Array(this.analyser.fftSize);
|
|
161
|
+
|
|
162
|
+
const source = this.audioContext.createMediaStreamSource(this.stream);
|
|
163
|
+
source.connect(this.analyser);
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
private cleanup(): void {
|
|
167
|
+
if (this.monitorTimer !== undefined) {
|
|
168
|
+
clearInterval(this.monitorTimer);
|
|
169
|
+
this.monitorTimer = undefined;
|
|
170
|
+
}
|
|
171
|
+
if (this.stream) {
|
|
172
|
+
for (const track of this.stream.getTracks()) {
|
|
173
|
+
track.stop();
|
|
174
|
+
}
|
|
175
|
+
this.stream = undefined;
|
|
176
|
+
}
|
|
177
|
+
if (this.audioContext) {
|
|
178
|
+
void this.audioContext.close();
|
|
179
|
+
this.audioContext = undefined;
|
|
180
|
+
}
|
|
181
|
+
this.analyser = undefined;
|
|
182
|
+
this.data = undefined;
|
|
183
|
+
this.recorder = undefined;
|
|
184
|
+
this.chunks = [];
|
|
185
|
+
this.isStopping = false;
|
|
186
|
+
}
|
|
187
|
+
}
|
|
188
|
+
|
|
189
|
+
export interface TranscribeOptions {
|
|
190
|
+
apiKey: string;
|
|
191
|
+
model?: string;
|
|
192
|
+
language?: string;
|
|
193
|
+
prompt?: string;
|
|
194
|
+
}
|
|
195
|
+
|
|
196
|
+
export async function transcribeOpenAI(blob: Blob, options: TranscribeOptions): Promise<string> {
|
|
197
|
+
const formData = new FormData();
|
|
198
|
+
const filename = blob.type.includes("ogg") ? "voice.ogg" : "voice.webm";
|
|
199
|
+
const file = new File([blob], filename, { type: blob.type || "audio/webm" });
|
|
200
|
+
formData.append("file", file);
|
|
201
|
+
formData.append("model", options.model || "whisper-1");
|
|
202
|
+
if (options.language) formData.append("language", options.language);
|
|
203
|
+
if (options.prompt) formData.append("prompt", options.prompt);
|
|
204
|
+
|
|
205
|
+
const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
|
|
206
|
+
method: "POST",
|
|
207
|
+
headers: {
|
|
208
|
+
Authorization: `Bearer ${options.apiKey}`,
|
|
209
|
+
},
|
|
210
|
+
body: formData,
|
|
211
|
+
});
|
|
212
|
+
|
|
213
|
+
if (!response.ok) {
|
|
214
|
+
throw new Error(`Transcription failed (${response.status})`);
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
const data = (await response.json()) as { text?: string };
|
|
218
|
+
return data.text?.trim() || "";
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
export interface SpeechOptions {
|
|
222
|
+
apiKey: string;
|
|
223
|
+
model?: string;
|
|
224
|
+
voice?: string;
|
|
225
|
+
responseFormat?: "mp3" | "wav" | "opus";
|
|
226
|
+
speed?: number;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
export async function synthesizeOpenAI(text: string, options: SpeechOptions): Promise<Blob> {
|
|
230
|
+
const response = await fetch("https://api.openai.com/v1/audio/speech", {
|
|
231
|
+
method: "POST",
|
|
232
|
+
headers: {
|
|
233
|
+
Authorization: `Bearer ${options.apiKey}`,
|
|
234
|
+
"Content-Type": "application/json",
|
|
235
|
+
},
|
|
236
|
+
body: JSON.stringify({
|
|
237
|
+
model: options.model || "tts-1",
|
|
238
|
+
voice: options.voice || "alloy",
|
|
239
|
+
input: text,
|
|
240
|
+
response_format: options.responseFormat || "mp3",
|
|
241
|
+
speed: options.speed,
|
|
242
|
+
}),
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
if (!response.ok) {
|
|
246
|
+
throw new Error(`Speech synthesis failed (${response.status})`);
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
return response.blob();
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
function pickSupportedMimeType(preferred?: string): string | undefined {
|
|
253
|
+
const candidates = [preferred, "audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/ogg"].filter(
|
|
254
|
+
(value): value is string => typeof value === "string",
|
|
255
|
+
);
|
|
256
|
+
|
|
257
|
+
for (const candidate of candidates) {
|
|
258
|
+
if (MediaRecorder.isTypeSupported(candidate)) {
|
|
259
|
+
return candidate;
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
return undefined;
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
function getRms(data: Float32Array): number {
|
|
266
|
+
let sum = 0;
|
|
267
|
+
for (let i = 0; i < data.length; i += 1) {
|
|
268
|
+
const sample = data[i] ?? 0;
|
|
269
|
+
sum += sample * sample;
|
|
270
|
+
}
|
|
271
|
+
return Math.sqrt(sum / data.length);
|
|
272
|
+
}
|