@oh-my-pi/pi-web-ui 3.15.1 → 3.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -2,6 +2,15 @@
2
2
 
3
3
  ## [Unreleased]
4
4
 
5
+ ## [3.20.0] - 2026-01-06
6
+ ### Added
7
+
8
+ - Added voice input and output support using OpenAI Whisper for transcription and TTS for speech synthesis
9
+ - Added `enableVoice` property to toggle voice features in the agent interface
10
+ - Added Caps Lock key as push-to-talk trigger for voice capture
11
+ - Added automatic voice summarization and playback of assistant responses
12
+ - Added silence detection to automatically stop voice recording after speech ends
13
+
5
14
  ## [3.15.1] - 2026-01-05
6
15
 
7
16
  ## [3.15.0] - 2026-01-05
@@ -148,4 +157,4 @@ declare module "@oh-my-pi/pi-agent-core" {
148
157
  "my-message": MyMessage;
149
158
  }
150
159
  }
151
- ```
160
+ ```
package/README.md CHANGED
@@ -209,7 +209,8 @@ agent.abort();
209
209
  agent.setModel(newModel);
210
210
  agent.setThinkingLevel('medium');
211
211
  agent.setTools([...]);
212
- agent.queueMessage(customMessage);
212
+ agent.steer(customMessage);
213
+ agent.followUp(customMessage);
213
214
  ```
214
215
 
215
216
  ## Message Types
@@ -343,11 +343,11 @@ const renderApp = () => {
343
343
  size: "sm",
344
344
  children: icon(Bell, "sm"),
345
345
  onClick: () => {
346
- // Demo: Inject custom message (will appear on next agent run)
346
+ // Demo: Queue custom message for the next turn
347
347
  if (agent) {
348
- agent.queueMessage(
348
+ agent.followUp(
349
349
  createSystemNotification(
350
- "This is a custom message! It appears in the UI but is never sent to the LLM.",
350
+ "This is a custom message! It appears in the UI and is sent to the LLM.",
351
351
  ),
352
352
  );
353
353
  }
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "compilerOptions": {
3
3
  "target": "ES2022",
4
- "module": "ES2022",
4
+ "module": "ESNext",
5
5
  "lib": ["ES2022", "DOM", "DOM.Iterable"],
6
6
  "moduleResolution": "bundler",
7
7
  "paths": {
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@oh-my-pi/pi-web-ui",
3
- "version": "3.15.1",
3
+ "version": "3.20.0",
4
4
  "description": "Reusable web UI components for AI chat interfaces powered by @oh-my-pi/pi-ai",
5
5
  "type": "module",
6
6
  "main": "src/index.ts",
@@ -19,9 +19,9 @@
19
19
  },
20
20
  "dependencies": {
21
21
  "@lmstudio/sdk": "^1.5.0",
22
- "@oh-my-pi/pi-agent-core": "3.15.1",
23
- "@oh-my-pi/pi-ai": "3.15.1",
24
- "@oh-my-pi/pi-tui": "3.15.1",
22
+ "@oh-my-pi/pi-agent-core": "3.20.0",
23
+ "@oh-my-pi/pi-ai": "3.20.0",
24
+ "@oh-my-pi/pi-tui": "3.20.0",
25
25
  "docx-preview": "^0.3.7",
26
26
  "highlight.js": "^11.11.1",
27
27
  "jszip": "^3.10.1",
@@ -1,6 +1,15 @@
1
- import { streamSimple, type ToolResultMessage, type Usage } from "@oh-my-pi/pi-ai";
1
+ import {
2
+ type AssistantMessage,
3
+ completeSimple,
4
+ getModels,
5
+ type Model,
6
+ streamSimple,
7
+ type TextContent,
8
+ type ToolResultMessage,
9
+ type Usage,
10
+ } from "@oh-my-pi/pi-ai";
2
11
  import { html, LitElement } from "lit";
3
- import { customElement, property, query } from "lit/decorators.js";
12
+ import { customElement, property, query, state } from "lit/decorators.js";
4
13
  import { ModelSelector } from "../dialogs/ModelSelector";
5
14
  import type { MessageEditor } from "./MessageEditor";
6
15
  import "./MessageEditor.js";
@@ -13,6 +22,7 @@ import type { Attachment } from "../utils/attachment-utils";
13
22
  import { formatUsage } from "../utils/format";
14
23
  import { i18n } from "../utils/i18n";
15
24
  import { createStreamFn } from "../utils/proxy-utils";
25
+ import { synthesizeOpenAI, transcribeOpenAI, VoiceCapture, type VoiceCaptureResult } from "../utils/voice";
16
26
  import type { UserMessageWithAttachments } from "./Messages";
17
27
  import type { StreamingMessageContainer } from "./StreamingMessageContainer";
18
28
 
@@ -24,6 +34,7 @@ export class AgentInterface extends LitElement {
24
34
  @property({ type: Boolean }) enableModelSelector = true;
25
35
  @property({ type: Boolean }) enableThinkingSelector = true;
26
36
  @property({ type: Boolean }) showThemeToggle = false;
37
+ @property({ type: Boolean }) enableVoice = true;
27
38
  // Optional custom API key prompt handler - if not provided, uses default dialog
28
39
  @property({ attribute: false }) onApiKeyRequired?: (provider: string) => Promise<boolean>;
29
40
  // Optional callback called before sending a message
@@ -37,12 +48,21 @@ export class AgentInterface extends LitElement {
37
48
  @query("message-editor") private _messageEditor!: MessageEditor;
38
49
  @query("streaming-message-container") private _streamingContainer!: StreamingMessageContainer;
39
50
 
51
+ @state() private voiceStatus: "idle" | "listening" | "transcribing" | "speaking" | "error" = "idle";
52
+
40
53
  private _autoScroll = true;
41
54
  private _lastScrollTop = 0;
42
55
  private _lastClientHeight = 0;
43
56
  private _scrollContainer?: HTMLElement;
44
57
  private _resizeObserver?: ResizeObserver;
45
58
  private _unsubscribeSession?: () => void;
59
+ private voiceCapture?: VoiceCapture;
60
+ private voiceAudio?: HTMLAudioElement;
61
+ private voiceSummaryAbort?: AbortController;
62
+ private voiceLastSpokenTimestamp = 0;
63
+ private voiceKeyListener = (event: KeyboardEvent) => {
64
+ void this.handleVoiceKey(event);
65
+ };
46
66
 
47
67
  public setInput(text: string, attachments?: Attachment[]) {
48
68
  const update = () => {
@@ -104,6 +124,8 @@ export class AgentInterface extends LitElement {
104
124
 
105
125
  // Subscribe to external session if provided
106
126
  this.setupSessionSubscription();
127
+
128
+ window.addEventListener("keydown", this.voiceKeyListener);
107
129
  }
108
130
 
109
131
  override disconnectedCallback() {
@@ -123,6 +145,11 @@ export class AgentInterface extends LitElement {
123
145
  this._unsubscribeSession();
124
146
  this._unsubscribeSession = undefined;
125
147
  }
148
+
149
+ window.removeEventListener("keydown", this.voiceKeyListener);
150
+ void this.stopVoiceCapture("manual");
151
+ this.stopVoicePlayback();
152
+ this.voiceSummaryAbort?.abort();
126
153
  }
127
154
 
128
155
  private setupSessionSubscription() {
@@ -164,6 +191,7 @@ export class AgentInterface extends LitElement {
164
191
  this._streamingContainer.setMessage(null, true);
165
192
  }
166
193
  this.requestUpdate();
194
+ void this.handleAgentEnd();
167
195
  break;
168
196
  case "message_update":
169
197
  if (this._streamingContainer) {
@@ -203,6 +231,287 @@ export class AgentInterface extends LitElement {
203
231
  this._lastClientHeight = clientHeight;
204
232
  };
205
233
 
234
+ private async handleVoiceKey(event: KeyboardEvent): Promise<void> {
235
+ if (!this.enableVoice) return;
236
+ if (event.code !== "CapsLock" && event.key !== "CapsLock") return;
237
+ if (event.repeat) return;
238
+ event.preventDefault();
239
+
240
+ if (this.voiceStatus === "transcribing" || this.voiceStatus === "speaking") {
241
+ return;
242
+ }
243
+
244
+ if (this.voiceCapture?.isRecording) {
245
+ await this.stopVoiceCapture("manual");
246
+ } else {
247
+ await this.startVoiceCapture();
248
+ }
249
+ }
250
+
251
+ private async startVoiceCapture(): Promise<void> {
252
+ if (!this.enableVoice || this.voiceCapture?.isRecording) return;
253
+ const apiKey = await this.resolveApiKey("openai");
254
+ if (!apiKey) {
255
+ this.voiceStatus = "error";
256
+ console.error("Voice capture requires an OpenAI API key.");
257
+ return;
258
+ }
259
+
260
+ this.voiceStatus = "listening";
261
+ this.voiceCapture = new VoiceCapture();
262
+ const resultPromise = this.voiceCapture.start();
263
+ void resultPromise
264
+ .then((result) => this.handleVoiceCaptureResult(result))
265
+ .catch((error) => {
266
+ this.voiceStatus = "error";
267
+ this.voiceCapture = undefined;
268
+ console.error("Failed to start voice capture:", error);
269
+ });
270
+ }
271
+
272
+ private async stopVoiceCapture(reason: "manual" | "silence" | "max" | "error"): Promise<void> {
273
+ if (!this.voiceCapture) return;
274
+ await this.voiceCapture.stop(reason);
275
+ }
276
+
277
+ private async handleVoiceCaptureResult(result: VoiceCaptureResult): Promise<void> {
278
+ if (this.voiceStatus === "transcribing" || this.voiceStatus === "speaking") return;
279
+ if (!result.blob) {
280
+ this.voiceStatus = "idle";
281
+ this.voiceCapture = undefined;
282
+ return;
283
+ }
284
+ this.voiceStatus = "transcribing";
285
+ try {
286
+ const transcript = await this.transcribeAudio(result.blob);
287
+ if (transcript.trim()) {
288
+ await this.sendVoiceMessage(transcript);
289
+ }
290
+ this.voiceStatus = "idle";
291
+ this.voiceCapture = undefined;
292
+ } catch (error) {
293
+ this.voiceStatus = "error";
294
+ this.voiceCapture = undefined;
295
+ console.error("Voice transcription failed:", error);
296
+ }
297
+ }
298
+
299
+ private async transcribeAudio(blob: Blob): Promise<string> {
300
+ const apiKey = await this.resolveApiKey("openai");
301
+ if (!apiKey) {
302
+ throw new Error("OpenAI API key required for transcription.");
303
+ }
304
+ return transcribeOpenAI(blob, {
305
+ apiKey,
306
+ model: "whisper-1",
307
+ prompt: "Short voice command or question.",
308
+ });
309
+ }
310
+
311
+ private async sendVoiceMessage(input: string): Promise<void> {
312
+ const session = this.session;
313
+ if (!session) return;
314
+
315
+ // Design choice: keep the agent's full response for the UI, and generate short voice summaries separately.
316
+ const text = input.trim();
317
+ if (!text) return;
318
+
319
+ if (!(await this.ensureApiKeyForProvider(session.state.model?.provider))) {
320
+ return;
321
+ }
322
+
323
+ if (session.state.isStreaming) {
324
+ if (this.onBeforeSend) {
325
+ await this.onBeforeSend();
326
+ }
327
+ session.steer({ role: "user", content: text, timestamp: Date.now() });
328
+ return;
329
+ }
330
+
331
+ await this.sendMessage(text);
332
+ }
333
+
334
+ private async handleAgentEnd(): Promise<void> {
335
+ if (!this.enableVoice) return;
336
+ const session = this.session;
337
+ if (!session) return;
338
+
339
+ const lastAssistant = this.getLastAssistantMessage(session.state.messages);
340
+ if (!lastAssistant) return;
341
+ if (lastAssistant.timestamp <= this.voiceLastSpokenTimestamp) return;
342
+
343
+ const assistantText = this.extractAssistantText(lastAssistant);
344
+ if (!assistantText.trim()) return;
345
+
346
+ this.voiceLastSpokenTimestamp = lastAssistant.timestamp;
347
+
348
+ try {
349
+ const summary = await this.summarizeForVoice(assistantText);
350
+ if (summary.trim()) {
351
+ await this.speak(summary);
352
+ }
353
+ } catch (error) {
354
+ this.voiceStatus = "error";
355
+ console.error("Voice summary failed:", error);
356
+ }
357
+ }
358
+
359
+ private async summarizeForVoice(text: string): Promise<string> {
360
+ const model = this.resolveFastModel() || this.session?.state.model;
361
+ if (!model) {
362
+ return this.fallbackSummary(text);
363
+ }
364
+
365
+ const apiKey = await this.resolveApiKey(model.provider);
366
+ if (!apiKey) {
367
+ return this.fallbackSummary(text);
368
+ }
369
+
370
+ this.voiceSummaryAbort?.abort();
371
+ this.voiceSummaryAbort = new AbortController();
372
+
373
+ const prompt = [
374
+ "Summarize the assistant response for voice playback.",
375
+ "Keep it to 1-3 short sentences, conversational tone.",
376
+ "Preserve any question the assistant asked.",
377
+ 'Keep uncertainty if present (e.g. "hmm... maybe...").',
378
+ "Do not use bullet points.",
379
+ "",
380
+ "Assistant response:",
381
+ text,
382
+ ].join("\n");
383
+
384
+ const result = await completeSimple(
385
+ model,
386
+ {
387
+ messages: [
388
+ {
389
+ role: "user",
390
+ content: prompt,
391
+ timestamp: Date.now(),
392
+ },
393
+ ],
394
+ },
395
+ {
396
+ apiKey,
397
+ maxTokens: 200,
398
+ temperature: 0.2,
399
+ signal: this.voiceSummaryAbort.signal,
400
+ },
401
+ );
402
+
403
+ const summary = this.extractAssistantText(result);
404
+ return summary.trim() || this.fallbackSummary(text);
405
+ }
406
+
407
+ private async speak(text: string): Promise<void> {
408
+ const apiKey = await this.resolveApiKey("openai");
409
+ if (!apiKey) {
410
+ console.error("OpenAI API key required for speech synthesis.");
411
+ return;
412
+ }
413
+
414
+ this.stopVoicePlayback();
415
+ this.voiceStatus = "speaking";
416
+
417
+ const audioBlob = await synthesizeOpenAI(text, {
418
+ apiKey,
419
+ model: "tts-1",
420
+ voice: "alloy",
421
+ responseFormat: "mp3",
422
+ });
423
+
424
+ const audioUrl = URL.createObjectURL(audioBlob);
425
+ const audio = new Audio(audioUrl);
426
+ this.voiceAudio = audio;
427
+
428
+ try {
429
+ await audio.play();
430
+ await new Promise<void>((resolve, reject) => {
431
+ audio.onended = () => resolve();
432
+ audio.onerror = () => reject(new Error("Audio playback failed"));
433
+ });
434
+ } finally {
435
+ URL.revokeObjectURL(audioUrl);
436
+ if (this.voiceAudio === audio) {
437
+ this.voiceAudio = undefined;
438
+ }
439
+ this.voiceStatus = "idle";
440
+ }
441
+ }
442
+
443
+ private stopVoicePlayback(): void {
444
+ if (this.voiceAudio) {
445
+ this.voiceAudio.pause();
446
+ this.voiceAudio.currentTime = 0;
447
+ this.voiceAudio = undefined;
448
+ }
449
+ }
450
+
451
+ private resolveFastModel(): Model<any> | undefined {
452
+ const openAiModels = getModels("openai");
453
+ const preferred = ["gpt-5-mini", "gpt-4o-mini", "gpt-4.1-mini"];
454
+ for (const id of preferred) {
455
+ const found = openAiModels.find((m) => m.id === id);
456
+ if (found) return found;
457
+ }
458
+ return openAiModels.find((m) => m.id.includes("mini")) || undefined;
459
+ }
460
+
461
+ private async resolveApiKey(provider: string): Promise<string | undefined> {
462
+ if (this.session?.getApiKey) {
463
+ const key = await this.session.getApiKey(provider);
464
+ if (key) return key;
465
+ }
466
+ const stored = await getAppStorage().providerKeys.get(provider);
467
+ return stored ?? undefined;
468
+ }
469
+
470
+ private async ensureApiKeyForProvider(provider?: string): Promise<boolean> {
471
+ if (!provider) return false;
472
+ const apiKey = await getAppStorage().providerKeys.get(provider);
473
+ if (apiKey) return true;
474
+
475
+ if (!this.onApiKeyRequired) {
476
+ console.error("No API key configured and no onApiKeyRequired handler set");
477
+ return false;
478
+ }
479
+
480
+ const success = await this.onApiKeyRequired(provider);
481
+ return success;
482
+ }
483
+
484
+ private getLastAssistantMessage(messages: readonly unknown[]): AssistantMessage | undefined {
485
+ for (let i = messages.length - 1; i >= 0; i -= 1) {
486
+ const message = messages[i];
487
+ if (this.isAssistantMessage(message)) {
488
+ return message;
489
+ }
490
+ }
491
+ return undefined;
492
+ }
493
+
494
+ private isAssistantMessage(message: unknown): message is AssistantMessage {
495
+ if (!message || typeof message !== "object") return false;
496
+ return (message as AssistantMessage).role === "assistant";
497
+ }
498
+
499
+ private extractAssistantText(message: AssistantMessage): string {
500
+ return message.content
501
+ .filter((part): part is TextContent => part.type === "text")
502
+ .map((part) => part.text)
503
+ .join("\n");
504
+ }
505
+
506
+ private fallbackSummary(text: string): string {
507
+ const cleaned = text.replace(/\s+/g, " ").trim();
508
+ const matches = cleaned.match(/[^.!?]+[.!?]+/g);
509
+ if (matches && matches.length > 0) {
510
+ return matches.slice(0, 2).join(" ").trim();
511
+ }
512
+ return cleaned.slice(0, 240);
513
+ }
514
+
206
515
  public async sendMessage(input: string, attachments?: Attachment[]) {
207
516
  if ((!input.trim() && attachments?.length === 0) || this.session?.state.isStreaming) return;
208
517
  const session = this.session;
@@ -320,8 +629,9 @@ export class AgentInterface extends LitElement {
320
629
 
321
630
  return html`
322
631
  <div class="text-xs text-muted-foreground flex justify-between items-center h-5">
323
- <div class="flex items-center gap-1">
632
+ <div class="flex items-center gap-2">
324
633
  ${this.showThemeToggle ? html`<theme-toggle></theme-toggle>` : html``}
634
+ ${this.renderVoiceStatus()}
325
635
  </div>
326
636
  <div class="flex ml-auto items-center gap-3">
327
637
  ${
@@ -338,6 +648,19 @@ export class AgentInterface extends LitElement {
338
648
  `;
339
649
  }
340
650
 
651
+ private renderVoiceStatus() {
652
+ if (!this.enableVoice || this.voiceStatus === "idle") return html``;
653
+ const statusText =
654
+ this.voiceStatus === "listening"
655
+ ? "Listening... (Caps Lock to stop)"
656
+ : this.voiceStatus === "transcribing"
657
+ ? "Transcribing..."
658
+ : this.voiceStatus === "speaking"
659
+ ? "Speaking..."
660
+ : "Voice error";
661
+ return html`<span>${statusText}</span>`;
662
+ }
663
+
341
664
  override render() {
342
665
  if (!this.session)
343
666
  return html`<div class="p-4 text-center text-muted-foreground">${i18n("No session set")}</div>`;
@@ -0,0 +1,272 @@
1
+ export type VoiceStopReason = "manual" | "silence" | "max" | "error";
2
+
3
+ export interface VoiceCaptureOptions {
4
+ silenceMs: number;
5
+ maxDurationMs: number;
6
+ minSpeechMs: number;
7
+ levelThreshold: number;
8
+ chunkMs: number;
9
+ preferredMimeType?: string;
10
+ }
11
+
12
+ export interface VoiceCaptureResult {
13
+ blob: Blob | null;
14
+ reason: VoiceStopReason;
15
+ durationMs: number;
16
+ mimeType: string | null;
17
+ }
18
+
19
+ const DEFAULT_CAPTURE_OPTIONS: VoiceCaptureOptions = {
20
+ silenceMs: 1200,
21
+ maxDurationMs: 20000,
22
+ minSpeechMs: 500,
23
+ levelThreshold: 0.02,
24
+ chunkMs: 250,
25
+ preferredMimeType: "audio/webm;codecs=opus",
26
+ };
27
+
28
+ export class VoiceCapture {
29
+ private options: VoiceCaptureOptions;
30
+ private stream?: MediaStream;
31
+ private recorder?: MediaRecorder;
32
+ private audioContext?: AudioContext;
33
+ private analyser?: AnalyserNode;
34
+ private data?: Float32Array<ArrayBuffer>;
35
+ private chunks: Blob[] = [];
36
+ private startedAt = 0;
37
+ private speechDetectedAt?: number;
38
+ private silenceStartedAt?: number;
39
+ private stopResolver?: (result: VoiceCaptureResult) => void;
40
+ private stopPromise?: Promise<VoiceCaptureResult>;
41
+ private stopReason: VoiceStopReason = "manual";
42
+ private monitorTimer?: number;
43
+ private isStopping = false;
44
+ private _isRecording = false;
45
+
46
+ public get isRecording(): boolean {
47
+ return this._isRecording;
48
+ }
49
+
50
+ constructor(options?: Partial<VoiceCaptureOptions>) {
51
+ this.options = { ...DEFAULT_CAPTURE_OPTIONS, ...options };
52
+ }
53
+
54
+ async start(): Promise<VoiceCaptureResult> {
55
+ if (this._isRecording && this.stopPromise) return this.stopPromise;
56
+ if (!navigator.mediaDevices?.getUserMedia) {
57
+ throw new Error("Media devices not available.");
58
+ }
59
+ if (typeof MediaRecorder === "undefined") {
60
+ throw new Error("MediaRecorder not supported.");
61
+ }
62
+ this.stream = await navigator.mediaDevices.getUserMedia({ audio: true });
63
+
64
+ const mimeType = pickSupportedMimeType(this.options.preferredMimeType);
65
+ this.recorder = mimeType ? new MediaRecorder(this.stream, { mimeType }) : new MediaRecorder(this.stream);
66
+
67
+ this.chunks = [];
68
+ this.startedAt = performance.now();
69
+ this.speechDetectedAt = undefined;
70
+ this.silenceStartedAt = undefined;
71
+ this.stopReason = "manual";
72
+ this.isStopping = false;
73
+ this._isRecording = true;
74
+ this.stopPromise = new Promise((resolve) => {
75
+ this.stopResolver = resolve;
76
+ });
77
+
78
+ this.recorder.ondataavailable = (event: BlobEvent) => {
79
+ if (event.data && event.data.size > 0) {
80
+ this.chunks.push(event.data);
81
+ }
82
+ };
83
+ this.recorder.onstop = () => {
84
+ this.finish();
85
+ };
86
+
87
+ this.recorder.start(this.options.chunkMs);
88
+ this.setupAnalyzer();
89
+ this.startMonitor();
90
+
91
+ return this.stopPromise;
92
+ }
93
+
94
+ async stop(reason: VoiceStopReason = "manual"): Promise<VoiceCaptureResult> {
95
+ if (!this.recorder || !this._isRecording) {
96
+ return { blob: null, reason, durationMs: 0, mimeType: null };
97
+ }
98
+ if (this.isStopping && this.stopPromise) {
99
+ return this.stopPromise;
100
+ }
101
+ this.isStopping = true;
102
+ this.stopReason = reason;
103
+ this.recorder.stop();
104
+ return this.stopPromise!;
105
+ }
106
+
107
+ private finish(): void {
108
+ this._isRecording = false;
109
+ const durationMs = Math.max(0, performance.now() - this.startedAt);
110
+ const mimeType = this.recorder?.mimeType || null;
111
+ const hasSpeech = this.speechDetectedAt !== undefined;
112
+ const blob =
113
+ hasSpeech && this.chunks.length > 0 ? new Blob(this.chunks, { type: mimeType || "audio/webm" }) : null;
114
+ const result: VoiceCaptureResult = {
115
+ blob,
116
+ reason: this.stopReason,
117
+ durationMs,
118
+ mimeType,
119
+ };
120
+ this.cleanup();
121
+ this.stopResolver?.(result);
122
+ }
123
+
124
+ private startMonitor(): void {
125
+ this.monitorTimer = window.setInterval(() => {
126
+ if (!this.analyser || !this.data || this.isStopping) return;
127
+
128
+ this.analyser.getFloatTimeDomainData(this.data);
129
+ const rms = getRms(this.data);
130
+ const now = performance.now();
131
+
132
+ if (rms >= this.options.levelThreshold) {
133
+ if (!this.speechDetectedAt) {
134
+ this.speechDetectedAt = now;
135
+ }
136
+ this.silenceStartedAt = undefined;
137
+ } else if (this.speechDetectedAt) {
138
+ if (!this.silenceStartedAt) {
139
+ this.silenceStartedAt = now;
140
+ } else if (
141
+ now - this.silenceStartedAt >= this.options.silenceMs &&
142
+ now - this.speechDetectedAt >= this.options.minSpeechMs
143
+ ) {
144
+ void this.stop("silence");
145
+ return;
146
+ }
147
+ }
148
+
149
+ if (now - this.startedAt >= this.options.maxDurationMs) {
150
+ void this.stop("max");
151
+ }
152
+ }, 100);
153
+ }
154
+
155
+ private setupAnalyzer(): void {
156
+ if (!this.stream) return;
157
+ this.audioContext = new AudioContext();
158
+ this.analyser = this.audioContext.createAnalyser();
159
+ this.analyser.fftSize = 2048;
160
+ this.data = new Float32Array(this.analyser.fftSize);
161
+
162
+ const source = this.audioContext.createMediaStreamSource(this.stream);
163
+ source.connect(this.analyser);
164
+ }
165
+
166
+ private cleanup(): void {
167
+ if (this.monitorTimer !== undefined) {
168
+ clearInterval(this.monitorTimer);
169
+ this.monitorTimer = undefined;
170
+ }
171
+ if (this.stream) {
172
+ for (const track of this.stream.getTracks()) {
173
+ track.stop();
174
+ }
175
+ this.stream = undefined;
176
+ }
177
+ if (this.audioContext) {
178
+ void this.audioContext.close();
179
+ this.audioContext = undefined;
180
+ }
181
+ this.analyser = undefined;
182
+ this.data = undefined;
183
+ this.recorder = undefined;
184
+ this.chunks = [];
185
+ this.isStopping = false;
186
+ }
187
+ }
188
+
189
+ export interface TranscribeOptions {
190
+ apiKey: string;
191
+ model?: string;
192
+ language?: string;
193
+ prompt?: string;
194
+ }
195
+
196
+ export async function transcribeOpenAI(blob: Blob, options: TranscribeOptions): Promise<string> {
197
+ const formData = new FormData();
198
+ const filename = blob.type.includes("ogg") ? "voice.ogg" : "voice.webm";
199
+ const file = new File([blob], filename, { type: blob.type || "audio/webm" });
200
+ formData.append("file", file);
201
+ formData.append("model", options.model || "whisper-1");
202
+ if (options.language) formData.append("language", options.language);
203
+ if (options.prompt) formData.append("prompt", options.prompt);
204
+
205
+ const response = await fetch("https://api.openai.com/v1/audio/transcriptions", {
206
+ method: "POST",
207
+ headers: {
208
+ Authorization: `Bearer ${options.apiKey}`,
209
+ },
210
+ body: formData,
211
+ });
212
+
213
+ if (!response.ok) {
214
+ throw new Error(`Transcription failed (${response.status})`);
215
+ }
216
+
217
+ const data = (await response.json()) as { text?: string };
218
+ return data.text?.trim() || "";
219
+ }
220
+
221
+ export interface SpeechOptions {
222
+ apiKey: string;
223
+ model?: string;
224
+ voice?: string;
225
+ responseFormat?: "mp3" | "wav" | "opus";
226
+ speed?: number;
227
+ }
228
+
229
+ export async function synthesizeOpenAI(text: string, options: SpeechOptions): Promise<Blob> {
230
+ const response = await fetch("https://api.openai.com/v1/audio/speech", {
231
+ method: "POST",
232
+ headers: {
233
+ Authorization: `Bearer ${options.apiKey}`,
234
+ "Content-Type": "application/json",
235
+ },
236
+ body: JSON.stringify({
237
+ model: options.model || "tts-1",
238
+ voice: options.voice || "alloy",
239
+ input: text,
240
+ response_format: options.responseFormat || "mp3",
241
+ speed: options.speed,
242
+ }),
243
+ });
244
+
245
+ if (!response.ok) {
246
+ throw new Error(`Speech synthesis failed (${response.status})`);
247
+ }
248
+
249
+ return response.blob();
250
+ }
251
+
252
+ function pickSupportedMimeType(preferred?: string): string | undefined {
253
+ const candidates = [preferred, "audio/webm;codecs=opus", "audio/webm", "audio/ogg;codecs=opus", "audio/ogg"].filter(
254
+ (value): value is string => typeof value === "string",
255
+ );
256
+
257
+ for (const candidate of candidates) {
258
+ if (MediaRecorder.isTypeSupported(candidate)) {
259
+ return candidate;
260
+ }
261
+ }
262
+ return undefined;
263
+ }
264
+
265
+ function getRms(data: Float32Array): number {
266
+ let sum = 0;
267
+ for (let i = 0; i < data.length; i += 1) {
268
+ const sample = data[i] ?? 0;
269
+ sum += sample * sample;
270
+ }
271
+ return Math.sqrt(sum / data.length);
272
+ }