opencode-voice 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/index.ts ADDED
@@ -0,0 +1,177 @@
1
+ /* biome-ignore-all assist/source/organizeImports: keep runtime and type imports grouped for clarity */
2
+ import { createSignal } from "solid-js";
3
+ import type { PluginOptions } from "@opencode-ai/plugin";
4
+ import type {
5
+ TuiPlugin,
6
+ TuiPluginApi,
7
+ TuiPluginMeta,
8
+ TuiPluginModule,
9
+ TuiSlotPlugin,
10
+ } from "@opencode-ai/plugin/tui";
11
+
12
+ import { loadVoiceConfig, resolveApiKey } from "./config";
13
+ import { detectRecordingTool, getInstallInstructions } from "./audio/detector";
14
+ import { Recorder } from "./audio/recorder";
15
+ import { createProvider } from "./providers/factory";
16
+ import { VoiceSession } from "./session";
17
+ import { RecordingIndicator } from "./ui/recording-indicator";
18
+ import type { TranscriptionEvent, VoiceSessionState } from "./types";
19
+
20
+ const VOICE_COMMAND_KEYBIND = "ctrl+shift+v";
21
+ const VOICE_COMMAND_VALUE = "voice-toggle";
22
+
23
+ type PluginDeps = {
24
+ loadVoiceConfig: typeof loadVoiceConfig;
25
+ resolveApiKey: typeof resolveApiKey;
26
+ detectRecordingTool: typeof detectRecordingTool;
27
+ getInstallInstructions: typeof getInstallInstructions;
28
+ Recorder: typeof Recorder;
29
+ createProvider: typeof createProvider;
30
+ VoiceSession: typeof VoiceSession;
31
+ RecordingIndicator: typeof RecordingIndicator;
32
+ };
33
+
34
+ const defaultDeps: PluginDeps = {
35
+ loadVoiceConfig,
36
+ resolveApiKey,
37
+ detectRecordingTool,
38
+ getInstallInstructions,
39
+ Recorder,
40
+ createProvider,
41
+ VoiceSession,
42
+ RecordingIndicator,
43
+ };
44
+
45
+ async function appendTranscript(
46
+ api: TuiPluginApi,
47
+ text: string,
48
+ ): Promise<void> {
49
+ await api.client.tui.appendPrompt({ text: `${text} ` });
50
+ }
51
+
52
+ export function createTuiPlugin(deps: PluginDeps = defaultDeps): TuiPlugin {
53
+ return async (
54
+ api: TuiPluginApi,
55
+ options: PluginOptions | undefined,
56
+ _meta: TuiPluginMeta,
57
+ ): Promise<void> => {
58
+ let config: ReturnType<typeof loadVoiceConfig>;
59
+ try {
60
+ config = deps.loadVoiceConfig(options);
61
+ } catch (error) {
62
+ api.ui.toast({
63
+ variant: "error",
64
+ title: "opencode-voice",
65
+ message: error instanceof Error ? error.message : "Configuration error",
66
+ });
67
+ return;
68
+ }
69
+
70
+ const detected = await deps.detectRecordingTool();
71
+ if (!detected) {
72
+ api.ui.toast({
73
+ variant: "error",
74
+ title: "opencode-voice",
75
+ message:
76
+ "No recording tool found. Install sox to use voice input.\n" +
77
+ deps.getInstallInstructions(process.platform),
78
+ });
79
+ return;
80
+ }
81
+
82
+ const recorder = new deps.Recorder(detected.tool, detected.path);
83
+ const provider = deps.createProvider(config);
84
+ const apiKey = deps.resolveApiKey(config.provider);
85
+ const [isRecording, setIsRecording] = createSignal(false);
86
+ const [interimText, setInterimText] = createSignal("");
87
+
88
+ const session = new deps.VoiceSession({
89
+ recorder,
90
+ provider,
91
+ providerConfig: {
92
+ apiKey,
93
+ language: config.language,
94
+ },
95
+ onTranscript: (event: TranscriptionEvent) => {
96
+ if (event.isFinal) {
97
+ setInterimText("");
98
+ void appendTranscript(api, event.text);
99
+ return;
100
+ }
101
+
102
+ setInterimText(event.text);
103
+ },
104
+ onStateChange: (state: VoiceSessionState) => {
105
+ setIsRecording(state === "recording");
106
+ },
107
+ onError: (error: Error) => {
108
+ api.ui.toast({
109
+ variant: "error",
110
+ title: "Voice Error",
111
+ message: error.message,
112
+ });
113
+ },
114
+ });
115
+
116
+ const unregisterCommand = api.command.register(() => [
117
+ {
118
+ title: "Toggle Voice Recording",
119
+ value: VOICE_COMMAND_VALUE,
120
+ description: "Start/stop voice-to-text recording (Ctrl+Shift+V)",
121
+ keybind: VOICE_COMMAND_KEYBIND,
122
+ hidden: true,
123
+ onSelect: () => {
124
+ void (async () => {
125
+ const before = session.getState();
126
+ await session.toggle();
127
+ const after = session.getState();
128
+
129
+ if (before === after) {
130
+ return;
131
+ }
132
+
133
+ if (after === "recording") {
134
+ api.ui.toast({
135
+ variant: "info",
136
+ title: "Voice",
137
+ message: "Recording started... Ctrl+Shift+V to stop",
138
+ });
139
+ return;
140
+ }
141
+
142
+ if (before === "recording") {
143
+ api.ui.toast({
144
+ variant: "info",
145
+ title: "Voice",
146
+ message: "Recording stopped",
147
+ });
148
+ }
149
+ })();
150
+ },
151
+ },
152
+ ]);
153
+
154
+ const slotPlugin: TuiSlotPlugin = {
155
+ slots: {
156
+ home_bottom: () =>
157
+ deps.RecordingIndicator({
158
+ isRecording,
159
+ interimText,
160
+ }),
161
+ },
162
+ };
163
+
164
+ api.slots.register(slotPlugin);
165
+
166
+ api.lifecycle.onDispose(async () => {
167
+ unregisterCommand();
168
+ await session.dispose();
169
+ });
170
+ };
171
+ }
172
+
173
+ export const tui = createTuiPlugin();
174
+
175
+ const plugin: TuiPluginModule = { tui };
176
+
177
+ export default plugin;
@@ -0,0 +1,143 @@
1
+ import type {
2
+ ProviderConfig,
3
+ TranscriptionEvent,
4
+ TranscriptionProvider,
5
+ VoiceProvider,
6
+ } from "../types";
7
+
8
+ type ChunkedConfig = {
9
+ chunkDurationMs?: number;
10
+ overlapMs?: number;
11
+ };
12
+
13
+ const BYTES_PER_MS = 32;
14
+
15
+ export abstract class ChunkedProvider implements TranscriptionProvider {
16
+ abstract readonly name: VoiceProvider;
17
+
18
+ private transcriptCallback: ((event: TranscriptionEvent) => void) | null =
19
+ null;
20
+ private errorCallback: ((err: Error) => void) | null = null;
21
+ private audioBuffer: Uint8Array[] = [];
22
+ private chunkTimer: ReturnType<typeof setInterval> | null = null;
23
+ private previousTranscript = "";
24
+ private connected = false;
25
+ private hasUnprocessedAudio = false;
26
+ protected providerConfig: ProviderConfig = {};
27
+
28
+ private readonly chunkDurationMs: number;
29
+ private readonly overlapMs: number;
30
+
31
+ constructor(config?: ChunkedConfig) {
32
+ this.chunkDurationMs = config?.chunkDurationMs ?? 5000;
33
+ this.overlapMs = config?.overlapMs ?? 500;
34
+ }
35
+
36
+ async connect(config: ProviderConfig): Promise<void> {
37
+ this.stopChunkTimer();
38
+ this.providerConfig = config;
39
+ this.audioBuffer = [];
40
+ this.previousTranscript = "";
41
+ this.connected = true;
42
+ this.hasUnprocessedAudio = false;
43
+ this.startChunkTimer();
44
+ }
45
+
46
+ sendAudio(chunk: Uint8Array): void {
47
+ if (!this.connected) {
48
+ return;
49
+ }
50
+
51
+ this.audioBuffer.push(chunk);
52
+ this.hasUnprocessedAudio = true;
53
+ }
54
+
55
+ onTranscript(callback: (event: TranscriptionEvent) => void): void {
56
+ this.transcriptCallback = callback;
57
+ }
58
+
59
+ onError(callback: (err: Error) => void): void {
60
+ this.errorCallback = callback;
61
+ }
62
+
63
+ async disconnect(): Promise<void> {
64
+ this.connected = false;
65
+ this.stopChunkTimer();
66
+
67
+ if (this.hasUnprocessedAudio && this.audioBuffer.length > 0) {
68
+ await this.processChunk();
69
+ }
70
+ }
71
+
72
+ private startChunkTimer(): void {
73
+ this.chunkTimer = setInterval(async () => {
74
+ if (this.hasUnprocessedAudio && this.audioBuffer.length > 0) {
75
+ await this.processChunk();
76
+ }
77
+ }, this.chunkDurationMs);
78
+ }
79
+
80
+ private stopChunkTimer(): void {
81
+ if (this.chunkTimer !== null) {
82
+ clearInterval(this.chunkTimer);
83
+ this.chunkTimer = null;
84
+ }
85
+ }
86
+
87
+ private async processChunk(): Promise<void> {
88
+ if (this.audioBuffer.length === 0) {
89
+ return;
90
+ }
91
+
92
+ const totalLength = this.audioBuffer.reduce(
93
+ (sum, chunk) => sum + chunk.length,
94
+ 0,
95
+ );
96
+ const audio = new Uint8Array(totalLength);
97
+ let offset = 0;
98
+
99
+ for (const chunk of this.audioBuffer) {
100
+ audio.set(chunk, offset);
101
+ offset += chunk.length;
102
+ }
103
+
104
+ const overlapBytes = this.overlapMs * BYTES_PER_MS;
105
+ if (audio.length > overlapBytes) {
106
+ this.audioBuffer = [audio.slice(audio.length - overlapBytes)];
107
+ } else {
108
+ this.audioBuffer = [];
109
+ }
110
+ this.hasUnprocessedAudio = false;
111
+
112
+ let lastError: Error | null = null;
113
+ for (let attempt = 0; attempt < 3; attempt++) {
114
+ try {
115
+ const text = await this.transcribeChunk(audio, this.previousTranscript);
116
+
117
+ if (text.trim()) {
118
+ this.previousTranscript = text;
119
+ this.transcriptCallback?.({ text, isFinal: true });
120
+ }
121
+
122
+ return;
123
+ } catch (error) {
124
+ lastError = error instanceof Error ? error : new Error(String(error));
125
+
126
+ if (attempt < 2) {
127
+ await new Promise((resolve) =>
128
+ setTimeout(resolve, 500 * 2 ** attempt),
129
+ );
130
+ }
131
+ }
132
+ }
133
+
134
+ this.errorCallback?.(
135
+ lastError ?? new Error("Transcription failed after 3 attempts"),
136
+ );
137
+ }
138
+
139
+ protected abstract transcribeChunk(
140
+ audio: Uint8Array,
141
+ previousTranscript: string,
142
+ ): Promise<string>;
143
+ }
@@ -0,0 +1,74 @@
1
+ import { StreamingProvider } from "./streaming";
2
+ import type {
3
+ VoiceProvider,
4
+ ProviderConfig,
5
+ TranscriptionEvent,
6
+ } from "../types";
7
+ import { AuthError } from "../types";
8
+
9
+ export class DeepgramProvider extends StreamingProvider {
10
+ readonly name: VoiceProvider = "deepgram";
11
+
12
+ constructor() {
13
+ super({ keepaliveIntervalMs: 8000, reconnectMaxRetries: 3 });
14
+ }
15
+
16
+ protected createWebSocket(config: ProviderConfig): WebSocket {
17
+ const apiKey = config.apiKey;
18
+ if (!apiKey)
19
+ throw new AuthError(
20
+ "Deepgram API key not configured. Set DEEPGRAM_API_KEY env var.",
21
+ );
22
+
23
+ const language = (config.language as string) ?? "";
24
+ const model = (config.model as string) ?? "nova-2";
25
+
26
+ const params = new URLSearchParams({
27
+ model,
28
+ interim_results: "true",
29
+ punctuate: "true",
30
+ vad_events: "true",
31
+ encoding: "linear16",
32
+ sample_rate: "16000",
33
+ channels: "1",
34
+ });
35
+ if (language) params.set("language", language);
36
+
37
+ const url = `wss://api.deepgram.com/v1/listen?${params.toString()}`;
38
+
39
+ // Bun's WebSocket supports headers in the constructor options
40
+ return new WebSocket(url, {
41
+ headers: { Authorization: `Token ${apiKey}` },
42
+ } as any);
43
+ }
44
+
45
+ protected parseMessage(data: unknown): TranscriptionEvent | null {
46
+ if (typeof data !== "string") return null;
47
+
48
+ let parsed: any;
49
+ try {
50
+ parsed = JSON.parse(data);
51
+ } catch {
52
+ return null;
53
+ }
54
+
55
+ // Only handle Results messages with non-empty transcripts
56
+ if (parsed.type !== "Results") return null;
57
+
58
+ const transcript = parsed.channel?.alternatives?.[0]?.transcript;
59
+ if (!transcript || transcript.trim() === "") return null;
60
+
61
+ const confidence = parsed.channel?.alternatives?.[0]?.confidence;
62
+ const isFinal = parsed.is_final === true || parsed.speech_final === true;
63
+
64
+ return {
65
+ text: transcript,
66
+ isFinal,
67
+ confidence: typeof confidence === "number" ? confidence : undefined,
68
+ };
69
+ }
70
+
71
+ protected buildKeepAliveFrame(): string {
72
+ return JSON.stringify({ type: "KeepAlive" });
73
+ }
74
+ }
@@ -0,0 +1,20 @@
1
+ import type { TranscriptionProvider, VoiceConfig } from "../types.ts";
2
+ import { ConfigError } from "../types.ts";
3
+ import { GroqProvider } from "./groq.ts";
4
+ import { OpenAIWhisperProvider } from "./openai.ts";
5
+ import { DeepgramProvider } from "./deepgram.ts";
6
+
7
+ export function createProvider(config: VoiceConfig): TranscriptionProvider {
8
+ switch (config.provider) {
9
+ case "groq":
10
+ return new GroqProvider(config.chunkDurationMs);
11
+ case "openai-whisper":
12
+ return new OpenAIWhisperProvider(config.chunkDurationMs);
13
+ case "deepgram":
14
+ return new DeepgramProvider();
15
+ default:
16
+ throw new ConfigError(
17
+ `Unknown provider: "${(config as any).provider}". Valid providers: groq, openai-whisper, deepgram`,
18
+ );
19
+ }
20
+ }
@@ -0,0 +1,57 @@
1
+ import { ChunkedProvider } from "./chunked.ts";
2
+ import { AuthError, RateLimitError, ProviderError } from "../types.ts";
3
+ import type { VoiceProvider } from "../types.ts";
4
+ import { wrapInWav } from "./wav-utils.ts";
5
+
6
+ export class GroqProvider extends ChunkedProvider {
7
+ readonly name: VoiceProvider = "groq";
8
+
9
+ constructor(chunkDurationMs = 5000) {
10
+ super({ chunkDurationMs, overlapMs: 500 });
11
+ }
12
+
13
+ protected async transcribeChunk(
14
+ audio: Uint8Array,
15
+ previousTranscript: string,
16
+ ): Promise<string> {
17
+ const apiKey = this.providerConfig.apiKey;
18
+ if (!apiKey)
19
+ throw new AuthError(
20
+ "Groq API key not configured. Set GROQ_API_KEY env var.",
21
+ );
22
+
23
+ const model = (this.providerConfig.model as string) ?? "whisper-large-v3";
24
+ const language = this.providerConfig.language as string | undefined;
25
+
26
+ const wav = wrapInWav(audio);
27
+ const formData = new FormData();
28
+ formData.append(
29
+ "file",
30
+ new Blob([wav.buffer as ArrayBuffer], { type: "audio/wav" }),
31
+ "audio.wav",
32
+ );
33
+ formData.append("model", model);
34
+ formData.append("response_format", "text");
35
+ if (previousTranscript) formData.append("prompt", previousTranscript);
36
+ if (language) formData.append("language", language);
37
+
38
+ const response = await fetch(
39
+ "https://api.groq.com/openai/v1/audio/transcriptions",
40
+ {
41
+ method: "POST",
42
+ headers: { Authorization: `Bearer ${apiKey}` },
43
+ body: formData,
44
+ },
45
+ );
46
+
47
+ if (response.status === 401) throw new AuthError("Invalid Groq API key.");
48
+ if (response.status === 429)
49
+ throw new RateLimitError("Groq rate limit exceeded.");
50
+ if (!response.ok)
51
+ throw new ProviderError(
52
+ `Groq API error: ${response.status} ${response.statusText}`,
53
+ );
54
+
55
+ return (await response.text()).trim();
56
+ }
57
+ }
@@ -0,0 +1,57 @@
1
+ import { ChunkedProvider } from "./chunked.ts";
2
+ import { AuthError, RateLimitError, ProviderError } from "../types.ts";
3
+ import type { VoiceProvider } from "../types.ts";
4
+ import { wrapInWav } from "./wav-utils.ts";
5
+
6
+ export class OpenAIWhisperProvider extends ChunkedProvider {
7
+ readonly name: VoiceProvider = "openai-whisper";
8
+
9
+ constructor(chunkDurationMs = 10000) {
10
+ super({ chunkDurationMs, overlapMs: 500 });
11
+ }
12
+
13
+ protected async transcribeChunk(
14
+ audio: Uint8Array,
15
+ previousTranscript: string,
16
+ ): Promise<string> {
17
+ const apiKey = this.providerConfig.apiKey;
18
+ if (!apiKey)
19
+ throw new AuthError(
20
+ "OpenAI API key not configured. Set OPENAI_API_KEY env var.",
21
+ );
22
+
23
+ const model = (this.providerConfig.model as string) ?? "whisper-1";
24
+ const language = this.providerConfig.language as string | undefined;
25
+
26
+ const wav = wrapInWav(audio);
27
+ const formData = new FormData();
28
+ formData.append(
29
+ "file",
30
+ new Blob([wav], { type: "audio/wav" }),
31
+ "audio.wav",
32
+ );
33
+ formData.append("model", model);
34
+ formData.append("response_format", "text");
35
+ if (previousTranscript) formData.append("prompt", previousTranscript);
36
+ if (language) formData.append("language", language);
37
+
38
+ const response = await fetch(
39
+ "https://api.openai.com/v1/audio/transcriptions",
40
+ {
41
+ method: "POST",
42
+ headers: { Authorization: `Bearer ${apiKey}` },
43
+ body: formData,
44
+ },
45
+ );
46
+
47
+ if (response.status === 401) throw new AuthError("Invalid OpenAI API key.");
48
+ if (response.status === 429)
49
+ throw new RateLimitError("OpenAI rate limit exceeded.");
50
+ if (!response.ok)
51
+ throw new ProviderError(
52
+ `OpenAI API error: ${response.status} ${response.statusText}`,
53
+ );
54
+
55
+ return (await response.text()).trim();
56
+ }
57
+ }