opencode-voice2text 0.1.11 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +3 -2
  2. package/src/index.tsx +918 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "opencode-voice2text",
3
- "version": "0.1.11",
3
+ "version": "0.1.12",
4
4
  "description": "Streaming Volcengine speech-to-text plugin for the OpenCode TUI",
5
5
  "type": "module",
6
6
  "license": "MIT",
@@ -22,6 +22,7 @@
22
22
  ],
23
23
  "files": [
24
24
  "dist",
25
+ "src",
25
26
  "README.md",
26
27
  "LICENSE"
27
28
  ],
@@ -31,7 +32,7 @@
31
32
  "import": "./dist/index.js"
32
33
  },
33
34
  "./tui": {
34
- "import": "./dist/index.js",
35
+ "import": "./src/index.tsx",
35
36
  "config": {
36
37
  "commandKeybind": "ctrl+g"
37
38
  }
package/src/index.tsx ADDED
@@ -0,0 +1,918 @@
1
+ import "@opentui/solid/runtime-plugin-support"
2
+ /** @jsxImportSource @opentui/solid */
3
+ import { randomBytes, randomUUID } from "node:crypto"
4
+ import { spawn } from "node:child_process"
5
+ import os from "node:os"
6
+ import path from "node:path"
7
+ import tls from "node:tls"
8
+ import zlib from "node:zlib"
9
+ import { promises as fs } from "node:fs"
10
+ import { Show, createMemo } from "solid-js"
11
+ import type { PluginOptions } from "@opencode-ai/plugin"
12
+ import type { TuiPlugin, TuiPluginApi, TuiPluginModule } from "@opencode-ai/plugin/tui"
13
+
14
+ const WS_OPCODE_BINARY = 0x2
15
+ const WS_OPCODE_CLOSE = 0x8
16
+ const WS_OPCODE_PING = 0x9
17
+ const WS_OPCODE_PONG = 0xA
18
+
19
+ const HEADER_VERSION = 0x1
20
+ const HEADER_SIZE = 0x1
21
+ const MESSAGE_TYPE_FULL_CLIENT_REQUEST = 0x1
22
+ const MESSAGE_TYPE_AUDIO_ONLY_REQUEST = 0x2
23
+ const MESSAGE_TYPE_ERROR = 0xF
24
+ const SERIALIZATION_NONE = 0x0
25
+ const SERIALIZATION_JSON = 0x1
26
+ const COMPRESSION_GZIP = 0x1
27
+
28
+ const DEFAULT_CONFIG_PATH = path.join(os.homedir(), ".config/opencode/voice2text.local.json")
29
+ const DEFAULT_ENDPOINT = "wss://openspeech.bytedance.com/api/v3/sauc/bigmodel_async"
30
+ const DEFAULT_RESOURCE_ID = "volc.seedasr.sauc.duration"
31
+ const DEFAULT_CHUNK_MS = 200
32
+ const DEFAULT_RATE = 16000
33
+ const DEFAULT_BITS = 16
34
+ const DEFAULT_CHANNELS = 1
35
+ const DEFAULT_END_WINDOW_SIZE = 800
36
+
37
+ const STATUS_KEY = "opencode_voice2text.status"
38
+ const STATUS_MESSAGE_KEY = "opencode_voice2text.status_message"
39
+
40
+ type Voice2TextOptions = PluginOptions & {
41
+ commandKeybind?: string
42
+ provider?: string
43
+ endpoint?: string
44
+ appId?: string
45
+ accessToken?: string
46
+ resourceId?: string
47
+ language?: string
48
+ chunkMs?: number
49
+ endWindowSize?: number
50
+ maxDurationSeconds?: number
51
+ appendTrailingSpace?: boolean
52
+ rate?: number
53
+ bits?: number
54
+ channels?: number
55
+ }
56
+
57
+ type Voice2TextConfig = {
58
+ commandKeybind: string
59
+ provider: string
60
+ language: string
61
+ chunkMs: number
62
+ endWindowSize: number
63
+ maxDurationSeconds: number
64
+ appendTrailingSpace: boolean
65
+ rate: number
66
+ bits: number
67
+ channels: number
68
+ providerConfig: {
69
+ endpoint: string
70
+ appId: string
71
+ accessToken: string
72
+ resourceId: string
73
+ }
74
+ }
75
+
76
+ type VoiceProvider = {
77
+ id: string
78
+ displayName: string
79
+ configFileFields: string[]
80
+ validateConfig: (config: Voice2TextConfig) => string | undefined
81
+ createRecognition: (
82
+ config: Voice2TextConfig,
83
+ callbacks: { onStableText?: (text: string) => Promise<void> },
84
+ ) => Promise<RecognitionSession>
85
+ }
86
+
87
+ type TranscriptResult = {
88
+ text: string
89
+ stableText: string
90
+ logId: string
91
+ }
92
+
93
+ type RecorderSession = {
94
+ done: Promise<void>
95
+ stop: () => void
96
+ }
97
+
98
+ type RecognitionSession = {
99
+ write: (chunk: Buffer) => void
100
+ finish: (finalChunk?: Buffer) => Promise<TranscriptResult>
101
+ abort: () => Promise<void>
102
+ }
103
+
104
+ type VolcengineResponse = {
105
+ flags: number
106
+ data: any
107
+ }
108
+
109
+ function str(value: unknown, fallback = "") {
110
+ return typeof value === "string" && value.trim() ? value.trim() : fallback
111
+ }
112
+
113
+ function num(value: unknown, fallback: number) {
114
+ const parsed = Number(value)
115
+ return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback
116
+ }
117
+
118
+ function bool(value: unknown, fallback: boolean) {
119
+ if (typeof value === "boolean") return value
120
+ if (value === "true") return true
121
+ if (value === "false") return false
122
+ return fallback
123
+ }
124
+
125
+ function appendableText(text: unknown) {
126
+ return typeof text === "string" ? text.trim() : ""
127
+ }
128
+
129
+ function diffSuffix(previous: string, next: string) {
130
+ if (!next) return ""
131
+ if (!previous) return next
132
+ if (next.startsWith(previous)) return next.slice(previous.length).trim()
133
+ return ""
134
+ }
135
+
136
+ function platformLabel() {
137
+ if (process.platform === "darwin") return "macOS"
138
+ if (process.platform === "linux") return "Linux"
139
+ return process.platform
140
+ }
141
+
142
+ function installHint() {
143
+ if (process.platform === "darwin") return "Missing recorder 'rec'. Install Sox with: brew install sox"
144
+ if (process.platform === "linux") return "Missing recorder 'rec'. Install Sox with: sudo apt install sox"
145
+ return "Missing recorder 'rec'. Install Sox before using voice input."
146
+ }
147
+
148
+ function gzip(buffer: Buffer) {
149
+ return zlib.gzipSync(buffer)
150
+ }
151
+
152
+ function buildProtocolHeader(messageType: number, flags: number, serialization: number, compression: number) {
153
+ return Buffer.from([
154
+ (HEADER_VERSION << 4) | HEADER_SIZE,
155
+ (messageType << 4) | flags,
156
+ (serialization << 4) | compression,
157
+ 0x00,
158
+ ])
159
+ }
160
+
161
+ function buildClientMessage(messageType: number, flags: number, payload: Buffer, serialization: number) {
162
+ const compressedPayload = gzip(payload)
163
+ const header = buildProtocolHeader(messageType, flags, serialization, COMPRESSION_GZIP)
164
+ const size = Buffer.alloc(4)
165
+ size.writeUInt32BE(compressedPayload.length, 0)
166
+ return Buffer.concat([header, size, compressedPayload])
167
+ }
168
+
169
+ function buildEmptyLastAudioMessage() {
170
+ const compressedPayload = gzip(Buffer.alloc(0))
171
+ const header = buildProtocolHeader(MESSAGE_TYPE_AUDIO_ONLY_REQUEST, 0x2, SERIALIZATION_NONE, COMPRESSION_GZIP)
172
+ const size = Buffer.alloc(4)
173
+ size.writeUInt32BE(compressedPayload.length, 0)
174
+ return Buffer.concat([header, size, compressedPayload])
175
+ }
176
+
177
+ function parseServerMessage(message: Buffer): VolcengineResponse {
178
+ if (message.length < 8) throw new Error("Invalid Volcengine response frame")
179
+
180
+ const headerSize = (message[0] & 0x0f) * 4
181
+ const messageType = message[1] >> 4
182
+ const flags = message[1] & 0x0f
183
+ const serialization = message[2] >> 4
184
+ const compression = message[2] & 0x0f
185
+ let offset = headerSize
186
+
187
+ if (messageType === MESSAGE_TYPE_ERROR) {
188
+ const code = message.readUInt32BE(offset)
189
+ offset += 4
190
+ const payloadSize = message.readUInt32BE(offset)
191
+ offset += 4
192
+ const payload = message.subarray(offset, offset + payloadSize)
193
+ const text = compression === COMPRESSION_GZIP ? zlib.gunzipSync(payload).toString("utf8") : payload.toString("utf8")
194
+ throw new Error(`Volcengine ASR error ${code}: ${text}`)
195
+ }
196
+
197
+ if (flags === 0x1 || flags === 0x3) offset += 4
198
+ const payloadSize = message.readUInt32BE(offset)
199
+ offset += 4
200
+ const payload = message.subarray(offset, offset + payloadSize)
201
+ const body = compression === COMPRESSION_GZIP ? zlib.gunzipSync(payload) : payload
202
+
203
+ return {
204
+ flags,
205
+ data: serialization === SERIALIZATION_JSON ? JSON.parse(body.toString("utf8")) : body,
206
+ }
207
+ }
208
+
209
+ function createMaskedFrame(opcode: number, payload: Buffer) {
210
+ const mask = randomBytes(4)
211
+ const chunks = [Buffer.from([0x80 | opcode])]
212
+ const length = payload.length
213
+
214
+ if (length < 126) {
215
+ chunks.push(Buffer.from([0x80 | length]))
216
+ } else if (length <= 0xffff) {
217
+ const extended = Buffer.alloc(3)
218
+ extended[0] = 0x80 | 126
219
+ extended.writeUInt16BE(length, 1)
220
+ chunks.push(extended)
221
+ } else {
222
+ const extended = Buffer.alloc(9)
223
+ extended[0] = 0x80 | 127
224
+ extended.writeBigUInt64BE(BigInt(length), 1)
225
+ chunks.push(extended)
226
+ }
227
+
228
+ const masked = Buffer.alloc(length)
229
+ for (let index = 0; index < length; index += 1) {
230
+ masked[index] = payload[index] ^ mask[index % 4]
231
+ }
232
+
233
+ chunks.push(mask, masked)
234
+ return Buffer.concat(chunks)
235
+ }
236
+
237
+ class WebSocketBinaryClient {
238
+ private readonly url: URL
239
+ private readonly headers: Record<string, string>
240
+ private socket: tls.TLSSocket | undefined
241
+ private buffer = Buffer.alloc(0)
242
+ private pendingFrames: Buffer[] = []
243
+ private waiters: Array<{ resolve: (value: Buffer) => void; reject: (error: Error) => void }> = []
244
+
245
+ constructor(url: string, headers: Record<string, string>) {
246
+ this.url = new URL(url)
247
+ this.headers = headers
248
+ }
249
+
250
+ async connect() {
251
+ if (this.url.protocol !== "wss:") {
252
+ throw new Error(`Unsupported websocket protocol: ${this.url.protocol}`)
253
+ }
254
+
255
+ const key = randomBytes(16).toString("base64")
256
+ const headerLines = [
257
+ `GET ${this.url.pathname}${this.url.search} HTTP/1.1`,
258
+ `Host: ${this.url.host}`,
259
+ "Upgrade: websocket",
260
+ "Connection: Upgrade",
261
+ `Sec-WebSocket-Key: ${key}`,
262
+ "Sec-WebSocket-Version: 13",
263
+ ]
264
+
265
+ for (const [name, value] of Object.entries(this.headers)) {
266
+ headerLines.push(`${name}: ${value}`)
267
+ }
268
+ headerLines.push("\r\n")
269
+
270
+ const socket = await new Promise<tls.TLSSocket>((resolve, reject) => {
271
+ const next = tls.connect({
272
+ host: this.url.hostname,
273
+ port: Number(this.url.port || 443),
274
+ servername: this.url.hostname,
275
+ })
276
+ next.once("secureConnect", () => resolve(next))
277
+ next.once("error", reject)
278
+ })
279
+
280
+ this.socket = socket
281
+ socket.setNoDelay(true)
282
+ socket.write(headerLines.join("\r\n"))
283
+
284
+ const handshake = await new Promise<{ headerPart: string; rest: Buffer }>((resolve, reject) => {
285
+ let chunkBuffer = Buffer.alloc(0)
286
+ const onData = (chunk: Buffer) => {
287
+ chunkBuffer = Buffer.concat([chunkBuffer, chunk])
288
+ const separator = chunkBuffer.indexOf("\r\n\r\n")
289
+ if (separator === -1) return
290
+ socket.off("data", onData)
291
+ resolve({
292
+ headerPart: chunkBuffer.subarray(0, separator).toString("utf8"),
293
+ rest: chunkBuffer.subarray(separator + 4),
294
+ })
295
+ }
296
+ socket.on("data", onData)
297
+ socket.once("error", reject)
298
+ })
299
+
300
+ const lines = handshake.headerPart.split("\r\n")
301
+ if (!(lines[0] || "").includes("101")) {
302
+ throw new Error(`WebSocket handshake failed: ${lines[0] || "unknown"}`)
303
+ }
304
+
305
+ const responseHeaders: Record<string, string> = {}
306
+ for (const line of lines.slice(1)) {
307
+ const index = line.indexOf(":")
308
+ if (index === -1) continue
309
+ responseHeaders[line.slice(0, index).trim().toLowerCase()] = line.slice(index + 1).trim()
310
+ }
311
+
312
+ socket.on("data", (chunk) => this.onData(chunk))
313
+ socket.on("close", () => this.flushWaiters(new Error("WebSocket closed")))
314
+ socket.on("error", (error) => this.flushWaiters(error))
315
+ if (handshake.rest.length > 0) this.onData(handshake.rest)
316
+ return responseHeaders
317
+ }
318
+
319
+ private onData(chunk: Buffer) {
320
+ this.buffer = Buffer.concat([this.buffer, chunk])
321
+
322
+ while (this.buffer.length >= 2) {
323
+ const first = this.buffer[0]
324
+ const second = this.buffer[1]
325
+ const opcode = first & 0x0f
326
+ const masked = (second & 0x80) !== 0
327
+ let offset = 2
328
+ let payloadLength = second & 0x7f
329
+
330
+ if (payloadLength === 126) {
331
+ if (this.buffer.length < offset + 2) return
332
+ payloadLength = this.buffer.readUInt16BE(offset)
333
+ offset += 2
334
+ } else if (payloadLength === 127) {
335
+ if (this.buffer.length < offset + 8) return
336
+ payloadLength = Number(this.buffer.readBigUInt64BE(offset))
337
+ offset += 8
338
+ }
339
+
340
+ let mask: Buffer | undefined
341
+ if (masked) {
342
+ if (this.buffer.length < offset + 4) return
343
+ mask = this.buffer.subarray(offset, offset + 4)
344
+ offset += 4
345
+ }
346
+
347
+ if (this.buffer.length < offset + payloadLength) return
348
+
349
+ let payload = this.buffer.subarray(offset, offset + payloadLength)
350
+ this.buffer = this.buffer.subarray(offset + payloadLength)
351
+
352
+ if (masked && mask) {
353
+ const unmasked = Buffer.alloc(payload.length)
354
+ for (let index = 0; index < payload.length; index += 1) {
355
+ unmasked[index] = payload[index] ^ mask[index % 4]
356
+ }
357
+ payload = unmasked
358
+ }
359
+
360
+ if (opcode === WS_OPCODE_PING) {
361
+ this.sendRaw(WS_OPCODE_PONG, payload)
362
+ continue
363
+ }
364
+
365
+ if (opcode === WS_OPCODE_CLOSE) {
366
+ this.flushWaiters(new Error("WebSocket closed by server"))
367
+ return
368
+ }
369
+
370
+ if (opcode !== WS_OPCODE_BINARY) continue
371
+
372
+ const waiter = this.waiters.shift()
373
+ if (waiter) waiter.resolve(payload)
374
+ else this.pendingFrames.push(payload)
375
+ }
376
+ }
377
+
378
+ private flushWaiters(error: Error) {
379
+ while (this.waiters.length > 0) {
380
+ this.waiters.shift()?.reject(error)
381
+ }
382
+ }
383
+
384
+ private sendRaw(opcode: number, payload: Buffer) {
385
+ if (!this.socket) throw new Error("WebSocket is not connected")
386
+ this.socket.write(createMaskedFrame(opcode, payload))
387
+ }
388
+
389
+ sendBinary(payload: Buffer) {
390
+ this.sendRaw(WS_OPCODE_BINARY, payload)
391
+ }
392
+
393
+ async receiveBinary(timeoutMs = 30000) {
394
+ if (this.pendingFrames.length > 0) {
395
+ return this.pendingFrames.shift() as Buffer
396
+ }
397
+
398
+ return new Promise<Buffer>((resolve, reject) => {
399
+ const waiter = {
400
+ resolve: (value: Buffer) => {
401
+ clearTimeout(timer)
402
+ resolve(value)
403
+ },
404
+ reject: (error: Error) => {
405
+ clearTimeout(timer)
406
+ reject(error)
407
+ },
408
+ }
409
+
410
+ const timer = setTimeout(() => {
411
+ this.waiters = this.waiters.filter((item) => item !== waiter)
412
+ reject(new Error("Timed out waiting for Volcengine ASR response"))
413
+ }, timeoutMs)
414
+
415
+ this.waiters.push(waiter)
416
+ })
417
+ }
418
+
419
+ async close() {
420
+ if (!this.socket) return
421
+
422
+ try {
423
+ this.sendRaw(WS_OPCODE_CLOSE, Buffer.alloc(0))
424
+ } catch {
425
+ // Ignore close send failure.
426
+ }
427
+
428
+ const socket = this.socket
429
+ await new Promise<void>((resolve) => {
430
+ socket.end(() => resolve())
431
+ setTimeout(() => resolve(), 200)
432
+ })
433
+ this.socket = undefined
434
+ }
435
+ }
436
+
437
+ async function commandExists(command: string) {
438
+ return new Promise<boolean>((resolve) => {
439
+ const child = spawn("which", [command], { stdio: "ignore" })
440
+ child.on("close", (code) => resolve(code === 0))
441
+ child.on("error", () => resolve(false))
442
+ })
443
+ }
444
+
445
+ async function ensureRuntimeSupport() {
446
+ if (process.platform !== "darwin" && process.platform !== "linux") {
447
+ throw new Error(`opencode-voice2text currently supports macOS and Linux. Current platform: ${platformLabel()}`)
448
+ }
449
+
450
+ if (!(await commandExists("rec"))) {
451
+ throw new Error(installHint())
452
+ }
453
+ }
454
+
455
+ async function readLocalConfig() {
456
+ const configPath = process.env.OPENCODE_VOICE2TEXT_LOCAL_CONFIG || DEFAULT_CONFIG_PATH
457
+ try {
458
+ return JSON.parse(await fs.readFile(configPath, "utf8")) as Record<string, unknown>
459
+ } catch (error: any) {
460
+ if (error?.code === "ENOENT") return {}
461
+ throw error
462
+ }
463
+ }
464
+
465
+ function configPathLabel() {
466
+ return process.env.OPENCODE_VOICE2TEXT_LOCAL_CONFIG || DEFAULT_CONFIG_PATH
467
+ }
468
+
469
+ function providerName(provider: string) {
470
+ if (provider === "volcengine") return "Volcengine ASR"
471
+ return provider
472
+ }
473
+
474
+ async function loadConfig(options: Voice2TextOptions = {}): Promise<Voice2TextConfig> {
475
+ const local = await readLocalConfig()
476
+ const env = process.env
477
+ const merged = { ...local, ...options }
478
+
479
+ const config: Voice2TextConfig = {
480
+ commandKeybind: str(merged.commandKeybind, "ctrl+g"),
481
+ provider: str(merged.provider ?? env.OPENCODE_VOICE2TEXT_PROVIDER, "volcengine"),
482
+ language: str(merged.language ?? env.OPENCODE_VOICE2TEXT_LANGUAGE),
483
+ chunkMs: num(merged.chunkMs ?? env.OPENCODE_VOICE2TEXT_CHUNK_MS, DEFAULT_CHUNK_MS),
484
+ endWindowSize: num(merged.endWindowSize ?? env.OPENCODE_VOICE2TEXT_END_WINDOW_SIZE, DEFAULT_END_WINDOW_SIZE),
485
+ maxDurationSeconds: num(merged.maxDurationSeconds ?? env.OPENCODE_VOICE2TEXT_MAX_DURATION_SECONDS, 180),
486
+ appendTrailingSpace: bool(merged.appendTrailingSpace ?? env.OPENCODE_VOICE2TEXT_APPEND_TRAILING_SPACE, true),
487
+ rate: num(merged.rate ?? env.OPENCODE_VOICE2TEXT_SAMPLE_RATE, DEFAULT_RATE),
488
+ bits: num(merged.bits ?? env.OPENCODE_VOICE2TEXT_BITS, DEFAULT_BITS),
489
+ channels: num(merged.channels ?? env.OPENCODE_VOICE2TEXT_CHANNELS, DEFAULT_CHANNELS),
490
+ providerConfig: {
491
+ endpoint: str(merged.endpoint ?? env.OPENCODE_VOICE2TEXT_ENDPOINT, DEFAULT_ENDPOINT),
492
+ appId: str(merged.appId ?? env.OPENCODE_VOICE2TEXT_APP_ID),
493
+ accessToken: str(merged.accessToken ?? env.OPENCODE_VOICE2TEXT_ACCESS_TOKEN),
494
+ resourceId: str(merged.resourceId ?? env.OPENCODE_VOICE2TEXT_RESOURCE_ID, DEFAULT_RESOURCE_ID),
495
+ },
496
+ }
497
+
498
+ return config
499
+ }
500
+
501
+ const volcengineProvider: VoiceProvider = {
502
+ id: "volcengine",
503
+ displayName: "Volcengine ASR",
504
+ configFileFields: ["provider", "appId", "accessToken", "resourceId", "endpoint"],
505
+ validateConfig(config) {
506
+ if (!config.providerConfig.appId || !config.providerConfig.accessToken || !config.providerConfig.resourceId) {
507
+ return `Missing ${this.displayName} config. Fill ${configPathLabel()} with ${this.configFileFields.join(", ")}.`
508
+ }
509
+ return undefined
510
+ },
511
+ createRecognition(config, callbacks) {
512
+ return createVolcengineRecognition(config, callbacks)
513
+ },
514
+ }
515
+
516
+ const providers: Record<string, VoiceProvider> = {
517
+ [volcengineProvider.id]: volcengineProvider,
518
+ }
519
+
520
+ function getProvider(config: Voice2TextConfig): VoiceProvider {
521
+ const provider = providers[config.provider]
522
+ if (!provider) {
523
+ throw new Error(
524
+ `Unsupported provider '${config.provider}'. Available providers: ${Object.keys(providers).join(", ")}.`,
525
+ )
526
+ }
527
+ return provider
528
+ }
529
+
530
+ function buildVolcengineRequest(config: Voice2TextConfig) {
531
+ const audio: Record<string, unknown> = {
532
+ format: "pcm",
533
+ codec: "raw",
534
+ rate: config.rate,
535
+ bits: config.bits,
536
+ channel: config.channels,
537
+ }
538
+
539
+ if (config.language) {
540
+ audio.language = config.language
541
+ }
542
+
543
+ return {
544
+ user: {
545
+ uid: os.userInfo().username,
546
+ did: os.hostname(),
547
+ platform: process.platform === "darwin" ? "macOS" : process.platform,
548
+ sdk_version: "opencode-plugin",
549
+ app_version: "opencode-voice2text",
550
+ },
551
+ audio,
552
+ request: {
553
+ model_name: "bigmodel",
554
+ enable_itn: true,
555
+ enable_punc: true,
556
+ enable_ddc: false,
557
+ result_type: "full",
558
+ show_utterances: true,
559
+ end_window_size: config.endWindowSize,
560
+ },
561
+ }
562
+ }
563
+
564
+ function getStableText(data: any) {
565
+ const utterances = Array.isArray(data?.result?.utterances) ? data.result.utterances : []
566
+ return utterances
567
+ .filter((item: any) => item && item.definite && typeof item.text === "string" && item.text.trim())
568
+ .map((item: any) => item.text)
569
+ .join("")
570
+ .trim()
571
+ }
572
+
573
+ function createRecorder(config: Voice2TextConfig, onChunk: (chunk: Buffer) => Promise<void> | void): RecorderSession {
574
+ const child = spawn(
575
+ "rec",
576
+ [
577
+ "-q",
578
+ "-t",
579
+ "raw",
580
+ "-r",
581
+ String(config.rate),
582
+ "-c",
583
+ String(config.channels),
584
+ "-b",
585
+ String(config.bits),
586
+ "-e",
587
+ "signed-integer",
588
+ "-",
589
+ ],
590
+ { stdio: ["ignore", "pipe", "pipe"] },
591
+ )
592
+
593
+ let stderr = ""
594
+ let stopRequested = false
595
+ let finished = false
596
+ let streamError: Error | undefined
597
+ let writeChain = Promise.resolve()
598
+
599
+ child.stdout?.on("data", (chunk: Buffer) => {
600
+ writeChain = writeChain.then(async () => {
601
+ if (streamError) return
602
+ try {
603
+ await onChunk(chunk)
604
+ } catch (error) {
605
+ streamError = error instanceof Error ? error : new Error(String(error))
606
+ stopRequested = true
607
+ child.kill("SIGINT")
608
+ }
609
+ })
610
+ })
611
+
612
+ child.stderr?.on("data", (chunk) => {
613
+ stderr += chunk.toString()
614
+ })
615
+
616
+ const timer = setTimeout(() => {
617
+ stopRequested = true
618
+ child.kill("SIGINT")
619
+ }, config.maxDurationSeconds * 1000)
620
+
621
+ const done = new Promise<void>((resolve, reject) => {
622
+ child.on("error", (error: NodeJS.ErrnoException) => {
623
+ clearTimeout(timer)
624
+ finished = true
625
+ reject(error?.code === "ENOENT" ? new Error(installHint()) : error)
626
+ })
627
+
628
+ child.on("close", async (code, signal) => {
629
+ clearTimeout(timer)
630
+ finished = true
631
+ await writeChain
632
+
633
+ if (streamError) {
634
+ reject(streamError)
635
+ return
636
+ }
637
+
638
+ if (code === 0 || signal === "SIGINT" || stopRequested) {
639
+ resolve()
640
+ return
641
+ }
642
+
643
+ reject(new Error(stderr.trim() || `Recording failed with code ${code ?? "unknown"}`))
644
+ })
645
+ })
646
+
647
+ return {
648
+ done,
649
+ stop() {
650
+ if (finished || child.killed) return
651
+ stopRequested = true
652
+ child.kill("SIGINT")
653
+ },
654
+ }
655
+ }
656
+
657
+ async function createVolcengineRecognition(
658
+ config: Voice2TextConfig,
659
+ callbacks: { onStableText?: (text: string) => Promise<void> },
660
+ ): Promise<RecognitionSession> {
661
+ const client = new WebSocketBinaryClient(config.providerConfig.endpoint, {
662
+ "X-Api-App-Key": config.providerConfig.appId,
663
+ "X-Api-Access-Key": config.providerConfig.accessToken,
664
+ "X-Api-Resource-Id": config.providerConfig.resourceId,
665
+ "X-Api-Connect-Id": randomUUID(),
666
+ })
667
+
668
+ let lastText = ""
669
+ let stableText = ""
670
+ let closed = false
671
+
672
+ const responseHeaders = await client.connect()
673
+ const requestPayload = Buffer.from(JSON.stringify(buildVolcengineRequest(config)), "utf8")
674
+ client.sendBinary(buildClientMessage(MESSAGE_TYPE_FULL_CLIENT_REQUEST, 0x0, requestPayload, SERIALIZATION_JSON))
675
+ parseServerMessage(await client.receiveBinary())
676
+
677
+ const receiveLoop = (async (): Promise<TranscriptResult> => {
678
+ while (true) {
679
+ const response = parseServerMessage(await client.receiveBinary(30000))
680
+ const nextText = appendableText(response.data?.result?.text)
681
+ if (nextText) lastText = nextText
682
+
683
+ const nextStableText = appendableText(getStableText(response.data))
684
+ const stableDelta = diffSuffix(stableText, nextStableText)
685
+ if (stableDelta) {
686
+ stableText = nextStableText
687
+ await callbacks.onStableText?.(stableDelta)
688
+ }
689
+
690
+ if (response.flags === 0x3) {
691
+ return {
692
+ text: lastText,
693
+ stableText,
694
+ logId: responseHeaders["x-tt-logid"] || "",
695
+ }
696
+ }
697
+ }
698
+ })()
699
+
700
+ let sendChain = Promise.resolve()
701
+
702
+ return {
703
+ write(chunk: Buffer) {
704
+ if (closed || chunk.length === 0) return
705
+ sendChain = sendChain.then(() => {
706
+ client.sendBinary(buildClientMessage(MESSAGE_TYPE_AUDIO_ONLY_REQUEST, 0x0, chunk, SERIALIZATION_NONE))
707
+ })
708
+ },
709
+ async finish(finalChunk?: Buffer) {
710
+ if (closed) return receiveLoop
711
+ closed = true
712
+
713
+ if (finalChunk && finalChunk.length > 0) {
714
+ sendChain = sendChain.then(() => {
715
+ client.sendBinary(buildClientMessage(MESSAGE_TYPE_AUDIO_ONLY_REQUEST, 0x0, finalChunk, SERIALIZATION_NONE))
716
+ })
717
+ }
718
+
719
+ await sendChain
720
+ client.sendBinary(buildEmptyLastAudioMessage())
721
+
722
+ try {
723
+ return await receiveLoop
724
+ } finally {
725
+ await client.close()
726
+ }
727
+ },
728
+ async abort() {
729
+ if (closed) return
730
+ closed = true
731
+ await client.close()
732
+ },
733
+ }
734
+ }
735
+
736
+ async function appendTranscript(api: TuiPluginApi, config: Voice2TextConfig, text: string) {
737
+ const nextText = config.appendTrailingSpace ? `${text} ` : text
738
+ await api.client.tui.appendPrompt({ text: nextText })
739
+ }
740
+
741
+ function setStatus(api: TuiPluginApi, status: string, message: string) {
742
+ api.kv.set(STATUS_KEY, status)
743
+ api.kv.set(STATUS_MESSAGE_KEY, message)
744
+ }
745
+
746
+ function statusView(api: TuiPluginApi) {
747
+ return () => {
748
+ const status = createMemo(() => api.kv.get<string>(STATUS_KEY, "idle"))
749
+ const message = createMemo(() => api.kv.get<string>(STATUS_MESSAGE_KEY, ""))
750
+ const tone = createMemo(() => {
751
+ if (status() === "recording") return api.theme.current.warning
752
+ if (status() === "transcribing") return api.theme.current.accent
753
+ return api.theme.current.textMuted
754
+ })
755
+ const label = createMemo(() => (status() === "recording" ? "REC" : "ASR"))
756
+
757
+ return (
758
+ <Show when={status() !== "idle"}>
759
+ <box flexDirection="row" gap={1}>
760
+ <text fg={tone()}>
761
+ <b>{label()}</b>
762
+ </text>
763
+ <Show when={message()}>
764
+ <text fg={api.theme.current.textMuted}>{message()}</text>
765
+ </Show>
766
+ </box>
767
+ </Show>
768
+ )
769
+ }
770
+ }
771
+
772
+ const tui: TuiPlugin = async (api, options) => {
773
+ const config = await loadConfig((options ?? {}) as Voice2TextOptions)
774
+ const provider = getProvider(config)
775
+
776
+ let phase: "idle" | "recording" | "transcribing" = "idle"
777
+ let active:
778
+ | {
779
+ recorder: RecorderSession
780
+ stream: RecognitionSession
781
+ pending: Buffer
782
+ chunkBytes: number
783
+ }
784
+ | undefined
785
+
786
+ setStatus(api, "idle", "")
787
+
788
+ api.slots.register({
789
+ order: 50,
790
+ slots: {
791
+ home_prompt_right: statusView(api),
792
+ session_prompt_right: statusView(api),
793
+ },
794
+ })
795
+
796
+ const toast = (message: string, variant: "info" | "warning" | "error" = "info") => {
797
+ api.ui.toast({ title: "Voice2Text", message, variant, duration: 2500 })
798
+ }
799
+
800
+ const startRecording = async () => {
801
+ if (phase !== "idle") return
802
+
803
+ phase = "recording"
804
+ setStatus(api, "recording", `listening... press ${config.commandKeybind} to stop`)
805
+
806
+ try {
807
+ await ensureRuntimeSupport()
808
+
809
+ const configError = provider.validateConfig(config)
810
+ if (configError) {
811
+ phase = "idle"
812
+ setStatus(api, "idle", "")
813
+ toast(configError, "warning")
814
+ return
815
+ }
816
+
817
+ const stream = await provider.createRecognition(config, {
818
+ onStableText: async (text) => {
819
+ const next = appendableText(text)
820
+ if (!next) return
821
+ await appendTranscript(api, config, next)
822
+ },
823
+ })
824
+
825
+ const session = {
826
+ stream,
827
+ pending: Buffer.alloc(0),
828
+ chunkBytes: Math.max(1, Math.floor((config.rate * config.channels * (config.bits / 8) * config.chunkMs) / 1000)),
829
+ recorder: undefined as unknown as RecorderSession,
830
+ }
831
+
832
+ const flushPending = async () => {
833
+ while (session.pending.length >= session.chunkBytes) {
834
+ const chunk = session.pending.subarray(0, session.chunkBytes)
835
+ session.pending = session.pending.subarray(session.chunkBytes)
836
+ session.stream.write(chunk)
837
+ }
838
+ }
839
+
840
+ session.recorder = createRecorder(config, async (chunk) => {
841
+ session.pending = Buffer.concat([session.pending, chunk])
842
+ await flushPending()
843
+ })
844
+
845
+ active = session
846
+ } catch (error) {
847
+ phase = "idle"
848
+ setStatus(api, "idle", "")
849
+ toast(error instanceof Error ? error.message : String(error), "error")
850
+ }
851
+ }
852
+
853
+ const stopRecording = async () => {
854
+ if (phase !== "recording" || !active) return
855
+
856
+ const current = active
857
+ active = undefined
858
+ phase = "transcribing"
859
+ setStatus(api, "transcribing", "stopping...")
860
+
861
+ try {
862
+ current.recorder.stop()
863
+ await current.recorder.done
864
+ const finalChunk = current.pending.length > 0 ? current.pending : undefined
865
+ const result = await current.stream.finish(finalChunk)
866
+ const tail = diffSuffix(result.stableText, appendableText(result.text))
867
+
868
+ if (tail) {
869
+ await appendTranscript(api, config, tail)
870
+ }
871
+
872
+ void result.logId
873
+ } catch (error) {
874
+ await current.stream.abort().catch(() => undefined)
875
+ toast(error instanceof Error ? error.message : String(error), "error")
876
+ } finally {
877
+ phase = "idle"
878
+ setStatus(api, "idle", "")
879
+ }
880
+ }
881
+
882
+ api.command.register(() => [
883
+ {
884
+ title: "Toggle voice input",
885
+ value: "voice2text.toggle",
886
+ description: `Stream microphone audio to ${providerName(config.provider)} and append recognized text to the prompt`,
887
+ keybind: config.commandKeybind,
888
+ slash: { name: "voice2text", aliases: ["voice"] },
889
+ hidden: false,
890
+ onSelect: () => {
891
+ if (phase === "transcribing") {
892
+ toast("Still transcribing the previous recording.", "warning")
893
+ return
894
+ }
895
+
896
+ if (phase === "recording") {
897
+ void stopRecording()
898
+ return
899
+ }
900
+
901
+ void startRecording()
902
+ },
903
+ },
904
+ ])
905
+
906
+ api.lifecycle.onDispose(() => {
907
+ active?.recorder.stop()
908
+ void active?.stream.abort().catch(() => undefined)
909
+ setStatus(api, "idle", "")
910
+ })
911
+ }
912
+
913
+ const plugin: TuiPluginModule = {
914
+ id: "opencode.voice2text",
915
+ tui,
916
+ }
917
+
918
+ export default plugin