clideck 1.22.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +77 -0
  3. package/activity.js +56 -0
  4. package/agent-presets.json +93 -0
  5. package/assets/clideck-themes.jpg +0 -0
  6. package/bin/clideck.js +2 -0
  7. package/config.js +96 -0
  8. package/handlers.js +297 -0
  9. package/opencode-bridge.js +148 -0
  10. package/opencode-plugin/clideck-bridge.js +24 -0
  11. package/package.json +47 -0
  12. package/paths.js +41 -0
  13. package/plugin-loader.js +285 -0
  14. package/plugins/trim-clip/clideck-plugin.json +13 -0
  15. package/plugins/trim-clip/client.js +31 -0
  16. package/plugins/trim-clip/index.js +10 -0
  17. package/plugins/voice-input/clideck-plugin.json +49 -0
  18. package/plugins/voice-input/client.js +196 -0
  19. package/plugins/voice-input/index.js +342 -0
  20. package/plugins/voice-input/python/mel_filters.npz +0 -0
  21. package/plugins/voice-input/python/whisper_turbo.py +416 -0
  22. package/plugins/voice-input/python/worker.py +135 -0
  23. package/public/fx/bold-beep-idle.mp3 +0 -0
  24. package/public/fx/default-beep.mp3 +0 -0
  25. package/public/fx/echo-beep-idle.mp3 +0 -0
  26. package/public/fx/musical-beep-idle.mp3 +0 -0
  27. package/public/fx/small-bleep-idle.mp3 +0 -0
  28. package/public/fx/soft-beep.mp3 +0 -0
  29. package/public/fx/space-idle.mp3 +0 -0
  30. package/public/img/claude-code.png +0 -0
  31. package/public/img/clideck-logo-icon.png +0 -0
  32. package/public/img/clideck-logo-terminal-panel.png +0 -0
  33. package/public/img/codex.png +0 -0
  34. package/public/img/gemini.png +0 -0
  35. package/public/img/opencode.png +0 -0
  36. package/public/index.html +243 -0
  37. package/public/js/app.js +794 -0
  38. package/public/js/color-mode.js +51 -0
  39. package/public/js/confirm.js +27 -0
  40. package/public/js/creator.js +201 -0
  41. package/public/js/drag.js +134 -0
  42. package/public/js/folder-picker.js +81 -0
  43. package/public/js/hotkeys.js +90 -0
  44. package/public/js/nav.js +56 -0
  45. package/public/js/profiles.js +22 -0
  46. package/public/js/prompts.js +325 -0
  47. package/public/js/settings.js +489 -0
  48. package/public/js/state.js +15 -0
  49. package/public/js/terminals.js +905 -0
  50. package/public/js/toast.js +62 -0
  51. package/public/js/utils.js +27 -0
  52. package/public/tailwind.css +1 -0
  53. package/server.js +126 -0
  54. package/sessions.js +375 -0
  55. package/telemetry-receiver.js +129 -0
  56. package/themes.js +247 -0
  57. package/transcript.js +90 -0
  58. package/utils.js +66 -0
@@ -0,0 +1,416 @@
1
+ import base64
2
+ import glob
3
+ import json
4
+ import math
5
+ import os
6
+ import time
7
+ from functools import lru_cache
8
+ from subprocess import CalledProcessError, run
9
+
10
+ import mlx.core as mx
11
+ import mlx.nn as nn
12
+ import numpy as np
13
+ import tiktoken
14
+ from huggingface_hub import hf_hub_download, snapshot_download
15
+
16
+ class Tokenizer:
17
+ def __init__(self):
18
+ base_path = os.path.dirname(os.path.abspath(__file__))
19
+ path_tok = os.path.join(base_path, 'multilingual.tiktoken')
20
+ if not os.path.exists(path_tok):
21
+ path_tok = hf_hub_download(repo_id='JosefAlbers/whisper', filename='multilingual.tiktoken', cache_dir=base_path)
22
+ with open(path_tok) as f:
23
+ ranks = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in f if line)}
24
+ n_vocab = len(ranks)
25
+ specials = ["<|endoftext|>", "<|startoftranscript|>", *[f"<|_{lang}|>" for lang in range(100)], "<|translate|>", "<|transcribe|>", "<|startoflm|>", "<|startofprev|>", "<|nospeech|>", "<|notimestamps|>", *[f"<|{i * 0.02:.2f}|>" for i in range(1501)]]
26
+ special_tokens = {k:(n_vocab+i) for i,k in enumerate(specials)}
27
+ self.encoding = tiktoken.Encoding(name='jj', explicit_n_vocab=n_vocab + len(special_tokens), pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", mergeable_ranks=ranks, special_tokens=special_tokens)
28
+ def encode(self, lot):
29
+ if isinstance(lot, str):
30
+ lot = [lot]
31
+ return [self.encoding.encode(t, allowed_special='all') for t in lot]
32
+ def decode(self, lol):
33
+ if isinstance(lol[0], int):
34
+ lol = [lol]
35
+ return [self.encoding.decode(l) for l in lol]
36
+
37
+ LANGUAGES_KEYS = [
38
+ "en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr",
39
+ "pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi",
40
+ "he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no",
41
+ "th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk",
42
+ "te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk",
43
+ "br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw",
44
+ "gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc",
45
+ "ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo",
46
+ "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl",
47
+ "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su", "yue"
48
+ ]
49
+
50
+ def load_audio(file, sr=16000):
51
+ try:
52
+ out = run(["ffmpeg", "-nostdin", "-threads", "0", "-i", file, "-f", "s16le", "-ac", "1", "-acodec", "pcm_s16le", "-ar", str(sr), "-"], capture_output=True, check=True).stdout
53
+ except CalledProcessError as e:
54
+ raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
55
+ return mx.array(np.frombuffer(out, np.int16)).flatten().astype(mx.float32) / 32768.0
56
+
57
+ @lru_cache(maxsize=None)
58
+ def mel_filters(n_mels):
59
+ base_path = os.path.dirname(os.path.abspath(__file__))
60
+ path_mel = os.path.join(base_path, "mel_filters.npz")
61
+ if not os.path.exists(path_mel):
62
+ raise RuntimeError(
63
+ "Missing mel_filters.npz. This runtime expects mel_filters.npz to be packaged alongside whisper_turbo.py."
64
+ )
65
+ return mx.load(path_mel)[f"mel_{n_mels}"]
66
+
67
+ @lru_cache(maxsize=None)
68
+ def hanning(n_fft):
69
+ return mx.array(np.hanning(n_fft + 1)[:-1])
70
+
71
+ @lru_cache(maxsize=None)
72
+ def stft(x, window, nperseg=400, noverlap=160, nfft=None, axis=-1, pad_mode="reflect"):
73
+ if nfft is None:
74
+ nfft = nperseg
75
+ if noverlap is None:
76
+ noverlap = nfft // 4
77
+ def _pad(x, padding, pad_mode="constant"):
78
+ if pad_mode == "constant":
79
+ return mx.pad(x, [(padding, padding)])
80
+ elif pad_mode == "reflect":
81
+ prefix = x[1 : padding + 1][::-1]
82
+ suffix = x[-(padding + 1) : -1][::-1]
83
+ return mx.concatenate([prefix, x, suffix])
84
+ else:
85
+ raise ValueError(f"Invalid pad_mode {pad_mode}")
86
+ padding = nperseg // 2
87
+ x = _pad(x, padding, pad_mode)
88
+ strides = [noverlap, 1]
89
+ t = (x.size - nperseg + noverlap) // noverlap
90
+ shape = [t, nfft]
91
+ x = mx.as_strided(x, shape=shape, strides=strides)
92
+ return mx.fft.rfft(x * window)
93
+
94
+ def log_mel_spectrogram(audio, n_mels=128, padding=480000):
95
+ if isinstance(audio, str):
96
+ audio = load_audio(audio)
97
+ elif not isinstance(audio, mx.array):
98
+ audio = mx.array(audio)
99
+ if padding > 0:
100
+ audio = mx.pad(audio, (0, padding))
101
+ window = hanning(400)
102
+ freqs = stft(audio, window, nperseg=400, noverlap=160)
103
+ magnitudes = freqs[:-1, :].abs().square()
104
+ filters = mel_filters(n_mels)
105
+ mel_spec = magnitudes @ filters.T
106
+ log_spec = mx.maximum(mel_spec, 1e-10).log10()
107
+ log_spec = mx.maximum(log_spec, log_spec.max() - 8.0)
108
+ log_spec = (log_spec + 4.0) / 4.0
109
+ return log_spec
110
+
111
+ def sinusoids(length, channels, max_timescale=10000):
112
+ assert channels % 2 == 0
113
+ log_timescale_increment = math.log(max_timescale) / (channels // 2 - 1)
114
+ inv_timescales = mx.exp(-log_timescale_increment * mx.arange(channels // 2))
115
+ scaled_time = mx.arange(length)[:, None] * inv_timescales[None, :]
116
+ return mx.concatenate([mx.sin(scaled_time), mx.cos(scaled_time)], axis=1)
117
+
118
+ class MultiHeadAttention(nn.Module):
119
+ def __init__(self, d_model, n_head):
120
+ super().__init__()
121
+ self.n_head = n_head
122
+ self.q_proj = nn.Linear(d_model, d_model)
123
+ self.k_proj = nn.Linear(d_model, d_model, bias=False)
124
+ self.v_proj = nn.Linear(d_model, d_model)
125
+ self.out_proj = nn.Linear(d_model, d_model)
126
+ def __call__(self, x, xa=None, mask=None, kv_cache=None):
127
+ q = self.q_proj(x)
128
+ if xa is None:
129
+ k = self.k_proj(x)
130
+ v = self.v_proj(x)
131
+ if kv_cache is not None:
132
+ k = mx.concatenate([kv_cache[0], k], axis=1)
133
+ v = mx.concatenate([kv_cache[1], v], axis=1)
134
+ elif kv_cache is None:
135
+ k = self.k_proj(xa)
136
+ v = self.v_proj(xa)
137
+ else:
138
+ k, v = kv_cache
139
+ wv, qk = self.qkv_attention(q, k, v, mask)
140
+ return self.out_proj(wv), (k, v), qk
141
+
142
+ def qkv_attention(self, q, k, v, mask=None):
143
+ n_batch, n_ctx, n_state = q.shape
144
+ scale = (n_state // self.n_head) ** -0.25
145
+ q = q.reshape(*q.shape[:2], self.n_head, -1).transpose(0, 2, 1, 3) * scale
146
+ k = k.reshape(*k.shape[:2], self.n_head, -1).transpose(0, 2, 3, 1) * scale
147
+ v = v.reshape(*v.shape[:2], self.n_head, -1).transpose(0, 2, 1, 3)
148
+ qk = q @ k
149
+ if mask is not None:
150
+ qk = qk + mask[:n_ctx, :n_ctx]
151
+ w = mx.softmax(qk, axis=-1)
152
+ out = (w @ v).transpose(0, 2, 1, 3)
153
+ out = out.reshape(n_batch, n_ctx, n_state)
154
+ return out, qk
155
+
156
+ class ResidualAttentionBlock(nn.Module):
157
+ def __init__(self, d_model, n_head, cross_attention=False):
158
+ super().__init__()
159
+ self.self_attn = MultiHeadAttention(d_model, n_head)
160
+ self.self_attn_layer_norm = nn.LayerNorm(d_model)
161
+ self.encoder_attn = MultiHeadAttention(d_model, n_head) if cross_attention else None
162
+ self.encoder_attn_layer_norm = nn.LayerNorm(d_model) if cross_attention else None
163
+ n_mlp = d_model * 4
164
+ self.fc1 = nn.Linear(d_model, n_mlp)
165
+ self.fc2 = nn.Linear(n_mlp, d_model)
166
+ self.final_layer_norm = nn.LayerNorm(d_model)
167
+ def __call__(self, x, xa=None, mask=None, kv_cache=None):
168
+ kv, cross_kv = kv_cache if kv_cache else (None, None)
169
+ y, kv, _ = self.self_attn(self.self_attn_layer_norm(x), mask=mask, kv_cache=kv)
170
+ x += y
171
+ cross_qk = None
172
+ if self.encoder_attn:
173
+ y, cross_kv, cross_qk = self.encoder_attn(self.encoder_attn_layer_norm(x), xa, kv_cache=cross_kv)
174
+ x += y
175
+ x = x + self.fc2(nn.gelu(self.fc1(self.final_layer_norm(x))))
176
+ return x, (kv, cross_kv), cross_qk
177
+
178
+ class AudioEncoder(nn.Module):
179
+ def __init__(self, cfg):
180
+ super().__init__()
181
+ self.conv1 = nn.Conv1d(cfg['num_mel_bins'], cfg['d_model'], kernel_size=3, padding=1)
182
+ self.conv2 = nn.Conv1d(cfg['d_model'], cfg['d_model'], kernel_size=3, stride=2, padding=1)
183
+ self._positional_embedding = sinusoids(cfg['max_source_positions'], cfg['d_model']).astype(mx.float16)
184
+ self.layers = [ResidualAttentionBlock(cfg['d_model'], cfg['encoder_attention_heads']) for _ in range(cfg['encoder_layers'])]
185
+ self.layer_norm = nn.LayerNorm(cfg['d_model'])
186
+ def __call__(self, x):
187
+ x = nn.gelu(self.conv1(x))
188
+ x = nn.gelu(self.conv2(x))
189
+ x = x + self._positional_embedding
190
+ for block in self.layers:
191
+ x, _, _ = block(x)
192
+ x = self.layer_norm(x)
193
+ return x
194
+
195
+ class TextDecoder(nn.Module):
196
+ def __init__(self, cfg):
197
+ super().__init__()
198
+ self.embed_tokens = nn.Embedding(cfg['vocab_size'], cfg['d_model'])
199
+ self.positional_embedding = mx.zeros((cfg['max_target_positions'], cfg['d_model']))
200
+ self.layers = [ResidualAttentionBlock(cfg['d_model'], cfg['decoder_attention_heads'], cross_attention=True) for _ in range(cfg['decoder_layers'])]
201
+ self.layer_norm = nn.LayerNorm(cfg['d_model'])
202
+ self._mask = nn.MultiHeadAttention.create_additive_causal_mask(cfg['max_target_positions']).astype(mx.float16)
203
+ def __call__(self, x, xa, kv_cache=None):
204
+ offset = kv_cache[0][0][0].shape[1] if kv_cache else 0
205
+ x = self.embed_tokens(x) + self.positional_embedding[offset : offset + x.shape[-1]]
206
+ if kv_cache is None:
207
+ kv_cache = [None] * len(self.layers)
208
+ cross_qk = [None] * len(self.layers)
209
+ for e, block in enumerate(self.layers):
210
+ x, kv_cache[e], cross_qk[e] = block(x, xa, mask=self._mask, kv_cache=kv_cache[e])
211
+ x = self.layer_norm(x)
212
+ return self.embed_tokens.as_linear(x), kv_cache, cross_qk
213
+
214
+ class Whisper(nn.Module):
215
+ def __init__(self, cfg):
216
+ self.encoder = AudioEncoder(cfg)
217
+ self.decoder = TextDecoder(cfg)
218
+ def __call__(self, mel, txt):
219
+ return self.decoder(txt, self.encoder(mel))[0]
220
+ def encode(self, mel):
221
+ return self.encoder(mel)
222
+ def decode(self, txt, mel, kv_cache):
223
+ return self.decoder(txt, mel, kv_cache)
224
+
225
+ class Transcriber(nn.Module):
226
+ def __init__(self, cfg):
227
+ self.model = Whisper(cfg)
228
+ self.tokenizer = Tokenizer()
229
+ self.len_sot = 0
230
+ def __call__(self, path_audio, lang="auto", any_lang=None, quick=False):
231
+ raw = log_mel_spectrogram(path_audio).astype(mx.float16)
232
+
233
+ # Backward compatibility for any_lang boolean
234
+ if any_lang is not None:
235
+ if any_lang:
236
+ lang = "auto"
237
+ else:
238
+ lang = "en"
239
+
240
+ detected_lang = None
241
+ if lang == "auto" or lang is None:
242
+ lang = self.detect_language(raw)
243
+ detected_lang = lang
244
+
245
+ if lang not in LANGUAGES_KEYS:
246
+ print(f"Warning: Language '{lang}' not found, defaulting to English.")
247
+ lang = "en"
248
+
249
+ lang_idx = LANGUAGES_KEYS.index(lang)
250
+ lang_token = 50259 + lang_idx
251
+ sot = mx.array([[50258, lang_token, 50360, 50365]])
252
+
253
+ self.len_sot = sot.shape[-1]
254
+ txt, avg_logprob = self.parallel(raw, sot) if quick else self.recurrent(raw, sot)
255
+ return {"text": txt, "avg_logprob": avg_logprob, "language": lang}
256
+
257
+ def detect_language(self, raw):
258
+ # Take first 30s (3000 frames) or less
259
+ length = min(len(raw), 3000)
260
+ segment = raw[:length][None] # (1, T, 128)
261
+
262
+ # Encode
263
+ audio_features = self.model.encode(segment)
264
+
265
+ # Decode [SOT]
266
+ sot = mx.array([[50258]])
267
+ logits, _, _ = self.model.decode(txt=sot, mel=audio_features, kv_cache=None)
268
+
269
+ # logits: (1, 1, vocab) -> Take last token logits
270
+ last_logits = logits[0, -1, :]
271
+
272
+ # Languages are 50259 to 50358 (100 tokens)
273
+ # Slice to get only language tokens
274
+ lang_logits = last_logits[50259:50359]
275
+ best_lang_idx = mx.argmax(lang_logits).item()
276
+
277
+ return LANGUAGES_KEYS[best_lang_idx]
278
+
279
+ def recurrent(self, raw, sot):
280
+ new_tok, i = mx.zeros((1,0), dtype=mx.int32), 0
281
+ total_logprob = 0.0
282
+ total_tokens = 0
283
+
284
+ while i+3000 < len(raw):
285
+ piece, logprob = self.step(raw[i:i+3000][None], sot)
286
+
287
+ # Accumulate logprobs (simplified for single segment)
288
+ total_logprob += logprob
289
+ total_tokens += piece.shape[1]
290
+
291
+ arg_hop = mx.argmax(piece).item()
292
+ hop = (piece[:,arg_hop].astype(mx.int32).item()-50365)*2
293
+ new_tok = mx.concatenate([new_tok, piece[:,:arg_hop]], axis=-1)
294
+ i += hop if hop > 0 else 3000
295
+
296
+ new_tok = [i for i in new_tok.astype(mx.int32).tolist()[0] if i < 50257]
297
+ avg_logprob = total_logprob / max(1, total_tokens)
298
+ return self.tokenizer.decode(new_tok)[0], avg_logprob
299
+
300
+ def parallel(self, raw, sot):
301
+ raw = raw[:(raw.shape[0]//3000)*3000].reshape(-1, 3000, 128)
302
+ assert raw.shape[0] < 360
303
+ sot = mx.repeat(sot, raw.shape[0], 0)
304
+ new_tok, avg_logprob = self.step(raw, sot)
305
+
306
+ arg_hop = mx.argmax(new_tok, axis=-1).tolist()
307
+ new_tok = [i[:a] for i,a in zip(new_tok.astype(mx.int32).tolist(),arg_hop)]
308
+ new_tok = [i for i in sum(new_tok, []) if i < 50257]
309
+ return self.tokenizer.decode(new_tok)[0], avg_logprob
310
+
311
+ def step(self, mel, txt):
312
+ mel = self.model.encode(mel)
313
+ kv_cache = None
314
+ B = mel.shape[0]
315
+ new_tok = mx.zeros((B,0), dtype=mx.int32)
316
+ goon = mx.ones((B,1), dtype=mx.bool_)
317
+
318
+ accumulated_logprob = 0.0
319
+ token_count = 0
320
+
321
+ for i in range(449-self.len_sot):
322
+ logits, kv_cache, _ = self.model.decode(txt=txt, mel=mel, kv_cache=kv_cache)
323
+
324
+ # Calculate logprobs
325
+ logprobs = nn.log_softmax(logits[:,-1,:], axis=-1)
326
+
327
+ txt = mx.argmax(logits[:,-1,:], axis=-1, keepdims=True) * goon
328
+ mx.eval(txt)
329
+
330
+ # Get logprob of selected token
331
+ # We need to gather the logprob corresponding to the selected index
332
+ # MLX doesn't have gather easily in this context, but we can do it via indexing if batch size is small (it is)
333
+ # Simplified: just take max logprob since we are doing argmax
334
+ selected_logprob = mx.max(logprobs, axis=-1)
335
+ accumulated_logprob += selected_logprob.item() # Taking item() assumes B=1 mostly
336
+ token_count += 1
337
+
338
+ goon *= (txt != 50257)
339
+ new_tok = mx.concatenate([new_tok, txt], axis=-1)
340
+ if goon.sum() <= 0:
341
+ break
342
+
343
+ avg = accumulated_logprob / max(1, token_count)
344
+ return new_tok, avg
345
+
346
+ MODEL_CACHE = None
347
+
348
+ def load_model():
349
+ global MODEL_CACHE
350
+ if MODEL_CACHE is not None:
351
+ return MODEL_CACHE
352
+
353
+ path_hf = snapshot_download(repo_id='openai/whisper-large-v3-turbo', allow_patterns=["config.json", "model.safetensors"])
354
+ with open(f'{path_hf}/config.json', 'r') as fp:
355
+ cfg = json.load(fp)
356
+ weights = [(k.replace("embed_positions.weight", "positional_embedding"), v.swapaxes(1, 2) if ('conv' in k and v.ndim==3) else v) for k, v in mx.load(f'{path_hf}/model.safetensors').items()]
357
+ model = Transcriber(cfg)
358
+ model.load_weights(weights, strict=False)
359
+ model.eval()
360
+ mx.eval(model)
361
+ MODEL_CACHE = model
362
+ return model
363
+
364
+ def transcribe(path_audio=None, lang="auto", any_lang=None, quick=False):
365
+ if path_audio is None:
366
+ return benchmark()
367
+ model = load_model()
368
+ return model(path_audio=path_audio, lang=lang, any_lang=any_lang, quick=quick)
369
+
370
+ def benchmark():
371
+ path_hf = snapshot_download(repo_id='JosefAlbers/exurb1a', allow_patterns=["*.mp3"])
372
+ tics = {}
373
+ for path_audio in sorted(glob.glob(f"{path_hf}/*.mp3")):
374
+ for any_lang in [True, False]:
375
+ for quick in [True, False]:
376
+ tic = time.perf_counter()
377
+ arg = f'{path_audio.split("/")[-1]} {any_lang=} {quick=}'
378
+ print(f'--- {arg=}')
379
+ result = transcribe(path_audio=path_audio, any_lang=any_lang, quick=quick)
380
+ print(result["text"])
381
+ tic = f'{(time.perf_counter() - tic):.2f}'
382
+ print(f'{tic=}')
383
+ tics[arg] = tic
384
+ return tics
385
+
386
+ def fire_main():
387
+ try:
388
+ import fire
389
+ except ImportError as e:
390
+ raise RuntimeError("fire package is required for whisper_turbo CLI usage") from e
391
+ fire.Fire(transcribe)
392
+
393
+ if __name__ == '__main__':
394
+ fire_main()
395
+
396
+ # benchmarks:
397
+ # 0_test.mp3 any_lang=True quick=True: 0.85
398
+ # 0_test.mp3 any_lang=True quick=False: 0.75
399
+ # 0_test.mp3 any_lang=False quick=True: 0.78
400
+ # 0_test.mp3 any_lang=False quick=False: 0.77
401
+ # 1_alive.mp3 any_lang=True quick=True: 7.10
402
+ # 1_alive.mp3 any_lang=True quick=False: 7.98
403
+ # 1_alive.mp3 any_lang=False quick=True: 6.57
404
+ # 1_alive.mp3 any_lang=False quick=False: 7.98
405
+ # 2_make.mp3 any_lang=True quick=True: 7.30
406
+ # 2_make.mp3 any_lang=True quick=False: 13.30
407
+ # 2_make.mp3 any_lang=False quick=True: 6.26
408
+ # 2_make.mp3 any_lang=False quick=False: 11.10
409
+ # 3_try.mp3 any_lang=True quick=True: 8.62
410
+ # 3_try.mp3 any_lang=True quick=False: 14.79
411
+ # 3_try.mp3 any_lang=False quick=True: 7.87
412
+ # 3_try.mp3 any_lang=False quick=False: 15.21
413
+ # 4_never.mp3 any_lang=True quick=True: 11.70
414
+ # 4_never.mp3 any_lang=True quick=False: 17.70
415
+ # 4_never.mp3 any_lang=False quick=True: 10.67
416
+ # 4_never.mp3 any_lang=False quick=False: 19.48
@@ -0,0 +1,135 @@
1
+ #!/usr/bin/env python3
2
+ """Voice Input plugin - Python worker for local ASR transcription.
3
+
4
+ Long-running process. Reads JSON commands from stdin, writes JSON responses to stdout.
5
+ Audio arrives as base64-encoded float32 PCM (16kHz mono) — no ffmpeg needed.
6
+ """
7
+ import base64
8
+ import json
9
+ import os
10
+ import struct
11
+ import sys
12
+ import tempfile
13
+ import time
14
+
15
+ import numpy as np
16
+
17
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
18
+ _model = None
19
+ _backend = None # 'mlx' or 'faster_whisper'
20
+ _fw_model = None
21
+
22
+
23
+ def respond(msg_id, data):
24
+ sys.stdout.write(json.dumps({"id": msg_id, **data}) + "\n")
25
+ sys.stdout.flush()
26
+
27
+
28
+ def load_model():
29
+ global _model, _backend, _fw_model
30
+ if _model is not None:
31
+ return
32
+
33
+ if sys.platform == "darwin":
34
+ sys.path.insert(0, SCRIPT_DIR)
35
+ from whisper_turbo import load_model as _load
36
+ _model = _load()
37
+ _backend = "mlx"
38
+ else:
39
+ from faster_whisper import WhisperModel
40
+ _fw_model = WhisperModel("small", device="auto", compute_type="int8")
41
+ _model = _fw_model
42
+ _backend = "faster_whisper"
43
+
44
+
45
+ def write_wav(pcm_f32):
46
+ """Write float32 PCM to a temp WAV file (16-bit, 16kHz, mono). For faster-whisper."""
47
+ fd, path = tempfile.mkstemp(suffix=".wav")
48
+ pcm16 = np.clip(pcm_f32, -1.0, 1.0)
49
+ pcm16 = (pcm16 * 32767).astype(np.int16)
50
+ data = pcm16.tobytes()
51
+ n = len(data)
52
+ # WAV header
53
+ header = struct.pack('<4sI4s4sIHHIIHH4sI',
54
+ b'RIFF', 36 + n, b'WAVE',
55
+ b'fmt ', 16, 1, 1, 16000, 32000, 2, 16,
56
+ b'data', n)
57
+ os.write(fd, header + data)
58
+ os.close(fd)
59
+ return path
60
+
61
+
62
+ def transcribe(pcm_f32, lang="auto"):
63
+ if _backend == "mlx":
64
+ from whisper_turbo import transcribe as _transcribe
65
+ # Pass numpy array directly — no file, no ffmpeg
66
+ return _transcribe(path_audio=pcm_f32, lang=lang)
67
+
68
+ # faster-whisper needs a file
69
+ path = write_wav(pcm_f32)
70
+ try:
71
+ kwargs = {"task": "transcribe"}
72
+ if lang and lang != "auto":
73
+ kwargs["language"] = lang
74
+ segs, info = _fw_model.transcribe(path, **kwargs)
75
+ segs = list(segs)
76
+ text = "".join(s.text for s in segs).strip()
77
+ lps = [s.avg_logprob for s in segs if s.avg_logprob is not None]
78
+ return {
79
+ "text": text,
80
+ "avg_logprob": sum(lps) / len(lps) if lps else None,
81
+ "language": getattr(info, "language", "unknown"),
82
+ }
83
+ finally:
84
+ try:
85
+ os.remove(path)
86
+ except OSError:
87
+ pass
88
+
89
+
90
+ def handle(cmd):
91
+ cid = cmd.get("id", "")
92
+ action = cmd.get("action")
93
+
94
+ if action == "warmup":
95
+ try:
96
+ load_model()
97
+ respond(cid, {"status": "ready"})
98
+ except Exception as e:
99
+ respond(cid, {"status": "error", "error": str(e)})
100
+
101
+ elif action == "transcribe":
102
+ try:
103
+ raw = base64.b64decode(cmd.get("audio", ""))
104
+ pcm_f32 = np.frombuffer(raw, dtype=np.float32)
105
+ lang = cmd.get("lang", "auto")
106
+ t0 = time.perf_counter()
107
+ result = transcribe(pcm_f32, lang)
108
+ elapsed = time.perf_counter() - t0
109
+ respond(cid, {
110
+ "text": result.get("text", ""),
111
+ "avg_logprob": result.get("avg_logprob"),
112
+ "language": result.get("language", "unknown"),
113
+ "inference_time": round(elapsed, 2),
114
+ })
115
+ except Exception as e:
116
+ respond(cid, {"error": str(e)})
117
+
118
+ elif action == "status":
119
+ respond(cid, {"loaded": _model is not None, "backend": _backend})
120
+
121
+
122
+ def main():
123
+ respond("init", {"status": "started"})
124
+ for line in sys.stdin:
125
+ line = line.strip()
126
+ if not line:
127
+ continue
128
+ try:
129
+ handle(json.loads(line))
130
+ except Exception as e:
131
+ respond("error", {"error": str(e)})
132
+
133
+
134
+ if __name__ == "__main__":
135
+ main()
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file
Binary file