clideck 1.22.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +77 -0
- package/activity.js +56 -0
- package/agent-presets.json +93 -0
- package/assets/clideck-themes.jpg +0 -0
- package/bin/clideck.js +2 -0
- package/config.js +96 -0
- package/handlers.js +297 -0
- package/opencode-bridge.js +148 -0
- package/opencode-plugin/clideck-bridge.js +24 -0
- package/package.json +47 -0
- package/paths.js +41 -0
- package/plugin-loader.js +285 -0
- package/plugins/trim-clip/clideck-plugin.json +13 -0
- package/plugins/trim-clip/client.js +31 -0
- package/plugins/trim-clip/index.js +10 -0
- package/plugins/voice-input/clideck-plugin.json +49 -0
- package/plugins/voice-input/client.js +196 -0
- package/plugins/voice-input/index.js +342 -0
- package/plugins/voice-input/python/mel_filters.npz +0 -0
- package/plugins/voice-input/python/whisper_turbo.py +416 -0
- package/plugins/voice-input/python/worker.py +135 -0
- package/public/fx/bold-beep-idle.mp3 +0 -0
- package/public/fx/default-beep.mp3 +0 -0
- package/public/fx/echo-beep-idle.mp3 +0 -0
- package/public/fx/musical-beep-idle.mp3 +0 -0
- package/public/fx/small-bleep-idle.mp3 +0 -0
- package/public/fx/soft-beep.mp3 +0 -0
- package/public/fx/space-idle.mp3 +0 -0
- package/public/img/claude-code.png +0 -0
- package/public/img/clideck-logo-icon.png +0 -0
- package/public/img/clideck-logo-terminal-panel.png +0 -0
- package/public/img/codex.png +0 -0
- package/public/img/gemini.png +0 -0
- package/public/img/opencode.png +0 -0
- package/public/index.html +243 -0
- package/public/js/app.js +794 -0
- package/public/js/color-mode.js +51 -0
- package/public/js/confirm.js +27 -0
- package/public/js/creator.js +201 -0
- package/public/js/drag.js +134 -0
- package/public/js/folder-picker.js +81 -0
- package/public/js/hotkeys.js +90 -0
- package/public/js/nav.js +56 -0
- package/public/js/profiles.js +22 -0
- package/public/js/prompts.js +325 -0
- package/public/js/settings.js +489 -0
- package/public/js/state.js +15 -0
- package/public/js/terminals.js +905 -0
- package/public/js/toast.js +62 -0
- package/public/js/utils.js +27 -0
- package/public/tailwind.css +1 -0
- package/server.js +126 -0
- package/sessions.js +375 -0
- package/telemetry-receiver.js +129 -0
- package/themes.js +247 -0
- package/transcript.js +90 -0
- package/utils.js +66 -0
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
import base64
|
|
2
|
+
import glob
|
|
3
|
+
import json
|
|
4
|
+
import math
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
from functools import lru_cache
|
|
8
|
+
from subprocess import CalledProcessError, run
|
|
9
|
+
|
|
10
|
+
import mlx.core as mx
|
|
11
|
+
import mlx.nn as nn
|
|
12
|
+
import numpy as np
|
|
13
|
+
import tiktoken
|
|
14
|
+
from huggingface_hub import hf_hub_download, snapshot_download
|
|
15
|
+
|
|
16
|
+
class Tokenizer:
|
|
17
|
+
def __init__(self):
|
|
18
|
+
base_path = os.path.dirname(os.path.abspath(__file__))
|
|
19
|
+
path_tok = os.path.join(base_path, 'multilingual.tiktoken')
|
|
20
|
+
if not os.path.exists(path_tok):
|
|
21
|
+
path_tok = hf_hub_download(repo_id='JosefAlbers/whisper', filename='multilingual.tiktoken', cache_dir=base_path)
|
|
22
|
+
with open(path_tok) as f:
|
|
23
|
+
ranks = {base64.b64decode(token): int(rank) for token, rank in (line.split() for line in f if line)}
|
|
24
|
+
n_vocab = len(ranks)
|
|
25
|
+
specials = ["<|endoftext|>", "<|startoftranscript|>", *[f"<|_{lang}|>" for lang in range(100)], "<|translate|>", "<|transcribe|>", "<|startoflm|>", "<|startofprev|>", "<|nospeech|>", "<|notimestamps|>", *[f"<|{i * 0.02:.2f}|>" for i in range(1501)]]
|
|
26
|
+
special_tokens = {k:(n_vocab+i) for i,k in enumerate(specials)}
|
|
27
|
+
self.encoding = tiktoken.Encoding(name='jj', explicit_n_vocab=n_vocab + len(special_tokens), pat_str=r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""", mergeable_ranks=ranks, special_tokens=special_tokens)
|
|
28
|
+
def encode(self, lot):
|
|
29
|
+
if isinstance(lot, str):
|
|
30
|
+
lot = [lot]
|
|
31
|
+
return [self.encoding.encode(t, allowed_special='all') for t in lot]
|
|
32
|
+
def decode(self, lol):
|
|
33
|
+
if isinstance(lol[0], int):
|
|
34
|
+
lol = [lol]
|
|
35
|
+
return [self.encoding.decode(l) for l in lol]
|
|
36
|
+
|
|
37
|
+
LANGUAGES_KEYS = [
|
|
38
|
+
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr",
|
|
39
|
+
"pl", "ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi",
|
|
40
|
+
"he", "uk", "el", "ms", "cs", "ro", "da", "hu", "ta", "no",
|
|
41
|
+
"th", "ur", "hr", "bg", "lt", "la", "mi", "ml", "cy", "sk",
|
|
42
|
+
"te", "fa", "lv", "bn", "sr", "az", "sl", "kn", "et", "mk",
|
|
43
|
+
"br", "eu", "is", "hy", "ne", "mn", "bs", "kk", "sq", "sw",
|
|
44
|
+
"gl", "mr", "pa", "si", "km", "sn", "yo", "so", "af", "oc",
|
|
45
|
+
"ka", "be", "tg", "sd", "gu", "am", "yi", "lo", "uz", "fo",
|
|
46
|
+
"ht", "ps", "tk", "nn", "mt", "sa", "lb", "my", "bo", "tl",
|
|
47
|
+
"mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su", "yue"
|
|
48
|
+
]
|
|
49
|
+
|
|
50
|
+
def load_audio(file, sr=16000):
|
|
51
|
+
try:
|
|
52
|
+
out = run(["ffmpeg", "-nostdin", "-threads", "0", "-i", file, "-f", "s16le", "-ac", "1", "-acodec", "pcm_s16le", "-ar", str(sr), "-"], capture_output=True, check=True).stdout
|
|
53
|
+
except CalledProcessError as e:
|
|
54
|
+
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
|
|
55
|
+
return mx.array(np.frombuffer(out, np.int16)).flatten().astype(mx.float32) / 32768.0
|
|
56
|
+
|
|
57
|
+
@lru_cache(maxsize=None)
|
|
58
|
+
def mel_filters(n_mels):
|
|
59
|
+
base_path = os.path.dirname(os.path.abspath(__file__))
|
|
60
|
+
path_mel = os.path.join(base_path, "mel_filters.npz")
|
|
61
|
+
if not os.path.exists(path_mel):
|
|
62
|
+
raise RuntimeError(
|
|
63
|
+
"Missing mel_filters.npz. This runtime expects mel_filters.npz to be packaged alongside whisper_turbo.py."
|
|
64
|
+
)
|
|
65
|
+
return mx.load(path_mel)[f"mel_{n_mels}"]
|
|
66
|
+
|
|
67
|
+
@lru_cache(maxsize=None)
|
|
68
|
+
def hanning(n_fft):
|
|
69
|
+
return mx.array(np.hanning(n_fft + 1)[:-1])
|
|
70
|
+
|
|
71
|
+
@lru_cache(maxsize=None)
|
|
72
|
+
def stft(x, window, nperseg=400, noverlap=160, nfft=None, axis=-1, pad_mode="reflect"):
|
|
73
|
+
if nfft is None:
|
|
74
|
+
nfft = nperseg
|
|
75
|
+
if noverlap is None:
|
|
76
|
+
noverlap = nfft // 4
|
|
77
|
+
def _pad(x, padding, pad_mode="constant"):
|
|
78
|
+
if pad_mode == "constant":
|
|
79
|
+
return mx.pad(x, [(padding, padding)])
|
|
80
|
+
elif pad_mode == "reflect":
|
|
81
|
+
prefix = x[1 : padding + 1][::-1]
|
|
82
|
+
suffix = x[-(padding + 1) : -1][::-1]
|
|
83
|
+
return mx.concatenate([prefix, x, suffix])
|
|
84
|
+
else:
|
|
85
|
+
raise ValueError(f"Invalid pad_mode {pad_mode}")
|
|
86
|
+
padding = nperseg // 2
|
|
87
|
+
x = _pad(x, padding, pad_mode)
|
|
88
|
+
strides = [noverlap, 1]
|
|
89
|
+
t = (x.size - nperseg + noverlap) // noverlap
|
|
90
|
+
shape = [t, nfft]
|
|
91
|
+
x = mx.as_strided(x, shape=shape, strides=strides)
|
|
92
|
+
return mx.fft.rfft(x * window)
|
|
93
|
+
|
|
94
|
+
def log_mel_spectrogram(audio, n_mels=128, padding=480000):
|
|
95
|
+
if isinstance(audio, str):
|
|
96
|
+
audio = load_audio(audio)
|
|
97
|
+
elif not isinstance(audio, mx.array):
|
|
98
|
+
audio = mx.array(audio)
|
|
99
|
+
if padding > 0:
|
|
100
|
+
audio = mx.pad(audio, (0, padding))
|
|
101
|
+
window = hanning(400)
|
|
102
|
+
freqs = stft(audio, window, nperseg=400, noverlap=160)
|
|
103
|
+
magnitudes = freqs[:-1, :].abs().square()
|
|
104
|
+
filters = mel_filters(n_mels)
|
|
105
|
+
mel_spec = magnitudes @ filters.T
|
|
106
|
+
log_spec = mx.maximum(mel_spec, 1e-10).log10()
|
|
107
|
+
log_spec = mx.maximum(log_spec, log_spec.max() - 8.0)
|
|
108
|
+
log_spec = (log_spec + 4.0) / 4.0
|
|
109
|
+
return log_spec
|
|
110
|
+
|
|
111
|
+
def sinusoids(length, channels, max_timescale=10000):
|
|
112
|
+
assert channels % 2 == 0
|
|
113
|
+
log_timescale_increment = math.log(max_timescale) / (channels // 2 - 1)
|
|
114
|
+
inv_timescales = mx.exp(-log_timescale_increment * mx.arange(channels // 2))
|
|
115
|
+
scaled_time = mx.arange(length)[:, None] * inv_timescales[None, :]
|
|
116
|
+
return mx.concatenate([mx.sin(scaled_time), mx.cos(scaled_time)], axis=1)
|
|
117
|
+
|
|
118
|
+
class MultiHeadAttention(nn.Module):
|
|
119
|
+
def __init__(self, d_model, n_head):
|
|
120
|
+
super().__init__()
|
|
121
|
+
self.n_head = n_head
|
|
122
|
+
self.q_proj = nn.Linear(d_model, d_model)
|
|
123
|
+
self.k_proj = nn.Linear(d_model, d_model, bias=False)
|
|
124
|
+
self.v_proj = nn.Linear(d_model, d_model)
|
|
125
|
+
self.out_proj = nn.Linear(d_model, d_model)
|
|
126
|
+
def __call__(self, x, xa=None, mask=None, kv_cache=None):
|
|
127
|
+
q = self.q_proj(x)
|
|
128
|
+
if xa is None:
|
|
129
|
+
k = self.k_proj(x)
|
|
130
|
+
v = self.v_proj(x)
|
|
131
|
+
if kv_cache is not None:
|
|
132
|
+
k = mx.concatenate([kv_cache[0], k], axis=1)
|
|
133
|
+
v = mx.concatenate([kv_cache[1], v], axis=1)
|
|
134
|
+
elif kv_cache is None:
|
|
135
|
+
k = self.k_proj(xa)
|
|
136
|
+
v = self.v_proj(xa)
|
|
137
|
+
else:
|
|
138
|
+
k, v = kv_cache
|
|
139
|
+
wv, qk = self.qkv_attention(q, k, v, mask)
|
|
140
|
+
return self.out_proj(wv), (k, v), qk
|
|
141
|
+
|
|
142
|
+
def qkv_attention(self, q, k, v, mask=None):
|
|
143
|
+
n_batch, n_ctx, n_state = q.shape
|
|
144
|
+
scale = (n_state // self.n_head) ** -0.25
|
|
145
|
+
q = q.reshape(*q.shape[:2], self.n_head, -1).transpose(0, 2, 1, 3) * scale
|
|
146
|
+
k = k.reshape(*k.shape[:2], self.n_head, -1).transpose(0, 2, 3, 1) * scale
|
|
147
|
+
v = v.reshape(*v.shape[:2], self.n_head, -1).transpose(0, 2, 1, 3)
|
|
148
|
+
qk = q @ k
|
|
149
|
+
if mask is not None:
|
|
150
|
+
qk = qk + mask[:n_ctx, :n_ctx]
|
|
151
|
+
w = mx.softmax(qk, axis=-1)
|
|
152
|
+
out = (w @ v).transpose(0, 2, 1, 3)
|
|
153
|
+
out = out.reshape(n_batch, n_ctx, n_state)
|
|
154
|
+
return out, qk
|
|
155
|
+
|
|
156
|
+
class ResidualAttentionBlock(nn.Module):
|
|
157
|
+
def __init__(self, d_model, n_head, cross_attention=False):
|
|
158
|
+
super().__init__()
|
|
159
|
+
self.self_attn = MultiHeadAttention(d_model, n_head)
|
|
160
|
+
self.self_attn_layer_norm = nn.LayerNorm(d_model)
|
|
161
|
+
self.encoder_attn = MultiHeadAttention(d_model, n_head) if cross_attention else None
|
|
162
|
+
self.encoder_attn_layer_norm = nn.LayerNorm(d_model) if cross_attention else None
|
|
163
|
+
n_mlp = d_model * 4
|
|
164
|
+
self.fc1 = nn.Linear(d_model, n_mlp)
|
|
165
|
+
self.fc2 = nn.Linear(n_mlp, d_model)
|
|
166
|
+
self.final_layer_norm = nn.LayerNorm(d_model)
|
|
167
|
+
def __call__(self, x, xa=None, mask=None, kv_cache=None):
|
|
168
|
+
kv, cross_kv = kv_cache if kv_cache else (None, None)
|
|
169
|
+
y, kv, _ = self.self_attn(self.self_attn_layer_norm(x), mask=mask, kv_cache=kv)
|
|
170
|
+
x += y
|
|
171
|
+
cross_qk = None
|
|
172
|
+
if self.encoder_attn:
|
|
173
|
+
y, cross_kv, cross_qk = self.encoder_attn(self.encoder_attn_layer_norm(x), xa, kv_cache=cross_kv)
|
|
174
|
+
x += y
|
|
175
|
+
x = x + self.fc2(nn.gelu(self.fc1(self.final_layer_norm(x))))
|
|
176
|
+
return x, (kv, cross_kv), cross_qk
|
|
177
|
+
|
|
178
|
+
class AudioEncoder(nn.Module):
|
|
179
|
+
def __init__(self, cfg):
|
|
180
|
+
super().__init__()
|
|
181
|
+
self.conv1 = nn.Conv1d(cfg['num_mel_bins'], cfg['d_model'], kernel_size=3, padding=1)
|
|
182
|
+
self.conv2 = nn.Conv1d(cfg['d_model'], cfg['d_model'], kernel_size=3, stride=2, padding=1)
|
|
183
|
+
self._positional_embedding = sinusoids(cfg['max_source_positions'], cfg['d_model']).astype(mx.float16)
|
|
184
|
+
self.layers = [ResidualAttentionBlock(cfg['d_model'], cfg['encoder_attention_heads']) for _ in range(cfg['encoder_layers'])]
|
|
185
|
+
self.layer_norm = nn.LayerNorm(cfg['d_model'])
|
|
186
|
+
def __call__(self, x):
|
|
187
|
+
x = nn.gelu(self.conv1(x))
|
|
188
|
+
x = nn.gelu(self.conv2(x))
|
|
189
|
+
x = x + self._positional_embedding
|
|
190
|
+
for block in self.layers:
|
|
191
|
+
x, _, _ = block(x)
|
|
192
|
+
x = self.layer_norm(x)
|
|
193
|
+
return x
|
|
194
|
+
|
|
195
|
+
class TextDecoder(nn.Module):
|
|
196
|
+
def __init__(self, cfg):
|
|
197
|
+
super().__init__()
|
|
198
|
+
self.embed_tokens = nn.Embedding(cfg['vocab_size'], cfg['d_model'])
|
|
199
|
+
self.positional_embedding = mx.zeros((cfg['max_target_positions'], cfg['d_model']))
|
|
200
|
+
self.layers = [ResidualAttentionBlock(cfg['d_model'], cfg['decoder_attention_heads'], cross_attention=True) for _ in range(cfg['decoder_layers'])]
|
|
201
|
+
self.layer_norm = nn.LayerNorm(cfg['d_model'])
|
|
202
|
+
self._mask = nn.MultiHeadAttention.create_additive_causal_mask(cfg['max_target_positions']).astype(mx.float16)
|
|
203
|
+
def __call__(self, x, xa, kv_cache=None):
|
|
204
|
+
offset = kv_cache[0][0][0].shape[1] if kv_cache else 0
|
|
205
|
+
x = self.embed_tokens(x) + self.positional_embedding[offset : offset + x.shape[-1]]
|
|
206
|
+
if kv_cache is None:
|
|
207
|
+
kv_cache = [None] * len(self.layers)
|
|
208
|
+
cross_qk = [None] * len(self.layers)
|
|
209
|
+
for e, block in enumerate(self.layers):
|
|
210
|
+
x, kv_cache[e], cross_qk[e] = block(x, xa, mask=self._mask, kv_cache=kv_cache[e])
|
|
211
|
+
x = self.layer_norm(x)
|
|
212
|
+
return self.embed_tokens.as_linear(x), kv_cache, cross_qk
|
|
213
|
+
|
|
214
|
+
class Whisper(nn.Module):
|
|
215
|
+
def __init__(self, cfg):
|
|
216
|
+
self.encoder = AudioEncoder(cfg)
|
|
217
|
+
self.decoder = TextDecoder(cfg)
|
|
218
|
+
def __call__(self, mel, txt):
|
|
219
|
+
return self.decoder(txt, self.encoder(mel))[0]
|
|
220
|
+
def encode(self, mel):
|
|
221
|
+
return self.encoder(mel)
|
|
222
|
+
def decode(self, txt, mel, kv_cache):
|
|
223
|
+
return self.decoder(txt, mel, kv_cache)
|
|
224
|
+
|
|
225
|
+
class Transcriber(nn.Module):
|
|
226
|
+
def __init__(self, cfg):
|
|
227
|
+
self.model = Whisper(cfg)
|
|
228
|
+
self.tokenizer = Tokenizer()
|
|
229
|
+
self.len_sot = 0
|
|
230
|
+
def __call__(self, path_audio, lang="auto", any_lang=None, quick=False):
|
|
231
|
+
raw = log_mel_spectrogram(path_audio).astype(mx.float16)
|
|
232
|
+
|
|
233
|
+
# Backward compatibility for any_lang boolean
|
|
234
|
+
if any_lang is not None:
|
|
235
|
+
if any_lang:
|
|
236
|
+
lang = "auto"
|
|
237
|
+
else:
|
|
238
|
+
lang = "en"
|
|
239
|
+
|
|
240
|
+
detected_lang = None
|
|
241
|
+
if lang == "auto" or lang is None:
|
|
242
|
+
lang = self.detect_language(raw)
|
|
243
|
+
detected_lang = lang
|
|
244
|
+
|
|
245
|
+
if lang not in LANGUAGES_KEYS:
|
|
246
|
+
print(f"Warning: Language '{lang}' not found, defaulting to English.")
|
|
247
|
+
lang = "en"
|
|
248
|
+
|
|
249
|
+
lang_idx = LANGUAGES_KEYS.index(lang)
|
|
250
|
+
lang_token = 50259 + lang_idx
|
|
251
|
+
sot = mx.array([[50258, lang_token, 50360, 50365]])
|
|
252
|
+
|
|
253
|
+
self.len_sot = sot.shape[-1]
|
|
254
|
+
txt, avg_logprob = self.parallel(raw, sot) if quick else self.recurrent(raw, sot)
|
|
255
|
+
return {"text": txt, "avg_logprob": avg_logprob, "language": lang}
|
|
256
|
+
|
|
257
|
+
def detect_language(self, raw):
|
|
258
|
+
# Take first 30s (3000 frames) or less
|
|
259
|
+
length = min(len(raw), 3000)
|
|
260
|
+
segment = raw[:length][None] # (1, T, 128)
|
|
261
|
+
|
|
262
|
+
# Encode
|
|
263
|
+
audio_features = self.model.encode(segment)
|
|
264
|
+
|
|
265
|
+
# Decode [SOT]
|
|
266
|
+
sot = mx.array([[50258]])
|
|
267
|
+
logits, _, _ = self.model.decode(txt=sot, mel=audio_features, kv_cache=None)
|
|
268
|
+
|
|
269
|
+
# logits: (1, 1, vocab) -> Take last token logits
|
|
270
|
+
last_logits = logits[0, -1, :]
|
|
271
|
+
|
|
272
|
+
# Languages are 50259 to 50358 (100 tokens)
|
|
273
|
+
# Slice to get only language tokens
|
|
274
|
+
lang_logits = last_logits[50259:50359]
|
|
275
|
+
best_lang_idx = mx.argmax(lang_logits).item()
|
|
276
|
+
|
|
277
|
+
return LANGUAGES_KEYS[best_lang_idx]
|
|
278
|
+
|
|
279
|
+
def recurrent(self, raw, sot):
|
|
280
|
+
new_tok, i = mx.zeros((1,0), dtype=mx.int32), 0
|
|
281
|
+
total_logprob = 0.0
|
|
282
|
+
total_tokens = 0
|
|
283
|
+
|
|
284
|
+
while i+3000 < len(raw):
|
|
285
|
+
piece, logprob = self.step(raw[i:i+3000][None], sot)
|
|
286
|
+
|
|
287
|
+
# Accumulate logprobs (simplified for single segment)
|
|
288
|
+
total_logprob += logprob
|
|
289
|
+
total_tokens += piece.shape[1]
|
|
290
|
+
|
|
291
|
+
arg_hop = mx.argmax(piece).item()
|
|
292
|
+
hop = (piece[:,arg_hop].astype(mx.int32).item()-50365)*2
|
|
293
|
+
new_tok = mx.concatenate([new_tok, piece[:,:arg_hop]], axis=-1)
|
|
294
|
+
i += hop if hop > 0 else 3000
|
|
295
|
+
|
|
296
|
+
new_tok = [i for i in new_tok.astype(mx.int32).tolist()[0] if i < 50257]
|
|
297
|
+
avg_logprob = total_logprob / max(1, total_tokens)
|
|
298
|
+
return self.tokenizer.decode(new_tok)[0], avg_logprob
|
|
299
|
+
|
|
300
|
+
def parallel(self, raw, sot):
|
|
301
|
+
raw = raw[:(raw.shape[0]//3000)*3000].reshape(-1, 3000, 128)
|
|
302
|
+
assert raw.shape[0] < 360
|
|
303
|
+
sot = mx.repeat(sot, raw.shape[0], 0)
|
|
304
|
+
new_tok, avg_logprob = self.step(raw, sot)
|
|
305
|
+
|
|
306
|
+
arg_hop = mx.argmax(new_tok, axis=-1).tolist()
|
|
307
|
+
new_tok = [i[:a] for i,a in zip(new_tok.astype(mx.int32).tolist(),arg_hop)]
|
|
308
|
+
new_tok = [i for i in sum(new_tok, []) if i < 50257]
|
|
309
|
+
return self.tokenizer.decode(new_tok)[0], avg_logprob
|
|
310
|
+
|
|
311
|
+
def step(self, mel, txt):
|
|
312
|
+
mel = self.model.encode(mel)
|
|
313
|
+
kv_cache = None
|
|
314
|
+
B = mel.shape[0]
|
|
315
|
+
new_tok = mx.zeros((B,0), dtype=mx.int32)
|
|
316
|
+
goon = mx.ones((B,1), dtype=mx.bool_)
|
|
317
|
+
|
|
318
|
+
accumulated_logprob = 0.0
|
|
319
|
+
token_count = 0
|
|
320
|
+
|
|
321
|
+
for i in range(449-self.len_sot):
|
|
322
|
+
logits, kv_cache, _ = self.model.decode(txt=txt, mel=mel, kv_cache=kv_cache)
|
|
323
|
+
|
|
324
|
+
# Calculate logprobs
|
|
325
|
+
logprobs = nn.log_softmax(logits[:,-1,:], axis=-1)
|
|
326
|
+
|
|
327
|
+
txt = mx.argmax(logits[:,-1,:], axis=-1, keepdims=True) * goon
|
|
328
|
+
mx.eval(txt)
|
|
329
|
+
|
|
330
|
+
# Get logprob of selected token
|
|
331
|
+
# We need to gather the logprob corresponding to the selected index
|
|
332
|
+
# MLX doesn't have gather easily in this context, but we can do it via indexing if batch size is small (it is)
|
|
333
|
+
# Simplified: just take max logprob since we are doing argmax
|
|
334
|
+
selected_logprob = mx.max(logprobs, axis=-1)
|
|
335
|
+
accumulated_logprob += selected_logprob.item() # Taking item() assumes B=1 mostly
|
|
336
|
+
token_count += 1
|
|
337
|
+
|
|
338
|
+
goon *= (txt != 50257)
|
|
339
|
+
new_tok = mx.concatenate([new_tok, txt], axis=-1)
|
|
340
|
+
if goon.sum() <= 0:
|
|
341
|
+
break
|
|
342
|
+
|
|
343
|
+
avg = accumulated_logprob / max(1, token_count)
|
|
344
|
+
return new_tok, avg
|
|
345
|
+
|
|
346
|
+
MODEL_CACHE = None
|
|
347
|
+
|
|
348
|
+
def load_model():
|
|
349
|
+
global MODEL_CACHE
|
|
350
|
+
if MODEL_CACHE is not None:
|
|
351
|
+
return MODEL_CACHE
|
|
352
|
+
|
|
353
|
+
path_hf = snapshot_download(repo_id='openai/whisper-large-v3-turbo', allow_patterns=["config.json", "model.safetensors"])
|
|
354
|
+
with open(f'{path_hf}/config.json', 'r') as fp:
|
|
355
|
+
cfg = json.load(fp)
|
|
356
|
+
weights = [(k.replace("embed_positions.weight", "positional_embedding"), v.swapaxes(1, 2) if ('conv' in k and v.ndim==3) else v) for k, v in mx.load(f'{path_hf}/model.safetensors').items()]
|
|
357
|
+
model = Transcriber(cfg)
|
|
358
|
+
model.load_weights(weights, strict=False)
|
|
359
|
+
model.eval()
|
|
360
|
+
mx.eval(model)
|
|
361
|
+
MODEL_CACHE = model
|
|
362
|
+
return model
|
|
363
|
+
|
|
364
|
+
def transcribe(path_audio=None, lang="auto", any_lang=None, quick=False):
|
|
365
|
+
if path_audio is None:
|
|
366
|
+
return benchmark()
|
|
367
|
+
model = load_model()
|
|
368
|
+
return model(path_audio=path_audio, lang=lang, any_lang=any_lang, quick=quick)
|
|
369
|
+
|
|
370
|
+
def benchmark():
|
|
371
|
+
path_hf = snapshot_download(repo_id='JosefAlbers/exurb1a', allow_patterns=["*.mp3"])
|
|
372
|
+
tics = {}
|
|
373
|
+
for path_audio in sorted(glob.glob(f"{path_hf}/*.mp3")):
|
|
374
|
+
for any_lang in [True, False]:
|
|
375
|
+
for quick in [True, False]:
|
|
376
|
+
tic = time.perf_counter()
|
|
377
|
+
arg = f'{path_audio.split("/")[-1]} {any_lang=} {quick=}'
|
|
378
|
+
print(f'--- {arg=}')
|
|
379
|
+
result = transcribe(path_audio=path_audio, any_lang=any_lang, quick=quick)
|
|
380
|
+
print(result["text"])
|
|
381
|
+
tic = f'{(time.perf_counter() - tic):.2f}'
|
|
382
|
+
print(f'{tic=}')
|
|
383
|
+
tics[arg] = tic
|
|
384
|
+
return tics
|
|
385
|
+
|
|
386
|
+
def fire_main():
|
|
387
|
+
try:
|
|
388
|
+
import fire
|
|
389
|
+
except ImportError as e:
|
|
390
|
+
raise RuntimeError("fire package is required for whisper_turbo CLI usage") from e
|
|
391
|
+
fire.Fire(transcribe)
|
|
392
|
+
|
|
393
|
+
if __name__ == '__main__':
|
|
394
|
+
fire_main()
|
|
395
|
+
|
|
396
|
+
# benchmarks:
|
|
397
|
+
# 0_test.mp3 any_lang=True quick=True: 0.85
|
|
398
|
+
# 0_test.mp3 any_lang=True quick=False: 0.75
|
|
399
|
+
# 0_test.mp3 any_lang=False quick=True: 0.78
|
|
400
|
+
# 0_test.mp3 any_lang=False quick=False: 0.77
|
|
401
|
+
# 1_alive.mp3 any_lang=True quick=True: 7.10
|
|
402
|
+
# 1_alive.mp3 any_lang=True quick=False: 7.98
|
|
403
|
+
# 1_alive.mp3 any_lang=False quick=True: 6.57
|
|
404
|
+
# 1_alive.mp3 any_lang=False quick=False: 7.98
|
|
405
|
+
# 2_make.mp3 any_lang=True quick=True: 7.30
|
|
406
|
+
# 2_make.mp3 any_lang=True quick=False: 13.30
|
|
407
|
+
# 2_make.mp3 any_lang=False quick=True: 6.26
|
|
408
|
+
# 2_make.mp3 any_lang=False quick=False: 11.10
|
|
409
|
+
# 3_try.mp3 any_lang=True quick=True: 8.62
|
|
410
|
+
# 3_try.mp3 any_lang=True quick=False: 14.79
|
|
411
|
+
# 3_try.mp3 any_lang=False quick=True: 7.87
|
|
412
|
+
# 3_try.mp3 any_lang=False quick=False: 15.21
|
|
413
|
+
# 4_never.mp3 any_lang=True quick=True: 11.70
|
|
414
|
+
# 4_never.mp3 any_lang=True quick=False: 17.70
|
|
415
|
+
# 4_never.mp3 any_lang=False quick=True: 10.67
|
|
416
|
+
# 4_never.mp3 any_lang=False quick=False: 19.48
|
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Voice Input plugin - Python worker for local ASR transcription.
|
|
3
|
+
|
|
4
|
+
Long-running process. Reads JSON commands from stdin, writes JSON responses to stdout.
|
|
5
|
+
Audio arrives as base64-encoded float32 PCM (16kHz mono) — no ffmpeg needed.
|
|
6
|
+
"""
|
|
7
|
+
import base64
|
|
8
|
+
import json
|
|
9
|
+
import os
|
|
10
|
+
import struct
|
|
11
|
+
import sys
|
|
12
|
+
import tempfile
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
|
|
17
|
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
18
|
+
_model = None
|
|
19
|
+
_backend = None # 'mlx' or 'faster_whisper'
|
|
20
|
+
_fw_model = None
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def respond(msg_id, data):
|
|
24
|
+
sys.stdout.write(json.dumps({"id": msg_id, **data}) + "\n")
|
|
25
|
+
sys.stdout.flush()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def load_model():
|
|
29
|
+
global _model, _backend, _fw_model
|
|
30
|
+
if _model is not None:
|
|
31
|
+
return
|
|
32
|
+
|
|
33
|
+
if sys.platform == "darwin":
|
|
34
|
+
sys.path.insert(0, SCRIPT_DIR)
|
|
35
|
+
from whisper_turbo import load_model as _load
|
|
36
|
+
_model = _load()
|
|
37
|
+
_backend = "mlx"
|
|
38
|
+
else:
|
|
39
|
+
from faster_whisper import WhisperModel
|
|
40
|
+
_fw_model = WhisperModel("small", device="auto", compute_type="int8")
|
|
41
|
+
_model = _fw_model
|
|
42
|
+
_backend = "faster_whisper"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def write_wav(pcm_f32):
|
|
46
|
+
"""Write float32 PCM to a temp WAV file (16-bit, 16kHz, mono). For faster-whisper."""
|
|
47
|
+
fd, path = tempfile.mkstemp(suffix=".wav")
|
|
48
|
+
pcm16 = np.clip(pcm_f32, -1.0, 1.0)
|
|
49
|
+
pcm16 = (pcm16 * 32767).astype(np.int16)
|
|
50
|
+
data = pcm16.tobytes()
|
|
51
|
+
n = len(data)
|
|
52
|
+
# WAV header
|
|
53
|
+
header = struct.pack('<4sI4s4sIHHIIHH4sI',
|
|
54
|
+
b'RIFF', 36 + n, b'WAVE',
|
|
55
|
+
b'fmt ', 16, 1, 1, 16000, 32000, 2, 16,
|
|
56
|
+
b'data', n)
|
|
57
|
+
os.write(fd, header + data)
|
|
58
|
+
os.close(fd)
|
|
59
|
+
return path
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def transcribe(pcm_f32, lang="auto"):
|
|
63
|
+
if _backend == "mlx":
|
|
64
|
+
from whisper_turbo import transcribe as _transcribe
|
|
65
|
+
# Pass numpy array directly — no file, no ffmpeg
|
|
66
|
+
return _transcribe(path_audio=pcm_f32, lang=lang)
|
|
67
|
+
|
|
68
|
+
# faster-whisper needs a file
|
|
69
|
+
path = write_wav(pcm_f32)
|
|
70
|
+
try:
|
|
71
|
+
kwargs = {"task": "transcribe"}
|
|
72
|
+
if lang and lang != "auto":
|
|
73
|
+
kwargs["language"] = lang
|
|
74
|
+
segs, info = _fw_model.transcribe(path, **kwargs)
|
|
75
|
+
segs = list(segs)
|
|
76
|
+
text = "".join(s.text for s in segs).strip()
|
|
77
|
+
lps = [s.avg_logprob for s in segs if s.avg_logprob is not None]
|
|
78
|
+
return {
|
|
79
|
+
"text": text,
|
|
80
|
+
"avg_logprob": sum(lps) / len(lps) if lps else None,
|
|
81
|
+
"language": getattr(info, "language", "unknown"),
|
|
82
|
+
}
|
|
83
|
+
finally:
|
|
84
|
+
try:
|
|
85
|
+
os.remove(path)
|
|
86
|
+
except OSError:
|
|
87
|
+
pass
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def handle(cmd):
|
|
91
|
+
cid = cmd.get("id", "")
|
|
92
|
+
action = cmd.get("action")
|
|
93
|
+
|
|
94
|
+
if action == "warmup":
|
|
95
|
+
try:
|
|
96
|
+
load_model()
|
|
97
|
+
respond(cid, {"status": "ready"})
|
|
98
|
+
except Exception as e:
|
|
99
|
+
respond(cid, {"status": "error", "error": str(e)})
|
|
100
|
+
|
|
101
|
+
elif action == "transcribe":
|
|
102
|
+
try:
|
|
103
|
+
raw = base64.b64decode(cmd.get("audio", ""))
|
|
104
|
+
pcm_f32 = np.frombuffer(raw, dtype=np.float32)
|
|
105
|
+
lang = cmd.get("lang", "auto")
|
|
106
|
+
t0 = time.perf_counter()
|
|
107
|
+
result = transcribe(pcm_f32, lang)
|
|
108
|
+
elapsed = time.perf_counter() - t0
|
|
109
|
+
respond(cid, {
|
|
110
|
+
"text": result.get("text", ""),
|
|
111
|
+
"avg_logprob": result.get("avg_logprob"),
|
|
112
|
+
"language": result.get("language", "unknown"),
|
|
113
|
+
"inference_time": round(elapsed, 2),
|
|
114
|
+
})
|
|
115
|
+
except Exception as e:
|
|
116
|
+
respond(cid, {"error": str(e)})
|
|
117
|
+
|
|
118
|
+
elif action == "status":
|
|
119
|
+
respond(cid, {"loaded": _model is not None, "backend": _backend})
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def main():
|
|
123
|
+
respond("init", {"status": "started"})
|
|
124
|
+
for line in sys.stdin:
|
|
125
|
+
line = line.strip()
|
|
126
|
+
if not line:
|
|
127
|
+
continue
|
|
128
|
+
try:
|
|
129
|
+
handle(json.loads(line))
|
|
130
|
+
except Exception as e:
|
|
131
|
+
respond("error", {"error": str(e)})
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
if __name__ == "__main__":
|
|
135
|
+
main()
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|