ai-or-die 0.1.71 → 0.1.72

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ai-or-die",
3
- "version": "0.1.71",
3
+ "version": "0.1.72",
4
4
  "description": "Universal AI coding terminal — Claude, Copilot, Gemini & more in your browser",
5
5
  "main": "src/server.js",
6
6
  "bin": {
package/src/public/app.js CHANGED
@@ -1386,8 +1386,19 @@ class ClaudeCodeWebInterface {
1386
1386
 
1387
1387
  this.voiceController = new window.VoiceHandler.VoiceInputController({
1388
1388
  mode: this.voiceMode,
1389
+ // Refuse a new recording while a previous transcription is still
1390
+ // pending (single timeout slot + no correlation id — overlapping
1391
+ // uploads would clobber each other's spinner/timeout).
1392
+ canStart: function () {
1393
+ return !self._voiceTranscriptionTimeout;
1394
+ },
1389
1395
  onRecordingStart: function () {
1390
1396
  self._playMicChime('on');
1397
+ // Suspend the heartbeat pong-timeout while capturing: the main
1398
+ // thread can be busy enough (esp. the ScriptProcessor fallback)
1399
+ // to miss a pong, which would otherwise force a spurious reconnect.
1400
+ self._voiceRecordingActive = true;
1401
+ if (self._heartbeat) self._heartbeat.pause();
1391
1402
  btn.classList.add('recording');
1392
1403
  btn.classList.remove('processing');
1393
1404
  btn.setAttribute('aria-pressed', 'true');
@@ -1415,6 +1426,8 @@ class ClaudeCodeWebInterface {
1415
1426
  },
1416
1427
  onRecordingStop: function (result) {
1417
1428
  self._playMicChime('off');
1429
+ self._voiceRecordingActive = false;
1430
+ if (self._heartbeat) self._heartbeat.resume();
1418
1431
  btn.classList.remove('recording');
1419
1432
  btn.setAttribute('aria-pressed', 'false');
1420
1433
  btn.title = 'Voice Input (Ctrl+Shift+M)';
@@ -1425,21 +1438,33 @@ class ClaudeCodeWebInterface {
1425
1438
  }
1426
1439
 
1427
1440
  if (self.voiceMode === 'local' && result && result.samples) {
1441
+ // Guard against a zero-sample recording (would send a
1442
+ // header-only frame the server rejects as "too short").
1443
+ if (!result.samples.byteLength || result.samples.byteLength < 2) {
1444
+ btn.classList.remove('processing');
1445
+ if (window.feedback) window.feedback.error('No audio captured');
1446
+ return;
1447
+ }
1448
+
1428
1449
  btn.classList.add('processing');
1429
- // Convert Int16 PCM to base64 efficiently (chunked to avoid call stack overflow)
1430
- var pcmBytes = new Uint8Array(result.samples.buffer);
1431
- var CHUNK_SIZE = 8192;
1432
- var parts = [];
1433
- for (var i = 0; i < pcmBytes.length; i += CHUNK_SIZE) {
1434
- var chunk = pcmBytes.subarray(i, Math.min(i + CHUNK_SIZE, pcmBytes.length));
1435
- parts.push(String.fromCharCode.apply(null, chunk));
1450
+
1451
+ // Send raw Int16 PCM as a tagged binary WS frame (no base64 —
1452
+ // base64's 33% inflation is what pushed long clips past the
1453
+ // 1 MiB frame guard and crashed the page).
1454
+ var frame = window.VoiceFrame.buildVoiceFrame(result.samples);
1455
+ var sent = self.sendBinary(frame);
1456
+
1457
+ if (!sent) {
1458
+ // Socket not OPEN (e.g. mid-reconnect): fail fast instead of
1459
+ // silently dropping the frame and hanging the spinner 90 s.
1460
+ btn.classList.remove('processing');
1461
+ var notSentMsg = 'Connection not ready — recording not sent';
1462
+ if (window.feedback) window.feedback.error(notSentMsg);
1463
+ if (self.terminal) {
1464
+ self.terminal.write('\r\n\x1b[31m[Voice error] ' + notSentMsg + '\x1b[0m\r\n');
1465
+ }
1466
+ return;
1436
1467
  }
1437
- var base64Audio = btoa(parts.join(''));
1438
- self.send({
1439
- type: 'voice_upload',
1440
- audio: base64Audio,
1441
- durationMs: result.durationMs
1442
- });
1443
1468
 
1444
1469
  // Client-side timeout for transcription processing (90 seconds)
1445
1470
  self._voiceTranscriptionTimeout = setTimeout(function () {
@@ -1467,6 +1492,8 @@ class ClaudeCodeWebInterface {
1467
1492
  self._deliverVoiceTranscription(text);
1468
1493
  },
1469
1494
  onError: function (err) {
1495
+ self._voiceRecordingActive = false;
1496
+ if (self._heartbeat) self._heartbeat.resume();
1470
1497
  btn.classList.remove('recording', 'processing');
1471
1498
  btn.setAttribute('aria-pressed', 'false');
1472
1499
  btn.title = 'Voice Input (Ctrl+Shift+M)';
@@ -1497,6 +1524,8 @@ class ClaudeCodeWebInterface {
1497
1524
  }
1498
1525
  },
1499
1526
  onCancel: function () {
1527
+ self._voiceRecordingActive = false;
1528
+ if (self._heartbeat) self._heartbeat.resume();
1500
1529
  btn.classList.remove('recording', 'processing');
1501
1530
  btn.setAttribute('aria-pressed', 'false');
1502
1531
  btn.title = 'Voice Input (Ctrl+Shift+M)';
@@ -2043,6 +2072,32 @@ class ClaudeCodeWebInterface {
2043
2072
  if (this._heartbeat) { this._heartbeat.stop(); this._heartbeat = null; }
2044
2073
  if (this._heartbeatTimer) { clearInterval(this._heartbeatTimer); this._heartbeatTimer = null; }
2045
2074
  if (this._pongTimer) { clearTimeout(this._pongTimer); this._pongTimer = null; }
2075
+
2076
+ // A close mid-transcription must not leave the mic spinner + its
2077
+ // 90 s timeout hanging.
2078
+ if (this._voiceTranscriptionTimeout) {
2079
+ clearTimeout(this._voiceTranscriptionTimeout);
2080
+ this._voiceTranscriptionTimeout = null;
2081
+ }
2082
+ this._voiceRecordingActive = false;
2083
+ const voiceBtn = document.getElementById('voiceInputBtn');
2084
+ if (voiceBtn) voiceBtn.classList.remove('processing');
2085
+
2086
+ // Log the close code so field reports can tell a server frame
2087
+ // rejection (1009/1003, at stop) from a heartbeat pong-timeout
2088
+ // (4000, mid-recording).
2089
+ console.warn('[ws] closed', event.code, event.reason || '');
2090
+
2091
+ // 1009/1003 are server-initiated CLEAN closes (wasClean=true): the
2092
+ // server rejected our frame. Surface a specific message and still
2093
+ // reconnect below, instead of dead-ending on "refresh the page".
2094
+ const voiceClose = (window.VoiceFrame && window.VoiceFrame.classifyVoiceClose)
2095
+ ? window.VoiceFrame.classifyVoiceClose(event.code)
2096
+ : { rejected: false, message: null };
2097
+ if (voiceClose.rejected && window.feedback) {
2098
+ window.feedback.error(voiceClose.message);
2099
+ }
2100
+
2046
2101
  // During server restart, don't count failures against reconnect budget
2047
2102
  // but still use backoff to avoid thundering herd
2048
2103
  if (this._serverRestarting) {
@@ -2056,7 +2111,7 @@ class ClaudeCodeWebInterface {
2056
2111
  if (restartGen !== this._socketGeneration) return;
2057
2112
  this.reconnect();
2058
2113
  }, restartBackoff);
2059
- } else if (!event.wasClean && this.reconnectAttempts < this.maxReconnectAttempts) {
2114
+ } else if ((!event.wasClean || voiceClose.rejected) && this.reconnectAttempts < this.maxReconnectAttempts) {
2060
2115
  this.updateStatus('Reconnecting (' + (this.reconnectAttempts + 1) + '/' + this.maxReconnectAttempts + ')...');
2061
2116
  // First attempt is fast (250ms covers a server-process restart window);
2062
2117
  // subsequent attempts use exponential backoff with jitter.
@@ -2174,6 +2229,17 @@ class ClaudeCodeWebInterface {
2174
2229
  }
2175
2230
  }
2176
2231
 
2232
+ // Send a binary WS frame (e.g. a voice PCM frame). Returns true if it was
2233
+ // handed to an OPEN socket, false otherwise so the caller can react to a
2234
+ // closed/closing socket instead of silently dropping the frame.
2235
+ sendBinary(view) {
2236
+ if (this.socket && this.socket.readyState === WebSocket.OPEN) {
2237
+ this.socket.send(view);
2238
+ return true;
2239
+ }
2240
+ return false;
2241
+ }
2242
+
2177
2243
  _handleStickyNoteUpdate(message) {
2178
2244
  if (!message || !message.sessionId) return;
2179
2245
  const sm = this.sessionTabManager;
@@ -4368,6 +4434,9 @@ class ClaudeCodeWebInterface {
4368
4434
  log: (m) => console.warn('[heartbeat]', m),
4369
4435
  });
4370
4436
  this._heartbeat.start();
4437
+ // If a recording is in progress (e.g. this heartbeat was re-created after
4438
+ // a reconnect mid-recording), keep pong-timeout enforcement suspended.
4439
+ if (this._voiceRecordingActive) this._heartbeat.pause();
4371
4440
  // Keep _heartbeatTimer/_pongTimer references in sync for legacy code
4372
4441
  // (disconnect() still nulls them defensively); the watchdog owns the
4373
4442
  // real timer lifecycle via stop().
@@ -59,6 +59,10 @@
59
59
  this._clearTimeout = t.clearTimeout || ((id) => clearTimeout(id));
60
60
  this._heartbeatTimer = null;
61
61
  this._pongTimer = null;
62
+ // When paused (e.g. during mic recording), pings still go out but a
63
+ // missed pong does NOT force a reconnect — the client main thread can
64
+ // be busy capturing audio and briefly stop servicing the pong.
65
+ this._paused = false;
62
66
  }
63
67
 
64
68
  _isStale() {
@@ -75,6 +79,9 @@
75
79
  } catch (_) {
76
80
  return;
77
81
  }
82
+ // Paused: keep liveness pings flowing but do NOT arm the pong-timeout
83
+ // (a missed pong while recording must not force-close the socket).
84
+ if (this._paused) return;
78
85
  if (this._pongTimer) this._clearTimeout(this._pongTimer);
79
86
  this._pongTimer = this._setTimeout(() => {
80
87
  if (this._isStale()) return;
@@ -118,6 +125,24 @@
118
125
  this._pongTimer = null;
119
126
  }
120
127
  }
128
+
129
+ /**
130
+ * Suspend pong-timeout enforcement (pings continue). Use while the client
131
+ * main thread may be busy enough to miss a pong — e.g. mic recording —
132
+ * so a transient stall doesn't trigger a spurious reconnect.
133
+ */
134
+ pause() {
135
+ this._paused = true;
136
+ if (this._pongTimer) {
137
+ this._clearTimeout(this._pongTimer);
138
+ this._pongTimer = null;
139
+ }
140
+ }
141
+
142
+ /** Resume normal pong-timeout enforcement (next ping re-arms it). */
143
+ resume() {
144
+ this._paused = false;
145
+ }
121
146
  }
122
147
 
123
148
  return HeartbeatWatchdog;
@@ -801,6 +801,7 @@
801
801
  <script src="vscode-tunnel.js"></script>
802
802
  <script src="app-tunnel.js"></script>
803
803
  <script src="voice-handler.js"></script>
804
+ <script src="voice-frame.js"></script>
804
805
  <script src="command-palette.js"></script>
805
806
  <script src="extra-keys.js"></script>
806
807
  <script src="input-overlay.js"></script>
@@ -0,0 +1,73 @@
1
+ /**
2
+ * VoiceFrame
3
+ *
4
+ * Pure helpers for the client->server binary voice path, factored out of app.js
5
+ * so they can be unit-tested in Node. Mirrors the UMD shape of
6
+ * heartbeat-watchdog.js (CommonJS in tests, `window.VoiceFrame` in the browser).
7
+ */
8
+ (function (global, factory) {
9
+ if (typeof module === 'object' && module.exports) {
10
+ module.exports = factory();
11
+ } else {
12
+ global.VoiceFrame = factory();
13
+ }
14
+ }(typeof self !== 'undefined' ? self : this, function () {
15
+
16
+ // Wire header: [ "VUP1" (4) ][ version (1) ][ type (1) ] then raw 16-bit PCM.
17
+ var MAGIC_V = 0x56; // 'V'
18
+ var MAGIC_U = 0x55; // 'U'
19
+ var MAGIC_P = 0x50; // 'P'
20
+ var MAGIC_1 = 0x31; // '1'
21
+ var PROTO_VERSION = 0x01;
22
+ var FRAME_TYPE_PCM = 0x01;
23
+ var HEADER_BYTES = 6;
24
+
25
+ /**
26
+ * Build a binary voice frame: the 6-byte header followed by the PCM bytes of
27
+ * `samples`. Uses byteOffset/byteLength so a subarray-backed Int16Array is
28
+ * copied correctly (not the whole underlying buffer).
29
+ *
30
+ * @param {Int16Array} samples
31
+ * @returns {Uint8Array}
32
+ */
33
+ function buildVoiceFrame(samples) {
34
+ var pcm = new Uint8Array(samples.buffer, samples.byteOffset, samples.byteLength);
35
+ var frame = new Uint8Array(HEADER_BYTES + pcm.length);
36
+ frame[0] = MAGIC_V;
37
+ frame[1] = MAGIC_U;
38
+ frame[2] = MAGIC_P;
39
+ frame[3] = MAGIC_1;
40
+ frame[4] = PROTO_VERSION;
41
+ frame[5] = FRAME_TYPE_PCM;
42
+ frame.set(pcm, HEADER_BYTES);
43
+ return frame;
44
+ }
45
+
46
+ /**
47
+ * Classify a WebSocket close code for the voice path.
48
+ *
49
+ * 1009 (server rejected an oversized frame) and 1003 (unsupported/garbage
50
+ * binary) are server-initiated CLEAN closes, so `event.wasClean` is true and
51
+ * the default onclose path would SKIP reconnect and dead-end on
52
+ * "refresh the page". Treat them as recoverable: show a specific message and
53
+ * still reconnect (bounded by the normal attempt budget).
54
+ *
55
+ * @param {number} code
56
+ * @returns {{rejected: boolean, message: (string|null)}}
57
+ */
58
+ function classifyVoiceClose(code) {
59
+ if (code === 1009 || code === 1003) {
60
+ return {
61
+ rejected: true,
62
+ message: 'A voice message was rejected by the server. Reconnecting…'
63
+ };
64
+ }
65
+ return { rejected: false, message: null };
66
+ }
67
+
68
+ return {
69
+ HEADER_BYTES: HEADER_BYTES,
70
+ buildVoiceFrame: buildVoiceFrame,
71
+ classifyVoiceClose: classifyVoiceClose
72
+ };
73
+ }));
@@ -625,6 +625,9 @@ function VoiceInputController(options) {
625
625
  this._onTranscription = options.onTranscription || null;
626
626
  this._onError = options.onError || null;
627
627
  this._onCancel = options.onCancel || null;
628
+ // Optional predicate: if it returns false, a start request is ignored (e.g.
629
+ // a previous transcription is still pending). Gates both button + keyboard.
630
+ this._canStart = options.canStart || null;
628
631
 
629
632
  this._recorder = null;
630
633
  this._starting = false;
@@ -670,6 +673,7 @@ VoiceInputController.prototype.startRecording = function () {
670
673
  var self = this;
671
674
  if (self._starting) return;
672
675
  if (self._recorder && self._recorder.isRecording) return;
676
+ if (self._canStart && !self._canStart()) return;
673
677
 
674
678
  self._starting = true;
675
679
  self._recorder = self._createRecorder();
package/src/server.js CHANGED
@@ -1,6 +1,7 @@
1
1
  const express = require('express');
2
2
  const http = require('http');
3
3
  const https = require('https');
4
+ const net = require('net');
4
5
  const fs = require('fs');
5
6
  const path = require('path');
6
7
  const os = require('os');
@@ -46,6 +47,18 @@ const RestartManager = require('./restart-manager');
46
47
  // See docs/audits/hot-03-ws-frame-size.md.
47
48
  const MAX_WS_MESSAGE_BYTES = 1 * 1024 * 1024;
48
49
 
50
+ // Inbound binary voice frames (client mic -> server STT) bypass the JSON guard
51
+ // above. Framing + validation (incl. the Buffer[] fragmented-frame normalize)
52
+ // lives in utils/ws-voice-frame so it can be unit-tested without a live socket.
53
+ // A frame is bounded by MAX_VOICE_BINARY_FRAME_BYTES (oversize -> 1009 close,
54
+ // like the text guard); a bad/short header -> 1003 close.
55
+ const {
56
+ MAX_VOICE_PCM_BYTES,
57
+ MAX_VOICE_BINARY_FRAME_BYTES,
58
+ normalizeBinaryMessage,
59
+ classifyVoiceFrame,
60
+ } = require('./utils/ws-voice-frame');
61
+
49
62
  // Pre-built PWA screenshot SVG buffers (served at /screenshot-wide.png and /screenshot-narrow.png)
50
63
  const SCREENSHOT_WIDE_BUF = Buffer.from(`
51
64
  <svg width="1280" height="720" viewBox="0 0 1280 720" xmlns="http://www.w3.org/2000/svg">
@@ -148,8 +161,6 @@ class ClaudeCodeWebServer {
148
161
  modelsDir: options.sttModelDir,
149
162
  numThreads: options.sttThreads ? parseInt(options.sttThreads, 10) : undefined,
150
163
  });
151
- this._voiceUploadCounts = new Map();
152
-
153
164
  // Per-tab local-LLM "sticky note" summariser. ON by default for AI-agent
154
165
  // tabs; disable globally with --no-sticky-notes / AIORDIE_DISABLE_STICKY_NOTES=1
155
166
  // (sticky-notes only — does NOT affect STT). The engine lazily downloads its
@@ -1319,11 +1330,6 @@ class ClaudeCodeWebServer {
1319
1330
  // weeks of uptime. See _cleanupFsWatchSession.
1320
1331
  this._cleanupFsWatchSession(sessionId, 'session_deleted');
1321
1332
 
1322
- // Drop the voice-upload rate-limit history for this session. Map
1323
- // grew unbounded across session-create/delete churn on long-lived
1324
- // servers (smaller cousin of the _fsWatchSessions leak).
1325
- this._voiceUploadCounts.delete(sessionId);
1326
-
1327
1333
  // Stop + tear down the summariser so an in-flight inference is discarded.
1328
1334
  this.stickyNoteSummarizer.cancel(sessionId);
1329
1335
  this._stickyJsonl.delete(sessionId);
@@ -3151,6 +3157,7 @@ class ClaudeCodeWebServer {
3151
3157
  this._ensureStickyNoteEngine();
3152
3158
 
3153
3159
  let server;
3160
+ let wsHost; // the server the WebSocket server attaches to (TLS server in HTTPS mode)
3154
3161
 
3155
3162
  if (this.useHttps) {
3156
3163
  let cert, key;
@@ -3177,13 +3184,90 @@ class ClaudeCodeWebServer {
3177
3184
  console.log(' Browsers will show a security warning on first visit.');
3178
3185
  console.log(' For a trusted, installable origin use \x1b[1m--tunnel\x1b[0m.');
3179
3186
  }
3180
- server = https.createServer({ cert, key }, this.app);
3187
+
3188
+ // The real TLS app server. The WebSocket server attaches HERE so a wss://
3189
+ // upgrade arrives over an encrypted TLSSocket (req.socket.encrypted stays
3190
+ // true for the secure-context / voice checks).
3191
+ const tlsServer = https.createServer({ cert, key }, this.app);
3192
+
3193
+ // Build the https redirect target from the SAME host:port the client
3194
+ // reached. The Host header is client-controlled, so accept ONLY a bare
3195
+ // hostname[:port] (or [ipv6][:port]) — reject userinfo (`@`), paths, and
3196
+ // control chars to prevent an open redirect to an external origin
3197
+ // (e.g. Host: user:pass@evil.com). Fall back to localhost otherwise.
3198
+ const redirectLocation = (req) => {
3199
+ const raw = String(req.headers.host || '');
3200
+ const validHost = /^[A-Za-z0-9.-]+(?::\d+)?$/.test(raw)
3201
+ || /^\[[0-9a-fA-F:]+\](?::\d+)?$/.test(raw);
3202
+ const hostname = validHost ? raw.replace(/:\d+$/, '') : 'localhost';
3203
+ const port = req.socket.localPort || this.port;
3204
+ // req.url is parser-validated (no CR/LF in a valid request target).
3205
+ return `https://${hostname}:${port}${req.url}`.replace(/[\r\n]/g, '');
3206
+ };
3207
+
3208
+ // Plaintext HTTP on the SAME port -> redirect to https. A user who reaches
3209
+ // http://host:PORT is auto-upgraded instead of getting an opaque
3210
+ // TLS-handshake error. 307 keeps the method and is not cached as permanent
3211
+ // (switching the port back to http mode later isn't poisoned by a stale 301).
3212
+ const httpRedirectServer = http.createServer((req, res) => {
3213
+ const location = redirectLocation(req);
3214
+ res.writeHead(307, { Location: location, 'Content-Type': 'text/plain' });
3215
+ res.end(`Redirecting to ${location}\n`);
3216
+ });
3217
+ // A plaintext ws:// upgrade to the TLS port: answer with the same redirect
3218
+ // (written raw — an upgrade has no res object) instead of an abrupt RST.
3219
+ httpRedirectServer.on('upgrade', (req, socket) => {
3220
+ const location = redirectLocation(req);
3221
+ try {
3222
+ socket.end(
3223
+ 'HTTP/1.1 307 Temporary Redirect\r\n' +
3224
+ `Location: ${location}\r\n` +
3225
+ 'Connection: close\r\n\r\n'
3226
+ );
3227
+ } catch (_) { try { socket.destroy(); } catch (__) { /* ignore */ } }
3228
+ });
3229
+
3230
+ // Front both with a 1-byte sniffer: a TLS ClientHello starts with 0x16
3231
+ // (handshake record); anything else is plaintext HTTP. One listening port
3232
+ // therefore serves both — http:// and https:// to PORT both work.
3233
+ this._proxySockets = new Set();
3234
+ server = net.createServer((socket) => {
3235
+ this._proxySockets.add(socket);
3236
+ socket.once('close', () => this._proxySockets.delete(socket));
3237
+ // Pre-handoff guards: drop a connection that errors or sends no data
3238
+ // (port scanner / slowloris) before we know which server owns it. Both
3239
+ // are cleared the moment we route, so the target server's own lifecycle
3240
+ // and timeouts take over cleanly.
3241
+ const sniffTimer = setTimeout(() => { try { socket.destroy(); } catch (_) { /* ignore */ } }, 10000);
3242
+ const onSniffError = () => {
3243
+ clearTimeout(sniffTimer);
3244
+ try { socket.destroy(); } catch (_) { /* ignore */ }
3245
+ };
3246
+ socket.on('error', onSniffError);
3247
+ socket.once('readable', () => {
3248
+ clearTimeout(sniffTimer);
3249
+ socket.removeListener('error', onSniffError);
3250
+ const chunk = socket.read(1);
3251
+ if (!chunk) { socket.destroy(); return; }
3252
+ socket.unshift(chunk);
3253
+ const target = chunk[0] === 0x16 ? tlsServer : httpRedirectServer;
3254
+ target.emit('connection', socket);
3255
+ });
3256
+ });
3257
+
3258
+ this._tlsServer = tlsServer;
3259
+ this._httpRedirectServer = httpRedirectServer;
3260
+ wsHost = tlsServer;
3261
+ console.log(' http:// requests on this port auto-upgrade to https.');
3181
3262
  } else {
3182
3263
  server = http.createServer(this.app);
3264
+ this._tlsServer = null;
3265
+ this._httpRedirectServer = null;
3266
+ wsHost = server;
3183
3267
  }
3184
3268
 
3185
3269
  this.wss = new WebSocket.Server({
3186
- server,
3270
+ server: wsHost,
3187
3271
  maxPayload: 8 * 1024 * 1024,
3188
3272
  // Compression disabled — binary frames already send with compress:false,
3189
3273
  // and JSON control messages are small/infrequent. Saves ~300KB per connection
@@ -3237,7 +3321,39 @@ class ClaudeCodeWebServer {
3237
3321
  };
3238
3322
  this.webSocketConnections.set(wsId, wsInfo);
3239
3323
 
3240
- ws.on('message', (message) => {
3324
+ ws.on('message', (message, isBinary) => {
3325
+ // Inbound BINARY frames are voice audio (client mic). Handle them BEFORE
3326
+ // the JSON guard below: they legitimately exceed 1 MiB (up to 3.84 MB of
3327
+ // 120 s PCM) and must not be killed by the text-frame guard. They are
3328
+ // still bounded (oversize -> 1009; bad/short header -> 1003) so this does
3329
+ // not reopen the event-loop-DoS hole the JSON guard closes.
3330
+ if (isBinary) {
3331
+ // ws delivers a Buffer when un-fragmented and a Buffer[] when the frame
3332
+ // arrived in multiple WS continuation fragments. Normalize first, then
3333
+ // classify on the normalized buffer (never on `message.length`, which is
3334
+ // the fragment COUNT for an array).
3335
+ const buf = normalizeBinaryMessage(message);
3336
+ const verdict = classifyVoiceFrame(buf);
3337
+ if (verdict.action === 'oversize') {
3338
+ try {
3339
+ this.sendToWebSocket(ws, {
3340
+ type: 'error',
3341
+ code: 'message_too_large',
3342
+ message: `Binary voice frame exceeds ${MAX_VOICE_BINARY_FRAME_BYTES} bytes`,
3343
+ received_bytes: buf.length,
3344
+ limit_bytes: MAX_VOICE_BINARY_FRAME_BYTES,
3345
+ });
3346
+ } catch (_) { /* socket may be half-closed */ }
3347
+ try { ws.close(1009, 'message_too_large'); } catch (_) {}
3348
+ return;
3349
+ }
3350
+ if (verdict.action === 'unsupported') {
3351
+ try { ws.close(1003, 'unsupported binary'); } catch (_) {}
3352
+ return;
3353
+ }
3354
+ this.handleVoiceBinary(wsId, verdict.pcm);
3355
+ return;
3356
+ }
3241
3357
  // HOT-08: application-layer size guard, runs BEFORE JSON.parse.
3242
3358
  // Buffer.byteLength handles both string and Buffer message types.
3243
3359
  // On oversize, send a marker error frame and close with WS-standard
@@ -4677,11 +4793,11 @@ class ClaudeCodeWebServer {
4677
4793
  }
4678
4794
 
4679
4795
  // Same per-session cleanup contract as the DELETE handler:
4680
- // tear down any orphan fs-watch SSE + voice-upload history
4681
- // BEFORE removing the parent session entry, otherwise the
4682
- // chokidar watcher leaks (PR #99 regression).
4796
+ // tear down any orphan fs-watch SSE BEFORE removing the parent session
4797
+ // entry, otherwise the chokidar watcher leaks (PR #99 regression). The
4798
+ // voice-upload rate-limit history lives on the session object and is
4799
+ // dropped with it below.
4683
4800
  try { this._cleanupFsWatchSession(top.id, 'session_evicted'); } catch (_) { /* ignore */ }
4684
- try { this._voiceUploadCounts.delete(top.id); } catch (_) { /* ignore */ }
4685
4801
  try { this.stickyNoteSummarizer.cancel(top.id); } catch (_) { /* ignore */ }
4686
4802
  try { this._stickyJsonl.delete(top.id); } catch (_) { /* ignore */ }
4687
4803
  if (this._foregroundSessionId === top.id) this._foregroundSessionId = null;
@@ -4888,7 +5004,8 @@ class ClaudeCodeWebServer {
4888
5004
  total: this.claudeSessions.size,
4889
5005
  ws_connections: this.webSocketConnections.size,
4890
5006
  fs_watch_sessions: (this._fsWatchSessions && this._fsWatchSessions.size) || 0,
4891
- voice_upload_counts: (this._voiceUploadCounts && this._voiceUploadCounts.size) || 0,
5007
+ voice_upload_counts: Array.from(this.claudeSessions.values())
5008
+ .filter(s => s._voiceUploadTimestamps && s._voiceUploadTimestamps.length).length,
4892
5009
  activity_broadcast_timestamps: (this.activityBroadcastTimestamps && this.activityBroadcastTimestamps.size) || 0,
4893
5010
  },
4894
5011
  // DISK-02/03: cached disk usage sample (60 s TTL, never blocks the
@@ -5225,6 +5342,23 @@ class ClaudeCodeWebServer {
5225
5342
  if (this.server) {
5226
5343
  this.server.close();
5227
5344
  }
5345
+ // In HTTPS mode `this.server` is the TLS-sniffing proxy that owns the
5346
+ // listening port; the TLS app server and the http->https redirect server sit
5347
+ // behind it. Sockets are handed to them via emit('connection'), bypassing
5348
+ // their internal connection tracking, so destroy the proxied sockets here
5349
+ // (and close the inner servers) to avoid keep-alive connections lingering.
5350
+ if (this._proxySockets) {
5351
+ for (const s of this._proxySockets) {
5352
+ try { s.destroy(); } catch (_) { /* ignore */ }
5353
+ }
5354
+ this._proxySockets.clear();
5355
+ }
5356
+ if (this._tlsServer) {
5357
+ try { this._tlsServer.close(); } catch (_) { /* ignore */ }
5358
+ }
5359
+ if (this._httpRedirectServer) {
5360
+ try { this._httpRedirectServer.close(); } catch (_) { /* ignore */ }
5361
+ }
5228
5362
 
5229
5363
  // Flush pending output and stop all sessions with a 5-second timeout
5230
5364
  const stopPromises = [];
@@ -5456,10 +5590,40 @@ class ClaudeCodeWebServer {
5456
5590
  }
5457
5591
  }
5458
5592
 
5593
+ // Thin shim for the legacy base64-JSON voice_upload path. The 'Missing audio
5594
+ // data' guard must live HERE (the binary path has no data.audio); after the
5595
+ // binary-frame switch no live client emits this, but it is kept for
5596
+ // back-compat and shares the validation/transcribe core below.
5459
5597
  async handleVoiceUpload(wsId, data) {
5460
5598
  const wsInfo = this.webSocketConnections.get(wsId);
5461
5599
  if (!wsInfo) return;
5462
5600
 
5601
+ if (!data.audio || typeof data.audio !== 'string') {
5602
+ this.sendToWebSocket(wsInfo.ws, {
5603
+ type: 'voice_transcription_error',
5604
+ message: 'Missing audio data'
5605
+ });
5606
+ return;
5607
+ }
5608
+
5609
+ await this._processVoicePcm(wsId, Buffer.from(data.audio, 'base64'));
5610
+ }
5611
+
5612
+ // Binary voice frame path. The ws dispatcher has already validated the 6-byte
5613
+ // header and sliced it off, so `pcmBuffer` is raw 16-bit PCM.
5614
+ async handleVoiceBinary(wsId, pcmBuffer) {
5615
+ await this._processVoicePcm(wsId, pcmBuffer);
5616
+ }
5617
+
5618
+ // Shared voice core for both the base64 shim and the binary path. Check order
5619
+ // is cheapest/most-restrictive first; the rate limit stays BEFORE the isReady
5620
+ // gate (so it is enforced even when STT is unavailable), and the int16->float32
5621
+ // conversion is deferred to the STT worker (transcribePcm16) rather than run on
5622
+ // the event loop here.
5623
+ async _processVoicePcm(wsId, pcmBuffer) {
5624
+ const wsInfo = this.webSocketConnections.get(wsId);
5625
+ if (!wsInfo) return;
5626
+
5463
5627
  // Reject voice uploads over HTTP from non-localhost origins (defense-in-depth)
5464
5628
  if (!wsInfo.secure && !this._isLocalhostConnection(wsInfo.ws)) {
5465
5629
  this.sendToWebSocket(wsInfo.ws, {
@@ -5494,23 +5658,21 @@ class ClaudeCodeWebServer {
5494
5658
  return;
5495
5659
  }
5496
5660
 
5497
- // Rate limit: max 10 voice uploads per minute per session (check early to prevent abuse)
5498
- const sessionId = wsInfo.claudeSessionId;
5499
- if (!this._voiceUploadCounts.has(sessionId)) {
5500
- this._voiceUploadCounts.set(sessionId, []);
5501
- }
5502
- const timestamps = this._voiceUploadCounts.get(sessionId);
5661
+ // Rate limit: max 10 voice uploads per minute per session. State lives on the
5662
+ // session object (mirrors image uploads at saveImageToTemp) so it shares the
5663
+ // session's lifetime — GC'd on session delete/evict, and correctly survives a
5664
+ // WS reconnect (the budget must NOT reset when the socket drops).
5503
5665
  const now = Date.now();
5504
- const recent = timestamps.filter(ts => now - ts < 60000);
5505
- this._voiceUploadCounts.set(sessionId, recent);
5506
- if (recent.length >= 10) {
5666
+ if (!session._voiceUploadTimestamps) session._voiceUploadTimestamps = [];
5667
+ session._voiceUploadTimestamps = session._voiceUploadTimestamps.filter(ts => now - ts < 60000);
5668
+ if (session._voiceUploadTimestamps.length >= 10) {
5507
5669
  this.sendToWebSocket(wsInfo.ws, {
5508
5670
  type: 'voice_transcription_error',
5509
5671
  message: 'Rate limit exceeded: maximum 10 voice uploads per minute.'
5510
5672
  });
5511
5673
  return;
5512
5674
  }
5513
- recent.push(now);
5675
+ session._voiceUploadTimestamps.push(now);
5514
5676
 
5515
5677
  if (!this.sttEngine.isReady()) {
5516
5678
  this.sendToWebSocket(wsInfo.ws, {
@@ -5521,19 +5683,8 @@ class ClaudeCodeWebServer {
5521
5683
  }
5522
5684
 
5523
5685
  try {
5524
- // Validate audio data
5525
- if (!data.audio || typeof data.audio !== 'string') {
5526
- this.sendToWebSocket(wsInfo.ws, {
5527
- type: 'voice_transcription_error',
5528
- message: 'Missing audio data'
5529
- });
5530
- return;
5531
- }
5532
-
5533
- const audioBuffer = Buffer.from(data.audio, 'base64');
5534
-
5535
5686
  // Max 120s of 16kHz 16-bit mono PCM = 3,840,000 bytes
5536
- if (audioBuffer.length > 3840000) {
5687
+ if (pcmBuffer.length > MAX_VOICE_PCM_BYTES) {
5537
5688
  this.sendToWebSocket(wsInfo.ws, {
5538
5689
  type: 'voice_transcription_error',
5539
5690
  message: 'Audio too long (max 120 seconds)'
@@ -5541,7 +5692,7 @@ class ClaudeCodeWebServer {
5541
5692
  return;
5542
5693
  }
5543
5694
 
5544
- if (audioBuffer.length < 2) {
5695
+ if (pcmBuffer.length < 2) {
5545
5696
  this.sendToWebSocket(wsInfo.ws, {
5546
5697
  type: 'voice_transcription_error',
5547
5698
  message: 'Audio too short'
@@ -5549,7 +5700,7 @@ class ClaudeCodeWebServer {
5549
5700
  return;
5550
5701
  }
5551
5702
 
5552
- if (audioBuffer.length % 2 !== 0) {
5703
+ if (pcmBuffer.length % 2 !== 0) {
5553
5704
  this.sendToWebSocket(wsInfo.ws, {
5554
5705
  type: 'voice_transcription_error',
5555
5706
  message: 'Invalid audio data: buffer length must be even (16-bit PCM samples)'
@@ -5557,11 +5708,8 @@ class ClaudeCodeWebServer {
5557
5708
  return;
5558
5709
  }
5559
5710
 
5560
-
5561
- // Convert Int16 PCM buffer to Float32Array for sherpa-onnx
5562
- const float32 = this._int16ToFloat32(audioBuffer);
5563
-
5564
- const text = await this.sttEngine.transcribe(float32);
5711
+ // Raw int16 PCM -> the worker converts to Float32 off the event loop.
5712
+ const text = await this.sttEngine.transcribePcm16(pcmBuffer);
5565
5713
 
5566
5714
  this.sendToWebSocket(wsInfo.ws, {
5567
5715
  type: 'voice_transcription',
@@ -5652,17 +5800,6 @@ class ClaudeCodeWebServer {
5652
5800
  }
5653
5801
  }
5654
5802
 
5655
- _int16ToFloat32(int16Buffer) {
5656
- // Copy to ensure 2-byte alignment (Node.js Buffers may have odd byteOffset)
5657
- const aligned = new Uint8Array(int16Buffer).buffer;
5658
- const int16 = new Int16Array(aligned);
5659
- const float32 = new Float32Array(int16.length);
5660
- for (let i = 0; i < int16.length; i++) {
5661
- float32[i] = int16[i] / 32768.0;
5662
- }
5663
- return float32;
5664
- }
5665
-
5666
5803
  async saveImageToTemp(session, data) {
5667
5804
  // Primary temp dir: .claude-images inside the session working directory
5668
5805
  let tempDir = path.join(session.workingDir, '.claude-images');
package/src/stt-engine.js CHANGED
@@ -93,6 +93,71 @@ class SttEngine {
93
93
  return promise;
94
94
  }
95
95
 
96
+ /**
97
+ * Transcribe raw 16-bit PCM. The int16->float32 conversion is deferred to the
98
+ * worker thread (see stt-worker.js) so the server event loop never runs the
99
+ * per-sample loop. Accepts an Int16Array, an ArrayBuffer, or any ArrayBuffer
100
+ * view (e.g. a Node Buffer) of raw little-endian 16-bit samples.
101
+ *
102
+ * @param {Int16Array|ArrayBuffer|ArrayBufferView} int16
103
+ * @returns {Promise<string>}
104
+ */
105
+ transcribePcm16(int16) {
106
+ const int16arr = this._toInt16Array(int16);
107
+
108
+ if (this._sttEndpoint) {
109
+ // External endpoint has no worker — convert here and reuse the float32 path.
110
+ const float32 = new Float32Array(int16arr.length);
111
+ for (let i = 0; i < int16arr.length; i++) {
112
+ float32[i] = int16arr[i] / 32768.0;
113
+ }
114
+ return this._transcribeExternal(float32);
115
+ }
116
+
117
+ if (this._status !== 'ready') {
118
+ throw new Error(`STT engine not ready (status: ${this._status})`);
119
+ }
120
+
121
+ if (this._queue.length >= MAX_QUEUE_SIZE) {
122
+ throw new Error('STT busy, try again later');
123
+ }
124
+
125
+ const id = ++this._requestIdCounter;
126
+
127
+ const promise = new Promise((resolve, reject) => {
128
+ const timer = setTimeout(() => {
129
+ this._removeFromQueue(id);
130
+ reject(new Error('Transcription timed out'));
131
+ }, TRANSCRIPTION_TIMEOUT_MS);
132
+
133
+ this._queue.push({ id, pcm16: int16arr, resolve, reject, timer });
134
+ });
135
+
136
+ this._processQueue();
137
+ return promise;
138
+ }
139
+
140
+ // Copy an int16 input into a fresh, offset-0, even-length Int16Array. Always
141
+ // copies (even an Int16Array input) so the queued buffer is solely owned and
142
+ // can be safely TRANSFERRED to the worker. A Node Buffer slice can have an odd
143
+ // byteOffset (a direct `new Int16Array(buf.buffer, off)` would throw); an odd
144
+ // byteLength is floored to whole 16-bit samples — callers already reject odd
145
+ // lengths, this is defense-in-depth so the method never throws RangeError.
146
+ _toInt16Array(int16) {
147
+ let bytes;
148
+ if (int16 instanceof Int16Array || ArrayBuffer.isView(int16)) {
149
+ bytes = new Uint8Array(int16.buffer, int16.byteOffset, int16.byteLength);
150
+ } else if (int16 instanceof ArrayBuffer) {
151
+ bytes = new Uint8Array(int16);
152
+ } else {
153
+ throw new Error('transcribePcm16 expects an Int16Array, ArrayBuffer, or typed-array view');
154
+ }
155
+ const evenLen = bytes.byteLength - (bytes.byteLength % 2);
156
+ const copy = new Uint8Array(evenLen);
157
+ copy.set(bytes.subarray(0, evenLen));
158
+ return new Int16Array(copy.buffer);
159
+ }
160
+
96
161
  _processQueue() {
97
162
  if (this._currentRequest || this._queue.length === 0 || !this._worker) {
98
163
  return;
@@ -101,11 +166,23 @@ class SttEngine {
101
166
  const request = this._queue[0];
102
167
  this._currentRequest = request;
103
168
 
104
- this._worker.postMessage({
105
- type: 'transcribe',
106
- id: request.id,
107
- samples: request.samples
108
- });
169
+ // pcm16 path: TRANSFER the (solely-owned, freshly-copied by _toInt16Array)
170
+ // buffer to the worker — avoids a multi-MB structured-clone copy on the event
171
+ // loop. Safe because each request is posted exactly once (on worker crash the
172
+ // queue is rejected + cleared, so a posted/detached buffer is never requeued).
173
+ if (request.pcm16 !== undefined) {
174
+ this._worker.postMessage({
175
+ type: 'transcribe',
176
+ id: request.id,
177
+ pcm16: request.pcm16
178
+ }, [request.pcm16.buffer]);
179
+ } else {
180
+ this._worker.postMessage({
181
+ type: 'transcribe',
182
+ id: request.id,
183
+ samples: request.samples
184
+ });
185
+ }
109
186
  }
110
187
 
111
188
  _onWorkerMessage(msg) {
package/src/stt-worker.js CHANGED
@@ -3,6 +3,7 @@
3
3
  const { parentPort, workerData } = require('worker_threads');
4
4
  const path = require('path');
5
5
  const os = require('os');
6
+ const { pcm16ToFloat32 } = require('./utils/pcm.js');
6
7
 
7
8
  // Set platform-specific library paths BEFORE requiring sherpa-onnx-node.
8
9
  // The native .node addon dynamically loads shared libraries (onnxruntime.dll,
@@ -74,10 +75,22 @@ try {
74
75
  parentPort.on('message', (msg) => {
75
76
  if (msg.type === 'transcribe') {
76
77
  try {
77
- // msg.samples is a Float32Array (transferred or copied from main thread)
78
- const samples = msg.samples instanceof Float32Array
79
- ? msg.samples
80
- : new Float32Array(msg.samples);
78
+ // Two input shapes:
79
+ // - msg.pcm16: raw 16-bit PCM (Int16Array). Conversion to Float32 runs
80
+ // HERE, in the worker thread, so the server event loop never does the
81
+ // per-sample loop (HOL-blocking input/ping for long clips).
82
+ // - msg.samples: a Float32Array (legacy / external-endpoint callers).
83
+ let samples;
84
+ if (msg.pcm16 !== undefined && msg.pcm16 !== null) {
85
+ const int16 = msg.pcm16 instanceof Int16Array
86
+ ? msg.pcm16
87
+ : new Int16Array(msg.pcm16);
88
+ samples = pcm16ToFloat32(int16);
89
+ } else {
90
+ samples = msg.samples instanceof Float32Array
91
+ ? msg.samples
92
+ : new Float32Array(msg.samples);
93
+ }
81
94
 
82
95
  const stream = recognizer.createStream();
83
96
  stream.acceptWaveform({ samples, sampleRate: 16000 });
@@ -0,0 +1,22 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Convert 16-bit PCM samples to normalized Float32 [-1, 1).
5
+ *
6
+ * Divisor is 32768.0 for every sample (matching the original server-side
7
+ * conversion): positive full-scale 32767 maps to ~0.99997, negative full-scale
8
+ * -32768 maps to exactly -1.0. Used by the STT worker (off the server event
9
+ * loop) and exercised directly in unit tests.
10
+ *
11
+ * @param {Int16Array} int16
12
+ * @returns {Float32Array}
13
+ */
14
+ function pcm16ToFloat32(int16) {
15
+ const out = new Float32Array(int16.length);
16
+ for (let i = 0; i < int16.length; i++) {
17
+ out[i] = int16[i] / 32768.0;
18
+ }
19
+ return out;
20
+ }
21
+
22
+ module.exports = { pcm16ToFloat32 };
@@ -0,0 +1,73 @@
1
+ 'use strict';
2
+
3
+ /**
4
+ * Inbound binary voice-frame framing (client mic -> server STT).
5
+ *
6
+ * Wire format:
7
+ * [4 bytes ASCII "VUP1"][1 byte version][1 byte type][raw 16-bit PCM @16kHz mono]
8
+ *
9
+ * Pure (no I/O) so the dispatcher logic in server.js can be unit-tested without
10
+ * a live socket — in particular the Buffer[] (fragmented frame) normalization,
11
+ * which is the one genuinely platform-dependent receive path.
12
+ */
13
+
14
+ const VOICE_MAGIC = Buffer.from('VUP1', 'ascii');
15
+ const VOICE_PROTO_VERSION = 1;
16
+ const VOICE_FRAME_TYPE_PCM = 0x01;
17
+ const VOICE_HEADER_BYTES = 6; // magic(4) + version(1) + type(1)
18
+ const MAX_VOICE_PCM_BYTES = 3840000; // 120 s @ 16 kHz / 16-bit / mono
19
+ const MAX_VOICE_BINARY_FRAME_BYTES = VOICE_HEADER_BYTES + MAX_VOICE_PCM_BYTES;
20
+
21
+ /**
22
+ * Normalize ws RawData to a single Buffer. ws delivers a Buffer when the frame
23
+ * is un-fragmented, a Buffer[] when it arrived in multiple WS continuation
24
+ * fragments (1-4 MB voice frames fragment variably across browsers/proxies/
25
+ * tunnels), or an ArrayBuffer under non-default options. Always size-check the
26
+ * RESULT of this, never the raw message (whose `.length` is the fragment count
27
+ * for an array).
28
+ *
29
+ * The concatenation is bounded: the ws server's `maxPayload` (8 MiB) caps the
30
+ * total message length during fragment reassembly and closes 1009 before the
31
+ * 'message' event fires, so this never concatenates more than 8 MiB.
32
+ *
33
+ * @param {Buffer|Buffer[]|ArrayBuffer|ArrayBufferView} message
34
+ * @returns {Buffer}
35
+ */
36
+ function normalizeBinaryMessage(message) {
37
+ if (Buffer.isBuffer(message)) return message;
38
+ if (Array.isArray(message)) return Buffer.concat(message);
39
+ return Buffer.from(message);
40
+ }
41
+
42
+ /**
43
+ * Classify a normalized binary frame.
44
+ * - { action: 'oversize' } -> caller closes 1009 (message too big)
45
+ * - { action: 'unsupported' } -> caller closes 1003 (bad/short/unknown header)
46
+ * - { action: 'pcm', pcm } -> caller hands `pcm` (Buffer) to the STT core
47
+ *
48
+ * @param {Buffer} buf Normalized frame (see normalizeBinaryMessage).
49
+ * @returns {{action: string, pcm?: Buffer}}
50
+ */
51
+ function classifyVoiceFrame(buf) {
52
+ if (buf.length > MAX_VOICE_BINARY_FRAME_BYTES) {
53
+ return { action: 'oversize' };
54
+ }
55
+ if (buf.length < VOICE_HEADER_BYTES
56
+ || !buf.subarray(0, 4).equals(VOICE_MAGIC)
57
+ || buf[4] !== VOICE_PROTO_VERSION
58
+ || buf[5] !== VOICE_FRAME_TYPE_PCM) {
59
+ return { action: 'unsupported' };
60
+ }
61
+ return { action: 'pcm', pcm: buf.subarray(VOICE_HEADER_BYTES) };
62
+ }
63
+
64
+ module.exports = {
65
+ VOICE_MAGIC,
66
+ VOICE_PROTO_VERSION,
67
+ VOICE_FRAME_TYPE_PCM,
68
+ VOICE_HEADER_BYTES,
69
+ MAX_VOICE_PCM_BYTES,
70
+ MAX_VOICE_BINARY_FRAME_BYTES,
71
+ normalizeBinaryMessage,
72
+ classifyVoiceFrame,
73
+ };