ai-or-die 0.1.71 → 0.1.72
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/public/app.js +83 -14
- package/src/public/heartbeat-watchdog.js +25 -0
- package/src/public/index.html +1 -0
- package/src/public/voice-frame.js +73 -0
- package/src/public/voice-handler.js +4 -0
- package/src/server.js +192 -55
- package/src/stt-engine.js +82 -5
- package/src/stt-worker.js +17 -4
- package/src/utils/pcm.js +22 -0
- package/src/utils/ws-voice-frame.js +73 -0
package/package.json
CHANGED
package/src/public/app.js
CHANGED
|
@@ -1386,8 +1386,19 @@ class ClaudeCodeWebInterface {
|
|
|
1386
1386
|
|
|
1387
1387
|
this.voiceController = new window.VoiceHandler.VoiceInputController({
|
|
1388
1388
|
mode: this.voiceMode,
|
|
1389
|
+
// Refuse a new recording while a previous transcription is still
|
|
1390
|
+
// pending (single timeout slot + no correlation id — overlapping
|
|
1391
|
+
// uploads would clobber each other's spinner/timeout).
|
|
1392
|
+
canStart: function () {
|
|
1393
|
+
return !self._voiceTranscriptionTimeout;
|
|
1394
|
+
},
|
|
1389
1395
|
onRecordingStart: function () {
|
|
1390
1396
|
self._playMicChime('on');
|
|
1397
|
+
// Suspend the heartbeat pong-timeout while capturing: the main
|
|
1398
|
+
// thread can be busy enough (esp. the ScriptProcessor fallback)
|
|
1399
|
+
// to miss a pong, which would otherwise force a spurious reconnect.
|
|
1400
|
+
self._voiceRecordingActive = true;
|
|
1401
|
+
if (self._heartbeat) self._heartbeat.pause();
|
|
1391
1402
|
btn.classList.add('recording');
|
|
1392
1403
|
btn.classList.remove('processing');
|
|
1393
1404
|
btn.setAttribute('aria-pressed', 'true');
|
|
@@ -1415,6 +1426,8 @@ class ClaudeCodeWebInterface {
|
|
|
1415
1426
|
},
|
|
1416
1427
|
onRecordingStop: function (result) {
|
|
1417
1428
|
self._playMicChime('off');
|
|
1429
|
+
self._voiceRecordingActive = false;
|
|
1430
|
+
if (self._heartbeat) self._heartbeat.resume();
|
|
1418
1431
|
btn.classList.remove('recording');
|
|
1419
1432
|
btn.setAttribute('aria-pressed', 'false');
|
|
1420
1433
|
btn.title = 'Voice Input (Ctrl+Shift+M)';
|
|
@@ -1425,21 +1438,33 @@ class ClaudeCodeWebInterface {
|
|
|
1425
1438
|
}
|
|
1426
1439
|
|
|
1427
1440
|
if (self.voiceMode === 'local' && result && result.samples) {
|
|
1441
|
+
// Guard against a zero-sample recording (would send a
|
|
1442
|
+
// header-only frame the server rejects as "too short").
|
|
1443
|
+
if (!result.samples.byteLength || result.samples.byteLength < 2) {
|
|
1444
|
+
btn.classList.remove('processing');
|
|
1445
|
+
if (window.feedback) window.feedback.error('No audio captured');
|
|
1446
|
+
return;
|
|
1447
|
+
}
|
|
1448
|
+
|
|
1428
1449
|
btn.classList.add('processing');
|
|
1429
|
-
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
|
|
1433
|
-
|
|
1434
|
-
|
|
1435
|
-
|
|
1450
|
+
|
|
1451
|
+
// Send raw Int16 PCM as a tagged binary WS frame (no base64 —
|
|
1452
|
+
// base64's 33% inflation is what pushed long clips past the
|
|
1453
|
+
// 1 MiB frame guard and crashed the page).
|
|
1454
|
+
var frame = window.VoiceFrame.buildVoiceFrame(result.samples);
|
|
1455
|
+
var sent = self.sendBinary(frame);
|
|
1456
|
+
|
|
1457
|
+
if (!sent) {
|
|
1458
|
+
// Socket not OPEN (e.g. mid-reconnect): fail fast instead of
|
|
1459
|
+
// silently dropping the frame and hanging the spinner 90 s.
|
|
1460
|
+
btn.classList.remove('processing');
|
|
1461
|
+
var notSentMsg = 'Connection not ready — recording not sent';
|
|
1462
|
+
if (window.feedback) window.feedback.error(notSentMsg);
|
|
1463
|
+
if (self.terminal) {
|
|
1464
|
+
self.terminal.write('\r\n\x1b[31m[Voice error] ' + notSentMsg + '\x1b[0m\r\n');
|
|
1465
|
+
}
|
|
1466
|
+
return;
|
|
1436
1467
|
}
|
|
1437
|
-
var base64Audio = btoa(parts.join(''));
|
|
1438
|
-
self.send({
|
|
1439
|
-
type: 'voice_upload',
|
|
1440
|
-
audio: base64Audio,
|
|
1441
|
-
durationMs: result.durationMs
|
|
1442
|
-
});
|
|
1443
1468
|
|
|
1444
1469
|
// Client-side timeout for transcription processing (90 seconds)
|
|
1445
1470
|
self._voiceTranscriptionTimeout = setTimeout(function () {
|
|
@@ -1467,6 +1492,8 @@ class ClaudeCodeWebInterface {
|
|
|
1467
1492
|
self._deliverVoiceTranscription(text);
|
|
1468
1493
|
},
|
|
1469
1494
|
onError: function (err) {
|
|
1495
|
+
self._voiceRecordingActive = false;
|
|
1496
|
+
if (self._heartbeat) self._heartbeat.resume();
|
|
1470
1497
|
btn.classList.remove('recording', 'processing');
|
|
1471
1498
|
btn.setAttribute('aria-pressed', 'false');
|
|
1472
1499
|
btn.title = 'Voice Input (Ctrl+Shift+M)';
|
|
@@ -1497,6 +1524,8 @@ class ClaudeCodeWebInterface {
|
|
|
1497
1524
|
}
|
|
1498
1525
|
},
|
|
1499
1526
|
onCancel: function () {
|
|
1527
|
+
self._voiceRecordingActive = false;
|
|
1528
|
+
if (self._heartbeat) self._heartbeat.resume();
|
|
1500
1529
|
btn.classList.remove('recording', 'processing');
|
|
1501
1530
|
btn.setAttribute('aria-pressed', 'false');
|
|
1502
1531
|
btn.title = 'Voice Input (Ctrl+Shift+M)';
|
|
@@ -2043,6 +2072,32 @@ class ClaudeCodeWebInterface {
|
|
|
2043
2072
|
if (this._heartbeat) { this._heartbeat.stop(); this._heartbeat = null; }
|
|
2044
2073
|
if (this._heartbeatTimer) { clearInterval(this._heartbeatTimer); this._heartbeatTimer = null; }
|
|
2045
2074
|
if (this._pongTimer) { clearTimeout(this._pongTimer); this._pongTimer = null; }
|
|
2075
|
+
|
|
2076
|
+
// A close mid-transcription must not leave the mic spinner + its
|
|
2077
|
+
// 90 s timeout hanging.
|
|
2078
|
+
if (this._voiceTranscriptionTimeout) {
|
|
2079
|
+
clearTimeout(this._voiceTranscriptionTimeout);
|
|
2080
|
+
this._voiceTranscriptionTimeout = null;
|
|
2081
|
+
}
|
|
2082
|
+
this._voiceRecordingActive = false;
|
|
2083
|
+
const voiceBtn = document.getElementById('voiceInputBtn');
|
|
2084
|
+
if (voiceBtn) voiceBtn.classList.remove('processing');
|
|
2085
|
+
|
|
2086
|
+
// Log the close code so field reports can tell a server frame
|
|
2087
|
+
// rejection (1009/1003, at stop) from a heartbeat pong-timeout
|
|
2088
|
+
// (4000, mid-recording).
|
|
2089
|
+
console.warn('[ws] closed', event.code, event.reason || '');
|
|
2090
|
+
|
|
2091
|
+
// 1009/1003 are server-initiated CLEAN closes (wasClean=true): the
|
|
2092
|
+
// server rejected our frame. Surface a specific message and still
|
|
2093
|
+
// reconnect below, instead of dead-ending on "refresh the page".
|
|
2094
|
+
const voiceClose = (window.VoiceFrame && window.VoiceFrame.classifyVoiceClose)
|
|
2095
|
+
? window.VoiceFrame.classifyVoiceClose(event.code)
|
|
2096
|
+
: { rejected: false, message: null };
|
|
2097
|
+
if (voiceClose.rejected && window.feedback) {
|
|
2098
|
+
window.feedback.error(voiceClose.message);
|
|
2099
|
+
}
|
|
2100
|
+
|
|
2046
2101
|
// During server restart, don't count failures against reconnect budget
|
|
2047
2102
|
// but still use backoff to avoid thundering herd
|
|
2048
2103
|
if (this._serverRestarting) {
|
|
@@ -2056,7 +2111,7 @@ class ClaudeCodeWebInterface {
|
|
|
2056
2111
|
if (restartGen !== this._socketGeneration) return;
|
|
2057
2112
|
this.reconnect();
|
|
2058
2113
|
}, restartBackoff);
|
|
2059
|
-
} else if (!event.wasClean && this.reconnectAttempts < this.maxReconnectAttempts) {
|
|
2114
|
+
} else if ((!event.wasClean || voiceClose.rejected) && this.reconnectAttempts < this.maxReconnectAttempts) {
|
|
2060
2115
|
this.updateStatus('Reconnecting (' + (this.reconnectAttempts + 1) + '/' + this.maxReconnectAttempts + ')...');
|
|
2061
2116
|
// First attempt is fast (250ms covers a server-process restart window);
|
|
2062
2117
|
// subsequent attempts use exponential backoff with jitter.
|
|
@@ -2174,6 +2229,17 @@ class ClaudeCodeWebInterface {
|
|
|
2174
2229
|
}
|
|
2175
2230
|
}
|
|
2176
2231
|
|
|
2232
|
+
// Send a binary WS frame (e.g. a voice PCM frame). Returns true if it was
|
|
2233
|
+
// handed to an OPEN socket, false otherwise so the caller can react to a
|
|
2234
|
+
// closed/closing socket instead of silently dropping the frame.
|
|
2235
|
+
sendBinary(view) {
|
|
2236
|
+
if (this.socket && this.socket.readyState === WebSocket.OPEN) {
|
|
2237
|
+
this.socket.send(view);
|
|
2238
|
+
return true;
|
|
2239
|
+
}
|
|
2240
|
+
return false;
|
|
2241
|
+
}
|
|
2242
|
+
|
|
2177
2243
|
_handleStickyNoteUpdate(message) {
|
|
2178
2244
|
if (!message || !message.sessionId) return;
|
|
2179
2245
|
const sm = this.sessionTabManager;
|
|
@@ -4368,6 +4434,9 @@ class ClaudeCodeWebInterface {
|
|
|
4368
4434
|
log: (m) => console.warn('[heartbeat]', m),
|
|
4369
4435
|
});
|
|
4370
4436
|
this._heartbeat.start();
|
|
4437
|
+
// If a recording is in progress (e.g. this heartbeat was re-created after
|
|
4438
|
+
// a reconnect mid-recording), keep pong-timeout enforcement suspended.
|
|
4439
|
+
if (this._voiceRecordingActive) this._heartbeat.pause();
|
|
4371
4440
|
// Keep _heartbeatTimer/_pongTimer references in sync for legacy code
|
|
4372
4441
|
// (disconnect() still nulls them defensively); the watchdog owns the
|
|
4373
4442
|
// real timer lifecycle via stop().
|
|
@@ -59,6 +59,10 @@
|
|
|
59
59
|
this._clearTimeout = t.clearTimeout || ((id) => clearTimeout(id));
|
|
60
60
|
this._heartbeatTimer = null;
|
|
61
61
|
this._pongTimer = null;
|
|
62
|
+
// When paused (e.g. during mic recording), pings still go out but a
|
|
63
|
+
// missed pong does NOT force a reconnect — the client main thread can
|
|
64
|
+
// be busy capturing audio and briefly stop servicing the pong.
|
|
65
|
+
this._paused = false;
|
|
62
66
|
}
|
|
63
67
|
|
|
64
68
|
_isStale() {
|
|
@@ -75,6 +79,9 @@
|
|
|
75
79
|
} catch (_) {
|
|
76
80
|
return;
|
|
77
81
|
}
|
|
82
|
+
// Paused: keep liveness pings flowing but do NOT arm the pong-timeout
|
|
83
|
+
// (a missed pong while recording must not force-close the socket).
|
|
84
|
+
if (this._paused) return;
|
|
78
85
|
if (this._pongTimer) this._clearTimeout(this._pongTimer);
|
|
79
86
|
this._pongTimer = this._setTimeout(() => {
|
|
80
87
|
if (this._isStale()) return;
|
|
@@ -118,6 +125,24 @@
|
|
|
118
125
|
this._pongTimer = null;
|
|
119
126
|
}
|
|
120
127
|
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Suspend pong-timeout enforcement (pings continue). Use while the client
|
|
131
|
+
* main thread may be busy enough to miss a pong — e.g. mic recording —
|
|
132
|
+
* so a transient stall doesn't trigger a spurious reconnect.
|
|
133
|
+
*/
|
|
134
|
+
pause() {
|
|
135
|
+
this._paused = true;
|
|
136
|
+
if (this._pongTimer) {
|
|
137
|
+
this._clearTimeout(this._pongTimer);
|
|
138
|
+
this._pongTimer = null;
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/** Resume normal pong-timeout enforcement (next ping re-arms it). */
|
|
143
|
+
resume() {
|
|
144
|
+
this._paused = false;
|
|
145
|
+
}
|
|
121
146
|
}
|
|
122
147
|
|
|
123
148
|
return HeartbeatWatchdog;
|
package/src/public/index.html
CHANGED
|
@@ -801,6 +801,7 @@
|
|
|
801
801
|
<script src="vscode-tunnel.js"></script>
|
|
802
802
|
<script src="app-tunnel.js"></script>
|
|
803
803
|
<script src="voice-handler.js"></script>
|
|
804
|
+
<script src="voice-frame.js"></script>
|
|
804
805
|
<script src="command-palette.js"></script>
|
|
805
806
|
<script src="extra-keys.js"></script>
|
|
806
807
|
<script src="input-overlay.js"></script>
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* VoiceFrame
|
|
3
|
+
*
|
|
4
|
+
* Pure helpers for the client->server binary voice path, factored out of app.js
|
|
5
|
+
* so they can be unit-tested in Node. Mirrors the UMD shape of
|
|
6
|
+
* heartbeat-watchdog.js (CommonJS in tests, `window.VoiceFrame` in the browser).
|
|
7
|
+
*/
|
|
8
|
+
(function (global, factory) {
|
|
9
|
+
if (typeof module === 'object' && module.exports) {
|
|
10
|
+
module.exports = factory();
|
|
11
|
+
} else {
|
|
12
|
+
global.VoiceFrame = factory();
|
|
13
|
+
}
|
|
14
|
+
}(typeof self !== 'undefined' ? self : this, function () {
|
|
15
|
+
|
|
16
|
+
// Wire header: [ "VUP1" (4) ][ version (1) ][ type (1) ] then raw 16-bit PCM.
|
|
17
|
+
var MAGIC_V = 0x56; // 'V'
|
|
18
|
+
var MAGIC_U = 0x55; // 'U'
|
|
19
|
+
var MAGIC_P = 0x50; // 'P'
|
|
20
|
+
var MAGIC_1 = 0x31; // '1'
|
|
21
|
+
var PROTO_VERSION = 0x01;
|
|
22
|
+
var FRAME_TYPE_PCM = 0x01;
|
|
23
|
+
var HEADER_BYTES = 6;
|
|
24
|
+
|
|
25
|
+
/**
|
|
26
|
+
* Build a binary voice frame: the 6-byte header followed by the PCM bytes of
|
|
27
|
+
* `samples`. Uses byteOffset/byteLength so a subarray-backed Int16Array is
|
|
28
|
+
* copied correctly (not the whole underlying buffer).
|
|
29
|
+
*
|
|
30
|
+
* @param {Int16Array} samples
|
|
31
|
+
* @returns {Uint8Array}
|
|
32
|
+
*/
|
|
33
|
+
function buildVoiceFrame(samples) {
|
|
34
|
+
var pcm = new Uint8Array(samples.buffer, samples.byteOffset, samples.byteLength);
|
|
35
|
+
var frame = new Uint8Array(HEADER_BYTES + pcm.length);
|
|
36
|
+
frame[0] = MAGIC_V;
|
|
37
|
+
frame[1] = MAGIC_U;
|
|
38
|
+
frame[2] = MAGIC_P;
|
|
39
|
+
frame[3] = MAGIC_1;
|
|
40
|
+
frame[4] = PROTO_VERSION;
|
|
41
|
+
frame[5] = FRAME_TYPE_PCM;
|
|
42
|
+
frame.set(pcm, HEADER_BYTES);
|
|
43
|
+
return frame;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Classify a WebSocket close code for the voice path.
|
|
48
|
+
*
|
|
49
|
+
* 1009 (server rejected an oversized frame) and 1003 (unsupported/garbage
|
|
50
|
+
* binary) are server-initiated CLEAN closes, so `event.wasClean` is true and
|
|
51
|
+
* the default onclose path would SKIP reconnect and dead-end on
|
|
52
|
+
* "refresh the page". Treat them as recoverable: show a specific message and
|
|
53
|
+
* still reconnect (bounded by the normal attempt budget).
|
|
54
|
+
*
|
|
55
|
+
* @param {number} code
|
|
56
|
+
* @returns {{rejected: boolean, message: (string|null)}}
|
|
57
|
+
*/
|
|
58
|
+
function classifyVoiceClose(code) {
|
|
59
|
+
if (code === 1009 || code === 1003) {
|
|
60
|
+
return {
|
|
61
|
+
rejected: true,
|
|
62
|
+
message: 'A voice message was rejected by the server. Reconnecting…'
|
|
63
|
+
};
|
|
64
|
+
}
|
|
65
|
+
return { rejected: false, message: null };
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
return {
|
|
69
|
+
HEADER_BYTES: HEADER_BYTES,
|
|
70
|
+
buildVoiceFrame: buildVoiceFrame,
|
|
71
|
+
classifyVoiceClose: classifyVoiceClose
|
|
72
|
+
};
|
|
73
|
+
}));
|
|
@@ -625,6 +625,9 @@ function VoiceInputController(options) {
|
|
|
625
625
|
this._onTranscription = options.onTranscription || null;
|
|
626
626
|
this._onError = options.onError || null;
|
|
627
627
|
this._onCancel = options.onCancel || null;
|
|
628
|
+
// Optional predicate: if it returns false, a start request is ignored (e.g.
|
|
629
|
+
// a previous transcription is still pending). Gates both button + keyboard.
|
|
630
|
+
this._canStart = options.canStart || null;
|
|
628
631
|
|
|
629
632
|
this._recorder = null;
|
|
630
633
|
this._starting = false;
|
|
@@ -670,6 +673,7 @@ VoiceInputController.prototype.startRecording = function () {
|
|
|
670
673
|
var self = this;
|
|
671
674
|
if (self._starting) return;
|
|
672
675
|
if (self._recorder && self._recorder.isRecording) return;
|
|
676
|
+
if (self._canStart && !self._canStart()) return;
|
|
673
677
|
|
|
674
678
|
self._starting = true;
|
|
675
679
|
self._recorder = self._createRecorder();
|
package/src/server.js
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
const express = require('express');
|
|
2
2
|
const http = require('http');
|
|
3
3
|
const https = require('https');
|
|
4
|
+
const net = require('net');
|
|
4
5
|
const fs = require('fs');
|
|
5
6
|
const path = require('path');
|
|
6
7
|
const os = require('os');
|
|
@@ -46,6 +47,18 @@ const RestartManager = require('./restart-manager');
|
|
|
46
47
|
// See docs/audits/hot-03-ws-frame-size.md.
|
|
47
48
|
const MAX_WS_MESSAGE_BYTES = 1 * 1024 * 1024;
|
|
48
49
|
|
|
50
|
+
// Inbound binary voice frames (client mic -> server STT) bypass the JSON guard
|
|
51
|
+
// above. Framing + validation (incl. the Buffer[] fragmented-frame normalize)
|
|
52
|
+
// lives in utils/ws-voice-frame so it can be unit-tested without a live socket.
|
|
53
|
+
// A frame is bounded by MAX_VOICE_BINARY_FRAME_BYTES (oversize -> 1009 close,
|
|
54
|
+
// like the text guard); a bad/short header -> 1003 close.
|
|
55
|
+
const {
|
|
56
|
+
MAX_VOICE_PCM_BYTES,
|
|
57
|
+
MAX_VOICE_BINARY_FRAME_BYTES,
|
|
58
|
+
normalizeBinaryMessage,
|
|
59
|
+
classifyVoiceFrame,
|
|
60
|
+
} = require('./utils/ws-voice-frame');
|
|
61
|
+
|
|
49
62
|
// Pre-built PWA screenshot SVG buffers (served at /screenshot-wide.png and /screenshot-narrow.png)
|
|
50
63
|
const SCREENSHOT_WIDE_BUF = Buffer.from(`
|
|
51
64
|
<svg width="1280" height="720" viewBox="0 0 1280 720" xmlns="http://www.w3.org/2000/svg">
|
|
@@ -148,8 +161,6 @@ class ClaudeCodeWebServer {
|
|
|
148
161
|
modelsDir: options.sttModelDir,
|
|
149
162
|
numThreads: options.sttThreads ? parseInt(options.sttThreads, 10) : undefined,
|
|
150
163
|
});
|
|
151
|
-
this._voiceUploadCounts = new Map();
|
|
152
|
-
|
|
153
164
|
// Per-tab local-LLM "sticky note" summariser. ON by default for AI-agent
|
|
154
165
|
// tabs; disable globally with --no-sticky-notes / AIORDIE_DISABLE_STICKY_NOTES=1
|
|
155
166
|
// (sticky-notes only — does NOT affect STT). The engine lazily downloads its
|
|
@@ -1319,11 +1330,6 @@ class ClaudeCodeWebServer {
|
|
|
1319
1330
|
// weeks of uptime. See _cleanupFsWatchSession.
|
|
1320
1331
|
this._cleanupFsWatchSession(sessionId, 'session_deleted');
|
|
1321
1332
|
|
|
1322
|
-
// Drop the voice-upload rate-limit history for this session. Map
|
|
1323
|
-
// grew unbounded across session-create/delete churn on long-lived
|
|
1324
|
-
// servers (smaller cousin of the _fsWatchSessions leak).
|
|
1325
|
-
this._voiceUploadCounts.delete(sessionId);
|
|
1326
|
-
|
|
1327
1333
|
// Stop + tear down the summariser so an in-flight inference is discarded.
|
|
1328
1334
|
this.stickyNoteSummarizer.cancel(sessionId);
|
|
1329
1335
|
this._stickyJsonl.delete(sessionId);
|
|
@@ -3151,6 +3157,7 @@ class ClaudeCodeWebServer {
|
|
|
3151
3157
|
this._ensureStickyNoteEngine();
|
|
3152
3158
|
|
|
3153
3159
|
let server;
|
|
3160
|
+
let wsHost; // the server the WebSocket server attaches to (TLS server in HTTPS mode)
|
|
3154
3161
|
|
|
3155
3162
|
if (this.useHttps) {
|
|
3156
3163
|
let cert, key;
|
|
@@ -3177,13 +3184,90 @@ class ClaudeCodeWebServer {
|
|
|
3177
3184
|
console.log(' Browsers will show a security warning on first visit.');
|
|
3178
3185
|
console.log(' For a trusted, installable origin use \x1b[1m--tunnel\x1b[0m.');
|
|
3179
3186
|
}
|
|
3180
|
-
|
|
3187
|
+
|
|
3188
|
+
// The real TLS app server. The WebSocket server attaches HERE so a wss://
|
|
3189
|
+
// upgrade arrives over an encrypted TLSSocket (req.socket.encrypted stays
|
|
3190
|
+
// true for the secure-context / voice checks).
|
|
3191
|
+
const tlsServer = https.createServer({ cert, key }, this.app);
|
|
3192
|
+
|
|
3193
|
+
// Build the https redirect target from the SAME host:port the client
|
|
3194
|
+
// reached. The Host header is client-controlled, so accept ONLY a bare
|
|
3195
|
+
// hostname[:port] (or [ipv6][:port]) — reject userinfo (`@`), paths, and
|
|
3196
|
+
// control chars to prevent an open redirect to an external origin
|
|
3197
|
+
// (e.g. Host: user:pass@evil.com). Fall back to localhost otherwise.
|
|
3198
|
+
const redirectLocation = (req) => {
|
|
3199
|
+
const raw = String(req.headers.host || '');
|
|
3200
|
+
const validHost = /^[A-Za-z0-9.-]+(?::\d+)?$/.test(raw)
|
|
3201
|
+
|| /^\[[0-9a-fA-F:]+\](?::\d+)?$/.test(raw);
|
|
3202
|
+
const hostname = validHost ? raw.replace(/:\d+$/, '') : 'localhost';
|
|
3203
|
+
const port = req.socket.localPort || this.port;
|
|
3204
|
+
// req.url is parser-validated (no CR/LF in a valid request target).
|
|
3205
|
+
return `https://${hostname}:${port}${req.url}`.replace(/[\r\n]/g, '');
|
|
3206
|
+
};
|
|
3207
|
+
|
|
3208
|
+
// Plaintext HTTP on the SAME port -> redirect to https. A user who reaches
|
|
3209
|
+
// http://host:PORT is auto-upgraded instead of getting an opaque
|
|
3210
|
+
// TLS-handshake error. 307 keeps the method and is not cached as permanent
|
|
3211
|
+
// (switching the port back to http mode later isn't poisoned by a stale 301).
|
|
3212
|
+
const httpRedirectServer = http.createServer((req, res) => {
|
|
3213
|
+
const location = redirectLocation(req);
|
|
3214
|
+
res.writeHead(307, { Location: location, 'Content-Type': 'text/plain' });
|
|
3215
|
+
res.end(`Redirecting to ${location}\n`);
|
|
3216
|
+
});
|
|
3217
|
+
// A plaintext ws:// upgrade to the TLS port: answer with the same redirect
|
|
3218
|
+
// (written raw — an upgrade has no res object) instead of an abrupt RST.
|
|
3219
|
+
httpRedirectServer.on('upgrade', (req, socket) => {
|
|
3220
|
+
const location = redirectLocation(req);
|
|
3221
|
+
try {
|
|
3222
|
+
socket.end(
|
|
3223
|
+
'HTTP/1.1 307 Temporary Redirect\r\n' +
|
|
3224
|
+
`Location: ${location}\r\n` +
|
|
3225
|
+
'Connection: close\r\n\r\n'
|
|
3226
|
+
);
|
|
3227
|
+
} catch (_) { try { socket.destroy(); } catch (__) { /* ignore */ } }
|
|
3228
|
+
});
|
|
3229
|
+
|
|
3230
|
+
// Front both with a 1-byte sniffer: a TLS ClientHello starts with 0x16
|
|
3231
|
+
// (handshake record); anything else is plaintext HTTP. One listening port
|
|
3232
|
+
// therefore serves both — http:// and https:// to PORT both work.
|
|
3233
|
+
this._proxySockets = new Set();
|
|
3234
|
+
server = net.createServer((socket) => {
|
|
3235
|
+
this._proxySockets.add(socket);
|
|
3236
|
+
socket.once('close', () => this._proxySockets.delete(socket));
|
|
3237
|
+
// Pre-handoff guards: drop a connection that errors or sends no data
|
|
3238
|
+
// (port scanner / slowloris) before we know which server owns it. Both
|
|
3239
|
+
// are cleared the moment we route, so the target server's own lifecycle
|
|
3240
|
+
// and timeouts take over cleanly.
|
|
3241
|
+
const sniffTimer = setTimeout(() => { try { socket.destroy(); } catch (_) { /* ignore */ } }, 10000);
|
|
3242
|
+
const onSniffError = () => {
|
|
3243
|
+
clearTimeout(sniffTimer);
|
|
3244
|
+
try { socket.destroy(); } catch (_) { /* ignore */ }
|
|
3245
|
+
};
|
|
3246
|
+
socket.on('error', onSniffError);
|
|
3247
|
+
socket.once('readable', () => {
|
|
3248
|
+
clearTimeout(sniffTimer);
|
|
3249
|
+
socket.removeListener('error', onSniffError);
|
|
3250
|
+
const chunk = socket.read(1);
|
|
3251
|
+
if (!chunk) { socket.destroy(); return; }
|
|
3252
|
+
socket.unshift(chunk);
|
|
3253
|
+
const target = chunk[0] === 0x16 ? tlsServer : httpRedirectServer;
|
|
3254
|
+
target.emit('connection', socket);
|
|
3255
|
+
});
|
|
3256
|
+
});
|
|
3257
|
+
|
|
3258
|
+
this._tlsServer = tlsServer;
|
|
3259
|
+
this._httpRedirectServer = httpRedirectServer;
|
|
3260
|
+
wsHost = tlsServer;
|
|
3261
|
+
console.log(' http:// requests on this port auto-upgrade to https.');
|
|
3181
3262
|
} else {
|
|
3182
3263
|
server = http.createServer(this.app);
|
|
3264
|
+
this._tlsServer = null;
|
|
3265
|
+
this._httpRedirectServer = null;
|
|
3266
|
+
wsHost = server;
|
|
3183
3267
|
}
|
|
3184
3268
|
|
|
3185
3269
|
this.wss = new WebSocket.Server({
|
|
3186
|
-
server,
|
|
3270
|
+
server: wsHost,
|
|
3187
3271
|
maxPayload: 8 * 1024 * 1024,
|
|
3188
3272
|
// Compression disabled — binary frames already send with compress:false,
|
|
3189
3273
|
// and JSON control messages are small/infrequent. Saves ~300KB per connection
|
|
@@ -3237,7 +3321,39 @@ class ClaudeCodeWebServer {
|
|
|
3237
3321
|
};
|
|
3238
3322
|
this.webSocketConnections.set(wsId, wsInfo);
|
|
3239
3323
|
|
|
3240
|
-
ws.on('message', (message) => {
|
|
3324
|
+
ws.on('message', (message, isBinary) => {
|
|
3325
|
+
// Inbound BINARY frames are voice audio (client mic). Handle them BEFORE
|
|
3326
|
+
// the JSON guard below: they legitimately exceed 1 MiB (up to 3.84 MB of
|
|
3327
|
+
// 120 s PCM) and must not be killed by the text-frame guard. They are
|
|
3328
|
+
// still bounded (oversize -> 1009; bad/short header -> 1003) so this does
|
|
3329
|
+
// not reopen the event-loop-DoS hole the JSON guard closes.
|
|
3330
|
+
if (isBinary) {
|
|
3331
|
+
// ws delivers a Buffer when un-fragmented and a Buffer[] when the frame
|
|
3332
|
+
// arrived in multiple WS continuation fragments. Normalize first, then
|
|
3333
|
+
// classify on the normalized buffer (never on `message.length`, which is
|
|
3334
|
+
// the fragment COUNT for an array).
|
|
3335
|
+
const buf = normalizeBinaryMessage(message);
|
|
3336
|
+
const verdict = classifyVoiceFrame(buf);
|
|
3337
|
+
if (verdict.action === 'oversize') {
|
|
3338
|
+
try {
|
|
3339
|
+
this.sendToWebSocket(ws, {
|
|
3340
|
+
type: 'error',
|
|
3341
|
+
code: 'message_too_large',
|
|
3342
|
+
message: `Binary voice frame exceeds ${MAX_VOICE_BINARY_FRAME_BYTES} bytes`,
|
|
3343
|
+
received_bytes: buf.length,
|
|
3344
|
+
limit_bytes: MAX_VOICE_BINARY_FRAME_BYTES,
|
|
3345
|
+
});
|
|
3346
|
+
} catch (_) { /* socket may be half-closed */ }
|
|
3347
|
+
try { ws.close(1009, 'message_too_large'); } catch (_) {}
|
|
3348
|
+
return;
|
|
3349
|
+
}
|
|
3350
|
+
if (verdict.action === 'unsupported') {
|
|
3351
|
+
try { ws.close(1003, 'unsupported binary'); } catch (_) {}
|
|
3352
|
+
return;
|
|
3353
|
+
}
|
|
3354
|
+
this.handleVoiceBinary(wsId, verdict.pcm);
|
|
3355
|
+
return;
|
|
3356
|
+
}
|
|
3241
3357
|
// HOT-08: application-layer size guard, runs BEFORE JSON.parse.
|
|
3242
3358
|
// Buffer.byteLength handles both string and Buffer message types.
|
|
3243
3359
|
// On oversize, send a marker error frame and close with WS-standard
|
|
@@ -4677,11 +4793,11 @@ class ClaudeCodeWebServer {
|
|
|
4677
4793
|
}
|
|
4678
4794
|
|
|
4679
4795
|
// Same per-session cleanup contract as the DELETE handler:
|
|
4680
|
-
// tear down any orphan fs-watch SSE
|
|
4681
|
-
//
|
|
4682
|
-
//
|
|
4796
|
+
// tear down any orphan fs-watch SSE BEFORE removing the parent session
|
|
4797
|
+
// entry, otherwise the chokidar watcher leaks (PR #99 regression). The
|
|
4798
|
+
// voice-upload rate-limit history lives on the session object and is
|
|
4799
|
+
// dropped with it below.
|
|
4683
4800
|
try { this._cleanupFsWatchSession(top.id, 'session_evicted'); } catch (_) { /* ignore */ }
|
|
4684
|
-
try { this._voiceUploadCounts.delete(top.id); } catch (_) { /* ignore */ }
|
|
4685
4801
|
try { this.stickyNoteSummarizer.cancel(top.id); } catch (_) { /* ignore */ }
|
|
4686
4802
|
try { this._stickyJsonl.delete(top.id); } catch (_) { /* ignore */ }
|
|
4687
4803
|
if (this._foregroundSessionId === top.id) this._foregroundSessionId = null;
|
|
@@ -4888,7 +5004,8 @@ class ClaudeCodeWebServer {
|
|
|
4888
5004
|
total: this.claudeSessions.size,
|
|
4889
5005
|
ws_connections: this.webSocketConnections.size,
|
|
4890
5006
|
fs_watch_sessions: (this._fsWatchSessions && this._fsWatchSessions.size) || 0,
|
|
4891
|
-
voice_upload_counts: (this.
|
|
5007
|
+
voice_upload_counts: Array.from(this.claudeSessions.values())
|
|
5008
|
+
.filter(s => s._voiceUploadTimestamps && s._voiceUploadTimestamps.length).length,
|
|
4892
5009
|
activity_broadcast_timestamps: (this.activityBroadcastTimestamps && this.activityBroadcastTimestamps.size) || 0,
|
|
4893
5010
|
},
|
|
4894
5011
|
// DISK-02/03: cached disk usage sample (60 s TTL, never blocks the
|
|
@@ -5225,6 +5342,23 @@ class ClaudeCodeWebServer {
|
|
|
5225
5342
|
if (this.server) {
|
|
5226
5343
|
this.server.close();
|
|
5227
5344
|
}
|
|
5345
|
+
// In HTTPS mode `this.server` is the TLS-sniffing proxy that owns the
|
|
5346
|
+
// listening port; the TLS app server and the http->https redirect server sit
|
|
5347
|
+
// behind it. Sockets are handed to them via emit('connection'), bypassing
|
|
5348
|
+
// their internal connection tracking, so destroy the proxied sockets here
|
|
5349
|
+
// (and close the inner servers) to avoid keep-alive connections lingering.
|
|
5350
|
+
if (this._proxySockets) {
|
|
5351
|
+
for (const s of this._proxySockets) {
|
|
5352
|
+
try { s.destroy(); } catch (_) { /* ignore */ }
|
|
5353
|
+
}
|
|
5354
|
+
this._proxySockets.clear();
|
|
5355
|
+
}
|
|
5356
|
+
if (this._tlsServer) {
|
|
5357
|
+
try { this._tlsServer.close(); } catch (_) { /* ignore */ }
|
|
5358
|
+
}
|
|
5359
|
+
if (this._httpRedirectServer) {
|
|
5360
|
+
try { this._httpRedirectServer.close(); } catch (_) { /* ignore */ }
|
|
5361
|
+
}
|
|
5228
5362
|
|
|
5229
5363
|
// Flush pending output and stop all sessions with a 5-second timeout
|
|
5230
5364
|
const stopPromises = [];
|
|
@@ -5456,10 +5590,40 @@ class ClaudeCodeWebServer {
|
|
|
5456
5590
|
}
|
|
5457
5591
|
}
|
|
5458
5592
|
|
|
5593
|
+
// Thin shim for the legacy base64-JSON voice_upload path. The 'Missing audio
|
|
5594
|
+
// data' guard must live HERE (the binary path has no data.audio); after the
|
|
5595
|
+
// binary-frame switch no live client emits this, but it is kept for
|
|
5596
|
+
// back-compat and shares the validation/transcribe core below.
|
|
5459
5597
|
async handleVoiceUpload(wsId, data) {
|
|
5460
5598
|
const wsInfo = this.webSocketConnections.get(wsId);
|
|
5461
5599
|
if (!wsInfo) return;
|
|
5462
5600
|
|
|
5601
|
+
if (!data.audio || typeof data.audio !== 'string') {
|
|
5602
|
+
this.sendToWebSocket(wsInfo.ws, {
|
|
5603
|
+
type: 'voice_transcription_error',
|
|
5604
|
+
message: 'Missing audio data'
|
|
5605
|
+
});
|
|
5606
|
+
return;
|
|
5607
|
+
}
|
|
5608
|
+
|
|
5609
|
+
await this._processVoicePcm(wsId, Buffer.from(data.audio, 'base64'));
|
|
5610
|
+
}
|
|
5611
|
+
|
|
5612
|
+
// Binary voice frame path. The ws dispatcher has already validated the 6-byte
|
|
5613
|
+
// header and sliced it off, so `pcmBuffer` is raw 16-bit PCM.
|
|
5614
|
+
async handleVoiceBinary(wsId, pcmBuffer) {
|
|
5615
|
+
await this._processVoicePcm(wsId, pcmBuffer);
|
|
5616
|
+
}
|
|
5617
|
+
|
|
5618
|
+
// Shared voice core for both the base64 shim and the binary path. Check order
|
|
5619
|
+
// is cheapest/most-restrictive first; the rate limit stays BEFORE the isReady
|
|
5620
|
+
// gate (so it is enforced even when STT is unavailable), and the int16->float32
|
|
5621
|
+
// conversion is deferred to the STT worker (transcribePcm16) rather than run on
|
|
5622
|
+
// the event loop here.
|
|
5623
|
+
async _processVoicePcm(wsId, pcmBuffer) {
|
|
5624
|
+
const wsInfo = this.webSocketConnections.get(wsId);
|
|
5625
|
+
if (!wsInfo) return;
|
|
5626
|
+
|
|
5463
5627
|
// Reject voice uploads over HTTP from non-localhost origins (defense-in-depth)
|
|
5464
5628
|
if (!wsInfo.secure && !this._isLocalhostConnection(wsInfo.ws)) {
|
|
5465
5629
|
this.sendToWebSocket(wsInfo.ws, {
|
|
@@ -5494,23 +5658,21 @@ class ClaudeCodeWebServer {
|
|
|
5494
5658
|
return;
|
|
5495
5659
|
}
|
|
5496
5660
|
|
|
5497
|
-
// Rate limit: max 10 voice uploads per minute per session
|
|
5498
|
-
|
|
5499
|
-
|
|
5500
|
-
|
|
5501
|
-
}
|
|
5502
|
-
const timestamps = this._voiceUploadCounts.get(sessionId);
|
|
5661
|
+
// Rate limit: max 10 voice uploads per minute per session. State lives on the
|
|
5662
|
+
// session object (mirrors image uploads at saveImageToTemp) so it shares the
|
|
5663
|
+
// session's lifetime — GC'd on session delete/evict, and correctly survives a
|
|
5664
|
+
// WS reconnect (the budget must NOT reset when the socket drops).
|
|
5503
5665
|
const now = Date.now();
|
|
5504
|
-
|
|
5505
|
-
|
|
5506
|
-
if (
|
|
5666
|
+
if (!session._voiceUploadTimestamps) session._voiceUploadTimestamps = [];
|
|
5667
|
+
session._voiceUploadTimestamps = session._voiceUploadTimestamps.filter(ts => now - ts < 60000);
|
|
5668
|
+
if (session._voiceUploadTimestamps.length >= 10) {
|
|
5507
5669
|
this.sendToWebSocket(wsInfo.ws, {
|
|
5508
5670
|
type: 'voice_transcription_error',
|
|
5509
5671
|
message: 'Rate limit exceeded: maximum 10 voice uploads per minute.'
|
|
5510
5672
|
});
|
|
5511
5673
|
return;
|
|
5512
5674
|
}
|
|
5513
|
-
|
|
5675
|
+
session._voiceUploadTimestamps.push(now);
|
|
5514
5676
|
|
|
5515
5677
|
if (!this.sttEngine.isReady()) {
|
|
5516
5678
|
this.sendToWebSocket(wsInfo.ws, {
|
|
@@ -5521,19 +5683,8 @@ class ClaudeCodeWebServer {
|
|
|
5521
5683
|
}
|
|
5522
5684
|
|
|
5523
5685
|
try {
|
|
5524
|
-
// Validate audio data
|
|
5525
|
-
if (!data.audio || typeof data.audio !== 'string') {
|
|
5526
|
-
this.sendToWebSocket(wsInfo.ws, {
|
|
5527
|
-
type: 'voice_transcription_error',
|
|
5528
|
-
message: 'Missing audio data'
|
|
5529
|
-
});
|
|
5530
|
-
return;
|
|
5531
|
-
}
|
|
5532
|
-
|
|
5533
|
-
const audioBuffer = Buffer.from(data.audio, 'base64');
|
|
5534
|
-
|
|
5535
5686
|
// Max 120s of 16kHz 16-bit mono PCM = 3,840,000 bytes
|
|
5536
|
-
if (
|
|
5687
|
+
if (pcmBuffer.length > MAX_VOICE_PCM_BYTES) {
|
|
5537
5688
|
this.sendToWebSocket(wsInfo.ws, {
|
|
5538
5689
|
type: 'voice_transcription_error',
|
|
5539
5690
|
message: 'Audio too long (max 120 seconds)'
|
|
@@ -5541,7 +5692,7 @@ class ClaudeCodeWebServer {
|
|
|
5541
5692
|
return;
|
|
5542
5693
|
}
|
|
5543
5694
|
|
|
5544
|
-
if (
|
|
5695
|
+
if (pcmBuffer.length < 2) {
|
|
5545
5696
|
this.sendToWebSocket(wsInfo.ws, {
|
|
5546
5697
|
type: 'voice_transcription_error',
|
|
5547
5698
|
message: 'Audio too short'
|
|
@@ -5549,7 +5700,7 @@ class ClaudeCodeWebServer {
|
|
|
5549
5700
|
return;
|
|
5550
5701
|
}
|
|
5551
5702
|
|
|
5552
|
-
if (
|
|
5703
|
+
if (pcmBuffer.length % 2 !== 0) {
|
|
5553
5704
|
this.sendToWebSocket(wsInfo.ws, {
|
|
5554
5705
|
type: 'voice_transcription_error',
|
|
5555
5706
|
message: 'Invalid audio data: buffer length must be even (16-bit PCM samples)'
|
|
@@ -5557,11 +5708,8 @@ class ClaudeCodeWebServer {
|
|
|
5557
5708
|
return;
|
|
5558
5709
|
}
|
|
5559
5710
|
|
|
5560
|
-
|
|
5561
|
-
|
|
5562
|
-
const float32 = this._int16ToFloat32(audioBuffer);
|
|
5563
|
-
|
|
5564
|
-
const text = await this.sttEngine.transcribe(float32);
|
|
5711
|
+
// Raw int16 PCM -> the worker converts to Float32 off the event loop.
|
|
5712
|
+
const text = await this.sttEngine.transcribePcm16(pcmBuffer);
|
|
5565
5713
|
|
|
5566
5714
|
this.sendToWebSocket(wsInfo.ws, {
|
|
5567
5715
|
type: 'voice_transcription',
|
|
@@ -5652,17 +5800,6 @@ class ClaudeCodeWebServer {
|
|
|
5652
5800
|
}
|
|
5653
5801
|
}
|
|
5654
5802
|
|
|
5655
|
-
_int16ToFloat32(int16Buffer) {
|
|
5656
|
-
// Copy to ensure 2-byte alignment (Node.js Buffers may have odd byteOffset)
|
|
5657
|
-
const aligned = new Uint8Array(int16Buffer).buffer;
|
|
5658
|
-
const int16 = new Int16Array(aligned);
|
|
5659
|
-
const float32 = new Float32Array(int16.length);
|
|
5660
|
-
for (let i = 0; i < int16.length; i++) {
|
|
5661
|
-
float32[i] = int16[i] / 32768.0;
|
|
5662
|
-
}
|
|
5663
|
-
return float32;
|
|
5664
|
-
}
|
|
5665
|
-
|
|
5666
5803
|
async saveImageToTemp(session, data) {
|
|
5667
5804
|
// Primary temp dir: .claude-images inside the session working directory
|
|
5668
5805
|
let tempDir = path.join(session.workingDir, '.claude-images');
|
package/src/stt-engine.js
CHANGED
|
@@ -93,6 +93,71 @@ class SttEngine {
|
|
|
93
93
|
return promise;
|
|
94
94
|
}
|
|
95
95
|
|
|
96
|
+
/**
|
|
97
|
+
* Transcribe raw 16-bit PCM. The int16->float32 conversion is deferred to the
|
|
98
|
+
* worker thread (see stt-worker.js) so the server event loop never runs the
|
|
99
|
+
* per-sample loop. Accepts an Int16Array, an ArrayBuffer, or any ArrayBuffer
|
|
100
|
+
* view (e.g. a Node Buffer) of raw little-endian 16-bit samples.
|
|
101
|
+
*
|
|
102
|
+
* @param {Int16Array|ArrayBuffer|ArrayBufferView} int16
|
|
103
|
+
* @returns {Promise<string>}
|
|
104
|
+
*/
|
|
105
|
+
transcribePcm16(int16) {
|
|
106
|
+
const int16arr = this._toInt16Array(int16);
|
|
107
|
+
|
|
108
|
+
if (this._sttEndpoint) {
|
|
109
|
+
// External endpoint has no worker — convert here and reuse the float32 path.
|
|
110
|
+
const float32 = new Float32Array(int16arr.length);
|
|
111
|
+
for (let i = 0; i < int16arr.length; i++) {
|
|
112
|
+
float32[i] = int16arr[i] / 32768.0;
|
|
113
|
+
}
|
|
114
|
+
return this._transcribeExternal(float32);
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if (this._status !== 'ready') {
|
|
118
|
+
throw new Error(`STT engine not ready (status: ${this._status})`);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (this._queue.length >= MAX_QUEUE_SIZE) {
|
|
122
|
+
throw new Error('STT busy, try again later');
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
const id = ++this._requestIdCounter;
|
|
126
|
+
|
|
127
|
+
const promise = new Promise((resolve, reject) => {
|
|
128
|
+
const timer = setTimeout(() => {
|
|
129
|
+
this._removeFromQueue(id);
|
|
130
|
+
reject(new Error('Transcription timed out'));
|
|
131
|
+
}, TRANSCRIPTION_TIMEOUT_MS);
|
|
132
|
+
|
|
133
|
+
this._queue.push({ id, pcm16: int16arr, resolve, reject, timer });
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
this._processQueue();
|
|
137
|
+
return promise;
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Copy an int16 input into a fresh, offset-0, even-length Int16Array. Always
|
|
141
|
+
// copies (even an Int16Array input) so the queued buffer is solely owned and
|
|
142
|
+
// can be safely TRANSFERRED to the worker. A Node Buffer slice can have an odd
|
|
143
|
+
// byteOffset (a direct `new Int16Array(buf.buffer, off)` would throw); an odd
|
|
144
|
+
// byteLength is floored to whole 16-bit samples — callers already reject odd
|
|
145
|
+
// lengths, this is defense-in-depth so the method never throws RangeError.
|
|
146
|
+
_toInt16Array(int16) {
|
|
147
|
+
let bytes;
|
|
148
|
+
if (int16 instanceof Int16Array || ArrayBuffer.isView(int16)) {
|
|
149
|
+
bytes = new Uint8Array(int16.buffer, int16.byteOffset, int16.byteLength);
|
|
150
|
+
} else if (int16 instanceof ArrayBuffer) {
|
|
151
|
+
bytes = new Uint8Array(int16);
|
|
152
|
+
} else {
|
|
153
|
+
throw new Error('transcribePcm16 expects an Int16Array, ArrayBuffer, or typed-array view');
|
|
154
|
+
}
|
|
155
|
+
const evenLen = bytes.byteLength - (bytes.byteLength % 2);
|
|
156
|
+
const copy = new Uint8Array(evenLen);
|
|
157
|
+
copy.set(bytes.subarray(0, evenLen));
|
|
158
|
+
return new Int16Array(copy.buffer);
|
|
159
|
+
}
|
|
160
|
+
|
|
96
161
|
_processQueue() {
|
|
97
162
|
if (this._currentRequest || this._queue.length === 0 || !this._worker) {
|
|
98
163
|
return;
|
|
@@ -101,11 +166,23 @@ class SttEngine {
|
|
|
101
166
|
const request = this._queue[0];
|
|
102
167
|
this._currentRequest = request;
|
|
103
168
|
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
169
|
+
// pcm16 path: TRANSFER the (solely-owned, freshly-copied by _toInt16Array)
|
|
170
|
+
// buffer to the worker — avoids a multi-MB structured-clone copy on the event
|
|
171
|
+
// loop. Safe because each request is posted exactly once (on worker crash the
|
|
172
|
+
// queue is rejected + cleared, so a posted/detached buffer is never requeued).
|
|
173
|
+
if (request.pcm16 !== undefined) {
|
|
174
|
+
this._worker.postMessage({
|
|
175
|
+
type: 'transcribe',
|
|
176
|
+
id: request.id,
|
|
177
|
+
pcm16: request.pcm16
|
|
178
|
+
}, [request.pcm16.buffer]);
|
|
179
|
+
} else {
|
|
180
|
+
this._worker.postMessage({
|
|
181
|
+
type: 'transcribe',
|
|
182
|
+
id: request.id,
|
|
183
|
+
samples: request.samples
|
|
184
|
+
});
|
|
185
|
+
}
|
|
109
186
|
}
|
|
110
187
|
|
|
111
188
|
_onWorkerMessage(msg) {
|
package/src/stt-worker.js
CHANGED
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
const { parentPort, workerData } = require('worker_threads');
|
|
4
4
|
const path = require('path');
|
|
5
5
|
const os = require('os');
|
|
6
|
+
const { pcm16ToFloat32 } = require('./utils/pcm.js');
|
|
6
7
|
|
|
7
8
|
// Set platform-specific library paths BEFORE requiring sherpa-onnx-node.
|
|
8
9
|
// The native .node addon dynamically loads shared libraries (onnxruntime.dll,
|
|
@@ -74,10 +75,22 @@ try {
|
|
|
74
75
|
parentPort.on('message', (msg) => {
|
|
75
76
|
if (msg.type === 'transcribe') {
|
|
76
77
|
try {
|
|
77
|
-
//
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
78
|
+
// Two input shapes:
|
|
79
|
+
// - msg.pcm16: raw 16-bit PCM (Int16Array). Conversion to Float32 runs
|
|
80
|
+
// HERE, in the worker thread, so the server event loop never does the
|
|
81
|
+
// per-sample loop (HOL-blocking input/ping for long clips).
|
|
82
|
+
// - msg.samples: a Float32Array (legacy / external-endpoint callers).
|
|
83
|
+
let samples;
|
|
84
|
+
if (msg.pcm16 !== undefined && msg.pcm16 !== null) {
|
|
85
|
+
const int16 = msg.pcm16 instanceof Int16Array
|
|
86
|
+
? msg.pcm16
|
|
87
|
+
: new Int16Array(msg.pcm16);
|
|
88
|
+
samples = pcm16ToFloat32(int16);
|
|
89
|
+
} else {
|
|
90
|
+
samples = msg.samples instanceof Float32Array
|
|
91
|
+
? msg.samples
|
|
92
|
+
: new Float32Array(msg.samples);
|
|
93
|
+
}
|
|
81
94
|
|
|
82
95
|
const stream = recognizer.createStream();
|
|
83
96
|
stream.acceptWaveform({ samples, sampleRate: 16000 });
|
package/src/utils/pcm.js
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Convert 16-bit PCM samples to normalized Float32 [-1, 1).
|
|
5
|
+
*
|
|
6
|
+
* Divisor is 32768.0 for every sample (matching the original server-side
|
|
7
|
+
* conversion): positive full-scale 32767 maps to ~0.99997, negative full-scale
|
|
8
|
+
* -32768 maps to exactly -1.0. Used by the STT worker (off the server event
|
|
9
|
+
* loop) and exercised directly in unit tests.
|
|
10
|
+
*
|
|
11
|
+
* @param {Int16Array} int16
|
|
12
|
+
* @returns {Float32Array}
|
|
13
|
+
*/
|
|
14
|
+
function pcm16ToFloat32(int16) {
|
|
15
|
+
const out = new Float32Array(int16.length);
|
|
16
|
+
for (let i = 0; i < int16.length; i++) {
|
|
17
|
+
out[i] = int16[i] / 32768.0;
|
|
18
|
+
}
|
|
19
|
+
return out;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
module.exports = { pcm16ToFloat32 };
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Inbound binary voice-frame framing (client mic -> server STT).
|
|
5
|
+
*
|
|
6
|
+
* Wire format:
|
|
7
|
+
* [4 bytes ASCII "VUP1"][1 byte version][1 byte type][raw 16-bit PCM @16kHz mono]
|
|
8
|
+
*
|
|
9
|
+
* Pure (no I/O) so the dispatcher logic in server.js can be unit-tested without
|
|
10
|
+
* a live socket — in particular the Buffer[] (fragmented frame) normalization,
|
|
11
|
+
* which is the one genuinely platform-dependent receive path.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
const VOICE_MAGIC = Buffer.from('VUP1', 'ascii');
|
|
15
|
+
const VOICE_PROTO_VERSION = 1;
|
|
16
|
+
const VOICE_FRAME_TYPE_PCM = 0x01;
|
|
17
|
+
const VOICE_HEADER_BYTES = 6; // magic(4) + version(1) + type(1)
|
|
18
|
+
const MAX_VOICE_PCM_BYTES = 3840000; // 120 s @ 16 kHz / 16-bit / mono
|
|
19
|
+
const MAX_VOICE_BINARY_FRAME_BYTES = VOICE_HEADER_BYTES + MAX_VOICE_PCM_BYTES;
|
|
20
|
+
|
|
21
|
+
/**
|
|
22
|
+
* Normalize ws RawData to a single Buffer. ws delivers a Buffer when the frame
|
|
23
|
+
* is un-fragmented, a Buffer[] when it arrived in multiple WS continuation
|
|
24
|
+
* fragments (1-4 MB voice frames fragment variably across browsers/proxies/
|
|
25
|
+
* tunnels), or an ArrayBuffer under non-default options. Always size-check the
|
|
26
|
+
* RESULT of this, never the raw message (whose `.length` is the fragment count
|
|
27
|
+
* for an array).
|
|
28
|
+
*
|
|
29
|
+
* The concatenation is bounded: the ws server's `maxPayload` (8 MiB) caps the
|
|
30
|
+
* total message length during fragment reassembly and closes 1009 before the
|
|
31
|
+
* 'message' event fires, so this never concatenates more than 8 MiB.
|
|
32
|
+
*
|
|
33
|
+
* @param {Buffer|Buffer[]|ArrayBuffer|ArrayBufferView} message
|
|
34
|
+
* @returns {Buffer}
|
|
35
|
+
*/
|
|
36
|
+
function normalizeBinaryMessage(message) {
|
|
37
|
+
if (Buffer.isBuffer(message)) return message;
|
|
38
|
+
if (Array.isArray(message)) return Buffer.concat(message);
|
|
39
|
+
return Buffer.from(message);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Classify a normalized binary frame.
|
|
44
|
+
* - { action: 'oversize' } -> caller closes 1009 (message too big)
|
|
45
|
+
* - { action: 'unsupported' } -> caller closes 1003 (bad/short/unknown header)
|
|
46
|
+
* - { action: 'pcm', pcm } -> caller hands `pcm` (Buffer) to the STT core
|
|
47
|
+
*
|
|
48
|
+
* @param {Buffer} buf Normalized frame (see normalizeBinaryMessage).
|
|
49
|
+
* @returns {{action: string, pcm?: Buffer}}
|
|
50
|
+
*/
|
|
51
|
+
function classifyVoiceFrame(buf) {
|
|
52
|
+
if (buf.length > MAX_VOICE_BINARY_FRAME_BYTES) {
|
|
53
|
+
return { action: 'oversize' };
|
|
54
|
+
}
|
|
55
|
+
if (buf.length < VOICE_HEADER_BYTES
|
|
56
|
+
|| !buf.subarray(0, 4).equals(VOICE_MAGIC)
|
|
57
|
+
|| buf[4] !== VOICE_PROTO_VERSION
|
|
58
|
+
|| buf[5] !== VOICE_FRAME_TYPE_PCM) {
|
|
59
|
+
return { action: 'unsupported' };
|
|
60
|
+
}
|
|
61
|
+
return { action: 'pcm', pcm: buf.subarray(VOICE_HEADER_BYTES) };
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
module.exports = {
|
|
65
|
+
VOICE_MAGIC,
|
|
66
|
+
VOICE_PROTO_VERSION,
|
|
67
|
+
VOICE_FRAME_TYPE_PCM,
|
|
68
|
+
VOICE_HEADER_BYTES,
|
|
69
|
+
MAX_VOICE_PCM_BYTES,
|
|
70
|
+
MAX_VOICE_BINARY_FRAME_BYTES,
|
|
71
|
+
normalizeBinaryMessage,
|
|
72
|
+
classifyVoiceFrame,
|
|
73
|
+
};
|