@decentchat/decentchat-plugin 0.1.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,174 @@
1
+ import OpusScript from 'opusscript';
2
+
3
+ export interface AudioPipelineOptions {
4
+ sampleRate?: number; // 48000 (default)
5
+ channels?: number; // 1 mono (default)
6
+ frameDuration?: number; // 20ms (default)
7
+ vadThreshold?: number; // 0.02 RMS (default)
8
+ vadSilenceMs?: number; // 500ms (default)
9
+ onSpeechStart?: () => void;
10
+ onSpeechEnd?: (pcmBuffer: Buffer) => void;
11
+ log?: { info: (s: string) => void };
12
+ }
13
+
14
+ /**
15
+ * AudioPipeline — decodes incoming Opus RTP packets to PCM
16
+ * and detects when the human stops speaking via energy-based VAD.
17
+ */
18
+ export class AudioPipeline {
19
+ private decoder: OpusScript;
20
+ private sampleRate: number;
21
+ private channels: number;
22
+ private frameDuration: number;
23
+ private vadThreshold: number;
24
+ private vadSilenceMs: number;
25
+ private onSpeechStart?: () => void;
26
+ private onSpeechEnd?: (pcmBuffer: Buffer) => void;
27
+ private log?: { info: (s: string) => void };
28
+
29
+ // VAD state
30
+ private isSpeaking = false;
31
+ private pcmChunks: Buffer[] = [];
32
+ private silenceStart: number | null = null;
33
+
34
+ constructor(opts: AudioPipelineOptions = {}) {
35
+ this.sampleRate = opts.sampleRate ?? 48000;
36
+ this.channels = opts.channels ?? 1;
37
+ this.frameDuration = opts.frameDuration ?? 20;
38
+ this.vadThreshold = opts.vadThreshold ?? 0.02;
39
+ this.vadSilenceMs = opts.vadSilenceMs ?? 500;
40
+ this.onSpeechStart = opts.onSpeechStart;
41
+ this.onSpeechEnd = opts.onSpeechEnd;
42
+ this.log = opts.log;
43
+
44
+ this.decoder = new OpusScript(this.sampleRate, this.channels, OpusScript.Application.AUDIO);
45
+ }
46
+
47
+ /**
48
+ * Main entry point — called with raw RTP packet from WebRTC Track.
49
+ */
50
+ feedRtpPacket(buf: Buffer): void {
51
+ const opusPayload = this.stripRtpHeader(buf);
52
+ if (!opusPayload || opusPayload.length === 0) return;
53
+
54
+ // Decode Opus → PCM Int16
55
+ const pcm = this.decoder.decode(opusPayload);
56
+ if (!pcm || pcm.length === 0) return;
57
+
58
+ const pcmBuf = Buffer.from(pcm.buffer, pcm.byteOffset, pcm.byteLength);
59
+
60
+ // Convert to Int16Array for RMS computation
61
+ const samples = new Int16Array(pcm.buffer, pcm.byteOffset, pcm.byteLength / 2);
62
+ const rms = this.computeRMS(samples);
63
+
64
+ const now = Date.now();
65
+
66
+ if (rms >= this.vadThreshold) {
67
+ // Speech detected
68
+ if (!this.isSpeaking) {
69
+ this.isSpeaking = true;
70
+ this.log?.info(`[AudioPipeline] Speech started (RMS=${rms.toFixed(4)})`);
71
+ this.onSpeechStart?.();
72
+ }
73
+ this.silenceStart = null;
74
+ this.pcmChunks.push(pcmBuf);
75
+ } else {
76
+ // Silence
77
+ if (this.isSpeaking) {
78
+ // Still accumulate PCM during silence gap (for continuity)
79
+ this.pcmChunks.push(pcmBuf);
80
+
81
+ if (this.silenceStart === null) {
82
+ this.silenceStart = now;
83
+ } else if (now - this.silenceStart >= this.vadSilenceMs) {
84
+ // Silence threshold reached — emit speech
85
+ this.log?.info(`[AudioPipeline] Speech ended after ${this.vadSilenceMs}ms silence`);
86
+ const fullPcm = Buffer.concat(this.pcmChunks);
87
+ this.isSpeaking = false;
88
+ this.pcmChunks = [];
89
+ this.silenceStart = null;
90
+ this.onSpeechEnd?.(fullPcm);
91
+ }
92
+ }
93
+ // If not speaking, ignore silence frames
94
+ }
95
+ }
96
+
97
+ /**
98
+ * Parse RTP header and return the Opus payload after the header.
99
+ *
100
+ * RTP header format:
101
+ * Byte 0: V(2)|P(1)|X(1)|CC(4)
102
+ * Byte 1: M(1)|PT(7)
103
+ * Bytes 2-3: sequence number
104
+ * Bytes 4-7: timestamp
105
+ * Bytes 8-11: SSRC
106
+ * Then CC*4 bytes of CSRC
107
+ * If X bit set: 4 bytes extension header + extension data
108
+ */
109
+ stripRtpHeader(buf: Buffer): Buffer | null {
110
+ if (buf.length < 12) return null;
111
+
112
+ const byte0 = buf[0];
113
+ const cc = byte0 & 0x0F; // CSRC count
114
+ const hasExtension = (byte0 >> 4) & 0x01; // X bit
115
+ const hasPadding = (byte0 >> 5) & 0x01; // P bit
116
+
117
+ let offset = 12 + cc * 4; // Fixed header + CSRC list
118
+
119
+ if (offset > buf.length) return null;
120
+
121
+ // Handle extension header
122
+ if (hasExtension) {
123
+ if (offset + 4 > buf.length) return null;
124
+ // Extension header: 2 bytes profile-specific, 2 bytes length (in 32-bit words)
125
+ const extLength = buf.readUInt16BE(offset + 2);
126
+ offset += 4 + extLength * 4;
127
+ }
128
+
129
+ if (offset > buf.length) return null;
130
+
131
+ let payloadEnd = buf.length;
132
+
133
+ // Handle padding
134
+ if (hasPadding && buf.length > offset) {
135
+ const paddingLength = buf[buf.length - 1];
136
+ payloadEnd -= paddingLength;
137
+ }
138
+
139
+ if (payloadEnd <= offset) return null;
140
+
141
+ return buf.subarray(offset, payloadEnd);
142
+ }
143
+
144
+ /**
145
+ * Compute root-mean-square energy level.
146
+ * Normalizes Int16 samples to -1..1 range first.
147
+ */
148
+ computeRMS(pcm: Int16Array): number {
149
+ if (pcm.length === 0) return 0;
150
+ let sumSquares = 0;
151
+ for (let i = 0; i < pcm.length; i++) {
152
+ const normalized = pcm[i] / 32768;
153
+ sumSquares += normalized * normalized;
154
+ }
155
+ return Math.sqrt(sumSquares / pcm.length);
156
+ }
157
+
158
+ /**
159
+ * Reset VAD state (e.g. when peer disconnects).
160
+ */
161
+ reset(): void {
162
+ this.isSpeaking = false;
163
+ this.pcmChunks = [];
164
+ this.silenceStart = null;
165
+ }
166
+
167
+ /**
168
+ * Clean up decoder resources.
169
+ */
170
+ destroy(): void {
171
+ this.reset();
172
+ this.decoder.delete();
173
+ }
174
+ }