@decentchat/decentclaw 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +88 -0
- package/index.ts +17 -0
- package/openclaw.plugin.json +31 -0
- package/package.json +48 -0
- package/src/channel.ts +789 -0
- package/src/huddle/AudioPipeline.ts +174 -0
- package/src/huddle/BotHuddleManager.ts +882 -0
- package/src/huddle/SpeechToText.ts +223 -0
- package/src/huddle/TextToSpeech.ts +260 -0
- package/src/huddle/index.ts +8 -0
- package/src/monitor.ts +1266 -0
- package/src/peer/DecentChatNodePeer.ts +4570 -0
- package/src/peer/FileStore.ts +59 -0
- package/src/peer/NodeMessageProtocol.ts +1057 -0
- package/src/peer/SyncProtocol.ts +701 -0
- package/src/peer/polyfill.ts +43 -0
- package/src/peer-registry.ts +32 -0
- package/src/runtime.ts +63 -0
- package/src/types.ts +136 -0
|
@@ -0,0 +1,174 @@
|
|
|
1
|
+
import OpusScript from 'opusscript';
|
|
2
|
+
|
|
3
|
+
export interface AudioPipelineOptions {
|
|
4
|
+
sampleRate?: number; // 48000 (default)
|
|
5
|
+
channels?: number; // 1 mono (default)
|
|
6
|
+
frameDuration?: number; // 20ms (default)
|
|
7
|
+
vadThreshold?: number; // 0.02 RMS (default)
|
|
8
|
+
vadSilenceMs?: number; // 500ms (default)
|
|
9
|
+
onSpeechStart?: () => void;
|
|
10
|
+
onSpeechEnd?: (pcmBuffer: Buffer) => void;
|
|
11
|
+
log?: { info: (s: string) => void };
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* AudioPipeline — decodes incoming Opus RTP packets to PCM
|
|
16
|
+
* and detects when the human stops speaking via energy-based VAD.
|
|
17
|
+
*/
|
|
18
|
+
export class AudioPipeline {
|
|
19
|
+
private decoder: OpusScript;
|
|
20
|
+
private sampleRate: number;
|
|
21
|
+
private channels: number;
|
|
22
|
+
private frameDuration: number;
|
|
23
|
+
private vadThreshold: number;
|
|
24
|
+
private vadSilenceMs: number;
|
|
25
|
+
private onSpeechStart?: () => void;
|
|
26
|
+
private onSpeechEnd?: (pcmBuffer: Buffer) => void;
|
|
27
|
+
private log?: { info: (s: string) => void };
|
|
28
|
+
|
|
29
|
+
// VAD state
|
|
30
|
+
private isSpeaking = false;
|
|
31
|
+
private pcmChunks: Buffer[] = [];
|
|
32
|
+
private silenceStart: number | null = null;
|
|
33
|
+
|
|
34
|
+
constructor(opts: AudioPipelineOptions = {}) {
|
|
35
|
+
this.sampleRate = opts.sampleRate ?? 48000;
|
|
36
|
+
this.channels = opts.channels ?? 1;
|
|
37
|
+
this.frameDuration = opts.frameDuration ?? 20;
|
|
38
|
+
this.vadThreshold = opts.vadThreshold ?? 0.02;
|
|
39
|
+
this.vadSilenceMs = opts.vadSilenceMs ?? 500;
|
|
40
|
+
this.onSpeechStart = opts.onSpeechStart;
|
|
41
|
+
this.onSpeechEnd = opts.onSpeechEnd;
|
|
42
|
+
this.log = opts.log;
|
|
43
|
+
|
|
44
|
+
this.decoder = new OpusScript(this.sampleRate, this.channels, OpusScript.Application.AUDIO);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
/**
|
|
48
|
+
* Main entry point — called with raw RTP packet from WebRTC Track.
|
|
49
|
+
*/
|
|
50
|
+
feedRtpPacket(buf: Buffer): void {
|
|
51
|
+
const opusPayload = this.stripRtpHeader(buf);
|
|
52
|
+
if (!opusPayload || opusPayload.length === 0) return;
|
|
53
|
+
|
|
54
|
+
// Decode Opus → PCM Int16
|
|
55
|
+
const pcm = this.decoder.decode(opusPayload);
|
|
56
|
+
if (!pcm || pcm.length === 0) return;
|
|
57
|
+
|
|
58
|
+
const pcmBuf = Buffer.from(pcm.buffer, pcm.byteOffset, pcm.byteLength);
|
|
59
|
+
|
|
60
|
+
// Convert to Int16Array for RMS computation
|
|
61
|
+
const samples = new Int16Array(pcm.buffer, pcm.byteOffset, pcm.byteLength / 2);
|
|
62
|
+
const rms = this.computeRMS(samples);
|
|
63
|
+
|
|
64
|
+
const now = Date.now();
|
|
65
|
+
|
|
66
|
+
if (rms >= this.vadThreshold) {
|
|
67
|
+
// Speech detected
|
|
68
|
+
if (!this.isSpeaking) {
|
|
69
|
+
this.isSpeaking = true;
|
|
70
|
+
this.log?.info(`[AudioPipeline] Speech started (RMS=${rms.toFixed(4)})`);
|
|
71
|
+
this.onSpeechStart?.();
|
|
72
|
+
}
|
|
73
|
+
this.silenceStart = null;
|
|
74
|
+
this.pcmChunks.push(pcmBuf);
|
|
75
|
+
} else {
|
|
76
|
+
// Silence
|
|
77
|
+
if (this.isSpeaking) {
|
|
78
|
+
// Still accumulate PCM during silence gap (for continuity)
|
|
79
|
+
this.pcmChunks.push(pcmBuf);
|
|
80
|
+
|
|
81
|
+
if (this.silenceStart === null) {
|
|
82
|
+
this.silenceStart = now;
|
|
83
|
+
} else if (now - this.silenceStart >= this.vadSilenceMs) {
|
|
84
|
+
// Silence threshold reached — emit speech
|
|
85
|
+
this.log?.info(`[AudioPipeline] Speech ended after ${this.vadSilenceMs}ms silence`);
|
|
86
|
+
const fullPcm = Buffer.concat(this.pcmChunks);
|
|
87
|
+
this.isSpeaking = false;
|
|
88
|
+
this.pcmChunks = [];
|
|
89
|
+
this.silenceStart = null;
|
|
90
|
+
this.onSpeechEnd?.(fullPcm);
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
// If not speaking, ignore silence frames
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Parse RTP header and return the Opus payload after the header.
|
|
99
|
+
*
|
|
100
|
+
* RTP header format:
|
|
101
|
+
* Byte 0: V(2)|P(1)|X(1)|CC(4)
|
|
102
|
+
* Byte 1: M(1)|PT(7)
|
|
103
|
+
* Bytes 2-3: sequence number
|
|
104
|
+
* Bytes 4-7: timestamp
|
|
105
|
+
* Bytes 8-11: SSRC
|
|
106
|
+
* Then CC*4 bytes of CSRC
|
|
107
|
+
* If X bit set: 4 bytes extension header + extension data
|
|
108
|
+
*/
|
|
109
|
+
stripRtpHeader(buf: Buffer): Buffer | null {
|
|
110
|
+
if (buf.length < 12) return null;
|
|
111
|
+
|
|
112
|
+
const byte0 = buf[0];
|
|
113
|
+
const cc = byte0 & 0x0F; // CSRC count
|
|
114
|
+
const hasExtension = (byte0 >> 4) & 0x01; // X bit
|
|
115
|
+
const hasPadding = (byte0 >> 5) & 0x01; // P bit
|
|
116
|
+
|
|
117
|
+
let offset = 12 + cc * 4; // Fixed header + CSRC list
|
|
118
|
+
|
|
119
|
+
if (offset > buf.length) return null;
|
|
120
|
+
|
|
121
|
+
// Handle extension header
|
|
122
|
+
if (hasExtension) {
|
|
123
|
+
if (offset + 4 > buf.length) return null;
|
|
124
|
+
// Extension header: 2 bytes profile-specific, 2 bytes length (in 32-bit words)
|
|
125
|
+
const extLength = buf.readUInt16BE(offset + 2);
|
|
126
|
+
offset += 4 + extLength * 4;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if (offset > buf.length) return null;
|
|
130
|
+
|
|
131
|
+
let payloadEnd = buf.length;
|
|
132
|
+
|
|
133
|
+
// Handle padding
|
|
134
|
+
if (hasPadding && buf.length > offset) {
|
|
135
|
+
const paddingLength = buf[buf.length - 1];
|
|
136
|
+
payloadEnd -= paddingLength;
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (payloadEnd <= offset) return null;
|
|
140
|
+
|
|
141
|
+
return buf.subarray(offset, payloadEnd);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
/**
|
|
145
|
+
* Compute root-mean-square energy level.
|
|
146
|
+
* Normalizes Int16 samples to -1..1 range first.
|
|
147
|
+
*/
|
|
148
|
+
computeRMS(pcm: Int16Array): number {
|
|
149
|
+
if (pcm.length === 0) return 0;
|
|
150
|
+
let sumSquares = 0;
|
|
151
|
+
for (let i = 0; i < pcm.length; i++) {
|
|
152
|
+
const normalized = pcm[i] / 32768;
|
|
153
|
+
sumSquares += normalized * normalized;
|
|
154
|
+
}
|
|
155
|
+
return Math.sqrt(sumSquares / pcm.length);
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Reset VAD state (e.g. when peer disconnects).
|
|
160
|
+
*/
|
|
161
|
+
reset(): void {
|
|
162
|
+
this.isSpeaking = false;
|
|
163
|
+
this.pcmChunks = [];
|
|
164
|
+
this.silenceStart = null;
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
/**
|
|
168
|
+
* Clean up decoder resources.
|
|
169
|
+
*/
|
|
170
|
+
destroy(): void {
|
|
171
|
+
this.reset();
|
|
172
|
+
this.decoder.delete();
|
|
173
|
+
}
|
|
174
|
+
}
|