openwakeword-js 0.1.9 → 0.1.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/index.html +2 -2
- package/openwakeword.mjs +242 -0
- package/package.json +4 -3
- package/scripts/download_models.js +8 -0
package/index.html
CHANGED
|
@@ -295,9 +295,9 @@
|
|
|
295
295
|
</div>
|
|
296
296
|
|
|
297
297
|
<!-- Scripts -->
|
|
298
|
-
<script src="
|
|
298
|
+
<script src="./models/ort.min.js"></script>
|
|
299
299
|
<script type="module">
|
|
300
|
-
import { Model } from '
|
|
300
|
+
import { Model } from './openwakeword.mjs';
|
|
301
301
|
|
|
302
302
|
const state = {
|
|
303
303
|
isListening: false,
|
package/openwakeword.mjs
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
// src/index.ts
|
|
2
|
+
import * as ort from "onnxruntime-web";
|
|
3
|
+
var Model = class {
|
|
4
|
+
constructor(options) {
|
|
5
|
+
this.options = options;
|
|
6
|
+
const runtime = ort.env ? ort : globalThis.ort;
|
|
7
|
+
if (runtime && options.wasmPaths) {
|
|
8
|
+
runtime.env.wasm.wasmPaths = options.wasmPaths;
|
|
9
|
+
}
|
|
10
|
+
this.melContextBuffer = new Float32Array(this.MEL_CONTEXT).fill(0);
|
|
11
|
+
}
|
|
12
|
+
melSession = null;
|
|
13
|
+
embeddingSession = null;
|
|
14
|
+
vadSession = null;
|
|
15
|
+
customSessions = /* @__PURE__ */ new Map();
|
|
16
|
+
// Buffers
|
|
17
|
+
melBuffer = [];
|
|
18
|
+
embeddingBuffers = /* @__PURE__ */ new Map();
|
|
19
|
+
predictionBuffers = /* @__PURE__ */ new Map();
|
|
20
|
+
vadBuffer = [];
|
|
21
|
+
rawAudioRemainder = new Float32Array(0);
|
|
22
|
+
melContextBuffer;
|
|
23
|
+
// Seeding history
|
|
24
|
+
noiseSeededEmbeddings = /* @__PURE__ */ new Map();
|
|
25
|
+
// Constants
|
|
26
|
+
CHUNK_SIZE = 1280;
|
|
27
|
+
MEL_CONTEXT = 480;
|
|
28
|
+
SAMPLE_RATE = 16e3;
|
|
29
|
+
MEL_BINS = 32;
|
|
30
|
+
FRAMES_PER_CHUNK = 8;
|
|
31
|
+
MEL_WINDOW_SIZE = 76;
|
|
32
|
+
EMBEDDING_WINDOW_SIZE = 24;
|
|
33
|
+
MAX_MEL_FRAMES = 970;
|
|
34
|
+
INITIAL_FRAMES_SUPPRESS = 5;
|
|
35
|
+
PREDICTION_BUFFER_MAX = 30;
|
|
36
|
+
// VAD State (Silero VAD)
|
|
37
|
+
vadStateH = new Float32Array(2 * 1 * 64).fill(0);
|
|
38
|
+
vadStateC = new Float32Array(2 * 1 * 64).fill(0);
|
|
39
|
+
isLoaded = false;
|
|
40
|
+
async init() {
|
|
41
|
+
try {
|
|
42
|
+
this.melSession = await ort.InferenceSession.create(this.options.melspectrogramModelPath);
|
|
43
|
+
this.embeddingSession = await ort.InferenceSession.create(this.options.embeddingModelPath);
|
|
44
|
+
if (this.options.vadModelPath && this.options.vadThreshold && this.options.vadThreshold > 0) {
|
|
45
|
+
this.vadSession = await ort.InferenceSession.create(this.options.vadModelPath);
|
|
46
|
+
}
|
|
47
|
+
this.melBuffer = Array(this.MEL_WINDOW_SIZE).fill(0).map(() => new Float32Array(this.MEL_BINS).fill(1));
|
|
48
|
+
const warmNoise = new Float32Array(this.SAMPLE_RATE * 4);
|
|
49
|
+
for (let i = 0; i < warmNoise.length; i++) warmNoise[i] = Math.random() * 2e3 - 1e3;
|
|
50
|
+
const tempMelContext = new Float32Array(this.MEL_CONTEXT).fill(0);
|
|
51
|
+
const generatedEmbeddings = [];
|
|
52
|
+
for (let i = 0; i <= warmNoise.length - this.CHUNK_SIZE; i += this.CHUNK_SIZE) {
|
|
53
|
+
const chunk = warmNoise.subarray(i, i + this.CHUNK_SIZE);
|
|
54
|
+
const melInput = new Float32Array(this.CHUNK_SIZE + this.MEL_CONTEXT);
|
|
55
|
+
melInput.set(tempMelContext);
|
|
56
|
+
melInput.set(chunk, this.MEL_CONTEXT);
|
|
57
|
+
tempMelContext.set(chunk.subarray(this.CHUNK_SIZE - this.MEL_CONTEXT));
|
|
58
|
+
const melOutput = await this.runMelSpectrogram(melInput);
|
|
59
|
+
for (let f = 0; f < this.FRAMES_PER_CHUNK; f++) {
|
|
60
|
+
const frame = new Float32Array(this.MEL_BINS);
|
|
61
|
+
for (let b = 0; b < this.MEL_BINS; b++) {
|
|
62
|
+
const idx = f * this.MEL_BINS + b;
|
|
63
|
+
frame[b] = melOutput[idx] / 10 + 2;
|
|
64
|
+
}
|
|
65
|
+
this.melBuffer.push(frame);
|
|
66
|
+
}
|
|
67
|
+
while (this.melBuffer.length > this.MAX_MEL_FRAMES) this.melBuffer.shift();
|
|
68
|
+
const emb = await this.runEmbeddingModel();
|
|
69
|
+
generatedEmbeddings.push(emb);
|
|
70
|
+
}
|
|
71
|
+
for (const modelPath of this.options.wakewordModels) {
|
|
72
|
+
const session = await ort.InferenceSession.create(modelPath);
|
|
73
|
+
const name = this.extractModelName(modelPath);
|
|
74
|
+
this.customSessions.set(name, session);
|
|
75
|
+
const history = generatedEmbeddings.slice(-this.EMBEDDING_WINDOW_SIZE).map((e) => new Float32Array(e));
|
|
76
|
+
this.noiseSeededEmbeddings.set(name, history);
|
|
77
|
+
this.embeddingBuffers.set(name, history.map((e) => new Float32Array(e)));
|
|
78
|
+
this.predictionBuffers.set(name, []);
|
|
79
|
+
}
|
|
80
|
+
this.isLoaded = true;
|
|
81
|
+
console.log("OpenWakeWord models loaded and bit-perfectly aligned");
|
|
82
|
+
} catch (error) {
|
|
83
|
+
console.error("Failed to initialize OpenWakeWord models:", error);
|
|
84
|
+
throw error;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
async predict(audio) {
|
|
88
|
+
if (!this.isLoaded) throw new Error("Model not initialized");
|
|
89
|
+
let pcmAudio;
|
|
90
|
+
if (audio instanceof Int16Array) {
|
|
91
|
+
pcmAudio = new Float32Array(audio.length);
|
|
92
|
+
for (let i = 0; i < audio.length; i++) pcmAudio[i] = audio[i];
|
|
93
|
+
} else {
|
|
94
|
+
let max = 0;
|
|
95
|
+
for (let i = 0; i < Math.min(audio.length, 1e3); i++) {
|
|
96
|
+
const abs = Math.abs(audio[i]);
|
|
97
|
+
if (abs > max) max = abs;
|
|
98
|
+
}
|
|
99
|
+
if (max <= 1) {
|
|
100
|
+
pcmAudio = new Float32Array(audio.length);
|
|
101
|
+
for (let i = 0; i < audio.length; i++) pcmAudio[i] = audio[i] * 32768;
|
|
102
|
+
} else {
|
|
103
|
+
pcmAudio = audio;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
let combinedAudio = new Float32Array(this.rawAudioRemainder.length + pcmAudio.length);
|
|
107
|
+
combinedAudio.set(this.rawAudioRemainder);
|
|
108
|
+
combinedAudio.set(pcmAudio, this.rawAudioRemainder.length);
|
|
109
|
+
const scores = {};
|
|
110
|
+
for (const name of this.customSessions.keys()) scores[name] = 0;
|
|
111
|
+
let offset = 0;
|
|
112
|
+
while (offset + this.CHUNK_SIZE <= combinedAudio.length) {
|
|
113
|
+
const chunk = combinedAudio.subarray(offset, offset + this.CHUNK_SIZE);
|
|
114
|
+
offset += this.CHUNK_SIZE;
|
|
115
|
+
const melInput = new Float32Array(this.CHUNK_SIZE + this.MEL_CONTEXT);
|
|
116
|
+
melInput.set(this.melContextBuffer);
|
|
117
|
+
melInput.set(chunk, this.MEL_CONTEXT);
|
|
118
|
+
this.melContextBuffer.set(chunk.subarray(this.CHUNK_SIZE - this.MEL_CONTEXT));
|
|
119
|
+
if (this.vadSession && this.options.vadThreshold) {
|
|
120
|
+
const vadScore = await this.runVAD(chunk);
|
|
121
|
+
this.vadBuffer.push(vadScore);
|
|
122
|
+
while (this.vadBuffer.length > 30) this.vadBuffer.shift();
|
|
123
|
+
}
|
|
124
|
+
const melOutput = await this.runMelSpectrogram(melInput);
|
|
125
|
+
for (let f = 0; f < this.FRAMES_PER_CHUNK; f++) {
|
|
126
|
+
const frame = new Float32Array(this.MEL_BINS);
|
|
127
|
+
for (let b = 0; b < this.MEL_BINS; b++) {
|
|
128
|
+
const idx = f * this.MEL_BINS + b;
|
|
129
|
+
frame[b] = melOutput[idx] / 10 + 2;
|
|
130
|
+
}
|
|
131
|
+
this.melBuffer.push(frame);
|
|
132
|
+
}
|
|
133
|
+
while (this.melBuffer.length > this.MAX_MEL_FRAMES) this.melBuffer.shift();
|
|
134
|
+
const embedding = await this.runEmbeddingModel();
|
|
135
|
+
for (const [name, session] of this.customSessions.entries()) {
|
|
136
|
+
const embBuf = this.embeddingBuffers.get(name);
|
|
137
|
+
embBuf.shift();
|
|
138
|
+
embBuf.push(embedding);
|
|
139
|
+
let score = await this.runClassifier(name, session);
|
|
140
|
+
if (this.vadSession && this.options.vadThreshold) {
|
|
141
|
+
const window = this.vadBuffer.slice(-7, -4);
|
|
142
|
+
const maxVAD = window.length > 0 ? Math.max(...window) : 0;
|
|
143
|
+
if (maxVAD < this.options.vadThreshold) score = 0;
|
|
144
|
+
}
|
|
145
|
+
const predBuf = this.predictionBuffers.get(name);
|
|
146
|
+
predBuf.push(score);
|
|
147
|
+
while (predBuf.length > this.PREDICTION_BUFFER_MAX) predBuf.shift();
|
|
148
|
+
if (predBuf.length < this.INITIAL_FRAMES_SUPPRESS) {
|
|
149
|
+
score = 0;
|
|
150
|
+
} else if (this.options.patience?.[name] || this.options.debounceTime && this.options.debounceTime > 0) {
|
|
151
|
+
const threshold = this.options.thresholds?.[name] ?? 0.5;
|
|
152
|
+
if (this.options.patience?.[name]) {
|
|
153
|
+
const p = this.options.patience[name];
|
|
154
|
+
const recentScores = predBuf.slice(-p);
|
|
155
|
+
const countAbove = recentScores.filter((s) => s >= threshold).length;
|
|
156
|
+
if (countAbove < p) score = 0;
|
|
157
|
+
} else if (this.options.debounceTime) {
|
|
158
|
+
const framesToWait = Math.ceil(this.options.debounceTime / 0.08);
|
|
159
|
+
const recentScores = predBuf.slice(-framesToWait - 1, -1);
|
|
160
|
+
const alreadyTriggered = recentScores.some((s) => s >= threshold);
|
|
161
|
+
if (score >= threshold && alreadyTriggered) score = 0;
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
scores[name] = Math.max(scores[name], score);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
this.rawAudioRemainder = combinedAudio.slice(offset);
|
|
168
|
+
return scores;
|
|
169
|
+
}
|
|
170
|
+
async runMelSpectrogram(input) {
|
|
171
|
+
const inputTensor = new ort.Tensor("float32", input, [1, input.length]);
|
|
172
|
+
const results = await this.melSession.run({ [this.melSession.inputNames[0]]: inputTensor });
|
|
173
|
+
return results[this.melSession.outputNames[0]].data;
|
|
174
|
+
}
|
|
175
|
+
async runEmbeddingModel() {
|
|
176
|
+
const windowData = new Float32Array(this.MEL_WINDOW_SIZE * this.MEL_BINS);
|
|
177
|
+
const startIdx = this.melBuffer.length - this.MEL_WINDOW_SIZE;
|
|
178
|
+
for (let t = 0; t < this.MEL_WINDOW_SIZE; t++) {
|
|
179
|
+
windowData.set(this.melBuffer[startIdx + t], t * this.MEL_BINS);
|
|
180
|
+
}
|
|
181
|
+
const windowTensor = new ort.Tensor("float32", windowData, [1, this.MEL_WINDOW_SIZE, this.MEL_BINS, 1]);
|
|
182
|
+
const results = await this.embeddingSession.run({ [this.embeddingSession.inputNames[0]]: windowTensor });
|
|
183
|
+
const output = results[this.embeddingSession.outputNames[0]].data;
|
|
184
|
+
const embedding = new Float32Array(96);
|
|
185
|
+
for (let i = 0; i < 96; i++) {
|
|
186
|
+
let v = output[i] ?? 0;
|
|
187
|
+
if (isNaN(v) || !isFinite(v)) v = 0;
|
|
188
|
+
embedding[i] = v;
|
|
189
|
+
}
|
|
190
|
+
return embedding;
|
|
191
|
+
}
|
|
192
|
+
async runClassifier(name, session) {
|
|
193
|
+
const embBuf = this.embeddingBuffers.get(name);
|
|
194
|
+
const predData = new Float32Array(this.EMBEDDING_WINDOW_SIZE * 96);
|
|
195
|
+
for (let t = 0; t < this.EMBEDDING_WINDOW_SIZE; t++) predData.set(embBuf[t], t * 96);
|
|
196
|
+
const predTensor = new ort.Tensor("float32", predData, [1, this.EMBEDDING_WINDOW_SIZE, 96]);
|
|
197
|
+
const results = await session.run({ [session.inputNames[0]]: predTensor });
|
|
198
|
+
return results[session.outputNames[0]].data[0];
|
|
199
|
+
}
|
|
200
|
+
async runVAD(chunk) {
|
|
201
|
+
const normalized = new Float32Array(chunk.length);
|
|
202
|
+
for (let i = 0; i < chunk.length; i++) normalized[i] = chunk[i] / 32768;
|
|
203
|
+
const srTensor = new ort.Tensor("int64", BigInt64Array.from([BigInt(this.SAMPLE_RATE)]), [1]);
|
|
204
|
+
const hTensor = new ort.Tensor("float32", this.vadStateH, [2, 1, 64]);
|
|
205
|
+
const cTensor = new ort.Tensor("float32", this.vadStateC, [2, 1, 64]);
|
|
206
|
+
const inputTensor = new ort.Tensor("float32", normalized, [1, chunk.length]);
|
|
207
|
+
const feeds = {
|
|
208
|
+
[this.vadSession.inputNames[0]]: inputTensor,
|
|
209
|
+
[this.vadSession.inputNames[1]]: srTensor,
|
|
210
|
+
[this.vadSession.inputNames[2]]: hTensor,
|
|
211
|
+
[this.vadSession.inputNames[3]]: cTensor
|
|
212
|
+
};
|
|
213
|
+
const results = await this.vadSession.run(feeds);
|
|
214
|
+
this.vadStateH = results[this.vadSession.outputNames[1]].data;
|
|
215
|
+
this.vadStateC = results[this.vadSession.outputNames[2]].data;
|
|
216
|
+
return results[this.vadSession.outputNames[0]].data[0];
|
|
217
|
+
}
|
|
218
|
+
extractModelName(path) {
|
|
219
|
+
const base = path.split("/").pop() || path;
|
|
220
|
+
return base.replace(".onnx", "").replace(".tflite", "");
|
|
221
|
+
}
|
|
222
|
+
reset() {
|
|
223
|
+
this.melBuffer = Array(this.MEL_WINDOW_SIZE).fill(0).map(() => new Float32Array(this.MEL_BINS).fill(1));
|
|
224
|
+
this.rawAudioRemainder = new Float32Array(0);
|
|
225
|
+
this.melContextBuffer.fill(0);
|
|
226
|
+
this.vadBuffer = [];
|
|
227
|
+
this.vadStateH.fill(0);
|
|
228
|
+
this.vadStateC.fill(0);
|
|
229
|
+
for (const name of this.embeddingBuffers.keys()) {
|
|
230
|
+
this.predictionBuffers.set(name, []);
|
|
231
|
+
const seeded = this.noiseSeededEmbeddings.get(name);
|
|
232
|
+
if (seeded) {
|
|
233
|
+
this.embeddingBuffers.set(name, seeded.map((e) => new Float32Array(e)));
|
|
234
|
+
} else {
|
|
235
|
+
this.embeddingBuffers.set(name, Array(this.EMBEDDING_WINDOW_SIZE).fill(0).map(() => new Float32Array(96).fill(0)));
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
};
|
|
240
|
+
export {
|
|
241
|
+
Model
|
|
242
|
+
};
|
package/package.json
CHANGED
|
@@ -1,18 +1,18 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "openwakeword-js",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.10",
|
|
4
4
|
"description": "Port of openWakeWord to JavaScript/TypeScript using ONNX Runtime",
|
|
5
5
|
"bin": {
|
|
6
6
|
"openwakeword-js-setup": "scripts/download_models.js"
|
|
7
7
|
},
|
|
8
8
|
"main": "dist/index.js",
|
|
9
|
-
"module": "dist/index.
|
|
9
|
+
"module": "dist/index.js",
|
|
10
10
|
"types": "dist/index.d.ts",
|
|
11
11
|
"type": "module",
|
|
12
12
|
"exports": {
|
|
13
13
|
".": {
|
|
14
14
|
"types": "./dist/index.d.ts",
|
|
15
|
-
"import": "./dist/index.
|
|
15
|
+
"import": "./dist/index.js",
|
|
16
16
|
"require": "./dist/index.cjs"
|
|
17
17
|
}
|
|
18
18
|
},
|
|
@@ -21,6 +21,7 @@
|
|
|
21
21
|
"src",
|
|
22
22
|
"scripts",
|
|
23
23
|
"index.html",
|
|
24
|
+
"openwakeword.mjs",
|
|
24
25
|
"models/hello_deepa.onnx",
|
|
25
26
|
"models/namaste_deepa.onnx",
|
|
26
27
|
"README.md",
|
|
@@ -72,10 +72,14 @@ async function main() {
|
|
|
72
72
|
const nodeModulesPath = path.join(process.cwd(), 'node_modules', 'onnxruntime-web', 'dist');
|
|
73
73
|
|
|
74
74
|
if (fs.existsSync(nodeModulesPath)) {
|
|
75
|
+
// Copy WASM files
|
|
75
76
|
const wasmFiles = fs.readdirSync(nodeModulesPath).filter(f => f.endsWith('.wasm'));
|
|
76
77
|
for (const file of wasmFiles) {
|
|
77
78
|
copyIfExists(path.join(nodeModulesPath, file), path.join(MODELS_DIR, file), 'WASM');
|
|
78
79
|
}
|
|
80
|
+
// Copy ORT Script
|
|
81
|
+
const ortPath = path.join(nodeModulesPath, 'ort.min.js');
|
|
82
|
+
copyIfExists(ortPath, path.join(MODELS_DIR, 'ort.min.js'), 'ORT Script');
|
|
79
83
|
} else {
|
|
80
84
|
console.log('Warning: onnxruntime-web not found. Ensuring high-speed browser execution requires "npm install".');
|
|
81
85
|
}
|
|
@@ -85,6 +89,10 @@ async function main() {
|
|
|
85
89
|
const destHtml = path.join(process.cwd(), 'index.html');
|
|
86
90
|
copyIfExists(exampleHtml, destHtml, 'UI');
|
|
87
91
|
|
|
92
|
+
const libSrc = path.join(packageRoot, 'dist', 'index.js');
|
|
93
|
+
const libDest = path.join(process.cwd(), 'openwakeword.mjs');
|
|
94
|
+
copyIfExists(libSrc, libDest, 'Library');
|
|
95
|
+
|
|
88
96
|
console.log('\n----------------------------------------------------');
|
|
89
97
|
console.log('SETUP COMPLETE');
|
|
90
98
|
console.log('----------------------------------------------------');
|