@utterance/core 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +92 -0
- package/dist/index.cjs +381 -0
- package/dist/index.d.cts +92 -0
- package/dist/index.d.ts +92 -0
- package/dist/index.js +354 -0
- package/models/.gitkeep +0 -0
- package/package.json +93 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Nizh
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
package/README.md
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<h1 align="center">Utterance</h1>
|
|
3
|
+
<p align="center"><strong>Client-side semantic endpointing. Know when they're done talking.</strong></p>
|
|
4
|
+
<p align="center">
|
|
5
|
+
<a href="https://utterance.dev">Documentation</a> •
|
|
6
|
+
<a href="https://utterance.dev/demo">Live Demo</a> •
|
|
7
|
+
<a href="https://discord.gg/kb4zMHNtEV">Discord</a> •
|
|
8
|
+
<a href="https://github.com/nizh0/Utterance">GitHub</a>
|
|
9
|
+
</p>
|
|
10
|
+
</p>
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## The Problem
|
|
15
|
+
|
|
16
|
+
Every voice app faces the same annoying problem: **it can't tell when you're done talking.**
|
|
17
|
+
|
|
18
|
+
You pause to think, and it cuts you off. You take a breath, and it responds too soon. You want to interrupt, and it keeps going.
|
|
19
|
+
|
|
20
|
+
The current solutions either:
|
|
21
|
+
|
|
22
|
+
- **Detect silence** (Silero VAD, ricky0123/vad): They know when sound stops, but they can't tell if you're thinking or finished.
|
|
23
|
+
- **Use server-side AI** (OpenAI Realtime, AssemblyAI): They are smart, but they add delay, costs, and privacy issues.
|
|
24
|
+
|
|
25
|
+
**Utterance is different.** It uses a lightweight ML model entirely on the client side. It recognizes the difference between a thinking pause and a completed turn. No cloud. No delay. No per-minute fees.
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
npm install @utterance/core
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
```javascript
|
|
34
|
+
import { Utterance } from "@utterance/core";
|
|
35
|
+
|
|
36
|
+
const detector = new Utterance();
|
|
37
|
+
|
|
38
|
+
detector.on("turnEnd", (result) => {
|
|
39
|
+
console.log("User is done speaking", result.confidence);
|
|
40
|
+
});
|
|
41
|
+
|
|
42
|
+
detector.on("pause", (result) => {
|
|
43
|
+
console.log("User is thinking...", result.duration);
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
detector.on("interrupt", () => {
|
|
47
|
+
console.log("User wants to speak — stop AI response");
|
|
48
|
+
});
|
|
49
|
+
|
|
50
|
+
await detector.start();
|
|
51
|
+
```
|
|
52
|
+
|
|
53
|
+
See the [full documentation](https://utterance.dev/docs/quick-start) for detailed usage, API reference, and integration examples.
|
|
54
|
+
|
|
55
|
+
## Comparison
|
|
56
|
+
|
|
57
|
+
| Feature | Silero VAD | ricky0123/vad | Picovoice Cobra | OpenAI Realtime | **Utterance** |
|
|
58
|
+
| --- | --- | --- | --- | --- | --- |
|
|
59
|
+
| Detects speech vs. silence | ✅ | ✅ | ✅ | ✅ | ✅ |
|
|
60
|
+
| Semantic pause detection | ❌ | ❌ | ❌ | ✅ | ✅ |
|
|
61
|
+
| Interrupt detection | ❌ | ❌ | ❌ | ✅ | ✅ |
|
|
62
|
+
| Runs client-side | ✅ | ✅ | ✅ | ❌ | ✅ |
|
|
63
|
+
| No API costs | ✅ | ✅ | ❌ | ❌ | ✅ |
|
|
64
|
+
| Privacy (audio stays local) | ✅ | ✅ | ✅ | ❌ | ✅ |
|
|
65
|
+
|
|
66
|
+
## Contributing
|
|
67
|
+
|
|
68
|
+
We're building Utterance in the open, and contributions are welcome.
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
git clone https://github.com/nizh0/Utterance.git
|
|
72
|
+
cd Utterance
|
|
73
|
+
npm install
|
|
74
|
+
npm start
|
|
75
|
+
```
|
|
76
|
+
|
|
77
|
+
See the [contributing guide](https://utterance.dev/docs/contributing) for development workflow, project structure, and areas where we need help.
|
|
78
|
+
|
|
79
|
+
## Community
|
|
80
|
+
|
|
81
|
+
- [Discord](https://discord.gg/kb4zMHNtEV): Chat with contributors
|
|
82
|
+
- [GitHub Issues](https://github.com/nizh0/Utterance/issues): Bug reports & feature requests
|
|
83
|
+
|
|
84
|
+
## License
|
|
85
|
+
|
|
86
|
+
MIT © [Utterance](https://utterance.dev)
|
|
87
|
+
|
|
88
|
+
---
|
|
89
|
+
|
|
90
|
+
<p align="center">
|
|
91
|
+
<strong>"Five pharmacies on one road. But this one actually knows when you're done talking."</strong>
|
|
92
|
+
</p>
|
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,381 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __defProp = Object.defineProperty;
|
|
3
|
+
var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
4
|
+
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
5
|
+
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
6
|
+
var __export = (target, all) => {
|
|
7
|
+
for (var name in all)
|
|
8
|
+
__defProp(target, name, { get: all[name], enumerable: true });
|
|
9
|
+
};
|
|
10
|
+
var __copyProps = (to, from, except, desc) => {
|
|
11
|
+
if (from && typeof from === "object" || typeof from === "function") {
|
|
12
|
+
for (let key of __getOwnPropNames(from))
|
|
13
|
+
if (!__hasOwnProp.call(to, key) && key !== except)
|
|
14
|
+
__defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
|
|
15
|
+
}
|
|
16
|
+
return to;
|
|
17
|
+
};
|
|
18
|
+
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
19
|
+
|
|
20
|
+
// src/index.ts
|
|
21
|
+
var index_exports = {};
|
|
22
|
+
__export(index_exports, {
|
|
23
|
+
Utterance: () => Utterance
|
|
24
|
+
});
|
|
25
|
+
module.exports = __toCommonJS(index_exports);
|
|
26
|
+
|
|
27
|
+
// src/audio/capture.ts
|
|
28
|
+
var AudioCapture = class {
|
|
29
|
+
context = null;
|
|
30
|
+
stream = null;
|
|
31
|
+
processor = null;
|
|
32
|
+
callback = null;
|
|
33
|
+
sampleRate;
|
|
34
|
+
constructor(sampleRate = 16e3) {
|
|
35
|
+
this.sampleRate = sampleRate;
|
|
36
|
+
}
|
|
37
|
+
onAudioData(callback) {
|
|
38
|
+
this.callback = callback;
|
|
39
|
+
}
|
|
40
|
+
async start() {
|
|
41
|
+
this.stream = await navigator.mediaDevices.getUserMedia({
|
|
42
|
+
audio: {
|
|
43
|
+
sampleRate: this.sampleRate,
|
|
44
|
+
channelCount: 1,
|
|
45
|
+
echoCancellation: true,
|
|
46
|
+
noiseSuppression: true
|
|
47
|
+
}
|
|
48
|
+
});
|
|
49
|
+
this.context = new AudioContext({ sampleRate: this.sampleRate });
|
|
50
|
+
const source = this.context.createMediaStreamSource(this.stream);
|
|
51
|
+
const bufferSize = 4096;
|
|
52
|
+
this.processor = this.context.createScriptProcessor(bufferSize, 1, 1);
|
|
53
|
+
this.processor.onaudioprocess = (event) => {
|
|
54
|
+
const input = event.inputBuffer.getChannelData(0);
|
|
55
|
+
this.callback?.(new Float32Array(input));
|
|
56
|
+
};
|
|
57
|
+
source.connect(this.processor);
|
|
58
|
+
this.processor.connect(this.context.destination);
|
|
59
|
+
}
|
|
60
|
+
stop() {
|
|
61
|
+
this.processor?.disconnect();
|
|
62
|
+
this.stream?.getTracks().forEach((track) => track.stop());
|
|
63
|
+
void this.context?.close();
|
|
64
|
+
this.processor = null;
|
|
65
|
+
this.stream = null;
|
|
66
|
+
this.context = null;
|
|
67
|
+
}
|
|
68
|
+
};
|
|
69
|
+
|
|
70
|
+
// src/features/extractor.ts
|
|
71
|
+
var FeatureExtractor = class {
|
|
72
|
+
sampleRate;
|
|
73
|
+
constructor(sampleRate = 16e3) {
|
|
74
|
+
this.sampleRate = sampleRate;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Extract all features from a single audio frame.
|
|
78
|
+
*/
|
|
79
|
+
extract(frame) {
|
|
80
|
+
return {
|
|
81
|
+
mfcc: this.computeMFCC(frame),
|
|
82
|
+
energy: this.computeEnergy(frame),
|
|
83
|
+
pitch: this.estimatePitch(frame),
|
|
84
|
+
speechRate: this.estimateSpeechRate(frame),
|
|
85
|
+
pauseDuration: 0
|
|
86
|
+
// tracked by the detector over time
|
|
87
|
+
};
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Compute Mel-Frequency Cepstral Coefficients.
|
|
91
|
+
*
|
|
92
|
+
* TODO: Implement full MFCC pipeline:
|
|
93
|
+
* 1. Pre-emphasis filter
|
|
94
|
+
* 2. Windowing (Hamming)
|
|
95
|
+
* 3. FFT
|
|
96
|
+
* 4. Mel filterbank
|
|
97
|
+
* 5. Log energy
|
|
98
|
+
* 6. DCT
|
|
99
|
+
*/
|
|
100
|
+
computeMFCC(_frame) {
|
|
101
|
+
return new Float32Array(13);
|
|
102
|
+
}
|
|
103
|
+
/**
|
|
104
|
+
* Compute RMS energy of the frame.
|
|
105
|
+
*/
|
|
106
|
+
computeEnergy(frame) {
|
|
107
|
+
let sum = 0;
|
|
108
|
+
for (let i = 0; i < frame.length; i++) {
|
|
109
|
+
sum += frame[i] * frame[i];
|
|
110
|
+
}
|
|
111
|
+
return Math.sqrt(sum / frame.length);
|
|
112
|
+
}
|
|
113
|
+
/**
|
|
114
|
+
* Estimate fundamental frequency (pitch) using autocorrelation.
|
|
115
|
+
*
|
|
116
|
+
* TODO: Implement YIN or autocorrelation-based pitch detection.
|
|
117
|
+
*/
|
|
118
|
+
estimatePitch(_frame) {
|
|
119
|
+
void this.sampleRate;
|
|
120
|
+
return 0;
|
|
121
|
+
}
|
|
122
|
+
/**
|
|
123
|
+
* Estimate speech rate (syllables per second).
|
|
124
|
+
*
|
|
125
|
+
* TODO: Implement energy-envelope peak counting.
|
|
126
|
+
*/
|
|
127
|
+
estimateSpeechRate(_frame) {
|
|
128
|
+
return 0;
|
|
129
|
+
}
|
|
130
|
+
};
|
|
131
|
+
|
|
132
|
+
// src/model/energy-vad.ts
|
|
133
|
+
var EnergyVAD = class {
|
|
134
|
+
speechThreshold;
|
|
135
|
+
silenceThreshold;
|
|
136
|
+
isSpeaking = false;
|
|
137
|
+
silenceStart = 0;
|
|
138
|
+
pauseHintMs;
|
|
139
|
+
constructor(sensitivity = 0.5) {
|
|
140
|
+
this.speechThreshold = 0.015 * (1 - sensitivity * 0.8);
|
|
141
|
+
this.silenceThreshold = this.speechThreshold * 0.6;
|
|
142
|
+
this.pauseHintMs = 800;
|
|
143
|
+
}
|
|
144
|
+
classify(features) {
|
|
145
|
+
const { energy } = features;
|
|
146
|
+
const now = Date.now();
|
|
147
|
+
if (!this.isSpeaking && energy >= this.speechThreshold) {
|
|
148
|
+
this.isSpeaking = true;
|
|
149
|
+
this.silenceStart = 0;
|
|
150
|
+
return { label: "speaking", confidence: this.energyToConfidence(energy), timestamp: now };
|
|
151
|
+
}
|
|
152
|
+
if (this.isSpeaking && energy >= this.silenceThreshold) {
|
|
153
|
+
this.silenceStart = 0;
|
|
154
|
+
return { label: "speaking", confidence: this.energyToConfidence(energy), timestamp: now };
|
|
155
|
+
}
|
|
156
|
+
if (this.isSpeaking && energy < this.silenceThreshold) {
|
|
157
|
+
if (this.silenceStart === 0) {
|
|
158
|
+
this.silenceStart = now;
|
|
159
|
+
}
|
|
160
|
+
const silenceDuration = now - this.silenceStart;
|
|
161
|
+
if (silenceDuration >= this.pauseHintMs) {
|
|
162
|
+
this.isSpeaking = false;
|
|
163
|
+
const confidence = Math.min(silenceDuration / (this.pauseHintMs * 2), 1);
|
|
164
|
+
return { label: "turn_complete", confidence, timestamp: now };
|
|
165
|
+
}
|
|
166
|
+
return { label: "thinking_pause", confidence: 0.6, timestamp: now };
|
|
167
|
+
}
|
|
168
|
+
return { label: "thinking_pause", confidence: 0.3, timestamp: now };
|
|
169
|
+
}
|
|
170
|
+
reset() {
|
|
171
|
+
this.isSpeaking = false;
|
|
172
|
+
this.silenceStart = 0;
|
|
173
|
+
}
|
|
174
|
+
energyToConfidence(energy) {
|
|
175
|
+
return Math.min(energy / (this.speechThreshold * 4), 1);
|
|
176
|
+
}
|
|
177
|
+
};
|
|
178
|
+
|
|
179
|
+
// src/model/onnx.ts
|
|
180
|
+
var ONNXModel = class {
|
|
181
|
+
session = null;
|
|
182
|
+
fallback;
|
|
183
|
+
constructor(sensitivity = 0.5) {
|
|
184
|
+
this.fallback = new EnergyVAD(sensitivity);
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Load the ONNX model from a given path or URL.
|
|
188
|
+
*
|
|
189
|
+
* TODO:
|
|
190
|
+
* 1. Import onnxruntime-web InferenceSession
|
|
191
|
+
* 2. Load model bytes
|
|
192
|
+
* 3. Create session with appropriate execution providers
|
|
193
|
+
*/
|
|
194
|
+
async load(_path) {
|
|
195
|
+
this.session = null;
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Run inference on a set of extracted features.
|
|
199
|
+
*
|
|
200
|
+
* TODO:
|
|
201
|
+
* 1. Build input tensor from AudioFeatures
|
|
202
|
+
* 2. Run session.run()
|
|
203
|
+
* 3. Parse output into ClassificationResult
|
|
204
|
+
*/
|
|
205
|
+
async predict(features) {
|
|
206
|
+
if (!this.session) {
|
|
207
|
+
return this.fallback.classify(features);
|
|
208
|
+
}
|
|
209
|
+
return this.fallback.classify(features);
|
|
210
|
+
}
|
|
211
|
+
/**
|
|
212
|
+
* Release model resources.
|
|
213
|
+
*/
|
|
214
|
+
dispose() {
|
|
215
|
+
this.session = null;
|
|
216
|
+
this.fallback.reset();
|
|
217
|
+
}
|
|
218
|
+
};
|
|
219
|
+
|
|
220
|
+
// src/detector/turn-detector.ts
|
|
221
|
+
var TurnDetector = class {
|
|
222
|
+
listeners = /* @__PURE__ */ new Map();
|
|
223
|
+
state = "idle";
|
|
224
|
+
pauseStart = 0;
|
|
225
|
+
speakStart = 0;
|
|
226
|
+
sensitivity;
|
|
227
|
+
pauseTolerance;
|
|
228
|
+
constructor(sensitivity = 0.5, pauseTolerance = 1500) {
|
|
229
|
+
this.sensitivity = sensitivity;
|
|
230
|
+
this.pauseTolerance = pauseTolerance;
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Register an event listener.
|
|
234
|
+
*/
|
|
235
|
+
on(event, listener) {
|
|
236
|
+
if (!this.listeners.has(event)) {
|
|
237
|
+
this.listeners.set(event, /* @__PURE__ */ new Set());
|
|
238
|
+
}
|
|
239
|
+
this.listeners.get(event).add(listener);
|
|
240
|
+
}
|
|
241
|
+
/**
|
|
242
|
+
* Remove an event listener.
|
|
243
|
+
*/
|
|
244
|
+
off(event, listener) {
|
|
245
|
+
this.listeners.get(event)?.delete(listener);
|
|
246
|
+
}
|
|
247
|
+
/**
|
|
248
|
+
* Process a classification result from the model and emit events.
|
|
249
|
+
*/
|
|
250
|
+
process(result) {
|
|
251
|
+
const { label, confidence, timestamp } = result;
|
|
252
|
+
const threshold = this.sensitivity;
|
|
253
|
+
switch (label) {
|
|
254
|
+
case "speaking":
|
|
255
|
+
if (this.state !== "speaking") {
|
|
256
|
+
this.state = "speaking";
|
|
257
|
+
this.speakStart = timestamp;
|
|
258
|
+
this.emit("speechStart", { timestamp });
|
|
259
|
+
}
|
|
260
|
+
break;
|
|
261
|
+
case "thinking_pause":
|
|
262
|
+
if (this.state === "speaking" && confidence >= threshold) {
|
|
263
|
+
this.state = "paused";
|
|
264
|
+
this.pauseStart = timestamp;
|
|
265
|
+
this.emit("pause", {
|
|
266
|
+
duration: 0,
|
|
267
|
+
confidence
|
|
268
|
+
});
|
|
269
|
+
} else if (this.state === "paused") {
|
|
270
|
+
const duration = timestamp - this.pauseStart;
|
|
271
|
+
if (duration >= this.pauseTolerance) {
|
|
272
|
+
this.state = "idle";
|
|
273
|
+
this.emit("turnEnd", {
|
|
274
|
+
confidence,
|
|
275
|
+
duration: timestamp - this.speakStart
|
|
276
|
+
});
|
|
277
|
+
}
|
|
278
|
+
}
|
|
279
|
+
break;
|
|
280
|
+
case "turn_complete":
|
|
281
|
+
if ((this.state === "speaking" || this.state === "paused") && confidence >= threshold) {
|
|
282
|
+
this.state = "idle";
|
|
283
|
+
this.emit("turnEnd", {
|
|
284
|
+
confidence,
|
|
285
|
+
duration: timestamp - this.speakStart
|
|
286
|
+
});
|
|
287
|
+
}
|
|
288
|
+
break;
|
|
289
|
+
case "interrupt_intent":
|
|
290
|
+
if (confidence >= threshold) {
|
|
291
|
+
this.emit("interrupt", { timestamp });
|
|
292
|
+
}
|
|
293
|
+
break;
|
|
294
|
+
}
|
|
295
|
+
}
|
|
296
|
+
/**
|
|
297
|
+
* Reset internal state.
|
|
298
|
+
*/
|
|
299
|
+
reset() {
|
|
300
|
+
this.state = "idle";
|
|
301
|
+
this.pauseStart = 0;
|
|
302
|
+
this.speakStart = 0;
|
|
303
|
+
}
|
|
304
|
+
emit(event, payload) {
|
|
305
|
+
this.listeners.get(event)?.forEach((fn) => fn(payload));
|
|
306
|
+
}
|
|
307
|
+
};
|
|
308
|
+
|
|
309
|
+
// src/types.ts
|
|
310
|
+
var DEFAULT_OPTIONS = {
|
|
311
|
+
sensitivity: 0.5,
|
|
312
|
+
pauseTolerance: 1500,
|
|
313
|
+
modelPath: "bundled",
|
|
314
|
+
sampleRate: 16e3
|
|
315
|
+
};
|
|
316
|
+
|
|
317
|
+
// src/utterance.ts
|
|
318
|
+
var Utterance = class {
|
|
319
|
+
options;
|
|
320
|
+
audio;
|
|
321
|
+
features;
|
|
322
|
+
model;
|
|
323
|
+
detector;
|
|
324
|
+
listening = false;
|
|
325
|
+
constructor(options = {}) {
|
|
326
|
+
this.options = { ...DEFAULT_OPTIONS, ...options };
|
|
327
|
+
this.audio = new AudioCapture(this.options.sampleRate);
|
|
328
|
+
this.features = new FeatureExtractor(this.options.sampleRate);
|
|
329
|
+
this.model = new ONNXModel(this.options.sensitivity);
|
|
330
|
+
this.detector = new TurnDetector(
|
|
331
|
+
this.options.sensitivity,
|
|
332
|
+
this.options.pauseTolerance
|
|
333
|
+
);
|
|
334
|
+
}
|
|
335
|
+
/**
|
|
336
|
+
* Register an event listener.
|
|
337
|
+
*/
|
|
338
|
+
on(event, listener) {
|
|
339
|
+
this.detector.on(event, listener);
|
|
340
|
+
}
|
|
341
|
+
/**
|
|
342
|
+
* Remove an event listener.
|
|
343
|
+
*/
|
|
344
|
+
off(event, listener) {
|
|
345
|
+
this.detector.off(event, listener);
|
|
346
|
+
}
|
|
347
|
+
/**
|
|
348
|
+
* Start listening to the microphone and detecting turns.
|
|
349
|
+
*/
|
|
350
|
+
async start() {
|
|
351
|
+
if (this.listening) return;
|
|
352
|
+
await this.model.load(this.options.modelPath);
|
|
353
|
+
this.audio.onAudioData(async (frame) => {
|
|
354
|
+
const extracted = this.features.extract(frame);
|
|
355
|
+
const result = await this.model.predict(extracted);
|
|
356
|
+
this.detector.process(result);
|
|
357
|
+
});
|
|
358
|
+
await this.audio.start();
|
|
359
|
+
this.listening = true;
|
|
360
|
+
}
|
|
361
|
+
/**
|
|
362
|
+
* Stop listening and release all resources.
|
|
363
|
+
*/
|
|
364
|
+
stop() {
|
|
365
|
+
if (!this.listening) return;
|
|
366
|
+
this.audio.stop();
|
|
367
|
+
this.model.dispose();
|
|
368
|
+
this.detector.reset();
|
|
369
|
+
this.listening = false;
|
|
370
|
+
}
|
|
371
|
+
/**
|
|
372
|
+
* Returns whether the detector is currently listening.
|
|
373
|
+
*/
|
|
374
|
+
isListening() {
|
|
375
|
+
return this.listening;
|
|
376
|
+
}
|
|
377
|
+
};
|
|
378
|
+
// Annotate the CommonJS export names for ESM import in node:
|
|
379
|
+
0 && (module.exports = {
|
|
380
|
+
Utterance
|
|
381
|
+
});
|
package/dist/index.d.cts
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core types for the Utterance SDK.
|
|
3
|
+
*
|
|
4
|
+
* All shared interfaces and type definitions live here to keep
|
|
5
|
+
* the codebase scalable and avoid circular dependencies.
|
|
6
|
+
*/
|
|
7
|
+
interface UtteranceOptions {
|
|
8
|
+
/** Detection sensitivity (0-1). Higher = more sensitive to pauses. Default: 0.5 */
|
|
9
|
+
sensitivity?: number;
|
|
10
|
+
/** Max thinking pause duration (ms) before triggering turnEnd. Default: 1500 */
|
|
11
|
+
pauseTolerance?: number;
|
|
12
|
+
/** Path to a custom ONNX model. Default: bundled model */
|
|
13
|
+
modelPath?: string;
|
|
14
|
+
/** Audio sample rate in Hz. Default: 16000 */
|
|
15
|
+
sampleRate?: number;
|
|
16
|
+
}
|
|
17
|
+
type ClassificationLabel = "speaking" | "thinking_pause" | "turn_complete" | "interrupt_intent";
|
|
18
|
+
interface ClassificationResult {
|
|
19
|
+
label: ClassificationLabel;
|
|
20
|
+
confidence: number;
|
|
21
|
+
timestamp: number;
|
|
22
|
+
}
|
|
23
|
+
interface SpeechStartEvent {
|
|
24
|
+
timestamp: number;
|
|
25
|
+
}
|
|
26
|
+
interface PauseEvent {
|
|
27
|
+
duration: number;
|
|
28
|
+
confidence: number;
|
|
29
|
+
}
|
|
30
|
+
interface TurnEndEvent {
|
|
31
|
+
confidence: number;
|
|
32
|
+
duration: number;
|
|
33
|
+
}
|
|
34
|
+
interface InterruptEvent {
|
|
35
|
+
timestamp: number;
|
|
36
|
+
}
|
|
37
|
+
interface UtteranceEventMap {
|
|
38
|
+
speechStart: SpeechStartEvent;
|
|
39
|
+
pause: PauseEvent;
|
|
40
|
+
turnEnd: TurnEndEvent;
|
|
41
|
+
interrupt: InterruptEvent;
|
|
42
|
+
}
|
|
43
|
+
type UtteranceEvent = keyof UtteranceEventMap;
|
|
44
|
+
interface AudioFeatures {
|
|
45
|
+
mfcc: Float32Array;
|
|
46
|
+
energy: number;
|
|
47
|
+
pitch: number;
|
|
48
|
+
speechRate: number;
|
|
49
|
+
pauseDuration: number;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Main entry point for the Utterance SDK.
|
|
54
|
+
*
|
|
55
|
+
* Usage:
|
|
56
|
+
* ```ts
|
|
57
|
+
* const detector = new Utterance({ sensitivity: 0.6 });
|
|
58
|
+
* detector.on("turnEnd", (e) => console.log("Done!", e.confidence));
|
|
59
|
+
* await detector.start();
|
|
60
|
+
* ```
|
|
61
|
+
*/
|
|
62
|
+
declare class Utterance {
|
|
63
|
+
private readonly options;
|
|
64
|
+
private readonly audio;
|
|
65
|
+
private readonly features;
|
|
66
|
+
private readonly model;
|
|
67
|
+
private readonly detector;
|
|
68
|
+
private listening;
|
|
69
|
+
constructor(options?: UtteranceOptions);
|
|
70
|
+
/**
|
|
71
|
+
* Register an event listener.
|
|
72
|
+
*/
|
|
73
|
+
on<E extends UtteranceEvent>(event: E, listener: (payload: UtteranceEventMap[E]) => void): void;
|
|
74
|
+
/**
|
|
75
|
+
* Remove an event listener.
|
|
76
|
+
*/
|
|
77
|
+
off<E extends UtteranceEvent>(event: E, listener: (payload: UtteranceEventMap[E]) => void): void;
|
|
78
|
+
/**
|
|
79
|
+
* Start listening to the microphone and detecting turns.
|
|
80
|
+
*/
|
|
81
|
+
start(): Promise<void>;
|
|
82
|
+
/**
|
|
83
|
+
* Stop listening and release all resources.
|
|
84
|
+
*/
|
|
85
|
+
stop(): void;
|
|
86
|
+
/**
|
|
87
|
+
* Returns whether the detector is currently listening.
|
|
88
|
+
*/
|
|
89
|
+
isListening(): boolean;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export { type AudioFeatures, type ClassificationLabel, type ClassificationResult, type InterruptEvent, type PauseEvent, type SpeechStartEvent, type TurnEndEvent, Utterance, type UtteranceEvent, type UtteranceEventMap, type UtteranceOptions };
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Core types for the Utterance SDK.
|
|
3
|
+
*
|
|
4
|
+
* All shared interfaces and type definitions live here to keep
|
|
5
|
+
* the codebase scalable and avoid circular dependencies.
|
|
6
|
+
*/
|
|
7
|
+
interface UtteranceOptions {
|
|
8
|
+
/** Detection sensitivity (0-1). Higher = more sensitive to pauses. Default: 0.5 */
|
|
9
|
+
sensitivity?: number;
|
|
10
|
+
/** Max thinking pause duration (ms) before triggering turnEnd. Default: 1500 */
|
|
11
|
+
pauseTolerance?: number;
|
|
12
|
+
/** Path to a custom ONNX model. Default: bundled model */
|
|
13
|
+
modelPath?: string;
|
|
14
|
+
/** Audio sample rate in Hz. Default: 16000 */
|
|
15
|
+
sampleRate?: number;
|
|
16
|
+
}
|
|
17
|
+
type ClassificationLabel = "speaking" | "thinking_pause" | "turn_complete" | "interrupt_intent";
|
|
18
|
+
interface ClassificationResult {
|
|
19
|
+
label: ClassificationLabel;
|
|
20
|
+
confidence: number;
|
|
21
|
+
timestamp: number;
|
|
22
|
+
}
|
|
23
|
+
interface SpeechStartEvent {
|
|
24
|
+
timestamp: number;
|
|
25
|
+
}
|
|
26
|
+
interface PauseEvent {
|
|
27
|
+
duration: number;
|
|
28
|
+
confidence: number;
|
|
29
|
+
}
|
|
30
|
+
interface TurnEndEvent {
|
|
31
|
+
confidence: number;
|
|
32
|
+
duration: number;
|
|
33
|
+
}
|
|
34
|
+
interface InterruptEvent {
|
|
35
|
+
timestamp: number;
|
|
36
|
+
}
|
|
37
|
+
interface UtteranceEventMap {
|
|
38
|
+
speechStart: SpeechStartEvent;
|
|
39
|
+
pause: PauseEvent;
|
|
40
|
+
turnEnd: TurnEndEvent;
|
|
41
|
+
interrupt: InterruptEvent;
|
|
42
|
+
}
|
|
43
|
+
type UtteranceEvent = keyof UtteranceEventMap;
|
|
44
|
+
interface AudioFeatures {
|
|
45
|
+
mfcc: Float32Array;
|
|
46
|
+
energy: number;
|
|
47
|
+
pitch: number;
|
|
48
|
+
speechRate: number;
|
|
49
|
+
pauseDuration: number;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Main entry point for the Utterance SDK.
|
|
54
|
+
*
|
|
55
|
+
* Usage:
|
|
56
|
+
* ```ts
|
|
57
|
+
* const detector = new Utterance({ sensitivity: 0.6 });
|
|
58
|
+
* detector.on("turnEnd", (e) => console.log("Done!", e.confidence));
|
|
59
|
+
* await detector.start();
|
|
60
|
+
* ```
|
|
61
|
+
*/
|
|
62
|
+
declare class Utterance {
|
|
63
|
+
private readonly options;
|
|
64
|
+
private readonly audio;
|
|
65
|
+
private readonly features;
|
|
66
|
+
private readonly model;
|
|
67
|
+
private readonly detector;
|
|
68
|
+
private listening;
|
|
69
|
+
constructor(options?: UtteranceOptions);
|
|
70
|
+
/**
|
|
71
|
+
* Register an event listener.
|
|
72
|
+
*/
|
|
73
|
+
on<E extends UtteranceEvent>(event: E, listener: (payload: UtteranceEventMap[E]) => void): void;
|
|
74
|
+
/**
|
|
75
|
+
* Remove an event listener.
|
|
76
|
+
*/
|
|
77
|
+
off<E extends UtteranceEvent>(event: E, listener: (payload: UtteranceEventMap[E]) => void): void;
|
|
78
|
+
/**
|
|
79
|
+
* Start listening to the microphone and detecting turns.
|
|
80
|
+
*/
|
|
81
|
+
start(): Promise<void>;
|
|
82
|
+
/**
|
|
83
|
+
* Stop listening and release all resources.
|
|
84
|
+
*/
|
|
85
|
+
stop(): void;
|
|
86
|
+
/**
|
|
87
|
+
* Returns whether the detector is currently listening.
|
|
88
|
+
*/
|
|
89
|
+
isListening(): boolean;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export { type AudioFeatures, type ClassificationLabel, type ClassificationResult, type InterruptEvent, type PauseEvent, type SpeechStartEvent, type TurnEndEvent, Utterance, type UtteranceEvent, type UtteranceEventMap, type UtteranceOptions };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,354 @@
|
|
|
1
|
+
// src/audio/capture.ts
|
|
2
|
+
var AudioCapture = class {
|
|
3
|
+
context = null;
|
|
4
|
+
stream = null;
|
|
5
|
+
processor = null;
|
|
6
|
+
callback = null;
|
|
7
|
+
sampleRate;
|
|
8
|
+
constructor(sampleRate = 16e3) {
|
|
9
|
+
this.sampleRate = sampleRate;
|
|
10
|
+
}
|
|
11
|
+
onAudioData(callback) {
|
|
12
|
+
this.callback = callback;
|
|
13
|
+
}
|
|
14
|
+
async start() {
|
|
15
|
+
this.stream = await navigator.mediaDevices.getUserMedia({
|
|
16
|
+
audio: {
|
|
17
|
+
sampleRate: this.sampleRate,
|
|
18
|
+
channelCount: 1,
|
|
19
|
+
echoCancellation: true,
|
|
20
|
+
noiseSuppression: true
|
|
21
|
+
}
|
|
22
|
+
});
|
|
23
|
+
this.context = new AudioContext({ sampleRate: this.sampleRate });
|
|
24
|
+
const source = this.context.createMediaStreamSource(this.stream);
|
|
25
|
+
const bufferSize = 4096;
|
|
26
|
+
this.processor = this.context.createScriptProcessor(bufferSize, 1, 1);
|
|
27
|
+
this.processor.onaudioprocess = (event) => {
|
|
28
|
+
const input = event.inputBuffer.getChannelData(0);
|
|
29
|
+
this.callback?.(new Float32Array(input));
|
|
30
|
+
};
|
|
31
|
+
source.connect(this.processor);
|
|
32
|
+
this.processor.connect(this.context.destination);
|
|
33
|
+
}
|
|
34
|
+
stop() {
|
|
35
|
+
this.processor?.disconnect();
|
|
36
|
+
this.stream?.getTracks().forEach((track) => track.stop());
|
|
37
|
+
void this.context?.close();
|
|
38
|
+
this.processor = null;
|
|
39
|
+
this.stream = null;
|
|
40
|
+
this.context = null;
|
|
41
|
+
}
|
|
42
|
+
};
|
|
43
|
+
|
|
44
|
+
// src/features/extractor.ts
|
|
45
|
+
var FeatureExtractor = class {
|
|
46
|
+
sampleRate;
|
|
47
|
+
constructor(sampleRate = 16e3) {
|
|
48
|
+
this.sampleRate = sampleRate;
|
|
49
|
+
}
|
|
50
|
+
/**
|
|
51
|
+
* Extract all features from a single audio frame.
|
|
52
|
+
*/
|
|
53
|
+
extract(frame) {
|
|
54
|
+
return {
|
|
55
|
+
mfcc: this.computeMFCC(frame),
|
|
56
|
+
energy: this.computeEnergy(frame),
|
|
57
|
+
pitch: this.estimatePitch(frame),
|
|
58
|
+
speechRate: this.estimateSpeechRate(frame),
|
|
59
|
+
pauseDuration: 0
|
|
60
|
+
// tracked by the detector over time
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
/**
|
|
64
|
+
* Compute Mel-Frequency Cepstral Coefficients.
|
|
65
|
+
*
|
|
66
|
+
* TODO: Implement full MFCC pipeline:
|
|
67
|
+
* 1. Pre-emphasis filter
|
|
68
|
+
* 2. Windowing (Hamming)
|
|
69
|
+
* 3. FFT
|
|
70
|
+
* 4. Mel filterbank
|
|
71
|
+
* 5. Log energy
|
|
72
|
+
* 6. DCT
|
|
73
|
+
*/
|
|
74
|
+
computeMFCC(_frame) {
|
|
75
|
+
return new Float32Array(13);
|
|
76
|
+
}
|
|
77
|
+
/**
|
|
78
|
+
* Compute RMS energy of the frame.
|
|
79
|
+
*/
|
|
80
|
+
computeEnergy(frame) {
|
|
81
|
+
let sum = 0;
|
|
82
|
+
for (let i = 0; i < frame.length; i++) {
|
|
83
|
+
sum += frame[i] * frame[i];
|
|
84
|
+
}
|
|
85
|
+
return Math.sqrt(sum / frame.length);
|
|
86
|
+
}
|
|
87
|
+
/**
|
|
88
|
+
* Estimate fundamental frequency (pitch) using autocorrelation.
|
|
89
|
+
*
|
|
90
|
+
* TODO: Implement YIN or autocorrelation-based pitch detection.
|
|
91
|
+
*/
|
|
92
|
+
estimatePitch(_frame) {
|
|
93
|
+
void this.sampleRate;
|
|
94
|
+
return 0;
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Estimate speech rate (syllables per second).
|
|
98
|
+
*
|
|
99
|
+
* TODO: Implement energy-envelope peak counting.
|
|
100
|
+
*/
|
|
101
|
+
estimateSpeechRate(_frame) {
|
|
102
|
+
return 0;
|
|
103
|
+
}
|
|
104
|
+
};
|
|
105
|
+
|
|
106
|
+
// src/model/energy-vad.ts
|
|
107
|
+
var EnergyVAD = class {
|
|
108
|
+
speechThreshold;
|
|
109
|
+
silenceThreshold;
|
|
110
|
+
isSpeaking = false;
|
|
111
|
+
silenceStart = 0;
|
|
112
|
+
pauseHintMs;
|
|
113
|
+
constructor(sensitivity = 0.5) {
|
|
114
|
+
this.speechThreshold = 0.015 * (1 - sensitivity * 0.8);
|
|
115
|
+
this.silenceThreshold = this.speechThreshold * 0.6;
|
|
116
|
+
this.pauseHintMs = 800;
|
|
117
|
+
}
|
|
118
|
+
classify(features) {
|
|
119
|
+
const { energy } = features;
|
|
120
|
+
const now = Date.now();
|
|
121
|
+
if (!this.isSpeaking && energy >= this.speechThreshold) {
|
|
122
|
+
this.isSpeaking = true;
|
|
123
|
+
this.silenceStart = 0;
|
|
124
|
+
return { label: "speaking", confidence: this.energyToConfidence(energy), timestamp: now };
|
|
125
|
+
}
|
|
126
|
+
if (this.isSpeaking && energy >= this.silenceThreshold) {
|
|
127
|
+
this.silenceStart = 0;
|
|
128
|
+
return { label: "speaking", confidence: this.energyToConfidence(energy), timestamp: now };
|
|
129
|
+
}
|
|
130
|
+
if (this.isSpeaking && energy < this.silenceThreshold) {
|
|
131
|
+
if (this.silenceStart === 0) {
|
|
132
|
+
this.silenceStart = now;
|
|
133
|
+
}
|
|
134
|
+
const silenceDuration = now - this.silenceStart;
|
|
135
|
+
if (silenceDuration >= this.pauseHintMs) {
|
|
136
|
+
this.isSpeaking = false;
|
|
137
|
+
const confidence = Math.min(silenceDuration / (this.pauseHintMs * 2), 1);
|
|
138
|
+
return { label: "turn_complete", confidence, timestamp: now };
|
|
139
|
+
}
|
|
140
|
+
return { label: "thinking_pause", confidence: 0.6, timestamp: now };
|
|
141
|
+
}
|
|
142
|
+
return { label: "thinking_pause", confidence: 0.3, timestamp: now };
|
|
143
|
+
}
|
|
144
|
+
reset() {
|
|
145
|
+
this.isSpeaking = false;
|
|
146
|
+
this.silenceStart = 0;
|
|
147
|
+
}
|
|
148
|
+
energyToConfidence(energy) {
|
|
149
|
+
return Math.min(energy / (this.speechThreshold * 4), 1);
|
|
150
|
+
}
|
|
151
|
+
};
|
|
152
|
+
|
|
153
|
+
// src/model/onnx.ts
|
|
154
|
+
var ONNXModel = class {
|
|
155
|
+
session = null;
|
|
156
|
+
fallback;
|
|
157
|
+
constructor(sensitivity = 0.5) {
|
|
158
|
+
this.fallback = new EnergyVAD(sensitivity);
|
|
159
|
+
}
|
|
160
|
+
/**
|
|
161
|
+
* Load the ONNX model from a given path or URL.
|
|
162
|
+
*
|
|
163
|
+
* TODO:
|
|
164
|
+
* 1. Import onnxruntime-web InferenceSession
|
|
165
|
+
* 2. Load model bytes
|
|
166
|
+
* 3. Create session with appropriate execution providers
|
|
167
|
+
*/
|
|
168
|
+
async load(_path) {
|
|
169
|
+
this.session = null;
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Run inference on a set of extracted features.
|
|
173
|
+
*
|
|
174
|
+
* TODO:
|
|
175
|
+
* 1. Build input tensor from AudioFeatures
|
|
176
|
+
* 2. Run session.run()
|
|
177
|
+
* 3. Parse output into ClassificationResult
|
|
178
|
+
*/
|
|
179
|
+
async predict(features) {
|
|
180
|
+
if (!this.session) {
|
|
181
|
+
return this.fallback.classify(features);
|
|
182
|
+
}
|
|
183
|
+
return this.fallback.classify(features);
|
|
184
|
+
}
|
|
185
|
+
/**
|
|
186
|
+
* Release model resources.
|
|
187
|
+
*/
|
|
188
|
+
dispose() {
|
|
189
|
+
this.session = null;
|
|
190
|
+
this.fallback.reset();
|
|
191
|
+
}
|
|
192
|
+
};
|
|
193
|
+
|
|
194
|
+
// src/detector/turn-detector.ts
|
|
195
|
+
var TurnDetector = class {
|
|
196
|
+
listeners = /* @__PURE__ */ new Map();
|
|
197
|
+
state = "idle";
|
|
198
|
+
pauseStart = 0;
|
|
199
|
+
speakStart = 0;
|
|
200
|
+
sensitivity;
|
|
201
|
+
pauseTolerance;
|
|
202
|
+
constructor(sensitivity = 0.5, pauseTolerance = 1500) {
|
|
203
|
+
this.sensitivity = sensitivity;
|
|
204
|
+
this.pauseTolerance = pauseTolerance;
|
|
205
|
+
}
|
|
206
|
+
/**
|
|
207
|
+
* Register an event listener.
|
|
208
|
+
*/
|
|
209
|
+
on(event, listener) {
|
|
210
|
+
if (!this.listeners.has(event)) {
|
|
211
|
+
this.listeners.set(event, /* @__PURE__ */ new Set());
|
|
212
|
+
}
|
|
213
|
+
this.listeners.get(event).add(listener);
|
|
214
|
+
}
|
|
215
|
+
/**
|
|
216
|
+
* Remove an event listener.
|
|
217
|
+
*/
|
|
218
|
+
off(event, listener) {
|
|
219
|
+
this.listeners.get(event)?.delete(listener);
|
|
220
|
+
}
|
|
221
|
+
/**
|
|
222
|
+
* Process a classification result from the model and emit events.
|
|
223
|
+
*/
|
|
224
|
+
process(result) {
|
|
225
|
+
const { label, confidence, timestamp } = result;
|
|
226
|
+
const threshold = this.sensitivity;
|
|
227
|
+
switch (label) {
|
|
228
|
+
case "speaking":
|
|
229
|
+
if (this.state !== "speaking") {
|
|
230
|
+
this.state = "speaking";
|
|
231
|
+
this.speakStart = timestamp;
|
|
232
|
+
this.emit("speechStart", { timestamp });
|
|
233
|
+
}
|
|
234
|
+
break;
|
|
235
|
+
case "thinking_pause":
|
|
236
|
+
if (this.state === "speaking" && confidence >= threshold) {
|
|
237
|
+
this.state = "paused";
|
|
238
|
+
this.pauseStart = timestamp;
|
|
239
|
+
this.emit("pause", {
|
|
240
|
+
duration: 0,
|
|
241
|
+
confidence
|
|
242
|
+
});
|
|
243
|
+
} else if (this.state === "paused") {
|
|
244
|
+
const duration = timestamp - this.pauseStart;
|
|
245
|
+
if (duration >= this.pauseTolerance) {
|
|
246
|
+
this.state = "idle";
|
|
247
|
+
this.emit("turnEnd", {
|
|
248
|
+
confidence,
|
|
249
|
+
duration: timestamp - this.speakStart
|
|
250
|
+
});
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
break;
|
|
254
|
+
case "turn_complete":
|
|
255
|
+
if ((this.state === "speaking" || this.state === "paused") && confidence >= threshold) {
|
|
256
|
+
this.state = "idle";
|
|
257
|
+
this.emit("turnEnd", {
|
|
258
|
+
confidence,
|
|
259
|
+
duration: timestamp - this.speakStart
|
|
260
|
+
});
|
|
261
|
+
}
|
|
262
|
+
break;
|
|
263
|
+
case "interrupt_intent":
|
|
264
|
+
if (confidence >= threshold) {
|
|
265
|
+
this.emit("interrupt", { timestamp });
|
|
266
|
+
}
|
|
267
|
+
break;
|
|
268
|
+
}
|
|
269
|
+
}
|
|
270
|
+
/**
|
|
271
|
+
* Reset internal state.
|
|
272
|
+
*/
|
|
273
|
+
reset() {
|
|
274
|
+
this.state = "idle";
|
|
275
|
+
this.pauseStart = 0;
|
|
276
|
+
this.speakStart = 0;
|
|
277
|
+
}
|
|
278
|
+
emit(event, payload) {
|
|
279
|
+
this.listeners.get(event)?.forEach((fn) => fn(payload));
|
|
280
|
+
}
|
|
281
|
+
};
|
|
282
|
+
|
|
283
|
+
// src/types.ts
|
|
284
|
+
var DEFAULT_OPTIONS = {
|
|
285
|
+
sensitivity: 0.5,
|
|
286
|
+
pauseTolerance: 1500,
|
|
287
|
+
modelPath: "bundled",
|
|
288
|
+
sampleRate: 16e3
|
|
289
|
+
};
|
|
290
|
+
|
|
291
|
+
// src/utterance.ts
|
|
292
|
+
var Utterance = class {
|
|
293
|
+
options;
|
|
294
|
+
audio;
|
|
295
|
+
features;
|
|
296
|
+
model;
|
|
297
|
+
detector;
|
|
298
|
+
listening = false;
|
|
299
|
+
constructor(options = {}) {
|
|
300
|
+
this.options = { ...DEFAULT_OPTIONS, ...options };
|
|
301
|
+
this.audio = new AudioCapture(this.options.sampleRate);
|
|
302
|
+
this.features = new FeatureExtractor(this.options.sampleRate);
|
|
303
|
+
this.model = new ONNXModel(this.options.sensitivity);
|
|
304
|
+
this.detector = new TurnDetector(
|
|
305
|
+
this.options.sensitivity,
|
|
306
|
+
this.options.pauseTolerance
|
|
307
|
+
);
|
|
308
|
+
}
|
|
309
|
+
/**
|
|
310
|
+
* Register an event listener.
|
|
311
|
+
*/
|
|
312
|
+
on(event, listener) {
|
|
313
|
+
this.detector.on(event, listener);
|
|
314
|
+
}
|
|
315
|
+
/**
|
|
316
|
+
* Remove an event listener.
|
|
317
|
+
*/
|
|
318
|
+
off(event, listener) {
|
|
319
|
+
this.detector.off(event, listener);
|
|
320
|
+
}
|
|
321
|
+
/**
|
|
322
|
+
* Start listening to the microphone and detecting turns.
|
|
323
|
+
*/
|
|
324
|
+
async start() {
|
|
325
|
+
if (this.listening) return;
|
|
326
|
+
await this.model.load(this.options.modelPath);
|
|
327
|
+
this.audio.onAudioData(async (frame) => {
|
|
328
|
+
const extracted = this.features.extract(frame);
|
|
329
|
+
const result = await this.model.predict(extracted);
|
|
330
|
+
this.detector.process(result);
|
|
331
|
+
});
|
|
332
|
+
await this.audio.start();
|
|
333
|
+
this.listening = true;
|
|
334
|
+
}
|
|
335
|
+
/**
|
|
336
|
+
* Stop listening and release all resources.
|
|
337
|
+
*/
|
|
338
|
+
stop() {
|
|
339
|
+
if (!this.listening) return;
|
|
340
|
+
this.audio.stop();
|
|
341
|
+
this.model.dispose();
|
|
342
|
+
this.detector.reset();
|
|
343
|
+
this.listening = false;
|
|
344
|
+
}
|
|
345
|
+
/**
|
|
346
|
+
* Returns whether the detector is currently listening.
|
|
347
|
+
*/
|
|
348
|
+
isListening() {
|
|
349
|
+
return this.listening;
|
|
350
|
+
}
|
|
351
|
+
};
|
|
352
|
+
export {
|
|
353
|
+
Utterance
|
|
354
|
+
};
|
package/models/.gitkeep
ADDED
|
File without changes
|
package/package.json
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@utterance/core",
|
|
3
|
+
"version": "0.0.1",
|
|
4
|
+
"description": "Client-side semantic endpointing. Know when they're done talking.",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "dist/index.cjs",
|
|
7
|
+
"module": "dist/index.js",
|
|
8
|
+
"types": "dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"types": "./dist/index.d.ts",
|
|
12
|
+
"import": "./dist/index.js",
|
|
13
|
+
"require": "./dist/index.cjs"
|
|
14
|
+
}
|
|
15
|
+
},
|
|
16
|
+
"files": [
|
|
17
|
+
"dist",
|
|
18
|
+
"models"
|
|
19
|
+
],
|
|
20
|
+
"scripts": {
|
|
21
|
+
"start": "npm run build:sdk && run-p build:sdk:watch dev test:watch",
|
|
22
|
+
"dev": "next dev",
|
|
23
|
+
"build": "npm run build:sdk && next build",
|
|
24
|
+
"build:sdk": "tsup src/index.ts --format esm,cjs --dts --clean --tsconfig tsconfig.sdk.json",
|
|
25
|
+
"build:sdk:watch": "tsup src/index.ts --watch --format esm,cjs --dts --tsconfig tsconfig.sdk.json",
|
|
26
|
+
"test": "vitest run",
|
|
27
|
+
"test:watch": "vitest",
|
|
28
|
+
"lint": "eslint src/ tests/",
|
|
29
|
+
"lint:fix": "eslint src/ tests/ --fix",
|
|
30
|
+
"format": "prettier --write \"src/**/*.ts\" \"tests/**/*.ts\"",
|
|
31
|
+
"format:check": "prettier --check \"src/**/*.ts\" \"tests/**/*.ts\"",
|
|
32
|
+
"typecheck": "tsc --noEmit"
|
|
33
|
+
},
|
|
34
|
+
"keywords": [
|
|
35
|
+
"voice",
|
|
36
|
+
"speech",
|
|
37
|
+
"endpointing",
|
|
38
|
+
"vad",
|
|
39
|
+
"turn-detection",
|
|
40
|
+
"audio",
|
|
41
|
+
"ml",
|
|
42
|
+
"onnx",
|
|
43
|
+
"web-audio",
|
|
44
|
+
"real-time"
|
|
45
|
+
],
|
|
46
|
+
"author": "Utterance Contributors",
|
|
47
|
+
"license": "MIT",
|
|
48
|
+
"repository": {
|
|
49
|
+
"type": "git",
|
|
50
|
+
"url": "https://github.com/nizh0/Utterance.git"
|
|
51
|
+
},
|
|
52
|
+
"homepage": "https://utterance.dev",
|
|
53
|
+
"bugs": {
|
|
54
|
+
"url": "https://github.com/nizh0/Utterance/issues"
|
|
55
|
+
},
|
|
56
|
+
"devDependencies": {
|
|
57
|
+
"@eslint/js": "^9.0.0",
|
|
58
|
+
"@tailwindcss/postcss": "^4.2.0",
|
|
59
|
+
"@types/mdx": "^2.0.13",
|
|
60
|
+
"@types/node": "^22.0.0",
|
|
61
|
+
"@types/react": "^19.2.14",
|
|
62
|
+
"@types/react-dom": "^19.2.3",
|
|
63
|
+
"@types/three": "^0.182.0",
|
|
64
|
+
"eslint": "^9.0.0",
|
|
65
|
+
"npm-run-all2": "^8.0.4",
|
|
66
|
+
"postcss": "^8.5.6",
|
|
67
|
+
"prettier": "^3.4.0",
|
|
68
|
+
"shadcn": "^3.8.5",
|
|
69
|
+
"tailwindcss": "^4.2.0",
|
|
70
|
+
"tsup": "^8.0.0",
|
|
71
|
+
"tw-animate-css": "^1.4.0",
|
|
72
|
+
"typescript": "^5.7.0",
|
|
73
|
+
"typescript-eslint": "^8.0.0",
|
|
74
|
+
"vitest": "^3.0.0"
|
|
75
|
+
},
|
|
76
|
+
"dependencies": {
|
|
77
|
+
"@next/third-parties": "^16.1.6",
|
|
78
|
+
"@react-three/fiber": "^9.5.0",
|
|
79
|
+
"class-variance-authority": "^0.7.1",
|
|
80
|
+
"clsx": "^2.1.1",
|
|
81
|
+
"fumadocs-core": "^16.6.3",
|
|
82
|
+
"fumadocs-mdx": "^14.2.7",
|
|
83
|
+
"fumadocs-ui": "^16.6.3",
|
|
84
|
+
"lucide-react": "^0.574.0",
|
|
85
|
+
"next": "^16.1.6",
|
|
86
|
+
"onnxruntime-web": "^1.20.0",
|
|
87
|
+
"radix-ui": "^1.4.3",
|
|
88
|
+
"react": "^19.2.4",
|
|
89
|
+
"react-dom": "^19.2.4",
|
|
90
|
+
"tailwind-merge": "^3.4.1",
|
|
91
|
+
"three": "^0.183.0"
|
|
92
|
+
}
|
|
93
|
+
}
|