@utterance/core 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Nizh
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
package/README.md ADDED
@@ -0,0 +1,92 @@
1
+ <p align="center">
2
+ <h1 align="center">Utterance</h1>
3
+ <p align="center"><strong>Client-side semantic endpointing. Know when they're done talking.</strong></p>
4
+ <p align="center">
5
+ <a href="https://utterance.dev">Documentation</a> •
6
+ <a href="https://utterance.dev/demo">Live Demo</a> •
7
+ <a href="https://discord.gg/kb4zMHNtEV">Discord</a> •
8
+ <a href="https://github.com/nizh0/Utterance">GitHub</a>
9
+ </p>
10
+ </p>
11
+
12
+ ---
13
+
14
+ ## The Problem
15
+
16
+ Every voice app faces the same annoying problem: **it can't tell when you're done talking.**
17
+
18
+ You pause to think, and it cuts you off. You take a breath, and it responds too soon. You want to interrupt, and it keeps going.
19
+
20
+ The current solutions either:
21
+
22
+ - **Detect silence** (Silero VAD, ricky0123/vad): They know when sound stops, but they can't tell if you're thinking or finished.
23
+ - **Use server-side AI** (OpenAI Realtime, AssemblyAI): They are smart, but they add delay, costs, and privacy issues.
24
+
25
+ **Utterance is different.** It uses a lightweight ML model entirely on the client side. It recognizes the difference between a thinking pause and a completed turn. No cloud. No delay. No per-minute fees.
26
+
27
+ ## Quick Start
28
+
29
+ ```bash
30
+ npm install @utterance/core
31
+ ```
32
+
33
+ ```javascript
34
+ import { Utterance } from "@utterance/core";
35
+
36
+ const detector = new Utterance();
37
+
38
+ detector.on("turnEnd", (result) => {
39
+ console.log("User is done speaking", result.confidence);
40
+ });
41
+
42
+ detector.on("pause", (result) => {
43
+ console.log("User is thinking...", result.duration);
44
+ });
45
+
46
+ detector.on("interrupt", () => {
47
+ console.log("User wants to speak — stop AI response");
48
+ });
49
+
50
+ await detector.start();
51
+ ```
52
+
53
+ See the [full documentation](https://utterance.dev/docs/quick-start) for detailed usage, API reference, and integration examples.
54
+
55
+ ## Comparison
56
+
57
+ | Feature | Silero VAD | ricky0123/vad | Picovoice Cobra | OpenAI Realtime | **Utterance** |
58
+ | --- | --- | --- | --- | --- | --- |
59
+ | Detects speech vs. silence | ✅ | ✅ | ✅ | ✅ | ✅ |
60
+ | Semantic pause detection | ❌ | ❌ | ❌ | ✅ | ✅ |
61
+ | Interrupt detection | ❌ | ❌ | ❌ | ✅ | ✅ |
62
+ | Runs client-side | ✅ | ✅ | ✅ | ❌ | ✅ |
63
+ | No API costs | ✅ | ✅ | ❌ | ❌ | ✅ |
64
+ | Privacy (audio stays local) | ✅ | ✅ | ✅ | ❌ | ✅ |
65
+
66
+ ## Contributing
67
+
68
+ We're building Utterance in the open, and contributions are welcome.
69
+
70
+ ```bash
71
+ git clone https://github.com/nizh0/Utterance.git
72
+ cd Utterance
73
+ npm install
74
+ npm start
75
+ ```
76
+
77
+ See the [contributing guide](https://utterance.dev/docs/contributing) for development workflow, project structure, and areas where we need help.
78
+
79
+ ## Community
80
+
81
+ - [Discord](https://discord.gg/kb4zMHNtEV): Chat with contributors
82
+ - [GitHub Issues](https://github.com/nizh0/Utterance/issues): Bug reports & feature requests
83
+
84
+ ## License
85
+
86
+ MIT © [Utterance](https://utterance.dev)
87
+
88
+ ---
89
+
90
+ <p align="center">
91
+ <strong>"Five pharmacies on one road. But this one actually knows when you're done talking."</strong>
92
+ </p>
package/dist/index.cjs ADDED
@@ -0,0 +1,381 @@
1
+ "use strict";
2
+ var __defProp = Object.defineProperty;
3
+ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
4
+ var __getOwnPropNames = Object.getOwnPropertyNames;
5
+ var __hasOwnProp = Object.prototype.hasOwnProperty;
6
+ var __export = (target, all) => {
7
+ for (var name in all)
8
+ __defProp(target, name, { get: all[name], enumerable: true });
9
+ };
10
+ var __copyProps = (to, from, except, desc) => {
11
+ if (from && typeof from === "object" || typeof from === "function") {
12
+ for (let key of __getOwnPropNames(from))
13
+ if (!__hasOwnProp.call(to, key) && key !== except)
14
+ __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable });
15
+ }
16
+ return to;
17
+ };
18
+ var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
19
+
20
+ // src/index.ts
21
+ var index_exports = {};
22
+ __export(index_exports, {
23
+ Utterance: () => Utterance
24
+ });
25
+ module.exports = __toCommonJS(index_exports);
26
+
27
+ // src/audio/capture.ts
28
+ var AudioCapture = class {
29
+ context = null;
30
+ stream = null;
31
+ processor = null;
32
+ callback = null;
33
+ sampleRate;
34
+ constructor(sampleRate = 16e3) {
35
+ this.sampleRate = sampleRate;
36
+ }
37
+ onAudioData(callback) {
38
+ this.callback = callback;
39
+ }
40
+ async start() {
41
+ this.stream = await navigator.mediaDevices.getUserMedia({
42
+ audio: {
43
+ sampleRate: this.sampleRate,
44
+ channelCount: 1,
45
+ echoCancellation: true,
46
+ noiseSuppression: true
47
+ }
48
+ });
49
+ this.context = new AudioContext({ sampleRate: this.sampleRate });
50
+ const source = this.context.createMediaStreamSource(this.stream);
51
+ const bufferSize = 4096;
52
+ this.processor = this.context.createScriptProcessor(bufferSize, 1, 1);
53
+ this.processor.onaudioprocess = (event) => {
54
+ const input = event.inputBuffer.getChannelData(0);
55
+ this.callback?.(new Float32Array(input));
56
+ };
57
+ source.connect(this.processor);
58
+ this.processor.connect(this.context.destination);
59
+ }
60
+ stop() {
61
+ this.processor?.disconnect();
62
+ this.stream?.getTracks().forEach((track) => track.stop());
63
+ void this.context?.close();
64
+ this.processor = null;
65
+ this.stream = null;
66
+ this.context = null;
67
+ }
68
+ };
69
+
70
+ // src/features/extractor.ts
71
+ var FeatureExtractor = class {
72
+ sampleRate;
73
+ constructor(sampleRate = 16e3) {
74
+ this.sampleRate = sampleRate;
75
+ }
76
+ /**
77
+ * Extract all features from a single audio frame.
78
+ */
79
+ extract(frame) {
80
+ return {
81
+ mfcc: this.computeMFCC(frame),
82
+ energy: this.computeEnergy(frame),
83
+ pitch: this.estimatePitch(frame),
84
+ speechRate: this.estimateSpeechRate(frame),
85
+ pauseDuration: 0
86
+ // tracked by the detector over time
87
+ };
88
+ }
89
+ /**
90
+ * Compute Mel-Frequency Cepstral Coefficients.
91
+ *
92
+ * TODO: Implement full MFCC pipeline:
93
+ * 1. Pre-emphasis filter
94
+ * 2. Windowing (Hamming)
95
+ * 3. FFT
96
+ * 4. Mel filterbank
97
+ * 5. Log energy
98
+ * 6. DCT
99
+ */
100
+ computeMFCC(_frame) {
101
+ return new Float32Array(13);
102
+ }
103
+ /**
104
+ * Compute RMS energy of the frame.
105
+ */
106
+ computeEnergy(frame) {
107
+ let sum = 0;
108
+ for (let i = 0; i < frame.length; i++) {
109
+ sum += frame[i] * frame[i];
110
+ }
111
+ return Math.sqrt(sum / frame.length);
112
+ }
113
+ /**
114
+ * Estimate fundamental frequency (pitch) using autocorrelation.
115
+ *
116
+ * TODO: Implement YIN or autocorrelation-based pitch detection.
117
+ */
118
+ estimatePitch(_frame) {
119
+ void this.sampleRate;
120
+ return 0;
121
+ }
122
+ /**
123
+ * Estimate speech rate (syllables per second).
124
+ *
125
+ * TODO: Implement energy-envelope peak counting.
126
+ */
127
+ estimateSpeechRate(_frame) {
128
+ return 0;
129
+ }
130
+ };
131
+
132
+ // src/model/energy-vad.ts
133
+ var EnergyVAD = class {
134
+ speechThreshold;
135
+ silenceThreshold;
136
+ isSpeaking = false;
137
+ silenceStart = 0;
138
+ pauseHintMs;
139
+ constructor(sensitivity = 0.5) {
140
+ this.speechThreshold = 0.015 * (1 - sensitivity * 0.8);
141
+ this.silenceThreshold = this.speechThreshold * 0.6;
142
+ this.pauseHintMs = 800;
143
+ }
144
+ classify(features) {
145
+ const { energy } = features;
146
+ const now = Date.now();
147
+ if (!this.isSpeaking && energy >= this.speechThreshold) {
148
+ this.isSpeaking = true;
149
+ this.silenceStart = 0;
150
+ return { label: "speaking", confidence: this.energyToConfidence(energy), timestamp: now };
151
+ }
152
+ if (this.isSpeaking && energy >= this.silenceThreshold) {
153
+ this.silenceStart = 0;
154
+ return { label: "speaking", confidence: this.energyToConfidence(energy), timestamp: now };
155
+ }
156
+ if (this.isSpeaking && energy < this.silenceThreshold) {
157
+ if (this.silenceStart === 0) {
158
+ this.silenceStart = now;
159
+ }
160
+ const silenceDuration = now - this.silenceStart;
161
+ if (silenceDuration >= this.pauseHintMs) {
162
+ this.isSpeaking = false;
163
+ const confidence = Math.min(silenceDuration / (this.pauseHintMs * 2), 1);
164
+ return { label: "turn_complete", confidence, timestamp: now };
165
+ }
166
+ return { label: "thinking_pause", confidence: 0.6, timestamp: now };
167
+ }
168
+ return { label: "thinking_pause", confidence: 0.3, timestamp: now };
169
+ }
170
+ reset() {
171
+ this.isSpeaking = false;
172
+ this.silenceStart = 0;
173
+ }
174
+ energyToConfidence(energy) {
175
+ return Math.min(energy / (this.speechThreshold * 4), 1);
176
+ }
177
+ };
178
+
179
+ // src/model/onnx.ts
180
+ var ONNXModel = class {
181
+ session = null;
182
+ fallback;
183
+ constructor(sensitivity = 0.5) {
184
+ this.fallback = new EnergyVAD(sensitivity);
185
+ }
186
+ /**
187
+ * Load the ONNX model from a given path or URL.
188
+ *
189
+ * TODO:
190
+ * 1. Import onnxruntime-web InferenceSession
191
+ * 2. Load model bytes
192
+ * 3. Create session with appropriate execution providers
193
+ */
194
+ async load(_path) {
195
+ this.session = null;
196
+ }
197
+ /**
198
+ * Run inference on a set of extracted features.
199
+ *
200
+ * TODO:
201
+ * 1. Build input tensor from AudioFeatures
202
+ * 2. Run session.run()
203
+ * 3. Parse output into ClassificationResult
204
+ */
205
+ async predict(features) {
206
+ if (!this.session) {
207
+ return this.fallback.classify(features);
208
+ }
209
+ return this.fallback.classify(features);
210
+ }
211
+ /**
212
+ * Release model resources.
213
+ */
214
+ dispose() {
215
+ this.session = null;
216
+ this.fallback.reset();
217
+ }
218
+ };
219
+
220
+ // src/detector/turn-detector.ts
221
+ var TurnDetector = class {
222
+ listeners = /* @__PURE__ */ new Map();
223
+ state = "idle";
224
+ pauseStart = 0;
225
+ speakStart = 0;
226
+ sensitivity;
227
+ pauseTolerance;
228
+ constructor(sensitivity = 0.5, pauseTolerance = 1500) {
229
+ this.sensitivity = sensitivity;
230
+ this.pauseTolerance = pauseTolerance;
231
+ }
232
+ /**
233
+ * Register an event listener.
234
+ */
235
+ on(event, listener) {
236
+ if (!this.listeners.has(event)) {
237
+ this.listeners.set(event, /* @__PURE__ */ new Set());
238
+ }
239
+ this.listeners.get(event).add(listener);
240
+ }
241
+ /**
242
+ * Remove an event listener.
243
+ */
244
+ off(event, listener) {
245
+ this.listeners.get(event)?.delete(listener);
246
+ }
247
+ /**
248
+ * Process a classification result from the model and emit events.
249
+ */
250
+ process(result) {
251
+ const { label, confidence, timestamp } = result;
252
+ const threshold = this.sensitivity;
253
+ switch (label) {
254
+ case "speaking":
255
+ if (this.state !== "speaking") {
256
+ this.state = "speaking";
257
+ this.speakStart = timestamp;
258
+ this.emit("speechStart", { timestamp });
259
+ }
260
+ break;
261
+ case "thinking_pause":
262
+ if (this.state === "speaking" && confidence >= threshold) {
263
+ this.state = "paused";
264
+ this.pauseStart = timestamp;
265
+ this.emit("pause", {
266
+ duration: 0,
267
+ confidence
268
+ });
269
+ } else if (this.state === "paused") {
270
+ const duration = timestamp - this.pauseStart;
271
+ if (duration >= this.pauseTolerance) {
272
+ this.state = "idle";
273
+ this.emit("turnEnd", {
274
+ confidence,
275
+ duration: timestamp - this.speakStart
276
+ });
277
+ }
278
+ }
279
+ break;
280
+ case "turn_complete":
281
+ if ((this.state === "speaking" || this.state === "paused") && confidence >= threshold) {
282
+ this.state = "idle";
283
+ this.emit("turnEnd", {
284
+ confidence,
285
+ duration: timestamp - this.speakStart
286
+ });
287
+ }
288
+ break;
289
+ case "interrupt_intent":
290
+ if (confidence >= threshold) {
291
+ this.emit("interrupt", { timestamp });
292
+ }
293
+ break;
294
+ }
295
+ }
296
+ /**
297
+ * Reset internal state.
298
+ */
299
+ reset() {
300
+ this.state = "idle";
301
+ this.pauseStart = 0;
302
+ this.speakStart = 0;
303
+ }
304
+ emit(event, payload) {
305
+ this.listeners.get(event)?.forEach((fn) => fn(payload));
306
+ }
307
+ };
308
+
309
+ // src/types.ts
310
+ var DEFAULT_OPTIONS = {
311
+ sensitivity: 0.5,
312
+ pauseTolerance: 1500,
313
+ modelPath: "bundled",
314
+ sampleRate: 16e3
315
+ };
316
+
317
+ // src/utterance.ts
318
+ var Utterance = class {
319
+ options;
320
+ audio;
321
+ features;
322
+ model;
323
+ detector;
324
+ listening = false;
325
+ constructor(options = {}) {
326
+ this.options = { ...DEFAULT_OPTIONS, ...options };
327
+ this.audio = new AudioCapture(this.options.sampleRate);
328
+ this.features = new FeatureExtractor(this.options.sampleRate);
329
+ this.model = new ONNXModel(this.options.sensitivity);
330
+ this.detector = new TurnDetector(
331
+ this.options.sensitivity,
332
+ this.options.pauseTolerance
333
+ );
334
+ }
335
+ /**
336
+ * Register an event listener.
337
+ */
338
+ on(event, listener) {
339
+ this.detector.on(event, listener);
340
+ }
341
+ /**
342
+ * Remove an event listener.
343
+ */
344
+ off(event, listener) {
345
+ this.detector.off(event, listener);
346
+ }
347
+ /**
348
+ * Start listening to the microphone and detecting turns.
349
+ */
350
+ async start() {
351
+ if (this.listening) return;
352
+ await this.model.load(this.options.modelPath);
353
+ this.audio.onAudioData(async (frame) => {
354
+ const extracted = this.features.extract(frame);
355
+ const result = await this.model.predict(extracted);
356
+ this.detector.process(result);
357
+ });
358
+ await this.audio.start();
359
+ this.listening = true;
360
+ }
361
+ /**
362
+ * Stop listening and release all resources.
363
+ */
364
+ stop() {
365
+ if (!this.listening) return;
366
+ this.audio.stop();
367
+ this.model.dispose();
368
+ this.detector.reset();
369
+ this.listening = false;
370
+ }
371
+ /**
372
+ * Returns whether the detector is currently listening.
373
+ */
374
+ isListening() {
375
+ return this.listening;
376
+ }
377
+ };
378
+ // Annotate the CommonJS export names for ESM import in node:
379
+ 0 && (module.exports = {
380
+ Utterance
381
+ });
@@ -0,0 +1,92 @@
1
+ /**
2
+ * Core types for the Utterance SDK.
3
+ *
4
+ * All shared interfaces and type definitions live here to keep
5
+ * the codebase scalable and avoid circular dependencies.
6
+ */
7
+ interface UtteranceOptions {
8
+ /** Detection sensitivity (0-1). Higher = more sensitive to pauses. Default: 0.5 */
9
+ sensitivity?: number;
10
+ /** Max thinking pause duration (ms) before triggering turnEnd. Default: 1500 */
11
+ pauseTolerance?: number;
12
+ /** Path to a custom ONNX model. Default: bundled model */
13
+ modelPath?: string;
14
+ /** Audio sample rate in Hz. Default: 16000 */
15
+ sampleRate?: number;
16
+ }
17
+ type ClassificationLabel = "speaking" | "thinking_pause" | "turn_complete" | "interrupt_intent";
18
+ interface ClassificationResult {
19
+ label: ClassificationLabel;
20
+ confidence: number;
21
+ timestamp: number;
22
+ }
23
+ interface SpeechStartEvent {
24
+ timestamp: number;
25
+ }
26
+ interface PauseEvent {
27
+ duration: number;
28
+ confidence: number;
29
+ }
30
+ interface TurnEndEvent {
31
+ confidence: number;
32
+ duration: number;
33
+ }
34
+ interface InterruptEvent {
35
+ timestamp: number;
36
+ }
37
+ interface UtteranceEventMap {
38
+ speechStart: SpeechStartEvent;
39
+ pause: PauseEvent;
40
+ turnEnd: TurnEndEvent;
41
+ interrupt: InterruptEvent;
42
+ }
43
+ type UtteranceEvent = keyof UtteranceEventMap;
44
+ interface AudioFeatures {
45
+ mfcc: Float32Array;
46
+ energy: number;
47
+ pitch: number;
48
+ speechRate: number;
49
+ pauseDuration: number;
50
+ }
51
+
52
+ /**
53
+ * Main entry point for the Utterance SDK.
54
+ *
55
+ * Usage:
56
+ * ```ts
57
+ * const detector = new Utterance({ sensitivity: 0.6 });
58
+ * detector.on("turnEnd", (e) => console.log("Done!", e.confidence));
59
+ * await detector.start();
60
+ * ```
61
+ */
62
+ declare class Utterance {
63
+ private readonly options;
64
+ private readonly audio;
65
+ private readonly features;
66
+ private readonly model;
67
+ private readonly detector;
68
+ private listening;
69
+ constructor(options?: UtteranceOptions);
70
+ /**
71
+ * Register an event listener.
72
+ */
73
+ on<E extends UtteranceEvent>(event: E, listener: (payload: UtteranceEventMap[E]) => void): void;
74
+ /**
75
+ * Remove an event listener.
76
+ */
77
+ off<E extends UtteranceEvent>(event: E, listener: (payload: UtteranceEventMap[E]) => void): void;
78
+ /**
79
+ * Start listening to the microphone and detecting turns.
80
+ */
81
+ start(): Promise<void>;
82
+ /**
83
+ * Stop listening and release all resources.
84
+ */
85
+ stop(): void;
86
+ /**
87
+ * Returns whether the detector is currently listening.
88
+ */
89
+ isListening(): boolean;
90
+ }
91
+
92
+ export { type AudioFeatures, type ClassificationLabel, type ClassificationResult, type InterruptEvent, type PauseEvent, type SpeechStartEvent, type TurnEndEvent, Utterance, type UtteranceEvent, type UtteranceEventMap, type UtteranceOptions };
@@ -0,0 +1,92 @@
1
+ /**
2
+ * Core types for the Utterance SDK.
3
+ *
4
+ * All shared interfaces and type definitions live here to keep
5
+ * the codebase scalable and avoid circular dependencies.
6
+ */
7
+ interface UtteranceOptions {
8
+ /** Detection sensitivity (0-1). Higher = more sensitive to pauses. Default: 0.5 */
9
+ sensitivity?: number;
10
+ /** Max thinking pause duration (ms) before triggering turnEnd. Default: 1500 */
11
+ pauseTolerance?: number;
12
+ /** Path to a custom ONNX model. Default: bundled model */
13
+ modelPath?: string;
14
+ /** Audio sample rate in Hz. Default: 16000 */
15
+ sampleRate?: number;
16
+ }
17
+ type ClassificationLabel = "speaking" | "thinking_pause" | "turn_complete" | "interrupt_intent";
18
+ interface ClassificationResult {
19
+ label: ClassificationLabel;
20
+ confidence: number;
21
+ timestamp: number;
22
+ }
23
+ interface SpeechStartEvent {
24
+ timestamp: number;
25
+ }
26
+ interface PauseEvent {
27
+ duration: number;
28
+ confidence: number;
29
+ }
30
+ interface TurnEndEvent {
31
+ confidence: number;
32
+ duration: number;
33
+ }
34
+ interface InterruptEvent {
35
+ timestamp: number;
36
+ }
37
+ interface UtteranceEventMap {
38
+ speechStart: SpeechStartEvent;
39
+ pause: PauseEvent;
40
+ turnEnd: TurnEndEvent;
41
+ interrupt: InterruptEvent;
42
+ }
43
+ type UtteranceEvent = keyof UtteranceEventMap;
44
+ interface AudioFeatures {
45
+ mfcc: Float32Array;
46
+ energy: number;
47
+ pitch: number;
48
+ speechRate: number;
49
+ pauseDuration: number;
50
+ }
51
+
52
+ /**
53
+ * Main entry point for the Utterance SDK.
54
+ *
55
+ * Usage:
56
+ * ```ts
57
+ * const detector = new Utterance({ sensitivity: 0.6 });
58
+ * detector.on("turnEnd", (e) => console.log("Done!", e.confidence));
59
+ * await detector.start();
60
+ * ```
61
+ */
62
+ declare class Utterance {
63
+ private readonly options;
64
+ private readonly audio;
65
+ private readonly features;
66
+ private readonly model;
67
+ private readonly detector;
68
+ private listening;
69
+ constructor(options?: UtteranceOptions);
70
+ /**
71
+ * Register an event listener.
72
+ */
73
+ on<E extends UtteranceEvent>(event: E, listener: (payload: UtteranceEventMap[E]) => void): void;
74
+ /**
75
+ * Remove an event listener.
76
+ */
77
+ off<E extends UtteranceEvent>(event: E, listener: (payload: UtteranceEventMap[E]) => void): void;
78
+ /**
79
+ * Start listening to the microphone and detecting turns.
80
+ */
81
+ start(): Promise<void>;
82
+ /**
83
+ * Stop listening and release all resources.
84
+ */
85
+ stop(): void;
86
+ /**
87
+ * Returns whether the detector is currently listening.
88
+ */
89
+ isListening(): boolean;
90
+ }
91
+
92
+ export { type AudioFeatures, type ClassificationLabel, type ClassificationResult, type InterruptEvent, type PauseEvent, type SpeechStartEvent, type TurnEndEvent, Utterance, type UtteranceEvent, type UtteranceEventMap, type UtteranceOptions };
package/dist/index.js ADDED
@@ -0,0 +1,354 @@
1
+ // src/audio/capture.ts
2
+ var AudioCapture = class {
3
+ context = null;
4
+ stream = null;
5
+ processor = null;
6
+ callback = null;
7
+ sampleRate;
8
+ constructor(sampleRate = 16e3) {
9
+ this.sampleRate = sampleRate;
10
+ }
11
+ onAudioData(callback) {
12
+ this.callback = callback;
13
+ }
14
+ async start() {
15
+ this.stream = await navigator.mediaDevices.getUserMedia({
16
+ audio: {
17
+ sampleRate: this.sampleRate,
18
+ channelCount: 1,
19
+ echoCancellation: true,
20
+ noiseSuppression: true
21
+ }
22
+ });
23
+ this.context = new AudioContext({ sampleRate: this.sampleRate });
24
+ const source = this.context.createMediaStreamSource(this.stream);
25
+ const bufferSize = 4096;
26
+ this.processor = this.context.createScriptProcessor(bufferSize, 1, 1);
27
+ this.processor.onaudioprocess = (event) => {
28
+ const input = event.inputBuffer.getChannelData(0);
29
+ this.callback?.(new Float32Array(input));
30
+ };
31
+ source.connect(this.processor);
32
+ this.processor.connect(this.context.destination);
33
+ }
34
+ stop() {
35
+ this.processor?.disconnect();
36
+ this.stream?.getTracks().forEach((track) => track.stop());
37
+ void this.context?.close();
38
+ this.processor = null;
39
+ this.stream = null;
40
+ this.context = null;
41
+ }
42
+ };
43
+
44
+ // src/features/extractor.ts
45
+ var FeatureExtractor = class {
46
+ sampleRate;
47
+ constructor(sampleRate = 16e3) {
48
+ this.sampleRate = sampleRate;
49
+ }
50
+ /**
51
+ * Extract all features from a single audio frame.
52
+ */
53
+ extract(frame) {
54
+ return {
55
+ mfcc: this.computeMFCC(frame),
56
+ energy: this.computeEnergy(frame),
57
+ pitch: this.estimatePitch(frame),
58
+ speechRate: this.estimateSpeechRate(frame),
59
+ pauseDuration: 0
60
+ // tracked by the detector over time
61
+ };
62
+ }
63
+ /**
64
+ * Compute Mel-Frequency Cepstral Coefficients.
65
+ *
66
+ * TODO: Implement full MFCC pipeline:
67
+ * 1. Pre-emphasis filter
68
+ * 2. Windowing (Hamming)
69
+ * 3. FFT
70
+ * 4. Mel filterbank
71
+ * 5. Log energy
72
+ * 6. DCT
73
+ */
74
+ computeMFCC(_frame) {
75
+ return new Float32Array(13);
76
+ }
77
+ /**
78
+ * Compute RMS energy of the frame.
79
+ */
80
+ computeEnergy(frame) {
81
+ let sum = 0;
82
+ for (let i = 0; i < frame.length; i++) {
83
+ sum += frame[i] * frame[i];
84
+ }
85
+ return Math.sqrt(sum / frame.length);
86
+ }
87
+ /**
88
+ * Estimate fundamental frequency (pitch) using autocorrelation.
89
+ *
90
+ * TODO: Implement YIN or autocorrelation-based pitch detection.
91
+ */
92
+ estimatePitch(_frame) {
93
+ void this.sampleRate;
94
+ return 0;
95
+ }
96
+ /**
97
+ * Estimate speech rate (syllables per second).
98
+ *
99
+ * TODO: Implement energy-envelope peak counting.
100
+ */
101
+ estimateSpeechRate(_frame) {
102
+ return 0;
103
+ }
104
+ };
105
+
106
+ // src/model/energy-vad.ts
107
+ var EnergyVAD = class {
108
+ speechThreshold;
109
+ silenceThreshold;
110
+ isSpeaking = false;
111
+ silenceStart = 0;
112
+ pauseHintMs;
113
+ constructor(sensitivity = 0.5) {
114
+ this.speechThreshold = 0.015 * (1 - sensitivity * 0.8);
115
+ this.silenceThreshold = this.speechThreshold * 0.6;
116
+ this.pauseHintMs = 800;
117
+ }
118
+ classify(features) {
119
+ const { energy } = features;
120
+ const now = Date.now();
121
+ if (!this.isSpeaking && energy >= this.speechThreshold) {
122
+ this.isSpeaking = true;
123
+ this.silenceStart = 0;
124
+ return { label: "speaking", confidence: this.energyToConfidence(energy), timestamp: now };
125
+ }
126
+ if (this.isSpeaking && energy >= this.silenceThreshold) {
127
+ this.silenceStart = 0;
128
+ return { label: "speaking", confidence: this.energyToConfidence(energy), timestamp: now };
129
+ }
130
+ if (this.isSpeaking && energy < this.silenceThreshold) {
131
+ if (this.silenceStart === 0) {
132
+ this.silenceStart = now;
133
+ }
134
+ const silenceDuration = now - this.silenceStart;
135
+ if (silenceDuration >= this.pauseHintMs) {
136
+ this.isSpeaking = false;
137
+ const confidence = Math.min(silenceDuration / (this.pauseHintMs * 2), 1);
138
+ return { label: "turn_complete", confidence, timestamp: now };
139
+ }
140
+ return { label: "thinking_pause", confidence: 0.6, timestamp: now };
141
+ }
142
+ return { label: "thinking_pause", confidence: 0.3, timestamp: now };
143
+ }
144
+ reset() {
145
+ this.isSpeaking = false;
146
+ this.silenceStart = 0;
147
+ }
148
+ energyToConfidence(energy) {
149
+ return Math.min(energy / (this.speechThreshold * 4), 1);
150
+ }
151
+ };
152
+
153
+ // src/model/onnx.ts
154
+ var ONNXModel = class {
155
+ session = null;
156
+ fallback;
157
+ constructor(sensitivity = 0.5) {
158
+ this.fallback = new EnergyVAD(sensitivity);
159
+ }
160
+ /**
161
+ * Load the ONNX model from a given path or URL.
162
+ *
163
+ * TODO:
164
+ * 1. Import onnxruntime-web InferenceSession
165
+ * 2. Load model bytes
166
+ * 3. Create session with appropriate execution providers
167
+ */
168
+ async load(_path) {
169
+ this.session = null;
170
+ }
171
+ /**
172
+ * Run inference on a set of extracted features.
173
+ *
174
+ * TODO:
175
+ * 1. Build input tensor from AudioFeatures
176
+ * 2. Run session.run()
177
+ * 3. Parse output into ClassificationResult
178
+ */
179
+ async predict(features) {
180
+ if (!this.session) {
181
+ return this.fallback.classify(features);
182
+ }
183
+ return this.fallback.classify(features);
184
+ }
185
+ /**
186
+ * Release model resources.
187
+ */
188
+ dispose() {
189
+ this.session = null;
190
+ this.fallback.reset();
191
+ }
192
+ };
193
+
194
+ // src/detector/turn-detector.ts
195
+ var TurnDetector = class {
196
+ listeners = /* @__PURE__ */ new Map();
197
+ state = "idle";
198
+ pauseStart = 0;
199
+ speakStart = 0;
200
+ sensitivity;
201
+ pauseTolerance;
202
+ constructor(sensitivity = 0.5, pauseTolerance = 1500) {
203
+ this.sensitivity = sensitivity;
204
+ this.pauseTolerance = pauseTolerance;
205
+ }
206
+ /**
207
+ * Register an event listener.
208
+ */
209
+ on(event, listener) {
210
+ if (!this.listeners.has(event)) {
211
+ this.listeners.set(event, /* @__PURE__ */ new Set());
212
+ }
213
+ this.listeners.get(event).add(listener);
214
+ }
215
+ /**
216
+ * Remove an event listener.
217
+ */
218
+ off(event, listener) {
219
+ this.listeners.get(event)?.delete(listener);
220
+ }
221
+ /**
222
+ * Process a classification result from the model and emit events.
223
+ */
224
+ process(result) {
225
+ const { label, confidence, timestamp } = result;
226
+ const threshold = this.sensitivity;
227
+ switch (label) {
228
+ case "speaking":
229
+ if (this.state !== "speaking") {
230
+ this.state = "speaking";
231
+ this.speakStart = timestamp;
232
+ this.emit("speechStart", { timestamp });
233
+ }
234
+ break;
235
+ case "thinking_pause":
236
+ if (this.state === "speaking" && confidence >= threshold) {
237
+ this.state = "paused";
238
+ this.pauseStart = timestamp;
239
+ this.emit("pause", {
240
+ duration: 0,
241
+ confidence
242
+ });
243
+ } else if (this.state === "paused") {
244
+ const duration = timestamp - this.pauseStart;
245
+ if (duration >= this.pauseTolerance) {
246
+ this.state = "idle";
247
+ this.emit("turnEnd", {
248
+ confidence,
249
+ duration: timestamp - this.speakStart
250
+ });
251
+ }
252
+ }
253
+ break;
254
+ case "turn_complete":
255
+ if ((this.state === "speaking" || this.state === "paused") && confidence >= threshold) {
256
+ this.state = "idle";
257
+ this.emit("turnEnd", {
258
+ confidence,
259
+ duration: timestamp - this.speakStart
260
+ });
261
+ }
262
+ break;
263
+ case "interrupt_intent":
264
+ if (confidence >= threshold) {
265
+ this.emit("interrupt", { timestamp });
266
+ }
267
+ break;
268
+ }
269
+ }
270
+ /**
271
+ * Reset internal state.
272
+ */
273
+ reset() {
274
+ this.state = "idle";
275
+ this.pauseStart = 0;
276
+ this.speakStart = 0;
277
+ }
278
+ emit(event, payload) {
279
+ this.listeners.get(event)?.forEach((fn) => fn(payload));
280
+ }
281
+ };
282
+
283
+ // src/types.ts
284
+ var DEFAULT_OPTIONS = {
285
+ sensitivity: 0.5,
286
+ pauseTolerance: 1500,
287
+ modelPath: "bundled",
288
+ sampleRate: 16e3
289
+ };
290
+
291
+ // src/utterance.ts
292
+ var Utterance = class {
293
+ options;
294
+ audio;
295
+ features;
296
+ model;
297
+ detector;
298
+ listening = false;
299
+ constructor(options = {}) {
300
+ this.options = { ...DEFAULT_OPTIONS, ...options };
301
+ this.audio = new AudioCapture(this.options.sampleRate);
302
+ this.features = new FeatureExtractor(this.options.sampleRate);
303
+ this.model = new ONNXModel(this.options.sensitivity);
304
+ this.detector = new TurnDetector(
305
+ this.options.sensitivity,
306
+ this.options.pauseTolerance
307
+ );
308
+ }
309
+ /**
310
+ * Register an event listener.
311
+ */
312
+ on(event, listener) {
313
+ this.detector.on(event, listener);
314
+ }
315
+ /**
316
+ * Remove an event listener.
317
+ */
318
+ off(event, listener) {
319
+ this.detector.off(event, listener);
320
+ }
321
+ /**
322
+ * Start listening to the microphone and detecting turns.
323
+ */
324
+ async start() {
325
+ if (this.listening) return;
326
+ await this.model.load(this.options.modelPath);
327
+ this.audio.onAudioData(async (frame) => {
328
+ const extracted = this.features.extract(frame);
329
+ const result = await this.model.predict(extracted);
330
+ this.detector.process(result);
331
+ });
332
+ await this.audio.start();
333
+ this.listening = true;
334
+ }
335
+ /**
336
+ * Stop listening and release all resources.
337
+ */
338
+ stop() {
339
+ if (!this.listening) return;
340
+ this.audio.stop();
341
+ this.model.dispose();
342
+ this.detector.reset();
343
+ this.listening = false;
344
+ }
345
+ /**
346
+ * Returns whether the detector is currently listening.
347
+ */
348
+ isListening() {
349
+ return this.listening;
350
+ }
351
+ };
352
+ export {
353
+ Utterance
354
+ };
File without changes
package/package.json ADDED
@@ -0,0 +1,93 @@
1
+ {
2
+ "name": "@utterance/core",
3
+ "version": "0.0.1",
4
+ "description": "Client-side semantic endpointing. Know when they're done talking.",
5
+ "type": "module",
6
+ "main": "dist/index.cjs",
7
+ "module": "dist/index.js",
8
+ "types": "dist/index.d.ts",
9
+ "exports": {
10
+ ".": {
11
+ "types": "./dist/index.d.ts",
12
+ "import": "./dist/index.js",
13
+ "require": "./dist/index.cjs"
14
+ }
15
+ },
16
+ "files": [
17
+ "dist",
18
+ "models"
19
+ ],
20
+ "scripts": {
21
+ "start": "npm run build:sdk && run-p build:sdk:watch dev test:watch",
22
+ "dev": "next dev",
23
+ "build": "npm run build:sdk && next build",
24
+ "build:sdk": "tsup src/index.ts --format esm,cjs --dts --clean --tsconfig tsconfig.sdk.json",
25
+ "build:sdk:watch": "tsup src/index.ts --watch --format esm,cjs --dts --tsconfig tsconfig.sdk.json",
26
+ "test": "vitest run",
27
+ "test:watch": "vitest",
28
+ "lint": "eslint src/ tests/",
29
+ "lint:fix": "eslint src/ tests/ --fix",
30
+ "format": "prettier --write \"src/**/*.ts\" \"tests/**/*.ts\"",
31
+ "format:check": "prettier --check \"src/**/*.ts\" \"tests/**/*.ts\"",
32
+ "typecheck": "tsc --noEmit"
33
+ },
34
+ "keywords": [
35
+ "voice",
36
+ "speech",
37
+ "endpointing",
38
+ "vad",
39
+ "turn-detection",
40
+ "audio",
41
+ "ml",
42
+ "onnx",
43
+ "web-audio",
44
+ "real-time"
45
+ ],
46
+ "author": "Utterance Contributors",
47
+ "license": "MIT",
48
+ "repository": {
49
+ "type": "git",
50
+ "url": "https://github.com/nizh0/Utterance.git"
51
+ },
52
+ "homepage": "https://utterance.dev",
53
+ "bugs": {
54
+ "url": "https://github.com/nizh0/Utterance/issues"
55
+ },
56
+ "devDependencies": {
57
+ "@eslint/js": "^9.0.0",
58
+ "@tailwindcss/postcss": "^4.2.0",
59
+ "@types/mdx": "^2.0.13",
60
+ "@types/node": "^22.0.0",
61
+ "@types/react": "^19.2.14",
62
+ "@types/react-dom": "^19.2.3",
63
+ "@types/three": "^0.182.0",
64
+ "eslint": "^9.0.0",
65
+ "npm-run-all2": "^8.0.4",
66
+ "postcss": "^8.5.6",
67
+ "prettier": "^3.4.0",
68
+ "shadcn": "^3.8.5",
69
+ "tailwindcss": "^4.2.0",
70
+ "tsup": "^8.0.0",
71
+ "tw-animate-css": "^1.4.0",
72
+ "typescript": "^5.7.0",
73
+ "typescript-eslint": "^8.0.0",
74
+ "vitest": "^3.0.0"
75
+ },
76
+ "dependencies": {
77
+ "@next/third-parties": "^16.1.6",
78
+ "@react-three/fiber": "^9.5.0",
79
+ "class-variance-authority": "^0.7.1",
80
+ "clsx": "^2.1.1",
81
+ "fumadocs-core": "^16.6.3",
82
+ "fumadocs-mdx": "^14.2.7",
83
+ "fumadocs-ui": "^16.6.3",
84
+ "lucide-react": "^0.574.0",
85
+ "next": "^16.1.6",
86
+ "onnxruntime-web": "^1.20.0",
87
+ "radix-ui": "^1.4.3",
88
+ "react": "^19.2.4",
89
+ "react-dom": "^19.2.4",
90
+ "tailwind-merge": "^3.4.1",
91
+ "three": "^0.183.0"
92
+ }
93
+ }