mellon 0.0.3 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -9
- package/dist/assets/audio-processor.js +37 -0
- package/dist/assets/manifest.json +16 -0
- package/dist/assets/sw.js +76 -0
- package/dist/index.d.ts +295 -0
- package/dist/mellon.cjs +38 -0
- package/dist/mellon.mjs +611 -0
- package/package.json +3 -10
- /package/dist/{models → assets}/model.onnx +0 -0
- /package/dist/{wasm → assets}/ort-wasm-simd-threaded.asyncify.mjs +0 -0
- /package/dist/{wasm → assets}/ort-wasm-simd-threaded.asyncify.wasm +0 -0
- /package/dist/{wasm → assets}/ort-wasm-simd-threaded.jsep.mjs +0 -0
- /package/dist/{wasm → assets}/ort-wasm-simd-threaded.jsep.wasm +0 -0
- /package/dist/{wasm → assets}/ort-wasm-simd-threaded.jspi.mjs +0 -0
- /package/dist/{wasm → assets}/ort-wasm-simd-threaded.jspi.wasm +0 -0
- /package/dist/{wasm → assets}/ort-wasm-simd-threaded.mjs +0 -0
- /package/dist/{wasm → assets}/ort-wasm-simd-threaded.wasm +0 -0
- /package/dist/{wasm → assets}/ort.all.bundle.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.all.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.all.mjs +0 -0
- /package/dist/{wasm → assets}/ort.bundle.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.jspi.bundle.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.jspi.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.jspi.mjs +0 -0
- /package/dist/{wasm → assets}/ort.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.mjs +0 -0
- /package/dist/{wasm → assets}/ort.node.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.wasm.bundle.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.wasm.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.wasm.mjs +0 -0
- /package/dist/{wasm → assets}/ort.webgl.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.webgl.mjs +0 -0
- /package/dist/{wasm → assets}/ort.webgpu.bundle.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.webgpu.min.mjs +0 -0
- /package/dist/{wasm → assets}/ort.webgpu.mjs +0 -0
package/README.md
CHANGED
|
@@ -1,12 +1,11 @@
|
|
|
1
1
|
# mellon
|
|
2
2
|
|
|
3
|
-
Offline, fully in-browser **hotword / wake-word detection** powered by [EfficientWord-Net](https://github.com/Ant-Brain/EfficientWord-Net) (ResNet-50 ArcFace).
|
|
3
|
+
Offline, fully in-browser **hotword / wake-word detection** powered by [EfficientWord-Net](https://github.com/Ant-Brain/EfficientWord-Net) (ResNet-50 ArcFace).
|
|
4
4
|
|
|
5
5
|
- **100% offline** — ONNX inference runs in the browser via WebAssembly; no server, no cloud.
|
|
6
6
|
- **Speaker-independent** — the model generalises across voices out of the box.
|
|
7
|
-
- **Custom words** — enroll any phrase with ≥ 3 audio samples
|
|
7
|
+
- **Custom words** — enroll any phrase with ≥ 3 audio samples.
|
|
8
8
|
- **TypeScript-ready** — ships with full `.d.ts` declarations.
|
|
9
|
-
- **Tiny API surface** — one class, zero config.
|
|
10
9
|
|
|
11
10
|
---
|
|
12
11
|
|
|
@@ -81,16 +80,14 @@ Refs are fetched automatically during `start()`. You can enroll your own words
|
|
|
81
80
|
By default, the WASM runtime and model load from the jsDelivr CDN — no setup needed. For air-gapped or private-network deployments, copy the assets locally and tell the library where to find them:
|
|
82
81
|
|
|
83
82
|
```bash
|
|
84
|
-
cp -r node_modules/mellon/dist/
|
|
85
|
-
cp node_modules/mellon/dist/models/model.onnx public/mellon-assets/model.onnx
|
|
83
|
+
cp -r node_modules/mellon/dist/assets public/mellon-assets/
|
|
86
84
|
```
|
|
87
85
|
|
|
88
86
|
Then pass the paths to the constructor:
|
|
89
87
|
|
|
90
88
|
```js
|
|
91
89
|
new Mellon({
|
|
92
|
-
|
|
93
|
-
modelUrl: '/mellon-assets/model.onnx',
|
|
90
|
+
assetsPath: '/mellon-assets', // trailing slash required
|
|
94
91
|
})
|
|
95
92
|
```
|
|
96
93
|
|
|
@@ -104,8 +101,7 @@ export default {
|
|
|
104
101
|
plugins: [
|
|
105
102
|
viteStaticCopy({
|
|
106
103
|
targets: [
|
|
107
|
-
{ src: 'node_modules/mellon/dist/
|
|
108
|
-
{ src: 'node_modules/mellon/dist/models/model.onnx', dest: 'mellon-assets' },
|
|
104
|
+
{ src: 'node_modules/mellon/dist/assets/*', dest: 'mellon-assets' }
|
|
109
105
|
],
|
|
110
106
|
}),
|
|
111
107
|
],
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* public/audio-processor.js
|
|
3
|
+
* AudioWorklet that runs at 16 kHz and continuously emits the last
|
|
4
|
+
* 1.5-second window (24 000 samples) via a circular buffer.
|
|
5
|
+
*
|
|
6
|
+
* The main thread receives a fresh Float32Array on every
|
|
7
|
+
* AudioWorklet quantum (128 samples ≈ every 8 ms at 16 kHz).
|
|
8
|
+
* The inference loop in engine.js rate-limits to avoid excessive work.
|
|
9
|
+
*/
|
|
10
|
+
class AudioProcessor extends AudioWorkletProcessor {
|
|
11
|
+
constructor() {
|
|
12
|
+
super()
|
|
13
|
+
this._size = 24000 // 1.5 s × 16 000 Hz
|
|
14
|
+
this._buf = new Float32Array(this._size)
|
|
15
|
+
this._ptr = 0
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
process(inputs) {
|
|
19
|
+
const ch = inputs[0]?.[0]
|
|
20
|
+
if (!ch) return true
|
|
21
|
+
|
|
22
|
+
for (let i = 0; i < ch.length; i++) {
|
|
23
|
+
this._buf[this._ptr] = ch[i]
|
|
24
|
+
this._ptr = (this._ptr + 1) % this._size
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Send an ordered copy of the ring buffer
|
|
28
|
+
const out = new Float32Array(this._size)
|
|
29
|
+
for (let i = 0; i < this._size; i++) {
|
|
30
|
+
out[i] = this._buf[(this._ptr + i) % this._size]
|
|
31
|
+
}
|
|
32
|
+
this.port.postMessage(out)
|
|
33
|
+
return true
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
registerProcessor('audio-processor', AudioProcessor)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "Mellon STT",
|
|
3
|
+
"short_name": "Mellon",
|
|
4
|
+
"description": "Offline hotword detection demo — EfficientWord-Net in the browser",
|
|
5
|
+
"start_url": "./",
|
|
6
|
+
"display": "standalone",
|
|
7
|
+
"background_color": "#080810",
|
|
8
|
+
"theme_color": "#080810",
|
|
9
|
+
"icons": [
|
|
10
|
+
{
|
|
11
|
+
"src": "data:image/svg+xml,<svg xmlns='http://www.w3.org/2000/svg' viewBox='0 0 100 100'><text y='.9em' font-size='90'>🎙</text></svg>",
|
|
12
|
+
"sizes": "any",
|
|
13
|
+
"type": "image/svg+xml"
|
|
14
|
+
}
|
|
15
|
+
]
|
|
16
|
+
}
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* public/sw.js — Service Worker for offline-first caching.
|
|
3
|
+
*
|
|
4
|
+
* Strategy:
|
|
5
|
+
* • model.onnx, *.wasm, *_ref.json → Cache-first (immutable large assets)
|
|
6
|
+
* • JS / CSS / HTML → Stale-while-revalidate
|
|
7
|
+
* • Everything else → Network-first with cache fallback
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
const STATIC_CACHE = 'mellon-static-v1'
|
|
11
|
+
const MODEL_CACHE = 'mellon-model-v1'
|
|
12
|
+
|
|
13
|
+
// ─── Lifecycle ───────────────────────────────────────────────────────────────
|
|
14
|
+
|
|
15
|
+
self.addEventListener('install', () => self.skipWaiting())
|
|
16
|
+
|
|
17
|
+
self.addEventListener('activate', e => {
|
|
18
|
+
e.waitUntil(
|
|
19
|
+
caches.keys().then(keys =>
|
|
20
|
+
Promise.all(
|
|
21
|
+
keys
|
|
22
|
+
.filter(k => k !== STATIC_CACHE && k !== MODEL_CACHE)
|
|
23
|
+
.map(k => caches.delete(k))
|
|
24
|
+
)
|
|
25
|
+
).then(() => self.clients.claim())
|
|
26
|
+
)
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
// ─── Fetch ───────────────────────────────────────────────────────────────────
|
|
30
|
+
|
|
31
|
+
self.addEventListener('fetch', e => {
|
|
32
|
+
const { request } = e
|
|
33
|
+
const url = new URL(request.url)
|
|
34
|
+
const p = url.pathname
|
|
35
|
+
|
|
36
|
+
// Large immutable assets: model, wasm files, reference JSONs
|
|
37
|
+
if (p.endsWith('.onnx') || p.endsWith('.wasm') || p.includes('_ref.json') || p.endsWith('.mjs')) {
|
|
38
|
+
e.respondWith(cacheFirst(request, MODEL_CACHE))
|
|
39
|
+
return
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
// App shell: stale-while-revalidate
|
|
43
|
+
if (request.mode === 'navigate' || p.endsWith('.js') || p.endsWith('.css')) {
|
|
44
|
+
e.respondWith(staleWhileRevalidate(request, STATIC_CACHE))
|
|
45
|
+
return
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
// Default: network first
|
|
49
|
+
e.respondWith(
|
|
50
|
+
fetch(request).catch(() => caches.match(request))
|
|
51
|
+
)
|
|
52
|
+
})
|
|
53
|
+
|
|
54
|
+
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
|
55
|
+
|
|
56
|
+
async function cacheFirst(request, cacheName) {
|
|
57
|
+
const cache = await caches.open(cacheName)
|
|
58
|
+
const cached = await cache.match(request)
|
|
59
|
+
if (cached) return cached
|
|
60
|
+
|
|
61
|
+
const response = await fetch(request)
|
|
62
|
+
if (response.ok) cache.put(request, response.clone())
|
|
63
|
+
return response
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
async function staleWhileRevalidate(request, cacheName) {
|
|
67
|
+
const cache = await caches.open(cacheName)
|
|
68
|
+
const cached = await cache.match(request)
|
|
69
|
+
|
|
70
|
+
const fetchPromise = fetch(request).then(r => {
|
|
71
|
+
if (r.ok) cache.put(request, r.clone())
|
|
72
|
+
return r
|
|
73
|
+
}).catch(() => null)
|
|
74
|
+
|
|
75
|
+
return cached || fetchPromise
|
|
76
|
+
}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,295 @@
|
|
|
1
|
+
// mellon type declarations
|
|
2
|
+
|
|
3
|
+
// ─── Shared data types ───────────────────────────────────────────────────────
|
|
4
|
+
|
|
5
|
+
export interface RefData {
|
|
6
|
+
word_name: string
|
|
7
|
+
model_type: 'resnet_50_arc'
|
|
8
|
+
embeddings: number[][]
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
export interface MatchEventDetail {
|
|
12
|
+
/** The detected word name. */
|
|
13
|
+
name: string
|
|
14
|
+
/** Similarity score that triggered detection (0–1). */
|
|
15
|
+
confidence: number
|
|
16
|
+
/** Unix timestamp (ms) of the detection. */
|
|
17
|
+
timestamp: number
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
export interface SampleInfo {
|
|
21
|
+
audioBuffer: Float32Array
|
|
22
|
+
name: string
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
// ─── Engine ──────────────────────────────────────────────────────────────────
|
|
26
|
+
|
|
27
|
+
export interface EngineConfig {
|
|
28
|
+
/**
|
|
29
|
+
* Base URL where ORT WASM files are served (trailing slash required).
|
|
30
|
+
* Defaults to the jsDelivr CDN. Override for offline / intranet use.
|
|
31
|
+
* @example '/mellon-assets/wasm/'
|
|
32
|
+
*/
|
|
33
|
+
wasmBasePath?: string
|
|
34
|
+
/**
|
|
35
|
+
* Full URL to model.onnx.
|
|
36
|
+
* Defaults to the jsDelivr CDN. Override for offline / intranet use.
|
|
37
|
+
* @example '/mellon-assets/model.onnx'
|
|
38
|
+
*/
|
|
39
|
+
modelUrl?: string
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
/**
|
|
43
|
+
* Override asset paths. Optional — by default assets load from the jsDelivr CDN.
|
|
44
|
+
* Call this before loadModel() when deploying offline or on a private network.
|
|
45
|
+
*/
|
|
46
|
+
export function configure(config: EngineConfig): void
|
|
47
|
+
|
|
48
|
+
/**
|
|
49
|
+
* Load (or return the already-loaded) ONNX inference session.
|
|
50
|
+
* Idempotent — safe to call multiple times.
|
|
51
|
+
*
|
|
52
|
+
* @param onProgress Called with values 0.0 → 1.0 as the model downloads.
|
|
53
|
+
*/
|
|
54
|
+
export function loadModel(onProgress?: (progress: number) => void): Promise<void>
|
|
55
|
+
|
|
56
|
+
/**
|
|
57
|
+
* Compute a 256-dim L2-normalised embedding from a log-mel spectrogram.
|
|
58
|
+
* Requires loadModel() to have completed first.
|
|
59
|
+
*
|
|
60
|
+
* @param spectrogram Flat Float32Array of shape [149 × 64] from logfbank().
|
|
61
|
+
*/
|
|
62
|
+
export function embed(spectrogram: Float32Array): Promise<Float32Array>
|
|
63
|
+
|
|
64
|
+
// ─── Mel feature extraction ───────────────────────────────────────────────────
|
|
65
|
+
|
|
66
|
+
/**
|
|
67
|
+
* Compute a log-mel spectrogram from a 1.5-second 16 kHz audio buffer.
|
|
68
|
+
*
|
|
69
|
+
* @param signal 24 000 samples at 16 kHz (1.5 seconds).
|
|
70
|
+
* @returns Float32Array of shape [149 × 64] (frames × mel-bins).
|
|
71
|
+
*/
|
|
72
|
+
export function logfbank(signal: Float32Array): Float32Array
|
|
73
|
+
|
|
74
|
+
// ─── Similarity helpers ───────────────────────────────────────────────────────
|
|
75
|
+
|
|
76
|
+
/**
|
|
77
|
+
* Cosine similarity normalised to [0, 1].
|
|
78
|
+
* Assumes both vectors are L2-normalised (as the ArcFace model guarantees).
|
|
79
|
+
*/
|
|
80
|
+
export function cosineSim(a: Float32Array | number[], b: Float32Array | number[]): number
|
|
81
|
+
|
|
82
|
+
/**
|
|
83
|
+
* Maximum cosine similarity between `embedding` and any of `refs`.
|
|
84
|
+
*/
|
|
85
|
+
export function maxSimilarity(
|
|
86
|
+
embedding: Float32Array,
|
|
87
|
+
refs: number[][] | Float32Array[],
|
|
88
|
+
): number
|
|
89
|
+
|
|
90
|
+
// ─── HotwordDetector ─────────────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
export interface DetectorOptions {
|
|
93
|
+
/** Human-readable label for this word. */
|
|
94
|
+
name: string
|
|
95
|
+
/** Reference embeddings (N × 256), e.g. from a RefData.embeddings array. */
|
|
96
|
+
refEmbeddings: number[][] | Float32Array[]
|
|
97
|
+
/** Detection threshold in [0, 1]. Default: 0.65 */
|
|
98
|
+
threshold?: number
|
|
99
|
+
/** Minimum milliseconds between successive 'match' events. Default: 2000 */
|
|
100
|
+
relaxationMs?: number
|
|
101
|
+
/** Minimum milliseconds between consecutive inference runs. Default: 300 */
|
|
102
|
+
inferenceGapMs?: number
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/**
|
|
106
|
+
* Stateful detector for a single hotword.
|
|
107
|
+
*
|
|
108
|
+
* @example
|
|
109
|
+
* const myRef = await importRefFile(file) // or load from your source
|
|
110
|
+
* const d = new HotwordDetector({ name: myRef.word_name, refEmbeddings: myRef.embeddings })
|
|
111
|
+
* d.addEventListener('match', e => console.log(e.detail.name, e.detail.confidence))
|
|
112
|
+
* // In AudioWorklet onmessage handler:
|
|
113
|
+
* const score = await d.scoreFrame(audioBuffer)
|
|
114
|
+
*/
|
|
115
|
+
export class HotwordDetector extends EventTarget {
|
|
116
|
+
constructor(opts: DetectorOptions)
|
|
117
|
+
|
|
118
|
+
readonly name: string
|
|
119
|
+
/** Most recent similarity score (0–1). */
|
|
120
|
+
readonly lastScore: number
|
|
121
|
+
threshold: number
|
|
122
|
+
relaxationMs: number
|
|
123
|
+
inferenceGapMs: number
|
|
124
|
+
refEmbeddings: number[][] | Float32Array[]
|
|
125
|
+
|
|
126
|
+
/**
|
|
127
|
+
* Score a 1.5-second audio frame. Rate-limited to inferenceGapMs.
|
|
128
|
+
*
|
|
129
|
+
* @param audioBuffer 24 000 samples at 16 kHz.
|
|
130
|
+
* @returns Similarity score, or null when rate-limited.
|
|
131
|
+
*/
|
|
132
|
+
scoreFrame(audioBuffer: Float32Array): Promise<number | null>
|
|
133
|
+
|
|
134
|
+
addEventListener(
|
|
135
|
+
type: 'match',
|
|
136
|
+
listener: (event: CustomEvent<MatchEventDetail>) => void,
|
|
137
|
+
options?: boolean | AddEventListenerOptions,
|
|
138
|
+
): void
|
|
139
|
+
addEventListener(
|
|
140
|
+
type: string,
|
|
141
|
+
listener: EventListenerOrEventListenerObject,
|
|
142
|
+
options?: boolean | AddEventListenerOptions,
|
|
143
|
+
): void
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
// ─── EnrollmentSession ────────────────────────────────────────────────────────
|
|
147
|
+
|
|
148
|
+
/**
|
|
149
|
+
* Manages recording and embedding generation for a custom wake word.
|
|
150
|
+
*
|
|
151
|
+
* @example
|
|
152
|
+
* const session = new EnrollmentSession('hello')
|
|
153
|
+
* await session.recordSample() // record 1.5 s from mic (repeat 3+ times)
|
|
154
|
+
* const ref = await session.generateRef()
|
|
155
|
+
* saveCustomRef(ref) // persist to localStorage
|
|
156
|
+
*/
|
|
157
|
+
export class EnrollmentSession extends EventTarget {
|
|
158
|
+
constructor(wordName: string)
|
|
159
|
+
|
|
160
|
+
readonly wordName: string
|
|
161
|
+
readonly sampleCount: number
|
|
162
|
+
readonly samples: SampleInfo[]
|
|
163
|
+
|
|
164
|
+
/** Record exactly 1.5 seconds from the microphone. Returns 1-based sample index. */
|
|
165
|
+
recordSample(): Promise<number>
|
|
166
|
+
|
|
167
|
+
/** Decode an uploaded audio File and add it as a sample. Returns 1-based index. */
|
|
168
|
+
addAudioFile(file: File): Promise<number>
|
|
169
|
+
|
|
170
|
+
/** Remove a sample by 0-based index. */
|
|
171
|
+
removeSample(idx: number): void
|
|
172
|
+
|
|
173
|
+
/** Remove all samples. */
|
|
174
|
+
clearSamples(): void
|
|
175
|
+
|
|
176
|
+
/**
|
|
177
|
+
* Generate reference embeddings from the accumulated samples (minimum 3 required).
|
|
178
|
+
* Returns a RefData object ready to pass to saveCustomRef() or Mellon.addCustomWord().
|
|
179
|
+
*/
|
|
180
|
+
generateRef(): Promise<RefData>
|
|
181
|
+
|
|
182
|
+
addEventListener(type: 'recording-start', listener: (event: CustomEvent) => void, options?: boolean | AddEventListenerOptions): void
|
|
183
|
+
addEventListener(type: 'sample-added', listener: (event: CustomEvent<{ count: number; name: string }>) => void, options?: boolean | AddEventListenerOptions): void
|
|
184
|
+
addEventListener(type: 'samples-changed', listener: (event: CustomEvent<{ count: number }>) => void, options?: boolean | AddEventListenerOptions): void
|
|
185
|
+
addEventListener(type: 'generating', listener: (event: CustomEvent<{ total: number }>) => void, options?: boolean | AddEventListenerOptions): void
|
|
186
|
+
addEventListener(type: 'progress', listener: (event: CustomEvent<{ done: number; total: number }>) => void, options?: boolean | AddEventListenerOptions): void
|
|
187
|
+
addEventListener(type: string, listener: EventListenerOrEventListenerObject, options?: boolean | AddEventListenerOptions): void
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// ─── Mellon (high-level API) ─────────────────────────────────────────────────
|
|
191
|
+
|
|
192
|
+
export interface MellonOptions {
|
|
193
|
+
/** Words to detect. Refs must be registered via `addCustomWord()` or `refs` before `start()`. */
|
|
194
|
+
words?: string[]
|
|
195
|
+
/**
|
|
196
|
+
* Reference data to preload during `init()`. Each entry is either:
|
|
197
|
+
* - a URL string pointing to a hosted `_ref.json` file, or
|
|
198
|
+
* - an inline `RefData` object.
|
|
199
|
+
*
|
|
200
|
+
* @example
|
|
201
|
+
* refs: [
|
|
202
|
+
* 'https://example.com/hello_ref.json',
|
|
203
|
+
* 'https://example.com/stop_ref.json',
|
|
204
|
+
* ]
|
|
205
|
+
*/
|
|
206
|
+
refs?: (string | RefData)[]
|
|
207
|
+
/** Detection threshold [0, 1]. Default: 0.65 */
|
|
208
|
+
threshold?: number
|
|
209
|
+
/** Minimum milliseconds between successive match events per word. Default: 2000 */
|
|
210
|
+
relaxationMs?: number
|
|
211
|
+
/** Minimum milliseconds between consecutive inference runs. Default: 300 */
|
|
212
|
+
inferenceGapMs?: number
|
|
213
|
+
/**
|
|
214
|
+
* Override the ORT WASM base URL. Defaults to the jsDelivr CDN.
|
|
215
|
+
* Only needed for offline / intranet deployments (trailing slash required).
|
|
216
|
+
* @example '/mellon-assets/wasm/'
|
|
217
|
+
*/
|
|
218
|
+
wasmBasePath?: string
|
|
219
|
+
/**
|
|
220
|
+
* Override the model.onnx URL. Defaults to the jsDelivr CDN.
|
|
221
|
+
* Only needed for offline / intranet deployments.
|
|
222
|
+
* @example '/mellon-assets/model.onnx'
|
|
223
|
+
*/
|
|
224
|
+
modelUrl?: string
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
/**
|
|
228
|
+
* High-level, all-in-one hotword detector.
|
|
229
|
+
*
|
|
230
|
+
* @example
|
|
231
|
+
* const stt = new Mellon({
|
|
232
|
+
* wasmBasePath: '/assets/wasm/',
|
|
233
|
+
* modelUrl: '/assets/model.onnx',
|
|
234
|
+
* })
|
|
235
|
+
* await stt.init(pct => progressBar.style.width = pct * 100 + '%')
|
|
236
|
+
* await stt.start()
|
|
237
|
+
* stt.addEventListener('match', e => console.log(e.detail.name, e.detail.confidence))
|
|
238
|
+
*/
|
|
239
|
+
export class Mellon extends EventTarget {
|
|
240
|
+
constructor(opts?: MellonOptions)
|
|
241
|
+
|
|
242
|
+
/** True after init() has completed successfully. */
|
|
243
|
+
readonly isInitialized: boolean
|
|
244
|
+
|
|
245
|
+
/** True while start() is active (microphone is open). */
|
|
246
|
+
readonly isRunning: boolean
|
|
247
|
+
|
|
248
|
+
/**
|
|
249
|
+
* Load the ONNX model and cache built-in reference embeddings.
|
|
250
|
+
* Optional — start() auto-calls init() when needed.
|
|
251
|
+
*
|
|
252
|
+
* @param onProgress Progress callback, 0.0 → 1.0.
|
|
253
|
+
*/
|
|
254
|
+
init(onProgress?: (progress: number) => void): Promise<void>
|
|
255
|
+
|
|
256
|
+
/**
|
|
257
|
+
* Request microphone access and start hotword detection.
|
|
258
|
+
* Resolves once audio pipeline is running.
|
|
259
|
+
*
|
|
260
|
+
* @param words Optional subset of words to activate (must have refs loaded).
|
|
261
|
+
*/
|
|
262
|
+
start(words?: string[]): Promise<void>
|
|
263
|
+
|
|
264
|
+
/** Stop detection and release the microphone + AudioContext. */
|
|
265
|
+
stop(): void
|
|
266
|
+
|
|
267
|
+
/**
|
|
268
|
+
* Register reference embeddings for a word.
|
|
269
|
+
* Can be called before or after start().
|
|
270
|
+
*/
|
|
271
|
+
addCustomWord(refData: RefData): void
|
|
272
|
+
|
|
273
|
+
/**
|
|
274
|
+
* Create an EnrollmentSession for recording a new custom word.
|
|
275
|
+
* Call addCustomWord() with the result of session.generateRef().
|
|
276
|
+
*/
|
|
277
|
+
enrollWord(wordName: string): EnrollmentSession
|
|
278
|
+
|
|
279
|
+
/** Return all custom word refs stored in localStorage. */
|
|
280
|
+
static loadWords(): RefData[]
|
|
281
|
+
/** Persist a word ref to localStorage (replaces any existing entry with the same name). */
|
|
282
|
+
static saveWord(refData: RefData): void
|
|
283
|
+
/** Delete a word ref from localStorage by name. */
|
|
284
|
+
static deleteWord(wordName: string): void
|
|
285
|
+
/** Parse an uploaded ref JSON file. */
|
|
286
|
+
static importWordFile(file: File): Promise<RefData>
|
|
287
|
+
/** Trigger a browser download of a ref as a JSON file. */
|
|
288
|
+
static exportWord(refData: RefData): void
|
|
289
|
+
|
|
290
|
+
addEventListener(type: 'match', listener: (event: CustomEvent<MatchEventDetail>) => void, options?: boolean | AddEventListenerOptions): void
|
|
291
|
+
addEventListener(type: 'ready', listener: (event: CustomEvent) => void, options?: boolean | AddEventListenerOptions): void
|
|
292
|
+
addEventListener(type: 'error', listener: (event: CustomEvent<{ error: Error }>) => void, options?: boolean | AddEventListenerOptions): void
|
|
293
|
+
addEventListener(type: string, listener: EventListenerOrEventListenerObject, options?: boolean | AddEventListenerOptions): void
|
|
294
|
+
}
|
|
295
|
+
|
package/dist/mellon.cjs
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});const Jt="0.0.5",Kt=[1,1,149,64],Yt=`https://cdn.jsdelivr.net/npm/mellon@${Jt}/dist/assets`,st={assetsPath:`${Yt}`};let I=null,q=null,tt=null;function Qt({assetsPath:i}={}){i!==void 0&&(st.assetsPath=i),I=null,q=null,tt=null}async function Vt(i){return I?(i==null||i(1),I):q||(q=(async()=>{const n=st.assetsPath.endsWith("/")?st.assetsPath:st.assetsPath+"/",t=n+"ort.all.min.mjs",e=n+"model.onnx";tt=await new Function("url","return import(url)")(t),tt.env.wasm.wasmPaths=n;const s=await fetch(e);if(!s.ok)throw new Error(`Failed to fetch model: ${s.status}`);const a=parseInt(s.headers.get("content-length")||"0",10),r=s.body.getReader(),c=[];let l=0;for(;;){const{done:m,value:_}=await r.read();if(m)break;c.push(_),l+=_.byteLength,a>0&&(i==null||i(l/a))}const h=new Uint8Array(l);let d=0;for(const m of c)h.set(m,d),d+=m.byteLength;return I=await tt.InferenceSession.create(h.buffer,{executionProviders:["wasm"],graphOptimizationLevel:"all"}),i==null||i(1),I})(),q)}async function St(i){if(!I)throw new Error("Model not loaded — call loadModel() first");const n=new tt.Tensor("float32",i,Kt),t=await I.run({input:n}),e=Object.keys(t)[0];return t[e].data}function Xt(i){return i&&i.__esModule&&Object.prototype.hasOwnProperty.call(i,"default")?i.default:i}var mt,Et;function Zt(){if(Et)return mt;Et=1;function i(n){if(this.size=n|0,this.size<=1||(this.size&this.size-1)!==0)throw new Error("FFT size must be a power of two and bigger than 1");this._csize=n<<1;for(var t=new Array(this.size*2),e=0;e<t.length;e+=2){const l=Math.PI*e/this.size;t[e]=Math.cos(l),t[e+1]=-Math.sin(l)}this.table=t;for(var o=0,s=1;this.size>s;s<<=1)o++;this._width=o%2===0?o-1:o,this._bitrev=new Array(1<<this._width);for(var a=0;a<this._bitrev.length;a++){this._bitrev[a]=0;for(var r=0;r<this._width;r+=2){var c=this._width-r-2;this._bitrev[a]|=(a>>>r&3)<<c}}this._out=null,this._data=null,this._inv=0}return mt=i,i.prototype.fromComplexArray=function(t,e){for(var o=e||new Array(t.length>>>1),s=0;s<t.length;s+=2)o[s>>>1]=t[s];return o},i.prototype.createComplexArray=function(){const t=new Array(this._csize);for(var e=0;e<t.length;e++)t[e]=0;return t},i.prototype.toComplexArray=function(t,e){for(var o=e||this.createComplexArray(),s=0;s<o.length;s+=2)o[s]=t[s>>>1],o[s+1]=0;return o},i.prototype.completeSpectrum=function(t){for(var e=this._csize,o=e>>>1,s=2;s<o;s+=2)t[e-s]=t[s],t[e-s+1]=-t[s+1]},i.prototype.transform=function(t,e){if(t===e)throw new Error("Input and output buffers must be different");this._out=t,this._data=e,this._inv=0,this._transform4(),this._out=null,this._data=null},i.prototype.realTransform=function(t,e){if(t===e)throw new Error("Input and output buffers must be different");this._out=t,this._data=e,this._inv=0,this._realTransform4(),this._out=null,this._data=null},i.prototype.inverseTransform=function(t,e){if(t===e)throw new Error("Input and output buffers must be different");this._out=t,this._data=e,this._inv=1,this._transform4();for(var o=0;o<t.length;o++)t[o]/=this.size;this._out=null,this._data=null},i.prototype._transform4=function(){var t=this._out,e=this._csize,o=this._width,s=1<<o,a=e/s<<1,r,c,l=this._bitrev;if(a===4)for(r=0,c=0;r<e;r+=a,c++){const u=l[c];this._singleTransform2(r,u,s)}else for(r=0,c=0;r<e;r+=a,c++){const u=l[c];this._singleTransform4(r,u,s)}var h=this._inv?-1:1,d=this.table;for(s>>=2;s>=2;s>>=2){a=e/s<<1;var m=a>>>2;for(r=0;r<e;r+=a)for(var _=r+m,g=r,f=0;g<_;g+=2,f+=s){const u=g,p=u+m,v=p+m,w=v+m,b=t[u],A=t[u+1],E=t[p],y=t[p+1],F=t[v],M=t[v+1],C=t[w],T=t[w+1],x=b,R=A,z=d[f],S=h*d[f+1],N=E*z-y*S,k=E*S+y*z,P=d[2*f],L=h*d[2*f+1],G=F*P-M*L,H=F*L+M*P,J=d[3*f],K=h*d[3*f+1],Y=C*J-T*K,Q=C*K+T*J,V=x+G,W=R+H,B=x-G,X=R-H,Z=N+Y,U=k+Q,$=h*(N-Y),O=h*(k-Q),et=V+Z,ot=W+U,at=V-Z,it=W-U,ct=B+O,lt=X-$,ht=B-O,dt=X+$;t[u]=et,t[u+1]=ot,t[p]=ct,t[p+1]=lt,t[v]=at,t[v+1]=it,t[w]=ht,t[w+1]=dt}}},i.prototype._singleTransform2=function(t,e,o){const s=this._out,a=this._data,r=a[e],c=a[e+1],l=a[e+o],h=a[e+o+1],d=r+l,m=c+h,_=r-l,g=c-h;s[t]=d,s[t+1]=m,s[t+2]=_,s[t+3]=g},i.prototype._singleTransform4=function(t,e,o){const s=this._out,a=this._data,r=this._inv?-1:1,c=o*2,l=o*3,h=a[e],d=a[e+1],m=a[e+o],_=a[e+o+1],g=a[e+c],f=a[e+c+1],u=a[e+l],p=a[e+l+1],v=h+g,w=d+f,b=h-g,A=d-f,E=m+u,y=_+p,F=r*(m-u),M=r*(_-p),C=v+E,T=w+y,x=b+M,R=A-F,z=v-E,S=w-y,N=b-M,k=A+F;s[t]=C,s[t+1]=T,s[t+2]=x,s[t+3]=R,s[t+4]=z,s[t+5]=S,s[t+6]=N,s[t+7]=k},i.prototype._realTransform4=function(){var t=this._out,e=this._csize,o=this._width,s=1<<o,a=e/s<<1,r,c,l=this._bitrev;if(a===4)for(r=0,c=0;r<e;r+=a,c++){const ut=l[c];this._singleRealTransform2(r,ut>>>1,s>>>1)}else for(r=0,c=0;r<e;r+=a,c++){const ut=l[c];this._singleRealTransform4(r,ut>>>1,s>>>1)}var h=this._inv?-1:1,d=this.table;for(s>>=2;s>=2;s>>=2){a=e/s<<1;var m=a>>>1,_=m>>>1,g=_>>>1;for(r=0;r<e;r+=a)for(var f=0,u=0;f<=g;f+=2,u+=s){var p=r+f,v=p+_,w=v+_,b=w+_,A=t[p],E=t[p+1],y=t[v],F=t[v+1],M=t[w],C=t[w+1],T=t[b],x=t[b+1],R=A,z=E,S=d[u],N=h*d[u+1],k=y*S-F*N,P=y*N+F*S,L=d[2*u],G=h*d[2*u+1],H=M*L-C*G,J=M*G+C*L,K=d[3*u],Y=h*d[3*u+1],Q=T*K-x*Y,V=T*Y+x*K,W=R+H,B=z+J,X=R-H,Z=z-J,U=k+Q,$=P+V,O=h*(k-Q),et=h*(P-V),ot=W+U,at=B+$,it=X+et,ct=Z-O;if(t[p]=ot,t[p+1]=at,t[v]=it,t[v+1]=ct,f===0){var lt=W-U,ht=B-$;t[w]=lt,t[w+1]=ht;continue}if(f!==g){var dt=X,It=-Z,Dt=W,jt=-B,Wt=-h*et,Bt=-h*O,Ut=-h*$,$t=-h*U,Pt=dt+Wt,Lt=It+Bt,Gt=Dt+$t,Ht=jt-Ut,yt=r+_-f,bt=r+m-f;t[yt]=Pt,t[yt+1]=Lt,t[bt]=Gt,t[bt+1]=Ht}}}},i.prototype._singleRealTransform2=function(t,e,o){const s=this._out,a=this._data,r=a[e],c=a[e+o],l=r+c,h=r-c;s[t]=l,s[t+1]=0,s[t+2]=h,s[t+3]=0},i.prototype._singleRealTransform4=function(t,e,o){const s=this._out,a=this._data,r=this._inv?-1:1,c=o*2,l=o*3,h=a[e],d=a[e+o],m=a[e+c],_=a[e+l],g=h+m,f=h-m,u=d+_,p=r*(d-_),v=g+u,w=f,b=-p,A=g-u,E=f,y=p;s[t]=v,s[t+1]=0,s[t+2]=w,s[t+3]=b,s[t+4]=A,s[t+5]=0,s[t+6]=E,s[t+7]=y},mt}var Ot=Zt();const qt=Xt(Ot),nt=16e3,j=512,D=64,At=Math.floor(.025*nt),Ft=Math.floor(.01*nt);function Mt(i){return 2595*Math.log10(1+i/700)}function te(i){return 700*(10**(i/2595)-1)}function ee(){const i=Mt(0),n=Mt(nt/2),t=new Float64Array(D+2);for(let r=0;r<D+2;r++)t[r]=i+r*(n-i)/(D+1);const o=t.map(r=>te(r)).map(r=>Math.floor((j+1)*r/nt)),s=[],a=Math.floor(j/2)+1;for(let r=0;r<D;r++){const c=new Float32Array(a);for(let l=o[r];l<o[r+1];l++)c[l]=(l-o[r])/(o[r+1]-o[r]);for(let l=o[r+1];l<o[r+2];l++)c[l]=(o[r+2]-l)/(o[r+2]-o[r+1]);s.push(c)}return s}const se=ee(),rt=new qt(j),ft=new Float32Array(j),Ct=rt.createComplexArray(),_t=rt.createComplexArray(),Tt=new Float32Array(Math.floor(j/2)+1);function Nt(i){const n=1+Math.ceil((i.length-At)/Ft),t=new Float32Array(n*D),e=Math.floor(j/2)+1;for(let o=0;o<n;o++){const s=o*Ft;ft.fill(0);for(let a=0;a<At&&s+a<i.length;a++)ft[a]=i[s+a];rt.toComplexArray(ft,Ct),rt.transform(_t,Ct);for(let a=0;a<e;a++){const r=_t[2*a],c=_t[2*a+1],l=(r*r+c*c)/j;Tt[a]=l===0?1e-30:l}for(let a=0;a<D;a++){const r=se[a];let c=0;for(let l=0;l<e;l++)c+=Tt[l]*r[l];t[o*D+a]=Math.log(c===0?1e-30:c)}}return t}function ne(i,n){let t=0;for(let e=0;e<i.length;e++)t+=i[e]*n[e];return(t+1)/2}function re(i,n){let t=0;for(const e of n){const o=ne(i,e);o>t&&(t=o)}return t}class xt extends EventTarget{constructor({name:n,refEmbeddings:t,threshold:e=.65,relaxationMs:o=2e3,inferenceGapMs:s=300}){super(),this.name=n,this.refEmbeddings=t,this.threshold=e,this.relaxationMs=o,this.inferenceGapMs=s,this._lastDetectionAt=0,this._lastInferenceAt=0,this._lastScore=0}get lastScore(){return this._lastScore}async scoreFrame(n){const t=Date.now();if(t-this._lastInferenceAt<this.inferenceGapMs)return null;this._lastInferenceAt=t;const e=Nt(n),o=await St(e),s=re(o,this.refEmbeddings);return this._lastScore=s,s>=this.threshold&&t-this._lastDetectionAt>=this.relaxationMs&&(this._lastDetectionAt=t,this.dispatchEvent(new CustomEvent("match",{detail:{name:this.name,confidence:s,timestamp:t}}))),s}}const Rt=16e3,oe=1500,pt=24e3;function zt(i){if(i.length===pt)return i;const n=new Float32Array(pt);return n.set(i.subarray(0,pt)),n}class kt extends EventTarget{constructor(n){super(),this.wordName=n.trim().toLowerCase(),this.samples=[]}get sampleCount(){return this.samples.length}async recordSample(){const n=await navigator.mediaDevices.getUserMedia({audio:!0});return new Promise((t,e)=>{const o=new AudioContext({sampleRate:Rt}),s=new MediaRecorder(n),a=[];this.dispatchEvent(new CustomEvent("recording-start")),s.ondataavailable=r=>{r.data.size>0&&a.push(r.data)},s.onstop=async()=>{n.getTracks().forEach(r=>r.stop());try{const c=await new Blob(a,{type:"audio/webm"}).arrayBuffer(),l=await o.decodeAudioData(c);await o.close();const h=l.getChannelData(0),d=zt(new Float32Array(h)),m=this._push(d,`Recorded #${this.samples.length}`);t(m)}catch(r){await o.close().catch(()=>{}),e(r)}},s.start(),setTimeout(()=>s.stop(),oe)})}async addAudioFile(n){const t=await n.arrayBuffer(),e=new AudioContext({sampleRate:Rt}),o=await e.decodeAudioData(t);await e.close();const s=o.getChannelData(0),a=zt(new Float32Array(s));return this._push(a,n.name)}removeSample(n){this.samples.splice(n,1),this.dispatchEvent(new CustomEvent("samples-changed",{detail:{count:this.samples.length}}))}clearSamples(){this.samples=[],this.dispatchEvent(new CustomEvent("samples-changed",{detail:{count:0}}))}async generateRef(){if(this.samples.length<3)throw new Error(`Need at least 3 samples (currently have ${this.samples.length})`);this.dispatchEvent(new CustomEvent("generating",{detail:{total:this.samples.length}}));const n=[];for(let t=0;t<this.samples.length;t++){const e=Nt(this.samples[t].audioBuffer),o=await St(e);n.push(Array.from(o)),this.dispatchEvent(new CustomEvent("progress",{detail:{done:t+1,total:this.samples.length}}))}return{word_name:this.wordName,model_type:"resnet_50_arc",embeddings:n}}_push(n,t){this.samples.push({audioBuffer:n,name:t});const e=this.samples.length;return this.dispatchEvent(new CustomEvent("sample-added",{detail:{count:e,name:t}})),e}}const ae=`/**
|
|
2
|
+
* public/audio-processor.js
|
|
3
|
+
* AudioWorklet that runs at 16 kHz and continuously emits the last
|
|
4
|
+
* 1.5-second window (24 000 samples) via a circular buffer.
|
|
5
|
+
*
|
|
6
|
+
* The main thread receives a fresh Float32Array on every
|
|
7
|
+
* AudioWorklet quantum (128 samples ≈ every 8 ms at 16 kHz).
|
|
8
|
+
* The inference loop in engine.js rate-limits to avoid excessive work.
|
|
9
|
+
*/
|
|
10
|
+
class AudioProcessor extends AudioWorkletProcessor {
|
|
11
|
+
constructor() {
|
|
12
|
+
super()
|
|
13
|
+
this._size = 24000 // 1.5 s × 16 000 Hz
|
|
14
|
+
this._buf = new Float32Array(this._size)
|
|
15
|
+
this._ptr = 0
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
process(inputs) {
|
|
19
|
+
const ch = inputs[0]?.[0]
|
|
20
|
+
if (!ch) return true
|
|
21
|
+
|
|
22
|
+
for (let i = 0; i < ch.length; i++) {
|
|
23
|
+
this._buf[this._ptr] = ch[i]
|
|
24
|
+
this._ptr = (this._ptr + 1) % this._size
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
// Send an ordered copy of the ring buffer
|
|
28
|
+
const out = new Float32Array(this._size)
|
|
29
|
+
for (let i = 0; i < this._size; i++) {
|
|
30
|
+
out[i] = this._buf[(this._ptr + i) % this._size]
|
|
31
|
+
}
|
|
32
|
+
this.port.postMessage(out)
|
|
33
|
+
return true
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
registerProcessor('audio-processor', AudioProcessor)
|
|
38
|
+
`;let vt=null;function ie(){if(!vt){const i=new Blob([ae],{type:"application/javascript"});vt=URL.createObjectURL(i)}return vt}const wt="mellon_custom_refs";function gt(){try{const i=localStorage.getItem(wt);return i?JSON.parse(i):[]}catch{return[]}}function ce(i){const n=gt().filter(t=>t.word_name!==i.word_name);n.push(i),localStorage.setItem(wt,JSON.stringify(n))}function le(i){const n=gt().filter(t=>t.word_name!==i);localStorage.setItem(wt,JSON.stringify(n))}function he(i){const n=JSON.stringify(i,null,2),t=new Blob([n],{type:"application/json"}),e=URL.createObjectURL(t),o=Object.assign(document.createElement("a"),{href:e,download:`${i.word_name}_ref.json`});document.body.appendChild(o),o.click(),document.body.removeChild(o),URL.revokeObjectURL(e)}async function de(i){const n=await i.text();let t;try{t=JSON.parse(n)}catch{throw new Error("Invalid JSON")}if(!t.embeddings||!Array.isArray(t.embeddings)||!t.embeddings.length)throw new Error('Missing or empty "embeddings" array');if(!Array.isArray(t.embeddings[0]))throw new Error('"embeddings" must be a 2D array');return t.word_name||(t.word_name=i.name.replace(/_ref\.json$/i,"").replace(/\.json$/i,"")),t}class ue extends EventTarget{constructor(n={}){super(),this._opts={words:n.words??[],refs:n.refs??[],threshold:n.threshold??.65,relaxationMs:n.relaxationMs??2e3,inferenceGapMs:n.inferenceGapMs??300,assetsPath:n.assetsPath},this._refs=new Map,this._detectors=new Map,this._audioCtx=null,this._workletNode=null,this._stream=null,this._initialized=!1,this._running=!1}get isInitialized(){return this._initialized}get isRunning(){return this._running}async init(n){if(this._initialized){n==null||n(1);return}this._opts.assetsPath&&Qt({assetsPath:this._opts.assetsPath});try{await Vt(n)}catch(t){throw this.dispatchEvent(new CustomEvent("error",{detail:{error:t}})),t}for(const t of this._opts.refs)try{let e;if(typeof t=="string"){const o=await fetch(t);if(!o.ok)throw new Error(`HTTP ${o.status}`);e=await o.json()}else e=t;this.addCustomWord(e)}catch(e){const o=typeof t=="string"?t:t.word_name;console.warn(`[Mellon] Failed to load ref "${o}": ${e.message}`)}this._initialized=!0,this.dispatchEvent(new CustomEvent("ready"))}async start(n){this._initialized||await this.init();const t=n??this._opts.words;try{this._stream=await navigator.mediaDevices.getUserMedia({audio:!0})}catch(s){const a=new Error(`Microphone access denied: ${s.message}`);throw this.dispatchEvent(new CustomEvent("error",{detail:{error:a}})),a}this._audioCtx=new AudioContext({sampleRate:16e3});const e=ie();await this._audioCtx.audioWorklet.addModule(e);const o=this._audioCtx.createMediaStreamSource(this._stream);this._workletNode=new AudioWorkletNode(this._audioCtx,"audio-processor"),o.connect(this._workletNode),this._workletNode.connect(this._audioCtx.destination);for(const s of t){const a=this._refs.get(s);if(!a){console.warn(`[Mellon] No reference embeddings for "${s}" — skipping. Call addCustomWord() to register custom words before start().`);continue}const r=new xt({name:s,refEmbeddings:a.embeddings,threshold:this._opts.threshold,relaxationMs:this._opts.relaxationMs,inferenceGapMs:this._opts.inferenceGapMs});r.addEventListener("match",c=>{this.dispatchEvent(new CustomEvent("match",{detail:c.detail}))}),this._detectors.set(s,r)}this._workletNode.port.onmessage=async s=>{const a=[];for(const r of this._detectors.values())a.push(r.scoreFrame(s.data));await Promise.allSettled(a)},this._running=!0}stop(){this._workletNode&&(this._workletNode.port.onmessage=null,this._workletNode.disconnect(),this._workletNode=null),this._stream&&(this._stream.getTracks().forEach(n=>n.stop()),this._stream=null),this._audioCtx&&(this._audioCtx.close(),this._audioCtx=null),this._detectors.clear(),this._running=!1}addCustomWord(n){if(this._refs.set(n.word_name,n),this._running&&this._workletNode){const t=new xt({name:n.word_name,refEmbeddings:n.embeddings,threshold:this._opts.threshold,relaxationMs:this._opts.relaxationMs,inferenceGapMs:this._opts.inferenceGapMs});t.addEventListener("match",e=>{this.dispatchEvent(new CustomEvent("match",{detail:e.detail}))}),this._detectors.set(n.word_name,t)}}enrollWord(n){return new kt(n)}static loadWords(){return gt()}static saveWord(n){ce(n)}static deleteWord(n){le(n)}static importWordFile(n){return de(n)}static exportWord(n){he(n)}}exports.EnrollmentSession=kt;exports.Mellon=ue;
|
package/dist/mellon.mjs
ADDED
|
@@ -0,0 +1,611 @@
|
|
|
1
|
+
const Ht = "0.0.5", Jt = [1, 1, 149, 64], Kt = `https://cdn.jsdelivr.net/npm/mellon@${Ht}/dist/assets`, st = {
|
|
2
|
+
assetsPath: `${Kt}`
|
|
3
|
+
};
|
|
4
|
+
let I = null, q = null, tt = null;
|
|
5
|
+
function Yt({ assetsPath: i } = {}) {
|
|
6
|
+
i !== void 0 && (st.assetsPath = i), I = null, q = null, tt = null;
|
|
7
|
+
}
|
|
8
|
+
async function Qt(i) {
|
|
9
|
+
return I ? (i == null || i(1), I) : q || (q = (async () => {
|
|
10
|
+
const n = st.assetsPath.endsWith("/") ? st.assetsPath : st.assetsPath + "/", t = n + "ort.all.min.mjs", e = n + "model.onnx";
|
|
11
|
+
tt = await new Function("url", "return import(url)")(t), tt.env.wasm.wasmPaths = n;
|
|
12
|
+
const s = await fetch(e);
|
|
13
|
+
if (!s.ok) throw new Error(`Failed to fetch model: ${s.status}`);
|
|
14
|
+
const a = parseInt(s.headers.get("content-length") || "0", 10), r = s.body.getReader(), c = [];
|
|
15
|
+
let l = 0;
|
|
16
|
+
for (; ; ) {
|
|
17
|
+
const { done: m, value: _ } = await r.read();
|
|
18
|
+
if (m) break;
|
|
19
|
+
c.push(_), l += _.byteLength, a > 0 && (i == null || i(l / a));
|
|
20
|
+
}
|
|
21
|
+
const h = new Uint8Array(l);
|
|
22
|
+
let d = 0;
|
|
23
|
+
for (const m of c)
|
|
24
|
+
h.set(m, d), d += m.byteLength;
|
|
25
|
+
return I = await tt.InferenceSession.create(h.buffer, {
|
|
26
|
+
executionProviders: ["wasm"],
|
|
27
|
+
graphOptimizationLevel: "all"
|
|
28
|
+
}), i == null || i(1), I;
|
|
29
|
+
})(), q);
|
|
30
|
+
}
|
|
31
|
+
async function St(i) {
|
|
32
|
+
if (!I) throw new Error("Model not loaded — call loadModel() first");
|
|
33
|
+
const n = new tt.Tensor("float32", i, Jt), t = await I.run({ input: n }), e = Object.keys(t)[0];
|
|
34
|
+
return t[e].data;
|
|
35
|
+
}
|
|
36
|
+
function Vt(i) {
|
|
37
|
+
return i && i.__esModule && Object.prototype.hasOwnProperty.call(i, "default") ? i.default : i;
|
|
38
|
+
}
|
|
39
|
+
var mt, Et;
|
|
40
|
+
function Xt() {
|
|
41
|
+
if (Et) return mt;
|
|
42
|
+
Et = 1;
|
|
43
|
+
function i(n) {
|
|
44
|
+
if (this.size = n | 0, this.size <= 1 || (this.size & this.size - 1) !== 0)
|
|
45
|
+
throw new Error("FFT size must be a power of two and bigger than 1");
|
|
46
|
+
this._csize = n << 1;
|
|
47
|
+
for (var t = new Array(this.size * 2), e = 0; e < t.length; e += 2) {
|
|
48
|
+
const l = Math.PI * e / this.size;
|
|
49
|
+
t[e] = Math.cos(l), t[e + 1] = -Math.sin(l);
|
|
50
|
+
}
|
|
51
|
+
this.table = t;
|
|
52
|
+
for (var o = 0, s = 1; this.size > s; s <<= 1)
|
|
53
|
+
o++;
|
|
54
|
+
this._width = o % 2 === 0 ? o - 1 : o, this._bitrev = new Array(1 << this._width);
|
|
55
|
+
for (var a = 0; a < this._bitrev.length; a++) {
|
|
56
|
+
this._bitrev[a] = 0;
|
|
57
|
+
for (var r = 0; r < this._width; r += 2) {
|
|
58
|
+
var c = this._width - r - 2;
|
|
59
|
+
this._bitrev[a] |= (a >>> r & 3) << c;
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
this._out = null, this._data = null, this._inv = 0;
|
|
63
|
+
}
|
|
64
|
+
return mt = i, i.prototype.fromComplexArray = function(t, e) {
|
|
65
|
+
for (var o = e || new Array(t.length >>> 1), s = 0; s < t.length; s += 2)
|
|
66
|
+
o[s >>> 1] = t[s];
|
|
67
|
+
return o;
|
|
68
|
+
}, i.prototype.createComplexArray = function() {
|
|
69
|
+
const t = new Array(this._csize);
|
|
70
|
+
for (var e = 0; e < t.length; e++)
|
|
71
|
+
t[e] = 0;
|
|
72
|
+
return t;
|
|
73
|
+
}, i.prototype.toComplexArray = function(t, e) {
|
|
74
|
+
for (var o = e || this.createComplexArray(), s = 0; s < o.length; s += 2)
|
|
75
|
+
o[s] = t[s >>> 1], o[s + 1] = 0;
|
|
76
|
+
return o;
|
|
77
|
+
}, i.prototype.completeSpectrum = function(t) {
|
|
78
|
+
for (var e = this._csize, o = e >>> 1, s = 2; s < o; s += 2)
|
|
79
|
+
t[e - s] = t[s], t[e - s + 1] = -t[s + 1];
|
|
80
|
+
}, i.prototype.transform = function(t, e) {
|
|
81
|
+
if (t === e)
|
|
82
|
+
throw new Error("Input and output buffers must be different");
|
|
83
|
+
this._out = t, this._data = e, this._inv = 0, this._transform4(), this._out = null, this._data = null;
|
|
84
|
+
}, i.prototype.realTransform = function(t, e) {
|
|
85
|
+
if (t === e)
|
|
86
|
+
throw new Error("Input and output buffers must be different");
|
|
87
|
+
this._out = t, this._data = e, this._inv = 0, this._realTransform4(), this._out = null, this._data = null;
|
|
88
|
+
}, i.prototype.inverseTransform = function(t, e) {
|
|
89
|
+
if (t === e)
|
|
90
|
+
throw new Error("Input and output buffers must be different");
|
|
91
|
+
this._out = t, this._data = e, this._inv = 1, this._transform4();
|
|
92
|
+
for (var o = 0; o < t.length; o++)
|
|
93
|
+
t[o] /= this.size;
|
|
94
|
+
this._out = null, this._data = null;
|
|
95
|
+
}, i.prototype._transform4 = function() {
|
|
96
|
+
var t = this._out, e = this._csize, o = this._width, s = 1 << o, a = e / s << 1, r, c, l = this._bitrev;
|
|
97
|
+
if (a === 4)
|
|
98
|
+
for (r = 0, c = 0; r < e; r += a, c++) {
|
|
99
|
+
const u = l[c];
|
|
100
|
+
this._singleTransform2(r, u, s);
|
|
101
|
+
}
|
|
102
|
+
else
|
|
103
|
+
for (r = 0, c = 0; r < e; r += a, c++) {
|
|
104
|
+
const u = l[c];
|
|
105
|
+
this._singleTransform4(r, u, s);
|
|
106
|
+
}
|
|
107
|
+
var h = this._inv ? -1 : 1, d = this.table;
|
|
108
|
+
for (s >>= 2; s >= 2; s >>= 2) {
|
|
109
|
+
a = e / s << 1;
|
|
110
|
+
var m = a >>> 2;
|
|
111
|
+
for (r = 0; r < e; r += a)
|
|
112
|
+
for (var _ = r + m, g = r, f = 0; g < _; g += 2, f += s) {
|
|
113
|
+
const u = g, p = u + m, v = p + m, w = v + m, b = t[u], A = t[u + 1], E = t[p], y = t[p + 1], F = t[v], C = t[v + 1], M = t[w], T = t[w + 1], x = b, R = A, z = d[f], S = h * d[f + 1], N = E * z - y * S, k = E * S + y * z, L = d[2 * f], P = h * d[2 * f + 1], G = F * L - C * P, H = F * P + C * L, J = d[3 * f], K = h * d[3 * f + 1], Y = M * J - T * K, Q = M * K + T * J, V = x + G, j = R + H, B = x - G, X = R - H, Z = N + Y, U = k + Q, $ = h * (N - Y), O = h * (k - Q), et = V + Z, ot = j + U, at = V - Z, it = j - U, ct = B + O, lt = X - $, ht = B - O, dt = X + $;
|
|
114
|
+
t[u] = et, t[u + 1] = ot, t[p] = ct, t[p + 1] = lt, t[v] = at, t[v + 1] = it, t[w] = ht, t[w + 1] = dt;
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}, i.prototype._singleTransform2 = function(t, e, o) {
|
|
118
|
+
const s = this._out, a = this._data, r = a[e], c = a[e + 1], l = a[e + o], h = a[e + o + 1], d = r + l, m = c + h, _ = r - l, g = c - h;
|
|
119
|
+
s[t] = d, s[t + 1] = m, s[t + 2] = _, s[t + 3] = g;
|
|
120
|
+
}, i.prototype._singleTransform4 = function(t, e, o) {
|
|
121
|
+
const s = this._out, a = this._data, r = this._inv ? -1 : 1, c = o * 2, l = o * 3, h = a[e], d = a[e + 1], m = a[e + o], _ = a[e + o + 1], g = a[e + c], f = a[e + c + 1], u = a[e + l], p = a[e + l + 1], v = h + g, w = d + f, b = h - g, A = d - f, E = m + u, y = _ + p, F = r * (m - u), C = r * (_ - p), M = v + E, T = w + y, x = b + C, R = A - F, z = v - E, S = w - y, N = b - C, k = A + F;
|
|
122
|
+
s[t] = M, s[t + 1] = T, s[t + 2] = x, s[t + 3] = R, s[t + 4] = z, s[t + 5] = S, s[t + 6] = N, s[t + 7] = k;
|
|
123
|
+
}, i.prototype._realTransform4 = function() {
|
|
124
|
+
var t = this._out, e = this._csize, o = this._width, s = 1 << o, a = e / s << 1, r, c, l = this._bitrev;
|
|
125
|
+
if (a === 4)
|
|
126
|
+
for (r = 0, c = 0; r < e; r += a, c++) {
|
|
127
|
+
const ut = l[c];
|
|
128
|
+
this._singleRealTransform2(r, ut >>> 1, s >>> 1);
|
|
129
|
+
}
|
|
130
|
+
else
|
|
131
|
+
for (r = 0, c = 0; r < e; r += a, c++) {
|
|
132
|
+
const ut = l[c];
|
|
133
|
+
this._singleRealTransform4(r, ut >>> 1, s >>> 1);
|
|
134
|
+
}
|
|
135
|
+
var h = this._inv ? -1 : 1, d = this.table;
|
|
136
|
+
for (s >>= 2; s >= 2; s >>= 2) {
|
|
137
|
+
a = e / s << 1;
|
|
138
|
+
var m = a >>> 1, _ = m >>> 1, g = _ >>> 1;
|
|
139
|
+
for (r = 0; r < e; r += a)
|
|
140
|
+
for (var f = 0, u = 0; f <= g; f += 2, u += s) {
|
|
141
|
+
var p = r + f, v = p + _, w = v + _, b = w + _, A = t[p], E = t[p + 1], y = t[v], F = t[v + 1], C = t[w], M = t[w + 1], T = t[b], x = t[b + 1], R = A, z = E, S = d[u], N = h * d[u + 1], k = y * S - F * N, L = y * N + F * S, P = d[2 * u], G = h * d[2 * u + 1], H = C * P - M * G, J = C * G + M * P, K = d[3 * u], Y = h * d[3 * u + 1], Q = T * K - x * Y, V = T * Y + x * K, j = R + H, B = z + J, X = R - H, Z = z - J, U = k + Q, $ = L + V, O = h * (k - Q), et = h * (L - V), ot = j + U, at = B + $, it = X + et, ct = Z - O;
|
|
142
|
+
if (t[p] = ot, t[p + 1] = at, t[v] = it, t[v + 1] = ct, f === 0) {
|
|
143
|
+
var lt = j - U, ht = B - $;
|
|
144
|
+
t[w] = lt, t[w + 1] = ht;
|
|
145
|
+
continue;
|
|
146
|
+
}
|
|
147
|
+
if (f !== g) {
|
|
148
|
+
var dt = X, kt = -Z, It = j, Dt = -B, Wt = -h * et, jt = -h * O, Bt = -h * $, Ut = -h * U, $t = dt + Wt, Lt = kt + jt, Pt = It + Ut, Gt = Dt - Bt, yt = r + _ - f, bt = r + m - f;
|
|
149
|
+
t[yt] = $t, t[yt + 1] = Lt, t[bt] = Pt, t[bt + 1] = Gt;
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
}, i.prototype._singleRealTransform2 = function(t, e, o) {
|
|
154
|
+
const s = this._out, a = this._data, r = a[e], c = a[e + o], l = r + c, h = r - c;
|
|
155
|
+
s[t] = l, s[t + 1] = 0, s[t + 2] = h, s[t + 3] = 0;
|
|
156
|
+
}, i.prototype._singleRealTransform4 = function(t, e, o) {
|
|
157
|
+
const s = this._out, a = this._data, r = this._inv ? -1 : 1, c = o * 2, l = o * 3, h = a[e], d = a[e + o], m = a[e + c], _ = a[e + l], g = h + m, f = h - m, u = d + _, p = r * (d - _), v = g + u, w = f, b = -p, A = g - u, E = f, y = p;
|
|
158
|
+
s[t] = v, s[t + 1] = 0, s[t + 2] = w, s[t + 3] = b, s[t + 4] = A, s[t + 5] = 0, s[t + 6] = E, s[t + 7] = y;
|
|
159
|
+
}, mt;
|
|
160
|
+
}
|
|
161
|
+
var Zt = Xt();
|
|
162
|
+
const Ot = /* @__PURE__ */ Vt(Zt), nt = 16e3, W = 512, D = 64, At = Math.floor(0.025 * nt), Ft = Math.floor(0.01 * nt);
|
|
163
|
+
function Ct(i) {
|
|
164
|
+
return 2595 * Math.log10(1 + i / 700);
|
|
165
|
+
}
|
|
166
|
+
function qt(i) {
|
|
167
|
+
return 700 * (10 ** (i / 2595) - 1);
|
|
168
|
+
}
|
|
169
|
+
function te() {
|
|
170
|
+
const i = Ct(0), n = Ct(nt / 2), t = new Float64Array(D + 2);
|
|
171
|
+
for (let r = 0; r < D + 2; r++)
|
|
172
|
+
t[r] = i + r * (n - i) / (D + 1);
|
|
173
|
+
const o = t.map((r) => qt(r)).map((r) => Math.floor((W + 1) * r / nt)), s = [], a = Math.floor(W / 2) + 1;
|
|
174
|
+
for (let r = 0; r < D; r++) {
|
|
175
|
+
const c = new Float32Array(a);
|
|
176
|
+
for (let l = o[r]; l < o[r + 1]; l++) c[l] = (l - o[r]) / (o[r + 1] - o[r]);
|
|
177
|
+
for (let l = o[r + 1]; l < o[r + 2]; l++) c[l] = (o[r + 2] - l) / (o[r + 2] - o[r + 1]);
|
|
178
|
+
s.push(c);
|
|
179
|
+
}
|
|
180
|
+
return s;
|
|
181
|
+
}
|
|
182
|
+
const ee = te(), rt = new Ot(W), ft = new Float32Array(W), Mt = rt.createComplexArray(), _t = rt.createComplexArray(), Tt = new Float32Array(Math.floor(W / 2) + 1);
|
|
183
|
+
function Nt(i) {
|
|
184
|
+
const n = 1 + Math.ceil((i.length - At) / Ft), t = new Float32Array(n * D), e = Math.floor(W / 2) + 1;
|
|
185
|
+
for (let o = 0; o < n; o++) {
|
|
186
|
+
const s = o * Ft;
|
|
187
|
+
ft.fill(0);
|
|
188
|
+
for (let a = 0; a < At && s + a < i.length; a++)
|
|
189
|
+
ft[a] = i[s + a];
|
|
190
|
+
rt.toComplexArray(ft, Mt), rt.transform(_t, Mt);
|
|
191
|
+
for (let a = 0; a < e; a++) {
|
|
192
|
+
const r = _t[2 * a], c = _t[2 * a + 1], l = (r * r + c * c) / W;
|
|
193
|
+
Tt[a] = l === 0 ? 1e-30 : l;
|
|
194
|
+
}
|
|
195
|
+
for (let a = 0; a < D; a++) {
|
|
196
|
+
const r = ee[a];
|
|
197
|
+
let c = 0;
|
|
198
|
+
for (let l = 0; l < e; l++) c += Tt[l] * r[l];
|
|
199
|
+
t[o * D + a] = Math.log(c === 0 ? 1e-30 : c);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
return t;
|
|
203
|
+
}
|
|
204
|
+
function se(i, n) {
|
|
205
|
+
let t = 0;
|
|
206
|
+
for (let e = 0; e < i.length; e++) t += i[e] * n[e];
|
|
207
|
+
return (t + 1) / 2;
|
|
208
|
+
}
|
|
209
|
+
function ne(i, n) {
|
|
210
|
+
let t = 0;
|
|
211
|
+
for (const e of n) {
|
|
212
|
+
const o = se(i, e);
|
|
213
|
+
o > t && (t = o);
|
|
214
|
+
}
|
|
215
|
+
return t;
|
|
216
|
+
}
|
|
217
|
+
class xt extends EventTarget {
|
|
218
|
+
/**
|
|
219
|
+
* @param {object} opts
|
|
220
|
+
* @param {string} opts.name Human-readable label
|
|
221
|
+
* @param {number[][]|Float32Array[]} opts.refEmbeddings Reference embeddings
|
|
222
|
+
* @param {number} [opts.threshold=0.65] Detection threshold
|
|
223
|
+
* @param {number} [opts.relaxationMs=2000] Min ms between events
|
|
224
|
+
* @param {number} [opts.inferenceGapMs=300] Min ms between inferences
|
|
225
|
+
*/
|
|
226
|
+
constructor({ name: n, refEmbeddings: t, threshold: e = 0.65, relaxationMs: o = 2e3, inferenceGapMs: s = 300 }) {
|
|
227
|
+
super(), this.name = n, this.refEmbeddings = t, this.threshold = e, this.relaxationMs = o, this.inferenceGapMs = s, this._lastDetectionAt = 0, this._lastInferenceAt = 0, this._lastScore = 0;
|
|
228
|
+
}
|
|
229
|
+
get lastScore() {
|
|
230
|
+
return this._lastScore;
|
|
231
|
+
}
|
|
232
|
+
/**
|
|
233
|
+
* Score one 1.5-second audio frame. Rate-limited to `inferenceGapMs`.
|
|
234
|
+
*
|
|
235
|
+
* @param {Float32Array} audioBuffer 24 000 samples at 16 kHz
|
|
236
|
+
* @returns {Promise<number|null>} Similarity score, or null if rate-limited
|
|
237
|
+
*/
|
|
238
|
+
async scoreFrame(n) {
|
|
239
|
+
const t = Date.now();
|
|
240
|
+
if (t - this._lastInferenceAt < this.inferenceGapMs) return null;
|
|
241
|
+
this._lastInferenceAt = t;
|
|
242
|
+
const e = Nt(n), o = await St(e), s = ne(o, this.refEmbeddings);
|
|
243
|
+
return this._lastScore = s, s >= this.threshold && t - this._lastDetectionAt >= this.relaxationMs && (this._lastDetectionAt = t, this.dispatchEvent(new CustomEvent("match", {
|
|
244
|
+
detail: { name: this.name, confidence: s, timestamp: t }
|
|
245
|
+
}))), s;
|
|
246
|
+
}
|
|
247
|
+
}
|
|
248
|
+
const Rt = 16e3, re = 1500, pt = 24e3;
|
|
249
|
+
function zt(i) {
|
|
250
|
+
if (i.length === pt) return i;
|
|
251
|
+
const n = new Float32Array(pt);
|
|
252
|
+
return n.set(i.subarray(0, pt)), n;
|
|
253
|
+
}
|
|
254
|
+
class oe extends EventTarget {
|
|
255
|
+
/** @param {string} wordName — the wake word label */
|
|
256
|
+
constructor(n) {
|
|
257
|
+
super(), this.wordName = n.trim().toLowerCase(), this.samples = [];
|
|
258
|
+
}
|
|
259
|
+
get sampleCount() {
|
|
260
|
+
return this.samples.length;
|
|
261
|
+
}
|
|
262
|
+
// ─── Recording ─────────────────────────────────────────────────────────────
|
|
263
|
+
/**
|
|
264
|
+
* Record exactly `RECORD_MS` milliseconds from the microphone.
|
|
265
|
+
* Dispatches 'recording-start' and 'sample-added' events.
|
|
266
|
+
*
|
|
267
|
+
* @returns {Promise<number>} Index (1-based) of the new sample
|
|
268
|
+
*/
|
|
269
|
+
async recordSample() {
|
|
270
|
+
const n = await navigator.mediaDevices.getUserMedia({ audio: !0 });
|
|
271
|
+
return new Promise((t, e) => {
|
|
272
|
+
const o = new AudioContext({ sampleRate: Rt }), s = new MediaRecorder(n), a = [];
|
|
273
|
+
this.dispatchEvent(new CustomEvent("recording-start")), s.ondataavailable = (r) => {
|
|
274
|
+
r.data.size > 0 && a.push(r.data);
|
|
275
|
+
}, s.onstop = async () => {
|
|
276
|
+
n.getTracks().forEach((r) => r.stop());
|
|
277
|
+
try {
|
|
278
|
+
const c = await new Blob(a, { type: "audio/webm" }).arrayBuffer(), l = await o.decodeAudioData(c);
|
|
279
|
+
await o.close();
|
|
280
|
+
const h = l.getChannelData(0), d = zt(new Float32Array(h)), m = this._push(d, `Recorded #${this.samples.length}`);
|
|
281
|
+
t(m);
|
|
282
|
+
} catch (r) {
|
|
283
|
+
await o.close().catch(() => {
|
|
284
|
+
}), e(r);
|
|
285
|
+
}
|
|
286
|
+
}, s.start(), setTimeout(() => s.stop(), re);
|
|
287
|
+
});
|
|
288
|
+
}
|
|
289
|
+
// ─── Upload ────────────────────────────────────────────────────────────────
|
|
290
|
+
/**
|
|
291
|
+
* Decode an uploaded audio File and add it as a sample.
|
|
292
|
+
*
|
|
293
|
+
* @param {File} file
|
|
294
|
+
* @returns {Promise<number>} Index (1-based) of the new sample
|
|
295
|
+
*/
|
|
296
|
+
async addAudioFile(n) {
|
|
297
|
+
const t = await n.arrayBuffer(), e = new AudioContext({ sampleRate: Rt }), o = await e.decodeAudioData(t);
|
|
298
|
+
await e.close();
|
|
299
|
+
const s = o.getChannelData(0), a = zt(new Float32Array(s));
|
|
300
|
+
return this._push(a, n.name);
|
|
301
|
+
}
|
|
302
|
+
// ─── Manage ────────────────────────────────────────────────────────────────
|
|
303
|
+
/**
|
|
304
|
+
* Remove a sample by 0-based index.
|
|
305
|
+
* @param {number} idx
|
|
306
|
+
*/
|
|
307
|
+
removeSample(n) {
|
|
308
|
+
this.samples.splice(n, 1), this.dispatchEvent(new CustomEvent("samples-changed", { detail: { count: this.samples.length } }));
|
|
309
|
+
}
|
|
310
|
+
clearSamples() {
|
|
311
|
+
this.samples = [], this.dispatchEvent(new CustomEvent("samples-changed", { detail: { count: 0 } }));
|
|
312
|
+
}
|
|
313
|
+
// ─── Generate ──────────────────────────────────────────────────────────────
|
|
314
|
+
/**
|
|
315
|
+
* Compute embeddings for all collected samples and return a ref object.
|
|
316
|
+
* Dispatches 'progress' events during embedding.
|
|
317
|
+
*
|
|
318
|
+
* @returns {Promise<{ word_name:string, model_type:string, embeddings:number[][] }>}
|
|
319
|
+
*/
|
|
320
|
+
async generateRef() {
|
|
321
|
+
if (this.samples.length < 3)
|
|
322
|
+
throw new Error(`Need at least 3 samples (currently have ${this.samples.length})`);
|
|
323
|
+
this.dispatchEvent(new CustomEvent("generating", { detail: { total: this.samples.length } }));
|
|
324
|
+
const n = [];
|
|
325
|
+
for (let t = 0; t < this.samples.length; t++) {
|
|
326
|
+
const e = Nt(this.samples[t].audioBuffer), o = await St(e);
|
|
327
|
+
n.push(Array.from(o)), this.dispatchEvent(new CustomEvent("progress", {
|
|
328
|
+
detail: { done: t + 1, total: this.samples.length }
|
|
329
|
+
}));
|
|
330
|
+
}
|
|
331
|
+
return {
|
|
332
|
+
word_name: this.wordName,
|
|
333
|
+
model_type: "resnet_50_arc",
|
|
334
|
+
embeddings: n
|
|
335
|
+
};
|
|
336
|
+
}
|
|
337
|
+
// ─── Private ───────────────────────────────────────────────────────────────
|
|
338
|
+
_push(n, t) {
|
|
339
|
+
this.samples.push({ audioBuffer: n, name: t });
|
|
340
|
+
const e = this.samples.length;
|
|
341
|
+
return this.dispatchEvent(new CustomEvent("sample-added", { detail: { count: e, name: t } })), e;
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
const ae = `/**
|
|
345
|
+
* public/audio-processor.js
|
|
346
|
+
* AudioWorklet that runs at 16 kHz and continuously emits the last
|
|
347
|
+
* 1.5-second window (24 000 samples) via a circular buffer.
|
|
348
|
+
*
|
|
349
|
+
* The main thread receives a fresh Float32Array on every
|
|
350
|
+
* AudioWorklet quantum (128 samples ≈ every 8 ms at 16 kHz).
|
|
351
|
+
* The inference loop in engine.js rate-limits to avoid excessive work.
|
|
352
|
+
*/
|
|
353
|
+
class AudioProcessor extends AudioWorkletProcessor {
|
|
354
|
+
constructor() {
|
|
355
|
+
super()
|
|
356
|
+
this._size = 24000 // 1.5 s × 16 000 Hz
|
|
357
|
+
this._buf = new Float32Array(this._size)
|
|
358
|
+
this._ptr = 0
|
|
359
|
+
}
|
|
360
|
+
|
|
361
|
+
process(inputs) {
|
|
362
|
+
const ch = inputs[0]?.[0]
|
|
363
|
+
if (!ch) return true
|
|
364
|
+
|
|
365
|
+
for (let i = 0; i < ch.length; i++) {
|
|
366
|
+
this._buf[this._ptr] = ch[i]
|
|
367
|
+
this._ptr = (this._ptr + 1) % this._size
|
|
368
|
+
}
|
|
369
|
+
|
|
370
|
+
// Send an ordered copy of the ring buffer
|
|
371
|
+
const out = new Float32Array(this._size)
|
|
372
|
+
for (let i = 0; i < this._size; i++) {
|
|
373
|
+
out[i] = this._buf[(this._ptr + i) % this._size]
|
|
374
|
+
}
|
|
375
|
+
this.port.postMessage(out)
|
|
376
|
+
return true
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
registerProcessor('audio-processor', AudioProcessor)
|
|
381
|
+
`;
|
|
382
|
+
let vt = null;
|
|
383
|
+
function ie() {
|
|
384
|
+
if (!vt) {
|
|
385
|
+
const i = new Blob([ae], { type: "application/javascript" });
|
|
386
|
+
vt = URL.createObjectURL(i);
|
|
387
|
+
}
|
|
388
|
+
return vt;
|
|
389
|
+
}
|
|
390
|
+
const wt = "mellon_custom_refs";
|
|
391
|
+
function gt() {
|
|
392
|
+
try {
|
|
393
|
+
const i = localStorage.getItem(wt);
|
|
394
|
+
return i ? JSON.parse(i) : [];
|
|
395
|
+
} catch {
|
|
396
|
+
return [];
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
function ce(i) {
|
|
400
|
+
const n = gt().filter((t) => t.word_name !== i.word_name);
|
|
401
|
+
n.push(i), localStorage.setItem(wt, JSON.stringify(n));
|
|
402
|
+
}
|
|
403
|
+
function le(i) {
|
|
404
|
+
const n = gt().filter((t) => t.word_name !== i);
|
|
405
|
+
localStorage.setItem(wt, JSON.stringify(n));
|
|
406
|
+
}
|
|
407
|
+
function he(i) {
|
|
408
|
+
const n = JSON.stringify(i, null, 2), t = new Blob([n], { type: "application/json" }), e = URL.createObjectURL(t), o = Object.assign(document.createElement("a"), {
|
|
409
|
+
href: e,
|
|
410
|
+
download: `${i.word_name}_ref.json`
|
|
411
|
+
});
|
|
412
|
+
document.body.appendChild(o), o.click(), document.body.removeChild(o), URL.revokeObjectURL(e);
|
|
413
|
+
}
|
|
414
|
+
async function de(i) {
|
|
415
|
+
const n = await i.text();
|
|
416
|
+
let t;
|
|
417
|
+
try {
|
|
418
|
+
t = JSON.parse(n);
|
|
419
|
+
} catch {
|
|
420
|
+
throw new Error("Invalid JSON");
|
|
421
|
+
}
|
|
422
|
+
if (!t.embeddings || !Array.isArray(t.embeddings) || !t.embeddings.length)
|
|
423
|
+
throw new Error('Missing or empty "embeddings" array');
|
|
424
|
+
if (!Array.isArray(t.embeddings[0]))
|
|
425
|
+
throw new Error('"embeddings" must be a 2D array');
|
|
426
|
+
return t.word_name || (t.word_name = i.name.replace(/_ref\.json$/i, "").replace(/\.json$/i, "")), t;
|
|
427
|
+
}
|
|
428
|
+
class ue extends EventTarget {
|
|
429
|
+
/**
|
|
430
|
+
* @param {object} [opts]
|
|
431
|
+
* @param {string[]} [opts.words] Words to detect (must have refs loaded via addCustomWord())
|
|
432
|
+
* @param {Array<string|{word_name:string,embeddings:number[][]}>} [opts.refs]
|
|
433
|
+
* Reference data to preload. Each entry is either a URL string pointing to a
|
|
434
|
+
* hosted `_ref.json` file, or an inline RefData object.
|
|
435
|
+
* Refs are fetched/loaded during init() before detection starts.
|
|
436
|
+
* @param {number} [opts.threshold=0.65] Detection threshold [0, 1]
|
|
437
|
+
* @param {number} [opts.relaxationMs=2000] Min ms between successive match events
|
|
438
|
+
* @param {number} [opts.inferenceGapMs=300] Min ms between inference runs
|
|
439
|
+
* @param {string} [opts.assetsPath]
|
|
440
|
+
*/
|
|
441
|
+
constructor(n = {}) {
|
|
442
|
+
super(), this._opts = {
|
|
443
|
+
words: n.words ?? [],
|
|
444
|
+
refs: n.refs ?? [],
|
|
445
|
+
threshold: n.threshold ?? 0.65,
|
|
446
|
+
relaxationMs: n.relaxationMs ?? 2e3,
|
|
447
|
+
inferenceGapMs: n.inferenceGapMs ?? 300,
|
|
448
|
+
assetsPath: n.assetsPath
|
|
449
|
+
}, this._refs = /* @__PURE__ */ new Map(), this._detectors = /* @__PURE__ */ new Map(), this._audioCtx = null, this._workletNode = null, this._stream = null, this._initialized = !1, this._running = !1;
|
|
450
|
+
}
|
|
451
|
+
/** Whether init() has completed successfully. */
|
|
452
|
+
get isInitialized() {
|
|
453
|
+
return this._initialized;
|
|
454
|
+
}
|
|
455
|
+
/** Whether start() is active (mic is open). */
|
|
456
|
+
get isRunning() {
|
|
457
|
+
return this._running;
|
|
458
|
+
}
|
|
459
|
+
// ─── Lifecycle ───────────────────────────────────────────────────────────
|
|
460
|
+
/**
|
|
461
|
+
* Load the ONNX model and pre-cache built-in reference embeddings.
|
|
462
|
+
* Calling this explicitly is optional — start() will auto-init if needed.
|
|
463
|
+
* Call it early to show a loading progress bar.
|
|
464
|
+
*
|
|
465
|
+
* @param {(progress: number) => void} [onProgress] 0.0 → 1.0
|
|
466
|
+
*/
|
|
467
|
+
async init(n) {
|
|
468
|
+
if (this._initialized) {
|
|
469
|
+
n == null || n(1);
|
|
470
|
+
return;
|
|
471
|
+
}
|
|
472
|
+
this._opts.assetsPath && Yt({ assetsPath: this._opts.assetsPath });
|
|
473
|
+
try {
|
|
474
|
+
await Qt(n);
|
|
475
|
+
} catch (t) {
|
|
476
|
+
throw this.dispatchEvent(new CustomEvent("error", { detail: { error: t } })), t;
|
|
477
|
+
}
|
|
478
|
+
for (const t of this._opts.refs)
|
|
479
|
+
try {
|
|
480
|
+
let e;
|
|
481
|
+
if (typeof t == "string") {
|
|
482
|
+
const o = await fetch(t);
|
|
483
|
+
if (!o.ok) throw new Error(`HTTP ${o.status}`);
|
|
484
|
+
e = await o.json();
|
|
485
|
+
} else
|
|
486
|
+
e = t;
|
|
487
|
+
this.addCustomWord(e);
|
|
488
|
+
} catch (e) {
|
|
489
|
+
const o = typeof t == "string" ? t : t.word_name;
|
|
490
|
+
console.warn(`[Mellon] Failed to load ref "${o}": ${e.message}`);
|
|
491
|
+
}
|
|
492
|
+
this._initialized = !0, this.dispatchEvent(new CustomEvent("ready"));
|
|
493
|
+
}
|
|
494
|
+
/**
|
|
495
|
+
* Request microphone access and start hotword detection.
|
|
496
|
+
* Emits 'match' CustomEvents when a word is detected.
|
|
497
|
+
*
|
|
498
|
+
* @param {string[]} [words] Subset of words to listen for; defaults to opts.words
|
|
499
|
+
*/
|
|
500
|
+
async start(n) {
|
|
501
|
+
this._initialized || await this.init();
|
|
502
|
+
const t = n ?? this._opts.words;
|
|
503
|
+
try {
|
|
504
|
+
this._stream = await navigator.mediaDevices.getUserMedia({ audio: !0 });
|
|
505
|
+
} catch (s) {
|
|
506
|
+
const a = new Error(`Microphone access denied: ${s.message}`);
|
|
507
|
+
throw this.dispatchEvent(new CustomEvent("error", { detail: { error: a } })), a;
|
|
508
|
+
}
|
|
509
|
+
this._audioCtx = new AudioContext({ sampleRate: 16e3 });
|
|
510
|
+
const e = ie();
|
|
511
|
+
await this._audioCtx.audioWorklet.addModule(e);
|
|
512
|
+
const o = this._audioCtx.createMediaStreamSource(this._stream);
|
|
513
|
+
this._workletNode = new AudioWorkletNode(this._audioCtx, "audio-processor"), o.connect(this._workletNode), this._workletNode.connect(this._audioCtx.destination);
|
|
514
|
+
for (const s of t) {
|
|
515
|
+
const a = this._refs.get(s);
|
|
516
|
+
if (!a) {
|
|
517
|
+
console.warn(`[Mellon] No reference embeddings for "${s}" — skipping. Call addCustomWord() to register custom words before start().`);
|
|
518
|
+
continue;
|
|
519
|
+
}
|
|
520
|
+
const r = new xt({
|
|
521
|
+
name: s,
|
|
522
|
+
refEmbeddings: a.embeddings,
|
|
523
|
+
threshold: this._opts.threshold,
|
|
524
|
+
relaxationMs: this._opts.relaxationMs,
|
|
525
|
+
inferenceGapMs: this._opts.inferenceGapMs
|
|
526
|
+
});
|
|
527
|
+
r.addEventListener("match", (c) => {
|
|
528
|
+
this.dispatchEvent(new CustomEvent("match", { detail: c.detail }));
|
|
529
|
+
}), this._detectors.set(s, r);
|
|
530
|
+
}
|
|
531
|
+
this._workletNode.port.onmessage = async (s) => {
|
|
532
|
+
const a = [];
|
|
533
|
+
for (const r of this._detectors.values())
|
|
534
|
+
a.push(r.scoreFrame(s.data));
|
|
535
|
+
await Promise.allSettled(a);
|
|
536
|
+
}, this._running = !0;
|
|
537
|
+
}
|
|
538
|
+
/**
|
|
539
|
+
* Stop detection and release the microphone and AudioContext.
|
|
540
|
+
*/
|
|
541
|
+
stop() {
|
|
542
|
+
this._workletNode && (this._workletNode.port.onmessage = null, this._workletNode.disconnect(), this._workletNode = null), this._stream && (this._stream.getTracks().forEach((n) => n.stop()), this._stream = null), this._audioCtx && (this._audioCtx.close(), this._audioCtx = null), this._detectors.clear(), this._running = !1;
|
|
543
|
+
}
|
|
544
|
+
// ─── Custom words ────────────────────────────────────────────────────────
|
|
545
|
+
/**
|
|
546
|
+
* Register reference embeddings for a custom (or overridden) word.
|
|
547
|
+
* Can be called before or after start(). If called while running, the new
|
|
548
|
+
* detector is added to the active pipeline without restarting.
|
|
549
|
+
*
|
|
550
|
+
* @param {{ word_name: string, model_type: string, embeddings: number[][] }} refData
|
|
551
|
+
*/
|
|
552
|
+
addCustomWord(n) {
|
|
553
|
+
if (this._refs.set(n.word_name, n), this._running && this._workletNode) {
|
|
554
|
+
const t = new xt({
|
|
555
|
+
name: n.word_name,
|
|
556
|
+
refEmbeddings: n.embeddings,
|
|
557
|
+
threshold: this._opts.threshold,
|
|
558
|
+
relaxationMs: this._opts.relaxationMs,
|
|
559
|
+
inferenceGapMs: this._opts.inferenceGapMs
|
|
560
|
+
});
|
|
561
|
+
t.addEventListener("match", (e) => {
|
|
562
|
+
this.dispatchEvent(new CustomEvent("match", { detail: e.detail }));
|
|
563
|
+
}), this._detectors.set(n.word_name, t);
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
/**
|
|
567
|
+
* Create an EnrollmentSession for recording a new custom word.
|
|
568
|
+
* After generating the ref via session.generateRef(), pass the result
|
|
569
|
+
* to addCustomWord().
|
|
570
|
+
*
|
|
571
|
+
* @param {string} wordName
|
|
572
|
+
* @returns {EnrollmentSession}
|
|
573
|
+
*
|
|
574
|
+
* @example
|
|
575
|
+
* const session = stt.enrollWord('hello')
|
|
576
|
+
* await session.recordSample() // record 1.5 s × 3+ times
|
|
577
|
+
* const ref = await session.generateRef()
|
|
578
|
+
* stt.addCustomWord(ref)
|
|
579
|
+
*/
|
|
580
|
+
enrollWord(n) {
|
|
581
|
+
return new oe(n);
|
|
582
|
+
}
|
|
583
|
+
// ─── Persistence (static) ────────────────────────────────────────────────
|
|
584
|
+
/** Return all custom word refs stored in localStorage. */
|
|
585
|
+
static loadWords() {
|
|
586
|
+
return gt();
|
|
587
|
+
}
|
|
588
|
+
/** Persist a word ref to localStorage (replaces any existing entry with the same name). */
|
|
589
|
+
static saveWord(n) {
|
|
590
|
+
ce(n);
|
|
591
|
+
}
|
|
592
|
+
/** Delete a word ref from localStorage by name. */
|
|
593
|
+
static deleteWord(n) {
|
|
594
|
+
le(n);
|
|
595
|
+
}
|
|
596
|
+
/**
|
|
597
|
+
* Parse an uploaded ref JSON file and return a RefData object.
|
|
598
|
+
* @param {File} file
|
|
599
|
+
*/
|
|
600
|
+
static importWordFile(n) {
|
|
601
|
+
return de(n);
|
|
602
|
+
}
|
|
603
|
+
/** Trigger a browser download of a ref as a JSON file. */
|
|
604
|
+
static exportWord(n) {
|
|
605
|
+
he(n);
|
|
606
|
+
}
|
|
607
|
+
}
|
|
608
|
+
export {
|
|
609
|
+
oe as EnrollmentSession,
|
|
610
|
+
ue as Mellon
|
|
611
|
+
};
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "mellon",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.5",
|
|
4
4
|
"description": "Offline, in-browser hotword detection powered by EfficientWord-Net (ResNet-50 ArcFace). Works as a standalone app or npm library.",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./dist/mellon.cjs",
|
|
@@ -17,8 +17,7 @@
|
|
|
17
17
|
"dist/mellon.mjs",
|
|
18
18
|
"dist/mellon.cjs",
|
|
19
19
|
"dist/index.d.ts",
|
|
20
|
-
"dist/
|
|
21
|
-
"dist/models",
|
|
20
|
+
"dist/assets",
|
|
22
21
|
"README.md"
|
|
23
22
|
],
|
|
24
23
|
"keywords": [
|
|
@@ -33,13 +32,7 @@
|
|
|
33
32
|
"voice"
|
|
34
33
|
],
|
|
35
34
|
"scripts": {
|
|
36
|
-
"
|
|
37
|
-
"build": "npm run build:app",
|
|
38
|
-
"build:app": "vite build && npm run copy-wasm",
|
|
39
|
-
"build:lib": "vite build --config vite.lib.config.js",
|
|
40
|
-
"copy-wasm": "node scripts/copy-wasm.js",
|
|
41
|
-
"preview": "vite preview",
|
|
42
|
-
"prepare": "node scripts/prepare_check.js"
|
|
35
|
+
"build": "vite build"
|
|
43
36
|
},
|
|
44
37
|
"dependencies": {
|
|
45
38
|
"fft.js": "^4.0.4",
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|