mellon 0.0.18 → 0.0.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -9
- package/dist/index.d.ts +6 -2
- package/dist/mellon.cjs +1 -1
- package/dist/mellon.mjs +130 -127
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -17,7 +17,7 @@ Offline, fully in-browser **hotword / wake-word detection** powered by [Efficien
|
|
|
17
17
|
2. [Quick start](#quick-start)
|
|
18
18
|
3. [Enrolling words](#enrolling-custom-words)
|
|
19
19
|
4. [API reference](#api-reference)
|
|
20
|
-
- [
|
|
20
|
+
- [Detector](#detector)
|
|
21
21
|
- [EnrollmentSession](#enrollmentsession)
|
|
22
22
|
5. [Science behind the lib](#science-behind-the-lib)
|
|
23
23
|
---
|
|
@@ -31,9 +31,9 @@ npm install mellon
|
|
|
31
31
|
## Quick start
|
|
32
32
|
|
|
33
33
|
```js
|
|
34
|
-
import {
|
|
34
|
+
import { Detector } from 'mellon'
|
|
35
35
|
|
|
36
|
-
const hotWordDetection = new
|
|
36
|
+
const hotWordDetection = new Detector([
|
|
37
37
|
{
|
|
38
38
|
name: 'openDoors',
|
|
39
39
|
triggers: [{ name: 'mellon', defaultRefPath: '/mellon-assets/mellon_ref.json' }],
|
|
@@ -71,9 +71,9 @@ await hotWordDetection.start() // opens the mic and listens for all registered t
|
|
|
71
71
|
## Enrolling custom words
|
|
72
72
|
|
|
73
73
|
```js
|
|
74
|
-
import {
|
|
74
|
+
import { Detector, EnrollmentSession, Storage } from 'mellon'
|
|
75
75
|
|
|
76
|
-
const hotwordDetection = new
|
|
76
|
+
const hotwordDetection = new Detector([{
|
|
77
77
|
name: 'startEngine',
|
|
78
78
|
triggers: [{ name: 'start' }],
|
|
79
79
|
onMatch: (triggerNameMatched, confidence) => { console.log('starting engine...') }
|
|
@@ -99,19 +99,19 @@ hotwordDetection.addCustomWord(ref)
|
|
|
99
99
|
await hotwordDetection.start()
|
|
100
100
|
|
|
101
101
|
// 4b. Persist for future sessions
|
|
102
|
-
|
|
102
|
+
Storage.saveWord(ref)
|
|
103
103
|
```
|
|
104
104
|
|
|
105
105
|
---
|
|
106
106
|
|
|
107
107
|
## API reference
|
|
108
108
|
|
|
109
|
-
### `
|
|
109
|
+
### `Detector`
|
|
110
110
|
|
|
111
111
|
The easiest way to use the library. Wraps mic access, AudioWorklet wiring, and detector management into a single class.
|
|
112
112
|
|
|
113
113
|
```ts
|
|
114
|
-
class
|
|
114
|
+
class Detector {
|
|
115
115
|
constructor(commands: Command[], config?: MellonConfig)
|
|
116
116
|
readonly threshold: number // read/write; persisted in localStorage
|
|
117
117
|
readonly listening: boolean
|
|
@@ -121,7 +121,19 @@ class Mellon {
|
|
|
121
121
|
stop(): Promise<void>
|
|
122
122
|
addCustomWord(ref: WordRef): void
|
|
123
123
|
|
|
124
|
-
// Storage helpers — static, work without a
|
|
124
|
+
// Storage helpers — static, work without a Detector instance
|
|
125
|
+
static loadWords(storageKey?: string): WordRef[]
|
|
126
|
+
static saveWord(ref: WordRef, storageKey?: string): void
|
|
127
|
+
static deleteWord(wordName: string, storageKey?: string): void
|
|
128
|
+
}
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
### `Storage`
|
|
132
|
+
|
|
133
|
+
Static helpers for persisting enrolled word references in `localStorage`.
|
|
134
|
+
|
|
135
|
+
```ts
|
|
136
|
+
class Storage {
|
|
125
137
|
static loadWords(storageKey?: string): WordRef[]
|
|
126
138
|
static saveWord(ref: WordRef, storageKey?: string): void
|
|
127
139
|
static deleteWord(wordName: string, storageKey?: string): void
|
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,7 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { Detector } from './Mellon';
|
|
2
2
|
import { EnrollmentSession } from './EnrollmentSession';
|
|
3
|
+
import { Storage } from './Storage';
|
|
4
|
+
import { AudioUtils } from './AudioUtils';
|
|
3
5
|
export type TriggerName = string;
|
|
4
6
|
export interface Trigger {
|
|
5
7
|
name: TriggerName;
|
|
@@ -17,11 +19,13 @@ export interface MellonConfig {
|
|
|
17
19
|
modelPath?: string;
|
|
18
20
|
audioProcessorPath?: string;
|
|
19
21
|
ortCdnUrl?: string;
|
|
22
|
+
audioUtils?: AudioUtils;
|
|
20
23
|
}
|
|
21
24
|
export interface EnrollmentSessionConfig {
|
|
22
25
|
wasmPaths?: string;
|
|
23
26
|
modelPath?: string;
|
|
24
27
|
ortCdnUrl?: string;
|
|
28
|
+
audioUtils?: AudioUtils;
|
|
25
29
|
}
|
|
26
30
|
export interface WordRef {
|
|
27
31
|
word_name: TriggerName;
|
|
@@ -34,4 +38,4 @@ export declare const DEFAULT_MODEL_PATH = "https://huggingface.co/ComicScrip/mel
|
|
|
34
38
|
export declare const DEFAULT_AUDIO_PROCESSOR_PATH = "https://cdn.jsdelivr.net/npm/mellon@0.0.14/dist/assets/audio-processor.js";
|
|
35
39
|
export declare const DEFAULT_REFS_STORAGE_KEY = "mellon-refs";
|
|
36
40
|
export declare const DEFAULT_THRESHOLD_STORAGE_KEY = "mellon-threshold";
|
|
37
|
-
export {
|
|
41
|
+
export { Detector, EnrollmentSession, Storage, AudioUtils };
|
package/dist/mellon.cjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});function Dt(m){return m&&m.__esModule&&Object.prototype.hasOwnProperty.call(m,"default")?m.default:m}var lt,ut;function Ut(){if(ut)return lt;ut=1;function m(o){if(this.size=o|0,this.size<=1||(this.size&this.size-1)!==0)throw new Error("FFT size must be a power of two and bigger than 1");this._csize=o<<1;for(var t=new Array(this.size*2),s=0;s<t.length;s+=2){const i=Math.PI*s/this.size;t[s]=Math.cos(i),t[s+1]=-Math.sin(i)}this.table=t;for(var n=0,r=1;this.size>r;r<<=1)n++;this._width=n%2===0?n-1:n,this._bitrev=new Array(1<<this._width);for(var a=0;a<this._bitrev.length;a++){this._bitrev[a]=0;for(var e=0;e<this._width;e+=2){var h=this._width-e-2;this._bitrev[a]|=(a>>>e&3)<<h}}this._out=null,this._data=null,this._inv=0}return lt=m,m.prototype.fromComplexArray=function(t,s){for(var n=s||new Array(t.length>>>1),r=0;r<t.length;r+=2)n[r>>>1]=t[r];return n},m.prototype.createComplexArray=function(){const t=new Array(this._csize);for(var s=0;s<t.length;s++)t[s]=0;return t},m.prototype.toComplexArray=function(t,s){for(var n=s||this.createComplexArray(),r=0;r<n.length;r+=2)n[r]=t[r>>>1],n[r+1]=0;return n},m.prototype.completeSpectrum=function(t){for(var s=this._csize,n=s>>>1,r=2;r<n;r+=2)t[s-r]=t[r],t[s-r+1]=-t[r+1]},m.prototype.transform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=0,this._transform4(),this._out=null,this._data=null},m.prototype.realTransform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=0,this._realTransform4(),this._out=null,this._data=null},m.prototype.inverseTransform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=1,this._transform4();for(var n=0;n<t.length;n++)t[n]/=this.size;this._out=null,this._data=null},m.prototype._transform4=function(){var t=this._out,s=this._csize,n=this._width,r=1<<n,a=s/r<<1,e,h,i=this._bitrev;if(a===4)for(e=0,h=0;e<s;e+=a,h++){const v=i[h];this._singleTransform2(e,v,r)}else for(e=0,h=0;e<s;e+=a,h++){const v=i[h];this._singleTransform4(e,v,r)}var l=this._inv?-1:1,c=this.table;for(r>>=2;r>=2;r>>=2){a=s/r<<1;var d=a>>>2;for(e=0;e<s;e+=a)for(var _=e+d,u=e,f=0;u<_;u+=2,f+=r){const v=u,p=v+d,g=p+d,w=g+d,T=t[v],F=t[v+1],A=t[p],y=t[p+1],b=t[g],S=t[g+1],P=t[w],C=t[w+1],E=T,R=F,M=c[f],D=l*c[f+1],U=A*M-y*D,z=A*D+y*M,j=c[2*f],L=l*c[2*f+1],B=b*j-S*L,K=b*L+S*j,N=c[3*f],O=l*c[3*f+1],k=P*N-C*O,G=P*O+C*N,$=E+B,x=R+K,I=E-B,Y=R-K,J=U+k,H=z+G,W=l*(U-k),Q=l*(z-G),tt=$+J,rt=x+H,et=$-J,ot=x-H,nt=I+Q,at=Y-W,it=I-Q,ct=Y+W;t[v]=tt,t[v+1]=rt,t[p]=nt,t[p+1]=at,t[g]=et,t[g+1]=ot,t[w]=it,t[w+1]=ct}}},m.prototype._singleTransform2=function(t,s,n){const r=this._out,a=this._data,e=a[s],h=a[s+1],i=a[s+n],l=a[s+n+1],c=e+i,d=h+l,_=e-i,u=h-l;r[t]=c,r[t+1]=d,r[t+2]=_,r[t+3]=u},m.prototype._singleTransform4=function(t,s,n){const r=this._out,a=this._data,e=this._inv?-1:1,h=n*2,i=n*3,l=a[s],c=a[s+1],d=a[s+n],_=a[s+n+1],u=a[s+h],f=a[s+h+1],v=a[s+i],p=a[s+i+1],g=l+u,w=c+f,T=l-u,F=c-f,A=d+v,y=_+p,b=e*(d-v),S=e*(_-p),P=g+A,C=w+y,E=T+S,R=F-b,M=g-A,D=w-y,U=T-S,z=F+b;r[t]=P,r[t+1]=C,r[t+2]=E,r[t+3]=R,r[t+4]=M,r[t+5]=D,r[t+6]=U,r[t+7]=z},m.prototype._realTransform4=function(){var t=this._out,s=this._csize,n=this._width,r=1<<n,a=s/r<<1,e,h,i=this._bitrev;if(a===4)for(e=0,h=0;e<s;e+=a,h++){const ht=i[h];this._singleRealTransform2(e,ht>>>1,r>>>1)}else for(e=0,h=0;e<s;e+=a,h++){const ht=i[h];this._singleRealTransform4(e,ht>>>1,r>>>1)}var l=this._inv?-1:1,c=this.table;for(r>>=2;r>=2;r>>=2){a=s/r<<1;var d=a>>>1,_=d>>>1,u=_>>>1;for(e=0;e<s;e+=a)for(var f=0,v=0;f<=u;f+=2,v+=r){var p=e+f,g=p+_,w=g+_,T=w+_,F=t[p],A=t[p+1],y=t[g],b=t[g+1],S=t[w],P=t[w+1],C=t[T],E=t[T+1],R=F,M=A,D=c[v],U=l*c[v+1],z=y*D-b*U,j=y*U+b*D,L=c[2*v],B=l*c[2*v+1],K=S*L-P*B,N=S*B+P*L,O=c[3*v],k=l*c[3*v+1],G=C*O-E*k,$=C*k+E*O,x=R+K,I=M+N,Y=R-K,J=M-N,H=z+G,W=j+$,Q=l*(z-G),tt=l*(j-$),rt=x+H,et=I+W,ot=Y+tt,nt=J-Q;if(t[p]=rt,t[p+1]=et,t[g]=ot,t[g+1]=nt,f===0){var at=x-H,it=I-W;t[w]=at,t[w+1]=it;continue}if(f!==u){var ct=Y,yt=-J,Tt=x,At=-I,Ft=-l*tt,bt=-l*Q,St=-l*W,Pt=-l*H,Ct=ct+Ft,Et=yt+bt,Rt=Tt+Pt,Mt=At-St,ft=e+_-f,vt=e+d-f;t[ft]=Ct,t[ft+1]=Et,t[vt]=Rt,t[vt+1]=Mt}}}},m.prototype._singleRealTransform2=function(t,s,n){const r=this._out,a=this._data,e=a[s],h=a[s+n],i=e+h,l=e-h;r[t]=i,r[t+1]=0,r[t+2]=l,r[t+3]=0},m.prototype._singleRealTransform4=function(t,s,n){const r=this._out,a=this._data,e=this._inv?-1:1,h=n*2,i=n*3,l=a[s],c=a[s+n],d=a[s+h],_=a[s+i],u=l+d,f=l-d,v=c+_,p=e*(c-_),g=u+v,w=f,T=-p,F=u-v,A=f,y=p;r[t]=g,r[t+1]=0,r[t+2]=w,r[t+3]=T,r[t+4]=F,r[t+5]=0,r[t+6]=A,r[t+7]=y},lt}var zt=Ut();const xt=Dt(zt);class pt{constructor(o=16e3,t=512,s=64){this.sampleRate=o,this.nfft=t,this.nfilt=s,this.fft=new xt(t),this.melFilters=this.createMelFilterbank()}hzToMel(o){return 2595*Math.log10(1+o/700)}melToHz(o){return 700*(10**(o/2595)-1)}createMelFilterbank(){const t=this.sampleRate/2,s=this.hzToMel(0),n=this.hzToMel(t),r=new Float32Array(this.nfilt+2);for(let i=0;i<this.nfilt+2;i++)r[i]=s+i*(n-s)/(this.nfilt+1);const e=r.map(i=>this.melToHz(i)).map(i=>Math.floor((this.nfft+1)*i/this.sampleRate)),h=[];for(let i=0;i<this.nfilt;i++){const l=new Float32Array(Math.floor(this.nfft/2)+1);for(let c=e[i];c<e[i+1];c++)l[c]=(c-e[i])/(e[i+1]-e[i]);for(let c=e[i+1];c<e[i+2];c++)l[c]=(e[i+2]-c)/(e[i+2]-e[i+1]);h.push(l)}return h}logfbank(o){const t=Math.floor(.025*this.sampleRate),s=Math.floor(.01*this.sampleRate),n=1+Math.ceil((o.length-t)/s),r=new Float32Array(n*this.nfilt),a=new Float32Array(this.nfft),e=this.fft.createComplexArray();for(let h=0;h<n;h++){const i=h*s;a.fill(0);for(let d=0;d<t&&i+d<o.length;d++)a[d]=o[i+d];const l=this.fft.toComplexArray(a,null);this.fft.transform(e,l);const c=new Float32Array(Math.floor(this.nfft/2)+1);for(let d=0;d<c.length;d++){const _=e[2*d],u=e[2*d+1];c[d]=1/this.nfft*(_*_+u*u),c[d]===0&&(c[d]=1e-30)}for(let d=0;d<this.nfilt;d++){let _=0;const u=this.melFilters[d];for(let f=0;f<c.length;f++)_+=c[f]*u[f];_===0&&(_=1e-30),r[h*this.nfilt+d]=Math.log(_)}}return r}}async function mt(m=Z,o=q){const t=await import(o);return t.env.wasm.wasmPaths=m,t.env.wasm.numThreads=1,t}let dt=null;async function _t(m=st,o=Z,t=q){return dt||(dt=mt(o,t).then(s=>s.InferenceSession.create(m,{executionProviders:["wasm"],graphOptimizationLevel:"all"}))),dt}const It=new pt;class X{constructor(o,t){this._started=!1,this._inferring=!1,this._audioCtx=null,this._stream=null,this._refEmbeddings=new Map,this._lastMatchAt=0,this._lastInferenceAt=0;const{refsStorageKey:s=V,thresholdStorageKey:n=wt,wasmPaths:r=Z,modelPath:a=st,audioProcessorPath:e=gt,ortCdnUrl:h=q}=t||{};this._commands=o,this._refsStorageKey=s,this._thresholdStorageKey=n,this._audioProcessorPath=e,this._wasmPaths=r,this._modelPath=a,this._ortCdnUrl=h;try{const i=localStorage.getItem(this._thresholdStorageKey);this._threshold=i!==null?Math.max(0,Math.min(1,Number(i))):.65}catch{this._threshold=.65}this._initPromise=this._init()}get threshold(){return this._threshold}set threshold(o){this._threshold=Math.max(0,Math.min(1,o));try{localStorage.setItem(this._thresholdStorageKey,String(this._threshold))}catch{}}get listening(){return this._started}async _init(){await _t(this._modelPath,this._wasmPaths,this._ortCdnUrl);const o=new Set;for(const t of this._commands)for(const s of t.triggers)if(!o.has(s.name)&&(o.add(s.name),s.defaultRefPath)){const n=await fetch(s.defaultRefPath);if(n.ok){const r=await n.json();this.addCustomWord(r)}}for(const t of X.loadWords(this._refsStorageKey))this._refEmbeddings.set(t.word_name,t.embeddings);console.info("[Mellon] init complete, loaded refs:",[...this._refEmbeddings.keys()])}async init(){await this._initPromise}addCustomWord(o){if(!(Array.isArray(o.embeddings)&&o.embeddings.length>0))throw new Error("invalid ref file for : "+o.word_name);this._refEmbeddings.set(o.word_name,o.embeddings)}async start(){if(this._started)return;await this._initPromise;let o;try{o=await navigator.mediaDevices.getUserMedia({audio:{noiseSuppression:!1,echoCancellation:!1,autoGainControl:!1,channelCount:1}})}catch{o=await navigator.mediaDevices.getUserMedia({audio:!0})}this._stream=o;const t=new AudioContext({sampleRate:16e3});this._audioCtx=t,await t.audioWorklet.addModule(this._audioProcessorPath);const s=t.createMediaStreamSource(o),n=new AudioWorkletNode(t,"audio-processor");n.port.onmessage=r=>{this._handleBuffer(r.data)},s.connect(n),n.connect(t.destination),this._started=!0}async stop(){if(this._started=!1,this._audioCtx&&(await this._audioCtx.close(),this._audioCtx=null),this._stream){for(const o of this._stream.getTracks())o.stop();this._stream=null}}async _handleBuffer(o){if(this._inferring)return;const t=Date.now();if(!(t-this._lastInferenceAt<300)){this._lastInferenceAt=t,this._inferring=!0;try{const[s,n]=await Promise.all([mt(this._wasmPaths,this._ortCdnUrl),_t(this._modelPath,this._wasmPaths,this._ortCdnUrl)]),r=It.logfbank(o),a=new s.Tensor("float32",r,[1,1,149,64]),e=await n.run({input:a}),h=e[Object.keys(e)[0]].data;let i=!1;for(const l of this._commands){if(i)break;for(const c of l.triggers){const d=this._refEmbeddings.get(c.name);if(!d)continue;const _=this._maxCosineSim(h,d);if(_>=this._threshold&&t-this._lastMatchAt>2e3){this._lastMatchAt=t,console.info(`[Mellon] match: "${c}" sim=${_.toFixed(3)}`),typeof l.onMatch=="function"&&l.onMatch(c.name,_),i=!0;break}}}}catch(s){console.error("[Mellon] inference error:",s)}finally{this._inferring=!1}}}_maxCosineSim(o,t){let s=0;for(const n of t){let r=0;for(let e=0;e<n.length;e++)r+=o[e]*n[e];const a=(r+1)/2;a>s&&(s=a)}return s}static loadWords(o=V){try{const t=localStorage.getItem(o);return t?JSON.parse(t):[]}catch{return[]}}static saveWord(o,t=V){const s=X.loadWords(t).filter(n=>n.word_name!==o.word_name);localStorage.setItem(t,JSON.stringify([...s,o]))}static deleteWord(o,t=V){try{const s=X.loadWords(t).filter(n=>n.word_name!==o);localStorage.setItem(t,JSON.stringify(s))}catch{}}}const Ht=new pt;class Wt{constructor(o,t){this._config={},this._samples=[],this._wordName=o,this._config.modelPath=(t==null?void 0:t.modelPath)||st,this._config.wasmPaths=(t==null?void 0:t.wasmPaths)||Z,this._config.ortCdnUrl=(t==null?void 0:t.ortCdnUrl)||q}async recordSample(){const o=await navigator.mediaDevices.getUserMedia({audio:!0}),t=new AudioContext({sampleRate:16e3}),s=await new Promise((a,e)=>{const h=new MediaRecorder(o),i=[];h.ondataavailable=l=>{l.data.size>0&&i.push(l.data)},h.onstop=async()=>{var l;for(const c of o.getTracks())c.stop();try{const d=await new Blob(i,{type:((l=i[0])==null?void 0:l.type)||"audio/webm"}).arrayBuffer(),_=await t.decodeAudioData(d);await t.close(),a(_.getChannelData(0).slice())}catch(c){e(c)}},h.start(),setTimeout(()=>{try{h.stop()}catch{}},1500)}),n=24e3,r=new Float32Array(n);return r.set(s.slice(0,n)),this._samples.push(r),this._samples.length}deleteSample(o){if(o<0||o>=this._samples.length)throw new RangeError(`index ${o} out of bounds (${this._samples.length} samples)`);return this._samples.splice(o,1),this._samples.length}async generateRef(){const[o,t]=await Promise.all([mt(this._config.wasmPaths,this._config.ortCdnUrl),_t(this._config.modelPath,this._config.wasmPaths,this._config.ortCdnUrl)]),s=[];for(const n of this._samples){const r=Ht.logfbank(n),a=new o.Tensor("float32",r,[1,1,149,64]),e=await t.run({input:a}),h=Array.from(e[Object.keys(e)[0]].data);s.push(h)}return{word_name:this._wordName,model_type:"resnet_50_arc",embeddings:s}}}const Z="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/",q="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.wasm.min.mjs",st="https://huggingface.co/ComicScrip/mellon/resolve/main/model.onnx",gt="https://cdn.jsdelivr.net/npm/mellon@0.0.14/dist/assets/audio-processor.js",V="mellon-refs",wt="mellon-threshold";exports.DEFAULT_AUDIO_PROCESSOR_PATH=gt;exports.DEFAULT_MODEL_PATH=st;exports.DEFAULT_ORT_CDN_URL=q;exports.DEFAULT_REFS_STORAGE_KEY=V;exports.DEFAULT_THRESHOLD_STORAGE_KEY=wt;exports.DEFAULT_WASM_PATHS=Z;exports.EnrollmentSession=Wt;exports.Mellon=X;
|
|
1
|
+
"use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});function Dt(m){return m&&m.__esModule&&Object.prototype.hasOwnProperty.call(m,"default")?m.default:m}var ht,pt;function Ut(){if(pt)return ht;pt=1;function m(o){if(this.size=o|0,this.size<=1||(this.size&this.size-1)!==0)throw new Error("FFT size must be a power of two and bigger than 1");this._csize=o<<1;for(var t=new Array(this.size*2),s=0;s<t.length;s+=2){const i=Math.PI*s/this.size;t[s]=Math.cos(i),t[s+1]=-Math.sin(i)}this.table=t;for(var n=0,r=1;this.size>r;r<<=1)n++;this._width=n%2===0?n-1:n,this._bitrev=new Array(1<<this._width);for(var a=0;a<this._bitrev.length;a++){this._bitrev[a]=0;for(var e=0;e<this._width;e+=2){var l=this._width-e-2;this._bitrev[a]|=(a>>>e&3)<<l}}this._out=null,this._data=null,this._inv=0}return ht=m,m.prototype.fromComplexArray=function(t,s){for(var n=s||new Array(t.length>>>1),r=0;r<t.length;r+=2)n[r>>>1]=t[r];return n},m.prototype.createComplexArray=function(){const t=new Array(this._csize);for(var s=0;s<t.length;s++)t[s]=0;return t},m.prototype.toComplexArray=function(t,s){for(var n=s||this.createComplexArray(),r=0;r<n.length;r+=2)n[r]=t[r>>>1],n[r+1]=0;return n},m.prototype.completeSpectrum=function(t){for(var s=this._csize,n=s>>>1,r=2;r<n;r+=2)t[s-r]=t[r],t[s-r+1]=-t[r+1]},m.prototype.transform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=0,this._transform4(),this._out=null,this._data=null},m.prototype.realTransform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=0,this._realTransform4(),this._out=null,this._data=null},m.prototype.inverseTransform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=1,this._transform4();for(var n=0;n<t.length;n++)t[n]/=this.size;this._out=null,this._data=null},m.prototype._transform4=function(){var t=this._out,s=this._csize,n=this._width,r=1<<n,a=s/r<<1,e,l,i=this._bitrev;if(a===4)for(e=0,l=0;e<s;e+=a,l++){const u=i[l];this._singleTransform2(e,u,r)}else for(e=0,l=0;e<s;e+=a,l++){const u=i[l];this._singleTransform4(e,u,r)}var h=this._inv?-1:1,c=this.table;for(r>>=2;r>=2;r>>=2){a=s/r<<1;var _=a>>>2;for(e=0;e<s;e+=a)for(var d=e+_,v=e,f=0;v<d;v+=2,f+=r){const u=v,p=u+_,w=p+_,g=w+_,T=t[u],F=t[u+1],A=t[p],y=t[p+1],b=t[w],S=t[w+1],P=t[g],C=t[g+1],E=T,M=F,R=c[f],D=h*c[f+1],U=A*R-y*D,z=A*D+y*R,j=c[2*f],L=h*c[2*f+1],B=b*j-S*L,K=b*L+S*j,N=c[3*f],O=h*c[3*f+1],k=P*N-C*O,G=P*O+C*N,Y=E+B,x=M+K,I=E-B,$=M-K,J=U+k,H=z+G,W=h*(U-k),Q=h*(z-G),tt=Y+J,rt=x+H,et=Y-J,ot=x-H,nt=I+Q,at=$-W,it=I-Q,ct=$+W;t[u]=tt,t[u+1]=rt,t[p]=nt,t[p+1]=at,t[w]=et,t[w+1]=ot,t[g]=it,t[g+1]=ct}}},m.prototype._singleTransform2=function(t,s,n){const r=this._out,a=this._data,e=a[s],l=a[s+1],i=a[s+n],h=a[s+n+1],c=e+i,_=l+h,d=e-i,v=l-h;r[t]=c,r[t+1]=_,r[t+2]=d,r[t+3]=v},m.prototype._singleTransform4=function(t,s,n){const r=this._out,a=this._data,e=this._inv?-1:1,l=n*2,i=n*3,h=a[s],c=a[s+1],_=a[s+n],d=a[s+n+1],v=a[s+l],f=a[s+l+1],u=a[s+i],p=a[s+i+1],w=h+v,g=c+f,T=h-v,F=c-f,A=_+u,y=d+p,b=e*(_-u),S=e*(d-p),P=w+A,C=g+y,E=T+S,M=F-b,R=w-A,D=g-y,U=T-S,z=F+b;r[t]=P,r[t+1]=C,r[t+2]=E,r[t+3]=M,r[t+4]=R,r[t+5]=D,r[t+6]=U,r[t+7]=z},m.prototype._realTransform4=function(){var t=this._out,s=this._csize,n=this._width,r=1<<n,a=s/r<<1,e,l,i=this._bitrev;if(a===4)for(e=0,l=0;e<s;e+=a,l++){const lt=i[l];this._singleRealTransform2(e,lt>>>1,r>>>1)}else for(e=0,l=0;e<s;e+=a,l++){const lt=i[l];this._singleRealTransform4(e,lt>>>1,r>>>1)}var h=this._inv?-1:1,c=this.table;for(r>>=2;r>=2;r>>=2){a=s/r<<1;var _=a>>>1,d=_>>>1,v=d>>>1;for(e=0;e<s;e+=a)for(var f=0,u=0;f<=v;f+=2,u+=r){var p=e+f,w=p+d,g=w+d,T=g+d,F=t[p],A=t[p+1],y=t[w],b=t[w+1],S=t[g],P=t[g+1],C=t[T],E=t[T+1],M=F,R=A,D=c[u],U=h*c[u+1],z=y*D-b*U,j=y*U+b*D,L=c[2*u],B=h*c[2*u+1],K=S*L-P*B,N=S*B+P*L,O=c[3*u],k=h*c[3*u+1],G=C*O-E*k,Y=C*k+E*O,x=M+K,I=R+N,$=M-K,J=R-N,H=z+G,W=j+Y,Q=h*(z-G),tt=h*(j-Y),rt=x+H,et=I+W,ot=$+tt,nt=J-Q;if(t[p]=rt,t[p+1]=et,t[w]=ot,t[w+1]=nt,f===0){var at=x-H,it=I-W;t[g]=at,t[g+1]=it;continue}if(f!==v){var ct=$,yt=-J,Tt=x,At=-I,Ft=-h*tt,bt=-h*Q,St=-h*W,Pt=-h*H,Ct=ct+Ft,Et=yt+bt,Mt=Tt+Pt,Rt=At-St,ut=e+d-f,vt=e+_-f;t[ut]=Ct,t[ut+1]=Et,t[vt]=Mt,t[vt+1]=Rt}}}},m.prototype._singleRealTransform2=function(t,s,n){const r=this._out,a=this._data,e=a[s],l=a[s+n],i=e+l,h=e-l;r[t]=i,r[t+1]=0,r[t+2]=h,r[t+3]=0},m.prototype._singleRealTransform4=function(t,s,n){const r=this._out,a=this._data,e=this._inv?-1:1,l=n*2,i=n*3,h=a[s],c=a[s+n],_=a[s+l],d=a[s+i],v=h+_,f=h-_,u=c+d,p=e*(c-d),w=v+u,g=f,T=-p,F=v-u,A=f,y=p;r[t]=w,r[t+1]=0,r[t+2]=g,r[t+3]=T,r[t+4]=F,r[t+5]=0,r[t+6]=A,r[t+7]=y},ht}var zt=Ut();const xt=Dt(zt);class mt{constructor(o=16e3,t=512,s=64){this._sampleRate=o,this._nfft=t,this._nfilt=s,this._fft=new xt(t),this._melFilters=this._createMelFilterbank()}_hzToMel(o){return 2595*Math.log10(1+o/700)}_melToHz(o){return 700*(10**(o/2595)-1)}_createMelFilterbank(){const t=this._sampleRate/2,s=this._hzToMel(0),n=this._hzToMel(t),r=new Float32Array(this._nfilt+2);for(let i=0;i<this._nfilt+2;i++)r[i]=s+i*(n-s)/(this._nfilt+1);const e=r.map(i=>this._melToHz(i)).map(i=>Math.floor((this._nfft+1)*i/this._sampleRate)),l=[];for(let i=0;i<this._nfilt;i++){const h=new Float32Array(Math.floor(this._nfft/2)+1);for(let c=e[i];c<e[i+1];c++)h[c]=(c-e[i])/(e[i+1]-e[i]);for(let c=e[i+1];c<e[i+2];c++)h[c]=(e[i+2]-c)/(e[i+2]-e[i+1]);l.push(h)}return l}logfbank(o){const t=Math.floor(.025*this._sampleRate),s=Math.floor(.01*this._sampleRate),n=1+Math.ceil((o.length-t)/s),r=new Float32Array(n*this._nfilt),a=new Float32Array(this._nfft),e=this._fft.createComplexArray();for(let l=0;l<n;l++){const i=l*s;a.fill(0);for(let _=0;_<t&&i+_<o.length;_++)a[_]=o[i+_];const h=this._fft.toComplexArray(a,null);this._fft.transform(e,h);const c=new Float32Array(Math.floor(this._nfft/2)+1);for(let _=0;_<c.length;_++){const d=e[2*_],v=e[2*_+1];c[_]=1/this._nfft*(d*d+v*v),c[_]===0&&(c[_]=1e-30)}for(let _=0;_<this._nfilt;_++){let d=0;const v=this._melFilters[_];for(let f=0;f<c.length;f++)d+=c[f]*v[f];d===0&&(d=1e-30),r[l*this._nfilt+_]=Math.log(d)}}return r}maxCosineSim(o,t){let s=0;for(const n of t){let r=0;for(let e=0;e<n.length;e++)r+=o[e]*n[e];const a=(r+1)/2;a>s&&(s=a)}return s}}async function ft(m=Z,o=q){const t=await import(o);return t.env.wasm.wasmPaths=m,t.env.wasm.numThreads=1,t}let _t=null;async function dt(m=st,o=Z,t=q){return _t||(_t=ft(o,t).then(s=>s.InferenceSession.create(m,{executionProviders:["wasm"],graphOptimizationLevel:"all"}))),_t}class X{static loadWords(o=V){try{const t=localStorage.getItem(o);return t?JSON.parse(t):[]}catch{return[]}}static saveWord(o,t=V){const s=X.loadWords(t).filter(n=>n.word_name!==o.word_name);localStorage.setItem(t,JSON.stringify([...s,o]))}static deleteWord(o,t=V){try{const s=X.loadWords(t).filter(n=>n.word_name!==o);localStorage.setItem(t,JSON.stringify(s))}catch{}}}class It{constructor(o,t){this._started=!1,this._inferring=!1,this._audioCtx=null,this._stream=null,this._refEmbeddings=new Map,this._lastMatchAt=0,this._lastInferenceAt=0;const{refsStorageKey:s=V,thresholdStorageKey:n=gt,wasmPaths:r=Z,modelPath:a=st,audioProcessorPath:e=wt,ortCdnUrl:l=q,audioUtils:i=new mt}=t||{};this._audioUtils=i,this._commands=o,this._refsStorageKey=s,this._thresholdStorageKey=n,this._audioProcessorPath=e,this._wasmPaths=r,this._modelPath=a,this._ortCdnUrl=l;try{const h=localStorage.getItem(this._thresholdStorageKey);this._threshold=h!==null?Math.max(0,Math.min(1,Number(h))):.65}catch{this._threshold=.65}this._initPromise=this._init()}get threshold(){return this._threshold}set threshold(o){this._threshold=Math.max(0,Math.min(1,o));try{localStorage.setItem(this._thresholdStorageKey,String(this._threshold))}catch{}}get listening(){return this._started}async _init(){await dt(this._modelPath,this._wasmPaths,this._ortCdnUrl);const o=new Set;for(const t of this._commands)for(const s of t.triggers)if(!o.has(s.name)&&(o.add(s.name),s.defaultRefPath)){const n=await fetch(s.defaultRefPath);if(n.ok){const r=await n.json();this.addCustomWord(r)}}for(const t of X.loadWords(this._refsStorageKey))this._refEmbeddings.set(t.word_name,t.embeddings);console.info("[Mellon] init complete, loaded refs:",[...this._refEmbeddings.keys()])}async init(){await this._initPromise}addCustomWord(o){if(!(Array.isArray(o.embeddings)&&o.embeddings.length>0))throw new Error("invalid ref file for : "+o.word_name);this._refEmbeddings.set(o.word_name,o.embeddings)}async start(){if(this._started)return;await this._initPromise;let o;try{o=await navigator.mediaDevices.getUserMedia({audio:{noiseSuppression:!1,echoCancellation:!1,autoGainControl:!1,channelCount:1}})}catch{o=await navigator.mediaDevices.getUserMedia({audio:!0})}this._stream=o;const t=new AudioContext({sampleRate:16e3});this._audioCtx=t,await t.audioWorklet.addModule(this._audioProcessorPath);const s=t.createMediaStreamSource(o),n=new AudioWorkletNode(t,"audio-processor");n.port.onmessage=r=>{this._handleBuffer(r.data)},s.connect(n),n.connect(t.destination),this._started=!0}async stop(){if(this._started=!1,this._audioCtx&&(await this._audioCtx.close(),this._audioCtx=null),this._stream){for(const o of this._stream.getTracks())o.stop();this._stream=null}}async _handleBuffer(o){if(this._inferring)return;const t=Date.now();if(!(t-this._lastInferenceAt<300)){this._lastInferenceAt=t,this._inferring=!0;try{const[s,n]=await Promise.all([ft(this._wasmPaths,this._ortCdnUrl),dt(this._modelPath,this._wasmPaths,this._ortCdnUrl)]),r=this._audioUtils.logfbank(o),a=new s.Tensor("float32",r,[1,1,149,64]),e=await n.run({input:a}),l=e[Object.keys(e)[0]].data;let i=!1;for(const h of this._commands){if(i)break;for(const c of h.triggers){const _=this._refEmbeddings.get(c.name);if(!_)continue;const d=this._audioUtils.maxCosineSim(l,_);if(d>=this._threshold&&t-this._lastMatchAt>2e3){this._lastMatchAt=t,console.info(`[Mellon] match: "${c}" sim=${d.toFixed(3)}`),typeof h.onMatch=="function"&&h.onMatch(c.name,d),i=!0;break}}}}catch(s){console.error("[Mellon] inference error:",s)}finally{this._inferring=!1}}}}class Ht{constructor(o,t){this._config={},this._samples=[],this._wordName=o,this._config.modelPath=(t==null?void 0:t.modelPath)||st,this._config.wasmPaths=(t==null?void 0:t.wasmPaths)||Z,this._config.ortCdnUrl=(t==null?void 0:t.ortCdnUrl)||q,this._audioUtils=(t==null?void 0:t.audioUtils)??new mt}async recordSample(){const o=await navigator.mediaDevices.getUserMedia({audio:!0}),t=new AudioContext({sampleRate:16e3}),s=await new Promise((a,e)=>{const l=new MediaRecorder(o),i=[];l.ondataavailable=h=>{h.data.size>0&&i.push(h.data)},l.onstop=async()=>{var h;for(const c of o.getTracks())c.stop();try{const _=await new Blob(i,{type:((h=i[0])==null?void 0:h.type)||"audio/webm"}).arrayBuffer(),d=await t.decodeAudioData(_);await t.close(),a(d.getChannelData(0).slice())}catch(c){e(c)}},l.start(),setTimeout(()=>{try{l.stop()}catch{}},1500)}),n=24e3,r=new Float32Array(n);return r.set(s.slice(0,n)),this._samples.push(r),this._samples.length}deleteSample(o){if(o<0||o>=this._samples.length)throw new RangeError(`index ${o} out of bounds (${this._samples.length} samples)`);return this._samples.splice(o,1),this._samples.length}async generateRef(){const[o,t]=await Promise.all([ft(this._config.wasmPaths,this._config.ortCdnUrl),dt(this._config.modelPath,this._config.wasmPaths,this._config.ortCdnUrl)]),s=[];for(const n of this._samples){const r=this._audioUtils.logfbank(n),a=new o.Tensor("float32",r,[1,1,149,64]),e=await t.run({input:a}),l=Array.from(e[Object.keys(e)[0]].data);s.push(l)}return{word_name:this._wordName,model_type:"resnet_50_arc",embeddings:s}}}const Z="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/",q="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.wasm.min.mjs",st="https://huggingface.co/ComicScrip/mellon/resolve/main/model.onnx",wt="https://cdn.jsdelivr.net/npm/mellon@0.0.14/dist/assets/audio-processor.js",V="mellon-refs",gt="mellon-threshold";exports.AudioUtils=mt;exports.DEFAULT_AUDIO_PROCESSOR_PATH=wt;exports.DEFAULT_MODEL_PATH=st;exports.DEFAULT_ORT_CDN_URL=q;exports.DEFAULT_REFS_STORAGE_KEY=V;exports.DEFAULT_THRESHOLD_STORAGE_KEY=gt;exports.DEFAULT_WASM_PATHS=Z;exports.Detector=It;exports.EnrollmentSession=Ht;exports.Storage=X;
|
package/dist/mellon.mjs
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
function Rt(
|
|
2
|
-
return
|
|
1
|
+
function Rt(m) {
|
|
2
|
+
return m && m.__esModule && Object.prototype.hasOwnProperty.call(m, "default") ? m.default : m;
|
|
3
3
|
}
|
|
4
|
-
var ht,
|
|
4
|
+
var ht, vt;
|
|
5
5
|
function Et() {
|
|
6
|
-
if (
|
|
7
|
-
|
|
8
|
-
function
|
|
6
|
+
if (vt) return ht;
|
|
7
|
+
vt = 1;
|
|
8
|
+
function m(o) {
|
|
9
9
|
if (this.size = o | 0, this.size <= 1 || (this.size & this.size - 1) !== 0)
|
|
10
10
|
throw new Error("FFT size must be a power of two and bigger than 1");
|
|
11
11
|
this._csize = o << 1;
|
|
@@ -26,66 +26,66 @@ function Et() {
|
|
|
26
26
|
}
|
|
27
27
|
this._out = null, this._data = null, this._inv = 0;
|
|
28
28
|
}
|
|
29
|
-
return ht =
|
|
29
|
+
return ht = m, m.prototype.fromComplexArray = function(t, s) {
|
|
30
30
|
for (var n = s || new Array(t.length >>> 1), r = 0; r < t.length; r += 2)
|
|
31
31
|
n[r >>> 1] = t[r];
|
|
32
32
|
return n;
|
|
33
|
-
},
|
|
33
|
+
}, m.prototype.createComplexArray = function() {
|
|
34
34
|
const t = new Array(this._csize);
|
|
35
35
|
for (var s = 0; s < t.length; s++)
|
|
36
36
|
t[s] = 0;
|
|
37
37
|
return t;
|
|
38
|
-
},
|
|
38
|
+
}, m.prototype.toComplexArray = function(t, s) {
|
|
39
39
|
for (var n = s || this.createComplexArray(), r = 0; r < n.length; r += 2)
|
|
40
40
|
n[r] = t[r >>> 1], n[r + 1] = 0;
|
|
41
41
|
return n;
|
|
42
|
-
},
|
|
42
|
+
}, m.prototype.completeSpectrum = function(t) {
|
|
43
43
|
for (var s = this._csize, n = s >>> 1, r = 2; r < n; r += 2)
|
|
44
44
|
t[s - r] = t[r], t[s - r + 1] = -t[r + 1];
|
|
45
|
-
},
|
|
45
|
+
}, m.prototype.transform = function(t, s) {
|
|
46
46
|
if (t === s)
|
|
47
47
|
throw new Error("Input and output buffers must be different");
|
|
48
48
|
this._out = t, this._data = s, this._inv = 0, this._transform4(), this._out = null, this._data = null;
|
|
49
|
-
},
|
|
49
|
+
}, m.prototype.realTransform = function(t, s) {
|
|
50
50
|
if (t === s)
|
|
51
51
|
throw new Error("Input and output buffers must be different");
|
|
52
52
|
this._out = t, this._data = s, this._inv = 0, this._realTransform4(), this._out = null, this._data = null;
|
|
53
|
-
},
|
|
53
|
+
}, m.prototype.inverseTransform = function(t, s) {
|
|
54
54
|
if (t === s)
|
|
55
55
|
throw new Error("Input and output buffers must be different");
|
|
56
56
|
this._out = t, this._data = s, this._inv = 1, this._transform4();
|
|
57
57
|
for (var n = 0; n < t.length; n++)
|
|
58
58
|
t[n] /= this.size;
|
|
59
59
|
this._out = null, this._data = null;
|
|
60
|
-
},
|
|
60
|
+
}, m.prototype._transform4 = function() {
|
|
61
61
|
var t = this._out, s = this._csize, n = this._width, r = 1 << n, a = s / r << 1, e, h, i = this._bitrev;
|
|
62
62
|
if (a === 4)
|
|
63
63
|
for (e = 0, h = 0; e < s; e += a, h++) {
|
|
64
|
-
const
|
|
65
|
-
this._singleTransform2(e,
|
|
64
|
+
const u = i[h];
|
|
65
|
+
this._singleTransform2(e, u, r);
|
|
66
66
|
}
|
|
67
67
|
else
|
|
68
68
|
for (e = 0, h = 0; e < s; e += a, h++) {
|
|
69
|
-
const
|
|
70
|
-
this._singleTransform4(e,
|
|
69
|
+
const u = i[h];
|
|
70
|
+
this._singleTransform4(e, u, r);
|
|
71
71
|
}
|
|
72
72
|
var l = this._inv ? -1 : 1, c = this.table;
|
|
73
73
|
for (r >>= 2; r >= 2; r >>= 2) {
|
|
74
74
|
a = s / r << 1;
|
|
75
|
-
var
|
|
75
|
+
var _ = a >>> 2;
|
|
76
76
|
for (e = 0; e < s; e += a)
|
|
77
|
-
for (var
|
|
78
|
-
const
|
|
79
|
-
t[
|
|
77
|
+
for (var d = e + _, v = e, f = 0; v < d; v += 2, f += r) {
|
|
78
|
+
const u = v, p = u + _, w = p + _, g = w + _, b = t[u], T = t[u + 1], F = t[p], y = t[p + 1], A = t[w], C = t[w + 1], P = t[g], M = t[g + 1], S = b, R = T, E = c[f], U = l * c[f + 1], D = F * E - y * U, z = F * U + y * E, B = c[2 * f], N = l * c[2 * f + 1], H = A * B - C * N, K = A * N + C * B, k = c[3 * f], $ = l * c[3 * f + 1], G = P * k - M * $, J = P * $ + M * k, L = S + H, x = R + K, I = S - H, Y = R - K, Q = D + G, W = z + J, j = l * (D - G), V = l * (z - J), X = L + Q, st = x + W, rt = L - Q, et = x - W, ot = I + V, nt = Y - j, at = I - V, it = Y + j;
|
|
79
|
+
t[u] = X, t[u + 1] = st, t[p] = ot, t[p + 1] = nt, t[w] = rt, t[w + 1] = et, t[g] = at, t[g + 1] = it;
|
|
80
80
|
}
|
|
81
81
|
}
|
|
82
|
-
},
|
|
83
|
-
const r = this._out, a = this._data, e = a[s], h = a[s + 1], i = a[s + n], l = a[s + n + 1], c = e + i,
|
|
84
|
-
r[t] = c, r[t + 1] =
|
|
85
|
-
},
|
|
86
|
-
const r = this._out, a = this._data, e = this._inv ? -1 : 1, h = n * 2, i = n * 3, l = a[s], c = a[s + 1],
|
|
87
|
-
r[t] = P, r[t + 1] = M, r[t + 2] = S, r[t + 3] = R, r[t + 4] = E, r[t + 5] =
|
|
88
|
-
},
|
|
82
|
+
}, m.prototype._singleTransform2 = function(t, s, n) {
|
|
83
|
+
const r = this._out, a = this._data, e = a[s], h = a[s + 1], i = a[s + n], l = a[s + n + 1], c = e + i, _ = h + l, d = e - i, v = h - l;
|
|
84
|
+
r[t] = c, r[t + 1] = _, r[t + 2] = d, r[t + 3] = v;
|
|
85
|
+
}, m.prototype._singleTransform4 = function(t, s, n) {
|
|
86
|
+
const r = this._out, a = this._data, e = this._inv ? -1 : 1, h = n * 2, i = n * 3, l = a[s], c = a[s + 1], _ = a[s + n], d = a[s + n + 1], v = a[s + h], f = a[s + h + 1], u = a[s + i], p = a[s + i + 1], w = l + v, g = c + f, b = l - v, T = c - f, F = _ + u, y = d + p, A = e * (_ - u), C = e * (d - p), P = w + F, M = g + y, S = b + C, R = T - A, E = w - F, U = g - y, D = b - C, z = T + A;
|
|
87
|
+
r[t] = P, r[t + 1] = M, r[t + 2] = S, r[t + 3] = R, r[t + 4] = E, r[t + 5] = U, r[t + 6] = D, r[t + 7] = z;
|
|
88
|
+
}, m.prototype._realTransform4 = function() {
|
|
89
89
|
var t = this._out, s = this._csize, n = this._width, r = 1 << n, a = s / r << 1, e, h, i = this._bitrev;
|
|
90
90
|
if (a === 4)
|
|
91
91
|
for (e = 0, h = 0; e < s; e += a, h++) {
|
|
@@ -100,48 +100,48 @@ function Et() {
|
|
|
100
100
|
var l = this._inv ? -1 : 1, c = this.table;
|
|
101
101
|
for (r >>= 2; r >= 2; r >>= 2) {
|
|
102
102
|
a = s / r << 1;
|
|
103
|
-
var
|
|
103
|
+
var _ = a >>> 1, d = _ >>> 1, v = d >>> 1;
|
|
104
104
|
for (e = 0; e < s; e += a)
|
|
105
|
-
for (var f = 0,
|
|
106
|
-
var p = e + f, w = p +
|
|
105
|
+
for (var f = 0, u = 0; f <= v; f += 2, u += r) {
|
|
106
|
+
var p = e + f, w = p + d, g = w + d, b = g + d, T = t[p], F = t[p + 1], y = t[w], A = t[w + 1], C = t[g], P = t[g + 1], M = t[b], S = t[b + 1], R = T, E = F, U = c[u], D = l * c[u + 1], z = y * U - A * D, B = y * D + A * U, N = c[2 * u], H = l * c[2 * u + 1], K = C * N - P * H, k = C * H + P * N, $ = c[3 * u], G = l * c[3 * u + 1], J = M * $ - S * G, L = M * G + S * $, x = R + K, I = E + k, Y = R - K, Q = E - k, W = z + J, j = B + L, V = l * (z - J), X = l * (B - L), st = x + W, rt = I + j, et = Y + X, ot = Q - V;
|
|
107
107
|
if (t[p] = st, t[p + 1] = rt, t[w] = et, t[w + 1] = ot, f === 0) {
|
|
108
|
-
var nt =
|
|
108
|
+
var nt = x - W, at = I - j;
|
|
109
109
|
t[g] = nt, t[g + 1] = at;
|
|
110
110
|
continue;
|
|
111
111
|
}
|
|
112
|
-
if (f !==
|
|
113
|
-
var it = Y, wt = -Q, gt =
|
|
114
|
-
t[ft] = Ct, t[ft + 1] = Pt, t[
|
|
112
|
+
if (f !== v) {
|
|
113
|
+
var it = Y, wt = -Q, gt = x, yt = -I, bt = -l * X, Ft = -l * V, Tt = -l * j, At = -l * W, Ct = it + bt, Pt = wt + Ft, Mt = gt + At, St = yt - Tt, ft = e + d - f, ut = e + _ - f;
|
|
114
|
+
t[ft] = Ct, t[ft + 1] = Pt, t[ut] = Mt, t[ut + 1] = St;
|
|
115
115
|
}
|
|
116
116
|
}
|
|
117
117
|
}
|
|
118
|
-
},
|
|
118
|
+
}, m.prototype._singleRealTransform2 = function(t, s, n) {
|
|
119
119
|
const r = this._out, a = this._data, e = a[s], h = a[s + n], i = e + h, l = e - h;
|
|
120
120
|
r[t] = i, r[t + 1] = 0, r[t + 2] = l, r[t + 3] = 0;
|
|
121
|
-
},
|
|
122
|
-
const r = this._out, a = this._data, e = this._inv ? -1 : 1, h = n * 2, i = n * 3, l = a[s], c = a[s + n],
|
|
121
|
+
}, m.prototype._singleRealTransform4 = function(t, s, n) {
|
|
122
|
+
const r = this._out, a = this._data, e = this._inv ? -1 : 1, h = n * 2, i = n * 3, l = a[s], c = a[s + n], _ = a[s + h], d = a[s + i], v = l + _, f = l - _, u = c + d, p = e * (c - d), w = v + u, g = f, b = -p, T = v - u, F = f, y = p;
|
|
123
123
|
r[t] = w, r[t + 1] = 0, r[t + 2] = g, r[t + 3] = b, r[t + 4] = T, r[t + 5] = 0, r[t + 6] = F, r[t + 7] = y;
|
|
124
124
|
}, ht;
|
|
125
125
|
}
|
|
126
|
-
var
|
|
127
|
-
const
|
|
126
|
+
var Ut = Et();
|
|
127
|
+
const Dt = /* @__PURE__ */ Rt(Ut);
|
|
128
128
|
class pt {
|
|
129
129
|
constructor(o = 16e3, t = 512, s = 64) {
|
|
130
|
-
this.
|
|
130
|
+
this._sampleRate = o, this._nfft = t, this._nfilt = s, this._fft = new Dt(t), this._melFilters = this._createMelFilterbank();
|
|
131
131
|
}
|
|
132
|
-
|
|
132
|
+
_hzToMel(o) {
|
|
133
133
|
return 2595 * Math.log10(1 + o / 700);
|
|
134
134
|
}
|
|
135
|
-
|
|
135
|
+
_melToHz(o) {
|
|
136
136
|
return 700 * (10 ** (o / 2595) - 1);
|
|
137
137
|
}
|
|
138
|
-
|
|
139
|
-
const t = this.
|
|
140
|
-
for (let i = 0; i < this.
|
|
141
|
-
r[i] = s + i * (n - s) / (this.
|
|
142
|
-
const e = r.map((i) => this.
|
|
143
|
-
for (let i = 0; i < this.
|
|
144
|
-
const l = new Float32Array(Math.floor(this.
|
|
138
|
+
_createMelFilterbank() {
|
|
139
|
+
const t = this._sampleRate / 2, s = this._hzToMel(0), n = this._hzToMel(t), r = new Float32Array(this._nfilt + 2);
|
|
140
|
+
for (let i = 0; i < this._nfilt + 2; i++)
|
|
141
|
+
r[i] = s + i * (n - s) / (this._nfilt + 1);
|
|
142
|
+
const e = r.map((i) => this._melToHz(i)).map((i) => Math.floor((this._nfft + 1) * i / this._sampleRate)), h = [];
|
|
143
|
+
for (let i = 0; i < this._nfilt; i++) {
|
|
144
|
+
const l = new Float32Array(Math.floor(this._nfft / 2) + 1);
|
|
145
145
|
for (let c = e[i]; c < e[i + 1]; c++)
|
|
146
146
|
l[c] = (c - e[i]) / (e[i + 1] - e[i]);
|
|
147
147
|
for (let c = e[i + 1]; c < e[i + 2]; c++)
|
|
@@ -152,62 +152,93 @@ class pt {
|
|
|
152
152
|
}
|
|
153
153
|
/** Returns a flat Float32Array of shape [numFrames × nfilt]. */
|
|
154
154
|
logfbank(o) {
|
|
155
|
-
const t = Math.floor(0.025 * this.
|
|
155
|
+
const t = Math.floor(0.025 * this._sampleRate), s = Math.floor(0.01 * this._sampleRate), n = 1 + Math.ceil((o.length - t) / s), r = new Float32Array(n * this._nfilt), a = new Float32Array(this._nfft), e = this._fft.createComplexArray();
|
|
156
156
|
for (let h = 0; h < n; h++) {
|
|
157
157
|
const i = h * s;
|
|
158
158
|
a.fill(0);
|
|
159
|
-
for (let
|
|
160
|
-
a[
|
|
161
|
-
const l = this.
|
|
162
|
-
this.
|
|
163
|
-
const c = new Float32Array(Math.floor(this.
|
|
164
|
-
for (let
|
|
165
|
-
const
|
|
166
|
-
c[
|
|
159
|
+
for (let _ = 0; _ < t && i + _ < o.length; _++)
|
|
160
|
+
a[_] = o[i + _];
|
|
161
|
+
const l = this._fft.toComplexArray(a, null);
|
|
162
|
+
this._fft.transform(e, l);
|
|
163
|
+
const c = new Float32Array(Math.floor(this._nfft / 2) + 1);
|
|
164
|
+
for (let _ = 0; _ < c.length; _++) {
|
|
165
|
+
const d = e[2 * _], v = e[2 * _ + 1];
|
|
166
|
+
c[_] = 1 / this._nfft * (d * d + v * v), c[_] === 0 && (c[_] = 1e-30);
|
|
167
167
|
}
|
|
168
|
-
for (let
|
|
169
|
-
let
|
|
170
|
-
const
|
|
168
|
+
for (let _ = 0; _ < this._nfilt; _++) {
|
|
169
|
+
let d = 0;
|
|
170
|
+
const v = this._melFilters[_];
|
|
171
171
|
for (let f = 0; f < c.length; f++)
|
|
172
|
-
|
|
173
|
-
|
|
172
|
+
d += c[f] * v[f];
|
|
173
|
+
d === 0 && (d = 1e-30), r[h * this._nfilt + _] = Math.log(d);
|
|
174
174
|
}
|
|
175
175
|
}
|
|
176
176
|
return r;
|
|
177
177
|
}
|
|
178
|
+
maxCosineSim(o, t) {
|
|
179
|
+
let s = 0;
|
|
180
|
+
for (const n of t) {
|
|
181
|
+
let r = 0;
|
|
182
|
+
for (let e = 0; e < n.length; e++) r += o[e] * n[e];
|
|
183
|
+
const a = (r + 1) / 2;
|
|
184
|
+
a > s && (s = a);
|
|
185
|
+
}
|
|
186
|
+
return s;
|
|
187
|
+
}
|
|
178
188
|
}
|
|
179
|
-
async function
|
|
189
|
+
async function dt(m = q, o = tt) {
|
|
180
190
|
const t = await import(
|
|
181
191
|
/* @vite-ignore */
|
|
182
192
|
o
|
|
183
193
|
);
|
|
184
|
-
return t.env.wasm.wasmPaths =
|
|
194
|
+
return t.env.wasm.wasmPaths = m, t.env.wasm.numThreads = 1, t;
|
|
185
195
|
}
|
|
186
196
|
let lt = null;
|
|
187
|
-
async function
|
|
188
|
-
return lt || (lt =
|
|
189
|
-
(s) => s.InferenceSession.create(
|
|
197
|
+
async function _t(m = mt, o = q, t = tt) {
|
|
198
|
+
return lt || (lt = dt(o, t).then(
|
|
199
|
+
(s) => s.InferenceSession.create(m, {
|
|
190
200
|
executionProviders: ["wasm"],
|
|
191
201
|
graphOptimizationLevel: "all"
|
|
192
202
|
})
|
|
193
203
|
)), lt;
|
|
194
204
|
}
|
|
195
|
-
const xt = new pt();
|
|
196
205
|
class O {
|
|
206
|
+
static loadWords(o = Z) {
|
|
207
|
+
try {
|
|
208
|
+
const t = localStorage.getItem(o);
|
|
209
|
+
return t ? JSON.parse(t) : [];
|
|
210
|
+
} catch {
|
|
211
|
+
return [];
|
|
212
|
+
}
|
|
213
|
+
}
|
|
214
|
+
static saveWord(o, t = Z) {
|
|
215
|
+
const s = O.loadWords(t).filter((n) => n.word_name !== o.word_name);
|
|
216
|
+
localStorage.setItem(t, JSON.stringify([...s, o]));
|
|
217
|
+
}
|
|
218
|
+
static deleteWord(o, t = Z) {
|
|
219
|
+
try {
|
|
220
|
+
const s = O.loadWords(t).filter((n) => n.word_name !== o);
|
|
221
|
+
localStorage.setItem(t, JSON.stringify(s));
|
|
222
|
+
} catch {
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
}
|
|
226
|
+
class It {
|
|
197
227
|
constructor(o, t) {
|
|
198
228
|
this._started = !1, this._inferring = !1, this._audioCtx = null, this._stream = null, this._refEmbeddings = /* @__PURE__ */ new Map(), this._lastMatchAt = 0, this._lastInferenceAt = 0;
|
|
199
229
|
const {
|
|
200
230
|
refsStorageKey: s = Z,
|
|
201
|
-
thresholdStorageKey: n =
|
|
231
|
+
thresholdStorageKey: n = xt,
|
|
202
232
|
wasmPaths: r = q,
|
|
203
|
-
modelPath: a =
|
|
204
|
-
audioProcessorPath: e =
|
|
205
|
-
ortCdnUrl: h = tt
|
|
233
|
+
modelPath: a = mt,
|
|
234
|
+
audioProcessorPath: e = zt,
|
|
235
|
+
ortCdnUrl: h = tt,
|
|
236
|
+
audioUtils: i = new pt()
|
|
206
237
|
} = t || {};
|
|
207
|
-
this._commands = o, this._refsStorageKey = s, this._thresholdStorageKey = n, this._audioProcessorPath = e, this._wasmPaths = r, this._modelPath = a, this._ortCdnUrl = h;
|
|
238
|
+
this._audioUtils = i, this._commands = o, this._refsStorageKey = s, this._thresholdStorageKey = n, this._audioProcessorPath = e, this._wasmPaths = r, this._modelPath = a, this._ortCdnUrl = h;
|
|
208
239
|
try {
|
|
209
|
-
const
|
|
210
|
-
this._threshold =
|
|
240
|
+
const l = localStorage.getItem(this._thresholdStorageKey);
|
|
241
|
+
this._threshold = l !== null ? Math.max(0, Math.min(1, Number(l))) : 0.65;
|
|
211
242
|
} catch {
|
|
212
243
|
this._threshold = 0.65;
|
|
213
244
|
}
|
|
@@ -227,7 +258,7 @@ class O {
|
|
|
227
258
|
return this._started;
|
|
228
259
|
}
|
|
229
260
|
async _init() {
|
|
230
|
-
await
|
|
261
|
+
await _t(this._modelPath, this._wasmPaths, this._ortCdnUrl);
|
|
231
262
|
const o = /* @__PURE__ */ new Set();
|
|
232
263
|
for (const t of this._commands)
|
|
233
264
|
for (const s of t.triggers)
|
|
@@ -288,16 +319,16 @@ class O {
|
|
|
288
319
|
if (!(t - this._lastInferenceAt < 300)) {
|
|
289
320
|
this._lastInferenceAt = t, this._inferring = !0;
|
|
290
321
|
try {
|
|
291
|
-
const [s, n] = await Promise.all([
|
|
322
|
+
const [s, n] = await Promise.all([dt(this._wasmPaths, this._ortCdnUrl), _t(this._modelPath, this._wasmPaths, this._ortCdnUrl)]), r = this._audioUtils.logfbank(o), a = new s.Tensor("float32", r, [1, 1, 149, 64]), e = await n.run({ input: a }), h = e[Object.keys(e)[0]].data;
|
|
292
323
|
let i = !1;
|
|
293
324
|
for (const l of this._commands) {
|
|
294
325
|
if (i) break;
|
|
295
326
|
for (const c of l.triggers) {
|
|
296
|
-
const
|
|
297
|
-
if (!
|
|
298
|
-
const
|
|
299
|
-
if (
|
|
300
|
-
this._lastMatchAt = t, console.info(`[Mellon] match: "${c}" sim=${
|
|
327
|
+
const _ = this._refEmbeddings.get(c.name);
|
|
328
|
+
if (!_) continue;
|
|
329
|
+
const d = this._audioUtils.maxCosineSim(h, _);
|
|
330
|
+
if (d >= this._threshold && t - this._lastMatchAt > 2e3) {
|
|
331
|
+
this._lastMatchAt = t, console.info(`[Mellon] match: "${c}" sim=${d.toFixed(3)}`), typeof l.onMatch == "function" && l.onMatch(c.name, d), i = !0;
|
|
301
332
|
break;
|
|
302
333
|
}
|
|
303
334
|
}
|
|
@@ -309,40 +340,10 @@ class O {
|
|
|
309
340
|
}
|
|
310
341
|
}
|
|
311
342
|
}
|
|
312
|
-
_maxCosineSim(o, t) {
|
|
313
|
-
let s = 0;
|
|
314
|
-
for (const n of t) {
|
|
315
|
-
let r = 0;
|
|
316
|
-
for (let e = 0; e < n.length; e++) r += o[e] * n[e];
|
|
317
|
-
const a = (r + 1) / 2;
|
|
318
|
-
a > s && (s = a);
|
|
319
|
-
}
|
|
320
|
-
return s;
|
|
321
|
-
}
|
|
322
|
-
static loadWords(o = Z) {
|
|
323
|
-
try {
|
|
324
|
-
const t = localStorage.getItem(o);
|
|
325
|
-
return t ? JSON.parse(t) : [];
|
|
326
|
-
} catch {
|
|
327
|
-
return [];
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
static saveWord(o, t = Z) {
|
|
331
|
-
const s = O.loadWords(t).filter((n) => n.word_name !== o.word_name);
|
|
332
|
-
localStorage.setItem(t, JSON.stringify([...s, o]));
|
|
333
|
-
}
|
|
334
|
-
static deleteWord(o, t = Z) {
|
|
335
|
-
try {
|
|
336
|
-
const s = O.loadWords(t).filter((n) => n.word_name !== o);
|
|
337
|
-
localStorage.setItem(t, JSON.stringify(s));
|
|
338
|
-
} catch {
|
|
339
|
-
}
|
|
340
|
-
}
|
|
341
343
|
}
|
|
342
|
-
|
|
343
|
-
class jt {
|
|
344
|
+
class Wt {
|
|
344
345
|
constructor(o, t) {
|
|
345
|
-
this._config = {}, this._samples = [], this._wordName = o, this._config.modelPath = (t == null ? void 0 : t.modelPath) ||
|
|
346
|
+
this._config = {}, this._samples = [], this._wordName = o, this._config.modelPath = (t == null ? void 0 : t.modelPath) || mt, this._config.wasmPaths = (t == null ? void 0 : t.wasmPaths) || q, this._config.ortCdnUrl = (t == null ? void 0 : t.ortCdnUrl) || tt, this._audioUtils = (t == null ? void 0 : t.audioUtils) ?? new pt();
|
|
346
347
|
}
|
|
347
348
|
/** Records 1.5 s of audio, stores the decoded PCM, returns new sample count. */
|
|
348
349
|
async recordSample() {
|
|
@@ -354,8 +355,8 @@ class jt {
|
|
|
354
355
|
var l;
|
|
355
356
|
for (const c of o.getTracks()) c.stop();
|
|
356
357
|
try {
|
|
357
|
-
const
|
|
358
|
-
await t.close(), a(
|
|
358
|
+
const _ = await new Blob(i, { type: ((l = i[0]) == null ? void 0 : l.type) || "audio/webm" }).arrayBuffer(), d = await t.decodeAudioData(_);
|
|
359
|
+
await t.close(), a(d.getChannelData(0).slice());
|
|
359
360
|
} catch (c) {
|
|
360
361
|
e(c);
|
|
361
362
|
}
|
|
@@ -376,22 +377,24 @@ class jt {
|
|
|
376
377
|
}
|
|
377
378
|
/** Runs ONNX inference on every recorded sample to produce reference embeddings. */
|
|
378
379
|
async generateRef() {
|
|
379
|
-
const [o, t] = await Promise.all([
|
|
380
|
+
const [o, t] = await Promise.all([dt(this._config.wasmPaths, this._config.ortCdnUrl), _t(this._config.modelPath, this._config.wasmPaths, this._config.ortCdnUrl)]), s = [];
|
|
380
381
|
for (const n of this._samples) {
|
|
381
|
-
const r =
|
|
382
|
+
const r = this._audioUtils.logfbank(n), a = new o.Tensor("float32", r, [1, 1, 149, 64]), e = await t.run({ input: a }), h = Array.from(e[Object.keys(e)[0]].data);
|
|
382
383
|
s.push(h);
|
|
383
384
|
}
|
|
384
385
|
return { word_name: this._wordName, model_type: "resnet_50_arc", embeddings: s };
|
|
385
386
|
}
|
|
386
387
|
}
|
|
387
|
-
const q = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/", tt = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.wasm.min.mjs",
|
|
388
|
+
const q = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/", tt = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.wasm.min.mjs", mt = "https://huggingface.co/ComicScrip/mellon/resolve/main/model.onnx", zt = "https://cdn.jsdelivr.net/npm/mellon@0.0.14/dist/assets/audio-processor.js", Z = "mellon-refs", xt = "mellon-threshold";
|
|
388
389
|
export {
|
|
389
|
-
|
|
390
|
-
|
|
390
|
+
pt as AudioUtils,
|
|
391
|
+
zt as DEFAULT_AUDIO_PROCESSOR_PATH,
|
|
392
|
+
mt as DEFAULT_MODEL_PATH,
|
|
391
393
|
tt as DEFAULT_ORT_CDN_URL,
|
|
392
394
|
Z as DEFAULT_REFS_STORAGE_KEY,
|
|
393
|
-
|
|
395
|
+
xt as DEFAULT_THRESHOLD_STORAGE_KEY,
|
|
394
396
|
q as DEFAULT_WASM_PATHS,
|
|
395
|
-
|
|
396
|
-
|
|
397
|
+
It as Detector,
|
|
398
|
+
Wt as EnrollmentSession,
|
|
399
|
+
O as Storage
|
|
397
400
|
};
|