mellon 0.0.18 → 0.0.20

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -17,7 +17,7 @@ Offline, fully in-browser **hotword / wake-word detection** powered by [Efficien
17
17
  2. [Quick start](#quick-start)
18
18
  3. [Enrolling words](#enrolling-custom-words)
19
19
  4. [API reference](#api-reference)
20
- - [Mellon](#mellon)
20
+ - [Detector](#detector)
21
21
  - [EnrollmentSession](#enrollmentsession)
22
22
  5. [Science behind the lib](#science-behind-the-lib)
23
23
  ---
@@ -31,9 +31,9 @@ npm install mellon
31
31
  ## Quick start
32
32
 
33
33
  ```js
34
- import { Mellon } from 'mellon'
34
+ import { Detector } from 'mellon'
35
35
 
36
- const hotWordDetection = new Mellon([
36
+ const hotWordDetection = new Detector([
37
37
  {
38
38
  name: 'openDoors',
39
39
  triggers: [{ name: 'mellon', defaultRefPath: '/mellon-assets/mellon_ref.json' }],
@@ -71,9 +71,9 @@ await hotWordDetection.start() // opens the mic and listens for all registered t
71
71
  ## Enrolling custom words
72
72
 
73
73
  ```js
74
- import { Mellon, EnrollmentSession } from 'mellon'
74
+ import { Detector, EnrollmentSession, Storage } from 'mellon'
75
75
 
76
- const hotwordDetection = new Mellon([{
76
+ const hotwordDetection = new Detector([{
77
77
  name: 'startEngine',
78
78
  triggers: [{ name: 'start' }],
79
79
  onMatch: (triggerNameMatched, confidence) => { console.log('starting engine...') }
@@ -99,19 +99,19 @@ hotwordDetection.addCustomWord(ref)
99
99
  await hotwordDetection.start()
100
100
 
101
101
  // 4b. Persist for future sessions
102
- Mellon.saveWord(ref)
102
+ Storage.saveWord(ref)
103
103
  ```
104
104
 
105
105
  ---
106
106
 
107
107
  ## API reference
108
108
 
109
- ### `Mellon`
109
+ ### `Detector`
110
110
 
111
111
  The easiest way to use the library. Wraps mic access, AudioWorklet wiring, and detector management into a single class.
112
112
 
113
113
  ```ts
114
- class Mellon {
114
+ class Detector {
115
115
  constructor(commands: Command[], config?: MellonConfig)
116
116
  readonly threshold: number // read/write; persisted in localStorage
117
117
  readonly listening: boolean
@@ -121,7 +121,19 @@ class Mellon {
121
121
  stop(): Promise<void>
122
122
  addCustomWord(ref: WordRef): void
123
123
 
124
- // Storage helpers — static, work without a Mellon instance
124
+ // Storage helpers — static, work without a Detector instance
125
+ static loadWords(storageKey?: string): WordRef[]
126
+ static saveWord(ref: WordRef, storageKey?: string): void
127
+ static deleteWord(wordName: string, storageKey?: string): void
128
+ }
129
+ ```
130
+
131
+ ### `Storage`
132
+
133
+ Static helpers for persisting enrolled word references in `localStorage`.
134
+
135
+ ```ts
136
+ class Storage {
125
137
  static loadWords(storageKey?: string): WordRef[]
126
138
  static saveWord(ref: WordRef, storageKey?: string): void
127
139
  static deleteWord(wordName: string, storageKey?: string): void
package/dist/index.d.ts CHANGED
@@ -1,5 +1,7 @@
1
- import { Mellon } from './Mellon';
1
+ import { Detector } from './Detector';
2
2
  import { EnrollmentSession } from './EnrollmentSession';
3
+ import { Storage } from './Storage';
4
+ import { AudioUtils } from './AudioUtils';
3
5
  export type TriggerName = string;
4
6
  export interface Trigger {
5
7
  name: TriggerName;
@@ -10,28 +12,37 @@ export interface Command {
10
12
  triggers: Trigger[];
11
13
  onMatch?: (trigger: TriggerName, confidence: number) => any;
12
14
  }
13
- export interface MellonConfig {
15
+ export interface DetectorConfig {
14
16
  refsStorageKey?: string;
15
17
  thresholdStorageKey?: string;
16
18
  wasmPaths?: string;
17
19
  modelPath?: string;
18
20
  audioProcessorPath?: string;
19
21
  ortCdnUrl?: string;
22
+ audioUtils?: AudioUtils;
20
23
  }
21
24
  export interface EnrollmentSessionConfig {
22
25
  wasmPaths?: string;
23
26
  modelPath?: string;
24
27
  ortCdnUrl?: string;
28
+ audioUtils?: AudioUtils;
25
29
  }
26
30
  export interface WordRef {
27
31
  word_name: TriggerName;
28
32
  model_type?: string;
29
33
  embeddings: number[][];
30
34
  }
35
+ /**
36
+ * Called during {@link Detector.init} to report real download progress.
37
+ * @param downloaded - total bytes received so far across all assets
38
+ * @param total - sum of known Content-Length values for all assets;
39
+ * may still be 0 early on (before first header is received)
40
+ */
41
+ export type ProgressCallback = (downloaded: number, total: number) => void;
31
42
  export declare const DEFAULT_WASM_PATHS = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/";
32
43
  export declare const DEFAULT_ORT_CDN_URL = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.wasm.min.mjs";
33
44
  export declare const DEFAULT_MODEL_PATH = "https://huggingface.co/ComicScrip/mellon/resolve/main/model.onnx";
34
45
  export declare const DEFAULT_AUDIO_PROCESSOR_PATH = "https://cdn.jsdelivr.net/npm/mellon@0.0.14/dist/assets/audio-processor.js";
35
46
  export declare const DEFAULT_REFS_STORAGE_KEY = "mellon-refs";
36
47
  export declare const DEFAULT_THRESHOLD_STORAGE_KEY = "mellon-threshold";
37
- export { Mellon, EnrollmentSession };
48
+ export { Detector, EnrollmentSession, Storage, AudioUtils };
package/dist/mellon.cjs CHANGED
@@ -1 +1 @@
1
- "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});function Dt(m){return m&&m.__esModule&&Object.prototype.hasOwnProperty.call(m,"default")?m.default:m}var lt,ut;function Ut(){if(ut)return lt;ut=1;function m(o){if(this.size=o|0,this.size<=1||(this.size&this.size-1)!==0)throw new Error("FFT size must be a power of two and bigger than 1");this._csize=o<<1;for(var t=new Array(this.size*2),s=0;s<t.length;s+=2){const i=Math.PI*s/this.size;t[s]=Math.cos(i),t[s+1]=-Math.sin(i)}this.table=t;for(var n=0,r=1;this.size>r;r<<=1)n++;this._width=n%2===0?n-1:n,this._bitrev=new Array(1<<this._width);for(var a=0;a<this._bitrev.length;a++){this._bitrev[a]=0;for(var e=0;e<this._width;e+=2){var h=this._width-e-2;this._bitrev[a]|=(a>>>e&3)<<h}}this._out=null,this._data=null,this._inv=0}return lt=m,m.prototype.fromComplexArray=function(t,s){for(var n=s||new Array(t.length>>>1),r=0;r<t.length;r+=2)n[r>>>1]=t[r];return n},m.prototype.createComplexArray=function(){const t=new Array(this._csize);for(var s=0;s<t.length;s++)t[s]=0;return t},m.prototype.toComplexArray=function(t,s){for(var n=s||this.createComplexArray(),r=0;r<n.length;r+=2)n[r]=t[r>>>1],n[r+1]=0;return n},m.prototype.completeSpectrum=function(t){for(var s=this._csize,n=s>>>1,r=2;r<n;r+=2)t[s-r]=t[r],t[s-r+1]=-t[r+1]},m.prototype.transform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=0,this._transform4(),this._out=null,this._data=null},m.prototype.realTransform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=0,this._realTransform4(),this._out=null,this._data=null},m.prototype.inverseTransform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=1,this._transform4();for(var n=0;n<t.length;n++)t[n]/=this.size;this._out=null,this._data=null},m.prototype._transform4=function(){var t=this._out,s=this._csize,n=this._width,r=1<<n,a=s/r<<1,e,h,i=this._bitrev;if(a===4)for(e=0,h=0;e<s;e+=a,h++){const v=i[h];this._singleTransform2(e,v,r)}else for(e=0,h=0;e<s;e+=a,h++){const v=i[h];this._singleTransform4(e,v,r)}var l=this._inv?-1:1,c=this.table;for(r>>=2;r>=2;r>>=2){a=s/r<<1;var d=a>>>2;for(e=0;e<s;e+=a)for(var _=e+d,u=e,f=0;u<_;u+=2,f+=r){const v=u,p=v+d,g=p+d,w=g+d,T=t[v],F=t[v+1],A=t[p],y=t[p+1],b=t[g],S=t[g+1],P=t[w],C=t[w+1],E=T,R=F,M=c[f],D=l*c[f+1],U=A*M-y*D,z=A*D+y*M,j=c[2*f],L=l*c[2*f+1],B=b*j-S*L,K=b*L+S*j,N=c[3*f],O=l*c[3*f+1],k=P*N-C*O,G=P*O+C*N,$=E+B,x=R+K,I=E-B,Y=R-K,J=U+k,H=z+G,W=l*(U-k),Q=l*(z-G),tt=$+J,rt=x+H,et=$-J,ot=x-H,nt=I+Q,at=Y-W,it=I-Q,ct=Y+W;t[v]=tt,t[v+1]=rt,t[p]=nt,t[p+1]=at,t[g]=et,t[g+1]=ot,t[w]=it,t[w+1]=ct}}},m.prototype._singleTransform2=function(t,s,n){const r=this._out,a=this._data,e=a[s],h=a[s+1],i=a[s+n],l=a[s+n+1],c=e+i,d=h+l,_=e-i,u=h-l;r[t]=c,r[t+1]=d,r[t+2]=_,r[t+3]=u},m.prototype._singleTransform4=function(t,s,n){const r=this._out,a=this._data,e=this._inv?-1:1,h=n*2,i=n*3,l=a[s],c=a[s+1],d=a[s+n],_=a[s+n+1],u=a[s+h],f=a[s+h+1],v=a[s+i],p=a[s+i+1],g=l+u,w=c+f,T=l-u,F=c-f,A=d+v,y=_+p,b=e*(d-v),S=e*(_-p),P=g+A,C=w+y,E=T+S,R=F-b,M=g-A,D=w-y,U=T-S,z=F+b;r[t]=P,r[t+1]=C,r[t+2]=E,r[t+3]=R,r[t+4]=M,r[t+5]=D,r[t+6]=U,r[t+7]=z},m.prototype._realTransform4=function(){var t=this._out,s=this._csize,n=this._width,r=1<<n,a=s/r<<1,e,h,i=this._bitrev;if(a===4)for(e=0,h=0;e<s;e+=a,h++){const ht=i[h];this._singleRealTransform2(e,ht>>>1,r>>>1)}else for(e=0,h=0;e<s;e+=a,h++){const ht=i[h];this._singleRealTransform4(e,ht>>>1,r>>>1)}var l=this._inv?-1:1,c=this.table;for(r>>=2;r>=2;r>>=2){a=s/r<<1;var d=a>>>1,_=d>>>1,u=_>>>1;for(e=0;e<s;e+=a)for(var f=0,v=0;f<=u;f+=2,v+=r){var p=e+f,g=p+_,w=g+_,T=w+_,F=t[p],A=t[p+1],y=t[g],b=t[g+1],S=t[w],P=t[w+1],C=t[T],E=t[T+1],R=F,M=A,D=c[v],U=l*c[v+1],z=y*D-b*U,j=y*U+b*D,L=c[2*v],B=l*c[2*v+1],K=S*L-P*B,N=S*B+P*L,O=c[3*v],k=l*c[3*v+1],G=C*O-E*k,$=C*k+E*O,x=R+K,I=M+N,Y=R-K,J=M-N,H=z+G,W=j+$,Q=l*(z-G),tt=l*(j-$),rt=x+H,et=I+W,ot=Y+tt,nt=J-Q;if(t[p]=rt,t[p+1]=et,t[g]=ot,t[g+1]=nt,f===0){var at=x-H,it=I-W;t[w]=at,t[w+1]=it;continue}if(f!==u){var ct=Y,yt=-J,Tt=x,At=-I,Ft=-l*tt,bt=-l*Q,St=-l*W,Pt=-l*H,Ct=ct+Ft,Et=yt+bt,Rt=Tt+Pt,Mt=At-St,ft=e+_-f,vt=e+d-f;t[ft]=Ct,t[ft+1]=Et,t[vt]=Rt,t[vt+1]=Mt}}}},m.prototype._singleRealTransform2=function(t,s,n){const r=this._out,a=this._data,e=a[s],h=a[s+n],i=e+h,l=e-h;r[t]=i,r[t+1]=0,r[t+2]=l,r[t+3]=0},m.prototype._singleRealTransform4=function(t,s,n){const r=this._out,a=this._data,e=this._inv?-1:1,h=n*2,i=n*3,l=a[s],c=a[s+n],d=a[s+h],_=a[s+i],u=l+d,f=l-d,v=c+_,p=e*(c-_),g=u+v,w=f,T=-p,F=u-v,A=f,y=p;r[t]=g,r[t+1]=0,r[t+2]=w,r[t+3]=T,r[t+4]=F,r[t+5]=0,r[t+6]=A,r[t+7]=y},lt}var zt=Ut();const xt=Dt(zt);class pt{constructor(o=16e3,t=512,s=64){this.sampleRate=o,this.nfft=t,this.nfilt=s,this.fft=new xt(t),this.melFilters=this.createMelFilterbank()}hzToMel(o){return 2595*Math.log10(1+o/700)}melToHz(o){return 700*(10**(o/2595)-1)}createMelFilterbank(){const t=this.sampleRate/2,s=this.hzToMel(0),n=this.hzToMel(t),r=new Float32Array(this.nfilt+2);for(let i=0;i<this.nfilt+2;i++)r[i]=s+i*(n-s)/(this.nfilt+1);const e=r.map(i=>this.melToHz(i)).map(i=>Math.floor((this.nfft+1)*i/this.sampleRate)),h=[];for(let i=0;i<this.nfilt;i++){const l=new Float32Array(Math.floor(this.nfft/2)+1);for(let c=e[i];c<e[i+1];c++)l[c]=(c-e[i])/(e[i+1]-e[i]);for(let c=e[i+1];c<e[i+2];c++)l[c]=(e[i+2]-c)/(e[i+2]-e[i+1]);h.push(l)}return h}logfbank(o){const t=Math.floor(.025*this.sampleRate),s=Math.floor(.01*this.sampleRate),n=1+Math.ceil((o.length-t)/s),r=new Float32Array(n*this.nfilt),a=new Float32Array(this.nfft),e=this.fft.createComplexArray();for(let h=0;h<n;h++){const i=h*s;a.fill(0);for(let d=0;d<t&&i+d<o.length;d++)a[d]=o[i+d];const l=this.fft.toComplexArray(a,null);this.fft.transform(e,l);const c=new Float32Array(Math.floor(this.nfft/2)+1);for(let d=0;d<c.length;d++){const _=e[2*d],u=e[2*d+1];c[d]=1/this.nfft*(_*_+u*u),c[d]===0&&(c[d]=1e-30)}for(let d=0;d<this.nfilt;d++){let _=0;const u=this.melFilters[d];for(let f=0;f<c.length;f++)_+=c[f]*u[f];_===0&&(_=1e-30),r[h*this.nfilt+d]=Math.log(_)}}return r}}async function mt(m=Z,o=q){const t=await import(o);return t.env.wasm.wasmPaths=m,t.env.wasm.numThreads=1,t}let dt=null;async function _t(m=st,o=Z,t=q){return dt||(dt=mt(o,t).then(s=>s.InferenceSession.create(m,{executionProviders:["wasm"],graphOptimizationLevel:"all"}))),dt}const It=new pt;class X{constructor(o,t){this._started=!1,this._inferring=!1,this._audioCtx=null,this._stream=null,this._refEmbeddings=new Map,this._lastMatchAt=0,this._lastInferenceAt=0;const{refsStorageKey:s=V,thresholdStorageKey:n=wt,wasmPaths:r=Z,modelPath:a=st,audioProcessorPath:e=gt,ortCdnUrl:h=q}=t||{};this._commands=o,this._refsStorageKey=s,this._thresholdStorageKey=n,this._audioProcessorPath=e,this._wasmPaths=r,this._modelPath=a,this._ortCdnUrl=h;try{const i=localStorage.getItem(this._thresholdStorageKey);this._threshold=i!==null?Math.max(0,Math.min(1,Number(i))):.65}catch{this._threshold=.65}this._initPromise=this._init()}get threshold(){return this._threshold}set threshold(o){this._threshold=Math.max(0,Math.min(1,o));try{localStorage.setItem(this._thresholdStorageKey,String(this._threshold))}catch{}}get listening(){return this._started}async _init(){await _t(this._modelPath,this._wasmPaths,this._ortCdnUrl);const o=new Set;for(const t of this._commands)for(const s of t.triggers)if(!o.has(s.name)&&(o.add(s.name),s.defaultRefPath)){const n=await fetch(s.defaultRefPath);if(n.ok){const r=await n.json();this.addCustomWord(r)}}for(const t of X.loadWords(this._refsStorageKey))this._refEmbeddings.set(t.word_name,t.embeddings);console.info("[Mellon] init complete, loaded refs:",[...this._refEmbeddings.keys()])}async init(){await this._initPromise}addCustomWord(o){if(!(Array.isArray(o.embeddings)&&o.embeddings.length>0))throw new Error("invalid ref file for : "+o.word_name);this._refEmbeddings.set(o.word_name,o.embeddings)}async start(){if(this._started)return;await this._initPromise;let o;try{o=await navigator.mediaDevices.getUserMedia({audio:{noiseSuppression:!1,echoCancellation:!1,autoGainControl:!1,channelCount:1}})}catch{o=await navigator.mediaDevices.getUserMedia({audio:!0})}this._stream=o;const t=new AudioContext({sampleRate:16e3});this._audioCtx=t,await t.audioWorklet.addModule(this._audioProcessorPath);const s=t.createMediaStreamSource(o),n=new AudioWorkletNode(t,"audio-processor");n.port.onmessage=r=>{this._handleBuffer(r.data)},s.connect(n),n.connect(t.destination),this._started=!0}async stop(){if(this._started=!1,this._audioCtx&&(await this._audioCtx.close(),this._audioCtx=null),this._stream){for(const o of this._stream.getTracks())o.stop();this._stream=null}}async _handleBuffer(o){if(this._inferring)return;const t=Date.now();if(!(t-this._lastInferenceAt<300)){this._lastInferenceAt=t,this._inferring=!0;try{const[s,n]=await Promise.all([mt(this._wasmPaths,this._ortCdnUrl),_t(this._modelPath,this._wasmPaths,this._ortCdnUrl)]),r=It.logfbank(o),a=new s.Tensor("float32",r,[1,1,149,64]),e=await n.run({input:a}),h=e[Object.keys(e)[0]].data;let i=!1;for(const l of this._commands){if(i)break;for(const c of l.triggers){const d=this._refEmbeddings.get(c.name);if(!d)continue;const _=this._maxCosineSim(h,d);if(_>=this._threshold&&t-this._lastMatchAt>2e3){this._lastMatchAt=t,console.info(`[Mellon] match: "${c}" sim=${_.toFixed(3)}`),typeof l.onMatch=="function"&&l.onMatch(c.name,_),i=!0;break}}}}catch(s){console.error("[Mellon] inference error:",s)}finally{this._inferring=!1}}}_maxCosineSim(o,t){let s=0;for(const n of t){let r=0;for(let e=0;e<n.length;e++)r+=o[e]*n[e];const a=(r+1)/2;a>s&&(s=a)}return s}static loadWords(o=V){try{const t=localStorage.getItem(o);return t?JSON.parse(t):[]}catch{return[]}}static saveWord(o,t=V){const s=X.loadWords(t).filter(n=>n.word_name!==o.word_name);localStorage.setItem(t,JSON.stringify([...s,o]))}static deleteWord(o,t=V){try{const s=X.loadWords(t).filter(n=>n.word_name!==o);localStorage.setItem(t,JSON.stringify(s))}catch{}}}const Ht=new pt;class Wt{constructor(o,t){this._config={},this._samples=[],this._wordName=o,this._config.modelPath=(t==null?void 0:t.modelPath)||st,this._config.wasmPaths=(t==null?void 0:t.wasmPaths)||Z,this._config.ortCdnUrl=(t==null?void 0:t.ortCdnUrl)||q}async recordSample(){const o=await navigator.mediaDevices.getUserMedia({audio:!0}),t=new AudioContext({sampleRate:16e3}),s=await new Promise((a,e)=>{const h=new MediaRecorder(o),i=[];h.ondataavailable=l=>{l.data.size>0&&i.push(l.data)},h.onstop=async()=>{var l;for(const c of o.getTracks())c.stop();try{const d=await new Blob(i,{type:((l=i[0])==null?void 0:l.type)||"audio/webm"}).arrayBuffer(),_=await t.decodeAudioData(d);await t.close(),a(_.getChannelData(0).slice())}catch(c){e(c)}},h.start(),setTimeout(()=>{try{h.stop()}catch{}},1500)}),n=24e3,r=new Float32Array(n);return r.set(s.slice(0,n)),this._samples.push(r),this._samples.length}deleteSample(o){if(o<0||o>=this._samples.length)throw new RangeError(`index ${o} out of bounds (${this._samples.length} samples)`);return this._samples.splice(o,1),this._samples.length}async generateRef(){const[o,t]=await Promise.all([mt(this._config.wasmPaths,this._config.ortCdnUrl),_t(this._config.modelPath,this._config.wasmPaths,this._config.ortCdnUrl)]),s=[];for(const n of this._samples){const r=Ht.logfbank(n),a=new o.Tensor("float32",r,[1,1,149,64]),e=await t.run({input:a}),h=Array.from(e[Object.keys(e)[0]].data);s.push(h)}return{word_name:this._wordName,model_type:"resnet_50_arc",embeddings:s}}}const Z="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/",q="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.wasm.min.mjs",st="https://huggingface.co/ComicScrip/mellon/resolve/main/model.onnx",gt="https://cdn.jsdelivr.net/npm/mellon@0.0.14/dist/assets/audio-processor.js",V="mellon-refs",wt="mellon-threshold";exports.DEFAULT_AUDIO_PROCESSOR_PATH=gt;exports.DEFAULT_MODEL_PATH=st;exports.DEFAULT_ORT_CDN_URL=q;exports.DEFAULT_REFS_STORAGE_KEY=V;exports.DEFAULT_THRESHOLD_STORAGE_KEY=wt;exports.DEFAULT_WASM_PATHS=Z;exports.EnrollmentSession=Wt;exports.Mellon=X;
1
+ "use strict";Object.defineProperty(exports,Symbol.toStringTag,{value:"Module"});function Ut(m){return m&&m.__esModule&&Object.prototype.hasOwnProperty.call(m,"default")?m.default:m}var dt,pt;function Dt(){if(pt)return dt;pt=1;function m(o){if(this.size=o|0,this.size<=1||(this.size&this.size-1)!==0)throw new Error("FFT size must be a power of two and bigger than 1");this._csize=o<<1;for(var t=new Array(this.size*2),s=0;s<t.length;s+=2){const l=Math.PI*s/this.size;t[s]=Math.cos(l),t[s+1]=-Math.sin(l)}this.table=t;for(var n=0,e=1;this.size>e;e<<=1)n++;this._width=n%2===0?n-1:n,this._bitrev=new Array(1<<this._width);for(var c=0;c<this._bitrev.length;c++){this._bitrev[c]=0;for(var r=0;r<this._width;r+=2){var h=this._width-r-2;this._bitrev[c]|=(c>>>r&3)<<h}}this._out=null,this._data=null,this._inv=0}return dt=m,m.prototype.fromComplexArray=function(t,s){for(var n=s||new Array(t.length>>>1),e=0;e<t.length;e+=2)n[e>>>1]=t[e];return n},m.prototype.createComplexArray=function(){const t=new Array(this._csize);for(var s=0;s<t.length;s++)t[s]=0;return t},m.prototype.toComplexArray=function(t,s){for(var n=s||this.createComplexArray(),e=0;e<n.length;e+=2)n[e]=t[e>>>1],n[e+1]=0;return n},m.prototype.completeSpectrum=function(t){for(var s=this._csize,n=s>>>1,e=2;e<n;e+=2)t[s-e]=t[e],t[s-e+1]=-t[e+1]},m.prototype.transform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=0,this._transform4(),this._out=null,this._data=null},m.prototype.realTransform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=0,this._realTransform4(),this._out=null,this._data=null},m.prototype.inverseTransform=function(t,s){if(t===s)throw new Error("Input and output buffers must be different");this._out=t,this._data=s,this._inv=1,this._transform4();for(var n=0;n<t.length;n++)t[n]/=this.size;this._out=null,this._data=null},m.prototype._transform4=function(){var t=this._out,s=this._csize,n=this._width,e=1<<n,c=s/e<<1,r,h,l=this._bitrev;if(c===4)for(r=0,h=0;r<s;r+=c,h++){const u=l[h];this._singleTransform2(r,u,e)}else for(r=0,h=0;r<s;r+=c,h++){const u=l[h];this._singleTransform4(r,u,e)}var i=this._inv?-1:1,a=this.table;for(e>>=2;e>=2;e>>=2){c=s/e<<1;var d=c>>>2;for(r=0;r<s;r+=c)for(var _=r+d,v=r,f=0;v<_;v+=2,f+=e){const u=v,p=u+d,w=p+d,g=w+d,T=t[u],A=t[u+1],b=t[p],y=t[p+1],F=t[w],S=t[w+1],C=t[g],E=t[g+1],M=T,P=A,R=a[f],U=i*a[f+1],D=b*R-y*U,z=b*U+y*R,k=a[2*f],N=i*a[2*f+1],B=F*k-S*N,j=F*N+S*k,K=a[3*f],O=i*a[3*f+1],$=C*K-E*O,G=C*O+E*K,J=M+B,x=P+j,I=M-B,Y=P-j,Q=D+$,L=z+G,W=i*(D-$),V=i*(z-G),tt=J+Q,rt=x+L,ot=J-Q,nt=x-L,at=I+V,it=Y-W,ct=I-V,lt=Y+W;t[u]=tt,t[u+1]=rt,t[p]=at,t[p+1]=it,t[w]=ot,t[w+1]=nt,t[g]=ct,t[g+1]=lt}}},m.prototype._singleTransform2=function(t,s,n){const e=this._out,c=this._data,r=c[s],h=c[s+1],l=c[s+n],i=c[s+n+1],a=r+l,d=h+i,_=r-l,v=h-i;e[t]=a,e[t+1]=d,e[t+2]=_,e[t+3]=v},m.prototype._singleTransform4=function(t,s,n){const e=this._out,c=this._data,r=this._inv?-1:1,h=n*2,l=n*3,i=c[s],a=c[s+1],d=c[s+n],_=c[s+n+1],v=c[s+h],f=c[s+h+1],u=c[s+l],p=c[s+l+1],w=i+v,g=a+f,T=i-v,A=a-f,b=d+u,y=_+p,F=r*(d-u),S=r*(_-p),C=w+b,E=g+y,M=T+S,P=A-F,R=w-b,U=g-y,D=T-S,z=A+F;e[t]=C,e[t+1]=E,e[t+2]=M,e[t+3]=P,e[t+4]=R,e[t+5]=U,e[t+6]=D,e[t+7]=z},m.prototype._realTransform4=function(){var t=this._out,s=this._csize,n=this._width,e=1<<n,c=s/e<<1,r,h,l=this._bitrev;if(c===4)for(r=0,h=0;r<s;r+=c,h++){const ht=l[h];this._singleRealTransform2(r,ht>>>1,e>>>1)}else for(r=0,h=0;r<s;r+=c,h++){const ht=l[h];this._singleRealTransform4(r,ht>>>1,e>>>1)}var i=this._inv?-1:1,a=this.table;for(e>>=2;e>=2;e>>=2){c=s/e<<1;var d=c>>>1,_=d>>>1,v=_>>>1;for(r=0;r<s;r+=c)for(var f=0,u=0;f<=v;f+=2,u+=e){var p=r+f,w=p+_,g=w+_,T=g+_,A=t[p],b=t[p+1],y=t[w],F=t[w+1],S=t[g],C=t[g+1],E=t[T],M=t[T+1],P=A,R=b,U=a[u],D=i*a[u+1],z=y*U-F*D,k=y*D+F*U,N=a[2*u],B=i*a[2*u+1],j=S*N-C*B,K=S*B+C*N,O=a[3*u],$=i*a[3*u+1],G=E*O-M*$,J=E*$+M*O,x=P+j,I=R+K,Y=P-j,Q=R-K,L=z+G,W=k+J,V=i*(z-G),tt=i*(k-J),rt=x+L,ot=I+W,nt=Y+tt,at=Q-V;if(t[p]=rt,t[p+1]=ot,t[w]=nt,t[w+1]=at,f===0){var it=x-L,ct=I-W;t[g]=it,t[g+1]=ct;continue}if(f!==v){var lt=Y,yt=-Q,Tt=x,bt=-I,At=-i*tt,Ft=-i*V,St=-i*W,Ct=-i*L,Et=lt+At,Mt=yt+Ft,Pt=Tt+Ct,Rt=bt-St,ut=r+_-f,vt=r+d-f;t[ut]=Et,t[ut+1]=Mt,t[vt]=Pt,t[vt+1]=Rt}}}},m.prototype._singleRealTransform2=function(t,s,n){const e=this._out,c=this._data,r=c[s],h=c[s+n],l=r+h,i=r-h;e[t]=l,e[t+1]=0,e[t+2]=i,e[t+3]=0},m.prototype._singleRealTransform4=function(t,s,n){const e=this._out,c=this._data,r=this._inv?-1:1,h=n*2,l=n*3,i=c[s],a=c[s+n],d=c[s+h],_=c[s+l],v=i+d,f=i-d,u=a+_,p=r*(a-_),w=v+u,g=f,T=-p,A=v-u,b=f,y=p;e[t]=w,e[t+1]=0,e[t+2]=g,e[t+3]=T,e[t+4]=A,e[t+5]=0,e[t+6]=b,e[t+7]=y},dt}var zt=Dt();const xt=Ut(zt);class ft{constructor(o=16e3,t=512,s=64){this._sampleRate=o,this._nfft=t,this._nfilt=s,this._fft=new xt(t),this._melFilters=this._createMelFilterbank()}_hzToMel(o){return 2595*Math.log10(1+o/700)}_melToHz(o){return 700*(10**(o/2595)-1)}_createMelFilterbank(){const t=this._sampleRate/2,s=this._hzToMel(0),n=this._hzToMel(t),e=new Float32Array(this._nfilt+2);for(let l=0;l<this._nfilt+2;l++)e[l]=s+l*(n-s)/(this._nfilt+1);const r=e.map(l=>this._melToHz(l)).map(l=>Math.floor((this._nfft+1)*l/this._sampleRate)),h=[];for(let l=0;l<this._nfilt;l++){const i=new Float32Array(Math.floor(this._nfft/2)+1);for(let a=r[l];a<r[l+1];a++)i[a]=(a-r[l])/(r[l+1]-r[l]);for(let a=r[l+1];a<r[l+2];a++)i[a]=(r[l+2]-a)/(r[l+2]-r[l+1]);h.push(i)}return h}logfbank(o){const t=Math.floor(.025*this._sampleRate),s=Math.floor(.01*this._sampleRate),n=1+Math.ceil((o.length-t)/s),e=new Float32Array(n*this._nfilt),c=new Float32Array(this._nfft),r=this._fft.createComplexArray();for(let h=0;h<n;h++){const l=h*s;c.fill(0);for(let d=0;d<t&&l+d<o.length;d++)c[d]=o[l+d];const i=this._fft.toComplexArray(c,null);this._fft.transform(r,i);const a=new Float32Array(Math.floor(this._nfft/2)+1);for(let d=0;d<a.length;d++){const _=r[2*d],v=r[2*d+1];a[d]=1/this._nfft*(_*_+v*v),a[d]===0&&(a[d]=1e-30)}for(let d=0;d<this._nfilt;d++){let _=0;const v=this._melFilters[d];for(let f=0;f<a.length;f++)_+=a[f]*v[f];_===0&&(_=1e-30),e[h*this._nfilt+d]=Math.log(_)}}return e}maxCosineSim(o,t){let s=0;for(const n of t){let e=0;for(let r=0;r<n.length;r++)e+=o[r]*n[r];const c=(e+1)/2;c>s&&(s=c)}return s}}async function st(m=Z,o=q){const t=await import(o);return t.env.wasm.wasmPaths=m,t.env.wasm.numThreads=1,t}let _t=null;async function mt(m=et,o=Z,t=q,s){return _t||(_t=st(o,t).then(n=>s?n.InferenceSession.create(new Uint8Array(s),{executionProviders:["wasm"],graphOptimizationLevel:"all"}):n.InferenceSession.create(m,{executionProviders:["wasm"],graphOptimizationLevel:"all"}))),_t}class H{static loadWords(o=X){try{const t=localStorage.getItem(o);return t?JSON.parse(t):[]}catch{return[]}}static saveWord(o,t=X){const s=H.loadWords(t).filter(n=>n.word_name!==o.word_name);localStorage.setItem(t,JSON.stringify([...s,o]))}static deleteWord(o,t=X){try{const s=H.loadWords(t).filter(n=>n.word_name!==o);localStorage.setItem(t,JSON.stringify(s))}catch{}}}class It{constructor(o,t){this._started=!1,this._inferring=!1,this._audioCtx=null,this._stream=null,this._refEmbeddings=new Map,this._lastMatchAt=0,this._lastInferenceAt=0,this._initPromise=null;const{refsStorageKey:s=X,thresholdStorageKey:n=gt,wasmPaths:e=Z,modelPath:c=et,audioProcessorPath:r=wt,ortCdnUrl:h=q,audioUtils:l=new ft}=t||{};this._audioUtils=l,this._commands=o,this._refsStorageKey=s,this._thresholdStorageKey=n,this._audioProcessorPath=r,this._wasmPaths=e,this._modelPath=c,this._ortCdnUrl=h;try{const i=localStorage.getItem(this._thresholdStorageKey);this._threshold=i!==null?Math.max(0,Math.min(1,Number(i))):.65}catch{this._threshold=.65}}get threshold(){return this._threshold}set threshold(o){this._threshold=Math.max(0,Math.min(1,o));try{localStorage.setItem(this._thresholdStorageKey,String(this._threshold))}catch{}}get listening(){return this._started}async _trackFetch(o,t,s){const n=await fetch(o);if(!n.ok)throw new Error(`HTTP ${n.status} fetching ${o}`);const e=Number(n.headers.get("content-length")??"0");if(e>0&&(s.total+=e),!n.body){const a=await n.arrayBuffer();return s.downloaded+=a.byteLength,e||(s.total+=a.byteLength),t==null||t(s.downloaded,s.total),a}const c=n.body.getReader(),r=[];let h=0;for(;;){const{done:a,value:d}=await c.read();if(a)break;r.push(d),h+=d.length,s.downloaded+=d.length,t==null||t(s.downloaded,s.total)}e||(s.total+=h);const l=new Uint8Array(h);let i=0;for(const a of r)l.set(a,i),i+=a.length;return l.buffer}async _init(o){const t={downloaded:0,total:0},s=new Set,n=[];for(const i of this._commands)for(const a of i.triggers)!s.has(a.name)&&a.defaultRefPath&&(s.add(a.name),n.push({name:a.name,path:a.defaultRefPath}));const e=st(this._wasmPaths,this._ortCdnUrl),[c,...r]=await Promise.all([this._trackFetch(this._modelPath,o,t),...n.map(({path:i})=>this._trackFetch(i,o,t))]);await e,await mt(this._modelPath,this._wasmPaths,this._ortCdnUrl,c);const h=H.loadWords(this._refsStorageKey),l=new Set(h.map(i=>i.word_name));for(let i=0;i<n.length;i++)try{const a=JSON.parse(new TextDecoder().decode(r[i]));this.addCustomWord(a),l.has(a.word_name)||H.saveWord(a,this._refsStorageKey)}catch{console.warn(`[Mellon] failed to parse ref file: ${n[i].path}`)}for(const i of h)this._refEmbeddings.set(i.word_name,i.embeddings);console.info("[Mellon] init complete, loaded refs:",[...this._refEmbeddings.keys()])}async init(o){this._initPromise||(this._initPromise=this._init(o)),await this._initPromise}addCustomWord(o){if(!(Array.isArray(o.embeddings)&&o.embeddings.length>0))throw new Error("invalid ref file for : "+o.word_name);this._refEmbeddings.set(o.word_name,o.embeddings)}async start(){if(this._started)return;await this.init();let o;try{o=await navigator.mediaDevices.getUserMedia({audio:{noiseSuppression:!1,echoCancellation:!1,autoGainControl:!1,channelCount:1}})}catch{o=await navigator.mediaDevices.getUserMedia({audio:!0})}this._stream=o;const t=new AudioContext({sampleRate:16e3});this._audioCtx=t,await t.audioWorklet.addModule(this._audioProcessorPath);const s=t.createMediaStreamSource(o),n=new AudioWorkletNode(t,"audio-processor");n.port.onmessage=e=>{this._handleBuffer(e.data)},s.connect(n),n.connect(t.destination),this._started=!0}async stop(){if(this._started=!1,this._audioCtx&&(await this._audioCtx.close(),this._audioCtx=null),this._stream){for(const o of this._stream.getTracks())o.stop();this._stream=null}}async _handleBuffer(o){if(this._inferring)return;const t=Date.now();if(!(t-this._lastInferenceAt<300)){this._lastInferenceAt=t,this._inferring=!0;try{const[s,n]=await Promise.all([st(this._wasmPaths,this._ortCdnUrl),mt(this._modelPath,this._wasmPaths,this._ortCdnUrl)]),e=this._audioUtils.logfbank(o),c=new s.Tensor("float32",e,[1,1,149,64]),r=await n.run({input:c}),h=r[Object.keys(r)[0]].data;let l=!1;for(const i of this._commands){if(l)break;for(const a of i.triggers){const d=this._refEmbeddings.get(a.name);if(!d)continue;const _=this._audioUtils.maxCosineSim(h,d);if(_>=this._threshold&&t-this._lastMatchAt>2e3){this._lastMatchAt=t,console.info(`[Mellon] match: "${a}" sim=${_.toFixed(3)}`),typeof i.onMatch=="function"&&i.onMatch(a.name,_),l=!0;break}}}}catch(s){console.error("[Mellon] inference error:",s)}finally{this._inferring=!1}}}}class Lt{constructor(o,t){this._config={},this._samples=[],this._wordName=o,this._config.modelPath=(t==null?void 0:t.modelPath)||et,this._config.wasmPaths=(t==null?void 0:t.wasmPaths)||Z,this._config.ortCdnUrl=(t==null?void 0:t.ortCdnUrl)||q,this._audioUtils=(t==null?void 0:t.audioUtils)??new ft}async recordSample(){const o=await navigator.mediaDevices.getUserMedia({audio:!0}),t=new AudioContext({sampleRate:16e3}),s=await new Promise((c,r)=>{const h=new MediaRecorder(o),l=[];h.ondataavailable=i=>{i.data.size>0&&l.push(i.data)},h.onstop=async()=>{var i;for(const a of o.getTracks())a.stop();try{const d=await new Blob(l,{type:((i=l[0])==null?void 0:i.type)||"audio/webm"}).arrayBuffer(),_=await t.decodeAudioData(d);await t.close(),c(_.getChannelData(0).slice())}catch(a){r(a)}},h.start(),setTimeout(()=>{try{h.stop()}catch{}},1500)}),n=24e3,e=new Float32Array(n);return e.set(s.slice(0,n)),this._samples.push(e),this._samples.length}deleteSample(o){if(o<0||o>=this._samples.length)throw new RangeError(`index ${o} out of bounds (${this._samples.length} samples)`);return this._samples.splice(o,1),this._samples.length}async generateRef(){const[o,t]=await Promise.all([st(this._config.wasmPaths,this._config.ortCdnUrl),mt(this._config.modelPath,this._config.wasmPaths,this._config.ortCdnUrl)]),s=[];for(const n of this._samples){const e=this._audioUtils.logfbank(n),c=new o.Tensor("float32",e,[1,1,149,64]),r=await t.run({input:c}),h=Array.from(r[Object.keys(r)[0]].data);s.push(h)}return{word_name:this._wordName,model_type:"resnet_50_arc",embeddings:s}}}const Z="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/",q="https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.wasm.min.mjs",et="https://huggingface.co/ComicScrip/mellon/resolve/main/model.onnx",wt="https://cdn.jsdelivr.net/npm/mellon@0.0.14/dist/assets/audio-processor.js",X="mellon-refs",gt="mellon-threshold";exports.AudioUtils=ft;exports.DEFAULT_AUDIO_PROCESSOR_PATH=wt;exports.DEFAULT_MODEL_PATH=et;exports.DEFAULT_ORT_CDN_URL=q;exports.DEFAULT_REFS_STORAGE_KEY=X;exports.DEFAULT_THRESHOLD_STORAGE_KEY=gt;exports.DEFAULT_WASM_PATHS=Z;exports.Detector=It;exports.EnrollmentSession=Lt;exports.Storage=H;
package/dist/mellon.mjs CHANGED
@@ -1,217 +1,250 @@
1
- function Rt(_) {
2
- return _ && _.__esModule && Object.prototype.hasOwnProperty.call(_, "default") ? _.default : _;
1
+ function Rt(m) {
2
+ return m && m.__esModule && Object.prototype.hasOwnProperty.call(m, "default") ? m.default : m;
3
3
  }
4
- var ht, ut;
5
- function Et() {
6
- if (ut) return ht;
7
- ut = 1;
8
- function _(o) {
4
+ var lt, vt;
5
+ function Ut() {
6
+ if (vt) return lt;
7
+ vt = 1;
8
+ function m(o) {
9
9
  if (this.size = o | 0, this.size <= 1 || (this.size & this.size - 1) !== 0)
10
10
  throw new Error("FFT size must be a power of two and bigger than 1");
11
11
  this._csize = o << 1;
12
12
  for (var t = new Array(this.size * 2), s = 0; s < t.length; s += 2) {
13
- const i = Math.PI * s / this.size;
14
- t[s] = Math.cos(i), t[s + 1] = -Math.sin(i);
13
+ const h = Math.PI * s / this.size;
14
+ t[s] = Math.cos(h), t[s + 1] = -Math.sin(h);
15
15
  }
16
16
  this.table = t;
17
- for (var n = 0, r = 1; this.size > r; r <<= 1)
17
+ for (var n = 0, e = 1; this.size > e; e <<= 1)
18
18
  n++;
19
19
  this._width = n % 2 === 0 ? n - 1 : n, this._bitrev = new Array(1 << this._width);
20
- for (var a = 0; a < this._bitrev.length; a++) {
21
- this._bitrev[a] = 0;
22
- for (var e = 0; e < this._width; e += 2) {
23
- var h = this._width - e - 2;
24
- this._bitrev[a] |= (a >>> e & 3) << h;
20
+ for (var c = 0; c < this._bitrev.length; c++) {
21
+ this._bitrev[c] = 0;
22
+ for (var r = 0; r < this._width; r += 2) {
23
+ var l = this._width - r - 2;
24
+ this._bitrev[c] |= (c >>> r & 3) << l;
25
25
  }
26
26
  }
27
27
  this._out = null, this._data = null, this._inv = 0;
28
28
  }
29
- return ht = _, _.prototype.fromComplexArray = function(t, s) {
30
- for (var n = s || new Array(t.length >>> 1), r = 0; r < t.length; r += 2)
31
- n[r >>> 1] = t[r];
29
+ return lt = m, m.prototype.fromComplexArray = function(t, s) {
30
+ for (var n = s || new Array(t.length >>> 1), e = 0; e < t.length; e += 2)
31
+ n[e >>> 1] = t[e];
32
32
  return n;
33
- }, _.prototype.createComplexArray = function() {
33
+ }, m.prototype.createComplexArray = function() {
34
34
  const t = new Array(this._csize);
35
35
  for (var s = 0; s < t.length; s++)
36
36
  t[s] = 0;
37
37
  return t;
38
- }, _.prototype.toComplexArray = function(t, s) {
39
- for (var n = s || this.createComplexArray(), r = 0; r < n.length; r += 2)
40
- n[r] = t[r >>> 1], n[r + 1] = 0;
38
+ }, m.prototype.toComplexArray = function(t, s) {
39
+ for (var n = s || this.createComplexArray(), e = 0; e < n.length; e += 2)
40
+ n[e] = t[e >>> 1], n[e + 1] = 0;
41
41
  return n;
42
- }, _.prototype.completeSpectrum = function(t) {
43
- for (var s = this._csize, n = s >>> 1, r = 2; r < n; r += 2)
44
- t[s - r] = t[r], t[s - r + 1] = -t[r + 1];
45
- }, _.prototype.transform = function(t, s) {
42
+ }, m.prototype.completeSpectrum = function(t) {
43
+ for (var s = this._csize, n = s >>> 1, e = 2; e < n; e += 2)
44
+ t[s - e] = t[e], t[s - e + 1] = -t[e + 1];
45
+ }, m.prototype.transform = function(t, s) {
46
46
  if (t === s)
47
47
  throw new Error("Input and output buffers must be different");
48
48
  this._out = t, this._data = s, this._inv = 0, this._transform4(), this._out = null, this._data = null;
49
- }, _.prototype.realTransform = function(t, s) {
49
+ }, m.prototype.realTransform = function(t, s) {
50
50
  if (t === s)
51
51
  throw new Error("Input and output buffers must be different");
52
52
  this._out = t, this._data = s, this._inv = 0, this._realTransform4(), this._out = null, this._data = null;
53
- }, _.prototype.inverseTransform = function(t, s) {
53
+ }, m.prototype.inverseTransform = function(t, s) {
54
54
  if (t === s)
55
55
  throw new Error("Input and output buffers must be different");
56
56
  this._out = t, this._data = s, this._inv = 1, this._transform4();
57
57
  for (var n = 0; n < t.length; n++)
58
58
  t[n] /= this.size;
59
59
  this._out = null, this._data = null;
60
- }, _.prototype._transform4 = function() {
61
- var t = this._out, s = this._csize, n = this._width, r = 1 << n, a = s / r << 1, e, h, i = this._bitrev;
62
- if (a === 4)
63
- for (e = 0, h = 0; e < s; e += a, h++) {
64
- const v = i[h];
65
- this._singleTransform2(e, v, r);
60
+ }, m.prototype._transform4 = function() {
61
+ var t = this._out, s = this._csize, n = this._width, e = 1 << n, c = s / e << 1, r, l, h = this._bitrev;
62
+ if (c === 4)
63
+ for (r = 0, l = 0; r < s; r += c, l++) {
64
+ const u = h[l];
65
+ this._singleTransform2(r, u, e);
66
66
  }
67
67
  else
68
- for (e = 0, h = 0; e < s; e += a, h++) {
69
- const v = i[h];
70
- this._singleTransform4(e, v, r);
68
+ for (r = 0, l = 0; r < s; r += c, l++) {
69
+ const u = h[l];
70
+ this._singleTransform4(r, u, e);
71
71
  }
72
- var l = this._inv ? -1 : 1, c = this.table;
73
- for (r >>= 2; r >= 2; r >>= 2) {
74
- a = s / r << 1;
75
- var d = a >>> 2;
76
- for (e = 0; e < s; e += a)
77
- for (var m = e + d, u = e, f = 0; u < m; u += 2, f += r) {
78
- const v = u, p = v + d, w = p + d, g = w + d, b = t[v], T = t[v + 1], F = t[p], y = t[p + 1], A = t[w], C = t[w + 1], P = t[g], M = t[g + 1], S = b, R = T, E = c[f], D = l * c[f + 1], z = F * E - y * D, x = F * D + y * E, B = c[2 * f], N = l * c[2 * f + 1], H = A * B - C * N, K = A * N + C * B, k = c[3 * f], $ = l * c[3 * f + 1], G = P * k - M * $, J = P * $ + M * k, L = S + H, U = R + K, I = S - H, Y = R - K, Q = z + G, W = x + J, j = l * (z - G), V = l * (x - J), X = L + Q, st = U + W, rt = L - Q, et = U - W, ot = I + V, nt = Y - j, at = I - V, it = Y + j;
79
- t[v] = X, t[v + 1] = st, t[p] = ot, t[p + 1] = nt, t[w] = rt, t[w + 1] = et, t[g] = at, t[g + 1] = it;
72
+ var i = this._inv ? -1 : 1, a = this.table;
73
+ for (e >>= 2; e >= 2; e >>= 2) {
74
+ c = s / e << 1;
75
+ var d = c >>> 2;
76
+ for (r = 0; r < s; r += c)
77
+ for (var _ = r + d, v = r, f = 0; v < _; v += 2, f += e) {
78
+ const u = v, p = u + d, w = p + d, g = w + d, b = t[u], T = t[u + 1], F = t[p], y = t[p + 1], A = t[w], C = t[w + 1], M = t[g], S = t[g + 1], P = b, R = T, U = a[f], E = i * a[f + 1], D = F * U - y * E, x = F * E + y * U, B = a[2 * f], N = i * a[2 * f + 1], j = A * B - C * N, H = A * N + C * B, K = a[3 * f], $ = i * a[3 * f + 1], L = M * K - S * $, J = M * $ + S * K, G = P + j, z = R + H, I = P - j, Y = R - H, O = D + L, W = x + J, k = i * (D - L), Q = i * (x - J), X = G + O, et = z + W, rt = G - O, ot = z - W, nt = I + Q, at = Y - k, it = I - Q, ct = Y + k;
79
+ t[u] = X, t[u + 1] = et, t[p] = nt, t[p + 1] = at, t[w] = rt, t[w + 1] = ot, t[g] = it, t[g + 1] = ct;
80
80
  }
81
81
  }
82
- }, _.prototype._singleTransform2 = function(t, s, n) {
83
- const r = this._out, a = this._data, e = a[s], h = a[s + 1], i = a[s + n], l = a[s + n + 1], c = e + i, d = h + l, m = e - i, u = h - l;
84
- r[t] = c, r[t + 1] = d, r[t + 2] = m, r[t + 3] = u;
85
- }, _.prototype._singleTransform4 = function(t, s, n) {
86
- const r = this._out, a = this._data, e = this._inv ? -1 : 1, h = n * 2, i = n * 3, l = a[s], c = a[s + 1], d = a[s + n], m = a[s + n + 1], u = a[s + h], f = a[s + h + 1], v = a[s + i], p = a[s + i + 1], w = l + u, g = c + f, b = l - u, T = c - f, F = d + v, y = m + p, A = e * (d - v), C = e * (m - p), P = w + F, M = g + y, S = b + C, R = T - A, E = w - F, D = g - y, z = b - C, x = T + A;
87
- r[t] = P, r[t + 1] = M, r[t + 2] = S, r[t + 3] = R, r[t + 4] = E, r[t + 5] = D, r[t + 6] = z, r[t + 7] = x;
88
- }, _.prototype._realTransform4 = function() {
89
- var t = this._out, s = this._csize, n = this._width, r = 1 << n, a = s / r << 1, e, h, i = this._bitrev;
90
- if (a === 4)
91
- for (e = 0, h = 0; e < s; e += a, h++) {
92
- const ct = i[h];
93
- this._singleRealTransform2(e, ct >>> 1, r >>> 1);
82
+ }, m.prototype._singleTransform2 = function(t, s, n) {
83
+ const e = this._out, c = this._data, r = c[s], l = c[s + 1], h = c[s + n], i = c[s + n + 1], a = r + h, d = l + i, _ = r - h, v = l - i;
84
+ e[t] = a, e[t + 1] = d, e[t + 2] = _, e[t + 3] = v;
85
+ }, m.prototype._singleTransform4 = function(t, s, n) {
86
+ const e = this._out, c = this._data, r = this._inv ? -1 : 1, l = n * 2, h = n * 3, i = c[s], a = c[s + 1], d = c[s + n], _ = c[s + n + 1], v = c[s + l], f = c[s + l + 1], u = c[s + h], p = c[s + h + 1], w = i + v, g = a + f, b = i - v, T = a - f, F = d + u, y = _ + p, A = r * (d - u), C = r * (_ - p), M = w + F, S = g + y, P = b + C, R = T - A, U = w - F, E = g - y, D = b - C, x = T + A;
87
+ e[t] = M, e[t + 1] = S, e[t + 2] = P, e[t + 3] = R, e[t + 4] = U, e[t + 5] = E, e[t + 6] = D, e[t + 7] = x;
88
+ }, m.prototype._realTransform4 = function() {
89
+ var t = this._out, s = this._csize, n = this._width, e = 1 << n, c = s / e << 1, r, l, h = this._bitrev;
90
+ if (c === 4)
91
+ for (r = 0, l = 0; r < s; r += c, l++) {
92
+ const ht = h[l];
93
+ this._singleRealTransform2(r, ht >>> 1, e >>> 1);
94
94
  }
95
95
  else
96
- for (e = 0, h = 0; e < s; e += a, h++) {
97
- const ct = i[h];
98
- this._singleRealTransform4(e, ct >>> 1, r >>> 1);
96
+ for (r = 0, l = 0; r < s; r += c, l++) {
97
+ const ht = h[l];
98
+ this._singleRealTransform4(r, ht >>> 1, e >>> 1);
99
99
  }
100
- var l = this._inv ? -1 : 1, c = this.table;
101
- for (r >>= 2; r >= 2; r >>= 2) {
102
- a = s / r << 1;
103
- var d = a >>> 1, m = d >>> 1, u = m >>> 1;
104
- for (e = 0; e < s; e += a)
105
- for (var f = 0, v = 0; f <= u; f += 2, v += r) {
106
- var p = e + f, w = p + m, g = w + m, b = g + m, T = t[p], F = t[p + 1], y = t[w], A = t[w + 1], C = t[g], P = t[g + 1], M = t[b], S = t[b + 1], R = T, E = F, D = c[v], z = l * c[v + 1], x = y * D - A * z, B = y * z + A * D, N = c[2 * v], H = l * c[2 * v + 1], K = C * N - P * H, k = C * H + P * N, $ = c[3 * v], G = l * c[3 * v + 1], J = M * $ - S * G, L = M * G + S * $, U = R + K, I = E + k, Y = R - K, Q = E - k, W = x + J, j = B + L, V = l * (x - J), X = l * (B - L), st = U + W, rt = I + j, et = Y + X, ot = Q - V;
107
- if (t[p] = st, t[p + 1] = rt, t[w] = et, t[w + 1] = ot, f === 0) {
108
- var nt = U - W, at = I - j;
109
- t[g] = nt, t[g + 1] = at;
100
+ var i = this._inv ? -1 : 1, a = this.table;
101
+ for (e >>= 2; e >= 2; e >>= 2) {
102
+ c = s / e << 1;
103
+ var d = c >>> 1, _ = d >>> 1, v = _ >>> 1;
104
+ for (r = 0; r < s; r += c)
105
+ for (var f = 0, u = 0; f <= v; f += 2, u += e) {
106
+ var p = r + f, w = p + _, g = w + _, b = g + _, T = t[p], F = t[p + 1], y = t[w], A = t[w + 1], C = t[g], M = t[g + 1], S = t[b], P = t[b + 1], R = T, U = F, E = a[u], D = i * a[u + 1], x = y * E - A * D, B = y * D + A * E, N = a[2 * u], j = i * a[2 * u + 1], H = C * N - M * j, K = C * j + M * N, $ = a[3 * u], L = i * a[3 * u + 1], J = S * $ - P * L, G = S * L + P * $, z = R + H, I = U + K, Y = R - H, O = U - K, W = x + J, k = B + G, Q = i * (x - J), X = i * (B - G), et = z + W, rt = I + k, ot = Y + X, nt = O - Q;
107
+ if (t[p] = et, t[p + 1] = rt, t[w] = ot, t[w + 1] = nt, f === 0) {
108
+ var at = z - W, it = I - k;
109
+ t[g] = at, t[g + 1] = it;
110
110
  continue;
111
111
  }
112
- if (f !== u) {
113
- var it = Y, wt = -Q, gt = U, yt = -I, bt = -l * X, Ft = -l * V, Tt = -l * j, At = -l * W, Ct = it + bt, Pt = wt + Ft, Mt = gt + At, St = yt - Tt, ft = e + m - f, vt = e + d - f;
114
- t[ft] = Ct, t[ft + 1] = Pt, t[vt] = Mt, t[vt + 1] = St;
112
+ if (f !== v) {
113
+ var ct = Y, wt = -O, gt = z, yt = -I, bt = -i * X, Ft = -i * Q, Tt = -i * k, At = -i * W, Ct = ct + bt, Mt = wt + Ft, St = gt + At, Pt = yt - Tt, ft = r + _ - f, ut = r + d - f;
114
+ t[ft] = Ct, t[ft + 1] = Mt, t[ut] = St, t[ut + 1] = Pt;
115
115
  }
116
116
  }
117
117
  }
118
- }, _.prototype._singleRealTransform2 = function(t, s, n) {
119
- const r = this._out, a = this._data, e = a[s], h = a[s + n], i = e + h, l = e - h;
120
- r[t] = i, r[t + 1] = 0, r[t + 2] = l, r[t + 3] = 0;
121
- }, _.prototype._singleRealTransform4 = function(t, s, n) {
122
- const r = this._out, a = this._data, e = this._inv ? -1 : 1, h = n * 2, i = n * 3, l = a[s], c = a[s + n], d = a[s + h], m = a[s + i], u = l + d, f = l - d, v = c + m, p = e * (c - m), w = u + v, g = f, b = -p, T = u - v, F = f, y = p;
123
- r[t] = w, r[t + 1] = 0, r[t + 2] = g, r[t + 3] = b, r[t + 4] = T, r[t + 5] = 0, r[t + 6] = F, r[t + 7] = y;
124
- }, ht;
118
+ }, m.prototype._singleRealTransform2 = function(t, s, n) {
119
+ const e = this._out, c = this._data, r = c[s], l = c[s + n], h = r + l, i = r - l;
120
+ e[t] = h, e[t + 1] = 0, e[t + 2] = i, e[t + 3] = 0;
121
+ }, m.prototype._singleRealTransform4 = function(t, s, n) {
122
+ const e = this._out, c = this._data, r = this._inv ? -1 : 1, l = n * 2, h = n * 3, i = c[s], a = c[s + n], d = c[s + l], _ = c[s + h], v = i + d, f = i - d, u = a + _, p = r * (a - _), w = v + u, g = f, b = -p, T = v - u, F = f, y = p;
123
+ e[t] = w, e[t + 1] = 0, e[t + 2] = g, e[t + 3] = b, e[t + 4] = T, e[t + 5] = 0, e[t + 6] = F, e[t + 7] = y;
124
+ }, lt;
125
125
  }
126
- var Dt = Et();
127
- const zt = /* @__PURE__ */ Rt(Dt);
126
+ var Et = Ut();
127
+ const Dt = /* @__PURE__ */ Rt(Et);
128
128
  class pt {
129
129
  constructor(o = 16e3, t = 512, s = 64) {
130
- this.sampleRate = o, this.nfft = t, this.nfilt = s, this.fft = new zt(t), this.melFilters = this.createMelFilterbank();
130
+ this._sampleRate = o, this._nfft = t, this._nfilt = s, this._fft = new Dt(t), this._melFilters = this._createMelFilterbank();
131
131
  }
132
- hzToMel(o) {
132
+ _hzToMel(o) {
133
133
  return 2595 * Math.log10(1 + o / 700);
134
134
  }
135
- melToHz(o) {
135
+ _melToHz(o) {
136
136
  return 700 * (10 ** (o / 2595) - 1);
137
137
  }
138
- createMelFilterbank() {
139
- const t = this.sampleRate / 2, s = this.hzToMel(0), n = this.hzToMel(t), r = new Float32Array(this.nfilt + 2);
140
- for (let i = 0; i < this.nfilt + 2; i++)
141
- r[i] = s + i * (n - s) / (this.nfilt + 1);
142
- const e = r.map((i) => this.melToHz(i)).map((i) => Math.floor((this.nfft + 1) * i / this.sampleRate)), h = [];
143
- for (let i = 0; i < this.nfilt; i++) {
144
- const l = new Float32Array(Math.floor(this.nfft / 2) + 1);
145
- for (let c = e[i]; c < e[i + 1]; c++)
146
- l[c] = (c - e[i]) / (e[i + 1] - e[i]);
147
- for (let c = e[i + 1]; c < e[i + 2]; c++)
148
- l[c] = (e[i + 2] - c) / (e[i + 2] - e[i + 1]);
149
- h.push(l);
138
+ _createMelFilterbank() {
139
+ const t = this._sampleRate / 2, s = this._hzToMel(0), n = this._hzToMel(t), e = new Float32Array(this._nfilt + 2);
140
+ for (let h = 0; h < this._nfilt + 2; h++)
141
+ e[h] = s + h * (n - s) / (this._nfilt + 1);
142
+ const r = e.map((h) => this._melToHz(h)).map((h) => Math.floor((this._nfft + 1) * h / this._sampleRate)), l = [];
143
+ for (let h = 0; h < this._nfilt; h++) {
144
+ const i = new Float32Array(Math.floor(this._nfft / 2) + 1);
145
+ for (let a = r[h]; a < r[h + 1]; a++)
146
+ i[a] = (a - r[h]) / (r[h + 1] - r[h]);
147
+ for (let a = r[h + 1]; a < r[h + 2]; a++)
148
+ i[a] = (r[h + 2] - a) / (r[h + 2] - r[h + 1]);
149
+ l.push(i);
150
150
  }
151
- return h;
151
+ return l;
152
152
  }
153
153
  /** Returns a flat Float32Array of shape [numFrames × nfilt]. */
154
154
  logfbank(o) {
155
- const t = Math.floor(0.025 * this.sampleRate), s = Math.floor(0.01 * this.sampleRate), n = 1 + Math.ceil((o.length - t) / s), r = new Float32Array(n * this.nfilt), a = new Float32Array(this.nfft), e = this.fft.createComplexArray();
156
- for (let h = 0; h < n; h++) {
157
- const i = h * s;
158
- a.fill(0);
159
- for (let d = 0; d < t && i + d < o.length; d++)
160
- a[d] = o[i + d];
161
- const l = this.fft.toComplexArray(a, null);
162
- this.fft.transform(e, l);
163
- const c = new Float32Array(Math.floor(this.nfft / 2) + 1);
164
- for (let d = 0; d < c.length; d++) {
165
- const m = e[2 * d], u = e[2 * d + 1];
166
- c[d] = 1 / this.nfft * (m * m + u * u), c[d] === 0 && (c[d] = 1e-30);
155
+ const t = Math.floor(0.025 * this._sampleRate), s = Math.floor(0.01 * this._sampleRate), n = 1 + Math.ceil((o.length - t) / s), e = new Float32Array(n * this._nfilt), c = new Float32Array(this._nfft), r = this._fft.createComplexArray();
156
+ for (let l = 0; l < n; l++) {
157
+ const h = l * s;
158
+ c.fill(0);
159
+ for (let d = 0; d < t && h + d < o.length; d++)
160
+ c[d] = o[h + d];
161
+ const i = this._fft.toComplexArray(c, null);
162
+ this._fft.transform(r, i);
163
+ const a = new Float32Array(Math.floor(this._nfft / 2) + 1);
164
+ for (let d = 0; d < a.length; d++) {
165
+ const _ = r[2 * d], v = r[2 * d + 1];
166
+ a[d] = 1 / this._nfft * (_ * _ + v * v), a[d] === 0 && (a[d] = 1e-30);
167
167
  }
168
- for (let d = 0; d < this.nfilt; d++) {
169
- let m = 0;
170
- const u = this.melFilters[d];
171
- for (let f = 0; f < c.length; f++)
172
- m += c[f] * u[f];
173
- m === 0 && (m = 1e-30), r[h * this.nfilt + d] = Math.log(m);
168
+ for (let d = 0; d < this._nfilt; d++) {
169
+ let _ = 0;
170
+ const v = this._melFilters[d];
171
+ for (let f = 0; f < a.length; f++)
172
+ _ += a[f] * v[f];
173
+ _ === 0 && (_ = 1e-30), e[l * this._nfilt + d] = Math.log(_);
174
174
  }
175
175
  }
176
- return r;
176
+ return e;
177
+ }
178
+ maxCosineSim(o, t) {
179
+ let s = 0;
180
+ for (const n of t) {
181
+ let e = 0;
182
+ for (let r = 0; r < n.length; r++) e += o[r] * n[r];
183
+ const c = (e + 1) / 2;
184
+ c > s && (s = c);
185
+ }
186
+ return s;
177
187
  }
178
188
  }
179
- async function mt(_ = q, o = tt) {
189
+ async function q(m = tt, o = st) {
180
190
  const t = await import(
181
191
  /* @vite-ignore */
182
192
  o
183
193
  );
184
- return t.env.wasm.wasmPaths = _, t.env.wasm.numThreads = 1, t;
194
+ return t.env.wasm.wasmPaths = m, t.env.wasm.numThreads = 1, t;
185
195
  }
186
- let lt = null;
187
- async function dt(_ = _t, o = q, t = tt) {
188
- return lt || (lt = mt(o, t).then(
189
- (s) => s.InferenceSession.create(_, {
196
+ let dt = null;
197
+ async function _t(m = mt, o = tt, t = st, s) {
198
+ return dt || (dt = q(o, t).then(
199
+ (n) => s ? n.InferenceSession.create(new Uint8Array(s), {
200
+ executionProviders: ["wasm"],
201
+ graphOptimizationLevel: "all"
202
+ }) : n.InferenceSession.create(m, {
190
203
  executionProviders: ["wasm"],
191
204
  graphOptimizationLevel: "all"
192
205
  })
193
- )), lt;
206
+ )), dt;
194
207
  }
195
- const xt = new pt();
196
- class O {
208
+ class V {
209
+ static loadWords(o = Z) {
210
+ try {
211
+ const t = localStorage.getItem(o);
212
+ return t ? JSON.parse(t) : [];
213
+ } catch {
214
+ return [];
215
+ }
216
+ }
217
+ static saveWord(o, t = Z) {
218
+ const s = V.loadWords(t).filter((n) => n.word_name !== o.word_name);
219
+ localStorage.setItem(t, JSON.stringify([...s, o]));
220
+ }
221
+ static deleteWord(o, t = Z) {
222
+ try {
223
+ const s = V.loadWords(t).filter((n) => n.word_name !== o);
224
+ localStorage.setItem(t, JSON.stringify(s));
225
+ } catch {
226
+ }
227
+ }
228
+ }
229
+ class It {
197
230
  constructor(o, t) {
198
- this._started = !1, this._inferring = !1, this._audioCtx = null, this._stream = null, this._refEmbeddings = /* @__PURE__ */ new Map(), this._lastMatchAt = 0, this._lastInferenceAt = 0;
231
+ this._started = !1, this._inferring = !1, this._audioCtx = null, this._stream = null, this._refEmbeddings = /* @__PURE__ */ new Map(), this._lastMatchAt = 0, this._lastInferenceAt = 0, this._initPromise = null;
199
232
  const {
200
233
  refsStorageKey: s = Z,
201
- thresholdStorageKey: n = Wt,
202
- wasmPaths: r = q,
203
- modelPath: a = _t,
204
- audioProcessorPath: e = It,
205
- ortCdnUrl: h = tt
234
+ thresholdStorageKey: n = zt,
235
+ wasmPaths: e = tt,
236
+ modelPath: c = mt,
237
+ audioProcessorPath: r = xt,
238
+ ortCdnUrl: l = st,
239
+ audioUtils: h = new pt()
206
240
  } = t || {};
207
- this._commands = o, this._refsStorageKey = s, this._thresholdStorageKey = n, this._audioProcessorPath = e, this._wasmPaths = r, this._modelPath = a, this._ortCdnUrl = h;
241
+ this._audioUtils = h, this._commands = o, this._refsStorageKey = s, this._thresholdStorageKey = n, this._audioProcessorPath = r, this._wasmPaths = e, this._modelPath = c, this._ortCdnUrl = l;
208
242
  try {
209
243
  const i = localStorage.getItem(this._thresholdStorageKey);
210
244
  this._threshold = i !== null ? Math.max(0, Math.min(1, Number(i))) : 0.65;
211
245
  } catch {
212
246
  this._threshold = 0.65;
213
247
  }
214
- this._initPromise = this._init();
215
248
  }
216
249
  get threshold() {
217
250
  return this._threshold;
@@ -226,25 +259,63 @@ class O {
226
259
  get listening() {
227
260
  return this._started;
228
261
  }
229
- async _init() {
230
- await dt(this._modelPath, this._wasmPaths, this._ortCdnUrl);
231
- const o = /* @__PURE__ */ new Set();
232
- for (const t of this._commands)
233
- for (const s of t.triggers)
234
- if (!o.has(s.name) && (o.add(s.name), s.defaultRefPath)) {
235
- const n = await fetch(s.defaultRefPath);
236
- if (n.ok) {
237
- const r = await n.json();
238
- this.addCustomWord(r);
239
- }
240
- }
241
- for (const t of O.loadWords(this._refsStorageKey))
242
- this._refEmbeddings.set(t.word_name, t.embeddings);
262
+ /**
263
+ * Streams `url`, calling `onProgress(downloaded, total)` after each chunk.
264
+ * Falls back to a single-shot fetch when the body stream is unavailable.
265
+ */
266
+ async _trackFetch(o, t, s) {
267
+ const n = await fetch(o);
268
+ if (!n.ok) throw new Error(`HTTP ${n.status} fetching ${o}`);
269
+ const e = Number(n.headers.get("content-length") ?? "0");
270
+ if (e > 0 && (s.total += e), !n.body) {
271
+ const a = await n.arrayBuffer();
272
+ return s.downloaded += a.byteLength, e || (s.total += a.byteLength), t == null || t(s.downloaded, s.total), a;
273
+ }
274
+ const c = n.body.getReader(), r = [];
275
+ let l = 0;
276
+ for (; ; ) {
277
+ const { done: a, value: d } = await c.read();
278
+ if (a) break;
279
+ r.push(d), l += d.length, s.downloaded += d.length, t == null || t(s.downloaded, s.total);
280
+ }
281
+ e || (s.total += l);
282
+ const h = new Uint8Array(l);
283
+ let i = 0;
284
+ for (const a of r)
285
+ h.set(a, i), i += a.length;
286
+ return h.buffer;
287
+ }
288
+ async _init(o) {
289
+ const t = { downloaded: 0, total: 0 }, s = /* @__PURE__ */ new Set(), n = [];
290
+ for (const i of this._commands)
291
+ for (const a of i.triggers)
292
+ !s.has(a.name) && a.defaultRefPath && (s.add(a.name), n.push({ name: a.name, path: a.defaultRefPath }));
293
+ const e = q(this._wasmPaths, this._ortCdnUrl), [c, ...r] = await Promise.all([
294
+ this._trackFetch(this._modelPath, o, t),
295
+ ...n.map(({ path: i }) => this._trackFetch(i, o, t))
296
+ ]);
297
+ await e, await _t(this._modelPath, this._wasmPaths, this._ortCdnUrl, c);
298
+ const l = V.loadWords(this._refsStorageKey), h = new Set(l.map((i) => i.word_name));
299
+ for (let i = 0; i < n.length; i++)
300
+ try {
301
+ const a = JSON.parse(new TextDecoder().decode(r[i]));
302
+ this.addCustomWord(a), h.has(a.word_name) || V.saveWord(a, this._refsStorageKey);
303
+ } catch {
304
+ console.warn(`[Mellon] failed to parse ref file: ${n[i].path}`);
305
+ }
306
+ for (const i of l)
307
+ this._refEmbeddings.set(i.word_name, i.embeddings);
243
308
  console.info("[Mellon] init complete, loaded refs:", [...this._refEmbeddings.keys()]);
244
309
  }
245
- /** Ensures the ONNX model is loaded — call before generateRef() in enrollment. */
246
- async init() {
247
- await this._initPromise;
310
+ /**
311
+ * Loads the ONNX model and all reference embeddings.
312
+ * Must be called before {@link start}.
313
+ * Safe to call multiple times — the work is only done once.
314
+ *
315
+ * @param onProgress - optional callback invoked as each asset is loaded
316
+ */
317
+ async init(o) {
318
+ this._initPromise || (this._initPromise = this._init(o)), await this._initPromise;
248
319
  }
249
320
  /** Adds (or replaces) the reference embeddings for a word without restarting. */
250
321
  addCustomWord(o) {
@@ -254,7 +325,7 @@ class O {
254
325
  }
255
326
  async start() {
256
327
  if (this._started) return;
257
- await this._initPromise;
328
+ await this.init();
258
329
  let o;
259
330
  try {
260
331
  o = await navigator.mediaDevices.getUserMedia({
@@ -272,8 +343,8 @@ class O {
272
343
  const t = new AudioContext({ sampleRate: 16e3 });
273
344
  this._audioCtx = t, await t.audioWorklet.addModule(this._audioProcessorPath);
274
345
  const s = t.createMediaStreamSource(o), n = new AudioWorkletNode(t, "audio-processor");
275
- n.port.onmessage = (r) => {
276
- this._handleBuffer(r.data);
346
+ n.port.onmessage = (e) => {
347
+ this._handleBuffer(e.data);
277
348
  }, s.connect(n), n.connect(t.destination), this._started = !0;
278
349
  }
279
350
  async stop() {
@@ -288,16 +359,16 @@ class O {
288
359
  if (!(t - this._lastInferenceAt < 300)) {
289
360
  this._lastInferenceAt = t, this._inferring = !0;
290
361
  try {
291
- const [s, n] = await Promise.all([mt(this._wasmPaths, this._ortCdnUrl), dt(this._modelPath, this._wasmPaths, this._ortCdnUrl)]), r = xt.logfbank(o), a = new s.Tensor("float32", r, [1, 1, 149, 64]), e = await n.run({ input: a }), h = e[Object.keys(e)[0]].data;
292
- let i = !1;
293
- for (const l of this._commands) {
294
- if (i) break;
295
- for (const c of l.triggers) {
296
- const d = this._refEmbeddings.get(c.name);
362
+ const [s, n] = await Promise.all([q(this._wasmPaths, this._ortCdnUrl), _t(this._modelPath, this._wasmPaths, this._ortCdnUrl)]), e = this._audioUtils.logfbank(o), c = new s.Tensor("float32", e, [1, 1, 149, 64]), r = await n.run({ input: c }), l = r[Object.keys(r)[0]].data;
363
+ let h = !1;
364
+ for (const i of this._commands) {
365
+ if (h) break;
366
+ for (const a of i.triggers) {
367
+ const d = this._refEmbeddings.get(a.name);
297
368
  if (!d) continue;
298
- const m = this._maxCosineSim(h, d);
299
- if (m >= this._threshold && t - this._lastMatchAt > 2e3) {
300
- this._lastMatchAt = t, console.info(`[Mellon] match: "${c}" sim=${m.toFixed(3)}`), typeof l.onMatch == "function" && l.onMatch(c.name, m), i = !0;
369
+ const _ = this._audioUtils.maxCosineSim(l, d);
370
+ if (_ >= this._threshold && t - this._lastMatchAt > 2e3) {
371
+ this._lastMatchAt = t, console.info(`[Mellon] match: "${a}" sim=${_.toFixed(3)}`), typeof i.onMatch == "function" && i.onMatch(a.name, _), h = !0;
301
372
  break;
302
373
  }
303
374
  }
@@ -309,64 +380,34 @@ class O {
309
380
  }
310
381
  }
311
382
  }
312
- _maxCosineSim(o, t) {
313
- let s = 0;
314
- for (const n of t) {
315
- let r = 0;
316
- for (let e = 0; e < n.length; e++) r += o[e] * n[e];
317
- const a = (r + 1) / 2;
318
- a > s && (s = a);
319
- }
320
- return s;
321
- }
322
- static loadWords(o = Z) {
323
- try {
324
- const t = localStorage.getItem(o);
325
- return t ? JSON.parse(t) : [];
326
- } catch {
327
- return [];
328
- }
329
- }
330
- static saveWord(o, t = Z) {
331
- const s = O.loadWords(t).filter((n) => n.word_name !== o.word_name);
332
- localStorage.setItem(t, JSON.stringify([...s, o]));
333
- }
334
- static deleteWord(o, t = Z) {
335
- try {
336
- const s = O.loadWords(t).filter((n) => n.word_name !== o);
337
- localStorage.setItem(t, JSON.stringify(s));
338
- } catch {
339
- }
340
- }
341
383
  }
342
- const Ut = new pt();
343
- class jt {
384
+ class Wt {
344
385
  constructor(o, t) {
345
- this._config = {}, this._samples = [], this._wordName = o, this._config.modelPath = (t == null ? void 0 : t.modelPath) || _t, this._config.wasmPaths = (t == null ? void 0 : t.wasmPaths) || q, this._config.ortCdnUrl = (t == null ? void 0 : t.ortCdnUrl) || tt;
386
+ this._config = {}, this._samples = [], this._wordName = o, this._config.modelPath = (t == null ? void 0 : t.modelPath) || mt, this._config.wasmPaths = (t == null ? void 0 : t.wasmPaths) || tt, this._config.ortCdnUrl = (t == null ? void 0 : t.ortCdnUrl) || st, this._audioUtils = (t == null ? void 0 : t.audioUtils) ?? new pt();
346
387
  }
347
388
  /** Records 1.5 s of audio, stores the decoded PCM, returns new sample count. */
348
389
  async recordSample() {
349
- const o = await navigator.mediaDevices.getUserMedia({ audio: !0 }), t = new AudioContext({ sampleRate: 16e3 }), s = await new Promise((a, e) => {
350
- const h = new MediaRecorder(o), i = [];
351
- h.ondataavailable = (l) => {
352
- l.data.size > 0 && i.push(l.data);
353
- }, h.onstop = async () => {
354
- var l;
355
- for (const c of o.getTracks()) c.stop();
390
+ const o = await navigator.mediaDevices.getUserMedia({ audio: !0 }), t = new AudioContext({ sampleRate: 16e3 }), s = await new Promise((c, r) => {
391
+ const l = new MediaRecorder(o), h = [];
392
+ l.ondataavailable = (i) => {
393
+ i.data.size > 0 && h.push(i.data);
394
+ }, l.onstop = async () => {
395
+ var i;
396
+ for (const a of o.getTracks()) a.stop();
356
397
  try {
357
- const d = await new Blob(i, { type: ((l = i[0]) == null ? void 0 : l.type) || "audio/webm" }).arrayBuffer(), m = await t.decodeAudioData(d);
358
- await t.close(), a(m.getChannelData(0).slice());
359
- } catch (c) {
360
- e(c);
398
+ const d = await new Blob(h, { type: ((i = h[0]) == null ? void 0 : i.type) || "audio/webm" }).arrayBuffer(), _ = await t.decodeAudioData(d);
399
+ await t.close(), c(_.getChannelData(0).slice());
400
+ } catch (a) {
401
+ r(a);
361
402
  }
362
- }, h.start(), setTimeout(() => {
403
+ }, l.start(), setTimeout(() => {
363
404
  try {
364
- h.stop();
405
+ l.stop();
365
406
  } catch {
366
407
  }
367
408
  }, 1500);
368
- }), n = 24e3, r = new Float32Array(n);
369
- return r.set(s.slice(0, n)), this._samples.push(r), this._samples.length;
409
+ }), n = 24e3, e = new Float32Array(n);
410
+ return e.set(s.slice(0, n)), this._samples.push(e), this._samples.length;
370
411
  }
371
412
  /** Removes the sample at the given index. Returns the new sample count. */
372
413
  deleteSample(o) {
@@ -376,22 +417,24 @@ class jt {
376
417
  }
377
418
  /** Runs ONNX inference on every recorded sample to produce reference embeddings. */
378
419
  async generateRef() {
379
- const [o, t] = await Promise.all([mt(this._config.wasmPaths, this._config.ortCdnUrl), dt(this._config.modelPath, this._config.wasmPaths, this._config.ortCdnUrl)]), s = [];
420
+ const [o, t] = await Promise.all([q(this._config.wasmPaths, this._config.ortCdnUrl), _t(this._config.modelPath, this._config.wasmPaths, this._config.ortCdnUrl)]), s = [];
380
421
  for (const n of this._samples) {
381
- const r = Ut.logfbank(n), a = new o.Tensor("float32", r, [1, 1, 149, 64]), e = await t.run({ input: a }), h = Array.from(e[Object.keys(e)[0]].data);
382
- s.push(h);
422
+ const e = this._audioUtils.logfbank(n), c = new o.Tensor("float32", e, [1, 1, 149, 64]), r = await t.run({ input: c }), l = Array.from(r[Object.keys(r)[0]].data);
423
+ s.push(l);
383
424
  }
384
425
  return { word_name: this._wordName, model_type: "resnet_50_arc", embeddings: s };
385
426
  }
386
427
  }
387
- const q = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/", tt = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.wasm.min.mjs", _t = "https://huggingface.co/ComicScrip/mellon/resolve/main/model.onnx", It = "https://cdn.jsdelivr.net/npm/mellon@0.0.14/dist/assets/audio-processor.js", Z = "mellon-refs", Wt = "mellon-threshold";
428
+ const tt = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/", st = "https://cdn.jsdelivr.net/npm/onnxruntime-web@1.24.3/dist/ort.wasm.min.mjs", mt = "https://huggingface.co/ComicScrip/mellon/resolve/main/model.onnx", xt = "https://cdn.jsdelivr.net/npm/mellon@0.0.14/dist/assets/audio-processor.js", Z = "mellon-refs", zt = "mellon-threshold";
388
429
  export {
389
- It as DEFAULT_AUDIO_PROCESSOR_PATH,
390
- _t as DEFAULT_MODEL_PATH,
391
- tt as DEFAULT_ORT_CDN_URL,
430
+ pt as AudioUtils,
431
+ xt as DEFAULT_AUDIO_PROCESSOR_PATH,
432
+ mt as DEFAULT_MODEL_PATH,
433
+ st as DEFAULT_ORT_CDN_URL,
392
434
  Z as DEFAULT_REFS_STORAGE_KEY,
393
- Wt as DEFAULT_THRESHOLD_STORAGE_KEY,
394
- q as DEFAULT_WASM_PATHS,
395
- jt as EnrollmentSession,
396
- O as Mellon
435
+ zt as DEFAULT_THRESHOLD_STORAGE_KEY,
436
+ tt as DEFAULT_WASM_PATHS,
437
+ It as Detector,
438
+ Wt as EnrollmentSession,
439
+ V as Storage
397
440
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "mellon",
3
- "version": "0.0.18",
3
+ "version": "0.0.20",
4
4
  "description": "Offline, in-browser voice commands powered by EfficientWord-Net (ResNet-50 ArcFace).",
5
5
  "type": "module",
6
6
  "main": "./dist/mellon.cjs",