npm - claude-code-session-manager - Versions diffs - 0.2.4 → 0.2.5 - Mend

claude-code-session-manager 0.2.4 → 0.2.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (27) hide show

package/dist/assets/{whisperWorker-BhZpQ_3S.js → whisperWorker-CgzXb5fW.js} RENAMED Viewed

@@ -38,4 +38,4 @@ ${this.boa_token}${this.audio_token.repeat(this._compute_audio_num_tokens(o[l++]
 `}return o+=`
 ${s}${a}`+n.repeat(e)+`${s}`,o}function _T(e,t,r,s){return`${t}${s}`+r.repeat(e)+`${t}`}function pT(e,t,r,s,n,a){return e===0&&t===0?_T(r,s,n,a):fT(r,e,t,s,n,a)}var V_=(xn=class extends Ee{constructor(){super(...arguments);k(this,"fake_image_token","<fake_token_around_image>");k(this,"image_token","<image>");k(this,"global_img_token","<global-img>")}async _call(t,r=null,s={}){s.return_row_col_info??(s.return_row_col_info=!0);let n;r&&(n=await this.image_processor(r,s)),Array.isArray(t)||(t=[t]);const a=n.rows??[new Array(t.length).fill(0)],o=n.cols??[new Array(t.length).fill(0)],i=this.config.image_seq_len,l=[],c=[];for(let h=0;h<t.length;++h){const _=t[h],p=a[h],w=o[h];l.push(Oy(_,this.image_token));const v=p.map((T,A)=>pT(T,w[A],i,this.fake_image_token,this.image_token,this.global_img_token)),y=_.split(this.image_token);if(y.length===0)throw new Error("The image token should be present in the text.");let M=y[0];for(let T=0;T<v.length;++T)M+=v[T]+y[T+1];c.push(M)}return{...this.tokenizer(c),...n}}},k(xn,"image_processor_class",dt),k(xn,"tokenizer_class",Te),k(xn,"uses_processor_config",!0),xn),mT=(kn=class extends Ee{constructor(e,t,r){super(e,t,r),this.image_tag=this.config.image_tag,this.image_start_tag=this.config.image_start_tag,this.image_end_tag=this.config.image_end_tag,this.num_image_tokens=this.config.num_image_tokens}async _call(e,{images:t=null,chat_template:r="default"}={}){t?Array.isArray(t)||(t=[t]):t=await Promise.all(e.filter(v=>v.images).flatMap(v=>v.images).map(v=>jt.read(v)));const s=this.tokenizer,n=s.apply_chat_template(e,{tokenize:!1,add_generation_prompt:!0,chat_template:r}),a=v=>s.encode(v,{add_special_tokens:!1}),o=n.split(this.image_tag),i=o.length-1;if(t.length!==i)throw new Error(`Number of images provided (${t.length}) does not match number of "${this.image_tag}" image tags (${i})`);const[l,c,d]=s.convert_tokens_to_ids([this.image_tag,this.image_start_tag,this.image_end_tag]);let h=a(o[0]),_=new Array(h.length).fill(!1);for(let v=1;v<o.length;++v){const y=new Array(this.num_image_tokens).fill(l),M=a(o[v]);h=Yt(h,[c],y,[d],M);const T=new Array(this.num_image_tokens).fill(!0);_=Yt(_,[!1],T,[!1],new Array(M.length).fill(!1))}const p=[1,h.length],w={input_ids:new U("int64",h,p),attention_mask:new U("int64",new Array(h.length).fill(1),p),images_seq_mask:new U("bool",_,p),images_emb_mask:new U("bool",new Array(i*this.num_image_tokens).fill(!0),[1,i,this.num_image_tokens])};if(t&&t.length>0){const v=await this.image_processor(t);return v.pixel_values.unsqueeze_(0),{...w,...v}}return w}},k(kn,"image_processor_class",dt),k(kn,"tokenizer_class",Te),k(kn,"uses_processor_config",!0),kn),gT=(Xa=class extends Ee{async _call(e=null,t=null,r={}){if(!e&&!t)throw new Error("Either text or images must be provided");const s=e?this.tokenizer(e,r):{},n=t?await this.image_processor(t,r):{};return{...s,...n}}},k(Xa,"tokenizer_class",Te),k(Xa,"image_processor_class",dt),Xa),wT=(Ya=class extends Ee{async _call(e,t=null,r={}){const{image_rows:s,image_cols:n,image_sizes:a,...o}=await this.image_processor(e,{...r,return_row_col_info:!0});if(t){const i=this.config.image_token??"<image>",{tile_size:l=512,downsample_factor:c=2,encoder_patch_size:d=16,use_thumbnail:h=!0}=this.image_processor.config,_=T=>Math.ceil(Math.floor(T/d)/c),p=_(l)**2,w=this.config.image_start_token??"<|image_start|>",v=this.config.image_end_token??"<|image_end|>",y=this.config.image_thumbnail??"<|img_thumbnail|>";Array.isArray(t)||(t=[t]);let M=0;t=t.map(T=>{const A=T.split(i);return A[0]+A.slice(1).map(C=>{const S=M++,[N,x]=a[S],R=s[S],z=n[S],$=_(N)*_(x);let Q=w;if(R>1||z>1){const H=i.repeat(p);for(let D=0;D<R;++D)for(let I=0;I<z;++I)Q+=`<|img_row_${D+1}_col_${I+1}|>`+H;h&&(Q+=y+i.repeat($))}else Q+=i.repeat($);return Q+v+C}).join("")})}return{...o,...t?this.tokenizer(t,r):{}}}},k(Ya,"tokenizer_class",Te),k(Ya,"image_processor_class",dt),Ya),vT=(Tn=class extends Ee{async _call(e,t=null,r={}){const s=await this.image_processor(e,r);if(t){const[a,o]=s.pixel_values.dims.slice(-2),{image_token:i,patch_size:l,num_additional_image_tokens:c}=this.config,d=Math.floor(a/l)*Math.floor(o/l)+c;t=structuredClone(t),Array.isArray(t)||(t=[t]);for(let h=0;h<t.length;++h)t[h]=t[h].replace(i,i.repeat(d))}const n=t?this.tokenizer(t,r):{};return{...s,...n}}},k(Tn,"tokenizer_class",Te),k(Tn,"image_processor_class",dt),k(Tn,"uses_processor_config",!0),Tn),U_={char:["char_decode",1],bpe:["bpe_decode",2],wp:["wp_decode",102]},yT=(Ja=class extends Ee{get char_tokenizer(){return this.components.char_tokenizer}get bpe_tokenizer(){return this.components.bpe_tokenizer}get wp_tokenizer(){return this.components.wp_tokenizer}_decode_helper(e,t){if(!U_.hasOwnProperty(t))throw new Error(`Format ${t} is not supported.`);const[r,s]=U_[t],n=this[r].bind(this),[a,o]=e.dims,i=[],l=[],c=e.tolist();for(let h=0;h<a;++h){const _=c[h],p=[],w=[];for(let y=1;y<o;++y){const[M,T]=je(nt(_[y]));if(w.push(M),T==s)break;p.push(T)}const v=w.length>0?w.reduce((y,M)=>y*M,1):0;l.push(p),i.push(v)}return[n(l),i]}char_decode(e){return this.char_tokenizer.batch_decode(e).map(t=>t.replaceAll(" ",""))}bpe_decode(e){return this.bpe_tokenizer.batch_decode(e)}wp_decode(e){return this.wp_tokenizer.batch_decode(e).map(t=>t.replaceAll(" ",""))}batch_decode([e,t,r]){const[s,n]=this._decode_helper(e,"char"),[a,o]=this._decode_helper(t,"bpe"),[i,l]=this._decode_helper(r,"wp"),c=[],d=[];for(let h=0;h<s.length;++h){const[_,p]=je([n[h],o[h],l[h]]);c.push([s[h],a[h],i[h]][p]),d.push(_)}return{generated_text:c,scores:d,char_preds:s,bpe_preds:a,wp_preds:i}}static async from_pretrained(...e){const t=await super.from_pretrained(...e),r=await Te.from_pretrained("Xenova/gpt2"),s=await Te.from_pretrained("Xenova/bert-base-uncased");return t.components={image_processor:t.image_processor,char_tokenizer:t.tokenizer,bpe_tokenizer:r,wp_tokenizer:s},t}async _call(e,t=null){const r=await this.image_processor(e);return t&&(r.labels=this.tokenizer(t).input_ids),r}},k(Ja,"tokenizer_class",Te),k(Ja,"image_processor_class",dt),Ja),bT=(Ka=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(Ka,"tokenizer_class",Te),k(Ka,"feature_extractor_class",Ft),Ka),MT=(Za=class extends Ee{},k(Za,"tokenizer_class",Te),k(Za,"image_processor_class",dt),Za),_s="<image>";function xT(e,t,r,s,n){return`${s.repeat(r*n)}${t}${e}
 `}var kT=(En=class extends Ee{async _call(e,t=null,r={}){t||(ue.warn("You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."),t=""),Array.isArray(e)||(e=[e]),Array.isArray(t)||(t=[t]);const s=this.tokenizer.bos_token,n=this.image_processor.config.image_seq_length;let a;t.some(l=>l.includes(_s))?a=t.map(l=>{const c=l.replaceAll(_s,_s.repeat(n)),d=c.lastIndexOf(_s),h=d===-1?0:d+_s.length;return c.slice(0,h)+s+c.slice(h)+`
-`}):(ue.warn("You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens."),a=t.map(l=>xT(l,s,n,_s,e.length)));const o=this.tokenizer(a,r);return{...await this.image_processor(e,r),...o}}},k(En,"tokenizer_class",Te),k(En,"image_processor_class",dt),k(En,"uses_processor_config",!1),En),j_="<|image|>",TT=/<\|image_\d+\|>/g,ET=(eo=class extends Ee{async _call(e,t=null,{padding:r=!0,truncation:s=!0,num_crops:n=null}={}){Array.isArray(e)||(e=[e]);let a,o;if(t){o=await this.image_processor(t,{num_crops:n});const{num_img_tokens:i}=o,l=e.map((d,h)=>d.split(TT).join(j_.repeat(i[h])));a=this.tokenizer(l,{padding:r,truncation:s});const c=this.tokenizer._tokenizer.token_to_id(j_);a.input_ids.map_(d=>d==c?-d:d)}else a=this.tokenizer(e);return{...a,...o}}},k(eo,"image_processor_class",dt),k(eo,"tokenizer_class",Te),eo),AT=(An=class extends Ee{async _call(e,t=null,r={}){const s=await this.image_processor(e,r);if(t){const[a,o]=s.pixel_values.dims.slice(-2),{image_token:i,image_break_token:l,image_end_token:c,patch_size:d,spatial_merge_size:h}=this.config,_=d*h,p=Math.floor(a/_),w=Math.floor(o/_);t=structuredClone(t),Array.isArray(t)||(t=[t]);for(let v=0;v<t.length;++v){const y=i.repeat(w),M=y+l,T=y+c,A=M.repeat(p-1)+T;t[v]=t[v].replace(i,A)}}const n=t?this.tokenizer(t,r):{};return{...s,...n}}},k(An,"tokenizer_class",Te),k(An,"image_processor_class",dt),k(An,"uses_processor_config",!0),An),CT=(du=class extends Ee{async _call(e){return await this.feature_extractor(e)}post_process_speaker_diarization(...e){return this.feature_extractor.post_process_speaker_diarization(...e)}get sampling_rate(){return this.feature_extractor.config.sampling_rate}},k(du,"feature_extractor_class",m_),du),q_=class extends Wi{},ST=class extends q_{},W_=(hu=class extends Ee{async _call(...e){return await this.image_processor(...e)}post_process_masks(...e){return this.image_processor.post_process_masks(...e)}reshape_input_points(...e){return this.image_processor.reshape_input_points(...e)}},k(hu,"image_processor_class",dt),hu),H_=class extends W_{},PT=class extends H_{},FT=(to=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(to,"tokenizer_class",Te),k(to,"feature_extractor_class",Ft),to),LT=(Cn=class extends Ee{async _call(e,t=null,r={}){if(Array.isArray(e))throw new Error("Batched inputs are not supported yet.");let s={};if(t){const a=t.length,{input_features:o}=await this.feature_extractor(t,{...r,max_length:a}),i=Math.round(a/this.config.encoder_ds_factor+1e-4),l=1+Math.ceil(i/this.config.stack_factor);s.audio_token_len=[l],s.audio_values=o;const c=this.config.audio_placeholder;if(!e.includes(c))throw new Error(`The input text does not contain the image token ${c}.`);e=e.replaceAll(c,c.repeat(l))}return{...this.tokenizer(e,{add_special_tokens:!1,...r}),...s}}},k(Cn,"tokenizer_class",Te),k(Cn,"feature_extractor_class",Ft),k(Cn,"uses_processor_config",!0),Cn),ga="[AUDIO]",IT="[BEGIN_AUDIO]",OT=375;function NT(e,t){const r=[];for(let s=0;s<e.length;s+=t)r.push(e.subarray(s,Math.min(s+t,e.length)));return r}var DT=(Sn=class extends Ee{async _call(e,t=null,r={}){if(Array.isArray(e))throw new Error("Batched inputs are not supported yet.");const s={};if(t){if(!e.includes(ga))throw new Error(`The input text does not contain the audio token ${ga}.`);Array.isArray(t)||(t=[t]);const a=e.split(ga),o=a.length-1;if(o!==t.length)throw new Error(`The number of audio inputs (${t.length}) does not match the number of audio tokens in the text (${o}).`);const i=this.feature_extractor.config.n_samples,l=t.map(p=>NT(p,i)),c=l.map(p=>p.length),d=l.flat(),h=(await Promise.all(d.map(p=>this.feature_extractor(p,r)))).map(p=>p.input_features);s.audio_values=h.length>1?ze(h,0):h[0];let _=a[0];for(let p=0;p<c.length;++p){_+=IT;for(let w=0;w<c[p];++w)_+=ga.repeat(OT);_+=a[p+1]}e=_}return{...this.tokenizer(e,{add_special_tokens:!1,...r}),...s}}},k(Sn,"tokenizer_class",Te),k(Sn,"feature_extractor_class",Ft),k(Sn,"uses_processor_config",!1),Sn),Q_=32,Hi=6,wa=8,zT=10,BT=32,RT=(Pn=class extends Ee{get num_mel_frames_first_audio_chunk(){return(Hi+1)*wa}get num_samples_first_audio_chunk(){const{hop_length:e,n_fft:t}=this.feature_extractor.config;return(this.num_mel_frames_first_audio_chunk-1)*e+Math.floor(t/2)}get num_samples_per_audio_chunk(){const{hop_length:e,n_fft:t}=this.feature_extractor.config;return wa*e+t}get num_right_pad_tokens(){return Hi+1+zT}get audio_length_per_tok(){return wa}get raw_audio_length_per_tok(){return wa*this.feature_extractor.config.hop_length}async _call(e,{is_streaming:t=!1,is_first_audio_chunk:r=!0}={}){if(at(e,"VoxtralRealtimeProcessor"),!t&&!r)throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");if(r)if(t){const s=Q_*this.raw_audio_length_per_tok,n=new Float32Array(s+e.length);n.set(e,s);const a=await this.feature_extractor(n,{center:!0}),i=1+(Q_+Hi),l=new BigInt64Array(i).fill(BigInt(BT));return l[0]=1n,{input_ids:new U("int64",l,[1,i]),...a}}else{const s=this.num_right_pad_tokens*this.raw_audio_length_per_tok,n=new Float32Array(e.length+s);return n.set(e),await this.feature_extractor(n,{center:!0})}else return await this.feature_extractor(e,{center:!1})}},k(Pn,"tokenizer_class",Te),k(Pn,"feature_extractor_class",Ft),k(Pn,"uses_processor_config",!1),Pn),GT=(ro=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(ro,"tokenizer_class",Te),k(ro,"feature_extractor_class",Ft),ro),$T=(so=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(so,"tokenizer_class",Te),k(so,"feature_extractor_class",Ft),so),VT=(no=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(no,"tokenizer_class",Te),k(no,"feature_extractor_class",Ft),no),UT=class{static async from_pretrained(e,t={}){const r=await Zt(e,sn,!0,t),{image_processor_type:s,feature_extractor_type:n,processor_class:a}=r;if(a&&Ni[a])return Ni[a].from_pretrained(e,t);if(!s&&!n)throw new Error("No `image_processor_type` or `feature_extractor_type` found in the config.");const o={};if(s){const l=ma[s.replace(/Fast$/,"")];if(!l)throw new Error(`Unknown image_processor_type: '${s}'.`);o.image_processor=new l(r)}if(n){const l=ma[n];if(l)o.image_processor=new l(r);else{const c=Di[n];if(!c)throw new Error(`Unknown feature_extractor_type: '${n}'.`);o.feature_extractor=new c(r)}}const i={};return new Ee(i,o,null)}};async function jT(e,t){return await Zt(e,"config.json",!0,t)}function ps(e){const t={};let r={};switch(e.model_type){case"llava":case"paligemma":case"gemma3":case"florence2":case"llava_onevision":case"idefics3":case"granite_speech":case"ultravox":case"voxtral":case"voxtral_realtime":case"smolvlm":case"gemma3n":case"gemma4":case"lfm2_vl":case"chatterbox":case"lighton_ocr":case"glm_ocr":case"mistral3":case"qwen2_5_vl":case"qwen3_vl":case"qwen3_vl_moe":r=ps(e.text_config);break;case"moondream1":r=ps(e.phi_config);break;case"musicgen":r=ps(e.decoder);break;case"multi_modality":r=ps(e.language_config);break;case"gpt2":case"gptj":case"jais":case"codegen":case"gpt_bigcode":t.num_heads="n_head",t.num_layers="n_layer",t.hidden_size="n_embd";break;case"gpt_neox":case"stablelm":case"opt":case"falcon":case"modernbert-decoder":t.num_heads="num_attention_heads",t.num_layers="num_hidden_layers",t.hidden_size="hidden_size";break;case"gpt_oss":case"llama":case"llama4_text":case"nanochat":case"apertus":case"arcee":case"afmoe":case"lfm2":case"lfm2_moe":case"smollm3":case"olmo":case"olmo2":case"olmo3":case"mobilellm":case"granite":case"granitemoehybrid":case"cohere":case"cohere2":case"mistral":case"voxtral_realtime_text":case"voxtral_realtime_encoder":case"starcoder2":case"qwen2":case"qwen2_moe":case"qwen2_vl":case"qwen2_vl_text":case"qwen2_5_vl_text":case"qwen3_moe":case"qwen3_vl_text":case"qwen3_vl_moe_text":case"phi":case"phi3":case"phi3_v":case"llava_qwen2":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.hidden_size="hidden_size",t.num_attention_heads="num_attention_heads",t.dim_kv="head_dim";break;case"qwen3":case"solar_open":case"glm_ocr_text":case"gemma":case"gemma2":case"vaultgemma":case"gemma3_text":case"gemma3n_text":case"gemma4_text":case"glm":case"helium":case"ernie4_5":case"hunyuan_v1_dense":case"falcon_h1":case"nemotron_h":case"ministral":case"ministral3":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.dim_kv="head_dim";break;case"openelm":t.num_heads="num_kv_heads",t.num_layers="num_transformer_layers",t.dim_kv="head_dim";break;case"gpt_neo":case"donut-swin":t.num_heads="num_heads",t.num_layers="num_layers",t.hidden_size="hidden_size";break;case"bloom":t.num_heads="n_head",t.num_layers="n_layer",t.hidden_size="hidden_size";break;case"mpt":t.num_heads="n_heads",t.num_layers="n_layers",t.hidden_size="d_model";break;case"exaone":t.num_heads="num_key_value_heads",t.num_layers="num_layers",t.dim_kv="head_dim",t.num_attention_heads="num_attention_heads";break;case"youtu":case"deepseek_v3":case"glm_moe_dsa":case"mistral4":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.dim_kv="qk_head_dim",t.num_attention_heads="num_attention_heads";break;case"t5":case"mt5":case"longt5":t.num_decoder_layers="num_decoder_layers",t.num_decoder_heads="num_heads",t.decoder_dim_kv="d_kv",t.num_encoder_layers="num_layers",t.num_encoder_heads="num_heads",t.encoder_dim_kv="d_kv";break;case"bart":case"mbart":case"marian":case"whisper":case"lite-whisper":case"m2m_100":case"blenderbot":case"blenderbot-small":case"florence2_language":t.num_decoder_layers="decoder_layers",t.num_decoder_heads="decoder_attention_heads",t.decoder_hidden_size="d_model",t.num_encoder_layers="encoder_layers",t.num_encoder_heads="encoder_attention_heads",t.encoder_hidden_size="d_model";break;case"speecht5":t.num_decoder_layers="decoder_layers",t.num_decoder_heads="decoder_attention_heads",t.decoder_hidden_size="hidden_size",t.num_encoder_layers="encoder_layers",t.num_encoder_heads="encoder_attention_heads",t.encoder_hidden_size="hidden_size";break;case"trocr":t.num_encoder_layers=t.num_decoder_layers="decoder_layers",t.num_encoder_heads=t.num_decoder_heads="decoder_attention_heads",t.encoder_hidden_size=t.decoder_hidden_size="d_model";break;case"musicgen_decoder":t.num_encoder_layers=t.num_decoder_layers="num_hidden_layers",t.num_encoder_heads=t.num_decoder_heads="num_attention_heads",t.encoder_hidden_size=t.decoder_hidden_size="hidden_size";break;case"moonshine":t.num_decoder_layers="decoder_num_hidden_layers",t.num_decoder_heads="decoder_num_key_value_heads",t.num_encoder_layers="encoder_num_hidden_layers",t.num_encoder_heads="encoder_num_key_value_heads",t.encoder_hidden_size=t.decoder_hidden_size="hidden_size";break;case"cohere_asr":t.num_decoder_layers="num_hidden_layers",t.num_decoder_heads="num_key_value_heads",t.decoder_hidden_size="hidden_size",t.decoder_dim_kv="head_dim";const{num_hidden_layers:n,num_attention_heads:a,hidden_size:o}=e.encoder_config;r={num_encoder_layers:n,num_encoder_heads:a,encoder_hidden_size:o,encoder_dim_kv:e.head_dim};break;case"vision-encoder-decoder":const i=ps(e.decoder),l="num_decoder_layers"in i,c=rt(e,["model_type","is_encoder_decoder"]);return l?(c.num_decoder_layers=i.num_decoder_layers,c.num_decoder_heads=i.num_decoder_heads,c.decoder_hidden_size=i.decoder_hidden_size,c.num_encoder_layers=i.num_encoder_layers,c.num_encoder_heads=i.num_encoder_heads,c.encoder_hidden_size=i.encoder_hidden_size):(c.num_layers=i.num_layers,c.num_heads=i.num_heads,c.hidden_size=i.hidden_size),c}const s={...r,...rt(e,["model_type","multi_query","is_encoder_decoder"])};for(const n in t)s[n]=e[t[n]];return s}function va(e,t){e instanceof Qi||(e=new Qi(e));const r=(t==null?void 0:t.batch_size)??1;if(["lfm2","lfm2_moe"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a={},{layer_types:o,num_attention_heads:i,num_key_value_heads:l,hidden_size:c,conv_L_cache:d}=e,h=c/i;for(let _=0;_<o.length;++_)if(o[_]==="full_attention")for(const p of["key","value"])a[`${s}.${_}.${p}`]=[r,l,0,h];else if(o[_]==="conv")a[`${n}_conv.${_}`]=[r,c,d];else throw new Error(`Unsupported layer type: ${o[_]}`);return a}else if(["granitemoehybrid","falcon_h1","nemotron_h"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a=e,o=a.layer_types??a.layers_block_type,i=a.num_hidden_layers??(o==null?void 0:o.length),l=a.num_key_value_heads,c=a.head_dim??a.hidden_size/a.num_attention_heads,d=a.mamba_n_heads??a.mamba_num_heads,h=a.mamba_d_head??a.mamba_head_dim,_=a.mamba_d_state??a.ssm_state_size,p=a.mamba_n_groups??a.n_groups,w=a.mamba_d_conv??a.conv_kernel,y=(a.mamba_d_ssm??(a.mamba_expand?a.mamba_expand*a.hidden_size:d*h))+2*p*_,M={};for(let T=0;T<i;++T)if((!o||o[T]==="mamba")&&(M[`${n}_conv.${T}`]=[r,y,w],M[`${n}_ssm.${T}`]=[r,d,h,_]),!o||o[T]==="attention")for(const A of["key","value"])M[`${s}.${T}.${A}`]=[r,l,0,c];return M}else if(["qwen3_next","qwen3_5_text","qwen3_5_moe_text","olmo_hybrid"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a={},{head_dim:o,layer_types:i,num_attention_heads:l,num_key_value_heads:c,hidden_size:d,linear_num_value_heads:h,linear_num_key_heads:_,linear_key_head_dim:p,linear_value_head_dim:w,linear_conv_kernel_dim:v}=e,y=p*_,M=w*h,T=o??d/l;for(let A=0;A<i.length;++A)if(i[A]==="full_attention")for(const C of["key","value"])a[`${s}.${A}.${C}`]=[r,c,0,T];else if(i[A]==="linear_attention"){if(e.model_type==="olmo_hybrid")a[`${n}_conv.${A}.key`]=[r,y,v],a[`${n}_conv.${A}.value`]=[r,M,v],a[`${n}_conv.${A}.query`]=[r,y,v];else{const C=y*2+M;a[`${n}_conv.${A}`]=[r,C,v]}a[`${n}_recurrent.${A}`]=[r,h,p,w]}else throw new Error(`Unsupported layer type: ${i[A]}`);return a}else if(["gemma4","gemma4_text"].includes(e.model_type)){const s=e.model_type==="gemma4"?e.text_config:e,n=(t==null?void 0:t.prefix)??"past_key_values",a={},o=s.num_hidden_layers,i=s.num_kv_shared_layers??0,l=o-i,c=s.num_key_value_heads,d=s.head_dim,h=s.global_head_dim??d,_=s.layer_types??[];for(let p=0;p<l;++p){const w=_[p]==="full_attention"?h:d;for(const v of["key","value"])a[`${n}.${p}.${v}`]=[r,c,0,w]}return a}else if(["lfm2_vl","qwen3_5","qwen3_5_moe","voxtral_realtime"].includes(e.model_type)){let s;return e.model_type==="voxtral_realtime"&&(t==null?void 0:t.session_name)==="audio_encoder"?s=e.audio_config:s=e.text_config,va(s,t)}return qT(e,t)}function qT(e,{prefix:t="past_key_values",batch_size:r=1}={}){const s={},n=e.normalized_config;if(n.is_encoder_decoder&&"num_encoder_heads"in n&&"num_decoder_heads"in n){const a=n.encoder_dim_kv??n.encoder_hidden_size/n.num_encoder_heads,o=n.decoder_dim_kv??n.decoder_hidden_size/n.num_decoder_heads,i=[r,n.num_encoder_heads,0,a],l=[r,n.num_decoder_heads,0,o];for(let c=0;c<n.num_decoder_layers;++c)s[`${t}.${c}.encoder.key`]=i,s[`${t}.${c}.encoder.value`]=i,s[`${t}.${c}.decoder.key`]=l,s[`${t}.${c}.decoder.value`]=l}else{const a=n.num_heads,o=n.num_layers,i=n.dim_kv??n.hidden_size/(n.num_attention_heads??a);if(n.model_type==="falcon"){const l=[r*a,0,i];for(let c=0;c<o;++c)s[`${t}.${c}.key`]=l,s[`${t}.${c}.value`]=l}else if(n.multi_query){const l=[r*a,0,2*i];for(let c=0;c<o;++c)s[`${t}.${c}.key_value`]=l}else if(n.model_type==="bloom"){const l=[r*a,i,0],c=[r*a,0,i];for(let d=0;d<o;++d)s[`${t}.${d}.key`]=l,s[`${t}.${d}.value`]=c}else if(n.model_type==="openelm")for(let l=0;l<o;++l){const c=[r,a[l],0,i];s[`${t}.${l}.key`]=c,s[`${t}.${l}.value`]=c}else{const l=[r,a,0,i];for(let c=0;c<o;++c)s[`${t}.${c}.key`]=l,s[`${t}.${c}.value`]=l}}return s}var Qi=class wd{constructor(t){k(this,"model_type",null);k(this,"is_encoder_decoder",!1);k(this,"max_position_embeddings");k(this,"transformers.js_config");Object.assign(this,t),this.normalized_config=ps(this)}static async from_pretrained(t,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main"}={}){s&&!(s instanceof wd)&&(s=new wd(s));const i=s??await jT(t,{progress_callback:r,config:s,cache_dir:n,local_files_only:a,revision:o});return new this(i)}},nn=class{static async from_pretrained(...e){return Qi.from_pretrained(...e)}};function X_(e,t,r){return e?typeof e=="object"&&e!==null?e.hasOwnProperty(t)?+e[t]:e.hasOwnProperty(r)?+e[r]:0:+e:0}function Y_(e,t){const r=[];for(let s=0;s<t;++s)r.push(`${e}_data${s===0?"":"_"+s}`);return r}async function WT(e,t,r,s){const n=`${t}${s}.onnx`,a=`${r.subfolder??""}/${n}`;return await ua(e,a,!0,r,fe.IS_NODE_ENV)}async function HT(e,t,r,s,n,a={}){const o=`${t}${r}.onnx`,i=fe.IS_NODE_ENV;let l=[];const c=X_(n,o,t);if(c>0){if(c>yf)throw new Error(`The number of external data chunks (${c}) exceeds the maximum allowed value (${yf}).`);const d=Y_(o,c);for(const h of d){const _=`${s.subfolder??""}/${h}`;l.push(new Promise(async(p,w)=>{const v=await ua(e,_,!0,s,i);p(v instanceof Uint8Array?{path:h,data:v}:h)}))}}else a.externalData!==void 0&&(l=a.externalData.map(async d=>{if(typeof d.data=="string"){const h=await ua(e,d.data,!0,s);return{...d,data:h}}return d}));return Promise.all(l)}async function QT(e,t,r,s=!1,n=void 0){var C;let a=((C=r.config)==null?void 0:C["transformers.js_config"])??{};const o=$f(r.device??a.device,t,{warn:S=>ue.info(S)}),i=wx(o),l=a.device_config??{};l.hasOwnProperty(o)&&(a={...a,...l[o]});const c=jf(r.dtype??a.dtype,t,o,{configDtype:a.dtype,warn:S=>ue.info(S)});if(Ai.hasOwnProperty(c)){if(o==="webgpu"&&!fe.IS_NODE_ENV&&c===We.fp16&&!await bx())throw new Error(`The device (${o}) does not support fp16.`)}else throw new Error(`Invalid dtype: ${c}. Should be one of: ${Object.keys(We).join(", ")}`);const d=a.kv_cache_dtype,h=d?typeof d=="string"?d:d[c]??"float32":void 0;if(h&&!["float32","float16"].includes(h))throw new Error(`Invalid kv_cache_dtype: ${h}. Should be one of: float32, float16`);const _=Ai[c],p={...r.session_options};p.executionProviders??(p.executionProviders=i);const w=a.free_dimension_overrides;w?p.freeDimensionOverrides??(p.freeDimensionOverrides=w):o.startsWith("webnn")&&!p.freeDimensionOverrides&&ue.warn(`WebNN does not currently support dynamic shapes and requires 'free_dimension_overrides' to be set in config.json, preferably as a field within config["transformers.js_config"]["device_config"]["${o}"]. When 'free_dimension_overrides' is not set, you may experience significant performance degradation.`);const v=WT(e,t,r,_),y=r.use_external_data_format??a.use_external_data_format,M=await HT(e,t,_,r,y,p);if(M.length>0&&(!fe.IS_NODE_ENV||M.some(S=>typeof S!="string"))&&(p.externalData=M),s&&o==="webgpu"&&d!==!1){const S=va(r.config,{prefix:"present",session_name:n});if(Object.keys(S).length>0&&!Ti()){const N={};for(const x in S)N[x]="gpu-buffer";p.preferredOutputLocation=N}}return{buffer_or_path:await v,session_options:p,session_config:{dtype:c,kv_cache_dtype:h,device:o}}}async function XT(e,t,r,s=void 0){return Object.fromEntries(await Promise.all(Object.keys(t).map(async n=>{const a=(s==null?void 0:s[n])??!1,{buffer_or_path:o,session_options:i,session_config:l}=await QT(e,t[n],r,a,n),c=await zf(o,i,l);return[n,c]})))}function J_(e){for(let t in e)Gf(e[t])?e[t]=new U(e[t]):typeof e[t]=="object"&&J_(e[t]);return e}async function xe(e,t){const r=YT(e,t);try{const s=Object.fromEntries(Object.entries(r).map(([a,o])=>{const i=o.ort_tensor;return fe.IS_NODE_ENV&&typeof Float16Array<"u"&&i.cpuData instanceof Float16Array&&(i.cpuData=new Uint16Array(i.cpuData.buffer)),[a,i]})),n=await Rf(e,s);return J_(n)}catch(s){const n=Object.fromEntries(Object.entries(r).map(([a,o])=>{const i={type:o.type,dims:o.dims,location:o.location};return i.location!=="gpu-buffer"&&(i.data=o.data),[a,i]}));throw ue.error(`An error occurred during model execution: "${s}".`),ue.error("Inputs given to model:",n),s}}function YT(e,t){const r=Object.create(null),s=[];for(const o of e.inputNames){const i=t[o];if(!(i instanceof U)){s.push(o);continue}r[o]=Ti()?i.clone():i}if(s.length>0)throw new Error(`An error occurred during model execution: "Missing the following inputs: ${s.join(", ")}.`);const n=Object.keys(t).length,a=e.inputNames.length;if(n>a){let o=Object.keys(t).filter(i=>!e.inputNames.includes(i));ue.warn(`WARNING: Too many inputs were provided (${n} > ${a}). The following inputs will be ignored: "${o.join(", ")}".`)}return r}var Ye=class{},ie=class extends Ye{constructor({logits:e,...t}){super(),this.logits=e;const r=Object.values(t);r.length>0&&(this.attentions=r)}},He=class extends Ye{constructor({logits:e}){super(),this.logits=e}},Je=class extends Ye{constructor({logits:e}){super(),this.logits=e}},ht=class extends Ye{constructor({start_logits:e,end_logits:t}){super(),this.start_logits=e,this.end_logits=t}},Xr=class extends Ye{constructor({logits:e}){super(),this.logits=e}},JT=class extends Ye{constructor({alphas:e}){super(),this.alphas=e}},qt=class extends vt{_call(e,t){throw Error("`_call` should be implemented in a subclass")}},KT=class extends vt{_call(e,t){throw Error("`_call` should be implemented in a subclass")}},Xi=class extends vt{constructor(){super(),this.processors=[]}push(e){this.processors.push(e)}extend(e){this.processors.push(...e)}_call(e,t){let r=t;for(const s of this.processors)r=s(e,r);return r}[Symbol.iterator](){return this.processors.values()}},ZT=class extends qt{constructor(e){super(),this.bos_token_id=e}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===1){const s=t[r].data;s.fill(-1/0),s[this.bos_token_id]=0}return t}},eE=class extends qt{constructor(e,t){super(),this.max_length=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===this.max_length-1){const s=t[r].data;s.fill(-1/0);for(const n of this.eos_token_id)s[n]=0}return t}},tE=class extends qt{constructor(e){super(),this.suppress_tokens=e}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;for(const n of this.suppress_tokens)s[n]=-1/0}return t}},K_=class extends qt{constructor(e,t){super(),this.begin_suppress_tokens=e,this.begin_index=t}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===this.begin_index){const s=t[r].data;for(const n of this.begin_suppress_tokens)s[n]=-1/0}return t}},rE=class extends qt{constructor(e,t){super(),this.eos_token_id=Array.isArray(e.eos_token_id)?e.eos_token_id[0]:e.eos_token_id,this.no_timestamps_token_id=e.no_timestamps_token_id,this.timestamp_begin=this.no_timestamps_token_id+1,this.begin_index=t.length,t.at(-1)===this.no_timestamps_token_id&&(this.begin_index-=1),this.max_initial_timestamp_index=e.max_initial_timestamp_index}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;if(s[this.no_timestamps_token_id]=-1/0,e[r].length===this.begin_index){s.subarray(0,this.timestamp_begin).fill(-1/0);continue}const n=e[r].slice(this.begin_index),a=n.length>=1&&n[n.length-1]>=this.timestamp_begin,o=n.length<2||n[n.length-2]>=this.timestamp_begin;if(a&&(o?s.subarray(this.timestamp_begin).fill(-1/0):s.subarray(0,this.eos_token_id).fill(-1/0)),e[r].length===this.begin_index&&this.max_initial_timestamp_index!==null){const d=this.timestamp_begin+this.max_initial_timestamp_index;s.subarray(d+1).fill(-1/0)}const i=ax(s),l=Math.log(i.subarray(this.timestamp_begin).map(Math.exp).reduce((d,h)=>d+h)),c=je(i.subarray(0,this.timestamp_begin))[0];l>c&&s.subarray(0,this.timestamp_begin).fill(-1/0)}return t}},sE=class extends qt{constructor(e){super(),this.no_repeat_ngram_size=e}getNgrams(e){const t=e.length,r=[];for(let n=0;n<t+1-this.no_repeat_ngram_size;++n){const a=[];for(let o=0;o<this.no_repeat_ngram_size;++o)a.push(e[n+o]);r.push(a.map(Number))}const s=new Map;for(const n of r){const a=n.slice(0,n.length-1),o=JSON.stringify(a),i=s.get(o)??[];i.push(n[n.length-1]),s.set(o,i)}return s}getGeneratedNgrams(e,t){const r=t.slice(t.length+1-this.no_repeat_ngram_size,t.length);return e.get(JSON.stringify(r.map(Number)))??[]}calcBannedNgramTokens(e){const t=[];if(e.length+1<this.no_repeat_ngram_size)return t;{const r=this.getNgrams(e);return this.getGeneratedNgrams(r,e)}}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data,n=this.calcBannedNgramTokens(e[r]);for(const a of n)s[a]=-1/0}return t}},nE=class extends qt{constructor(e){super(),this.penalty=e}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;for(const n of new Set(e[r])){const a=Number(n);s[a]<0?s[a]*=this.penalty:s[a]/=this.penalty}}return t}},aE=class extends qt{constructor(e,t){super(),this.min_length=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length<this.min_length){const s=t[r].data;for(const n of this.eos_token_id)s[n]=-1/0}return t}},oE=class extends qt{constructor(e,t,r){super(),this.prompt_length_to_skip=e,this.min_new_tokens=t,this.eos_token_id=Array.isArray(r)?r:[r]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length-this.prompt_length_to_skip<this.min_new_tokens){const n=t[r].data;for(const a of this.eos_token_id)n[a]=-1/0}return t}},iE=class extends qt{constructor(e,t){super(),this.bad_words_ids=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data,n=e[r];for(const a of this.bad_words_ids){if(n.length<a.length-1)continue;let o=!0;for(let i=1;i<=a.length-1;++i)if(a.at(-i-1)!=n.at(-i)){o=!1;break}o&&(s[a.at(-1)]=-1/0)}}return t}},lE=class extends qt{constructor(e){if(super(),e<=1)throw new Error(`Require guidance scale >1 to use the classifier free guidance processor, got guidance scale ${e}.`);this.guidance_scale=e}_call(e,t){if(t.dims[0]!==2*e.length)throw new Error(`Logits should have twice the batch size of the input ids, the first half of batches corresponding to the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got batch size ${t.dims[0]} for the logits and ${e.length} for the input ids.`);const r=e.length,s=t.slice([0,r],null),n=t.slice([r,t.dims[0]],null);for(let a=0;a<n.data.length;++a)n.data[a]+=(s.data[a]-n.data[a])*this.guidance_scale;return n}},cE=class extends KT{constructor(e){super(),this.temperature=e}_call(e,t){const r=t.data;for(let s=0;s<r.length;++s)r[s]/=this.temperature;return t}},Z_=class{constructor(e){k(this,"max_length",20);k(this,"max_new_tokens",null);k(this,"min_length",0);k(this,"min_new_tokens",null);k(this,"early_stopping",!1);k(this,"max_time",null);k(this,"do_sample",!1);k(this,"num_beams",1);k(this,"num_beam_groups",1);k(this,"penalty_alpha",null);k(this,"use_cache",!0);k(this,"temperature",1);k(this,"top_k",50);k(this,"top_p",1);k(this,"typical_p",1);k(this,"epsilon_cutoff",0);k(this,"eta_cutoff",0);k(this,"diversity_penalty",0);k(this,"repetition_penalty",1);k(this,"encoder_repetition_penalty",1);k(this,"length_penalty",1);k(this,"no_repeat_ngram_size",0);k(this,"bad_words_ids",null);k(this,"force_words_ids",null);k(this,"renormalize_logits",!1);k(this,"constraints",null);k(this,"forced_bos_token_id",null);k(this,"forced_eos_token_id",null);k(this,"remove_invalid_values",!1);k(this,"exponential_decay_length_penalty",null);k(this,"suppress_tokens",null);k(this,"streamer",null);k(this,"begin_suppress_tokens",null);k(this,"forced_decoder_ids",null);k(this,"guidance_scale",null);k(this,"num_return_sequences",1);k(this,"output_attentions",!1);k(this,"output_hidden_states",!1);k(this,"output_scores",!1);k(this,"return_dict_in_generate",!1);k(this,"pad_token_id",null);k(this,"bos_token_id",null);k(this,"eos_token_id",null);k(this,"encoder_no_repeat_ngram_size",0);k(this,"decoder_start_token_id",null);k(this,"generation_kwargs",{});Object.assign(this,rt(e,Object.getOwnPropertyNames(this)))}},ya=class extends vt{_call(e,t){throw Error("StoppingCriteria needs to be subclassed")}},ep=class Lv extends vt{constructor(){super(),this.criteria=[]}push(t){this.criteria.push(t)}extend(t){t instanceof Lv?t=t.criteria:t instanceof ya&&(t=[t]),this.criteria.push(...t)}_call(t,r){const s=new Array(t.length).fill(!1);for(const n of this.criteria){const a=n(t,r);for(let o=0;o<s.length;++o)s[o]||(s[o]=a[o])}return s}[Symbol.iterator](){return this.criteria.values()}},uE=class extends ya{constructor(e,t=null){super(),this.max_length=e,this.max_position_embeddings=t}_call(e){return e.map(t=>t.length>=this.max_length)}},dE=class extends ya{constructor(e){super(),Array.isArray(e)||(e=[e]),this.eos_token_id=e}_call(e,t){return e.map(r=>{const s=r.at(-1);return this.eos_token_id.some(n=>s==n)})}},ba=class extends vt{constructor(e){super(),this.generation_config=e}async _call(e){return this.sample(e)}async sample(e){throw Error("sample should be implemented in subclasses.")}getLogits(e,t){let r=e.dims.at(-1),s=e.data;if(t===-1)s=s.slice(-r);else{let n=t*r;s=s.slice(n,n+r)}return s}randomSelect(e){return VM(e)}static getSampler(e){if(e.do_sample)return new fE(e);if(e.num_beams>1)return new _E(e);if(e.num_return_sequences>1)throw Error(`num_return_sequences has to be 1 when doing greedy search, but is ${e.num_return_sequences}.`);return new hE(e)}},hE=class extends ba{async sample(e){const t=je(e.data)[1];return[[BigInt(t),0]]}},fE=class extends ba{async sample(e){let t=e.dims.at(-1);this.generation_config.top_k>0&&(t=Math.min(this.generation_config.top_k,t));const[r,s]=await hs(e,t),n=nt(r.data);return Array.from({length:this.generation_config.num_beams},()=>{const a=this.randomSelect(n);return[s.data[a],Math.log(n[a])]})}},_E=class extends ba{async sample(e){let t=e.dims.at(-1);this.generation_config.top_k>0&&(t=Math.min(this.generation_config.top_k,t));const[r,s]=await hs(e,t),n=nt(r.data);return Array.from({length:this.generation_config.num_beams},(a,o)=>[s.data[o],Math.log(n[o])])}},pE=class{constructor(e){if(e)for(const t in e){if(t in this)throw new TypeError(`Key "${t}" conflicts with an existing property on DynamicCache`);const r=e[t];if(!(r instanceof U))throw new TypeError(`Expected a Tensor for key "${t}", got ${typeof r}`);this[t]=r}}get_seq_length(){const e=this;if(Object.keys(e).length===0)return 0;for(const t in e)if(t.startsWith("past_key_values."))return e[t].dims.at(-2);throw new Error("Unable to determine sequence length from the cache.")}update(e){for(const t in e){const r=this[t],s=e[t];r&&r!==s&&r.location==="gpu-buffer"&&r.dispose(),this[t]=s}}async dispose(){const e=[];for(const t of Object.values(this))t.location==="gpu-buffer"&&e.push(t.dispose());await Promise.all(e)}},Yi=pE,q={EncoderOnly:0,EncoderDecoder:1,Seq2Seq:2,Vision2Seq:3,DecoderOnly:4,DecoderOnlyWithoutHead:5,MaskGeneration:6,ImageTextToText:7,Musicgen:8,MultiModality:9,Phi3V:10,AudioTextToText:11,AutoEncoder:12,ImageAudioTextToText:13,Supertonic:14,Chatterbox:15,VoxtralRealtime:16},hr={[q.DecoderOnly]:{sessions:(e,t)=>({model:t.model_file_name??"model"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.DecoderOnlyWithoutHead]:{sessions:(e,t)=>({model:t.model_file_name??"model"})},[q.Seq2Seq]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.Vision2Seq]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.Musicgen]:{sessions:()=>({model:"text_encoder",decoder_model_merged:"decoder_model_merged",encodec_decode:"encodec_decode"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.EncoderDecoder]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0}},[q.MaskGeneration]:{sessions:()=>({model:"vision_encoder",prompt_encoder_mask_decoder:"prompt_encoder_mask_decoder"})},[q.ImageTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...hr[q.ImageTextToText].text_only_sessions};return r||(s.vision_encoder="vision_encoder"),e.is_encoder_decoder&&(s.model="encoder_model"),s},cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.AudioTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...hr[q.AudioTextToText].text_only_sessions};return r||(s.audio_encoder="audio_encoder"),s},cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.ImageAudioTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...hr[q.ImageAudioTextToText].text_only_sessions};return r||(s.audio_encoder="audio_encoder",s.vision_encoder="vision_encoder"),s},optional_configs:{generation_config:"generation_config.json"}},[q.Phi3V]:{sessions:()=>({prepare_inputs_embeds:"prepare_inputs_embeds",model:"model",vision_encoder:"vision_encoder"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.MultiModality]:{sessions:()=>({prepare_inputs_embeds:"prepare_inputs_embeds",model:"language_model",lm_head:"lm_head",gen_head:"gen_head",gen_img_embeds:"gen_img_embeds",image_decode:"image_decode"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.AutoEncoder]:{sessions:()=>({encoder_model:"encoder_model",decoder_model:"decoder_model"})},[q.Supertonic]:{sessions:()=>({text_encoder:"text_encoder",latent_denoiser:"latent_denoiser",voice_decoder:"voice_decoder"})},[q.Chatterbox]:{sessions:()=>({embed_tokens:"embed_tokens",speech_encoder:"speech_encoder",model:"language_model",conditional_decoder:"conditional_decoder"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.VoxtralRealtime]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...hr[q.VoxtralRealtime].text_only_sessions};return r||(s.audio_encoder="audio_encoder"),s},cache_sessions:{decoder_model_merged:!0,audio_encoder:!0},optional_configs:{generation_config:"generation_config.json"}},default:{sessions:(e,t)=>({model:t.model_file_name??"model"})}};function mE(e){const t=hr[e];return(t==null?void 0:t.text_only_sessions)??null}function gE(e,t,r={}){const s=hr[e]??hr.default;return{sessions:s.sessions(t,r,r.textOnly??!1),cache_sessions:s.cache_sessions,optional_configs:s.optional_configs}}function tp(e,{warn:t=!0}={}){const r=e.architectures||[];for(const s of r){const n=fr.get(s);if(n!==void 0)return n}if(e.model_type){const s=fr.get(e.model_type);if(s!==void 0)return s;for(const n of Object.values(ms))if(n.has(e.model_type)){const a=fr.get(n.get(e.model_type));if(a!==void 0)return a}}if(t){const s=r.length>0?r.join(", "):"(none)";ue.warn(`[resolve_model_type] Architecture(s) not found in MODEL_TYPE_MAPPING: [${s}] for model type '${e.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${fa}`)}return q.EncoderOnly}function rp(e,{config:t=null,cache_dir:r=null,local_files_only:s=!1,revision:n="main"}={}){if(t!==null)return nn.from_pretrained(e,{config:t,cache_dir:r,local_files_only:s,revision:n});const a=JSON.stringify([e,r,s,n]);return Ef(a,()=>nn.from_pretrained(e,{config:t,cache_dir:r,local_files_only:s,revision:n}))}async function sp(e,{config:t=null,dtype:r=null,device:s=null,model_file_name:n=null}={}){t=await rp(e,{config:t});const a=["config.json"],o=t["transformers.js_config"]??{},i=o.use_external_data_format,l="onnx",c=s??o.device;let d=r??o.dtype;const h=tp(t),_=(v,y=null)=>{y=y??v;const M=$f(c,v),T=jf(d,v,M),A=Ai[T]??"",C=`${y}${A}.onnx`,S=`${l}/${C}`;a.push(S);const N=X_(i,C,v);for(const x of Y_(C,N)){const R=`${l}/${x}`;a.push(R)}},{sessions:p,optional_configs:w}=gE(h,t,{model_file_name:n});for(const[v,y]of Object.entries(p))_(v,y);if(w)for(const v of Object.values(w))a.push(v);return a}var ms=null;function wE(e){ms=e}function Ji(e){if(e instanceof U)return e;if(e.length===0)throw Error("items must be non-empty");if(Array.isArray(e[0])){if(e.some(t=>t.length!==e[0].length))throw Error("Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.");return new U("int64",BigInt64Array.from(e.flat().map(t=>BigInt(t))),[e.length,e[0].length])}else return new U("int64",BigInt64Array.from(e.map(t=>BigInt(t))),[1,e.length])}function np(e){return new U("bool",[e],[1])}var ap={[q.DecoderOnly]:{can_generate:!0,forward:_r,prepare_inputs:an},[q.DecoderOnlyWithoutHead]:{can_generate:!1,forward:_r,prepare_inputs:an},[q.Seq2Seq]:{can_generate:!0,forward:Ma,prepare_inputs:xa},[q.Vision2Seq]:{can_generate:!0,forward:Ma,prepare_inputs:xa},[q.Musicgen]:{can_generate:!0,forward:Ma},[q.EncoderDecoder]:{can_generate:!1,forward:Ma},[q.ImageTextToText]:{can_generate:!0,forward:ME,prepare_inputs:ka},[q.AudioTextToText]:{can_generate:!0,forward:bE,prepare_inputs:ka},[q.ImageAudioTextToText]:{can_generate:!0,prepare_inputs:ka},[q.Phi3V]:{can_generate:!0,prepare_inputs:ka},[q.MultiModality]:{can_generate:!0},[q.AutoEncoder]:{can_generate:!1,forward:vE},[q.Chatterbox]:{can_generate:!0,forward:Lr},[q.VoxtralRealtime]:{can_generate:!0,prepare_inputs:an},default:{can_generate:!1,forward:Lr}};function op(e,t){var i;let r=fr.get(e),s=!1;const n=(i=t==null?void 0:t.architectures)==null?void 0:i[0];if(n&&n!==e&&(e!=null&&e.endsWith("ForCausalLM"))&&n.endsWith("ForConditionalGeneration")){const l=fr.get(n);l!==void 0&&(r=l,s=!0)}const a=ap[r]??ap.default,o=hr[r]??hr.default;return{typeConfig:{...a,...o},textOnly:s,modelType:r}}var fr=new Map,Ki=new Map,gs=new Map,P=class extends vt{constructor(t,r,s){super();k(this,"main_input_name","input_ids");k(this,"forward_params",["input_ids","attention_mask"]);k(this,"_return_dict_in_generate_keys",null);this.config=t,this.sessions=r,this.configs=s;const n=gs.get(this.constructor),{typeConfig:a}=op(n,t);this.can_generate=a.can_generate,this._forward=a.forward,this._prepare_inputs_for_generation=a.prepare_inputs,this.can_generate&&this.forward_params.push("past_key_values"),this.custom_config=this.config["transformers.js_config"]??{}}async dispose(){var r;const t=[];for(const s of Object.values(this.sessions))t.push((r=s.release)==null?void 0:r.call(s));return await Promise.all(t)}static async from_pretrained(t,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main",model_file_name:i=null,subfolder:l="onnx",device:c=null,dtype:d=null,use_external_data_format:h=null,session_options:_={}}={}){const p={progress_callback:r,config:s,cache_dir:n,local_files_only:a,revision:o,model_file_name:i,subfolder:l,device:c,dtype:d,use_external_data_format:h,session_options:_},w=gs.get(this);s=p.config=await nn.from_pretrained(t,p);const{typeConfig:v,textOnly:y,modelType:M}=op(w,s);if(M===void 0){const S=w??(s==null?void 0:s.model_type);S!=="custom"&&ue.warn(`Model type for '${S}' not found, assuming encoder-only architecture. Please report this at ${fa}.`)}if(r&&!(r instanceof _i)){const S={};try{const N=await sp(t,{config:s,dtype:d,device:c,model_file_name:i});(await Promise.all(N.map(R=>Zs(t,R,p)))).forEach((R,z)=>{if(R.exists){const $=N[z]==="config.json";S[N[z]]={loaded:$?R.size??0:0,total:R.size??0}}})}catch(N){ue.warn(`Unable to fetch model file metadata for total progress tracking: ${N}`)}Object.keys(S).length>0&&(p.progress_callback=new _i(r,S))}const T=v.sessions(s,p,y),A=[XT(t,T,p,v.cache_sessions)];v.optional_configs&&A.push(kE(t,v.optional_configs,p));const C=await Promise.all(A);return new this(s,...C)}async _call(t){return await this.forward(t)}async forward(t){return await this._forward(this,t)}get generation_config(){var t;return((t=this.configs)==null?void 0:t.generation_config)??null}_get_logits_processor(t,r,s=null){const n=new Xi;if(t.repetition_penalty!==null&&t.repetition_penalty!==1&&n.push(new nE(t.repetition_penalty)),t.no_repeat_ngram_size!==null&&t.no_repeat_ngram_size>0&&n.push(new sE(t.no_repeat_ngram_size)),t.bad_words_ids!==null&&n.push(new iE(t.bad_words_ids,t.eos_token_id)),t.min_length!==null&&t.eos_token_id!==null&&t.min_length>0&&n.push(new aE(t.min_length,t.eos_token_id)),t.min_new_tokens!==null&&t.eos_token_id!==null&&t.min_new_tokens>0&&n.push(new oE(r,t.min_new_tokens,t.eos_token_id)),t.forced_bos_token_id!==null&&n.push(new ZT(t.forced_bos_token_id)),t.forced_eos_token_id!==null&&n.push(new eE(t.max_length,t.forced_eos_token_id)),t.suppress_tokens!==null&&n.push(new tE(t.suppress_tokens)),t.begin_suppress_tokens!==null){const a=r>1||t.forced_bos_token_id===null?r:r+1;n.push(new K_(t.begin_suppress_tokens,a))}return t.guidance_scale!==null&&t.guidance_scale>1&&n.push(new lE(t.guidance_scale)),t.temperature===0&&t.do_sample&&(ue.warn("`do_sample` changed to false because `temperature: 0` implies greedy sampling (always selecting the most likely token), which is incompatible with `do_sample: true`."),t.do_sample=!1),t.do_sample&&t.temperature!==null&&t.temperature!==1&&n.push(new cE(t.temperature)),s!==null&&n.extend(s),n}_prepare_generation_config(t,r,s=Z_){const n={...this.config};for(const o of["decoder","generator","text_config"])o in n&&Object.assign(n,n[o]);const a=new s(n);return Object.assign(a,this.generation_config??{}),t&&Object.assign(a,t),r&&Object.assign(a,rt(r,Object.getOwnPropertyNames(a))),a}_get_stopping_criteria(t,r=null){const s=new ep;return t.max_length!==null&&s.push(new uE(t.max_length,this.config.max_position_embeddings??null)),t.eos_token_id!==null&&s.push(new dE(t.eos_token_id)),r&&s.extend(r),s}_validate_model_class(){if(!this.can_generate){const t=[ms.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,ms.MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,ms.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,ms.MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES].filter(Boolean),r=gs.get(this.constructor),s=new Set,n=this.config.model_type;for(const o of t){const i=o==null?void 0:o.get(n);i&&s.add(i)}let a=`The current model class (${r}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`;throw s.size>0&&(a+=` Please use the following class instead: ${[...s].join(", ")}`),Error(a)}}prepare_inputs_for_generation(...t){if(!this._prepare_inputs_for_generation)throw new Error("prepare_inputs_for_generation is not implemented for this model.");return this._prepare_inputs_for_generation(this,...t)}_update_model_kwargs_for_generation({generated_input_ids:t,outputs:r,model_inputs:s,is_encoder_decoder:n}){return s.past_key_values=Zi(r,s.past_key_values),s.input_ids=new U("int64",t.flat(),[t.length,1]),n?"decoder_attention_mask"in s&&(s.decoder_attention_mask=ze([s.decoder_attention_mask,yt([s.decoder_attention_mask.dims[0],1])],1)):s.attention_mask=ze([s.attention_mask,yt([s.attention_mask.dims[0],1])],1),s.position_ids=null,s}_prepare_model_inputs({inputs:t,bos_token_id:r,model_kwargs:s}){const n=rt(s,this.forward_params),a=this.main_input_name;if(a in n){if(t)throw new Error("`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. Make sure to either pass {inputs} or {input_name}=...")}else n[a]=t;return{inputs_tensor:n[a],model_inputs:n,model_input_name:a}}async _prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:t,model_inputs:r,model_input_name:s,generation_config:n}){if(this.sessions.model.inputNames.includes("inputs_embeds")&&!r.inputs_embeds&&"_prepare_inputs_embeds"in this){const{input_ids:o,pixel_values:i,attention_mask:l,...c}=r,d=await this._prepare_inputs_embeds(r);r={...c,...rt(d,["inputs_embeds","attention_mask"])}}let{last_hidden_state:a}=await Lr(this,r);if(n.guidance_scale!==null&&n.guidance_scale>1)a=ze([a,Fi(a,0)],0),"attention_mask"in r&&(r.attention_mask=ze([r.attention_mask,Jf(r.attention_mask)],0));else if(r.decoder_input_ids){const o=Ji(r.decoder_input_ids).dims[0];if(o!==a.dims[0]){if(a.dims[0]!==1)throw new Error(`The encoder outputs have a different batch size (${a.dims[0]}) than the decoder inputs (${o}).`);a=ze(Array.from({length:o},()=>a),0)}}return r.encoder_outputs=a,r}_prepare_decoder_input_ids_for_generation({batch_size:t,model_input_name:r,model_kwargs:s,decoder_start_token_id:n,bos_token_id:a,generation_config:o}){let{decoder_input_ids:i,...l}=s;if(!(i instanceof U)){if(i)Array.isArray(i[0])||(i=Array.from({length:t},()=>i));else if(n??(n=a),this.config.model_type==="musicgen")i=Array.from({length:t*this.config.decoder.num_codebooks},()=>[n]);else if(Array.isArray(n)){if(n.length!==t)throw new Error(`\`decoder_start_token_id\` expcted to have length ${t} but got ${n.length}`);i=n}else i=Array.from({length:t},()=>[n]);i=Ji(i)}return l.decoder_attention_mask=Xf(i),{input_ids:i,model_inputs:l}}async generate({inputs:t=null,generation_config:r=null,logits_processor:s=null,stopping_criteria:n=null,streamer:a=null,...o}){this._validate_model_class(),r=this._prepare_generation_config(r,o);let{inputs_tensor:i,model_inputs:l,model_input_name:c}=this._prepare_model_inputs({inputs:t,model_kwargs:o});const d=this.config.is_encoder_decoder;d&&("encoder_outputs"in l||(l=await this._prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:i,model_inputs:l,model_input_name:c,generation_config:r})));let h;d?{input_ids:h,model_inputs:l}=this._prepare_decoder_input_ids_for_generation({batch_size:l[c].dims.at(0),model_input_name:c,model_kwargs:l,decoder_start_token_id:r.decoder_start_token_id,bos_token_id:r.bos_token_id,generation_config:r}):h=l[c];let _=h.dims.at(-1);r.max_new_tokens!==null&&(r.max_length=_+r.max_new_tokens);const p=this._get_logits_processor(r,_,s),w=this._get_stopping_criteria(r,n),v=l[c].dims.at(0),y=ba.getSampler(r),M=new Array(v).fill(0),T=h.tolist();a&&a.put(T);let A,C={},S={};for(;;){if(l=this.prepare_inputs_for_generation(T,l,r),A=await this.forward(l),r.return_dict_in_generate)if(r.output_attentions){const I=yE(A);for(const te in I)te in C||(C[te]=[]),C[te].push(I[te])}else this._return_dict_in_generate_keys&&Object.assign(S,rt(A,this._return_dict_in_generate_keys));const $=A.logits.slice(null,-1,null).to("float32"),Q=p(T,$),H=[];for(let I=0;I<Q.dims.at(0);++I){const te=Q[I],W=await y(te);for(const[ee,G]of W){const L=BigInt(ee);M[I]+=G,T[I].push(L),H.push([L]);break}}if(a&&a.put(H),w(T).every(I=>I))break;l=this._update_model_kwargs_for_generation({generated_input_ids:H,outputs:A,model_inputs:l,is_encoder_decoder:d})}a&&a.end();const N=new U("int64",T.flat(),[T.length,T[0].length]),x=Zi(A,l.past_key_values),R=new Set(Object.values(x));for(const $ of Object.values(A))$.location==="gpu-buffer"&&!R.has($)&&$.dispose();return"past_key_values"in o||r.return_dict_in_generate||await x.dispose(),r.return_dict_in_generate?{sequences:N,past_key_values:x,...C,...S}:N}async _encode_input(t,r,s){if(!Object.hasOwn(this.sessions,t))throw new Error(`Model does not have a ${t} session.`);const n=this.sessions[t];return(await xe(n,rt(r,n.inputNames)))[s]}async encode_image(t){return this._encode_input("vision_encoder",t,"image_features")}async encode_text(t){return this._encode_input("embed_tokens",t,"inputs_embeds")}async encode_audio(t){return this._encode_input("audio_encoder",t,"audio_features")}};async function Ma(e,t){let{encoder_outputs:r,input_ids:s,decoder_input_ids:n,decoder_attention_mask:a,...o}=t;if(!r){const i=rt(t,e.sessions.model.inputNames);r=(await Lr(e,i)).last_hidden_state}return o.input_ids=n,o.encoder_hidden_states=r,e.sessions.decoder_model_merged.inputNames.includes("encoder_attention_mask")&&(o.encoder_attention_mask=t.attention_mask),a&&!o.attention_mask&&(o.attention_mask=a),await _r(e,o,!0)}async function Lr(e,t){const r=e.sessions.model,s=rt(t,r.inputNames);if(r.inputNames.includes("inputs_embeds")&&!s.inputs_embeds){if(!t.input_ids)throw new Error("Both `input_ids` and `inputs_embeds` are missing in the model inputs.");s.inputs_embeds=await e.encode_text({input_ids:t.input_ids})}if(r.inputNames.includes("token_type_ids")&&!s.token_type_ids){if(!s.input_ids)throw new Error("Both `input_ids` and `token_type_ids` are missing in the model inputs.");s.token_type_ids=Jf(s.input_ids)}if(r.inputNames.includes("pixel_mask")&&!s.pixel_mask){if(!s.pixel_values)throw new Error("Both `pixel_values` and `pixel_mask` are missing in the model inputs.");const n=s.pixel_values.dims;s.pixel_mask=yt([n[0],n[2],n[3]])}return await xe(r,s)}async function vE(e,t){const r=await e.encode(t);return await e.decode(r)}function Zi(e,t){const r=Object.create(null);for(const s in e)if(s.startsWith("present")){const n=s.replace("present_ssm","past_ssm").replace("present_conv","past_conv").replace("present_recurrent","past_recurrent").replace("present","past_key_values");s.includes("encoder")&&t?r[n]=t[n]:r[n]=e[s]}return t?(t.update(r),t):new Yi(r)}function yE(e){const t={};for(const r of["cross_attentions","encoder_attentions","decoder_attentions"])for(const s in e)s.startsWith(r)&&(r in t||(t[r]=[]),t[r].push(e[s]));return t}function el(e,t,r){var c,d,h;if(r&&Object.keys(r).length>0)return Object.assign(t,r),r;const s=e.sessions.decoder_model_merged??e.sessions.model,n=((d=(c=t[e.main_input_name]??t.attention_mask)==null?void 0:c.dims)==null?void 0:d[0])??1,a=((h=s==null?void 0:s.config)==null?void 0:h.kv_cache_dtype)??"float32",o=a==="float16"?ds.float16:ds.float32,i=va(e.config,{batch_size:n}),l=Object.create(null);for(const _ in i){const p=i[_].reduce((v,y)=>v*y,1),w=new U(a,new o(p),i[_]);t[_]=w,l[_]=w}return r?(r.update(l),r):new Yi(l)}async function _r(e,t,r=!1){const s=e.sessions[r?"decoder_model_merged":"model"],{past_key_values:n,...a}=t;if(s.inputNames.includes("use_cache_branch")&&(a.use_cache_branch=np(n!=null&&Object.keys(n).length>0)),s.inputNames.includes("position_ids")&&a.attention_mask&&!a.position_ids){const i=["paligemma","gemma3_text","gemma3"].includes(e.config.model_type)?1:0;a.position_ids=xE(a,n,i)}s.inputNames.includes("num_logits_to_keep")&&!a.num_logits_to_keep&&(a.num_logits_to_keep=new U("int64",[0n],[])),el(e,a,n);const o=rt(a,s.inputNames);return await xe(s,o)}async function ip(e,{encode_function:t,merge_function:r,modality_input_names:s,modality_output_name:n,input_ids:a=null,attention_mask:o=null,position_ids:i=null,inputs_embeds:l=null,past_key_values:c=null,generation_config:d=null,logits_processor:h=null,..._}){if(!l){l=await e.encode_text({input_ids:a,..._});const w=rt(_,s);if(Object.keys(w).length>0){if(a.dims[1]!==1){const v=await t({...w,..._});({inputs_embeds:l,attention_mask:o}=r({[n]:v,inputs_embeds:l,input_ids:a,attention_mask:o}))}else if(c&&a.dims[1]===1){const v=a.dims[1],y=c.get_seq_length();o=ze([yt([a.dims[0],y]),o.slice(null,[o.dims[1]-v,o.dims[1]])],1)}}}if(!i&&["qwen2_vl","qwen2_vl_text","qwen2_5_vl","qwen2_5_vl_text","qwen3_vl","qwen3_vl_text","qwen3_vl_moe","qwen3_vl_moe_text","qwen3_5","qwen3_5_text","qwen3_5_moe","qwen3_5_moe_text","glm_ocr","glm_ocr_text"].includes(e.config.model_type)){const{image_grid_thw:w,video_grid_thw:v}=_;[i]=e.get_rope_index(a,w,v,o)}return await _r(e,{inputs_embeds:l,past_key_values:c,attention_mask:o,position_ids:i,generation_config:d,logits_processor:h},!0)}async function bE(e,t){return await ip(e,{...t,modality_input_names:["audio_values","input_features"],modality_output_name:"audio_features",encode_function:e.encode_audio.bind(e),merge_function:e._merge_input_ids_with_audio_features.bind(e)})}async function ME(e,t){return await ip(e,{...t,modality_input_names:["pixel_values"],modality_output_name:"image_features",encode_function:e.encode_image.bind(e),merge_function:e._merge_input_ids_with_image_features.bind(e)})}function lp(e,t=0){const[r,s]=e.dims,n=e.data,a=new BigInt64Array(n.length);for(let o=0;o<r;++o){const i=o*s;let l=BigInt(t);for(let c=0;c<s;++c){const d=i+c;n[d]===0n?a[d]=BigInt(1):(a[d]=l,l+=n[d])}}return{data:a,dims:e.dims}}function xE(e,t=null,r=0){const{input_ids:s,inputs_embeds:n,attention_mask:a}=e,{data:o,dims:i}=lp(a,r);let l=new U("int64",o,i);if(t){const c=-(s??n).dims.at(1);l=l.slice(null,[c,null])}return l}function an(e,t,r,s){const n=r.past_key_values?r.past_key_values.get_seq_length():0,a=e.sessions.decoder_model_merged??e.sessions.model;if(a!=null&&a.inputNames.includes("num_logits_to_keep")&&!r.num_logits_to_keep&&(r.num_logits_to_keep=new U("int64",[1n],[])),!r.attention_mask){let o;for(const i of["input_ids","inputs_embeds","position_ids"])if(r[i]){o=r[i].dims;break}if(!o)throw new Error("attention_mask is not provided, and unable to infer its shape from model inputs.");r.attention_mask=yt([o[0],n+o[1]])}if(r.past_key_values){const{input_ids:o,attention_mask:i}=r;i&&i.dims[1]>o.dims[1]||n<o.dims[1]&&(r.input_ids=o.slice(null,[n,null]))}return r}function xa(e,t,r,s){return r.past_key_values&&(t=t.map(n=>[n.at(-1)])),{...r,decoder_input_ids:Ji(t)}}function ka(e,...t){return e.config.is_encoder_decoder?xa(e,...t):an(e,...t)}function cp({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n}){const a=s.tolist().map(c=>c.reduce((d,h,_)=>(h==e&&d.push(_),d),[])),o=a.reduce((c,d)=>c+d.length,0),i=r.dims[0];if(o!==i)throw new Error(`Number of tokens and features do not match: tokens: ${o}, features ${i}`);let l=0;for(let c=0;c<a.length;++c){const d=a[c],h=t[c];for(let _=0;_<d.length;++_)h[d[_]].data.set(r[l++].data)}return{inputs_embeds:t,attention_mask:n}}function tl({image_token_id:e,inputs_embeds:t,image_features:r,input_ids:s,attention_mask:n}){return cp({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n})}function up({audio_token_id:e,inputs_embeds:t,audio_features:r,input_ids:s,attention_mask:n}){return cp({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n})}async function kE(e,t,r){return Object.fromEntries(await Promise.all(Object.keys(t).map(async s=>{const n=await Zt(e,t[s],!1,r);return[s,n]})))}var rl={};ns(rl,{ASTForAudioClassification:()=>DE,ASTModel:()=>NE,ASTPreTrainedModel:()=>ol,AfmoeForCausalLM:()=>LE,AfmoeModel:()=>FE,AfmoePreTrainedModel:()=>nl,AlbertForMaskedLM:()=>CE,AlbertForQuestionAnswering:()=>AE,AlbertForSequenceClassification:()=>EE,AlbertModel:()=>TE,AlbertPreTrainedModel:()=>on,ApertusForCausalLM:()=>PE,ApertusModel:()=>SE,ApertusPreTrainedModel:()=>sl,ArceeForCausalLM:()=>OE,ArceeModel:()=>IE,ArceePreTrainedModel:()=>al,BartForConditionalGeneration:()=>BE,BartForSequenceClassification:()=>RE,BartModel:()=>zE,BartPretrainedModel:()=>Ta,BeitForImageClassification:()=>$E,BeitModel:()=>GE,BeitPreTrainedModel:()=>il,BertForMaskedLM:()=>UE,BertForQuestionAnswering:()=>WE,BertForSequenceClassification:()=>jE,BertForTokenClassification:()=>qE,BertModel:()=>VE,BertPreTrainedModel:()=>ws,BlenderbotForConditionalGeneration:()=>QE,BlenderbotModel:()=>HE,BlenderbotPreTrainedModel:()=>ll,BlenderbotSmallForConditionalGeneration:()=>YE,BlenderbotSmallModel:()=>XE,BlenderbotSmallPreTrainedModel:()=>cl,BloomForCausalLM:()=>KE,BloomModel:()=>JE,BloomPreTrainedModel:()=>ul,CHMv2ForDepthEstimation:()=>o2,CHMv2PreTrainedModel:()=>pp,CLIPModel:()=>l2,CLIPPreTrainedModel:()=>Yr,CLIPSegForImageSegmentation:()=>f2,CLIPSegModel:()=>h2,CLIPSegPreTrainedModel:()=>dl,CLIPTextModel:()=>c2,CLIPTextModelWithProjection:()=>wp,CLIPVisionModel:()=>u2,CLIPVisionModelWithProjection:()=>d2,CamembertForMaskedLM:()=>e2,CamembertForQuestionAnswering:()=>s2,CamembertForSequenceClassification:()=>t2,CamembertForTokenClassification:()=>r2,CamembertModel:()=>ZE,CamembertPreTrainedModel:()=>vs,ChatterboxModel:()=>fp,ChatterboxPreTrainedModel:()=>hp,ChineseCLIPModel:()=>a2,ChineseCLIPPreTrainedModel:()=>_p,ClapAudioModelWithProjection:()=>gp,ClapModel:()=>i2,ClapPreTrainedModel:()=>Ea,ClapTextModelWithProjection:()=>mp,CodeGenForCausalLM:()=>p2,CodeGenModel:()=>_2,CodeGenPreTrainedModel:()=>hl,Cohere2ForCausalLM:()=>v2,Cohere2Model:()=>w2,Cohere2PreTrainedModel:()=>_l,CohereAsrForConditionalGeneration:()=>b2,CohereAsrModel:()=>y2,CohereAsrPreTrainedModel:()=>pl,CohereForCausalLM:()=>g2,CohereModel:()=>m2,CoherePreTrainedModel:()=>fl,ConvBertForMaskedLM:()=>x2,ConvBertForQuestionAnswering:()=>E2,ConvBertForSequenceClassification:()=>k2,ConvBertForTokenClassification:()=>T2,ConvBertModel:()=>M2,ConvBertPreTrainedModel:()=>ys,ConvNextForImageClassification:()=>C2,ConvNextModel:()=>A2,ConvNextPreTrainedModel:()=>ml,ConvNextV2ForImageClassification:()=>P2,ConvNextV2Model:()=>S2,ConvNextV2PreTrainedModel:()=>gl,DFineForObjectDetection:()=>O2,DFineModel:()=>I2,DFinePreTrainedModel:()=>vl,DINOv3ConvNextModel:()=>oA,DINOv3ConvNextPreTrainedModel:()=>Ap,DINOv3ViTModel:()=>iA,DINOv3ViTPreTrainedModel:()=>Cp,DPTForDepthEstimation:()=>pA,DPTModel:()=>_A,DPTPreTrainedModel:()=>Tl,DacDecoderModel:()=>Mp,DacDecoderOutput:()=>yp,DacEncoderModel:()=>bp,DacEncoderOutput:()=>vp,DacModel:()=>N2,DacPreTrainedModel:()=>Aa,DebertaForMaskedLM:()=>z2,DebertaForQuestionAnswering:()=>G2,DebertaForSequenceClassification:()=>B2,DebertaForTokenClassification:()=>R2,DebertaModel:()=>D2,DebertaPreTrainedModel:()=>bs,DebertaV2ForMaskedLM:()=>j2,DebertaV2ForQuestionAnswering:()=>H2,DebertaV2ForSequenceClassification:()=>q2,DebertaV2ForTokenClassification:()=>W2,DebertaV2Model:()=>U2,DebertaV2PreTrainedModel:()=>Ms,DecisionTransformerModel:()=>Q2,DecisionTransformerPreTrainedModel:()=>xp,DeepseekV3ForCausalLM:()=>V2,DeepseekV3Model:()=>$2,DeepseekV3PreTrainedModel:()=>yl,DeiTForImageClassification:()=>Y2,DeiTModel:()=>X2,DeiTPreTrainedModel:()=>bl,DepthAnythingForDepthEstimation:()=>J2,DepthAnythingPreTrainedModel:()=>kp,DepthProForDepthEstimation:()=>K2,DepthProPreTrainedModel:()=>Tp,DetrForObjectDetection:()=>eA,DetrForSegmentation:()=>tA,DetrModel:()=>Z2,DetrObjectDetectionOutput:()=>Ml,DetrPreTrainedModel:()=>Ca,DetrSegmentationOutput:()=>Ep,Dinov2ForImageClassification:()=>sA,Dinov2Model:()=>rA,Dinov2PreTrainedModel:()=>xl,Dinov2WithRegistersForImageClassification:()=>aA,Dinov2WithRegistersModel:()=>nA,Dinov2WithRegistersPreTrainedModel:()=>kl,DistilBertForMaskedLM:()=>hA,DistilBertForQuestionAnswering:()=>dA,DistilBertForSequenceClassification:()=>cA,DistilBertForTokenClassification:()=>uA,DistilBertModel:()=>lA,DistilBertPreTrainedModel:()=>xs,DonutSwinModel:()=>fA,DonutSwinPreTrainedModel:()=>Sp,EdgeTamModel:()=>kF,EfficientNetForImageClassification:()=>gA,EfficientNetModel:()=>mA,EfficientNetPreTrainedModel:()=>El,ElectraForMaskedLM:()=>vA,ElectraForQuestionAnswering:()=>MA,ElectraForSequenceClassification:()=>yA,ElectraForTokenClassification:()=>bA,ElectraModel:()=>wA,ElectraPreTrainedModel:()=>ks,Ernie4_5ForCausalLM:()=>kA,Ernie4_5Model:()=>xA,Ernie4_5PretrainedModel:()=>Al,EsmForMaskedLM:()=>EA,EsmForSequenceClassification:()=>AA,EsmForTokenClassification:()=>CA,EsmModel:()=>TA,EsmPreTrainedModel:()=>cn,EuroBertForMaskedLM:()=>PA,EuroBertForSequenceClassification:()=>FA,EuroBertForTokenClassification:()=>LA,EuroBertModel:()=>SA,EuroBertPreTrainedModel:()=>un,ExaoneForCausalLM:()=>OA,ExaoneModel:()=>IA,ExaonePreTrainedModel:()=>Cl,FalconForCausalLM:()=>DA,FalconH1ForCausalLM:()=>BA,FalconH1Model:()=>zA,FalconH1PreTrainedModel:()=>Pl,FalconModel:()=>NA,FalconPreTrainedModel:()=>Sl,FastViTForImageClassification:()=>GA,FastViTModel:()=>RA,FastViTPreTrainedModel:()=>Fl,Florence2ForConditionalGeneration:()=>$A,Florence2PreTrainedModel:()=>Pp,GLPNForDepthEstimation:()=>nC,GLPNModel:()=>sC,GLPNPreTrainedModel:()=>Rl,GPT2LMHeadModel:()=>_C,GPT2Model:()=>fC,GPT2PreTrainedModel:()=>jl,GPTBigCodeForCausalLM:()=>oC,GPTBigCodeModel:()=>aC,GPTBigCodePreTrainedModel:()=>Gl,GPTJForCausalLM:()=>mC,GPTJModel:()=>pC,GPTJPreTrainedModel:()=>ql,GPTNeoForCausalLM:()=>lC,GPTNeoModel:()=>iC,GPTNeoPreTrainedModel:()=>$l,GPTNeoXForCausalLM:()=>uC,GPTNeoXModel:()=>cC,GPTNeoXPreTrainedModel:()=>Vl,Gemma2ForCausalLM:()=>qA,Gemma2Model:()=>jA,Gemma2PreTrainedModel:()=>Il,Gemma3ForCausalLM:()=>XA,Gemma3ForConditionalGeneration:()=>Ip,Gemma3Model:()=>QA,Gemma3PreTrainedModel:()=>Lp,Gemma3nForCausalLM:()=>YA,Gemma3nForConditionalGeneration:()=>Sa,Gemma3nPreTrainedModel:()=>Op,Gemma4ForCausalLM:()=>JA,Gemma4ForConditionalGeneration:()=>Ol,GemmaForCausalLM:()=>UA,GemmaModel:()=>VA,GemmaPreTrainedModel:()=>Ll,GlmForCausalLM:()=>ZA,GlmModel:()=>KA,GlmMoeDsaForCausalLM:()=>tC,GlmMoeDsaModel:()=>eC,GlmMoeDsaPreTrainedModel:()=>Dl,GlmOcrForConditionalGeneration:()=>rC,GlmPreTrainedModel:()=>Nl,GptOssForCausalLM:()=>hC,GptOssModel:()=>dC,GptOssPreTrainedModel:()=>Ul,GraniteForCausalLM:()=>wC,GraniteModel:()=>gC,GraniteMoeHybridForCausalLM:()=>yC,GraniteMoeHybridModel:()=>vC,GraniteMoeHybridPreTrainedModel:()=>Hl,GranitePreTrainedModel:()=>Wl,GraniteSpeechForConditionalGeneration:()=>bC,GroundingDinoForObjectDetection:()=>MC,GroundingDinoPreTrainedModel:()=>Rp,GroupViTModel:()=>xC,GroupViTPreTrainedModel:()=>Gp,HeliumForCausalLM:()=>TC,HeliumModel:()=>kC,HeliumPreTrainedModel:()=>Xl,HieraForImageClassification:()=>AC,HieraModel:()=>EC,HieraPreTrainedModel:()=>Yl,HubertForCTC:()=>OC,HubertForSequenceClassification:()=>NC,HubertModel:()=>IC,HubertPreTrainedModel:()=>LC,HunYuanDenseV1ForCausalLM:()=>zC,HunYuanDenseV1Model:()=>DC,HunYuanDenseV1PreTrainedModel:()=>Jl,IJepaForImageClassification:()=>RC,IJepaModel:()=>BC,IJepaPreTrainedModel:()=>Kl,Idefics3ForConditionalGeneration:()=>$p,JAISLMHeadModel:()=>$C,JAISModel:()=>GC,JAISPreTrainedModel:()=>Zl,JinaCLIPModel:()=>VC,JinaCLIPPreTrainedModel:()=>Pa,JinaCLIPTextModel:()=>Vp,JinaCLIPVisionModel:()=>UC,Lfm2ForCausalLM:()=>qC,Lfm2Model:()=>jC,Lfm2MoeForCausalLM:()=>QC,Lfm2MoeModel:()=>HC,Lfm2MoePreTrainedModel:()=>tc,Lfm2PreTrainedModel:()=>ec,Lfm2VlForConditionalGeneration:()=>XC,LightOnOcrForConditionalGeneration:()=>WC,LiteWhisperForConditionalGeneration:()=>qL,Llama4ForCausalLM:()=>KC,Llama4PreTrainedModel:()=>Up,LlamaForCausalLM:()=>JC,LlamaModel:()=>YC,LlamaPreTrainedModel:()=>rc,LlavaForConditionalGeneration:()=>pr,LlavaOnevisionForConditionalGeneration:()=>pr,LlavaPreTrainedModel:()=>Fp,LlavaQwen2ForCausalLM:()=>HA,LongT5ForConditionalGeneration:()=>eS,LongT5Model:()=>ZC,LongT5PreTrainedModel:()=>sc,M2M100ForConditionalGeneration:()=>rS,M2M100Model:()=>tS,M2M100PreTrainedModel:()=>nc,MBartForCausalLM:()=>uS,MBartForConditionalGeneration:()=>lS,MBartForSequenceClassification:()=>cS,MBartModel:()=>iS,MBartPreTrainedModel:()=>dn,MPNetForMaskedLM:()=>YS,MPNetForQuestionAnswering:()=>ZS,MPNetForSequenceClassification:()=>JS,MPNetForTokenClassification:()=>KS,MPNetModel:()=>XS,MPNetPreTrainedModel:()=>Ts,MT5ForConditionalGeneration:()=>sP,MT5Model:()=>rP,MT5PreTrainedModel:()=>pc,MarianMTModel:()=>nS,MarianModel:()=>sS,MarianPreTrainedModel:()=>ac,MaskFormerForInstanceSegmentation:()=>oS,MaskFormerModel:()=>aS,MaskFormerPreTrainedModel:()=>oc,Metric3DForDepthEstimation:()=>dS,Metric3DPreTrainedModel:()=>jp,Metric3Dv2ForDepthEstimation:()=>hS,Metric3Dv2PreTrainedModel:()=>qp,MgpstrForSceneTextRecognition:()=>fS,MgpstrModelOutput:()=>Wp,MgpstrPreTrainedModel:()=>Hp,MimiDecoderModel:()=>Jp,MimiDecoderOutput:()=>Xp,MimiEncoderModel:()=>Yp,MimiEncoderOutput:()=>Qp,MimiModel:()=>_S,MimiPreTrainedModel:()=>Fa,Mistral4ForCausalLM:()=>wS,Mistral4Model:()=>gS,Mistral4PreTrainedModel:()=>lc,MistralForCausalLM:()=>mS,MistralModel:()=>pS,MistralPreTrainedModel:()=>ic,MobileBertForMaskedLM:()=>yS,MobileBertForQuestionAnswering:()=>MS,MobileBertForSequenceClassification:()=>bS,MobileBertModel:()=>vS,MobileBertPreTrainedModel:()=>hn,MobileLLMForCausalLM:()=>kS,MobileLLMModel:()=>xS,MobileLLMPreTrainedModel:()=>cc,MobileNetV1ForImageClassification:()=>ES,MobileNetV1ForSemanticSegmentation:()=>AS,MobileNetV1Model:()=>TS,MobileNetV1PreTrainedModel:()=>La,MobileNetV2ForImageClassification:()=>SS,MobileNetV2ForSemanticSegmentation:()=>PS,MobileNetV2Model:()=>CS,MobileNetV2PreTrainedModel:()=>Ia,MobileNetV3ForImageClassification:()=>LS,MobileNetV3ForSemanticSegmentation:()=>IS,MobileNetV3Model:()=>FS,MobileNetV3PreTrainedModel:()=>Oa,MobileNetV4ForImageClassification:()=>NS,MobileNetV4ForSemanticSegmentation:()=>DS,MobileNetV4Model:()=>OS,MobileNetV4PreTrainedModel:()=>Na,MobileViTForImageClassification:()=>BS,MobileViTModel:()=>zS,MobileViTPreTrainedModel:()=>uc,MobileViTV2ForImageClassification:()=>GS,MobileViTV2Model:()=>RS,MobileViTV2PreTrainedModel:()=>dc,ModernBertDecoderForCausalLM:()=>WS,ModernBertDecoderModel:()=>qS,ModernBertDecoderPreTrainedModel:()=>hc,ModernBertForMaskedLM:()=>VS,ModernBertForSequenceClassification:()=>US,ModernBertForTokenClassification:()=>jS,ModernBertModel:()=>$S,ModernBertPreTrainedModel:()=>fn,Moondream1ForConditionalGeneration:()=>WA,MoonshineForConditionalGeneration:()=>QS,MoonshineModel:()=>HS,MoonshinePreTrainedModel:()=>fc,MptForCausalLM:()=>tP,MptModel:()=>eP,MptPreTrainedModel:()=>_c,MultiModalityCausalLM:()=>nP,MultiModalityPreTrainedModel:()=>Kp,MusicgenForCausalLM:()=>oP,MusicgenForConditionalGeneration:()=>Zp,MusicgenModel:()=>aP,MusicgenPreTrainedModel:()=>mc,NanoChatForCausalLM:()=>lP,NanoChatModel:()=>iP,NanoChatPreTrainedModel:()=>gc,NemotronHForCausalLM:()=>uP,NemotronHModel:()=>cP,NemotronHPreTrainedModel:()=>wc,NeoBertForMaskedLM:()=>hP,NeoBertForQuestionAnswering:()=>pP,NeoBertForSequenceClassification:()=>fP,NeoBertForTokenClassification:()=>_P,NeoBertModel:()=>dP,NeoBertPreTrainedModel:()=>Es,NomicBertModel:()=>mP,NomicBertPreTrainedModel:()=>em,OPTForCausalLM:()=>CP,OPTModel:()=>AP,OPTPreTrainedModel:()=>kc,Olmo2ForCausalLM:()=>yP,Olmo2Model:()=>vP,Olmo2PreTrainedModel:()=>yc,Olmo3ForCausalLM:()=>MP,Olmo3Model:()=>bP,Olmo3PreTrainedModel:()=>bc,OlmoForCausalLM:()=>wP,OlmoHybridForCausalLM:()=>kP,OlmoHybridModel:()=>xP,OlmoHybridPreTrainedModel:()=>Mc,OlmoModel:()=>gP,OlmoPreTrainedModel:()=>vc,OpenELMForCausalLM:()=>EP,OpenELMModel:()=>TP,OpenELMPreTrainedModel:()=>xc,OwlViTForObjectDetection:()=>LP,OwlViTModel:()=>FP,OwlViTPreTrainedModel:()=>Ec,Owlv2ForObjectDetection:()=>PP,Owlv2Model:()=>SP,Owlv2PreTrainedModel:()=>Tc,PaliGemmaForConditionalGeneration:()=>IP,ParakeetForCTC:()=>OP,ParakeetPreTrainedModel:()=>tm,PatchTSMixerForPrediction:()=>DP,PatchTSMixerModel:()=>NP,PatchTSMixerPreTrainedModel:()=>Ac,PatchTSTForPrediction:()=>BP,PatchTSTModel:()=>zP,PatchTSTPreTrainedModel:()=>Cc,Phi3ForCausalLM:()=>VP,Phi3Model:()=>$P,Phi3PreTrainedModel:()=>Pc,Phi3VForCausalLM:()=>sm,Phi3VPreTrainedModel:()=>rm,PhiForCausalLM:()=>GP,PhiModel:()=>RP,PhiPreTrainedModel:()=>Sc,PreTrainedModel:()=>P,PvtForImageClassification:()=>jP,PvtModel:()=>UP,PvtPreTrainedModel:()=>Fc,PyAnnoteForAudioFrameClassification:()=>WP,PyAnnoteModel:()=>qP,PyAnnotePreTrainedModel:()=>Lc,Qwen2ForCausalLM:()=>QP,Qwen2Model:()=>HP,Qwen2MoeForCausalLM:()=>YP,Qwen2MoeModel:()=>XP,Qwen2MoePreTrainedModel:()=>Oc,Qwen2PreTrainedModel:()=>Ic,Qwen2VLForCausalLM:()=>Dp,Qwen2VLForConditionalGeneration:()=>zl,Qwen2VLPreTrainedModel:()=>Np,Qwen2_5_VLForCausalLM:()=>zp,Qwen2_5_VLForConditionalGeneration:()=>Bl,Qwen3ForCausalLM:()=>KP,Qwen3Model:()=>JP,Qwen3MoeForCausalLM:()=>eF,Qwen3MoeModel:()=>ZP,Qwen3MoePreTrainedModel:()=>Dc,Qwen3NextForCausalLM:()=>rF,Qwen3NextModel:()=>tF,Qwen3NextPreTrainedModel:()=>zc,Qwen3PreTrainedModel:()=>Nc,Qwen3VLForCausalLM:()=>nm,Qwen3VLForConditionalGeneration:()=>Bc,Qwen3VLMoeForCausalLM:()=>nF,Qwen3VLMoeForConditionalGeneration:()=>sF,Qwen3_5ForCausalLM:()=>am,Qwen3_5ForConditionalGeneration:()=>Rc,Qwen3_5MoeForCausalLM:()=>oF,Qwen3_5MoeForConditionalGeneration:()=>aF,RFDetrForObjectDetection:()=>uF,RFDetrModel:()=>cF,RFDetrObjectDetectionOutput:()=>om,RFDetrPreTrainedModel:()=>$c,RTDetrForObjectDetection:()=>L2,RTDetrModel:()=>F2,RTDetrObjectDetectionOutput:()=>ln,RTDetrPreTrainedModel:()=>wl,RTDetrV2ForObjectDetection:()=>MF,RTDetrV2Model:()=>bF,RTDetrV2ObjectDetectionOutput:()=>im,RTDetrV2PreTrainedModel:()=>Vc,ResNetForImageClassification:()=>lF,ResNetModel:()=>iF,ResNetPreTrainedModel:()=>Gc,RoFormerForMaskedLM:()=>gF,RoFormerForQuestionAnswering:()=>yF,RoFormerForSequenceClassification:()=>wF,RoFormerForTokenClassification:()=>vF,RoFormerModel:()=>mF,RoFormerPreTrainedModel:()=>Cs,RobertaForMaskedLM:()=>hF,RobertaForQuestionAnswering:()=>pF,RobertaForSequenceClassification:()=>fF,RobertaForTokenClassification:()=>_F,RobertaModel:()=>dF,RobertaPreTrainedModel:()=>As,Sam2ImageSegmentationOutput:()=>um,Sam2Model:()=>Uc,Sam2PreTrainedModel:()=>dm,Sam3TrackerModel:()=>TF,SamImageSegmentationOutput:()=>lm,SamModel:()=>xF,SamPreTrainedModel:()=>cm,SapiensForDepthEstimation:()=>AF,SapiensForNormalEstimation:()=>CF,SapiensForSemanticSegmentation:()=>EF,SapiensPreTrainedModel:()=>Da,SegformerForImageClassification:()=>PF,SegformerForSemanticSegmentation:()=>FF,SegformerModel:()=>SF,SegformerPreTrainedModel:()=>za,SiglipModel:()=>LF,SiglipPreTrainedModel:()=>jc,SiglipTextModel:()=>hm,SiglipVisionModel:()=>IF,SmolLM3ForCausalLM:()=>NF,SmolLM3Model:()=>OF,SmolLM3PreTrainedModel:()=>qc,SmolVLMForConditionalGeneration:()=>DF,SnacDecoderModel:()=>_m,SnacEncoderModel:()=>fm,SnacModel:()=>zF,SnacPreTrainedModel:()=>Ba,SolarOpenForCausalLM:()=>RF,SolarOpenModel:()=>BF,SolarOpenPreTrainedModel:()=>Wc,SpeechT5ForSpeechToText:()=>$F,SpeechT5ForTextToSpeech:()=>VF,SpeechT5HifiGan:()=>UF,SpeechT5Model:()=>GF,SpeechT5PreTrainedModel:()=>Ra,SqueezeBertForMaskedLM:()=>qF,SqueezeBertForQuestionAnswering:()=>HF,SqueezeBertForSequenceClassification:()=>WF,SqueezeBertModel:()=>jF,SqueezeBertPreTrainedModel:()=>_n,StableLmForCausalLM:()=>XF,StableLmModel:()=>QF,StableLmPreTrainedModel:()=>Hc,Starcoder2ForCausalLM:()=>JF,Starcoder2Model:()=>YF,Starcoder2PreTrainedModel:()=>Qc,StyleTextToSpeech2Model:()=>KF,StyleTextToSpeech2PreTrainedModel:()=>pm,SupertonicForConditionalGeneration:()=>gm,SupertonicPreTrainedModel:()=>mm,Swin2SRForImageSuperResolution:()=>sL,Swin2SRModel:()=>rL,Swin2SRPreTrainedModel:()=>Xc,SwinForImageClassification:()=>eL,SwinForSemanticSegmentation:()=>tL,SwinModel:()=>ZF,SwinPreTrainedModel:()=>Ga,T5ForConditionalGeneration:()=>aL,T5Model:()=>nL,T5PreTrainedModel:()=>Yc,TableTransformerForObjectDetection:()=>iL,TableTransformerModel:()=>oL,TableTransformerObjectDetectionOutput:()=>wm,TableTransformerPreTrainedModel:()=>Jc,TrOCRForCausalLM:()=>lL,TrOCRPreTrainedModel:()=>vm,UltravoxModel:()=>Ql,UltravoxPreTrainedModel:()=>Bp,UniSpeechForCTC:()=>uL,UniSpeechForSequenceClassification:()=>dL,UniSpeechModel:()=>cL,UniSpeechPreTrainedModel:()=>$a,UniSpeechSatForAudioFrameClassification:()=>pL,UniSpeechSatForCTC:()=>fL,UniSpeechSatForSequenceClassification:()=>_L,UniSpeechSatModel:()=>hL,UniSpeechSatPreTrainedModel:()=>pn,VaultGemmaForCausalLM:()=>gL,VaultGemmaModel:()=>mL,VaultGemmaPreTrainedModel:()=>Kc,ViTForImageClassification:()=>yL,ViTMAEModel:()=>bL,ViTMAEPreTrainedModel:()=>ym,ViTMSNForImageClassification:()=>xL,ViTMSNModel:()=>ML,ViTMSNPreTrainedModel:()=>eu,ViTModel:()=>vL,ViTPreTrainedModel:()=>Zc,VisionEncoderDecoderModel:()=>wL,VitMatteForImageMatting:()=>kL,VitMattePreTrainedModel:()=>bm,VitPoseForPoseEstimation:()=>TL,VitPosePreTrainedModel:()=>Mm,VitsModel:()=>EL,VitsModelOutput:()=>xm,VitsPreTrainedModel:()=>km,VoxtralForConditionalGeneration:()=>AL,VoxtralRealtimeForConditionalGeneration:()=>Am,VoxtralRealtimePreTrainedModel:()=>Em,Wav2Vec2BertForCTC:()=>NL,Wav2Vec2BertForSequenceClassification:()=>DL,Wav2Vec2BertModel:()=>OL,Wav2Vec2BertPreTrainedModel:()=>Va,Wav2Vec2ForAudioFrameClassification:()=>FC,Wav2Vec2ForCTC:()=>SC,Wav2Vec2ForSequenceClassification:()=>PC,Wav2Vec2Model:()=>CC,Wav2Vec2PreTrainedModel:()=>Ir,WavLMForAudioFrameClassification:()=>$L,WavLMForCTC:()=>BL,WavLMForSequenceClassification:()=>RL,WavLMForXVector:()=>GL,WavLMModel:()=>zL,WavLMPreTrainedModel:()=>Ss,WeSpeakerResNetModel:()=>VL,WeSpeakerResNetPreTrainedModel:()=>Sm,WhisperForConditionalGeneration:()=>Pm,WhisperModel:()=>jL,WhisperPreTrainedModel:()=>ru,XLMForQuestionAnswering:()=>YL,XLMForSequenceClassification:()=>QL,XLMForTokenClassification:()=>XL,XLMModel:()=>WL,XLMPreTrainedModel:()=>Ps,XLMRobertaForMaskedLM:()=>KL,XLMRobertaForQuestionAnswering:()=>tI,XLMRobertaForSequenceClassification:()=>ZL,XLMRobertaForTokenClassification:()=>eI,XLMRobertaModel:()=>JL,XLMRobertaPreTrainedModel:()=>Fs,XLMWithLMHeadModel:()=>HL,XVectorOutput:()=>Cm,YolosForObjectDetection:()=>sI,YolosModel:()=>rI,YolosObjectDetectionOutput:()=>Fm,YolosPreTrainedModel:()=>su,YoutuForCausalLM:()=>aI,YoutuModel:()=>nI,YoutuPreTrainedModel:()=>nu});var on=class extends P{},TE=class extends on{},EE=class extends on{async _call(e){return new ie(await super._call(e))}},AE=class extends on{async _call(e){return new ht(await super._call(e))}},CE=class extends on{async _call(e){return new Je(await super._call(e))}},sl=class extends P{},SE=class extends sl{},PE=class extends sl{},nl=class extends P{},FE=class extends nl{},LE=class extends nl{},al=class extends P{},IE=class extends al{},OE=class extends al{},ol=class extends P{},NE=class extends ol{},DE=class extends ol{},Ta=class extends P{},zE=class extends Ta{},BE=class extends Ta{},RE=class extends Ta{async _call(e){return new ie(await super._call(e))}},il=class extends P{},GE=class extends il{},$E=class extends il{async _call(e){return new ie(await super._call(e))}},ws=class extends P{},VE=class extends ws{},UE=class extends ws{async _call(e){return new Je(await super._call(e))}},jE=class extends ws{async _call(e){return new ie(await super._call(e))}},qE=class extends ws{async _call(e){return new He(await super._call(e))}},WE=class extends ws{async _call(e){return new ht(await super._call(e))}},ll=class extends P{},HE=class extends ll{},QE=class extends ll{},cl=class extends P{},XE=class extends cl{},YE=class extends cl{},ul=class extends P{},JE=class extends ul{},KE=class extends ul{},vs=class extends P{},ZE=class extends vs{},e2=class extends vs{async _call(e){return new Je(await super._call(e))}},t2=class extends vs{async _call(e){return new ie(await super._call(e))}},r2=class extends vs{async _call(e){return new He(await super._call(e))}},s2=class extends vs{async _call(e){return new ht(await super._call(e))}},n2=4299n,dp=6561n,hp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","position_ids","audio_values","exaggeration","audio_features","audio_tokens","speaker_embeddings","speaker_features","past_key_values"]);k(this,"main_input_name","input_ids");k(this,"_return_dict_in_generate_keys",["audio_tokens","speaker_embeddings","speaker_features"])}},fp=class extends hp{async encode_speech(e){return xe(this.sessions.speech_encoder,{audio_values:e})}async forward({input_ids:e=null,attention_mask:t=null,audio_values:r=null,exaggeration:s=null,position_ids:n=null,inputs_embeds:a=null,past_key_values:o=null,generation_config:i=null,logits_processor:l=null,audio_features:c=null,audio_tokens:d=null,speaker_embeddings:h=null,speaker_features:_=null,...p}){let w;if(!a){const y=this.sessions.embed_tokens.inputNames,M={input_ids:e};if(y.includes("exaggeration")){if(!(s instanceof U)){const T=e.dims[0];if(s==null)s=ct([T],.5);else if(typeof s=="number")s=ct([T],s);else if(Array.isArray(s))s=new U("float32",s,[T]);else throw new Error("Unsupported type for `exaggeration` input")}M.exaggeration=s}if(y.includes("position_ids")&&(M.position_ids=n),{inputs_embeds:a}=await xe(this.sessions.embed_tokens,M),c&&d&&h&&_&&(w={audio_features:c,audio_tokens:d,speaker_embeddings:h,speaker_features:_}),w||r)w??(w=await this.encode_speech(r)),a=ze([w.audio_features,a],1),t=yt([a.dims[0],a.dims[1]]);else{const T=a.dims[1];if(!o||T!==1)throw new Error("Incorrect state encountered during generation.");const A=o.get_seq_length();t=yt([a.dims[0],A+T])}}return{...await _r(this,{inputs_embeds:a,past_key_values:o,attention_mask:t,generation_config:i,logits_processor:l},!1),...w}}prepare_inputs_for_generation(e,t,r){if(!t.position_ids&&this.sessions.embed_tokens.inputNames.includes("position_ids"))if(t.input_ids.dims[1]===1){const s=Array.from({length:e.length},(n,a)=>e[a].length-e[a].findLastIndex(o=>o==dp)-1);t.position_ids=new U("int64",s,[e.length,1])}else{const n=t.input_ids.tolist().map(a=>{let o=0;return a.map(i=>i>=dp?0:o++)});t.position_ids=new U("int64",n.flat(),t.input_ids.dims)}return t.input_ids.dims[1]===1&&(delete t.audio_values,delete t.audio_features,delete t.audio_tokens,delete t.speaker_embeddings,delete t.speaker_features),an(this,e,t)}async generate(e){const{sequences:t,audio_tokens:r,speaker_embeddings:s,speaker_features:n}=await super.generate({...e,return_dict_in_generate:!0}),a=t.slice(null,[e.input_ids.dims[1],-1]),o=ct([a.dims[0],3],n2),i=ze([r,a,o],1),{waveform:l}=await xe(this.sessions.conditional_decoder,{speech_tokens:i,speaker_features:n,speaker_embeddings:s});return l}},_p=class extends P{},a2=class extends _p{},pp=class extends P{},o2=class extends pp{},Ea=class extends P{},i2=class extends Ea{},mp=class extends Ea{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},gp=class extends Ea{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"audio_model"})}},Yr=class extends P{},l2=class extends Yr{},c2=class extends Yr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},wp=class extends Yr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},u2=class extends Yr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},d2=class extends Yr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},dl=class extends P{},h2=class extends dl{},f2=class extends dl{},hl=class extends P{},_2=class extends hl{},p2=class extends hl{},fl=class extends P{},m2=class extends fl{},g2=class extends fl{},_l=class extends P{},w2=class extends _l{},v2=class extends _l{},pl=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_features");k(this,"forward_params",["input_features","decoder_input_ids","decoder_attention_mask","past_key_values"])}},y2=class extends pl{},b2=class extends pl{},ys=class extends P{},M2=class extends ys{},x2=class extends ys{async _call(e){return new Je(await super._call(e))}},k2=class extends ys{async _call(e){return new ie(await super._call(e))}},T2=class extends ys{async _call(e){return new He(await super._call(e))}},E2=class extends ys{async _call(e){return new ht(await super._call(e))}},ml=class extends P{},A2=class extends ml{},C2=class extends ml{async _call(e){return new ie(await super._call(e))}},gl=class extends P{},S2=class extends gl{},P2=class extends gl{async _call(e){return new ie(await super._call(e))}},wl=class extends P{},F2=class extends wl{},L2=class extends wl{async _call(e){return new ln(await super._call(e))}},ln=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},vl=class extends P{},I2=class extends vl{},O2=class extends vl{async _call(e){return new ln(await super._call(e))}},vp=class extends Ye{constructor({audio_codes:e}){super(),this.audio_codes=e}},yp=class extends Ye{constructor({audio_values:e}){super(),this.audio_values=e}},Aa=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},N2=class extends Aa{async encode(e){return new vp(await xe(this.sessions.encoder_model,e))}async decode(e){return new yp(await xe(this.sessions.decoder_model,e))}},bp=class extends Aa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},Mp=class extends Aa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},bs=class extends P{},D2=class extends bs{},z2=class extends bs{async _call(e){return new Je(await super._call(e))}},B2=class extends bs{async _call(e){return new ie(await super._call(e))}},R2=class extends bs{async _call(e){return new He(await super._call(e))}},G2=class extends bs{async _call(e){return new ht(await super._call(e))}},yl=class extends P{},$2=class extends yl{},V2=class extends yl{},Ms=class extends P{},U2=class extends Ms{},j2=class extends Ms{async _call(e){return new Je(await super._call(e))}},q2=class extends Ms{async _call(e){return new ie(await super._call(e))}},W2=class extends Ms{async _call(e){return new He(await super._call(e))}},H2=class extends Ms{async _call(e){return new ht(await super._call(e))}},xp=class extends P{},Q2=class extends xp{},bl=class extends P{},X2=class extends bl{},Y2=class extends bl{async _call(e){return new ie(await super._call(e))}},kp=class extends P{},J2=class extends kp{},Tp=class extends P{},K2=class extends Tp{},Ca=class extends P{},Z2=class extends Ca{},eA=class extends Ca{async _call(e){return new Ml(await super._call(e))}},tA=class extends Ca{async _call(e){return new Ep(await super._call(e))}},Ml=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},Ep=class extends Ye{constructor({logits:e,pred_boxes:t,pred_masks:r}){super(),this.logits=e,this.pred_boxes=t,this.pred_masks=r}},xl=class extends P{},rA=class extends xl{},sA=class extends xl{async _call(e){return new ie(await super._call(e))}},kl=class extends P{},nA=class extends kl{},aA=class extends kl{async _call(e){return new ie(await super._call(e))}},Ap=class extends P{},oA=class extends Ap{},Cp=class extends P{},iA=class extends Cp{},xs=class extends P{},lA=class extends xs{},cA=class extends xs{async _call(e){return new ie(await super._call(e))}},uA=class extends xs{async _call(e){return new He(await super._call(e))}},dA=class extends xs{async _call(e){return new ht(await super._call(e))}},hA=class extends xs{async _call(e){return new Je(await super._call(e))}},Sp=class extends P{},fA=class extends Sp{},Tl=class extends P{},_A=class extends Tl{},pA=class extends Tl{},El=class extends P{},mA=class extends El{},gA=class extends El{async _call(e){return new ie(await super._call(e))}},ks=class extends P{},wA=class extends ks{},vA=class extends ks{async _call(e){return new Je(await super._call(e))}},yA=class extends ks{async _call(e){return new ie(await super._call(e))}},bA=class extends ks{async _call(e){return new He(await super._call(e))}},MA=class extends ks{async _call(e){return new ht(await super._call(e))}},Al=class extends P{},xA=class extends Al{},kA=class extends Al{},cn=class extends P{},TA=class extends cn{},EA=class extends cn{async _call(e){return new Je(await super._call(e))}},AA=class extends cn{async _call(e){return new ie(await super._call(e))}},CA=class extends cn{async _call(e){return new He(await super._call(e))}},un=class extends P{},SA=class extends un{},PA=class extends un{async _call(e){return new Je(await super._call(e))}},FA=class extends un{async _call(e){return new ie(await super._call(e))}},LA=class extends un{async _call(e){return new He(await super._call(e))}},Cl=class extends P{},IA=class extends Cl{},OA=class extends Cl{},Sl=class extends P{},NA=class extends Sl{},DA=class extends Sl{},Pl=class extends P{},zA=class extends Pl{},BA=class extends Pl{},Fl=class extends P{},RA=class extends Fl{},GA=class extends Fl{async _call(e){return new ie(await super._call(e))}},Pp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","pixel_values","encoder_outputs","decoder_input_ids","decoder_inputs_embeds","decoder_attention_mask","past_key_values"]);k(this,"main_input_name","inputs_embeds")}},$A=class extends Pp{_merge_input_ids_with_image_features({inputs_embeds:e,image_features:t,input_ids:r,attention_mask:s}){return{inputs_embeds:ze([t,e],1),attention_mask:ze([yt(t.dims.slice(0,2)),s],1)}}async _prepare_inputs_embeds({input_ids:e,pixel_values:t,inputs_embeds:r,attention_mask:s}){if(!e&&!t)throw new Error("Either `input_ids` or `pixel_values` should be provided.");let n,a;return e&&(n=await this.encode_text({input_ids:e})),t&&(a=await this.encode_image({pixel_values:t})),n&&a?{inputs_embeds:r,attention_mask:s}=this._merge_input_ids_with_image_features({inputs_embeds:n,image_features:a,input_ids:e,attention_mask:s}):r=n||a,{inputs_embeds:r,attention_mask:s}}async forward({input_ids:e,pixel_values:t,attention_mask:r,decoder_input_ids:s,decoder_attention_mask:n,encoder_outputs:a,past_key_values:o,inputs_embeds:i,decoder_inputs_embeds:l}){if(i||({inputs_embeds:i,attention_mask:r}=await this._prepare_inputs_embeds({input_ids:e,pixel_values:t,inputs_embeds:i,attention_mask:r})),!a){let{last_hidden_state:d}=await Lr(this,{inputs_embeds:i,attention_mask:r});a=d}if(!l){if(!s)throw new Error("Either `decoder_input_ids` or `decoder_inputs_embeds` should be provided.");l=await this.encode_text({input_ids:s})}return await _r(this,{inputs_embeds:l,attention_mask:n,encoder_attention_mask:r,encoder_hidden_states:a,past_key_values:o},!0)}},Ll=class extends P{},VA=class extends Ll{},UA=class extends Ll{},Il=class extends P{},jA=class extends Il{},qA=class extends Il{},Fp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","position_ids","past_key_values"])}},pr=class extends Fp{_merge_input_ids_with_image_features(e){const t=e.image_features.dims.at(-1),r=e.image_features.view(-1,t);return tl({image_token_id:this.config.image_token_index??this.config.image_token_id,...e,image_features:r})}},WA=class extends pr{},HA=class extends pr{},Lp=class extends P{},QA=class extends Lp{},Ip=class extends pr{},XA=class extends Ip{},Op=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","inputs_embeds","per_layer_inputs","position_ids","pixel_values","input_features","input_features_mask","past_key_values"])}},Sa=class extends Op{async forward({input_ids:e=null,attention_mask:t=null,pixel_values:r=null,input_features:s=null,input_features_mask:n=null,position_ids:a=null,inputs_embeds:o=null,per_layer_inputs:i=null,past_key_values:l=null,generation_config:c=null,logits_processor:d=null,...h}){if((!o||!i)&&({inputs_embeds:o,per_layer_inputs:i}=await xe(this.sessions.embed_tokens,{input_ids:e}),e.dims[1]!==1)){if(r){const{image_features:p}=await this._encode_vision({pixel_values:r,...h});({inputs_embeds:o,attention_mask:t}=this._merge_input_ids_with_image_features({image_features:p,inputs_embeds:o,input_ids:e,attention_mask:t}))}if(s){const{audio_features:p}=await xe(this.sessions.audio_encoder,{input_features:s,input_features_mask:n});({inputs_embeds:o,attention_mask:t}=this._merge_input_ids_with_audio_features({audio_features:p,inputs_embeds:o,input_ids:e,attention_mask:t}))}}return await _r(this,{inputs_embeds:o,per_layer_inputs:i,past_key_values:l,attention_mask:t,position_ids:a,generation_config:c,logits_processor:d},!0)}_encode_vision(e){return xe(this.sessions.vision_encoder,{pixel_values:e.pixel_values})}_merge_input_ids_with_image_features(e){const t=e.image_features.dims.at(-1),r=e.image_features.view(-1,t);return tl({image_token_id:this.config.image_token_id,...e,image_features:r})}_merge_input_ids_with_audio_features(e){const t=e.audio_features.dims.at(-1),r=e.audio_features.view(-1,t);return up({audio_token_id:this.config.audio_token_id,...e,audio_features:r})}},YA=class extends Sa{},Ol=class extends Sa{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","inputs_embeds","per_layer_inputs","position_ids","pixel_values","image_position_ids","input_features","input_features_mask","past_key_values"])}_encode_vision(t){return xe(this.sessions.vision_encoder,{pixel_values:t.pixel_values,pixel_position_ids:t.image_position_ids})}},JA=class extends Ol{},Nl=class extends P{},KA=class extends Nl{},ZA=class extends Nl{},Dl=class extends P{},eC=class extends Dl{},tC=class extends Dl{},Np=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","past_key_values","pixel_values","image_grid_thw"])}},zl=class extends Np{constructor(){super(...arguments);k(this,"image_grid_thw_name","grid_thw")}_get_text_only_rope_index(t,r){if(r){const{data:s,dims:n}=lp(r),a=BigInt64Array.from({length:3*s.length},(i,l)=>s[l%s.length]),o=Array.from({length:n[0]},(i,l)=>je(s.subarray(n[1]*l,n[1]*(l+1)))[0]+1n+BigInt(n[1]));return[new U("int64",a,[3,...n]),new U("int64",o,[o.length,1])]}else{const[s,n]=t.dims,a=BigInt64Array.from({length:3*s*n},(o,i)=>BigInt(Math.floor(i%n/s)));return[new U("int64",a,[3,...t.dims]),Yf([s,1])]}}_reorder_and_write_positions(t,r,s,n){const a=t.reduce((c,d)=>c+d.length,0),o=new Array(a);let i=0;for(let c=0;c<3;++c)for(const d of t){const h=d.length/3;for(let _=c*h;_<(c+1)*h;++_)o[i++]=d[_]}let l=0;for(let c=0;c<r.length;++c)if(r[c]==1){for(let d=0;d<3;++d)s[d][n][c]=o[d*a/3+l];++l}return o}_get_multimodal_rope_positions({filtered_ids:t,image_grid_thw_list:r,video_grid_thw_list:s,spatial_merge_size:n,state:a}){const{image_token_id:o,video_token_id:i,vision_start_token_id:l}=this.config,c=t,h=c.reduce((T,A,C)=>(A==l&&T.push(C),T),[]).map(T=>c[T+1]),_=h.filter(T=>T==o).length,p=h.filter(T=>T==i).length,w=[];let v=0,y=_,M=p;for(let T=0;T<h.length;++T){const A=c.findIndex((X,J)=>J>v&&X==o),C=c.findIndex((X,J)=>J>v&&X==i),S=y>0&&A!==-1?A:c.length+1,N=M>0&&C!==-1?C:c.length+1;let x,R,z,$;S<N?([R,z,$]=r[a.image_index],++a.image_index,--y,x=S):([R,z,$]=s[a.video_index],++a.video_index,--M,x=N);const[Q,H,D]=[Number(R),Math.floor(Number(z)/n),Math.floor(Number($)/n)],I=x-v,te=w.length>0?je(w.at(-1))[0]+1:0;w.push(Array.from({length:3*I},(X,J)=>te+J%I));const W=I+te,ee=Q*H*D,G=Array.from({length:ee},(X,J)=>W+Math.floor(J/(H*D))),L=Array.from({length:ee},(X,J)=>W+Math.floor(J/D)%H),V=Array.from({length:ee},(X,J)=>W+J%D);w.push([G,L,V].flat()),v=x+ee}if(v<c.length){const T=w.length>0?je(w.at(-1))[0]+1:0,A=c.length-v;w.push(Array.from({length:3*A},(C,S)=>T+S%A))}return w}get_rope_index(t,r,s,n){const{vision_config:a}=this.config,o=a.spatial_merge_size??2;if(r||s){const i=t.tolist();n||(n=Xf(t));const l=n.tolist(),c=Array.from({length:3},()=>Array.from({length:t.dims[0]},()=>Array.from({length:t.dims[1]},()=>0))),d=r?r.tolist():[],h=s?s.tolist():[],_={image_index:0,video_index:0},p=[];for(let w=0;w<i.length;++w){const v=i[w].filter((T,A)=>l[w][A]==1),y=this._get_multimodal_rope_positions({filtered_ids:v,image_grid_thw_list:d,video_grid_thw_list:h,spatial_merge_size:o,state:_}),M=this._reorder_and_write_positions(y,l[w],c,w);p.push(je(M)[0]+1-i[w].length)}return[new U("int64",c.flat(1/0),[3,t.dims[0],t.dims[1]]),new U("int64",p,[p.length,1])]}else return this._get_text_only_rope_index(t,n)}async encode_image({pixel_values:t,image_grid_thw:r}){return(await xe(this.sessions.vision_encoder,{pixel_values:t,[this.image_grid_thw_name]:r})).image_features}_merge_input_ids_with_image_features(t){return tl({image_token_id:this.config.image_token_id,...t})}prepare_inputs_for_generation(t,r,s){if(!r.attention_mask||r.position_ids||!(this.sessions.decoder_model_merged??this.sessions.model).inputNames.includes("position_ids"))return r;if(!r.past_key_values)[r.position_ids,r.rope_deltas]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask);else{r.pixel_values=null;const a=r.past_key_values.get_seq_length();if(a<r.input_ids.dims[1]){const[o,i]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask);r.rope_deltas=i,r.position_ids=o.slice(null,null,[a,null]),r.input_ids=r.input_ids.slice(null,[a,null])}else{r.rope_deltas||([,r.rope_deltas]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask));const o=BigInt(a),i=r.rope_deltas.map(l=>o+l);r.position_ids=ur([i,i,i],0)}}return r}},Dp=class extends zl{},Bl=class extends zl{constructor(){super(...arguments);k(this,"image_grid_thw_name","image_grid_thw")}},zp=class extends Dp{constructor(){super(...arguments);k(this,"image_grid_thw_name","image_grid_thw")}},rC=class extends Bl{get_vision_position_ids(e,t,r,s){const n=Math.floor(t[0]/r),a=Math.floor(t[1]/s),o=Math.floor(t[2]/s),i=a*o*n,l=Array.from({length:i},()=>e),c=Array.from({length:i},(h,_)=>e+Math.floor(_/(o*n))),d=Array.from({length:i},(h,_)=>e+_%o);return[...l,...c,...d]}_get_multimodal_rope_positions({filtered_ids:e,image_grid_thw_list:t,video_grid_thw_list:r,spatial_merge_size:s,state:n}){const{image_token_id:a}=this.config,o=[];let i=0,l=e[0]==a?1:0;for(let h=1;h<=e.length;++h){const _=h<e.length?e[h]==a?1:0:-1;_!==l&&(o.push([l,i,h]),i=h,l=_)}let c=0;const d=[];for(const[h,_,p]of o)if(h===0){const w=p-_;d.push(Array.from({length:3*w},(v,y)=>c+y%w)),c+=w}else{const w=t[n.image_index++].map(Number),v=w[0];d.push(this.get_vision_position_ids(c,w,v,s)),c+=Math.max(w[1],w[2])/s}return d}},Rl=class extends P{},sC=class extends Rl{},nC=class extends Rl{},Gl=class extends P{},aC=class extends Gl{},oC=class extends Gl{},$l=class extends P{},iC=class extends $l{},lC=class extends $l{},Vl=class extends P{},cC=class extends Vl{},uC=class extends Vl{},Ul=class extends P{},dC=class extends Ul{},hC=class extends Ul{},jl=class extends P{},fC=class extends jl{},_C=class extends jl{},ql=class extends P{},pC=class extends ql{},mC=class extends ql{},Wl=class extends P{},gC=class extends Wl{},wC=class extends Wl{},Hl=class extends P{},vC=class extends Hl{},yC=class extends Hl{},Bp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","audio_values","past_key_values"])}},Ql=class extends Bp{_merge_input_ids_with_audio_features(e){const t=e.audio_features.dims.at(-1),r=e.audio_features.view(-1,t);return up({audio_token_id:this.config.ignore_index??this.config.audio_token_id??this.config.audio_token_index,...e,audio_features:r})}},bC=class extends Ql{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","input_features","past_key_values"])}},Rp=class extends P{},MC=class extends Rp{},Gp=class extends P{},xC=class extends Gp{},Xl=class extends P{},kC=class extends Xl{},TC=class extends Xl{},Yl=class extends P{},EC=class extends Yl{},AC=class extends Yl{async _call(e){return new ie(await super._call(e))}},Ir=class extends P{},CC=class extends Ir{},SC=class extends Ir{async _call(e){return new Xr(await super._call(e))}},PC=class extends Ir{async _call(e){return new ie(await super._call(e))}},FC=class extends Ir{async _call(e){return new He(await super._call(e))}},LC=class extends P{},IC=class extends Ir{},OC=class extends Ir{async _call(e){return new Xr(await super._call(e))}},NC=class extends Ir{async _call(e){return new ie(await super._call(e))}},Jl=class extends P{},DC=class extends Jl{},zC=class extends Jl{},$p=class extends pr{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","pixel_attention_mask","position_ids","past_key_values"])}},Kl=class extends P{},BC=class extends Kl{},RC=class extends Kl{async _call(e){return new ie(await super._call(e))}},Zl=class extends P{},GC=class extends Zl{},$C=class extends Zl{},Pa=class extends P{},VC=class extends Pa{async forward(e){const t=!e.input_ids,r=!e.pixel_values;if(t&&r)throw new Error("Either `input_ids` or `pixel_values` should be provided.");if(t&&(e.input_ids=yt([e.pixel_values.dims[0],1])),r){const{image_size:l}=this.config.vision_config;e.pixel_values=ct([0,3,l,l],0)}const{text_embeddings:s,image_embeddings:n,l2norm_text_embeddings:a,l2norm_image_embeddings:o}=await super.forward(e),i={};return t||(i.text_embeddings=s,i.l2norm_text_embeddings=a),r||(i.image_embeddings=n,i.l2norm_image_embeddings=o),i}},Vp=class extends Pa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},UC=class extends Pa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},ec=class extends P{},jC=class extends ec{},qC=class extends ec{},WC=class extends pr{},tc=class extends P{},HC=class extends tc{},QC=class extends tc{},XC=class extends pr{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","pixel_attention_mask","spatial_shapes","position_ids","past_key_values"])}},rc=class extends P{},YC=class extends rc{},JC=class extends rc{},Up=class extends P{},KC=class extends Up{},sc=class extends P{},ZC=class extends sc{},eS=class extends sc{},nc=class extends P{},tS=class extends nc{},rS=class extends nc{},ac=class extends P{},sS=class extends ac{},nS=class extends ac{},oc=class extends P{},aS=class extends oc{},oS=class extends oc{},dn=class extends P{},iS=class extends dn{},lS=class extends dn{},cS=class extends dn{async _call(e){return new ie(await super._call(e))}},uS=class extends dn{},jp=class extends P{},dS=class extends jp{},qp=class extends P{},hS=class extends qp{},Wp=class extends Ye{constructor({char_logits:e,bpe_logits:t,wp_logits:r}){super(),this.char_logits=e,this.bpe_logits=t,this.wp_logits=r}get logits(){return[this.char_logits,this.bpe_logits,this.wp_logits]}},Hp=class extends P{},fS=class extends Hp{async _call(e){return new Wp(await super._call(e))}},Qp=class extends Ye{constructor({audio_codes:e}){super(),this.audio_codes=e}},Xp=class extends Ye{constructor({audio_values:e}){super(),this.audio_values=e}},Fa=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},_S=class extends Fa{async encode(e){return new Qp(await xe(this.sessions.encoder_model,e))}async decode(e){return new Xp(await xe(this.sessions.decoder_model,e))}},Yp=class extends Fa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},Jp=class extends Fa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},ic=class extends P{},pS=class extends ic{},mS=class extends ic{},lc=class extends P{},gS=class extends lc{},wS=class extends lc{},hn=class extends P{},vS=class extends hn{},yS=class extends hn{async _call(e){return new Je(await super._call(e))}},bS=class extends hn{async _call(e){return new ie(await super._call(e))}},MS=class extends hn{async _call(e){return new ht(await super._call(e))}},cc=class extends P{},xS=class extends cc{},kS=class extends cc{},La=class extends P{},TS=class extends La{},ES=class extends La{async _call(e){return new ie(await super._call(e))}},AS=class extends La{},Ia=class extends P{},CS=class extends Ia{},SS=class extends Ia{async _call(e){return new ie(await super._call(e))}},PS=class extends Ia{},Oa=class extends P{},FS=class extends Oa{},LS=class extends Oa{async _call(e){return new ie(await super._call(e))}},IS=class extends Oa{},Na=class extends P{},OS=class extends Na{},NS=class extends Na{async _call(e){return new ie(await super._call(e))}},DS=class extends Na{},uc=class extends P{},zS=class extends uc{},BS=class extends uc{async _call(e){return new ie(await super._call(e))}},dc=class extends P{},RS=class extends dc{},GS=class extends dc{async _call(e){return new ie(await super._call(e))}},fn=class extends P{},$S=class extends fn{},VS=class extends fn{async _call(e){return new Je(await super._call(e))}},US=class extends fn{async _call(e){return new ie(await super._call(e))}},jS=class extends fn{async _call(e){return new He(await super._call(e))}},hc=class extends P{},qS=class extends hc{},WS=class extends hc{},fc=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values","decoder_input_ids","past_key_values"])}},HS=class extends fc{},QS=class extends fc{},Ts=class extends P{},XS=class extends Ts{},YS=class extends Ts{async _call(e){return new Je(await super._call(e))}},JS=class extends Ts{async _call(e){return new ie(await super._call(e))}},KS=class extends Ts{async _call(e){return new He(await super._call(e))}},ZS=class extends Ts{async _call(e){return new ht(await super._call(e))}},_c=class extends P{},eP=class extends _c{},tP=class extends _c{},pc=class extends P{},rP=class extends pc{},sP=class extends pc{},Kp=class extends P{},nP=class extends Kp{constructor(...t){super(...t);k(this,"forward_params",["input_ids","pixel_values","images_seq_mask","images_emb_mask","attention_mask","position_ids","past_key_values"]);this._generation_mode="text"}async forward(t){const r=this._generation_mode??"text";let s;if(r==="text"||!t.past_key_values){const l=this.sessions.prepare_inputs_embeds,c=rt(t,l.inputNames);s=await xe(l,c)}else{const l=this.sessions.gen_img_embeds,c=rt({image_ids:t.input_ids},l.inputNames);s=await xe(l,c)}const n={...t,...s},a=await _r(this,n),o=this.sessions[r==="text"?"lm_head":"gen_head"];if(!o)throw new Error(`Unable to find "${o}" generation head`);const i=await xe(o,rt(a,o.inputNames));return{...s,...a,...i}}prepare_inputs_for_generation(t,r,s){const n=!!r.past_key_values;return s.guidance_scale!==null&&s.guidance_scale>1&&(n?r.input_ids=ze([r.input_ids,r.input_ids],0):(r.input_ids=ze([r.input_ids,Fi(r.input_ids,BigInt(s.pad_token_id))],0),r.attention_mask=ze([r.attention_mask,Fi(r.attention_mask,0n)],0))),(n||!r.pixel_values)&&(r.pixel_values=ct([0,0,3,384,384],1)),n&&(r.images_seq_mask=new U("bool",new Array(1).fill(!0).fill(!1,0,1),[1,1]),r.images_emb_mask=new U("bool",new Array(0).fill(!1),[1,1,0])),r}async generate(t){return this._generation_mode="text",super.generate(t)}async generate_images(t){this._generation_mode="image";const r=(t.inputs??t[this.main_input_name]).dims[1],n=(await super.generate(t)).slice(null,[r,null]),a=this.sessions.image_decode,{decoded_image:o}=await xe(a,{generated_tokens:n}),i=o.add_(1).mul_(255/2).clamp_(0,255).to("uint8"),l=[];for(const c of i){const d=jt.fromTensor(c);l.push(d)}return l}},mc=class extends P{},aP=class extends mc{},oP=class extends mc{},Zp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"])}_apply_and_filter_by_delay_pattern_mask(t){const[r,s]=t.dims,n=this.config.decoder.num_codebooks,a=s-n;let o=0;for(let c=0;c<t.size;++c){if(t.data[c]==this.config.decoder.pad_token_id)continue;const d=c%s,h=Math.floor(c/s)%n,_=d-h;_>0&&_<=a&&(t.data[o++]=t.data[c])}const i=Math.floor(r/n),l=o/(i*n);return new U(t.type,t.data.slice(0,o),[i,n,l])}prepare_inputs_for_generation(t,r,s){const n=BigInt(this.config.decoder.pad_token_id);let a=structuredClone(t);for(let o=0;o<a.length;++o)for(let i=0;i<a[o].length;++i)o%this.config.decoder.num_codebooks>=i&&(a[o][i]=n);return s.guidance_scale!==null&&s.guidance_scale>1&&(a=a.concat(a)),xa(this,a,r)}async generate(t){const r=await super.generate(t),s=this._apply_and_filter_by_delay_pattern_mask(r).unsqueeze_(0),{audio_values:n}=await xe(this.sessions.encodec_decode,{audio_codes:s});return n}},gc=class extends P{},iP=class extends gc{},lP=class extends gc{},wc=class extends P{},cP=class extends wc{},uP=class extends wc{},Es=class extends P{},dP=class extends Es{},hP=class extends Es{async _call(e){return new Je(await super._call(e))}},fP=class extends Es{async _call(e){return new ie(await super._call(e))}},_P=class extends Es{async _call(e){return new He(await super._call(e))}},pP=class extends Es{async _call(e){return new ht(await super._call(e))}},em=class extends P{},mP=class extends em{},vc=class extends P{},gP=class extends vc{},wP=class extends vc{},yc=class extends P{},vP=class extends yc{},yP=class extends yc{},bc=class extends P{},bP=class extends bc{},MP=class extends bc{},Mc=class extends P{},xP=class extends Mc{},kP=class extends Mc{},xc=class extends P{},TP=class extends xc{},EP=class extends xc{},kc=class extends P{},AP=class extends kc{},CP=class extends kc{},Tc=class extends P{},SP=class extends Tc{},PP=class extends Tc{},Ec=class extends P{},FP=class extends Ec{},LP=class extends Ec{},IP=class extends pr{},tm=class extends P{},OP=class extends tm{async _call(e){return new Xr(await super._call(e))}},Ac=class extends P{},NP=class extends Ac{},DP=class extends Ac{},Cc=class extends P{},zP=class extends Cc{},BP=class extends Cc{},Sc=class extends P{},RP=class extends Sc{},GP=class extends Sc{},Pc=class extends P{},$P=class extends Pc{},VP=class extends Pc{},rm=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","position_ids","pixel_values","image_sizes","past_key_values"])}},sm=class extends rm{async forward({input_ids:e=null,attention_mask:t=null,pixel_values:r=null,image_sizes:s=null,position_ids:n=null,inputs_embeds:a=null,past_key_values:o=null,generation_config:i=null,logits_processor:l=null,...c}){if(!a){let h;if(r&&e.dims[1]!==1){if(!s)throw new Error("`image_sizes` must be provided when `pixel_values` is provided.");({image_features:h}=await xe(this.sessions.vision_encoder,{pixel_values:r,image_sizes:s}))}else{const _=this.config.normalized_config.hidden_size;h=new U("float32",[],[0,_])}({inputs_embeds:a}=await xe(this.sessions.prepare_inputs_embeds,{input_ids:e,image_features:h}))}return await _r(this,{inputs_embeds:a,past_key_values:o,attention_mask:t,position_ids:n,generation_config:i,logits_processor:l},!1)}},Fc=class extends P{},UP=class extends Fc{},jP=class extends Fc{async _call(e){return new ie(await super._call(e))}},Lc=class extends P{},qP=class extends Lc{},WP=class extends Lc{async _call(e){return new He(await super._call(e))}},Ic=class extends P{},HP=class extends Ic{},QP=class extends Ic{},Oc=class extends P{},XP=class extends Oc{},YP=class extends Oc{},Nc=class extends P{},JP=class extends Nc{},KP=class extends Nc{},Dc=class extends P{},ZP=class extends Dc{},eF=class extends Dc{},zc=class extends P{},tF=class extends zc{},rF=class extends zc{},Bc=class extends Bl{},nm=class extends zp{},sF=class extends Bc{},nF=class extends nm{},Rc=class extends Bc{},am=class extends Rc{},aF=class extends Rc{},oF=class extends am{},Gc=class extends P{},iF=class extends Gc{},lF=class extends Gc{async _call(e){return new ie(await super._call(e))}},$c=class extends P{},cF=class extends $c{},uF=class extends $c{async _call(e){return new om(await super._call(e))}},om=class extends ln{},As=class extends P{},dF=class extends As{},hF=class extends As{async _call(e){return new Je(await super._call(e))}},fF=class extends As{async _call(e){return new ie(await super._call(e))}},_F=class extends As{async _call(e){return new He(await super._call(e))}},pF=class extends As{async _call(e){return new ht(await super._call(e))}},Cs=class extends P{},mF=class extends Cs{},gF=class extends Cs{async _call(e){return new Je(await super._call(e))}},wF=class extends Cs{async _call(e){return new ie(await super._call(e))}},vF=class extends Cs{async _call(e){return new He(await super._call(e))}},yF=class extends Cs{async _call(e){return new ht(await super._call(e))}},Vc=class extends P{},bF=class extends Vc{},MF=class extends Vc{async _call(e){return new im(await super._call(e))}},im=class extends ln{},lm=class extends Ye{constructor({iou_scores:e,pred_masks:t}){super(),this.iou_scores=e,this.pred_masks=t}},cm=class extends P{},xF=class extends cm{async get_image_embeddings({pixel_values:e}){return await Lr(this,{pixel_values:e})}async forward(e){!e.image_embeddings||!e.image_positional_embeddings?e={...e,...await this.get_image_embeddings(e)}:e={...e},e.input_labels??(e.input_labels=yt(e.input_points.dims.slice(0,-1)));const t={image_embeddings:e.image_embeddings,image_positional_embeddings:e.image_positional_embeddings};return e.input_points&&(t.input_points=e.input_points),e.input_labels&&(t.input_labels=e.input_labels),e.input_boxes&&(t.input_boxes=e.input_boxes),await xe(this.sessions.prompt_encoder_mask_decoder,t)}async _call(e){return new lm(await super._call(e))}},um=class extends Ye{constructor({iou_scores:e,pred_masks:t,object_score_logits:r}){super(),this.iou_scores=e,this.pred_masks=t,this.object_score_logits=r}},dm=class extends P{},Uc=class extends dm{async get_image_embeddings({pixel_values:e}){return await Lr(this,{pixel_values:e})}async forward(e){const{num_feature_levels:t}=this.config.vision_config;if(Array.from({length:t},(a,o)=>`image_embeddings.${o}`).some(a=>!e[a])?e={...e,...await this.get_image_embeddings(e)}:e={...e},e.input_points){if(e.input_boxes&&e.input_boxes.dims[1]!==1)throw new Error("When both `input_points` and `input_boxes` are provided, the number of boxes per image must be 1.");const a=e.input_points.dims;e.input_labels??(e.input_labels=yt(a.slice(0,-1))),e.input_boxes??(e.input_boxes=ct([a[0],0,4],0))}else if(e.input_boxes){const a=e.input_boxes.dims;e.input_labels=ct([a[0],a[1],0],-1n),e.input_points=ct([a[0],1,0,2],0)}else throw new Error("At least one of `input_points` or `input_boxes` must be provided.");const s=this.sessions.prompt_encoder_mask_decoder,n=rt(e,s.inputNames);return await xe(s,n)}async _call(e){return new um(await super._call(e))}},kF=class extends Uc{},TF=class extends Uc{},Da=class extends P{},EF=class extends Da{},AF=class extends Da{},CF=class extends Da{},za=class extends P{},SF=class extends za{},PF=class extends za{},FF=class extends za{},jc=class extends P{},LF=class extends jc{},hm=class extends jc{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},IF=class extends Yr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},qc=class extends P{},OF=class extends qc{},NF=class extends qc{},DF=class extends $p{},Ba=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},zF=class extends Ba{async encode(e){return await xe(this.sessions.encoder_model,e)}async decode(e){return await xe(this.sessions.decoder_model,e)}},fm=class extends Ba{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},_m=class extends Ba{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},Wc=class extends P{},BF=class extends Wc{},RF=class extends Wc{},Ra=class extends P{},GF=class extends Ra{},$F=class extends Ra{},VF=class extends Ra{async generate_speech(e,t,{threshold:r=.5,minlenratio:s=0,maxlenratio:n=20,vocoder:a=null}={}){const o={input_ids:e},{encoder_outputs:i,encoder_attention_mask:l}=await Lr(this,o),c=i.dims[1]/this.config.reduction_factor,d=Math.floor(c*n),h=Math.floor(c*s),_=this.config.num_mel_bins;let p=[],w=null,v=null,y=0;for(;;){++y;const A=np(!!v);let C;v?C=v.output_sequence_out:C=new U("float32",new Float32Array(_),[1,1,_]);let S={use_cache_branch:A,output_sequence:C,encoder_attention_mask:l,speaker_embeddings:t,encoder_hidden_states:i};el(this,S,w),v=await xe(this.sessions.decoder_model_merged,S),w=Zi(v,w);const{prob:N,spectrum:x}=v;if(p.push(x),y>=h&&(Array.from(N.data).filter(R=>R>=r).length>0||y>=d))break}const M=ze(p),{waveform:T}=await xe(a.sessions.model,{spectrogram:M});return{spectrogram:M,waveform:T}}},UF=class extends P{constructor(){super(...arguments);k(this,"main_input_name","spectrogram")}},_n=class extends P{},jF=class extends _n{},qF=class extends _n{async _call(e){return new Je(await super._call(e))}},WF=class extends _n{async _call(e){return new ie(await super._call(e))}},HF=class extends _n{async _call(e){return new ht(await super._call(e))}},Hc=class extends P{},QF=class extends Hc{},XF=class extends Hc{},Qc=class extends P{},YF=class extends Qc{},JF=class extends Qc{},pm=class extends P{},KF=class extends pm{},mm=class extends P{},gm=class extends mm{async generate_speech({input_ids:e,attention_mask:t,style:r,num_inference_steps:s=5,speed:n=1.05}){const{sampling_rate:a,chunk_compress_factor:o,base_chunk_size:i,latent_dim:l}=this.config,{last_hidden_state:c,durations:d}=await xe(this.sessions.text_encoder,{input_ids:e,attention_mask:t,style:r}),h=d.div(n).mul_(a),_=i*o,p=h.data,w=Int32Array.from(p,z=>Math.ceil(z/_)),v=Math.max(...w),y=e.dims[0],M=new BigInt64Array(y*v);for(let z=0;z<y;++z)M.fill(1n,z*v,z*v+w[z]);const T=new U("int64",M,[y,v]),A=l*o,C=A*v;let S=Ax([y,A,v]);const N=S.data;for(let z=0;z<y;++z)if(w[z]!==v)for(let $=0;$<A;++$)N.fill(0,z*C+$*v+w[z],z*C+($+1)*v);const x=ct([y],s);for(let z=0;z<s;++z){const $=ct([y],z);({denoised_latents:S}=await xe(this.sessions.latent_denoiser,{style:r,noisy_latents:S,latent_mask:T,encoder_outputs:c,attention_mask:t,timestep:$,num_inference_steps:x}))}const{waveform:R}=await xe(this.sessions.voice_decoder,{latents:S});return{waveform:R,durations:h}}},Ga=class extends P{},ZF=class extends Ga{},eL=class extends Ga{async _call(e){return new ie(await super._call(e))}},tL=class extends Ga{},Xc=class extends P{},rL=class extends Xc{},sL=class extends Xc{},Yc=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"])}},nL=class extends Yc{},aL=class extends Yc{},Jc=class extends P{},oL=class extends Jc{},iL=class extends Jc{async _call(e){return new wm(await super._call(e))}},wm=class extends Ml{},vm=class extends P{},lL=class extends vm{},$a=class extends P{},cL=class extends $a{},uL=class extends $a{async _call(e){return new Xr(await super._call(e))}},dL=class extends $a{async _call(e){return new ie(await super._call(e))}},pn=class extends P{},hL=class extends pn{},fL=class extends pn{async _call(e){return new Xr(await super._call(e))}},_L=class extends pn{async _call(e){return new ie(await super._call(e))}},pL=class extends pn{async _call(e){return new He(await super._call(e))}},Kc=class extends P{},mL=class extends Kc{},gL=class extends Kc{},wL=class extends P{constructor(){super(...arguments);k(this,"main_input_name","pixel_values");k(this,"forward_params",["pixel_values","decoder_input_ids","encoder_hidden_states","past_key_values"])}},Zc=class extends P{},vL=class extends Zc{},yL=class extends Zc{async _call(e){return new ie(await super._call(e))}},ym=class extends P{},bL=class extends ym{},eu=class extends P{},ML=class extends eu{},xL=class extends eu{async _call(e){return new ie(await super._call(e))}},bm=class extends P{},kL=class extends bm{async _call(e){return new JT(await super._call(e))}},Mm=class extends P{},TL=class extends Mm{},xm=class extends Ye{constructor({waveform:e,spectrogram:t}){super(),this.waveform=e,this.spectrogram=t}},km=class extends P{},EL=class extends km{async _call(e){return new xm(await super._call(e))}},AL=class extends Ql{},Tm=2,CL=1,tu=new WeakMap;function SL(e,t){var w,v,y;const{text_config:r,audio_config:s}=e.config,n=e.sessions.audio_encoder,{num_mel_bins:a,hidden_size:o}=s,i=a+o,l=new Yi,c=((w=n==null?void 0:n.config)==null?void 0:w.kv_cache_dtype)??"float32",d=c==="float16"?ds.float16:ds.float32,h=va(s,{batch_size:1});for(const M in h){const T=h[M].reduce((A,C)=>A*C,1);l[M]=new U(c,new d(T),h[M])}const _=new U(c,new d(i*Tm),[1,i,Tm]),p=((v=t[Symbol.asyncIterator])==null?void 0:v.call(t))??((y=t[Symbol.iterator])==null?void 0:y.call(t));if(!p)throw new Error("input_features must be iterable or async iterable");return{encoder_session:n,enc_kv_cache:l,enc_padding_cache:_,enc_past_seq_len:0,audio_embed_queue:[],audio_embed_total_tokens:0,audio_queue_offset:0,audio_consumed:0,stream_exhausted:!1,chunks_iter:p,text_hidden_size:r.hidden_size}}async function PL(e,t){const r=t.dims[2],s=Math.floor((CL+r-3)/2)+1,n=new U("int64",BigInt64Array.from({length:s},(d,h)=>BigInt(e.enc_past_seq_len+h)),[1,s]),a=e.enc_past_seq_len+s,o=yt([1,a]),{audio_embeds:i,present_padding_cache:l,...c}=await xe(e.encoder_session,{input_features:t,attention_mask:o,position_ids:n,past_padding_cache:e.enc_padding_cache,...e.enc_kv_cache});e.enc_padding_cache.location==="gpu-buffer"&&e.enc_padding_cache.dispose(),e.enc_padding_cache=l;for(const d in c)if(d.startsWith("present.")){const h=d.replace("present","past_key_values"),_=e.enc_kv_cache[h];(_==null?void 0:_.location)==="gpu-buffer"&&_.dispose(),e.enc_kv_cache[h]=c[d]}return e.enc_past_seq_len=a,i}async function FL(e,t){for(;e.audio_embed_total_tokens<t&&!e.stream_exhausted;){const r=await e.chunks_iter.next();if(r.done){e.stream_exhausted=!0;break}const s=await PL(e,r.value);e.audio_embed_queue.push({data:s.data,tokens:s.dims[1]}),e.audio_embed_total_tokens+=s.dims[1]}}function LL(e,t,r){if(e.audio_embed_queue.length===0)return;const s=t.data;let n=0,a=r;for(;a>0&&e.audio_embed_queue.length>0;){const o=e.audio_embed_queue[0],i=o.tokens-e.audio_queue_offset,l=Math.min(a,i),c=e.audio_queue_offset*e.text_hidden_size;for(let d=0;d<l*e.text_hidden_size;++d)s[n*e.text_hidden_size+d]+=o.data[c+d];n+=l,a-=l,e.audio_queue_offset+=l,e.audio_queue_offset>=o.tokens&&(e.audio_embed_queue.shift(),e.audio_queue_offset=0)}e.audio_consumed+=r-a}var IL=class extends ya{constructor(e){super(),this._s=e}_call(e){const t=this._s.stream_exhausted&&this._s.audio_embed_queue.length===0;return e.map(()=>t)}},Em=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","past_key_values"])}},Am=class extends Em{async forward({input_ids:e,past_key_values:t,...r}){const s=e.dims[1],n=tu.get(this);n&&await FL(n,n.audio_consumed+s);const{inputs_embeds:a}=await xe(this.sessions.embed_tokens,{input_ids:e});n&&LL(n,a,s);const o={inputs_embeds:a,...r};el(this,o,t);const i=this.sessions.decoder_model_merged,l=rt(o,i.inputNames);return await xe(i,l)}async generate({input_features:e,stopping_criteria:t,...r}){if(!e)throw new Error("input_features (generator/iterable) must be provided");const s=SL(this,e);tu.set(this,s);const n=new ep;n.push(new IL(s)),t&&n.extend(t);try{return await super.generate({...r,stopping_criteria:n})}finally{s.enc_kv_cache.dispose(),tu.delete(this)}}},Va=class extends P{},OL=class extends Va{},NL=class extends Va{async _call(e){return new Xr(await super._call(e))}},DL=class extends Va{async _call(e){return new ie(await super._call(e))}},Cm=class extends Ye{constructor({logits:e,embeddings:t}){super(),this.logits=e,this.embeddings=t}},Ss=class extends P{},zL=class extends Ss{},BL=class extends Ss{async _call(e){return new Xr(await super._call(e))}},RL=class extends Ss{async _call(e){return new ie(await super._call(e))}},GL=class extends Ss{async _call(e){return new Cm(await super._call(e))}},$L=class extends Ss{async _call(e){return new He(await super._call(e))}},Sm=class extends P{},VL=class extends Sm{},UL=class extends Z_{constructor(){super(...arguments);k(this,"return_timestamps",null);k(this,"return_token_timestamps",null);k(this,"num_frames",null);k(this,"alignment_heads",null);k(this,"task",null);k(this,"language",null);k(this,"no_timestamps_token_id",null);k(this,"prompt_ids",null);k(this,"is_multilingual",null);k(this,"lang_to_id",null);k(this,"task_to_id",null);k(this,"max_initial_timestamp_index",1)}},ru=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_features");k(this,"forward_params",["input_features","attention_mask","decoder_input_ids","decoder_attention_mask","past_key_values"])}},jL=class extends ru{},Pm=class extends ru{_prepare_generation_config(e,t){return super._prepare_generation_config(e,t,UL)}_retrieve_init_tokens(e){const t=[e.decoder_start_token_id];let r=e.language;const s=e.task;if(e.is_multilingual){r||(ue.warn("No language specified - defaulting to English (en)."),r="en");const a=`<|${M1(r)}|>`;t.push(e.lang_to_id[a]),t.push(e.task_to_id[s??"transcribe"])}else if(r||s)throw new Error("Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.");return!e.return_timestamps&&e.no_timestamps_token_id&&t.at(-1)!==e.no_timestamps_token_id?t.push(e.no_timestamps_token_id):e.return_timestamps&&t.at(-1)===e.no_timestamps_token_id&&(ue.warn("<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`."),t.pop()),t.filter(n=>n!=null)}async generate({inputs:e=null,generation_config:t=null,logits_processor:r=null,stopping_criteria:s=null,...n}){t=this._prepare_generation_config(t,n);const a=n.decoder_input_ids instanceof U?Li(n.decoder_input_ids):n.decoder_input_ids??this._retrieve_init_tokens(t);if(t.return_timestamps&&(r??(r=new Xi),r.push(new rE(t,a))),t.begin_suppress_tokens&&(r??(r=new Xi),r.push(new K_(t.begin_suppress_tokens,a.length))),t.return_token_timestamps){if(!t.alignment_heads)throw new Error("Model generation config has no `alignment_heads`, token-level timestamps not available. See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config.");t.task==="translate"&&ue.warn("Token-level timestamps may not be reliable for task 'translate'."),t.output_attentions=!0,t.return_dict_in_generate=!0}if(t.return_timestamps&&!n.max_new_tokens)return this._generate_with_seek({inputs:e,generation_config:t,logits_processor:r,init_tokens:a,kwargs:n});const o=await super.generate({inputs:e,generation_config:t,logits_processor:r,decoder_input_ids:a,...n});return t.return_token_timestamps&&(o.token_timestamps=this._extract_token_timestamps(o,t.alignment_heads,t.num_frames,.02,a.length)),o}async _generate_with_seek({inputs:e,generation_config:t,logits_processor:r,init_tokens:s,kwargs:n}){const a=t.no_timestamps_token_id+1,o=Array.isArray(t.eos_token_id)?t.eos_token_id[0]:t.eos_token_id,i=t.return_token_timestamps,l=e,c=l.dims[2],d=2,h=this.config.max_source_positions,_=d*h;let p=0;const w=[],v=[];for(;p<c;){const M=Math.min(p+_,c),T=l.slice(null,null,[p,M]);let A;const C=T.dims[2];if(C<_){const W=l.dims[1],ee=new Float32Array(W*_),G=T.data;for(let L=0;L<W;++L)ee.set(G.subarray(L*C,(L+1)*C),L*_);A=new U("float32",ee,[1,W,_])}else A=T;if(r)for(const W of r)"begin_index"in W&&(W.begin_index=s.length);const S=await super.generate({inputs:A,generation_config:t,logits_processor:r,decoder_input_ids:s,...n}),x=(i?S.sequences:S)[0].tolist().map(Number).slice(s.length);let R;if(i){S.token_timestamps=this._extract_token_timestamps(S,t.alignment_heads,Math.floor((M-p)/d),.02,s.length);const W=p/d*.02;R=S.token_timestamps[0].tolist().slice(s.length).map(ee=>ee+W)}if(x.length>0&&x.at(-1)===o&&x.pop(),x.length===0)break;const z=x.map(W=>W>=a),$=x.length>=2&&z[x.length-1]&&!z[x.length-2],Q=[];for(let W=0;W<x.length-1;++W)z[W]&&z[W+1]&&Q.push(W+1);let H,D=x.length;if(Q.length>0)if($)H=M-p;else{const W=Q.at(-1);H=(x[W-1]-a)*d,D=W}else H=M-p;const I=Math.floor(p/d),te=a+1500;for(let W=0;W<D;++W)x[W]>=a&&(x[W]=Math.min(x[W]+I,te));w.push(...x.slice(0,D)),R&&v.push(...R.slice(0,D)),p+=H}w.push(o);const y=[...s,...w];if(i){const M=new U("int64",y.map(BigInt),[1,y.length]),T=[...new Array(s.length).fill(0),...v,0],A=new U("float32",new Float32Array(T),[1,T.length]);return{sequences:M,token_timestamps:A}}return new U("int64",y.map(BigInt),[1,y.length])}_extract_token_timestamps(e,t,r=null,s=.02,n=0){if(!e.cross_attentions)throw new Error("Model outputs must contain cross attentions to extract timestamps. This is most likely because the model was not exported with `output_attentions=True`.");r==null&&ue.warn("`num_frames` has not been set, meaning the entire audio will be analyzed. This may lead to inaccurate token-level timestamps for short audios (< 30 seconds).");let a=this.config.median_filter_width;a===void 0&&(ue.warn("Model config has no `median_filter_width`, using default value of 7."),a=7);const o=e.cross_attentions,i=Array.from({length:this.config.decoder_layers},(y,M)=>ze(o.map(T=>T[M]),2)),l=ur(t.map(([y,M])=>{if(y>=i.length)throw new Error(`Layer index ${y} is out of bounds for cross attentions (length ${i.length}).`);return r?i[y].slice(null,M,null,[0,r]):i[y].slice(null,M)})).transpose(1,0,2,3),[c,d]=Ex(l,-2,0,!0),h=l.clone();for(let y=0;y<h.dims[0];++y){const M=h[y];for(let T=0;T<M.dims[0];++T){const A=M[T],C=c[y][T][0].data,S=d[y][T][0].data;for(let N=0;N<A.dims[0];++N){let x=A[N].data;for(let R=0;R<x.length;++R)x[R]=(x[R]-S[R])/C[R];x.set(lx(x,a))}}}const _=n>0?h.slice(null,null,[n,h.dims[2]],null):h,p=[Ci(_,1)],w=e.sequences.dims,v=new U("float32",new Float32Array(w[0]*w[1]),w);for(let y=0;y<w[0];++y){const M=p[y].neg().squeeze_(0),[T,A]=ux(M.tolist()),C=Array.from({length:T.length-1},(R,z)=>T[z+1]-T[z]),S=Yt([1],C).map(R=>!!R),N=[];for(let R=0;R<S.length;++R)S[R]&&N.push(A[R]*s);const x=new Array(n).fill(0);x.push(...N),N.length>0&&x.push(N.at(-1)),v[y].data.set(x)}return v}},qL=class extends Pm{},Ps=class extends P{},WL=class extends Ps{},HL=class extends Ps{async _call(e){return new Je(await super._call(e))}},QL=class extends Ps{async _call(e){return new ie(await super._call(e))}},XL=class extends Ps{async _call(e){return new He(await super._call(e))}},YL=class extends Ps{async _call(e){return new ht(await super._call(e))}},Fs=class extends P{},JL=class extends Fs{},KL=class extends Fs{async _call(e){return new Je(await super._call(e))}},ZL=class extends Fs{async _call(e){return new ie(await super._call(e))}},eI=class extends Fs{async _call(e){return new He(await super._call(e))}},tI=class extends Fs{async _call(e){return new ht(await super._call(e))}},su=class extends P{},rI=class extends su{},sI=class extends su{async _call(e){return new Fm(await super._call(e))}},Fm=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},nu=class extends P{},nI=class extends nu{},aI=class extends nu{},oI=new Map([["bert","BertModel"],["eurobert","EuroBertModel"],["neobert","NeoBertModel"],["modernbert","ModernBertModel"],["nomic_bert","NomicBertModel"],["roformer","RoFormerModel"],["electra","ElectraModel"],["esm","EsmModel"],["convbert","ConvBertModel"],["camembert","CamembertModel"],["deberta","DebertaModel"],["deberta-v2","DebertaV2Model"],["mpnet","MPNetModel"],["albert","AlbertModel"],["distilbert","DistilBertModel"],["roberta","RobertaModel"],["xlm","XLMModel"],["xlm-roberta","XLMRobertaModel"],["clap","ClapModel"],["clip","CLIPModel"],["clipseg","CLIPSegModel"],["chinese_clip","ChineseCLIPModel"],["siglip","SiglipModel"],["jina_clip","JinaCLIPModel"],["mobilebert","MobileBertModel"],["squeezebert","SqueezeBertModel"],["wav2vec2","Wav2Vec2Model"],["wav2vec2-bert","Wav2Vec2BertModel"],["unispeech","UniSpeechModel"],["unispeech-sat","UniSpeechSatModel"],["hubert","HubertModel"],["wavlm","WavLMModel"],["audio-spectrogram-transformer","ASTModel"],["vits","VitsModel"],["pyannote","PyAnnoteModel"],["wespeaker-resnet","WeSpeakerResNetModel"],["detr","DetrModel"],["rt_detr","RTDetrModel"],["rt_detr_v2","RTDetrV2Model"],["rf_detr","RFDetrModel"],["d_fine","DFineModel"],["table-transformer","TableTransformerModel"],["vit","ViTModel"],["ijepa","IJepaModel"],["pvt","PvtModel"],["vit_msn","ViTMSNModel"],["vit_mae","ViTMAEModel"],["groupvit","GroupViTModel"],["fastvit","FastViTModel"],["mobilevit","MobileViTModel"],["mobilevitv2","MobileViTV2Model"],["owlvit","OwlViTModel"],["owlv2","Owlv2Model"],["beit","BeitModel"],["deit","DeiTModel"],["hiera","HieraModel"],["convnext","ConvNextModel"],["convnextv2","ConvNextV2Model"],["dinov2","Dinov2Model"],["dinov2_with_registers","Dinov2WithRegistersModel"],["dinov3_vit","DINOv3ViTModel"],["dinov3_convnext","DINOv3ConvNextModel"],["resnet","ResNetModel"],["swin","SwinModel"],["swin2sr","Swin2SRModel"],["donut-swin","DonutSwinModel"],["yolos","YolosModel"],["dpt","DPTModel"],["glpn","GLPNModel"],["hifigan","SpeechT5HifiGan"],["efficientnet","EfficientNetModel"],["decision_transformer","DecisionTransformerModel"],["patchtst","PatchTSTModel"],["patchtsmixer","PatchTSMixerModel"],["mobilenet_v1","MobileNetV1Model"],["mobilenet_v2","MobileNetV2Model"],["mobilenet_v3","MobileNetV3Model"],["mobilenet_v4","MobileNetV4Model"],["maskformer","MaskFormerModel"],["mgp-str","MgpstrForSceneTextRecognition"],["style_text_to_speech_2","StyleTextToSpeech2Model"]]),iI=new Map([["t5","T5Model"],["longt5","LongT5Model"],["mt5","MT5Model"],["bart","BartModel"],["mbart","MBartModel"],["marian","MarianModel"],["whisper","WhisperModel"],["cohere_asr","CohereAsrModel"],["m2m_100","M2M100Model"],["blenderbot","BlenderbotModel"],["blenderbot-small","BlenderbotSmallModel"]]),lI=new Map([["mimi","MimiModel"],["dac","DacModel"],["snac","SnacModel"]]),cI=new Map([["bloom","BloomModel"],["jais","JAISModel"],["gpt2","GPT2Model"],["gpt_oss","GptOssModel"],["gptj","GPTJModel"],["gpt_bigcode","GPTBigCodeModel"],["gpt_neo","GPTNeoModel"],["gpt_neox","GPTNeoXModel"],["codegen","CodeGenModel"],["llama","LlamaModel"],["apertus","ApertusModel"],["nanochat","NanoChatModel"],["arcee","ArceeModel"],["afmoe","AfmoeModel"],["lfm2","Lfm2Model"],["lfm2_moe","Lfm2MoeModel"],["smollm3","SmolLM3Model"],["exaone","ExaoneModel"],["olmo","OlmoModel"],["olmo2","Olmo2Model"],["olmo3","Olmo3Model"],["olmo_hybrid","OlmoHybridModel"],["mobilellm","MobileLLMModel"],["granite","GraniteModel"],["granitemoehybrid","GraniteMoeHybridModel"],["cohere","CohereModel"],["cohere2","Cohere2Model"],["gemma","GemmaModel"],["gemma2","Gemma2Model"],["vaultgemma","VaultGemmaModel"],["gemma3_text","Gemma3Model"],["helium","HeliumModel"],["glm","GlmModel"],["glm_moe_dsa","GlmMoeDsaModel"],["openelm","OpenELMModel"],["qwen2","Qwen2Model"],["qwen2_moe","Qwen2MoeModel"],["qwen3","Qwen3Model"],["qwen3_moe","Qwen3MoeModel"],["qwen3_next","Qwen3NextModel"],["phi","PhiModel"],["phi3","Phi3Model"],["mpt","MptModel"],["opt","OPTModel"],["mistral","MistralModel"],["mistral4","Mistral4Model"],["ministral","MinistralModel"],["ministral3","Ministral3Model"],["ernie4_5","Ernie4_5ForCausalLM"],["starcoder2","Starcoder2Model"],["deepseek_v3","DeepseekV3Model"],["falcon","FalconModel"],["falcon_h1","FalconH1Model"],["nemotron_h","NemotronHModel"],["solar_open","SolarOpenModel"],["stablelm","StableLmModel"],["modernbert-decoder","ModernBertDecoderModel"],["hunyuan_v1_dense","HunYuanDenseV1Model"],["youtu","YoutuModel"]]),Lm=new Map([["speecht5","SpeechT5ForSpeechToText"],["whisper","WhisperForConditionalGeneration"],["lite-whisper","LiteWhisperForConditionalGeneration"],["moonshine","MoonshineForConditionalGeneration"],["cohere_asr","CohereAsrForConditionalGeneration"]]),Im=new Map([["speecht5","SpeechT5ForTextToSpeech"]]),Om=new Map([["vits","VitsModel"],["musicgen","MusicgenForConditionalGeneration"],["supertonic","SupertonicForConditionalGeneration"]]),Nm=new Map([["bert","BertForSequenceClassification"],["eurobert","EuroBertForSequenceClassification"],["neobert","NeoBertForSequenceClassification"],["modernbert","ModernBertForSequenceClassification"],["roformer","RoFormerForSequenceClassification"],["electra","ElectraForSequenceClassification"],["esm","EsmForSequenceClassification"],["convbert","ConvBertForSequenceClassification"],["camembert","CamembertForSequenceClassification"],["deberta","DebertaForSequenceClassification"],["deberta-v2","DebertaV2ForSequenceClassification"],["mpnet","MPNetForSequenceClassification"],["albert","AlbertForSequenceClassification"],["distilbert","DistilBertForSequenceClassification"],["roberta","RobertaForSequenceClassification"],["xlm","XLMForSequenceClassification"],["xlm-roberta","XLMRobertaForSequenceClassification"],["bart","BartForSequenceClassification"],["mbart","MBartForSequenceClassification"],["mobilebert","MobileBertForSequenceClassification"],["squeezebert","SqueezeBertForSequenceClassification"]]),Dm=new Map([["bert","BertForTokenClassification"],["eurobert","EuroBertForTokenClassification"],["neobert","NeoBertForTokenClassification"],["modernbert","ModernBertForTokenClassification"],["roformer","RoFormerForTokenClassification"],["electra","ElectraForTokenClassification"],["esm","EsmForTokenClassification"],["convbert","ConvBertForTokenClassification"],["camembert","CamembertForTokenClassification"],["deberta","DebertaForTokenClassification"],["deberta-v2","DebertaV2ForTokenClassification"],["mpnet","MPNetForTokenClassification"],["distilbert","DistilBertForTokenClassification"],["roberta","RobertaForTokenClassification"],["xlm","XLMForTokenClassification"],["xlm-roberta","XLMRobertaForTokenClassification"]]),zm=new Map([["t5","T5ForConditionalGeneration"],["longt5","LongT5ForConditionalGeneration"],["mt5","MT5ForConditionalGeneration"],["bart","BartForConditionalGeneration"],["mbart","MBartForConditionalGeneration"],["marian","MarianMTModel"],["m2m_100","M2M100ForConditionalGeneration"],["blenderbot","BlenderbotForConditionalGeneration"],["blenderbot-small","BlenderbotSmallForConditionalGeneration"]]),Bm=new Map([["bloom","BloomForCausalLM"],["gpt2","GPT2LMHeadModel"],["gpt_oss","GptOssForCausalLM"],["jais","JAISLMHeadModel"],["gptj","GPTJForCausalLM"],["gpt_bigcode","GPTBigCodeForCausalLM"],["gpt_neo","GPTNeoForCausalLM"],["gpt_neox","GPTNeoXForCausalLM"],["codegen","CodeGenForCausalLM"],["llama","LlamaForCausalLM"],["nanochat","NanoChatForCausalLM"],["apertus","ApertusForCausalLM"],["llama4_text","Llama4ForCausalLM"],["arcee","ArceeForCausalLM"],["afmoe","AfmoeForCausalLM"],["lfm2","Lfm2ForCausalLM"],["lfm2_moe","Lfm2MoeForCausalLM"],["smollm3","SmolLM3ForCausalLM"],["exaone","ExaoneForCausalLM"],["olmo","OlmoForCausalLM"],["olmo2","Olmo2ForCausalLM"],["olmo3","Olmo3ForCausalLM"],["olmo_hybrid","OlmoHybridForCausalLM"],["mobilellm","MobileLLMForCausalLM"],["granite","GraniteForCausalLM"],["granitemoehybrid","GraniteMoeHybridForCausalLM"],["cohere","CohereForCausalLM"],["cohere2","Cohere2ForCausalLM"],["gemma","GemmaForCausalLM"],["gemma2","Gemma2ForCausalLM"],["vaultgemma","VaultGemmaForCausalLM"],["gemma3_text","Gemma3ForCausalLM"],["gemma3","Gemma3ForCausalLM"],["helium","HeliumForCausalLM"],["glm","GlmForCausalLM"],["glm_moe_dsa","GlmMoeDsaForCausalLM"],["openelm","OpenELMForCausalLM"],["qwen2","Qwen2ForCausalLM"],["qwen2_moe","Qwen2MoeForCausalLM"],["qwen3","Qwen3ForCausalLM"],["qwen3_moe","Qwen3MoeForCausalLM"],["qwen3_next","Qwen3NextForCausalLM"],["qwen2_vl","Qwen2VLForCausalLM"],["qwen2_5_vl","Qwen2_5_VLForCausalLM"],["qwen3_vl","Qwen3VLForCausalLM"],["qwen3_vl_moe","Qwen3VLMoeForCausalLM"],["qwen3_5","Qwen3_5ForCausalLM"],["qwen3_5_text","Qwen3_5ForCausalLM"],["qwen3_5_moe","Qwen3_5MoeForCausalLM"],["gemma3n","Gemma3nForCausalLM"],["gemma4","Gemma4ForCausalLM"],["phi","PhiForCausalLM"],["phi3","Phi3ForCausalLM"],["mpt","MptForCausalLM"],["opt","OPTForCausalLM"],["mbart","MBartForCausalLM"],["mistral","MistralForCausalLM"],["mistral4","Mistral4ForCausalLM"],["ministral","MinistralForCausalLM"],["ministral3","Ministral3ForCausalLM"],["ernie4_5","Ernie4_5ForCausalLM"],["starcoder2","Starcoder2ForCausalLM"],["deepseek_v3","DeepseekV3ForCausalLM"],["falcon","FalconForCausalLM"],["falcon_h1","FalconH1ForCausalLM"],["nemotron_h","NemotronHForCausalLM"],["trocr","TrOCRForCausalLM"],["solar_open","SolarOpenForCausalLM"],["stablelm","StableLmForCausalLM"],["modernbert-decoder","ModernBertDecoderForCausalLM"],["hunyuan_v1_dense","HunYuanDenseV1ForCausalLM"],["youtu","YoutuForCausalLM"],["phi3_v","Phi3VForCausalLM"]]),uI=new Map([["multi_modality","MultiModalityCausalLM"]]),Rm=new Map([["bert","BertForMaskedLM"],["eurobert","EuroBertForMaskedLM"],["neobert","NeoBertForMaskedLM"],["modernbert","ModernBertForMaskedLM"],["roformer","RoFormerForMaskedLM"],["electra","ElectraForMaskedLM"],["esm","EsmForMaskedLM"],["convbert","ConvBertForMaskedLM"],["camembert","CamembertForMaskedLM"],["deberta","DebertaForMaskedLM"],["deberta-v2","DebertaV2ForMaskedLM"],["mpnet","MPNetForMaskedLM"],["albert","AlbertForMaskedLM"],["distilbert","DistilBertForMaskedLM"],["roberta","RobertaForMaskedLM"],["xlm","XLMWithLMHeadModel"],["xlm-roberta","XLMRobertaForMaskedLM"],["mobilebert","MobileBertForMaskedLM"],["squeezebert","SqueezeBertForMaskedLM"]]),Gm=new Map([["bert","BertForQuestionAnswering"],["neobert","NeoBertForQuestionAnswering"],["roformer","RoFormerForQuestionAnswering"],["electra","ElectraForQuestionAnswering"],["convbert","ConvBertForQuestionAnswering"],["camembert","CamembertForQuestionAnswering"],["deberta","DebertaForQuestionAnswering"],["deberta-v2","DebertaV2ForQuestionAnswering"],["mpnet","MPNetForQuestionAnswering"],["albert","AlbertForQuestionAnswering"],["distilbert","DistilBertForQuestionAnswering"],["roberta","RobertaForQuestionAnswering"],["xlm","XLMForQuestionAnswering"],["xlm-roberta","XLMRobertaForQuestionAnswering"],["mobilebert","MobileBertForQuestionAnswering"],["squeezebert","SqueezeBertForQuestionAnswering"]]),$m=new Map([["vision-encoder-decoder","VisionEncoderDecoderModel"],["idefics3","Idefics3ForConditionalGeneration"],["smolvlm","SmolVLMForConditionalGeneration"]]),Vm=new Map([["llava","LlavaForConditionalGeneration"],["llava_onevision","LlavaOnevisionForConditionalGeneration"],["moondream1","Moondream1ForConditionalGeneration"],["florence2","Florence2ForConditionalGeneration"],["qwen2_vl","Qwen2VLForConditionalGeneration"],["qwen2_5_vl","Qwen2_5_VLForConditionalGeneration"],["qwen3_vl","Qwen3VLForConditionalGeneration"],["qwen3_vl_moe","Qwen3VLMoeForConditionalGeneration"],["qwen3_5","Qwen3_5ForConditionalGeneration"],["qwen3_5_moe","Qwen3_5MoeForConditionalGeneration"],["lfm2_vl","Lfm2VlForConditionalGeneration"],["idefics3","Idefics3ForConditionalGeneration"],["smolvlm","SmolVLMForConditionalGeneration"],["paligemma","PaliGemmaForConditionalGeneration"],["llava_qwen2","LlavaQwen2ForCausalLM"],["gemma3","Gemma3ForConditionalGeneration"],["gemma3n","Gemma3nForConditionalGeneration"],["gemma4","Gemma4ForConditionalGeneration"],["mistral3","Mistral3ForConditionalGeneration"],["lighton_ocr","LightOnOcrForConditionalGeneration"],["glm_ocr","GlmOcrForConditionalGeneration"]]),Um=new Map([["granite_speech","GraniteSpeechForConditionalGeneration"],["ultravox","UltravoxModel"],["voxtral","VoxtralForConditionalGeneration"],["voxtral_realtime","VoxtralRealtimeForConditionalGeneration"]]),dI=new Map([["vision-encoder-decoder","VisionEncoderDecoderModel"]]),jm=new Map([["vit","ViTForImageClassification"],["ijepa","IJepaForImageClassification"],["pvt","PvtForImageClassification"],["vit_msn","ViTMSNForImageClassification"],["fastvit","FastViTForImageClassification"],["mobilevit","MobileViTForImageClassification"],["mobilevitv2","MobileViTV2ForImageClassification"],["beit","BeitForImageClassification"],["deit","DeiTForImageClassification"],["hiera","HieraForImageClassification"],["convnext","ConvNextForImageClassification"],["convnextv2","ConvNextV2ForImageClassification"],["dinov2","Dinov2ForImageClassification"],["dinov2_with_registers","Dinov2WithRegistersForImageClassification"],["resnet","ResNetForImageClassification"],["swin","SwinForImageClassification"],["segformer","SegformerForImageClassification"],["efficientnet","EfficientNetForImageClassification"],["mobilenet_v1","MobileNetV1ForImageClassification"],["mobilenet_v2","MobileNetV2ForImageClassification"],["mobilenet_v3","MobileNetV3ForImageClassification"],["mobilenet_v4","MobileNetV4ForImageClassification"]]),qm=new Map([["detr","DetrForObjectDetection"],["rt_detr","RTDetrForObjectDetection"],["rt_detr_v2","RTDetrV2ForObjectDetection"],["rf_detr","RFDetrForObjectDetection"],["d_fine","DFineForObjectDetection"],["table-transformer","TableTransformerForObjectDetection"],["yolos","YolosForObjectDetection"]]),Wm=new Map([["owlvit","OwlViTForObjectDetection"],["owlv2","Owlv2ForObjectDetection"],["grounding-dino","GroundingDinoForObjectDetection"]]),Ls=new Map([["detr","DetrForSegmentation"],["clipseg","CLIPSegForImageSegmentation"]]),Hm=new Map([["segformer","SegformerForSemanticSegmentation"],["sapiens","SapiensForSemanticSegmentation"],["swin","SwinForSemanticSegmentation"],["mobilenet_v1","MobileNetV1ForSemanticSegmentation"],["mobilenet_v2","MobileNetV2ForSemanticSegmentation"],["mobilenet_v3","MobileNetV3ForSemanticSegmentation"],["mobilenet_v4","MobileNetV4ForSemanticSegmentation"]]),Qm=new Map([["detr","DetrForSegmentation"],["maskformer","MaskFormerForInstanceSegmentation"]]),Xm=new Map([["sam","SamModel"],["sam2","Sam2Model"],["edgetam","EdgeTamModel"],["sam3_tracker","Sam3TrackerModel"]]),Ym=new Map([["wav2vec2","Wav2Vec2ForCTC"],["wav2vec2-bert","Wav2Vec2BertForCTC"],["unispeech","UniSpeechForCTC"],["unispeech-sat","UniSpeechSatForCTC"],["wavlm","WavLMForCTC"],["hubert","HubertForCTC"],["parakeet_ctc","ParakeetForCTC"]]),Jm=new Map([["wav2vec2","Wav2Vec2ForSequenceClassification"],["wav2vec2-bert","Wav2Vec2BertForSequenceClassification"],["unispeech","UniSpeechForSequenceClassification"],["unispeech-sat","UniSpeechSatForSequenceClassification"],["wavlm","WavLMForSequenceClassification"],["hubert","HubertForSequenceClassification"],["audio-spectrogram-transformer","ASTForAudioClassification"]]),Km=new Map([["wavlm","WavLMForXVector"]]),Zm=new Map([["unispeech-sat","UniSpeechSatForAudioFrameClassification"],["wavlm","WavLMForAudioFrameClassification"],["wav2vec2","Wav2Vec2ForAudioFrameClassification"],["pyannote","PyAnnoteForAudioFrameClassification"]]),eg=new Map([["vitmatte","VitMatteForImageMatting"]]),hI=new Map([["patchtst","PatchTSTForPrediction"],["patchtsmixer","PatchTSMixerForPrediction"]]),tg=new Map([["swin2sr","Swin2SRForImageSuperResolution"]]),rg=new Map([["chmv2","CHMv2ForDepthEstimation"],["dpt","DPTForDepthEstimation"],["depth_anything","DepthAnythingForDepthEstimation"],["glpn","GLPNForDepthEstimation"],["sapiens","SapiensForDepthEstimation"],["depth_pro","DepthProForDepthEstimation"],["metric3d","Metric3DForDepthEstimation"],["metric3dv2","Metric3Dv2ForDepthEstimation"]]),sg=new Map([["sapiens","SapiensForNormalEstimation"]]),ng=new Map([["vitpose","VitPoseForPoseEstimation"]]),ag=new Map([["clip","CLIPVisionModelWithProjection"],["siglip","SiglipVisionModel"],["jina_clip","JinaCLIPVisionModel"]]),og=[[oI,q.EncoderOnly],[iI,q.EncoderDecoder],[cI,q.DecoderOnlyWithoutHead],[lI,q.AutoEncoder],[Nm,q.EncoderOnly],[Dm,q.EncoderOnly],[zm,q.Seq2Seq],[Lm,q.Seq2Seq],[Bm,q.DecoderOnly],[uI,q.MultiModality],[Rm,q.EncoderOnly],[Gm,q.EncoderOnly],[$m,q.Vision2Seq],[Vm,q.ImageTextToText],[Um,q.AudioTextToText],[jm,q.EncoderOnly],[Ls,q.EncoderOnly],[Qm,q.EncoderOnly],[Hm,q.EncoderOnly],[eg,q.EncoderOnly],[hI,q.EncoderOnly],[tg,q.EncoderOnly],[rg,q.EncoderOnly],[sg,q.EncoderOnly],[ng,q.EncoderOnly],[qm,q.EncoderOnly],[Wm,q.EncoderOnly],[Xm,q.MaskGeneration],[Ym,q.EncoderOnly],[Jm,q.EncoderOnly],[Im,q.Seq2Seq],[Om,q.EncoderOnly],[Km,q.EncoderOnly],[Zm,q.EncoderOnly],[ag,q.EncoderOnly]];for(const[e,t]of og)for(const r of e.values()){fr.set(r,t);const s=rl[r];gs.set(s,r),Ki.set(r,s)}var fI=[["MusicgenForConditionalGeneration",Zp,q.Musicgen],["Phi3VForCausalLM",sm,q.Phi3V],["CLIPTextModelWithProjection",wp,q.EncoderOnly],["SiglipTextModel",hm,q.EncoderOnly],["JinaCLIPTextModel",Vp,q.EncoderOnly],["ClapTextModelWithProjection",mp,q.EncoderOnly],["ClapAudioModelWithProjection",gp,q.EncoderOnly],["DacEncoderModel",bp,q.EncoderOnly],["DacDecoderModel",Mp,q.EncoderOnly],["MimiEncoderModel",Yp,q.EncoderOnly],["MimiDecoderModel",Jp,q.EncoderOnly],["SnacEncoderModel",fm,q.EncoderOnly],["SnacDecoderModel",_m,q.EncoderOnly],["Gemma3nForConditionalGeneration",Sa,q.ImageAudioTextToText],["Gemma4ForConditionalGeneration",Ol,q.ImageAudioTextToText],["SupertonicForConditionalGeneration",gm,q.Supertonic],["ChatterboxModel",fp,q.Chatterbox],["VoxtralRealtimeForConditionalGeneration",Am,q.VoxtralRealtime]];for(const[e,t,r]of fI)fr.set(e,r),gs.set(t,e),Ki.set(e,t);var ig=new Map([["modnet",Ls],["birefnet",Ls],["isnet",Ls],["ben",Ls]]);for(const[e,t]of ig.entries())t.set(e,"PreTrainedModel"),fr.set(e,q.EncoderOnly),Ki.set(e,P);var _I=new Set(ig.keys());fr.set("PreTrainedModel",q.EncoderOnly),gs.set(P,"PreTrainedModel");var Ae={MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES:Nm,MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES:Dm,MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES:Im,MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES:Om,MODEL_FOR_MASKED_LM_MAPPING_NAMES:Rm,MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES:Gm,MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES:jm,MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES:Ls,MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES:Hm,MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES:Qm,MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES:qm,MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES:Wm,MODEL_FOR_MASK_GENERATION_MAPPING_NAMES:Xm,MODEL_FOR_CTC_MAPPING_NAMES:Ym,MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES:Jm,MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES:Km,MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES:Zm,MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES:dI,MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES:eg,MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES:tg,MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES:rg,MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES:sg,MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES:ng,MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES:ag,MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES:Vm,MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES:Um,MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:zm,MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES:Lm,MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:Bm,MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES:$m};wE(Ae);var Ce=(ao=class{static supports(e){if(!this.MODEL_CLASS_MAPPINGS)return!1;for(const t of this.MODEL_CLASS_MAPPINGS)if(t.has(e))return!0;return this.BASE_IF_FAIL}static async from_pretrained(e,{progress_callback:t=null,config:r=null,cache_dir:s=null,local_files_only:n=!1,revision:a="main",model_file_name:o=null,subfolder:i="onnx",device:l=null,dtype:c=null,use_external_data_format:d=null,session_options:h={}}={}){const _={progress_callback:t,config:r,cache_dir:s,local_files_only:n,revision:a,model_file_name:o,subfolder:i,device:l,dtype:c,use_external_data_format:d,session_options:h};if(_.config=await nn.from_pretrained(e,_),!this.MODEL_CLASS_MAPPINGS)throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: "+this.name);const{model_type:p}=_.config;for(const w of this.MODEL_CLASS_MAPPINGS){let v=w.get(p);if(!v){for(const y of w.values())if(y[0]===p){v=y;break}if(!v)continue}return await rl[v].from_pretrained(e,_)}if(this.BASE_IF_FAIL)return _I.has(p)||ue.warn(`Unknown model class "${p}", attempting to construct from base class.`),await P.from_pretrained(e,_);throw Error(`Unsupported model type: ${p}`)}},k(ao,"MODEL_CLASS_MAPPINGS",null),k(ao,"BASE_IF_FAIL",!1),ao),mn=(oo=class extends Ce{},k(oo,"MODEL_CLASS_MAPPINGS",og.map(e=>e[0])),k(oo,"BASE_IF_FAIL",!0),oo),lg=(fu=class extends Ce{},k(fu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES]),fu),pI=(_u=class extends Ce{},k(_u,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES]),_u),au=(pu=class extends Ce{},k(pu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]),pu),mI=(mu=class extends Ce{},k(mu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]),mu),gI=(gu=class extends Ce{},k(gu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]),gu),wI=(wu=class extends Ce{},k(wu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]),wu),vI=(vu=class extends Ce{},k(vu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES]),vu),yI=(yu=class extends Ce{},k(yu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_MASKED_LM_MAPPING_NAMES]),yu),bI=(bu=class extends Ce{},k(bu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES]),bu),MI=(Mu=class extends Ce{},k(Mu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES]),Mu),xI=(xu=class extends Ce{},k(xu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES]),xu),cg=(ku=class extends Ce{},k(ku,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES]),ku),ug=(Tu=class extends Ce{},k(Tu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES]),Tu),dg=(Eu=class extends Ce{},k(Eu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES]),Eu),kI=(Au=class extends Ce{},k(Au,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES]),Au),TI=(Cu=class extends Ce{},k(Cu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]),Cu);Su=class extends Ce{},k(Su,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]);var EI=(Pu=class extends Ce{},k(Pu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_CTC_MAPPING_NAMES]),Pu),AI=(Fu=class extends Ce{},k(Fu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]),Fu);Lu=class extends Ce{},k(Lu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]),Iu=class extends Ce{},k(Iu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]);var CI=(Ou=class extends Ce{},k(Ou,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]),Ou);Nu=class extends Ce{},k(Nu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]);var SI=(Du=class extends Ce{},k(Du,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]),Du),PI=(zu=class extends Ce{},k(zu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES]),zu);Bu=class extends Ce{},k(Bu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES]),Ru=class extends Ce{},k(Ru,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES]);var FI=(Gu=class extends Ce{},k(Gu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES]),Gu);$u=class extends Ce{},k($u,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES]),Vu=class extends Ce{},k(Vu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES]);async function Wt(e){return Array.isArray(e)||(e=[e]),await Promise.all(e.map(t=>jt.read(t)))}async function Is(e,t){return Array.isArray(e)||(e=[e]),await Promise.all(e.map(r=>typeof r=="string"||r instanceof URL?F1(r,t):r instanceof Float64Array?new Float32Array(r):r))}function ou(e,t){t&&(e=e.map(o=>o|0));const[r,s,n,a]=e;return{xmin:r,ymin:s,xmax:n,ymax:a}}var Ve=class extends vt{constructor({task:e,model:t,tokenizer:r=null,processor:s=null}){super(),this.task=e,this.model=t,this.tokenizer=r,this.processor=s}async dispose(){await this.model.dispose()}},LI=class extends Ve{async _call(e,{top_k:t=1}={}){const r=this.tokenizer(e,{padding:!0,truncation:!0}),s=await this.model(r),{problem_type:n,id2label:a}=this.model.config,o=n==="multi_label_classification"?l=>l.sigmoid():l=>new U("float32",nt(l.data),l.dims),i=[];for(const l of s.logits){const c=o(l),d=await hs(c,t),h=d[0].tolist(),p=d[1].tolist().map((w,v)=>({label:a?a[w]:`LABEL_${w}`,score:h[v]}));t===1?i.push(...p):i.push(p)}return Array.isArray(e)||t===1?i:i[0]}},II=class extends Ve{async _call(e,{ignore_labels:t=["O"]}={}){const r=Array.isArray(e),s=this.tokenizer(r?e:[e],{padding:!0,truncation:!0}),a=(await this.model(s)).logits,o=this.model.config.id2label,i=[];for(let l=0;l<a.dims[0];++l){const c=s.input_ids[l],d=a[l],h=[];for(let _=0;_<d.dims[0];++_){const p=d[_],w=je(p.data)[1],v=o?o[w]:`LABEL_${w}`;if(t.includes(v))continue;const y=this.tokenizer.decode([c[_].item()],{skip_special_tokens:!0});if(y==="")continue;const M=nt(p.data);h.push({entity:v,score:M[w],index:_,word:y})}i.push(h)}return r?i:i[0]}},OI=class extends Ve{async _call(e,t,{top_k:r=1}={}){const s=this.tokenizer(e,{text_pair:t,padding:!0,truncation:!0}),n=Array.isArray(e),{start_logits:a,end_logits:o}=await this.model(s),i=s.input_ids.tolist(),l=s.attention_mask.tolist(),{all_special_ids:c,sep_token_id:d}=this.tokenizer,h=[];for(let _=0;_<a.dims[0];++_){const p=i[_],w=p.findIndex(S=>S==d),v=a[_].tolist(),y=o[_].tolist();for(let S=1;S<v.length;++S)(l[_]==0||S<=w||c.findIndex(N=>N==p[S])!==-1)&&(v[S]=-1/0,y[S]=-1/0);const M=nt(v).map((S,N)=>[S,N]),T=nt(y).map((S,N)=>[S,N]);M[0][0]=0,T[0][0]=0;const A=Iy(M,T).filter(S=>S[0][1]<=S[1][1]).map(S=>[S[0][1],S[1][1],S[0][0]*S[1][0]]).sort((S,N)=>N[2]-S[2]),C=[];for(let S=0;S<Math.min(A.length,r);++S){const[N,x,R]=A[S],z=p.slice(N,x+1),$=this.tokenizer.decode(z,{skip_special_tokens:!0});C.push({answer:$,score:R})}r===1?h.push(...C):h.push(C)}return n?h:h[0]}},NI=class extends Ve{async _call(e,{top_k:t=5}={}){const{mask_token_id:r,mask_token:s}=this.tokenizer,n=this.tokenizer(e,{padding:!0,truncation:!0}),{logits:a}=await this.model(n),o=[],i=n.input_ids.tolist();for(let l=0;l<i.length;++l){const c=i[l],d=c.findIndex(v=>v==r);if(d===-1)throw Error(`Mask token (${s}) not found in text.`);const h=a[l][d],_=await hs(new U("float32",nt(h.data),h.dims),t),p=_[0].tolist(),w=_[1].tolist();o.push(w.map((v,y)=>{const M=c.slice();return M[d]=v,{score:p[y],token:Number(v),token_str:this.tokenizer.decode([v]),sequence:this.tokenizer.decode(M,{skip_special_tokens:!0})}}))}return Array.isArray(e)?o:o[0]}},iu=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256});k(this,"_key","generated_text")}async _call(t,r={}){Array.isArray(t)||(t=[t]),this.model.config.prefix&&(t=t.map(l=>this.model.config.prefix+l));const s=this.model.config.task_specific_params;s&&s[this.task]&&s[this.task].prefix&&(t=t.map(l=>s[this.task].prefix+l));const n=this.tokenizer,a={padding:!0,truncation:!0};let o;this.task==="translation"&&"_build_translation_inputs"in n?o=n._build_translation_inputs(t,a,r):o=n(t,a);const i=await this.model.generate({...o,...this._default_generation_config,...r});return n.batch_decode(i,{skip_special_tokens:!0}).map(l=>({[this._key]:l}))}},DI=class extends iu{constructor(){super(...arguments);k(this,"_key","summary_text")}},zI=class extends iu{constructor(){super(...arguments);k(this,"_key","translation_text")}};function hg(e){return Array.isArray(e)&&e.every(t=>"role"in t&&"content"in t)}var BI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256})}async _call(t,r={}){let s=!1,n=!1,a=r.add_special_tokens??(this.tokenizer.add_bos_token||this.tokenizer.add_eos_token)??!1,o=r.tokenizer_encode_kwargs,i;if(typeof t=="string")i=t=[t];else if(Array.isArray(t)&&t.every(w=>typeof w=="string"))s=!0,i=t;else{if(hg(t))t=[t];else if(Array.isArray(t)&&t.every(hg))s=!0;else throw new Error("Input must be a string, an array of strings, a Chat, or an array of Chats");n=!0,i=t.map(w=>this.tokenizer.apply_chat_template(w,{tokenize:!1,add_generation_prompt:!0,...o})),a=!1,o=void 0}const l=n?!1:r.return_full_text??!0;this.tokenizer.padding_side="left";const c=this.tokenizer(i,{add_special_tokens:a,padding:!0,truncation:!0,...o}),d=await this.model.generate({...c,...this._default_generation_config,...r}),h=this.tokenizer.batch_decode(d,{skip_special_tokens:!0});let _;!l&&c.input_ids.dims.at(-1)>0&&(_=this.tokenizer.batch_decode(c.input_ids,{skip_special_tokens:!0}).map(w=>w.length));const p=Array.from({length:t.length},w=>[]);for(let w=0;w<h.length;++w){const v=Math.floor(w/d.dims[0]*t.length);_&&(h[w]=h[w].slice(_[v])),p[v].push({generated_text:n?[...t[v],{role:"assistant",content:h[w]}]:h[w]})}return!s&&p.length===1?p[0]:p}},RI=class extends Ve{constructor(e){super(e),this.label2id=Object.fromEntries(Object.entries(this.model.config.label2id).map(([t,r])=>[t.toLowerCase(),r])),this.entailment_id=this.label2id.entailment,this.entailment_id===void 0&&(ue.warn("Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."),this.entailment_id=2),this.contradiction_id=this.label2id.contradiction??this.label2id.not_entailment,this.contradiction_id===void 0&&(ue.warn("Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."),this.contradiction_id=0)}async _call(e,t,{hypothesis_template:r="This example is {}.",multi_label:s=!1}={}){const n=Array.isArray(e);n||(e=[e]),Array.isArray(t)||(t=[t]);const a=t.map(l=>r.replace("{}",l)),o=s||t.length===1,i=[];for(const l of e){const c=[];for(const _ of a){const p=this.tokenizer(l,{text_pair:_,padding:!0,truncation:!0}),w=await this.model(p);o?c.push([w.logits.data[this.contradiction_id],w.logits.data[this.entailment_id]]):c.push(w.logits.data[this.entailment_id])}const h=(o?c.map(_=>nt(_)[1]):nt(c)).map((_,p)=>[_,p]).sort((_,p)=>p[0]-_[0]);i.push({sequence:l,labels:h.map(_=>t[_[1]]),scores:h.map(_=>_[0])})}return n?i:i[0]}},GI=class extends Ve{async _call(e,{top_k:t=5}={}){const r=this.processor.feature_extractor.config.sampling_rate,s=await Is(e,r),n=this.model.config.id2label,a=[];for(const o of s){const i=await this.processor(o),c=(await this.model(i)).logits[0],d=await hs(new U("float32",nt(c.data),c.dims),t),h=d[0].tolist(),p=d[1].tolist().map((w,v)=>({label:n?n[w]:`LABEL_${w}`,score:h[v]}));a.push(p)}return Array.isArray(e)?a:a[0]}},$I=class extends Ve{async _call(e,t,{hypothesis_template:r="This is a sound of {}."}={}){const s=!Array.isArray(e);s&&(e=[e]);const n=t.map(c=>r.replace("{}",c)),a=this.tokenizer(n,{padding:!0,truncation:!0}),o=this.processor.feature_extractor.config.sampling_rate,i=await Is(e,o),l=[];for(const c of i){const d=await this.processor(c),h=await this.model({...a,...d}),_=nt(h.logits_per_audio.data);l.push([..._].map((p,w)=>({score:p,label:t[w]})))}return s?l[0]:l}},VI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{})}async _call(t,r={}){switch(r={...this._default_generation_config,...r},this.model.config.model_type){case"whisper":case"lite-whisper":return this._call_whisper(t,r);case"wav2vec2":case"wav2vec2-bert":case"unispeech":case"unispeech-sat":case"hubert":case"parakeet_ctc":return this._call_wav2vec2(t,r);case"moonshine":return this._call_moonshine(t,r);case"cohere_asr":return this._call_cohere_asr(t,r);default:throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)}}async _call_wav2vec2(t,r){r.language&&ue.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".'),r.task&&ue.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor.config.sampling_rate,o=await Is(n,a),i=[];for(const l of o){const c=await this.processor(l),h=(await this.model(c)).logits[0],_=[];for(const w of h)_.push(je(w.data)[1]);const p=this.tokenizer.decode(_,{skip_special_tokens:!0}).trim();i.push({text:p})}return s?i[0]:i}async _call_whisper(t,r){const s=r.return_timestamps??!1,n=r.chunk_length_s??0,a=r.force_full_sequences??!1;let o=r.stride_length_s??null;const i={...r};s==="word"&&(i.return_token_timestamps=!0,i.return_timestamps=!0);const l=!Array.isArray(t),c=l?[t]:t,d=this.processor.feature_extractor.config,h=d.chunk_length/this.model.config.max_source_positions,_=d.hop_length,p=d.sampling_rate,w=await Is(c,p),v=[];for(const y of w){let M=[];if(n>0){if(o===null)o=n/6;else if(n<=o)throw Error("`chunk_length_s` must be larger than `stride_length_s`.");const C=p*n,S=p*o,N=C-2*S;let x=0;for(;;){const R=x+C,z=y.subarray(x,R),$=await this.processor(z),Q=x===0,H=R>=y.length;if(M.push({stride:[z.length,Q?0:S,H?0:S],input_features:$.input_features,is_last:H}),H)break;x+=N}}else M=[{stride:[y.length,0,0],input_features:(await this.processor(y)).input_features,is_last:!0}];for(const C of M){i.num_frames=Math.floor(C.stride[0]/_);const S=await this.model.generate({inputs:C.input_features,...i});if(s==="word"){const N=S.sequences.tolist()[0],x=S.token_timestamps.tolist()[0],R=this.tokenizer.timestamp_begin,z=Math.max(N.findIndex($=>Number($)>=R),0);C.tokens=N.slice(z),C.token_timestamps=x.slice(z).map($=>tn($,2))}else C.tokens=S[0].tolist();C.stride=C.stride.map(N=>N/p)}const[T,A]=this.tokenizer._decode_asr(M,{time_precision:h,return_timestamps:s,force_full_sequences:a});v.push({text:T,...A})}return l?v[0]:v}async _call_moonshine(t,r){const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor.config.sampling_rate,o=await Is(n,a),i=[];for(const l of o){const c=await this.processor(l),d=Math.floor(l.length/a)*6,h=await this.model.generate({max_new_tokens:d,...r,...c}),_=this.processor.batch_decode(h,{skip_special_tokens:!0})[0];i.push({text:_})}return s?i[0]:i}async _call_cohere_asr(t,r){const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor,o=a.config.sampling_rate,i=await Is(n,o),l=r.language??"en",c=this.processor.get_decoder_prompt_ids(l),d=[];for(const h of i){const _=a.split_audio(h),p=[];for(const v of _){const y=await this.processor(v),M=await this.model.generate({...y,decoder_input_ids:c,...r}),T=this.tokenizer.decode(M[0].tolist(),{skip_special_tokens:!0}).trim();p.push(T)}const w=this.processor.constructor.join_chunks(p,l);d.push({text:w})}return s?d[0]:d}},UI=class extends Ve{constructor(t){super(t);k(this,"DEFAULT_VOCODER_ID","Xenova/speecht5_hifigan");this.vocoder=t.vocoder??null}async _prepare_speaker_embeddings(t,r){if((typeof t=="string"||t instanceof URL)&&(t=new Float32Array(await(await be.fetch(t)).arrayBuffer())),t instanceof Float32Array)t=new U("float32",t,[t.length]);else if(!(t instanceof U))throw new Error("Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.");if(r>1){if(t.dims[0]===1)t=t.repeat(r,1);else if(t.dims[0]!==r)throw new Error(`Expected speaker embeddings batch size to be 1 or ${r}, but got ${t.dims[0]}.`)}return t}_postprocess_waveform(t,r,s,n=null){const a=r.data,[o,i]=r.dims,l=n?n.data:null,c=[];for(let d=0;d<o;++d){const h=l?Math.min(Math.ceil(l[d]),i):i,_=d*i;c.push(new $1(a.slice(_,_+h),s))}return Array.isArray(t)?c:c[0]}async _call(t,r){return this.processor?this._call_text_to_spectrogram(t,r):this.model.config.model_type==="supertonic"?this._call_supertonic(t,r):this._call_text_to_waveform(t)}async _call_supertonic(t,{speaker_embeddings:r,num_inference_steps:s,speed:n}){if(!r)throw new Error("Speaker embeddings must be provided for Supertonic models.");const{sampling_rate:a,style_dim:o}=this.model.config,i=this.tokenizer(t,{padding:!0,truncation:!0}),l=i.input_ids.dims[0];r=await this._prepare_speaker_embeddings(r,l),r=r.view(l,-1,o);const{waveform:c,durations:d}=await this.model.generate_speech({...i,style:r,num_inference_steps:s,speed:n});return this._postprocess_waveform(t,c,a,d)}async _call_text_to_waveform(t){const r=this.tokenizer(t,{padding:!0,truncation:!0}),{waveform:s}=await this.model(r),n=this.model.config.sampling_rate;return this._postprocess_waveform(t,s,n)}async _call_text_to_spectrogram(t,{speaker_embeddings:r}){this.vocoder||(ue.info("No vocoder specified, using default HifiGan vocoder."),this.vocoder=await mn.from_pretrained(this.DEFAULT_VOCODER_ID,{dtype:"fp32"}));const{input_ids:s}=this.tokenizer(t,{padding:!0,truncation:!0}),n=s.dims[0];r=await this._prepare_speaker_embeddings(r,n),r=r.view(n,-1);const{waveform:a}=await this.model.generate_speech(s,r,{vocoder:this.vocoder}),o=this.processor.feature_extractor.config.sampling_rate;return this._postprocess_waveform(t,a,o)}},jI=class extends Ve{async _call(e,t={}){const r=Array.isArray(e),s=await Wt(e),{pixel_values:n}=await this.processor(s),a=[];for(const o of n){o.dims=[1,...o.dims];const i=await this.model.generate({inputs:o,...t}),l=this.tokenizer.batch_decode(i,{skip_special_tokens:!0}).map(c=>({generated_text:c.trim()}));a.push(l)}return r?a:a[0]}},qI=class extends Ve{async _call(e,{top_k:t=5}={}){const r=await Wt(e),{pixel_values:s}=await this.processor(r),n=await this.model({pixel_values:s}),{id2label:a}=this.model.config,o=[];for(const i of n.logits){const l=await hs(new U("float32",nt(i.data),i.dims),t),c=l[0].tolist(),h=l[1].tolist().map((_,p)=>({label:a?a[_]:`LABEL_${_}`,score:c[p]}));o.push(h)}return Array.isArray(e)?o:o[0]}},fg={panoptic:"post_process_panoptic_segmentation",instance:"post_process_instance_segmentation",semantic:"post_process_semantic_segmentation"},_g=class extends Ve{async _call(e,{threshold:t=.5,mask_threshold:r=.5,overlap_mask_area_threshold:s=.8,label_ids_to_fuse:n=null,target_sizes:a=null,subtask:o=null}={}){if(Array.isArray(e)&&e.length!==1)throw Error("Image segmentation pipeline currently only supports a batch size of 1.");const l=await Wt(e),c=l.map(M=>[M.height,M.width]),d=await this.processor(l),{inputNames:h,outputNames:_}=this.model.sessions.model;if(!h.includes("pixel_values")){if(h.length!==1)throw Error(`Expected a single input name, but got ${h.length} inputs: ${h}.`);const M=h[0];if(M in d)throw Error(`Input name ${M} already exists in the inputs.`);d[M]=d.pixel_values}const p=await this.model(d);let w=null;if(o!==null)w=fg[o];else if(this.processor.image_processor){for(const[M,T]of Object.entries(fg))if(T in this.processor.image_processor){w=this.processor.image_processor[T].bind(this.processor.image_processor),o=M;break}}const v=this.model.config.id2label,y=[];if(o)if(o==="panoptic"||o==="instance"){const M=w(p,t,r,s,n,a??c)[0],T=M.segmentation;for(const A of M.segments_info){const C=new Uint8ClampedArray(T.data.length);for(let N=0;N<T.data.length;++N)T.data[N]===A.id&&(C[N]=255);const S=new jt(C,T.dims[1],T.dims[0],1);y.push({score:A.score,label:v[A.label_id],mask:S})}}else if(o==="semantic"){const{segmentation:M,labels:T}=w(p,a??c)[0];for(const A of T){const C=new Uint8ClampedArray(M.data.length);for(let N=0;N<M.data.length;++N)M.data[N]===A&&(C[N]=255);const S=new jt(C,M.dims[1],M.dims[0],1);y.push({score:null,label:v[A],mask:S})}}else throw Error(`Subtask ${o} not supported.`);else{const T=p[_[0]];for(let A=0;A<c.length;++A){const C=c[A],S=T[A];S.data.some(x=>x<-1e-5||x>1+1e-5)&&S.sigmoid_();const N=await jt.fromTensor(S.mul_(255).to("uint8")).resize(C[1],C[0]);y.push({label:null,score:null,mask:N})}}return y}},WI=class extends _g{async _call(e,t={}){const r=await Wt(e),s=await super._call(e,t),n=r.map((a,o)=>{const i=a.clone();return i.putAlpha(s[o].mask),i});return Array.isArray(e)?n:n[0]}},HI=class extends Ve{async _call(e,t,{hypothesis_template:r="This is a photo of {}"}={}){const s=Array.isArray(e),n=await Wt(e),a=t.map(h=>r.replace("{}",h)),o=this.tokenizer(a,{padding:this.model.config.model_type==="siglip"?"max_length":!0,truncation:!0}),{pixel_values:i}=await this.processor(n),l=await this.model({...o,pixel_values:i}),c=this.model.config.model_type==="siglip"?h=>h.sigmoid().data:h=>nt(h.data),d=[];for(const h of l.logits_per_image){const p=[...c(h)].map((w,v)=>({score:w,label:t[v]}));p.sort((w,v)=>v.score-w.score),d.push(p)}return s?d:d[0]}},QI=class extends Ve{async _call(e,{threshold:t=.9,percentage:r=!1}={}){const s=Array.isArray(e);if(s&&e.length!==1)throw Error("Object detection pipeline currently only supports a batch size of 1.");const n=await Wt(e),a=r?null:n.map(_=>[_.height,_.width]),{pixel_values:o,pixel_mask:i}=await this.processor(n),l=await this.model({pixel_values:o,pixel_mask:i}),c=this.processor.image_processor.post_process_object_detection(l,t,a),{id2label:d}=this.model.config,h=c.map(_=>_.boxes.map((p,w)=>({score:_.scores[w],label:d[_.classes[w]],box:ou(p,!r)})));return s?h:h[0]}},XI=class extends Ve{async _call(e,t,{threshold:r=.1,top_k:s=null,percentage:n=!1}={}){const a=Array.isArray(e),o=await Wt(e),i=this.tokenizer(t,{padding:!0,truncation:!0}),l=await this.processor(o),c=[];for(let d=0;d<o.length;++d){const h=o[d],_=n?null:[[h.height,h.width]],p=l.pixel_values[d].unsqueeze_(0),w=await this.model({...i,pixel_values:p});let v;if("post_process_grounded_object_detection"in this.processor){const y=this.processor.post_process_grounded_object_detection(w,i.input_ids,{box_threshold:r,text_threshold:r,target_sizes:_})[0];v=y.boxes.map((M,T)=>({score:y.scores[T],label:y.labels[T],box:ou(M,!n)}))}else{const y=this.processor.image_processor.post_process_object_detection(w,r,_,!0)[0];v=y.boxes.map((M,T)=>({score:y.scores[T],label:t[y.classes[T]],box:ou(M,!n)}))}v.sort((y,M)=>M.score-y.score),s!==null&&(v=v.slice(0,s)),c.push(v)}return a?c:c[0]}},YI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256})}async _call(t,r,s={}){if(Array.isArray(t)){if(t.length!==1)throw Error("Document Question Answering pipeline currently only supports a batch size of 1.");t=t[0]}const n=(await Wt(t))[0],{pixel_values:a}=await this.processor(n),o=`<s_docvqa><s_question>${r}</s_question><s_answer>`,i=this.tokenizer(o,{add_special_tokens:!1,padding:!0,truncation:!0}).input_ids,l=await this.model.generate({inputs:a,max_length:this.model.config.decoder.max_position_embeddings,decoder_input_ids:i,...this._default_generation_config,...s}),d=this.tokenizer.batch_decode(l)[0].match(/<s_answer>(.*?)<\/s_answer>/);let h=null;return d&&d.length>=2&&(h=d[1].trim()),[{answer:h}]}},JI=class extends Ve{async _call(e){const t=await Wt(e),r=await this.processor(t),s=await this.model(r),n=[];for(const a of s.reconstruction){const o=a.squeeze().clamp_(0,1).mul_(255).round_().to("uint8");n.push(jt.fromTensor(o))}return Array.isArray(e)?n:n[0]}},KI=class extends Ve{async _call(e){const t=await Wt(e),r=await this.processor(t),{predicted_depth:s}=await this.model(r),n=[];for(let a=0;a<t.length;++a){const o=s[a],[i,l]=o.dims.slice(-2),[c,d]=t[a].size,h=(await Ut(o.view(1,1,i,l),{size:[d,c],mode:"bilinear"})).view(d,c),_=h.min().item(),p=h.max().item(),w=h.sub(_).div_(p-_).mul_(255).to("uint8").unsqueeze(0),v=jt.fromTensor(w);n.push({predicted_depth:h,depth:v})}return Array.isArray(e)?n:n[0]}},ZI=class extends Ve{async _call(e,{pooling:t="none",normalize:r=!1,quantize:s=!1,precision:n="binary"}={}){const a=this.tokenizer(e,{padding:!0,truncation:!0}),o=await this.model(a);let i=o.last_hidden_state??o.logits??o.token_embeddings;switch(t){case"none":break;case"mean":i=Tx(i,a.attention_mask);break;case"first_token":case"cls":i=i.slice(null,0);break;case"last_token":case"eos":i=i.slice(null,-1);break;default:throw Error(`Pooling method '${t}' not supported.`)}return r&&(i=i.normalize(2,-1)),s&&(i=Cx(i,n)),i}},eO=class extends Ve{async _call(e,{pool:t=null}={}){const r=await Wt(e),{pixel_values:s}=await this.processor(r),n=await this.model({pixel_values:s});let a;if(t){if(!("pooler_output"in n))throw Error("No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.");a=n.pooler_output}else a=n.last_hidden_state??n.logits??n.image_embeds;return a}},Ua=Object.freeze({"text-classification":{pipeline:LI,model:lg,default:{model:"Xenova/distilbert-base-uncased-finetuned-sst-2-english"},type:"text"},"token-classification":{pipeline:II,model:pI,default:{model:"Xenova/bert-base-multilingual-cased-ner-hrl"},type:"text"},"question-answering":{pipeline:OI,model:bI,default:{model:"Xenova/distilbert-base-cased-distilled-squad"},type:"text"},"fill-mask":{pipeline:NI,model:yI,default:{model:"onnx-community/ettin-encoder-32m-ONNX",dtype:"fp32"},type:"text"},summarization:{pipeline:DI,model:au,default:{model:"Xenova/distilbart-cnn-6-6"},type:"text"},translation:{pipeline:zI,model:au,default:{model:"Xenova/t5-small"},type:"text"},"text2text-generation":{pipeline:iu,model:au,default:{model:"Xenova/flan-t5-small"},type:"text"},"text-generation":{pipeline:BI,model:vI,default:{model:"onnx-community/Qwen3-0.6B-ONNX",dtype:"q4"},type:"text"},"zero-shot-classification":{pipeline:RI,model:lg,default:{model:"Xenova/distilbert-base-uncased-mnli"},type:"text"},"audio-classification":{pipeline:GI,model:AI,default:{model:"Xenova/wav2vec2-base-superb-ks"},type:"audio"},"zero-shot-audio-classification":{pipeline:$I,model:mn,default:{model:"Xenova/clap-htsat-unfused"},type:"multimodal"},"automatic-speech-recognition":{pipeline:VI,model:[mI,EI],default:{model:"Xenova/whisper-tiny.en"},type:"multimodal"},"text-to-audio":{pipeline:UI,model:[wI,gI],default:{model:"onnx-community/Supertonic-TTS-ONNX",dtype:"fp32"},type:"text"},"image-to-text":{pipeline:jI,model:MI,default:{model:"Xenova/vit-gpt2-image-captioning"},type:"multimodal"},"image-classification":{pipeline:qI,model:xI,default:{model:"Xenova/vit-base-patch16-224"},type:"multimodal"},"image-segmentation":{pipeline:_g,model:[cg,ug,dg],default:{model:"Xenova/detr-resnet-50-panoptic"},type:"multimodal"},"background-removal":{pipeline:WI,model:[cg,ug,dg],default:{model:"Xenova/modnet"},type:"image"},"zero-shot-image-classification":{pipeline:HI,model:mn,default:{model:"Xenova/clip-vit-base-patch32"},type:"multimodal"},"object-detection":{pipeline:QI,model:kI,default:{model:"Xenova/detr-resnet-50"},type:"multimodal"},"zero-shot-object-detection":{pipeline:XI,model:TI,default:{model:"Xenova/owlvit-base-patch32"},type:"multimodal"},"document-question-answering":{pipeline:YI,model:CI,default:{model:"Xenova/donut-base-finetuned-docvqa"},type:"multimodal"},"image-to-image":{pipeline:JI,model:SI,default:{model:"Xenova/swin2SR-classical-sr-x2-64"},type:"image"},"depth-estimation":{pipeline:KI,model:PI,default:{model:"onnx-community/depth-anything-v2-small"},type:"image"},"feature-extraction":{pipeline:ZI,model:mn,default:{model:"onnx-community/all-MiniLM-L6-v2-ONNX",dtype:"fp32"},type:"text"},"image-feature-extraction":{pipeline:eO,model:[FI,mn],default:{model:"onnx-community/dinov3-vits16-pretrain-lvd1689m-ONNX",dtype:"fp32"},type:"image"}}),pg=Object.freeze({"sentiment-analysis":"text-classification",ner:"token-classification",asr:"automatic-speech-recognition","text-to-speech":"text-to-audio",embeddings:"feature-extraction"});async function tO(e){if(!e)throw new Error("modelId is required");return(await Zs(e,sn,{})).exists?[sn]:[]}async function rO(e,{config:t=null,dtype:r=null,device:s=null,model_file_name:n=null,include_tokenizer:a=!0,include_processor:o=!0}={}){const i=await sp(e,{config:t,dtype:r,device:s,model_file_name:n});if(a){const l=await Kf(e);i.push(...l)}if(o){const l=await tO(e);i.push(...l)}return i}async function sO(e,t,r={}){e=pg[e]??e;const s=Ua[e];if(!s)throw new Error(`Unsupported pipeline task: ${e}. Must be one of [${Object.keys(Ua).join(", ")}]`);const{type:n}=s,i=await rO(t,{...r,include_tokenizer:n!=="audio"&&n!=="image",include_processor:n!=="text"});if(e==="text-generation"){const l=await rp(t,r),c=tp(l),d=mE(c);if(d){const h=Object.values(d).map(_=>`onnx/${_}`);return i.filter(_=>!_.startsWith("onnx/")||h.some(p=>_.startsWith(p)))}}return i}async function nO(e,t=null,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main",device:i=null,dtype:l=null,subfolder:c="onnx",use_external_data_format:d=null,model_file_name:h=null,session_options:_={}}={}){e=pg[e]??e;const p=Ua[e.split("_",1)[0]];if(!p)throw Error(`Unsupported pipeline: ${e}. Must be one of [${Object.keys(Ua)}]`);t||(t=p.default.model,ue.info(`No model specified. Using default model: "${t}".`),!l&&p.default.dtype&&(l=p.default.dtype));const w=await sO(e,t,{device:i,dtype:l});let v={};r&&(await Promise.all(w.map(async Q=>Zs(t,Q)))).forEach((Q,H)=>{Q.exists&&(v[w[H]]={loaded:0,total:Q.size??0})});const y={progress_callback:r?new _i(r,v):void 0,config:s,cache_dir:n,local_files_only:a,revision:o,device:i,dtype:l,subfolder:c,use_external_data_format:d,model_file_name:h,session_options:_},M=w.includes("tokenizer.json"),T=w.includes("preprocessor_config.json"),A=p.model;let C;if(Array.isArray(A)){const $=s??await nn.from_pretrained(t,y),{model_type:Q}=$,H=A.find(D=>D.supports(Q));if(!H)throw Error(`Unsupported model type "${Q}" for task "${e}". None of the candidate model classes support this type.`);C=H.from_pretrained(t,{...y,config:$})}else C=A.from_pretrained(t,y);const[S,N,x]=await Promise.all([M?Te.from_pretrained(t,y):null,T?UT.from_pretrained(t,y):null,C]),R={task:e,model:x};S&&(R.tokenizer=S),N&&(R.processor=N),Cr(r,{status:"ready",task:e,model:t});const z=p.pipeline;return new z(R)}fe.IS_PROCESS_AVAILABLE;let ja=null;const aO="Xenova/whisper-base.en";async function oO(){ja||(self.postMessage({type:"status",status:"loading",message:"Downloading speech model..."}),ja=await nO("automatic-speech-recognition",aO,{dtype:"q8",device:"wasm"}),self.postMessage({type:"status",status:"ready",message:"Speech model ready"}))}self.onmessage=async e=>{var r;const{type:t}=e.data;if(t==="load"){try{await oO()}catch(s){const n=s instanceof Error?s.message:"Model load failed";self.postMessage({type:"error",error:n})}return}if(t==="transcribe"){const s=e.data.audio;if(!ja){self.postMessage({type:"error",error:"Model not loaded"});return}try{const a=((r=(await ja(s,{language:"en",task:"transcribe",chunk_length_s:30,stride_length_s:5})).text)==null?void 0:r.trim())??"";self.postMessage({type:"result",text:a})}catch(n){const a=n instanceof Error?n.message:"Transcription failed";self.postMessage({type:"error",error:a})}}}})();
+`}):(ue.warn("You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens."),a=t.map(l=>xT(l,s,n,_s,e.length)));const o=this.tokenizer(a,r);return{...await this.image_processor(e,r),...o}}},k(En,"tokenizer_class",Te),k(En,"image_processor_class",dt),k(En,"uses_processor_config",!1),En),j_="<|image|>",TT=/<\|image_\d+\|>/g,ET=(eo=class extends Ee{async _call(e,t=null,{padding:r=!0,truncation:s=!0,num_crops:n=null}={}){Array.isArray(e)||(e=[e]);let a,o;if(t){o=await this.image_processor(t,{num_crops:n});const{num_img_tokens:i}=o,l=e.map((d,h)=>d.split(TT).join(j_.repeat(i[h])));a=this.tokenizer(l,{padding:r,truncation:s});const c=this.tokenizer._tokenizer.token_to_id(j_);a.input_ids.map_(d=>d==c?-d:d)}else a=this.tokenizer(e);return{...a,...o}}},k(eo,"image_processor_class",dt),k(eo,"tokenizer_class",Te),eo),AT=(An=class extends Ee{async _call(e,t=null,r={}){const s=await this.image_processor(e,r);if(t){const[a,o]=s.pixel_values.dims.slice(-2),{image_token:i,image_break_token:l,image_end_token:c,patch_size:d,spatial_merge_size:h}=this.config,_=d*h,p=Math.floor(a/_),w=Math.floor(o/_);t=structuredClone(t),Array.isArray(t)||(t=[t]);for(let v=0;v<t.length;++v){const y=i.repeat(w),M=y+l,T=y+c,A=M.repeat(p-1)+T;t[v]=t[v].replace(i,A)}}const n=t?this.tokenizer(t,r):{};return{...s,...n}}},k(An,"tokenizer_class",Te),k(An,"image_processor_class",dt),k(An,"uses_processor_config",!0),An),CT=(du=class extends Ee{async _call(e){return await this.feature_extractor(e)}post_process_speaker_diarization(...e){return this.feature_extractor.post_process_speaker_diarization(...e)}get sampling_rate(){return this.feature_extractor.config.sampling_rate}},k(du,"feature_extractor_class",m_),du),q_=class extends Wi{},ST=class extends q_{},W_=(hu=class extends Ee{async _call(...e){return await this.image_processor(...e)}post_process_masks(...e){return this.image_processor.post_process_masks(...e)}reshape_input_points(...e){return this.image_processor.reshape_input_points(...e)}},k(hu,"image_processor_class",dt),hu),H_=class extends W_{},PT=class extends H_{},FT=(to=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(to,"tokenizer_class",Te),k(to,"feature_extractor_class",Ft),to),LT=(Cn=class extends Ee{async _call(e,t=null,r={}){if(Array.isArray(e))throw new Error("Batched inputs are not supported yet.");let s={};if(t){const a=t.length,{input_features:o}=await this.feature_extractor(t,{...r,max_length:a}),i=Math.round(a/this.config.encoder_ds_factor+1e-4),l=1+Math.ceil(i/this.config.stack_factor);s.audio_token_len=[l],s.audio_values=o;const c=this.config.audio_placeholder;if(!e.includes(c))throw new Error(`The input text does not contain the image token ${c}.`);e=e.replaceAll(c,c.repeat(l))}return{...this.tokenizer(e,{add_special_tokens:!1,...r}),...s}}},k(Cn,"tokenizer_class",Te),k(Cn,"feature_extractor_class",Ft),k(Cn,"uses_processor_config",!0),Cn),ga="[AUDIO]",IT="[BEGIN_AUDIO]",OT=375;function NT(e,t){const r=[];for(let s=0;s<e.length;s+=t)r.push(e.subarray(s,Math.min(s+t,e.length)));return r}var DT=(Sn=class extends Ee{async _call(e,t=null,r={}){if(Array.isArray(e))throw new Error("Batched inputs are not supported yet.");const s={};if(t){if(!e.includes(ga))throw new Error(`The input text does not contain the audio token ${ga}.`);Array.isArray(t)||(t=[t]);const a=e.split(ga),o=a.length-1;if(o!==t.length)throw new Error(`The number of audio inputs (${t.length}) does not match the number of audio tokens in the text (${o}).`);const i=this.feature_extractor.config.n_samples,l=t.map(p=>NT(p,i)),c=l.map(p=>p.length),d=l.flat(),h=(await Promise.all(d.map(p=>this.feature_extractor(p,r)))).map(p=>p.input_features);s.audio_values=h.length>1?ze(h,0):h[0];let _=a[0];for(let p=0;p<c.length;++p){_+=IT;for(let w=0;w<c[p];++w)_+=ga.repeat(OT);_+=a[p+1]}e=_}return{...this.tokenizer(e,{add_special_tokens:!1,...r}),...s}}},k(Sn,"tokenizer_class",Te),k(Sn,"feature_extractor_class",Ft),k(Sn,"uses_processor_config",!1),Sn),Q_=32,Hi=6,wa=8,zT=10,BT=32,RT=(Pn=class extends Ee{get num_mel_frames_first_audio_chunk(){return(Hi+1)*wa}get num_samples_first_audio_chunk(){const{hop_length:e,n_fft:t}=this.feature_extractor.config;return(this.num_mel_frames_first_audio_chunk-1)*e+Math.floor(t/2)}get num_samples_per_audio_chunk(){const{hop_length:e,n_fft:t}=this.feature_extractor.config;return wa*e+t}get num_right_pad_tokens(){return Hi+1+zT}get audio_length_per_tok(){return wa}get raw_audio_length_per_tok(){return wa*this.feature_extractor.config.hop_length}async _call(e,{is_streaming:t=!1,is_first_audio_chunk:r=!0}={}){if(at(e,"VoxtralRealtimeProcessor"),!t&&!r)throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");if(r)if(t){const s=Q_*this.raw_audio_length_per_tok,n=new Float32Array(s+e.length);n.set(e,s);const a=await this.feature_extractor(n,{center:!0}),i=1+(Q_+Hi),l=new BigInt64Array(i).fill(BigInt(BT));return l[0]=1n,{input_ids:new U("int64",l,[1,i]),...a}}else{const s=this.num_right_pad_tokens*this.raw_audio_length_per_tok,n=new Float32Array(e.length+s);return n.set(e),await this.feature_extractor(n,{center:!0})}else return await this.feature_extractor(e,{center:!1})}},k(Pn,"tokenizer_class",Te),k(Pn,"feature_extractor_class",Ft),k(Pn,"uses_processor_config",!1),Pn),GT=(ro=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(ro,"tokenizer_class",Te),k(ro,"feature_extractor_class",Ft),ro),$T=(so=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(so,"tokenizer_class",Te),k(so,"feature_extractor_class",Ft),so),VT=(no=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(no,"tokenizer_class",Te),k(no,"feature_extractor_class",Ft),no),UT=class{static async from_pretrained(e,t={}){const r=await Zt(e,sn,!0,t),{image_processor_type:s,feature_extractor_type:n,processor_class:a}=r;if(a&&Ni[a])return Ni[a].from_pretrained(e,t);if(!s&&!n)throw new Error("No `image_processor_type` or `feature_extractor_type` found in the config.");const o={};if(s){const l=ma[s.replace(/Fast$/,"")];if(!l)throw new Error(`Unknown image_processor_type: '${s}'.`);o.image_processor=new l(r)}if(n){const l=ma[n];if(l)o.image_processor=new l(r);else{const c=Di[n];if(!c)throw new Error(`Unknown feature_extractor_type: '${n}'.`);o.feature_extractor=new c(r)}}const i={};return new Ee(i,o,null)}};async function jT(e,t){return await Zt(e,"config.json",!0,t)}function ps(e){const t={};let r={};switch(e.model_type){case"llava":case"paligemma":case"gemma3":case"florence2":case"llava_onevision":case"idefics3":case"granite_speech":case"ultravox":case"voxtral":case"voxtral_realtime":case"smolvlm":case"gemma3n":case"gemma4":case"lfm2_vl":case"chatterbox":case"lighton_ocr":case"glm_ocr":case"mistral3":case"qwen2_5_vl":case"qwen3_vl":case"qwen3_vl_moe":r=ps(e.text_config);break;case"moondream1":r=ps(e.phi_config);break;case"musicgen":r=ps(e.decoder);break;case"multi_modality":r=ps(e.language_config);break;case"gpt2":case"gptj":case"jais":case"codegen":case"gpt_bigcode":t.num_heads="n_head",t.num_layers="n_layer",t.hidden_size="n_embd";break;case"gpt_neox":case"stablelm":case"opt":case"falcon":case"modernbert-decoder":t.num_heads="num_attention_heads",t.num_layers="num_hidden_layers",t.hidden_size="hidden_size";break;case"gpt_oss":case"llama":case"llama4_text":case"nanochat":case"apertus":case"arcee":case"afmoe":case"lfm2":case"lfm2_moe":case"smollm3":case"olmo":case"olmo2":case"olmo3":case"mobilellm":case"granite":case"granitemoehybrid":case"cohere":case"cohere2":case"mistral":case"voxtral_realtime_text":case"voxtral_realtime_encoder":case"starcoder2":case"qwen2":case"qwen2_moe":case"qwen2_vl":case"qwen2_vl_text":case"qwen2_5_vl_text":case"qwen3_moe":case"qwen3_vl_text":case"qwen3_vl_moe_text":case"phi":case"phi3":case"phi3_v":case"llava_qwen2":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.hidden_size="hidden_size",t.num_attention_heads="num_attention_heads",t.dim_kv="head_dim";break;case"qwen3":case"solar_open":case"glm_ocr_text":case"gemma":case"gemma2":case"vaultgemma":case"gemma3_text":case"gemma3n_text":case"gemma4_text":case"glm":case"helium":case"ernie4_5":case"hunyuan_v1_dense":case"falcon_h1":case"nemotron_h":case"ministral":case"ministral3":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.dim_kv="head_dim";break;case"openelm":t.num_heads="num_kv_heads",t.num_layers="num_transformer_layers",t.dim_kv="head_dim";break;case"gpt_neo":case"donut-swin":t.num_heads="num_heads",t.num_layers="num_layers",t.hidden_size="hidden_size";break;case"bloom":t.num_heads="n_head",t.num_layers="n_layer",t.hidden_size="hidden_size";break;case"mpt":t.num_heads="n_heads",t.num_layers="n_layers",t.hidden_size="d_model";break;case"exaone":t.num_heads="num_key_value_heads",t.num_layers="num_layers",t.dim_kv="head_dim",t.num_attention_heads="num_attention_heads";break;case"youtu":case"deepseek_v3":case"glm_moe_dsa":case"mistral4":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.dim_kv="qk_head_dim",t.num_attention_heads="num_attention_heads";break;case"t5":case"mt5":case"longt5":t.num_decoder_layers="num_decoder_layers",t.num_decoder_heads="num_heads",t.decoder_dim_kv="d_kv",t.num_encoder_layers="num_layers",t.num_encoder_heads="num_heads",t.encoder_dim_kv="d_kv";break;case"bart":case"mbart":case"marian":case"whisper":case"lite-whisper":case"m2m_100":case"blenderbot":case"blenderbot-small":case"florence2_language":t.num_decoder_layers="decoder_layers",t.num_decoder_heads="decoder_attention_heads",t.decoder_hidden_size="d_model",t.num_encoder_layers="encoder_layers",t.num_encoder_heads="encoder_attention_heads",t.encoder_hidden_size="d_model";break;case"speecht5":t.num_decoder_layers="decoder_layers",t.num_decoder_heads="decoder_attention_heads",t.decoder_hidden_size="hidden_size",t.num_encoder_layers="encoder_layers",t.num_encoder_heads="encoder_attention_heads",t.encoder_hidden_size="hidden_size";break;case"trocr":t.num_encoder_layers=t.num_decoder_layers="decoder_layers",t.num_encoder_heads=t.num_decoder_heads="decoder_attention_heads",t.encoder_hidden_size=t.decoder_hidden_size="d_model";break;case"musicgen_decoder":t.num_encoder_layers=t.num_decoder_layers="num_hidden_layers",t.num_encoder_heads=t.num_decoder_heads="num_attention_heads",t.encoder_hidden_size=t.decoder_hidden_size="hidden_size";break;case"moonshine":t.num_decoder_layers="decoder_num_hidden_layers",t.num_decoder_heads="decoder_num_key_value_heads",t.num_encoder_layers="encoder_num_hidden_layers",t.num_encoder_heads="encoder_num_key_value_heads",t.encoder_hidden_size=t.decoder_hidden_size="hidden_size";break;case"cohere_asr":t.num_decoder_layers="num_hidden_layers",t.num_decoder_heads="num_key_value_heads",t.decoder_hidden_size="hidden_size",t.decoder_dim_kv="head_dim";const{num_hidden_layers:n,num_attention_heads:a,hidden_size:o}=e.encoder_config;r={num_encoder_layers:n,num_encoder_heads:a,encoder_hidden_size:o,encoder_dim_kv:e.head_dim};break;case"vision-encoder-decoder":const i=ps(e.decoder),l="num_decoder_layers"in i,c=rt(e,["model_type","is_encoder_decoder"]);return l?(c.num_decoder_layers=i.num_decoder_layers,c.num_decoder_heads=i.num_decoder_heads,c.decoder_hidden_size=i.decoder_hidden_size,c.num_encoder_layers=i.num_encoder_layers,c.num_encoder_heads=i.num_encoder_heads,c.encoder_hidden_size=i.encoder_hidden_size):(c.num_layers=i.num_layers,c.num_heads=i.num_heads,c.hidden_size=i.hidden_size),c}const s={...r,...rt(e,["model_type","multi_query","is_encoder_decoder"])};for(const n in t)s[n]=e[t[n]];return s}function va(e,t){e instanceof Qi||(e=new Qi(e));const r=(t==null?void 0:t.batch_size)??1;if(["lfm2","lfm2_moe"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a={},{layer_types:o,num_attention_heads:i,num_key_value_heads:l,hidden_size:c,conv_L_cache:d}=e,h=c/i;for(let _=0;_<o.length;++_)if(o[_]==="full_attention")for(const p of["key","value"])a[`${s}.${_}.${p}`]=[r,l,0,h];else if(o[_]==="conv")a[`${n}_conv.${_}`]=[r,c,d];else throw new Error(`Unsupported layer type: ${o[_]}`);return a}else if(["granitemoehybrid","falcon_h1","nemotron_h"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a=e,o=a.layer_types??a.layers_block_type,i=a.num_hidden_layers??(o==null?void 0:o.length),l=a.num_key_value_heads,c=a.head_dim??a.hidden_size/a.num_attention_heads,d=a.mamba_n_heads??a.mamba_num_heads,h=a.mamba_d_head??a.mamba_head_dim,_=a.mamba_d_state??a.ssm_state_size,p=a.mamba_n_groups??a.n_groups,w=a.mamba_d_conv??a.conv_kernel,y=(a.mamba_d_ssm??(a.mamba_expand?a.mamba_expand*a.hidden_size:d*h))+2*p*_,M={};for(let T=0;T<i;++T)if((!o||o[T]==="mamba")&&(M[`${n}_conv.${T}`]=[r,y,w],M[`${n}_ssm.${T}`]=[r,d,h,_]),!o||o[T]==="attention")for(const A of["key","value"])M[`${s}.${T}.${A}`]=[r,l,0,c];return M}else if(["qwen3_next","qwen3_5_text","qwen3_5_moe_text","olmo_hybrid"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a={},{head_dim:o,layer_types:i,num_attention_heads:l,num_key_value_heads:c,hidden_size:d,linear_num_value_heads:h,linear_num_key_heads:_,linear_key_head_dim:p,linear_value_head_dim:w,linear_conv_kernel_dim:v}=e,y=p*_,M=w*h,T=o??d/l;for(let A=0;A<i.length;++A)if(i[A]==="full_attention")for(const C of["key","value"])a[`${s}.${A}.${C}`]=[r,c,0,T];else if(i[A]==="linear_attention"){if(e.model_type==="olmo_hybrid")a[`${n}_conv.${A}.key`]=[r,y,v],a[`${n}_conv.${A}.value`]=[r,M,v],a[`${n}_conv.${A}.query`]=[r,y,v];else{const C=y*2+M;a[`${n}_conv.${A}`]=[r,C,v]}a[`${n}_recurrent.${A}`]=[r,h,p,w]}else throw new Error(`Unsupported layer type: ${i[A]}`);return a}else if(["gemma4","gemma4_text"].includes(e.model_type)){const s=e.model_type==="gemma4"?e.text_config:e,n=(t==null?void 0:t.prefix)??"past_key_values",a={},o=s.num_hidden_layers,i=s.num_kv_shared_layers??0,l=o-i,c=s.num_key_value_heads,d=s.head_dim,h=s.global_head_dim??d,_=s.layer_types??[];for(let p=0;p<l;++p){const w=_[p]==="full_attention"?h:d;for(const v of["key","value"])a[`${n}.${p}.${v}`]=[r,c,0,w]}return a}else if(["lfm2_vl","qwen3_5","qwen3_5_moe","voxtral_realtime"].includes(e.model_type)){let s;return e.model_type==="voxtral_realtime"&&(t==null?void 0:t.session_name)==="audio_encoder"?s=e.audio_config:s=e.text_config,va(s,t)}return qT(e,t)}function qT(e,{prefix:t="past_key_values",batch_size:r=1}={}){const s={},n=e.normalized_config;if(n.is_encoder_decoder&&"num_encoder_heads"in n&&"num_decoder_heads"in n){const a=n.encoder_dim_kv??n.encoder_hidden_size/n.num_encoder_heads,o=n.decoder_dim_kv??n.decoder_hidden_size/n.num_decoder_heads,i=[r,n.num_encoder_heads,0,a],l=[r,n.num_decoder_heads,0,o];for(let c=0;c<n.num_decoder_layers;++c)s[`${t}.${c}.encoder.key`]=i,s[`${t}.${c}.encoder.value`]=i,s[`${t}.${c}.decoder.key`]=l,s[`${t}.${c}.decoder.value`]=l}else{const a=n.num_heads,o=n.num_layers,i=n.dim_kv??n.hidden_size/(n.num_attention_heads??a);if(n.model_type==="falcon"){const l=[r*a,0,i];for(let c=0;c<o;++c)s[`${t}.${c}.key`]=l,s[`${t}.${c}.value`]=l}else if(n.multi_query){const l=[r*a,0,2*i];for(let c=0;c<o;++c)s[`${t}.${c}.key_value`]=l}else if(n.model_type==="bloom"){const l=[r*a,i,0],c=[r*a,0,i];for(let d=0;d<o;++d)s[`${t}.${d}.key`]=l,s[`${t}.${d}.value`]=c}else if(n.model_type==="openelm")for(let l=0;l<o;++l){const c=[r,a[l],0,i];s[`${t}.${l}.key`]=c,s[`${t}.${l}.value`]=c}else{const l=[r,a,0,i];for(let c=0;c<o;++c)s[`${t}.${c}.key`]=l,s[`${t}.${c}.value`]=l}}return s}var Qi=class wd{constructor(t){k(this,"model_type",null);k(this,"is_encoder_decoder",!1);k(this,"max_position_embeddings");k(this,"transformers.js_config");Object.assign(this,t),this.normalized_config=ps(this)}static async from_pretrained(t,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main"}={}){s&&!(s instanceof wd)&&(s=new wd(s));const i=s??await jT(t,{progress_callback:r,config:s,cache_dir:n,local_files_only:a,revision:o});return new this(i)}},nn=class{static async from_pretrained(...e){return Qi.from_pretrained(...e)}};function X_(e,t,r){return e?typeof e=="object"&&e!==null?e.hasOwnProperty(t)?+e[t]:e.hasOwnProperty(r)?+e[r]:0:+e:0}function Y_(e,t){const r=[];for(let s=0;s<t;++s)r.push(`${e}_data${s===0?"":"_"+s}`);return r}async function WT(e,t,r,s){const n=`${t}${s}.onnx`,a=`${r.subfolder??""}/${n}`;return await ua(e,a,!0,r,fe.IS_NODE_ENV)}async function HT(e,t,r,s,n,a={}){const o=`${t}${r}.onnx`,i=fe.IS_NODE_ENV;let l=[];const c=X_(n,o,t);if(c>0){if(c>yf)throw new Error(`The number of external data chunks (${c}) exceeds the maximum allowed value (${yf}).`);const d=Y_(o,c);for(const h of d){const _=`${s.subfolder??""}/${h}`;l.push(new Promise(async(p,w)=>{const v=await ua(e,_,!0,s,i);p(v instanceof Uint8Array?{path:h,data:v}:h)}))}}else a.externalData!==void 0&&(l=a.externalData.map(async d=>{if(typeof d.data=="string"){const h=await ua(e,d.data,!0,s);return{...d,data:h}}return d}));return Promise.all(l)}async function QT(e,t,r,s=!1,n=void 0){var C;let a=((C=r.config)==null?void 0:C["transformers.js_config"])??{};const o=$f(r.device??a.device,t,{warn:S=>ue.info(S)}),i=wx(o),l=a.device_config??{};l.hasOwnProperty(o)&&(a={...a,...l[o]});const c=jf(r.dtype??a.dtype,t,o,{configDtype:a.dtype,warn:S=>ue.info(S)});if(Ai.hasOwnProperty(c)){if(o==="webgpu"&&!fe.IS_NODE_ENV&&c===We.fp16&&!await bx())throw new Error(`The device (${o}) does not support fp16.`)}else throw new Error(`Invalid dtype: ${c}. Should be one of: ${Object.keys(We).join(", ")}`);const d=a.kv_cache_dtype,h=d?typeof d=="string"?d:d[c]??"float32":void 0;if(h&&!["float32","float16"].includes(h))throw new Error(`Invalid kv_cache_dtype: ${h}. Should be one of: float32, float16`);const _=Ai[c],p={...r.session_options};p.executionProviders??(p.executionProviders=i);const w=a.free_dimension_overrides;w?p.freeDimensionOverrides??(p.freeDimensionOverrides=w):o.startsWith("webnn")&&!p.freeDimensionOverrides&&ue.warn(`WebNN does not currently support dynamic shapes and requires 'free_dimension_overrides' to be set in config.json, preferably as a field within config["transformers.js_config"]["device_config"]["${o}"]. When 'free_dimension_overrides' is not set, you may experience significant performance degradation.`);const v=WT(e,t,r,_),y=r.use_external_data_format??a.use_external_data_format,M=await HT(e,t,_,r,y,p);if(M.length>0&&(!fe.IS_NODE_ENV||M.some(S=>typeof S!="string"))&&(p.externalData=M),s&&o==="webgpu"&&d!==!1){const S=va(r.config,{prefix:"present",session_name:n});if(Object.keys(S).length>0&&!Ti()){const N={};for(const x in S)N[x]="gpu-buffer";p.preferredOutputLocation=N}}return{buffer_or_path:await v,session_options:p,session_config:{dtype:c,kv_cache_dtype:h,device:o}}}async function XT(e,t,r,s=void 0){return Object.fromEntries(await Promise.all(Object.keys(t).map(async n=>{const a=(s==null?void 0:s[n])??!1,{buffer_or_path:o,session_options:i,session_config:l}=await QT(e,t[n],r,a,n),c=await zf(o,i,l);return[n,c]})))}function J_(e){for(let t in e)Gf(e[t])?e[t]=new U(e[t]):typeof e[t]=="object"&&J_(e[t]);return e}async function xe(e,t){const r=YT(e,t);try{const s=Object.fromEntries(Object.entries(r).map(([a,o])=>{const i=o.ort_tensor;return fe.IS_NODE_ENV&&typeof Float16Array<"u"&&i.cpuData instanceof Float16Array&&(i.cpuData=new Uint16Array(i.cpuData.buffer)),[a,i]})),n=await Rf(e,s);return J_(n)}catch(s){const n=Object.fromEntries(Object.entries(r).map(([a,o])=>{const i={type:o.type,dims:o.dims,location:o.location};return i.location!=="gpu-buffer"&&(i.data=o.data),[a,i]}));throw ue.error(`An error occurred during model execution: "${s}".`),ue.error("Inputs given to model:",n),s}}function YT(e,t){const r=Object.create(null),s=[];for(const o of e.inputNames){const i=t[o];if(!(i instanceof U)){s.push(o);continue}r[o]=Ti()?i.clone():i}if(s.length>0)throw new Error(`An error occurred during model execution: "Missing the following inputs: ${s.join(", ")}.`);const n=Object.keys(t).length,a=e.inputNames.length;if(n>a){let o=Object.keys(t).filter(i=>!e.inputNames.includes(i));ue.warn(`WARNING: Too many inputs were provided (${n} > ${a}). The following inputs will be ignored: "${o.join(", ")}".`)}return r}var Ye=class{},ie=class extends Ye{constructor({logits:e,...t}){super(),this.logits=e;const r=Object.values(t);r.length>0&&(this.attentions=r)}},He=class extends Ye{constructor({logits:e}){super(),this.logits=e}},Je=class extends Ye{constructor({logits:e}){super(),this.logits=e}},ht=class extends Ye{constructor({start_logits:e,end_logits:t}){super(),this.start_logits=e,this.end_logits=t}},Xr=class extends Ye{constructor({logits:e}){super(),this.logits=e}},JT=class extends Ye{constructor({alphas:e}){super(),this.alphas=e}},qt=class extends vt{_call(e,t){throw Error("`_call` should be implemented in a subclass")}},KT=class extends vt{_call(e,t){throw Error("`_call` should be implemented in a subclass")}},Xi=class extends vt{constructor(){super(),this.processors=[]}push(e){this.processors.push(e)}extend(e){this.processors.push(...e)}_call(e,t){let r=t;for(const s of this.processors)r=s(e,r);return r}[Symbol.iterator](){return this.processors.values()}},ZT=class extends qt{constructor(e){super(),this.bos_token_id=e}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===1){const s=t[r].data;s.fill(-1/0),s[this.bos_token_id]=0}return t}},eE=class extends qt{constructor(e,t){super(),this.max_length=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===this.max_length-1){const s=t[r].data;s.fill(-1/0);for(const n of this.eos_token_id)s[n]=0}return t}},tE=class extends qt{constructor(e){super(),this.suppress_tokens=e}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;for(const n of this.suppress_tokens)s[n]=-1/0}return t}},K_=class extends qt{constructor(e,t){super(),this.begin_suppress_tokens=e,this.begin_index=t}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===this.begin_index){const s=t[r].data;for(const n of this.begin_suppress_tokens)s[n]=-1/0}return t}},rE=class extends qt{constructor(e,t){super(),this.eos_token_id=Array.isArray(e.eos_token_id)?e.eos_token_id[0]:e.eos_token_id,this.no_timestamps_token_id=e.no_timestamps_token_id,this.timestamp_begin=this.no_timestamps_token_id+1,this.begin_index=t.length,t.at(-1)===this.no_timestamps_token_id&&(this.begin_index-=1),this.max_initial_timestamp_index=e.max_initial_timestamp_index}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;if(s[this.no_timestamps_token_id]=-1/0,e[r].length===this.begin_index){s.subarray(0,this.timestamp_begin).fill(-1/0);continue}const n=e[r].slice(this.begin_index),a=n.length>=1&&n[n.length-1]>=this.timestamp_begin,o=n.length<2||n[n.length-2]>=this.timestamp_begin;if(a&&(o?s.subarray(this.timestamp_begin).fill(-1/0):s.subarray(0,this.eos_token_id).fill(-1/0)),e[r].length===this.begin_index&&this.max_initial_timestamp_index!==null){const d=this.timestamp_begin+this.max_initial_timestamp_index;s.subarray(d+1).fill(-1/0)}const i=ax(s),l=Math.log(i.subarray(this.timestamp_begin).map(Math.exp).reduce((d,h)=>d+h)),c=je(i.subarray(0,this.timestamp_begin))[0];l>c&&s.subarray(0,this.timestamp_begin).fill(-1/0)}return t}},sE=class extends qt{constructor(e){super(),this.no_repeat_ngram_size=e}getNgrams(e){const t=e.length,r=[];for(let n=0;n<t+1-this.no_repeat_ngram_size;++n){const a=[];for(let o=0;o<this.no_repeat_ngram_size;++o)a.push(e[n+o]);r.push(a.map(Number))}const s=new Map;for(const n of r){const a=n.slice(0,n.length-1),o=JSON.stringify(a),i=s.get(o)??[];i.push(n[n.length-1]),s.set(o,i)}return s}getGeneratedNgrams(e,t){const r=t.slice(t.length+1-this.no_repeat_ngram_size,t.length);return e.get(JSON.stringify(r.map(Number)))??[]}calcBannedNgramTokens(e){const t=[];if(e.length+1<this.no_repeat_ngram_size)return t;{const r=this.getNgrams(e);return this.getGeneratedNgrams(r,e)}}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data,n=this.calcBannedNgramTokens(e[r]);for(const a of n)s[a]=-1/0}return t}},nE=class extends qt{constructor(e){super(),this.penalty=e}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;for(const n of new Set(e[r])){const a=Number(n);s[a]<0?s[a]*=this.penalty:s[a]/=this.penalty}}return t}},aE=class extends qt{constructor(e,t){super(),this.min_length=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length<this.min_length){const s=t[r].data;for(const n of this.eos_token_id)s[n]=-1/0}return t}},oE=class extends qt{constructor(e,t,r){super(),this.prompt_length_to_skip=e,this.min_new_tokens=t,this.eos_token_id=Array.isArray(r)?r:[r]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length-this.prompt_length_to_skip<this.min_new_tokens){const n=t[r].data;for(const a of this.eos_token_id)n[a]=-1/0}return t}},iE=class extends qt{constructor(e,t){super(),this.bad_words_ids=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data,n=e[r];for(const a of this.bad_words_ids){if(n.length<a.length-1)continue;let o=!0;for(let i=1;i<=a.length-1;++i)if(a.at(-i-1)!=n.at(-i)){o=!1;break}o&&(s[a.at(-1)]=-1/0)}}return t}},lE=class extends qt{constructor(e){if(super(),e<=1)throw new Error(`Require guidance scale >1 to use the classifier free guidance processor, got guidance scale ${e}.`);this.guidance_scale=e}_call(e,t){if(t.dims[0]!==2*e.length)throw new Error(`Logits should have twice the batch size of the input ids, the first half of batches corresponding to the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got batch size ${t.dims[0]} for the logits and ${e.length} for the input ids.`);const r=e.length,s=t.slice([0,r],null),n=t.slice([r,t.dims[0]],null);for(let a=0;a<n.data.length;++a)n.data[a]+=(s.data[a]-n.data[a])*this.guidance_scale;return n}},cE=class extends KT{constructor(e){super(),this.temperature=e}_call(e,t){const r=t.data;for(let s=0;s<r.length;++s)r[s]/=this.temperature;return t}},Z_=class{constructor(e){k(this,"max_length",20);k(this,"max_new_tokens",null);k(this,"min_length",0);k(this,"min_new_tokens",null);k(this,"early_stopping",!1);k(this,"max_time",null);k(this,"do_sample",!1);k(this,"num_beams",1);k(this,"num_beam_groups",1);k(this,"penalty_alpha",null);k(this,"use_cache",!0);k(this,"temperature",1);k(this,"top_k",50);k(this,"top_p",1);k(this,"typical_p",1);k(this,"epsilon_cutoff",0);k(this,"eta_cutoff",0);k(this,"diversity_penalty",0);k(this,"repetition_penalty",1);k(this,"encoder_repetition_penalty",1);k(this,"length_penalty",1);k(this,"no_repeat_ngram_size",0);k(this,"bad_words_ids",null);k(this,"force_words_ids",null);k(this,"renormalize_logits",!1);k(this,"constraints",null);k(this,"forced_bos_token_id",null);k(this,"forced_eos_token_id",null);k(this,"remove_invalid_values",!1);k(this,"exponential_decay_length_penalty",null);k(this,"suppress_tokens",null);k(this,"streamer",null);k(this,"begin_suppress_tokens",null);k(this,"forced_decoder_ids",null);k(this,"guidance_scale",null);k(this,"num_return_sequences",1);k(this,"output_attentions",!1);k(this,"output_hidden_states",!1);k(this,"output_scores",!1);k(this,"return_dict_in_generate",!1);k(this,"pad_token_id",null);k(this,"bos_token_id",null);k(this,"eos_token_id",null);k(this,"encoder_no_repeat_ngram_size",0);k(this,"decoder_start_token_id",null);k(this,"generation_kwargs",{});Object.assign(this,rt(e,Object.getOwnPropertyNames(this)))}},ya=class extends vt{_call(e,t){throw Error("StoppingCriteria needs to be subclassed")}},ep=class Lv extends vt{constructor(){super(),this.criteria=[]}push(t){this.criteria.push(t)}extend(t){t instanceof Lv?t=t.criteria:t instanceof ya&&(t=[t]),this.criteria.push(...t)}_call(t,r){const s=new Array(t.length).fill(!1);for(const n of this.criteria){const a=n(t,r);for(let o=0;o<s.length;++o)s[o]||(s[o]=a[o])}return s}[Symbol.iterator](){return this.criteria.values()}},uE=class extends ya{constructor(e,t=null){super(),this.max_length=e,this.max_position_embeddings=t}_call(e){return e.map(t=>t.length>=this.max_length)}},dE=class extends ya{constructor(e){super(),Array.isArray(e)||(e=[e]),this.eos_token_id=e}_call(e,t){return e.map(r=>{const s=r.at(-1);return this.eos_token_id.some(n=>s==n)})}},ba=class extends vt{constructor(e){super(),this.generation_config=e}async _call(e){return this.sample(e)}async sample(e){throw Error("sample should be implemented in subclasses.")}getLogits(e,t){let r=e.dims.at(-1),s=e.data;if(t===-1)s=s.slice(-r);else{let n=t*r;s=s.slice(n,n+r)}return s}randomSelect(e){return VM(e)}static getSampler(e){if(e.do_sample)return new fE(e);if(e.num_beams>1)return new _E(e);if(e.num_return_sequences>1)throw Error(`num_return_sequences has to be 1 when doing greedy search, but is ${e.num_return_sequences}.`);return new hE(e)}},hE=class extends ba{async sample(e){const t=je(e.data)[1];return[[BigInt(t),0]]}},fE=class extends ba{async sample(e){let t=e.dims.at(-1);this.generation_config.top_k>0&&(t=Math.min(this.generation_config.top_k,t));const[r,s]=await hs(e,t),n=nt(r.data);return Array.from({length:this.generation_config.num_beams},()=>{const a=this.randomSelect(n);return[s.data[a],Math.log(n[a])]})}},_E=class extends ba{async sample(e){let t=e.dims.at(-1);this.generation_config.top_k>0&&(t=Math.min(this.generation_config.top_k,t));const[r,s]=await hs(e,t),n=nt(r.data);return Array.from({length:this.generation_config.num_beams},(a,o)=>[s.data[o],Math.log(n[o])])}},pE=class{constructor(e){if(e)for(const t in e){if(t in this)throw new TypeError(`Key "${t}" conflicts with an existing property on DynamicCache`);const r=e[t];if(!(r instanceof U))throw new TypeError(`Expected a Tensor for key "${t}", got ${typeof r}`);this[t]=r}}get_seq_length(){const e=this;if(Object.keys(e).length===0)return 0;for(const t in e)if(t.startsWith("past_key_values."))return e[t].dims.at(-2);throw new Error("Unable to determine sequence length from the cache.")}update(e){for(const t in e){const r=this[t],s=e[t];r&&r!==s&&r.location==="gpu-buffer"&&r.dispose(),this[t]=s}}async dispose(){const e=[];for(const t of Object.values(this))t.location==="gpu-buffer"&&e.push(t.dispose());await Promise.all(e)}},Yi=pE,q={EncoderOnly:0,EncoderDecoder:1,Seq2Seq:2,Vision2Seq:3,DecoderOnly:4,DecoderOnlyWithoutHead:5,MaskGeneration:6,ImageTextToText:7,Musicgen:8,MultiModality:9,Phi3V:10,AudioTextToText:11,AutoEncoder:12,ImageAudioTextToText:13,Supertonic:14,Chatterbox:15,VoxtralRealtime:16},hr={[q.DecoderOnly]:{sessions:(e,t)=>({model:t.model_file_name??"model"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.DecoderOnlyWithoutHead]:{sessions:(e,t)=>({model:t.model_file_name??"model"})},[q.Seq2Seq]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.Vision2Seq]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.Musicgen]:{sessions:()=>({model:"text_encoder",decoder_model_merged:"decoder_model_merged",encodec_decode:"encodec_decode"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.EncoderDecoder]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0}},[q.MaskGeneration]:{sessions:()=>({model:"vision_encoder",prompt_encoder_mask_decoder:"prompt_encoder_mask_decoder"})},[q.ImageTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...hr[q.ImageTextToText].text_only_sessions};return r||(s.vision_encoder="vision_encoder"),e.is_encoder_decoder&&(s.model="encoder_model"),s},cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.AudioTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...hr[q.AudioTextToText].text_only_sessions};return r||(s.audio_encoder="audio_encoder"),s},cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.ImageAudioTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...hr[q.ImageAudioTextToText].text_only_sessions};return r||(s.audio_encoder="audio_encoder",s.vision_encoder="vision_encoder"),s},optional_configs:{generation_config:"generation_config.json"}},[q.Phi3V]:{sessions:()=>({prepare_inputs_embeds:"prepare_inputs_embeds",model:"model",vision_encoder:"vision_encoder"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.MultiModality]:{sessions:()=>({prepare_inputs_embeds:"prepare_inputs_embeds",model:"language_model",lm_head:"lm_head",gen_head:"gen_head",gen_img_embeds:"gen_img_embeds",image_decode:"image_decode"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.AutoEncoder]:{sessions:()=>({encoder_model:"encoder_model",decoder_model:"decoder_model"})},[q.Supertonic]:{sessions:()=>({text_encoder:"text_encoder",latent_denoiser:"latent_denoiser",voice_decoder:"voice_decoder"})},[q.Chatterbox]:{sessions:()=>({embed_tokens:"embed_tokens",speech_encoder:"speech_encoder",model:"language_model",conditional_decoder:"conditional_decoder"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.VoxtralRealtime]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...hr[q.VoxtralRealtime].text_only_sessions};return r||(s.audio_encoder="audio_encoder"),s},cache_sessions:{decoder_model_merged:!0,audio_encoder:!0},optional_configs:{generation_config:"generation_config.json"}},default:{sessions:(e,t)=>({model:t.model_file_name??"model"})}};function mE(e){const t=hr[e];return(t==null?void 0:t.text_only_sessions)??null}function gE(e,t,r={}){const s=hr[e]??hr.default;return{sessions:s.sessions(t,r,r.textOnly??!1),cache_sessions:s.cache_sessions,optional_configs:s.optional_configs}}function tp(e,{warn:t=!0}={}){const r=e.architectures||[];for(const s of r){const n=fr.get(s);if(n!==void 0)return n}if(e.model_type){const s=fr.get(e.model_type);if(s!==void 0)return s;for(const n of Object.values(ms))if(n.has(e.model_type)){const a=fr.get(n.get(e.model_type));if(a!==void 0)return a}}if(t){const s=r.length>0?r.join(", "):"(none)";ue.warn(`[resolve_model_type] Architecture(s) not found in MODEL_TYPE_MAPPING: [${s}] for model type '${e.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${fa}`)}return q.EncoderOnly}function rp(e,{config:t=null,cache_dir:r=null,local_files_only:s=!1,revision:n="main"}={}){if(t!==null)return nn.from_pretrained(e,{config:t,cache_dir:r,local_files_only:s,revision:n});const a=JSON.stringify([e,r,s,n]);return Ef(a,()=>nn.from_pretrained(e,{config:t,cache_dir:r,local_files_only:s,revision:n}))}async function sp(e,{config:t=null,dtype:r=null,device:s=null,model_file_name:n=null}={}){t=await rp(e,{config:t});const a=["config.json"],o=t["transformers.js_config"]??{},i=o.use_external_data_format,l="onnx",c=s??o.device;let d=r??o.dtype;const h=tp(t),_=(v,y=null)=>{y=y??v;const M=$f(c,v),T=jf(d,v,M),A=Ai[T]??"",C=`${y}${A}.onnx`,S=`${l}/${C}`;a.push(S);const N=X_(i,C,v);for(const x of Y_(C,N)){const R=`${l}/${x}`;a.push(R)}},{sessions:p,optional_configs:w}=gE(h,t,{model_file_name:n});for(const[v,y]of Object.entries(p))_(v,y);if(w)for(const v of Object.values(w))a.push(v);return a}var ms=null;function wE(e){ms=e}function Ji(e){if(e instanceof U)return e;if(e.length===0)throw Error("items must be non-empty");if(Array.isArray(e[0])){if(e.some(t=>t.length!==e[0].length))throw Error("Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.");return new U("int64",BigInt64Array.from(e.flat().map(t=>BigInt(t))),[e.length,e[0].length])}else return new U("int64",BigInt64Array.from(e.map(t=>BigInt(t))),[1,e.length])}function np(e){return new U("bool",[e],[1])}var ap={[q.DecoderOnly]:{can_generate:!0,forward:_r,prepare_inputs:an},[q.DecoderOnlyWithoutHead]:{can_generate:!1,forward:_r,prepare_inputs:an},[q.Seq2Seq]:{can_generate:!0,forward:Ma,prepare_inputs:xa},[q.Vision2Seq]:{can_generate:!0,forward:Ma,prepare_inputs:xa},[q.Musicgen]:{can_generate:!0,forward:Ma},[q.EncoderDecoder]:{can_generate:!1,forward:Ma},[q.ImageTextToText]:{can_generate:!0,forward:ME,prepare_inputs:ka},[q.AudioTextToText]:{can_generate:!0,forward:bE,prepare_inputs:ka},[q.ImageAudioTextToText]:{can_generate:!0,prepare_inputs:ka},[q.Phi3V]:{can_generate:!0,prepare_inputs:ka},[q.MultiModality]:{can_generate:!0},[q.AutoEncoder]:{can_generate:!1,forward:vE},[q.Chatterbox]:{can_generate:!0,forward:Lr},[q.VoxtralRealtime]:{can_generate:!0,prepare_inputs:an},default:{can_generate:!1,forward:Lr}};function op(e,t){var i;let r=fr.get(e),s=!1;const n=(i=t==null?void 0:t.architectures)==null?void 0:i[0];if(n&&n!==e&&(e!=null&&e.endsWith("ForCausalLM"))&&n.endsWith("ForConditionalGeneration")){const l=fr.get(n);l!==void 0&&(r=l,s=!0)}const a=ap[r]??ap.default,o=hr[r]??hr.default;return{typeConfig:{...a,...o},textOnly:s,modelType:r}}var fr=new Map,Ki=new Map,gs=new Map,P=class extends vt{constructor(t,r,s){super();k(this,"main_input_name","input_ids");k(this,"forward_params",["input_ids","attention_mask"]);k(this,"_return_dict_in_generate_keys",null);this.config=t,this.sessions=r,this.configs=s;const n=gs.get(this.constructor),{typeConfig:a}=op(n,t);this.can_generate=a.can_generate,this._forward=a.forward,this._prepare_inputs_for_generation=a.prepare_inputs,this.can_generate&&this.forward_params.push("past_key_values"),this.custom_config=this.config["transformers.js_config"]??{}}async dispose(){var r;const t=[];for(const s of Object.values(this.sessions))t.push((r=s.release)==null?void 0:r.call(s));return await Promise.all(t)}static async from_pretrained(t,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main",model_file_name:i=null,subfolder:l="onnx",device:c=null,dtype:d=null,use_external_data_format:h=null,session_options:_={}}={}){const p={progress_callback:r,config:s,cache_dir:n,local_files_only:a,revision:o,model_file_name:i,subfolder:l,device:c,dtype:d,use_external_data_format:h,session_options:_},w=gs.get(this);s=p.config=await nn.from_pretrained(t,p);const{typeConfig:v,textOnly:y,modelType:M}=op(w,s);if(M===void 0){const S=w??(s==null?void 0:s.model_type);S!=="custom"&&ue.warn(`Model type for '${S}' not found, assuming encoder-only architecture. Please report this at ${fa}.`)}if(r&&!(r instanceof _i)){const S={};try{const N=await sp(t,{config:s,dtype:d,device:c,model_file_name:i});(await Promise.all(N.map(R=>Zs(t,R,p)))).forEach((R,z)=>{if(R.exists){const $=N[z]==="config.json";S[N[z]]={loaded:$?R.size??0:0,total:R.size??0}}})}catch(N){ue.warn(`Unable to fetch model file metadata for total progress tracking: ${N}`)}Object.keys(S).length>0&&(p.progress_callback=new _i(r,S))}const T=v.sessions(s,p,y),A=[XT(t,T,p,v.cache_sessions)];v.optional_configs&&A.push(kE(t,v.optional_configs,p));const C=await Promise.all(A);return new this(s,...C)}async _call(t){return await this.forward(t)}async forward(t){return await this._forward(this,t)}get generation_config(){var t;return((t=this.configs)==null?void 0:t.generation_config)??null}_get_logits_processor(t,r,s=null){const n=new Xi;if(t.repetition_penalty!==null&&t.repetition_penalty!==1&&n.push(new nE(t.repetition_penalty)),t.no_repeat_ngram_size!==null&&t.no_repeat_ngram_size>0&&n.push(new sE(t.no_repeat_ngram_size)),t.bad_words_ids!==null&&n.push(new iE(t.bad_words_ids,t.eos_token_id)),t.min_length!==null&&t.eos_token_id!==null&&t.min_length>0&&n.push(new aE(t.min_length,t.eos_token_id)),t.min_new_tokens!==null&&t.eos_token_id!==null&&t.min_new_tokens>0&&n.push(new oE(r,t.min_new_tokens,t.eos_token_id)),t.forced_bos_token_id!==null&&n.push(new ZT(t.forced_bos_token_id)),t.forced_eos_token_id!==null&&n.push(new eE(t.max_length,t.forced_eos_token_id)),t.suppress_tokens!==null&&n.push(new tE(t.suppress_tokens)),t.begin_suppress_tokens!==null){const a=r>1||t.forced_bos_token_id===null?r:r+1;n.push(new K_(t.begin_suppress_tokens,a))}return t.guidance_scale!==null&&t.guidance_scale>1&&n.push(new lE(t.guidance_scale)),t.temperature===0&&t.do_sample&&(ue.warn("`do_sample` changed to false because `temperature: 0` implies greedy sampling (always selecting the most likely token), which is incompatible with `do_sample: true`."),t.do_sample=!1),t.do_sample&&t.temperature!==null&&t.temperature!==1&&n.push(new cE(t.temperature)),s!==null&&n.extend(s),n}_prepare_generation_config(t,r,s=Z_){const n={...this.config};for(const o of["decoder","generator","text_config"])o in n&&Object.assign(n,n[o]);const a=new s(n);return Object.assign(a,this.generation_config??{}),t&&Object.assign(a,t),r&&Object.assign(a,rt(r,Object.getOwnPropertyNames(a))),a}_get_stopping_criteria(t,r=null){const s=new ep;return t.max_length!==null&&s.push(new uE(t.max_length,this.config.max_position_embeddings??null)),t.eos_token_id!==null&&s.push(new dE(t.eos_token_id)),r&&s.extend(r),s}_validate_model_class(){if(!this.can_generate){const t=[ms.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,ms.MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,ms.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,ms.MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES].filter(Boolean),r=gs.get(this.constructor),s=new Set,n=this.config.model_type;for(const o of t){const i=o==null?void 0:o.get(n);i&&s.add(i)}let a=`The current model class (${r}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`;throw s.size>0&&(a+=` Please use the following class instead: ${[...s].join(", ")}`),Error(a)}}prepare_inputs_for_generation(...t){if(!this._prepare_inputs_for_generation)throw new Error("prepare_inputs_for_generation is not implemented for this model.");return this._prepare_inputs_for_generation(this,...t)}_update_model_kwargs_for_generation({generated_input_ids:t,outputs:r,model_inputs:s,is_encoder_decoder:n}){return s.past_key_values=Zi(r,s.past_key_values),s.input_ids=new U("int64",t.flat(),[t.length,1]),n?"decoder_attention_mask"in s&&(s.decoder_attention_mask=ze([s.decoder_attention_mask,yt([s.decoder_attention_mask.dims[0],1])],1)):s.attention_mask=ze([s.attention_mask,yt([s.attention_mask.dims[0],1])],1),s.position_ids=null,s}_prepare_model_inputs({inputs:t,bos_token_id:r,model_kwargs:s}){const n=rt(s,this.forward_params),a=this.main_input_name;if(a in n){if(t)throw new Error("`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. Make sure to either pass {inputs} or {input_name}=...")}else n[a]=t;return{inputs_tensor:n[a],model_inputs:n,model_input_name:a}}async _prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:t,model_inputs:r,model_input_name:s,generation_config:n}){if(this.sessions.model.inputNames.includes("inputs_embeds")&&!r.inputs_embeds&&"_prepare_inputs_embeds"in this){const{input_ids:o,pixel_values:i,attention_mask:l,...c}=r,d=await this._prepare_inputs_embeds(r);r={...c,...rt(d,["inputs_embeds","attention_mask"])}}let{last_hidden_state:a}=await Lr(this,r);if(n.guidance_scale!==null&&n.guidance_scale>1)a=ze([a,Fi(a,0)],0),"attention_mask"in r&&(r.attention_mask=ze([r.attention_mask,Jf(r.attention_mask)],0));else if(r.decoder_input_ids){const o=Ji(r.decoder_input_ids).dims[0];if(o!==a.dims[0]){if(a.dims[0]!==1)throw new Error(`The encoder outputs have a different batch size (${a.dims[0]}) than the decoder inputs (${o}).`);a=ze(Array.from({length:o},()=>a),0)}}return r.encoder_outputs=a,r}_prepare_decoder_input_ids_for_generation({batch_size:t,model_input_name:r,model_kwargs:s,decoder_start_token_id:n,bos_token_id:a,generation_config:o}){let{decoder_input_ids:i,...l}=s;if(!(i instanceof U)){if(i)Array.isArray(i[0])||(i=Array.from({length:t},()=>i));else if(n??(n=a),this.config.model_type==="musicgen")i=Array.from({length:t*this.config.decoder.num_codebooks},()=>[n]);else if(Array.isArray(n)){if(n.length!==t)throw new Error(`\`decoder_start_token_id\` expcted to have length ${t} but got ${n.length}`);i=n}else i=Array.from({length:t},()=>[n]);i=Ji(i)}return l.decoder_attention_mask=Xf(i),{input_ids:i,model_inputs:l}}async generate({inputs:t=null,generation_config:r=null,logits_processor:s=null,stopping_criteria:n=null,streamer:a=null,...o}){this._validate_model_class(),r=this._prepare_generation_config(r,o);let{inputs_tensor:i,model_inputs:l,model_input_name:c}=this._prepare_model_inputs({inputs:t,model_kwargs:o});const d=this.config.is_encoder_decoder;d&&("encoder_outputs"in l||(l=await this._prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:i,model_inputs:l,model_input_name:c,generation_config:r})));let h;d?{input_ids:h,model_inputs:l}=this._prepare_decoder_input_ids_for_generation({batch_size:l[c].dims.at(0),model_input_name:c,model_kwargs:l,decoder_start_token_id:r.decoder_start_token_id,bos_token_id:r.bos_token_id,generation_config:r}):h=l[c];let _=h.dims.at(-1);r.max_new_tokens!==null&&(r.max_length=_+r.max_new_tokens);const p=this._get_logits_processor(r,_,s),w=this._get_stopping_criteria(r,n),v=l[c].dims.at(0),y=ba.getSampler(r),M=new Array(v).fill(0),T=h.tolist();a&&a.put(T);let A,C={},S={};for(;;){if(l=this.prepare_inputs_for_generation(T,l,r),A=await this.forward(l),r.return_dict_in_generate)if(r.output_attentions){const I=yE(A);for(const te in I)te in C||(C[te]=[]),C[te].push(I[te])}else this._return_dict_in_generate_keys&&Object.assign(S,rt(A,this._return_dict_in_generate_keys));const $=A.logits.slice(null,-1,null).to("float32"),Q=p(T,$),H=[];for(let I=0;I<Q.dims.at(0);++I){const te=Q[I],W=await y(te);for(const[ee,G]of W){const L=BigInt(ee);M[I]+=G,T[I].push(L),H.push([L]);break}}if(a&&a.put(H),w(T).every(I=>I))break;l=this._update_model_kwargs_for_generation({generated_input_ids:H,outputs:A,model_inputs:l,is_encoder_decoder:d})}a&&a.end();const N=new U("int64",T.flat(),[T.length,T[0].length]),x=Zi(A,l.past_key_values),R=new Set(Object.values(x));for(const $ of Object.values(A))$.location==="gpu-buffer"&&!R.has($)&&$.dispose();return"past_key_values"in o||r.return_dict_in_generate||await x.dispose(),r.return_dict_in_generate?{sequences:N,past_key_values:x,...C,...S}:N}async _encode_input(t,r,s){if(!Object.hasOwn(this.sessions,t))throw new Error(`Model does not have a ${t} session.`);const n=this.sessions[t];return(await xe(n,rt(r,n.inputNames)))[s]}async encode_image(t){return this._encode_input("vision_encoder",t,"image_features")}async encode_text(t){return this._encode_input("embed_tokens",t,"inputs_embeds")}async encode_audio(t){return this._encode_input("audio_encoder",t,"audio_features")}};async function Ma(e,t){let{encoder_outputs:r,input_ids:s,decoder_input_ids:n,decoder_attention_mask:a,...o}=t;if(!r){const i=rt(t,e.sessions.model.inputNames);r=(await Lr(e,i)).last_hidden_state}return o.input_ids=n,o.encoder_hidden_states=r,e.sessions.decoder_model_merged.inputNames.includes("encoder_attention_mask")&&(o.encoder_attention_mask=t.attention_mask),a&&!o.attention_mask&&(o.attention_mask=a),await _r(e,o,!0)}async function Lr(e,t){const r=e.sessions.model,s=rt(t,r.inputNames);if(r.inputNames.includes("inputs_embeds")&&!s.inputs_embeds){if(!t.input_ids)throw new Error("Both `input_ids` and `inputs_embeds` are missing in the model inputs.");s.inputs_embeds=await e.encode_text({input_ids:t.input_ids})}if(r.inputNames.includes("token_type_ids")&&!s.token_type_ids){if(!s.input_ids)throw new Error("Both `input_ids` and `token_type_ids` are missing in the model inputs.");s.token_type_ids=Jf(s.input_ids)}if(r.inputNames.includes("pixel_mask")&&!s.pixel_mask){if(!s.pixel_values)throw new Error("Both `pixel_values` and `pixel_mask` are missing in the model inputs.");const n=s.pixel_values.dims;s.pixel_mask=yt([n[0],n[2],n[3]])}return await xe(r,s)}async function vE(e,t){const r=await e.encode(t);return await e.decode(r)}function Zi(e,t){const r=Object.create(null);for(const s in e)if(s.startsWith("present")){const n=s.replace("present_ssm","past_ssm").replace("present_conv","past_conv").replace("present_recurrent","past_recurrent").replace("present","past_key_values");s.includes("encoder")&&t?r[n]=t[n]:r[n]=e[s]}return t?(t.update(r),t):new Yi(r)}function yE(e){const t={};for(const r of["cross_attentions","encoder_attentions","decoder_attentions"])for(const s in e)s.startsWith(r)&&(r in t||(t[r]=[]),t[r].push(e[s]));return t}function el(e,t,r){var c,d,h;if(r&&Object.keys(r).length>0)return Object.assign(t,r),r;const s=e.sessions.decoder_model_merged??e.sessions.model,n=((d=(c=t[e.main_input_name]??t.attention_mask)==null?void 0:c.dims)==null?void 0:d[0])??1,a=((h=s==null?void 0:s.config)==null?void 0:h.kv_cache_dtype)??"float32",o=a==="float16"?ds.float16:ds.float32,i=va(e.config,{batch_size:n}),l=Object.create(null);for(const _ in i){const p=i[_].reduce((v,y)=>v*y,1),w=new U(a,new o(p),i[_]);t[_]=w,l[_]=w}return r?(r.update(l),r):new Yi(l)}async function _r(e,t,r=!1){const s=e.sessions[r?"decoder_model_merged":"model"],{past_key_values:n,...a}=t;if(s.inputNames.includes("use_cache_branch")&&(a.use_cache_branch=np(n!=null&&Object.keys(n).length>0)),s.inputNames.includes("position_ids")&&a.attention_mask&&!a.position_ids){const i=["paligemma","gemma3_text","gemma3"].includes(e.config.model_type)?1:0;a.position_ids=xE(a,n,i)}s.inputNames.includes("num_logits_to_keep")&&!a.num_logits_to_keep&&(a.num_logits_to_keep=new U("int64",[0n],[])),el(e,a,n);const o=rt(a,s.inputNames);return await xe(s,o)}async function ip(e,{encode_function:t,merge_function:r,modality_input_names:s,modality_output_name:n,input_ids:a=null,attention_mask:o=null,position_ids:i=null,inputs_embeds:l=null,past_key_values:c=null,generation_config:d=null,logits_processor:h=null,..._}){if(!l){l=await e.encode_text({input_ids:a,..._});const w=rt(_,s);if(Object.keys(w).length>0){if(a.dims[1]!==1){const v=await t({...w,..._});({inputs_embeds:l,attention_mask:o}=r({[n]:v,inputs_embeds:l,input_ids:a,attention_mask:o}))}else if(c&&a.dims[1]===1){const v=a.dims[1],y=c.get_seq_length();o=ze([yt([a.dims[0],y]),o.slice(null,[o.dims[1]-v,o.dims[1]])],1)}}}if(!i&&["qwen2_vl","qwen2_vl_text","qwen2_5_vl","qwen2_5_vl_text","qwen3_vl","qwen3_vl_text","qwen3_vl_moe","qwen3_vl_moe_text","qwen3_5","qwen3_5_text","qwen3_5_moe","qwen3_5_moe_text","glm_ocr","glm_ocr_text"].includes(e.config.model_type)){const{image_grid_thw:w,video_grid_thw:v}=_;[i]=e.get_rope_index(a,w,v,o)}return await _r(e,{inputs_embeds:l,past_key_values:c,attention_mask:o,position_ids:i,generation_config:d,logits_processor:h},!0)}async function bE(e,t){return await ip(e,{...t,modality_input_names:["audio_values","input_features"],modality_output_name:"audio_features",encode_function:e.encode_audio.bind(e),merge_function:e._merge_input_ids_with_audio_features.bind(e)})}async function ME(e,t){return await ip(e,{...t,modality_input_names:["pixel_values"],modality_output_name:"image_features",encode_function:e.encode_image.bind(e),merge_function:e._merge_input_ids_with_image_features.bind(e)})}function lp(e,t=0){const[r,s]=e.dims,n=e.data,a=new BigInt64Array(n.length);for(let o=0;o<r;++o){const i=o*s;let l=BigInt(t);for(let c=0;c<s;++c){const d=i+c;n[d]===0n?a[d]=BigInt(1):(a[d]=l,l+=n[d])}}return{data:a,dims:e.dims}}function xE(e,t=null,r=0){const{input_ids:s,inputs_embeds:n,attention_mask:a}=e,{data:o,dims:i}=lp(a,r);let l=new U("int64",o,i);if(t){const c=-(s??n).dims.at(1);l=l.slice(null,[c,null])}return l}function an(e,t,r,s){const n=r.past_key_values?r.past_key_values.get_seq_length():0,a=e.sessions.decoder_model_merged??e.sessions.model;if(a!=null&&a.inputNames.includes("num_logits_to_keep")&&!r.num_logits_to_keep&&(r.num_logits_to_keep=new U("int64",[1n],[])),!r.attention_mask){let o;for(const i of["input_ids","inputs_embeds","position_ids"])if(r[i]){o=r[i].dims;break}if(!o)throw new Error("attention_mask is not provided, and unable to infer its shape from model inputs.");r.attention_mask=yt([o[0],n+o[1]])}if(r.past_key_values){const{input_ids:o,attention_mask:i}=r;i&&i.dims[1]>o.dims[1]||n<o.dims[1]&&(r.input_ids=o.slice(null,[n,null]))}return r}function xa(e,t,r,s){return r.past_key_values&&(t=t.map(n=>[n.at(-1)])),{...r,decoder_input_ids:Ji(t)}}function ka(e,...t){return e.config.is_encoder_decoder?xa(e,...t):an(e,...t)}function cp({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n}){const a=s.tolist().map(c=>c.reduce((d,h,_)=>(h==e&&d.push(_),d),[])),o=a.reduce((c,d)=>c+d.length,0),i=r.dims[0];if(o!==i)throw new Error(`Number of tokens and features do not match: tokens: ${o}, features ${i}`);let l=0;for(let c=0;c<a.length;++c){const d=a[c],h=t[c];for(let _=0;_<d.length;++_)h[d[_]].data.set(r[l++].data)}return{inputs_embeds:t,attention_mask:n}}function tl({image_token_id:e,inputs_embeds:t,image_features:r,input_ids:s,attention_mask:n}){return cp({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n})}function up({audio_token_id:e,inputs_embeds:t,audio_features:r,input_ids:s,attention_mask:n}){return cp({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n})}async function kE(e,t,r){return Object.fromEntries(await Promise.all(Object.keys(t).map(async s=>{const n=await Zt(e,t[s],!1,r);return[s,n]})))}var rl={};ns(rl,{ASTForAudioClassification:()=>DE,ASTModel:()=>NE,ASTPreTrainedModel:()=>ol,AfmoeForCausalLM:()=>LE,AfmoeModel:()=>FE,AfmoePreTrainedModel:()=>nl,AlbertForMaskedLM:()=>CE,AlbertForQuestionAnswering:()=>AE,AlbertForSequenceClassification:()=>EE,AlbertModel:()=>TE,AlbertPreTrainedModel:()=>on,ApertusForCausalLM:()=>PE,ApertusModel:()=>SE,ApertusPreTrainedModel:()=>sl,ArceeForCausalLM:()=>OE,ArceeModel:()=>IE,ArceePreTrainedModel:()=>al,BartForConditionalGeneration:()=>BE,BartForSequenceClassification:()=>RE,BartModel:()=>zE,BartPretrainedModel:()=>Ta,BeitForImageClassification:()=>$E,BeitModel:()=>GE,BeitPreTrainedModel:()=>il,BertForMaskedLM:()=>UE,BertForQuestionAnswering:()=>WE,BertForSequenceClassification:()=>jE,BertForTokenClassification:()=>qE,BertModel:()=>VE,BertPreTrainedModel:()=>ws,BlenderbotForConditionalGeneration:()=>QE,BlenderbotModel:()=>HE,BlenderbotPreTrainedModel:()=>ll,BlenderbotSmallForConditionalGeneration:()=>YE,BlenderbotSmallModel:()=>XE,BlenderbotSmallPreTrainedModel:()=>cl,BloomForCausalLM:()=>KE,BloomModel:()=>JE,BloomPreTrainedModel:()=>ul,CHMv2ForDepthEstimation:()=>o2,CHMv2PreTrainedModel:()=>pp,CLIPModel:()=>l2,CLIPPreTrainedModel:()=>Yr,CLIPSegForImageSegmentation:()=>f2,CLIPSegModel:()=>h2,CLIPSegPreTrainedModel:()=>dl,CLIPTextModel:()=>c2,CLIPTextModelWithProjection:()=>wp,CLIPVisionModel:()=>u2,CLIPVisionModelWithProjection:()=>d2,CamembertForMaskedLM:()=>e2,CamembertForQuestionAnswering:()=>s2,CamembertForSequenceClassification:()=>t2,CamembertForTokenClassification:()=>r2,CamembertModel:()=>ZE,CamembertPreTrainedModel:()=>vs,ChatterboxModel:()=>fp,ChatterboxPreTrainedModel:()=>hp,ChineseCLIPModel:()=>a2,ChineseCLIPPreTrainedModel:()=>_p,ClapAudioModelWithProjection:()=>gp,ClapModel:()=>i2,ClapPreTrainedModel:()=>Ea,ClapTextModelWithProjection:()=>mp,CodeGenForCausalLM:()=>p2,CodeGenModel:()=>_2,CodeGenPreTrainedModel:()=>hl,Cohere2ForCausalLM:()=>v2,Cohere2Model:()=>w2,Cohere2PreTrainedModel:()=>_l,CohereAsrForConditionalGeneration:()=>b2,CohereAsrModel:()=>y2,CohereAsrPreTrainedModel:()=>pl,CohereForCausalLM:()=>g2,CohereModel:()=>m2,CoherePreTrainedModel:()=>fl,ConvBertForMaskedLM:()=>x2,ConvBertForQuestionAnswering:()=>E2,ConvBertForSequenceClassification:()=>k2,ConvBertForTokenClassification:()=>T2,ConvBertModel:()=>M2,ConvBertPreTrainedModel:()=>ys,ConvNextForImageClassification:()=>C2,ConvNextModel:()=>A2,ConvNextPreTrainedModel:()=>ml,ConvNextV2ForImageClassification:()=>P2,ConvNextV2Model:()=>S2,ConvNextV2PreTrainedModel:()=>gl,DFineForObjectDetection:()=>O2,DFineModel:()=>I2,DFinePreTrainedModel:()=>vl,DINOv3ConvNextModel:()=>oA,DINOv3ConvNextPreTrainedModel:()=>Ap,DINOv3ViTModel:()=>iA,DINOv3ViTPreTrainedModel:()=>Cp,DPTForDepthEstimation:()=>pA,DPTModel:()=>_A,DPTPreTrainedModel:()=>Tl,DacDecoderModel:()=>Mp,DacDecoderOutput:()=>yp,DacEncoderModel:()=>bp,DacEncoderOutput:()=>vp,DacModel:()=>N2,DacPreTrainedModel:()=>Aa,DebertaForMaskedLM:()=>z2,DebertaForQuestionAnswering:()=>G2,DebertaForSequenceClassification:()=>B2,DebertaForTokenClassification:()=>R2,DebertaModel:()=>D2,DebertaPreTrainedModel:()=>bs,DebertaV2ForMaskedLM:()=>j2,DebertaV2ForQuestionAnswering:()=>H2,DebertaV2ForSequenceClassification:()=>q2,DebertaV2ForTokenClassification:()=>W2,DebertaV2Model:()=>U2,DebertaV2PreTrainedModel:()=>Ms,DecisionTransformerModel:()=>Q2,DecisionTransformerPreTrainedModel:()=>xp,DeepseekV3ForCausalLM:()=>V2,DeepseekV3Model:()=>$2,DeepseekV3PreTrainedModel:()=>yl,DeiTForImageClassification:()=>Y2,DeiTModel:()=>X2,DeiTPreTrainedModel:()=>bl,DepthAnythingForDepthEstimation:()=>J2,DepthAnythingPreTrainedModel:()=>kp,DepthProForDepthEstimation:()=>K2,DepthProPreTrainedModel:()=>Tp,DetrForObjectDetection:()=>eA,DetrForSegmentation:()=>tA,DetrModel:()=>Z2,DetrObjectDetectionOutput:()=>Ml,DetrPreTrainedModel:()=>Ca,DetrSegmentationOutput:()=>Ep,Dinov2ForImageClassification:()=>sA,Dinov2Model:()=>rA,Dinov2PreTrainedModel:()=>xl,Dinov2WithRegistersForImageClassification:()=>aA,Dinov2WithRegistersModel:()=>nA,Dinov2WithRegistersPreTrainedModel:()=>kl,DistilBertForMaskedLM:()=>hA,DistilBertForQuestionAnswering:()=>dA,DistilBertForSequenceClassification:()=>cA,DistilBertForTokenClassification:()=>uA,DistilBertModel:()=>lA,DistilBertPreTrainedModel:()=>xs,DonutSwinModel:()=>fA,DonutSwinPreTrainedModel:()=>Sp,EdgeTamModel:()=>kF,EfficientNetForImageClassification:()=>gA,EfficientNetModel:()=>mA,EfficientNetPreTrainedModel:()=>El,ElectraForMaskedLM:()=>vA,ElectraForQuestionAnswering:()=>MA,ElectraForSequenceClassification:()=>yA,ElectraForTokenClassification:()=>bA,ElectraModel:()=>wA,ElectraPreTrainedModel:()=>ks,Ernie4_5ForCausalLM:()=>kA,Ernie4_5Model:()=>xA,Ernie4_5PretrainedModel:()=>Al,EsmForMaskedLM:()=>EA,EsmForSequenceClassification:()=>AA,EsmForTokenClassification:()=>CA,EsmModel:()=>TA,EsmPreTrainedModel:()=>cn,EuroBertForMaskedLM:()=>PA,EuroBertForSequenceClassification:()=>FA,EuroBertForTokenClassification:()=>LA,EuroBertModel:()=>SA,EuroBertPreTrainedModel:()=>un,ExaoneForCausalLM:()=>OA,ExaoneModel:()=>IA,ExaonePreTrainedModel:()=>Cl,FalconForCausalLM:()=>DA,FalconH1ForCausalLM:()=>BA,FalconH1Model:()=>zA,FalconH1PreTrainedModel:()=>Pl,FalconModel:()=>NA,FalconPreTrainedModel:()=>Sl,FastViTForImageClassification:()=>GA,FastViTModel:()=>RA,FastViTPreTrainedModel:()=>Fl,Florence2ForConditionalGeneration:()=>$A,Florence2PreTrainedModel:()=>Pp,GLPNForDepthEstimation:()=>nC,GLPNModel:()=>sC,GLPNPreTrainedModel:()=>Rl,GPT2LMHeadModel:()=>_C,GPT2Model:()=>fC,GPT2PreTrainedModel:()=>jl,GPTBigCodeForCausalLM:()=>oC,GPTBigCodeModel:()=>aC,GPTBigCodePreTrainedModel:()=>Gl,GPTJForCausalLM:()=>mC,GPTJModel:()=>pC,GPTJPreTrainedModel:()=>ql,GPTNeoForCausalLM:()=>lC,GPTNeoModel:()=>iC,GPTNeoPreTrainedModel:()=>$l,GPTNeoXForCausalLM:()=>uC,GPTNeoXModel:()=>cC,GPTNeoXPreTrainedModel:()=>Vl,Gemma2ForCausalLM:()=>qA,Gemma2Model:()=>jA,Gemma2PreTrainedModel:()=>Il,Gemma3ForCausalLM:()=>XA,Gemma3ForConditionalGeneration:()=>Ip,Gemma3Model:()=>QA,Gemma3PreTrainedModel:()=>Lp,Gemma3nForCausalLM:()=>YA,Gemma3nForConditionalGeneration:()=>Sa,Gemma3nPreTrainedModel:()=>Op,Gemma4ForCausalLM:()=>JA,Gemma4ForConditionalGeneration:()=>Ol,GemmaForCausalLM:()=>UA,GemmaModel:()=>VA,GemmaPreTrainedModel:()=>Ll,GlmForCausalLM:()=>ZA,GlmModel:()=>KA,GlmMoeDsaForCausalLM:()=>tC,GlmMoeDsaModel:()=>eC,GlmMoeDsaPreTrainedModel:()=>Dl,GlmOcrForConditionalGeneration:()=>rC,GlmPreTrainedModel:()=>Nl,GptOssForCausalLM:()=>hC,GptOssModel:()=>dC,GptOssPreTrainedModel:()=>Ul,GraniteForCausalLM:()=>wC,GraniteModel:()=>gC,GraniteMoeHybridForCausalLM:()=>yC,GraniteMoeHybridModel:()=>vC,GraniteMoeHybridPreTrainedModel:()=>Hl,GranitePreTrainedModel:()=>Wl,GraniteSpeechForConditionalGeneration:()=>bC,GroundingDinoForObjectDetection:()=>MC,GroundingDinoPreTrainedModel:()=>Rp,GroupViTModel:()=>xC,GroupViTPreTrainedModel:()=>Gp,HeliumForCausalLM:()=>TC,HeliumModel:()=>kC,HeliumPreTrainedModel:()=>Xl,HieraForImageClassification:()=>AC,HieraModel:()=>EC,HieraPreTrainedModel:()=>Yl,HubertForCTC:()=>OC,HubertForSequenceClassification:()=>NC,HubertModel:()=>IC,HubertPreTrainedModel:()=>LC,HunYuanDenseV1ForCausalLM:()=>zC,HunYuanDenseV1Model:()=>DC,HunYuanDenseV1PreTrainedModel:()=>Jl,IJepaForImageClassification:()=>RC,IJepaModel:()=>BC,IJepaPreTrainedModel:()=>Kl,Idefics3ForConditionalGeneration:()=>$p,JAISLMHeadModel:()=>$C,JAISModel:()=>GC,JAISPreTrainedModel:()=>Zl,JinaCLIPModel:()=>VC,JinaCLIPPreTrainedModel:()=>Pa,JinaCLIPTextModel:()=>Vp,JinaCLIPVisionModel:()=>UC,Lfm2ForCausalLM:()=>qC,Lfm2Model:()=>jC,Lfm2MoeForCausalLM:()=>QC,Lfm2MoeModel:()=>HC,Lfm2MoePreTrainedModel:()=>tc,Lfm2PreTrainedModel:()=>ec,Lfm2VlForConditionalGeneration:()=>XC,LightOnOcrForConditionalGeneration:()=>WC,LiteWhisperForConditionalGeneration:()=>qL,Llama4ForCausalLM:()=>KC,Llama4PreTrainedModel:()=>Up,LlamaForCausalLM:()=>JC,LlamaModel:()=>YC,LlamaPreTrainedModel:()=>rc,LlavaForConditionalGeneration:()=>pr,LlavaOnevisionForConditionalGeneration:()=>pr,LlavaPreTrainedModel:()=>Fp,LlavaQwen2ForCausalLM:()=>HA,LongT5ForConditionalGeneration:()=>eS,LongT5Model:()=>ZC,LongT5PreTrainedModel:()=>sc,M2M100ForConditionalGeneration:()=>rS,M2M100Model:()=>tS,M2M100PreTrainedModel:()=>nc,MBartForCausalLM:()=>uS,MBartForConditionalGeneration:()=>lS,MBartForSequenceClassification:()=>cS,MBartModel:()=>iS,MBartPreTrainedModel:()=>dn,MPNetForMaskedLM:()=>YS,MPNetForQuestionAnswering:()=>ZS,MPNetForSequenceClassification:()=>JS,MPNetForTokenClassification:()=>KS,MPNetModel:()=>XS,MPNetPreTrainedModel:()=>Ts,MT5ForConditionalGeneration:()=>sP,MT5Model:()=>rP,MT5PreTrainedModel:()=>pc,MarianMTModel:()=>nS,MarianModel:()=>sS,MarianPreTrainedModel:()=>ac,MaskFormerForInstanceSegmentation:()=>oS,MaskFormerModel:()=>aS,MaskFormerPreTrainedModel:()=>oc,Metric3DForDepthEstimation:()=>dS,Metric3DPreTrainedModel:()=>jp,Metric3Dv2ForDepthEstimation:()=>hS,Metric3Dv2PreTrainedModel:()=>qp,MgpstrForSceneTextRecognition:()=>fS,MgpstrModelOutput:()=>Wp,MgpstrPreTrainedModel:()=>Hp,MimiDecoderModel:()=>Jp,MimiDecoderOutput:()=>Xp,MimiEncoderModel:()=>Yp,MimiEncoderOutput:()=>Qp,MimiModel:()=>_S,MimiPreTrainedModel:()=>Fa,Mistral4ForCausalLM:()=>wS,Mistral4Model:()=>gS,Mistral4PreTrainedModel:()=>lc,MistralForCausalLM:()=>mS,MistralModel:()=>pS,MistralPreTrainedModel:()=>ic,MobileBertForMaskedLM:()=>yS,MobileBertForQuestionAnswering:()=>MS,MobileBertForSequenceClassification:()=>bS,MobileBertModel:()=>vS,MobileBertPreTrainedModel:()=>hn,MobileLLMForCausalLM:()=>kS,MobileLLMModel:()=>xS,MobileLLMPreTrainedModel:()=>cc,MobileNetV1ForImageClassification:()=>ES,MobileNetV1ForSemanticSegmentation:()=>AS,MobileNetV1Model:()=>TS,MobileNetV1PreTrainedModel:()=>La,MobileNetV2ForImageClassification:()=>SS,MobileNetV2ForSemanticSegmentation:()=>PS,MobileNetV2Model:()=>CS,MobileNetV2PreTrainedModel:()=>Ia,MobileNetV3ForImageClassification:()=>LS,MobileNetV3ForSemanticSegmentation:()=>IS,MobileNetV3Model:()=>FS,MobileNetV3PreTrainedModel:()=>Oa,MobileNetV4ForImageClassification:()=>NS,MobileNetV4ForSemanticSegmentation:()=>DS,MobileNetV4Model:()=>OS,MobileNetV4PreTrainedModel:()=>Na,MobileViTForImageClassification:()=>BS,MobileViTModel:()=>zS,MobileViTPreTrainedModel:()=>uc,MobileViTV2ForImageClassification:()=>GS,MobileViTV2Model:()=>RS,MobileViTV2PreTrainedModel:()=>dc,ModernBertDecoderForCausalLM:()=>WS,ModernBertDecoderModel:()=>qS,ModernBertDecoderPreTrainedModel:()=>hc,ModernBertForMaskedLM:()=>VS,ModernBertForSequenceClassification:()=>US,ModernBertForTokenClassification:()=>jS,ModernBertModel:()=>$S,ModernBertPreTrainedModel:()=>fn,Moondream1ForConditionalGeneration:()=>WA,MoonshineForConditionalGeneration:()=>QS,MoonshineModel:()=>HS,MoonshinePreTrainedModel:()=>fc,MptForCausalLM:()=>tP,MptModel:()=>eP,MptPreTrainedModel:()=>_c,MultiModalityCausalLM:()=>nP,MultiModalityPreTrainedModel:()=>Kp,MusicgenForCausalLM:()=>oP,MusicgenForConditionalGeneration:()=>Zp,MusicgenModel:()=>aP,MusicgenPreTrainedModel:()=>mc,NanoChatForCausalLM:()=>lP,NanoChatModel:()=>iP,NanoChatPreTrainedModel:()=>gc,NemotronHForCausalLM:()=>uP,NemotronHModel:()=>cP,NemotronHPreTrainedModel:()=>wc,NeoBertForMaskedLM:()=>hP,NeoBertForQuestionAnswering:()=>pP,NeoBertForSequenceClassification:()=>fP,NeoBertForTokenClassification:()=>_P,NeoBertModel:()=>dP,NeoBertPreTrainedModel:()=>Es,NomicBertModel:()=>mP,NomicBertPreTrainedModel:()=>em,OPTForCausalLM:()=>CP,OPTModel:()=>AP,OPTPreTrainedModel:()=>kc,Olmo2ForCausalLM:()=>yP,Olmo2Model:()=>vP,Olmo2PreTrainedModel:()=>yc,Olmo3ForCausalLM:()=>MP,Olmo3Model:()=>bP,Olmo3PreTrainedModel:()=>bc,OlmoForCausalLM:()=>wP,OlmoHybridForCausalLM:()=>kP,OlmoHybridModel:()=>xP,OlmoHybridPreTrainedModel:()=>Mc,OlmoModel:()=>gP,OlmoPreTrainedModel:()=>vc,OpenELMForCausalLM:()=>EP,OpenELMModel:()=>TP,OpenELMPreTrainedModel:()=>xc,OwlViTForObjectDetection:()=>LP,OwlViTModel:()=>FP,OwlViTPreTrainedModel:()=>Ec,Owlv2ForObjectDetection:()=>PP,Owlv2Model:()=>SP,Owlv2PreTrainedModel:()=>Tc,PaliGemmaForConditionalGeneration:()=>IP,ParakeetForCTC:()=>OP,ParakeetPreTrainedModel:()=>tm,PatchTSMixerForPrediction:()=>DP,PatchTSMixerModel:()=>NP,PatchTSMixerPreTrainedModel:()=>Ac,PatchTSTForPrediction:()=>BP,PatchTSTModel:()=>zP,PatchTSTPreTrainedModel:()=>Cc,Phi3ForCausalLM:()=>VP,Phi3Model:()=>$P,Phi3PreTrainedModel:()=>Pc,Phi3VForCausalLM:()=>sm,Phi3VPreTrainedModel:()=>rm,PhiForCausalLM:()=>GP,PhiModel:()=>RP,PhiPreTrainedModel:()=>Sc,PreTrainedModel:()=>P,PvtForImageClassification:()=>jP,PvtModel:()=>UP,PvtPreTrainedModel:()=>Fc,PyAnnoteForAudioFrameClassification:()=>WP,PyAnnoteModel:()=>qP,PyAnnotePreTrainedModel:()=>Lc,Qwen2ForCausalLM:()=>QP,Qwen2Model:()=>HP,Qwen2MoeForCausalLM:()=>YP,Qwen2MoeModel:()=>XP,Qwen2MoePreTrainedModel:()=>Oc,Qwen2PreTrainedModel:()=>Ic,Qwen2VLForCausalLM:()=>Dp,Qwen2VLForConditionalGeneration:()=>zl,Qwen2VLPreTrainedModel:()=>Np,Qwen2_5_VLForCausalLM:()=>zp,Qwen2_5_VLForConditionalGeneration:()=>Bl,Qwen3ForCausalLM:()=>KP,Qwen3Model:()=>JP,Qwen3MoeForCausalLM:()=>eF,Qwen3MoeModel:()=>ZP,Qwen3MoePreTrainedModel:()=>Dc,Qwen3NextForCausalLM:()=>rF,Qwen3NextModel:()=>tF,Qwen3NextPreTrainedModel:()=>zc,Qwen3PreTrainedModel:()=>Nc,Qwen3VLForCausalLM:()=>nm,Qwen3VLForConditionalGeneration:()=>Bc,Qwen3VLMoeForCausalLM:()=>nF,Qwen3VLMoeForConditionalGeneration:()=>sF,Qwen3_5ForCausalLM:()=>am,Qwen3_5ForConditionalGeneration:()=>Rc,Qwen3_5MoeForCausalLM:()=>oF,Qwen3_5MoeForConditionalGeneration:()=>aF,RFDetrForObjectDetection:()=>uF,RFDetrModel:()=>cF,RFDetrObjectDetectionOutput:()=>om,RFDetrPreTrainedModel:()=>$c,RTDetrForObjectDetection:()=>L2,RTDetrModel:()=>F2,RTDetrObjectDetectionOutput:()=>ln,RTDetrPreTrainedModel:()=>wl,RTDetrV2ForObjectDetection:()=>MF,RTDetrV2Model:()=>bF,RTDetrV2ObjectDetectionOutput:()=>im,RTDetrV2PreTrainedModel:()=>Vc,ResNetForImageClassification:()=>lF,ResNetModel:()=>iF,ResNetPreTrainedModel:()=>Gc,RoFormerForMaskedLM:()=>gF,RoFormerForQuestionAnswering:()=>yF,RoFormerForSequenceClassification:()=>wF,RoFormerForTokenClassification:()=>vF,RoFormerModel:()=>mF,RoFormerPreTrainedModel:()=>Cs,RobertaForMaskedLM:()=>hF,RobertaForQuestionAnswering:()=>pF,RobertaForSequenceClassification:()=>fF,RobertaForTokenClassification:()=>_F,RobertaModel:()=>dF,RobertaPreTrainedModel:()=>As,Sam2ImageSegmentationOutput:()=>um,Sam2Model:()=>Uc,Sam2PreTrainedModel:()=>dm,Sam3TrackerModel:()=>TF,SamImageSegmentationOutput:()=>lm,SamModel:()=>xF,SamPreTrainedModel:()=>cm,SapiensForDepthEstimation:()=>AF,SapiensForNormalEstimation:()=>CF,SapiensForSemanticSegmentation:()=>EF,SapiensPreTrainedModel:()=>Da,SegformerForImageClassification:()=>PF,SegformerForSemanticSegmentation:()=>FF,SegformerModel:()=>SF,SegformerPreTrainedModel:()=>za,SiglipModel:()=>LF,SiglipPreTrainedModel:()=>jc,SiglipTextModel:()=>hm,SiglipVisionModel:()=>IF,SmolLM3ForCausalLM:()=>NF,SmolLM3Model:()=>OF,SmolLM3PreTrainedModel:()=>qc,SmolVLMForConditionalGeneration:()=>DF,SnacDecoderModel:()=>_m,SnacEncoderModel:()=>fm,SnacModel:()=>zF,SnacPreTrainedModel:()=>Ba,SolarOpenForCausalLM:()=>RF,SolarOpenModel:()=>BF,SolarOpenPreTrainedModel:()=>Wc,SpeechT5ForSpeechToText:()=>$F,SpeechT5ForTextToSpeech:()=>VF,SpeechT5HifiGan:()=>UF,SpeechT5Model:()=>GF,SpeechT5PreTrainedModel:()=>Ra,SqueezeBertForMaskedLM:()=>qF,SqueezeBertForQuestionAnswering:()=>HF,SqueezeBertForSequenceClassification:()=>WF,SqueezeBertModel:()=>jF,SqueezeBertPreTrainedModel:()=>_n,StableLmForCausalLM:()=>XF,StableLmModel:()=>QF,StableLmPreTrainedModel:()=>Hc,Starcoder2ForCausalLM:()=>JF,Starcoder2Model:()=>YF,Starcoder2PreTrainedModel:()=>Qc,StyleTextToSpeech2Model:()=>KF,StyleTextToSpeech2PreTrainedModel:()=>pm,SupertonicForConditionalGeneration:()=>gm,SupertonicPreTrainedModel:()=>mm,Swin2SRForImageSuperResolution:()=>sL,Swin2SRModel:()=>rL,Swin2SRPreTrainedModel:()=>Xc,SwinForImageClassification:()=>eL,SwinForSemanticSegmentation:()=>tL,SwinModel:()=>ZF,SwinPreTrainedModel:()=>Ga,T5ForConditionalGeneration:()=>aL,T5Model:()=>nL,T5PreTrainedModel:()=>Yc,TableTransformerForObjectDetection:()=>iL,TableTransformerModel:()=>oL,TableTransformerObjectDetectionOutput:()=>wm,TableTransformerPreTrainedModel:()=>Jc,TrOCRForCausalLM:()=>lL,TrOCRPreTrainedModel:()=>vm,UltravoxModel:()=>Ql,UltravoxPreTrainedModel:()=>Bp,UniSpeechForCTC:()=>uL,UniSpeechForSequenceClassification:()=>dL,UniSpeechModel:()=>cL,UniSpeechPreTrainedModel:()=>$a,UniSpeechSatForAudioFrameClassification:()=>pL,UniSpeechSatForCTC:()=>fL,UniSpeechSatForSequenceClassification:()=>_L,UniSpeechSatModel:()=>hL,UniSpeechSatPreTrainedModel:()=>pn,VaultGemmaForCausalLM:()=>gL,VaultGemmaModel:()=>mL,VaultGemmaPreTrainedModel:()=>Kc,ViTForImageClassification:()=>yL,ViTMAEModel:()=>bL,ViTMAEPreTrainedModel:()=>ym,ViTMSNForImageClassification:()=>xL,ViTMSNModel:()=>ML,ViTMSNPreTrainedModel:()=>eu,ViTModel:()=>vL,ViTPreTrainedModel:()=>Zc,VisionEncoderDecoderModel:()=>wL,VitMatteForImageMatting:()=>kL,VitMattePreTrainedModel:()=>bm,VitPoseForPoseEstimation:()=>TL,VitPosePreTrainedModel:()=>Mm,VitsModel:()=>EL,VitsModelOutput:()=>xm,VitsPreTrainedModel:()=>km,VoxtralForConditionalGeneration:()=>AL,VoxtralRealtimeForConditionalGeneration:()=>Am,VoxtralRealtimePreTrainedModel:()=>Em,Wav2Vec2BertForCTC:()=>NL,Wav2Vec2BertForSequenceClassification:()=>DL,Wav2Vec2BertModel:()=>OL,Wav2Vec2BertPreTrainedModel:()=>Va,Wav2Vec2ForAudioFrameClassification:()=>FC,Wav2Vec2ForCTC:()=>SC,Wav2Vec2ForSequenceClassification:()=>PC,Wav2Vec2Model:()=>CC,Wav2Vec2PreTrainedModel:()=>Ir,WavLMForAudioFrameClassification:()=>$L,WavLMForCTC:()=>BL,WavLMForSequenceClassification:()=>RL,WavLMForXVector:()=>GL,WavLMModel:()=>zL,WavLMPreTrainedModel:()=>Ss,WeSpeakerResNetModel:()=>VL,WeSpeakerResNetPreTrainedModel:()=>Sm,WhisperForConditionalGeneration:()=>Pm,WhisperModel:()=>jL,WhisperPreTrainedModel:()=>ru,XLMForQuestionAnswering:()=>YL,XLMForSequenceClassification:()=>QL,XLMForTokenClassification:()=>XL,XLMModel:()=>WL,XLMPreTrainedModel:()=>Ps,XLMRobertaForMaskedLM:()=>KL,XLMRobertaForQuestionAnswering:()=>tI,XLMRobertaForSequenceClassification:()=>ZL,XLMRobertaForTokenClassification:()=>eI,XLMRobertaModel:()=>JL,XLMRobertaPreTrainedModel:()=>Fs,XLMWithLMHeadModel:()=>HL,XVectorOutput:()=>Cm,YolosForObjectDetection:()=>sI,YolosModel:()=>rI,YolosObjectDetectionOutput:()=>Fm,YolosPreTrainedModel:()=>su,YoutuForCausalLM:()=>aI,YoutuModel:()=>nI,YoutuPreTrainedModel:()=>nu});var on=class extends P{},TE=class extends on{},EE=class extends on{async _call(e){return new ie(await super._call(e))}},AE=class extends on{async _call(e){return new ht(await super._call(e))}},CE=class extends on{async _call(e){return new Je(await super._call(e))}},sl=class extends P{},SE=class extends sl{},PE=class extends sl{},nl=class extends P{},FE=class extends nl{},LE=class extends nl{},al=class extends P{},IE=class extends al{},OE=class extends al{},ol=class extends P{},NE=class extends ol{},DE=class extends ol{},Ta=class extends P{},zE=class extends Ta{},BE=class extends Ta{},RE=class extends Ta{async _call(e){return new ie(await super._call(e))}},il=class extends P{},GE=class extends il{},$E=class extends il{async _call(e){return new ie(await super._call(e))}},ws=class extends P{},VE=class extends ws{},UE=class extends ws{async _call(e){return new Je(await super._call(e))}},jE=class extends ws{async _call(e){return new ie(await super._call(e))}},qE=class extends ws{async _call(e){return new He(await super._call(e))}},WE=class extends ws{async _call(e){return new ht(await super._call(e))}},ll=class extends P{},HE=class extends ll{},QE=class extends ll{},cl=class extends P{},XE=class extends cl{},YE=class extends cl{},ul=class extends P{},JE=class extends ul{},KE=class extends ul{},vs=class extends P{},ZE=class extends vs{},e2=class extends vs{async _call(e){return new Je(await super._call(e))}},t2=class extends vs{async _call(e){return new ie(await super._call(e))}},r2=class extends vs{async _call(e){return new He(await super._call(e))}},s2=class extends vs{async _call(e){return new ht(await super._call(e))}},n2=4299n,dp=6561n,hp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","position_ids","audio_values","exaggeration","audio_features","audio_tokens","speaker_embeddings","speaker_features","past_key_values"]);k(this,"main_input_name","input_ids");k(this,"_return_dict_in_generate_keys",["audio_tokens","speaker_embeddings","speaker_features"])}},fp=class extends hp{async encode_speech(e){return xe(this.sessions.speech_encoder,{audio_values:e})}async forward({input_ids:e=null,attention_mask:t=null,audio_values:r=null,exaggeration:s=null,position_ids:n=null,inputs_embeds:a=null,past_key_values:o=null,generation_config:i=null,logits_processor:l=null,audio_features:c=null,audio_tokens:d=null,speaker_embeddings:h=null,speaker_features:_=null,...p}){let w;if(!a){const y=this.sessions.embed_tokens.inputNames,M={input_ids:e};if(y.includes("exaggeration")){if(!(s instanceof U)){const T=e.dims[0];if(s==null)s=ct([T],.5);else if(typeof s=="number")s=ct([T],s);else if(Array.isArray(s))s=new U("float32",s,[T]);else throw new Error("Unsupported type for `exaggeration` input")}M.exaggeration=s}if(y.includes("position_ids")&&(M.position_ids=n),{inputs_embeds:a}=await xe(this.sessions.embed_tokens,M),c&&d&&h&&_&&(w={audio_features:c,audio_tokens:d,speaker_embeddings:h,speaker_features:_}),w||r)w??(w=await this.encode_speech(r)),a=ze([w.audio_features,a],1),t=yt([a.dims[0],a.dims[1]]);else{const T=a.dims[1];if(!o||T!==1)throw new Error("Incorrect state encountered during generation.");const A=o.get_seq_length();t=yt([a.dims[0],A+T])}}return{...await _r(this,{inputs_embeds:a,past_key_values:o,attention_mask:t,generation_config:i,logits_processor:l},!1),...w}}prepare_inputs_for_generation(e,t,r){if(!t.position_ids&&this.sessions.embed_tokens.inputNames.includes("position_ids"))if(t.input_ids.dims[1]===1){const s=Array.from({length:e.length},(n,a)=>e[a].length-e[a].findLastIndex(o=>o==dp)-1);t.position_ids=new U("int64",s,[e.length,1])}else{const n=t.input_ids.tolist().map(a=>{let o=0;return a.map(i=>i>=dp?0:o++)});t.position_ids=new U("int64",n.flat(),t.input_ids.dims)}return t.input_ids.dims[1]===1&&(delete t.audio_values,delete t.audio_features,delete t.audio_tokens,delete t.speaker_embeddings,delete t.speaker_features),an(this,e,t)}async generate(e){const{sequences:t,audio_tokens:r,speaker_embeddings:s,speaker_features:n}=await super.generate({...e,return_dict_in_generate:!0}),a=t.slice(null,[e.input_ids.dims[1],-1]),o=ct([a.dims[0],3],n2),i=ze([r,a,o],1),{waveform:l}=await xe(this.sessions.conditional_decoder,{speech_tokens:i,speaker_features:n,speaker_embeddings:s});return l}},_p=class extends P{},a2=class extends _p{},pp=class extends P{},o2=class extends pp{},Ea=class extends P{},i2=class extends Ea{},mp=class extends Ea{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},gp=class extends Ea{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"audio_model"})}},Yr=class extends P{},l2=class extends Yr{},c2=class extends Yr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},wp=class extends Yr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},u2=class extends Yr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},d2=class extends Yr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},dl=class extends P{},h2=class extends dl{},f2=class extends dl{},hl=class extends P{},_2=class extends hl{},p2=class extends hl{},fl=class extends P{},m2=class extends fl{},g2=class extends fl{},_l=class extends P{},w2=class extends _l{},v2=class extends _l{},pl=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_features");k(this,"forward_params",["input_features","decoder_input_ids","decoder_attention_mask","past_key_values"])}},y2=class extends pl{},b2=class extends pl{},ys=class extends P{},M2=class extends ys{},x2=class extends ys{async _call(e){return new Je(await super._call(e))}},k2=class extends ys{async _call(e){return new ie(await super._call(e))}},T2=class extends ys{async _call(e){return new He(await super._call(e))}},E2=class extends ys{async _call(e){return new ht(await super._call(e))}},ml=class extends P{},A2=class extends ml{},C2=class extends ml{async _call(e){return new ie(await super._call(e))}},gl=class extends P{},S2=class extends gl{},P2=class extends gl{async _call(e){return new ie(await super._call(e))}},wl=class extends P{},F2=class extends wl{},L2=class extends wl{async _call(e){return new ln(await super._call(e))}},ln=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},vl=class extends P{},I2=class extends vl{},O2=class extends vl{async _call(e){return new ln(await super._call(e))}},vp=class extends Ye{constructor({audio_codes:e}){super(),this.audio_codes=e}},yp=class extends Ye{constructor({audio_values:e}){super(),this.audio_values=e}},Aa=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},N2=class extends Aa{async encode(e){return new vp(await xe(this.sessions.encoder_model,e))}async decode(e){return new yp(await xe(this.sessions.decoder_model,e))}},bp=class extends Aa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},Mp=class extends Aa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},bs=class extends P{},D2=class extends bs{},z2=class extends bs{async _call(e){return new Je(await super._call(e))}},B2=class extends bs{async _call(e){return new ie(await super._call(e))}},R2=class extends bs{async _call(e){return new He(await super._call(e))}},G2=class extends bs{async _call(e){return new ht(await super._call(e))}},yl=class extends P{},$2=class extends yl{},V2=class extends yl{},Ms=class extends P{},U2=class extends Ms{},j2=class extends Ms{async _call(e){return new Je(await super._call(e))}},q2=class extends Ms{async _call(e){return new ie(await super._call(e))}},W2=class extends Ms{async _call(e){return new He(await super._call(e))}},H2=class extends Ms{async _call(e){return new ht(await super._call(e))}},xp=class extends P{},Q2=class extends xp{},bl=class extends P{},X2=class extends bl{},Y2=class extends bl{async _call(e){return new ie(await super._call(e))}},kp=class extends P{},J2=class extends kp{},Tp=class extends P{},K2=class extends Tp{},Ca=class extends P{},Z2=class extends Ca{},eA=class extends Ca{async _call(e){return new Ml(await super._call(e))}},tA=class extends Ca{async _call(e){return new Ep(await super._call(e))}},Ml=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},Ep=class extends Ye{constructor({logits:e,pred_boxes:t,pred_masks:r}){super(),this.logits=e,this.pred_boxes=t,this.pred_masks=r}},xl=class extends P{},rA=class extends xl{},sA=class extends xl{async _call(e){return new ie(await super._call(e))}},kl=class extends P{},nA=class extends kl{},aA=class extends kl{async _call(e){return new ie(await super._call(e))}},Ap=class extends P{},oA=class extends Ap{},Cp=class extends P{},iA=class extends Cp{},xs=class extends P{},lA=class extends xs{},cA=class extends xs{async _call(e){return new ie(await super._call(e))}},uA=class extends xs{async _call(e){return new He(await super._call(e))}},dA=class extends xs{async _call(e){return new ht(await super._call(e))}},hA=class extends xs{async _call(e){return new Je(await super._call(e))}},Sp=class extends P{},fA=class extends Sp{},Tl=class extends P{},_A=class extends Tl{},pA=class extends Tl{},El=class extends P{},mA=class extends El{},gA=class extends El{async _call(e){return new ie(await super._call(e))}},ks=class extends P{},wA=class extends ks{},vA=class extends ks{async _call(e){return new Je(await super._call(e))}},yA=class extends ks{async _call(e){return new ie(await super._call(e))}},bA=class extends ks{async _call(e){return new He(await super._call(e))}},MA=class extends ks{async _call(e){return new ht(await super._call(e))}},Al=class extends P{},xA=class extends Al{},kA=class extends Al{},cn=class extends P{},TA=class extends cn{},EA=class extends cn{async _call(e){return new Je(await super._call(e))}},AA=class extends cn{async _call(e){return new ie(await super._call(e))}},CA=class extends cn{async _call(e){return new He(await super._call(e))}},un=class extends P{},SA=class extends un{},PA=class extends un{async _call(e){return new Je(await super._call(e))}},FA=class extends un{async _call(e){return new ie(await super._call(e))}},LA=class extends un{async _call(e){return new He(await super._call(e))}},Cl=class extends P{},IA=class extends Cl{},OA=class extends Cl{},Sl=class extends P{},NA=class extends Sl{},DA=class extends Sl{},Pl=class extends P{},zA=class extends Pl{},BA=class extends Pl{},Fl=class extends P{},RA=class extends Fl{},GA=class extends Fl{async _call(e){return new ie(await super._call(e))}},Pp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","pixel_values","encoder_outputs","decoder_input_ids","decoder_inputs_embeds","decoder_attention_mask","past_key_values"]);k(this,"main_input_name","inputs_embeds")}},$A=class extends Pp{_merge_input_ids_with_image_features({inputs_embeds:e,image_features:t,input_ids:r,attention_mask:s}){return{inputs_embeds:ze([t,e],1),attention_mask:ze([yt(t.dims.slice(0,2)),s],1)}}async _prepare_inputs_embeds({input_ids:e,pixel_values:t,inputs_embeds:r,attention_mask:s}){if(!e&&!t)throw new Error("Either `input_ids` or `pixel_values` should be provided.");let n,a;return e&&(n=await this.encode_text({input_ids:e})),t&&(a=await this.encode_image({pixel_values:t})),n&&a?{inputs_embeds:r,attention_mask:s}=this._merge_input_ids_with_image_features({inputs_embeds:n,image_features:a,input_ids:e,attention_mask:s}):r=n||a,{inputs_embeds:r,attention_mask:s}}async forward({input_ids:e,pixel_values:t,attention_mask:r,decoder_input_ids:s,decoder_attention_mask:n,encoder_outputs:a,past_key_values:o,inputs_embeds:i,decoder_inputs_embeds:l}){if(i||({inputs_embeds:i,attention_mask:r}=await this._prepare_inputs_embeds({input_ids:e,pixel_values:t,inputs_embeds:i,attention_mask:r})),!a){let{last_hidden_state:d}=await Lr(this,{inputs_embeds:i,attention_mask:r});a=d}if(!l){if(!s)throw new Error("Either `decoder_input_ids` or `decoder_inputs_embeds` should be provided.");l=await this.encode_text({input_ids:s})}return await _r(this,{inputs_embeds:l,attention_mask:n,encoder_attention_mask:r,encoder_hidden_states:a,past_key_values:o},!0)}},Ll=class extends P{},VA=class extends Ll{},UA=class extends Ll{},Il=class extends P{},jA=class extends Il{},qA=class extends Il{},Fp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","position_ids","past_key_values"])}},pr=class extends Fp{_merge_input_ids_with_image_features(e){const t=e.image_features.dims.at(-1),r=e.image_features.view(-1,t);return tl({image_token_id:this.config.image_token_index??this.config.image_token_id,...e,image_features:r})}},WA=class extends pr{},HA=class extends pr{},Lp=class extends P{},QA=class extends Lp{},Ip=class extends pr{},XA=class extends Ip{},Op=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","inputs_embeds","per_layer_inputs","position_ids","pixel_values","input_features","input_features_mask","past_key_values"])}},Sa=class extends Op{async forward({input_ids:e=null,attention_mask:t=null,pixel_values:r=null,input_features:s=null,input_features_mask:n=null,position_ids:a=null,inputs_embeds:o=null,per_layer_inputs:i=null,past_key_values:l=null,generation_config:c=null,logits_processor:d=null,...h}){if((!o||!i)&&({inputs_embeds:o,per_layer_inputs:i}=await xe(this.sessions.embed_tokens,{input_ids:e}),e.dims[1]!==1)){if(r){const{image_features:p}=await this._encode_vision({pixel_values:r,...h});({inputs_embeds:o,attention_mask:t}=this._merge_input_ids_with_image_features({image_features:p,inputs_embeds:o,input_ids:e,attention_mask:t}))}if(s){const{audio_features:p}=await xe(this.sessions.audio_encoder,{input_features:s,input_features_mask:n});({inputs_embeds:o,attention_mask:t}=this._merge_input_ids_with_audio_features({audio_features:p,inputs_embeds:o,input_ids:e,attention_mask:t}))}}return await _r(this,{inputs_embeds:o,per_layer_inputs:i,past_key_values:l,attention_mask:t,position_ids:a,generation_config:c,logits_processor:d},!0)}_encode_vision(e){return xe(this.sessions.vision_encoder,{pixel_values:e.pixel_values})}_merge_input_ids_with_image_features(e){const t=e.image_features.dims.at(-1),r=e.image_features.view(-1,t);return tl({image_token_id:this.config.image_token_id,...e,image_features:r})}_merge_input_ids_with_audio_features(e){const t=e.audio_features.dims.at(-1),r=e.audio_features.view(-1,t);return up({audio_token_id:this.config.audio_token_id,...e,audio_features:r})}},YA=class extends Sa{},Ol=class extends Sa{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","inputs_embeds","per_layer_inputs","position_ids","pixel_values","image_position_ids","input_features","input_features_mask","past_key_values"])}_encode_vision(t){return xe(this.sessions.vision_encoder,{pixel_values:t.pixel_values,pixel_position_ids:t.image_position_ids})}},JA=class extends Ol{},Nl=class extends P{},KA=class extends Nl{},ZA=class extends Nl{},Dl=class extends P{},eC=class extends Dl{},tC=class extends Dl{},Np=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","past_key_values","pixel_values","image_grid_thw"])}},zl=class extends Np{constructor(){super(...arguments);k(this,"image_grid_thw_name","grid_thw")}_get_text_only_rope_index(t,r){if(r){const{data:s,dims:n}=lp(r),a=BigInt64Array.from({length:3*s.length},(i,l)=>s[l%s.length]),o=Array.from({length:n[0]},(i,l)=>je(s.subarray(n[1]*l,n[1]*(l+1)))[0]+1n+BigInt(n[1]));return[new U("int64",a,[3,...n]),new U("int64",o,[o.length,1])]}else{const[s,n]=t.dims,a=BigInt64Array.from({length:3*s*n},(o,i)=>BigInt(Math.floor(i%n/s)));return[new U("int64",a,[3,...t.dims]),Yf([s,1])]}}_reorder_and_write_positions(t,r,s,n){const a=t.reduce((c,d)=>c+d.length,0),o=new Array(a);let i=0;for(let c=0;c<3;++c)for(const d of t){const h=d.length/3;for(let _=c*h;_<(c+1)*h;++_)o[i++]=d[_]}let l=0;for(let c=0;c<r.length;++c)if(r[c]==1){for(let d=0;d<3;++d)s[d][n][c]=o[d*a/3+l];++l}return o}_get_multimodal_rope_positions({filtered_ids:t,image_grid_thw_list:r,video_grid_thw_list:s,spatial_merge_size:n,state:a}){const{image_token_id:o,video_token_id:i,vision_start_token_id:l}=this.config,c=t,h=c.reduce((T,A,C)=>(A==l&&T.push(C),T),[]).map(T=>c[T+1]),_=h.filter(T=>T==o).length,p=h.filter(T=>T==i).length,w=[];let v=0,y=_,M=p;for(let T=0;T<h.length;++T){const A=c.findIndex((X,J)=>J>v&&X==o),C=c.findIndex((X,J)=>J>v&&X==i),S=y>0&&A!==-1?A:c.length+1,N=M>0&&C!==-1?C:c.length+1;let x,R,z,$;S<N?([R,z,$]=r[a.image_index],++a.image_index,--y,x=S):([R,z,$]=s[a.video_index],++a.video_index,--M,x=N);const[Q,H,D]=[Number(R),Math.floor(Number(z)/n),Math.floor(Number($)/n)],I=x-v,te=w.length>0?je(w.at(-1))[0]+1:0;w.push(Array.from({length:3*I},(X,J)=>te+J%I));const W=I+te,ee=Q*H*D,G=Array.from({length:ee},(X,J)=>W+Math.floor(J/(H*D))),L=Array.from({length:ee},(X,J)=>W+Math.floor(J/D)%H),V=Array.from({length:ee},(X,J)=>W+J%D);w.push([G,L,V].flat()),v=x+ee}if(v<c.length){const T=w.length>0?je(w.at(-1))[0]+1:0,A=c.length-v;w.push(Array.from({length:3*A},(C,S)=>T+S%A))}return w}get_rope_index(t,r,s,n){const{vision_config:a}=this.config,o=a.spatial_merge_size??2;if(r||s){const i=t.tolist();n||(n=Xf(t));const l=n.tolist(),c=Array.from({length:3},()=>Array.from({length:t.dims[0]},()=>Array.from({length:t.dims[1]},()=>0))),d=r?r.tolist():[],h=s?s.tolist():[],_={image_index:0,video_index:0},p=[];for(let w=0;w<i.length;++w){const v=i[w].filter((T,A)=>l[w][A]==1),y=this._get_multimodal_rope_positions({filtered_ids:v,image_grid_thw_list:d,video_grid_thw_list:h,spatial_merge_size:o,state:_}),M=this._reorder_and_write_positions(y,l[w],c,w);p.push(je(M)[0]+1-i[w].length)}return[new U("int64",c.flat(1/0),[3,t.dims[0],t.dims[1]]),new U("int64",p,[p.length,1])]}else return this._get_text_only_rope_index(t,n)}async encode_image({pixel_values:t,image_grid_thw:r}){return(await xe(this.sessions.vision_encoder,{pixel_values:t,[this.image_grid_thw_name]:r})).image_features}_merge_input_ids_with_image_features(t){return tl({image_token_id:this.config.image_token_id,...t})}prepare_inputs_for_generation(t,r,s){if(!r.attention_mask||r.position_ids||!(this.sessions.decoder_model_merged??this.sessions.model).inputNames.includes("position_ids"))return r;if(!r.past_key_values)[r.position_ids,r.rope_deltas]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask);else{r.pixel_values=null;const a=r.past_key_values.get_seq_length();if(a<r.input_ids.dims[1]){const[o,i]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask);r.rope_deltas=i,r.position_ids=o.slice(null,null,[a,null]),r.input_ids=r.input_ids.slice(null,[a,null])}else{r.rope_deltas||([,r.rope_deltas]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask));const o=BigInt(a),i=r.rope_deltas.map(l=>o+l);r.position_ids=ur([i,i,i],0)}}return r}},Dp=class extends zl{},Bl=class extends zl{constructor(){super(...arguments);k(this,"image_grid_thw_name","image_grid_thw")}},zp=class extends Dp{constructor(){super(...arguments);k(this,"image_grid_thw_name","image_grid_thw")}},rC=class extends Bl{get_vision_position_ids(e,t,r,s){const n=Math.floor(t[0]/r),a=Math.floor(t[1]/s),o=Math.floor(t[2]/s),i=a*o*n,l=Array.from({length:i},()=>e),c=Array.from({length:i},(h,_)=>e+Math.floor(_/(o*n))),d=Array.from({length:i},(h,_)=>e+_%o);return[...l,...c,...d]}_get_multimodal_rope_positions({filtered_ids:e,image_grid_thw_list:t,video_grid_thw_list:r,spatial_merge_size:s,state:n}){const{image_token_id:a}=this.config,o=[];let i=0,l=e[0]==a?1:0;for(let h=1;h<=e.length;++h){const _=h<e.length?e[h]==a?1:0:-1;_!==l&&(o.push([l,i,h]),i=h,l=_)}let c=0;const d=[];for(const[h,_,p]of o)if(h===0){const w=p-_;d.push(Array.from({length:3*w},(v,y)=>c+y%w)),c+=w}else{const w=t[n.image_index++].map(Number),v=w[0];d.push(this.get_vision_position_ids(c,w,v,s)),c+=Math.max(w[1],w[2])/s}return d}},Rl=class extends P{},sC=class extends Rl{},nC=class extends Rl{},Gl=class extends P{},aC=class extends Gl{},oC=class extends Gl{},$l=class extends P{},iC=class extends $l{},lC=class extends $l{},Vl=class extends P{},cC=class extends Vl{},uC=class extends Vl{},Ul=class extends P{},dC=class extends Ul{},hC=class extends Ul{},jl=class extends P{},fC=class extends jl{},_C=class extends jl{},ql=class extends P{},pC=class extends ql{},mC=class extends ql{},Wl=class extends P{},gC=class extends Wl{},wC=class extends Wl{},Hl=class extends P{},vC=class extends Hl{},yC=class extends Hl{},Bp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","audio_values","past_key_values"])}},Ql=class extends Bp{_merge_input_ids_with_audio_features(e){const t=e.audio_features.dims.at(-1),r=e.audio_features.view(-1,t);return up({audio_token_id:this.config.ignore_index??this.config.audio_token_id??this.config.audio_token_index,...e,audio_features:r})}},bC=class extends Ql{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","input_features","past_key_values"])}},Rp=class extends P{},MC=class extends Rp{},Gp=class extends P{},xC=class extends Gp{},Xl=class extends P{},kC=class extends Xl{},TC=class extends Xl{},Yl=class extends P{},EC=class extends Yl{},AC=class extends Yl{async _call(e){return new ie(await super._call(e))}},Ir=class extends P{},CC=class extends Ir{},SC=class extends Ir{async _call(e){return new Xr(await super._call(e))}},PC=class extends Ir{async _call(e){return new ie(await super._call(e))}},FC=class extends Ir{async _call(e){return new He(await super._call(e))}},LC=class extends P{},IC=class extends Ir{},OC=class extends Ir{async _call(e){return new Xr(await super._call(e))}},NC=class extends Ir{async _call(e){return new ie(await super._call(e))}},Jl=class extends P{},DC=class extends Jl{},zC=class extends Jl{},$p=class extends pr{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","pixel_attention_mask","position_ids","past_key_values"])}},Kl=class extends P{},BC=class extends Kl{},RC=class extends Kl{async _call(e){return new ie(await super._call(e))}},Zl=class extends P{},GC=class extends Zl{},$C=class extends Zl{},Pa=class extends P{},VC=class extends Pa{async forward(e){const t=!e.input_ids,r=!e.pixel_values;if(t&&r)throw new Error("Either `input_ids` or `pixel_values` should be provided.");if(t&&(e.input_ids=yt([e.pixel_values.dims[0],1])),r){const{image_size:l}=this.config.vision_config;e.pixel_values=ct([0,3,l,l],0)}const{text_embeddings:s,image_embeddings:n,l2norm_text_embeddings:a,l2norm_image_embeddings:o}=await super.forward(e),i={};return t||(i.text_embeddings=s,i.l2norm_text_embeddings=a),r||(i.image_embeddings=n,i.l2norm_image_embeddings=o),i}},Vp=class extends Pa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},UC=class extends Pa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},ec=class extends P{},jC=class extends ec{},qC=class extends ec{},WC=class extends pr{},tc=class extends P{},HC=class extends tc{},QC=class extends tc{},XC=class extends pr{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","pixel_attention_mask","spatial_shapes","position_ids","past_key_values"])}},rc=class extends P{},YC=class extends rc{},JC=class extends rc{},Up=class extends P{},KC=class extends Up{},sc=class extends P{},ZC=class extends sc{},eS=class extends sc{},nc=class extends P{},tS=class extends nc{},rS=class extends nc{},ac=class extends P{},sS=class extends ac{},nS=class extends ac{},oc=class extends P{},aS=class extends oc{},oS=class extends oc{},dn=class extends P{},iS=class extends dn{},lS=class extends dn{},cS=class extends dn{async _call(e){return new ie(await super._call(e))}},uS=class extends dn{},jp=class extends P{},dS=class extends jp{},qp=class extends P{},hS=class extends qp{},Wp=class extends Ye{constructor({char_logits:e,bpe_logits:t,wp_logits:r}){super(),this.char_logits=e,this.bpe_logits=t,this.wp_logits=r}get logits(){return[this.char_logits,this.bpe_logits,this.wp_logits]}},Hp=class extends P{},fS=class extends Hp{async _call(e){return new Wp(await super._call(e))}},Qp=class extends Ye{constructor({audio_codes:e}){super(),this.audio_codes=e}},Xp=class extends Ye{constructor({audio_values:e}){super(),this.audio_values=e}},Fa=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},_S=class extends Fa{async encode(e){return new Qp(await xe(this.sessions.encoder_model,e))}async decode(e){return new Xp(await xe(this.sessions.decoder_model,e))}},Yp=class extends Fa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},Jp=class extends Fa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},ic=class extends P{},pS=class extends ic{},mS=class extends ic{},lc=class extends P{},gS=class extends lc{},wS=class extends lc{},hn=class extends P{},vS=class extends hn{},yS=class extends hn{async _call(e){return new Je(await super._call(e))}},bS=class extends hn{async _call(e){return new ie(await super._call(e))}},MS=class extends hn{async _call(e){return new ht(await super._call(e))}},cc=class extends P{},xS=class extends cc{},kS=class extends cc{},La=class extends P{},TS=class extends La{},ES=class extends La{async _call(e){return new ie(await super._call(e))}},AS=class extends La{},Ia=class extends P{},CS=class extends Ia{},SS=class extends Ia{async _call(e){return new ie(await super._call(e))}},PS=class extends Ia{},Oa=class extends P{},FS=class extends Oa{},LS=class extends Oa{async _call(e){return new ie(await super._call(e))}},IS=class extends Oa{},Na=class extends P{},OS=class extends Na{},NS=class extends Na{async _call(e){return new ie(await super._call(e))}},DS=class extends Na{},uc=class extends P{},zS=class extends uc{},BS=class extends uc{async _call(e){return new ie(await super._call(e))}},dc=class extends P{},RS=class extends dc{},GS=class extends dc{async _call(e){return new ie(await super._call(e))}},fn=class extends P{},$S=class extends fn{},VS=class extends fn{async _call(e){return new Je(await super._call(e))}},US=class extends fn{async _call(e){return new ie(await super._call(e))}},jS=class extends fn{async _call(e){return new He(await super._call(e))}},hc=class extends P{},qS=class extends hc{},WS=class extends hc{},fc=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values","decoder_input_ids","past_key_values"])}},HS=class extends fc{},QS=class extends fc{},Ts=class extends P{},XS=class extends Ts{},YS=class extends Ts{async _call(e){return new Je(await super._call(e))}},JS=class extends Ts{async _call(e){return new ie(await super._call(e))}},KS=class extends Ts{async _call(e){return new He(await super._call(e))}},ZS=class extends Ts{async _call(e){return new ht(await super._call(e))}},_c=class extends P{},eP=class extends _c{},tP=class extends _c{},pc=class extends P{},rP=class extends pc{},sP=class extends pc{},Kp=class extends P{},nP=class extends Kp{constructor(...t){super(...t);k(this,"forward_params",["input_ids","pixel_values","images_seq_mask","images_emb_mask","attention_mask","position_ids","past_key_values"]);this._generation_mode="text"}async forward(t){const r=this._generation_mode??"text";let s;if(r==="text"||!t.past_key_values){const l=this.sessions.prepare_inputs_embeds,c=rt(t,l.inputNames);s=await xe(l,c)}else{const l=this.sessions.gen_img_embeds,c=rt({image_ids:t.input_ids},l.inputNames);s=await xe(l,c)}const n={...t,...s},a=await _r(this,n),o=this.sessions[r==="text"?"lm_head":"gen_head"];if(!o)throw new Error(`Unable to find "${o}" generation head`);const i=await xe(o,rt(a,o.inputNames));return{...s,...a,...i}}prepare_inputs_for_generation(t,r,s){const n=!!r.past_key_values;return s.guidance_scale!==null&&s.guidance_scale>1&&(n?r.input_ids=ze([r.input_ids,r.input_ids],0):(r.input_ids=ze([r.input_ids,Fi(r.input_ids,BigInt(s.pad_token_id))],0),r.attention_mask=ze([r.attention_mask,Fi(r.attention_mask,0n)],0))),(n||!r.pixel_values)&&(r.pixel_values=ct([0,0,3,384,384],1)),n&&(r.images_seq_mask=new U("bool",new Array(1).fill(!0).fill(!1,0,1),[1,1]),r.images_emb_mask=new U("bool",new Array(0).fill(!1),[1,1,0])),r}async generate(t){return this._generation_mode="text",super.generate(t)}async generate_images(t){this._generation_mode="image";const r=(t.inputs??t[this.main_input_name]).dims[1],n=(await super.generate(t)).slice(null,[r,null]),a=this.sessions.image_decode,{decoded_image:o}=await xe(a,{generated_tokens:n}),i=o.add_(1).mul_(255/2).clamp_(0,255).to("uint8"),l=[];for(const c of i){const d=jt.fromTensor(c);l.push(d)}return l}},mc=class extends P{},aP=class extends mc{},oP=class extends mc{},Zp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"])}_apply_and_filter_by_delay_pattern_mask(t){const[r,s]=t.dims,n=this.config.decoder.num_codebooks,a=s-n;let o=0;for(let c=0;c<t.size;++c){if(t.data[c]==this.config.decoder.pad_token_id)continue;const d=c%s,h=Math.floor(c/s)%n,_=d-h;_>0&&_<=a&&(t.data[o++]=t.data[c])}const i=Math.floor(r/n),l=o/(i*n);return new U(t.type,t.data.slice(0,o),[i,n,l])}prepare_inputs_for_generation(t,r,s){const n=BigInt(this.config.decoder.pad_token_id);let a=structuredClone(t);for(let o=0;o<a.length;++o)for(let i=0;i<a[o].length;++i)o%this.config.decoder.num_codebooks>=i&&(a[o][i]=n);return s.guidance_scale!==null&&s.guidance_scale>1&&(a=a.concat(a)),xa(this,a,r)}async generate(t){const r=await super.generate(t),s=this._apply_and_filter_by_delay_pattern_mask(r).unsqueeze_(0),{audio_values:n}=await xe(this.sessions.encodec_decode,{audio_codes:s});return n}},gc=class extends P{},iP=class extends gc{},lP=class extends gc{},wc=class extends P{},cP=class extends wc{},uP=class extends wc{},Es=class extends P{},dP=class extends Es{},hP=class extends Es{async _call(e){return new Je(await super._call(e))}},fP=class extends Es{async _call(e){return new ie(await super._call(e))}},_P=class extends Es{async _call(e){return new He(await super._call(e))}},pP=class extends Es{async _call(e){return new ht(await super._call(e))}},em=class extends P{},mP=class extends em{},vc=class extends P{},gP=class extends vc{},wP=class extends vc{},yc=class extends P{},vP=class extends yc{},yP=class extends yc{},bc=class extends P{},bP=class extends bc{},MP=class extends bc{},Mc=class extends P{},xP=class extends Mc{},kP=class extends Mc{},xc=class extends P{},TP=class extends xc{},EP=class extends xc{},kc=class extends P{},AP=class extends kc{},CP=class extends kc{},Tc=class extends P{},SP=class extends Tc{},PP=class extends Tc{},Ec=class extends P{},FP=class extends Ec{},LP=class extends Ec{},IP=class extends pr{},tm=class extends P{},OP=class extends tm{async _call(e){return new Xr(await super._call(e))}},Ac=class extends P{},NP=class extends Ac{},DP=class extends Ac{},Cc=class extends P{},zP=class extends Cc{},BP=class extends Cc{},Sc=class extends P{},RP=class extends Sc{},GP=class extends Sc{},Pc=class extends P{},$P=class extends Pc{},VP=class extends Pc{},rm=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","position_ids","pixel_values","image_sizes","past_key_values"])}},sm=class extends rm{async forward({input_ids:e=null,attention_mask:t=null,pixel_values:r=null,image_sizes:s=null,position_ids:n=null,inputs_embeds:a=null,past_key_values:o=null,generation_config:i=null,logits_processor:l=null,...c}){if(!a){let h;if(r&&e.dims[1]!==1){if(!s)throw new Error("`image_sizes` must be provided when `pixel_values` is provided.");({image_features:h}=await xe(this.sessions.vision_encoder,{pixel_values:r,image_sizes:s}))}else{const _=this.config.normalized_config.hidden_size;h=new U("float32",[],[0,_])}({inputs_embeds:a}=await xe(this.sessions.prepare_inputs_embeds,{input_ids:e,image_features:h}))}return await _r(this,{inputs_embeds:a,past_key_values:o,attention_mask:t,position_ids:n,generation_config:i,logits_processor:l},!1)}},Fc=class extends P{},UP=class extends Fc{},jP=class extends Fc{async _call(e){return new ie(await super._call(e))}},Lc=class extends P{},qP=class extends Lc{},WP=class extends Lc{async _call(e){return new He(await super._call(e))}},Ic=class extends P{},HP=class extends Ic{},QP=class extends Ic{},Oc=class extends P{},XP=class extends Oc{},YP=class extends Oc{},Nc=class extends P{},JP=class extends Nc{},KP=class extends Nc{},Dc=class extends P{},ZP=class extends Dc{},eF=class extends Dc{},zc=class extends P{},tF=class extends zc{},rF=class extends zc{},Bc=class extends Bl{},nm=class extends zp{},sF=class extends Bc{},nF=class extends nm{},Rc=class extends Bc{},am=class extends Rc{},aF=class extends Rc{},oF=class extends am{},Gc=class extends P{},iF=class extends Gc{},lF=class extends Gc{async _call(e){return new ie(await super._call(e))}},$c=class extends P{},cF=class extends $c{},uF=class extends $c{async _call(e){return new om(await super._call(e))}},om=class extends ln{},As=class extends P{},dF=class extends As{},hF=class extends As{async _call(e){return new Je(await super._call(e))}},fF=class extends As{async _call(e){return new ie(await super._call(e))}},_F=class extends As{async _call(e){return new He(await super._call(e))}},pF=class extends As{async _call(e){return new ht(await super._call(e))}},Cs=class extends P{},mF=class extends Cs{},gF=class extends Cs{async _call(e){return new Je(await super._call(e))}},wF=class extends Cs{async _call(e){return new ie(await super._call(e))}},vF=class extends Cs{async _call(e){return new He(await super._call(e))}},yF=class extends Cs{async _call(e){return new ht(await super._call(e))}},Vc=class extends P{},bF=class extends Vc{},MF=class extends Vc{async _call(e){return new im(await super._call(e))}},im=class extends ln{},lm=class extends Ye{constructor({iou_scores:e,pred_masks:t}){super(),this.iou_scores=e,this.pred_masks=t}},cm=class extends P{},xF=class extends cm{async get_image_embeddings({pixel_values:e}){return await Lr(this,{pixel_values:e})}async forward(e){!e.image_embeddings||!e.image_positional_embeddings?e={...e,...await this.get_image_embeddings(e)}:e={...e},e.input_labels??(e.input_labels=yt(e.input_points.dims.slice(0,-1)));const t={image_embeddings:e.image_embeddings,image_positional_embeddings:e.image_positional_embeddings};return e.input_points&&(t.input_points=e.input_points),e.input_labels&&(t.input_labels=e.input_labels),e.input_boxes&&(t.input_boxes=e.input_boxes),await xe(this.sessions.prompt_encoder_mask_decoder,t)}async _call(e){return new lm(await super._call(e))}},um=class extends Ye{constructor({iou_scores:e,pred_masks:t,object_score_logits:r}){super(),this.iou_scores=e,this.pred_masks=t,this.object_score_logits=r}},dm=class extends P{},Uc=class extends dm{async get_image_embeddings({pixel_values:e}){return await Lr(this,{pixel_values:e})}async forward(e){const{num_feature_levels:t}=this.config.vision_config;if(Array.from({length:t},(a,o)=>`image_embeddings.${o}`).some(a=>!e[a])?e={...e,...await this.get_image_embeddings(e)}:e={...e},e.input_points){if(e.input_boxes&&e.input_boxes.dims[1]!==1)throw new Error("When both `input_points` and `input_boxes` are provided, the number of boxes per image must be 1.");const a=e.input_points.dims;e.input_labels??(e.input_labels=yt(a.slice(0,-1))),e.input_boxes??(e.input_boxes=ct([a[0],0,4],0))}else if(e.input_boxes){const a=e.input_boxes.dims;e.input_labels=ct([a[0],a[1],0],-1n),e.input_points=ct([a[0],1,0,2],0)}else throw new Error("At least one of `input_points` or `input_boxes` must be provided.");const s=this.sessions.prompt_encoder_mask_decoder,n=rt(e,s.inputNames);return await xe(s,n)}async _call(e){return new um(await super._call(e))}},kF=class extends Uc{},TF=class extends Uc{},Da=class extends P{},EF=class extends Da{},AF=class extends Da{},CF=class extends Da{},za=class extends P{},SF=class extends za{},PF=class extends za{},FF=class extends za{},jc=class extends P{},LF=class extends jc{},hm=class extends jc{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},IF=class extends Yr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},qc=class extends P{},OF=class extends qc{},NF=class extends qc{},DF=class extends $p{},Ba=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},zF=class extends Ba{async encode(e){return await xe(this.sessions.encoder_model,e)}async decode(e){return await xe(this.sessions.decoder_model,e)}},fm=class extends Ba{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},_m=class extends Ba{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},Wc=class extends P{},BF=class extends Wc{},RF=class extends Wc{},Ra=class extends P{},GF=class extends Ra{},$F=class extends Ra{},VF=class extends Ra{async generate_speech(e,t,{threshold:r=.5,minlenratio:s=0,maxlenratio:n=20,vocoder:a=null}={}){const o={input_ids:e},{encoder_outputs:i,encoder_attention_mask:l}=await Lr(this,o),c=i.dims[1]/this.config.reduction_factor,d=Math.floor(c*n),h=Math.floor(c*s),_=this.config.num_mel_bins;let p=[],w=null,v=null,y=0;for(;;){++y;const A=np(!!v);let C;v?C=v.output_sequence_out:C=new U("float32",new Float32Array(_),[1,1,_]);let S={use_cache_branch:A,output_sequence:C,encoder_attention_mask:l,speaker_embeddings:t,encoder_hidden_states:i};el(this,S,w),v=await xe(this.sessions.decoder_model_merged,S),w=Zi(v,w);const{prob:N,spectrum:x}=v;if(p.push(x),y>=h&&(Array.from(N.data).filter(R=>R>=r).length>0||y>=d))break}const M=ze(p),{waveform:T}=await xe(a.sessions.model,{spectrogram:M});return{spectrogram:M,waveform:T}}},UF=class extends P{constructor(){super(...arguments);k(this,"main_input_name","spectrogram")}},_n=class extends P{},jF=class extends _n{},qF=class extends _n{async _call(e){return new Je(await super._call(e))}},WF=class extends _n{async _call(e){return new ie(await super._call(e))}},HF=class extends _n{async _call(e){return new ht(await super._call(e))}},Hc=class extends P{},QF=class extends Hc{},XF=class extends Hc{},Qc=class extends P{},YF=class extends Qc{},JF=class extends Qc{},pm=class extends P{},KF=class extends pm{},mm=class extends P{},gm=class extends mm{async generate_speech({input_ids:e,attention_mask:t,style:r,num_inference_steps:s=5,speed:n=1.05}){const{sampling_rate:a,chunk_compress_factor:o,base_chunk_size:i,latent_dim:l}=this.config,{last_hidden_state:c,durations:d}=await xe(this.sessions.text_encoder,{input_ids:e,attention_mask:t,style:r}),h=d.div(n).mul_(a),_=i*o,p=h.data,w=Int32Array.from(p,z=>Math.ceil(z/_)),v=Math.max(...w),y=e.dims[0],M=new BigInt64Array(y*v);for(let z=0;z<y;++z)M.fill(1n,z*v,z*v+w[z]);const T=new U("int64",M,[y,v]),A=l*o,C=A*v;let S=Ax([y,A,v]);const N=S.data;for(let z=0;z<y;++z)if(w[z]!==v)for(let $=0;$<A;++$)N.fill(0,z*C+$*v+w[z],z*C+($+1)*v);const x=ct([y],s);for(let z=0;z<s;++z){const $=ct([y],z);({denoised_latents:S}=await xe(this.sessions.latent_denoiser,{style:r,noisy_latents:S,latent_mask:T,encoder_outputs:c,attention_mask:t,timestep:$,num_inference_steps:x}))}const{waveform:R}=await xe(this.sessions.voice_decoder,{latents:S});return{waveform:R,durations:h}}},Ga=class extends P{},ZF=class extends Ga{},eL=class extends Ga{async _call(e){return new ie(await super._call(e))}},tL=class extends Ga{},Xc=class extends P{},rL=class extends Xc{},sL=class extends Xc{},Yc=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"])}},nL=class extends Yc{},aL=class extends Yc{},Jc=class extends P{},oL=class extends Jc{},iL=class extends Jc{async _call(e){return new wm(await super._call(e))}},wm=class extends Ml{},vm=class extends P{},lL=class extends vm{},$a=class extends P{},cL=class extends $a{},uL=class extends $a{async _call(e){return new Xr(await super._call(e))}},dL=class extends $a{async _call(e){return new ie(await super._call(e))}},pn=class extends P{},hL=class extends pn{},fL=class extends pn{async _call(e){return new Xr(await super._call(e))}},_L=class extends pn{async _call(e){return new ie(await super._call(e))}},pL=class extends pn{async _call(e){return new He(await super._call(e))}},Kc=class extends P{},mL=class extends Kc{},gL=class extends Kc{},wL=class extends P{constructor(){super(...arguments);k(this,"main_input_name","pixel_values");k(this,"forward_params",["pixel_values","decoder_input_ids","encoder_hidden_states","past_key_values"])}},Zc=class extends P{},vL=class extends Zc{},yL=class extends Zc{async _call(e){return new ie(await super._call(e))}},ym=class extends P{},bL=class extends ym{},eu=class extends P{},ML=class extends eu{},xL=class extends eu{async _call(e){return new ie(await super._call(e))}},bm=class extends P{},kL=class extends bm{async _call(e){return new JT(await super._call(e))}},Mm=class extends P{},TL=class extends Mm{},xm=class extends Ye{constructor({waveform:e,spectrogram:t}){super(),this.waveform=e,this.spectrogram=t}},km=class extends P{},EL=class extends km{async _call(e){return new xm(await super._call(e))}},AL=class extends Ql{},Tm=2,CL=1,tu=new WeakMap;function SL(e,t){var w,v,y;const{text_config:r,audio_config:s}=e.config,n=e.sessions.audio_encoder,{num_mel_bins:a,hidden_size:o}=s,i=a+o,l=new Yi,c=((w=n==null?void 0:n.config)==null?void 0:w.kv_cache_dtype)??"float32",d=c==="float16"?ds.float16:ds.float32,h=va(s,{batch_size:1});for(const M in h){const T=h[M].reduce((A,C)=>A*C,1);l[M]=new U(c,new d(T),h[M])}const _=new U(c,new d(i*Tm),[1,i,Tm]),p=((v=t[Symbol.asyncIterator])==null?void 0:v.call(t))??((y=t[Symbol.iterator])==null?void 0:y.call(t));if(!p)throw new Error("input_features must be iterable or async iterable");return{encoder_session:n,enc_kv_cache:l,enc_padding_cache:_,enc_past_seq_len:0,audio_embed_queue:[],audio_embed_total_tokens:0,audio_queue_offset:0,audio_consumed:0,stream_exhausted:!1,chunks_iter:p,text_hidden_size:r.hidden_size}}async function PL(e,t){const r=t.dims[2],s=Math.floor((CL+r-3)/2)+1,n=new U("int64",BigInt64Array.from({length:s},(d,h)=>BigInt(e.enc_past_seq_len+h)),[1,s]),a=e.enc_past_seq_len+s,o=yt([1,a]),{audio_embeds:i,present_padding_cache:l,...c}=await xe(e.encoder_session,{input_features:t,attention_mask:o,position_ids:n,past_padding_cache:e.enc_padding_cache,...e.enc_kv_cache});e.enc_padding_cache.location==="gpu-buffer"&&e.enc_padding_cache.dispose(),e.enc_padding_cache=l;for(const d in c)if(d.startsWith("present.")){const h=d.replace("present","past_key_values"),_=e.enc_kv_cache[h];(_==null?void 0:_.location)==="gpu-buffer"&&_.dispose(),e.enc_kv_cache[h]=c[d]}return e.enc_past_seq_len=a,i}async function FL(e,t){for(;e.audio_embed_total_tokens<t&&!e.stream_exhausted;){const r=await e.chunks_iter.next();if(r.done){e.stream_exhausted=!0;break}const s=await PL(e,r.value);e.audio_embed_queue.push({data:s.data,tokens:s.dims[1]}),e.audio_embed_total_tokens+=s.dims[1]}}function LL(e,t,r){if(e.audio_embed_queue.length===0)return;const s=t.data;let n=0,a=r;for(;a>0&&e.audio_embed_queue.length>0;){const o=e.audio_embed_queue[0],i=o.tokens-e.audio_queue_offset,l=Math.min(a,i),c=e.audio_queue_offset*e.text_hidden_size;for(let d=0;d<l*e.text_hidden_size;++d)s[n*e.text_hidden_size+d]+=o.data[c+d];n+=l,a-=l,e.audio_queue_offset+=l,e.audio_queue_offset>=o.tokens&&(e.audio_embed_queue.shift(),e.audio_queue_offset=0)}e.audio_consumed+=r-a}var IL=class extends ya{constructor(e){super(),this._s=e}_call(e){const t=this._s.stream_exhausted&&this._s.audio_embed_queue.length===0;return e.map(()=>t)}},Em=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","past_key_values"])}},Am=class extends Em{async forward({input_ids:e,past_key_values:t,...r}){const s=e.dims[1],n=tu.get(this);n&&await FL(n,n.audio_consumed+s);const{inputs_embeds:a}=await xe(this.sessions.embed_tokens,{input_ids:e});n&&LL(n,a,s);const o={inputs_embeds:a,...r};el(this,o,t);const i=this.sessions.decoder_model_merged,l=rt(o,i.inputNames);return await xe(i,l)}async generate({input_features:e,stopping_criteria:t,...r}){if(!e)throw new Error("input_features (generator/iterable) must be provided");const s=SL(this,e);tu.set(this,s);const n=new ep;n.push(new IL(s)),t&&n.extend(t);try{return await super.generate({...r,stopping_criteria:n})}finally{s.enc_kv_cache.dispose(),tu.delete(this)}}},Va=class extends P{},OL=class extends Va{},NL=class extends Va{async _call(e){return new Xr(await super._call(e))}},DL=class extends Va{async _call(e){return new ie(await super._call(e))}},Cm=class extends Ye{constructor({logits:e,embeddings:t}){super(),this.logits=e,this.embeddings=t}},Ss=class extends P{},zL=class extends Ss{},BL=class extends Ss{async _call(e){return new Xr(await super._call(e))}},RL=class extends Ss{async _call(e){return new ie(await super._call(e))}},GL=class extends Ss{async _call(e){return new Cm(await super._call(e))}},$L=class extends Ss{async _call(e){return new He(await super._call(e))}},Sm=class extends P{},VL=class extends Sm{},UL=class extends Z_{constructor(){super(...arguments);k(this,"return_timestamps",null);k(this,"return_token_timestamps",null);k(this,"num_frames",null);k(this,"alignment_heads",null);k(this,"task",null);k(this,"language",null);k(this,"no_timestamps_token_id",null);k(this,"prompt_ids",null);k(this,"is_multilingual",null);k(this,"lang_to_id",null);k(this,"task_to_id",null);k(this,"max_initial_timestamp_index",1)}},ru=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_features");k(this,"forward_params",["input_features","attention_mask","decoder_input_ids","decoder_attention_mask","past_key_values"])}},jL=class extends ru{},Pm=class extends ru{_prepare_generation_config(e,t){return super._prepare_generation_config(e,t,UL)}_retrieve_init_tokens(e){const t=[e.decoder_start_token_id];let r=e.language;const s=e.task;if(e.is_multilingual){r||(ue.warn("No language specified - defaulting to English (en)."),r="en");const a=`<|${M1(r)}|>`;t.push(e.lang_to_id[a]),t.push(e.task_to_id[s??"transcribe"])}else if(r||s)throw new Error("Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.");return!e.return_timestamps&&e.no_timestamps_token_id&&t.at(-1)!==e.no_timestamps_token_id?t.push(e.no_timestamps_token_id):e.return_timestamps&&t.at(-1)===e.no_timestamps_token_id&&(ue.warn("<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`."),t.pop()),t.filter(n=>n!=null)}async generate({inputs:e=null,generation_config:t=null,logits_processor:r=null,stopping_criteria:s=null,...n}){t=this._prepare_generation_config(t,n);const a=n.decoder_input_ids instanceof U?Li(n.decoder_input_ids):n.decoder_input_ids??this._retrieve_init_tokens(t);if(t.return_timestamps&&(r??(r=new Xi),r.push(new rE(t,a))),t.begin_suppress_tokens&&(r??(r=new Xi),r.push(new K_(t.begin_suppress_tokens,a.length))),t.return_token_timestamps){if(!t.alignment_heads)throw new Error("Model generation config has no `alignment_heads`, token-level timestamps not available. See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config.");t.task==="translate"&&ue.warn("Token-level timestamps may not be reliable for task 'translate'."),t.output_attentions=!0,t.return_dict_in_generate=!0}if(t.return_timestamps&&!n.max_new_tokens)return this._generate_with_seek({inputs:e,generation_config:t,logits_processor:r,init_tokens:a,kwargs:n});const o=await super.generate({inputs:e,generation_config:t,logits_processor:r,decoder_input_ids:a,...n});return t.return_token_timestamps&&(o.token_timestamps=this._extract_token_timestamps(o,t.alignment_heads,t.num_frames,.02,a.length)),o}async _generate_with_seek({inputs:e,generation_config:t,logits_processor:r,init_tokens:s,kwargs:n}){const a=t.no_timestamps_token_id+1,o=Array.isArray(t.eos_token_id)?t.eos_token_id[0]:t.eos_token_id,i=t.return_token_timestamps,l=e,c=l.dims[2],d=2,h=this.config.max_source_positions,_=d*h;let p=0;const w=[],v=[];for(;p<c;){const M=Math.min(p+_,c),T=l.slice(null,null,[p,M]);let A;const C=T.dims[2];if(C<_){const W=l.dims[1],ee=new Float32Array(W*_),G=T.data;for(let L=0;L<W;++L)ee.set(G.subarray(L*C,(L+1)*C),L*_);A=new U("float32",ee,[1,W,_])}else A=T;if(r)for(const W of r)"begin_index"in W&&(W.begin_index=s.length);const S=await super.generate({inputs:A,generation_config:t,logits_processor:r,decoder_input_ids:s,...n}),x=(i?S.sequences:S)[0].tolist().map(Number).slice(s.length);let R;if(i){S.token_timestamps=this._extract_token_timestamps(S,t.alignment_heads,Math.floor((M-p)/d),.02,s.length);const W=p/d*.02;R=S.token_timestamps[0].tolist().slice(s.length).map(ee=>ee+W)}if(x.length>0&&x.at(-1)===o&&x.pop(),x.length===0)break;const z=x.map(W=>W>=a),$=x.length>=2&&z[x.length-1]&&!z[x.length-2],Q=[];for(let W=0;W<x.length-1;++W)z[W]&&z[W+1]&&Q.push(W+1);let H,D=x.length;if(Q.length>0)if($)H=M-p;else{const W=Q.at(-1);H=(x[W-1]-a)*d,D=W}else H=M-p;const I=Math.floor(p/d),te=a+1500;for(let W=0;W<D;++W)x[W]>=a&&(x[W]=Math.min(x[W]+I,te));w.push(...x.slice(0,D)),R&&v.push(...R.slice(0,D)),p+=H}w.push(o);const y=[...s,...w];if(i){const M=new U("int64",y.map(BigInt),[1,y.length]),T=[...new Array(s.length).fill(0),...v,0],A=new U("float32",new Float32Array(T),[1,T.length]);return{sequences:M,token_timestamps:A}}return new U("int64",y.map(BigInt),[1,y.length])}_extract_token_timestamps(e,t,r=null,s=.02,n=0){if(!e.cross_attentions)throw new Error("Model outputs must contain cross attentions to extract timestamps. This is most likely because the model was not exported with `output_attentions=True`.");r==null&&ue.warn("`num_frames` has not been set, meaning the entire audio will be analyzed. This may lead to inaccurate token-level timestamps for short audios (< 30 seconds).");let a=this.config.median_filter_width;a===void 0&&(ue.warn("Model config has no `median_filter_width`, using default value of 7."),a=7);const o=e.cross_attentions,i=Array.from({length:this.config.decoder_layers},(y,M)=>ze(o.map(T=>T[M]),2)),l=ur(t.map(([y,M])=>{if(y>=i.length)throw new Error(`Layer index ${y} is out of bounds for cross attentions (length ${i.length}).`);return r?i[y].slice(null,M,null,[0,r]):i[y].slice(null,M)})).transpose(1,0,2,3),[c,d]=Ex(l,-2,0,!0),h=l.clone();for(let y=0;y<h.dims[0];++y){const M=h[y];for(let T=0;T<M.dims[0];++T){const A=M[T],C=c[y][T][0].data,S=d[y][T][0].data;for(let N=0;N<A.dims[0];++N){let x=A[N].data;for(let R=0;R<x.length;++R)x[R]=(x[R]-S[R])/C[R];x.set(lx(x,a))}}}const _=n>0?h.slice(null,null,[n,h.dims[2]],null):h,p=[Ci(_,1)],w=e.sequences.dims,v=new U("float32",new Float32Array(w[0]*w[1]),w);for(let y=0;y<w[0];++y){const M=p[y].neg().squeeze_(0),[T,A]=ux(M.tolist()),C=Array.from({length:T.length-1},(R,z)=>T[z+1]-T[z]),S=Yt([1],C).map(R=>!!R),N=[];for(let R=0;R<S.length;++R)S[R]&&N.push(A[R]*s);const x=new Array(n).fill(0);x.push(...N),N.length>0&&x.push(N.at(-1)),v[y].data.set(x)}return v}},qL=class extends Pm{},Ps=class extends P{},WL=class extends Ps{},HL=class extends Ps{async _call(e){return new Je(await super._call(e))}},QL=class extends Ps{async _call(e){return new ie(await super._call(e))}},XL=class extends Ps{async _call(e){return new He(await super._call(e))}},YL=class extends Ps{async _call(e){return new ht(await super._call(e))}},Fs=class extends P{},JL=class extends Fs{},KL=class extends Fs{async _call(e){return new Je(await super._call(e))}},ZL=class extends Fs{async _call(e){return new ie(await super._call(e))}},eI=class extends Fs{async _call(e){return new He(await super._call(e))}},tI=class extends Fs{async _call(e){return new ht(await super._call(e))}},su=class extends P{},rI=class extends su{},sI=class extends su{async _call(e){return new Fm(await super._call(e))}},Fm=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},nu=class extends P{},nI=class extends nu{},aI=class extends nu{},oI=new Map([["bert","BertModel"],["eurobert","EuroBertModel"],["neobert","NeoBertModel"],["modernbert","ModernBertModel"],["nomic_bert","NomicBertModel"],["roformer","RoFormerModel"],["electra","ElectraModel"],["esm","EsmModel"],["convbert","ConvBertModel"],["camembert","CamembertModel"],["deberta","DebertaModel"],["deberta-v2","DebertaV2Model"],["mpnet","MPNetModel"],["albert","AlbertModel"],["distilbert","DistilBertModel"],["roberta","RobertaModel"],["xlm","XLMModel"],["xlm-roberta","XLMRobertaModel"],["clap","ClapModel"],["clip","CLIPModel"],["clipseg","CLIPSegModel"],["chinese_clip","ChineseCLIPModel"],["siglip","SiglipModel"],["jina_clip","JinaCLIPModel"],["mobilebert","MobileBertModel"],["squeezebert","SqueezeBertModel"],["wav2vec2","Wav2Vec2Model"],["wav2vec2-bert","Wav2Vec2BertModel"],["unispeech","UniSpeechModel"],["unispeech-sat","UniSpeechSatModel"],["hubert","HubertModel"],["wavlm","WavLMModel"],["audio-spectrogram-transformer","ASTModel"],["vits","VitsModel"],["pyannote","PyAnnoteModel"],["wespeaker-resnet","WeSpeakerResNetModel"],["detr","DetrModel"],["rt_detr","RTDetrModel"],["rt_detr_v2","RTDetrV2Model"],["rf_detr","RFDetrModel"],["d_fine","DFineModel"],["table-transformer","TableTransformerModel"],["vit","ViTModel"],["ijepa","IJepaModel"],["pvt","PvtModel"],["vit_msn","ViTMSNModel"],["vit_mae","ViTMAEModel"],["groupvit","GroupViTModel"],["fastvit","FastViTModel"],["mobilevit","MobileViTModel"],["mobilevitv2","MobileViTV2Model"],["owlvit","OwlViTModel"],["owlv2","Owlv2Model"],["beit","BeitModel"],["deit","DeiTModel"],["hiera","HieraModel"],["convnext","ConvNextModel"],["convnextv2","ConvNextV2Model"],["dinov2","Dinov2Model"],["dinov2_with_registers","Dinov2WithRegistersModel"],["dinov3_vit","DINOv3ViTModel"],["dinov3_convnext","DINOv3ConvNextModel"],["resnet","ResNetModel"],["swin","SwinModel"],["swin2sr","Swin2SRModel"],["donut-swin","DonutSwinModel"],["yolos","YolosModel"],["dpt","DPTModel"],["glpn","GLPNModel"],["hifigan","SpeechT5HifiGan"],["efficientnet","EfficientNetModel"],["decision_transformer","DecisionTransformerModel"],["patchtst","PatchTSTModel"],["patchtsmixer","PatchTSMixerModel"],["mobilenet_v1","MobileNetV1Model"],["mobilenet_v2","MobileNetV2Model"],["mobilenet_v3","MobileNetV3Model"],["mobilenet_v4","MobileNetV4Model"],["maskformer","MaskFormerModel"],["mgp-str","MgpstrForSceneTextRecognition"],["style_text_to_speech_2","StyleTextToSpeech2Model"]]),iI=new Map([["t5","T5Model"],["longt5","LongT5Model"],["mt5","MT5Model"],["bart","BartModel"],["mbart","MBartModel"],["marian","MarianModel"],["whisper","WhisperModel"],["cohere_asr","CohereAsrModel"],["m2m_100","M2M100Model"],["blenderbot","BlenderbotModel"],["blenderbot-small","BlenderbotSmallModel"]]),lI=new Map([["mimi","MimiModel"],["dac","DacModel"],["snac","SnacModel"]]),cI=new Map([["bloom","BloomModel"],["jais","JAISModel"],["gpt2","GPT2Model"],["gpt_oss","GptOssModel"],["gptj","GPTJModel"],["gpt_bigcode","GPTBigCodeModel"],["gpt_neo","GPTNeoModel"],["gpt_neox","GPTNeoXModel"],["codegen","CodeGenModel"],["llama","LlamaModel"],["apertus","ApertusModel"],["nanochat","NanoChatModel"],["arcee","ArceeModel"],["afmoe","AfmoeModel"],["lfm2","Lfm2Model"],["lfm2_moe","Lfm2MoeModel"],["smollm3","SmolLM3Model"],["exaone","ExaoneModel"],["olmo","OlmoModel"],["olmo2","Olmo2Model"],["olmo3","Olmo3Model"],["olmo_hybrid","OlmoHybridModel"],["mobilellm","MobileLLMModel"],["granite","GraniteModel"],["granitemoehybrid","GraniteMoeHybridModel"],["cohere","CohereModel"],["cohere2","Cohere2Model"],["gemma","GemmaModel"],["gemma2","Gemma2Model"],["vaultgemma","VaultGemmaModel"],["gemma3_text","Gemma3Model"],["helium","HeliumModel"],["glm","GlmModel"],["glm_moe_dsa","GlmMoeDsaModel"],["openelm","OpenELMModel"],["qwen2","Qwen2Model"],["qwen2_moe","Qwen2MoeModel"],["qwen3","Qwen3Model"],["qwen3_moe","Qwen3MoeModel"],["qwen3_next","Qwen3NextModel"],["phi","PhiModel"],["phi3","Phi3Model"],["mpt","MptModel"],["opt","OPTModel"],["mistral","MistralModel"],["mistral4","Mistral4Model"],["ministral","MinistralModel"],["ministral3","Ministral3Model"],["ernie4_5","Ernie4_5ForCausalLM"],["starcoder2","Starcoder2Model"],["deepseek_v3","DeepseekV3Model"],["falcon","FalconModel"],["falcon_h1","FalconH1Model"],["nemotron_h","NemotronHModel"],["solar_open","SolarOpenModel"],["stablelm","StableLmModel"],["modernbert-decoder","ModernBertDecoderModel"],["hunyuan_v1_dense","HunYuanDenseV1Model"],["youtu","YoutuModel"]]),Lm=new Map([["speecht5","SpeechT5ForSpeechToText"],["whisper","WhisperForConditionalGeneration"],["lite-whisper","LiteWhisperForConditionalGeneration"],["moonshine","MoonshineForConditionalGeneration"],["cohere_asr","CohereAsrForConditionalGeneration"]]),Im=new Map([["speecht5","SpeechT5ForTextToSpeech"]]),Om=new Map([["vits","VitsModel"],["musicgen","MusicgenForConditionalGeneration"],["supertonic","SupertonicForConditionalGeneration"]]),Nm=new Map([["bert","BertForSequenceClassification"],["eurobert","EuroBertForSequenceClassification"],["neobert","NeoBertForSequenceClassification"],["modernbert","ModernBertForSequenceClassification"],["roformer","RoFormerForSequenceClassification"],["electra","ElectraForSequenceClassification"],["esm","EsmForSequenceClassification"],["convbert","ConvBertForSequenceClassification"],["camembert","CamembertForSequenceClassification"],["deberta","DebertaForSequenceClassification"],["deberta-v2","DebertaV2ForSequenceClassification"],["mpnet","MPNetForSequenceClassification"],["albert","AlbertForSequenceClassification"],["distilbert","DistilBertForSequenceClassification"],["roberta","RobertaForSequenceClassification"],["xlm","XLMForSequenceClassification"],["xlm-roberta","XLMRobertaForSequenceClassification"],["bart","BartForSequenceClassification"],["mbart","MBartForSequenceClassification"],["mobilebert","MobileBertForSequenceClassification"],["squeezebert","SqueezeBertForSequenceClassification"]]),Dm=new Map([["bert","BertForTokenClassification"],["eurobert","EuroBertForTokenClassification"],["neobert","NeoBertForTokenClassification"],["modernbert","ModernBertForTokenClassification"],["roformer","RoFormerForTokenClassification"],["electra","ElectraForTokenClassification"],["esm","EsmForTokenClassification"],["convbert","ConvBertForTokenClassification"],["camembert","CamembertForTokenClassification"],["deberta","DebertaForTokenClassification"],["deberta-v2","DebertaV2ForTokenClassification"],["mpnet","MPNetForTokenClassification"],["distilbert","DistilBertForTokenClassification"],["roberta","RobertaForTokenClassification"],["xlm","XLMForTokenClassification"],["xlm-roberta","XLMRobertaForTokenClassification"]]),zm=new Map([["t5","T5ForConditionalGeneration"],["longt5","LongT5ForConditionalGeneration"],["mt5","MT5ForConditionalGeneration"],["bart","BartForConditionalGeneration"],["mbart","MBartForConditionalGeneration"],["marian","MarianMTModel"],["m2m_100","M2M100ForConditionalGeneration"],["blenderbot","BlenderbotForConditionalGeneration"],["blenderbot-small","BlenderbotSmallForConditionalGeneration"]]),Bm=new Map([["bloom","BloomForCausalLM"],["gpt2","GPT2LMHeadModel"],["gpt_oss","GptOssForCausalLM"],["jais","JAISLMHeadModel"],["gptj","GPTJForCausalLM"],["gpt_bigcode","GPTBigCodeForCausalLM"],["gpt_neo","GPTNeoForCausalLM"],["gpt_neox","GPTNeoXForCausalLM"],["codegen","CodeGenForCausalLM"],["llama","LlamaForCausalLM"],["nanochat","NanoChatForCausalLM"],["apertus","ApertusForCausalLM"],["llama4_text","Llama4ForCausalLM"],["arcee","ArceeForCausalLM"],["afmoe","AfmoeForCausalLM"],["lfm2","Lfm2ForCausalLM"],["lfm2_moe","Lfm2MoeForCausalLM"],["smollm3","SmolLM3ForCausalLM"],["exaone","ExaoneForCausalLM"],["olmo","OlmoForCausalLM"],["olmo2","Olmo2ForCausalLM"],["olmo3","Olmo3ForCausalLM"],["olmo_hybrid","OlmoHybridForCausalLM"],["mobilellm","MobileLLMForCausalLM"],["granite","GraniteForCausalLM"],["granitemoehybrid","GraniteMoeHybridForCausalLM"],["cohere","CohereForCausalLM"],["cohere2","Cohere2ForCausalLM"],["gemma","GemmaForCausalLM"],["gemma2","Gemma2ForCausalLM"],["vaultgemma","VaultGemmaForCausalLM"],["gemma3_text","Gemma3ForCausalLM"],["gemma3","Gemma3ForCausalLM"],["helium","HeliumForCausalLM"],["glm","GlmForCausalLM"],["glm_moe_dsa","GlmMoeDsaForCausalLM"],["openelm","OpenELMForCausalLM"],["qwen2","Qwen2ForCausalLM"],["qwen2_moe","Qwen2MoeForCausalLM"],["qwen3","Qwen3ForCausalLM"],["qwen3_moe","Qwen3MoeForCausalLM"],["qwen3_next","Qwen3NextForCausalLM"],["qwen2_vl","Qwen2VLForCausalLM"],["qwen2_5_vl","Qwen2_5_VLForCausalLM"],["qwen3_vl","Qwen3VLForCausalLM"],["qwen3_vl_moe","Qwen3VLMoeForCausalLM"],["qwen3_5","Qwen3_5ForCausalLM"],["qwen3_5_text","Qwen3_5ForCausalLM"],["qwen3_5_moe","Qwen3_5MoeForCausalLM"],["gemma3n","Gemma3nForCausalLM"],["gemma4","Gemma4ForCausalLM"],["phi","PhiForCausalLM"],["phi3","Phi3ForCausalLM"],["mpt","MptForCausalLM"],["opt","OPTForCausalLM"],["mbart","MBartForCausalLM"],["mistral","MistralForCausalLM"],["mistral4","Mistral4ForCausalLM"],["ministral","MinistralForCausalLM"],["ministral3","Ministral3ForCausalLM"],["ernie4_5","Ernie4_5ForCausalLM"],["starcoder2","Starcoder2ForCausalLM"],["deepseek_v3","DeepseekV3ForCausalLM"],["falcon","FalconForCausalLM"],["falcon_h1","FalconH1ForCausalLM"],["nemotron_h","NemotronHForCausalLM"],["trocr","TrOCRForCausalLM"],["solar_open","SolarOpenForCausalLM"],["stablelm","StableLmForCausalLM"],["modernbert-decoder","ModernBertDecoderForCausalLM"],["hunyuan_v1_dense","HunYuanDenseV1ForCausalLM"],["youtu","YoutuForCausalLM"],["phi3_v","Phi3VForCausalLM"]]),uI=new Map([["multi_modality","MultiModalityCausalLM"]]),Rm=new Map([["bert","BertForMaskedLM"],["eurobert","EuroBertForMaskedLM"],["neobert","NeoBertForMaskedLM"],["modernbert","ModernBertForMaskedLM"],["roformer","RoFormerForMaskedLM"],["electra","ElectraForMaskedLM"],["esm","EsmForMaskedLM"],["convbert","ConvBertForMaskedLM"],["camembert","CamembertForMaskedLM"],["deberta","DebertaForMaskedLM"],["deberta-v2","DebertaV2ForMaskedLM"],["mpnet","MPNetForMaskedLM"],["albert","AlbertForMaskedLM"],["distilbert","DistilBertForMaskedLM"],["roberta","RobertaForMaskedLM"],["xlm","XLMWithLMHeadModel"],["xlm-roberta","XLMRobertaForMaskedLM"],["mobilebert","MobileBertForMaskedLM"],["squeezebert","SqueezeBertForMaskedLM"]]),Gm=new Map([["bert","BertForQuestionAnswering"],["neobert","NeoBertForQuestionAnswering"],["roformer","RoFormerForQuestionAnswering"],["electra","ElectraForQuestionAnswering"],["convbert","ConvBertForQuestionAnswering"],["camembert","CamembertForQuestionAnswering"],["deberta","DebertaForQuestionAnswering"],["deberta-v2","DebertaV2ForQuestionAnswering"],["mpnet","MPNetForQuestionAnswering"],["albert","AlbertForQuestionAnswering"],["distilbert","DistilBertForQuestionAnswering"],["roberta","RobertaForQuestionAnswering"],["xlm","XLMForQuestionAnswering"],["xlm-roberta","XLMRobertaForQuestionAnswering"],["mobilebert","MobileBertForQuestionAnswering"],["squeezebert","SqueezeBertForQuestionAnswering"]]),$m=new Map([["vision-encoder-decoder","VisionEncoderDecoderModel"],["idefics3","Idefics3ForConditionalGeneration"],["smolvlm","SmolVLMForConditionalGeneration"]]),Vm=new Map([["llava","LlavaForConditionalGeneration"],["llava_onevision","LlavaOnevisionForConditionalGeneration"],["moondream1","Moondream1ForConditionalGeneration"],["florence2","Florence2ForConditionalGeneration"],["qwen2_vl","Qwen2VLForConditionalGeneration"],["qwen2_5_vl","Qwen2_5_VLForConditionalGeneration"],["qwen3_vl","Qwen3VLForConditionalGeneration"],["qwen3_vl_moe","Qwen3VLMoeForConditionalGeneration"],["qwen3_5","Qwen3_5ForConditionalGeneration"],["qwen3_5_moe","Qwen3_5MoeForConditionalGeneration"],["lfm2_vl","Lfm2VlForConditionalGeneration"],["idefics3","Idefics3ForConditionalGeneration"],["smolvlm","SmolVLMForConditionalGeneration"],["paligemma","PaliGemmaForConditionalGeneration"],["llava_qwen2","LlavaQwen2ForCausalLM"],["gemma3","Gemma3ForConditionalGeneration"],["gemma3n","Gemma3nForConditionalGeneration"],["gemma4","Gemma4ForConditionalGeneration"],["mistral3","Mistral3ForConditionalGeneration"],["lighton_ocr","LightOnOcrForConditionalGeneration"],["glm_ocr","GlmOcrForConditionalGeneration"]]),Um=new Map([["granite_speech","GraniteSpeechForConditionalGeneration"],["ultravox","UltravoxModel"],["voxtral","VoxtralForConditionalGeneration"],["voxtral_realtime","VoxtralRealtimeForConditionalGeneration"]]),dI=new Map([["vision-encoder-decoder","VisionEncoderDecoderModel"]]),jm=new Map([["vit","ViTForImageClassification"],["ijepa","IJepaForImageClassification"],["pvt","PvtForImageClassification"],["vit_msn","ViTMSNForImageClassification"],["fastvit","FastViTForImageClassification"],["mobilevit","MobileViTForImageClassification"],["mobilevitv2","MobileViTV2ForImageClassification"],["beit","BeitForImageClassification"],["deit","DeiTForImageClassification"],["hiera","HieraForImageClassification"],["convnext","ConvNextForImageClassification"],["convnextv2","ConvNextV2ForImageClassification"],["dinov2","Dinov2ForImageClassification"],["dinov2_with_registers","Dinov2WithRegistersForImageClassification"],["resnet","ResNetForImageClassification"],["swin","SwinForImageClassification"],["segformer","SegformerForImageClassification"],["efficientnet","EfficientNetForImageClassification"],["mobilenet_v1","MobileNetV1ForImageClassification"],["mobilenet_v2","MobileNetV2ForImageClassification"],["mobilenet_v3","MobileNetV3ForImageClassification"],["mobilenet_v4","MobileNetV4ForImageClassification"]]),qm=new Map([["detr","DetrForObjectDetection"],["rt_detr","RTDetrForObjectDetection"],["rt_detr_v2","RTDetrV2ForObjectDetection"],["rf_detr","RFDetrForObjectDetection"],["d_fine","DFineForObjectDetection"],["table-transformer","TableTransformerForObjectDetection"],["yolos","YolosForObjectDetection"]]),Wm=new Map([["owlvit","OwlViTForObjectDetection"],["owlv2","Owlv2ForObjectDetection"],["grounding-dino","GroundingDinoForObjectDetection"]]),Ls=new Map([["detr","DetrForSegmentation"],["clipseg","CLIPSegForImageSegmentation"]]),Hm=new Map([["segformer","SegformerForSemanticSegmentation"],["sapiens","SapiensForSemanticSegmentation"],["swin","SwinForSemanticSegmentation"],["mobilenet_v1","MobileNetV1ForSemanticSegmentation"],["mobilenet_v2","MobileNetV2ForSemanticSegmentation"],["mobilenet_v3","MobileNetV3ForSemanticSegmentation"],["mobilenet_v4","MobileNetV4ForSemanticSegmentation"]]),Qm=new Map([["detr","DetrForSegmentation"],["maskformer","MaskFormerForInstanceSegmentation"]]),Xm=new Map([["sam","SamModel"],["sam2","Sam2Model"],["edgetam","EdgeTamModel"],["sam3_tracker","Sam3TrackerModel"]]),Ym=new Map([["wav2vec2","Wav2Vec2ForCTC"],["wav2vec2-bert","Wav2Vec2BertForCTC"],["unispeech","UniSpeechForCTC"],["unispeech-sat","UniSpeechSatForCTC"],["wavlm","WavLMForCTC"],["hubert","HubertForCTC"],["parakeet_ctc","ParakeetForCTC"]]),Jm=new Map([["wav2vec2","Wav2Vec2ForSequenceClassification"],["wav2vec2-bert","Wav2Vec2BertForSequenceClassification"],["unispeech","UniSpeechForSequenceClassification"],["unispeech-sat","UniSpeechSatForSequenceClassification"],["wavlm","WavLMForSequenceClassification"],["hubert","HubertForSequenceClassification"],["audio-spectrogram-transformer","ASTForAudioClassification"]]),Km=new Map([["wavlm","WavLMForXVector"]]),Zm=new Map([["unispeech-sat","UniSpeechSatForAudioFrameClassification"],["wavlm","WavLMForAudioFrameClassification"],["wav2vec2","Wav2Vec2ForAudioFrameClassification"],["pyannote","PyAnnoteForAudioFrameClassification"]]),eg=new Map([["vitmatte","VitMatteForImageMatting"]]),hI=new Map([["patchtst","PatchTSTForPrediction"],["patchtsmixer","PatchTSMixerForPrediction"]]),tg=new Map([["swin2sr","Swin2SRForImageSuperResolution"]]),rg=new Map([["chmv2","CHMv2ForDepthEstimation"],["dpt","DPTForDepthEstimation"],["depth_anything","DepthAnythingForDepthEstimation"],["glpn","GLPNForDepthEstimation"],["sapiens","SapiensForDepthEstimation"],["depth_pro","DepthProForDepthEstimation"],["metric3d","Metric3DForDepthEstimation"],["metric3dv2","Metric3Dv2ForDepthEstimation"]]),sg=new Map([["sapiens","SapiensForNormalEstimation"]]),ng=new Map([["vitpose","VitPoseForPoseEstimation"]]),ag=new Map([["clip","CLIPVisionModelWithProjection"],["siglip","SiglipVisionModel"],["jina_clip","JinaCLIPVisionModel"]]),og=[[oI,q.EncoderOnly],[iI,q.EncoderDecoder],[cI,q.DecoderOnlyWithoutHead],[lI,q.AutoEncoder],[Nm,q.EncoderOnly],[Dm,q.EncoderOnly],[zm,q.Seq2Seq],[Lm,q.Seq2Seq],[Bm,q.DecoderOnly],[uI,q.MultiModality],[Rm,q.EncoderOnly],[Gm,q.EncoderOnly],[$m,q.Vision2Seq],[Vm,q.ImageTextToText],[Um,q.AudioTextToText],[jm,q.EncoderOnly],[Ls,q.EncoderOnly],[Qm,q.EncoderOnly],[Hm,q.EncoderOnly],[eg,q.EncoderOnly],[hI,q.EncoderOnly],[tg,q.EncoderOnly],[rg,q.EncoderOnly],[sg,q.EncoderOnly],[ng,q.EncoderOnly],[qm,q.EncoderOnly],[Wm,q.EncoderOnly],[Xm,q.MaskGeneration],[Ym,q.EncoderOnly],[Jm,q.EncoderOnly],[Im,q.Seq2Seq],[Om,q.EncoderOnly],[Km,q.EncoderOnly],[Zm,q.EncoderOnly],[ag,q.EncoderOnly]];for(const[e,t]of og)for(const r of e.values()){fr.set(r,t);const s=rl[r];gs.set(s,r),Ki.set(r,s)}var fI=[["MusicgenForConditionalGeneration",Zp,q.Musicgen],["Phi3VForCausalLM",sm,q.Phi3V],["CLIPTextModelWithProjection",wp,q.EncoderOnly],["SiglipTextModel",hm,q.EncoderOnly],["JinaCLIPTextModel",Vp,q.EncoderOnly],["ClapTextModelWithProjection",mp,q.EncoderOnly],["ClapAudioModelWithProjection",gp,q.EncoderOnly],["DacEncoderModel",bp,q.EncoderOnly],["DacDecoderModel",Mp,q.EncoderOnly],["MimiEncoderModel",Yp,q.EncoderOnly],["MimiDecoderModel",Jp,q.EncoderOnly],["SnacEncoderModel",fm,q.EncoderOnly],["SnacDecoderModel",_m,q.EncoderOnly],["Gemma3nForConditionalGeneration",Sa,q.ImageAudioTextToText],["Gemma4ForConditionalGeneration",Ol,q.ImageAudioTextToText],["SupertonicForConditionalGeneration",gm,q.Supertonic],["ChatterboxModel",fp,q.Chatterbox],["VoxtralRealtimeForConditionalGeneration",Am,q.VoxtralRealtime]];for(const[e,t,r]of fI)fr.set(e,r),gs.set(t,e),Ki.set(e,t);var ig=new Map([["modnet",Ls],["birefnet",Ls],["isnet",Ls],["ben",Ls]]);for(const[e,t]of ig.entries())t.set(e,"PreTrainedModel"),fr.set(e,q.EncoderOnly),Ki.set(e,P);var _I=new Set(ig.keys());fr.set("PreTrainedModel",q.EncoderOnly),gs.set(P,"PreTrainedModel");var Ae={MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES:Nm,MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES:Dm,MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES:Im,MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES:Om,MODEL_FOR_MASKED_LM_MAPPING_NAMES:Rm,MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES:Gm,MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES:jm,MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES:Ls,MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES:Hm,MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES:Qm,MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES:qm,MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES:Wm,MODEL_FOR_MASK_GENERATION_MAPPING_NAMES:Xm,MODEL_FOR_CTC_MAPPING_NAMES:Ym,MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES:Jm,MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES:Km,MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES:Zm,MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES:dI,MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES:eg,MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES:tg,MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES:rg,MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES:sg,MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES:ng,MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES:ag,MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES:Vm,MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES:Um,MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:zm,MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES:Lm,MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:Bm,MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES:$m};wE(Ae);var Ce=(ao=class{static supports(e){if(!this.MODEL_CLASS_MAPPINGS)return!1;for(const t of this.MODEL_CLASS_MAPPINGS)if(t.has(e))return!0;return this.BASE_IF_FAIL}static async from_pretrained(e,{progress_callback:t=null,config:r=null,cache_dir:s=null,local_files_only:n=!1,revision:a="main",model_file_name:o=null,subfolder:i="onnx",device:l=null,dtype:c=null,use_external_data_format:d=null,session_options:h={}}={}){const _={progress_callback:t,config:r,cache_dir:s,local_files_only:n,revision:a,model_file_name:o,subfolder:i,device:l,dtype:c,use_external_data_format:d,session_options:h};if(_.config=await nn.from_pretrained(e,_),!this.MODEL_CLASS_MAPPINGS)throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: "+this.name);const{model_type:p}=_.config;for(const w of this.MODEL_CLASS_MAPPINGS){let v=w.get(p);if(!v){for(const y of w.values())if(y[0]===p){v=y;break}if(!v)continue}return await rl[v].from_pretrained(e,_)}if(this.BASE_IF_FAIL)return _I.has(p)||ue.warn(`Unknown model class "${p}", attempting to construct from base class.`),await P.from_pretrained(e,_);throw Error(`Unsupported model type: ${p}`)}},k(ao,"MODEL_CLASS_MAPPINGS",null),k(ao,"BASE_IF_FAIL",!1),ao),mn=(oo=class extends Ce{},k(oo,"MODEL_CLASS_MAPPINGS",og.map(e=>e[0])),k(oo,"BASE_IF_FAIL",!0),oo),lg=(fu=class extends Ce{},k(fu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES]),fu),pI=(_u=class extends Ce{},k(_u,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES]),_u),au=(pu=class extends Ce{},k(pu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]),pu),mI=(mu=class extends Ce{},k(mu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]),mu),gI=(gu=class extends Ce{},k(gu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]),gu),wI=(wu=class extends Ce{},k(wu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]),wu),vI=(vu=class extends Ce{},k(vu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES]),vu),yI=(yu=class extends Ce{},k(yu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_MASKED_LM_MAPPING_NAMES]),yu),bI=(bu=class extends Ce{},k(bu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES]),bu),MI=(Mu=class extends Ce{},k(Mu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES]),Mu),xI=(xu=class extends Ce{},k(xu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES]),xu),cg=(ku=class extends Ce{},k(ku,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES]),ku),ug=(Tu=class extends Ce{},k(Tu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES]),Tu),dg=(Eu=class extends Ce{},k(Eu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES]),Eu),kI=(Au=class extends Ce{},k(Au,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES]),Au),TI=(Cu=class extends Ce{},k(Cu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]),Cu);Su=class extends Ce{},k(Su,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]);var EI=(Pu=class extends Ce{},k(Pu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_CTC_MAPPING_NAMES]),Pu),AI=(Fu=class extends Ce{},k(Fu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]),Fu);Lu=class extends Ce{},k(Lu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]),Iu=class extends Ce{},k(Iu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]);var CI=(Ou=class extends Ce{},k(Ou,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]),Ou);Nu=class extends Ce{},k(Nu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]);var SI=(Du=class extends Ce{},k(Du,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]),Du),PI=(zu=class extends Ce{},k(zu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES]),zu);Bu=class extends Ce{},k(Bu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES]),Ru=class extends Ce{},k(Ru,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES]);var FI=(Gu=class extends Ce{},k(Gu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES]),Gu);$u=class extends Ce{},k($u,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES]),Vu=class extends Ce{},k(Vu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES]);async function Wt(e){return Array.isArray(e)||(e=[e]),await Promise.all(e.map(t=>jt.read(t)))}async function Is(e,t){return Array.isArray(e)||(e=[e]),await Promise.all(e.map(r=>typeof r=="string"||r instanceof URL?F1(r,t):r instanceof Float64Array?new Float32Array(r):r))}function ou(e,t){t&&(e=e.map(o=>o|0));const[r,s,n,a]=e;return{xmin:r,ymin:s,xmax:n,ymax:a}}var Ve=class extends vt{constructor({task:e,model:t,tokenizer:r=null,processor:s=null}){super(),this.task=e,this.model=t,this.tokenizer=r,this.processor=s}async dispose(){await this.model.dispose()}},LI=class extends Ve{async _call(e,{top_k:t=1}={}){const r=this.tokenizer(e,{padding:!0,truncation:!0}),s=await this.model(r),{problem_type:n,id2label:a}=this.model.config,o=n==="multi_label_classification"?l=>l.sigmoid():l=>new U("float32",nt(l.data),l.dims),i=[];for(const l of s.logits){const c=o(l),d=await hs(c,t),h=d[0].tolist(),p=d[1].tolist().map((w,v)=>({label:a?a[w]:`LABEL_${w}`,score:h[v]}));t===1?i.push(...p):i.push(p)}return Array.isArray(e)||t===1?i:i[0]}},II=class extends Ve{async _call(e,{ignore_labels:t=["O"]}={}){const r=Array.isArray(e),s=this.tokenizer(r?e:[e],{padding:!0,truncation:!0}),a=(await this.model(s)).logits,o=this.model.config.id2label,i=[];for(let l=0;l<a.dims[0];++l){const c=s.input_ids[l],d=a[l],h=[];for(let _=0;_<d.dims[0];++_){const p=d[_],w=je(p.data)[1],v=o?o[w]:`LABEL_${w}`;if(t.includes(v))continue;const y=this.tokenizer.decode([c[_].item()],{skip_special_tokens:!0});if(y==="")continue;const M=nt(p.data);h.push({entity:v,score:M[w],index:_,word:y})}i.push(h)}return r?i:i[0]}},OI=class extends Ve{async _call(e,t,{top_k:r=1}={}){const s=this.tokenizer(e,{text_pair:t,padding:!0,truncation:!0}),n=Array.isArray(e),{start_logits:a,end_logits:o}=await this.model(s),i=s.input_ids.tolist(),l=s.attention_mask.tolist(),{all_special_ids:c,sep_token_id:d}=this.tokenizer,h=[];for(let _=0;_<a.dims[0];++_){const p=i[_],w=p.findIndex(S=>S==d),v=a[_].tolist(),y=o[_].tolist();for(let S=1;S<v.length;++S)(l[_]==0||S<=w||c.findIndex(N=>N==p[S])!==-1)&&(v[S]=-1/0,y[S]=-1/0);const M=nt(v).map((S,N)=>[S,N]),T=nt(y).map((S,N)=>[S,N]);M[0][0]=0,T[0][0]=0;const A=Iy(M,T).filter(S=>S[0][1]<=S[1][1]).map(S=>[S[0][1],S[1][1],S[0][0]*S[1][0]]).sort((S,N)=>N[2]-S[2]),C=[];for(let S=0;S<Math.min(A.length,r);++S){const[N,x,R]=A[S],z=p.slice(N,x+1),$=this.tokenizer.decode(z,{skip_special_tokens:!0});C.push({answer:$,score:R})}r===1?h.push(...C):h.push(C)}return n?h:h[0]}},NI=class extends Ve{async _call(e,{top_k:t=5}={}){const{mask_token_id:r,mask_token:s}=this.tokenizer,n=this.tokenizer(e,{padding:!0,truncation:!0}),{logits:a}=await this.model(n),o=[],i=n.input_ids.tolist();for(let l=0;l<i.length;++l){const c=i[l],d=c.findIndex(v=>v==r);if(d===-1)throw Error(`Mask token (${s}) not found in text.`);const h=a[l][d],_=await hs(new U("float32",nt(h.data),h.dims),t),p=_[0].tolist(),w=_[1].tolist();o.push(w.map((v,y)=>{const M=c.slice();return M[d]=v,{score:p[y],token:Number(v),token_str:this.tokenizer.decode([v]),sequence:this.tokenizer.decode(M,{skip_special_tokens:!0})}}))}return Array.isArray(e)?o:o[0]}},iu=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256});k(this,"_key","generated_text")}async _call(t,r={}){Array.isArray(t)||(t=[t]),this.model.config.prefix&&(t=t.map(l=>this.model.config.prefix+l));const s=this.model.config.task_specific_params;s&&s[this.task]&&s[this.task].prefix&&(t=t.map(l=>s[this.task].prefix+l));const n=this.tokenizer,a={padding:!0,truncation:!0};let o;this.task==="translation"&&"_build_translation_inputs"in n?o=n._build_translation_inputs(t,a,r):o=n(t,a);const i=await this.model.generate({...o,...this._default_generation_config,...r});return n.batch_decode(i,{skip_special_tokens:!0}).map(l=>({[this._key]:l}))}},DI=class extends iu{constructor(){super(...arguments);k(this,"_key","summary_text")}},zI=class extends iu{constructor(){super(...arguments);k(this,"_key","translation_text")}};function hg(e){return Array.isArray(e)&&e.every(t=>"role"in t&&"content"in t)}var BI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256})}async _call(t,r={}){let s=!1,n=!1,a=r.add_special_tokens??(this.tokenizer.add_bos_token||this.tokenizer.add_eos_token)??!1,o=r.tokenizer_encode_kwargs,i;if(typeof t=="string")i=t=[t];else if(Array.isArray(t)&&t.every(w=>typeof w=="string"))s=!0,i=t;else{if(hg(t))t=[t];else if(Array.isArray(t)&&t.every(hg))s=!0;else throw new Error("Input must be a string, an array of strings, a Chat, or an array of Chats");n=!0,i=t.map(w=>this.tokenizer.apply_chat_template(w,{tokenize:!1,add_generation_prompt:!0,...o})),a=!1,o=void 0}const l=n?!1:r.return_full_text??!0;this.tokenizer.padding_side="left";const c=this.tokenizer(i,{add_special_tokens:a,padding:!0,truncation:!0,...o}),d=await this.model.generate({...c,...this._default_generation_config,...r}),h=this.tokenizer.batch_decode(d,{skip_special_tokens:!0});let _;!l&&c.input_ids.dims.at(-1)>0&&(_=this.tokenizer.batch_decode(c.input_ids,{skip_special_tokens:!0}).map(w=>w.length));const p=Array.from({length:t.length},w=>[]);for(let w=0;w<h.length;++w){const v=Math.floor(w/d.dims[0]*t.length);_&&(h[w]=h[w].slice(_[v])),p[v].push({generated_text:n?[...t[v],{role:"assistant",content:h[w]}]:h[w]})}return!s&&p.length===1?p[0]:p}},RI=class extends Ve{constructor(e){super(e),this.label2id=Object.fromEntries(Object.entries(this.model.config.label2id).map(([t,r])=>[t.toLowerCase(),r])),this.entailment_id=this.label2id.entailment,this.entailment_id===void 0&&(ue.warn("Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."),this.entailment_id=2),this.contradiction_id=this.label2id.contradiction??this.label2id.not_entailment,this.contradiction_id===void 0&&(ue.warn("Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."),this.contradiction_id=0)}async _call(e,t,{hypothesis_template:r="This example is {}.",multi_label:s=!1}={}){const n=Array.isArray(e);n||(e=[e]),Array.isArray(t)||(t=[t]);const a=t.map(l=>r.replace("{}",l)),o=s||t.length===1,i=[];for(const l of e){const c=[];for(const _ of a){const p=this.tokenizer(l,{text_pair:_,padding:!0,truncation:!0}),w=await this.model(p);o?c.push([w.logits.data[this.contradiction_id],w.logits.data[this.entailment_id]]):c.push(w.logits.data[this.entailment_id])}const h=(o?c.map(_=>nt(_)[1]):nt(c)).map((_,p)=>[_,p]).sort((_,p)=>p[0]-_[0]);i.push({sequence:l,labels:h.map(_=>t[_[1]]),scores:h.map(_=>_[0])})}return n?i:i[0]}},GI=class extends Ve{async _call(e,{top_k:t=5}={}){const r=this.processor.feature_extractor.config.sampling_rate,s=await Is(e,r),n=this.model.config.id2label,a=[];for(const o of s){const i=await this.processor(o),c=(await this.model(i)).logits[0],d=await hs(new U("float32",nt(c.data),c.dims),t),h=d[0].tolist(),p=d[1].tolist().map((w,v)=>({label:n?n[w]:`LABEL_${w}`,score:h[v]}));a.push(p)}return Array.isArray(e)?a:a[0]}},$I=class extends Ve{async _call(e,t,{hypothesis_template:r="This is a sound of {}."}={}){const s=!Array.isArray(e);s&&(e=[e]);const n=t.map(c=>r.replace("{}",c)),a=this.tokenizer(n,{padding:!0,truncation:!0}),o=this.processor.feature_extractor.config.sampling_rate,i=await Is(e,o),l=[];for(const c of i){const d=await this.processor(c),h=await this.model({...a,...d}),_=nt(h.logits_per_audio.data);l.push([..._].map((p,w)=>({score:p,label:t[w]})))}return s?l[0]:l}},VI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{})}async _call(t,r={}){switch(r={...this._default_generation_config,...r},this.model.config.model_type){case"whisper":case"lite-whisper":return this._call_whisper(t,r);case"wav2vec2":case"wav2vec2-bert":case"unispeech":case"unispeech-sat":case"hubert":case"parakeet_ctc":return this._call_wav2vec2(t,r);case"moonshine":return this._call_moonshine(t,r);case"cohere_asr":return this._call_cohere_asr(t,r);default:throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)}}async _call_wav2vec2(t,r){r.language&&ue.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".'),r.task&&ue.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor.config.sampling_rate,o=await Is(n,a),i=[];for(const l of o){const c=await this.processor(l),h=(await this.model(c)).logits[0],_=[];for(const w of h)_.push(je(w.data)[1]);const p=this.tokenizer.decode(_,{skip_special_tokens:!0}).trim();i.push({text:p})}return s?i[0]:i}async _call_whisper(t,r){const s=r.return_timestamps??!1,n=r.chunk_length_s??0,a=r.force_full_sequences??!1;let o=r.stride_length_s??null;const i={...r};s==="word"&&(i.return_token_timestamps=!0,i.return_timestamps=!0);const l=!Array.isArray(t),c=l?[t]:t,d=this.processor.feature_extractor.config,h=d.chunk_length/this.model.config.max_source_positions,_=d.hop_length,p=d.sampling_rate,w=await Is(c,p),v=[];for(const y of w){let M=[];if(n>0){if(o===null)o=n/6;else if(n<=o)throw Error("`chunk_length_s` must be larger than `stride_length_s`.");const C=p*n,S=p*o,N=C-2*S;let x=0;for(;;){const R=x+C,z=y.subarray(x,R),$=await this.processor(z),Q=x===0,H=R>=y.length;if(M.push({stride:[z.length,Q?0:S,H?0:S],input_features:$.input_features,is_last:H}),H)break;x+=N}}else M=[{stride:[y.length,0,0],input_features:(await this.processor(y)).input_features,is_last:!0}];for(const C of M){i.num_frames=Math.floor(C.stride[0]/_);const S=await this.model.generate({inputs:C.input_features,...i});if(s==="word"){const N=S.sequences.tolist()[0],x=S.token_timestamps.tolist()[0],R=this.tokenizer.timestamp_begin,z=Math.max(N.findIndex($=>Number($)>=R),0);C.tokens=N.slice(z),C.token_timestamps=x.slice(z).map($=>tn($,2))}else C.tokens=S[0].tolist();C.stride=C.stride.map(N=>N/p)}const[T,A]=this.tokenizer._decode_asr(M,{time_precision:h,return_timestamps:s,force_full_sequences:a});v.push({text:T,...A})}return l?v[0]:v}async _call_moonshine(t,r){const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor.config.sampling_rate,o=await Is(n,a),i=[];for(const l of o){const c=await this.processor(l),d=Math.floor(l.length/a)*6,h=await this.model.generate({max_new_tokens:d,...r,...c}),_=this.processor.batch_decode(h,{skip_special_tokens:!0})[0];i.push({text:_})}return s?i[0]:i}async _call_cohere_asr(t,r){const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor,o=a.config.sampling_rate,i=await Is(n,o),l=r.language??"en",c=this.processor.get_decoder_prompt_ids(l),d=[];for(const h of i){const _=a.split_audio(h),p=[];for(const v of _){const y=await this.processor(v),M=await this.model.generate({...y,decoder_input_ids:c,...r}),T=this.tokenizer.decode(M[0].tolist(),{skip_special_tokens:!0}).trim();p.push(T)}const w=this.processor.constructor.join_chunks(p,l);d.push({text:w})}return s?d[0]:d}},UI=class extends Ve{constructor(t){super(t);k(this,"DEFAULT_VOCODER_ID","Xenova/speecht5_hifigan");this.vocoder=t.vocoder??null}async _prepare_speaker_embeddings(t,r){if((typeof t=="string"||t instanceof URL)&&(t=new Float32Array(await(await be.fetch(t)).arrayBuffer())),t instanceof Float32Array)t=new U("float32",t,[t.length]);else if(!(t instanceof U))throw new Error("Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.");if(r>1){if(t.dims[0]===1)t=t.repeat(r,1);else if(t.dims[0]!==r)throw new Error(`Expected speaker embeddings batch size to be 1 or ${r}, but got ${t.dims[0]}.`)}return t}_postprocess_waveform(t,r,s,n=null){const a=r.data,[o,i]=r.dims,l=n?n.data:null,c=[];for(let d=0;d<o;++d){const h=l?Math.min(Math.ceil(l[d]),i):i,_=d*i;c.push(new $1(a.slice(_,_+h),s))}return Array.isArray(t)?c:c[0]}async _call(t,r){return this.processor?this._call_text_to_spectrogram(t,r):this.model.config.model_type==="supertonic"?this._call_supertonic(t,r):this._call_text_to_waveform(t)}async _call_supertonic(t,{speaker_embeddings:r,num_inference_steps:s,speed:n}){if(!r)throw new Error("Speaker embeddings must be provided for Supertonic models.");const{sampling_rate:a,style_dim:o}=this.model.config,i=this.tokenizer(t,{padding:!0,truncation:!0}),l=i.input_ids.dims[0];r=await this._prepare_speaker_embeddings(r,l),r=r.view(l,-1,o);const{waveform:c,durations:d}=await this.model.generate_speech({...i,style:r,num_inference_steps:s,speed:n});return this._postprocess_waveform(t,c,a,d)}async _call_text_to_waveform(t){const r=this.tokenizer(t,{padding:!0,truncation:!0}),{waveform:s}=await this.model(r),n=this.model.config.sampling_rate;return this._postprocess_waveform(t,s,n)}async _call_text_to_spectrogram(t,{speaker_embeddings:r}){this.vocoder||(ue.info("No vocoder specified, using default HifiGan vocoder."),this.vocoder=await mn.from_pretrained(this.DEFAULT_VOCODER_ID,{dtype:"fp32"}));const{input_ids:s}=this.tokenizer(t,{padding:!0,truncation:!0}),n=s.dims[0];r=await this._prepare_speaker_embeddings(r,n),r=r.view(n,-1);const{waveform:a}=await this.model.generate_speech(s,r,{vocoder:this.vocoder}),o=this.processor.feature_extractor.config.sampling_rate;return this._postprocess_waveform(t,a,o)}},jI=class extends Ve{async _call(e,t={}){const r=Array.isArray(e),s=await Wt(e),{pixel_values:n}=await this.processor(s),a=[];for(const o of n){o.dims=[1,...o.dims];const i=await this.model.generate({inputs:o,...t}),l=this.tokenizer.batch_decode(i,{skip_special_tokens:!0}).map(c=>({generated_text:c.trim()}));a.push(l)}return r?a:a[0]}},qI=class extends Ve{async _call(e,{top_k:t=5}={}){const r=await Wt(e),{pixel_values:s}=await this.processor(r),n=await this.model({pixel_values:s}),{id2label:a}=this.model.config,o=[];for(const i of n.logits){const l=await hs(new U("float32",nt(i.data),i.dims),t),c=l[0].tolist(),h=l[1].tolist().map((_,p)=>({label:a?a[_]:`LABEL_${_}`,score:c[p]}));o.push(h)}return Array.isArray(e)?o:o[0]}},fg={panoptic:"post_process_panoptic_segmentation",instance:"post_process_instance_segmentation",semantic:"post_process_semantic_segmentation"},_g=class extends Ve{async _call(e,{threshold:t=.5,mask_threshold:r=.5,overlap_mask_area_threshold:s=.8,label_ids_to_fuse:n=null,target_sizes:a=null,subtask:o=null}={}){if(Array.isArray(e)&&e.length!==1)throw Error("Image segmentation pipeline currently only supports a batch size of 1.");const l=await Wt(e),c=l.map(M=>[M.height,M.width]),d=await this.processor(l),{inputNames:h,outputNames:_}=this.model.sessions.model;if(!h.includes("pixel_values")){if(h.length!==1)throw Error(`Expected a single input name, but got ${h.length} inputs: ${h}.`);const M=h[0];if(M in d)throw Error(`Input name ${M} already exists in the inputs.`);d[M]=d.pixel_values}const p=await this.model(d);let w=null;if(o!==null)w=fg[o];else if(this.processor.image_processor){for(const[M,T]of Object.entries(fg))if(T in this.processor.image_processor){w=this.processor.image_processor[T].bind(this.processor.image_processor),o=M;break}}const v=this.model.config.id2label,y=[];if(o)if(o==="panoptic"||o==="instance"){const M=w(p,t,r,s,n,a??c)[0],T=M.segmentation;for(const A of M.segments_info){const C=new Uint8ClampedArray(T.data.length);for(let N=0;N<T.data.length;++N)T.data[N]===A.id&&(C[N]=255);const S=new jt(C,T.dims[1],T.dims[0],1);y.push({score:A.score,label:v[A.label_id],mask:S})}}else if(o==="semantic"){const{segmentation:M,labels:T}=w(p,a??c)[0];for(const A of T){const C=new Uint8ClampedArray(M.data.length);for(let N=0;N<M.data.length;++N)M.data[N]===A&&(C[N]=255);const S=new jt(C,M.dims[1],M.dims[0],1);y.push({score:null,label:v[A],mask:S})}}else throw Error(`Subtask ${o} not supported.`);else{const T=p[_[0]];for(let A=0;A<c.length;++A){const C=c[A],S=T[A];S.data.some(x=>x<-1e-5||x>1+1e-5)&&S.sigmoid_();const N=await jt.fromTensor(S.mul_(255).to("uint8")).resize(C[1],C[0]);y.push({label:null,score:null,mask:N})}}return y}},WI=class extends _g{async _call(e,t={}){const r=await Wt(e),s=await super._call(e,t),n=r.map((a,o)=>{const i=a.clone();return i.putAlpha(s[o].mask),i});return Array.isArray(e)?n:n[0]}},HI=class extends Ve{async _call(e,t,{hypothesis_template:r="This is a photo of {}"}={}){const s=Array.isArray(e),n=await Wt(e),a=t.map(h=>r.replace("{}",h)),o=this.tokenizer(a,{padding:this.model.config.model_type==="siglip"?"max_length":!0,truncation:!0}),{pixel_values:i}=await this.processor(n),l=await this.model({...o,pixel_values:i}),c=this.model.config.model_type==="siglip"?h=>h.sigmoid().data:h=>nt(h.data),d=[];for(const h of l.logits_per_image){const p=[...c(h)].map((w,v)=>({score:w,label:t[v]}));p.sort((w,v)=>v.score-w.score),d.push(p)}return s?d:d[0]}},QI=class extends Ve{async _call(e,{threshold:t=.9,percentage:r=!1}={}){const s=Array.isArray(e);if(s&&e.length!==1)throw Error("Object detection pipeline currently only supports a batch size of 1.");const n=await Wt(e),a=r?null:n.map(_=>[_.height,_.width]),{pixel_values:o,pixel_mask:i}=await this.processor(n),l=await this.model({pixel_values:o,pixel_mask:i}),c=this.processor.image_processor.post_process_object_detection(l,t,a),{id2label:d}=this.model.config,h=c.map(_=>_.boxes.map((p,w)=>({score:_.scores[w],label:d[_.classes[w]],box:ou(p,!r)})));return s?h:h[0]}},XI=class extends Ve{async _call(e,t,{threshold:r=.1,top_k:s=null,percentage:n=!1}={}){const a=Array.isArray(e),o=await Wt(e),i=this.tokenizer(t,{padding:!0,truncation:!0}),l=await this.processor(o),c=[];for(let d=0;d<o.length;++d){const h=o[d],_=n?null:[[h.height,h.width]],p=l.pixel_values[d].unsqueeze_(0),w=await this.model({...i,pixel_values:p});let v;if("post_process_grounded_object_detection"in this.processor){const y=this.processor.post_process_grounded_object_detection(w,i.input_ids,{box_threshold:r,text_threshold:r,target_sizes:_})[0];v=y.boxes.map((M,T)=>({score:y.scores[T],label:y.labels[T],box:ou(M,!n)}))}else{const y=this.processor.image_processor.post_process_object_detection(w,r,_,!0)[0];v=y.boxes.map((M,T)=>({score:y.scores[T],label:t[y.classes[T]],box:ou(M,!n)}))}v.sort((y,M)=>M.score-y.score),s!==null&&(v=v.slice(0,s)),c.push(v)}return a?c:c[0]}},YI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256})}async _call(t,r,s={}){if(Array.isArray(t)){if(t.length!==1)throw Error("Document Question Answering pipeline currently only supports a batch size of 1.");t=t[0]}const n=(await Wt(t))[0],{pixel_values:a}=await this.processor(n),o=`<s_docvqa><s_question>${r}</s_question><s_answer>`,i=this.tokenizer(o,{add_special_tokens:!1,padding:!0,truncation:!0}).input_ids,l=await this.model.generate({inputs:a,max_length:this.model.config.decoder.max_position_embeddings,decoder_input_ids:i,...this._default_generation_config,...s}),d=this.tokenizer.batch_decode(l)[0].match(/<s_answer>(.*?)<\/s_answer>/);let h=null;return d&&d.length>=2&&(h=d[1].trim()),[{answer:h}]}},JI=class extends Ve{async _call(e){const t=await Wt(e),r=await this.processor(t),s=await this.model(r),n=[];for(const a of s.reconstruction){const o=a.squeeze().clamp_(0,1).mul_(255).round_().to("uint8");n.push(jt.fromTensor(o))}return Array.isArray(e)?n:n[0]}},KI=class extends Ve{async _call(e){const t=await Wt(e),r=await this.processor(t),{predicted_depth:s}=await this.model(r),n=[];for(let a=0;a<t.length;++a){const o=s[a],[i,l]=o.dims.slice(-2),[c,d]=t[a].size,h=(await Ut(o.view(1,1,i,l),{size:[d,c],mode:"bilinear"})).view(d,c),_=h.min().item(),p=h.max().item(),w=h.sub(_).div_(p-_).mul_(255).to("uint8").unsqueeze(0),v=jt.fromTensor(w);n.push({predicted_depth:h,depth:v})}return Array.isArray(e)?n:n[0]}},ZI=class extends Ve{async _call(e,{pooling:t="none",normalize:r=!1,quantize:s=!1,precision:n="binary"}={}){const a=this.tokenizer(e,{padding:!0,truncation:!0}),o=await this.model(a);let i=o.last_hidden_state??o.logits??o.token_embeddings;switch(t){case"none":break;case"mean":i=Tx(i,a.attention_mask);break;case"first_token":case"cls":i=i.slice(null,0);break;case"last_token":case"eos":i=i.slice(null,-1);break;default:throw Error(`Pooling method '${t}' not supported.`)}return r&&(i=i.normalize(2,-1)),s&&(i=Cx(i,n)),i}},eO=class extends Ve{async _call(e,{pool:t=null}={}){const r=await Wt(e),{pixel_values:s}=await this.processor(r),n=await this.model({pixel_values:s});let a;if(t){if(!("pooler_output"in n))throw Error("No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.");a=n.pooler_output}else a=n.last_hidden_state??n.logits??n.image_embeds;return a}},Ua=Object.freeze({"text-classification":{pipeline:LI,model:lg,default:{model:"Xenova/distilbert-base-uncased-finetuned-sst-2-english"},type:"text"},"token-classification":{pipeline:II,model:pI,default:{model:"Xenova/bert-base-multilingual-cased-ner-hrl"},type:"text"},"question-answering":{pipeline:OI,model:bI,default:{model:"Xenova/distilbert-base-cased-distilled-squad"},type:"text"},"fill-mask":{pipeline:NI,model:yI,default:{model:"onnx-community/ettin-encoder-32m-ONNX",dtype:"fp32"},type:"text"},summarization:{pipeline:DI,model:au,default:{model:"Xenova/distilbart-cnn-6-6"},type:"text"},translation:{pipeline:zI,model:au,default:{model:"Xenova/t5-small"},type:"text"},"text2text-generation":{pipeline:iu,model:au,default:{model:"Xenova/flan-t5-small"},type:"text"},"text-generation":{pipeline:BI,model:vI,default:{model:"onnx-community/Qwen3-0.6B-ONNX",dtype:"q4"},type:"text"},"zero-shot-classification":{pipeline:RI,model:lg,default:{model:"Xenova/distilbert-base-uncased-mnli"},type:"text"},"audio-classification":{pipeline:GI,model:AI,default:{model:"Xenova/wav2vec2-base-superb-ks"},type:"audio"},"zero-shot-audio-classification":{pipeline:$I,model:mn,default:{model:"Xenova/clap-htsat-unfused"},type:"multimodal"},"automatic-speech-recognition":{pipeline:VI,model:[mI,EI],default:{model:"Xenova/whisper-tiny.en"},type:"multimodal"},"text-to-audio":{pipeline:UI,model:[wI,gI],default:{model:"onnx-community/Supertonic-TTS-ONNX",dtype:"fp32"},type:"text"},"image-to-text":{pipeline:jI,model:MI,default:{model:"Xenova/vit-gpt2-image-captioning"},type:"multimodal"},"image-classification":{pipeline:qI,model:xI,default:{model:"Xenova/vit-base-patch16-224"},type:"multimodal"},"image-segmentation":{pipeline:_g,model:[cg,ug,dg],default:{model:"Xenova/detr-resnet-50-panoptic"},type:"multimodal"},"background-removal":{pipeline:WI,model:[cg,ug,dg],default:{model:"Xenova/modnet"},type:"image"},"zero-shot-image-classification":{pipeline:HI,model:mn,default:{model:"Xenova/clip-vit-base-patch32"},type:"multimodal"},"object-detection":{pipeline:QI,model:kI,default:{model:"Xenova/detr-resnet-50"},type:"multimodal"},"zero-shot-object-detection":{pipeline:XI,model:TI,default:{model:"Xenova/owlvit-base-patch32"},type:"multimodal"},"document-question-answering":{pipeline:YI,model:CI,default:{model:"Xenova/donut-base-finetuned-docvqa"},type:"multimodal"},"image-to-image":{pipeline:JI,model:SI,default:{model:"Xenova/swin2SR-classical-sr-x2-64"},type:"image"},"depth-estimation":{pipeline:KI,model:PI,default:{model:"onnx-community/depth-anything-v2-small"},type:"image"},"feature-extraction":{pipeline:ZI,model:mn,default:{model:"onnx-community/all-MiniLM-L6-v2-ONNX",dtype:"fp32"},type:"text"},"image-feature-extraction":{pipeline:eO,model:[FI,mn],default:{model:"onnx-community/dinov3-vits16-pretrain-lvd1689m-ONNX",dtype:"fp32"},type:"image"}}),pg=Object.freeze({"sentiment-analysis":"text-classification",ner:"token-classification",asr:"automatic-speech-recognition","text-to-speech":"text-to-audio",embeddings:"feature-extraction"});async function tO(e){if(!e)throw new Error("modelId is required");return(await Zs(e,sn,{})).exists?[sn]:[]}async function rO(e,{config:t=null,dtype:r=null,device:s=null,model_file_name:n=null,include_tokenizer:a=!0,include_processor:o=!0}={}){const i=await sp(e,{config:t,dtype:r,device:s,model_file_name:n});if(a){const l=await Kf(e);i.push(...l)}if(o){const l=await tO(e);i.push(...l)}return i}async function sO(e,t,r={}){e=pg[e]??e;const s=Ua[e];if(!s)throw new Error(`Unsupported pipeline task: ${e}. Must be one of [${Object.keys(Ua).join(", ")}]`);const{type:n}=s,i=await rO(t,{...r,include_tokenizer:n!=="audio"&&n!=="image",include_processor:n!=="text"});if(e==="text-generation"){const l=await rp(t,r),c=tp(l),d=mE(c);if(d){const h=Object.values(d).map(_=>`onnx/${_}`);return i.filter(_=>!_.startsWith("onnx/")||h.some(p=>_.startsWith(p)))}}return i}async function nO(e,t=null,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main",device:i=null,dtype:l=null,subfolder:c="onnx",use_external_data_format:d=null,model_file_name:h=null,session_options:_={}}={}){e=pg[e]??e;const p=Ua[e.split("_",1)[0]];if(!p)throw Error(`Unsupported pipeline: ${e}. Must be one of [${Object.keys(Ua)}]`);t||(t=p.default.model,ue.info(`No model specified. Using default model: "${t}".`),!l&&p.default.dtype&&(l=p.default.dtype));const w=await sO(e,t,{device:i,dtype:l});let v={};r&&(await Promise.all(w.map(async Q=>Zs(t,Q)))).forEach((Q,H)=>{Q.exists&&(v[w[H]]={loaded:0,total:Q.size??0})});const y={progress_callback:r?new _i(r,v):void 0,config:s,cache_dir:n,local_files_only:a,revision:o,device:i,dtype:l,subfolder:c,use_external_data_format:d,model_file_name:h,session_options:_},M=w.includes("tokenizer.json"),T=w.includes("preprocessor_config.json"),A=p.model;let C;if(Array.isArray(A)){const $=s??await nn.from_pretrained(t,y),{model_type:Q}=$,H=A.find(D=>D.supports(Q));if(!H)throw Error(`Unsupported model type "${Q}" for task "${e}". None of the candidate model classes support this type.`);C=H.from_pretrained(t,{...y,config:$})}else C=A.from_pretrained(t,y);const[S,N,x]=await Promise.all([M?Te.from_pretrained(t,y):null,T?UT.from_pretrained(t,y):null,C]),R={task:e,model:x};S&&(R.tokenizer=S),N&&(R.processor=N),Cr(r,{status:"ready",task:e,model:t});const z=p.pipeline;return new z(R)}fe.IS_PROCESS_AVAILABLE;let ja=null;const aO="Xenova/whisper-base.en";async function oO(){ja||(self.postMessage({type:"status",status:"loading",message:"Downloading speech model..."}),ja=await nO("automatic-speech-recognition",aO,{dtype:"fp32",device:"wasm"}),self.postMessage({type:"status",status:"ready",message:"Speech model ready"}))}self.onmessage=async e=>{var r;const{type:t}=e.data;if(t==="load"){try{await oO()}catch(s){const n=s instanceof Error?s.message:"Model load failed",a=s instanceof Error?s.stack:"";console.error("[whisperWorker] load failed:",n,a),self.postMessage({type:"error",error:n})}return}if(t==="transcribe"){const s=e.data.audio;if(!ja){self.postMessage({type:"error",error:"Model not loaded"});return}try{const a=((r=(await ja(s,{chunk_length_s:30,stride_length_s:5})).text)==null?void 0:r.trim())??"";self.postMessage({type:"result",text:a})}catch(n){const a=n instanceof Error?n.message:"Transcription failed";self.postMessage({type:"error",error:a})}}}})();