npm - claude-code-session-manager - Versions diffs - 0.3.1 → 0.4.0 - Mend

claude-code-session-manager 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

package/dist/assets/{whisperWorker-HvcbMQn6.js → whisperWorker-ivwFFLMj.js} RENAMED Viewed

@@ -38,4 +38,4 @@ ${this.boa_token}${this.audio_token.repeat(this._compute_audio_num_tokens(o[l++]
 `}return o+=`
 ${s}${a}`+n.repeat(e)+`${s}`,o}function mT(e,t,r,s){return`${t}${s}`+r.repeat(e)+`${t}`}function gT(e,t,r,s,n,a){return e===0&&t===0?mT(r,s,n,a):pT(r,e,t,s,n,a)}var U_=(Tn=class extends Ee{constructor(){super(...arguments);k(this,"fake_image_token","<fake_token_around_image>");k(this,"image_token","<image>");k(this,"global_img_token","<global-img>")}async _call(t,r=null,s={}){s.return_row_col_info??(s.return_row_col_info=!0);let n;r&&(n=await this.image_processor(r,s)),Array.isArray(t)||(t=[t]);const a=n.rows??[new Array(t.length).fill(0)],o=n.cols??[new Array(t.length).fill(0)],i=this.config.image_seq_len,l=[],c=[];for(let h=0;h<t.length;++h){const _=t[h],p=a[h],w=o[h];l.push(Dy(_,this.image_token));const v=p.map((T,A)=>gT(T,w[A],i,this.fake_image_token,this.image_token,this.global_img_token)),y=_.split(this.image_token);if(y.length===0)throw new Error("The image token should be present in the text.");let M=y[0];for(let T=0;T<v.length;++T)M+=v[T]+y[T+1];c.push(M)}return{...this.tokenizer(c),...n}}},k(Tn,"image_processor_class",dt),k(Tn,"tokenizer_class",Te),k(Tn,"uses_processor_config",!0),Tn),wT=(En=class extends Ee{constructor(e,t,r){super(e,t,r),this.image_tag=this.config.image_tag,this.image_start_tag=this.config.image_start_tag,this.image_end_tag=this.config.image_end_tag,this.num_image_tokens=this.config.num_image_tokens}async _call(e,{images:t=null,chat_template:r="default"}={}){t?Array.isArray(t)||(t=[t]):t=await Promise.all(e.filter(v=>v.images).flatMap(v=>v.images).map(v=>qt.read(v)));const s=this.tokenizer,n=s.apply_chat_template(e,{tokenize:!1,add_generation_prompt:!0,chat_template:r}),a=v=>s.encode(v,{add_special_tokens:!1}),o=n.split(this.image_tag),i=o.length-1;if(t.length!==i)throw new Error(`Number of images provided (${t.length}) does not match number of "${this.image_tag}" image tags (${i})`);const[l,c,d]=s.convert_tokens_to_ids([this.image_tag,this.image_start_tag,this.image_end_tag]);let h=a(o[0]),_=new Array(h.length).fill(!1);for(let v=1;v<o.length;++v){const y=new Array(this.num_image_tokens).fill(l),M=a(o[v]);h=Jt(h,[c],y,[d],M);const T=new Array(this.num_image_tokens).fill(!0);_=Jt(_,[!1],T,[!1],new Array(M.length).fill(!1))}const p=[1,h.length],w={input_ids:new U("int64",h,p),attention_mask:new U("int64",new Array(h.length).fill(1),p),images_seq_mask:new U("bool",_,p),images_emb_mask:new U("bool",new Array(i*this.num_image_tokens).fill(!0),[1,i,this.num_image_tokens])};if(t&&t.length>0){const v=await this.image_processor(t);return v.pixel_values.unsqueeze_(0),{...w,...v}}return w}},k(En,"image_processor_class",dt),k(En,"tokenizer_class",Te),k(En,"uses_processor_config",!0),En),vT=(Ya=class extends Ee{async _call(e=null,t=null,r={}){if(!e&&!t)throw new Error("Either text or images must be provided");const s=e?this.tokenizer(e,r):{},n=t?await this.image_processor(t,r):{};return{...s,...n}}},k(Ya,"tokenizer_class",Te),k(Ya,"image_processor_class",dt),Ya),yT=(Ja=class extends Ee{async _call(e,t=null,r={}){const{image_rows:s,image_cols:n,image_sizes:a,...o}=await this.image_processor(e,{...r,return_row_col_info:!0});if(t){const i=this.config.image_token??"<image>",{tile_size:l=512,downsample_factor:c=2,encoder_patch_size:d=16,use_thumbnail:h=!0}=this.image_processor.config,_=T=>Math.ceil(Math.floor(T/d)/c),p=_(l)**2,w=this.config.image_start_token??"<|image_start|>",v=this.config.image_end_token??"<|image_end|>",y=this.config.image_thumbnail??"<|img_thumbnail|>";Array.isArray(t)||(t=[t]);let M=0;t=t.map(T=>{const A=T.split(i);return A[0]+A.slice(1).map(C=>{const S=M++,[N,x]=a[S],R=s[S],z=n[S],$=_(N)*_(x);let Q=w;if(R>1||z>1){const H=i.repeat(p);for(let D=0;D<R;++D)for(let I=0;I<z;++I)Q+=`<|img_row_${D+1}_col_${I+1}|>`+H;h&&(Q+=y+i.repeat($))}else Q+=i.repeat($);return Q+v+C}).join("")})}return{...o,...t?this.tokenizer(t,r):{}}}},k(Ja,"tokenizer_class",Te),k(Ja,"image_processor_class",dt),Ja),bT=(An=class extends Ee{async _call(e,t=null,r={}){const s=await this.image_processor(e,r);if(t){const[a,o]=s.pixel_values.dims.slice(-2),{image_token:i,patch_size:l,num_additional_image_tokens:c}=this.config,d=Math.floor(a/l)*Math.floor(o/l)+c;t=structuredClone(t),Array.isArray(t)||(t=[t]);for(let h=0;h<t.length;++h)t[h]=t[h].replace(i,i.repeat(d))}const n=t?this.tokenizer(t,r):{};return{...s,...n}}},k(An,"tokenizer_class",Te),k(An,"image_processor_class",dt),k(An,"uses_processor_config",!0),An),j_={char:["char_decode",1],bpe:["bpe_decode",2],wp:["wp_decode",102]},MT=(Ka=class extends Ee{get char_tokenizer(){return this.components.char_tokenizer}get bpe_tokenizer(){return this.components.bpe_tokenizer}get wp_tokenizer(){return this.components.wp_tokenizer}_decode_helper(e,t){if(!j_.hasOwnProperty(t))throw new Error(`Format ${t} is not supported.`);const[r,s]=j_[t],n=this[r].bind(this),[a,o]=e.dims,i=[],l=[],c=e.tolist();for(let h=0;h<a;++h){const _=c[h],p=[],w=[];for(let y=1;y<o;++y){const[M,T]=je(nt(_[y]));if(w.push(M),T==s)break;p.push(T)}const v=w.length>0?w.reduce((y,M)=>y*M,1):0;l.push(p),i.push(v)}return[n(l),i]}char_decode(e){return this.char_tokenizer.batch_decode(e).map(t=>t.replaceAll(" ",""))}bpe_decode(e){return this.bpe_tokenizer.batch_decode(e)}wp_decode(e){return this.wp_tokenizer.batch_decode(e).map(t=>t.replaceAll(" ",""))}batch_decode([e,t,r]){const[s,n]=this._decode_helper(e,"char"),[a,o]=this._decode_helper(t,"bpe"),[i,l]=this._decode_helper(r,"wp"),c=[],d=[];for(let h=0;h<s.length;++h){const[_,p]=je([n[h],o[h],l[h]]);c.push([s[h],a[h],i[h]][p]),d.push(_)}return{generated_text:c,scores:d,char_preds:s,bpe_preds:a,wp_preds:i}}static async from_pretrained(...e){const t=await super.from_pretrained(...e),r=await Te.from_pretrained("Xenova/gpt2"),s=await Te.from_pretrained("Xenova/bert-base-uncased");return t.components={image_processor:t.image_processor,char_tokenizer:t.tokenizer,bpe_tokenizer:r,wp_tokenizer:s},t}async _call(e,t=null){const r=await this.image_processor(e);return t&&(r.labels=this.tokenizer(t).input_ids),r}},k(Ka,"tokenizer_class",Te),k(Ka,"image_processor_class",dt),Ka),xT=(Za=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(Za,"tokenizer_class",Te),k(Za,"feature_extractor_class",Lt),Za),kT=(eo=class extends Ee{},k(eo,"tokenizer_class",Te),k(eo,"image_processor_class",dt),eo),ms="<image>";function TT(e,t,r,s,n){return`${s.repeat(r*n)}${t}${e}
 `}var ET=(Cn=class extends Ee{async _call(e,t=null,r={}){t||(ue.warn("You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."),t=""),Array.isArray(e)||(e=[e]),Array.isArray(t)||(t=[t]);const s=this.tokenizer.bos_token,n=this.image_processor.config.image_seq_length;let a;t.some(l=>l.includes(ms))?a=t.map(l=>{const c=l.replaceAll(ms,ms.repeat(n)),d=c.lastIndexOf(ms),h=d===-1?0:d+ms.length;return c.slice(0,h)+s+c.slice(h)+`
-`}):(ue.warn("You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens."),a=t.map(l=>TT(l,s,n,ms,e.length)));const o=this.tokenizer(a,r);return{...await this.image_processor(e,r),...o}}},k(Cn,"tokenizer_class",Te),k(Cn,"image_processor_class",dt),k(Cn,"uses_processor_config",!1),Cn),q_="<|image|>",AT=/<\|image_\d+\|>/g,CT=(to=class extends Ee{async _call(e,t=null,{padding:r=!0,truncation:s=!0,num_crops:n=null}={}){Array.isArray(e)||(e=[e]);let a,o;if(t){o=await this.image_processor(t,{num_crops:n});const{num_img_tokens:i}=o,l=e.map((d,h)=>d.split(AT).join(q_.repeat(i[h])));a=this.tokenizer(l,{padding:r,truncation:s});const c=this.tokenizer._tokenizer.token_to_id(q_);a.input_ids.map_(d=>d==c?-d:d)}else a=this.tokenizer(e);return{...a,...o}}},k(to,"image_processor_class",dt),k(to,"tokenizer_class",Te),to),ST=(Sn=class extends Ee{async _call(e,t=null,r={}){const s=await this.image_processor(e,r);if(t){const[a,o]=s.pixel_values.dims.slice(-2),{image_token:i,image_break_token:l,image_end_token:c,patch_size:d,spatial_merge_size:h}=this.config,_=d*h,p=Math.floor(a/_),w=Math.floor(o/_);t=structuredClone(t),Array.isArray(t)||(t=[t]);for(let v=0;v<t.length;++v){const y=i.repeat(w),M=y+l,T=y+c,A=M.repeat(p-1)+T;t[v]=t[v].replace(i,A)}}const n=t?this.tokenizer(t,r):{};return{...s,...n}}},k(Sn,"tokenizer_class",Te),k(Sn,"image_processor_class",dt),k(Sn,"uses_processor_config",!0),Sn),PT=(hu=class extends Ee{async _call(e){return await this.feature_extractor(e)}post_process_speaker_diarization(...e){return this.feature_extractor.post_process_speaker_diarization(...e)}get sampling_rate(){return this.feature_extractor.config.sampling_rate}},k(hu,"feature_extractor_class",g_),hu),W_=class extends Hi{},FT=class extends W_{},H_=(fu=class extends Ee{async _call(...e){return await this.image_processor(...e)}post_process_masks(...e){return this.image_processor.post_process_masks(...e)}reshape_input_points(...e){return this.image_processor.reshape_input_points(...e)}},k(fu,"image_processor_class",dt),fu),Q_=class extends H_{},LT=class extends Q_{},IT=(ro=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(ro,"tokenizer_class",Te),k(ro,"feature_extractor_class",Lt),ro),OT=(Pn=class extends Ee{async _call(e,t=null,r={}){if(Array.isArray(e))throw new Error("Batched inputs are not supported yet.");let s={};if(t){const a=t.length,{input_features:o}=await this.feature_extractor(t,{...r,max_length:a}),i=Math.round(a/this.config.encoder_ds_factor+1e-4),l=1+Math.ceil(i/this.config.stack_factor);s.audio_token_len=[l],s.audio_values=o;const c=this.config.audio_placeholder;if(!e.includes(c))throw new Error(`The input text does not contain the image token ${c}.`);e=e.replaceAll(c,c.repeat(l))}return{...this.tokenizer(e,{add_special_tokens:!1,...r}),...s}}},k(Pn,"tokenizer_class",Te),k(Pn,"feature_extractor_class",Lt),k(Pn,"uses_processor_config",!0),Pn),va="[AUDIO]",NT="[BEGIN_AUDIO]",DT=375;function zT(e,t){const r=[];for(let s=0;s<e.length;s+=t)r.push(e.subarray(s,Math.min(s+t,e.length)));return r}var BT=(Fn=class extends Ee{async _call(e,t=null,r={}){if(Array.isArray(e))throw new Error("Batched inputs are not supported yet.");const s={};if(t){if(!e.includes(va))throw new Error(`The input text does not contain the audio token ${va}.`);Array.isArray(t)||(t=[t]);const a=e.split(va),o=a.length-1;if(o!==t.length)throw new Error(`The number of audio inputs (${t.length}) does not match the number of audio tokens in the text (${o}).`);const i=this.feature_extractor.config.n_samples,l=t.map(p=>zT(p,i)),c=l.map(p=>p.length),d=l.flat(),h=(await Promise.all(d.map(p=>this.feature_extractor(p,r)))).map(p=>p.input_features);s.audio_values=h.length>1?ze(h,0):h[0];let _=a[0];for(let p=0;p<c.length;++p){_+=NT;for(let w=0;w<c[p];++w)_+=va.repeat(DT);_+=a[p+1]}e=_}return{...this.tokenizer(e,{add_special_tokens:!1,...r}),...s}}},k(Fn,"tokenizer_class",Te),k(Fn,"feature_extractor_class",Lt),k(Fn,"uses_processor_config",!1),Fn),X_=32,Qi=6,ya=8,RT=10,GT=32,$T=(Ln=class extends Ee{get num_mel_frames_first_audio_chunk(){return(Qi+1)*ya}get num_samples_first_audio_chunk(){const{hop_length:e,n_fft:t}=this.feature_extractor.config;return(this.num_mel_frames_first_audio_chunk-1)*e+Math.floor(t/2)}get num_samples_per_audio_chunk(){const{hop_length:e,n_fft:t}=this.feature_extractor.config;return ya*e+t}get num_right_pad_tokens(){return Qi+1+RT}get audio_length_per_tok(){return ya}get raw_audio_length_per_tok(){return ya*this.feature_extractor.config.hop_length}async _call(e,{is_streaming:t=!1,is_first_audio_chunk:r=!0}={}){if(at(e,"VoxtralRealtimeProcessor"),!t&&!r)throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");if(r)if(t){const s=X_*this.raw_audio_length_per_tok,n=new Float32Array(s+e.length);n.set(e,s);const a=await this.feature_extractor(n,{center:!0}),i=1+(X_+Qi),l=new BigInt64Array(i).fill(BigInt(GT));return l[0]=1n,{input_ids:new U("int64",l,[1,i]),...a}}else{const s=this.num_right_pad_tokens*this.raw_audio_length_per_tok,n=new Float32Array(e.length+s);return n.set(e),await this.feature_extractor(n,{center:!0})}else return await this.feature_extractor(e,{center:!1})}},k(Ln,"tokenizer_class",Te),k(Ln,"feature_extractor_class",Lt),k(Ln,"uses_processor_config",!1),Ln),VT=(so=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(so,"tokenizer_class",Te),k(so,"feature_extractor_class",Lt),so),UT=(no=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(no,"tokenizer_class",Te),k(no,"feature_extractor_class",Lt),no),jT=(ao=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(ao,"tokenizer_class",Te),k(ao,"feature_extractor_class",Lt),ao),qT=class{static async from_pretrained(e,t={}){const r=await er(e,an,!0,t),{image_processor_type:s,feature_extractor_type:n,processor_class:a}=r;if(a&&Di[a])return Di[a].from_pretrained(e,t);if(!s&&!n)throw new Error("No `image_processor_type` or `feature_extractor_type` found in the config.");const o={};if(s){const l=wa[s.replace(/Fast$/,"")];if(!l)throw new Error(`Unknown image_processor_type: '${s}'.`);o.image_processor=new l(r)}if(n){const l=wa[n];if(l)o.image_processor=new l(r);else{const c=zi[n];if(!c)throw new Error(`Unknown feature_extractor_type: '${n}'.`);o.feature_extractor=new c(r)}}const i={};return new Ee(i,o,null)}};async function WT(e,t){return await er(e,"config.json",!0,t)}function gs(e){const t={};let r={};switch(e.model_type){case"llava":case"paligemma":case"gemma3":case"florence2":case"llava_onevision":case"idefics3":case"granite_speech":case"ultravox":case"voxtral":case"voxtral_realtime":case"smolvlm":case"gemma3n":case"gemma4":case"lfm2_vl":case"chatterbox":case"lighton_ocr":case"glm_ocr":case"mistral3":case"qwen2_5_vl":case"qwen3_vl":case"qwen3_vl_moe":r=gs(e.text_config);break;case"moondream1":r=gs(e.phi_config);break;case"musicgen":r=gs(e.decoder);break;case"multi_modality":r=gs(e.language_config);break;case"gpt2":case"gptj":case"jais":case"codegen":case"gpt_bigcode":t.num_heads="n_head",t.num_layers="n_layer",t.hidden_size="n_embd";break;case"gpt_neox":case"stablelm":case"opt":case"falcon":case"modernbert-decoder":t.num_heads="num_attention_heads",t.num_layers="num_hidden_layers",t.hidden_size="hidden_size";break;case"gpt_oss":case"llama":case"llama4_text":case"nanochat":case"apertus":case"arcee":case"afmoe":case"lfm2":case"lfm2_moe":case"smollm3":case"olmo":case"olmo2":case"olmo3":case"mobilellm":case"granite":case"granitemoehybrid":case"cohere":case"cohere2":case"mistral":case"voxtral_realtime_text":case"voxtral_realtime_encoder":case"starcoder2":case"qwen2":case"qwen2_moe":case"qwen2_vl":case"qwen2_vl_text":case"qwen2_5_vl_text":case"qwen3_moe":case"qwen3_vl_text":case"qwen3_vl_moe_text":case"phi":case"phi3":case"phi3_v":case"llava_qwen2":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.hidden_size="hidden_size",t.num_attention_heads="num_attention_heads",t.dim_kv="head_dim";break;case"qwen3":case"solar_open":case"glm_ocr_text":case"gemma":case"gemma2":case"vaultgemma":case"gemma3_text":case"gemma3n_text":case"gemma4_text":case"glm":case"helium":case"ernie4_5":case"hunyuan_v1_dense":case"falcon_h1":case"nemotron_h":case"ministral":case"ministral3":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.dim_kv="head_dim";break;case"openelm":t.num_heads="num_kv_heads",t.num_layers="num_transformer_layers",t.dim_kv="head_dim";break;case"gpt_neo":case"donut-swin":t.num_heads="num_heads",t.num_layers="num_layers",t.hidden_size="hidden_size";break;case"bloom":t.num_heads="n_head",t.num_layers="n_layer",t.hidden_size="hidden_size";break;case"mpt":t.num_heads="n_heads",t.num_layers="n_layers",t.hidden_size="d_model";break;case"exaone":t.num_heads="num_key_value_heads",t.num_layers="num_layers",t.dim_kv="head_dim",t.num_attention_heads="num_attention_heads";break;case"youtu":case"deepseek_v3":case"glm_moe_dsa":case"mistral4":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.dim_kv="qk_head_dim",t.num_attention_heads="num_attention_heads";break;case"t5":case"mt5":case"longt5":t.num_decoder_layers="num_decoder_layers",t.num_decoder_heads="num_heads",t.decoder_dim_kv="d_kv",t.num_encoder_layers="num_layers",t.num_encoder_heads="num_heads",t.encoder_dim_kv="d_kv";break;case"bart":case"mbart":case"marian":case"whisper":case"lite-whisper":case"m2m_100":case"blenderbot":case"blenderbot-small":case"florence2_language":t.num_decoder_layers="decoder_layers",t.num_decoder_heads="decoder_attention_heads",t.decoder_hidden_size="d_model",t.num_encoder_layers="encoder_layers",t.num_encoder_heads="encoder_attention_heads",t.encoder_hidden_size="d_model";break;case"speecht5":t.num_decoder_layers="decoder_layers",t.num_decoder_heads="decoder_attention_heads",t.decoder_hidden_size="hidden_size",t.num_encoder_layers="encoder_layers",t.num_encoder_heads="encoder_attention_heads",t.encoder_hidden_size="hidden_size";break;case"trocr":t.num_encoder_layers=t.num_decoder_layers="decoder_layers",t.num_encoder_heads=t.num_decoder_heads="decoder_attention_heads",t.encoder_hidden_size=t.decoder_hidden_size="d_model";break;case"musicgen_decoder":t.num_encoder_layers=t.num_decoder_layers="num_hidden_layers",t.num_encoder_heads=t.num_decoder_heads="num_attention_heads",t.encoder_hidden_size=t.decoder_hidden_size="hidden_size";break;case"moonshine":t.num_decoder_layers="decoder_num_hidden_layers",t.num_decoder_heads="decoder_num_key_value_heads",t.num_encoder_layers="encoder_num_hidden_layers",t.num_encoder_heads="encoder_num_key_value_heads",t.encoder_hidden_size=t.decoder_hidden_size="hidden_size";break;case"cohere_asr":t.num_decoder_layers="num_hidden_layers",t.num_decoder_heads="num_key_value_heads",t.decoder_hidden_size="hidden_size",t.decoder_dim_kv="head_dim";const{num_hidden_layers:n,num_attention_heads:a,hidden_size:o}=e.encoder_config;r={num_encoder_layers:n,num_encoder_heads:a,encoder_hidden_size:o,encoder_dim_kv:e.head_dim};break;case"vision-encoder-decoder":const i=gs(e.decoder),l="num_decoder_layers"in i,c=rt(e,["model_type","is_encoder_decoder"]);return l?(c.num_decoder_layers=i.num_decoder_layers,c.num_decoder_heads=i.num_decoder_heads,c.decoder_hidden_size=i.decoder_hidden_size,c.num_encoder_layers=i.num_encoder_layers,c.num_encoder_heads=i.num_encoder_heads,c.encoder_hidden_size=i.encoder_hidden_size):(c.num_layers=i.num_layers,c.num_heads=i.num_heads,c.hidden_size=i.hidden_size),c}const s={...r,...rt(e,["model_type","multi_query","is_encoder_decoder"])};for(const n in t)s[n]=e[t[n]];return s}function ba(e,t){e instanceof Xi||(e=new Xi(e));const r=(t==null?void 0:t.batch_size)??1;if(["lfm2","lfm2_moe"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a={},{layer_types:o,num_attention_heads:i,num_key_value_heads:l,hidden_size:c,conv_L_cache:d}=e,h=c/i;for(let _=0;_<o.length;++_)if(o[_]==="full_attention")for(const p of["key","value"])a[`${s}.${_}.${p}`]=[r,l,0,h];else if(o[_]==="conv")a[`${n}_conv.${_}`]=[r,c,d];else throw new Error(`Unsupported layer type: ${o[_]}`);return a}else if(["granitemoehybrid","falcon_h1","nemotron_h"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a=e,o=a.layer_types??a.layers_block_type,i=a.num_hidden_layers??(o==null?void 0:o.length),l=a.num_key_value_heads,c=a.head_dim??a.hidden_size/a.num_attention_heads,d=a.mamba_n_heads??a.mamba_num_heads,h=a.mamba_d_head??a.mamba_head_dim,_=a.mamba_d_state??a.ssm_state_size,p=a.mamba_n_groups??a.n_groups,w=a.mamba_d_conv??a.conv_kernel,y=(a.mamba_d_ssm??(a.mamba_expand?a.mamba_expand*a.hidden_size:d*h))+2*p*_,M={};for(let T=0;T<i;++T)if((!o||o[T]==="mamba")&&(M[`${n}_conv.${T}`]=[r,y,w],M[`${n}_ssm.${T}`]=[r,d,h,_]),!o||o[T]==="attention")for(const A of["key","value"])M[`${s}.${T}.${A}`]=[r,l,0,c];return M}else if(["qwen3_next","qwen3_5_text","qwen3_5_moe_text","olmo_hybrid"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a={},{head_dim:o,layer_types:i,num_attention_heads:l,num_key_value_heads:c,hidden_size:d,linear_num_value_heads:h,linear_num_key_heads:_,linear_key_head_dim:p,linear_value_head_dim:w,linear_conv_kernel_dim:v}=e,y=p*_,M=w*h,T=o??d/l;for(let A=0;A<i.length;++A)if(i[A]==="full_attention")for(const C of["key","value"])a[`${s}.${A}.${C}`]=[r,c,0,T];else if(i[A]==="linear_attention"){if(e.model_type==="olmo_hybrid")a[`${n}_conv.${A}.key`]=[r,y,v],a[`${n}_conv.${A}.value`]=[r,M,v],a[`${n}_conv.${A}.query`]=[r,y,v];else{const C=y*2+M;a[`${n}_conv.${A}`]=[r,C,v]}a[`${n}_recurrent.${A}`]=[r,h,p,w]}else throw new Error(`Unsupported layer type: ${i[A]}`);return a}else if(["gemma4","gemma4_text"].includes(e.model_type)){const s=e.model_type==="gemma4"?e.text_config:e,n=(t==null?void 0:t.prefix)??"past_key_values",a={},o=s.num_hidden_layers,i=s.num_kv_shared_layers??0,l=o-i,c=s.num_key_value_heads,d=s.head_dim,h=s.global_head_dim??d,_=s.layer_types??[];for(let p=0;p<l;++p){const w=_[p]==="full_attention"?h:d;for(const v of["key","value"])a[`${n}.${p}.${v}`]=[r,c,0,w]}return a}else if(["lfm2_vl","qwen3_5","qwen3_5_moe","voxtral_realtime"].includes(e.model_type)){let s;return e.model_type==="voxtral_realtime"&&(t==null?void 0:t.session_name)==="audio_encoder"?s=e.audio_config:s=e.text_config,ba(s,t)}return HT(e,t)}function HT(e,{prefix:t="past_key_values",batch_size:r=1}={}){const s={},n=e.normalized_config;if(n.is_encoder_decoder&&"num_encoder_heads"in n&&"num_decoder_heads"in n){const a=n.encoder_dim_kv??n.encoder_hidden_size/n.num_encoder_heads,o=n.decoder_dim_kv??n.decoder_hidden_size/n.num_decoder_heads,i=[r,n.num_encoder_heads,0,a],l=[r,n.num_decoder_heads,0,o];for(let c=0;c<n.num_decoder_layers;++c)s[`${t}.${c}.encoder.key`]=i,s[`${t}.${c}.encoder.value`]=i,s[`${t}.${c}.decoder.key`]=l,s[`${t}.${c}.decoder.value`]=l}else{const a=n.num_heads,o=n.num_layers,i=n.dim_kv??n.hidden_size/(n.num_attention_heads??a);if(n.model_type==="falcon"){const l=[r*a,0,i];for(let c=0;c<o;++c)s[`${t}.${c}.key`]=l,s[`${t}.${c}.value`]=l}else if(n.multi_query){const l=[r*a,0,2*i];for(let c=0;c<o;++c)s[`${t}.${c}.key_value`]=l}else if(n.model_type==="bloom"){const l=[r*a,i,0],c=[r*a,0,i];for(let d=0;d<o;++d)s[`${t}.${d}.key`]=l,s[`${t}.${d}.value`]=c}else if(n.model_type==="openelm")for(let l=0;l<o;++l){const c=[r,a[l],0,i];s[`${t}.${l}.key`]=c,s[`${t}.${l}.value`]=c}else{const l=[r,a,0,i];for(let c=0;c<o;++c)s[`${t}.${c}.key`]=l,s[`${t}.${c}.value`]=l}}return s}var Xi=class vd{constructor(t){k(this,"model_type",null);k(this,"is_encoder_decoder",!1);k(this,"max_position_embeddings");k(this,"transformers.js_config");Object.assign(this,t),this.normalized_config=gs(this)}static async from_pretrained(t,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main"}={}){s&&!(s instanceof vd)&&(s=new vd(s));const i=s??await WT(t,{progress_callback:r,config:s,cache_dir:n,local_files_only:a,revision:o});return new this(i)}},on=class{static async from_pretrained(...e){return Xi.from_pretrained(...e)}};function Y_(e,t,r){return e?typeof e=="object"&&e!==null?e.hasOwnProperty(t)?+e[t]:e.hasOwnProperty(r)?+e[r]:0:+e:0}function J_(e,t){const r=[];for(let s=0;s<t;++s)r.push(`${e}_data${s===0?"":"_"+s}`);return r}async function QT(e,t,r,s){const n=`${t}${s}.onnx`,a=`${r.subfolder??""}/${n}`;return await ha(e,a,!0,r,fe.IS_NODE_ENV)}async function XT(e,t,r,s,n,a={}){const o=`${t}${r}.onnx`,i=fe.IS_NODE_ENV;let l=[];const c=Y_(n,o,t);if(c>0){if(c>bf)throw new Error(`The number of external data chunks (${c}) exceeds the maximum allowed value (${bf}).`);const d=J_(o,c);for(const h of d){const _=`${s.subfolder??""}/${h}`;l.push(new Promise(async(p,w)=>{const v=await ha(e,_,!0,s,i);p(v instanceof Uint8Array?{path:h,data:v}:h)}))}}else a.externalData!==void 0&&(l=a.externalData.map(async d=>{if(typeof d.data=="string"){const h=await ha(e,d.data,!0,s);return{...d,data:h}}return d}));return Promise.all(l)}async function YT(e,t,r,s=!1,n=void 0){var C;let a=((C=r.config)==null?void 0:C["transformers.js_config"])??{};const o=Vf(r.device??a.device,t,{warn:S=>ue.info(S)}),i=yx(o),l=a.device_config??{};l.hasOwnProperty(o)&&(a={...a,...l[o]});const c=qf(r.dtype??a.dtype,t,o,{configDtype:a.dtype,warn:S=>ue.info(S)});if(Ci.hasOwnProperty(c)){if(o==="webgpu"&&!fe.IS_NODE_ENV&&c===We.fp16&&!await xx())throw new Error(`The device (${o}) does not support fp16.`)}else throw new Error(`Invalid dtype: ${c}. Should be one of: ${Object.keys(We).join(", ")}`);const d=a.kv_cache_dtype,h=d?typeof d=="string"?d:d[c]??"float32":void 0;if(h&&!["float32","float16"].includes(h))throw new Error(`Invalid kv_cache_dtype: ${h}. Should be one of: float32, float16`);const _=Ci[c],p={...r.session_options};p.executionProviders??(p.executionProviders=i);const w=a.free_dimension_overrides;w?p.freeDimensionOverrides??(p.freeDimensionOverrides=w):o.startsWith("webnn")&&!p.freeDimensionOverrides&&ue.warn(`WebNN does not currently support dynamic shapes and requires 'free_dimension_overrides' to be set in config.json, preferably as a field within config["transformers.js_config"]["device_config"]["${o}"]. When 'free_dimension_overrides' is not set, you may experience significant performance degradation.`);const v=QT(e,t,r,_),y=r.use_external_data_format??a.use_external_data_format,M=await XT(e,t,_,r,y,p);if(M.length>0&&(!fe.IS_NODE_ENV||M.some(S=>typeof S!="string"))&&(p.externalData=M),s&&o==="webgpu"&&d!==!1){const S=ba(r.config,{prefix:"present",session_name:n});if(Object.keys(S).length>0&&!Ei()){const N={};for(const x in S)N[x]="gpu-buffer";p.preferredOutputLocation=N}}return{buffer_or_path:await v,session_options:p,session_config:{dtype:c,kv_cache_dtype:h,device:o}}}async function JT(e,t,r,s=void 0){return Object.fromEntries(await Promise.all(Object.keys(t).map(async n=>{const a=(s==null?void 0:s[n])??!1,{buffer_or_path:o,session_options:i,session_config:l}=await YT(e,t[n],r,a,n),c=await Bf(o,i,l);return[n,c]})))}function K_(e){for(let t in e)$f(e[t])?e[t]=new U(e[t]):typeof e[t]=="object"&&K_(e[t]);return e}async function xe(e,t){const r=KT(e,t);try{const s=Object.fromEntries(Object.entries(r).map(([a,o])=>{const i=o.ort_tensor;return fe.IS_NODE_ENV&&typeof Float16Array<"u"&&i.cpuData instanceof Float16Array&&(i.cpuData=new Uint16Array(i.cpuData.buffer)),[a,i]})),n=await Gf(e,s);return K_(n)}catch(s){const n=Object.fromEntries(Object.entries(r).map(([a,o])=>{const i={type:o.type,dims:o.dims,location:o.location};return i.location!=="gpu-buffer"&&(i.data=o.data),[a,i]}));throw ue.error(`An error occurred during model execution: "${s}".`),ue.error("Inputs given to model:",n),s}}function KT(e,t){const r=Object.create(null),s=[];for(const o of e.inputNames){const i=t[o];if(!(i instanceof U)){s.push(o);continue}r[o]=Ei()?i.clone():i}if(s.length>0)throw new Error(`An error occurred during model execution: "Missing the following inputs: ${s.join(", ")}.`);const n=Object.keys(t).length,a=e.inputNames.length;if(n>a){let o=Object.keys(t).filter(i=>!e.inputNames.includes(i));ue.warn(`WARNING: Too many inputs were provided (${n} > ${a}). The following inputs will be ignored: "${o.join(", ")}".`)}return r}var Ye=class{},ie=class extends Ye{constructor({logits:e,...t}){super(),this.logits=e;const r=Object.values(t);r.length>0&&(this.attentions=r)}},He=class extends Ye{constructor({logits:e}){super(),this.logits=e}},Je=class extends Ye{constructor({logits:e}){super(),this.logits=e}},ht=class extends Ye{constructor({start_logits:e,end_logits:t}){super(),this.start_logits=e,this.end_logits=t}},Jr=class extends Ye{constructor({logits:e}){super(),this.logits=e}},ZT=class extends Ye{constructor({alphas:e}){super(),this.alphas=e}},Wt=class extends vt{_call(e,t){throw Error("`_call` should be implemented in a subclass")}},eE=class extends vt{_call(e,t){throw Error("`_call` should be implemented in a subclass")}},Yi=class extends vt{constructor(){super(),this.processors=[]}push(e){this.processors.push(e)}extend(e){this.processors.push(...e)}_call(e,t){let r=t;for(const s of this.processors)r=s(e,r);return r}[Symbol.iterator](){return this.processors.values()}},tE=class extends Wt{constructor(e){super(),this.bos_token_id=e}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===1){const s=t[r].data;s.fill(-1/0),s[this.bos_token_id]=0}return t}},rE=class extends Wt{constructor(e,t){super(),this.max_length=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===this.max_length-1){const s=t[r].data;s.fill(-1/0);for(const n of this.eos_token_id)s[n]=0}return t}},sE=class extends Wt{constructor(e){super(),this.suppress_tokens=e}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;for(const n of this.suppress_tokens)s[n]=-1/0}return t}},Z_=class extends Wt{constructor(e,t){super(),this.begin_suppress_tokens=e,this.begin_index=t}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===this.begin_index){const s=t[r].data;for(const n of this.begin_suppress_tokens)s[n]=-1/0}return t}},nE=class extends Wt{constructor(e,t){super(),this.eos_token_id=Array.isArray(e.eos_token_id)?e.eos_token_id[0]:e.eos_token_id,this.no_timestamps_token_id=e.no_timestamps_token_id,this.timestamp_begin=this.no_timestamps_token_id+1,this.begin_index=t.length,t.at(-1)===this.no_timestamps_token_id&&(this.begin_index-=1),this.max_initial_timestamp_index=e.max_initial_timestamp_index}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;if(s[this.no_timestamps_token_id]=-1/0,e[r].length===this.begin_index){s.subarray(0,this.timestamp_begin).fill(-1/0);continue}const n=e[r].slice(this.begin_index),a=n.length>=1&&n[n.length-1]>=this.timestamp_begin,o=n.length<2||n[n.length-2]>=this.timestamp_begin;if(a&&(o?s.subarray(this.timestamp_begin).fill(-1/0):s.subarray(0,this.eos_token_id).fill(-1/0)),e[r].length===this.begin_index&&this.max_initial_timestamp_index!==null){const d=this.timestamp_begin+this.max_initial_timestamp_index;s.subarray(d+1).fill(-1/0)}const i=ix(s),l=Math.log(i.subarray(this.timestamp_begin).map(Math.exp).reduce((d,h)=>d+h)),c=je(i.subarray(0,this.timestamp_begin))[0];l>c&&s.subarray(0,this.timestamp_begin).fill(-1/0)}return t}},aE=class extends Wt{constructor(e){super(),this.no_repeat_ngram_size=e}getNgrams(e){const t=e.length,r=[];for(let n=0;n<t+1-this.no_repeat_ngram_size;++n){const a=[];for(let o=0;o<this.no_repeat_ngram_size;++o)a.push(e[n+o]);r.push(a.map(Number))}const s=new Map;for(const n of r){const a=n.slice(0,n.length-1),o=JSON.stringify(a),i=s.get(o)??[];i.push(n[n.length-1]),s.set(o,i)}return s}getGeneratedNgrams(e,t){const r=t.slice(t.length+1-this.no_repeat_ngram_size,t.length);return e.get(JSON.stringify(r.map(Number)))??[]}calcBannedNgramTokens(e){const t=[];if(e.length+1<this.no_repeat_ngram_size)return t;{const r=this.getNgrams(e);return this.getGeneratedNgrams(r,e)}}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data,n=this.calcBannedNgramTokens(e[r]);for(const a of n)s[a]=-1/0}return t}},oE=class extends Wt{constructor(e){super(),this.penalty=e}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;for(const n of new Set(e[r])){const a=Number(n);s[a]<0?s[a]*=this.penalty:s[a]/=this.penalty}}return t}},iE=class extends Wt{constructor(e,t){super(),this.min_length=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length<this.min_length){const s=t[r].data;for(const n of this.eos_token_id)s[n]=-1/0}return t}},lE=class extends Wt{constructor(e,t,r){super(),this.prompt_length_to_skip=e,this.min_new_tokens=t,this.eos_token_id=Array.isArray(r)?r:[r]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length-this.prompt_length_to_skip<this.min_new_tokens){const n=t[r].data;for(const a of this.eos_token_id)n[a]=-1/0}return t}},cE=class extends Wt{constructor(e,t){super(),this.bad_words_ids=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data,n=e[r];for(const a of this.bad_words_ids){if(n.length<a.length-1)continue;let o=!0;for(let i=1;i<=a.length-1;++i)if(a.at(-i-1)!=n.at(-i)){o=!1;break}o&&(s[a.at(-1)]=-1/0)}}return t}},uE=class extends Wt{constructor(e){if(super(),e<=1)throw new Error(`Require guidance scale >1 to use the classifier free guidance processor, got guidance scale ${e}.`);this.guidance_scale=e}_call(e,t){if(t.dims[0]!==2*e.length)throw new Error(`Logits should have twice the batch size of the input ids, the first half of batches corresponding to the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got batch size ${t.dims[0]} for the logits and ${e.length} for the input ids.`);const r=e.length,s=t.slice([0,r],null),n=t.slice([r,t.dims[0]],null);for(let a=0;a<n.data.length;++a)n.data[a]+=(s.data[a]-n.data[a])*this.guidance_scale;return n}},dE=class extends eE{constructor(e){super(),this.temperature=e}_call(e,t){const r=t.data;for(let s=0;s<r.length;++s)r[s]/=this.temperature;return t}},ep=class{constructor(e){k(this,"max_length",20);k(this,"max_new_tokens",null);k(this,"min_length",0);k(this,"min_new_tokens",null);k(this,"early_stopping",!1);k(this,"max_time",null);k(this,"do_sample",!1);k(this,"num_beams",1);k(this,"num_beam_groups",1);k(this,"penalty_alpha",null);k(this,"use_cache",!0);k(this,"temperature",1);k(this,"top_k",50);k(this,"top_p",1);k(this,"typical_p",1);k(this,"epsilon_cutoff",0);k(this,"eta_cutoff",0);k(this,"diversity_penalty",0);k(this,"repetition_penalty",1);k(this,"encoder_repetition_penalty",1);k(this,"length_penalty",1);k(this,"no_repeat_ngram_size",0);k(this,"bad_words_ids",null);k(this,"force_words_ids",null);k(this,"renormalize_logits",!1);k(this,"constraints",null);k(this,"forced_bos_token_id",null);k(this,"forced_eos_token_id",null);k(this,"remove_invalid_values",!1);k(this,"exponential_decay_length_penalty",null);k(this,"suppress_tokens",null);k(this,"streamer",null);k(this,"begin_suppress_tokens",null);k(this,"forced_decoder_ids",null);k(this,"guidance_scale",null);k(this,"num_return_sequences",1);k(this,"output_attentions",!1);k(this,"output_hidden_states",!1);k(this,"output_scores",!1);k(this,"return_dict_in_generate",!1);k(this,"pad_token_id",null);k(this,"bos_token_id",null);k(this,"eos_token_id",null);k(this,"encoder_no_repeat_ngram_size",0);k(this,"decoder_start_token_id",null);k(this,"generation_kwargs",{});Object.assign(this,rt(e,Object.getOwnPropertyNames(this)))}},Ma=class extends vt{_call(e,t){throw Error("StoppingCriteria needs to be subclassed")}},tp=class Ov extends vt{constructor(){super(),this.criteria=[]}push(t){this.criteria.push(t)}extend(t){t instanceof Ov?t=t.criteria:t instanceof Ma&&(t=[t]),this.criteria.push(...t)}_call(t,r){const s=new Array(t.length).fill(!1);for(const n of this.criteria){const a=n(t,r);for(let o=0;o<s.length;++o)s[o]||(s[o]=a[o])}return s}[Symbol.iterator](){return this.criteria.values()}},hE=class extends Ma{constructor(e,t=null){super(),this.max_length=e,this.max_position_embeddings=t}_call(e){return e.map(t=>t.length>=this.max_length)}},fE=class extends Ma{constructor(e){super(),Array.isArray(e)||(e=[e]),this.eos_token_id=e}_call(e,t){return e.map(r=>{const s=r.at(-1);return this.eos_token_id.some(n=>s==n)})}},xa=class extends vt{constructor(e){super(),this.generation_config=e}async _call(e){return this.sample(e)}async sample(e){throw Error("sample should be implemented in subclasses.")}getLogits(e,t){let r=e.dims.at(-1),s=e.data;if(t===-1)s=s.slice(-r);else{let n=t*r;s=s.slice(n,n+r)}return s}randomSelect(e){return jM(e)}static getSampler(e){if(e.do_sample)return new pE(e);if(e.num_beams>1)return new mE(e);if(e.num_return_sequences>1)throw Error(`num_return_sequences has to be 1 when doing greedy search, but is ${e.num_return_sequences}.`);return new _E(e)}},_E=class extends xa{async sample(e){const t=je(e.data)[1];return[[BigInt(t),0]]}},pE=class extends xa{async sample(e){let t=e.dims.at(-1);this.generation_config.top_k>0&&(t=Math.min(this.generation_config.top_k,t));const[r,s]=await _s(e,t),n=nt(r.data);return Array.from({length:this.generation_config.num_beams},()=>{const a=this.randomSelect(n);return[s.data[a],Math.log(n[a])]})}},mE=class extends xa{async sample(e){let t=e.dims.at(-1);this.generation_config.top_k>0&&(t=Math.min(this.generation_config.top_k,t));const[r,s]=await _s(e,t),n=nt(r.data);return Array.from({length:this.generation_config.num_beams},(a,o)=>[s.data[o],Math.log(n[o])])}},gE=class{constructor(e){if(e)for(const t in e){if(t in this)throw new TypeError(`Key "${t}" conflicts with an existing property on DynamicCache`);const r=e[t];if(!(r instanceof U))throw new TypeError(`Expected a Tensor for key "${t}", got ${typeof r}`);this[t]=r}}get_seq_length(){const e=this;if(Object.keys(e).length===0)return 0;for(const t in e)if(t.startsWith("past_key_values."))return e[t].dims.at(-2);throw new Error("Unable to determine sequence length from the cache.")}update(e){for(const t in e){const r=this[t],s=e[t];r&&r!==s&&r.location==="gpu-buffer"&&r.dispose(),this[t]=s}}async dispose(){const e=[];for(const t of Object.values(this))t.location==="gpu-buffer"&&e.push(t.dispose());await Promise.all(e)}},Ji=gE,q={EncoderOnly:0,EncoderDecoder:1,Seq2Seq:2,Vision2Seq:3,DecoderOnly:4,DecoderOnlyWithoutHead:5,MaskGeneration:6,ImageTextToText:7,Musicgen:8,MultiModality:9,Phi3V:10,AudioTextToText:11,AutoEncoder:12,ImageAudioTextToText:13,Supertonic:14,Chatterbox:15,VoxtralRealtime:16},fr={[q.DecoderOnly]:{sessions:(e,t)=>({model:t.model_file_name??"model"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.DecoderOnlyWithoutHead]:{sessions:(e,t)=>({model:t.model_file_name??"model"})},[q.Seq2Seq]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.Vision2Seq]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.Musicgen]:{sessions:()=>({model:"text_encoder",decoder_model_merged:"decoder_model_merged",encodec_decode:"encodec_decode"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.EncoderDecoder]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0}},[q.MaskGeneration]:{sessions:()=>({model:"vision_encoder",prompt_encoder_mask_decoder:"prompt_encoder_mask_decoder"})},[q.ImageTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...fr[q.ImageTextToText].text_only_sessions};return r||(s.vision_encoder="vision_encoder"),e.is_encoder_decoder&&(s.model="encoder_model"),s},cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.AudioTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...fr[q.AudioTextToText].text_only_sessions};return r||(s.audio_encoder="audio_encoder"),s},cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.ImageAudioTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...fr[q.ImageAudioTextToText].text_only_sessions};return r||(s.audio_encoder="audio_encoder",s.vision_encoder="vision_encoder"),s},optional_configs:{generation_config:"generation_config.json"}},[q.Phi3V]:{sessions:()=>({prepare_inputs_embeds:"prepare_inputs_embeds",model:"model",vision_encoder:"vision_encoder"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.MultiModality]:{sessions:()=>({prepare_inputs_embeds:"prepare_inputs_embeds",model:"language_model",lm_head:"lm_head",gen_head:"gen_head",gen_img_embeds:"gen_img_embeds",image_decode:"image_decode"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.AutoEncoder]:{sessions:()=>({encoder_model:"encoder_model",decoder_model:"decoder_model"})},[q.Supertonic]:{sessions:()=>({text_encoder:"text_encoder",latent_denoiser:"latent_denoiser",voice_decoder:"voice_decoder"})},[q.Chatterbox]:{sessions:()=>({embed_tokens:"embed_tokens",speech_encoder:"speech_encoder",model:"language_model",conditional_decoder:"conditional_decoder"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.VoxtralRealtime]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...fr[q.VoxtralRealtime].text_only_sessions};return r||(s.audio_encoder="audio_encoder"),s},cache_sessions:{decoder_model_merged:!0,audio_encoder:!0},optional_configs:{generation_config:"generation_config.json"}},default:{sessions:(e,t)=>({model:t.model_file_name??"model"})}};function wE(e){const t=fr[e];return(t==null?void 0:t.text_only_sessions)??null}function vE(e,t,r={}){const s=fr[e]??fr.default;return{sessions:s.sessions(t,r,r.textOnly??!1),cache_sessions:s.cache_sessions,optional_configs:s.optional_configs}}function rp(e,{warn:t=!0}={}){const r=e.architectures||[];for(const s of r){const n=_r.get(s);if(n!==void 0)return n}if(e.model_type){const s=_r.get(e.model_type);if(s!==void 0)return s;for(const n of Object.values(ws))if(n.has(e.model_type)){const a=_r.get(n.get(e.model_type));if(a!==void 0)return a}}if(t){const s=r.length>0?r.join(", "):"(none)";ue.warn(`[resolve_model_type] Architecture(s) not found in MODEL_TYPE_MAPPING: [${s}] for model type '${e.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${pa}`)}return q.EncoderOnly}function sp(e,{config:t=null,cache_dir:r=null,local_files_only:s=!1,revision:n="main"}={}){if(t!==null)return on.from_pretrained(e,{config:t,cache_dir:r,local_files_only:s,revision:n});const a=JSON.stringify([e,r,s,n]);return Af(a,()=>on.from_pretrained(e,{config:t,cache_dir:r,local_files_only:s,revision:n}))}async function np(e,{config:t=null,dtype:r=null,device:s=null,model_file_name:n=null}={}){t=await sp(e,{config:t});const a=["config.json"],o=t["transformers.js_config"]??{},i=o.use_external_data_format,l="onnx",c=s??o.device;let d=r??o.dtype;const h=rp(t),_=(v,y=null)=>{y=y??v;const M=Vf(c,v),T=qf(d,v,M),A=Ci[T]??"",C=`${y}${A}.onnx`,S=`${l}/${C}`;a.push(S);const N=Y_(i,C,v);for(const x of J_(C,N)){const R=`${l}/${x}`;a.push(R)}},{sessions:p,optional_configs:w}=vE(h,t,{model_file_name:n});for(const[v,y]of Object.entries(p))_(v,y);if(w)for(const v of Object.values(w))a.push(v);return a}var ws=null;function yE(e){ws=e}function Ki(e){if(e instanceof U)return e;if(e.length===0)throw Error("items must be non-empty");if(Array.isArray(e[0])){if(e.some(t=>t.length!==e[0].length))throw Error("Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.");return new U("int64",BigInt64Array.from(e.flat().map(t=>BigInt(t))),[e.length,e[0].length])}else return new U("int64",BigInt64Array.from(e.map(t=>BigInt(t))),[1,e.length])}function ap(e){return new U("bool",[e],[1])}var op={[q.DecoderOnly]:{can_generate:!0,forward:pr,prepare_inputs:ln},[q.DecoderOnlyWithoutHead]:{can_generate:!1,forward:pr,prepare_inputs:ln},[q.Seq2Seq]:{can_generate:!0,forward:ka,prepare_inputs:Ta},[q.Vision2Seq]:{can_generate:!0,forward:ka,prepare_inputs:Ta},[q.Musicgen]:{can_generate:!0,forward:ka},[q.EncoderDecoder]:{can_generate:!1,forward:ka},[q.ImageTextToText]:{can_generate:!0,forward:kE,prepare_inputs:Ea},[q.AudioTextToText]:{can_generate:!0,forward:xE,prepare_inputs:Ea},[q.ImageAudioTextToText]:{can_generate:!0,prepare_inputs:Ea},[q.Phi3V]:{can_generate:!0,prepare_inputs:Ea},[q.MultiModality]:{can_generate:!0},[q.AutoEncoder]:{can_generate:!1,forward:bE},[q.Chatterbox]:{can_generate:!0,forward:Ir},[q.VoxtralRealtime]:{can_generate:!0,prepare_inputs:ln},default:{can_generate:!1,forward:Ir}};function ip(e,t){var i;let r=_r.get(e),s=!1;const n=(i=t==null?void 0:t.architectures)==null?void 0:i[0];if(n&&n!==e&&(e!=null&&e.endsWith("ForCausalLM"))&&n.endsWith("ForConditionalGeneration")){const l=_r.get(n);l!==void 0&&(r=l,s=!0)}const a=op[r]??op.default,o=fr[r]??fr.default;return{typeConfig:{...a,...o},textOnly:s,modelType:r}}var _r=new Map,Zi=new Map,vs=new Map,P=class extends vt{constructor(t,r,s){super();k(this,"main_input_name","input_ids");k(this,"forward_params",["input_ids","attention_mask"]);k(this,"_return_dict_in_generate_keys",null);this.config=t,this.sessions=r,this.configs=s;const n=vs.get(this.constructor),{typeConfig:a}=ip(n,t);this.can_generate=a.can_generate,this._forward=a.forward,this._prepare_inputs_for_generation=a.prepare_inputs,this.can_generate&&this.forward_params.push("past_key_values"),this.custom_config=this.config["transformers.js_config"]??{}}async dispose(){var r;const t=[];for(const s of Object.values(this.sessions))t.push((r=s.release)==null?void 0:r.call(s));return await Promise.all(t)}static async from_pretrained(t,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main",model_file_name:i=null,subfolder:l="onnx",device:c=null,dtype:d=null,use_external_data_format:h=null,session_options:_={}}={}){const p={progress_callback:r,config:s,cache_dir:n,local_files_only:a,revision:o,model_file_name:i,subfolder:l,device:c,dtype:d,use_external_data_format:h,session_options:_},w=vs.get(this);s=p.config=await on.from_pretrained(t,p);const{typeConfig:v,textOnly:y,modelType:M}=ip(w,s);if(M===void 0){const S=w??(s==null?void 0:s.model_type);S!=="custom"&&ue.warn(`Model type for '${S}' not found, assuming encoder-only architecture. Please report this at ${pa}.`)}if(r&&!(r instanceof pi)){const S={};try{const N=await np(t,{config:s,dtype:d,device:c,model_file_name:i});(await Promise.all(N.map(R=>tn(t,R,p)))).forEach((R,z)=>{if(R.exists){const $=N[z]==="config.json";S[N[z]]={loaded:$?R.size??0:0,total:R.size??0}}})}catch(N){ue.warn(`Unable to fetch model file metadata for total progress tracking: ${N}`)}Object.keys(S).length>0&&(p.progress_callback=new pi(r,S))}const T=v.sessions(s,p,y),A=[JT(t,T,p,v.cache_sessions)];v.optional_configs&&A.push(EE(t,v.optional_configs,p));const C=await Promise.all(A);return new this(s,...C)}async _call(t){return await this.forward(t)}async forward(t){return await this._forward(this,t)}get generation_config(){var t;return((t=this.configs)==null?void 0:t.generation_config)??null}_get_logits_processor(t,r,s=null){const n=new Yi;if(t.repetition_penalty!==null&&t.repetition_penalty!==1&&n.push(new oE(t.repetition_penalty)),t.no_repeat_ngram_size!==null&&t.no_repeat_ngram_size>0&&n.push(new aE(t.no_repeat_ngram_size)),t.bad_words_ids!==null&&n.push(new cE(t.bad_words_ids,t.eos_token_id)),t.min_length!==null&&t.eos_token_id!==null&&t.min_length>0&&n.push(new iE(t.min_length,t.eos_token_id)),t.min_new_tokens!==null&&t.eos_token_id!==null&&t.min_new_tokens>0&&n.push(new lE(r,t.min_new_tokens,t.eos_token_id)),t.forced_bos_token_id!==null&&n.push(new tE(t.forced_bos_token_id)),t.forced_eos_token_id!==null&&n.push(new rE(t.max_length,t.forced_eos_token_id)),t.suppress_tokens!==null&&n.push(new sE(t.suppress_tokens)),t.begin_suppress_tokens!==null){const a=r>1||t.forced_bos_token_id===null?r:r+1;n.push(new Z_(t.begin_suppress_tokens,a))}return t.guidance_scale!==null&&t.guidance_scale>1&&n.push(new uE(t.guidance_scale)),t.temperature===0&&t.do_sample&&(ue.warn("`do_sample` changed to false because `temperature: 0` implies greedy sampling (always selecting the most likely token), which is incompatible with `do_sample: true`."),t.do_sample=!1),t.do_sample&&t.temperature!==null&&t.temperature!==1&&n.push(new dE(t.temperature)),s!==null&&n.extend(s),n}_prepare_generation_config(t,r,s=ep){const n={...this.config};for(const o of["decoder","generator","text_config"])o in n&&Object.assign(n,n[o]);const a=new s(n);return Object.assign(a,this.generation_config??{}),t&&Object.assign(a,t),r&&Object.assign(a,rt(r,Object.getOwnPropertyNames(a))),a}_get_stopping_criteria(t,r=null){const s=new tp;return t.max_length!==null&&s.push(new hE(t.max_length,this.config.max_position_embeddings??null)),t.eos_token_id!==null&&s.push(new fE(t.eos_token_id)),r&&s.extend(r),s}_validate_model_class(){if(!this.can_generate){const t=[ws.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,ws.MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,ws.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,ws.MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES].filter(Boolean),r=vs.get(this.constructor),s=new Set,n=this.config.model_type;for(const o of t){const i=o==null?void 0:o.get(n);i&&s.add(i)}let a=`The current model class (${r}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`;throw s.size>0&&(a+=` Please use the following class instead: ${[...s].join(", ")}`),Error(a)}}prepare_inputs_for_generation(...t){if(!this._prepare_inputs_for_generation)throw new Error("prepare_inputs_for_generation is not implemented for this model.");return this._prepare_inputs_for_generation(this,...t)}_update_model_kwargs_for_generation({generated_input_ids:t,outputs:r,model_inputs:s,is_encoder_decoder:n}){return s.past_key_values=el(r,s.past_key_values),s.input_ids=new U("int64",t.flat(),[t.length,1]),n?"decoder_attention_mask"in s&&(s.decoder_attention_mask=ze([s.decoder_attention_mask,yt([s.decoder_attention_mask.dims[0],1])],1)):s.attention_mask=ze([s.attention_mask,yt([s.attention_mask.dims[0],1])],1),s.position_ids=null,s}_prepare_model_inputs({inputs:t,bos_token_id:r,model_kwargs:s}){const n=rt(s,this.forward_params),a=this.main_input_name;if(a in n){if(t)throw new Error("`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. Make sure to either pass {inputs} or {input_name}=...")}else n[a]=t;return{inputs_tensor:n[a],model_inputs:n,model_input_name:a}}async _prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:t,model_inputs:r,model_input_name:s,generation_config:n}){if(this.sessions.model.inputNames.includes("inputs_embeds")&&!r.inputs_embeds&&"_prepare_inputs_embeds"in this){const{input_ids:o,pixel_values:i,attention_mask:l,...c}=r,d=await this._prepare_inputs_embeds(r);r={...c,...rt(d,["inputs_embeds","attention_mask"])}}let{last_hidden_state:a}=await Ir(this,r);if(n.guidance_scale!==null&&n.guidance_scale>1)a=ze([a,Li(a,0)],0),"attention_mask"in r&&(r.attention_mask=ze([r.attention_mask,Kf(r.attention_mask)],0));else if(r.decoder_input_ids){const o=Ki(r.decoder_input_ids).dims[0];if(o!==a.dims[0]){if(a.dims[0]!==1)throw new Error(`The encoder outputs have a different batch size (${a.dims[0]}) than the decoder inputs (${o}).`);a=ze(Array.from({length:o},()=>a),0)}}return r.encoder_outputs=a,r}_prepare_decoder_input_ids_for_generation({batch_size:t,model_input_name:r,model_kwargs:s,decoder_start_token_id:n,bos_token_id:a,generation_config:o}){let{decoder_input_ids:i,...l}=s;if(!(i instanceof U)){if(i)Array.isArray(i[0])||(i=Array.from({length:t},()=>i));else if(n??(n=a),this.config.model_type==="musicgen")i=Array.from({length:t*this.config.decoder.num_codebooks},()=>[n]);else if(Array.isArray(n)){if(n.length!==t)throw new Error(`\`decoder_start_token_id\` expcted to have length ${t} but got ${n.length}`);i=n}else i=Array.from({length:t},()=>[n]);i=Ki(i)}return l.decoder_attention_mask=Yf(i),{input_ids:i,model_inputs:l}}async generate({inputs:t=null,generation_config:r=null,logits_processor:s=null,stopping_criteria:n=null,streamer:a=null,...o}){this._validate_model_class(),r=this._prepare_generation_config(r,o);let{inputs_tensor:i,model_inputs:l,model_input_name:c}=this._prepare_model_inputs({inputs:t,model_kwargs:o});const d=this.config.is_encoder_decoder;d&&("encoder_outputs"in l||(l=await this._prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:i,model_inputs:l,model_input_name:c,generation_config:r})));let h;d?{input_ids:h,model_inputs:l}=this._prepare_decoder_input_ids_for_generation({batch_size:l[c].dims.at(0),model_input_name:c,model_kwargs:l,decoder_start_token_id:r.decoder_start_token_id,bos_token_id:r.bos_token_id,generation_config:r}):h=l[c];let _=h.dims.at(-1);r.max_new_tokens!==null&&(r.max_length=_+r.max_new_tokens);const p=this._get_logits_processor(r,_,s),w=this._get_stopping_criteria(r,n),v=l[c].dims.at(0),y=xa.getSampler(r),M=new Array(v).fill(0),T=h.tolist();a&&a.put(T);let A,C={},S={};for(;;){if(l=this.prepare_inputs_for_generation(T,l,r),A=await this.forward(l),r.return_dict_in_generate)if(r.output_attentions){const I=ME(A);for(const te in I)te in C||(C[te]=[]),C[te].push(I[te])}else this._return_dict_in_generate_keys&&Object.assign(S,rt(A,this._return_dict_in_generate_keys));const $=A.logits.slice(null,-1,null).to("float32"),Q=p(T,$),H=[];for(let I=0;I<Q.dims.at(0);++I){const te=Q[I],W=await y(te);for(const[ee,G]of W){const L=BigInt(ee);M[I]+=G,T[I].push(L),H.push([L]);break}}if(a&&a.put(H),w(T).every(I=>I))break;l=this._update_model_kwargs_for_generation({generated_input_ids:H,outputs:A,model_inputs:l,is_encoder_decoder:d})}a&&a.end();const N=new U("int64",T.flat(),[T.length,T[0].length]),x=el(A,l.past_key_values),R=new Set(Object.values(x));for(const $ of Object.values(A))$.location==="gpu-buffer"&&!R.has($)&&$.dispose();return"past_key_values"in o||r.return_dict_in_generate||await x.dispose(),r.return_dict_in_generate?{sequences:N,past_key_values:x,...C,...S}:N}async _encode_input(t,r,s){if(!Object.hasOwn(this.sessions,t))throw new Error(`Model does not have a ${t} session.`);const n=this.sessions[t];return(await xe(n,rt(r,n.inputNames)))[s]}async encode_image(t){return this._encode_input("vision_encoder",t,"image_features")}async encode_text(t){return this._encode_input("embed_tokens",t,"inputs_embeds")}async encode_audio(t){return this._encode_input("audio_encoder",t,"audio_features")}};async function ka(e,t){let{encoder_outputs:r,input_ids:s,decoder_input_ids:n,decoder_attention_mask:a,...o}=t;if(!r){const i=rt(t,e.sessions.model.inputNames);r=(await Ir(e,i)).last_hidden_state}return o.input_ids=n,o.encoder_hidden_states=r,e.sessions.decoder_model_merged.inputNames.includes("encoder_attention_mask")&&(o.encoder_attention_mask=t.attention_mask),a&&!o.attention_mask&&(o.attention_mask=a),await pr(e,o,!0)}async function Ir(e,t){const r=e.sessions.model,s=rt(t,r.inputNames);if(r.inputNames.includes("inputs_embeds")&&!s.inputs_embeds){if(!t.input_ids)throw new Error("Both `input_ids` and `inputs_embeds` are missing in the model inputs.");s.inputs_embeds=await e.encode_text({input_ids:t.input_ids})}if(r.inputNames.includes("token_type_ids")&&!s.token_type_ids){if(!s.input_ids)throw new Error("Both `input_ids` and `token_type_ids` are missing in the model inputs.");s.token_type_ids=Kf(s.input_ids)}if(r.inputNames.includes("pixel_mask")&&!s.pixel_mask){if(!s.pixel_values)throw new Error("Both `pixel_values` and `pixel_mask` are missing in the model inputs.");const n=s.pixel_values.dims;s.pixel_mask=yt([n[0],n[2],n[3]])}return await xe(r,s)}async function bE(e,t){const r=await e.encode(t);return await e.decode(r)}function el(e,t){const r=Object.create(null);for(const s in e)if(s.startsWith("present")){const n=s.replace("present_ssm","past_ssm").replace("present_conv","past_conv").replace("present_recurrent","past_recurrent").replace("present","past_key_values");s.includes("encoder")&&t?r[n]=t[n]:r[n]=e[s]}return t?(t.update(r),t):new Ji(r)}function ME(e){const t={};for(const r of["cross_attentions","encoder_attentions","decoder_attentions"])for(const s in e)s.startsWith(r)&&(r in t||(t[r]=[]),t[r].push(e[s]));return t}function tl(e,t,r){var c,d,h;if(r&&Object.keys(r).length>0)return Object.assign(t,r),r;const s=e.sessions.decoder_model_merged??e.sessions.model,n=((d=(c=t[e.main_input_name]??t.attention_mask)==null?void 0:c.dims)==null?void 0:d[0])??1,a=((h=s==null?void 0:s.config)==null?void 0:h.kv_cache_dtype)??"float32",o=a==="float16"?fs.float16:fs.float32,i=ba(e.config,{batch_size:n}),l=Object.create(null);for(const _ in i){const p=i[_].reduce((v,y)=>v*y,1),w=new U(a,new o(p),i[_]);t[_]=w,l[_]=w}return r?(r.update(l),r):new Ji(l)}async function pr(e,t,r=!1){const s=e.sessions[r?"decoder_model_merged":"model"],{past_key_values:n,...a}=t;if(s.inputNames.includes("use_cache_branch")&&(a.use_cache_branch=ap(n!=null&&Object.keys(n).length>0)),s.inputNames.includes("position_ids")&&a.attention_mask&&!a.position_ids){const i=["paligemma","gemma3_text","gemma3"].includes(e.config.model_type)?1:0;a.position_ids=TE(a,n,i)}s.inputNames.includes("num_logits_to_keep")&&!a.num_logits_to_keep&&(a.num_logits_to_keep=new U("int64",[0n],[])),tl(e,a,n);const o=rt(a,s.inputNames);return await xe(s,o)}async function lp(e,{encode_function:t,merge_function:r,modality_input_names:s,modality_output_name:n,input_ids:a=null,attention_mask:o=null,position_ids:i=null,inputs_embeds:l=null,past_key_values:c=null,generation_config:d=null,logits_processor:h=null,..._}){if(!l){l=await e.encode_text({input_ids:a,..._});const w=rt(_,s);if(Object.keys(w).length>0){if(a.dims[1]!==1){const v=await t({...w,..._});({inputs_embeds:l,attention_mask:o}=r({[n]:v,inputs_embeds:l,input_ids:a,attention_mask:o}))}else if(c&&a.dims[1]===1){const v=a.dims[1],y=c.get_seq_length();o=ze([yt([a.dims[0],y]),o.slice(null,[o.dims[1]-v,o.dims[1]])],1)}}}if(!i&&["qwen2_vl","qwen2_vl_text","qwen2_5_vl","qwen2_5_vl_text","qwen3_vl","qwen3_vl_text","qwen3_vl_moe","qwen3_vl_moe_text","qwen3_5","qwen3_5_text","qwen3_5_moe","qwen3_5_moe_text","glm_ocr","glm_ocr_text"].includes(e.config.model_type)){const{image_grid_thw:w,video_grid_thw:v}=_;[i]=e.get_rope_index(a,w,v,o)}return await pr(e,{inputs_embeds:l,past_key_values:c,attention_mask:o,position_ids:i,generation_config:d,logits_processor:h},!0)}async function xE(e,t){return await lp(e,{...t,modality_input_names:["audio_values","input_features"],modality_output_name:"audio_features",encode_function:e.encode_audio.bind(e),merge_function:e._merge_input_ids_with_audio_features.bind(e)})}async function kE(e,t){return await lp(e,{...t,modality_input_names:["pixel_values"],modality_output_name:"image_features",encode_function:e.encode_image.bind(e),merge_function:e._merge_input_ids_with_image_features.bind(e)})}function cp(e,t=0){const[r,s]=e.dims,n=e.data,a=new BigInt64Array(n.length);for(let o=0;o<r;++o){const i=o*s;let l=BigInt(t);for(let c=0;c<s;++c){const d=i+c;n[d]===0n?a[d]=BigInt(1):(a[d]=l,l+=n[d])}}return{data:a,dims:e.dims}}function TE(e,t=null,r=0){const{input_ids:s,inputs_embeds:n,attention_mask:a}=e,{data:o,dims:i}=cp(a,r);let l=new U("int64",o,i);if(t){const c=-(s??n).dims.at(1);l=l.slice(null,[c,null])}return l}function ln(e,t,r,s){const n=r.past_key_values?r.past_key_values.get_seq_length():0,a=e.sessions.decoder_model_merged??e.sessions.model;if(a!=null&&a.inputNames.includes("num_logits_to_keep")&&!r.num_logits_to_keep&&(r.num_logits_to_keep=new U("int64",[1n],[])),!r.attention_mask){let o;for(const i of["input_ids","inputs_embeds","position_ids"])if(r[i]){o=r[i].dims;break}if(!o)throw new Error("attention_mask is not provided, and unable to infer its shape from model inputs.");r.attention_mask=yt([o[0],n+o[1]])}if(r.past_key_values){const{input_ids:o,attention_mask:i}=r;i&&i.dims[1]>o.dims[1]||n<o.dims[1]&&(r.input_ids=o.slice(null,[n,null]))}return r}function Ta(e,t,r,s){return r.past_key_values&&(t=t.map(n=>[n.at(-1)])),{...r,decoder_input_ids:Ki(t)}}function Ea(e,...t){return e.config.is_encoder_decoder?Ta(e,...t):ln(e,...t)}function up({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n}){const a=s.tolist().map(c=>c.reduce((d,h,_)=>(h==e&&d.push(_),d),[])),o=a.reduce((c,d)=>c+d.length,0),i=r.dims[0];if(o!==i)throw new Error(`Number of tokens and features do not match: tokens: ${o}, features ${i}`);let l=0;for(let c=0;c<a.length;++c){const d=a[c],h=t[c];for(let _=0;_<d.length;++_)h[d[_]].data.set(r[l++].data)}return{inputs_embeds:t,attention_mask:n}}function rl({image_token_id:e,inputs_embeds:t,image_features:r,input_ids:s,attention_mask:n}){return up({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n})}function dp({audio_token_id:e,inputs_embeds:t,audio_features:r,input_ids:s,attention_mask:n}){return up({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n})}async function EE(e,t,r){return Object.fromEntries(await Promise.all(Object.keys(t).map(async s=>{const n=await er(e,t[s],!1,r);return[s,n]})))}var sl={};os(sl,{ASTForAudioClassification:()=>BE,ASTModel:()=>zE,ASTPreTrainedModel:()=>il,AfmoeForCausalLM:()=>OE,AfmoeModel:()=>IE,AfmoePreTrainedModel:()=>al,AlbertForMaskedLM:()=>PE,AlbertForQuestionAnswering:()=>SE,AlbertForSequenceClassification:()=>CE,AlbertModel:()=>AE,AlbertPreTrainedModel:()=>cn,ApertusForCausalLM:()=>LE,ApertusModel:()=>FE,ApertusPreTrainedModel:()=>nl,ArceeForCausalLM:()=>DE,ArceeModel:()=>NE,ArceePreTrainedModel:()=>ol,BartForConditionalGeneration:()=>GE,BartForSequenceClassification:()=>$E,BartModel:()=>RE,BartPretrainedModel:()=>Aa,BeitForImageClassification:()=>UE,BeitModel:()=>VE,BeitPreTrainedModel:()=>ll,BertForMaskedLM:()=>qE,BertForQuestionAnswering:()=>QE,BertForSequenceClassification:()=>WE,BertForTokenClassification:()=>HE,BertModel:()=>jE,BertPreTrainedModel:()=>ys,BlenderbotForConditionalGeneration:()=>YE,BlenderbotModel:()=>XE,BlenderbotPreTrainedModel:()=>cl,BlenderbotSmallForConditionalGeneration:()=>KE,BlenderbotSmallModel:()=>JE,BlenderbotSmallPreTrainedModel:()=>ul,BloomForCausalLM:()=>e2,BloomModel:()=>ZE,BloomPreTrainedModel:()=>dl,CHMv2ForDepthEstimation:()=>l2,CHMv2PreTrainedModel:()=>mp,CLIPModel:()=>u2,CLIPPreTrainedModel:()=>Kr,CLIPSegForImageSegmentation:()=>p2,CLIPSegModel:()=>_2,CLIPSegPreTrainedModel:()=>hl,CLIPTextModel:()=>d2,CLIPTextModelWithProjection:()=>vp,CLIPVisionModel:()=>h2,CLIPVisionModelWithProjection:()=>f2,CamembertForMaskedLM:()=>r2,CamembertForQuestionAnswering:()=>a2,CamembertForSequenceClassification:()=>s2,CamembertForTokenClassification:()=>n2,CamembertModel:()=>t2,CamembertPreTrainedModel:()=>bs,ChatterboxModel:()=>_p,ChatterboxPreTrainedModel:()=>fp,ChineseCLIPModel:()=>i2,ChineseCLIPPreTrainedModel:()=>pp,ClapAudioModelWithProjection:()=>wp,ClapModel:()=>c2,ClapPreTrainedModel:()=>Ca,ClapTextModelWithProjection:()=>gp,CodeGenForCausalLM:()=>g2,CodeGenModel:()=>m2,CodeGenPreTrainedModel:()=>fl,Cohere2ForCausalLM:()=>b2,Cohere2Model:()=>y2,Cohere2PreTrainedModel:()=>pl,CohereAsrForConditionalGeneration:()=>x2,CohereAsrModel:()=>M2,CohereAsrPreTrainedModel:()=>ml,CohereForCausalLM:()=>v2,CohereModel:()=>w2,CoherePreTrainedModel:()=>_l,ConvBertForMaskedLM:()=>T2,ConvBertForQuestionAnswering:()=>C2,ConvBertForSequenceClassification:()=>E2,ConvBertForTokenClassification:()=>A2,ConvBertModel:()=>k2,ConvBertPreTrainedModel:()=>Ms,ConvNextForImageClassification:()=>P2,ConvNextModel:()=>S2,ConvNextPreTrainedModel:()=>gl,ConvNextV2ForImageClassification:()=>L2,ConvNextV2Model:()=>F2,ConvNextV2PreTrainedModel:()=>wl,DFineForObjectDetection:()=>D2,DFineModel:()=>N2,DFinePreTrainedModel:()=>yl,DINOv3ConvNextModel:()=>lA,DINOv3ConvNextPreTrainedModel:()=>Cp,DINOv3ViTModel:()=>cA,DINOv3ViTPreTrainedModel:()=>Sp,DPTForDepthEstimation:()=>gA,DPTModel:()=>mA,DPTPreTrainedModel:()=>El,DacDecoderModel:()=>xp,DacDecoderOutput:()=>bp,DacEncoderModel:()=>Mp,DacEncoderOutput:()=>yp,DacModel:()=>z2,DacPreTrainedModel:()=>Sa,DebertaForMaskedLM:()=>R2,DebertaForQuestionAnswering:()=>V2,DebertaForSequenceClassification:()=>G2,DebertaForTokenClassification:()=>$2,DebertaModel:()=>B2,DebertaPreTrainedModel:()=>xs,DebertaV2ForMaskedLM:()=>W2,DebertaV2ForQuestionAnswering:()=>X2,DebertaV2ForSequenceClassification:()=>H2,DebertaV2ForTokenClassification:()=>Q2,DebertaV2Model:()=>q2,DebertaV2PreTrainedModel:()=>ks,DecisionTransformerModel:()=>Y2,DecisionTransformerPreTrainedModel:()=>kp,DeepseekV3ForCausalLM:()=>j2,DeepseekV3Model:()=>U2,DeepseekV3PreTrainedModel:()=>bl,DeiTForImageClassification:()=>K2,DeiTModel:()=>J2,DeiTPreTrainedModel:()=>Ml,DepthAnythingForDepthEstimation:()=>Z2,DepthAnythingPreTrainedModel:()=>Tp,DepthProForDepthEstimation:()=>eA,DepthProPreTrainedModel:()=>Ep,DetrForObjectDetection:()=>rA,DetrForSegmentation:()=>sA,DetrModel:()=>tA,DetrObjectDetectionOutput:()=>xl,DetrPreTrainedModel:()=>Pa,DetrSegmentationOutput:()=>Ap,Dinov2ForImageClassification:()=>aA,Dinov2Model:()=>nA,Dinov2PreTrainedModel:()=>kl,Dinov2WithRegistersForImageClassification:()=>iA,Dinov2WithRegistersModel:()=>oA,Dinov2WithRegistersPreTrainedModel:()=>Tl,DistilBertForMaskedLM:()=>_A,DistilBertForQuestionAnswering:()=>fA,DistilBertForSequenceClassification:()=>dA,DistilBertForTokenClassification:()=>hA,DistilBertModel:()=>uA,DistilBertPreTrainedModel:()=>Ts,DonutSwinModel:()=>pA,DonutSwinPreTrainedModel:()=>Pp,EdgeTamModel:()=>EF,EfficientNetForImageClassification:()=>vA,EfficientNetModel:()=>wA,EfficientNetPreTrainedModel:()=>Al,ElectraForMaskedLM:()=>bA,ElectraForQuestionAnswering:()=>kA,ElectraForSequenceClassification:()=>MA,ElectraForTokenClassification:()=>xA,ElectraModel:()=>yA,ElectraPreTrainedModel:()=>Es,Ernie4_5ForCausalLM:()=>EA,Ernie4_5Model:()=>TA,Ernie4_5PretrainedModel:()=>Cl,EsmForMaskedLM:()=>CA,EsmForSequenceClassification:()=>SA,EsmForTokenClassification:()=>PA,EsmModel:()=>AA,EsmPreTrainedModel:()=>dn,EuroBertForMaskedLM:()=>LA,EuroBertForSequenceClassification:()=>IA,EuroBertForTokenClassification:()=>OA,EuroBertModel:()=>FA,EuroBertPreTrainedModel:()=>hn,ExaoneForCausalLM:()=>DA,ExaoneModel:()=>NA,ExaonePreTrainedModel:()=>Sl,FalconForCausalLM:()=>BA,FalconH1ForCausalLM:()=>GA,FalconH1Model:()=>RA,FalconH1PreTrainedModel:()=>Fl,FalconModel:()=>zA,FalconPreTrainedModel:()=>Pl,FastViTForImageClassification:()=>VA,FastViTModel:()=>$A,FastViTPreTrainedModel:()=>Ll,Florence2ForConditionalGeneration:()=>UA,Florence2PreTrainedModel:()=>Fp,GLPNForDepthEstimation:()=>oC,GLPNModel:()=>aC,GLPNPreTrainedModel:()=>Gl,GPT2LMHeadModel:()=>mC,GPT2Model:()=>pC,GPT2PreTrainedModel:()=>ql,GPTBigCodeForCausalLM:()=>lC,GPTBigCodeModel:()=>iC,GPTBigCodePreTrainedModel:()=>$l,GPTJForCausalLM:()=>wC,GPTJModel:()=>gC,GPTJPreTrainedModel:()=>Wl,GPTNeoForCausalLM:()=>uC,GPTNeoModel:()=>cC,GPTNeoPreTrainedModel:()=>Vl,GPTNeoXForCausalLM:()=>hC,GPTNeoXModel:()=>dC,GPTNeoXPreTrainedModel:()=>Ul,Gemma2ForCausalLM:()=>HA,Gemma2Model:()=>WA,Gemma2PreTrainedModel:()=>Ol,Gemma3ForCausalLM:()=>JA,Gemma3ForConditionalGeneration:()=>Op,Gemma3Model:()=>YA,Gemma3PreTrainedModel:()=>Ip,Gemma3nForCausalLM:()=>KA,Gemma3nForConditionalGeneration:()=>Fa,Gemma3nPreTrainedModel:()=>Np,Gemma4ForCausalLM:()=>ZA,Gemma4ForConditionalGeneration:()=>Nl,GemmaForCausalLM:()=>qA,GemmaModel:()=>jA,GemmaPreTrainedModel:()=>Il,GlmForCausalLM:()=>tC,GlmModel:()=>eC,GlmMoeDsaForCausalLM:()=>sC,GlmMoeDsaModel:()=>rC,GlmMoeDsaPreTrainedModel:()=>zl,GlmOcrForConditionalGeneration:()=>nC,GlmPreTrainedModel:()=>Dl,GptOssForCausalLM:()=>_C,GptOssModel:()=>fC,GptOssPreTrainedModel:()=>jl,GraniteForCausalLM:()=>yC,GraniteModel:()=>vC,GraniteMoeHybridForCausalLM:()=>MC,GraniteMoeHybridModel:()=>bC,GraniteMoeHybridPreTrainedModel:()=>Ql,GranitePreTrainedModel:()=>Hl,GraniteSpeechForConditionalGeneration:()=>xC,GroundingDinoForObjectDetection:()=>kC,GroundingDinoPreTrainedModel:()=>Gp,GroupViTModel:()=>TC,GroupViTPreTrainedModel:()=>$p,HeliumForCausalLM:()=>AC,HeliumModel:()=>EC,HeliumPreTrainedModel:()=>Yl,HieraForImageClassification:()=>SC,HieraModel:()=>CC,HieraPreTrainedModel:()=>Jl,HubertForCTC:()=>DC,HubertForSequenceClassification:()=>zC,HubertModel:()=>NC,HubertPreTrainedModel:()=>OC,HunYuanDenseV1ForCausalLM:()=>RC,HunYuanDenseV1Model:()=>BC,HunYuanDenseV1PreTrainedModel:()=>Kl,IJepaForImageClassification:()=>$C,IJepaModel:()=>GC,IJepaPreTrainedModel:()=>Zl,Idefics3ForConditionalGeneration:()=>Vp,JAISLMHeadModel:()=>UC,JAISModel:()=>VC,JAISPreTrainedModel:()=>ec,JinaCLIPModel:()=>jC,JinaCLIPPreTrainedModel:()=>La,JinaCLIPTextModel:()=>Up,JinaCLIPVisionModel:()=>qC,Lfm2ForCausalLM:()=>HC,Lfm2Model:()=>WC,Lfm2MoeForCausalLM:()=>YC,Lfm2MoeModel:()=>XC,Lfm2MoePreTrainedModel:()=>rc,Lfm2PreTrainedModel:()=>tc,Lfm2VlForConditionalGeneration:()=>JC,LightOnOcrForConditionalGeneration:()=>QC,LiteWhisperForConditionalGeneration:()=>HL,Llama4ForCausalLM:()=>eS,Llama4PreTrainedModel:()=>jp,LlamaForCausalLM:()=>ZC,LlamaModel:()=>KC,LlamaPreTrainedModel:()=>sc,LlavaForConditionalGeneration:()=>mr,LlavaOnevisionForConditionalGeneration:()=>mr,LlavaPreTrainedModel:()=>Lp,LlavaQwen2ForCausalLM:()=>XA,LongT5ForConditionalGeneration:()=>rS,LongT5Model:()=>tS,LongT5PreTrainedModel:()=>nc,M2M100ForConditionalGeneration:()=>nS,M2M100Model:()=>sS,M2M100PreTrainedModel:()=>ac,MBartForCausalLM:()=>hS,MBartForConditionalGeneration:()=>uS,MBartForSequenceClassification:()=>dS,MBartModel:()=>cS,MBartPreTrainedModel:()=>fn,MPNetForMaskedLM:()=>KS,MPNetForQuestionAnswering:()=>tP,MPNetForSequenceClassification:()=>ZS,MPNetForTokenClassification:()=>eP,MPNetModel:()=>JS,MPNetPreTrainedModel:()=>As,MT5ForConditionalGeneration:()=>aP,MT5Model:()=>nP,MT5PreTrainedModel:()=>mc,MarianMTModel:()=>oS,MarianModel:()=>aS,MarianPreTrainedModel:()=>oc,MaskFormerForInstanceSegmentation:()=>lS,MaskFormerModel:()=>iS,MaskFormerPreTrainedModel:()=>ic,Metric3DForDepthEstimation:()=>fS,Metric3DPreTrainedModel:()=>qp,Metric3Dv2ForDepthEstimation:()=>_S,Metric3Dv2PreTrainedModel:()=>Wp,MgpstrForSceneTextRecognition:()=>pS,MgpstrModelOutput:()=>Hp,MgpstrPreTrainedModel:()=>Qp,MimiDecoderModel:()=>Kp,MimiDecoderOutput:()=>Yp,MimiEncoderModel:()=>Jp,MimiEncoderOutput:()=>Xp,MimiModel:()=>mS,MimiPreTrainedModel:()=>Ia,Mistral4ForCausalLM:()=>yS,Mistral4Model:()=>vS,Mistral4PreTrainedModel:()=>cc,MistralForCausalLM:()=>wS,MistralModel:()=>gS,MistralPreTrainedModel:()=>lc,MobileBertForMaskedLM:()=>MS,MobileBertForQuestionAnswering:()=>kS,MobileBertForSequenceClassification:()=>xS,MobileBertModel:()=>bS,MobileBertPreTrainedModel:()=>_n,MobileLLMForCausalLM:()=>ES,MobileLLMModel:()=>TS,MobileLLMPreTrainedModel:()=>uc,MobileNetV1ForImageClassification:()=>CS,MobileNetV1ForSemanticSegmentation:()=>SS,MobileNetV1Model:()=>AS,MobileNetV1PreTrainedModel:()=>Oa,MobileNetV2ForImageClassification:()=>FS,MobileNetV2ForSemanticSegmentation:()=>LS,MobileNetV2Model:()=>PS,MobileNetV2PreTrainedModel:()=>Na,MobileNetV3ForImageClassification:()=>OS,MobileNetV3ForSemanticSegmentation:()=>NS,MobileNetV3Model:()=>IS,MobileNetV3PreTrainedModel:()=>Da,MobileNetV4ForImageClassification:()=>zS,MobileNetV4ForSemanticSegmentation:()=>BS,MobileNetV4Model:()=>DS,MobileNetV4PreTrainedModel:()=>za,MobileViTForImageClassification:()=>GS,MobileViTModel:()=>RS,MobileViTPreTrainedModel:()=>dc,MobileViTV2ForImageClassification:()=>VS,MobileViTV2Model:()=>$S,MobileViTV2PreTrainedModel:()=>hc,ModernBertDecoderForCausalLM:()=>QS,ModernBertDecoderModel:()=>HS,ModernBertDecoderPreTrainedModel:()=>fc,ModernBertForMaskedLM:()=>jS,ModernBertForSequenceClassification:()=>qS,ModernBertForTokenClassification:()=>WS,ModernBertModel:()=>US,ModernBertPreTrainedModel:()=>pn,Moondream1ForConditionalGeneration:()=>QA,MoonshineForConditionalGeneration:()=>YS,MoonshineModel:()=>XS,MoonshinePreTrainedModel:()=>_c,MptForCausalLM:()=>sP,MptModel:()=>rP,MptPreTrainedModel:()=>pc,MultiModalityCausalLM:()=>oP,MultiModalityPreTrainedModel:()=>Zp,MusicgenForCausalLM:()=>lP,MusicgenForConditionalGeneration:()=>em,MusicgenModel:()=>iP,MusicgenPreTrainedModel:()=>gc,NanoChatForCausalLM:()=>uP,NanoChatModel:()=>cP,NanoChatPreTrainedModel:()=>wc,NemotronHForCausalLM:()=>hP,NemotronHModel:()=>dP,NemotronHPreTrainedModel:()=>vc,NeoBertForMaskedLM:()=>_P,NeoBertForQuestionAnswering:()=>gP,NeoBertForSequenceClassification:()=>pP,NeoBertForTokenClassification:()=>mP,NeoBertModel:()=>fP,NeoBertPreTrainedModel:()=>Cs,NomicBertModel:()=>wP,NomicBertPreTrainedModel:()=>tm,OPTForCausalLM:()=>PP,OPTModel:()=>SP,OPTPreTrainedModel:()=>Tc,Olmo2ForCausalLM:()=>MP,Olmo2Model:()=>bP,Olmo2PreTrainedModel:()=>bc,Olmo3ForCausalLM:()=>kP,Olmo3Model:()=>xP,Olmo3PreTrainedModel:()=>Mc,OlmoForCausalLM:()=>yP,OlmoHybridForCausalLM:()=>EP,OlmoHybridModel:()=>TP,OlmoHybridPreTrainedModel:()=>xc,OlmoModel:()=>vP,OlmoPreTrainedModel:()=>yc,OpenELMForCausalLM:()=>CP,OpenELMModel:()=>AP,OpenELMPreTrainedModel:()=>kc,OwlViTForObjectDetection:()=>OP,OwlViTModel:()=>IP,OwlViTPreTrainedModel:()=>Ac,Owlv2ForObjectDetection:()=>LP,Owlv2Model:()=>FP,Owlv2PreTrainedModel:()=>Ec,PaliGemmaForConditionalGeneration:()=>NP,ParakeetForCTC:()=>DP,ParakeetPreTrainedModel:()=>rm,PatchTSMixerForPrediction:()=>BP,PatchTSMixerModel:()=>zP,PatchTSMixerPreTrainedModel:()=>Cc,PatchTSTForPrediction:()=>GP,PatchTSTModel:()=>RP,PatchTSTPreTrainedModel:()=>Sc,Phi3ForCausalLM:()=>jP,Phi3Model:()=>UP,Phi3PreTrainedModel:()=>Fc,Phi3VForCausalLM:()=>nm,Phi3VPreTrainedModel:()=>sm,PhiForCausalLM:()=>VP,PhiModel:()=>$P,PhiPreTrainedModel:()=>Pc,PreTrainedModel:()=>P,PvtForImageClassification:()=>WP,PvtModel:()=>qP,PvtPreTrainedModel:()=>Lc,PyAnnoteForAudioFrameClassification:()=>QP,PyAnnoteModel:()=>HP,PyAnnotePreTrainedModel:()=>Ic,Qwen2ForCausalLM:()=>YP,Qwen2Model:()=>XP,Qwen2MoeForCausalLM:()=>KP,Qwen2MoeModel:()=>JP,Qwen2MoePreTrainedModel:()=>Nc,Qwen2PreTrainedModel:()=>Oc,Qwen2VLForCausalLM:()=>zp,Qwen2VLForConditionalGeneration:()=>Bl,Qwen2VLPreTrainedModel:()=>Dp,Qwen2_5_VLForCausalLM:()=>Bp,Qwen2_5_VLForConditionalGeneration:()=>Rl,Qwen3ForCausalLM:()=>eF,Qwen3Model:()=>ZP,Qwen3MoeForCausalLM:()=>rF,Qwen3MoeModel:()=>tF,Qwen3MoePreTrainedModel:()=>zc,Qwen3NextForCausalLM:()=>nF,Qwen3NextModel:()=>sF,Qwen3NextPreTrainedModel:()=>Bc,Qwen3PreTrainedModel:()=>Dc,Qwen3VLForCausalLM:()=>am,Qwen3VLForConditionalGeneration:()=>Rc,Qwen3VLMoeForCausalLM:()=>oF,Qwen3VLMoeForConditionalGeneration:()=>aF,Qwen3_5ForCausalLM:()=>om,Qwen3_5ForConditionalGeneration:()=>Gc,Qwen3_5MoeForCausalLM:()=>lF,Qwen3_5MoeForConditionalGeneration:()=>iF,RFDetrForObjectDetection:()=>hF,RFDetrModel:()=>dF,RFDetrObjectDetectionOutput:()=>im,RFDetrPreTrainedModel:()=>Vc,RTDetrForObjectDetection:()=>O2,RTDetrModel:()=>I2,RTDetrObjectDetectionOutput:()=>un,RTDetrPreTrainedModel:()=>vl,RTDetrV2ForObjectDetection:()=>kF,RTDetrV2Model:()=>xF,RTDetrV2ObjectDetectionOutput:()=>lm,RTDetrV2PreTrainedModel:()=>Uc,ResNetForImageClassification:()=>uF,ResNetModel:()=>cF,ResNetPreTrainedModel:()=>$c,RoFormerForMaskedLM:()=>vF,RoFormerForQuestionAnswering:()=>MF,RoFormerForSequenceClassification:()=>yF,RoFormerForTokenClassification:()=>bF,RoFormerModel:()=>wF,RoFormerPreTrainedModel:()=>Ps,RobertaForMaskedLM:()=>_F,RobertaForQuestionAnswering:()=>gF,RobertaForSequenceClassification:()=>pF,RobertaForTokenClassification:()=>mF,RobertaModel:()=>fF,RobertaPreTrainedModel:()=>Ss,Sam2ImageSegmentationOutput:()=>dm,Sam2Model:()=>jc,Sam2PreTrainedModel:()=>hm,Sam3TrackerModel:()=>AF,SamImageSegmentationOutput:()=>cm,SamModel:()=>TF,SamPreTrainedModel:()=>um,SapiensForDepthEstimation:()=>SF,SapiensForNormalEstimation:()=>PF,SapiensForSemanticSegmentation:()=>CF,SapiensPreTrainedModel:()=>Ba,SegformerForImageClassification:()=>LF,SegformerForSemanticSegmentation:()=>IF,SegformerModel:()=>FF,SegformerPreTrainedModel:()=>Ra,SiglipModel:()=>OF,SiglipPreTrainedModel:()=>qc,SiglipTextModel:()=>fm,SiglipVisionModel:()=>NF,SmolLM3ForCausalLM:()=>zF,SmolLM3Model:()=>DF,SmolLM3PreTrainedModel:()=>Wc,SmolVLMForConditionalGeneration:()=>BF,SnacDecoderModel:()=>pm,SnacEncoderModel:()=>_m,SnacModel:()=>RF,SnacPreTrainedModel:()=>Ga,SolarOpenForCausalLM:()=>$F,SolarOpenModel:()=>GF,SolarOpenPreTrainedModel:()=>Hc,SpeechT5ForSpeechToText:()=>UF,SpeechT5ForTextToSpeech:()=>jF,SpeechT5HifiGan:()=>qF,SpeechT5Model:()=>VF,SpeechT5PreTrainedModel:()=>$a,SqueezeBertForMaskedLM:()=>HF,SqueezeBertForQuestionAnswering:()=>XF,SqueezeBertForSequenceClassification:()=>QF,SqueezeBertModel:()=>WF,SqueezeBertPreTrainedModel:()=>mn,StableLmForCausalLM:()=>JF,StableLmModel:()=>YF,StableLmPreTrainedModel:()=>Qc,Starcoder2ForCausalLM:()=>ZF,Starcoder2Model:()=>KF,Starcoder2PreTrainedModel:()=>Xc,StyleTextToSpeech2Model:()=>eL,StyleTextToSpeech2PreTrainedModel:()=>mm,SupertonicForConditionalGeneration:()=>wm,SupertonicPreTrainedModel:()=>gm,Swin2SRForImageSuperResolution:()=>aL,Swin2SRModel:()=>nL,Swin2SRPreTrainedModel:()=>Yc,SwinForImageClassification:()=>rL,SwinForSemanticSegmentation:()=>sL,SwinModel:()=>tL,SwinPreTrainedModel:()=>Va,T5ForConditionalGeneration:()=>iL,T5Model:()=>oL,T5PreTrainedModel:()=>Jc,TableTransformerForObjectDetection:()=>cL,TableTransformerModel:()=>lL,TableTransformerObjectDetectionOutput:()=>vm,TableTransformerPreTrainedModel:()=>Kc,TrOCRForCausalLM:()=>uL,TrOCRPreTrainedModel:()=>ym,UltravoxModel:()=>Xl,UltravoxPreTrainedModel:()=>Rp,UniSpeechForCTC:()=>hL,UniSpeechForSequenceClassification:()=>fL,UniSpeechModel:()=>dL,UniSpeechPreTrainedModel:()=>Ua,UniSpeechSatForAudioFrameClassification:()=>gL,UniSpeechSatForCTC:()=>pL,UniSpeechSatForSequenceClassification:()=>mL,UniSpeechSatModel:()=>_L,UniSpeechSatPreTrainedModel:()=>gn,VaultGemmaForCausalLM:()=>vL,VaultGemmaModel:()=>wL,VaultGemmaPreTrainedModel:()=>Zc,ViTForImageClassification:()=>ML,ViTMAEModel:()=>xL,ViTMAEPreTrainedModel:()=>bm,ViTMSNForImageClassification:()=>TL,ViTMSNModel:()=>kL,ViTMSNPreTrainedModel:()=>tu,ViTModel:()=>bL,ViTPreTrainedModel:()=>eu,VisionEncoderDecoderModel:()=>yL,VitMatteForImageMatting:()=>EL,VitMattePreTrainedModel:()=>Mm,VitPoseForPoseEstimation:()=>AL,VitPosePreTrainedModel:()=>xm,VitsModel:()=>CL,VitsModelOutput:()=>km,VitsPreTrainedModel:()=>Tm,VoxtralForConditionalGeneration:()=>SL,VoxtralRealtimeForConditionalGeneration:()=>Cm,VoxtralRealtimePreTrainedModel:()=>Am,Wav2Vec2BertForCTC:()=>zL,Wav2Vec2BertForSequenceClassification:()=>BL,Wav2Vec2BertModel:()=>DL,Wav2Vec2BertPreTrainedModel:()=>ja,Wav2Vec2ForAudioFrameClassification:()=>IC,Wav2Vec2ForCTC:()=>FC,Wav2Vec2ForSequenceClassification:()=>LC,Wav2Vec2Model:()=>PC,Wav2Vec2PreTrainedModel:()=>Or,WavLMForAudioFrameClassification:()=>UL,WavLMForCTC:()=>GL,WavLMForSequenceClassification:()=>$L,WavLMForXVector:()=>VL,WavLMModel:()=>RL,WavLMPreTrainedModel:()=>Fs,WeSpeakerResNetModel:()=>jL,WeSpeakerResNetPreTrainedModel:()=>Pm,WhisperForConditionalGeneration:()=>Fm,WhisperModel:()=>WL,WhisperPreTrainedModel:()=>su,XLMForQuestionAnswering:()=>KL,XLMForSequenceClassification:()=>YL,XLMForTokenClassification:()=>JL,XLMModel:()=>QL,XLMPreTrainedModel:()=>Ls,XLMRobertaForMaskedLM:()=>eI,XLMRobertaForQuestionAnswering:()=>sI,XLMRobertaForSequenceClassification:()=>tI,XLMRobertaForTokenClassification:()=>rI,XLMRobertaModel:()=>ZL,XLMRobertaPreTrainedModel:()=>Is,XLMWithLMHeadModel:()=>XL,XVectorOutput:()=>Sm,YolosForObjectDetection:()=>aI,YolosModel:()=>nI,YolosObjectDetectionOutput:()=>Lm,YolosPreTrainedModel:()=>nu,YoutuForCausalLM:()=>iI,YoutuModel:()=>oI,YoutuPreTrainedModel:()=>au});var cn=class extends P{},AE=class extends cn{},CE=class extends cn{async _call(e){return new ie(await super._call(e))}},SE=class extends cn{async _call(e){return new ht(await super._call(e))}},PE=class extends cn{async _call(e){return new Je(await super._call(e))}},nl=class extends P{},FE=class extends nl{},LE=class extends nl{},al=class extends P{},IE=class extends al{},OE=class extends al{},ol=class extends P{},NE=class extends ol{},DE=class extends ol{},il=class extends P{},zE=class extends il{},BE=class extends il{},Aa=class extends P{},RE=class extends Aa{},GE=class extends Aa{},$E=class extends Aa{async _call(e){return new ie(await super._call(e))}},ll=class extends P{},VE=class extends ll{},UE=class extends ll{async _call(e){return new ie(await super._call(e))}},ys=class extends P{},jE=class extends ys{},qE=class extends ys{async _call(e){return new Je(await super._call(e))}},WE=class extends ys{async _call(e){return new ie(await super._call(e))}},HE=class extends ys{async _call(e){return new He(await super._call(e))}},QE=class extends ys{async _call(e){return new ht(await super._call(e))}},cl=class extends P{},XE=class extends cl{},YE=class extends cl{},ul=class extends P{},JE=class extends ul{},KE=class extends ul{},dl=class extends P{},ZE=class extends dl{},e2=class extends dl{},bs=class extends P{},t2=class extends bs{},r2=class extends bs{async _call(e){return new Je(await super._call(e))}},s2=class extends bs{async _call(e){return new ie(await super._call(e))}},n2=class extends bs{async _call(e){return new He(await super._call(e))}},a2=class extends bs{async _call(e){return new ht(await super._call(e))}},o2=4299n,hp=6561n,fp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","position_ids","audio_values","exaggeration","audio_features","audio_tokens","speaker_embeddings","speaker_features","past_key_values"]);k(this,"main_input_name","input_ids");k(this,"_return_dict_in_generate_keys",["audio_tokens","speaker_embeddings","speaker_features"])}},_p=class extends fp{async encode_speech(e){return xe(this.sessions.speech_encoder,{audio_values:e})}async forward({input_ids:e=null,attention_mask:t=null,audio_values:r=null,exaggeration:s=null,position_ids:n=null,inputs_embeds:a=null,past_key_values:o=null,generation_config:i=null,logits_processor:l=null,audio_features:c=null,audio_tokens:d=null,speaker_embeddings:h=null,speaker_features:_=null,...p}){let w;if(!a){const y=this.sessions.embed_tokens.inputNames,M={input_ids:e};if(y.includes("exaggeration")){if(!(s instanceof U)){const T=e.dims[0];if(s==null)s=ct([T],.5);else if(typeof s=="number")s=ct([T],s);else if(Array.isArray(s))s=new U("float32",s,[T]);else throw new Error("Unsupported type for `exaggeration` input")}M.exaggeration=s}if(y.includes("position_ids")&&(M.position_ids=n),{inputs_embeds:a}=await xe(this.sessions.embed_tokens,M),c&&d&&h&&_&&(w={audio_features:c,audio_tokens:d,speaker_embeddings:h,speaker_features:_}),w||r)w??(w=await this.encode_speech(r)),a=ze([w.audio_features,a],1),t=yt([a.dims[0],a.dims[1]]);else{const T=a.dims[1];if(!o||T!==1)throw new Error("Incorrect state encountered during generation.");const A=o.get_seq_length();t=yt([a.dims[0],A+T])}}return{...await pr(this,{inputs_embeds:a,past_key_values:o,attention_mask:t,generation_config:i,logits_processor:l},!1),...w}}prepare_inputs_for_generation(e,t,r){if(!t.position_ids&&this.sessions.embed_tokens.inputNames.includes("position_ids"))if(t.input_ids.dims[1]===1){const s=Array.from({length:e.length},(n,a)=>e[a].length-e[a].findLastIndex(o=>o==hp)-1);t.position_ids=new U("int64",s,[e.length,1])}else{const n=t.input_ids.tolist().map(a=>{let o=0;return a.map(i=>i>=hp?0:o++)});t.position_ids=new U("int64",n.flat(),t.input_ids.dims)}return t.input_ids.dims[1]===1&&(delete t.audio_values,delete t.audio_features,delete t.audio_tokens,delete t.speaker_embeddings,delete t.speaker_features),ln(this,e,t)}async generate(e){const{sequences:t,audio_tokens:r,speaker_embeddings:s,speaker_features:n}=await super.generate({...e,return_dict_in_generate:!0}),a=t.slice(null,[e.input_ids.dims[1],-1]),o=ct([a.dims[0],3],o2),i=ze([r,a,o],1),{waveform:l}=await xe(this.sessions.conditional_decoder,{speech_tokens:i,speaker_features:n,speaker_embeddings:s});return l}},pp=class extends P{},i2=class extends pp{},mp=class extends P{},l2=class extends mp{},Ca=class extends P{},c2=class extends Ca{},gp=class extends Ca{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},wp=class extends Ca{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"audio_model"})}},Kr=class extends P{},u2=class extends Kr{},d2=class extends Kr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},vp=class extends Kr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},h2=class extends Kr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},f2=class extends Kr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},hl=class extends P{},_2=class extends hl{},p2=class extends hl{},fl=class extends P{},m2=class extends fl{},g2=class extends fl{},_l=class extends P{},w2=class extends _l{},v2=class extends _l{},pl=class extends P{},y2=class extends pl{},b2=class extends pl{},ml=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_features");k(this,"forward_params",["input_features","decoder_input_ids","decoder_attention_mask","past_key_values"])}},M2=class extends ml{},x2=class extends ml{},Ms=class extends P{},k2=class extends Ms{},T2=class extends Ms{async _call(e){return new Je(await super._call(e))}},E2=class extends Ms{async _call(e){return new ie(await super._call(e))}},A2=class extends Ms{async _call(e){return new He(await super._call(e))}},C2=class extends Ms{async _call(e){return new ht(await super._call(e))}},gl=class extends P{},S2=class extends gl{},P2=class extends gl{async _call(e){return new ie(await super._call(e))}},wl=class extends P{},F2=class extends wl{},L2=class extends wl{async _call(e){return new ie(await super._call(e))}},vl=class extends P{},I2=class extends vl{},O2=class extends vl{async _call(e){return new un(await super._call(e))}},un=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},yl=class extends P{},N2=class extends yl{},D2=class extends yl{async _call(e){return new un(await super._call(e))}},yp=class extends Ye{constructor({audio_codes:e}){super(),this.audio_codes=e}},bp=class extends Ye{constructor({audio_values:e}){super(),this.audio_values=e}},Sa=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},z2=class extends Sa{async encode(e){return new yp(await xe(this.sessions.encoder_model,e))}async decode(e){return new bp(await xe(this.sessions.decoder_model,e))}},Mp=class extends Sa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},xp=class extends Sa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},xs=class extends P{},B2=class extends xs{},R2=class extends xs{async _call(e){return new Je(await super._call(e))}},G2=class extends xs{async _call(e){return new ie(await super._call(e))}},$2=class extends xs{async _call(e){return new He(await super._call(e))}},V2=class extends xs{async _call(e){return new ht(await super._call(e))}},bl=class extends P{},U2=class extends bl{},j2=class extends bl{},ks=class extends P{},q2=class extends ks{},W2=class extends ks{async _call(e){return new Je(await super._call(e))}},H2=class extends ks{async _call(e){return new ie(await super._call(e))}},Q2=class extends ks{async _call(e){return new He(await super._call(e))}},X2=class extends ks{async _call(e){return new ht(await super._call(e))}},kp=class extends P{},Y2=class extends kp{},Ml=class extends P{},J2=class extends Ml{},K2=class extends Ml{async _call(e){return new ie(await super._call(e))}},Tp=class extends P{},Z2=class extends Tp{},Ep=class extends P{},eA=class extends Ep{},Pa=class extends P{},tA=class extends Pa{},rA=class extends Pa{async _call(e){return new xl(await super._call(e))}},sA=class extends Pa{async _call(e){return new Ap(await super._call(e))}},xl=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},Ap=class extends Ye{constructor({logits:e,pred_boxes:t,pred_masks:r}){super(),this.logits=e,this.pred_boxes=t,this.pred_masks=r}},kl=class extends P{},nA=class extends kl{},aA=class extends kl{async _call(e){return new ie(await super._call(e))}},Tl=class extends P{},oA=class extends Tl{},iA=class extends Tl{async _call(e){return new ie(await super._call(e))}},Cp=class extends P{},lA=class extends Cp{},Sp=class extends P{},cA=class extends Sp{},Ts=class extends P{},uA=class extends Ts{},dA=class extends Ts{async _call(e){return new ie(await super._call(e))}},hA=class extends Ts{async _call(e){return new He(await super._call(e))}},fA=class extends Ts{async _call(e){return new ht(await super._call(e))}},_A=class extends Ts{async _call(e){return new Je(await super._call(e))}},Pp=class extends P{},pA=class extends Pp{},El=class extends P{},mA=class extends El{},gA=class extends El{},Al=class extends P{},wA=class extends Al{},vA=class extends Al{async _call(e){return new ie(await super._call(e))}},Es=class extends P{},yA=class extends Es{},bA=class extends Es{async _call(e){return new Je(await super._call(e))}},MA=class extends Es{async _call(e){return new ie(await super._call(e))}},xA=class extends Es{async _call(e){return new He(await super._call(e))}},kA=class extends Es{async _call(e){return new ht(await super._call(e))}},Cl=class extends P{},TA=class extends Cl{},EA=class extends Cl{},dn=class extends P{},AA=class extends dn{},CA=class extends dn{async _call(e){return new Je(await super._call(e))}},SA=class extends dn{async _call(e){return new ie(await super._call(e))}},PA=class extends dn{async _call(e){return new He(await super._call(e))}},hn=class extends P{},FA=class extends hn{},LA=class extends hn{async _call(e){return new Je(await super._call(e))}},IA=class extends hn{async _call(e){return new ie(await super._call(e))}},OA=class extends hn{async _call(e){return new He(await super._call(e))}},Sl=class extends P{},NA=class extends Sl{},DA=class extends Sl{},Pl=class extends P{},zA=class extends Pl{},BA=class extends Pl{},Fl=class extends P{},RA=class extends Fl{},GA=class extends Fl{},Ll=class extends P{},$A=class extends Ll{},VA=class extends Ll{async _call(e){return new ie(await super._call(e))}},Fp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","pixel_values","encoder_outputs","decoder_input_ids","decoder_inputs_embeds","decoder_attention_mask","past_key_values"]);k(this,"main_input_name","inputs_embeds")}},UA=class extends Fp{_merge_input_ids_with_image_features({inputs_embeds:e,image_features:t,input_ids:r,attention_mask:s}){return{inputs_embeds:ze([t,e],1),attention_mask:ze([yt(t.dims.slice(0,2)),s],1)}}async _prepare_inputs_embeds({input_ids:e,pixel_values:t,inputs_embeds:r,attention_mask:s}){if(!e&&!t)throw new Error("Either `input_ids` or `pixel_values` should be provided.");let n,a;return e&&(n=await this.encode_text({input_ids:e})),t&&(a=await this.encode_image({pixel_values:t})),n&&a?{inputs_embeds:r,attention_mask:s}=this._merge_input_ids_with_image_features({inputs_embeds:n,image_features:a,input_ids:e,attention_mask:s}):r=n||a,{inputs_embeds:r,attention_mask:s}}async forward({input_ids:e,pixel_values:t,attention_mask:r,decoder_input_ids:s,decoder_attention_mask:n,encoder_outputs:a,past_key_values:o,inputs_embeds:i,decoder_inputs_embeds:l}){if(i||({inputs_embeds:i,attention_mask:r}=await this._prepare_inputs_embeds({input_ids:e,pixel_values:t,inputs_embeds:i,attention_mask:r})),!a){let{last_hidden_state:d}=await Ir(this,{inputs_embeds:i,attention_mask:r});a=d}if(!l){if(!s)throw new Error("Either `decoder_input_ids` or `decoder_inputs_embeds` should be provided.");l=await this.encode_text({input_ids:s})}return await pr(this,{inputs_embeds:l,attention_mask:n,encoder_attention_mask:r,encoder_hidden_states:a,past_key_values:o},!0)}},Il=class extends P{},jA=class extends Il{},qA=class extends Il{},Ol=class extends P{},WA=class extends Ol{},HA=class extends Ol{},Lp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","position_ids","past_key_values"])}},mr=class extends Lp{_merge_input_ids_with_image_features(e){const t=e.image_features.dims.at(-1),r=e.image_features.view(-1,t);return rl({image_token_id:this.config.image_token_index??this.config.image_token_id,...e,image_features:r})}},QA=class extends mr{},XA=class extends mr{},Ip=class extends P{},YA=class extends Ip{},Op=class extends mr{},JA=class extends Op{},Np=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","inputs_embeds","per_layer_inputs","position_ids","pixel_values","input_features","input_features_mask","past_key_values"])}},Fa=class extends Np{async forward({input_ids:e=null,attention_mask:t=null,pixel_values:r=null,input_features:s=null,input_features_mask:n=null,position_ids:a=null,inputs_embeds:o=null,per_layer_inputs:i=null,past_key_values:l=null,generation_config:c=null,logits_processor:d=null,...h}){if((!o||!i)&&({inputs_embeds:o,per_layer_inputs:i}=await xe(this.sessions.embed_tokens,{input_ids:e}),e.dims[1]!==1)){if(r){const{image_features:p}=await this._encode_vision({pixel_values:r,...h});({inputs_embeds:o,attention_mask:t}=this._merge_input_ids_with_image_features({image_features:p,inputs_embeds:o,input_ids:e,attention_mask:t}))}if(s){const{audio_features:p}=await xe(this.sessions.audio_encoder,{input_features:s,input_features_mask:n});({inputs_embeds:o,attention_mask:t}=this._merge_input_ids_with_audio_features({audio_features:p,inputs_embeds:o,input_ids:e,attention_mask:t}))}}return await pr(this,{inputs_embeds:o,per_layer_inputs:i,past_key_values:l,attention_mask:t,position_ids:a,generation_config:c,logits_processor:d},!0)}_encode_vision(e){return xe(this.sessions.vision_encoder,{pixel_values:e.pixel_values})}_merge_input_ids_with_image_features(e){const t=e.image_features.dims.at(-1),r=e.image_features.view(-1,t);return rl({image_token_id:this.config.image_token_id,...e,image_features:r})}_merge_input_ids_with_audio_features(e){const t=e.audio_features.dims.at(-1),r=e.audio_features.view(-1,t);return dp({audio_token_id:this.config.audio_token_id,...e,audio_features:r})}},KA=class extends Fa{},Nl=class extends Fa{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","inputs_embeds","per_layer_inputs","position_ids","pixel_values","image_position_ids","input_features","input_features_mask","past_key_values"])}_encode_vision(t){return xe(this.sessions.vision_encoder,{pixel_values:t.pixel_values,pixel_position_ids:t.image_position_ids})}},ZA=class extends Nl{},Dl=class extends P{},eC=class extends Dl{},tC=class extends Dl{},zl=class extends P{},rC=class extends zl{},sC=class extends zl{},Dp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","past_key_values","pixel_values","image_grid_thw"])}},Bl=class extends Dp{constructor(){super(...arguments);k(this,"image_grid_thw_name","grid_thw")}_get_text_only_rope_index(t,r){if(r){const{data:s,dims:n}=cp(r),a=BigInt64Array.from({length:3*s.length},(i,l)=>s[l%s.length]),o=Array.from({length:n[0]},(i,l)=>je(s.subarray(n[1]*l,n[1]*(l+1)))[0]+1n+BigInt(n[1]));return[new U("int64",a,[3,...n]),new U("int64",o,[o.length,1])]}else{const[s,n]=t.dims,a=BigInt64Array.from({length:3*s*n},(o,i)=>BigInt(Math.floor(i%n/s)));return[new U("int64",a,[3,...t.dims]),Jf([s,1])]}}_reorder_and_write_positions(t,r,s,n){const a=t.reduce((c,d)=>c+d.length,0),o=new Array(a);let i=0;for(let c=0;c<3;++c)for(const d of t){const h=d.length/3;for(let _=c*h;_<(c+1)*h;++_)o[i++]=d[_]}let l=0;for(let c=0;c<r.length;++c)if(r[c]==1){for(let d=0;d<3;++d)s[d][n][c]=o[d*a/3+l];++l}return o}_get_multimodal_rope_positions({filtered_ids:t,image_grid_thw_list:r,video_grid_thw_list:s,spatial_merge_size:n,state:a}){const{image_token_id:o,video_token_id:i,vision_start_token_id:l}=this.config,c=t,h=c.reduce((T,A,C)=>(A==l&&T.push(C),T),[]).map(T=>c[T+1]),_=h.filter(T=>T==o).length,p=h.filter(T=>T==i).length,w=[];let v=0,y=_,M=p;for(let T=0;T<h.length;++T){const A=c.findIndex((X,J)=>J>v&&X==o),C=c.findIndex((X,J)=>J>v&&X==i),S=y>0&&A!==-1?A:c.length+1,N=M>0&&C!==-1?C:c.length+1;let x,R,z,$;S<N?([R,z,$]=r[a.image_index],++a.image_index,--y,x=S):([R,z,$]=s[a.video_index],++a.video_index,--M,x=N);const[Q,H,D]=[Number(R),Math.floor(Number(z)/n),Math.floor(Number($)/n)],I=x-v,te=w.length>0?je(w.at(-1))[0]+1:0;w.push(Array.from({length:3*I},(X,J)=>te+J%I));const W=I+te,ee=Q*H*D,G=Array.from({length:ee},(X,J)=>W+Math.floor(J/(H*D))),L=Array.from({length:ee},(X,J)=>W+Math.floor(J/D)%H),V=Array.from({length:ee},(X,J)=>W+J%D);w.push([G,L,V].flat()),v=x+ee}if(v<c.length){const T=w.length>0?je(w.at(-1))[0]+1:0,A=c.length-v;w.push(Array.from({length:3*A},(C,S)=>T+S%A))}return w}get_rope_index(t,r,s,n){const{vision_config:a}=this.config,o=a.spatial_merge_size??2;if(r||s){const i=t.tolist();n||(n=Yf(t));const l=n.tolist(),c=Array.from({length:3},()=>Array.from({length:t.dims[0]},()=>Array.from({length:t.dims[1]},()=>0))),d=r?r.tolist():[],h=s?s.tolist():[],_={image_index:0,video_index:0},p=[];for(let w=0;w<i.length;++w){const v=i[w].filter((T,A)=>l[w][A]==1),y=this._get_multimodal_rope_positions({filtered_ids:v,image_grid_thw_list:d,video_grid_thw_list:h,spatial_merge_size:o,state:_}),M=this._reorder_and_write_positions(y,l[w],c,w);p.push(je(M)[0]+1-i[w].length)}return[new U("int64",c.flat(1/0),[3,t.dims[0],t.dims[1]]),new U("int64",p,[p.length,1])]}else return this._get_text_only_rope_index(t,n)}async encode_image({pixel_values:t,image_grid_thw:r}){return(await xe(this.sessions.vision_encoder,{pixel_values:t,[this.image_grid_thw_name]:r})).image_features}_merge_input_ids_with_image_features(t){return rl({image_token_id:this.config.image_token_id,...t})}prepare_inputs_for_generation(t,r,s){if(!r.attention_mask||r.position_ids||!(this.sessions.decoder_model_merged??this.sessions.model).inputNames.includes("position_ids"))return r;if(!r.past_key_values)[r.position_ids,r.rope_deltas]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask);else{r.pixel_values=null;const a=r.past_key_values.get_seq_length();if(a<r.input_ids.dims[1]){const[o,i]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask);r.rope_deltas=i,r.position_ids=o.slice(null,null,[a,null]),r.input_ids=r.input_ids.slice(null,[a,null])}else{r.rope_deltas||([,r.rope_deltas]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask));const o=BigInt(a),i=r.rope_deltas.map(l=>o+l);r.position_ids=dr([i,i,i],0)}}return r}},zp=class extends Bl{},Rl=class extends Bl{constructor(){super(...arguments);k(this,"image_grid_thw_name","image_grid_thw")}},Bp=class extends zp{constructor(){super(...arguments);k(this,"image_grid_thw_name","image_grid_thw")}},nC=class extends Rl{get_vision_position_ids(e,t,r,s){const n=Math.floor(t[0]/r),a=Math.floor(t[1]/s),o=Math.floor(t[2]/s),i=a*o*n,l=Array.from({length:i},()=>e),c=Array.from({length:i},(h,_)=>e+Math.floor(_/(o*n))),d=Array.from({length:i},(h,_)=>e+_%o);return[...l,...c,...d]}_get_multimodal_rope_positions({filtered_ids:e,image_grid_thw_list:t,video_grid_thw_list:r,spatial_merge_size:s,state:n}){const{image_token_id:a}=this.config,o=[];let i=0,l=e[0]==a?1:0;for(let h=1;h<=e.length;++h){const _=h<e.length?e[h]==a?1:0:-1;_!==l&&(o.push([l,i,h]),i=h,l=_)}let c=0;const d=[];for(const[h,_,p]of o)if(h===0){const w=p-_;d.push(Array.from({length:3*w},(v,y)=>c+y%w)),c+=w}else{const w=t[n.image_index++].map(Number),v=w[0];d.push(this.get_vision_position_ids(c,w,v,s)),c+=Math.max(w[1],w[2])/s}return d}},Gl=class extends P{},aC=class extends Gl{},oC=class extends Gl{},$l=class extends P{},iC=class extends $l{},lC=class extends $l{},Vl=class extends P{},cC=class extends Vl{},uC=class extends Vl{},Ul=class extends P{},dC=class extends Ul{},hC=class extends Ul{},jl=class extends P{},fC=class extends jl{},_C=class extends jl{},ql=class extends P{},pC=class extends ql{},mC=class extends ql{},Wl=class extends P{},gC=class extends Wl{},wC=class extends Wl{},Hl=class extends P{},vC=class extends Hl{},yC=class extends Hl{},Ql=class extends P{},bC=class extends Ql{},MC=class extends Ql{},Rp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","audio_values","past_key_values"])}},Xl=class extends Rp{_merge_input_ids_with_audio_features(e){const t=e.audio_features.dims.at(-1),r=e.audio_features.view(-1,t);return dp({audio_token_id:this.config.ignore_index??this.config.audio_token_id??this.config.audio_token_index,...e,audio_features:r})}},xC=class extends Xl{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","input_features","past_key_values"])}},Gp=class extends P{},kC=class extends Gp{},$p=class extends P{},TC=class extends $p{},Yl=class extends P{},EC=class extends Yl{},AC=class extends Yl{},Jl=class extends P{},CC=class extends Jl{},SC=class extends Jl{async _call(e){return new ie(await super._call(e))}},Or=class extends P{},PC=class extends Or{},FC=class extends Or{async _call(e){return new Jr(await super._call(e))}},LC=class extends Or{async _call(e){return new ie(await super._call(e))}},IC=class extends Or{async _call(e){return new He(await super._call(e))}},OC=class extends P{},NC=class extends Or{},DC=class extends Or{async _call(e){return new Jr(await super._call(e))}},zC=class extends Or{async _call(e){return new ie(await super._call(e))}},Kl=class extends P{},BC=class extends Kl{},RC=class extends Kl{},Vp=class extends mr{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","pixel_attention_mask","position_ids","past_key_values"])}},Zl=class extends P{},GC=class extends Zl{},$C=class extends Zl{async _call(e){return new ie(await super._call(e))}},ec=class extends P{},VC=class extends ec{},UC=class extends ec{},La=class extends P{},jC=class extends La{async forward(e){const t=!e.input_ids,r=!e.pixel_values;if(t&&r)throw new Error("Either `input_ids` or `pixel_values` should be provided.");if(t&&(e.input_ids=yt([e.pixel_values.dims[0],1])),r){const{image_size:l}=this.config.vision_config;e.pixel_values=ct([0,3,l,l],0)}const{text_embeddings:s,image_embeddings:n,l2norm_text_embeddings:a,l2norm_image_embeddings:o}=await super.forward(e),i={};return t||(i.text_embeddings=s,i.l2norm_text_embeddings=a),r||(i.image_embeddings=n,i.l2norm_image_embeddings=o),i}},Up=class extends La{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},qC=class extends La{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},tc=class extends P{},WC=class extends tc{},HC=class extends tc{},QC=class extends mr{},rc=class extends P{},XC=class extends rc{},YC=class extends rc{},JC=class extends mr{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","pixel_attention_mask","spatial_shapes","position_ids","past_key_values"])}},sc=class extends P{},KC=class extends sc{},ZC=class extends sc{},jp=class extends P{},eS=class extends jp{},nc=class extends P{},tS=class extends nc{},rS=class extends nc{},ac=class extends P{},sS=class extends ac{},nS=class extends ac{},oc=class extends P{},aS=class extends oc{},oS=class extends oc{},ic=class extends P{},iS=class extends ic{},lS=class extends ic{},fn=class extends P{},cS=class extends fn{},uS=class extends fn{},dS=class extends fn{async _call(e){return new ie(await super._call(e))}},hS=class extends fn{},qp=class extends P{},fS=class extends qp{},Wp=class extends P{},_S=class extends Wp{},Hp=class extends Ye{constructor({char_logits:e,bpe_logits:t,wp_logits:r}){super(),this.char_logits=e,this.bpe_logits=t,this.wp_logits=r}get logits(){return[this.char_logits,this.bpe_logits,this.wp_logits]}},Qp=class extends P{},pS=class extends Qp{async _call(e){return new Hp(await super._call(e))}},Xp=class extends Ye{constructor({audio_codes:e}){super(),this.audio_codes=e}},Yp=class extends Ye{constructor({audio_values:e}){super(),this.audio_values=e}},Ia=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},mS=class extends Ia{async encode(e){return new Xp(await xe(this.sessions.encoder_model,e))}async decode(e){return new Yp(await xe(this.sessions.decoder_model,e))}},Jp=class extends Ia{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},Kp=class extends Ia{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},lc=class extends P{},gS=class extends lc{},wS=class extends lc{},cc=class extends P{},vS=class extends cc{},yS=class extends cc{},_n=class extends P{},bS=class extends _n{},MS=class extends _n{async _call(e){return new Je(await super._call(e))}},xS=class extends _n{async _call(e){return new ie(await super._call(e))}},kS=class extends _n{async _call(e){return new ht(await super._call(e))}},uc=class extends P{},TS=class extends uc{},ES=class extends uc{},Oa=class extends P{},AS=class extends Oa{},CS=class extends Oa{async _call(e){return new ie(await super._call(e))}},SS=class extends Oa{},Na=class extends P{},PS=class extends Na{},FS=class extends Na{async _call(e){return new ie(await super._call(e))}},LS=class extends Na{},Da=class extends P{},IS=class extends Da{},OS=class extends Da{async _call(e){return new ie(await super._call(e))}},NS=class extends Da{},za=class extends P{},DS=class extends za{},zS=class extends za{async _call(e){return new ie(await super._call(e))}},BS=class extends za{},dc=class extends P{},RS=class extends dc{},GS=class extends dc{async _call(e){return new ie(await super._call(e))}},hc=class extends P{},$S=class extends hc{},VS=class extends hc{async _call(e){return new ie(await super._call(e))}},pn=class extends P{},US=class extends pn{},jS=class extends pn{async _call(e){return new Je(await super._call(e))}},qS=class extends pn{async _call(e){return new ie(await super._call(e))}},WS=class extends pn{async _call(e){return new He(await super._call(e))}},fc=class extends P{},HS=class extends fc{},QS=class extends fc{},_c=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values","decoder_input_ids","past_key_values"])}},XS=class extends _c{},YS=class extends _c{},As=class extends P{},JS=class extends As{},KS=class extends As{async _call(e){return new Je(await super._call(e))}},ZS=class extends As{async _call(e){return new ie(await super._call(e))}},eP=class extends As{async _call(e){return new He(await super._call(e))}},tP=class extends As{async _call(e){return new ht(await super._call(e))}},pc=class extends P{},rP=class extends pc{},sP=class extends pc{},mc=class extends P{},nP=class extends mc{},aP=class extends mc{},Zp=class extends P{},oP=class extends Zp{constructor(...t){super(...t);k(this,"forward_params",["input_ids","pixel_values","images_seq_mask","images_emb_mask","attention_mask","position_ids","past_key_values"]);this._generation_mode="text"}async forward(t){const r=this._generation_mode??"text";let s;if(r==="text"||!t.past_key_values){const l=this.sessions.prepare_inputs_embeds,c=rt(t,l.inputNames);s=await xe(l,c)}else{const l=this.sessions.gen_img_embeds,c=rt({image_ids:t.input_ids},l.inputNames);s=await xe(l,c)}const n={...t,...s},a=await pr(this,n),o=this.sessions[r==="text"?"lm_head":"gen_head"];if(!o)throw new Error(`Unable to find "${o}" generation head`);const i=await xe(o,rt(a,o.inputNames));return{...s,...a,...i}}prepare_inputs_for_generation(t,r,s){const n=!!r.past_key_values;return s.guidance_scale!==null&&s.guidance_scale>1&&(n?r.input_ids=ze([r.input_ids,r.input_ids],0):(r.input_ids=ze([r.input_ids,Li(r.input_ids,BigInt(s.pad_token_id))],0),r.attention_mask=ze([r.attention_mask,Li(r.attention_mask,0n)],0))),(n||!r.pixel_values)&&(r.pixel_values=ct([0,0,3,384,384],1)),n&&(r.images_seq_mask=new U("bool",new Array(1).fill(!0).fill(!1,0,1),[1,1]),r.images_emb_mask=new U("bool",new Array(0).fill(!1),[1,1,0])),r}async generate(t){return this._generation_mode="text",super.generate(t)}async generate_images(t){this._generation_mode="image";const r=(t.inputs??t[this.main_input_name]).dims[1],n=(await super.generate(t)).slice(null,[r,null]),a=this.sessions.image_decode,{decoded_image:o}=await xe(a,{generated_tokens:n}),i=o.add_(1).mul_(255/2).clamp_(0,255).to("uint8"),l=[];for(const c of i){const d=qt.fromTensor(c);l.push(d)}return l}},gc=class extends P{},iP=class extends gc{},lP=class extends gc{},em=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"])}_apply_and_filter_by_delay_pattern_mask(t){const[r,s]=t.dims,n=this.config.decoder.num_codebooks,a=s-n;let o=0;for(let c=0;c<t.size;++c){if(t.data[c]==this.config.decoder.pad_token_id)continue;const d=c%s,h=Math.floor(c/s)%n,_=d-h;_>0&&_<=a&&(t.data[o++]=t.data[c])}const i=Math.floor(r/n),l=o/(i*n);return new U(t.type,t.data.slice(0,o),[i,n,l])}prepare_inputs_for_generation(t,r,s){const n=BigInt(this.config.decoder.pad_token_id);let a=structuredClone(t);for(let o=0;o<a.length;++o)for(let i=0;i<a[o].length;++i)o%this.config.decoder.num_codebooks>=i&&(a[o][i]=n);return s.guidance_scale!==null&&s.guidance_scale>1&&(a=a.concat(a)),Ta(this,a,r)}async generate(t){const r=await super.generate(t),s=this._apply_and_filter_by_delay_pattern_mask(r).unsqueeze_(0),{audio_values:n}=await xe(this.sessions.encodec_decode,{audio_codes:s});return n}},wc=class extends P{},cP=class extends wc{},uP=class extends wc{},vc=class extends P{},dP=class extends vc{},hP=class extends vc{},Cs=class extends P{},fP=class extends Cs{},_P=class extends Cs{async _call(e){return new Je(await super._call(e))}},pP=class extends Cs{async _call(e){return new ie(await super._call(e))}},mP=class extends Cs{async _call(e){return new He(await super._call(e))}},gP=class extends Cs{async _call(e){return new ht(await super._call(e))}},tm=class extends P{},wP=class extends tm{},yc=class extends P{},vP=class extends yc{},yP=class extends yc{},bc=class extends P{},bP=class extends bc{},MP=class extends bc{},Mc=class extends P{},xP=class extends Mc{},kP=class extends Mc{},xc=class extends P{},TP=class extends xc{},EP=class extends xc{},kc=class extends P{},AP=class extends kc{},CP=class extends kc{},Tc=class extends P{},SP=class extends Tc{},PP=class extends Tc{},Ec=class extends P{},FP=class extends Ec{},LP=class extends Ec{},Ac=class extends P{},IP=class extends Ac{},OP=class extends Ac{},NP=class extends mr{},rm=class extends P{},DP=class extends rm{async _call(e){return new Jr(await super._call(e))}},Cc=class extends P{},zP=class extends Cc{},BP=class extends Cc{},Sc=class extends P{},RP=class extends Sc{},GP=class extends Sc{},Pc=class extends P{},$P=class extends Pc{},VP=class extends Pc{},Fc=class extends P{},UP=class extends Fc{},jP=class extends Fc{},sm=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","position_ids","pixel_values","image_sizes","past_key_values"])}},nm=class extends sm{async forward({input_ids:e=null,attention_mask:t=null,pixel_values:r=null,image_sizes:s=null,position_ids:n=null,inputs_embeds:a=null,past_key_values:o=null,generation_config:i=null,logits_processor:l=null,...c}){if(!a){let h;if(r&&e.dims[1]!==1){if(!s)throw new Error("`image_sizes` must be provided when `pixel_values` is provided.");({image_features:h}=await xe(this.sessions.vision_encoder,{pixel_values:r,image_sizes:s}))}else{const _=this.config.normalized_config.hidden_size;h=new U("float32",[],[0,_])}({inputs_embeds:a}=await xe(this.sessions.prepare_inputs_embeds,{input_ids:e,image_features:h}))}return await pr(this,{inputs_embeds:a,past_key_values:o,attention_mask:t,position_ids:n,generation_config:i,logits_processor:l},!1)}},Lc=class extends P{},qP=class extends Lc{},WP=class extends Lc{async _call(e){return new ie(await super._call(e))}},Ic=class extends P{},HP=class extends Ic{},QP=class extends Ic{async _call(e){return new He(await super._call(e))}},Oc=class extends P{},XP=class extends Oc{},YP=class extends Oc{},Nc=class extends P{},JP=class extends Nc{},KP=class extends Nc{},Dc=class extends P{},ZP=class extends Dc{},eF=class extends Dc{},zc=class extends P{},tF=class extends zc{},rF=class extends zc{},Bc=class extends P{},sF=class extends Bc{},nF=class extends Bc{},Rc=class extends Rl{},am=class extends Bp{},aF=class extends Rc{},oF=class extends am{},Gc=class extends Rc{},om=class extends Gc{},iF=class extends Gc{},lF=class extends om{},$c=class extends P{},cF=class extends $c{},uF=class extends $c{async _call(e){return new ie(await super._call(e))}},Vc=class extends P{},dF=class extends Vc{},hF=class extends Vc{async _call(e){return new im(await super._call(e))}},im=class extends un{},Ss=class extends P{},fF=class extends Ss{},_F=class extends Ss{async _call(e){return new Je(await super._call(e))}},pF=class extends Ss{async _call(e){return new ie(await super._call(e))}},mF=class extends Ss{async _call(e){return new He(await super._call(e))}},gF=class extends Ss{async _call(e){return new ht(await super._call(e))}},Ps=class extends P{},wF=class extends Ps{},vF=class extends Ps{async _call(e){return new Je(await super._call(e))}},yF=class extends Ps{async _call(e){return new ie(await super._call(e))}},bF=class extends Ps{async _call(e){return new He(await super._call(e))}},MF=class extends Ps{async _call(e){return new ht(await super._call(e))}},Uc=class extends P{},xF=class extends Uc{},kF=class extends Uc{async _call(e){return new lm(await super._call(e))}},lm=class extends un{},cm=class extends Ye{constructor({iou_scores:e,pred_masks:t}){super(),this.iou_scores=e,this.pred_masks=t}},um=class extends P{},TF=class extends um{async get_image_embeddings({pixel_values:e}){return await Ir(this,{pixel_values:e})}async forward(e){!e.image_embeddings||!e.image_positional_embeddings?e={...e,...await this.get_image_embeddings(e)}:e={...e},e.input_labels??(e.input_labels=yt(e.input_points.dims.slice(0,-1)));const t={image_embeddings:e.image_embeddings,image_positional_embeddings:e.image_positional_embeddings};return e.input_points&&(t.input_points=e.input_points),e.input_labels&&(t.input_labels=e.input_labels),e.input_boxes&&(t.input_boxes=e.input_boxes),await xe(this.sessions.prompt_encoder_mask_decoder,t)}async _call(e){return new cm(await super._call(e))}},dm=class extends Ye{constructor({iou_scores:e,pred_masks:t,object_score_logits:r}){super(),this.iou_scores=e,this.pred_masks=t,this.object_score_logits=r}},hm=class extends P{},jc=class extends hm{async get_image_embeddings({pixel_values:e}){return await Ir(this,{pixel_values:e})}async forward(e){const{num_feature_levels:t}=this.config.vision_config;if(Array.from({length:t},(a,o)=>`image_embeddings.${o}`).some(a=>!e[a])?e={...e,...await this.get_image_embeddings(e)}:e={...e},e.input_points){if(e.input_boxes&&e.input_boxes.dims[1]!==1)throw new Error("When both `input_points` and `input_boxes` are provided, the number of boxes per image must be 1.");const a=e.input_points.dims;e.input_labels??(e.input_labels=yt(a.slice(0,-1))),e.input_boxes??(e.input_boxes=ct([a[0],0,4],0))}else if(e.input_boxes){const a=e.input_boxes.dims;e.input_labels=ct([a[0],a[1],0],-1n),e.input_points=ct([a[0],1,0,2],0)}else throw new Error("At least one of `input_points` or `input_boxes` must be provided.");const s=this.sessions.prompt_encoder_mask_decoder,n=rt(e,s.inputNames);return await xe(s,n)}async _call(e){return new dm(await super._call(e))}},EF=class extends jc{},AF=class extends jc{},Ba=class extends P{},CF=class extends Ba{},SF=class extends Ba{},PF=class extends Ba{},Ra=class extends P{},FF=class extends Ra{},LF=class extends Ra{},IF=class extends Ra{},qc=class extends P{},OF=class extends qc{},fm=class extends qc{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},NF=class extends Kr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},Wc=class extends P{},DF=class extends Wc{},zF=class extends Wc{},BF=class extends Vp{},Ga=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},RF=class extends Ga{async encode(e){return await xe(this.sessions.encoder_model,e)}async decode(e){return await xe(this.sessions.decoder_model,e)}},_m=class extends Ga{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},pm=class extends Ga{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},Hc=class extends P{},GF=class extends Hc{},$F=class extends Hc{},$a=class extends P{},VF=class extends $a{},UF=class extends $a{},jF=class extends $a{async generate_speech(e,t,{threshold:r=.5,minlenratio:s=0,maxlenratio:n=20,vocoder:a=null}={}){const o={input_ids:e},{encoder_outputs:i,encoder_attention_mask:l}=await Ir(this,o),c=i.dims[1]/this.config.reduction_factor,d=Math.floor(c*n),h=Math.floor(c*s),_=this.config.num_mel_bins;let p=[],w=null,v=null,y=0;for(;;){++y;const A=ap(!!v);let C;v?C=v.output_sequence_out:C=new U("float32",new Float32Array(_),[1,1,_]);let S={use_cache_branch:A,output_sequence:C,encoder_attention_mask:l,speaker_embeddings:t,encoder_hidden_states:i};tl(this,S,w),v=await xe(this.sessions.decoder_model_merged,S),w=el(v,w);const{prob:N,spectrum:x}=v;if(p.push(x),y>=h&&(Array.from(N.data).filter(R=>R>=r).length>0||y>=d))break}const M=ze(p),{waveform:T}=await xe(a.sessions.model,{spectrogram:M});return{spectrogram:M,waveform:T}}},qF=class extends P{constructor(){super(...arguments);k(this,"main_input_name","spectrogram")}},mn=class extends P{},WF=class extends mn{},HF=class extends mn{async _call(e){return new Je(await super._call(e))}},QF=class extends mn{async _call(e){return new ie(await super._call(e))}},XF=class extends mn{async _call(e){return new ht(await super._call(e))}},Qc=class extends P{},YF=class extends Qc{},JF=class extends Qc{},Xc=class extends P{},KF=class extends Xc{},ZF=class extends Xc{},mm=class extends P{},eL=class extends mm{},gm=class extends P{},wm=class extends gm{async generate_speech({input_ids:e,attention_mask:t,style:r,num_inference_steps:s=5,speed:n=1.05}){const{sampling_rate:a,chunk_compress_factor:o,base_chunk_size:i,latent_dim:l}=this.config,{last_hidden_state:c,durations:d}=await xe(this.sessions.text_encoder,{input_ids:e,attention_mask:t,style:r}),h=d.div(n).mul_(a),_=i*o,p=h.data,w=Int32Array.from(p,z=>Math.ceil(z/_)),v=Math.max(...w),y=e.dims[0],M=new BigInt64Array(y*v);for(let z=0;z<y;++z)M.fill(1n,z*v,z*v+w[z]);const T=new U("int64",M,[y,v]),A=l*o,C=A*v;let S=Sx([y,A,v]);const N=S.data;for(let z=0;z<y;++z)if(w[z]!==v)for(let $=0;$<A;++$)N.fill(0,z*C+$*v+w[z],z*C+($+1)*v);const x=ct([y],s);for(let z=0;z<s;++z){const $=ct([y],z);({denoised_latents:S}=await xe(this.sessions.latent_denoiser,{style:r,noisy_latents:S,latent_mask:T,encoder_outputs:c,attention_mask:t,timestep:$,num_inference_steps:x}))}const{waveform:R}=await xe(this.sessions.voice_decoder,{latents:S});return{waveform:R,durations:h}}},Va=class extends P{},tL=class extends Va{},rL=class extends Va{async _call(e){return new ie(await super._call(e))}},sL=class extends Va{},Yc=class extends P{},nL=class extends Yc{},aL=class extends Yc{},Jc=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"])}},oL=class extends Jc{},iL=class extends Jc{},Kc=class extends P{},lL=class extends Kc{},cL=class extends Kc{async _call(e){return new vm(await super._call(e))}},vm=class extends xl{},ym=class extends P{},uL=class extends ym{},Ua=class extends P{},dL=class extends Ua{},hL=class extends Ua{async _call(e){return new Jr(await super._call(e))}},fL=class extends Ua{async _call(e){return new ie(await super._call(e))}},gn=class extends P{},_L=class extends gn{},pL=class extends gn{async _call(e){return new Jr(await super._call(e))}},mL=class extends gn{async _call(e){return new ie(await super._call(e))}},gL=class extends gn{async _call(e){return new He(await super._call(e))}},Zc=class extends P{},wL=class extends Zc{},vL=class extends Zc{},yL=class extends P{constructor(){super(...arguments);k(this,"main_input_name","pixel_values");k(this,"forward_params",["pixel_values","decoder_input_ids","encoder_hidden_states","past_key_values"])}},eu=class extends P{},bL=class extends eu{},ML=class extends eu{async _call(e){return new ie(await super._call(e))}},bm=class extends P{},xL=class extends bm{},tu=class extends P{},kL=class extends tu{},TL=class extends tu{async _call(e){return new ie(await super._call(e))}},Mm=class extends P{},EL=class extends Mm{async _call(e){return new ZT(await super._call(e))}},xm=class extends P{},AL=class extends xm{},km=class extends Ye{constructor({waveform:e,spectrogram:t}){super(),this.waveform=e,this.spectrogram=t}},Tm=class extends P{},CL=class extends Tm{async _call(e){return new km(await super._call(e))}},SL=class extends Xl{},Em=2,PL=1,ru=new WeakMap;function FL(e,t){var w,v,y;const{text_config:r,audio_config:s}=e.config,n=e.sessions.audio_encoder,{num_mel_bins:a,hidden_size:o}=s,i=a+o,l=new Ji,c=((w=n==null?void 0:n.config)==null?void 0:w.kv_cache_dtype)??"float32",d=c==="float16"?fs.float16:fs.float32,h=ba(s,{batch_size:1});for(const M in h){const T=h[M].reduce((A,C)=>A*C,1);l[M]=new U(c,new d(T),h[M])}const _=new U(c,new d(i*Em),[1,i,Em]),p=((v=t[Symbol.asyncIterator])==null?void 0:v.call(t))??((y=t[Symbol.iterator])==null?void 0:y.call(t));if(!p)throw new Error("input_features must be iterable or async iterable");return{encoder_session:n,enc_kv_cache:l,enc_padding_cache:_,enc_past_seq_len:0,audio_embed_queue:[],audio_embed_total_tokens:0,audio_queue_offset:0,audio_consumed:0,stream_exhausted:!1,chunks_iter:p,text_hidden_size:r.hidden_size}}async function LL(e,t){const r=t.dims[2],s=Math.floor((PL+r-3)/2)+1,n=new U("int64",BigInt64Array.from({length:s},(d,h)=>BigInt(e.enc_past_seq_len+h)),[1,s]),a=e.enc_past_seq_len+s,o=yt([1,a]),{audio_embeds:i,present_padding_cache:l,...c}=await xe(e.encoder_session,{input_features:t,attention_mask:o,position_ids:n,past_padding_cache:e.enc_padding_cache,...e.enc_kv_cache});e.enc_padding_cache.location==="gpu-buffer"&&e.enc_padding_cache.dispose(),e.enc_padding_cache=l;for(const d in c)if(d.startsWith("present.")){const h=d.replace("present","past_key_values"),_=e.enc_kv_cache[h];(_==null?void 0:_.location)==="gpu-buffer"&&_.dispose(),e.enc_kv_cache[h]=c[d]}return e.enc_past_seq_len=a,i}async function IL(e,t){for(;e.audio_embed_total_tokens<t&&!e.stream_exhausted;){const r=await e.chunks_iter.next();if(r.done){e.stream_exhausted=!0;break}const s=await LL(e,r.value);e.audio_embed_queue.push({data:s.data,tokens:s.dims[1]}),e.audio_embed_total_tokens+=s.dims[1]}}function OL(e,t,r){if(e.audio_embed_queue.length===0)return;const s=t.data;let n=0,a=r;for(;a>0&&e.audio_embed_queue.length>0;){const o=e.audio_embed_queue[0],i=o.tokens-e.audio_queue_offset,l=Math.min(a,i),c=e.audio_queue_offset*e.text_hidden_size;for(let d=0;d<l*e.text_hidden_size;++d)s[n*e.text_hidden_size+d]+=o.data[c+d];n+=l,a-=l,e.audio_queue_offset+=l,e.audio_queue_offset>=o.tokens&&(e.audio_embed_queue.shift(),e.audio_queue_offset=0)}e.audio_consumed+=r-a}var NL=class extends Ma{constructor(e){super(),this._s=e}_call(e){const t=this._s.stream_exhausted&&this._s.audio_embed_queue.length===0;return e.map(()=>t)}},Am=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","past_key_values"])}},Cm=class extends Am{async forward({input_ids:e,past_key_values:t,...r}){const s=e.dims[1],n=ru.get(this);n&&await IL(n,n.audio_consumed+s);const{inputs_embeds:a}=await xe(this.sessions.embed_tokens,{input_ids:e});n&&OL(n,a,s);const o={inputs_embeds:a,...r};tl(this,o,t);const i=this.sessions.decoder_model_merged,l=rt(o,i.inputNames);return await xe(i,l)}async generate({input_features:e,stopping_criteria:t,...r}){if(!e)throw new Error("input_features (generator/iterable) must be provided");const s=FL(this,e);ru.set(this,s);const n=new tp;n.push(new NL(s)),t&&n.extend(t);try{return await super.generate({...r,stopping_criteria:n})}finally{s.enc_kv_cache.dispose(),ru.delete(this)}}},ja=class extends P{},DL=class extends ja{},zL=class extends ja{async _call(e){return new Jr(await super._call(e))}},BL=class extends ja{async _call(e){return new ie(await super._call(e))}},Sm=class extends Ye{constructor({logits:e,embeddings:t}){super(),this.logits=e,this.embeddings=t}},Fs=class extends P{},RL=class extends Fs{},GL=class extends Fs{async _call(e){return new Jr(await super._call(e))}},$L=class extends Fs{async _call(e){return new ie(await super._call(e))}},VL=class extends Fs{async _call(e){return new Sm(await super._call(e))}},UL=class extends Fs{async _call(e){return new He(await super._call(e))}},Pm=class extends P{},jL=class extends Pm{},qL=class extends ep{constructor(){super(...arguments);k(this,"return_timestamps",null);k(this,"return_token_timestamps",null);k(this,"num_frames",null);k(this,"alignment_heads",null);k(this,"task",null);k(this,"language",null);k(this,"no_timestamps_token_id",null);k(this,"prompt_ids",null);k(this,"is_multilingual",null);k(this,"lang_to_id",null);k(this,"task_to_id",null);k(this,"max_initial_timestamp_index",1)}},su=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_features");k(this,"forward_params",["input_features","attention_mask","decoder_input_ids","decoder_attention_mask","past_key_values"])}},WL=class extends su{},Fm=class extends su{_prepare_generation_config(e,t){return super._prepare_generation_config(e,t,qL)}_retrieve_init_tokens(e){const t=[e.decoder_start_token_id];let r=e.language;const s=e.task;if(e.is_multilingual){r||(ue.warn("No language specified - defaulting to English (en)."),r="en");const a=`<|${k1(r)}|>`;t.push(e.lang_to_id[a]),t.push(e.task_to_id[s??"transcribe"])}else if(r||s)throw new Error("Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.");return!e.return_timestamps&&e.no_timestamps_token_id&&t.at(-1)!==e.no_timestamps_token_id?t.push(e.no_timestamps_token_id):e.return_timestamps&&t.at(-1)===e.no_timestamps_token_id&&(ue.warn("<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`."),t.pop()),t.filter(n=>n!=null)}async generate({inputs:e=null,generation_config:t=null,logits_processor:r=null,stopping_criteria:s=null,...n}){t=this._prepare_generation_config(t,n);const a=n.decoder_input_ids instanceof U?Ii(n.decoder_input_ids):n.decoder_input_ids??this._retrieve_init_tokens(t);if(t.return_timestamps&&(r??(r=new Yi),r.push(new nE(t,a))),t.begin_suppress_tokens&&(r??(r=new Yi),r.push(new Z_(t.begin_suppress_tokens,a.length))),t.return_token_timestamps){if(!t.alignment_heads)throw new Error("Model generation config has no `alignment_heads`, token-level timestamps not available. See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config.");t.task==="translate"&&ue.warn("Token-level timestamps may not be reliable for task 'translate'."),t.output_attentions=!0,t.return_dict_in_generate=!0}if(t.return_timestamps&&!n.max_new_tokens)return this._generate_with_seek({inputs:e,generation_config:t,logits_processor:r,init_tokens:a,kwargs:n});const o=await super.generate({inputs:e,generation_config:t,logits_processor:r,decoder_input_ids:a,...n});return t.return_token_timestamps&&(o.token_timestamps=this._extract_token_timestamps(o,t.alignment_heads,t.num_frames,.02,a.length)),o}async _generate_with_seek({inputs:e,generation_config:t,logits_processor:r,init_tokens:s,kwargs:n}){const a=t.no_timestamps_token_id+1,o=Array.isArray(t.eos_token_id)?t.eos_token_id[0]:t.eos_token_id,i=t.return_token_timestamps,l=e,c=l.dims[2],d=2,h=this.config.max_source_positions,_=d*h;let p=0;const w=[],v=[];for(;p<c;){const M=Math.min(p+_,c),T=l.slice(null,null,[p,M]);let A;const C=T.dims[2];if(C<_){const W=l.dims[1],ee=new Float32Array(W*_),G=T.data;for(let L=0;L<W;++L)ee.set(G.subarray(L*C,(L+1)*C),L*_);A=new U("float32",ee,[1,W,_])}else A=T;if(r)for(const W of r)"begin_index"in W&&(W.begin_index=s.length);const S=await super.generate({inputs:A,generation_config:t,logits_processor:r,decoder_input_ids:s,...n}),x=(i?S.sequences:S)[0].tolist().map(Number).slice(s.length);let R;if(i){S.token_timestamps=this._extract_token_timestamps(S,t.alignment_heads,Math.floor((M-p)/d),.02,s.length);const W=p/d*.02;R=S.token_timestamps[0].tolist().slice(s.length).map(ee=>ee+W)}if(x.length>0&&x.at(-1)===o&&x.pop(),x.length===0)break;const z=x.map(W=>W>=a),$=x.length>=2&&z[x.length-1]&&!z[x.length-2],Q=[];for(let W=0;W<x.length-1;++W)z[W]&&z[W+1]&&Q.push(W+1);let H,D=x.length;if(Q.length>0)if($)H=M-p;else{const W=Q.at(-1);H=(x[W-1]-a)*d,D=W}else H=M-p;const I=Math.floor(p/d),te=a+1500;for(let W=0;W<D;++W)x[W]>=a&&(x[W]=Math.min(x[W]+I,te));w.push(...x.slice(0,D)),R&&v.push(...R.slice(0,D)),p+=H}w.push(o);const y=[...s,...w];if(i){const M=new U("int64",y.map(BigInt),[1,y.length]),T=[...new Array(s.length).fill(0),...v,0],A=new U("float32",new Float32Array(T),[1,T.length]);return{sequences:M,token_timestamps:A}}return new U("int64",y.map(BigInt),[1,y.length])}_extract_token_timestamps(e,t,r=null,s=.02,n=0){if(!e.cross_attentions)throw new Error("Model outputs must contain cross attentions to extract timestamps. This is most likely because the model was not exported with `output_attentions=True`.");r==null&&ue.warn("`num_frames` has not been set, meaning the entire audio will be analyzed. This may lead to inaccurate token-level timestamps for short audios (< 30 seconds).");let a=this.config.median_filter_width;a===void 0&&(ue.warn("Model config has no `median_filter_width`, using default value of 7."),a=7);const o=e.cross_attentions,i=Array.from({length:this.config.decoder_layers},(y,M)=>ze(o.map(T=>T[M]),2)),l=dr(t.map(([y,M])=>{if(y>=i.length)throw new Error(`Layer index ${y} is out of bounds for cross attentions (length ${i.length}).`);return r?i[y].slice(null,M,null,[0,r]):i[y].slice(null,M)})).transpose(1,0,2,3),[c,d]=Cx(l,-2,0,!0),h=l.clone();for(let y=0;y<h.dims[0];++y){const M=h[y];for(let T=0;T<M.dims[0];++T){const A=M[T],C=c[y][T][0].data,S=d[y][T][0].data;for(let N=0;N<A.dims[0];++N){let x=A[N].data;for(let R=0;R<x.length;++R)x[R]=(x[R]-S[R])/C[R];x.set(ux(x,a))}}}const _=n>0?h.slice(null,null,[n,h.dims[2]],null):h,p=[Si(_,1)],w=e.sequences.dims,v=new U("float32",new Float32Array(w[0]*w[1]),w);for(let y=0;y<w[0];++y){const M=p[y].neg().squeeze_(0),[T,A]=hx(M.tolist()),C=Array.from({length:T.length-1},(R,z)=>T[z+1]-T[z]),S=Jt([1],C).map(R=>!!R),N=[];for(let R=0;R<S.length;++R)S[R]&&N.push(A[R]*s);const x=new Array(n).fill(0);x.push(...N),N.length>0&&x.push(N.at(-1)),v[y].data.set(x)}return v}},HL=class extends Fm{},Ls=class extends P{},QL=class extends Ls{},XL=class extends Ls{async _call(e){return new Je(await super._call(e))}},YL=class extends Ls{async _call(e){return new ie(await super._call(e))}},JL=class extends Ls{async _call(e){return new He(await super._call(e))}},KL=class extends Ls{async _call(e){return new ht(await super._call(e))}},Is=class extends P{},ZL=class extends Is{},eI=class extends Is{async _call(e){return new Je(await super._call(e))}},tI=class extends Is{async _call(e){return new ie(await super._call(e))}},rI=class extends Is{async _call(e){return new He(await super._call(e))}},sI=class extends Is{async _call(e){return new ht(await super._call(e))}},nu=class extends P{},nI=class extends nu{},aI=class extends nu{async _call(e){return new Lm(await super._call(e))}},Lm=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},au=class extends P{},oI=class extends au{},iI=class extends au{},lI=new Map([["bert","BertModel"],["eurobert","EuroBertModel"],["neobert","NeoBertModel"],["modernbert","ModernBertModel"],["nomic_bert","NomicBertModel"],["roformer","RoFormerModel"],["electra","ElectraModel"],["esm","EsmModel"],["convbert","ConvBertModel"],["camembert","CamembertModel"],["deberta","DebertaModel"],["deberta-v2","DebertaV2Model"],["mpnet","MPNetModel"],["albert","AlbertModel"],["distilbert","DistilBertModel"],["roberta","RobertaModel"],["xlm","XLMModel"],["xlm-roberta","XLMRobertaModel"],["clap","ClapModel"],["clip","CLIPModel"],["clipseg","CLIPSegModel"],["chinese_clip","ChineseCLIPModel"],["siglip","SiglipModel"],["jina_clip","JinaCLIPModel"],["mobilebert","MobileBertModel"],["squeezebert","SqueezeBertModel"],["wav2vec2","Wav2Vec2Model"],["wav2vec2-bert","Wav2Vec2BertModel"],["unispeech","UniSpeechModel"],["unispeech-sat","UniSpeechSatModel"],["hubert","HubertModel"],["wavlm","WavLMModel"],["audio-spectrogram-transformer","ASTModel"],["vits","VitsModel"],["pyannote","PyAnnoteModel"],["wespeaker-resnet","WeSpeakerResNetModel"],["detr","DetrModel"],["rt_detr","RTDetrModel"],["rt_detr_v2","RTDetrV2Model"],["rf_detr","RFDetrModel"],["d_fine","DFineModel"],["table-transformer","TableTransformerModel"],["vit","ViTModel"],["ijepa","IJepaModel"],["pvt","PvtModel"],["vit_msn","ViTMSNModel"],["vit_mae","ViTMAEModel"],["groupvit","GroupViTModel"],["fastvit","FastViTModel"],["mobilevit","MobileViTModel"],["mobilevitv2","MobileViTV2Model"],["owlvit","OwlViTModel"],["owlv2","Owlv2Model"],["beit","BeitModel"],["deit","DeiTModel"],["hiera","HieraModel"],["convnext","ConvNextModel"],["convnextv2","ConvNextV2Model"],["dinov2","Dinov2Model"],["dinov2_with_registers","Dinov2WithRegistersModel"],["dinov3_vit","DINOv3ViTModel"],["dinov3_convnext","DINOv3ConvNextModel"],["resnet","ResNetModel"],["swin","SwinModel"],["swin2sr","Swin2SRModel"],["donut-swin","DonutSwinModel"],["yolos","YolosModel"],["dpt","DPTModel"],["glpn","GLPNModel"],["hifigan","SpeechT5HifiGan"],["efficientnet","EfficientNetModel"],["decision_transformer","DecisionTransformerModel"],["patchtst","PatchTSTModel"],["patchtsmixer","PatchTSMixerModel"],["mobilenet_v1","MobileNetV1Model"],["mobilenet_v2","MobileNetV2Model"],["mobilenet_v3","MobileNetV3Model"],["mobilenet_v4","MobileNetV4Model"],["maskformer","MaskFormerModel"],["mgp-str","MgpstrForSceneTextRecognition"],["style_text_to_speech_2","StyleTextToSpeech2Model"]]),cI=new Map([["t5","T5Model"],["longt5","LongT5Model"],["mt5","MT5Model"],["bart","BartModel"],["mbart","MBartModel"],["marian","MarianModel"],["whisper","WhisperModel"],["cohere_asr","CohereAsrModel"],["m2m_100","M2M100Model"],["blenderbot","BlenderbotModel"],["blenderbot-small","BlenderbotSmallModel"]]),uI=new Map([["mimi","MimiModel"],["dac","DacModel"],["snac","SnacModel"]]),dI=new Map([["bloom","BloomModel"],["jais","JAISModel"],["gpt2","GPT2Model"],["gpt_oss","GptOssModel"],["gptj","GPTJModel"],["gpt_bigcode","GPTBigCodeModel"],["gpt_neo","GPTNeoModel"],["gpt_neox","GPTNeoXModel"],["codegen","CodeGenModel"],["llama","LlamaModel"],["apertus","ApertusModel"],["nanochat","NanoChatModel"],["arcee","ArceeModel"],["afmoe","AfmoeModel"],["lfm2","Lfm2Model"],["lfm2_moe","Lfm2MoeModel"],["smollm3","SmolLM3Model"],["exaone","ExaoneModel"],["olmo","OlmoModel"],["olmo2","Olmo2Model"],["olmo3","Olmo3Model"],["olmo_hybrid","OlmoHybridModel"],["mobilellm","MobileLLMModel"],["granite","GraniteModel"],["granitemoehybrid","GraniteMoeHybridModel"],["cohere","CohereModel"],["cohere2","Cohere2Model"],["gemma","GemmaModel"],["gemma2","Gemma2Model"],["vaultgemma","VaultGemmaModel"],["gemma3_text","Gemma3Model"],["helium","HeliumModel"],["glm","GlmModel"],["glm_moe_dsa","GlmMoeDsaModel"],["openelm","OpenELMModel"],["qwen2","Qwen2Model"],["qwen2_moe","Qwen2MoeModel"],["qwen3","Qwen3Model"],["qwen3_moe","Qwen3MoeModel"],["qwen3_next","Qwen3NextModel"],["phi","PhiModel"],["phi3","Phi3Model"],["mpt","MptModel"],["opt","OPTModel"],["mistral","MistralModel"],["mistral4","Mistral4Model"],["ministral","MinistralModel"],["ministral3","Ministral3Model"],["ernie4_5","Ernie4_5ForCausalLM"],["starcoder2","Starcoder2Model"],["deepseek_v3","DeepseekV3Model"],["falcon","FalconModel"],["falcon_h1","FalconH1Model"],["nemotron_h","NemotronHModel"],["solar_open","SolarOpenModel"],["stablelm","StableLmModel"],["modernbert-decoder","ModernBertDecoderModel"],["hunyuan_v1_dense","HunYuanDenseV1Model"],["youtu","YoutuModel"]]),Im=new Map([["speecht5","SpeechT5ForSpeechToText"],["whisper","WhisperForConditionalGeneration"],["lite-whisper","LiteWhisperForConditionalGeneration"],["moonshine","MoonshineForConditionalGeneration"],["cohere_asr","CohereAsrForConditionalGeneration"]]),Om=new Map([["speecht5","SpeechT5ForTextToSpeech"]]),Nm=new Map([["vits","VitsModel"],["musicgen","MusicgenForConditionalGeneration"],["supertonic","SupertonicForConditionalGeneration"]]),Dm=new Map([["bert","BertForSequenceClassification"],["eurobert","EuroBertForSequenceClassification"],["neobert","NeoBertForSequenceClassification"],["modernbert","ModernBertForSequenceClassification"],["roformer","RoFormerForSequenceClassification"],["electra","ElectraForSequenceClassification"],["esm","EsmForSequenceClassification"],["convbert","ConvBertForSequenceClassification"],["camembert","CamembertForSequenceClassification"],["deberta","DebertaForSequenceClassification"],["deberta-v2","DebertaV2ForSequenceClassification"],["mpnet","MPNetForSequenceClassification"],["albert","AlbertForSequenceClassification"],["distilbert","DistilBertForSequenceClassification"],["roberta","RobertaForSequenceClassification"],["xlm","XLMForSequenceClassification"],["xlm-roberta","XLMRobertaForSequenceClassification"],["bart","BartForSequenceClassification"],["mbart","MBartForSequenceClassification"],["mobilebert","MobileBertForSequenceClassification"],["squeezebert","SqueezeBertForSequenceClassification"]]),zm=new Map([["bert","BertForTokenClassification"],["eurobert","EuroBertForTokenClassification"],["neobert","NeoBertForTokenClassification"],["modernbert","ModernBertForTokenClassification"],["roformer","RoFormerForTokenClassification"],["electra","ElectraForTokenClassification"],["esm","EsmForTokenClassification"],["convbert","ConvBertForTokenClassification"],["camembert","CamembertForTokenClassification"],["deberta","DebertaForTokenClassification"],["deberta-v2","DebertaV2ForTokenClassification"],["mpnet","MPNetForTokenClassification"],["distilbert","DistilBertForTokenClassification"],["roberta","RobertaForTokenClassification"],["xlm","XLMForTokenClassification"],["xlm-roberta","XLMRobertaForTokenClassification"]]),Bm=new Map([["t5","T5ForConditionalGeneration"],["longt5","LongT5ForConditionalGeneration"],["mt5","MT5ForConditionalGeneration"],["bart","BartForConditionalGeneration"],["mbart","MBartForConditionalGeneration"],["marian","MarianMTModel"],["m2m_100","M2M100ForConditionalGeneration"],["blenderbot","BlenderbotForConditionalGeneration"],["blenderbot-small","BlenderbotSmallForConditionalGeneration"]]),Rm=new Map([["bloom","BloomForCausalLM"],["gpt2","GPT2LMHeadModel"],["gpt_oss","GptOssForCausalLM"],["jais","JAISLMHeadModel"],["gptj","GPTJForCausalLM"],["gpt_bigcode","GPTBigCodeForCausalLM"],["gpt_neo","GPTNeoForCausalLM"],["gpt_neox","GPTNeoXForCausalLM"],["codegen","CodeGenForCausalLM"],["llama","LlamaForCausalLM"],["nanochat","NanoChatForCausalLM"],["apertus","ApertusForCausalLM"],["llama4_text","Llama4ForCausalLM"],["arcee","ArceeForCausalLM"],["afmoe","AfmoeForCausalLM"],["lfm2","Lfm2ForCausalLM"],["lfm2_moe","Lfm2MoeForCausalLM"],["smollm3","SmolLM3ForCausalLM"],["exaone","ExaoneForCausalLM"],["olmo","OlmoForCausalLM"],["olmo2","Olmo2ForCausalLM"],["olmo3","Olmo3ForCausalLM"],["olmo_hybrid","OlmoHybridForCausalLM"],["mobilellm","MobileLLMForCausalLM"],["granite","GraniteForCausalLM"],["granitemoehybrid","GraniteMoeHybridForCausalLM"],["cohere","CohereForCausalLM"],["cohere2","Cohere2ForCausalLM"],["gemma","GemmaForCausalLM"],["gemma2","Gemma2ForCausalLM"],["vaultgemma","VaultGemmaForCausalLM"],["gemma3_text","Gemma3ForCausalLM"],["gemma3","Gemma3ForCausalLM"],["helium","HeliumForCausalLM"],["glm","GlmForCausalLM"],["glm_moe_dsa","GlmMoeDsaForCausalLM"],["openelm","OpenELMForCausalLM"],["qwen2","Qwen2ForCausalLM"],["qwen2_moe","Qwen2MoeForCausalLM"],["qwen3","Qwen3ForCausalLM"],["qwen3_moe","Qwen3MoeForCausalLM"],["qwen3_next","Qwen3NextForCausalLM"],["qwen2_vl","Qwen2VLForCausalLM"],["qwen2_5_vl","Qwen2_5_VLForCausalLM"],["qwen3_vl","Qwen3VLForCausalLM"],["qwen3_vl_moe","Qwen3VLMoeForCausalLM"],["qwen3_5","Qwen3_5ForCausalLM"],["qwen3_5_text","Qwen3_5ForCausalLM"],["qwen3_5_moe","Qwen3_5MoeForCausalLM"],["gemma3n","Gemma3nForCausalLM"],["gemma4","Gemma4ForCausalLM"],["phi","PhiForCausalLM"],["phi3","Phi3ForCausalLM"],["mpt","MptForCausalLM"],["opt","OPTForCausalLM"],["mbart","MBartForCausalLM"],["mistral","MistralForCausalLM"],["mistral4","Mistral4ForCausalLM"],["ministral","MinistralForCausalLM"],["ministral3","Ministral3ForCausalLM"],["ernie4_5","Ernie4_5ForCausalLM"],["starcoder2","Starcoder2ForCausalLM"],["deepseek_v3","DeepseekV3ForCausalLM"],["falcon","FalconForCausalLM"],["falcon_h1","FalconH1ForCausalLM"],["nemotron_h","NemotronHForCausalLM"],["trocr","TrOCRForCausalLM"],["solar_open","SolarOpenForCausalLM"],["stablelm","StableLmForCausalLM"],["modernbert-decoder","ModernBertDecoderForCausalLM"],["hunyuan_v1_dense","HunYuanDenseV1ForCausalLM"],["youtu","YoutuForCausalLM"],["phi3_v","Phi3VForCausalLM"]]),hI=new Map([["multi_modality","MultiModalityCausalLM"]]),Gm=new Map([["bert","BertForMaskedLM"],["eurobert","EuroBertForMaskedLM"],["neobert","NeoBertForMaskedLM"],["modernbert","ModernBertForMaskedLM"],["roformer","RoFormerForMaskedLM"],["electra","ElectraForMaskedLM"],["esm","EsmForMaskedLM"],["convbert","ConvBertForMaskedLM"],["camembert","CamembertForMaskedLM"],["deberta","DebertaForMaskedLM"],["deberta-v2","DebertaV2ForMaskedLM"],["mpnet","MPNetForMaskedLM"],["albert","AlbertForMaskedLM"],["distilbert","DistilBertForMaskedLM"],["roberta","RobertaForMaskedLM"],["xlm","XLMWithLMHeadModel"],["xlm-roberta","XLMRobertaForMaskedLM"],["mobilebert","MobileBertForMaskedLM"],["squeezebert","SqueezeBertForMaskedLM"]]),$m=new Map([["bert","BertForQuestionAnswering"],["neobert","NeoBertForQuestionAnswering"],["roformer","RoFormerForQuestionAnswering"],["electra","ElectraForQuestionAnswering"],["convbert","ConvBertForQuestionAnswering"],["camembert","CamembertForQuestionAnswering"],["deberta","DebertaForQuestionAnswering"],["deberta-v2","DebertaV2ForQuestionAnswering"],["mpnet","MPNetForQuestionAnswering"],["albert","AlbertForQuestionAnswering"],["distilbert","DistilBertForQuestionAnswering"],["roberta","RobertaForQuestionAnswering"],["xlm","XLMForQuestionAnswering"],["xlm-roberta","XLMRobertaForQuestionAnswering"],["mobilebert","MobileBertForQuestionAnswering"],["squeezebert","SqueezeBertForQuestionAnswering"]]),Vm=new Map([["vision-encoder-decoder","VisionEncoderDecoderModel"],["idefics3","Idefics3ForConditionalGeneration"],["smolvlm","SmolVLMForConditionalGeneration"]]),Um=new Map([["llava","LlavaForConditionalGeneration"],["llava_onevision","LlavaOnevisionForConditionalGeneration"],["moondream1","Moondream1ForConditionalGeneration"],["florence2","Florence2ForConditionalGeneration"],["qwen2_vl","Qwen2VLForConditionalGeneration"],["qwen2_5_vl","Qwen2_5_VLForConditionalGeneration"],["qwen3_vl","Qwen3VLForConditionalGeneration"],["qwen3_vl_moe","Qwen3VLMoeForConditionalGeneration"],["qwen3_5","Qwen3_5ForConditionalGeneration"],["qwen3_5_moe","Qwen3_5MoeForConditionalGeneration"],["lfm2_vl","Lfm2VlForConditionalGeneration"],["idefics3","Idefics3ForConditionalGeneration"],["smolvlm","SmolVLMForConditionalGeneration"],["paligemma","PaliGemmaForConditionalGeneration"],["llava_qwen2","LlavaQwen2ForCausalLM"],["gemma3","Gemma3ForConditionalGeneration"],["gemma3n","Gemma3nForConditionalGeneration"],["gemma4","Gemma4ForConditionalGeneration"],["mistral3","Mistral3ForConditionalGeneration"],["lighton_ocr","LightOnOcrForConditionalGeneration"],["glm_ocr","GlmOcrForConditionalGeneration"]]),jm=new Map([["granite_speech","GraniteSpeechForConditionalGeneration"],["ultravox","UltravoxModel"],["voxtral","VoxtralForConditionalGeneration"],["voxtral_realtime","VoxtralRealtimeForConditionalGeneration"]]),fI=new Map([["vision-encoder-decoder","VisionEncoderDecoderModel"]]),qm=new Map([["vit","ViTForImageClassification"],["ijepa","IJepaForImageClassification"],["pvt","PvtForImageClassification"],["vit_msn","ViTMSNForImageClassification"],["fastvit","FastViTForImageClassification"],["mobilevit","MobileViTForImageClassification"],["mobilevitv2","MobileViTV2ForImageClassification"],["beit","BeitForImageClassification"],["deit","DeiTForImageClassification"],["hiera","HieraForImageClassification"],["convnext","ConvNextForImageClassification"],["convnextv2","ConvNextV2ForImageClassification"],["dinov2","Dinov2ForImageClassification"],["dinov2_with_registers","Dinov2WithRegistersForImageClassification"],["resnet","ResNetForImageClassification"],["swin","SwinForImageClassification"],["segformer","SegformerForImageClassification"],["efficientnet","EfficientNetForImageClassification"],["mobilenet_v1","MobileNetV1ForImageClassification"],["mobilenet_v2","MobileNetV2ForImageClassification"],["mobilenet_v3","MobileNetV3ForImageClassification"],["mobilenet_v4","MobileNetV4ForImageClassification"]]),Wm=new Map([["detr","DetrForObjectDetection"],["rt_detr","RTDetrForObjectDetection"],["rt_detr_v2","RTDetrV2ForObjectDetection"],["rf_detr","RFDetrForObjectDetection"],["d_fine","DFineForObjectDetection"],["table-transformer","TableTransformerForObjectDetection"],["yolos","YolosForObjectDetection"]]),Hm=new Map([["owlvit","OwlViTForObjectDetection"],["owlv2","Owlv2ForObjectDetection"],["grounding-dino","GroundingDinoForObjectDetection"]]),Os=new Map([["detr","DetrForSegmentation"],["clipseg","CLIPSegForImageSegmentation"]]),Qm=new Map([["segformer","SegformerForSemanticSegmentation"],["sapiens","SapiensForSemanticSegmentation"],["swin","SwinForSemanticSegmentation"],["mobilenet_v1","MobileNetV1ForSemanticSegmentation"],["mobilenet_v2","MobileNetV2ForSemanticSegmentation"],["mobilenet_v3","MobileNetV3ForSemanticSegmentation"],["mobilenet_v4","MobileNetV4ForSemanticSegmentation"]]),Xm=new Map([["detr","DetrForSegmentation"],["maskformer","MaskFormerForInstanceSegmentation"]]),Ym=new Map([["sam","SamModel"],["sam2","Sam2Model"],["edgetam","EdgeTamModel"],["sam3_tracker","Sam3TrackerModel"]]),Jm=new Map([["wav2vec2","Wav2Vec2ForCTC"],["wav2vec2-bert","Wav2Vec2BertForCTC"],["unispeech","UniSpeechForCTC"],["unispeech-sat","UniSpeechSatForCTC"],["wavlm","WavLMForCTC"],["hubert","HubertForCTC"],["parakeet_ctc","ParakeetForCTC"]]),Km=new Map([["wav2vec2","Wav2Vec2ForSequenceClassification"],["wav2vec2-bert","Wav2Vec2BertForSequenceClassification"],["unispeech","UniSpeechForSequenceClassification"],["unispeech-sat","UniSpeechSatForSequenceClassification"],["wavlm","WavLMForSequenceClassification"],["hubert","HubertForSequenceClassification"],["audio-spectrogram-transformer","ASTForAudioClassification"]]),Zm=new Map([["wavlm","WavLMForXVector"]]),eg=new Map([["unispeech-sat","UniSpeechSatForAudioFrameClassification"],["wavlm","WavLMForAudioFrameClassification"],["wav2vec2","Wav2Vec2ForAudioFrameClassification"],["pyannote","PyAnnoteForAudioFrameClassification"]]),tg=new Map([["vitmatte","VitMatteForImageMatting"]]),_I=new Map([["patchtst","PatchTSTForPrediction"],["patchtsmixer","PatchTSMixerForPrediction"]]),rg=new Map([["swin2sr","Swin2SRForImageSuperResolution"]]),sg=new Map([["chmv2","CHMv2ForDepthEstimation"],["dpt","DPTForDepthEstimation"],["depth_anything","DepthAnythingForDepthEstimation"],["glpn","GLPNForDepthEstimation"],["sapiens","SapiensForDepthEstimation"],["depth_pro","DepthProForDepthEstimation"],["metric3d","Metric3DForDepthEstimation"],["metric3dv2","Metric3Dv2ForDepthEstimation"]]),ng=new Map([["sapiens","SapiensForNormalEstimation"]]),ag=new Map([["vitpose","VitPoseForPoseEstimation"]]),og=new Map([["clip","CLIPVisionModelWithProjection"],["siglip","SiglipVisionModel"],["jina_clip","JinaCLIPVisionModel"]]),ig=[[lI,q.EncoderOnly],[cI,q.EncoderDecoder],[dI,q.DecoderOnlyWithoutHead],[uI,q.AutoEncoder],[Dm,q.EncoderOnly],[zm,q.EncoderOnly],[Bm,q.Seq2Seq],[Im,q.Seq2Seq],[Rm,q.DecoderOnly],[hI,q.MultiModality],[Gm,q.EncoderOnly],[$m,q.EncoderOnly],[Vm,q.Vision2Seq],[Um,q.ImageTextToText],[jm,q.AudioTextToText],[qm,q.EncoderOnly],[Os,q.EncoderOnly],[Xm,q.EncoderOnly],[Qm,q.EncoderOnly],[tg,q.EncoderOnly],[_I,q.EncoderOnly],[rg,q.EncoderOnly],[sg,q.EncoderOnly],[ng,q.EncoderOnly],[ag,q.EncoderOnly],[Wm,q.EncoderOnly],[Hm,q.EncoderOnly],[Ym,q.MaskGeneration],[Jm,q.EncoderOnly],[Km,q.EncoderOnly],[Om,q.Seq2Seq],[Nm,q.EncoderOnly],[Zm,q.EncoderOnly],[eg,q.EncoderOnly],[og,q.EncoderOnly]];for(const[e,t]of ig)for(const r of e.values()){_r.set(r,t);const s=sl[r];vs.set(s,r),Zi.set(r,s)}var pI=[["MusicgenForConditionalGeneration",em,q.Musicgen],["Phi3VForCausalLM",nm,q.Phi3V],["CLIPTextModelWithProjection",vp,q.EncoderOnly],["SiglipTextModel",fm,q.EncoderOnly],["JinaCLIPTextModel",Up,q.EncoderOnly],["ClapTextModelWithProjection",gp,q.EncoderOnly],["ClapAudioModelWithProjection",wp,q.EncoderOnly],["DacEncoderModel",Mp,q.EncoderOnly],["DacDecoderModel",xp,q.EncoderOnly],["MimiEncoderModel",Jp,q.EncoderOnly],["MimiDecoderModel",Kp,q.EncoderOnly],["SnacEncoderModel",_m,q.EncoderOnly],["SnacDecoderModel",pm,q.EncoderOnly],["Gemma3nForConditionalGeneration",Fa,q.ImageAudioTextToText],["Gemma4ForConditionalGeneration",Nl,q.ImageAudioTextToText],["SupertonicForConditionalGeneration",wm,q.Supertonic],["ChatterboxModel",_p,q.Chatterbox],["VoxtralRealtimeForConditionalGeneration",Cm,q.VoxtralRealtime]];for(const[e,t,r]of pI)_r.set(e,r),vs.set(t,e),Zi.set(e,t);var lg=new Map([["modnet",Os],["birefnet",Os],["isnet",Os],["ben",Os]]);for(const[e,t]of lg.entries())t.set(e,"PreTrainedModel"),_r.set(e,q.EncoderOnly),Zi.set(e,P);var mI=new Set(lg.keys());_r.set("PreTrainedModel",q.EncoderOnly),vs.set(P,"PreTrainedModel");var Ae={MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES:Dm,MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES:zm,MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES:Om,MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES:Nm,MODEL_FOR_MASKED_LM_MAPPING_NAMES:Gm,MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES:$m,MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES:qm,MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES:Os,MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES:Qm,MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES:Xm,MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES:Wm,MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES:Hm,MODEL_FOR_MASK_GENERATION_MAPPING_NAMES:Ym,MODEL_FOR_CTC_MAPPING_NAMES:Jm,MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES:Km,MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES:Zm,MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES:eg,MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES:fI,MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES:tg,MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES:rg,MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES:sg,MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES:ng,MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES:ag,MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES:og,MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES:Um,MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES:jm,MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:Bm,MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES:Im,MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:Rm,MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES:Vm};yE(Ae);var Ce=(oo=class{static supports(e){if(!this.MODEL_CLASS_MAPPINGS)return!1;for(const t of this.MODEL_CLASS_MAPPINGS)if(t.has(e))return!0;return this.BASE_IF_FAIL}static async from_pretrained(e,{progress_callback:t=null,config:r=null,cache_dir:s=null,local_files_only:n=!1,revision:a="main",model_file_name:o=null,subfolder:i="onnx",device:l=null,dtype:c=null,use_external_data_format:d=null,session_options:h={}}={}){const _={progress_callback:t,config:r,cache_dir:s,local_files_only:n,revision:a,model_file_name:o,subfolder:i,device:l,dtype:c,use_external_data_format:d,session_options:h};if(_.config=await on.from_pretrained(e,_),!this.MODEL_CLASS_MAPPINGS)throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: "+this.name);const{model_type:p}=_.config;for(const w of this.MODEL_CLASS_MAPPINGS){let v=w.get(p);if(!v){for(const y of w.values())if(y[0]===p){v=y;break}if(!v)continue}return await sl[v].from_pretrained(e,_)}if(this.BASE_IF_FAIL)return mI.has(p)||ue.warn(`Unknown model class "${p}", attempting to construct from base class.`),await P.from_pretrained(e,_);throw Error(`Unsupported model type: ${p}`)}},k(oo,"MODEL_CLASS_MAPPINGS",null),k(oo,"BASE_IF_FAIL",!1),oo),wn=(io=class extends Ce{},k(io,"MODEL_CLASS_MAPPINGS",ig.map(e=>e[0])),k(io,"BASE_IF_FAIL",!0),io),cg=(_u=class extends Ce{},k(_u,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES]),_u),gI=(pu=class extends Ce{},k(pu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES]),pu),ou=(mu=class extends Ce{},k(mu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]),mu),wI=(gu=class extends Ce{},k(gu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]),gu),vI=(wu=class extends Ce{},k(wu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]),wu),yI=(vu=class extends Ce{},k(vu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]),vu),bI=(yu=class extends Ce{},k(yu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES]),yu),MI=(bu=class extends Ce{},k(bu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_MASKED_LM_MAPPING_NAMES]),bu),xI=(Mu=class extends Ce{},k(Mu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES]),Mu),kI=(xu=class extends Ce{},k(xu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES]),xu),TI=(ku=class extends Ce{},k(ku,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES]),ku),ug=(Tu=class extends Ce{},k(Tu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES]),Tu),dg=(Eu=class extends Ce{},k(Eu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES]),Eu),hg=(Au=class extends Ce{},k(Au,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES]),Au),EI=(Cu=class extends Ce{},k(Cu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES]),Cu),AI=(Su=class extends Ce{},k(Su,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]),Su);Pu=class extends Ce{},k(Pu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]);var CI=(Fu=class extends Ce{},k(Fu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_CTC_MAPPING_NAMES]),Fu),SI=(Lu=class extends Ce{},k(Lu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]),Lu);Iu=class extends Ce{},k(Iu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]),Ou=class extends Ce{},k(Ou,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]);var PI=(Nu=class extends Ce{},k(Nu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]),Nu);Du=class extends Ce{},k(Du,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]);var FI=(zu=class extends Ce{},k(zu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]),zu),LI=(Bu=class extends Ce{},k(Bu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES]),Bu);Ru=class extends Ce{},k(Ru,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES]),Gu=class extends Ce{},k(Gu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES]);var II=($u=class extends Ce{},k($u,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES]),$u);Vu=class extends Ce{},k(Vu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES]),Uu=class extends Ce{},k(Uu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES]);async function Ht(e){return Array.isArray(e)||(e=[e]),await Promise.all(e.map(t=>qt.read(t)))}async function Ns(e,t){return Array.isArray(e)||(e=[e]),await Promise.all(e.map(r=>typeof r=="string"||r instanceof URL?I1(r,t):r instanceof Float64Array?new Float32Array(r):r))}function iu(e,t){t&&(e=e.map(o=>o|0));const[r,s,n,a]=e;return{xmin:r,ymin:s,xmax:n,ymax:a}}var Ve=class extends vt{constructor({task:e,model:t,tokenizer:r=null,processor:s=null}){super(),this.task=e,this.model=t,this.tokenizer=r,this.processor=s}async dispose(){await this.model.dispose()}},OI=class extends Ve{async _call(e,{top_k:t=1}={}){const r=this.tokenizer(e,{padding:!0,truncation:!0}),s=await this.model(r),{problem_type:n,id2label:a}=this.model.config,o=n==="multi_label_classification"?l=>l.sigmoid():l=>new U("float32",nt(l.data),l.dims),i=[];for(const l of s.logits){const c=o(l),d=await _s(c,t),h=d[0].tolist(),p=d[1].tolist().map((w,v)=>({label:a?a[w]:`LABEL_${w}`,score:h[v]}));t===1?i.push(...p):i.push(p)}return Array.isArray(e)||t===1?i:i[0]}},NI=class extends Ve{async _call(e,{ignore_labels:t=["O"]}={}){const r=Array.isArray(e),s=this.tokenizer(r?e:[e],{padding:!0,truncation:!0}),a=(await this.model(s)).logits,o=this.model.config.id2label,i=[];for(let l=0;l<a.dims[0];++l){const c=s.input_ids[l],d=a[l],h=[];for(let _=0;_<d.dims[0];++_){const p=d[_],w=je(p.data)[1],v=o?o[w]:`LABEL_${w}`;if(t.includes(v))continue;const y=this.tokenizer.decode([c[_].item()],{skip_special_tokens:!0});if(y==="")continue;const M=nt(p.data);h.push({entity:v,score:M[w],index:_,word:y})}i.push(h)}return r?i:i[0]}},DI=class extends Ve{async _call(e,t,{top_k:r=1}={}){const s=this.tokenizer(e,{text_pair:t,padding:!0,truncation:!0}),n=Array.isArray(e),{start_logits:a,end_logits:o}=await this.model(s),i=s.input_ids.tolist(),l=s.attention_mask.tolist(),{all_special_ids:c,sep_token_id:d}=this.tokenizer,h=[];for(let _=0;_<a.dims[0];++_){const p=i[_],w=p.findIndex(S=>S==d),v=a[_].tolist(),y=o[_].tolist();for(let S=1;S<v.length;++S)(l[_]==0||S<=w||c.findIndex(N=>N==p[S])!==-1)&&(v[S]=-1/0,y[S]=-1/0);const M=nt(v).map((S,N)=>[S,N]),T=nt(y).map((S,N)=>[S,N]);M[0][0]=0,T[0][0]=0;const A=Ny(M,T).filter(S=>S[0][1]<=S[1][1]).map(S=>[S[0][1],S[1][1],S[0][0]*S[1][0]]).sort((S,N)=>N[2]-S[2]),C=[];for(let S=0;S<Math.min(A.length,r);++S){const[N,x,R]=A[S],z=p.slice(N,x+1),$=this.tokenizer.decode(z,{skip_special_tokens:!0});C.push({answer:$,score:R})}r===1?h.push(...C):h.push(C)}return n?h:h[0]}},zI=class extends Ve{async _call(e,{top_k:t=5}={}){const{mask_token_id:r,mask_token:s}=this.tokenizer,n=this.tokenizer(e,{padding:!0,truncation:!0}),{logits:a}=await this.model(n),o=[],i=n.input_ids.tolist();for(let l=0;l<i.length;++l){const c=i[l],d=c.findIndex(v=>v==r);if(d===-1)throw Error(`Mask token (${s}) not found in text.`);const h=a[l][d],_=await _s(new U("float32",nt(h.data),h.dims),t),p=_[0].tolist(),w=_[1].tolist();o.push(w.map((v,y)=>{const M=c.slice();return M[d]=v,{score:p[y],token:Number(v),token_str:this.tokenizer.decode([v]),sequence:this.tokenizer.decode(M,{skip_special_tokens:!0})}}))}return Array.isArray(e)?o:o[0]}},lu=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256});k(this,"_key","generated_text")}async _call(t,r={}){Array.isArray(t)||(t=[t]),this.model.config.prefix&&(t=t.map(l=>this.model.config.prefix+l));const s=this.model.config.task_specific_params;s&&s[this.task]&&s[this.task].prefix&&(t=t.map(l=>s[this.task].prefix+l));const n=this.tokenizer,a={padding:!0,truncation:!0};let o;this.task==="translation"&&"_build_translation_inputs"in n?o=n._build_translation_inputs(t,a,r):o=n(t,a);const i=await this.model.generate({...o,...this._default_generation_config,...r});return n.batch_decode(i,{skip_special_tokens:!0}).map(l=>({[this._key]:l}))}},BI=class extends lu{constructor(){super(...arguments);k(this,"_key","summary_text")}},RI=class extends lu{constructor(){super(...arguments);k(this,"_key","translation_text")}};function fg(e){return Array.isArray(e)&&e.every(t=>"role"in t&&"content"in t)}var GI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256})}async _call(t,r={}){let s=!1,n=!1,a=r.add_special_tokens??(this.tokenizer.add_bos_token||this.tokenizer.add_eos_token)??!1,o=r.tokenizer_encode_kwargs,i;if(typeof t=="string")i=t=[t];else if(Array.isArray(t)&&t.every(w=>typeof w=="string"))s=!0,i=t;else{if(fg(t))t=[t];else if(Array.isArray(t)&&t.every(fg))s=!0;else throw new Error("Input must be a string, an array of strings, a Chat, or an array of Chats");n=!0,i=t.map(w=>this.tokenizer.apply_chat_template(w,{tokenize:!1,add_generation_prompt:!0,...o})),a=!1,o=void 0}const l=n?!1:r.return_full_text??!0;this.tokenizer.padding_side="left";const c=this.tokenizer(i,{add_special_tokens:a,padding:!0,truncation:!0,...o}),d=await this.model.generate({...c,...this._default_generation_config,...r}),h=this.tokenizer.batch_decode(d,{skip_special_tokens:!0});let _;!l&&c.input_ids.dims.at(-1)>0&&(_=this.tokenizer.batch_decode(c.input_ids,{skip_special_tokens:!0}).map(w=>w.length));const p=Array.from({length:t.length},w=>[]);for(let w=0;w<h.length;++w){const v=Math.floor(w/d.dims[0]*t.length);_&&(h[w]=h[w].slice(_[v])),p[v].push({generated_text:n?[...t[v],{role:"assistant",content:h[w]}]:h[w]})}return!s&&p.length===1?p[0]:p}},$I=class extends Ve{constructor(e){super(e),this.label2id=Object.fromEntries(Object.entries(this.model.config.label2id).map(([t,r])=>[t.toLowerCase(),r])),this.entailment_id=this.label2id.entailment,this.entailment_id===void 0&&(ue.warn("Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."),this.entailment_id=2),this.contradiction_id=this.label2id.contradiction??this.label2id.not_entailment,this.contradiction_id===void 0&&(ue.warn("Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."),this.contradiction_id=0)}async _call(e,t,{hypothesis_template:r="This example is {}.",multi_label:s=!1}={}){const n=Array.isArray(e);n||(e=[e]),Array.isArray(t)||(t=[t]);const a=t.map(l=>r.replace("{}",l)),o=s||t.length===1,i=[];for(const l of e){const c=[];for(const _ of a){const p=this.tokenizer(l,{text_pair:_,padding:!0,truncation:!0}),w=await this.model(p);o?c.push([w.logits.data[this.contradiction_id],w.logits.data[this.entailment_id]]):c.push(w.logits.data[this.entailment_id])}const h=(o?c.map(_=>nt(_)[1]):nt(c)).map((_,p)=>[_,p]).sort((_,p)=>p[0]-_[0]);i.push({sequence:l,labels:h.map(_=>t[_[1]]),scores:h.map(_=>_[0])})}return n?i:i[0]}},VI=class extends Ve{async _call(e,{top_k:t=5}={}){const r=this.processor.feature_extractor.config.sampling_rate,s=await Ns(e,r),n=this.model.config.id2label,a=[];for(const o of s){const i=await this.processor(o),c=(await this.model(i)).logits[0],d=await _s(new U("float32",nt(c.data),c.dims),t),h=d[0].tolist(),p=d[1].tolist().map((w,v)=>({label:n?n[w]:`LABEL_${w}`,score:h[v]}));a.push(p)}return Array.isArray(e)?a:a[0]}},UI=class extends Ve{async _call(e,t,{hypothesis_template:r="This is a sound of {}."}={}){const s=!Array.isArray(e);s&&(e=[e]);const n=t.map(c=>r.replace("{}",c)),a=this.tokenizer(n,{padding:!0,truncation:!0}),o=this.processor.feature_extractor.config.sampling_rate,i=await Ns(e,o),l=[];for(const c of i){const d=await this.processor(c),h=await this.model({...a,...d}),_=nt(h.logits_per_audio.data);l.push([..._].map((p,w)=>({score:p,label:t[w]})))}return s?l[0]:l}},jI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{})}async _call(t,r={}){switch(r={...this._default_generation_config,...r},this.model.config.model_type){case"whisper":case"lite-whisper":return this._call_whisper(t,r);case"wav2vec2":case"wav2vec2-bert":case"unispeech":case"unispeech-sat":case"hubert":case"parakeet_ctc":return this._call_wav2vec2(t,r);case"moonshine":return this._call_moonshine(t,r);case"cohere_asr":return this._call_cohere_asr(t,r);default:throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)}}async _call_wav2vec2(t,r){r.language&&ue.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".'),r.task&&ue.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor.config.sampling_rate,o=await Ns(n,a),i=[];for(const l of o){const c=await this.processor(l),h=(await this.model(c)).logits[0],_=[];for(const w of h)_.push(je(w.data)[1]);const p=this.tokenizer.decode(_,{skip_special_tokens:!0}).trim();i.push({text:p})}return s?i[0]:i}async _call_whisper(t,r){const s=r.return_timestamps??!1,n=r.chunk_length_s??0,a=r.force_full_sequences??!1;let o=r.stride_length_s??null;const i={...r};s==="word"&&(i.return_token_timestamps=!0,i.return_timestamps=!0);const l=!Array.isArray(t),c=l?[t]:t,d=this.processor.feature_extractor.config,h=d.chunk_length/this.model.config.max_source_positions,_=d.hop_length,p=d.sampling_rate,w=await Ns(c,p),v=[];for(const y of w){let M=[];if(n>0){if(o===null)o=n/6;else if(n<=o)throw Error("`chunk_length_s` must be larger than `stride_length_s`.");const C=p*n,S=p*o,N=C-2*S;let x=0;for(;;){const R=x+C,z=y.subarray(x,R),$=await this.processor(z),Q=x===0,H=R>=y.length;if(M.push({stride:[z.length,Q?0:S,H?0:S],input_features:$.input_features,is_last:H}),H)break;x+=N}}else M=[{stride:[y.length,0,0],input_features:(await this.processor(y)).input_features,is_last:!0}];for(const C of M){i.num_frames=Math.floor(C.stride[0]/_);const S=await this.model.generate({inputs:C.input_features,...i});if(s==="word"){const N=S.sequences.tolist()[0],x=S.token_timestamps.tolist()[0],R=this.tokenizer.timestamp_begin,z=Math.max(N.findIndex($=>Number($)>=R),0);C.tokens=N.slice(z),C.token_timestamps=x.slice(z).map($=>sn($,2))}else C.tokens=S[0].tolist();C.stride=C.stride.map(N=>N/p)}const[T,A]=this.tokenizer._decode_asr(M,{time_precision:h,return_timestamps:s,force_full_sequences:a});v.push({text:T,...A})}return l?v[0]:v}async _call_moonshine(t,r){const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor.config.sampling_rate,o=await Ns(n,a),i=[];for(const l of o){const c=await this.processor(l),d=Math.floor(l.length/a)*6,h=await this.model.generate({max_new_tokens:d,...r,...c}),_=this.processor.batch_decode(h,{skip_special_tokens:!0})[0];i.push({text:_})}return s?i[0]:i}async _call_cohere_asr(t,r){const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor,o=a.config.sampling_rate,i=await Ns(n,o),l=r.language??"en",c=this.processor.get_decoder_prompt_ids(l),d=[];for(const h of i){const _=a.split_audio(h),p=[];for(const v of _){const y=await this.processor(v),M=await this.model.generate({...y,decoder_input_ids:c,...r}),T=this.tokenizer.decode(M[0].tolist(),{skip_special_tokens:!0}).trim();p.push(T)}const w=this.processor.constructor.join_chunks(p,l);d.push({text:w})}return s?d[0]:d}},qI=class extends Ve{constructor(t){super(t);k(this,"DEFAULT_VOCODER_ID","Xenova/speecht5_hifigan");this.vocoder=t.vocoder??null}async _prepare_speaker_embeddings(t,r){if((typeof t=="string"||t instanceof URL)&&(t=new Float32Array(await(await be.fetch(t)).arrayBuffer())),t instanceof Float32Array)t=new U("float32",t,[t.length]);else if(!(t instanceof U))throw new Error("Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.");if(r>1){if(t.dims[0]===1)t=t.repeat(r,1);else if(t.dims[0]!==r)throw new Error(`Expected speaker embeddings batch size to be 1 or ${r}, but got ${t.dims[0]}.`)}return t}_postprocess_waveform(t,r,s,n=null){const a=r.data,[o,i]=r.dims,l=n?n.data:null,c=[];for(let d=0;d<o;++d){const h=l?Math.min(Math.ceil(l[d]),i):i,_=d*i;c.push(new U1(a.slice(_,_+h),s))}return Array.isArray(t)?c:c[0]}async _call(t,r){return this.processor?this._call_text_to_spectrogram(t,r):this.model.config.model_type==="supertonic"?this._call_supertonic(t,r):this._call_text_to_waveform(t)}async _call_supertonic(t,{speaker_embeddings:r,num_inference_steps:s,speed:n}){if(!r)throw new Error("Speaker embeddings must be provided for Supertonic models.");const{sampling_rate:a,style_dim:o}=this.model.config,i=this.tokenizer(t,{padding:!0,truncation:!0}),l=i.input_ids.dims[0];r=await this._prepare_speaker_embeddings(r,l),r=r.view(l,-1,o);const{waveform:c,durations:d}=await this.model.generate_speech({...i,style:r,num_inference_steps:s,speed:n});return this._postprocess_waveform(t,c,a,d)}async _call_text_to_waveform(t){const r=this.tokenizer(t,{padding:!0,truncation:!0}),{waveform:s}=await this.model(r),n=this.model.config.sampling_rate;return this._postprocess_waveform(t,s,n)}async _call_text_to_spectrogram(t,{speaker_embeddings:r}){this.vocoder||(ue.info("No vocoder specified, using default HifiGan vocoder."),this.vocoder=await wn.from_pretrained(this.DEFAULT_VOCODER_ID,{dtype:"fp32"}));const{input_ids:s}=this.tokenizer(t,{padding:!0,truncation:!0}),n=s.dims[0];r=await this._prepare_speaker_embeddings(r,n),r=r.view(n,-1);const{waveform:a}=await this.model.generate_speech(s,r,{vocoder:this.vocoder}),o=this.processor.feature_extractor.config.sampling_rate;return this._postprocess_waveform(t,a,o)}},WI=class extends Ve{async _call(e,t={}){const r=Array.isArray(e),s=await Ht(e),{pixel_values:n}=await this.processor(s),a=[];for(const o of n){o.dims=[1,...o.dims];const i=await this.model.generate({inputs:o,...t}),l=this.tokenizer.batch_decode(i,{skip_special_tokens:!0}).map(c=>({generated_text:c.trim()}));a.push(l)}return r?a:a[0]}},HI=class extends Ve{async _call(e,{top_k:t=5}={}){const r=await Ht(e),{pixel_values:s}=await this.processor(r),n=await this.model({pixel_values:s}),{id2label:a}=this.model.config,o=[];for(const i of n.logits){const l=await _s(new U("float32",nt(i.data),i.dims),t),c=l[0].tolist(),h=l[1].tolist().map((_,p)=>({label:a?a[_]:`LABEL_${_}`,score:c[p]}));o.push(h)}return Array.isArray(e)?o:o[0]}},_g={panoptic:"post_process_panoptic_segmentation",instance:"post_process_instance_segmentation",semantic:"post_process_semantic_segmentation"},pg=class extends Ve{async _call(e,{threshold:t=.5,mask_threshold:r=.5,overlap_mask_area_threshold:s=.8,label_ids_to_fuse:n=null,target_sizes:a=null,subtask:o=null}={}){if(Array.isArray(e)&&e.length!==1)throw Error("Image segmentation pipeline currently only supports a batch size of 1.");const l=await Ht(e),c=l.map(M=>[M.height,M.width]),d=await this.processor(l),{inputNames:h,outputNames:_}=this.model.sessions.model;if(!h.includes("pixel_values")){if(h.length!==1)throw Error(`Expected a single input name, but got ${h.length} inputs: ${h}.`);const M=h[0];if(M in d)throw Error(`Input name ${M} already exists in the inputs.`);d[M]=d.pixel_values}const p=await this.model(d);let w=null;if(o!==null)w=_g[o];else if(this.processor.image_processor){for(const[M,T]of Object.entries(_g))if(T in this.processor.image_processor){w=this.processor.image_processor[T].bind(this.processor.image_processor),o=M;break}}const v=this.model.config.id2label,y=[];if(o)if(o==="panoptic"||o==="instance"){const M=w(p,t,r,s,n,a??c)[0],T=M.segmentation;for(const A of M.segments_info){const C=new Uint8ClampedArray(T.data.length);for(let N=0;N<T.data.length;++N)T.data[N]===A.id&&(C[N]=255);const S=new qt(C,T.dims[1],T.dims[0],1);y.push({score:A.score,label:v[A.label_id],mask:S})}}else if(o==="semantic"){const{segmentation:M,labels:T}=w(p,a??c)[0];for(const A of T){const C=new Uint8ClampedArray(M.data.length);for(let N=0;N<M.data.length;++N)M.data[N]===A&&(C[N]=255);const S=new qt(C,M.dims[1],M.dims[0],1);y.push({score:null,label:v[A],mask:S})}}else throw Error(`Subtask ${o} not supported.`);else{const T=p[_[0]];for(let A=0;A<c.length;++A){const C=c[A],S=T[A];S.data.some(x=>x<-1e-5||x>1+1e-5)&&S.sigmoid_();const N=await qt.fromTensor(S.mul_(255).to("uint8")).resize(C[1],C[0]);y.push({label:null,score:null,mask:N})}}return y}},QI=class extends pg{async _call(e,t={}){const r=await Ht(e),s=await super._call(e,t),n=r.map((a,o)=>{const i=a.clone();return i.putAlpha(s[o].mask),i});return Array.isArray(e)?n:n[0]}},XI=class extends Ve{async _call(e,t,{hypothesis_template:r="This is a photo of {}"}={}){const s=Array.isArray(e),n=await Ht(e),a=t.map(h=>r.replace("{}",h)),o=this.tokenizer(a,{padding:this.model.config.model_type==="siglip"?"max_length":!0,truncation:!0}),{pixel_values:i}=await this.processor(n),l=await this.model({...o,pixel_values:i}),c=this.model.config.model_type==="siglip"?h=>h.sigmoid().data:h=>nt(h.data),d=[];for(const h of l.logits_per_image){const p=[...c(h)].map((w,v)=>({score:w,label:t[v]}));p.sort((w,v)=>v.score-w.score),d.push(p)}return s?d:d[0]}},YI=class extends Ve{async _call(e,{threshold:t=.9,percentage:r=!1}={}){const s=Array.isArray(e);if(s&&e.length!==1)throw Error("Object detection pipeline currently only supports a batch size of 1.");const n=await Ht(e),a=r?null:n.map(_=>[_.height,_.width]),{pixel_values:o,pixel_mask:i}=await this.processor(n),l=await this.model({pixel_values:o,pixel_mask:i}),c=this.processor.image_processor.post_process_object_detection(l,t,a),{id2label:d}=this.model.config,h=c.map(_=>_.boxes.map((p,w)=>({score:_.scores[w],label:d[_.classes[w]],box:iu(p,!r)})));return s?h:h[0]}},JI=class extends Ve{async _call(e,t,{threshold:r=.1,top_k:s=null,percentage:n=!1}={}){const a=Array.isArray(e),o=await Ht(e),i=this.tokenizer(t,{padding:!0,truncation:!0}),l=await this.processor(o),c=[];for(let d=0;d<o.length;++d){const h=o[d],_=n?null:[[h.height,h.width]],p=l.pixel_values[d].unsqueeze_(0),w=await this.model({...i,pixel_values:p});let v;if("post_process_grounded_object_detection"in this.processor){const y=this.processor.post_process_grounded_object_detection(w,i.input_ids,{box_threshold:r,text_threshold:r,target_sizes:_})[0];v=y.boxes.map((M,T)=>({score:y.scores[T],label:y.labels[T],box:iu(M,!n)}))}else{const y=this.processor.image_processor.post_process_object_detection(w,r,_,!0)[0];v=y.boxes.map((M,T)=>({score:y.scores[T],label:t[y.classes[T]],box:iu(M,!n)}))}v.sort((y,M)=>M.score-y.score),s!==null&&(v=v.slice(0,s)),c.push(v)}return a?c:c[0]}},KI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256})}async _call(t,r,s={}){if(Array.isArray(t)){if(t.length!==1)throw Error("Document Question Answering pipeline currently only supports a batch size of 1.");t=t[0]}const n=(await Ht(t))[0],{pixel_values:a}=await this.processor(n),o=`<s_docvqa><s_question>${r}</s_question><s_answer>`,i=this.tokenizer(o,{add_special_tokens:!1,padding:!0,truncation:!0}).input_ids,l=await this.model.generate({inputs:a,max_length:this.model.config.decoder.max_position_embeddings,decoder_input_ids:i,...this._default_generation_config,...s}),d=this.tokenizer.batch_decode(l)[0].match(/<s_answer>(.*?)<\/s_answer>/);let h=null;return d&&d.length>=2&&(h=d[1].trim()),[{answer:h}]}},ZI=class extends Ve{async _call(e){const t=await Ht(e),r=await this.processor(t),s=await this.model(r),n=[];for(const a of s.reconstruction){const o=a.squeeze().clamp_(0,1).mul_(255).round_().to("uint8");n.push(qt.fromTensor(o))}return Array.isArray(e)?n:n[0]}},eO=class extends Ve{async _call(e){const t=await Ht(e),r=await this.processor(t),{predicted_depth:s}=await this.model(r),n=[];for(let a=0;a<t.length;++a){const o=s[a],[i,l]=o.dims.slice(-2),[c,d]=t[a].size,h=(await jt(o.view(1,1,i,l),{size:[d,c],mode:"bilinear"})).view(d,c),_=h.min().item(),p=h.max().item(),w=h.sub(_).div_(p-_).mul_(255).to("uint8").unsqueeze(0),v=qt.fromTensor(w);n.push({predicted_depth:h,depth:v})}return Array.isArray(e)?n:n[0]}},tO=class extends Ve{async _call(e,{pooling:t="none",normalize:r=!1,quantize:s=!1,precision:n="binary"}={}){const a=this.tokenizer(e,{padding:!0,truncation:!0}),o=await this.model(a);let i=o.last_hidden_state??o.logits??o.token_embeddings;switch(t){case"none":break;case"mean":i=Ax(i,a.attention_mask);break;case"first_token":case"cls":i=i.slice(null,0);break;case"last_token":case"eos":i=i.slice(null,-1);break;default:throw Error(`Pooling method '${t}' not supported.`)}return r&&(i=i.normalize(2,-1)),s&&(i=Px(i,n)),i}},rO=class extends Ve{async _call(e,{pool:t=null}={}){const r=await Ht(e),{pixel_values:s}=await this.processor(r),n=await this.model({pixel_values:s});let a;if(t){if(!("pooler_output"in n))throw Error("No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.");a=n.pooler_output}else a=n.last_hidden_state??n.logits??n.image_embeds;return a}},qa=Object.freeze({"text-classification":{pipeline:OI,model:cg,default:{model:"Xenova/distilbert-base-uncased-finetuned-sst-2-english"},type:"text"},"token-classification":{pipeline:NI,model:gI,default:{model:"Xenova/bert-base-multilingual-cased-ner-hrl"},type:"text"},"question-answering":{pipeline:DI,model:xI,default:{model:"Xenova/distilbert-base-cased-distilled-squad"},type:"text"},"fill-mask":{pipeline:zI,model:MI,default:{model:"onnx-community/ettin-encoder-32m-ONNX",dtype:"fp32"},type:"text"},summarization:{pipeline:BI,model:ou,default:{model:"Xenova/distilbart-cnn-6-6"},type:"text"},translation:{pipeline:RI,model:ou,default:{model:"Xenova/t5-small"},type:"text"},"text2text-generation":{pipeline:lu,model:ou,default:{model:"Xenova/flan-t5-small"},type:"text"},"text-generation":{pipeline:GI,model:bI,default:{model:"onnx-community/Qwen3-0.6B-ONNX",dtype:"q4"},type:"text"},"zero-shot-classification":{pipeline:$I,model:cg,default:{model:"Xenova/distilbert-base-uncased-mnli"},type:"text"},"audio-classification":{pipeline:VI,model:SI,default:{model:"Xenova/wav2vec2-base-superb-ks"},type:"audio"},"zero-shot-audio-classification":{pipeline:UI,model:wn,default:{model:"Xenova/clap-htsat-unfused"},type:"multimodal"},"automatic-speech-recognition":{pipeline:jI,model:[wI,CI],default:{model:"Xenova/whisper-tiny.en"},type:"multimodal"},"text-to-audio":{pipeline:qI,model:[yI,vI],default:{model:"onnx-community/Supertonic-TTS-ONNX",dtype:"fp32"},type:"text"},"image-to-text":{pipeline:WI,model:kI,default:{model:"Xenova/vit-gpt2-image-captioning"},type:"multimodal"},"image-classification":{pipeline:HI,model:TI,default:{model:"Xenova/vit-base-patch16-224"},type:"multimodal"},"image-segmentation":{pipeline:pg,model:[ug,dg,hg],default:{model:"Xenova/detr-resnet-50-panoptic"},type:"multimodal"},"background-removal":{pipeline:QI,model:[ug,dg,hg],default:{model:"Xenova/modnet"},type:"image"},"zero-shot-image-classification":{pipeline:XI,model:wn,default:{model:"Xenova/clip-vit-base-patch32"},type:"multimodal"},"object-detection":{pipeline:YI,model:EI,default:{model:"Xenova/detr-resnet-50"},type:"multimodal"},"zero-shot-object-detection":{pipeline:JI,model:AI,default:{model:"Xenova/owlvit-base-patch32"},type:"multimodal"},"document-question-answering":{pipeline:KI,model:PI,default:{model:"Xenova/donut-base-finetuned-docvqa"},type:"multimodal"},"image-to-image":{pipeline:ZI,model:FI,default:{model:"Xenova/swin2SR-classical-sr-x2-64"},type:"image"},"depth-estimation":{pipeline:eO,model:LI,default:{model:"onnx-community/depth-anything-v2-small"},type:"image"},"feature-extraction":{pipeline:tO,model:wn,default:{model:"onnx-community/all-MiniLM-L6-v2-ONNX",dtype:"fp32"},type:"text"},"image-feature-extraction":{pipeline:rO,model:[II,wn],default:{model:"onnx-community/dinov3-vits16-pretrain-lvd1689m-ONNX",dtype:"fp32"},type:"image"}}),mg=Object.freeze({"sentiment-analysis":"text-classification",ner:"token-classification",asr:"automatic-speech-recognition","text-to-speech":"text-to-audio",embeddings:"feature-extraction"});async function sO(e){if(!e)throw new Error("modelId is required");return(await tn(e,an,{})).exists?[an]:[]}async function nO(e,{config:t=null,dtype:r=null,device:s=null,model_file_name:n=null,include_tokenizer:a=!0,include_processor:o=!0}={}){const i=await np(e,{config:t,dtype:r,device:s,model_file_name:n});if(a){const l=await Zf(e);i.push(...l)}if(o){const l=await sO(e);i.push(...l)}return i}async function aO(e,t,r={}){e=mg[e]??e;const s=qa[e];if(!s)throw new Error(`Unsupported pipeline task: ${e}. Must be one of [${Object.keys(qa).join(", ")}]`);const{type:n}=s,i=await nO(t,{...r,include_tokenizer:n!=="audio"&&n!=="image",include_processor:n!=="text"});if(e==="text-generation"){const l=await sp(t,r),c=rp(l),d=wE(c);if(d){const h=Object.values(d).map(_=>`onnx/${_}`);return i.filter(_=>!_.startsWith("onnx/")||h.some(p=>_.startsWith(p)))}}return i}async function oO(e,t=null,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main",device:i=null,dtype:l=null,subfolder:c="onnx",use_external_data_format:d=null,model_file_name:h=null,session_options:_={}}={}){e=mg[e]??e;const p=qa[e.split("_",1)[0]];if(!p)throw Error(`Unsupported pipeline: ${e}. Must be one of [${Object.keys(qa)}]`);t||(t=p.default.model,ue.info(`No model specified. Using default model: "${t}".`),!l&&p.default.dtype&&(l=p.default.dtype));const w=await aO(e,t,{device:i,dtype:l});let v={};r&&(await Promise.all(w.map(async Q=>tn(t,Q)))).forEach((Q,H)=>{Q.exists&&(v[w[H]]={loaded:0,total:Q.size??0})});const y={progress_callback:r?new pi(r,v):void 0,config:s,cache_dir:n,local_files_only:a,revision:o,device:i,dtype:l,subfolder:c,use_external_data_format:d,model_file_name:h,session_options:_},M=w.includes("tokenizer.json"),T=w.includes("preprocessor_config.json"),A=p.model;let C;if(Array.isArray(A)){const $=s??await on.from_pretrained(t,y),{model_type:Q}=$,H=A.find(D=>D.supports(Q));if(!H)throw Error(`Unsupported model type "${Q}" for task "${e}". None of the candidate model classes support this type.`);C=H.from_pretrained(t,{...y,config:$})}else C=A.from_pretrained(t,y);const[S,N,x]=await Promise.all([M?Te.from_pretrained(t,y):null,T?qT.from_pretrained(t,y):null,C]),R={task:e,model:x};S&&(R.tokenizer=S),N&&(R.processor=N),Sr(r,{status:"ready",task:e,model:t});const z=p.pipeline;return new z(R)}fe.IS_PROCESS_AVAILABLE;let Nr=null;const gg="onnx-community/moonshine-base-ONNX";function Ct(e,t,r){self.postMessage({type:"log",level:e,message:t,meta:r})}async function iO(){if(Nr)return;Ct("info","load: begin",{model:gg}),self.postMessage({type:"status",status:"loading",message:"Downloading speech model…"});let e=!1;const t=setTimeout(()=>{!e&&!Nr&&(self.postMessage({type:"status",status:"loading",message:"Loading speech model from cache…"}),Ct("info","load: no download progress within 1.5s, assuming cache hit"))},1500),r=Date.now();try{Nr=await oO("automatic-speech-recognition",gg,{dtype:"fp32",device:"wasm",progress_callback:n=>{n.status==="progress_total"&&typeof n.progress=="number"?(e=!0,self.postMessage({type:"progress",pct:n.progress})):(n.status==="download"||n.status==="initiate")&&(e=!0,Ct("debug","pipeline: "+n.status,{file:n.file}))}})}finally{clearTimeout(t)}Ct("info","load: pipeline ready",{ms:Date.now()-r});const s=Date.now();try{await Promise.race([Nr(new Float32Array(16e3)),new Promise((n,a)=>setTimeout(()=>a(new Error("warmup timeout")),1e4))]),Ct("info","load: warmup done",{ms:Date.now()-s})}catch(n){const a=n instanceof Error?n.message:String(n);Ct("warn","load: warmup failed (best-effort, ignored)",{error:a,ms:Date.now()-s})}self.postMessage({type:"status",status:"ready",message:"Speech model ready"})}self.onmessage=async e=>{var r,s;const{type:t}=e.data;if(t==="load"){try{await iO()}catch(n){const a=n instanceof Error?n.message:"Model load failed",o=n instanceof Error?n.stack:"";console.error("[speechWorker] load failed:",a,o),Ct("error","load: failed",{error:a,stack:o}),self.postMessage({type:"error",error:a})}return}if(t==="transcribe"){const n=e.data.audio;if(!Nr){Ct("warn","transcribe: model not loaded, dropping segment",{samples:n==null?void 0:n.length}),self.postMessage({type:"error",error:"Model not loaded"});return}const a=Date.now();try{const i=((r=(await Nr(n)).text)==null?void 0:r.trim())??"";Ct("debug","transcribe: ok",{ms:Date.now()-a,samples:n.length,chars:i.length}),self.postMessage({type:"result",text:i})}catch(o){const i=o instanceof Error?o.message:"Transcription failed";Ct("error","transcribe: failed",{error:i,ms:Date.now()-a}),self.postMessage({type:"error",error:i})}return}if(t==="transcribe-partial"){const n=e.data.audio,a=e.data.seq??0;if(!Nr){Ct("warn","transcribe-partial: model not loaded, dropping",{samples:n==null?void 0:n.length,seq:a}),self.postMessage({type:"partial-error",seq:a,error:"Model not loaded"});return}const o=Date.now();try{const l=((s=(await Nr(n)).text)==null?void 0:s.trim())??"";Ct("debug","transcribe-partial: ok",{ms:Date.now()-o,samples:n.length,chars:l.length,seq:a}),self.postMessage({type:"partial",text:l,seq:a})}catch(i){const l=i instanceof Error?i.message:"Partial transcription failed";Ct("warn","transcribe-partial: failed",{error:l,ms:Date.now()-o,seq:a}),self.postMessage({type:"partial-error",seq:a,error:l})}return}}})();
+`}):(ue.warn("You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special image tokens in the text, as many tokens as there are images per each text. It is recommended to add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images each text has and add special tokens."),a=t.map(l=>TT(l,s,n,ms,e.length)));const o=this.tokenizer(a,r);return{...await this.image_processor(e,r),...o}}},k(Cn,"tokenizer_class",Te),k(Cn,"image_processor_class",dt),k(Cn,"uses_processor_config",!1),Cn),q_="<|image|>",AT=/<\|image_\d+\|>/g,CT=(to=class extends Ee{async _call(e,t=null,{padding:r=!0,truncation:s=!0,num_crops:n=null}={}){Array.isArray(e)||(e=[e]);let a,o;if(t){o=await this.image_processor(t,{num_crops:n});const{num_img_tokens:i}=o,l=e.map((d,h)=>d.split(AT).join(q_.repeat(i[h])));a=this.tokenizer(l,{padding:r,truncation:s});const c=this.tokenizer._tokenizer.token_to_id(q_);a.input_ids.map_(d=>d==c?-d:d)}else a=this.tokenizer(e);return{...a,...o}}},k(to,"image_processor_class",dt),k(to,"tokenizer_class",Te),to),ST=(Sn=class extends Ee{async _call(e,t=null,r={}){const s=await this.image_processor(e,r);if(t){const[a,o]=s.pixel_values.dims.slice(-2),{image_token:i,image_break_token:l,image_end_token:c,patch_size:d,spatial_merge_size:h}=this.config,_=d*h,p=Math.floor(a/_),w=Math.floor(o/_);t=structuredClone(t),Array.isArray(t)||(t=[t]);for(let v=0;v<t.length;++v){const y=i.repeat(w),M=y+l,T=y+c,A=M.repeat(p-1)+T;t[v]=t[v].replace(i,A)}}const n=t?this.tokenizer(t,r):{};return{...s,...n}}},k(Sn,"tokenizer_class",Te),k(Sn,"image_processor_class",dt),k(Sn,"uses_processor_config",!0),Sn),PT=(hu=class extends Ee{async _call(e){return await this.feature_extractor(e)}post_process_speaker_diarization(...e){return this.feature_extractor.post_process_speaker_diarization(...e)}get sampling_rate(){return this.feature_extractor.config.sampling_rate}},k(hu,"feature_extractor_class",g_),hu),W_=class extends Hi{},FT=class extends W_{},H_=(fu=class extends Ee{async _call(...e){return await this.image_processor(...e)}post_process_masks(...e){return this.image_processor.post_process_masks(...e)}reshape_input_points(...e){return this.image_processor.reshape_input_points(...e)}},k(fu,"image_processor_class",dt),fu),Q_=class extends H_{},LT=class extends Q_{},IT=(ro=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(ro,"tokenizer_class",Te),k(ro,"feature_extractor_class",Lt),ro),OT=(Pn=class extends Ee{async _call(e,t=null,r={}){if(Array.isArray(e))throw new Error("Batched inputs are not supported yet.");let s={};if(t){const a=t.length,{input_features:o}=await this.feature_extractor(t,{...r,max_length:a}),i=Math.round(a/this.config.encoder_ds_factor+1e-4),l=1+Math.ceil(i/this.config.stack_factor);s.audio_token_len=[l],s.audio_values=o;const c=this.config.audio_placeholder;if(!e.includes(c))throw new Error(`The input text does not contain the image token ${c}.`);e=e.replaceAll(c,c.repeat(l))}return{...this.tokenizer(e,{add_special_tokens:!1,...r}),...s}}},k(Pn,"tokenizer_class",Te),k(Pn,"feature_extractor_class",Lt),k(Pn,"uses_processor_config",!0),Pn),va="[AUDIO]",NT="[BEGIN_AUDIO]",DT=375;function zT(e,t){const r=[];for(let s=0;s<e.length;s+=t)r.push(e.subarray(s,Math.min(s+t,e.length)));return r}var BT=(Fn=class extends Ee{async _call(e,t=null,r={}){if(Array.isArray(e))throw new Error("Batched inputs are not supported yet.");const s={};if(t){if(!e.includes(va))throw new Error(`The input text does not contain the audio token ${va}.`);Array.isArray(t)||(t=[t]);const a=e.split(va),o=a.length-1;if(o!==t.length)throw new Error(`The number of audio inputs (${t.length}) does not match the number of audio tokens in the text (${o}).`);const i=this.feature_extractor.config.n_samples,l=t.map(p=>zT(p,i)),c=l.map(p=>p.length),d=l.flat(),h=(await Promise.all(d.map(p=>this.feature_extractor(p,r)))).map(p=>p.input_features);s.audio_values=h.length>1?ze(h,0):h[0];let _=a[0];for(let p=0;p<c.length;++p){_+=NT;for(let w=0;w<c[p];++w)_+=va.repeat(DT);_+=a[p+1]}e=_}return{...this.tokenizer(e,{add_special_tokens:!1,...r}),...s}}},k(Fn,"tokenizer_class",Te),k(Fn,"feature_extractor_class",Lt),k(Fn,"uses_processor_config",!1),Fn),X_=32,Qi=6,ya=8,RT=10,GT=32,$T=(Ln=class extends Ee{get num_mel_frames_first_audio_chunk(){return(Qi+1)*ya}get num_samples_first_audio_chunk(){const{hop_length:e,n_fft:t}=this.feature_extractor.config;return(this.num_mel_frames_first_audio_chunk-1)*e+Math.floor(t/2)}get num_samples_per_audio_chunk(){const{hop_length:e,n_fft:t}=this.feature_extractor.config;return ya*e+t}get num_right_pad_tokens(){return Qi+1+RT}get audio_length_per_tok(){return ya}get raw_audio_length_per_tok(){return ya*this.feature_extractor.config.hop_length}async _call(e,{is_streaming:t=!1,is_first_audio_chunk:r=!0}={}){if(at(e,"VoxtralRealtimeProcessor"),!t&&!r)throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");if(r)if(t){const s=X_*this.raw_audio_length_per_tok,n=new Float32Array(s+e.length);n.set(e,s);const a=await this.feature_extractor(n,{center:!0}),i=1+(X_+Qi),l=new BigInt64Array(i).fill(BigInt(GT));return l[0]=1n,{input_ids:new U("int64",l,[1,i]),...a}}else{const s=this.num_right_pad_tokens*this.raw_audio_length_per_tok,n=new Float32Array(e.length+s);return n.set(e),await this.feature_extractor(n,{center:!0})}else return await this.feature_extractor(e,{center:!1})}},k(Ln,"tokenizer_class",Te),k(Ln,"feature_extractor_class",Lt),k(Ln,"uses_processor_config",!1),Ln),VT=(so=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(so,"tokenizer_class",Te),k(so,"feature_extractor_class",Lt),so),UT=(no=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(no,"tokenizer_class",Te),k(no,"feature_extractor_class",Lt),no),jT=(ao=class extends Ee{async _call(e){return await this.feature_extractor(e)}},k(ao,"tokenizer_class",Te),k(ao,"feature_extractor_class",Lt),ao),qT=class{static async from_pretrained(e,t={}){const r=await er(e,an,!0,t),{image_processor_type:s,feature_extractor_type:n,processor_class:a}=r;if(a&&Di[a])return Di[a].from_pretrained(e,t);if(!s&&!n)throw new Error("No `image_processor_type` or `feature_extractor_type` found in the config.");const o={};if(s){const l=wa[s.replace(/Fast$/,"")];if(!l)throw new Error(`Unknown image_processor_type: '${s}'.`);o.image_processor=new l(r)}if(n){const l=wa[n];if(l)o.image_processor=new l(r);else{const c=zi[n];if(!c)throw new Error(`Unknown feature_extractor_type: '${n}'.`);o.feature_extractor=new c(r)}}const i={};return new Ee(i,o,null)}};async function WT(e,t){return await er(e,"config.json",!0,t)}function gs(e){const t={};let r={};switch(e.model_type){case"llava":case"paligemma":case"gemma3":case"florence2":case"llava_onevision":case"idefics3":case"granite_speech":case"ultravox":case"voxtral":case"voxtral_realtime":case"smolvlm":case"gemma3n":case"gemma4":case"lfm2_vl":case"chatterbox":case"lighton_ocr":case"glm_ocr":case"mistral3":case"qwen2_5_vl":case"qwen3_vl":case"qwen3_vl_moe":r=gs(e.text_config);break;case"moondream1":r=gs(e.phi_config);break;case"musicgen":r=gs(e.decoder);break;case"multi_modality":r=gs(e.language_config);break;case"gpt2":case"gptj":case"jais":case"codegen":case"gpt_bigcode":t.num_heads="n_head",t.num_layers="n_layer",t.hidden_size="n_embd";break;case"gpt_neox":case"stablelm":case"opt":case"falcon":case"modernbert-decoder":t.num_heads="num_attention_heads",t.num_layers="num_hidden_layers",t.hidden_size="hidden_size";break;case"gpt_oss":case"llama":case"llama4_text":case"nanochat":case"apertus":case"arcee":case"afmoe":case"lfm2":case"lfm2_moe":case"smollm3":case"olmo":case"olmo2":case"olmo3":case"mobilellm":case"granite":case"granitemoehybrid":case"cohere":case"cohere2":case"mistral":case"voxtral_realtime_text":case"voxtral_realtime_encoder":case"starcoder2":case"qwen2":case"qwen2_moe":case"qwen2_vl":case"qwen2_vl_text":case"qwen2_5_vl_text":case"qwen3_moe":case"qwen3_vl_text":case"qwen3_vl_moe_text":case"phi":case"phi3":case"phi3_v":case"llava_qwen2":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.hidden_size="hidden_size",t.num_attention_heads="num_attention_heads",t.dim_kv="head_dim";break;case"qwen3":case"solar_open":case"glm_ocr_text":case"gemma":case"gemma2":case"vaultgemma":case"gemma3_text":case"gemma3n_text":case"gemma4_text":case"glm":case"helium":case"ernie4_5":case"hunyuan_v1_dense":case"falcon_h1":case"nemotron_h":case"ministral":case"ministral3":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.dim_kv="head_dim";break;case"openelm":t.num_heads="num_kv_heads",t.num_layers="num_transformer_layers",t.dim_kv="head_dim";break;case"gpt_neo":case"donut-swin":t.num_heads="num_heads",t.num_layers="num_layers",t.hidden_size="hidden_size";break;case"bloom":t.num_heads="n_head",t.num_layers="n_layer",t.hidden_size="hidden_size";break;case"mpt":t.num_heads="n_heads",t.num_layers="n_layers",t.hidden_size="d_model";break;case"exaone":t.num_heads="num_key_value_heads",t.num_layers="num_layers",t.dim_kv="head_dim",t.num_attention_heads="num_attention_heads";break;case"youtu":case"deepseek_v3":case"glm_moe_dsa":case"mistral4":t.num_heads="num_key_value_heads",t.num_layers="num_hidden_layers",t.dim_kv="qk_head_dim",t.num_attention_heads="num_attention_heads";break;case"t5":case"mt5":case"longt5":t.num_decoder_layers="num_decoder_layers",t.num_decoder_heads="num_heads",t.decoder_dim_kv="d_kv",t.num_encoder_layers="num_layers",t.num_encoder_heads="num_heads",t.encoder_dim_kv="d_kv";break;case"bart":case"mbart":case"marian":case"whisper":case"lite-whisper":case"m2m_100":case"blenderbot":case"blenderbot-small":case"florence2_language":t.num_decoder_layers="decoder_layers",t.num_decoder_heads="decoder_attention_heads",t.decoder_hidden_size="d_model",t.num_encoder_layers="encoder_layers",t.num_encoder_heads="encoder_attention_heads",t.encoder_hidden_size="d_model";break;case"speecht5":t.num_decoder_layers="decoder_layers",t.num_decoder_heads="decoder_attention_heads",t.decoder_hidden_size="hidden_size",t.num_encoder_layers="encoder_layers",t.num_encoder_heads="encoder_attention_heads",t.encoder_hidden_size="hidden_size";break;case"trocr":t.num_encoder_layers=t.num_decoder_layers="decoder_layers",t.num_encoder_heads=t.num_decoder_heads="decoder_attention_heads",t.encoder_hidden_size=t.decoder_hidden_size="d_model";break;case"musicgen_decoder":t.num_encoder_layers=t.num_decoder_layers="num_hidden_layers",t.num_encoder_heads=t.num_decoder_heads="num_attention_heads",t.encoder_hidden_size=t.decoder_hidden_size="hidden_size";break;case"moonshine":t.num_decoder_layers="decoder_num_hidden_layers",t.num_decoder_heads="decoder_num_key_value_heads",t.num_encoder_layers="encoder_num_hidden_layers",t.num_encoder_heads="encoder_num_key_value_heads",t.encoder_hidden_size=t.decoder_hidden_size="hidden_size";break;case"cohere_asr":t.num_decoder_layers="num_hidden_layers",t.num_decoder_heads="num_key_value_heads",t.decoder_hidden_size="hidden_size",t.decoder_dim_kv="head_dim";const{num_hidden_layers:n,num_attention_heads:a,hidden_size:o}=e.encoder_config;r={num_encoder_layers:n,num_encoder_heads:a,encoder_hidden_size:o,encoder_dim_kv:e.head_dim};break;case"vision-encoder-decoder":const i=gs(e.decoder),l="num_decoder_layers"in i,c=rt(e,["model_type","is_encoder_decoder"]);return l?(c.num_decoder_layers=i.num_decoder_layers,c.num_decoder_heads=i.num_decoder_heads,c.decoder_hidden_size=i.decoder_hidden_size,c.num_encoder_layers=i.num_encoder_layers,c.num_encoder_heads=i.num_encoder_heads,c.encoder_hidden_size=i.encoder_hidden_size):(c.num_layers=i.num_layers,c.num_heads=i.num_heads,c.hidden_size=i.hidden_size),c}const s={...r,...rt(e,["model_type","multi_query","is_encoder_decoder"])};for(const n in t)s[n]=e[t[n]];return s}function ba(e,t){e instanceof Xi||(e=new Xi(e));const r=(t==null?void 0:t.batch_size)??1;if(["lfm2","lfm2_moe"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a={},{layer_types:o,num_attention_heads:i,num_key_value_heads:l,hidden_size:c,conv_L_cache:d}=e,h=c/i;for(let _=0;_<o.length;++_)if(o[_]==="full_attention")for(const p of["key","value"])a[`${s}.${_}.${p}`]=[r,l,0,h];else if(o[_]==="conv")a[`${n}_conv.${_}`]=[r,c,d];else throw new Error(`Unsupported layer type: ${o[_]}`);return a}else if(["granitemoehybrid","falcon_h1","nemotron_h"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a=e,o=a.layer_types??a.layers_block_type,i=a.num_hidden_layers??(o==null?void 0:o.length),l=a.num_key_value_heads,c=a.head_dim??a.hidden_size/a.num_attention_heads,d=a.mamba_n_heads??a.mamba_num_heads,h=a.mamba_d_head??a.mamba_head_dim,_=a.mamba_d_state??a.ssm_state_size,p=a.mamba_n_groups??a.n_groups,w=a.mamba_d_conv??a.conv_kernel,y=(a.mamba_d_ssm??(a.mamba_expand?a.mamba_expand*a.hidden_size:d*h))+2*p*_,M={};for(let T=0;T<i;++T)if((!o||o[T]==="mamba")&&(M[`${n}_conv.${T}`]=[r,y,w],M[`${n}_ssm.${T}`]=[r,d,h,_]),!o||o[T]==="attention")for(const A of["key","value"])M[`${s}.${T}.${A}`]=[r,l,0,c];return M}else if(["qwen3_next","qwen3_5_text","qwen3_5_moe_text","olmo_hybrid"].includes(e.model_type)){const s=(t==null?void 0:t.prefix)??"past_key_values",n=s==="present"?"present":"past",a={},{head_dim:o,layer_types:i,num_attention_heads:l,num_key_value_heads:c,hidden_size:d,linear_num_value_heads:h,linear_num_key_heads:_,linear_key_head_dim:p,linear_value_head_dim:w,linear_conv_kernel_dim:v}=e,y=p*_,M=w*h,T=o??d/l;for(let A=0;A<i.length;++A)if(i[A]==="full_attention")for(const C of["key","value"])a[`${s}.${A}.${C}`]=[r,c,0,T];else if(i[A]==="linear_attention"){if(e.model_type==="olmo_hybrid")a[`${n}_conv.${A}.key`]=[r,y,v],a[`${n}_conv.${A}.value`]=[r,M,v],a[`${n}_conv.${A}.query`]=[r,y,v];else{const C=y*2+M;a[`${n}_conv.${A}`]=[r,C,v]}a[`${n}_recurrent.${A}`]=[r,h,p,w]}else throw new Error(`Unsupported layer type: ${i[A]}`);return a}else if(["gemma4","gemma4_text"].includes(e.model_type)){const s=e.model_type==="gemma4"?e.text_config:e,n=(t==null?void 0:t.prefix)??"past_key_values",a={},o=s.num_hidden_layers,i=s.num_kv_shared_layers??0,l=o-i,c=s.num_key_value_heads,d=s.head_dim,h=s.global_head_dim??d,_=s.layer_types??[];for(let p=0;p<l;++p){const w=_[p]==="full_attention"?h:d;for(const v of["key","value"])a[`${n}.${p}.${v}`]=[r,c,0,w]}return a}else if(["lfm2_vl","qwen3_5","qwen3_5_moe","voxtral_realtime"].includes(e.model_type)){let s;return e.model_type==="voxtral_realtime"&&(t==null?void 0:t.session_name)==="audio_encoder"?s=e.audio_config:s=e.text_config,ba(s,t)}return HT(e,t)}function HT(e,{prefix:t="past_key_values",batch_size:r=1}={}){const s={},n=e.normalized_config;if(n.is_encoder_decoder&&"num_encoder_heads"in n&&"num_decoder_heads"in n){const a=n.encoder_dim_kv??n.encoder_hidden_size/n.num_encoder_heads,o=n.decoder_dim_kv??n.decoder_hidden_size/n.num_decoder_heads,i=[r,n.num_encoder_heads,0,a],l=[r,n.num_decoder_heads,0,o];for(let c=0;c<n.num_decoder_layers;++c)s[`${t}.${c}.encoder.key`]=i,s[`${t}.${c}.encoder.value`]=i,s[`${t}.${c}.decoder.key`]=l,s[`${t}.${c}.decoder.value`]=l}else{const a=n.num_heads,o=n.num_layers,i=n.dim_kv??n.hidden_size/(n.num_attention_heads??a);if(n.model_type==="falcon"){const l=[r*a,0,i];for(let c=0;c<o;++c)s[`${t}.${c}.key`]=l,s[`${t}.${c}.value`]=l}else if(n.multi_query){const l=[r*a,0,2*i];for(let c=0;c<o;++c)s[`${t}.${c}.key_value`]=l}else if(n.model_type==="bloom"){const l=[r*a,i,0],c=[r*a,0,i];for(let d=0;d<o;++d)s[`${t}.${d}.key`]=l,s[`${t}.${d}.value`]=c}else if(n.model_type==="openelm")for(let l=0;l<o;++l){const c=[r,a[l],0,i];s[`${t}.${l}.key`]=c,s[`${t}.${l}.value`]=c}else{const l=[r,a,0,i];for(let c=0;c<o;++c)s[`${t}.${c}.key`]=l,s[`${t}.${c}.value`]=l}}return s}var Xi=class vd{constructor(t){k(this,"model_type",null);k(this,"is_encoder_decoder",!1);k(this,"max_position_embeddings");k(this,"transformers.js_config");Object.assign(this,t),this.normalized_config=gs(this)}static async from_pretrained(t,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main"}={}){s&&!(s instanceof vd)&&(s=new vd(s));const i=s??await WT(t,{progress_callback:r,config:s,cache_dir:n,local_files_only:a,revision:o});return new this(i)}},on=class{static async from_pretrained(...e){return Xi.from_pretrained(...e)}};function Y_(e,t,r){return e?typeof e=="object"&&e!==null?e.hasOwnProperty(t)?+e[t]:e.hasOwnProperty(r)?+e[r]:0:+e:0}function J_(e,t){const r=[];for(let s=0;s<t;++s)r.push(`${e}_data${s===0?"":"_"+s}`);return r}async function QT(e,t,r,s){const n=`${t}${s}.onnx`,a=`${r.subfolder??""}/${n}`;return await ha(e,a,!0,r,fe.IS_NODE_ENV)}async function XT(e,t,r,s,n,a={}){const o=`${t}${r}.onnx`,i=fe.IS_NODE_ENV;let l=[];const c=Y_(n,o,t);if(c>0){if(c>bf)throw new Error(`The number of external data chunks (${c}) exceeds the maximum allowed value (${bf}).`);const d=J_(o,c);for(const h of d){const _=`${s.subfolder??""}/${h}`;l.push(new Promise(async(p,w)=>{const v=await ha(e,_,!0,s,i);p(v instanceof Uint8Array?{path:h,data:v}:h)}))}}else a.externalData!==void 0&&(l=a.externalData.map(async d=>{if(typeof d.data=="string"){const h=await ha(e,d.data,!0,s);return{...d,data:h}}return d}));return Promise.all(l)}async function YT(e,t,r,s=!1,n=void 0){var C;let a=((C=r.config)==null?void 0:C["transformers.js_config"])??{};const o=Vf(r.device??a.device,t,{warn:S=>ue.info(S)}),i=yx(o),l=a.device_config??{};l.hasOwnProperty(o)&&(a={...a,...l[o]});const c=qf(r.dtype??a.dtype,t,o,{configDtype:a.dtype,warn:S=>ue.info(S)});if(Ci.hasOwnProperty(c)){if(o==="webgpu"&&!fe.IS_NODE_ENV&&c===We.fp16&&!await xx())throw new Error(`The device (${o}) does not support fp16.`)}else throw new Error(`Invalid dtype: ${c}. Should be one of: ${Object.keys(We).join(", ")}`);const d=a.kv_cache_dtype,h=d?typeof d=="string"?d:d[c]??"float32":void 0;if(h&&!["float32","float16"].includes(h))throw new Error(`Invalid kv_cache_dtype: ${h}. Should be one of: float32, float16`);const _=Ci[c],p={...r.session_options};p.executionProviders??(p.executionProviders=i);const w=a.free_dimension_overrides;w?p.freeDimensionOverrides??(p.freeDimensionOverrides=w):o.startsWith("webnn")&&!p.freeDimensionOverrides&&ue.warn(`WebNN does not currently support dynamic shapes and requires 'free_dimension_overrides' to be set in config.json, preferably as a field within config["transformers.js_config"]["device_config"]["${o}"]. When 'free_dimension_overrides' is not set, you may experience significant performance degradation.`);const v=QT(e,t,r,_),y=r.use_external_data_format??a.use_external_data_format,M=await XT(e,t,_,r,y,p);if(M.length>0&&(!fe.IS_NODE_ENV||M.some(S=>typeof S!="string"))&&(p.externalData=M),s&&o==="webgpu"&&d!==!1){const S=ba(r.config,{prefix:"present",session_name:n});if(Object.keys(S).length>0&&!Ei()){const N={};for(const x in S)N[x]="gpu-buffer";p.preferredOutputLocation=N}}return{buffer_or_path:await v,session_options:p,session_config:{dtype:c,kv_cache_dtype:h,device:o}}}async function JT(e,t,r,s=void 0){return Object.fromEntries(await Promise.all(Object.keys(t).map(async n=>{const a=(s==null?void 0:s[n])??!1,{buffer_or_path:o,session_options:i,session_config:l}=await YT(e,t[n],r,a,n),c=await Bf(o,i,l);return[n,c]})))}function K_(e){for(let t in e)$f(e[t])?e[t]=new U(e[t]):typeof e[t]=="object"&&K_(e[t]);return e}async function xe(e,t){const r=KT(e,t);try{const s=Object.fromEntries(Object.entries(r).map(([a,o])=>{const i=o.ort_tensor;return fe.IS_NODE_ENV&&typeof Float16Array<"u"&&i.cpuData instanceof Float16Array&&(i.cpuData=new Uint16Array(i.cpuData.buffer)),[a,i]})),n=await Gf(e,s);return K_(n)}catch(s){const n=Object.fromEntries(Object.entries(r).map(([a,o])=>{const i={type:o.type,dims:o.dims,location:o.location};return i.location!=="gpu-buffer"&&(i.data=o.data),[a,i]}));throw ue.error(`An error occurred during model execution: "${s}".`),ue.error("Inputs given to model:",n),s}}function KT(e,t){const r=Object.create(null),s=[];for(const o of e.inputNames){const i=t[o];if(!(i instanceof U)){s.push(o);continue}r[o]=Ei()?i.clone():i}if(s.length>0)throw new Error(`An error occurred during model execution: "Missing the following inputs: ${s.join(", ")}.`);const n=Object.keys(t).length,a=e.inputNames.length;if(n>a){let o=Object.keys(t).filter(i=>!e.inputNames.includes(i));ue.warn(`WARNING: Too many inputs were provided (${n} > ${a}). The following inputs will be ignored: "${o.join(", ")}".`)}return r}var Ye=class{},ie=class extends Ye{constructor({logits:e,...t}){super(),this.logits=e;const r=Object.values(t);r.length>0&&(this.attentions=r)}},He=class extends Ye{constructor({logits:e}){super(),this.logits=e}},Je=class extends Ye{constructor({logits:e}){super(),this.logits=e}},ht=class extends Ye{constructor({start_logits:e,end_logits:t}){super(),this.start_logits=e,this.end_logits=t}},Jr=class extends Ye{constructor({logits:e}){super(),this.logits=e}},ZT=class extends Ye{constructor({alphas:e}){super(),this.alphas=e}},Wt=class extends vt{_call(e,t){throw Error("`_call` should be implemented in a subclass")}},eE=class extends vt{_call(e,t){throw Error("`_call` should be implemented in a subclass")}},Yi=class extends vt{constructor(){super(),this.processors=[]}push(e){this.processors.push(e)}extend(e){this.processors.push(...e)}_call(e,t){let r=t;for(const s of this.processors)r=s(e,r);return r}[Symbol.iterator](){return this.processors.values()}},tE=class extends Wt{constructor(e){super(),this.bos_token_id=e}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===1){const s=t[r].data;s.fill(-1/0),s[this.bos_token_id]=0}return t}},rE=class extends Wt{constructor(e,t){super(),this.max_length=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===this.max_length-1){const s=t[r].data;s.fill(-1/0);for(const n of this.eos_token_id)s[n]=0}return t}},sE=class extends Wt{constructor(e){super(),this.suppress_tokens=e}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;for(const n of this.suppress_tokens)s[n]=-1/0}return t}},Z_=class extends Wt{constructor(e,t){super(),this.begin_suppress_tokens=e,this.begin_index=t}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length===this.begin_index){const s=t[r].data;for(const n of this.begin_suppress_tokens)s[n]=-1/0}return t}},nE=class extends Wt{constructor(e,t){super(),this.eos_token_id=Array.isArray(e.eos_token_id)?e.eos_token_id[0]:e.eos_token_id,this.no_timestamps_token_id=e.no_timestamps_token_id,this.timestamp_begin=this.no_timestamps_token_id+1,this.begin_index=t.length,t.at(-1)===this.no_timestamps_token_id&&(this.begin_index-=1),this.max_initial_timestamp_index=e.max_initial_timestamp_index}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;if(s[this.no_timestamps_token_id]=-1/0,e[r].length===this.begin_index){s.subarray(0,this.timestamp_begin).fill(-1/0);continue}const n=e[r].slice(this.begin_index),a=n.length>=1&&n[n.length-1]>=this.timestamp_begin,o=n.length<2||n[n.length-2]>=this.timestamp_begin;if(a&&(o?s.subarray(this.timestamp_begin).fill(-1/0):s.subarray(0,this.eos_token_id).fill(-1/0)),e[r].length===this.begin_index&&this.max_initial_timestamp_index!==null){const d=this.timestamp_begin+this.max_initial_timestamp_index;s.subarray(d+1).fill(-1/0)}const i=ix(s),l=Math.log(i.subarray(this.timestamp_begin).map(Math.exp).reduce((d,h)=>d+h)),c=je(i.subarray(0,this.timestamp_begin))[0];l>c&&s.subarray(0,this.timestamp_begin).fill(-1/0)}return t}},aE=class extends Wt{constructor(e){super(),this.no_repeat_ngram_size=e}getNgrams(e){const t=e.length,r=[];for(let n=0;n<t+1-this.no_repeat_ngram_size;++n){const a=[];for(let o=0;o<this.no_repeat_ngram_size;++o)a.push(e[n+o]);r.push(a.map(Number))}const s=new Map;for(const n of r){const a=n.slice(0,n.length-1),o=JSON.stringify(a),i=s.get(o)??[];i.push(n[n.length-1]),s.set(o,i)}return s}getGeneratedNgrams(e,t){const r=t.slice(t.length+1-this.no_repeat_ngram_size,t.length);return e.get(JSON.stringify(r.map(Number)))??[]}calcBannedNgramTokens(e){const t=[];if(e.length+1<this.no_repeat_ngram_size)return t;{const r=this.getNgrams(e);return this.getGeneratedNgrams(r,e)}}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data,n=this.calcBannedNgramTokens(e[r]);for(const a of n)s[a]=-1/0}return t}},oE=class extends Wt{constructor(e){super(),this.penalty=e}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data;for(const n of new Set(e[r])){const a=Number(n);s[a]<0?s[a]*=this.penalty:s[a]/=this.penalty}}return t}},iE=class extends Wt{constructor(e,t){super(),this.min_length=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length<this.min_length){const s=t[r].data;for(const n of this.eos_token_id)s[n]=-1/0}return t}},lE=class extends Wt{constructor(e,t,r){super(),this.prompt_length_to_skip=e,this.min_new_tokens=t,this.eos_token_id=Array.isArray(r)?r:[r]}_call(e,t){for(let r=0;r<e.length;++r)if(e[r].length-this.prompt_length_to_skip<this.min_new_tokens){const n=t[r].data;for(const a of this.eos_token_id)n[a]=-1/0}return t}},cE=class extends Wt{constructor(e,t){super(),this.bad_words_ids=e,this.eos_token_id=Array.isArray(t)?t:[t]}_call(e,t){for(let r=0;r<e.length;++r){const s=t[r].data,n=e[r];for(const a of this.bad_words_ids){if(n.length<a.length-1)continue;let o=!0;for(let i=1;i<=a.length-1;++i)if(a.at(-i-1)!=n.at(-i)){o=!1;break}o&&(s[a.at(-1)]=-1/0)}}return t}},uE=class extends Wt{constructor(e){if(super(),e<=1)throw new Error(`Require guidance scale >1 to use the classifier free guidance processor, got guidance scale ${e}.`);this.guidance_scale=e}_call(e,t){if(t.dims[0]!==2*e.length)throw new Error(`Logits should have twice the batch size of the input ids, the first half of batches corresponding to the conditional inputs, and the second half of batches corresponding to the unconditional inputs. Got batch size ${t.dims[0]} for the logits and ${e.length} for the input ids.`);const r=e.length,s=t.slice([0,r],null),n=t.slice([r,t.dims[0]],null);for(let a=0;a<n.data.length;++a)n.data[a]+=(s.data[a]-n.data[a])*this.guidance_scale;return n}},dE=class extends eE{constructor(e){super(),this.temperature=e}_call(e,t){const r=t.data;for(let s=0;s<r.length;++s)r[s]/=this.temperature;return t}},ep=class{constructor(e){k(this,"max_length",20);k(this,"max_new_tokens",null);k(this,"min_length",0);k(this,"min_new_tokens",null);k(this,"early_stopping",!1);k(this,"max_time",null);k(this,"do_sample",!1);k(this,"num_beams",1);k(this,"num_beam_groups",1);k(this,"penalty_alpha",null);k(this,"use_cache",!0);k(this,"temperature",1);k(this,"top_k",50);k(this,"top_p",1);k(this,"typical_p",1);k(this,"epsilon_cutoff",0);k(this,"eta_cutoff",0);k(this,"diversity_penalty",0);k(this,"repetition_penalty",1);k(this,"encoder_repetition_penalty",1);k(this,"length_penalty",1);k(this,"no_repeat_ngram_size",0);k(this,"bad_words_ids",null);k(this,"force_words_ids",null);k(this,"renormalize_logits",!1);k(this,"constraints",null);k(this,"forced_bos_token_id",null);k(this,"forced_eos_token_id",null);k(this,"remove_invalid_values",!1);k(this,"exponential_decay_length_penalty",null);k(this,"suppress_tokens",null);k(this,"streamer",null);k(this,"begin_suppress_tokens",null);k(this,"forced_decoder_ids",null);k(this,"guidance_scale",null);k(this,"num_return_sequences",1);k(this,"output_attentions",!1);k(this,"output_hidden_states",!1);k(this,"output_scores",!1);k(this,"return_dict_in_generate",!1);k(this,"pad_token_id",null);k(this,"bos_token_id",null);k(this,"eos_token_id",null);k(this,"encoder_no_repeat_ngram_size",0);k(this,"decoder_start_token_id",null);k(this,"generation_kwargs",{});Object.assign(this,rt(e,Object.getOwnPropertyNames(this)))}},Ma=class extends vt{_call(e,t){throw Error("StoppingCriteria needs to be subclassed")}},tp=class Ov extends vt{constructor(){super(),this.criteria=[]}push(t){this.criteria.push(t)}extend(t){t instanceof Ov?t=t.criteria:t instanceof Ma&&(t=[t]),this.criteria.push(...t)}_call(t,r){const s=new Array(t.length).fill(!1);for(const n of this.criteria){const a=n(t,r);for(let o=0;o<s.length;++o)s[o]||(s[o]=a[o])}return s}[Symbol.iterator](){return this.criteria.values()}},hE=class extends Ma{constructor(e,t=null){super(),this.max_length=e,this.max_position_embeddings=t}_call(e){return e.map(t=>t.length>=this.max_length)}},fE=class extends Ma{constructor(e){super(),Array.isArray(e)||(e=[e]),this.eos_token_id=e}_call(e,t){return e.map(r=>{const s=r.at(-1);return this.eos_token_id.some(n=>s==n)})}},xa=class extends vt{constructor(e){super(),this.generation_config=e}async _call(e){return this.sample(e)}async sample(e){throw Error("sample should be implemented in subclasses.")}getLogits(e,t){let r=e.dims.at(-1),s=e.data;if(t===-1)s=s.slice(-r);else{let n=t*r;s=s.slice(n,n+r)}return s}randomSelect(e){return jM(e)}static getSampler(e){if(e.do_sample)return new pE(e);if(e.num_beams>1)return new mE(e);if(e.num_return_sequences>1)throw Error(`num_return_sequences has to be 1 when doing greedy search, but is ${e.num_return_sequences}.`);return new _E(e)}},_E=class extends xa{async sample(e){const t=je(e.data)[1];return[[BigInt(t),0]]}},pE=class extends xa{async sample(e){let t=e.dims.at(-1);this.generation_config.top_k>0&&(t=Math.min(this.generation_config.top_k,t));const[r,s]=await _s(e,t),n=nt(r.data);return Array.from({length:this.generation_config.num_beams},()=>{const a=this.randomSelect(n);return[s.data[a],Math.log(n[a])]})}},mE=class extends xa{async sample(e){let t=e.dims.at(-1);this.generation_config.top_k>0&&(t=Math.min(this.generation_config.top_k,t));const[r,s]=await _s(e,t),n=nt(r.data);return Array.from({length:this.generation_config.num_beams},(a,o)=>[s.data[o],Math.log(n[o])])}},gE=class{constructor(e){if(e)for(const t in e){if(t in this)throw new TypeError(`Key "${t}" conflicts with an existing property on DynamicCache`);const r=e[t];if(!(r instanceof U))throw new TypeError(`Expected a Tensor for key "${t}", got ${typeof r}`);this[t]=r}}get_seq_length(){const e=this;if(Object.keys(e).length===0)return 0;for(const t in e)if(t.startsWith("past_key_values."))return e[t].dims.at(-2);throw new Error("Unable to determine sequence length from the cache.")}update(e){for(const t in e){const r=this[t],s=e[t];r&&r!==s&&r.location==="gpu-buffer"&&r.dispose(),this[t]=s}}async dispose(){const e=[];for(const t of Object.values(this))t.location==="gpu-buffer"&&e.push(t.dispose());await Promise.all(e)}},Ji=gE,q={EncoderOnly:0,EncoderDecoder:1,Seq2Seq:2,Vision2Seq:3,DecoderOnly:4,DecoderOnlyWithoutHead:5,MaskGeneration:6,ImageTextToText:7,Musicgen:8,MultiModality:9,Phi3V:10,AudioTextToText:11,AutoEncoder:12,ImageAudioTextToText:13,Supertonic:14,Chatterbox:15,VoxtralRealtime:16},fr={[q.DecoderOnly]:{sessions:(e,t)=>({model:t.model_file_name??"model"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.DecoderOnlyWithoutHead]:{sessions:(e,t)=>({model:t.model_file_name??"model"})},[q.Seq2Seq]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.Vision2Seq]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.Musicgen]:{sessions:()=>({model:"text_encoder",decoder_model_merged:"decoder_model_merged",encodec_decode:"encodec_decode"}),cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.EncoderDecoder]:{sessions:()=>({model:"encoder_model",decoder_model_merged:"decoder_model_merged"}),cache_sessions:{decoder_model_merged:!0}},[q.MaskGeneration]:{sessions:()=>({model:"vision_encoder",prompt_encoder_mask_decoder:"prompt_encoder_mask_decoder"})},[q.ImageTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...fr[q.ImageTextToText].text_only_sessions};return r||(s.vision_encoder="vision_encoder"),e.is_encoder_decoder&&(s.model="encoder_model"),s},cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.AudioTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...fr[q.AudioTextToText].text_only_sessions};return r||(s.audio_encoder="audio_encoder"),s},cache_sessions:{decoder_model_merged:!0},optional_configs:{generation_config:"generation_config.json"}},[q.ImageAudioTextToText]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...fr[q.ImageAudioTextToText].text_only_sessions};return r||(s.audio_encoder="audio_encoder",s.vision_encoder="vision_encoder"),s},optional_configs:{generation_config:"generation_config.json"}},[q.Phi3V]:{sessions:()=>({prepare_inputs_embeds:"prepare_inputs_embeds",model:"model",vision_encoder:"vision_encoder"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.MultiModality]:{sessions:()=>({prepare_inputs_embeds:"prepare_inputs_embeds",model:"language_model",lm_head:"lm_head",gen_head:"gen_head",gen_img_embeds:"gen_img_embeds",image_decode:"image_decode"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.AutoEncoder]:{sessions:()=>({encoder_model:"encoder_model",decoder_model:"decoder_model"})},[q.Supertonic]:{sessions:()=>({text_encoder:"text_encoder",latent_denoiser:"latent_denoiser",voice_decoder:"voice_decoder"})},[q.Chatterbox]:{sessions:()=>({embed_tokens:"embed_tokens",speech_encoder:"speech_encoder",model:"language_model",conditional_decoder:"conditional_decoder"}),cache_sessions:{model:!0},optional_configs:{generation_config:"generation_config.json"}},[q.VoxtralRealtime]:{text_only_sessions:{embed_tokens:"embed_tokens",decoder_model_merged:"decoder_model_merged"},sessions:(e,t,r)=>{const s={...fr[q.VoxtralRealtime].text_only_sessions};return r||(s.audio_encoder="audio_encoder"),s},cache_sessions:{decoder_model_merged:!0,audio_encoder:!0},optional_configs:{generation_config:"generation_config.json"}},default:{sessions:(e,t)=>({model:t.model_file_name??"model"})}};function wE(e){const t=fr[e];return(t==null?void 0:t.text_only_sessions)??null}function vE(e,t,r={}){const s=fr[e]??fr.default;return{sessions:s.sessions(t,r,r.textOnly??!1),cache_sessions:s.cache_sessions,optional_configs:s.optional_configs}}function rp(e,{warn:t=!0}={}){const r=e.architectures||[];for(const s of r){const n=_r.get(s);if(n!==void 0)return n}if(e.model_type){const s=_r.get(e.model_type);if(s!==void 0)return s;for(const n of Object.values(ws))if(n.has(e.model_type)){const a=_r.get(n.get(e.model_type));if(a!==void 0)return a}}if(t){const s=r.length>0?r.join(", "):"(none)";ue.warn(`[resolve_model_type] Architecture(s) not found in MODEL_TYPE_MAPPING: [${s}] for model type '${e.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${pa}`)}return q.EncoderOnly}function sp(e,{config:t=null,cache_dir:r=null,local_files_only:s=!1,revision:n="main"}={}){if(t!==null)return on.from_pretrained(e,{config:t,cache_dir:r,local_files_only:s,revision:n});const a=JSON.stringify([e,r,s,n]);return Af(a,()=>on.from_pretrained(e,{config:t,cache_dir:r,local_files_only:s,revision:n}))}async function np(e,{config:t=null,dtype:r=null,device:s=null,model_file_name:n=null}={}){t=await sp(e,{config:t});const a=["config.json"],o=t["transformers.js_config"]??{},i=o.use_external_data_format,l="onnx",c=s??o.device;let d=r??o.dtype;const h=rp(t),_=(v,y=null)=>{y=y??v;const M=Vf(c,v),T=qf(d,v,M),A=Ci[T]??"",C=`${y}${A}.onnx`,S=`${l}/${C}`;a.push(S);const N=Y_(i,C,v);for(const x of J_(C,N)){const R=`${l}/${x}`;a.push(R)}},{sessions:p,optional_configs:w}=vE(h,t,{model_file_name:n});for(const[v,y]of Object.entries(p))_(v,y);if(w)for(const v of Object.values(w))a.push(v);return a}var ws=null;function yE(e){ws=e}function Ki(e){if(e instanceof U)return e;if(e.length===0)throw Error("items must be non-empty");if(Array.isArray(e[0])){if(e.some(t=>t.length!==e[0].length))throw Error("Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' and/or 'truncation=True' to have batched tensors with the same length.");return new U("int64",BigInt64Array.from(e.flat().map(t=>BigInt(t))),[e.length,e[0].length])}else return new U("int64",BigInt64Array.from(e.map(t=>BigInt(t))),[1,e.length])}function ap(e){return new U("bool",[e],[1])}var op={[q.DecoderOnly]:{can_generate:!0,forward:pr,prepare_inputs:ln},[q.DecoderOnlyWithoutHead]:{can_generate:!1,forward:pr,prepare_inputs:ln},[q.Seq2Seq]:{can_generate:!0,forward:ka,prepare_inputs:Ta},[q.Vision2Seq]:{can_generate:!0,forward:ka,prepare_inputs:Ta},[q.Musicgen]:{can_generate:!0,forward:ka},[q.EncoderDecoder]:{can_generate:!1,forward:ka},[q.ImageTextToText]:{can_generate:!0,forward:kE,prepare_inputs:Ea},[q.AudioTextToText]:{can_generate:!0,forward:xE,prepare_inputs:Ea},[q.ImageAudioTextToText]:{can_generate:!0,prepare_inputs:Ea},[q.Phi3V]:{can_generate:!0,prepare_inputs:Ea},[q.MultiModality]:{can_generate:!0},[q.AutoEncoder]:{can_generate:!1,forward:bE},[q.Chatterbox]:{can_generate:!0,forward:Ir},[q.VoxtralRealtime]:{can_generate:!0,prepare_inputs:ln},default:{can_generate:!1,forward:Ir}};function ip(e,t){var i;let r=_r.get(e),s=!1;const n=(i=t==null?void 0:t.architectures)==null?void 0:i[0];if(n&&n!==e&&(e!=null&&e.endsWith("ForCausalLM"))&&n.endsWith("ForConditionalGeneration")){const l=_r.get(n);l!==void 0&&(r=l,s=!0)}const a=op[r]??op.default,o=fr[r]??fr.default;return{typeConfig:{...a,...o},textOnly:s,modelType:r}}var _r=new Map,Zi=new Map,vs=new Map,P=class extends vt{constructor(t,r,s){super();k(this,"main_input_name","input_ids");k(this,"forward_params",["input_ids","attention_mask"]);k(this,"_return_dict_in_generate_keys",null);this.config=t,this.sessions=r,this.configs=s;const n=vs.get(this.constructor),{typeConfig:a}=ip(n,t);this.can_generate=a.can_generate,this._forward=a.forward,this._prepare_inputs_for_generation=a.prepare_inputs,this.can_generate&&this.forward_params.push("past_key_values"),this.custom_config=this.config["transformers.js_config"]??{}}async dispose(){var r;const t=[];for(const s of Object.values(this.sessions))t.push((r=s.release)==null?void 0:r.call(s));return await Promise.all(t)}static async from_pretrained(t,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main",model_file_name:i=null,subfolder:l="onnx",device:c=null,dtype:d=null,use_external_data_format:h=null,session_options:_={}}={}){const p={progress_callback:r,config:s,cache_dir:n,local_files_only:a,revision:o,model_file_name:i,subfolder:l,device:c,dtype:d,use_external_data_format:h,session_options:_},w=vs.get(this);s=p.config=await on.from_pretrained(t,p);const{typeConfig:v,textOnly:y,modelType:M}=ip(w,s);if(M===void 0){const S=w??(s==null?void 0:s.model_type);S!=="custom"&&ue.warn(`Model type for '${S}' not found, assuming encoder-only architecture. Please report this at ${pa}.`)}if(r&&!(r instanceof pi)){const S={};try{const N=await np(t,{config:s,dtype:d,device:c,model_file_name:i});(await Promise.all(N.map(R=>tn(t,R,p)))).forEach((R,z)=>{if(R.exists){const $=N[z]==="config.json";S[N[z]]={loaded:$?R.size??0:0,total:R.size??0}}})}catch(N){ue.warn(`Unable to fetch model file metadata for total progress tracking: ${N}`)}Object.keys(S).length>0&&(p.progress_callback=new pi(r,S))}const T=v.sessions(s,p,y),A=[JT(t,T,p,v.cache_sessions)];v.optional_configs&&A.push(EE(t,v.optional_configs,p));const C=await Promise.all(A);return new this(s,...C)}async _call(t){return await this.forward(t)}async forward(t){return await this._forward(this,t)}get generation_config(){var t;return((t=this.configs)==null?void 0:t.generation_config)??null}_get_logits_processor(t,r,s=null){const n=new Yi;if(t.repetition_penalty!==null&&t.repetition_penalty!==1&&n.push(new oE(t.repetition_penalty)),t.no_repeat_ngram_size!==null&&t.no_repeat_ngram_size>0&&n.push(new aE(t.no_repeat_ngram_size)),t.bad_words_ids!==null&&n.push(new cE(t.bad_words_ids,t.eos_token_id)),t.min_length!==null&&t.eos_token_id!==null&&t.min_length>0&&n.push(new iE(t.min_length,t.eos_token_id)),t.min_new_tokens!==null&&t.eos_token_id!==null&&t.min_new_tokens>0&&n.push(new lE(r,t.min_new_tokens,t.eos_token_id)),t.forced_bos_token_id!==null&&n.push(new tE(t.forced_bos_token_id)),t.forced_eos_token_id!==null&&n.push(new rE(t.max_length,t.forced_eos_token_id)),t.suppress_tokens!==null&&n.push(new sE(t.suppress_tokens)),t.begin_suppress_tokens!==null){const a=r>1||t.forced_bos_token_id===null?r:r+1;n.push(new Z_(t.begin_suppress_tokens,a))}return t.guidance_scale!==null&&t.guidance_scale>1&&n.push(new uE(t.guidance_scale)),t.temperature===0&&t.do_sample&&(ue.warn("`do_sample` changed to false because `temperature: 0` implies greedy sampling (always selecting the most likely token), which is incompatible with `do_sample: true`."),t.do_sample=!1),t.do_sample&&t.temperature!==null&&t.temperature!==1&&n.push(new dE(t.temperature)),s!==null&&n.extend(s),n}_prepare_generation_config(t,r,s=ep){const n={...this.config};for(const o of["decoder","generator","text_config"])o in n&&Object.assign(n,n[o]);const a=new s(n);return Object.assign(a,this.generation_config??{}),t&&Object.assign(a,t),r&&Object.assign(a,rt(r,Object.getOwnPropertyNames(a))),a}_get_stopping_criteria(t,r=null){const s=new tp;return t.max_length!==null&&s.push(new hE(t.max_length,this.config.max_position_embeddings??null)),t.eos_token_id!==null&&s.push(new fE(t.eos_token_id)),r&&s.extend(r),s}_validate_model_class(){if(!this.can_generate){const t=[ws.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES,ws.MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES,ws.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,ws.MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES].filter(Boolean),r=vs.get(this.constructor),s=new Set,n=this.config.model_type;for(const o of t){const i=o==null?void 0:o.get(n);i&&s.add(i)}let a=`The current model class (${r}) is not compatible with \`.generate()\`, as it doesn't have a language model head.`;throw s.size>0&&(a+=` Please use the following class instead: ${[...s].join(", ")}`),Error(a)}}prepare_inputs_for_generation(...t){if(!this._prepare_inputs_for_generation)throw new Error("prepare_inputs_for_generation is not implemented for this model.");return this._prepare_inputs_for_generation(this,...t)}_update_model_kwargs_for_generation({generated_input_ids:t,outputs:r,model_inputs:s,is_encoder_decoder:n}){return s.past_key_values=el(r,s.past_key_values),s.input_ids=new U("int64",t.flat(),[t.length,1]),n?"decoder_attention_mask"in s&&(s.decoder_attention_mask=ze([s.decoder_attention_mask,yt([s.decoder_attention_mask.dims[0],1])],1)):s.attention_mask=ze([s.attention_mask,yt([s.attention_mask.dims[0],1])],1),s.position_ids=null,s}_prepare_model_inputs({inputs:t,bos_token_id:r,model_kwargs:s}){const n=rt(s,this.forward_params),a=this.main_input_name;if(a in n){if(t)throw new Error("`inputs`: {inputs}` were passed alongside {input_name} which is not allowed. Make sure to either pass {inputs} or {input_name}=...")}else n[a]=t;return{inputs_tensor:n[a],model_inputs:n,model_input_name:a}}async _prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:t,model_inputs:r,model_input_name:s,generation_config:n}){if(this.sessions.model.inputNames.includes("inputs_embeds")&&!r.inputs_embeds&&"_prepare_inputs_embeds"in this){const{input_ids:o,pixel_values:i,attention_mask:l,...c}=r,d=await this._prepare_inputs_embeds(r);r={...c,...rt(d,["inputs_embeds","attention_mask"])}}let{last_hidden_state:a}=await Ir(this,r);if(n.guidance_scale!==null&&n.guidance_scale>1)a=ze([a,Li(a,0)],0),"attention_mask"in r&&(r.attention_mask=ze([r.attention_mask,Kf(r.attention_mask)],0));else if(r.decoder_input_ids){const o=Ki(r.decoder_input_ids).dims[0];if(o!==a.dims[0]){if(a.dims[0]!==1)throw new Error(`The encoder outputs have a different batch size (${a.dims[0]}) than the decoder inputs (${o}).`);a=ze(Array.from({length:o},()=>a),0)}}return r.encoder_outputs=a,r}_prepare_decoder_input_ids_for_generation({batch_size:t,model_input_name:r,model_kwargs:s,decoder_start_token_id:n,bos_token_id:a,generation_config:o}){let{decoder_input_ids:i,...l}=s;if(!(i instanceof U)){if(i)Array.isArray(i[0])||(i=Array.from({length:t},()=>i));else if(n??(n=a),this.config.model_type==="musicgen")i=Array.from({length:t*this.config.decoder.num_codebooks},()=>[n]);else if(Array.isArray(n)){if(n.length!==t)throw new Error(`\`decoder_start_token_id\` expcted to have length ${t} but got ${n.length}`);i=n}else i=Array.from({length:t},()=>[n]);i=Ki(i)}return l.decoder_attention_mask=Yf(i),{input_ids:i,model_inputs:l}}async generate({inputs:t=null,generation_config:r=null,logits_processor:s=null,stopping_criteria:n=null,streamer:a=null,...o}){this._validate_model_class(),r=this._prepare_generation_config(r,o);let{inputs_tensor:i,model_inputs:l,model_input_name:c}=this._prepare_model_inputs({inputs:t,model_kwargs:o});const d=this.config.is_encoder_decoder;d&&("encoder_outputs"in l||(l=await this._prepare_encoder_decoder_kwargs_for_generation({inputs_tensor:i,model_inputs:l,model_input_name:c,generation_config:r})));let h;d?{input_ids:h,model_inputs:l}=this._prepare_decoder_input_ids_for_generation({batch_size:l[c].dims.at(0),model_input_name:c,model_kwargs:l,decoder_start_token_id:r.decoder_start_token_id,bos_token_id:r.bos_token_id,generation_config:r}):h=l[c];let _=h.dims.at(-1);r.max_new_tokens!==null&&(r.max_length=_+r.max_new_tokens);const p=this._get_logits_processor(r,_,s),w=this._get_stopping_criteria(r,n),v=l[c].dims.at(0),y=xa.getSampler(r),M=new Array(v).fill(0),T=h.tolist();a&&a.put(T);let A,C={},S={};for(;;){if(l=this.prepare_inputs_for_generation(T,l,r),A=await this.forward(l),r.return_dict_in_generate)if(r.output_attentions){const I=ME(A);for(const te in I)te in C||(C[te]=[]),C[te].push(I[te])}else this._return_dict_in_generate_keys&&Object.assign(S,rt(A,this._return_dict_in_generate_keys));const $=A.logits.slice(null,-1,null).to("float32"),Q=p(T,$),H=[];for(let I=0;I<Q.dims.at(0);++I){const te=Q[I],W=await y(te);for(const[ee,G]of W){const L=BigInt(ee);M[I]+=G,T[I].push(L),H.push([L]);break}}if(a&&a.put(H),w(T).every(I=>I))break;l=this._update_model_kwargs_for_generation({generated_input_ids:H,outputs:A,model_inputs:l,is_encoder_decoder:d})}a&&a.end();const N=new U("int64",T.flat(),[T.length,T[0].length]),x=el(A,l.past_key_values),R=new Set(Object.values(x));for(const $ of Object.values(A))$.location==="gpu-buffer"&&!R.has($)&&$.dispose();return"past_key_values"in o||r.return_dict_in_generate||await x.dispose(),r.return_dict_in_generate?{sequences:N,past_key_values:x,...C,...S}:N}async _encode_input(t,r,s){if(!Object.hasOwn(this.sessions,t))throw new Error(`Model does not have a ${t} session.`);const n=this.sessions[t];return(await xe(n,rt(r,n.inputNames)))[s]}async encode_image(t){return this._encode_input("vision_encoder",t,"image_features")}async encode_text(t){return this._encode_input("embed_tokens",t,"inputs_embeds")}async encode_audio(t){return this._encode_input("audio_encoder",t,"audio_features")}};async function ka(e,t){let{encoder_outputs:r,input_ids:s,decoder_input_ids:n,decoder_attention_mask:a,...o}=t;if(!r){const i=rt(t,e.sessions.model.inputNames);r=(await Ir(e,i)).last_hidden_state}return o.input_ids=n,o.encoder_hidden_states=r,e.sessions.decoder_model_merged.inputNames.includes("encoder_attention_mask")&&(o.encoder_attention_mask=t.attention_mask),a&&!o.attention_mask&&(o.attention_mask=a),await pr(e,o,!0)}async function Ir(e,t){const r=e.sessions.model,s=rt(t,r.inputNames);if(r.inputNames.includes("inputs_embeds")&&!s.inputs_embeds){if(!t.input_ids)throw new Error("Both `input_ids` and `inputs_embeds` are missing in the model inputs.");s.inputs_embeds=await e.encode_text({input_ids:t.input_ids})}if(r.inputNames.includes("token_type_ids")&&!s.token_type_ids){if(!s.input_ids)throw new Error("Both `input_ids` and `token_type_ids` are missing in the model inputs.");s.token_type_ids=Kf(s.input_ids)}if(r.inputNames.includes("pixel_mask")&&!s.pixel_mask){if(!s.pixel_values)throw new Error("Both `pixel_values` and `pixel_mask` are missing in the model inputs.");const n=s.pixel_values.dims;s.pixel_mask=yt([n[0],n[2],n[3]])}return await xe(r,s)}async function bE(e,t){const r=await e.encode(t);return await e.decode(r)}function el(e,t){const r=Object.create(null);for(const s in e)if(s.startsWith("present")){const n=s.replace("present_ssm","past_ssm").replace("present_conv","past_conv").replace("present_recurrent","past_recurrent").replace("present","past_key_values");s.includes("encoder")&&t?r[n]=t[n]:r[n]=e[s]}return t?(t.update(r),t):new Ji(r)}function ME(e){const t={};for(const r of["cross_attentions","encoder_attentions","decoder_attentions"])for(const s in e)s.startsWith(r)&&(r in t||(t[r]=[]),t[r].push(e[s]));return t}function tl(e,t,r){var c,d,h;if(r&&Object.keys(r).length>0)return Object.assign(t,r),r;const s=e.sessions.decoder_model_merged??e.sessions.model,n=((d=(c=t[e.main_input_name]??t.attention_mask)==null?void 0:c.dims)==null?void 0:d[0])??1,a=((h=s==null?void 0:s.config)==null?void 0:h.kv_cache_dtype)??"float32",o=a==="float16"?fs.float16:fs.float32,i=ba(e.config,{batch_size:n}),l=Object.create(null);for(const _ in i){const p=i[_].reduce((v,y)=>v*y,1),w=new U(a,new o(p),i[_]);t[_]=w,l[_]=w}return r?(r.update(l),r):new Ji(l)}async function pr(e,t,r=!1){const s=e.sessions[r?"decoder_model_merged":"model"],{past_key_values:n,...a}=t;if(s.inputNames.includes("use_cache_branch")&&(a.use_cache_branch=ap(n!=null&&Object.keys(n).length>0)),s.inputNames.includes("position_ids")&&a.attention_mask&&!a.position_ids){const i=["paligemma","gemma3_text","gemma3"].includes(e.config.model_type)?1:0;a.position_ids=TE(a,n,i)}s.inputNames.includes("num_logits_to_keep")&&!a.num_logits_to_keep&&(a.num_logits_to_keep=new U("int64",[0n],[])),tl(e,a,n);const o=rt(a,s.inputNames);return await xe(s,o)}async function lp(e,{encode_function:t,merge_function:r,modality_input_names:s,modality_output_name:n,input_ids:a=null,attention_mask:o=null,position_ids:i=null,inputs_embeds:l=null,past_key_values:c=null,generation_config:d=null,logits_processor:h=null,..._}){if(!l){l=await e.encode_text({input_ids:a,..._});const w=rt(_,s);if(Object.keys(w).length>0){if(a.dims[1]!==1){const v=await t({...w,..._});({inputs_embeds:l,attention_mask:o}=r({[n]:v,inputs_embeds:l,input_ids:a,attention_mask:o}))}else if(c&&a.dims[1]===1){const v=a.dims[1],y=c.get_seq_length();o=ze([yt([a.dims[0],y]),o.slice(null,[o.dims[1]-v,o.dims[1]])],1)}}}if(!i&&["qwen2_vl","qwen2_vl_text","qwen2_5_vl","qwen2_5_vl_text","qwen3_vl","qwen3_vl_text","qwen3_vl_moe","qwen3_vl_moe_text","qwen3_5","qwen3_5_text","qwen3_5_moe","qwen3_5_moe_text","glm_ocr","glm_ocr_text"].includes(e.config.model_type)){const{image_grid_thw:w,video_grid_thw:v}=_;[i]=e.get_rope_index(a,w,v,o)}return await pr(e,{inputs_embeds:l,past_key_values:c,attention_mask:o,position_ids:i,generation_config:d,logits_processor:h},!0)}async function xE(e,t){return await lp(e,{...t,modality_input_names:["audio_values","input_features"],modality_output_name:"audio_features",encode_function:e.encode_audio.bind(e),merge_function:e._merge_input_ids_with_audio_features.bind(e)})}async function kE(e,t){return await lp(e,{...t,modality_input_names:["pixel_values"],modality_output_name:"image_features",encode_function:e.encode_image.bind(e),merge_function:e._merge_input_ids_with_image_features.bind(e)})}function cp(e,t=0){const[r,s]=e.dims,n=e.data,a=new BigInt64Array(n.length);for(let o=0;o<r;++o){const i=o*s;let l=BigInt(t);for(let c=0;c<s;++c){const d=i+c;n[d]===0n?a[d]=BigInt(1):(a[d]=l,l+=n[d])}}return{data:a,dims:e.dims}}function TE(e,t=null,r=0){const{input_ids:s,inputs_embeds:n,attention_mask:a}=e,{data:o,dims:i}=cp(a,r);let l=new U("int64",o,i);if(t){const c=-(s??n).dims.at(1);l=l.slice(null,[c,null])}return l}function ln(e,t,r,s){const n=r.past_key_values?r.past_key_values.get_seq_length():0,a=e.sessions.decoder_model_merged??e.sessions.model;if(a!=null&&a.inputNames.includes("num_logits_to_keep")&&!r.num_logits_to_keep&&(r.num_logits_to_keep=new U("int64",[1n],[])),!r.attention_mask){let o;for(const i of["input_ids","inputs_embeds","position_ids"])if(r[i]){o=r[i].dims;break}if(!o)throw new Error("attention_mask is not provided, and unable to infer its shape from model inputs.");r.attention_mask=yt([o[0],n+o[1]])}if(r.past_key_values){const{input_ids:o,attention_mask:i}=r;i&&i.dims[1]>o.dims[1]||n<o.dims[1]&&(r.input_ids=o.slice(null,[n,null]))}return r}function Ta(e,t,r,s){return r.past_key_values&&(t=t.map(n=>[n.at(-1)])),{...r,decoder_input_ids:Ki(t)}}function Ea(e,...t){return e.config.is_encoder_decoder?Ta(e,...t):ln(e,...t)}function up({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n}){const a=s.tolist().map(c=>c.reduce((d,h,_)=>(h==e&&d.push(_),d),[])),o=a.reduce((c,d)=>c+d.length,0),i=r.dims[0];if(o!==i)throw new Error(`Number of tokens and features do not match: tokens: ${o}, features ${i}`);let l=0;for(let c=0;c<a.length;++c){const d=a[c],h=t[c];for(let _=0;_<d.length;++_)h[d[_]].data.set(r[l++].data)}return{inputs_embeds:t,attention_mask:n}}function rl({image_token_id:e,inputs_embeds:t,image_features:r,input_ids:s,attention_mask:n}){return up({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n})}function dp({audio_token_id:e,inputs_embeds:t,audio_features:r,input_ids:s,attention_mask:n}){return up({modality_token_id:e,inputs_embeds:t,modality_features:r,input_ids:s,attention_mask:n})}async function EE(e,t,r){return Object.fromEntries(await Promise.all(Object.keys(t).map(async s=>{const n=await er(e,t[s],!1,r);return[s,n]})))}var sl={};os(sl,{ASTForAudioClassification:()=>BE,ASTModel:()=>zE,ASTPreTrainedModel:()=>il,AfmoeForCausalLM:()=>OE,AfmoeModel:()=>IE,AfmoePreTrainedModel:()=>al,AlbertForMaskedLM:()=>PE,AlbertForQuestionAnswering:()=>SE,AlbertForSequenceClassification:()=>CE,AlbertModel:()=>AE,AlbertPreTrainedModel:()=>cn,ApertusForCausalLM:()=>LE,ApertusModel:()=>FE,ApertusPreTrainedModel:()=>nl,ArceeForCausalLM:()=>DE,ArceeModel:()=>NE,ArceePreTrainedModel:()=>ol,BartForConditionalGeneration:()=>GE,BartForSequenceClassification:()=>$E,BartModel:()=>RE,BartPretrainedModel:()=>Aa,BeitForImageClassification:()=>UE,BeitModel:()=>VE,BeitPreTrainedModel:()=>ll,BertForMaskedLM:()=>qE,BertForQuestionAnswering:()=>QE,BertForSequenceClassification:()=>WE,BertForTokenClassification:()=>HE,BertModel:()=>jE,BertPreTrainedModel:()=>ys,BlenderbotForConditionalGeneration:()=>YE,BlenderbotModel:()=>XE,BlenderbotPreTrainedModel:()=>cl,BlenderbotSmallForConditionalGeneration:()=>KE,BlenderbotSmallModel:()=>JE,BlenderbotSmallPreTrainedModel:()=>ul,BloomForCausalLM:()=>e2,BloomModel:()=>ZE,BloomPreTrainedModel:()=>dl,CHMv2ForDepthEstimation:()=>l2,CHMv2PreTrainedModel:()=>mp,CLIPModel:()=>u2,CLIPPreTrainedModel:()=>Kr,CLIPSegForImageSegmentation:()=>p2,CLIPSegModel:()=>_2,CLIPSegPreTrainedModel:()=>hl,CLIPTextModel:()=>d2,CLIPTextModelWithProjection:()=>vp,CLIPVisionModel:()=>h2,CLIPVisionModelWithProjection:()=>f2,CamembertForMaskedLM:()=>r2,CamembertForQuestionAnswering:()=>a2,CamembertForSequenceClassification:()=>s2,CamembertForTokenClassification:()=>n2,CamembertModel:()=>t2,CamembertPreTrainedModel:()=>bs,ChatterboxModel:()=>_p,ChatterboxPreTrainedModel:()=>fp,ChineseCLIPModel:()=>i2,ChineseCLIPPreTrainedModel:()=>pp,ClapAudioModelWithProjection:()=>wp,ClapModel:()=>c2,ClapPreTrainedModel:()=>Ca,ClapTextModelWithProjection:()=>gp,CodeGenForCausalLM:()=>g2,CodeGenModel:()=>m2,CodeGenPreTrainedModel:()=>fl,Cohere2ForCausalLM:()=>b2,Cohere2Model:()=>y2,Cohere2PreTrainedModel:()=>pl,CohereAsrForConditionalGeneration:()=>x2,CohereAsrModel:()=>M2,CohereAsrPreTrainedModel:()=>ml,CohereForCausalLM:()=>v2,CohereModel:()=>w2,CoherePreTrainedModel:()=>_l,ConvBertForMaskedLM:()=>T2,ConvBertForQuestionAnswering:()=>C2,ConvBertForSequenceClassification:()=>E2,ConvBertForTokenClassification:()=>A2,ConvBertModel:()=>k2,ConvBertPreTrainedModel:()=>Ms,ConvNextForImageClassification:()=>P2,ConvNextModel:()=>S2,ConvNextPreTrainedModel:()=>gl,ConvNextV2ForImageClassification:()=>L2,ConvNextV2Model:()=>F2,ConvNextV2PreTrainedModel:()=>wl,DFineForObjectDetection:()=>D2,DFineModel:()=>N2,DFinePreTrainedModel:()=>yl,DINOv3ConvNextModel:()=>lA,DINOv3ConvNextPreTrainedModel:()=>Cp,DINOv3ViTModel:()=>cA,DINOv3ViTPreTrainedModel:()=>Sp,DPTForDepthEstimation:()=>gA,DPTModel:()=>mA,DPTPreTrainedModel:()=>El,DacDecoderModel:()=>xp,DacDecoderOutput:()=>bp,DacEncoderModel:()=>Mp,DacEncoderOutput:()=>yp,DacModel:()=>z2,DacPreTrainedModel:()=>Sa,DebertaForMaskedLM:()=>R2,DebertaForQuestionAnswering:()=>V2,DebertaForSequenceClassification:()=>G2,DebertaForTokenClassification:()=>$2,DebertaModel:()=>B2,DebertaPreTrainedModel:()=>xs,DebertaV2ForMaskedLM:()=>W2,DebertaV2ForQuestionAnswering:()=>X2,DebertaV2ForSequenceClassification:()=>H2,DebertaV2ForTokenClassification:()=>Q2,DebertaV2Model:()=>q2,DebertaV2PreTrainedModel:()=>ks,DecisionTransformerModel:()=>Y2,DecisionTransformerPreTrainedModel:()=>kp,DeepseekV3ForCausalLM:()=>j2,DeepseekV3Model:()=>U2,DeepseekV3PreTrainedModel:()=>bl,DeiTForImageClassification:()=>K2,DeiTModel:()=>J2,DeiTPreTrainedModel:()=>Ml,DepthAnythingForDepthEstimation:()=>Z2,DepthAnythingPreTrainedModel:()=>Tp,DepthProForDepthEstimation:()=>eA,DepthProPreTrainedModel:()=>Ep,DetrForObjectDetection:()=>rA,DetrForSegmentation:()=>sA,DetrModel:()=>tA,DetrObjectDetectionOutput:()=>xl,DetrPreTrainedModel:()=>Pa,DetrSegmentationOutput:()=>Ap,Dinov2ForImageClassification:()=>aA,Dinov2Model:()=>nA,Dinov2PreTrainedModel:()=>kl,Dinov2WithRegistersForImageClassification:()=>iA,Dinov2WithRegistersModel:()=>oA,Dinov2WithRegistersPreTrainedModel:()=>Tl,DistilBertForMaskedLM:()=>_A,DistilBertForQuestionAnswering:()=>fA,DistilBertForSequenceClassification:()=>dA,DistilBertForTokenClassification:()=>hA,DistilBertModel:()=>uA,DistilBertPreTrainedModel:()=>Ts,DonutSwinModel:()=>pA,DonutSwinPreTrainedModel:()=>Pp,EdgeTamModel:()=>EF,EfficientNetForImageClassification:()=>vA,EfficientNetModel:()=>wA,EfficientNetPreTrainedModel:()=>Al,ElectraForMaskedLM:()=>bA,ElectraForQuestionAnswering:()=>kA,ElectraForSequenceClassification:()=>MA,ElectraForTokenClassification:()=>xA,ElectraModel:()=>yA,ElectraPreTrainedModel:()=>Es,Ernie4_5ForCausalLM:()=>EA,Ernie4_5Model:()=>TA,Ernie4_5PretrainedModel:()=>Cl,EsmForMaskedLM:()=>CA,EsmForSequenceClassification:()=>SA,EsmForTokenClassification:()=>PA,EsmModel:()=>AA,EsmPreTrainedModel:()=>dn,EuroBertForMaskedLM:()=>LA,EuroBertForSequenceClassification:()=>IA,EuroBertForTokenClassification:()=>OA,EuroBertModel:()=>FA,EuroBertPreTrainedModel:()=>hn,ExaoneForCausalLM:()=>DA,ExaoneModel:()=>NA,ExaonePreTrainedModel:()=>Sl,FalconForCausalLM:()=>BA,FalconH1ForCausalLM:()=>GA,FalconH1Model:()=>RA,FalconH1PreTrainedModel:()=>Fl,FalconModel:()=>zA,FalconPreTrainedModel:()=>Pl,FastViTForImageClassification:()=>VA,FastViTModel:()=>$A,FastViTPreTrainedModel:()=>Ll,Florence2ForConditionalGeneration:()=>UA,Florence2PreTrainedModel:()=>Fp,GLPNForDepthEstimation:()=>oC,GLPNModel:()=>aC,GLPNPreTrainedModel:()=>Gl,GPT2LMHeadModel:()=>mC,GPT2Model:()=>pC,GPT2PreTrainedModel:()=>ql,GPTBigCodeForCausalLM:()=>lC,GPTBigCodeModel:()=>iC,GPTBigCodePreTrainedModel:()=>$l,GPTJForCausalLM:()=>wC,GPTJModel:()=>gC,GPTJPreTrainedModel:()=>Wl,GPTNeoForCausalLM:()=>uC,GPTNeoModel:()=>cC,GPTNeoPreTrainedModel:()=>Vl,GPTNeoXForCausalLM:()=>hC,GPTNeoXModel:()=>dC,GPTNeoXPreTrainedModel:()=>Ul,Gemma2ForCausalLM:()=>HA,Gemma2Model:()=>WA,Gemma2PreTrainedModel:()=>Ol,Gemma3ForCausalLM:()=>JA,Gemma3ForConditionalGeneration:()=>Op,Gemma3Model:()=>YA,Gemma3PreTrainedModel:()=>Ip,Gemma3nForCausalLM:()=>KA,Gemma3nForConditionalGeneration:()=>Fa,Gemma3nPreTrainedModel:()=>Np,Gemma4ForCausalLM:()=>ZA,Gemma4ForConditionalGeneration:()=>Nl,GemmaForCausalLM:()=>qA,GemmaModel:()=>jA,GemmaPreTrainedModel:()=>Il,GlmForCausalLM:()=>tC,GlmModel:()=>eC,GlmMoeDsaForCausalLM:()=>sC,GlmMoeDsaModel:()=>rC,GlmMoeDsaPreTrainedModel:()=>zl,GlmOcrForConditionalGeneration:()=>nC,GlmPreTrainedModel:()=>Dl,GptOssForCausalLM:()=>_C,GptOssModel:()=>fC,GptOssPreTrainedModel:()=>jl,GraniteForCausalLM:()=>yC,GraniteModel:()=>vC,GraniteMoeHybridForCausalLM:()=>MC,GraniteMoeHybridModel:()=>bC,GraniteMoeHybridPreTrainedModel:()=>Ql,GranitePreTrainedModel:()=>Hl,GraniteSpeechForConditionalGeneration:()=>xC,GroundingDinoForObjectDetection:()=>kC,GroundingDinoPreTrainedModel:()=>Gp,GroupViTModel:()=>TC,GroupViTPreTrainedModel:()=>$p,HeliumForCausalLM:()=>AC,HeliumModel:()=>EC,HeliumPreTrainedModel:()=>Yl,HieraForImageClassification:()=>SC,HieraModel:()=>CC,HieraPreTrainedModel:()=>Jl,HubertForCTC:()=>DC,HubertForSequenceClassification:()=>zC,HubertModel:()=>NC,HubertPreTrainedModel:()=>OC,HunYuanDenseV1ForCausalLM:()=>RC,HunYuanDenseV1Model:()=>BC,HunYuanDenseV1PreTrainedModel:()=>Kl,IJepaForImageClassification:()=>$C,IJepaModel:()=>GC,IJepaPreTrainedModel:()=>Zl,Idefics3ForConditionalGeneration:()=>Vp,JAISLMHeadModel:()=>UC,JAISModel:()=>VC,JAISPreTrainedModel:()=>ec,JinaCLIPModel:()=>jC,JinaCLIPPreTrainedModel:()=>La,JinaCLIPTextModel:()=>Up,JinaCLIPVisionModel:()=>qC,Lfm2ForCausalLM:()=>HC,Lfm2Model:()=>WC,Lfm2MoeForCausalLM:()=>YC,Lfm2MoeModel:()=>XC,Lfm2MoePreTrainedModel:()=>rc,Lfm2PreTrainedModel:()=>tc,Lfm2VlForConditionalGeneration:()=>JC,LightOnOcrForConditionalGeneration:()=>QC,LiteWhisperForConditionalGeneration:()=>HL,Llama4ForCausalLM:()=>eS,Llama4PreTrainedModel:()=>jp,LlamaForCausalLM:()=>ZC,LlamaModel:()=>KC,LlamaPreTrainedModel:()=>sc,LlavaForConditionalGeneration:()=>mr,LlavaOnevisionForConditionalGeneration:()=>mr,LlavaPreTrainedModel:()=>Lp,LlavaQwen2ForCausalLM:()=>XA,LongT5ForConditionalGeneration:()=>rS,LongT5Model:()=>tS,LongT5PreTrainedModel:()=>nc,M2M100ForConditionalGeneration:()=>nS,M2M100Model:()=>sS,M2M100PreTrainedModel:()=>ac,MBartForCausalLM:()=>hS,MBartForConditionalGeneration:()=>uS,MBartForSequenceClassification:()=>dS,MBartModel:()=>cS,MBartPreTrainedModel:()=>fn,MPNetForMaskedLM:()=>KS,MPNetForQuestionAnswering:()=>tP,MPNetForSequenceClassification:()=>ZS,MPNetForTokenClassification:()=>eP,MPNetModel:()=>JS,MPNetPreTrainedModel:()=>As,MT5ForConditionalGeneration:()=>aP,MT5Model:()=>nP,MT5PreTrainedModel:()=>mc,MarianMTModel:()=>oS,MarianModel:()=>aS,MarianPreTrainedModel:()=>oc,MaskFormerForInstanceSegmentation:()=>lS,MaskFormerModel:()=>iS,MaskFormerPreTrainedModel:()=>ic,Metric3DForDepthEstimation:()=>fS,Metric3DPreTrainedModel:()=>qp,Metric3Dv2ForDepthEstimation:()=>_S,Metric3Dv2PreTrainedModel:()=>Wp,MgpstrForSceneTextRecognition:()=>pS,MgpstrModelOutput:()=>Hp,MgpstrPreTrainedModel:()=>Qp,MimiDecoderModel:()=>Kp,MimiDecoderOutput:()=>Yp,MimiEncoderModel:()=>Jp,MimiEncoderOutput:()=>Xp,MimiModel:()=>mS,MimiPreTrainedModel:()=>Ia,Mistral4ForCausalLM:()=>yS,Mistral4Model:()=>vS,Mistral4PreTrainedModel:()=>cc,MistralForCausalLM:()=>wS,MistralModel:()=>gS,MistralPreTrainedModel:()=>lc,MobileBertForMaskedLM:()=>MS,MobileBertForQuestionAnswering:()=>kS,MobileBertForSequenceClassification:()=>xS,MobileBertModel:()=>bS,MobileBertPreTrainedModel:()=>_n,MobileLLMForCausalLM:()=>ES,MobileLLMModel:()=>TS,MobileLLMPreTrainedModel:()=>uc,MobileNetV1ForImageClassification:()=>CS,MobileNetV1ForSemanticSegmentation:()=>SS,MobileNetV1Model:()=>AS,MobileNetV1PreTrainedModel:()=>Oa,MobileNetV2ForImageClassification:()=>FS,MobileNetV2ForSemanticSegmentation:()=>LS,MobileNetV2Model:()=>PS,MobileNetV2PreTrainedModel:()=>Na,MobileNetV3ForImageClassification:()=>OS,MobileNetV3ForSemanticSegmentation:()=>NS,MobileNetV3Model:()=>IS,MobileNetV3PreTrainedModel:()=>Da,MobileNetV4ForImageClassification:()=>zS,MobileNetV4ForSemanticSegmentation:()=>BS,MobileNetV4Model:()=>DS,MobileNetV4PreTrainedModel:()=>za,MobileViTForImageClassification:()=>GS,MobileViTModel:()=>RS,MobileViTPreTrainedModel:()=>dc,MobileViTV2ForImageClassification:()=>VS,MobileViTV2Model:()=>$S,MobileViTV2PreTrainedModel:()=>hc,ModernBertDecoderForCausalLM:()=>QS,ModernBertDecoderModel:()=>HS,ModernBertDecoderPreTrainedModel:()=>fc,ModernBertForMaskedLM:()=>jS,ModernBertForSequenceClassification:()=>qS,ModernBertForTokenClassification:()=>WS,ModernBertModel:()=>US,ModernBertPreTrainedModel:()=>pn,Moondream1ForConditionalGeneration:()=>QA,MoonshineForConditionalGeneration:()=>YS,MoonshineModel:()=>XS,MoonshinePreTrainedModel:()=>_c,MptForCausalLM:()=>sP,MptModel:()=>rP,MptPreTrainedModel:()=>pc,MultiModalityCausalLM:()=>oP,MultiModalityPreTrainedModel:()=>Zp,MusicgenForCausalLM:()=>lP,MusicgenForConditionalGeneration:()=>em,MusicgenModel:()=>iP,MusicgenPreTrainedModel:()=>gc,NanoChatForCausalLM:()=>uP,NanoChatModel:()=>cP,NanoChatPreTrainedModel:()=>wc,NemotronHForCausalLM:()=>hP,NemotronHModel:()=>dP,NemotronHPreTrainedModel:()=>vc,NeoBertForMaskedLM:()=>_P,NeoBertForQuestionAnswering:()=>gP,NeoBertForSequenceClassification:()=>pP,NeoBertForTokenClassification:()=>mP,NeoBertModel:()=>fP,NeoBertPreTrainedModel:()=>Cs,NomicBertModel:()=>wP,NomicBertPreTrainedModel:()=>tm,OPTForCausalLM:()=>PP,OPTModel:()=>SP,OPTPreTrainedModel:()=>Tc,Olmo2ForCausalLM:()=>MP,Olmo2Model:()=>bP,Olmo2PreTrainedModel:()=>bc,Olmo3ForCausalLM:()=>kP,Olmo3Model:()=>xP,Olmo3PreTrainedModel:()=>Mc,OlmoForCausalLM:()=>yP,OlmoHybridForCausalLM:()=>EP,OlmoHybridModel:()=>TP,OlmoHybridPreTrainedModel:()=>xc,OlmoModel:()=>vP,OlmoPreTrainedModel:()=>yc,OpenELMForCausalLM:()=>CP,OpenELMModel:()=>AP,OpenELMPreTrainedModel:()=>kc,OwlViTForObjectDetection:()=>OP,OwlViTModel:()=>IP,OwlViTPreTrainedModel:()=>Ac,Owlv2ForObjectDetection:()=>LP,Owlv2Model:()=>FP,Owlv2PreTrainedModel:()=>Ec,PaliGemmaForConditionalGeneration:()=>NP,ParakeetForCTC:()=>DP,ParakeetPreTrainedModel:()=>rm,PatchTSMixerForPrediction:()=>BP,PatchTSMixerModel:()=>zP,PatchTSMixerPreTrainedModel:()=>Cc,PatchTSTForPrediction:()=>GP,PatchTSTModel:()=>RP,PatchTSTPreTrainedModel:()=>Sc,Phi3ForCausalLM:()=>jP,Phi3Model:()=>UP,Phi3PreTrainedModel:()=>Fc,Phi3VForCausalLM:()=>nm,Phi3VPreTrainedModel:()=>sm,PhiForCausalLM:()=>VP,PhiModel:()=>$P,PhiPreTrainedModel:()=>Pc,PreTrainedModel:()=>P,PvtForImageClassification:()=>WP,PvtModel:()=>qP,PvtPreTrainedModel:()=>Lc,PyAnnoteForAudioFrameClassification:()=>QP,PyAnnoteModel:()=>HP,PyAnnotePreTrainedModel:()=>Ic,Qwen2ForCausalLM:()=>YP,Qwen2Model:()=>XP,Qwen2MoeForCausalLM:()=>KP,Qwen2MoeModel:()=>JP,Qwen2MoePreTrainedModel:()=>Nc,Qwen2PreTrainedModel:()=>Oc,Qwen2VLForCausalLM:()=>zp,Qwen2VLForConditionalGeneration:()=>Bl,Qwen2VLPreTrainedModel:()=>Dp,Qwen2_5_VLForCausalLM:()=>Bp,Qwen2_5_VLForConditionalGeneration:()=>Rl,Qwen3ForCausalLM:()=>eF,Qwen3Model:()=>ZP,Qwen3MoeForCausalLM:()=>rF,Qwen3MoeModel:()=>tF,Qwen3MoePreTrainedModel:()=>zc,Qwen3NextForCausalLM:()=>nF,Qwen3NextModel:()=>sF,Qwen3NextPreTrainedModel:()=>Bc,Qwen3PreTrainedModel:()=>Dc,Qwen3VLForCausalLM:()=>am,Qwen3VLForConditionalGeneration:()=>Rc,Qwen3VLMoeForCausalLM:()=>oF,Qwen3VLMoeForConditionalGeneration:()=>aF,Qwen3_5ForCausalLM:()=>om,Qwen3_5ForConditionalGeneration:()=>Gc,Qwen3_5MoeForCausalLM:()=>lF,Qwen3_5MoeForConditionalGeneration:()=>iF,RFDetrForObjectDetection:()=>hF,RFDetrModel:()=>dF,RFDetrObjectDetectionOutput:()=>im,RFDetrPreTrainedModel:()=>Vc,RTDetrForObjectDetection:()=>O2,RTDetrModel:()=>I2,RTDetrObjectDetectionOutput:()=>un,RTDetrPreTrainedModel:()=>vl,RTDetrV2ForObjectDetection:()=>kF,RTDetrV2Model:()=>xF,RTDetrV2ObjectDetectionOutput:()=>lm,RTDetrV2PreTrainedModel:()=>Uc,ResNetForImageClassification:()=>uF,ResNetModel:()=>cF,ResNetPreTrainedModel:()=>$c,RoFormerForMaskedLM:()=>vF,RoFormerForQuestionAnswering:()=>MF,RoFormerForSequenceClassification:()=>yF,RoFormerForTokenClassification:()=>bF,RoFormerModel:()=>wF,RoFormerPreTrainedModel:()=>Ps,RobertaForMaskedLM:()=>_F,RobertaForQuestionAnswering:()=>gF,RobertaForSequenceClassification:()=>pF,RobertaForTokenClassification:()=>mF,RobertaModel:()=>fF,RobertaPreTrainedModel:()=>Ss,Sam2ImageSegmentationOutput:()=>dm,Sam2Model:()=>jc,Sam2PreTrainedModel:()=>hm,Sam3TrackerModel:()=>AF,SamImageSegmentationOutput:()=>cm,SamModel:()=>TF,SamPreTrainedModel:()=>um,SapiensForDepthEstimation:()=>SF,SapiensForNormalEstimation:()=>PF,SapiensForSemanticSegmentation:()=>CF,SapiensPreTrainedModel:()=>Ba,SegformerForImageClassification:()=>LF,SegformerForSemanticSegmentation:()=>IF,SegformerModel:()=>FF,SegformerPreTrainedModel:()=>Ra,SiglipModel:()=>OF,SiglipPreTrainedModel:()=>qc,SiglipTextModel:()=>fm,SiglipVisionModel:()=>NF,SmolLM3ForCausalLM:()=>zF,SmolLM3Model:()=>DF,SmolLM3PreTrainedModel:()=>Wc,SmolVLMForConditionalGeneration:()=>BF,SnacDecoderModel:()=>pm,SnacEncoderModel:()=>_m,SnacModel:()=>RF,SnacPreTrainedModel:()=>Ga,SolarOpenForCausalLM:()=>$F,SolarOpenModel:()=>GF,SolarOpenPreTrainedModel:()=>Hc,SpeechT5ForSpeechToText:()=>UF,SpeechT5ForTextToSpeech:()=>jF,SpeechT5HifiGan:()=>qF,SpeechT5Model:()=>VF,SpeechT5PreTrainedModel:()=>$a,SqueezeBertForMaskedLM:()=>HF,SqueezeBertForQuestionAnswering:()=>XF,SqueezeBertForSequenceClassification:()=>QF,SqueezeBertModel:()=>WF,SqueezeBertPreTrainedModel:()=>mn,StableLmForCausalLM:()=>JF,StableLmModel:()=>YF,StableLmPreTrainedModel:()=>Qc,Starcoder2ForCausalLM:()=>ZF,Starcoder2Model:()=>KF,Starcoder2PreTrainedModel:()=>Xc,StyleTextToSpeech2Model:()=>eL,StyleTextToSpeech2PreTrainedModel:()=>mm,SupertonicForConditionalGeneration:()=>wm,SupertonicPreTrainedModel:()=>gm,Swin2SRForImageSuperResolution:()=>aL,Swin2SRModel:()=>nL,Swin2SRPreTrainedModel:()=>Yc,SwinForImageClassification:()=>rL,SwinForSemanticSegmentation:()=>sL,SwinModel:()=>tL,SwinPreTrainedModel:()=>Va,T5ForConditionalGeneration:()=>iL,T5Model:()=>oL,T5PreTrainedModel:()=>Jc,TableTransformerForObjectDetection:()=>cL,TableTransformerModel:()=>lL,TableTransformerObjectDetectionOutput:()=>vm,TableTransformerPreTrainedModel:()=>Kc,TrOCRForCausalLM:()=>uL,TrOCRPreTrainedModel:()=>ym,UltravoxModel:()=>Xl,UltravoxPreTrainedModel:()=>Rp,UniSpeechForCTC:()=>hL,UniSpeechForSequenceClassification:()=>fL,UniSpeechModel:()=>dL,UniSpeechPreTrainedModel:()=>Ua,UniSpeechSatForAudioFrameClassification:()=>gL,UniSpeechSatForCTC:()=>pL,UniSpeechSatForSequenceClassification:()=>mL,UniSpeechSatModel:()=>_L,UniSpeechSatPreTrainedModel:()=>gn,VaultGemmaForCausalLM:()=>vL,VaultGemmaModel:()=>wL,VaultGemmaPreTrainedModel:()=>Zc,ViTForImageClassification:()=>ML,ViTMAEModel:()=>xL,ViTMAEPreTrainedModel:()=>bm,ViTMSNForImageClassification:()=>TL,ViTMSNModel:()=>kL,ViTMSNPreTrainedModel:()=>tu,ViTModel:()=>bL,ViTPreTrainedModel:()=>eu,VisionEncoderDecoderModel:()=>yL,VitMatteForImageMatting:()=>EL,VitMattePreTrainedModel:()=>Mm,VitPoseForPoseEstimation:()=>AL,VitPosePreTrainedModel:()=>xm,VitsModel:()=>CL,VitsModelOutput:()=>km,VitsPreTrainedModel:()=>Tm,VoxtralForConditionalGeneration:()=>SL,VoxtralRealtimeForConditionalGeneration:()=>Cm,VoxtralRealtimePreTrainedModel:()=>Am,Wav2Vec2BertForCTC:()=>zL,Wav2Vec2BertForSequenceClassification:()=>BL,Wav2Vec2BertModel:()=>DL,Wav2Vec2BertPreTrainedModel:()=>ja,Wav2Vec2ForAudioFrameClassification:()=>IC,Wav2Vec2ForCTC:()=>FC,Wav2Vec2ForSequenceClassification:()=>LC,Wav2Vec2Model:()=>PC,Wav2Vec2PreTrainedModel:()=>Or,WavLMForAudioFrameClassification:()=>UL,WavLMForCTC:()=>GL,WavLMForSequenceClassification:()=>$L,WavLMForXVector:()=>VL,WavLMModel:()=>RL,WavLMPreTrainedModel:()=>Fs,WeSpeakerResNetModel:()=>jL,WeSpeakerResNetPreTrainedModel:()=>Pm,WhisperForConditionalGeneration:()=>Fm,WhisperModel:()=>WL,WhisperPreTrainedModel:()=>su,XLMForQuestionAnswering:()=>KL,XLMForSequenceClassification:()=>YL,XLMForTokenClassification:()=>JL,XLMModel:()=>QL,XLMPreTrainedModel:()=>Ls,XLMRobertaForMaskedLM:()=>eI,XLMRobertaForQuestionAnswering:()=>sI,XLMRobertaForSequenceClassification:()=>tI,XLMRobertaForTokenClassification:()=>rI,XLMRobertaModel:()=>ZL,XLMRobertaPreTrainedModel:()=>Is,XLMWithLMHeadModel:()=>XL,XVectorOutput:()=>Sm,YolosForObjectDetection:()=>aI,YolosModel:()=>nI,YolosObjectDetectionOutput:()=>Lm,YolosPreTrainedModel:()=>nu,YoutuForCausalLM:()=>iI,YoutuModel:()=>oI,YoutuPreTrainedModel:()=>au});var cn=class extends P{},AE=class extends cn{},CE=class extends cn{async _call(e){return new ie(await super._call(e))}},SE=class extends cn{async _call(e){return new ht(await super._call(e))}},PE=class extends cn{async _call(e){return new Je(await super._call(e))}},nl=class extends P{},FE=class extends nl{},LE=class extends nl{},al=class extends P{},IE=class extends al{},OE=class extends al{},ol=class extends P{},NE=class extends ol{},DE=class extends ol{},il=class extends P{},zE=class extends il{},BE=class extends il{},Aa=class extends P{},RE=class extends Aa{},GE=class extends Aa{},$E=class extends Aa{async _call(e){return new ie(await super._call(e))}},ll=class extends P{},VE=class extends ll{},UE=class extends ll{async _call(e){return new ie(await super._call(e))}},ys=class extends P{},jE=class extends ys{},qE=class extends ys{async _call(e){return new Je(await super._call(e))}},WE=class extends ys{async _call(e){return new ie(await super._call(e))}},HE=class extends ys{async _call(e){return new He(await super._call(e))}},QE=class extends ys{async _call(e){return new ht(await super._call(e))}},cl=class extends P{},XE=class extends cl{},YE=class extends cl{},ul=class extends P{},JE=class extends ul{},KE=class extends ul{},dl=class extends P{},ZE=class extends dl{},e2=class extends dl{},bs=class extends P{},t2=class extends bs{},r2=class extends bs{async _call(e){return new Je(await super._call(e))}},s2=class extends bs{async _call(e){return new ie(await super._call(e))}},n2=class extends bs{async _call(e){return new He(await super._call(e))}},a2=class extends bs{async _call(e){return new ht(await super._call(e))}},o2=4299n,hp=6561n,fp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","position_ids","audio_values","exaggeration","audio_features","audio_tokens","speaker_embeddings","speaker_features","past_key_values"]);k(this,"main_input_name","input_ids");k(this,"_return_dict_in_generate_keys",["audio_tokens","speaker_embeddings","speaker_features"])}},_p=class extends fp{async encode_speech(e){return xe(this.sessions.speech_encoder,{audio_values:e})}async forward({input_ids:e=null,attention_mask:t=null,audio_values:r=null,exaggeration:s=null,position_ids:n=null,inputs_embeds:a=null,past_key_values:o=null,generation_config:i=null,logits_processor:l=null,audio_features:c=null,audio_tokens:d=null,speaker_embeddings:h=null,speaker_features:_=null,...p}){let w;if(!a){const y=this.sessions.embed_tokens.inputNames,M={input_ids:e};if(y.includes("exaggeration")){if(!(s instanceof U)){const T=e.dims[0];if(s==null)s=ct([T],.5);else if(typeof s=="number")s=ct([T],s);else if(Array.isArray(s))s=new U("float32",s,[T]);else throw new Error("Unsupported type for `exaggeration` input")}M.exaggeration=s}if(y.includes("position_ids")&&(M.position_ids=n),{inputs_embeds:a}=await xe(this.sessions.embed_tokens,M),c&&d&&h&&_&&(w={audio_features:c,audio_tokens:d,speaker_embeddings:h,speaker_features:_}),w||r)w??(w=await this.encode_speech(r)),a=ze([w.audio_features,a],1),t=yt([a.dims[0],a.dims[1]]);else{const T=a.dims[1];if(!o||T!==1)throw new Error("Incorrect state encountered during generation.");const A=o.get_seq_length();t=yt([a.dims[0],A+T])}}return{...await pr(this,{inputs_embeds:a,past_key_values:o,attention_mask:t,generation_config:i,logits_processor:l},!1),...w}}prepare_inputs_for_generation(e,t,r){if(!t.position_ids&&this.sessions.embed_tokens.inputNames.includes("position_ids"))if(t.input_ids.dims[1]===1){const s=Array.from({length:e.length},(n,a)=>e[a].length-e[a].findLastIndex(o=>o==hp)-1);t.position_ids=new U("int64",s,[e.length,1])}else{const n=t.input_ids.tolist().map(a=>{let o=0;return a.map(i=>i>=hp?0:o++)});t.position_ids=new U("int64",n.flat(),t.input_ids.dims)}return t.input_ids.dims[1]===1&&(delete t.audio_values,delete t.audio_features,delete t.audio_tokens,delete t.speaker_embeddings,delete t.speaker_features),ln(this,e,t)}async generate(e){const{sequences:t,audio_tokens:r,speaker_embeddings:s,speaker_features:n}=await super.generate({...e,return_dict_in_generate:!0}),a=t.slice(null,[e.input_ids.dims[1],-1]),o=ct([a.dims[0],3],o2),i=ze([r,a,o],1),{waveform:l}=await xe(this.sessions.conditional_decoder,{speech_tokens:i,speaker_features:n,speaker_embeddings:s});return l}},pp=class extends P{},i2=class extends pp{},mp=class extends P{},l2=class extends mp{},Ca=class extends P{},c2=class extends Ca{},gp=class extends Ca{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},wp=class extends Ca{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"audio_model"})}},Kr=class extends P{},u2=class extends Kr{},d2=class extends Kr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},vp=class extends Kr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},h2=class extends Kr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},f2=class extends Kr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},hl=class extends P{},_2=class extends hl{},p2=class extends hl{},fl=class extends P{},m2=class extends fl{},g2=class extends fl{},_l=class extends P{},w2=class extends _l{},v2=class extends _l{},pl=class extends P{},y2=class extends pl{},b2=class extends pl{},ml=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_features");k(this,"forward_params",["input_features","decoder_input_ids","decoder_attention_mask","past_key_values"])}},M2=class extends ml{},x2=class extends ml{},Ms=class extends P{},k2=class extends Ms{},T2=class extends Ms{async _call(e){return new Je(await super._call(e))}},E2=class extends Ms{async _call(e){return new ie(await super._call(e))}},A2=class extends Ms{async _call(e){return new He(await super._call(e))}},C2=class extends Ms{async _call(e){return new ht(await super._call(e))}},gl=class extends P{},S2=class extends gl{},P2=class extends gl{async _call(e){return new ie(await super._call(e))}},wl=class extends P{},F2=class extends wl{},L2=class extends wl{async _call(e){return new ie(await super._call(e))}},vl=class extends P{},I2=class extends vl{},O2=class extends vl{async _call(e){return new un(await super._call(e))}},un=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},yl=class extends P{},N2=class extends yl{},D2=class extends yl{async _call(e){return new un(await super._call(e))}},yp=class extends Ye{constructor({audio_codes:e}){super(),this.audio_codes=e}},bp=class extends Ye{constructor({audio_values:e}){super(),this.audio_values=e}},Sa=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},z2=class extends Sa{async encode(e){return new yp(await xe(this.sessions.encoder_model,e))}async decode(e){return new bp(await xe(this.sessions.decoder_model,e))}},Mp=class extends Sa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},xp=class extends Sa{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},xs=class extends P{},B2=class extends xs{},R2=class extends xs{async _call(e){return new Je(await super._call(e))}},G2=class extends xs{async _call(e){return new ie(await super._call(e))}},$2=class extends xs{async _call(e){return new He(await super._call(e))}},V2=class extends xs{async _call(e){return new ht(await super._call(e))}},bl=class extends P{},U2=class extends bl{},j2=class extends bl{},ks=class extends P{},q2=class extends ks{},W2=class extends ks{async _call(e){return new Je(await super._call(e))}},H2=class extends ks{async _call(e){return new ie(await super._call(e))}},Q2=class extends ks{async _call(e){return new He(await super._call(e))}},X2=class extends ks{async _call(e){return new ht(await super._call(e))}},kp=class extends P{},Y2=class extends kp{},Ml=class extends P{},J2=class extends Ml{},K2=class extends Ml{async _call(e){return new ie(await super._call(e))}},Tp=class extends P{},Z2=class extends Tp{},Ep=class extends P{},eA=class extends Ep{},Pa=class extends P{},tA=class extends Pa{},rA=class extends Pa{async _call(e){return new xl(await super._call(e))}},sA=class extends Pa{async _call(e){return new Ap(await super._call(e))}},xl=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},Ap=class extends Ye{constructor({logits:e,pred_boxes:t,pred_masks:r}){super(),this.logits=e,this.pred_boxes=t,this.pred_masks=r}},kl=class extends P{},nA=class extends kl{},aA=class extends kl{async _call(e){return new ie(await super._call(e))}},Tl=class extends P{},oA=class extends Tl{},iA=class extends Tl{async _call(e){return new ie(await super._call(e))}},Cp=class extends P{},lA=class extends Cp{},Sp=class extends P{},cA=class extends Sp{},Ts=class extends P{},uA=class extends Ts{},dA=class extends Ts{async _call(e){return new ie(await super._call(e))}},hA=class extends Ts{async _call(e){return new He(await super._call(e))}},fA=class extends Ts{async _call(e){return new ht(await super._call(e))}},_A=class extends Ts{async _call(e){return new Je(await super._call(e))}},Pp=class extends P{},pA=class extends Pp{},El=class extends P{},mA=class extends El{},gA=class extends El{},Al=class extends P{},wA=class extends Al{},vA=class extends Al{async _call(e){return new ie(await super._call(e))}},Es=class extends P{},yA=class extends Es{},bA=class extends Es{async _call(e){return new Je(await super._call(e))}},MA=class extends Es{async _call(e){return new ie(await super._call(e))}},xA=class extends Es{async _call(e){return new He(await super._call(e))}},kA=class extends Es{async _call(e){return new ht(await super._call(e))}},Cl=class extends P{},TA=class extends Cl{},EA=class extends Cl{},dn=class extends P{},AA=class extends dn{},CA=class extends dn{async _call(e){return new Je(await super._call(e))}},SA=class extends dn{async _call(e){return new ie(await super._call(e))}},PA=class extends dn{async _call(e){return new He(await super._call(e))}},hn=class extends P{},FA=class extends hn{},LA=class extends hn{async _call(e){return new Je(await super._call(e))}},IA=class extends hn{async _call(e){return new ie(await super._call(e))}},OA=class extends hn{async _call(e){return new He(await super._call(e))}},Sl=class extends P{},NA=class extends Sl{},DA=class extends Sl{},Pl=class extends P{},zA=class extends Pl{},BA=class extends Pl{},Fl=class extends P{},RA=class extends Fl{},GA=class extends Fl{},Ll=class extends P{},$A=class extends Ll{},VA=class extends Ll{async _call(e){return new ie(await super._call(e))}},Fp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","pixel_values","encoder_outputs","decoder_input_ids","decoder_inputs_embeds","decoder_attention_mask","past_key_values"]);k(this,"main_input_name","inputs_embeds")}},UA=class extends Fp{_merge_input_ids_with_image_features({inputs_embeds:e,image_features:t,input_ids:r,attention_mask:s}){return{inputs_embeds:ze([t,e],1),attention_mask:ze([yt(t.dims.slice(0,2)),s],1)}}async _prepare_inputs_embeds({input_ids:e,pixel_values:t,inputs_embeds:r,attention_mask:s}){if(!e&&!t)throw new Error("Either `input_ids` or `pixel_values` should be provided.");let n,a;return e&&(n=await this.encode_text({input_ids:e})),t&&(a=await this.encode_image({pixel_values:t})),n&&a?{inputs_embeds:r,attention_mask:s}=this._merge_input_ids_with_image_features({inputs_embeds:n,image_features:a,input_ids:e,attention_mask:s}):r=n||a,{inputs_embeds:r,attention_mask:s}}async forward({input_ids:e,pixel_values:t,attention_mask:r,decoder_input_ids:s,decoder_attention_mask:n,encoder_outputs:a,past_key_values:o,inputs_embeds:i,decoder_inputs_embeds:l}){if(i||({inputs_embeds:i,attention_mask:r}=await this._prepare_inputs_embeds({input_ids:e,pixel_values:t,inputs_embeds:i,attention_mask:r})),!a){let{last_hidden_state:d}=await Ir(this,{inputs_embeds:i,attention_mask:r});a=d}if(!l){if(!s)throw new Error("Either `decoder_input_ids` or `decoder_inputs_embeds` should be provided.");l=await this.encode_text({input_ids:s})}return await pr(this,{inputs_embeds:l,attention_mask:n,encoder_attention_mask:r,encoder_hidden_states:a,past_key_values:o},!0)}},Il=class extends P{},jA=class extends Il{},qA=class extends Il{},Ol=class extends P{},WA=class extends Ol{},HA=class extends Ol{},Lp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","position_ids","past_key_values"])}},mr=class extends Lp{_merge_input_ids_with_image_features(e){const t=e.image_features.dims.at(-1),r=e.image_features.view(-1,t);return rl({image_token_id:this.config.image_token_index??this.config.image_token_id,...e,image_features:r})}},QA=class extends mr{},XA=class extends mr{},Ip=class extends P{},YA=class extends Ip{},Op=class extends mr{},JA=class extends Op{},Np=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","inputs_embeds","per_layer_inputs","position_ids","pixel_values","input_features","input_features_mask","past_key_values"])}},Fa=class extends Np{async forward({input_ids:e=null,attention_mask:t=null,pixel_values:r=null,input_features:s=null,input_features_mask:n=null,position_ids:a=null,inputs_embeds:o=null,per_layer_inputs:i=null,past_key_values:l=null,generation_config:c=null,logits_processor:d=null,...h}){if((!o||!i)&&({inputs_embeds:o,per_layer_inputs:i}=await xe(this.sessions.embed_tokens,{input_ids:e}),e.dims[1]!==1)){if(r){const{image_features:p}=await this._encode_vision({pixel_values:r,...h});({inputs_embeds:o,attention_mask:t}=this._merge_input_ids_with_image_features({image_features:p,inputs_embeds:o,input_ids:e,attention_mask:t}))}if(s){const{audio_features:p}=await xe(this.sessions.audio_encoder,{input_features:s,input_features_mask:n});({inputs_embeds:o,attention_mask:t}=this._merge_input_ids_with_audio_features({audio_features:p,inputs_embeds:o,input_ids:e,attention_mask:t}))}}return await pr(this,{inputs_embeds:o,per_layer_inputs:i,past_key_values:l,attention_mask:t,position_ids:a,generation_config:c,logits_processor:d},!0)}_encode_vision(e){return xe(this.sessions.vision_encoder,{pixel_values:e.pixel_values})}_merge_input_ids_with_image_features(e){const t=e.image_features.dims.at(-1),r=e.image_features.view(-1,t);return rl({image_token_id:this.config.image_token_id,...e,image_features:r})}_merge_input_ids_with_audio_features(e){const t=e.audio_features.dims.at(-1),r=e.audio_features.view(-1,t);return dp({audio_token_id:this.config.audio_token_id,...e,audio_features:r})}},KA=class extends Fa{},Nl=class extends Fa{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","inputs_embeds","per_layer_inputs","position_ids","pixel_values","image_position_ids","input_features","input_features_mask","past_key_values"])}_encode_vision(t){return xe(this.sessions.vision_encoder,{pixel_values:t.pixel_values,pixel_position_ids:t.image_position_ids})}},ZA=class extends Nl{},Dl=class extends P{},eC=class extends Dl{},tC=class extends Dl{},zl=class extends P{},rC=class extends zl{},sC=class extends zl{},Dp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","past_key_values","pixel_values","image_grid_thw"])}},Bl=class extends Dp{constructor(){super(...arguments);k(this,"image_grid_thw_name","grid_thw")}_get_text_only_rope_index(t,r){if(r){const{data:s,dims:n}=cp(r),a=BigInt64Array.from({length:3*s.length},(i,l)=>s[l%s.length]),o=Array.from({length:n[0]},(i,l)=>je(s.subarray(n[1]*l,n[1]*(l+1)))[0]+1n+BigInt(n[1]));return[new U("int64",a,[3,...n]),new U("int64",o,[o.length,1])]}else{const[s,n]=t.dims,a=BigInt64Array.from({length:3*s*n},(o,i)=>BigInt(Math.floor(i%n/s)));return[new U("int64",a,[3,...t.dims]),Jf([s,1])]}}_reorder_and_write_positions(t,r,s,n){const a=t.reduce((c,d)=>c+d.length,0),o=new Array(a);let i=0;for(let c=0;c<3;++c)for(const d of t){const h=d.length/3;for(let _=c*h;_<(c+1)*h;++_)o[i++]=d[_]}let l=0;for(let c=0;c<r.length;++c)if(r[c]==1){for(let d=0;d<3;++d)s[d][n][c]=o[d*a/3+l];++l}return o}_get_multimodal_rope_positions({filtered_ids:t,image_grid_thw_list:r,video_grid_thw_list:s,spatial_merge_size:n,state:a}){const{image_token_id:o,video_token_id:i,vision_start_token_id:l}=this.config,c=t,h=c.reduce((T,A,C)=>(A==l&&T.push(C),T),[]).map(T=>c[T+1]),_=h.filter(T=>T==o).length,p=h.filter(T=>T==i).length,w=[];let v=0,y=_,M=p;for(let T=0;T<h.length;++T){const A=c.findIndex((X,J)=>J>v&&X==o),C=c.findIndex((X,J)=>J>v&&X==i),S=y>0&&A!==-1?A:c.length+1,N=M>0&&C!==-1?C:c.length+1;let x,R,z,$;S<N?([R,z,$]=r[a.image_index],++a.image_index,--y,x=S):([R,z,$]=s[a.video_index],++a.video_index,--M,x=N);const[Q,H,D]=[Number(R),Math.floor(Number(z)/n),Math.floor(Number($)/n)],I=x-v,te=w.length>0?je(w.at(-1))[0]+1:0;w.push(Array.from({length:3*I},(X,J)=>te+J%I));const W=I+te,ee=Q*H*D,G=Array.from({length:ee},(X,J)=>W+Math.floor(J/(H*D))),L=Array.from({length:ee},(X,J)=>W+Math.floor(J/D)%H),V=Array.from({length:ee},(X,J)=>W+J%D);w.push([G,L,V].flat()),v=x+ee}if(v<c.length){const T=w.length>0?je(w.at(-1))[0]+1:0,A=c.length-v;w.push(Array.from({length:3*A},(C,S)=>T+S%A))}return w}get_rope_index(t,r,s,n){const{vision_config:a}=this.config,o=a.spatial_merge_size??2;if(r||s){const i=t.tolist();n||(n=Yf(t));const l=n.tolist(),c=Array.from({length:3},()=>Array.from({length:t.dims[0]},()=>Array.from({length:t.dims[1]},()=>0))),d=r?r.tolist():[],h=s?s.tolist():[],_={image_index:0,video_index:0},p=[];for(let w=0;w<i.length;++w){const v=i[w].filter((T,A)=>l[w][A]==1),y=this._get_multimodal_rope_positions({filtered_ids:v,image_grid_thw_list:d,video_grid_thw_list:h,spatial_merge_size:o,state:_}),M=this._reorder_and_write_positions(y,l[w],c,w);p.push(je(M)[0]+1-i[w].length)}return[new U("int64",c.flat(1/0),[3,t.dims[0],t.dims[1]]),new U("int64",p,[p.length,1])]}else return this._get_text_only_rope_index(t,n)}async encode_image({pixel_values:t,image_grid_thw:r}){return(await xe(this.sessions.vision_encoder,{pixel_values:t,[this.image_grid_thw_name]:r})).image_features}_merge_input_ids_with_image_features(t){return rl({image_token_id:this.config.image_token_id,...t})}prepare_inputs_for_generation(t,r,s){if(!r.attention_mask||r.position_ids||!(this.sessions.decoder_model_merged??this.sessions.model).inputNames.includes("position_ids"))return r;if(!r.past_key_values)[r.position_ids,r.rope_deltas]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask);else{r.pixel_values=null;const a=r.past_key_values.get_seq_length();if(a<r.input_ids.dims[1]){const[o,i]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask);r.rope_deltas=i,r.position_ids=o.slice(null,null,[a,null]),r.input_ids=r.input_ids.slice(null,[a,null])}else{r.rope_deltas||([,r.rope_deltas]=this.get_rope_index(r.input_ids,r.image_grid_thw,r.video_grid_thw,r.attention_mask));const o=BigInt(a),i=r.rope_deltas.map(l=>o+l);r.position_ids=dr([i,i,i],0)}}return r}},zp=class extends Bl{},Rl=class extends Bl{constructor(){super(...arguments);k(this,"image_grid_thw_name","image_grid_thw")}},Bp=class extends zp{constructor(){super(...arguments);k(this,"image_grid_thw_name","image_grid_thw")}},nC=class extends Rl{get_vision_position_ids(e,t,r,s){const n=Math.floor(t[0]/r),a=Math.floor(t[1]/s),o=Math.floor(t[2]/s),i=a*o*n,l=Array.from({length:i},()=>e),c=Array.from({length:i},(h,_)=>e+Math.floor(_/(o*n))),d=Array.from({length:i},(h,_)=>e+_%o);return[...l,...c,...d]}_get_multimodal_rope_positions({filtered_ids:e,image_grid_thw_list:t,video_grid_thw_list:r,spatial_merge_size:s,state:n}){const{image_token_id:a}=this.config,o=[];let i=0,l=e[0]==a?1:0;for(let h=1;h<=e.length;++h){const _=h<e.length?e[h]==a?1:0:-1;_!==l&&(o.push([l,i,h]),i=h,l=_)}let c=0;const d=[];for(const[h,_,p]of o)if(h===0){const w=p-_;d.push(Array.from({length:3*w},(v,y)=>c+y%w)),c+=w}else{const w=t[n.image_index++].map(Number),v=w[0];d.push(this.get_vision_position_ids(c,w,v,s)),c+=Math.max(w[1],w[2])/s}return d}},Gl=class extends P{},aC=class extends Gl{},oC=class extends Gl{},$l=class extends P{},iC=class extends $l{},lC=class extends $l{},Vl=class extends P{},cC=class extends Vl{},uC=class extends Vl{},Ul=class extends P{},dC=class extends Ul{},hC=class extends Ul{},jl=class extends P{},fC=class extends jl{},_C=class extends jl{},ql=class extends P{},pC=class extends ql{},mC=class extends ql{},Wl=class extends P{},gC=class extends Wl{},wC=class extends Wl{},Hl=class extends P{},vC=class extends Hl{},yC=class extends Hl{},Ql=class extends P{},bC=class extends Ql{},MC=class extends Ql{},Rp=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","audio_values","past_key_values"])}},Xl=class extends Rp{_merge_input_ids_with_audio_features(e){const t=e.audio_features.dims.at(-1),r=e.audio_features.view(-1,t);return dp({audio_token_id:this.config.ignore_index??this.config.audio_token_id??this.config.audio_token_index,...e,audio_features:r})}},xC=class extends Xl{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","input_features","past_key_values"])}},Gp=class extends P{},kC=class extends Gp{},$p=class extends P{},TC=class extends $p{},Yl=class extends P{},EC=class extends Yl{},AC=class extends Yl{},Jl=class extends P{},CC=class extends Jl{},SC=class extends Jl{async _call(e){return new ie(await super._call(e))}},Or=class extends P{},PC=class extends Or{},FC=class extends Or{async _call(e){return new Jr(await super._call(e))}},LC=class extends Or{async _call(e){return new ie(await super._call(e))}},IC=class extends Or{async _call(e){return new He(await super._call(e))}},OC=class extends P{},NC=class extends Or{},DC=class extends Or{async _call(e){return new Jr(await super._call(e))}},zC=class extends Or{async _call(e){return new ie(await super._call(e))}},Kl=class extends P{},BC=class extends Kl{},RC=class extends Kl{},Vp=class extends mr{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","pixel_attention_mask","position_ids","past_key_values"])}},Zl=class extends P{},GC=class extends Zl{},$C=class extends Zl{async _call(e){return new ie(await super._call(e))}},ec=class extends P{},VC=class extends ec{},UC=class extends ec{},La=class extends P{},jC=class extends La{async forward(e){const t=!e.input_ids,r=!e.pixel_values;if(t&&r)throw new Error("Either `input_ids` or `pixel_values` should be provided.");if(t&&(e.input_ids=yt([e.pixel_values.dims[0],1])),r){const{image_size:l}=this.config.vision_config;e.pixel_values=ct([0,3,l,l],0)}const{text_embeddings:s,image_embeddings:n,l2norm_text_embeddings:a,l2norm_image_embeddings:o}=await super.forward(e),i={};return t||(i.text_embeddings=s,i.l2norm_text_embeddings=a),r||(i.image_embeddings=n,i.l2norm_image_embeddings=o),i}},Up=class extends La{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},qC=class extends La{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},tc=class extends P{},WC=class extends tc{},HC=class extends tc{},QC=class extends mr{},rc=class extends P{},XC=class extends rc{},YC=class extends rc{},JC=class extends mr{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","pixel_values","pixel_attention_mask","spatial_shapes","position_ids","past_key_values"])}},sc=class extends P{},KC=class extends sc{},ZC=class extends sc{},jp=class extends P{},eS=class extends jp{},nc=class extends P{},tS=class extends nc{},rS=class extends nc{},ac=class extends P{},sS=class extends ac{},nS=class extends ac{},oc=class extends P{},aS=class extends oc{},oS=class extends oc{},ic=class extends P{},iS=class extends ic{},lS=class extends ic{},fn=class extends P{},cS=class extends fn{},uS=class extends fn{},dS=class extends fn{async _call(e){return new ie(await super._call(e))}},hS=class extends fn{},qp=class extends P{},fS=class extends qp{},Wp=class extends P{},_S=class extends Wp{},Hp=class extends Ye{constructor({char_logits:e,bpe_logits:t,wp_logits:r}){super(),this.char_logits=e,this.bpe_logits=t,this.wp_logits=r}get logits(){return[this.char_logits,this.bpe_logits,this.wp_logits]}},Qp=class extends P{},pS=class extends Qp{async _call(e){return new Hp(await super._call(e))}},Xp=class extends Ye{constructor({audio_codes:e}){super(),this.audio_codes=e}},Yp=class extends Ye{constructor({audio_values:e}){super(),this.audio_values=e}},Ia=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},mS=class extends Ia{async encode(e){return new Xp(await xe(this.sessions.encoder_model,e))}async decode(e){return new Yp(await xe(this.sessions.decoder_model,e))}},Jp=class extends Ia{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},Kp=class extends Ia{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},lc=class extends P{},gS=class extends lc{},wS=class extends lc{},cc=class extends P{},vS=class extends cc{},yS=class extends cc{},_n=class extends P{},bS=class extends _n{},MS=class extends _n{async _call(e){return new Je(await super._call(e))}},xS=class extends _n{async _call(e){return new ie(await super._call(e))}},kS=class extends _n{async _call(e){return new ht(await super._call(e))}},uc=class extends P{},TS=class extends uc{},ES=class extends uc{},Oa=class extends P{},AS=class extends Oa{},CS=class extends Oa{async _call(e){return new ie(await super._call(e))}},SS=class extends Oa{},Na=class extends P{},PS=class extends Na{},FS=class extends Na{async _call(e){return new ie(await super._call(e))}},LS=class extends Na{},Da=class extends P{},IS=class extends Da{},OS=class extends Da{async _call(e){return new ie(await super._call(e))}},NS=class extends Da{},za=class extends P{},DS=class extends za{},zS=class extends za{async _call(e){return new ie(await super._call(e))}},BS=class extends za{},dc=class extends P{},RS=class extends dc{},GS=class extends dc{async _call(e){return new ie(await super._call(e))}},hc=class extends P{},$S=class extends hc{},VS=class extends hc{async _call(e){return new ie(await super._call(e))}},pn=class extends P{},US=class extends pn{},jS=class extends pn{async _call(e){return new Je(await super._call(e))}},qS=class extends pn{async _call(e){return new ie(await super._call(e))}},WS=class extends pn{async _call(e){return new He(await super._call(e))}},fc=class extends P{},HS=class extends fc{},QS=class extends fc{},_c=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values","decoder_input_ids","past_key_values"])}},XS=class extends _c{},YS=class extends _c{},As=class extends P{},JS=class extends As{},KS=class extends As{async _call(e){return new Je(await super._call(e))}},ZS=class extends As{async _call(e){return new ie(await super._call(e))}},eP=class extends As{async _call(e){return new He(await super._call(e))}},tP=class extends As{async _call(e){return new ht(await super._call(e))}},pc=class extends P{},rP=class extends pc{},sP=class extends pc{},mc=class extends P{},nP=class extends mc{},aP=class extends mc{},Zp=class extends P{},oP=class extends Zp{constructor(...t){super(...t);k(this,"forward_params",["input_ids","pixel_values","images_seq_mask","images_emb_mask","attention_mask","position_ids","past_key_values"]);this._generation_mode="text"}async forward(t){const r=this._generation_mode??"text";let s;if(r==="text"||!t.past_key_values){const l=this.sessions.prepare_inputs_embeds,c=rt(t,l.inputNames);s=await xe(l,c)}else{const l=this.sessions.gen_img_embeds,c=rt({image_ids:t.input_ids},l.inputNames);s=await xe(l,c)}const n={...t,...s},a=await pr(this,n),o=this.sessions[r==="text"?"lm_head":"gen_head"];if(!o)throw new Error(`Unable to find "${o}" generation head`);const i=await xe(o,rt(a,o.inputNames));return{...s,...a,...i}}prepare_inputs_for_generation(t,r,s){const n=!!r.past_key_values;return s.guidance_scale!==null&&s.guidance_scale>1&&(n?r.input_ids=ze([r.input_ids,r.input_ids],0):(r.input_ids=ze([r.input_ids,Li(r.input_ids,BigInt(s.pad_token_id))],0),r.attention_mask=ze([r.attention_mask,Li(r.attention_mask,0n)],0))),(n||!r.pixel_values)&&(r.pixel_values=ct([0,0,3,384,384],1)),n&&(r.images_seq_mask=new U("bool",new Array(1).fill(!0).fill(!1,0,1),[1,1]),r.images_emb_mask=new U("bool",new Array(0).fill(!1),[1,1,0])),r}async generate(t){return this._generation_mode="text",super.generate(t)}async generate_images(t){this._generation_mode="image";const r=(t.inputs??t[this.main_input_name]).dims[1],n=(await super.generate(t)).slice(null,[r,null]),a=this.sessions.image_decode,{decoded_image:o}=await xe(a,{generated_tokens:n}),i=o.add_(1).mul_(255/2).clamp_(0,255).to("uint8"),l=[];for(const c of i){const d=qt.fromTensor(c);l.push(d)}return l}},gc=class extends P{},iP=class extends gc{},lP=class extends gc{},em=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"])}_apply_and_filter_by_delay_pattern_mask(t){const[r,s]=t.dims,n=this.config.decoder.num_codebooks,a=s-n;let o=0;for(let c=0;c<t.size;++c){if(t.data[c]==this.config.decoder.pad_token_id)continue;const d=c%s,h=Math.floor(c/s)%n,_=d-h;_>0&&_<=a&&(t.data[o++]=t.data[c])}const i=Math.floor(r/n),l=o/(i*n);return new U(t.type,t.data.slice(0,o),[i,n,l])}prepare_inputs_for_generation(t,r,s){const n=BigInt(this.config.decoder.pad_token_id);let a=structuredClone(t);for(let o=0;o<a.length;++o)for(let i=0;i<a[o].length;++i)o%this.config.decoder.num_codebooks>=i&&(a[o][i]=n);return s.guidance_scale!==null&&s.guidance_scale>1&&(a=a.concat(a)),Ta(this,a,r)}async generate(t){const r=await super.generate(t),s=this._apply_and_filter_by_delay_pattern_mask(r).unsqueeze_(0),{audio_values:n}=await xe(this.sessions.encodec_decode,{audio_codes:s});return n}},wc=class extends P{},cP=class extends wc{},uP=class extends wc{},vc=class extends P{},dP=class extends vc{},hP=class extends vc{},Cs=class extends P{},fP=class extends Cs{},_P=class extends Cs{async _call(e){return new Je(await super._call(e))}},pP=class extends Cs{async _call(e){return new ie(await super._call(e))}},mP=class extends Cs{async _call(e){return new He(await super._call(e))}},gP=class extends Cs{async _call(e){return new ht(await super._call(e))}},tm=class extends P{},wP=class extends tm{},yc=class extends P{},vP=class extends yc{},yP=class extends yc{},bc=class extends P{},bP=class extends bc{},MP=class extends bc{},Mc=class extends P{},xP=class extends Mc{},kP=class extends Mc{},xc=class extends P{},TP=class extends xc{},EP=class extends xc{},kc=class extends P{},AP=class extends kc{},CP=class extends kc{},Tc=class extends P{},SP=class extends Tc{},PP=class extends Tc{},Ec=class extends P{},FP=class extends Ec{},LP=class extends Ec{},Ac=class extends P{},IP=class extends Ac{},OP=class extends Ac{},NP=class extends mr{},rm=class extends P{},DP=class extends rm{async _call(e){return new Jr(await super._call(e))}},Cc=class extends P{},zP=class extends Cc{},BP=class extends Cc{},Sc=class extends P{},RP=class extends Sc{},GP=class extends Sc{},Pc=class extends P{},$P=class extends Pc{},VP=class extends Pc{},Fc=class extends P{},UP=class extends Fc{},jP=class extends Fc{},sm=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","inputs_embeds","attention_mask","position_ids","pixel_values","image_sizes","past_key_values"])}},nm=class extends sm{async forward({input_ids:e=null,attention_mask:t=null,pixel_values:r=null,image_sizes:s=null,position_ids:n=null,inputs_embeds:a=null,past_key_values:o=null,generation_config:i=null,logits_processor:l=null,...c}){if(!a){let h;if(r&&e.dims[1]!==1){if(!s)throw new Error("`image_sizes` must be provided when `pixel_values` is provided.");({image_features:h}=await xe(this.sessions.vision_encoder,{pixel_values:r,image_sizes:s}))}else{const _=this.config.normalized_config.hidden_size;h=new U("float32",[],[0,_])}({inputs_embeds:a}=await xe(this.sessions.prepare_inputs_embeds,{input_ids:e,image_features:h}))}return await pr(this,{inputs_embeds:a,past_key_values:o,attention_mask:t,position_ids:n,generation_config:i,logits_processor:l},!1)}},Lc=class extends P{},qP=class extends Lc{},WP=class extends Lc{async _call(e){return new ie(await super._call(e))}},Ic=class extends P{},HP=class extends Ic{},QP=class extends Ic{async _call(e){return new He(await super._call(e))}},Oc=class extends P{},XP=class extends Oc{},YP=class extends Oc{},Nc=class extends P{},JP=class extends Nc{},KP=class extends Nc{},Dc=class extends P{},ZP=class extends Dc{},eF=class extends Dc{},zc=class extends P{},tF=class extends zc{},rF=class extends zc{},Bc=class extends P{},sF=class extends Bc{},nF=class extends Bc{},Rc=class extends Rl{},am=class extends Bp{},aF=class extends Rc{},oF=class extends am{},Gc=class extends Rc{},om=class extends Gc{},iF=class extends Gc{},lF=class extends om{},$c=class extends P{},cF=class extends $c{},uF=class extends $c{async _call(e){return new ie(await super._call(e))}},Vc=class extends P{},dF=class extends Vc{},hF=class extends Vc{async _call(e){return new im(await super._call(e))}},im=class extends un{},Ss=class extends P{},fF=class extends Ss{},_F=class extends Ss{async _call(e){return new Je(await super._call(e))}},pF=class extends Ss{async _call(e){return new ie(await super._call(e))}},mF=class extends Ss{async _call(e){return new He(await super._call(e))}},gF=class extends Ss{async _call(e){return new ht(await super._call(e))}},Ps=class extends P{},wF=class extends Ps{},vF=class extends Ps{async _call(e){return new Je(await super._call(e))}},yF=class extends Ps{async _call(e){return new ie(await super._call(e))}},bF=class extends Ps{async _call(e){return new He(await super._call(e))}},MF=class extends Ps{async _call(e){return new ht(await super._call(e))}},Uc=class extends P{},xF=class extends Uc{},kF=class extends Uc{async _call(e){return new lm(await super._call(e))}},lm=class extends un{},cm=class extends Ye{constructor({iou_scores:e,pred_masks:t}){super(),this.iou_scores=e,this.pred_masks=t}},um=class extends P{},TF=class extends um{async get_image_embeddings({pixel_values:e}){return await Ir(this,{pixel_values:e})}async forward(e){!e.image_embeddings||!e.image_positional_embeddings?e={...e,...await this.get_image_embeddings(e)}:e={...e},e.input_labels??(e.input_labels=yt(e.input_points.dims.slice(0,-1)));const t={image_embeddings:e.image_embeddings,image_positional_embeddings:e.image_positional_embeddings};return e.input_points&&(t.input_points=e.input_points),e.input_labels&&(t.input_labels=e.input_labels),e.input_boxes&&(t.input_boxes=e.input_boxes),await xe(this.sessions.prompt_encoder_mask_decoder,t)}async _call(e){return new cm(await super._call(e))}},dm=class extends Ye{constructor({iou_scores:e,pred_masks:t,object_score_logits:r}){super(),this.iou_scores=e,this.pred_masks=t,this.object_score_logits=r}},hm=class extends P{},jc=class extends hm{async get_image_embeddings({pixel_values:e}){return await Ir(this,{pixel_values:e})}async forward(e){const{num_feature_levels:t}=this.config.vision_config;if(Array.from({length:t},(a,o)=>`image_embeddings.${o}`).some(a=>!e[a])?e={...e,...await this.get_image_embeddings(e)}:e={...e},e.input_points){if(e.input_boxes&&e.input_boxes.dims[1]!==1)throw new Error("When both `input_points` and `input_boxes` are provided, the number of boxes per image must be 1.");const a=e.input_points.dims;e.input_labels??(e.input_labels=yt(a.slice(0,-1))),e.input_boxes??(e.input_boxes=ct([a[0],0,4],0))}else if(e.input_boxes){const a=e.input_boxes.dims;e.input_labels=ct([a[0],a[1],0],-1n),e.input_points=ct([a[0],1,0,2],0)}else throw new Error("At least one of `input_points` or `input_boxes` must be provided.");const s=this.sessions.prompt_encoder_mask_decoder,n=rt(e,s.inputNames);return await xe(s,n)}async _call(e){return new dm(await super._call(e))}},EF=class extends jc{},AF=class extends jc{},Ba=class extends P{},CF=class extends Ba{},SF=class extends Ba{},PF=class extends Ba{},Ra=class extends P{},FF=class extends Ra{},LF=class extends Ra{},IF=class extends Ra{},qc=class extends P{},OF=class extends qc{},fm=class extends qc{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"text_model"})}},NF=class extends Kr{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"vision_model"})}},Wc=class extends P{},DF=class extends Wc{},zF=class extends Wc{},BF=class extends Vp{},Ga=class extends P{constructor(){super(...arguments);k(this,"main_input_name","input_values");k(this,"forward_params",["input_values"])}},RF=class extends Ga{async encode(e){return await xe(this.sessions.encoder_model,e)}async decode(e){return await xe(this.sessions.decoder_model,e)}},_m=class extends Ga{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"encoder_model"})}},pm=class extends Ga{static async from_pretrained(e,t={}){return super.from_pretrained(e,{...t,model_file_name:t.model_file_name??"decoder_model"})}},Hc=class extends P{},GF=class extends Hc{},$F=class extends Hc{},$a=class extends P{},VF=class extends $a{},UF=class extends $a{},jF=class extends $a{async generate_speech(e,t,{threshold:r=.5,minlenratio:s=0,maxlenratio:n=20,vocoder:a=null}={}){const o={input_ids:e},{encoder_outputs:i,encoder_attention_mask:l}=await Ir(this,o),c=i.dims[1]/this.config.reduction_factor,d=Math.floor(c*n),h=Math.floor(c*s),_=this.config.num_mel_bins;let p=[],w=null,v=null,y=0;for(;;){++y;const A=ap(!!v);let C;v?C=v.output_sequence_out:C=new U("float32",new Float32Array(_),[1,1,_]);let S={use_cache_branch:A,output_sequence:C,encoder_attention_mask:l,speaker_embeddings:t,encoder_hidden_states:i};tl(this,S,w),v=await xe(this.sessions.decoder_model_merged,S),w=el(v,w);const{prob:N,spectrum:x}=v;if(p.push(x),y>=h&&(Array.from(N.data).filter(R=>R>=r).length>0||y>=d))break}const M=ze(p),{waveform:T}=await xe(a.sessions.model,{spectrogram:M});return{spectrogram:M,waveform:T}}},qF=class extends P{constructor(){super(...arguments);k(this,"main_input_name","spectrogram")}},mn=class extends P{},WF=class extends mn{},HF=class extends mn{async _call(e){return new Je(await super._call(e))}},QF=class extends mn{async _call(e){return new ie(await super._call(e))}},XF=class extends mn{async _call(e){return new ht(await super._call(e))}},Qc=class extends P{},YF=class extends Qc{},JF=class extends Qc{},Xc=class extends P{},KF=class extends Xc{},ZF=class extends Xc{},mm=class extends P{},eL=class extends mm{},gm=class extends P{},wm=class extends gm{async generate_speech({input_ids:e,attention_mask:t,style:r,num_inference_steps:s=5,speed:n=1.05}){const{sampling_rate:a,chunk_compress_factor:o,base_chunk_size:i,latent_dim:l}=this.config,{last_hidden_state:c,durations:d}=await xe(this.sessions.text_encoder,{input_ids:e,attention_mask:t,style:r}),h=d.div(n).mul_(a),_=i*o,p=h.data,w=Int32Array.from(p,z=>Math.ceil(z/_)),v=Math.max(...w),y=e.dims[0],M=new BigInt64Array(y*v);for(let z=0;z<y;++z)M.fill(1n,z*v,z*v+w[z]);const T=new U("int64",M,[y,v]),A=l*o,C=A*v;let S=Sx([y,A,v]);const N=S.data;for(let z=0;z<y;++z)if(w[z]!==v)for(let $=0;$<A;++$)N.fill(0,z*C+$*v+w[z],z*C+($+1)*v);const x=ct([y],s);for(let z=0;z<s;++z){const $=ct([y],z);({denoised_latents:S}=await xe(this.sessions.latent_denoiser,{style:r,noisy_latents:S,latent_mask:T,encoder_outputs:c,attention_mask:t,timestep:$,num_inference_steps:x}))}const{waveform:R}=await xe(this.sessions.voice_decoder,{latents:S});return{waveform:R,durations:h}}},Va=class extends P{},tL=class extends Va{},rL=class extends Va{async _call(e){return new ie(await super._call(e))}},sL=class extends Va{},Yc=class extends P{},nL=class extends Yc{},aL=class extends Yc{},Jc=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","encoder_outputs","decoder_input_ids","decoder_attention_mask","past_key_values"])}},oL=class extends Jc{},iL=class extends Jc{},Kc=class extends P{},lL=class extends Kc{},cL=class extends Kc{async _call(e){return new vm(await super._call(e))}},vm=class extends xl{},ym=class extends P{},uL=class extends ym{},Ua=class extends P{},dL=class extends Ua{},hL=class extends Ua{async _call(e){return new Jr(await super._call(e))}},fL=class extends Ua{async _call(e){return new ie(await super._call(e))}},gn=class extends P{},_L=class extends gn{},pL=class extends gn{async _call(e){return new Jr(await super._call(e))}},mL=class extends gn{async _call(e){return new ie(await super._call(e))}},gL=class extends gn{async _call(e){return new He(await super._call(e))}},Zc=class extends P{},wL=class extends Zc{},vL=class extends Zc{},yL=class extends P{constructor(){super(...arguments);k(this,"main_input_name","pixel_values");k(this,"forward_params",["pixel_values","decoder_input_ids","encoder_hidden_states","past_key_values"])}},eu=class extends P{},bL=class extends eu{},ML=class extends eu{async _call(e){return new ie(await super._call(e))}},bm=class extends P{},xL=class extends bm{},tu=class extends P{},kL=class extends tu{},TL=class extends tu{async _call(e){return new ie(await super._call(e))}},Mm=class extends P{},EL=class extends Mm{async _call(e){return new ZT(await super._call(e))}},xm=class extends P{},AL=class extends xm{},km=class extends Ye{constructor({waveform:e,spectrogram:t}){super(),this.waveform=e,this.spectrogram=t}},Tm=class extends P{},CL=class extends Tm{async _call(e){return new km(await super._call(e))}},SL=class extends Xl{},Em=2,PL=1,ru=new WeakMap;function FL(e,t){var w,v,y;const{text_config:r,audio_config:s}=e.config,n=e.sessions.audio_encoder,{num_mel_bins:a,hidden_size:o}=s,i=a+o,l=new Ji,c=((w=n==null?void 0:n.config)==null?void 0:w.kv_cache_dtype)??"float32",d=c==="float16"?fs.float16:fs.float32,h=ba(s,{batch_size:1});for(const M in h){const T=h[M].reduce((A,C)=>A*C,1);l[M]=new U(c,new d(T),h[M])}const _=new U(c,new d(i*Em),[1,i,Em]),p=((v=t[Symbol.asyncIterator])==null?void 0:v.call(t))??((y=t[Symbol.iterator])==null?void 0:y.call(t));if(!p)throw new Error("input_features must be iterable or async iterable");return{encoder_session:n,enc_kv_cache:l,enc_padding_cache:_,enc_past_seq_len:0,audio_embed_queue:[],audio_embed_total_tokens:0,audio_queue_offset:0,audio_consumed:0,stream_exhausted:!1,chunks_iter:p,text_hidden_size:r.hidden_size}}async function LL(e,t){const r=t.dims[2],s=Math.floor((PL+r-3)/2)+1,n=new U("int64",BigInt64Array.from({length:s},(d,h)=>BigInt(e.enc_past_seq_len+h)),[1,s]),a=e.enc_past_seq_len+s,o=yt([1,a]),{audio_embeds:i,present_padding_cache:l,...c}=await xe(e.encoder_session,{input_features:t,attention_mask:o,position_ids:n,past_padding_cache:e.enc_padding_cache,...e.enc_kv_cache});e.enc_padding_cache.location==="gpu-buffer"&&e.enc_padding_cache.dispose(),e.enc_padding_cache=l;for(const d in c)if(d.startsWith("present.")){const h=d.replace("present","past_key_values"),_=e.enc_kv_cache[h];(_==null?void 0:_.location)==="gpu-buffer"&&_.dispose(),e.enc_kv_cache[h]=c[d]}return e.enc_past_seq_len=a,i}async function IL(e,t){for(;e.audio_embed_total_tokens<t&&!e.stream_exhausted;){const r=await e.chunks_iter.next();if(r.done){e.stream_exhausted=!0;break}const s=await LL(e,r.value);e.audio_embed_queue.push({data:s.data,tokens:s.dims[1]}),e.audio_embed_total_tokens+=s.dims[1]}}function OL(e,t,r){if(e.audio_embed_queue.length===0)return;const s=t.data;let n=0,a=r;for(;a>0&&e.audio_embed_queue.length>0;){const o=e.audio_embed_queue[0],i=o.tokens-e.audio_queue_offset,l=Math.min(a,i),c=e.audio_queue_offset*e.text_hidden_size;for(let d=0;d<l*e.text_hidden_size;++d)s[n*e.text_hidden_size+d]+=o.data[c+d];n+=l,a-=l,e.audio_queue_offset+=l,e.audio_queue_offset>=o.tokens&&(e.audio_embed_queue.shift(),e.audio_queue_offset=0)}e.audio_consumed+=r-a}var NL=class extends Ma{constructor(e){super(),this._s=e}_call(e){const t=this._s.stream_exhausted&&this._s.audio_embed_queue.length===0;return e.map(()=>t)}},Am=class extends P{constructor(){super(...arguments);k(this,"forward_params",["input_ids","attention_mask","position_ids","past_key_values"])}},Cm=class extends Am{async forward({input_ids:e,past_key_values:t,...r}){const s=e.dims[1],n=ru.get(this);n&&await IL(n,n.audio_consumed+s);const{inputs_embeds:a}=await xe(this.sessions.embed_tokens,{input_ids:e});n&&OL(n,a,s);const o={inputs_embeds:a,...r};tl(this,o,t);const i=this.sessions.decoder_model_merged,l=rt(o,i.inputNames);return await xe(i,l)}async generate({input_features:e,stopping_criteria:t,...r}){if(!e)throw new Error("input_features (generator/iterable) must be provided");const s=FL(this,e);ru.set(this,s);const n=new tp;n.push(new NL(s)),t&&n.extend(t);try{return await super.generate({...r,stopping_criteria:n})}finally{s.enc_kv_cache.dispose(),ru.delete(this)}}},ja=class extends P{},DL=class extends ja{},zL=class extends ja{async _call(e){return new Jr(await super._call(e))}},BL=class extends ja{async _call(e){return new ie(await super._call(e))}},Sm=class extends Ye{constructor({logits:e,embeddings:t}){super(),this.logits=e,this.embeddings=t}},Fs=class extends P{},RL=class extends Fs{},GL=class extends Fs{async _call(e){return new Jr(await super._call(e))}},$L=class extends Fs{async _call(e){return new ie(await super._call(e))}},VL=class extends Fs{async _call(e){return new Sm(await super._call(e))}},UL=class extends Fs{async _call(e){return new He(await super._call(e))}},Pm=class extends P{},jL=class extends Pm{},qL=class extends ep{constructor(){super(...arguments);k(this,"return_timestamps",null);k(this,"return_token_timestamps",null);k(this,"num_frames",null);k(this,"alignment_heads",null);k(this,"task",null);k(this,"language",null);k(this,"no_timestamps_token_id",null);k(this,"prompt_ids",null);k(this,"is_multilingual",null);k(this,"lang_to_id",null);k(this,"task_to_id",null);k(this,"max_initial_timestamp_index",1)}},su=class extends P{constructor(){super(...arguments);k(this,"requires_attention_mask",!1);k(this,"main_input_name","input_features");k(this,"forward_params",["input_features","attention_mask","decoder_input_ids","decoder_attention_mask","past_key_values"])}},WL=class extends su{},Fm=class extends su{_prepare_generation_config(e,t){return super._prepare_generation_config(e,t,qL)}_retrieve_init_tokens(e){const t=[e.decoder_start_token_id];let r=e.language;const s=e.task;if(e.is_multilingual){r||(ue.warn("No language specified - defaulting to English (en)."),r="en");const a=`<|${k1(r)}|>`;t.push(e.lang_to_id[a]),t.push(e.task_to_id[s??"transcribe"])}else if(r||s)throw new Error("Cannot specify `task` or `language` for an English-only model. If the model is intended to be multilingual, pass `is_multilingual=true` to generate, or update the generation config.");return!e.return_timestamps&&e.no_timestamps_token_id&&t.at(-1)!==e.no_timestamps_token_id?t.push(e.no_timestamps_token_id):e.return_timestamps&&t.at(-1)===e.no_timestamps_token_id&&(ue.warn("<|notimestamps|> prompt token is removed from generation_config since `return_timestamps` is set to `true`."),t.pop()),t.filter(n=>n!=null)}async generate({inputs:e=null,generation_config:t=null,logits_processor:r=null,stopping_criteria:s=null,...n}){t=this._prepare_generation_config(t,n);const a=n.decoder_input_ids instanceof U?Ii(n.decoder_input_ids):n.decoder_input_ids??this._retrieve_init_tokens(t);if(t.return_timestamps&&(r??(r=new Yi),r.push(new nE(t,a))),t.begin_suppress_tokens&&(r??(r=new Yi),r.push(new Z_(t.begin_suppress_tokens,a.length))),t.return_token_timestamps){if(!t.alignment_heads)throw new Error("Model generation config has no `alignment_heads`, token-level timestamps not available. See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config.");t.task==="translate"&&ue.warn("Token-level timestamps may not be reliable for task 'translate'."),t.output_attentions=!0,t.return_dict_in_generate=!0}if(t.return_timestamps&&!n.max_new_tokens)return this._generate_with_seek({inputs:e,generation_config:t,logits_processor:r,init_tokens:a,kwargs:n});const o=await super.generate({inputs:e,generation_config:t,logits_processor:r,decoder_input_ids:a,...n});return t.return_token_timestamps&&(o.token_timestamps=this._extract_token_timestamps(o,t.alignment_heads,t.num_frames,.02,a.length)),o}async _generate_with_seek({inputs:e,generation_config:t,logits_processor:r,init_tokens:s,kwargs:n}){const a=t.no_timestamps_token_id+1,o=Array.isArray(t.eos_token_id)?t.eos_token_id[0]:t.eos_token_id,i=t.return_token_timestamps,l=e,c=l.dims[2],d=2,h=this.config.max_source_positions,_=d*h;let p=0;const w=[],v=[];for(;p<c;){const M=Math.min(p+_,c),T=l.slice(null,null,[p,M]);let A;const C=T.dims[2];if(C<_){const W=l.dims[1],ee=new Float32Array(W*_),G=T.data;for(let L=0;L<W;++L)ee.set(G.subarray(L*C,(L+1)*C),L*_);A=new U("float32",ee,[1,W,_])}else A=T;if(r)for(const W of r)"begin_index"in W&&(W.begin_index=s.length);const S=await super.generate({inputs:A,generation_config:t,logits_processor:r,decoder_input_ids:s,...n}),x=(i?S.sequences:S)[0].tolist().map(Number).slice(s.length);let R;if(i){S.token_timestamps=this._extract_token_timestamps(S,t.alignment_heads,Math.floor((M-p)/d),.02,s.length);const W=p/d*.02;R=S.token_timestamps[0].tolist().slice(s.length).map(ee=>ee+W)}if(x.length>0&&x.at(-1)===o&&x.pop(),x.length===0)break;const z=x.map(W=>W>=a),$=x.length>=2&&z[x.length-1]&&!z[x.length-2],Q=[];for(let W=0;W<x.length-1;++W)z[W]&&z[W+1]&&Q.push(W+1);let H,D=x.length;if(Q.length>0)if($)H=M-p;else{const W=Q.at(-1);H=(x[W-1]-a)*d,D=W}else H=M-p;const I=Math.floor(p/d),te=a+1500;for(let W=0;W<D;++W)x[W]>=a&&(x[W]=Math.min(x[W]+I,te));w.push(...x.slice(0,D)),R&&v.push(...R.slice(0,D)),p+=H}w.push(o);const y=[...s,...w];if(i){const M=new U("int64",y.map(BigInt),[1,y.length]),T=[...new Array(s.length).fill(0),...v,0],A=new U("float32",new Float32Array(T),[1,T.length]);return{sequences:M,token_timestamps:A}}return new U("int64",y.map(BigInt),[1,y.length])}_extract_token_timestamps(e,t,r=null,s=.02,n=0){if(!e.cross_attentions)throw new Error("Model outputs must contain cross attentions to extract timestamps. This is most likely because the model was not exported with `output_attentions=True`.");r==null&&ue.warn("`num_frames` has not been set, meaning the entire audio will be analyzed. This may lead to inaccurate token-level timestamps for short audios (< 30 seconds).");let a=this.config.median_filter_width;a===void 0&&(ue.warn("Model config has no `median_filter_width`, using default value of 7."),a=7);const o=e.cross_attentions,i=Array.from({length:this.config.decoder_layers},(y,M)=>ze(o.map(T=>T[M]),2)),l=dr(t.map(([y,M])=>{if(y>=i.length)throw new Error(`Layer index ${y} is out of bounds for cross attentions (length ${i.length}).`);return r?i[y].slice(null,M,null,[0,r]):i[y].slice(null,M)})).transpose(1,0,2,3),[c,d]=Cx(l,-2,0,!0),h=l.clone();for(let y=0;y<h.dims[0];++y){const M=h[y];for(let T=0;T<M.dims[0];++T){const A=M[T],C=c[y][T][0].data,S=d[y][T][0].data;for(let N=0;N<A.dims[0];++N){let x=A[N].data;for(let R=0;R<x.length;++R)x[R]=(x[R]-S[R])/C[R];x.set(ux(x,a))}}}const _=n>0?h.slice(null,null,[n,h.dims[2]],null):h,p=[Si(_,1)],w=e.sequences.dims,v=new U("float32",new Float32Array(w[0]*w[1]),w);for(let y=0;y<w[0];++y){const M=p[y].neg().squeeze_(0),[T,A]=hx(M.tolist()),C=Array.from({length:T.length-1},(R,z)=>T[z+1]-T[z]),S=Jt([1],C).map(R=>!!R),N=[];for(let R=0;R<S.length;++R)S[R]&&N.push(A[R]*s);const x=new Array(n).fill(0);x.push(...N),N.length>0&&x.push(N.at(-1)),v[y].data.set(x)}return v}},HL=class extends Fm{},Ls=class extends P{},QL=class extends Ls{},XL=class extends Ls{async _call(e){return new Je(await super._call(e))}},YL=class extends Ls{async _call(e){return new ie(await super._call(e))}},JL=class extends Ls{async _call(e){return new He(await super._call(e))}},KL=class extends Ls{async _call(e){return new ht(await super._call(e))}},Is=class extends P{},ZL=class extends Is{},eI=class extends Is{async _call(e){return new Je(await super._call(e))}},tI=class extends Is{async _call(e){return new ie(await super._call(e))}},rI=class extends Is{async _call(e){return new He(await super._call(e))}},sI=class extends Is{async _call(e){return new ht(await super._call(e))}},nu=class extends P{},nI=class extends nu{},aI=class extends nu{async _call(e){return new Lm(await super._call(e))}},Lm=class extends Ye{constructor({logits:e,pred_boxes:t}){super(),this.logits=e,this.pred_boxes=t}},au=class extends P{},oI=class extends au{},iI=class extends au{},lI=new Map([["bert","BertModel"],["eurobert","EuroBertModel"],["neobert","NeoBertModel"],["modernbert","ModernBertModel"],["nomic_bert","NomicBertModel"],["roformer","RoFormerModel"],["electra","ElectraModel"],["esm","EsmModel"],["convbert","ConvBertModel"],["camembert","CamembertModel"],["deberta","DebertaModel"],["deberta-v2","DebertaV2Model"],["mpnet","MPNetModel"],["albert","AlbertModel"],["distilbert","DistilBertModel"],["roberta","RobertaModel"],["xlm","XLMModel"],["xlm-roberta","XLMRobertaModel"],["clap","ClapModel"],["clip","CLIPModel"],["clipseg","CLIPSegModel"],["chinese_clip","ChineseCLIPModel"],["siglip","SiglipModel"],["jina_clip","JinaCLIPModel"],["mobilebert","MobileBertModel"],["squeezebert","SqueezeBertModel"],["wav2vec2","Wav2Vec2Model"],["wav2vec2-bert","Wav2Vec2BertModel"],["unispeech","UniSpeechModel"],["unispeech-sat","UniSpeechSatModel"],["hubert","HubertModel"],["wavlm","WavLMModel"],["audio-spectrogram-transformer","ASTModel"],["vits","VitsModel"],["pyannote","PyAnnoteModel"],["wespeaker-resnet","WeSpeakerResNetModel"],["detr","DetrModel"],["rt_detr","RTDetrModel"],["rt_detr_v2","RTDetrV2Model"],["rf_detr","RFDetrModel"],["d_fine","DFineModel"],["table-transformer","TableTransformerModel"],["vit","ViTModel"],["ijepa","IJepaModel"],["pvt","PvtModel"],["vit_msn","ViTMSNModel"],["vit_mae","ViTMAEModel"],["groupvit","GroupViTModel"],["fastvit","FastViTModel"],["mobilevit","MobileViTModel"],["mobilevitv2","MobileViTV2Model"],["owlvit","OwlViTModel"],["owlv2","Owlv2Model"],["beit","BeitModel"],["deit","DeiTModel"],["hiera","HieraModel"],["convnext","ConvNextModel"],["convnextv2","ConvNextV2Model"],["dinov2","Dinov2Model"],["dinov2_with_registers","Dinov2WithRegistersModel"],["dinov3_vit","DINOv3ViTModel"],["dinov3_convnext","DINOv3ConvNextModel"],["resnet","ResNetModel"],["swin","SwinModel"],["swin2sr","Swin2SRModel"],["donut-swin","DonutSwinModel"],["yolos","YolosModel"],["dpt","DPTModel"],["glpn","GLPNModel"],["hifigan","SpeechT5HifiGan"],["efficientnet","EfficientNetModel"],["decision_transformer","DecisionTransformerModel"],["patchtst","PatchTSTModel"],["patchtsmixer","PatchTSMixerModel"],["mobilenet_v1","MobileNetV1Model"],["mobilenet_v2","MobileNetV2Model"],["mobilenet_v3","MobileNetV3Model"],["mobilenet_v4","MobileNetV4Model"],["maskformer","MaskFormerModel"],["mgp-str","MgpstrForSceneTextRecognition"],["style_text_to_speech_2","StyleTextToSpeech2Model"]]),cI=new Map([["t5","T5Model"],["longt5","LongT5Model"],["mt5","MT5Model"],["bart","BartModel"],["mbart","MBartModel"],["marian","MarianModel"],["whisper","WhisperModel"],["cohere_asr","CohereAsrModel"],["m2m_100","M2M100Model"],["blenderbot","BlenderbotModel"],["blenderbot-small","BlenderbotSmallModel"]]),uI=new Map([["mimi","MimiModel"],["dac","DacModel"],["snac","SnacModel"]]),dI=new Map([["bloom","BloomModel"],["jais","JAISModel"],["gpt2","GPT2Model"],["gpt_oss","GptOssModel"],["gptj","GPTJModel"],["gpt_bigcode","GPTBigCodeModel"],["gpt_neo","GPTNeoModel"],["gpt_neox","GPTNeoXModel"],["codegen","CodeGenModel"],["llama","LlamaModel"],["apertus","ApertusModel"],["nanochat","NanoChatModel"],["arcee","ArceeModel"],["afmoe","AfmoeModel"],["lfm2","Lfm2Model"],["lfm2_moe","Lfm2MoeModel"],["smollm3","SmolLM3Model"],["exaone","ExaoneModel"],["olmo","OlmoModel"],["olmo2","Olmo2Model"],["olmo3","Olmo3Model"],["olmo_hybrid","OlmoHybridModel"],["mobilellm","MobileLLMModel"],["granite","GraniteModel"],["granitemoehybrid","GraniteMoeHybridModel"],["cohere","CohereModel"],["cohere2","Cohere2Model"],["gemma","GemmaModel"],["gemma2","Gemma2Model"],["vaultgemma","VaultGemmaModel"],["gemma3_text","Gemma3Model"],["helium","HeliumModel"],["glm","GlmModel"],["glm_moe_dsa","GlmMoeDsaModel"],["openelm","OpenELMModel"],["qwen2","Qwen2Model"],["qwen2_moe","Qwen2MoeModel"],["qwen3","Qwen3Model"],["qwen3_moe","Qwen3MoeModel"],["qwen3_next","Qwen3NextModel"],["phi","PhiModel"],["phi3","Phi3Model"],["mpt","MptModel"],["opt","OPTModel"],["mistral","MistralModel"],["mistral4","Mistral4Model"],["ministral","MinistralModel"],["ministral3","Ministral3Model"],["ernie4_5","Ernie4_5ForCausalLM"],["starcoder2","Starcoder2Model"],["deepseek_v3","DeepseekV3Model"],["falcon","FalconModel"],["falcon_h1","FalconH1Model"],["nemotron_h","NemotronHModel"],["solar_open","SolarOpenModel"],["stablelm","StableLmModel"],["modernbert-decoder","ModernBertDecoderModel"],["hunyuan_v1_dense","HunYuanDenseV1Model"],["youtu","YoutuModel"]]),Im=new Map([["speecht5","SpeechT5ForSpeechToText"],["whisper","WhisperForConditionalGeneration"],["lite-whisper","LiteWhisperForConditionalGeneration"],["moonshine","MoonshineForConditionalGeneration"],["cohere_asr","CohereAsrForConditionalGeneration"]]),Om=new Map([["speecht5","SpeechT5ForTextToSpeech"]]),Nm=new Map([["vits","VitsModel"],["musicgen","MusicgenForConditionalGeneration"],["supertonic","SupertonicForConditionalGeneration"]]),Dm=new Map([["bert","BertForSequenceClassification"],["eurobert","EuroBertForSequenceClassification"],["neobert","NeoBertForSequenceClassification"],["modernbert","ModernBertForSequenceClassification"],["roformer","RoFormerForSequenceClassification"],["electra","ElectraForSequenceClassification"],["esm","EsmForSequenceClassification"],["convbert","ConvBertForSequenceClassification"],["camembert","CamembertForSequenceClassification"],["deberta","DebertaForSequenceClassification"],["deberta-v2","DebertaV2ForSequenceClassification"],["mpnet","MPNetForSequenceClassification"],["albert","AlbertForSequenceClassification"],["distilbert","DistilBertForSequenceClassification"],["roberta","RobertaForSequenceClassification"],["xlm","XLMForSequenceClassification"],["xlm-roberta","XLMRobertaForSequenceClassification"],["bart","BartForSequenceClassification"],["mbart","MBartForSequenceClassification"],["mobilebert","MobileBertForSequenceClassification"],["squeezebert","SqueezeBertForSequenceClassification"]]),zm=new Map([["bert","BertForTokenClassification"],["eurobert","EuroBertForTokenClassification"],["neobert","NeoBertForTokenClassification"],["modernbert","ModernBertForTokenClassification"],["roformer","RoFormerForTokenClassification"],["electra","ElectraForTokenClassification"],["esm","EsmForTokenClassification"],["convbert","ConvBertForTokenClassification"],["camembert","CamembertForTokenClassification"],["deberta","DebertaForTokenClassification"],["deberta-v2","DebertaV2ForTokenClassification"],["mpnet","MPNetForTokenClassification"],["distilbert","DistilBertForTokenClassification"],["roberta","RobertaForTokenClassification"],["xlm","XLMForTokenClassification"],["xlm-roberta","XLMRobertaForTokenClassification"]]),Bm=new Map([["t5","T5ForConditionalGeneration"],["longt5","LongT5ForConditionalGeneration"],["mt5","MT5ForConditionalGeneration"],["bart","BartForConditionalGeneration"],["mbart","MBartForConditionalGeneration"],["marian","MarianMTModel"],["m2m_100","M2M100ForConditionalGeneration"],["blenderbot","BlenderbotForConditionalGeneration"],["blenderbot-small","BlenderbotSmallForConditionalGeneration"]]),Rm=new Map([["bloom","BloomForCausalLM"],["gpt2","GPT2LMHeadModel"],["gpt_oss","GptOssForCausalLM"],["jais","JAISLMHeadModel"],["gptj","GPTJForCausalLM"],["gpt_bigcode","GPTBigCodeForCausalLM"],["gpt_neo","GPTNeoForCausalLM"],["gpt_neox","GPTNeoXForCausalLM"],["codegen","CodeGenForCausalLM"],["llama","LlamaForCausalLM"],["nanochat","NanoChatForCausalLM"],["apertus","ApertusForCausalLM"],["llama4_text","Llama4ForCausalLM"],["arcee","ArceeForCausalLM"],["afmoe","AfmoeForCausalLM"],["lfm2","Lfm2ForCausalLM"],["lfm2_moe","Lfm2MoeForCausalLM"],["smollm3","SmolLM3ForCausalLM"],["exaone","ExaoneForCausalLM"],["olmo","OlmoForCausalLM"],["olmo2","Olmo2ForCausalLM"],["olmo3","Olmo3ForCausalLM"],["olmo_hybrid","OlmoHybridForCausalLM"],["mobilellm","MobileLLMForCausalLM"],["granite","GraniteForCausalLM"],["granitemoehybrid","GraniteMoeHybridForCausalLM"],["cohere","CohereForCausalLM"],["cohere2","Cohere2ForCausalLM"],["gemma","GemmaForCausalLM"],["gemma2","Gemma2ForCausalLM"],["vaultgemma","VaultGemmaForCausalLM"],["gemma3_text","Gemma3ForCausalLM"],["gemma3","Gemma3ForCausalLM"],["helium","HeliumForCausalLM"],["glm","GlmForCausalLM"],["glm_moe_dsa","GlmMoeDsaForCausalLM"],["openelm","OpenELMForCausalLM"],["qwen2","Qwen2ForCausalLM"],["qwen2_moe","Qwen2MoeForCausalLM"],["qwen3","Qwen3ForCausalLM"],["qwen3_moe","Qwen3MoeForCausalLM"],["qwen3_next","Qwen3NextForCausalLM"],["qwen2_vl","Qwen2VLForCausalLM"],["qwen2_5_vl","Qwen2_5_VLForCausalLM"],["qwen3_vl","Qwen3VLForCausalLM"],["qwen3_vl_moe","Qwen3VLMoeForCausalLM"],["qwen3_5","Qwen3_5ForCausalLM"],["qwen3_5_text","Qwen3_5ForCausalLM"],["qwen3_5_moe","Qwen3_5MoeForCausalLM"],["gemma3n","Gemma3nForCausalLM"],["gemma4","Gemma4ForCausalLM"],["phi","PhiForCausalLM"],["phi3","Phi3ForCausalLM"],["mpt","MptForCausalLM"],["opt","OPTForCausalLM"],["mbart","MBartForCausalLM"],["mistral","MistralForCausalLM"],["mistral4","Mistral4ForCausalLM"],["ministral","MinistralForCausalLM"],["ministral3","Ministral3ForCausalLM"],["ernie4_5","Ernie4_5ForCausalLM"],["starcoder2","Starcoder2ForCausalLM"],["deepseek_v3","DeepseekV3ForCausalLM"],["falcon","FalconForCausalLM"],["falcon_h1","FalconH1ForCausalLM"],["nemotron_h","NemotronHForCausalLM"],["trocr","TrOCRForCausalLM"],["solar_open","SolarOpenForCausalLM"],["stablelm","StableLmForCausalLM"],["modernbert-decoder","ModernBertDecoderForCausalLM"],["hunyuan_v1_dense","HunYuanDenseV1ForCausalLM"],["youtu","YoutuForCausalLM"],["phi3_v","Phi3VForCausalLM"]]),hI=new Map([["multi_modality","MultiModalityCausalLM"]]),Gm=new Map([["bert","BertForMaskedLM"],["eurobert","EuroBertForMaskedLM"],["neobert","NeoBertForMaskedLM"],["modernbert","ModernBertForMaskedLM"],["roformer","RoFormerForMaskedLM"],["electra","ElectraForMaskedLM"],["esm","EsmForMaskedLM"],["convbert","ConvBertForMaskedLM"],["camembert","CamembertForMaskedLM"],["deberta","DebertaForMaskedLM"],["deberta-v2","DebertaV2ForMaskedLM"],["mpnet","MPNetForMaskedLM"],["albert","AlbertForMaskedLM"],["distilbert","DistilBertForMaskedLM"],["roberta","RobertaForMaskedLM"],["xlm","XLMWithLMHeadModel"],["xlm-roberta","XLMRobertaForMaskedLM"],["mobilebert","MobileBertForMaskedLM"],["squeezebert","SqueezeBertForMaskedLM"]]),$m=new Map([["bert","BertForQuestionAnswering"],["neobert","NeoBertForQuestionAnswering"],["roformer","RoFormerForQuestionAnswering"],["electra","ElectraForQuestionAnswering"],["convbert","ConvBertForQuestionAnswering"],["camembert","CamembertForQuestionAnswering"],["deberta","DebertaForQuestionAnswering"],["deberta-v2","DebertaV2ForQuestionAnswering"],["mpnet","MPNetForQuestionAnswering"],["albert","AlbertForQuestionAnswering"],["distilbert","DistilBertForQuestionAnswering"],["roberta","RobertaForQuestionAnswering"],["xlm","XLMForQuestionAnswering"],["xlm-roberta","XLMRobertaForQuestionAnswering"],["mobilebert","MobileBertForQuestionAnswering"],["squeezebert","SqueezeBertForQuestionAnswering"]]),Vm=new Map([["vision-encoder-decoder","VisionEncoderDecoderModel"],["idefics3","Idefics3ForConditionalGeneration"],["smolvlm","SmolVLMForConditionalGeneration"]]),Um=new Map([["llava","LlavaForConditionalGeneration"],["llava_onevision","LlavaOnevisionForConditionalGeneration"],["moondream1","Moondream1ForConditionalGeneration"],["florence2","Florence2ForConditionalGeneration"],["qwen2_vl","Qwen2VLForConditionalGeneration"],["qwen2_5_vl","Qwen2_5_VLForConditionalGeneration"],["qwen3_vl","Qwen3VLForConditionalGeneration"],["qwen3_vl_moe","Qwen3VLMoeForConditionalGeneration"],["qwen3_5","Qwen3_5ForConditionalGeneration"],["qwen3_5_moe","Qwen3_5MoeForConditionalGeneration"],["lfm2_vl","Lfm2VlForConditionalGeneration"],["idefics3","Idefics3ForConditionalGeneration"],["smolvlm","SmolVLMForConditionalGeneration"],["paligemma","PaliGemmaForConditionalGeneration"],["llava_qwen2","LlavaQwen2ForCausalLM"],["gemma3","Gemma3ForConditionalGeneration"],["gemma3n","Gemma3nForConditionalGeneration"],["gemma4","Gemma4ForConditionalGeneration"],["mistral3","Mistral3ForConditionalGeneration"],["lighton_ocr","LightOnOcrForConditionalGeneration"],["glm_ocr","GlmOcrForConditionalGeneration"]]),jm=new Map([["granite_speech","GraniteSpeechForConditionalGeneration"],["ultravox","UltravoxModel"],["voxtral","VoxtralForConditionalGeneration"],["voxtral_realtime","VoxtralRealtimeForConditionalGeneration"]]),fI=new Map([["vision-encoder-decoder","VisionEncoderDecoderModel"]]),qm=new Map([["vit","ViTForImageClassification"],["ijepa","IJepaForImageClassification"],["pvt","PvtForImageClassification"],["vit_msn","ViTMSNForImageClassification"],["fastvit","FastViTForImageClassification"],["mobilevit","MobileViTForImageClassification"],["mobilevitv2","MobileViTV2ForImageClassification"],["beit","BeitForImageClassification"],["deit","DeiTForImageClassification"],["hiera","HieraForImageClassification"],["convnext","ConvNextForImageClassification"],["convnextv2","ConvNextV2ForImageClassification"],["dinov2","Dinov2ForImageClassification"],["dinov2_with_registers","Dinov2WithRegistersForImageClassification"],["resnet","ResNetForImageClassification"],["swin","SwinForImageClassification"],["segformer","SegformerForImageClassification"],["efficientnet","EfficientNetForImageClassification"],["mobilenet_v1","MobileNetV1ForImageClassification"],["mobilenet_v2","MobileNetV2ForImageClassification"],["mobilenet_v3","MobileNetV3ForImageClassification"],["mobilenet_v4","MobileNetV4ForImageClassification"]]),Wm=new Map([["detr","DetrForObjectDetection"],["rt_detr","RTDetrForObjectDetection"],["rt_detr_v2","RTDetrV2ForObjectDetection"],["rf_detr","RFDetrForObjectDetection"],["d_fine","DFineForObjectDetection"],["table-transformer","TableTransformerForObjectDetection"],["yolos","YolosForObjectDetection"]]),Hm=new Map([["owlvit","OwlViTForObjectDetection"],["owlv2","Owlv2ForObjectDetection"],["grounding-dino","GroundingDinoForObjectDetection"]]),Os=new Map([["detr","DetrForSegmentation"],["clipseg","CLIPSegForImageSegmentation"]]),Qm=new Map([["segformer","SegformerForSemanticSegmentation"],["sapiens","SapiensForSemanticSegmentation"],["swin","SwinForSemanticSegmentation"],["mobilenet_v1","MobileNetV1ForSemanticSegmentation"],["mobilenet_v2","MobileNetV2ForSemanticSegmentation"],["mobilenet_v3","MobileNetV3ForSemanticSegmentation"],["mobilenet_v4","MobileNetV4ForSemanticSegmentation"]]),Xm=new Map([["detr","DetrForSegmentation"],["maskformer","MaskFormerForInstanceSegmentation"]]),Ym=new Map([["sam","SamModel"],["sam2","Sam2Model"],["edgetam","EdgeTamModel"],["sam3_tracker","Sam3TrackerModel"]]),Jm=new Map([["wav2vec2","Wav2Vec2ForCTC"],["wav2vec2-bert","Wav2Vec2BertForCTC"],["unispeech","UniSpeechForCTC"],["unispeech-sat","UniSpeechSatForCTC"],["wavlm","WavLMForCTC"],["hubert","HubertForCTC"],["parakeet_ctc","ParakeetForCTC"]]),Km=new Map([["wav2vec2","Wav2Vec2ForSequenceClassification"],["wav2vec2-bert","Wav2Vec2BertForSequenceClassification"],["unispeech","UniSpeechForSequenceClassification"],["unispeech-sat","UniSpeechSatForSequenceClassification"],["wavlm","WavLMForSequenceClassification"],["hubert","HubertForSequenceClassification"],["audio-spectrogram-transformer","ASTForAudioClassification"]]),Zm=new Map([["wavlm","WavLMForXVector"]]),eg=new Map([["unispeech-sat","UniSpeechSatForAudioFrameClassification"],["wavlm","WavLMForAudioFrameClassification"],["wav2vec2","Wav2Vec2ForAudioFrameClassification"],["pyannote","PyAnnoteForAudioFrameClassification"]]),tg=new Map([["vitmatte","VitMatteForImageMatting"]]),_I=new Map([["patchtst","PatchTSTForPrediction"],["patchtsmixer","PatchTSMixerForPrediction"]]),rg=new Map([["swin2sr","Swin2SRForImageSuperResolution"]]),sg=new Map([["chmv2","CHMv2ForDepthEstimation"],["dpt","DPTForDepthEstimation"],["depth_anything","DepthAnythingForDepthEstimation"],["glpn","GLPNForDepthEstimation"],["sapiens","SapiensForDepthEstimation"],["depth_pro","DepthProForDepthEstimation"],["metric3d","Metric3DForDepthEstimation"],["metric3dv2","Metric3Dv2ForDepthEstimation"]]),ng=new Map([["sapiens","SapiensForNormalEstimation"]]),ag=new Map([["vitpose","VitPoseForPoseEstimation"]]),og=new Map([["clip","CLIPVisionModelWithProjection"],["siglip","SiglipVisionModel"],["jina_clip","JinaCLIPVisionModel"]]),ig=[[lI,q.EncoderOnly],[cI,q.EncoderDecoder],[dI,q.DecoderOnlyWithoutHead],[uI,q.AutoEncoder],[Dm,q.EncoderOnly],[zm,q.EncoderOnly],[Bm,q.Seq2Seq],[Im,q.Seq2Seq],[Rm,q.DecoderOnly],[hI,q.MultiModality],[Gm,q.EncoderOnly],[$m,q.EncoderOnly],[Vm,q.Vision2Seq],[Um,q.ImageTextToText],[jm,q.AudioTextToText],[qm,q.EncoderOnly],[Os,q.EncoderOnly],[Xm,q.EncoderOnly],[Qm,q.EncoderOnly],[tg,q.EncoderOnly],[_I,q.EncoderOnly],[rg,q.EncoderOnly],[sg,q.EncoderOnly],[ng,q.EncoderOnly],[ag,q.EncoderOnly],[Wm,q.EncoderOnly],[Hm,q.EncoderOnly],[Ym,q.MaskGeneration],[Jm,q.EncoderOnly],[Km,q.EncoderOnly],[Om,q.Seq2Seq],[Nm,q.EncoderOnly],[Zm,q.EncoderOnly],[eg,q.EncoderOnly],[og,q.EncoderOnly]];for(const[e,t]of ig)for(const r of e.values()){_r.set(r,t);const s=sl[r];vs.set(s,r),Zi.set(r,s)}var pI=[["MusicgenForConditionalGeneration",em,q.Musicgen],["Phi3VForCausalLM",nm,q.Phi3V],["CLIPTextModelWithProjection",vp,q.EncoderOnly],["SiglipTextModel",fm,q.EncoderOnly],["JinaCLIPTextModel",Up,q.EncoderOnly],["ClapTextModelWithProjection",gp,q.EncoderOnly],["ClapAudioModelWithProjection",wp,q.EncoderOnly],["DacEncoderModel",Mp,q.EncoderOnly],["DacDecoderModel",xp,q.EncoderOnly],["MimiEncoderModel",Jp,q.EncoderOnly],["MimiDecoderModel",Kp,q.EncoderOnly],["SnacEncoderModel",_m,q.EncoderOnly],["SnacDecoderModel",pm,q.EncoderOnly],["Gemma3nForConditionalGeneration",Fa,q.ImageAudioTextToText],["Gemma4ForConditionalGeneration",Nl,q.ImageAudioTextToText],["SupertonicForConditionalGeneration",wm,q.Supertonic],["ChatterboxModel",_p,q.Chatterbox],["VoxtralRealtimeForConditionalGeneration",Cm,q.VoxtralRealtime]];for(const[e,t,r]of pI)_r.set(e,r),vs.set(t,e),Zi.set(e,t);var lg=new Map([["modnet",Os],["birefnet",Os],["isnet",Os],["ben",Os]]);for(const[e,t]of lg.entries())t.set(e,"PreTrainedModel"),_r.set(e,q.EncoderOnly),Zi.set(e,P);var mI=new Set(lg.keys());_r.set("PreTrainedModel",q.EncoderOnly),vs.set(P,"PreTrainedModel");var Ae={MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES:Dm,MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES:zm,MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES:Om,MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES:Nm,MODEL_FOR_MASKED_LM_MAPPING_NAMES:Gm,MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES:$m,MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES:qm,MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES:Os,MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES:Qm,MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES:Xm,MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES:Wm,MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES:Hm,MODEL_FOR_MASK_GENERATION_MAPPING_NAMES:Ym,MODEL_FOR_CTC_MAPPING_NAMES:Jm,MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES:Km,MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES:Zm,MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES:eg,MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES:fI,MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES:tg,MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES:rg,MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES:sg,MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES:ng,MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES:ag,MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES:og,MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES:Um,MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES:jm,MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES:Bm,MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES:Im,MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:Rm,MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES:Vm};yE(Ae);var Ce=(oo=class{static supports(e){if(!this.MODEL_CLASS_MAPPINGS)return!1;for(const t of this.MODEL_CLASS_MAPPINGS)if(t.has(e))return!0;return this.BASE_IF_FAIL}static async from_pretrained(e,{progress_callback:t=null,config:r=null,cache_dir:s=null,local_files_only:n=!1,revision:a="main",model_file_name:o=null,subfolder:i="onnx",device:l=null,dtype:c=null,use_external_data_format:d=null,session_options:h={}}={}){const _={progress_callback:t,config:r,cache_dir:s,local_files_only:n,revision:a,model_file_name:o,subfolder:i,device:l,dtype:c,use_external_data_format:d,session_options:h};if(_.config=await on.from_pretrained(e,_),!this.MODEL_CLASS_MAPPINGS)throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: "+this.name);const{model_type:p}=_.config;for(const w of this.MODEL_CLASS_MAPPINGS){let v=w.get(p);if(!v){for(const y of w.values())if(y[0]===p){v=y;break}if(!v)continue}return await sl[v].from_pretrained(e,_)}if(this.BASE_IF_FAIL)return mI.has(p)||ue.warn(`Unknown model class "${p}", attempting to construct from base class.`),await P.from_pretrained(e,_);throw Error(`Unsupported model type: ${p}`)}},k(oo,"MODEL_CLASS_MAPPINGS",null),k(oo,"BASE_IF_FAIL",!1),oo),wn=(io=class extends Ce{},k(io,"MODEL_CLASS_MAPPINGS",ig.map(e=>e[0])),k(io,"BASE_IF_FAIL",!0),io),cg=(_u=class extends Ce{},k(_u,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES]),_u),gI=(pu=class extends Ce{},k(pu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES]),pu),ou=(mu=class extends Ce{},k(mu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES]),mu),wI=(gu=class extends Ce{},k(gu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES]),gu),vI=(wu=class extends Ce{},k(wu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES]),wu),yI=(vu=class extends Ce{},k(vu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES]),vu),bI=(yu=class extends Ce{},k(yu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_CAUSAL_LM_MAPPING_NAMES]),yu),MI=(bu=class extends Ce{},k(bu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_MASKED_LM_MAPPING_NAMES]),bu),xI=(Mu=class extends Ce{},k(Mu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES]),Mu),kI=(xu=class extends Ce{},k(xu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES]),xu),TI=(ku=class extends Ce{},k(ku,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES]),ku),ug=(Tu=class extends Ce{},k(Tu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES]),Tu),dg=(Eu=class extends Ce{},k(Eu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES]),Eu),hg=(Au=class extends Ce{},k(Au,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES]),Au),EI=(Cu=class extends Ce{},k(Cu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES]),Cu),AI=(Su=class extends Ce{},k(Su,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES]),Su);Pu=class extends Ce{},k(Pu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_MASK_GENERATION_MAPPING_NAMES]);var CI=(Fu=class extends Ce{},k(Fu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_CTC_MAPPING_NAMES]),Fu),SI=(Lu=class extends Ce{},k(Lu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES]),Lu);Iu=class extends Ce{},k(Iu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES]),Ou=class extends Ce{},k(Ou,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES]);var PI=(Nu=class extends Ce{},k(Nu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES]),Nu);Du=class extends Ce{},k(Du,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES]);var FI=(zu=class extends Ce{},k(zu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES]),zu),LI=(Bu=class extends Ce{},k(Bu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES]),Bu);Ru=class extends Ce{},k(Ru,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_NORMAL_ESTIMATION_MAPPING_NAMES]),Gu=class extends Ce{},k(Gu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_POSE_ESTIMATION_MAPPING_NAMES]);var II=($u=class extends Ce{},k($u,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_FEATURE_EXTRACTION_MAPPING_NAMES]),$u);Vu=class extends Ce{},k(Vu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES]),Uu=class extends Ce{},k(Uu,"MODEL_CLASS_MAPPINGS",[Ae.MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES]);async function Ht(e){return Array.isArray(e)||(e=[e]),await Promise.all(e.map(t=>qt.read(t)))}async function Ns(e,t){return Array.isArray(e)||(e=[e]),await Promise.all(e.map(r=>typeof r=="string"||r instanceof URL?I1(r,t):r instanceof Float64Array?new Float32Array(r):r))}function iu(e,t){t&&(e=e.map(o=>o|0));const[r,s,n,a]=e;return{xmin:r,ymin:s,xmax:n,ymax:a}}var Ve=class extends vt{constructor({task:e,model:t,tokenizer:r=null,processor:s=null}){super(),this.task=e,this.model=t,this.tokenizer=r,this.processor=s}async dispose(){await this.model.dispose()}},OI=class extends Ve{async _call(e,{top_k:t=1}={}){const r=this.tokenizer(e,{padding:!0,truncation:!0}),s=await this.model(r),{problem_type:n,id2label:a}=this.model.config,o=n==="multi_label_classification"?l=>l.sigmoid():l=>new U("float32",nt(l.data),l.dims),i=[];for(const l of s.logits){const c=o(l),d=await _s(c,t),h=d[0].tolist(),p=d[1].tolist().map((w,v)=>({label:a?a[w]:`LABEL_${w}`,score:h[v]}));t===1?i.push(...p):i.push(p)}return Array.isArray(e)||t===1?i:i[0]}},NI=class extends Ve{async _call(e,{ignore_labels:t=["O"]}={}){const r=Array.isArray(e),s=this.tokenizer(r?e:[e],{padding:!0,truncation:!0}),a=(await this.model(s)).logits,o=this.model.config.id2label,i=[];for(let l=0;l<a.dims[0];++l){const c=s.input_ids[l],d=a[l],h=[];for(let _=0;_<d.dims[0];++_){const p=d[_],w=je(p.data)[1],v=o?o[w]:`LABEL_${w}`;if(t.includes(v))continue;const y=this.tokenizer.decode([c[_].item()],{skip_special_tokens:!0});if(y==="")continue;const M=nt(p.data);h.push({entity:v,score:M[w],index:_,word:y})}i.push(h)}return r?i:i[0]}},DI=class extends Ve{async _call(e,t,{top_k:r=1}={}){const s=this.tokenizer(e,{text_pair:t,padding:!0,truncation:!0}),n=Array.isArray(e),{start_logits:a,end_logits:o}=await this.model(s),i=s.input_ids.tolist(),l=s.attention_mask.tolist(),{all_special_ids:c,sep_token_id:d}=this.tokenizer,h=[];for(let _=0;_<a.dims[0];++_){const p=i[_],w=p.findIndex(S=>S==d),v=a[_].tolist(),y=o[_].tolist();for(let S=1;S<v.length;++S)(l[_]==0||S<=w||c.findIndex(N=>N==p[S])!==-1)&&(v[S]=-1/0,y[S]=-1/0);const M=nt(v).map((S,N)=>[S,N]),T=nt(y).map((S,N)=>[S,N]);M[0][0]=0,T[0][0]=0;const A=Ny(M,T).filter(S=>S[0][1]<=S[1][1]).map(S=>[S[0][1],S[1][1],S[0][0]*S[1][0]]).sort((S,N)=>N[2]-S[2]),C=[];for(let S=0;S<Math.min(A.length,r);++S){const[N,x,R]=A[S],z=p.slice(N,x+1),$=this.tokenizer.decode(z,{skip_special_tokens:!0});C.push({answer:$,score:R})}r===1?h.push(...C):h.push(C)}return n?h:h[0]}},zI=class extends Ve{async _call(e,{top_k:t=5}={}){const{mask_token_id:r,mask_token:s}=this.tokenizer,n=this.tokenizer(e,{padding:!0,truncation:!0}),{logits:a}=await this.model(n),o=[],i=n.input_ids.tolist();for(let l=0;l<i.length;++l){const c=i[l],d=c.findIndex(v=>v==r);if(d===-1)throw Error(`Mask token (${s}) not found in text.`);const h=a[l][d],_=await _s(new U("float32",nt(h.data),h.dims),t),p=_[0].tolist(),w=_[1].tolist();o.push(w.map((v,y)=>{const M=c.slice();return M[d]=v,{score:p[y],token:Number(v),token_str:this.tokenizer.decode([v]),sequence:this.tokenizer.decode(M,{skip_special_tokens:!0})}}))}return Array.isArray(e)?o:o[0]}},lu=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256});k(this,"_key","generated_text")}async _call(t,r={}){Array.isArray(t)||(t=[t]),this.model.config.prefix&&(t=t.map(l=>this.model.config.prefix+l));const s=this.model.config.task_specific_params;s&&s[this.task]&&s[this.task].prefix&&(t=t.map(l=>s[this.task].prefix+l));const n=this.tokenizer,a={padding:!0,truncation:!0};let o;this.task==="translation"&&"_build_translation_inputs"in n?o=n._build_translation_inputs(t,a,r):o=n(t,a);const i=await this.model.generate({...o,...this._default_generation_config,...r});return n.batch_decode(i,{skip_special_tokens:!0}).map(l=>({[this._key]:l}))}},BI=class extends lu{constructor(){super(...arguments);k(this,"_key","summary_text")}},RI=class extends lu{constructor(){super(...arguments);k(this,"_key","translation_text")}};function fg(e){return Array.isArray(e)&&e.every(t=>"role"in t&&"content"in t)}var GI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256})}async _call(t,r={}){let s=!1,n=!1,a=r.add_special_tokens??(this.tokenizer.add_bos_token||this.tokenizer.add_eos_token)??!1,o=r.tokenizer_encode_kwargs,i;if(typeof t=="string")i=t=[t];else if(Array.isArray(t)&&t.every(w=>typeof w=="string"))s=!0,i=t;else{if(fg(t))t=[t];else if(Array.isArray(t)&&t.every(fg))s=!0;else throw new Error("Input must be a string, an array of strings, a Chat, or an array of Chats");n=!0,i=t.map(w=>this.tokenizer.apply_chat_template(w,{tokenize:!1,add_generation_prompt:!0,...o})),a=!1,o=void 0}const l=n?!1:r.return_full_text??!0;this.tokenizer.padding_side="left";const c=this.tokenizer(i,{add_special_tokens:a,padding:!0,truncation:!0,...o}),d=await this.model.generate({...c,...this._default_generation_config,...r}),h=this.tokenizer.batch_decode(d,{skip_special_tokens:!0});let _;!l&&c.input_ids.dims.at(-1)>0&&(_=this.tokenizer.batch_decode(c.input_ids,{skip_special_tokens:!0}).map(w=>w.length));const p=Array.from({length:t.length},w=>[]);for(let w=0;w<h.length;++w){const v=Math.floor(w/d.dims[0]*t.length);_&&(h[w]=h[w].slice(_[v])),p[v].push({generated_text:n?[...t[v],{role:"assistant",content:h[w]}]:h[w]})}return!s&&p.length===1?p[0]:p}},$I=class extends Ve{constructor(e){super(e),this.label2id=Object.fromEntries(Object.entries(this.model.config.label2id).map(([t,r])=>[t.toLowerCase(),r])),this.entailment_id=this.label2id.entailment,this.entailment_id===void 0&&(ue.warn("Could not find 'entailment' in label2id mapping. Using 2 as entailment_id."),this.entailment_id=2),this.contradiction_id=this.label2id.contradiction??this.label2id.not_entailment,this.contradiction_id===void 0&&(ue.warn("Could not find 'contradiction' in label2id mapping. Using 0 as contradiction_id."),this.contradiction_id=0)}async _call(e,t,{hypothesis_template:r="This example is {}.",multi_label:s=!1}={}){const n=Array.isArray(e);n||(e=[e]),Array.isArray(t)||(t=[t]);const a=t.map(l=>r.replace("{}",l)),o=s||t.length===1,i=[];for(const l of e){const c=[];for(const _ of a){const p=this.tokenizer(l,{text_pair:_,padding:!0,truncation:!0}),w=await this.model(p);o?c.push([w.logits.data[this.contradiction_id],w.logits.data[this.entailment_id]]):c.push(w.logits.data[this.entailment_id])}const h=(o?c.map(_=>nt(_)[1]):nt(c)).map((_,p)=>[_,p]).sort((_,p)=>p[0]-_[0]);i.push({sequence:l,labels:h.map(_=>t[_[1]]),scores:h.map(_=>_[0])})}return n?i:i[0]}},VI=class extends Ve{async _call(e,{top_k:t=5}={}){const r=this.processor.feature_extractor.config.sampling_rate,s=await Ns(e,r),n=this.model.config.id2label,a=[];for(const o of s){const i=await this.processor(o),c=(await this.model(i)).logits[0],d=await _s(new U("float32",nt(c.data),c.dims),t),h=d[0].tolist(),p=d[1].tolist().map((w,v)=>({label:n?n[w]:`LABEL_${w}`,score:h[v]}));a.push(p)}return Array.isArray(e)?a:a[0]}},UI=class extends Ve{async _call(e,t,{hypothesis_template:r="This is a sound of {}."}={}){const s=!Array.isArray(e);s&&(e=[e]);const n=t.map(c=>r.replace("{}",c)),a=this.tokenizer(n,{padding:!0,truncation:!0}),o=this.processor.feature_extractor.config.sampling_rate,i=await Ns(e,o),l=[];for(const c of i){const d=await this.processor(c),h=await this.model({...a,...d}),_=nt(h.logits_per_audio.data);l.push([..._].map((p,w)=>({score:p,label:t[w]})))}return s?l[0]:l}},jI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{})}async _call(t,r={}){switch(r={...this._default_generation_config,...r},this.model.config.model_type){case"whisper":case"lite-whisper":return this._call_whisper(t,r);case"wav2vec2":case"wav2vec2-bert":case"unispeech":case"unispeech-sat":case"hubert":case"parakeet_ctc":return this._call_wav2vec2(t,r);case"moonshine":return this._call_moonshine(t,r);case"cohere_asr":return this._call_cohere_asr(t,r);default:throw new Error(`AutomaticSpeechRecognitionPipeline does not support model type '${this.model.config.model_type}'.`)}}async _call_wav2vec2(t,r){r.language&&ue.warn('`language` parameter is not yet supported for `wav2vec2` models, defaulting to "English".'),r.task&&ue.warn('`task` parameter is not yet supported for `wav2vec2` models, defaulting to "transcribe".');const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor.config.sampling_rate,o=await Ns(n,a),i=[];for(const l of o){const c=await this.processor(l),h=(await this.model(c)).logits[0],_=[];for(const w of h)_.push(je(w.data)[1]);const p=this.tokenizer.decode(_,{skip_special_tokens:!0}).trim();i.push({text:p})}return s?i[0]:i}async _call_whisper(t,r){const s=r.return_timestamps??!1,n=r.chunk_length_s??0,a=r.force_full_sequences??!1;let o=r.stride_length_s??null;const i={...r};s==="word"&&(i.return_token_timestamps=!0,i.return_timestamps=!0);const l=!Array.isArray(t),c=l?[t]:t,d=this.processor.feature_extractor.config,h=d.chunk_length/this.model.config.max_source_positions,_=d.hop_length,p=d.sampling_rate,w=await Ns(c,p),v=[];for(const y of w){let M=[];if(n>0){if(o===null)o=n/6;else if(n<=o)throw Error("`chunk_length_s` must be larger than `stride_length_s`.");const C=p*n,S=p*o,N=C-2*S;let x=0;for(;;){const R=x+C,z=y.subarray(x,R),$=await this.processor(z),Q=x===0,H=R>=y.length;if(M.push({stride:[z.length,Q?0:S,H?0:S],input_features:$.input_features,is_last:H}),H)break;x+=N}}else M=[{stride:[y.length,0,0],input_features:(await this.processor(y)).input_features,is_last:!0}];for(const C of M){i.num_frames=Math.floor(C.stride[0]/_);const S=await this.model.generate({inputs:C.input_features,...i});if(s==="word"){const N=S.sequences.tolist()[0],x=S.token_timestamps.tolist()[0],R=this.tokenizer.timestamp_begin,z=Math.max(N.findIndex($=>Number($)>=R),0);C.tokens=N.slice(z),C.token_timestamps=x.slice(z).map($=>sn($,2))}else C.tokens=S[0].tolist();C.stride=C.stride.map(N=>N/p)}const[T,A]=this.tokenizer._decode_asr(M,{time_precision:h,return_timestamps:s,force_full_sequences:a});v.push({text:T,...A})}return l?v[0]:v}async _call_moonshine(t,r){const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor.config.sampling_rate,o=await Ns(n,a),i=[];for(const l of o){const c=await this.processor(l),d=Math.floor(l.length/a)*6,h=await this.model.generate({max_new_tokens:d,...r,...c}),_=this.processor.batch_decode(h,{skip_special_tokens:!0})[0];i.push({text:_})}return s?i[0]:i}async _call_cohere_asr(t,r){const s=!Array.isArray(t),n=s?[t]:t,a=this.processor.feature_extractor,o=a.config.sampling_rate,i=await Ns(n,o),l=r.language??"en",c=this.processor.get_decoder_prompt_ids(l),d=[];for(const h of i){const _=a.split_audio(h),p=[];for(const v of _){const y=await this.processor(v),M=await this.model.generate({...y,decoder_input_ids:c,...r}),T=this.tokenizer.decode(M[0].tolist(),{skip_special_tokens:!0}).trim();p.push(T)}const w=this.processor.constructor.join_chunks(p,l);d.push({text:w})}return s?d[0]:d}},qI=class extends Ve{constructor(t){super(t);k(this,"DEFAULT_VOCODER_ID","Xenova/speecht5_hifigan");this.vocoder=t.vocoder??null}async _prepare_speaker_embeddings(t,r){if((typeof t=="string"||t instanceof URL)&&(t=new Float32Array(await(await be.fetch(t)).arrayBuffer())),t instanceof Float32Array)t=new U("float32",t,[t.length]);else if(!(t instanceof U))throw new Error("Speaker embeddings must be a `Tensor`, `Float32Array`, `string`, or `URL`.");if(r>1){if(t.dims[0]===1)t=t.repeat(r,1);else if(t.dims[0]!==r)throw new Error(`Expected speaker embeddings batch size to be 1 or ${r}, but got ${t.dims[0]}.`)}return t}_postprocess_waveform(t,r,s,n=null){const a=r.data,[o,i]=r.dims,l=n?n.data:null,c=[];for(let d=0;d<o;++d){const h=l?Math.min(Math.ceil(l[d]),i):i,_=d*i;c.push(new U1(a.slice(_,_+h),s))}return Array.isArray(t)?c:c[0]}async _call(t,r){return this.processor?this._call_text_to_spectrogram(t,r):this.model.config.model_type==="supertonic"?this._call_supertonic(t,r):this._call_text_to_waveform(t)}async _call_supertonic(t,{speaker_embeddings:r,num_inference_steps:s,speed:n}){if(!r)throw new Error("Speaker embeddings must be provided for Supertonic models.");const{sampling_rate:a,style_dim:o}=this.model.config,i=this.tokenizer(t,{padding:!0,truncation:!0}),l=i.input_ids.dims[0];r=await this._prepare_speaker_embeddings(r,l),r=r.view(l,-1,o);const{waveform:c,durations:d}=await this.model.generate_speech({...i,style:r,num_inference_steps:s,speed:n});return this._postprocess_waveform(t,c,a,d)}async _call_text_to_waveform(t){const r=this.tokenizer(t,{padding:!0,truncation:!0}),{waveform:s}=await this.model(r),n=this.model.config.sampling_rate;return this._postprocess_waveform(t,s,n)}async _call_text_to_spectrogram(t,{speaker_embeddings:r}){this.vocoder||(ue.info("No vocoder specified, using default HifiGan vocoder."),this.vocoder=await wn.from_pretrained(this.DEFAULT_VOCODER_ID,{dtype:"fp32"}));const{input_ids:s}=this.tokenizer(t,{padding:!0,truncation:!0}),n=s.dims[0];r=await this._prepare_speaker_embeddings(r,n),r=r.view(n,-1);const{waveform:a}=await this.model.generate_speech(s,r,{vocoder:this.vocoder}),o=this.processor.feature_extractor.config.sampling_rate;return this._postprocess_waveform(t,a,o)}},WI=class extends Ve{async _call(e,t={}){const r=Array.isArray(e),s=await Ht(e),{pixel_values:n}=await this.processor(s),a=[];for(const o of n){o.dims=[1,...o.dims];const i=await this.model.generate({inputs:o,...t}),l=this.tokenizer.batch_decode(i,{skip_special_tokens:!0}).map(c=>({generated_text:c.trim()}));a.push(l)}return r?a:a[0]}},HI=class extends Ve{async _call(e,{top_k:t=5}={}){const r=await Ht(e),{pixel_values:s}=await this.processor(r),n=await this.model({pixel_values:s}),{id2label:a}=this.model.config,o=[];for(const i of n.logits){const l=await _s(new U("float32",nt(i.data),i.dims),t),c=l[0].tolist(),h=l[1].tolist().map((_,p)=>({label:a?a[_]:`LABEL_${_}`,score:c[p]}));o.push(h)}return Array.isArray(e)?o:o[0]}},_g={panoptic:"post_process_panoptic_segmentation",instance:"post_process_instance_segmentation",semantic:"post_process_semantic_segmentation"},pg=class extends Ve{async _call(e,{threshold:t=.5,mask_threshold:r=.5,overlap_mask_area_threshold:s=.8,label_ids_to_fuse:n=null,target_sizes:a=null,subtask:o=null}={}){if(Array.isArray(e)&&e.length!==1)throw Error("Image segmentation pipeline currently only supports a batch size of 1.");const l=await Ht(e),c=l.map(M=>[M.height,M.width]),d=await this.processor(l),{inputNames:h,outputNames:_}=this.model.sessions.model;if(!h.includes("pixel_values")){if(h.length!==1)throw Error(`Expected a single input name, but got ${h.length} inputs: ${h}.`);const M=h[0];if(M in d)throw Error(`Input name ${M} already exists in the inputs.`);d[M]=d.pixel_values}const p=await this.model(d);let w=null;if(o!==null)w=_g[o];else if(this.processor.image_processor){for(const[M,T]of Object.entries(_g))if(T in this.processor.image_processor){w=this.processor.image_processor[T].bind(this.processor.image_processor),o=M;break}}const v=this.model.config.id2label,y=[];if(o)if(o==="panoptic"||o==="instance"){const M=w(p,t,r,s,n,a??c)[0],T=M.segmentation;for(const A of M.segments_info){const C=new Uint8ClampedArray(T.data.length);for(let N=0;N<T.data.length;++N)T.data[N]===A.id&&(C[N]=255);const S=new qt(C,T.dims[1],T.dims[0],1);y.push({score:A.score,label:v[A.label_id],mask:S})}}else if(o==="semantic"){const{segmentation:M,labels:T}=w(p,a??c)[0];for(const A of T){const C=new Uint8ClampedArray(M.data.length);for(let N=0;N<M.data.length;++N)M.data[N]===A&&(C[N]=255);const S=new qt(C,M.dims[1],M.dims[0],1);y.push({score:null,label:v[A],mask:S})}}else throw Error(`Subtask ${o} not supported.`);else{const T=p[_[0]];for(let A=0;A<c.length;++A){const C=c[A],S=T[A];S.data.some(x=>x<-1e-5||x>1+1e-5)&&S.sigmoid_();const N=await qt.fromTensor(S.mul_(255).to("uint8")).resize(C[1],C[0]);y.push({label:null,score:null,mask:N})}}return y}},QI=class extends pg{async _call(e,t={}){const r=await Ht(e),s=await super._call(e,t),n=r.map((a,o)=>{const i=a.clone();return i.putAlpha(s[o].mask),i});return Array.isArray(e)?n:n[0]}},XI=class extends Ve{async _call(e,t,{hypothesis_template:r="This is a photo of {}"}={}){const s=Array.isArray(e),n=await Ht(e),a=t.map(h=>r.replace("{}",h)),o=this.tokenizer(a,{padding:this.model.config.model_type==="siglip"?"max_length":!0,truncation:!0}),{pixel_values:i}=await this.processor(n),l=await this.model({...o,pixel_values:i}),c=this.model.config.model_type==="siglip"?h=>h.sigmoid().data:h=>nt(h.data),d=[];for(const h of l.logits_per_image){const p=[...c(h)].map((w,v)=>({score:w,label:t[v]}));p.sort((w,v)=>v.score-w.score),d.push(p)}return s?d:d[0]}},YI=class extends Ve{async _call(e,{threshold:t=.9,percentage:r=!1}={}){const s=Array.isArray(e);if(s&&e.length!==1)throw Error("Object detection pipeline currently only supports a batch size of 1.");const n=await Ht(e),a=r?null:n.map(_=>[_.height,_.width]),{pixel_values:o,pixel_mask:i}=await this.processor(n),l=await this.model({pixel_values:o,pixel_mask:i}),c=this.processor.image_processor.post_process_object_detection(l,t,a),{id2label:d}=this.model.config,h=c.map(_=>_.boxes.map((p,w)=>({score:_.scores[w],label:d[_.classes[w]],box:iu(p,!r)})));return s?h:h[0]}},JI=class extends Ve{async _call(e,t,{threshold:r=.1,top_k:s=null,percentage:n=!1}={}){const a=Array.isArray(e),o=await Ht(e),i=this.tokenizer(t,{padding:!0,truncation:!0}),l=await this.processor(o),c=[];for(let d=0;d<o.length;++d){const h=o[d],_=n?null:[[h.height,h.width]],p=l.pixel_values[d].unsqueeze_(0),w=await this.model({...i,pixel_values:p});let v;if("post_process_grounded_object_detection"in this.processor){const y=this.processor.post_process_grounded_object_detection(w,i.input_ids,{box_threshold:r,text_threshold:r,target_sizes:_})[0];v=y.boxes.map((M,T)=>({score:y.scores[T],label:y.labels[T],box:iu(M,!n)}))}else{const y=this.processor.image_processor.post_process_object_detection(w,r,_,!0)[0];v=y.boxes.map((M,T)=>({score:y.scores[T],label:t[y.classes[T]],box:iu(M,!n)}))}v.sort((y,M)=>M.score-y.score),s!==null&&(v=v.slice(0,s)),c.push(v)}return a?c:c[0]}},KI=class extends Ve{constructor(){super(...arguments);k(this,"_default_generation_config",{max_new_tokens:256})}async _call(t,r,s={}){if(Array.isArray(t)){if(t.length!==1)throw Error("Document Question Answering pipeline currently only supports a batch size of 1.");t=t[0]}const n=(await Ht(t))[0],{pixel_values:a}=await this.processor(n),o=`<s_docvqa><s_question>${r}</s_question><s_answer>`,i=this.tokenizer(o,{add_special_tokens:!1,padding:!0,truncation:!0}).input_ids,l=await this.model.generate({inputs:a,max_length:this.model.config.decoder.max_position_embeddings,decoder_input_ids:i,...this._default_generation_config,...s}),d=this.tokenizer.batch_decode(l)[0].match(/<s_answer>(.*?)<\/s_answer>/);let h=null;return d&&d.length>=2&&(h=d[1].trim()),[{answer:h}]}},ZI=class extends Ve{async _call(e){const t=await Ht(e),r=await this.processor(t),s=await this.model(r),n=[];for(const a of s.reconstruction){const o=a.squeeze().clamp_(0,1).mul_(255).round_().to("uint8");n.push(qt.fromTensor(o))}return Array.isArray(e)?n:n[0]}},eO=class extends Ve{async _call(e){const t=await Ht(e),r=await this.processor(t),{predicted_depth:s}=await this.model(r),n=[];for(let a=0;a<t.length;++a){const o=s[a],[i,l]=o.dims.slice(-2),[c,d]=t[a].size,h=(await jt(o.view(1,1,i,l),{size:[d,c],mode:"bilinear"})).view(d,c),_=h.min().item(),p=h.max().item(),w=h.sub(_).div_(p-_).mul_(255).to("uint8").unsqueeze(0),v=qt.fromTensor(w);n.push({predicted_depth:h,depth:v})}return Array.isArray(e)?n:n[0]}},tO=class extends Ve{async _call(e,{pooling:t="none",normalize:r=!1,quantize:s=!1,precision:n="binary"}={}){const a=this.tokenizer(e,{padding:!0,truncation:!0}),o=await this.model(a);let i=o.last_hidden_state??o.logits??o.token_embeddings;switch(t){case"none":break;case"mean":i=Ax(i,a.attention_mask);break;case"first_token":case"cls":i=i.slice(null,0);break;case"last_token":case"eos":i=i.slice(null,-1);break;default:throw Error(`Pooling method '${t}' not supported.`)}return r&&(i=i.normalize(2,-1)),s&&(i=Px(i,n)),i}},rO=class extends Ve{async _call(e,{pool:t=null}={}){const r=await Ht(e),{pixel_values:s}=await this.processor(r),n=await this.model({pixel_values:s});let a;if(t){if(!("pooler_output"in n))throw Error("No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.");a=n.pooler_output}else a=n.last_hidden_state??n.logits??n.image_embeds;return a}},qa=Object.freeze({"text-classification":{pipeline:OI,model:cg,default:{model:"Xenova/distilbert-base-uncased-finetuned-sst-2-english"},type:"text"},"token-classification":{pipeline:NI,model:gI,default:{model:"Xenova/bert-base-multilingual-cased-ner-hrl"},type:"text"},"question-answering":{pipeline:DI,model:xI,default:{model:"Xenova/distilbert-base-cased-distilled-squad"},type:"text"},"fill-mask":{pipeline:zI,model:MI,default:{model:"onnx-community/ettin-encoder-32m-ONNX",dtype:"fp32"},type:"text"},summarization:{pipeline:BI,model:ou,default:{model:"Xenova/distilbart-cnn-6-6"},type:"text"},translation:{pipeline:RI,model:ou,default:{model:"Xenova/t5-small"},type:"text"},"text2text-generation":{pipeline:lu,model:ou,default:{model:"Xenova/flan-t5-small"},type:"text"},"text-generation":{pipeline:GI,model:bI,default:{model:"onnx-community/Qwen3-0.6B-ONNX",dtype:"q4"},type:"text"},"zero-shot-classification":{pipeline:$I,model:cg,default:{model:"Xenova/distilbert-base-uncased-mnli"},type:"text"},"audio-classification":{pipeline:VI,model:SI,default:{model:"Xenova/wav2vec2-base-superb-ks"},type:"audio"},"zero-shot-audio-classification":{pipeline:UI,model:wn,default:{model:"Xenova/clap-htsat-unfused"},type:"multimodal"},"automatic-speech-recognition":{pipeline:jI,model:[wI,CI],default:{model:"Xenova/whisper-tiny.en"},type:"multimodal"},"text-to-audio":{pipeline:qI,model:[yI,vI],default:{model:"onnx-community/Supertonic-TTS-ONNX",dtype:"fp32"},type:"text"},"image-to-text":{pipeline:WI,model:kI,default:{model:"Xenova/vit-gpt2-image-captioning"},type:"multimodal"},"image-classification":{pipeline:HI,model:TI,default:{model:"Xenova/vit-base-patch16-224"},type:"multimodal"},"image-segmentation":{pipeline:pg,model:[ug,dg,hg],default:{model:"Xenova/detr-resnet-50-panoptic"},type:"multimodal"},"background-removal":{pipeline:QI,model:[ug,dg,hg],default:{model:"Xenova/modnet"},type:"image"},"zero-shot-image-classification":{pipeline:XI,model:wn,default:{model:"Xenova/clip-vit-base-patch32"},type:"multimodal"},"object-detection":{pipeline:YI,model:EI,default:{model:"Xenova/detr-resnet-50"},type:"multimodal"},"zero-shot-object-detection":{pipeline:JI,model:AI,default:{model:"Xenova/owlvit-base-patch32"},type:"multimodal"},"document-question-answering":{pipeline:KI,model:PI,default:{model:"Xenova/donut-base-finetuned-docvqa"},type:"multimodal"},"image-to-image":{pipeline:ZI,model:FI,default:{model:"Xenova/swin2SR-classical-sr-x2-64"},type:"image"},"depth-estimation":{pipeline:eO,model:LI,default:{model:"onnx-community/depth-anything-v2-small"},type:"image"},"feature-extraction":{pipeline:tO,model:wn,default:{model:"onnx-community/all-MiniLM-L6-v2-ONNX",dtype:"fp32"},type:"text"},"image-feature-extraction":{pipeline:rO,model:[II,wn],default:{model:"onnx-community/dinov3-vits16-pretrain-lvd1689m-ONNX",dtype:"fp32"},type:"image"}}),mg=Object.freeze({"sentiment-analysis":"text-classification",ner:"token-classification",asr:"automatic-speech-recognition","text-to-speech":"text-to-audio",embeddings:"feature-extraction"});async function sO(e){if(!e)throw new Error("modelId is required");return(await tn(e,an,{})).exists?[an]:[]}async function nO(e,{config:t=null,dtype:r=null,device:s=null,model_file_name:n=null,include_tokenizer:a=!0,include_processor:o=!0}={}){const i=await np(e,{config:t,dtype:r,device:s,model_file_name:n});if(a){const l=await Zf(e);i.push(...l)}if(o){const l=await sO(e);i.push(...l)}return i}async function aO(e,t,r={}){e=mg[e]??e;const s=qa[e];if(!s)throw new Error(`Unsupported pipeline task: ${e}. Must be one of [${Object.keys(qa).join(", ")}]`);const{type:n}=s,i=await nO(t,{...r,include_tokenizer:n!=="audio"&&n!=="image",include_processor:n!=="text"});if(e==="text-generation"){const l=await sp(t,r),c=rp(l),d=wE(c);if(d){const h=Object.values(d).map(_=>`onnx/${_}`);return i.filter(_=>!_.startsWith("onnx/")||h.some(p=>_.startsWith(p)))}}return i}async function oO(e,t=null,{progress_callback:r=null,config:s=null,cache_dir:n=null,local_files_only:a=!1,revision:o="main",device:i=null,dtype:l=null,subfolder:c="onnx",use_external_data_format:d=null,model_file_name:h=null,session_options:_={}}={}){e=mg[e]??e;const p=qa[e.split("_",1)[0]];if(!p)throw Error(`Unsupported pipeline: ${e}. Must be one of [${Object.keys(qa)}]`);t||(t=p.default.model,ue.info(`No model specified. Using default model: "${t}".`),!l&&p.default.dtype&&(l=p.default.dtype));const w=await aO(e,t,{device:i,dtype:l});let v={};r&&(await Promise.all(w.map(async Q=>tn(t,Q)))).forEach((Q,H)=>{Q.exists&&(v[w[H]]={loaded:0,total:Q.size??0})});const y={progress_callback:r?new pi(r,v):void 0,config:s,cache_dir:n,local_files_only:a,revision:o,device:i,dtype:l,subfolder:c,use_external_data_format:d,model_file_name:h,session_options:_},M=w.includes("tokenizer.json"),T=w.includes("preprocessor_config.json"),A=p.model;let C;if(Array.isArray(A)){const $=s??await on.from_pretrained(t,y),{model_type:Q}=$,H=A.find(D=>D.supports(Q));if(!H)throw Error(`Unsupported model type "${Q}" for task "${e}". None of the candidate model classes support this type.`);C=H.from_pretrained(t,{...y,config:$})}else C=A.from_pretrained(t,y);const[S,N,x]=await Promise.all([M?Te.from_pretrained(t,y):null,T?qT.from_pretrained(t,y):null,C]),R={task:e,model:x};S&&(R.tokenizer=S),N&&(R.processor=N),Sr(r,{status:"ready",task:e,model:t});const z=p.pipeline;return new z(R)}fe.IS_PROCESS_AVAILABLE;let Nr=null;const gg="onnx-community/moonshine-base-ONNX";function Ct(e,t,r){self.postMessage({type:"log",level:e,message:t,meta:r})}async function iO(){if(Nr)return;Ct("info","load: begin",{model:gg}),self.postMessage({type:"status",status:"loading",message:"Downloading speech model…"});let e=!1;const t=setTimeout(()=>{!e&&!Nr&&(self.postMessage({type:"status",status:"loading",message:"Loading speech model from cache…"}),Ct("info","load: no download progress within 1.5s, assuming cache hit"))},1500),r=Date.now();try{Nr=await oO("automatic-speech-recognition",gg,{dtype:"fp32",device:"wasm",progress_callback:n=>{n.status==="progress_total"&&typeof n.progress=="number"?(e=!0,self.postMessage({type:"progress",pct:n.progress})):(n.status==="download"||n.status==="initiate")&&(e=!0,Ct("debug","pipeline: "+n.status,{file:n.file}))}})}finally{clearTimeout(t)}Ct("info","load: pipeline ready",{ms:Date.now()-r});const s=Date.now();try{await Promise.race([Nr(new Float32Array(16e3)),new Promise((n,a)=>setTimeout(()=>a(new Error("warmup timeout")),1e4))]),Ct("info","load: warmup done",{ms:Date.now()-s})}catch(n){const a=n instanceof Error?n.message:String(n);Ct("warn","load: warmup failed (best-effort, ignored)",{error:a,ms:Date.now()-s})}self.postMessage({type:"status",status:"ready",message:"Speech model ready"})}self.onmessage=async e=>{var r,s;const{type:t}=e.data;if(t==="load"){try{await iO()}catch(n){const a=n instanceof Error?n.message:"Model load failed",o=n instanceof Error?n.stack:"";console.error("[speechWorker] load failed:",a,o),Ct("error","load: failed",{error:a,stack:o}),self.postMessage({type:"error",error:a})}return}if(t==="transcribe"){const n=e.data.audio,a=e.data.seq??0;if(!Nr){Ct("warn","transcribe: model not loaded, dropping segment",{samples:n==null?void 0:n.length,seq:a}),self.postMessage({type:"error",error:"Model not loaded",seq:a});return}const o=Date.now();try{const l=((r=(await Nr(n)).text)==null?void 0:r.trim())??"";Ct("debug","transcribe: ok",{ms:Date.now()-o,samples:n.length,chars:l.length,seq:a}),self.postMessage({type:"result",text:l,seq:a})}catch(i){const l=i instanceof Error?i.message:"Transcription failed";Ct("error","transcribe: failed",{error:l,ms:Date.now()-o,seq:a}),self.postMessage({type:"error",error:l,seq:a})}return}if(t==="transcribe-partial"){const n=e.data.audio,a=e.data.seq??0;if(!Nr){Ct("warn","transcribe-partial: model not loaded, dropping",{samples:n==null?void 0:n.length,seq:a}),self.postMessage({type:"partial-error",seq:a,error:"Model not loaded"});return}const o=Date.now();try{const l=((s=(await Nr(n)).text)==null?void 0:s.trim())??"";Ct("debug","transcribe-partial: ok",{ms:Date.now()-o,samples:n.length,chars:l.length,seq:a}),self.postMessage({type:"partial",text:l,seq:a})}catch(i){const l=i instanceof Error?i.message:"Partial transcription failed";Ct("warn","transcribe-partial: failed",{error:l,ms:Date.now()-o,seq:a}),self.postMessage({type:"partial-error",seq:a,error:l})}return}}})();