@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/README.md +13 -2
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +26 -26
  3. package/dist/transformers.js +1002 -587
  4. package/dist/transformers.min.js +23 -19
  5. package/dist/transformers.node.cjs +1030 -585
  6. package/dist/transformers.node.min.cjs +21 -17
  7. package/dist/transformers.node.min.mjs +21 -17
  8. package/dist/transformers.node.mjs +1000 -585
  9. package/dist/transformers.web.js +887 -472
  10. package/dist/transformers.web.min.js +21 -17
  11. package/package.json +3 -3
  12. package/src/configs.js +28 -22
  13. package/src/env.js +1 -1
  14. package/src/image_processors_utils.js +25 -15
  15. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  16. package/src/models/chmv2/modeling_chmv2.js +4 -0
  17. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  18. package/src/models/eurobert/modeling_eurobert.js +41 -0
  19. package/src/models/gemma3/image_processing_gemma3.js +3 -0
  20. package/src/models/gemma3/modeling_gemma3.js +4 -1
  21. package/src/models/gemma3/processing_gemma3.js +45 -0
  22. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  23. package/src/models/glm46v/processing_glm46v.js +5 -0
  24. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  25. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  26. package/src/models/image_processors.js +3 -0
  27. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
  28. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  29. package/src/models/mistral4/modeling_mistral4.js +5 -0
  30. package/src/models/modeling_utils.js +48 -25
  31. package/src/models/models.js +10 -1
  32. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  33. package/src/models/processors.js +2 -0
  34. package/src/models/qwen2_vl/modeling_qwen2_vl.js +226 -168
  35. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  36. package/src/models/registry.js +19 -8
  37. package/src/models/solar_open/modeling_solar_open.js +5 -0
  38. package/src/pipelines.js +1 -0
  39. package/src/utils/hub.js +4 -1
  40. package/src/utils/model_registry/ModelRegistry.js +36 -0
  41. package/src/utils/model_registry/get_available_dtypes.js +68 -0
  42. package/src/utils/model_registry/get_file_metadata.js +1 -0
  43. package/src/utils/model_registry/get_model_files.js +7 -60
  44. package/src/utils/model_registry/resolve_model_type.js +66 -0
  45. package/types/configs.d.ts.map +1 -1
  46. package/types/image_processors_utils.d.ts +3 -2
  47. package/types/image_processors_utils.d.ts.map +1 -1
  48. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  49. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  50. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  51. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  52. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  53. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  54. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  55. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  56. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  57. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  58. package/types/models/gemma3/image_processing_gemma3.d.ts +4 -0
  59. package/types/models/gemma3/image_processing_gemma3.d.ts.map +1 -0
  60. package/types/models/gemma3/modeling_gemma3.d.ts +4 -1
  61. package/types/models/gemma3/modeling_gemma3.d.ts.map +1 -1
  62. package/types/models/gemma3/processing_gemma3.d.ts +20 -0
  63. package/types/models/gemma3/processing_gemma3.d.ts.map +1 -0
  64. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  65. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  66. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  67. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  68. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  69. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  70. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  71. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  72. package/types/models/image_processors.d.ts +3 -0
  73. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  74. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  75. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  76. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  77. package/types/models/modeling_utils.d.ts +2 -3
  78. package/types/models/modeling_utils.d.ts.map +1 -1
  79. package/types/models/models.d.ts +10 -1
  80. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  81. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  82. package/types/models/processors.d.ts +2 -0
  83. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
  84. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  85. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  86. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  87. package/types/models/registry.d.ts.map +1 -1
  88. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  89. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  90. package/types/pipelines.d.ts +1 -0
  91. package/types/pipelines.d.ts.map +1 -1
  92. package/types/utils/hub.d.ts.map +1 -1
  93. package/types/utils/model_registry/ModelRegistry.d.ts +27 -0
  94. package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
  95. package/types/utils/model_registry/get_available_dtypes.d.ts +26 -0
  96. package/types/utils/model_registry/get_available_dtypes.d.ts.map +1 -0
  97. package/types/utils/model_registry/get_model_files.d.ts +25 -0
  98. package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
  99. package/types/utils/model_registry/resolve_model_type.d.ts +24 -0
  100. package/types/utils/model_registry/resolve_model_type.d.ts.map +1 -0
  101. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  102. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -117,6 +117,9 @@ __export(transformers_exports, {
117
117
  BloomModel: () => BloomModel,
118
118
  BloomPreTrainedModel: () => BloomPreTrainedModel,
119
119
  BloomTokenizer: () => BloomTokenizer,
120
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
121
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
122
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
120
123
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
121
124
  CLIPImageProcessor: () => CLIPImageProcessor,
122
125
  CLIPModel: () => CLIPModel,
@@ -212,6 +215,9 @@ __export(transformers_exports, {
212
215
  DebertaV2Tokenizer: () => DebertaV2Tokenizer,
213
216
  DecisionTransformerModel: () => DecisionTransformerModel,
214
217
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
218
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
219
+ DeepseekV3Model: () => DeepseekV3Model,
220
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
215
221
  DeiTFeatureExtractor: () => DeiTFeatureExtractor,
216
222
  DeiTForImageClassification: () => DeiTForImageClassification,
217
223
  DeiTImageProcessor: () => DeiTImageProcessor,
@@ -272,6 +278,11 @@ __export(transformers_exports, {
272
278
  EsmModel: () => EsmModel,
273
279
  EsmPreTrainedModel: () => EsmPreTrainedModel,
274
280
  EsmTokenizer: () => EsmTokenizer,
281
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
282
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
283
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
284
+ EuroBertModel: () => EuroBertModel,
285
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
275
286
  ExaoneForCausalLM: () => ExaoneForCausalLM,
276
287
  ExaoneModel: () => ExaoneModel,
277
288
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -318,8 +329,11 @@ __export(transformers_exports, {
318
329
  Gemma2Model: () => Gemma2Model,
319
330
  Gemma2PreTrainedModel: () => Gemma2PreTrainedModel,
320
331
  Gemma3ForCausalLM: () => Gemma3ForCausalLM,
332
+ Gemma3ForConditionalGeneration: () => Gemma3ForConditionalGeneration,
333
+ Gemma3ImageProcessor: () => Gemma3ImageProcessor,
321
334
  Gemma3Model: () => Gemma3Model,
322
335
  Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
336
+ Gemma3Processor: () => Gemma3Processor,
323
337
  Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
324
338
  Gemma3nForCausalLM: () => Gemma3nForCausalLM,
325
339
  Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
@@ -329,8 +343,14 @@ __export(transformers_exports, {
329
343
  GemmaModel: () => GemmaModel,
330
344
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
331
345
  GemmaTokenizer: () => GemmaTokenizer,
346
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
347
+ Glm46VProcessor: () => Glm46VProcessor,
332
348
  GlmForCausalLM: () => GlmForCausalLM,
333
349
  GlmModel: () => GlmModel,
350
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
351
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
352
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
353
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
334
354
  GlmPreTrainedModel: () => GlmPreTrainedModel,
335
355
  GptOssForCausalLM: () => GptOssForCausalLM,
336
356
  GptOssModel: () => GptOssModel,
@@ -396,6 +416,7 @@ __export(transformers_exports, {
396
416
  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
397
417
  Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
398
418
  Lfm2VlProcessor: () => Lfm2VlProcessor,
419
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
399
420
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
400
421
  Llama4ForCausalLM: () => Llama4ForCausalLM,
401
422
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -465,6 +486,9 @@ __export(transformers_exports, {
465
486
  MimiPreTrainedModel: () => MimiPreTrainedModel,
466
487
  MinLengthLogitsProcessor: () => MinLengthLogitsProcessor,
467
488
  MinNewTokensLengthLogitsProcessor: () => MinNewTokensLengthLogitsProcessor,
489
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
490
+ Mistral4Model: () => Mistral4Model,
491
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
468
492
  MistralForCausalLM: () => MistralForCausalLM,
469
493
  MistralModel: () => MistralModel,
470
494
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -536,6 +560,9 @@ __export(transformers_exports, {
536
560
  NanoChatForCausalLM: () => NanoChatForCausalLM,
537
561
  NanoChatModel: () => NanoChatModel,
538
562
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
563
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
564
+ NemotronHModel: () => NemotronHModel,
565
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
539
566
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
540
567
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
541
568
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -725,6 +752,9 @@ __export(transformers_exports, {
725
752
  SnacFeatureExtractor: () => SnacFeatureExtractor,
726
753
  SnacModel: () => SnacModel,
727
754
  SnacPreTrainedModel: () => SnacPreTrainedModel,
755
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
756
+ SolarOpenModel: () => SolarOpenModel,
757
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
728
758
  SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
729
759
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
730
760
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
@@ -925,7 +955,7 @@ var import_node_fs = __toESM(require("fs"), 1);
925
955
  var import_node_path = __toESM(require("path"), 1);
926
956
  var import_node_url = __toESM(require("url"), 1);
927
957
  var import_meta = {};
928
- var VERSION = "4.0.0-next.7";
958
+ var VERSION = "4.0.0-next.9";
929
959
  var HAS_SELF = typeof self !== "undefined";
930
960
  var IS_FS_AVAILABLE = !isEmpty(import_node_fs.default);
931
961
  var IS_PATH_AVAILABLE = !isEmpty(import_node_path.default);
@@ -1155,7 +1185,7 @@ var logger = {
1155
1185
  }
1156
1186
  };
1157
1187
 
1158
- // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
1188
+ // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
1159
1189
  var DictionarySplitter = class {
1160
1190
  /**
1161
1191
  * @param dictionary The dictionary of words to use for splitting.
@@ -2811,10 +2841,10 @@ var BPE = class extends TokenizerModel_default {
2811
2841
  );
2812
2842
  if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
2813
2843
  output_tokens.push(...byte_tokens);
2814
- } else {
2844
+ } else if (this.unk_token != null) {
2815
2845
  output_tokens.push(this.unk_token);
2816
2846
  }
2817
- } else {
2847
+ } else if (this.unk_token != null) {
2818
2848
  output_tokens.push(this.unk_token);
2819
2849
  }
2820
2850
  }
@@ -6664,14 +6694,14 @@ var Random = class {
6664
6694
  * @returns {number} A normally distributed random value.
6665
6695
  */
6666
6696
  gauss(mu = 0, sigma = 1) {
6667
- let z = this._gauss_next;
6697
+ let z2 = this._gauss_next;
6668
6698
  this._gauss_next = null;
6669
- if (z === null) {
6699
+ if (z2 === null) {
6670
6700
  const x2pi = this.random() * 2 * Math.PI, g2rad = Math.sqrt(-2 * Math.log(1 - this.random()));
6671
- z = Math.cos(x2pi) * g2rad;
6701
+ z2 = Math.cos(x2pi) * g2rad;
6672
6702
  this._gauss_next = Math.sin(x2pi) * g2rad;
6673
6703
  }
6674
- return mu + z * sigma;
6704
+ return mu + z2 * sigma;
6675
6705
  }
6676
6706
  /**
6677
6707
  * Shuffles an array in-place using the Fisher-Yates algorithm.
@@ -7426,13 +7456,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
7426
7456
  wrapped_progress
7427
7457
  );
7428
7458
  } else if (typeof response !== "string") {
7459
+ const headers = new Headers(response.headers);
7460
+ headers.set("content-length", result.byteLength.toString());
7429
7461
  await cache2.put(
7430
7462
  cacheKey,
7431
7463
  new Response(
7432
7464
  /** @type {any} */
7433
7465
  result,
7434
7466
  {
7435
- headers: response.headers
7467
+ headers
7436
7468
  }
7437
7469
  )
7438
7470
  ).catch((err) => {
@@ -8390,7 +8422,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
8390
8422
  // src/backends/onnx.js
8391
8423
  var ONNX_NODE = __toESM(require("onnxruntime-node"), 1);
8392
8424
 
8393
- // ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
8425
+ // ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260323-a99aad9d36/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
8394
8426
  var ort_webgpu_bundle_min_exports = {};
8395
8427
  __export(ort_webgpu_bundle_min_exports, {
8396
8428
  InferenceSession: () => Jf,
@@ -9159,7 +9191,7 @@ async function ts(a = {}) {
9159
9191
  throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
9160
9192
  }
9161
9193
  function Ye() {
9162
- return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
9194
+ return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, q: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, s: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: lf, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: uf, A: df, r: cf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
9163
9195
  }
9164
9196
  async function bt() {
9165
9197
  function e(o, u) {
@@ -9222,14 +9254,14 @@ async function ts(a = {}) {
9222
9254
  gt.push(t), Je[e.Nc] = t, t.Nc = e.Nc;
9223
9255
  var n = { Oc: "run", he: e.ge, Wc: e.Wc, Nc: e.Nc };
9224
9256
  return t.postMessage(n, e.Yc), 0;
9225
- }, z = 0, V = (e, t, ...n) => {
9257
+ }, G = 0, V = (e, t, ...n) => {
9226
9258
  var o, u = 16 * n.length, c = P(), h = Ft(u), b = h >>> 3;
9227
9259
  for (o of n) typeof o == "bigint" ? ((p(), pe)[b++ >>> 0] = 1n, (p(), pe)[b++ >>> 0] = o) : ((p(), pe)[b++ >>> 0] = 0n, (p(), ae)[b++ >>> 0] = o);
9228
9260
  return e = Lo(e, 0, u, h, t), D(c), e;
9229
9261
  };
9230
9262
  function qe(e) {
9231
9263
  if (i) return V(0, 1, e);
9232
- if (S = e, !(0 < z)) {
9264
+ if (S = e, !(0 < G)) {
9233
9265
  for (var t of gt) Se(t);
9234
9266
  for (t of We) Se(t);
9235
9267
  We = [], gt = [], Je = {}, W = true;
@@ -9274,7 +9306,7 @@ async function ts(a = {}) {
9274
9306
  We.push(e);
9275
9307
  }
9276
9308
  var Fe, zs = (e, t) => {
9277
- z = 0, e = zr(e, t), 0 < z ? S = e : Fr(e);
9309
+ G = 0, e = zr(e, t), 0 < G ? S = e : Fr(e);
9278
9310
  }, Ct = [], Ut = 0, me = (e) => -9007199254740992 > e || 9007199254740992 < e ? NaN : Number(e);
9279
9311
  function Vs(e) {
9280
9312
  var t = new wr(e >>>= 0);
@@ -9626,7 +9658,7 @@ async function ts(a = {}) {
9626
9658
  }
9627
9659
  var he = (e) => {
9628
9660
  if (!W) try {
9629
- if (e(), !(0 < z)) try {
9661
+ if (e(), !(0 < G)) try {
9630
9662
  i ? Wt() && Fr(S) : br(S);
9631
9663
  } catch (t) {
9632
9664
  t instanceof wt || t == "unwind" || y(0, t);
@@ -9654,7 +9686,7 @@ async function ts(a = {}) {
9654
9686
  return (t ? Vr[t] : of[e])(...Ir);
9655
9687
  }
9656
9688
  var Ei = () => {
9657
- z = 0;
9689
+ G = 0;
9658
9690
  };
9659
9691
  function Si(e) {
9660
9692
  e >>>= 0, i ? postMessage({ Oc: "cleanupThread", ie: e }) : yn(Je[e]);
@@ -9674,7 +9706,7 @@ async function ts(a = {}) {
9674
9706
  try {
9675
9707
  return e(...n);
9676
9708
  } finally {
9677
- W || (_t.pop(), Me && Ge === 1 && _t.length === 0 && (Ge = 0, z += 1, Pt(wa), typeof Fibers < "u" && Fibers.De()));
9709
+ W || (_t.pop(), Me && Ge === 1 && _t.length === 0 && (Ge = 0, G += 1, Pt(wa), typeof Fibers < "u" && Fibers.De()));
9678
9710
  }
9679
9711
  };
9680
9712
  return jn.set(e, t), t;
@@ -9689,7 +9721,7 @@ async function ts(a = {}) {
9689
9721
  try {
9690
9722
  var c = (function() {
9691
9723
  var E = (p(), x)[Me + 8 >>> 2 >>> 0];
9692
- return E = Vn.get(E), E = jn.get(E), --z, E();
9724
+ return E = Vn.get(E), E = jn.get(E), --G, E();
9693
9725
  })();
9694
9726
  } catch (E) {
9695
9727
  c = E, u = true;
@@ -9880,7 +9912,7 @@ async function ts(a = {}) {
9880
9912
  return L(ct(e >>> 0, t >>> 0));
9881
9913
  }
9882
9914
  var ou = () => {
9883
- throw z += 1, "unwind";
9915
+ throw G += 1, "unwind";
9884
9916
  };
9885
9917
  function au() {
9886
9918
  return 4294901760;
@@ -9973,15 +10005,15 @@ async function ts(a = {}) {
9973
10005
  }
9974
10006
  (b = (p(), A)[c + 24 >>> 2 >>> 0]) && (b = { label: Ne(b + 4) }, e.defaultQueue = b), e.label = Ne(c + 4);
9975
10007
  }
9976
- z += 1, lt(t, h.requestDevice(e).then((B) => {
9977
- --z, he(() => {
9978
- ce[u >>> 0] = B.queue, ce[o >>> 0] = B, lt(n, B.lost.then((ue) => {
10008
+ G += 1, lt(t, h.requestDevice(e).then((B) => {
10009
+ --G, he(() => {
10010
+ ce[u >>> 0] = B.queue, ce[o >>> 0] = B, G += 1, lt(n, B.lost.then((ue) => {
9979
10011
  he(() => {
9980
10012
  B.onuncapturederror = () => {
9981
10013
  };
9982
10014
  var ye = P(), fe = Ce(ue.message);
9983
10015
  _r(n, yu[ue.reason], fe), D(ye);
9984
- });
10016
+ }), --G;
9985
10017
  })), B.onuncapturederror = (ue) => {
9986
10018
  var ye = 5;
9987
10019
  ue.error instanceof GPUValidationError ? ye = 2 : ue.error instanceof GPUOutOfMemoryError ? ye = 3 : ue.error instanceof GPUInternalError && (ye = 4);
@@ -9990,7 +10022,7 @@ async function ts(a = {}) {
9990
10022
  }, "adapterInfo" in B || (B.adapterInfo = h.info), kr(t, 1, o, 0);
9991
10023
  });
9992
10024
  }, (B) => {
9993
- --z, he(() => {
10025
+ --G, he(() => {
9994
10026
  var ue = P(), ye = Ce(B.message);
9995
10027
  kr(t, 3, o, ye), n && _r(n, 4, ye), D(ue);
9996
10028
  });
@@ -10033,12 +10065,12 @@ async function ts(a = {}) {
10033
10065
  function vu(e, t, n, o, u) {
10034
10066
  e >>>= 0, t = me(t), n = me(n), u >>>= 0;
10035
10067
  var c = O(e);
10036
- Re[e] = [], u == 4294967295 && (u = void 0), z += 1, lt(t, c.mapAsync(n, o >>> 0, u).then(() => {
10037
- --z, he(() => {
10068
+ Re[e] = [], u == 4294967295 && (u = void 0), G += 1, lt(t, c.mapAsync(n, o >>> 0, u).then(() => {
10069
+ --G, he(() => {
10038
10070
  Rr(t, 1, 0);
10039
10071
  });
10040
10072
  }, (h) => {
10041
- --z, he(() => {
10073
+ --G, he(() => {
10042
10074
  P();
10043
10075
  var b = Ce(h.message);
10044
10076
  Rr(t, h.name === "AbortError" ? 4 : h.name === "OperationError" ? 3 : 0, b), delete Re[e];
@@ -10067,12 +10099,12 @@ async function ts(a = {}) {
10067
10099
  return ce[n >>> 0] = u, o && (Re[n] = []), true;
10068
10100
  }
10069
10101
  function Iu(e, t, n, o) {
10070
- e >>>= 0, t = me(t), o >>>= 0, n = du(n >>> 0), e = O(e), z += 1, lt(t, e.createComputePipelineAsync(n).then((u) => {
10071
- --z, he(() => {
10102
+ e >>>= 0, t = me(t), o >>>= 0, n = du(n >>> 0), e = O(e), G += 1, lt(t, e.createComputePipelineAsync(n).then((u) => {
10103
+ --G, he(() => {
10072
10104
  ce[o >>> 0] = u, Pr(t, 1, o, 0);
10073
10105
  });
10074
10106
  }, (u) => {
10075
- --z, he(() => {
10107
+ --G, he(() => {
10076
10108
  var c = P(), h = Ce(u.message);
10077
10109
  Pr(t, u.reason === "validation" ? 3 : u.reason === "internal" ? 4 : 0, o, h), D(c);
10078
10110
  });
@@ -10087,15 +10119,15 @@ async function ts(a = {}) {
10087
10119
  (e = O(e)).onuncapturederror = null, e.destroy();
10088
10120
  };
10089
10121
  function Ou(e, t) {
10090
- t = me(t), e = O(e >>> 0), z += 1, lt(t, e.popErrorScope().then((n) => {
10091
- --z, he(() => {
10122
+ t = me(t), e = O(e >>> 0), G += 1, lt(t, e.popErrorScope().then((n) => {
10123
+ --G, he(() => {
10092
10124
  var o = 5;
10093
10125
  n ? n instanceof GPUValidationError ? o = 2 : n instanceof GPUOutOfMemoryError ? o = 3 : n instanceof GPUInternalError && (o = 4) : o = 1;
10094
10126
  var u = P(), c = n ? Ce(n.message) : 0;
10095
10127
  Nr(t, 1, o, c), D(u);
10096
10128
  });
10097
10129
  }, (n) => {
10098
- --z, he(() => {
10130
+ --G, he(() => {
10099
10131
  var o = P(), u = Ce(n.message);
10100
10132
  Nr(t, 1, 5, u), D(o);
10101
10133
  });
@@ -10106,8 +10138,8 @@ async function ts(a = {}) {
10106
10138
  var u = { featureLevel: pu[(p(), x)[n + 4 >>> 2 >>> 0]], powerPreference: mu[(p(), x)[n + 8 >>> 2 >>> 0]], forceFallbackAdapter: !!(p(), A)[n + 12 >>> 2 >>> 0] };
10107
10139
  (e = (p(), A)[n >>> 2 >>> 0]) !== 0 && (p(), u.Fe = !!(p(), A)[e + 8 >>> 2 >>> 0]);
10108
10140
  }
10109
- "gpu" in navigator ? (z += 1, lt(t, navigator.gpu.requestAdapter(u).then((c) => {
10110
- --z, he(() => {
10141
+ "gpu" in navigator ? (G += 1, lt(t, navigator.gpu.requestAdapter(u).then((c) => {
10142
+ --G, he(() => {
10111
10143
  if (c) ce[o >>> 0] = c, Et(t, 1, o, 0);
10112
10144
  else {
10113
10145
  var h = P(), b = Ce("WebGPU not available on this browser (requestAdapter returned null)");
@@ -10115,7 +10147,7 @@ async function ts(a = {}) {
10115
10147
  }
10116
10148
  });
10117
10149
  }, (c) => {
10118
- --z, he(() => {
10150
+ --G, he(() => {
10119
10151
  var h = P(), b = Ce(c.message);
10120
10152
  Et(t, 4, o, b), D(h);
10121
10153
  });
@@ -10346,7 +10378,7 @@ async function ts(a = {}) {
10346
10378
  Te(`invalid type for getValue: ${t}`);
10347
10379
  }
10348
10380
  }, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
10349
- var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
10381
+ var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 937012: (e, t, n, o, u) => {
10350
10382
  if (r === void 0 || !r.Uc) return 1;
10351
10383
  if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
10352
10384
  if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
@@ -10366,11 +10398,11 @@ async function ts(a = {}) {
10366
10398
  } catch {
10367
10399
  return 4;
10368
10400
  }
10369
- }, 926500: (e, t, n) => {
10401
+ }, 937836: (e, t, n) => {
10370
10402
  r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
10371
- }, 926564: () => r.me(), 926606: (e) => {
10403
+ }, 937900: () => r.me(), 937942: (e) => {
10372
10404
  r.jd(e);
10373
- }, 926643: () => typeof wasmOffsetConverter < "u" };
10405
+ }, 937979: () => typeof wasmOffsetConverter < "u" };
10374
10406
  function af(e, t, n, o) {
10375
10407
  var u = P();
10376
10408
  try {
@@ -10389,12 +10421,12 @@ async function ts(a = {}) {
10389
10421
  N(1, 0);
10390
10422
  }
10391
10423
  }
10392
- function uf(e, t, n) {
10393
- var o = P();
10424
+ function uf(e) {
10425
+ var t = P();
10394
10426
  try {
10395
- _o(e, t, n);
10396
- } catch (u) {
10397
- if (D(o), u !== u + 0) throw u;
10427
+ Ro(e);
10428
+ } catch (n) {
10429
+ if (D(t), n !== n + 0) throw n;
10398
10430
  N(1, 0);
10399
10431
  }
10400
10432
  }
@@ -10407,25 +10439,16 @@ async function ts(a = {}) {
10407
10439
  N(1, 0);
10408
10440
  }
10409
10441
  }
10410
- function cf(e) {
10411
- var t = P();
10412
- try {
10413
- Ro(e);
10414
- } catch (n) {
10415
- if (D(t), n !== n + 0) throw n;
10416
- N(1, 0);
10417
- }
10418
- }
10419
- function df(e, t, n, o, u, c, h) {
10420
- var b = P();
10442
+ function cf(e, t, n) {
10443
+ var o = P();
10421
10444
  try {
10422
- return Wo(e, t, n, o, u, c, h);
10423
- } catch (E) {
10424
- if (D(b), E !== E + 0) throw E;
10445
+ _o(e, t, n);
10446
+ } catch (u) {
10447
+ if (D(o), u !== u + 0) throw u;
10425
10448
  N(1, 0);
10426
10449
  }
10427
10450
  }
10428
- function lf(e, t) {
10451
+ function df(e, t) {
10429
10452
  var n = P();
10430
10453
  try {
10431
10454
  Vo(e, t);
@@ -10434,6 +10457,15 @@ async function ts(a = {}) {
10434
10457
  N(1, 0);
10435
10458
  }
10436
10459
  }
10460
+ function lf(e, t, n, o, u, c, h) {
10461
+ var b = P();
10462
+ try {
10463
+ return Wo(e, t, n, o, u, c, h);
10464
+ } catch (E) {
10465
+ if (D(b), E !== E + 0) throw E;
10466
+ N(1, 0);
10467
+ }
10468
+ }
10437
10469
  function pf(e, t, n, o, u, c) {
10438
10470
  var h = P();
10439
10471
  try {
@@ -10863,7 +10895,7 @@ var nc;
10863
10895
  var oc;
10864
10896
  var ac;
10865
10897
  var qt;
10866
- var $;
10898
+ var z;
10867
10899
  var je = k(() => {
10868
10900
  "use strict";
10869
10901
  Yt();
@@ -10919,19 +10951,19 @@ var je = k(() => {
10919
10951
  rr = false, ds = true, H(M);
10920
10952
  });
10921
10953
  })), await Promise.race(C), S) throw new Error(`WebAssembly backend initializing failed due to timeout: ${r}ms`);
10922
- }, $ = () => {
10954
+ }, z = () => {
10923
10955
  if (nn && rn) return rn;
10924
10956
  throw new Error("WebAssembly is not initialized yet.");
10925
10957
  };
10926
10958
  });
10927
10959
  var be;
10928
10960
  var Lt;
10929
- var G;
10961
+ var $;
10930
10962
  var nr = k(() => {
10931
10963
  "use strict";
10932
10964
  je();
10933
10965
  be = (a, r) => {
10934
- let s = $(), f = s.lengthBytesUTF8(a) + 1, i = s._malloc(f);
10966
+ let s = z(), f = s.lengthBytesUTF8(a) + 1, i = s._malloc(f);
10935
10967
  return s.stringToUTF8(a, i, f), r.push(i), i;
10936
10968
  }, Lt = (a, r, s, f) => {
10937
10969
  if (typeof a == "object" && a !== null) {
@@ -10945,8 +10977,8 @@ var nr = k(() => {
10945
10977
  else if (typeof d == "boolean") f(l, d ? "1" : "0");
10946
10978
  else throw new Error(`Can't handle extra config type: ${typeof d}`);
10947
10979
  });
10948
- }, G = (a) => {
10949
- let r = $(), s = r.stackSave();
10980
+ }, $ = (a) => {
10981
+ let r = z(), s = r.stackSave();
10950
10982
  try {
10951
10983
  let f = r.PTR_SIZE, i = r.stackAlloc(2 * f);
10952
10984
  r._OrtGetLastError(i, i + f);
@@ -10963,7 +10995,7 @@ var ps = k(() => {
10963
10995
  je();
10964
10996
  nr();
10965
10997
  ls = (a) => {
10966
- let r = $(), s = 0, f = [], i = a || {};
10998
+ let r = z(), s = 0, f = [], i = a || {};
10967
10999
  try {
10968
11000
  if (a?.logSeverityLevel === void 0) i.logSeverityLevel = 2;
10969
11001
  else if (typeof a.logSeverityLevel != "number" || !Number.isInteger(a.logSeverityLevel) || a.logSeverityLevel < 0 || a.logSeverityLevel > 4) throw new Error(`log severity level is not valid: ${a.logSeverityLevel}`);
@@ -10971,9 +11003,9 @@ var ps = k(() => {
10971
11003
  else if (typeof a.logVerbosityLevel != "number" || !Number.isInteger(a.logVerbosityLevel)) throw new Error(`log verbosity level is not valid: ${a.logVerbosityLevel}`);
10972
11004
  a?.terminate === void 0 && (i.terminate = false);
10973
11005
  let d = 0;
10974
- return a?.tag !== void 0 && (d = be(a.tag, f)), s = r._OrtCreateRunOptions(i.logSeverityLevel, i.logVerbosityLevel, !!i.terminate, d), s === 0 && G("Can't create run options."), a?.extra !== void 0 && Lt(a.extra, "", /* @__PURE__ */ new WeakSet(), (l, m) => {
11006
+ return a?.tag !== void 0 && (d = be(a.tag, f)), s = r._OrtCreateRunOptions(i.logSeverityLevel, i.logVerbosityLevel, !!i.terminate, d), s === 0 && $("Can't create run options."), a?.extra !== void 0 && Lt(a.extra, "", /* @__PURE__ */ new WeakSet(), (l, m) => {
10975
11007
  let y = be(l, f), w = be(m, f);
10976
- r._OrtAddRunConfigEntry(s, y, w) !== 0 && G(`Can't set a run config entry: ${l} - ${m}.`);
11008
+ r._OrtAddRunConfigEntry(s, y, w) !== 0 && $(`Can't set a run config entry: ${l} - ${m}.`);
10977
11009
  }), [s, f];
10978
11010
  } catch (d) {
10979
11011
  throw s !== 0 && r._OrtReleaseRunOptions(s), f.forEach((l) => r._free(l)), d;
@@ -11021,7 +11053,7 @@ var hs = k(() => {
11021
11053
  r.use_ort_model_bytes_directly || (r.use_ort_model_bytes_directly = "1"), a.executionProviders && a.executionProviders.some((s) => (typeof s == "string" ? s : s.name) === "webgpu") && (a.enableMemPattern = false);
11022
11054
  }, on = (a, r, s, f) => {
11023
11055
  let i = be(r, f), d = be(s, f);
11024
- $()._OrtAddSessionConfigEntry(a, i, d) !== 0 && G(`Can't set a session config entry: ${r} - ${s}.`);
11056
+ z()._OrtAddSessionConfigEntry(a, i, d) !== 0 && $(`Can't set a session config entry: ${r} - ${s}.`);
11025
11057
  }, ot = (a, r, s, f) => {
11026
11058
  let i = be(r, f), d = be(s, f);
11027
11059
  a.push([i, d]);
@@ -11052,7 +11084,7 @@ var hs = k(() => {
11052
11084
  }
11053
11085
  S.validationMode && ot(l, "validationMode", S.validationMode, s);
11054
11086
  }
11055
- let v = $().webgpuRegisterDevice(g);
11087
+ let v = z().webgpuRegisterDevice(g);
11056
11088
  if (v) {
11057
11089
  let [S, C, R] = v;
11058
11090
  ot(l, "deviceId", S.toString(), s), ot(l, "webgpuInstance", C.toString(), s), ot(l, "webgpuDevice", R.toString(), s);
@@ -11067,13 +11099,13 @@ var hs = k(() => {
11067
11099
  }
11068
11100
  let m = be(d, s), y = l.length, w = 0, T = 0;
11069
11101
  if (y > 0) {
11070
- w = $()._malloc(y * $().PTR_SIZE), s.push(w), T = $()._malloc(y * $().PTR_SIZE), s.push(T);
11071
- for (let g = 0; g < y; g++) $().setValue(w + g * $().PTR_SIZE, l[g][0], "*"), $().setValue(T + g * $().PTR_SIZE, l[g][1], "*");
11102
+ w = z()._malloc(y * z().PTR_SIZE), s.push(w), T = z()._malloc(y * z().PTR_SIZE), s.push(T);
11103
+ for (let g = 0; g < y; g++) z().setValue(w + g * z().PTR_SIZE, l[g][0], "*"), z().setValue(T + g * z().PTR_SIZE, l[g][1], "*");
11072
11104
  }
11073
- await $()._OrtAppendExecutionProvider(a, m, w, T, y) !== 0 && G(`Can't append execution provider: ${d}.`);
11105
+ await z()._OrtAppendExecutionProvider(a, m, w, T, y) !== 0 && $(`Can't append execution provider: ${d}.`);
11074
11106
  }
11075
11107
  }, ms = async (a) => {
11076
- let r = $(), s = 0, f = [], i = a || {};
11108
+ let r = z(), s = 0, f = [], i = a || {};
11077
11109
  uc(i);
11078
11110
  try {
11079
11111
  let d = sc(i.graphOptimizationLevel ?? "all"), l = ic(i.executionMode ?? "sequential"), m = typeof i.logId == "string" ? be(i.logId, f) : 0, y = i.logSeverityLevel ?? 2;
@@ -11081,7 +11113,7 @@ var hs = k(() => {
11081
11113
  let w = i.logVerbosityLevel ?? 0;
11082
11114
  if (!Number.isInteger(w) || w < 0 || w > 4) throw new Error(`log verbosity level is not valid: ${w}`);
11083
11115
  let T = typeof i.optimizedModelFilePath == "string" ? be(i.optimizedModelFilePath, f) : 0;
11084
- if (s = r._OrtCreateSessionOptions(d, !!i.enableCpuMemArena, !!i.enableMemPattern, l, !!i.enableProfiling, 0, m, y, w, T), s === 0 && G("Can't create session options."), i.executionProviders && await fc(s, i, f), i.enableGraphCapture !== void 0) {
11116
+ if (s = r._OrtCreateSessionOptions(d, !!i.enableCpuMemArena, !!i.enableMemPattern, l, !!i.enableProfiling, 0, m, y, w, T), s === 0 && $("Can't create session options."), i.executionProviders && await fc(s, i, f), i.enableGraphCapture !== void 0) {
11085
11117
  if (typeof i.enableGraphCapture != "boolean") throw new Error(`enableGraphCapture must be a boolean value: ${i.enableGraphCapture}`);
11086
11118
  on(s, "enableGraphCapture", i.enableGraphCapture.toString(), f);
11087
11119
  }
@@ -11089,13 +11121,13 @@ var hs = k(() => {
11089
11121
  if (typeof g != "string") throw new Error(`free dimension override name must be a string: ${g}`);
11090
11122
  if (typeof v != "number" || !Number.isInteger(v) || v < 0) throw new Error(`free dimension override value must be a non-negative integer: ${v}`);
11091
11123
  let S = be(g, f);
11092
- r._OrtAddFreeDimensionOverride(s, S, v) !== 0 && G(`Can't set a free dimension override: ${g} - ${v}.`);
11124
+ r._OrtAddFreeDimensionOverride(s, S, v) !== 0 && $(`Can't set a free dimension override: ${g} - ${v}.`);
11093
11125
  }
11094
11126
  return i.extra !== void 0 && Lt(i.extra, "", /* @__PURE__ */ new WeakSet(), (g, v) => {
11095
11127
  on(s, g, v, f);
11096
11128
  }), [s, f];
11097
11129
  } catch (d) {
11098
- throw s !== 0 && r._OrtReleaseSessionOptions(s) !== 0 && G("Can't release session options."), f.forEach((l) => r._free(l)), d;
11130
+ throw s !== 0 && r._OrtReleaseSessionOptions(s) !== 0 && $("Can't release session options."), f.forEach((l) => r._free(l)), d;
11099
11131
  }
11100
11132
  };
11101
11133
  });
@@ -11665,7 +11697,7 @@ var Os = k(() => {
11665
11697
  return l ? l.push(d) : this.temporarySessionTensorIds.set(r, [d]), d;
11666
11698
  }
11667
11699
  uploadTensor(r, s) {
11668
- if (!$().shouldTransferToMLTensor) throw new Error("Trying to upload to a MLTensor while shouldTransferToMLTensor is false");
11700
+ if (!z().shouldTransferToMLTensor) throw new Error("Trying to upload to a MLTensor while shouldTransferToMLTensor is false");
11669
11701
  le("verbose", () => `[WebNN] uploadTensor {tensorId: ${r}, data: ${s.byteLength}}`), this.tensorManager.upload(r, s);
11670
11702
  }
11671
11703
  async downloadTensor(r, s) {
@@ -11771,11 +11803,11 @@ var Kr = k(() => {
11771
11803
  nr();
11772
11804
  sn();
11773
11805
  yc = (a, r) => {
11774
- $()._OrtInit(a, r) !== 0 && G("Can't initialize onnxruntime.");
11806
+ z()._OrtInit(a, r) !== 0 && $("Can't initialize onnxruntime.");
11775
11807
  }, Jt = async (a) => {
11776
11808
  yc(a.wasm.numThreads, Ot(a.logLevel));
11777
11809
  }, Xt = async (a, r) => {
11778
- $().asyncInit?.();
11810
+ z().asyncInit?.();
11779
11811
  let s = a.webgpu.adapter;
11780
11812
  if (r === "webgpu") {
11781
11813
  if (typeof navigator > "u" || !navigator.gpu) throw new Error("WebGPU is not supported in current environment");
@@ -11790,29 +11822,29 @@ var Kr = k(() => {
11790
11822
  }
11791
11823
  }
11792
11824
  if (r === "webnn" && (typeof navigator > "u" || !navigator.ml)) throw new Error("WebNN is not supported in current environment");
11793
- if (r === "webgpu" && $().webgpuInit((f) => {
11825
+ if (r === "webgpu" && z().webgpuInit((f) => {
11794
11826
  a.webgpu.device = f;
11795
11827
  }), r === "webnn") {
11796
11828
  let f = new (Os(), $t(Ls)).WebNNBackend(a);
11797
- $().webnnInit([f, () => f.reserveTensorId(), (i) => f.releaseTensorId(i), async (i, d, l, m, y) => f.ensureTensor(i, d, l, m, y), (i, d) => {
11829
+ z().webnnInit([f, () => f.reserveTensorId(), (i) => f.releaseTensorId(i), async (i, d, l, m, y) => f.ensureTensor(i, d, l, m, y), (i, d) => {
11798
11830
  f.uploadTensor(i, d);
11799
11831
  }, async (i, d) => f.downloadTensor(i, d), (i, d) => f.registerMLContext(i, d), !!a.trace]);
11800
11832
  }
11801
11833
  }, it = /* @__PURE__ */ new Map(), bc = (a) => {
11802
- let r = $(), s = r.stackSave();
11834
+ let r = z(), s = r.stackSave();
11803
11835
  try {
11804
11836
  let f = r.PTR_SIZE, i = r.stackAlloc(2 * f);
11805
- r._OrtGetInputOutputCount(a, i, i + f) !== 0 && G("Can't get session input/output count.");
11837
+ r._OrtGetInputOutputCount(a, i, i + f) !== 0 && $("Can't get session input/output count.");
11806
11838
  let l = f === 4 ? "i32" : "i64";
11807
11839
  return [Number(r.getValue(i, l)), Number(r.getValue(i + f, l))];
11808
11840
  } finally {
11809
11841
  r.stackRestore(s);
11810
11842
  }
11811
11843
  }, Bs = (a, r) => {
11812
- let s = $(), f = s.stackSave(), i = 0;
11844
+ let s = z(), f = s.stackSave(), i = 0;
11813
11845
  try {
11814
11846
  let d = s.PTR_SIZE, l = s.stackAlloc(2 * d);
11815
- s._OrtGetInputOutputMetadata(a, r, l, l + d) !== 0 && G("Can't get session input/output metadata.");
11847
+ s._OrtGetInputOutputMetadata(a, r, l, l + d) !== 0 && $("Can't get session input/output metadata.");
11816
11848
  let y = Number(s.getValue(l, "*"));
11817
11849
  i = Number(s.getValue(l + d, "*"));
11818
11850
  let w = s.HEAP32[i / 4];
@@ -11827,11 +11859,11 @@ var Kr = k(() => {
11827
11859
  s.stackRestore(f), i !== 0 && s._OrtFree(i);
11828
11860
  }
11829
11861
  }, xt = (a) => {
11830
- let r = $(), s = r._malloc(a.byteLength);
11862
+ let r = z(), s = r._malloc(a.byteLength);
11831
11863
  if (s === 0) throw new Error(`Can't create a session. failed to allocate a buffer of size ${a.byteLength}.`);
11832
11864
  return r.HEAPU8.set(a, s), [s, a.byteLength];
11833
11865
  }, Qt = async (a, r) => {
11834
- let s, f, i = $();
11866
+ let s, f, i = z();
11835
11867
  Array.isArray(a) ? [s, f] = a : a.buffer === i.HEAPU8.buffer ? [s, f] = [a.byteOffset, a.byteLength] : [s, f] = xt(a);
11836
11868
  let d = 0, l = 0, m = 0, y = [], w = [], T = [];
11837
11869
  try {
@@ -11852,17 +11884,17 @@ var Kr = k(() => {
11852
11884
  } else i.currentContext = await i.webnnCreateMLContext();
11853
11885
  break;
11854
11886
  }
11855
- d = await i._OrtCreateSession(s, f, l), i.webgpuOnCreateSession?.(d), d === 0 && G("Can't create a session."), i.jsepOnCreateSession?.(), i.currentContext && (i.webnnRegisterMLContext(d, i.currentContext), i.currentContext = void 0, i.shouldTransferToMLTensor = true);
11887
+ d = await i._OrtCreateSession(s, f, l), i.webgpuOnCreateSession?.(d), d === 0 && $("Can't create a session."), i.jsepOnCreateSession?.(), i.currentContext && (i.webnnRegisterMLContext(d, i.currentContext), i.currentContext = void 0, i.shouldTransferToMLTensor = true);
11856
11888
  let [g, v] = bc(d), S = !!r?.enableGraphCapture, C = [], R = [], H = [], U = [], M = [];
11857
11889
  for (let L = 0; L < g; L++) {
11858
11890
  let [W, oe, p] = Bs(d, L);
11859
- W === 0 && G("Can't get an input name."), w.push(W);
11891
+ W === 0 && $("Can't get an input name."), w.push(W);
11860
11892
  let ne = i.UTF8ToString(W);
11861
11893
  C.push(ne), H.push(oe === 0 ? { name: ne, isTensor: false } : { name: ne, isTensor: true, type: or(oe), shape: p });
11862
11894
  }
11863
11895
  for (let L = 0; L < v; L++) {
11864
11896
  let [W, oe, p] = Bs(d, L + g);
11865
- W === 0 && G("Can't get an output name."), T.push(W);
11897
+ W === 0 && $("Can't get an output name."), T.push(W);
11866
11898
  let ne = i.UTF8ToString(W);
11867
11899
  R.push(ne), U.push(oe === 0 ? { name: ne, isTensor: false } : { name: ne, isTensor: true, type: or(oe), shape: p });
11868
11900
  {
@@ -11881,23 +11913,23 @@ var Kr = k(() => {
11881
11913
  }
11882
11914
  }
11883
11915
  let Y = null;
11884
- return M.some((L) => L === "gpu-buffer" || L === "ml-tensor" || L === "ml-tensor-cpu-output") && (m = i._OrtCreateBinding(d), m === 0 && G("Can't create IO binding."), Y = { handle: m, outputPreferredLocations: M, outputPreferredLocationsEncoded: M.map((L) => L === "ml-tensor-cpu-output" ? "ml-tensor" : L).map((L) => an(L)) }), it.set(d, [d, w, T, Y, S, false]), [d, C, R, H, U];
11916
+ return M.some((L) => L === "gpu-buffer" || L === "ml-tensor" || L === "ml-tensor-cpu-output") && (m = i._OrtCreateBinding(d), m === 0 && $("Can't create IO binding."), Y = { handle: m, outputPreferredLocations: M, outputPreferredLocationsEncoded: M.map((L) => L === "ml-tensor-cpu-output" ? "ml-tensor" : L).map((L) => an(L)) }), it.set(d, [d, w, T, Y, S, false]), [d, C, R, H, U];
11885
11917
  } catch (g) {
11886
- throw w.forEach((v) => i._OrtFree(v)), T.forEach((v) => i._OrtFree(v)), m !== 0 && i._OrtReleaseBinding(m) !== 0 && G("Can't release IO binding."), d !== 0 && i._OrtReleaseSession(d) !== 0 && G("Can't release session."), g;
11918
+ throw w.forEach((v) => i._OrtFree(v)), T.forEach((v) => i._OrtFree(v)), m !== 0 && i._OrtReleaseBinding(m) !== 0 && $("Can't release IO binding."), d !== 0 && i._OrtReleaseSession(d) !== 0 && $("Can't release session."), g;
11887
11919
  } finally {
11888
- i._free(s), l !== 0 && i._OrtReleaseSessionOptions(l) !== 0 && G("Can't release session options."), y.forEach((g) => i._free(g)), i.unmountExternalData?.();
11920
+ i._free(s), l !== 0 && i._OrtReleaseSessionOptions(l) !== 0 && $("Can't release session options."), y.forEach((g) => i._free(g)), i.unmountExternalData?.();
11889
11921
  }
11890
11922
  }, Zt = (a) => {
11891
- let r = $(), s = it.get(a);
11923
+ let r = z(), s = it.get(a);
11892
11924
  if (!s) throw new Error(`cannot release session. invalid session id: ${a}`);
11893
11925
  let [f, i, d, l, m] = s;
11894
- l && (m && r._OrtClearBoundOutputs(l.handle) !== 0 && G("Can't clear bound outputs."), r._OrtReleaseBinding(l.handle) !== 0 && G("Can't release IO binding.")), r.jsepOnReleaseSession?.(a), r.webnnOnReleaseSession?.(a), r.webgpuOnReleaseSession?.(a), i.forEach((y) => r._OrtFree(y)), d.forEach((y) => r._OrtFree(y)), r._OrtReleaseSession(f) !== 0 && G("Can't release session."), it.delete(a);
11926
+ l && (m && r._OrtClearBoundOutputs(l.handle) !== 0 && $("Can't clear bound outputs."), r._OrtReleaseBinding(l.handle) !== 0 && $("Can't release IO binding.")), r.jsepOnReleaseSession?.(a), r.webnnOnReleaseSession?.(a), r.webgpuOnReleaseSession?.(a), i.forEach((y) => r._OrtFree(y)), d.forEach((y) => r._OrtFree(y)), r._OrtReleaseSession(f) !== 0 && $("Can't release session."), it.delete(a);
11895
11927
  }, Ms = async (a, r, s, f, i, d, l = false) => {
11896
11928
  if (!a) {
11897
11929
  r.push(0);
11898
11930
  return;
11899
11931
  }
11900
- let m = $(), y = m.PTR_SIZE, w = a[0], T = a[1], g = a[3], v = g, S, C;
11932
+ let m = z(), y = m.PTR_SIZE, w = a[0], T = a[1], g = a[3], v = g, S, C;
11901
11933
  if (w === "string" && (g === "gpu-buffer" || g === "ml-tensor")) throw new Error("String tensor is not supported on GPU.");
11902
11934
  if (l && g !== "gpu-buffer") throw new Error(`External buffer must be provided for input/output index ${d} when enableGraphCapture is true.`);
11903
11935
  if (g === "gpu-buffer") {
@@ -11941,12 +11973,12 @@ var Kr = k(() => {
11941
11973
  try {
11942
11974
  T.forEach((M, Y) => m.setValue(H + Y * y, M, y === 4 ? "i32" : "i64"));
11943
11975
  let U = m._OrtCreateTensor(He(w), S, C, H, T.length, an(v));
11944
- U === 0 && G(`Can't create tensor for input/output. session=${f}, index=${d}.`), r.push(U);
11976
+ U === 0 && $(`Can't create tensor for input/output. session=${f}, index=${d}.`), r.push(U);
11945
11977
  } finally {
11946
11978
  m.stackRestore(R);
11947
11979
  }
11948
11980
  }, Kt = async (a, r, s, f, i, d) => {
11949
- let l = $(), m = l.PTR_SIZE, y = it.get(a);
11981
+ let l = z(), m = l.PTR_SIZE, y = it.get(a);
11950
11982
  if (!y) throw new Error(`cannot run inference. invalid session id: ${a}`);
11951
11983
  let w = y[0], T = y[1], g = y[2], v = y[3], S = y[4], C = y[5], R = r.length, H = f.length, U = 0, M = [], Y = [], L = [], W = [], oe = [], p = l.stackSave(), ne = l.stackAlloc(R * m), X = l.stackAlloc(R * m), J = l.stackAlloc(H * m), Ue = l.stackAlloc(H * m);
11952
11984
  try {
@@ -11962,33 +11994,33 @@ var Kr = k(() => {
11962
11994
  $e("wasm bindInputsOutputs");
11963
11995
  for (let q = 0; q < R; q++) {
11964
11996
  let we = r[q];
11965
- await l._OrtBindInput(_, T[we], Y[q]) !== 0 && G(`Can't bind input[${q}] for session=${a}.`);
11997
+ await l._OrtBindInput(_, T[we], Y[q]) !== 0 && $(`Can't bind input[${q}] for session=${a}.`);
11966
11998
  }
11967
11999
  for (let q = 0; q < H; q++) {
11968
12000
  let we = f[q];
11969
- i[q]?.[3] ? (oe.push(L[q]), l._OrtBindOutput(_, g[we], L[q], 0) !== 0 && G(`Can't bind pre-allocated output[${q}] for session=${a}.`)) : l._OrtBindOutput(_, g[we], 0, pe[we]) !== 0 && G(`Can't bind output[${q}] to ${ae[q]} for session=${a}.`);
12001
+ i[q]?.[3] ? (oe.push(L[q]), l._OrtBindOutput(_, g[we], L[q], 0) !== 0 && $(`Can't bind pre-allocated output[${q}] for session=${a}.`)) : l._OrtBindOutput(_, g[we], 0, pe[we]) !== 0 && $(`Can't bind output[${q}] to ${ae[q]} for session=${a}.`);
11970
12002
  }
11971
12003
  ze("wasm bindInputsOutputs"), it.set(a, [w, T, g, v, S, true]);
11972
12004
  }
11973
12005
  l.jsepOnRunStart?.(w), l.webnnOnRunStart?.(w);
11974
12006
  let Q;
11975
- v ? Q = await l._OrtRunWithBinding(w, v.handle, H, J, U) : Q = await l._OrtRun(w, X, ne, R, Ue, H, J, U), Q !== 0 && G("failed to call OrtRun().");
12007
+ v ? Q = await l._OrtRunWithBinding(w, v.handle, H, J, U) : Q = await l._OrtRun(w, X, ne, R, Ue, H, J, U), Q !== 0 && $("failed to call OrtRun().");
11976
12008
  let x = [], A = [];
11977
12009
  $e("wasm ProcessOutputTensor");
11978
12010
  for (let _ = 0; _ < H; _++) {
11979
12011
  let ae = Number(l.getValue(J + _ * m, "*"));
11980
12012
  if (ae === L[_] || oe.includes(L[_])) {
11981
- x.push(i[_]), ae !== L[_] && l._OrtReleaseTensor(ae) !== 0 && G("Can't release tensor.");
12013
+ x.push(i[_]), ae !== L[_] && l._OrtReleaseTensor(ae) !== 0 && $("Can't release tensor.");
11982
12014
  continue;
11983
12015
  }
11984
12016
  let pe = l.stackSave(), q = l.stackAlloc(4 * m), we = false, re, se = 0;
11985
12017
  try {
11986
- l._OrtGetTensorData(ae, q, q + m, q + 2 * m, q + 3 * m) !== 0 && G(`Can't access output tensor data on index ${_}.`);
12018
+ l._OrtGetTensorData(ae, q, q + m, q + 2 * m, q + 3 * m) !== 0 && $(`Can't access output tensor data on index ${_}.`);
11987
12019
  let Te = m === 4 ? "i32" : "i64", Ye = Number(l.getValue(q, Te));
11988
12020
  se = l.getValue(q + m, "*");
11989
12021
  let bt = l.getValue(q + m * 2, "*"), wt = Number(l.getValue(q + m * 3, Te)), Se = [];
11990
12022
  for (let ee = 0; ee < wt; ee++) Se.push(Number(l.getValue(bt + ee * m, Te)));
11991
- l._OrtFree(bt) !== 0 && G("Can't free memory for tensor dims.");
12023
+ l._OrtFree(bt) !== 0 && $("Can't free memory for tensor dims.");
11992
12024
  let Ae = Se.reduce((ee, Z) => ee * Z, 1);
11993
12025
  re = or(Ye);
11994
12026
  let Oe = v?.outputPreferredLocations[f[_]];
@@ -11996,24 +12028,24 @@ var Kr = k(() => {
11996
12028
  if (Oe === "gpu-buffer" || Oe === "ml-tensor") throw new Error("String tensor is not supported on GPU.");
11997
12029
  let ee = [];
11998
12030
  for (let Z = 0; Z < Ae; Z++) {
11999
- let z = l.getValue(se + Z * m, "*"), V = l.getValue(se + (Z + 1) * m, "*"), qe = Z === Ae - 1 ? void 0 : V - z;
12000
- ee.push(l.UTF8ToString(z, qe));
12031
+ let G = l.getValue(se + Z * m, "*"), V = l.getValue(se + (Z + 1) * m, "*"), qe = Z === Ae - 1 ? void 0 : V - G;
12032
+ ee.push(l.UTF8ToString(G, qe));
12001
12033
  }
12002
12034
  x.push([re, Se, ee, "cpu"]);
12003
12035
  } else if (Oe === "gpu-buffer" && Ae > 0) {
12004
12036
  let ee = l.webgpuGetBuffer;
12005
12037
  if (!ee) throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');
12006
- let Z = ee(se), z = mt(Ye, Ae);
12007
- if (z === void 0 || !ar(re)) throw new Error(`Unsupported data type: ${re}`);
12038
+ let Z = ee(se), G = mt(Ye, Ae);
12039
+ if (G === void 0 || !ar(re)) throw new Error(`Unsupported data type: ${re}`);
12008
12040
  we = true;
12009
12041
  {
12010
12042
  l.webgpuRegisterBuffer(Z, a, se);
12011
- let V = l.webgpuCreateDownloader(Z, z, a);
12043
+ let V = l.webgpuCreateDownloader(Z, G, a);
12012
12044
  x.push([re, Se, { gpuBuffer: Z, download: async () => {
12013
12045
  let qe = await V();
12014
12046
  return new (at(re))(qe);
12015
12047
  }, dispose: () => {
12016
- l._OrtReleaseTensor(ae) !== 0 && G("Can't release tensor.");
12048
+ l._OrtReleaseTensor(ae) !== 0 && $("Can't release tensor.");
12017
12049
  } }, "gpu-buffer"]);
12018
12050
  }
12019
12051
  } else if (Oe === "ml-tensor" && Ae > 0) {
@@ -12028,8 +12060,8 @@ var Kr = k(() => {
12028
12060
  } else if (Oe === "ml-tensor-cpu-output" && Ae > 0) {
12029
12061
  let ee = l.webnnCreateMLTensorDownloader(se, re)(), Z = x.length;
12030
12062
  we = true, A.push((async () => {
12031
- let z = [Z, await ee];
12032
- return l.webnnReleaseTensorId(se), l._OrtReleaseTensor(ae), z;
12063
+ let G = [Z, await ee];
12064
+ return l.webnnReleaseTensorId(se), l._OrtReleaseTensor(ae), G;
12033
12065
  })()), x.push([re, Se, [], "cpu"]);
12034
12066
  } else {
12035
12067
  let ee = at(re), Z = new ee(Ae);
@@ -12039,7 +12071,7 @@ var Kr = k(() => {
12039
12071
  l.stackRestore(pe), re === "string" && se && l._free(se), we || l._OrtReleaseTensor(ae);
12040
12072
  }
12041
12073
  }
12042
- v && !S && (l._OrtClearBoundOutputs(v.handle) !== 0 && G("Can't clear bound outputs."), it.set(a, [w, T, g, v, S, false]));
12074
+ v && !S && (l._OrtClearBoundOutputs(v.handle) !== 0 && $("Can't clear bound outputs."), it.set(a, [w, T, g, v, S, false]));
12043
12075
  for (let [_, ae] of await Promise.all(A)) x[_][2] = ae;
12044
12076
  return ze("wasm ProcessOutputTensor"), x;
12045
12077
  } finally {
@@ -12050,10 +12082,10 @@ var Kr = k(() => {
12050
12082
  }), Y.forEach((Q) => l._OrtReleaseTensor(Q)), L.forEach((Q) => l._OrtReleaseTensor(Q)), W.forEach((Q) => l._free(Q)), U !== 0 && l._OrtReleaseRunOptions(U), M.forEach((Q) => l._free(Q));
12051
12083
  }
12052
12084
  }, er = (a) => {
12053
- let r = $(), s = it.get(a);
12085
+ let r = z(), s = it.get(a);
12054
12086
  if (!s) throw new Error("invalid session id");
12055
12087
  let f = s[0], i = r._OrtEndProfiling(f);
12056
- i === 0 && G("Can't get an profile file name."), r._OrtFree(i);
12088
+ i === 0 && $("Can't get an profile file name."), r._OrtFree(i);
12057
12089
  }, tr = (a) => {
12058
12090
  let r = [];
12059
12091
  for (let s of a) {
@@ -12286,7 +12318,7 @@ var $s = k(() => {
12286
12318
  Ve();
12287
12319
  Ve();
12288
12320
  Ve();
12289
- var Xa = "1.25.0-dev.20260307-d626b568e0";
12321
+ var Xa = "1.25.0-dev.20260323-a99aad9d36";
12290
12322
  var Tl = Zr;
12291
12323
  {
12292
12324
  let a = ($s(), $t(Gs)).wasmBackend;
@@ -16642,7 +16674,9 @@ var processors_exports = {};
16642
16674
  __export(processors_exports, {
16643
16675
  ChatterboxProcessor: () => ChatterboxProcessor,
16644
16676
  Florence2Processor: () => Florence2Processor,
16677
+ Gemma3Processor: () => Gemma3Processor,
16645
16678
  Gemma3nProcessor: () => Gemma3nProcessor,
16679
+ Glm46VProcessor: () => Glm46VProcessor,
16646
16680
  GraniteSpeechProcessor: () => GraniteSpeechProcessor,
16647
16681
  GroundingDinoProcessor: () => GroundingDinoProcessor,
16648
16682
  Idefics3Processor: () => Idefics3Processor,
@@ -19147,26 +19181,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
19147
19181
  }
19148
19182
  return [segmentation, segments];
19149
19183
  }
19150
- function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
19184
+ function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
19151
19185
  if (height < factor || width < factor) {
19152
- throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
19153
- } else if (Math.max(height, width) / Math.min(height, width) > 200) {
19186
+ const scale = Math.max(factor / height, factor / width);
19187
+ height = Math.round(height * scale);
19188
+ width = Math.round(width * scale);
19189
+ }
19190
+ if (Math.max(height, width) / Math.min(height, width) > 200) {
19154
19191
  throw new Error(
19155
19192
  `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
19156
19193
  );
19157
19194
  }
19158
19195
  let h_bar = Math.round(height / factor) * factor;
19159
19196
  let w_bar = Math.round(width / factor) * factor;
19160
- if (h_bar * w_bar > max_pixels) {
19161
- const beta = Math.sqrt(height * width / max_pixels);
19162
- h_bar = Math.floor(height / beta / factor) * factor;
19163
- w_bar = Math.floor(width / beta / factor) * factor;
19164
- } else if (h_bar * w_bar < min_pixels) {
19165
- const beta = Math.sqrt(min_pixels / (height * width));
19197
+ if (temporal_factor * h_bar * w_bar > max_pixels) {
19198
+ const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
19199
+ h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
19200
+ w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
19201
+ } else if (temporal_factor * h_bar * w_bar < min_pixels) {
19202
+ const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
19166
19203
  h_bar = Math.ceil(height * beta / factor) * factor;
19167
19204
  w_bar = Math.ceil(width * beta / factor) * factor;
19168
19205
  }
19169
- return [h_bar, w_bar];
19206
+ return [w_bar, h_bar];
19170
19207
  }
19171
19208
  function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
19172
19209
  if (label_ids_to_fuse === null) {
@@ -19245,7 +19282,7 @@ var ImageProcessor = class extends Callable2 {
19245
19282
  this.do_pad = config.do_pad;
19246
19283
  this.min_pixels = config.min_pixels;
19247
19284
  this.max_pixels = config.max_pixels;
19248
- if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
19285
+ if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
19249
19286
  this.pad_size = this.size;
19250
19287
  }
19251
19288
  this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -19533,10 +19570,8 @@ var ImageProcessor = class extends Callable2 {
19533
19570
  const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
19534
19571
  [pixelData, imgDims] = padded;
19535
19572
  } else if (this.size_divisibility) {
19536
- const [paddedWidth, paddedHeight] = enforce_size_divisibility(
19537
- [imgDims[1], imgDims[0]],
19538
- this.size_divisibility
19539
- );
19573
+ const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
19574
+ const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
19540
19575
  [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
19541
19576
  }
19542
19577
  }
@@ -19613,6 +19648,7 @@ var image_processors_exports = {};
19613
19648
  __export(image_processors_exports, {
19614
19649
  BeitFeatureExtractor: () => BeitFeatureExtractor,
19615
19650
  BitImageProcessor: () => BitImageProcessor,
19651
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
19616
19652
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
19617
19653
  CLIPImageProcessor: () => CLIPImageProcessor,
19618
19654
  ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -19629,6 +19665,8 @@ __export(image_processors_exports, {
19629
19665
  DonutImageProcessor: () => DonutImageProcessor,
19630
19666
  EfficientNetImageProcessor: () => EfficientNetImageProcessor,
19631
19667
  GLPNFeatureExtractor: () => GLPNFeatureExtractor,
19668
+ Gemma3ImageProcessor: () => Gemma3ImageProcessor,
19669
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
19632
19670
  GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
19633
19671
  Idefics3ImageProcessor: () => Idefics3ImageProcessor,
19634
19672
  ImageFeatureExtractor: () => ImageProcessor,
@@ -19689,6 +19727,10 @@ var BitImageProcessor = class extends ImageProcessor {
19689
19727
  var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
19690
19728
  };
19691
19729
 
19730
+ // src/models/chmv2/image_processing_chmv2.js
19731
+ var CHMv2ImageProcessor = class extends ImageProcessor {
19732
+ };
19733
+
19692
19734
  // src/models/clip/image_processing_clip.js
19693
19735
  var CLIPImageProcessor = class extends ImageProcessor {
19694
19736
  };
@@ -19808,6 +19850,69 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
19808
19850
  }
19809
19851
  };
19810
19852
 
19853
+ // src/models/gemma3/image_processing_gemma3.js
19854
+ var Gemma3ImageProcessor = class extends ImageProcessor {
19855
+ };
19856
+
19857
+ // src/models/qwen2_vl/image_processing_qwen2_vl.js
19858
+ var Qwen2VLImageProcessor = class extends ImageProcessor {
19859
+ constructor(config) {
19860
+ super(config);
19861
+ this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
19862
+ this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
19863
+ this.patch_size = config.patch_size;
19864
+ this.merge_size = config.merge_size;
19865
+ }
19866
+ /** @type {ImageProcessor['get_resize_output_image_size']} */
19867
+ get_resize_output_image_size(image, size) {
19868
+ const factor = this.patch_size * this.merge_size;
19869
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
19870
+ }
19871
+ async _call(images, ...args) {
19872
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
19873
+ let patches = pixel_values;
19874
+ const { temporal_patch_size, merge_size, patch_size } = this.config;
19875
+ if (patches.dims[0] === 1) {
19876
+ patches = cat(
19877
+ Array.from({ length: temporal_patch_size }, () => patches),
19878
+ 0
19879
+ );
19880
+ }
19881
+ const grid_t = patches.dims[0] / temporal_patch_size;
19882
+ const channel = patches.dims[1];
19883
+ const grid_h = Math.floor(patches.dims[2] / patch_size);
19884
+ const grid_w = Math.floor(patches.dims[3] / patch_size);
19885
+ const flatten_patches = patches.view(
19886
+ grid_t,
19887
+ temporal_patch_size,
19888
+ channel,
19889
+ Math.floor(grid_h / merge_size),
19890
+ merge_size,
19891
+ patch_size,
19892
+ Math.floor(grid_w / merge_size),
19893
+ merge_size,
19894
+ patch_size
19895
+ ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
19896
+ const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
19897
+ return {
19898
+ pixel_values: flatten_patches,
19899
+ image_grid_thw,
19900
+ original_sizes,
19901
+ reshaped_input_sizes
19902
+ };
19903
+ }
19904
+ };
19905
+
19906
+ // src/models/glm46v/image_processing_glm46v.js
19907
+ var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
19908
+ /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
19909
+ get_resize_output_image_size(image, size) {
19910
+ const factor = this.patch_size * this.merge_size;
19911
+ const temporal_factor = this.config.temporal_patch_size ?? 2;
19912
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
19913
+ }
19914
+ };
19915
+
19811
19916
  // src/models/glpn/image_processing_glpn.js
19812
19917
  var GLPNFeatureExtractor = class extends ImageProcessor {
19813
19918
  };
@@ -20201,7 +20306,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
20201
20306
  const img = pixel_values.unsqueeze_(0);
20202
20307
  const total_factor = this.encoder_patch_size * this.downsample_factor;
20203
20308
  const f2 = total_factor ** 2;
20204
- const [new_height, new_width] = smart_resize(
20309
+ const [new_width, new_height] = smart_resize(
20205
20310
  Math.max(total_factor, height),
20206
20311
  Math.max(total_factor, width),
20207
20312
  total_factor,
@@ -20491,55 +20596,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
20491
20596
  var PvtImageProcessor = class extends ImageProcessor {
20492
20597
  };
20493
20598
 
20494
- // src/models/qwen2_vl/image_processing_qwen2_vl.js
20495
- var Qwen2VLImageProcessor = class extends ImageProcessor {
20496
- constructor(config) {
20497
- super(config);
20498
- this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
20499
- this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
20500
- this.patch_size = config.patch_size;
20501
- this.merge_size = config.merge_size;
20502
- }
20503
- /** @type {ImageProcessor['get_resize_output_image_size']} */
20504
- get_resize_output_image_size(image, size) {
20505
- const factor = this.patch_size * this.merge_size;
20506
- return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
20507
- }
20508
- async _call(images, ...args) {
20509
- const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
20510
- let patches = pixel_values;
20511
- const { temporal_patch_size, merge_size, patch_size } = this.config;
20512
- if (patches.dims[0] === 1) {
20513
- patches = cat(
20514
- Array.from({ length: temporal_patch_size }, () => patches),
20515
- 0
20516
- );
20517
- }
20518
- const grid_t = patches.dims[0] / temporal_patch_size;
20519
- const channel = patches.dims[1];
20520
- const grid_h = Math.floor(patches.dims[2] / patch_size);
20521
- const grid_w = Math.floor(patches.dims[3] / patch_size);
20522
- const flatten_patches = patches.view(
20523
- grid_t,
20524
- temporal_patch_size,
20525
- channel,
20526
- Math.floor(grid_h / merge_size),
20527
- merge_size,
20528
- patch_size,
20529
- Math.floor(grid_w / merge_size),
20530
- merge_size,
20531
- patch_size
20532
- ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
20533
- const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
20534
- return {
20535
- pixel_values: flatten_patches,
20536
- image_grid_thw,
20537
- original_sizes,
20538
- reshaped_input_sizes
20539
- };
20540
- }
20541
- };
20542
-
20543
20599
  // src/models/rt_detr/image_processing_rt_detr.js
20544
20600
  var RTDetrImageProcessor = class extends ImageProcessor {
20545
20601
  /** @type {typeof post_process_object_detection} */
@@ -21021,6 +21077,48 @@ var Florence2Processor = class extends Processor {
21021
21077
  }
21022
21078
  };
21023
21079
 
21080
+ // src/models/gemma3/processing_gemma3.js
21081
+ var Gemma3Processor = class extends Processor {
21082
+ static tokenizer_class = AutoTokenizer;
21083
+ static image_processor_class = AutoImageProcessor;
21084
+ static uses_processor_config = true;
21085
+ static uses_chat_template_file = true;
21086
+ constructor(config, components, chat_template) {
21087
+ super(config, components, chat_template);
21088
+ this.image_seq_length = this.config.image_seq_length;
21089
+ const { boi_token, image_token, eoi_token } = this.tokenizer.config;
21090
+ this.boi_token = boi_token;
21091
+ this.image_token = image_token;
21092
+ this.eoi_token = eoi_token;
21093
+ const image_tokens_expanded = image_token.repeat(this.image_seq_length);
21094
+ this.full_image_sequence = `
21095
+
21096
+ ${boi_token}${image_tokens_expanded}${eoi_token}
21097
+
21098
+ `;
21099
+ }
21100
+ /**
21101
+ * @param {string|string[]} text
21102
+ * @param {import('../../utils/image.js').RawImage|import('../../utils/image.js').RawImage[]} [images]
21103
+ * @param {Object} [options]
21104
+ */
21105
+ async _call(text, images = null, options = {}) {
21106
+ if (typeof text === "string") {
21107
+ text = [text];
21108
+ }
21109
+ let image_inputs;
21110
+ if (images) {
21111
+ image_inputs = await this.image_processor(images, options);
21112
+ text = text.map((prompt) => prompt.replaceAll(this.boi_token, this.full_image_sequence));
21113
+ }
21114
+ const text_inputs = this.tokenizer(text, options);
21115
+ return {
21116
+ ...text_inputs,
21117
+ ...image_inputs
21118
+ };
21119
+ }
21120
+ };
21121
+
21024
21122
  // src/models/gemma3n/processing_gemma3n.js
21025
21123
  var Gemma3nProcessor = class extends Processor {
21026
21124
  static image_processor_class = AutoImageProcessor;
@@ -21093,6 +21191,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
21093
21191
  }
21094
21192
  };
21095
21193
 
21194
+ // src/models/qwen2_vl/processing_qwen2_vl.js
21195
+ var Qwen2VLProcessor = class extends Processor {
21196
+ static image_processor_class = AutoImageProcessor;
21197
+ static tokenizer_class = AutoTokenizer;
21198
+ static image_token = "<|image_pad|>";
21199
+ /**
21200
+ *
21201
+ * @param {string|string[]} text
21202
+ * @param {RawImage|RawImage[]} images
21203
+ * @param {...any} args
21204
+ * @returns {Promise<any>}
21205
+ */
21206
+ async _call(text, images = null, ...args) {
21207
+ if (!Array.isArray(text)) {
21208
+ text = [text];
21209
+ }
21210
+ let image_inputs, image_grid_thw;
21211
+ if (images) {
21212
+ image_inputs = await this.image_processor(images);
21213
+ image_grid_thw = image_inputs.image_grid_thw;
21214
+ }
21215
+ if (image_grid_thw) {
21216
+ let merge_length = this.image_processor.config.merge_size ** 2;
21217
+ let index = 0;
21218
+ const image_token = (
21219
+ /** @type {typeof Qwen2VLProcessor} */
21220
+ this.constructor.image_token
21221
+ );
21222
+ const image_grid_thw_list = image_grid_thw.tolist();
21223
+ text = text.map((t) => {
21224
+ while (t.includes(image_token)) {
21225
+ const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
21226
+ t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
21227
+ }
21228
+ return t.replaceAll("<|placeholder|>", image_token);
21229
+ });
21230
+ }
21231
+ const text_inputs = this.tokenizer(text);
21232
+ return {
21233
+ ...text_inputs,
21234
+ ...image_inputs
21235
+ };
21236
+ }
21237
+ };
21238
+
21239
+ // src/models/glm46v/processing_glm46v.js
21240
+ var Glm46VProcessor = class extends Qwen2VLProcessor {
21241
+ static image_token = "<|image|>";
21242
+ };
21243
+
21096
21244
  // src/models/granite_speech/processing_granite_speech.js
21097
21245
  var GraniteSpeechProcessor = class extends Processor {
21098
21246
  static tokenizer_class = AutoTokenizer;
@@ -21823,47 +21971,6 @@ var PyAnnoteProcessor = class extends Processor {
21823
21971
  }
21824
21972
  };
21825
21973
 
21826
- // src/models/qwen2_vl/processing_qwen2_vl.js
21827
- var Qwen2VLProcessor = class extends Processor {
21828
- static image_processor_class = AutoImageProcessor;
21829
- static tokenizer_class = AutoTokenizer;
21830
- /**
21831
- *
21832
- * @param {string|string[]} text
21833
- * @param {RawImage|RawImage[]} images
21834
- * @param {...any} args
21835
- * @returns {Promise<any>}
21836
- */
21837
- async _call(text, images = null, ...args) {
21838
- if (!Array.isArray(text)) {
21839
- text = [text];
21840
- }
21841
- let image_inputs, image_grid_thw;
21842
- if (images) {
21843
- image_inputs = await this.image_processor(images);
21844
- image_grid_thw = image_inputs.image_grid_thw;
21845
- }
21846
- if (image_grid_thw) {
21847
- let merge_length = this.image_processor.config.merge_size ** 2;
21848
- let index = 0;
21849
- const image_grid_thw_list = image_grid_thw.tolist();
21850
- text = text.map((t) => {
21851
- while (t.includes("<|image_pad|>")) {
21852
- const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
21853
- t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
21854
- }
21855
- return t.replaceAll("<|placeholder|>", "<|image_pad|>");
21856
- });
21857
- }
21858
- const text_inputs = this.tokenizer(text);
21859
- return {
21860
- ...text_inputs,
21861
- ...image_inputs
21862
- // TODO: ...videos_inputs,
21863
- };
21864
- }
21865
- };
21866
-
21867
21974
  // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
21868
21975
  var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
21869
21976
  };
@@ -22207,6 +22314,8 @@ function getNormalizedConfig(config) {
22207
22314
  case "gemma3n":
22208
22315
  case "lfm2_vl":
22209
22316
  case "chatterbox":
22317
+ case "lighton_ocr":
22318
+ case "glm_ocr":
22210
22319
  case "mistral3":
22211
22320
  case "qwen2_5_vl":
22212
22321
  case "qwen3_vl":
@@ -22282,6 +22391,8 @@ function getNormalizedConfig(config) {
22282
22391
  mapping["dim_kv"] = "head_dim";
22283
22392
  break;
22284
22393
  case "qwen3":
22394
+ case "solar_open":
22395
+ case "glm_ocr_text":
22285
22396
  case "gemma":
22286
22397
  case "gemma2":
22287
22398
  case "vaultgemma":
@@ -22292,6 +22403,7 @@ function getNormalizedConfig(config) {
22292
22403
  case "ernie4_5":
22293
22404
  case "hunyuan_v1_dense":
22294
22405
  case "falcon_h1":
22406
+ case "nemotron_h":
22295
22407
  case "ministral":
22296
22408
  case "ministral3":
22297
22409
  mapping["num_heads"] = "num_key_value_heads";
@@ -22326,6 +22438,9 @@ function getNormalizedConfig(config) {
22326
22438
  mapping["num_attention_heads"] = "num_attention_heads";
22327
22439
  break;
22328
22440
  case "youtu":
22441
+ case "deepseek_v3":
22442
+ case "glm_moe_dsa":
22443
+ case "mistral4":
22329
22444
  mapping["num_heads"] = "num_key_value_heads";
22330
22445
  mapping["num_layers"] = "num_hidden_layers";
22331
22446
  mapping["dim_kv"] = "qk_head_dim";
@@ -22414,6 +22529,7 @@ function getCacheShapes(config, options) {
22414
22529
  if (!(config instanceof PretrainedConfig)) {
22415
22530
  config = new PretrainedConfig(config);
22416
22531
  }
22532
+ const batch_size = options?.batch_size ?? 1;
22417
22533
  if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
22418
22534
  const pkv_prefix = options?.prefix ?? "past_key_values";
22419
22535
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -22423,7 +22539,6 @@ function getCacheShapes(config, options) {
22423
22539
  config
22424
22540
  );
22425
22541
  const head_dim = hidden_size / num_attention_heads;
22426
- const batch_size = options?.batch_size ?? 1;
22427
22542
  for (let i = 0; i < layer_types.length; ++i) {
22428
22543
  if (layer_types[i] === "full_attention") {
22429
22544
  for (const kv of ["key", "value"]) {
@@ -22436,31 +22551,26 @@ function getCacheShapes(config, options) {
22436
22551
  }
22437
22552
  }
22438
22553
  return cache_values;
22439
- } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
22554
+ } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
22440
22555
  const pkv_prefix = options?.prefix ?? "past_key_values";
22441
22556
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
22442
- const cache_values = {};
22443
- const {
22444
- layer_types,
22445
- num_hidden_layers,
22446
- num_attention_heads,
22447
- num_key_value_heads,
22448
- hidden_size,
22449
- mamba_d_conv,
22450
- mamba_n_heads,
22451
- mamba_d_head,
22452
- mamba_d_state,
22453
- mamba_n_groups,
22454
- mamba_expand,
22455
- mamba_d_ssm
22456
- } = (
22557
+ const c = (
22457
22558
  /** @type {any} */
22458
22559
  config
22459
22560
  );
22460
- const head_dim = hidden_size / num_attention_heads;
22461
- const batch_size = options?.batch_size ?? 1;
22462
- const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
22463
- for (let i = 0; i < num_hidden_layers; ++i) {
22561
+ const layer_types = c.layer_types ?? c.layers_block_type;
22562
+ const num_layers = c.num_hidden_layers ?? layer_types?.length;
22563
+ const num_key_value_heads = c.num_key_value_heads;
22564
+ const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
22565
+ const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
22566
+ const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
22567
+ const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
22568
+ const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
22569
+ const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
22570
+ const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
22571
+ const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
22572
+ const cache_values = {};
22573
+ for (let i = 0; i < num_layers; ++i) {
22464
22574
  if (!layer_types || layer_types[i] === "mamba") {
22465
22575
  cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
22466
22576
  cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -22494,7 +22604,6 @@ function getCacheShapes(config, options) {
22494
22604
  const key_dim = linear_key_head_dim * linear_num_key_heads;
22495
22605
  const value_dim = linear_value_head_dim * linear_num_value_heads;
22496
22606
  const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
22497
- const batch_size = options?.batch_size ?? 1;
22498
22607
  for (let i = 0; i < layer_types.length; ++i) {
22499
22608
  if (layer_types[i] === "full_attention") {
22500
22609
  for (const kv of ["key", "value"]) {
@@ -24190,8 +24299,7 @@ var MODEL_TYPES = {
24190
24299
  ImageAudioTextToText: 13,
24191
24300
  Supertonic: 14,
24192
24301
  Chatterbox: 15,
24193
- MultimodalLanguageModelOnly: 16,
24194
- VoxtralRealtime: 17
24302
+ VoxtralRealtime: 16
24195
24303
  };
24196
24304
  var MODEL_TYPE_CONFIG = {
24197
24305
  [MODEL_TYPES.DecoderOnly]: {
@@ -24248,12 +24356,12 @@ var MODEL_TYPE_CONFIG = {
24248
24356
  can_generate: true,
24249
24357
  forward: image_text_to_text_forward,
24250
24358
  prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
24251
- sessions: (config) => {
24359
+ sessions: (config, options, textOnly) => {
24252
24360
  const s = {
24253
24361
  embed_tokens: "embed_tokens",
24254
- vision_encoder: "vision_encoder",
24255
24362
  decoder_model_merged: "decoder_model_merged"
24256
24363
  };
24364
+ if (!textOnly) s["vision_encoder"] = "vision_encoder";
24257
24365
  if (config.is_encoder_decoder) s["model"] = "encoder_model";
24258
24366
  return s;
24259
24367
  },
@@ -24275,12 +24383,17 @@ var MODEL_TYPE_CONFIG = {
24275
24383
  [MODEL_TYPES.ImageAudioTextToText]: {
24276
24384
  can_generate: true,
24277
24385
  prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
24278
- sessions: () => ({
24279
- embed_tokens: "embed_tokens",
24280
- audio_encoder: "audio_encoder",
24281
- vision_encoder: "vision_encoder",
24282
- decoder_model_merged: "decoder_model_merged"
24283
- }),
24386
+ sessions: (config, options, textOnly) => {
24387
+ const s = {
24388
+ embed_tokens: "embed_tokens",
24389
+ decoder_model_merged: "decoder_model_merged"
24390
+ };
24391
+ if (!textOnly) {
24392
+ s["audio_encoder"] = "audio_encoder";
24393
+ s["vision_encoder"] = "vision_encoder";
24394
+ }
24395
+ return s;
24396
+ },
24284
24397
  optional_configs: { generation_config: "generation_config.json" }
24285
24398
  },
24286
24399
  [MODEL_TYPES.Phi3V]: {
@@ -24331,14 +24444,6 @@ var MODEL_TYPE_CONFIG = {
24331
24444
  cache_sessions: { model: true },
24332
24445
  optional_configs: { generation_config: "generation_config.json" }
24333
24446
  },
24334
- [MODEL_TYPES.MultimodalLanguageModelOnly]: {
24335
- can_generate: true,
24336
- forward: image_text_to_text_forward,
24337
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
24338
- sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
24339
- cache_sessions: { decoder_model_merged: true },
24340
- optional_configs: { generation_config: "generation_config.json" }
24341
- },
24342
24447
  [MODEL_TYPES.VoxtralRealtime]: {
24343
24448
  can_generate: true,
24344
24449
  prepare_inputs: decoder_prepare_inputs_for_generation,
@@ -24364,6 +24469,19 @@ function getSessionsConfig(modelType, config, options = {}) {
24364
24469
  optional_configs: typeConfig.optional_configs
24365
24470
  };
24366
24471
  }
24472
+ function resolveTypeConfig(modelName, config) {
24473
+ let modelType = MODEL_TYPE_MAPPING.get(modelName);
24474
+ let textOnly = false;
24475
+ const nativeArch = config?.architectures?.[0];
24476
+ if (nativeArch && nativeArch !== modelName && modelName?.endsWith("ForCausalLM") && nativeArch.endsWith("ForConditionalGeneration")) {
24477
+ const nativeType = MODEL_TYPE_MAPPING.get(nativeArch);
24478
+ if (nativeType !== void 0) {
24479
+ modelType = nativeType;
24480
+ textOnly = true;
24481
+ }
24482
+ }
24483
+ return { typeConfig: MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default, textOnly, modelType };
24484
+ }
24367
24485
  var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
24368
24486
  var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
24369
24487
  var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
@@ -24383,8 +24501,7 @@ var PreTrainedModel = class extends Callable2 {
24383
24501
  this.sessions = sessions;
24384
24502
  this.configs = configs;
24385
24503
  const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor);
24386
- const modelType = MODEL_TYPE_MAPPING.get(modelName);
24387
- const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
24504
+ const { typeConfig } = resolveTypeConfig(modelName, config);
24388
24505
  this.can_generate = typeConfig.can_generate;
24389
24506
  this._forward = typeConfig.forward;
24390
24507
  this._prepare_inputs_for_generation = typeConfig.prepare_inputs;
@@ -24447,9 +24564,8 @@ var PreTrainedModel = class extends Callable2 {
24447
24564
  session_options
24448
24565
  };
24449
24566
  const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
24450
- const modelType = MODEL_TYPE_MAPPING.get(modelName);
24451
24567
  config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
24452
- const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
24568
+ const { typeConfig, textOnly, modelType } = resolveTypeConfig(modelName, config);
24453
24569
  if (modelType === void 0) {
24454
24570
  const type = modelName ?? config?.model_type;
24455
24571
  if (type !== "custom") {
@@ -24458,7 +24574,7 @@ var PreTrainedModel = class extends Callable2 {
24458
24574
  );
24459
24575
  }
24460
24576
  }
24461
- const sessions = typeConfig.sessions(config, options);
24577
+ const sessions = typeConfig.sessions(config, options, textOnly);
24462
24578
  const promises = [
24463
24579
  constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
24464
24580
  ];
@@ -25122,7 +25238,9 @@ async function generic_text_to_text_forward(self2, {
25122
25238
  "qwen3_5",
25123
25239
  "qwen3_5_text",
25124
25240
  "qwen3_5_moe",
25125
- "qwen3_5_moe_text"
25241
+ "qwen3_5_moe_text",
25242
+ "glm_ocr",
25243
+ "glm_ocr_text"
25126
25244
  ].includes(self2.config.model_type)
25127
25245
  ) {
25128
25246
  const { image_grid_thw, video_grid_thw } = kwargs;
@@ -25346,6 +25464,8 @@ __export(models_exports, {
25346
25464
  BloomForCausalLM: () => BloomForCausalLM,
25347
25465
  BloomModel: () => BloomModel,
25348
25466
  BloomPreTrainedModel: () => BloomPreTrainedModel,
25467
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
25468
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
25349
25469
  CLIPModel: () => CLIPModel,
25350
25470
  CLIPPreTrainedModel: () => CLIPPreTrainedModel,
25351
25471
  CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -25420,6 +25540,9 @@ __export(models_exports, {
25420
25540
  DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
25421
25541
  DecisionTransformerModel: () => DecisionTransformerModel,
25422
25542
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
25543
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
25544
+ DeepseekV3Model: () => DeepseekV3Model,
25545
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
25423
25546
  DeiTForImageClassification: () => DeiTForImageClassification,
25424
25547
  DeiTModel: () => DeiTModel,
25425
25548
  DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -25465,6 +25588,11 @@ __export(models_exports, {
25465
25588
  EsmForTokenClassification: () => EsmForTokenClassification,
25466
25589
  EsmModel: () => EsmModel,
25467
25590
  EsmPreTrainedModel: () => EsmPreTrainedModel,
25591
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
25592
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
25593
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
25594
+ EuroBertModel: () => EuroBertModel,
25595
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
25468
25596
  ExaoneForCausalLM: () => ExaoneForCausalLM,
25469
25597
  ExaoneModel: () => ExaoneModel,
25470
25598
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -25501,6 +25629,7 @@ __export(models_exports, {
25501
25629
  Gemma2Model: () => Gemma2Model,
25502
25630
  Gemma2PreTrainedModel: () => Gemma2PreTrainedModel,
25503
25631
  Gemma3ForCausalLM: () => Gemma3ForCausalLM,
25632
+ Gemma3ForConditionalGeneration: () => Gemma3ForConditionalGeneration,
25504
25633
  Gemma3Model: () => Gemma3Model,
25505
25634
  Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
25506
25635
  Gemma3nForCausalLM: () => Gemma3nForCausalLM,
@@ -25511,6 +25640,10 @@ __export(models_exports, {
25511
25640
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
25512
25641
  GlmForCausalLM: () => GlmForCausalLM,
25513
25642
  GlmModel: () => GlmModel,
25643
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
25644
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
25645
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
25646
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
25514
25647
  GlmPreTrainedModel: () => GlmPreTrainedModel,
25515
25648
  GptOssForCausalLM: () => GptOssForCausalLM,
25516
25649
  GptOssModel: () => GptOssModel,
@@ -25557,6 +25690,7 @@ __export(models_exports, {
25557
25690
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
25558
25691
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
25559
25692
  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
25693
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
25560
25694
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
25561
25695
  Llama4ForCausalLM: () => Llama4ForCausalLM,
25562
25696
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -25606,6 +25740,9 @@ __export(models_exports, {
25606
25740
  MimiEncoderOutput: () => MimiEncoderOutput,
25607
25741
  MimiModel: () => MimiModel,
25608
25742
  MimiPreTrainedModel: () => MimiPreTrainedModel,
25743
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
25744
+ Mistral4Model: () => Mistral4Model,
25745
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
25609
25746
  MistralForCausalLM: () => MistralForCausalLM,
25610
25747
  MistralModel: () => MistralModel,
25611
25748
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -25663,6 +25800,9 @@ __export(models_exports, {
25663
25800
  NanoChatForCausalLM: () => NanoChatForCausalLM,
25664
25801
  NanoChatModel: () => NanoChatModel,
25665
25802
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
25803
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
25804
+ NemotronHModel: () => NemotronHModel,
25805
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
25666
25806
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
25667
25807
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
25668
25808
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -25800,6 +25940,9 @@ __export(models_exports, {
25800
25940
  SnacEncoderModel: () => SnacEncoderModel,
25801
25941
  SnacModel: () => SnacModel,
25802
25942
  SnacPreTrainedModel: () => SnacPreTrainedModel,
25943
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
25944
+ SolarOpenModel: () => SolarOpenModel,
25945
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
25803
25946
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
25804
25947
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
25805
25948
  SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -25974,7 +26117,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
25974
26117
  var ArceeForCausalLM = class extends ArceePreTrainedModel {
25975
26118
  };
25976
26119
 
25977
- // src/models/ast/modeling_ast.js
26120
+ // src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
25978
26121
  var ASTPreTrainedModel = class extends PreTrainedModel {
25979
26122
  };
25980
26123
  var ASTModel = class extends ASTPreTrainedModel {
@@ -26309,6 +26452,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
26309
26452
  var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
26310
26453
  };
26311
26454
 
26455
+ // src/models/chmv2/modeling_chmv2.js
26456
+ var CHMv2PreTrainedModel = class extends PreTrainedModel {
26457
+ };
26458
+ var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
26459
+ };
26460
+
26312
26461
  // src/models/clap/modeling_clap.js
26313
26462
  var ClapPreTrainedModel = class extends PreTrainedModel {
26314
26463
  };
@@ -26647,6 +26796,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
26647
26796
  }
26648
26797
  };
26649
26798
 
26799
+ // src/models/deepseek_v3/modeling_deepseek_v3.js
26800
+ var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
26801
+ };
26802
+ var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
26803
+ };
26804
+ var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
26805
+ };
26806
+
26650
26807
  // src/models/deberta_v2/modeling_deberta_v2.js
26651
26808
  var DebertaV2PreTrainedModel = class extends PreTrainedModel {
26652
26809
  };
@@ -26995,6 +27152,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
26995
27152
  }
26996
27153
  };
26997
27154
 
27155
+ // src/models/eurobert/modeling_eurobert.js
27156
+ var EuroBertPreTrainedModel = class extends PreTrainedModel {
27157
+ };
27158
+ var EuroBertModel = class extends EuroBertPreTrainedModel {
27159
+ };
27160
+ var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
27161
+ /**
27162
+ * Calls the model on new inputs.
27163
+ *
27164
+ * @param {Object} model_inputs The inputs to the model.
27165
+ * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
27166
+ */
27167
+ async _call(model_inputs) {
27168
+ return new MaskedLMOutput(await super._call(model_inputs));
27169
+ }
27170
+ };
27171
+ var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
27172
+ /**
27173
+ * Calls the model on new inputs.
27174
+ *
27175
+ * @param {Object} model_inputs The inputs to the model.
27176
+ * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
27177
+ */
27178
+ async _call(model_inputs) {
27179
+ return new SequenceClassifierOutput(await super._call(model_inputs));
27180
+ }
27181
+ };
27182
+ var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
27183
+ /**
27184
+ * Calls the model on new inputs.
27185
+ *
27186
+ * @param {Object} model_inputs The inputs to the model.
27187
+ * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
27188
+ */
27189
+ async _call(model_inputs) {
27190
+ return new TokenClassifierOutput(await super._call(model_inputs));
27191
+ }
27192
+ };
27193
+
26998
27194
  // src/models/exaone/modeling_exaone.js
26999
27195
  var ExaonePreTrainedModel = class extends PreTrainedModel {
27000
27196
  };
@@ -27152,12 +27348,35 @@ var Gemma2Model = class extends Gemma2PreTrainedModel {
27152
27348
  var Gemma2ForCausalLM = class extends Gemma2PreTrainedModel {
27153
27349
  };
27154
27350
 
27351
+ // src/models/llava/modeling_llava.js
27352
+ var LlavaPreTrainedModel = class extends PreTrainedModel {
27353
+ forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
27354
+ };
27355
+ var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
27356
+ _merge_input_ids_with_image_features(kwargs) {
27357
+ const vision_hidden_size = kwargs.image_features.dims.at(-1);
27358
+ const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
27359
+ return default_merge_input_ids_with_image_features({
27360
+ // @ts-ignore
27361
+ image_token_id: this.config.image_token_index ?? this.config.image_token_id,
27362
+ ...kwargs,
27363
+ image_features: reshaped_image_hidden_states
27364
+ });
27365
+ }
27366
+ };
27367
+ var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
27368
+ };
27369
+ var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
27370
+ };
27371
+
27155
27372
  // src/models/gemma3/modeling_gemma3.js
27156
27373
  var Gemma3PreTrainedModel = class extends PreTrainedModel {
27157
27374
  };
27158
27375
  var Gemma3Model = class extends Gemma3PreTrainedModel {
27159
27376
  };
27160
- var Gemma3ForCausalLM = class extends Gemma3PreTrainedModel {
27377
+ var Gemma3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
27378
+ };
27379
+ var Gemma3ForCausalLM = class extends Gemma3ForConditionalGeneration {
27161
27380
  };
27162
27381
 
27163
27382
  // src/models/gemma3n/modeling_gemma3n.js
@@ -27270,6 +27489,382 @@ var GlmModel = class extends GlmPreTrainedModel {
27270
27489
  var GlmForCausalLM = class extends GlmPreTrainedModel {
27271
27490
  };
27272
27491
 
27492
+ // src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
27493
+ var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
27494
+ };
27495
+ var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
27496
+ };
27497
+ var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
27498
+ };
27499
+
27500
+ // src/models/qwen2_vl/modeling_qwen2_vl.js
27501
+ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
27502
+ forward_params = [
27503
+ // Text inputs
27504
+ "input_ids",
27505
+ "attention_mask",
27506
+ "position_ids",
27507
+ "past_key_values",
27508
+ // Vision inputs
27509
+ "pixel_values",
27510
+ "image_grid_thw"
27511
+ ];
27512
+ };
27513
+ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
27514
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
27515
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
27516
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
27517
+ image_grid_thw_name = "grid_thw";
27518
+ /**
27519
+ * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
27520
+ * @param {Tensor} input_ids
27521
+ * @param {Tensor} attention_mask
27522
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
27523
+ */
27524
+ _get_text_only_rope_index(input_ids, attention_mask) {
27525
+ if (attention_mask) {
27526
+ const { data, dims } = cumsum_masked_fill(attention_mask);
27527
+ const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
27528
+ const mrope_position_deltas = Array.from(
27529
+ { length: dims[0] },
27530
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
27531
+ );
27532
+ return [
27533
+ new Tensor2("int64", position_ids, [3, ...dims]),
27534
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27535
+ ];
27536
+ } else {
27537
+ const [batch_size, seq_length] = input_ids.dims;
27538
+ const position_ids = BigInt64Array.from(
27539
+ { length: 3 * batch_size * seq_length },
27540
+ (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
27541
+ );
27542
+ return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
27543
+ }
27544
+ }
27545
+ /**
27546
+ * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
27547
+ * global [all_t, all_h, all_w] order, then write back into the position_ids array
27548
+ * respecting attention mask.
27549
+ * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
27550
+ * @param {number[]} attn_mask Attention mask for this batch element
27551
+ * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
27552
+ * @param {number} batch_idx Current batch index
27553
+ * @returns {number[]} Flat reordered positions of length total_len
27554
+ */
27555
+ _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
27556
+ const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
27557
+ const llm_positions = new Array(total_len);
27558
+ let index = 0;
27559
+ for (let x = 0; x < 3; ++x) {
27560
+ for (const val of llm_pos_ids_list) {
27561
+ const seg_len = val.length / 3;
27562
+ for (let z2 = x * seg_len; z2 < (x + 1) * seg_len; ++z2) {
27563
+ llm_positions[index++] = val[z2];
27564
+ }
27565
+ }
27566
+ }
27567
+ let count2 = 0;
27568
+ for (let y = 0; y < attn_mask.length; ++y) {
27569
+ if (attn_mask[y] == 1) {
27570
+ for (let x = 0; x < 3; ++x) {
27571
+ position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
27572
+ }
27573
+ ++count2;
27574
+ }
27575
+ }
27576
+ return llm_positions;
27577
+ }
27578
+ /**
27579
+ * Build per-batch position ID segments for multimodal rope.
27580
+ * Override this in subclasses to change how vision/text segments are identified and positioned.
27581
+ * @param {object} params
27582
+ * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
27583
+ * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
27584
+ * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
27585
+ * @param {number} params.spatial_merge_size
27586
+ * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
27587
+ * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
27588
+ */
27589
+ _get_multimodal_rope_positions({
27590
+ filtered_ids,
27591
+ image_grid_thw_list,
27592
+ video_grid_thw_list,
27593
+ spatial_merge_size,
27594
+ state
27595
+ }) {
27596
+ const { image_token_id, video_token_id, vision_start_token_id } = this.config;
27597
+ const ids = filtered_ids;
27598
+ const vision_start_indices = ids.reduce((acc, x, idx) => {
27599
+ if (x == vision_start_token_id) acc.push(idx);
27600
+ return acc;
27601
+ }, []);
27602
+ const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
27603
+ const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
27604
+ const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
27605
+ const llm_pos_ids_list = [];
27606
+ let st2 = 0;
27607
+ let remain_images = image_nums;
27608
+ let remain_videos = video_nums;
27609
+ for (let j = 0; j < vision_tokens.length; ++j) {
27610
+ const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
27611
+ const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
27612
+ const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
27613
+ const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
27614
+ let ed;
27615
+ let t, h, w;
27616
+ if (ed_image < ed_video) {
27617
+ [t, h, w] = image_grid_thw_list[state.image_index];
27618
+ ++state.image_index;
27619
+ --remain_images;
27620
+ ed = ed_image;
27621
+ } else {
27622
+ [t, h, w] = video_grid_thw_list[state.video_index];
27623
+ ++state.video_index;
27624
+ --remain_videos;
27625
+ ed = ed_video;
27626
+ }
27627
+ const [llm_grid_t, llm_grid_h, llm_grid_w] = [
27628
+ Number(t),
27629
+ Math.floor(Number(h) / spatial_merge_size),
27630
+ Math.floor(Number(w) / spatial_merge_size)
27631
+ ];
27632
+ const text_len = ed - st2;
27633
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27634
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
27635
+ const offset = text_len + st_idx;
27636
+ const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
27637
+ const t_index = Array.from(
27638
+ { length: grid_size },
27639
+ (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
27640
+ );
27641
+ const h_index = Array.from(
27642
+ { length: grid_size },
27643
+ (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
27644
+ );
27645
+ const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
27646
+ llm_pos_ids_list.push([t_index, h_index, w_index].flat());
27647
+ st2 = ed + grid_size;
27648
+ }
27649
+ if (st2 < ids.length) {
27650
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27651
+ const text_len = ids.length - st2;
27652
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
27653
+ }
27654
+ return llm_pos_ids_list;
27655
+ }
27656
+ /**
27657
+ * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
27658
+ *
27659
+ * Explanation:
27660
+ * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
27661
+ *
27662
+ * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
27663
+ * Examples:
27664
+ * input_ids: [T T T T T], here T is for text.
27665
+ * temporal position_ids: [0, 1, 2, 3, 4]
27666
+ * height position_ids: [0, 1, 2, 3, 4]
27667
+ * width position_ids: [0, 1, 2, 3, 4]
27668
+ *
27669
+ * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
27670
+ * and 1D rotary position embeddin for text part.
27671
+ * Examples:
27672
+ * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
27673
+ * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
27674
+ * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
27675
+ * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
27676
+ * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
27677
+ * text temporal position_ids: [3, 4, 5, 6, 7]
27678
+ * text height position_ids: [3, 4, 5, 6, 7]
27679
+ * text width position_ids: [3, 4, 5, 6, 7]
27680
+ * Here we calculate the text start position_ids as the max vision position_ids plus 1.
27681
+ *
27682
+ * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
27683
+ * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
27684
+ * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
27685
+ * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
27686
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
27687
+ */
27688
+ get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
27689
+ const { vision_config } = this.config;
27690
+ const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
27691
+ if (image_grid_thw || video_grid_thw) {
27692
+ const total_input_ids = input_ids.tolist();
27693
+ if (!attention_mask) {
27694
+ attention_mask = ones_like(input_ids);
27695
+ }
27696
+ const attention_mask_list = attention_mask.tolist();
27697
+ const position_ids_list = Array.from(
27698
+ { length: 3 },
27699
+ () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
27700
+ );
27701
+ const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
27702
+ const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
27703
+ const state = { image_index: 0, video_index: 0 };
27704
+ const mrope_position_deltas = [];
27705
+ for (let i = 0; i < total_input_ids.length; ++i) {
27706
+ const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
27707
+ const llm_pos_ids_list = this._get_multimodal_rope_positions({
27708
+ filtered_ids,
27709
+ image_grid_thw_list,
27710
+ video_grid_thw_list,
27711
+ spatial_merge_size,
27712
+ state
27713
+ });
27714
+ const llm_positions = this._reorder_and_write_positions(
27715
+ llm_pos_ids_list,
27716
+ attention_mask_list[i],
27717
+ position_ids_list,
27718
+ i
27719
+ );
27720
+ mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
27721
+ }
27722
+ return [
27723
+ new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
27724
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27725
+ ];
27726
+ } else {
27727
+ return this._get_text_only_rope_index(input_ids, attention_mask);
27728
+ }
27729
+ }
27730
+ async encode_image({ pixel_values, image_grid_thw }) {
27731
+ const features = (await sessionRun(this.sessions["vision_encoder"], {
27732
+ pixel_values,
27733
+ [this.image_grid_thw_name]: image_grid_thw
27734
+ })).image_features;
27735
+ return features;
27736
+ }
27737
+ _merge_input_ids_with_image_features(kwargs) {
27738
+ return default_merge_input_ids_with_image_features({
27739
+ // @ts-ignore
27740
+ image_token_id: this.config.image_token_id,
27741
+ ...kwargs
27742
+ });
27743
+ }
27744
+ prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
27745
+ if (!model_inputs.attention_mask || model_inputs.position_ids) {
27746
+ return model_inputs;
27747
+ }
27748
+ const session = this.sessions["decoder_model_merged"] ?? this.sessions["model"];
27749
+ if (!session.inputNames.includes("position_ids")) {
27750
+ return model_inputs;
27751
+ }
27752
+ if (!model_inputs.past_key_values) {
27753
+ [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
27754
+ model_inputs.input_ids,
27755
+ model_inputs.image_grid_thw,
27756
+ model_inputs.video_grid_thw,
27757
+ model_inputs.attention_mask
27758
+ );
27759
+ } else {
27760
+ model_inputs.pixel_values = null;
27761
+ const past_length = model_inputs.past_key_values.get_seq_length();
27762
+ if (past_length < model_inputs.input_ids.dims[1]) {
27763
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
27764
+ model_inputs.input_ids,
27765
+ model_inputs.image_grid_thw,
27766
+ model_inputs.video_grid_thw,
27767
+ model_inputs.attention_mask
27768
+ );
27769
+ model_inputs.rope_deltas = rope_deltas;
27770
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
27771
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
27772
+ } else {
27773
+ if (!model_inputs.rope_deltas) {
27774
+ [, model_inputs.rope_deltas] = this.get_rope_index(
27775
+ model_inputs.input_ids,
27776
+ model_inputs.image_grid_thw,
27777
+ model_inputs.video_grid_thw,
27778
+ model_inputs.attention_mask
27779
+ );
27780
+ }
27781
+ const delta = BigInt(past_length);
27782
+ const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
27783
+ model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
27784
+ }
27785
+ }
27786
+ return model_inputs;
27787
+ }
27788
+ };
27789
+ var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
27790
+ };
27791
+
27792
+ // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
27793
+ var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
27794
+ image_grid_thw_name = "image_grid_thw";
27795
+ };
27796
+ var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
27797
+ image_grid_thw_name = "image_grid_thw";
27798
+ };
27799
+
27800
+ // src/models/glm_ocr/modeling_glm_ocr.js
27801
+ var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
27802
+ /**
27803
+ * Compute 3D positional indices for vision tokens.
27804
+ * Temporal is constant, height is repeat-interleaved, width tiles.
27805
+ * @param {number} start_position
27806
+ * @param {number[]} grid_thw [T, H, W]
27807
+ * @param {number} temp_merge_size
27808
+ * @param {number} spatial_merge_size
27809
+ * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
27810
+ */
27811
+ get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
27812
+ const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
27813
+ const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
27814
+ const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
27815
+ const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
27816
+ const t_pos = Array.from({ length: seq_len }, () => start_position);
27817
+ const h_pos = Array.from(
27818
+ { length: seq_len },
27819
+ (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
27820
+ );
27821
+ const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
27822
+ return [...t_pos, ...h_pos, ...w_pos];
27823
+ }
27824
+ /**
27825
+ * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
27826
+ * instead of vision_start_token_id scanning used by Qwen2VL.
27827
+ * After a vision segment, position advances by max(h, w) / spatial_merge_size.
27828
+ */
27829
+ _get_multimodal_rope_positions({
27830
+ filtered_ids,
27831
+ image_grid_thw_list,
27832
+ video_grid_thw_list,
27833
+ spatial_merge_size,
27834
+ state
27835
+ }) {
27836
+ const { image_token_id } = this.config;
27837
+ const groups = [];
27838
+ let group_start = 0;
27839
+ let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
27840
+ for (let j = 1; j <= filtered_ids.length; ++j) {
27841
+ const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
27842
+ if (t !== current_type) {
27843
+ groups.push([current_type, group_start, j]);
27844
+ group_start = j;
27845
+ current_type = t;
27846
+ }
27847
+ }
27848
+ let current_pos = 0;
27849
+ const llm_pos_ids_list = [];
27850
+ for (const [modality_type, start_idx, end_idx] of groups) {
27851
+ if (modality_type === 0) {
27852
+ const text_len = end_idx - start_idx;
27853
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
27854
+ current_pos += text_len;
27855
+ } else {
27856
+ const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
27857
+ const temp_merge_size = grid_thw[0];
27858
+ llm_pos_ids_list.push(
27859
+ this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
27860
+ );
27861
+ current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
27862
+ }
27863
+ }
27864
+ return llm_pos_ids_list;
27865
+ }
27866
+ };
27867
+
27273
27868
  // src/models/glpn/modeling_glpn.js
27274
27869
  var GLPNPreTrainedModel = class extends PreTrainedModel {
27275
27870
  };
@@ -27468,27 +28063,6 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
27468
28063
  var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
27469
28064
  };
27470
28065
 
27471
- // src/models/llava/modeling_llava.js
27472
- var LlavaPreTrainedModel = class extends PreTrainedModel {
27473
- forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
27474
- };
27475
- var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
27476
- _merge_input_ids_with_image_features(kwargs) {
27477
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
27478
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
27479
- return default_merge_input_ids_with_image_features({
27480
- // @ts-ignore
27481
- image_token_id: this.config.image_token_index ?? this.config.image_token_id,
27482
- ...kwargs,
27483
- image_features: reshaped_image_hidden_states
27484
- });
27485
- }
27486
- };
27487
- var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
27488
- };
27489
- var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
27490
- };
27491
-
27492
28066
  // src/models/idefics3/modeling_idefics3.js
27493
28067
  var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
27494
28068
  forward_params = [
@@ -27582,6 +28156,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
27582
28156
  var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
27583
28157
  };
27584
28158
 
28159
+ // src/models/lighton_ocr/modeling_lighton_ocr.js
28160
+ var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
28161
+ };
28162
+
27585
28163
  // src/models/lfm2_moe/modeling_lfm2_moe.js
27586
28164
  var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
27587
28165
  };
@@ -27778,6 +28356,14 @@ var MistralModel = class extends MistralPreTrainedModel {
27778
28356
  var MistralForCausalLM = class extends MistralPreTrainedModel {
27779
28357
  };
27780
28358
 
28359
+ // src/models/mistral4/modeling_mistral4.js
28360
+ var Mistral4PreTrainedModel = class extends PreTrainedModel {
28361
+ };
28362
+ var Mistral4Model = class extends Mistral4PreTrainedModel {
28363
+ };
28364
+ var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
28365
+ };
28366
+
27781
28367
  // src/models/mobilebert/modeling_mobilebert.js
27782
28368
  var MobileBertPreTrainedModel = class extends PreTrainedModel {
27783
28369
  };
@@ -28246,6 +28832,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
28246
28832
  var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
28247
28833
  };
28248
28834
 
28835
+ // src/models/nemotron_h/modeling_nemotron_h.js
28836
+ var NemotronHPreTrainedModel = class extends PreTrainedModel {
28837
+ };
28838
+ var NemotronHModel = class extends NemotronHPreTrainedModel {
28839
+ };
28840
+ var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
28841
+ };
28842
+
28249
28843
  // src/models/neobert/modeling_neobert.js
28250
28844
  var NeoBertPreTrainedModel = class extends PreTrainedModel {
28251
28845
  };
@@ -28526,252 +29120,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
28526
29120
  var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
28527
29121
  };
28528
29122
 
28529
- // src/models/qwen2_vl/modeling_qwen2_vl.js
28530
- var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
28531
- forward_params = [
28532
- // Text inputs
28533
- "input_ids",
28534
- "attention_mask",
28535
- "position_ids",
28536
- "past_key_values",
28537
- // Vision inputs
28538
- "pixel_values",
28539
- "image_grid_thw"
28540
- ];
28541
- };
28542
- var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
28543
- // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
28544
- // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
28545
- // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
28546
- image_grid_thw_name = "grid_thw";
28547
- /**
28548
- * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
28549
- *
28550
- * Explanation:
28551
- * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
28552
- *
28553
- * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
28554
- * Examples:
28555
- * input_ids: [T T T T T], here T is for text.
28556
- * temporal position_ids: [0, 1, 2, 3, 4]
28557
- * height position_ids: [0, 1, 2, 3, 4]
28558
- * width position_ids: [0, 1, 2, 3, 4]
28559
- *
28560
- * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
28561
- * and 1D rotary position embeddin for text part.
28562
- * Examples:
28563
- * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
28564
- * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
28565
- * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
28566
- * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
28567
- * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
28568
- * text temporal position_ids: [3, 4, 5, 6, 7]
28569
- * text height position_ids: [3, 4, 5, 6, 7]
28570
- * text width position_ids: [3, 4, 5, 6, 7]
28571
- * Here we calculate the text start position_ids as the max vision position_ids plus 1.
28572
- *
28573
- * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
28574
- * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
28575
- * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
28576
- * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
28577
- * - 1 for tokens that are **not masked**,
28578
- * - 0 for tokens that are **masked**.
28579
- * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
28580
- * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
28581
- * - mrope_position_deltas: Tensor of shape `(batch_size)`.
28582
- */
28583
- get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
28584
- const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
28585
- const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
28586
- const mrope_position_deltas = [];
28587
- if (image_grid_thw || video_grid_thw) {
28588
- let total_input_ids = input_ids.tolist();
28589
- if (!attention_mask) {
28590
- attention_mask = ones_like(input_ids);
28591
- }
28592
- const attention_mask_list = attention_mask.tolist();
28593
- const position_ids_list = Array.from(
28594
- { length: 3 },
28595
- (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
28596
- );
28597
- const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
28598
- const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
28599
- let image_index = 0;
28600
- let video_index = 0;
28601
- for (let i = 0; i < total_input_ids.length; ++i) {
28602
- const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
28603
- const vision_start_indices = ids.reduce((acc, x, idx) => {
28604
- if (x == vision_start_token_id) acc.push(idx);
28605
- return acc;
28606
- }, []);
28607
- const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
28608
- const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
28609
- const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
28610
- let llm_pos_ids_list = [];
28611
- let st2 = 0;
28612
- let remain_images = image_nums;
28613
- let remain_videos = video_nums;
28614
- for (let j = 0; j < vision_tokens.length; ++j) {
28615
- const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
28616
- const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
28617
- const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
28618
- const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
28619
- let ed;
28620
- let t, h, w;
28621
- if (ed_image < ed_video) {
28622
- [t, h, w] = image_grid_thw_list[image_index];
28623
- ++image_index;
28624
- --remain_images;
28625
- ed = ed_image;
28626
- } else {
28627
- [t, h, w] = video_grid_thw_list[video_index];
28628
- ++video_index;
28629
- --remain_videos;
28630
- ed = ed_video;
28631
- }
28632
- const [llm_grid_t, llm_grid_h, llm_grid_w] = [
28633
- Number(t),
28634
- Math.floor(Number(h) / spatial_merge_size),
28635
- Math.floor(Number(w) / spatial_merge_size)
28636
- ];
28637
- const text_len = ed - st2;
28638
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
28639
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
28640
- const offset = text_len + st_idx;
28641
- const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
28642
- const t_index = Array.from(
28643
- { length: grid_size },
28644
- (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
28645
- );
28646
- const h_index = Array.from(
28647
- { length: grid_size },
28648
- (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
28649
- );
28650
- const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
28651
- llm_pos_ids_list.push([t_index, h_index, w_index].flat());
28652
- st2 = ed + grid_size;
28653
- }
28654
- if (st2 < ids.length) {
28655
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
28656
- const text_len = ids.length - st2;
28657
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
28658
- }
28659
- const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
28660
- const llm_positions = new Array(num_items);
28661
- let index = 0;
28662
- for (let x = 0; x < 3; ++x) {
28663
- for (let y = 0; y < llm_pos_ids_list.length; ++y) {
28664
- const val = llm_pos_ids_list[y];
28665
- const text_len = val.length / 3;
28666
- for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
28667
- llm_positions[index++] = val[z];
28668
- }
28669
- }
28670
- }
28671
- let count2 = 0;
28672
- const attn_mask = attention_mask_list[i];
28673
- for (let y = 0; y < attn_mask.length; ++y) {
28674
- if (attn_mask[y] == 1) {
28675
- for (let x = 0; x < 3; ++x) {
28676
- position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
28677
- }
28678
- ++count2;
28679
- }
28680
- }
28681
- const max_llm_positions = max(llm_positions)[0];
28682
- mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
28683
- }
28684
- return [
28685
- new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
28686
- new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
28687
- ];
28688
- } else {
28689
- if (attention_mask) {
28690
- const { data, dims } = cumsum_masked_fill(attention_mask);
28691
- const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
28692
- const mrope_position_deltas2 = Array.from(
28693
- { length: dims[0] },
28694
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
28695
- );
28696
- return [
28697
- new Tensor2("int64", position_ids, [3, ...dims]),
28698
- new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
28699
- ];
28700
- } else {
28701
- const [batch_size, seq_length] = input_ids.dims;
28702
- const position_ids = BigInt64Array.from(
28703
- { length: 3 * batch_size * seq_length },
28704
- (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
28705
- );
28706
- return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
28707
- }
28708
- }
28709
- }
28710
- async encode_image({ pixel_values, image_grid_thw }) {
28711
- const features = (await sessionRun(this.sessions["vision_encoder"], {
28712
- pixel_values,
28713
- [this.image_grid_thw_name]: image_grid_thw
28714
- })).image_features;
28715
- return features;
28716
- }
28717
- _merge_input_ids_with_image_features(kwargs) {
28718
- return default_merge_input_ids_with_image_features({
28719
- // @ts-ignore
28720
- image_token_id: this.config.image_token_id,
28721
- ...kwargs
28722
- });
28723
- }
28724
- prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
28725
- if (model_inputs.attention_mask && !model_inputs.position_ids) {
28726
- if (!model_inputs.past_key_values) {
28727
- [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
28728
- model_inputs.input_ids,
28729
- model_inputs.image_grid_thw,
28730
- model_inputs.video_grid_thw,
28731
- model_inputs.attention_mask
28732
- );
28733
- } else {
28734
- model_inputs.pixel_values = null;
28735
- const past_length = model_inputs.past_key_values.get_seq_length();
28736
- if (past_length < model_inputs.input_ids.dims[1]) {
28737
- const [full_position_ids, rope_deltas] = this.get_rope_index(
28738
- model_inputs.input_ids,
28739
- model_inputs.image_grid_thw,
28740
- model_inputs.video_grid_thw,
28741
- model_inputs.attention_mask
28742
- );
28743
- model_inputs.rope_deltas = rope_deltas;
28744
- model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
28745
- model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
28746
- } else {
28747
- if (!model_inputs.rope_deltas) {
28748
- [, model_inputs.rope_deltas] = this.get_rope_index(
28749
- model_inputs.input_ids,
28750
- model_inputs.image_grid_thw,
28751
- model_inputs.video_grid_thw,
28752
- model_inputs.attention_mask
28753
- );
28754
- }
28755
- const delta = BigInt(past_length);
28756
- const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
28757
- model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
28758
- }
28759
- }
28760
- }
28761
- return model_inputs;
28762
- }
28763
- };
28764
- var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
28765
- };
28766
-
28767
- // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
28768
- var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
28769
- image_grid_thw_name = "image_grid_thw";
28770
- };
28771
- var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
28772
- image_grid_thw_name = "image_grid_thw";
28773
- };
28774
-
28775
29123
  // src/models/qwen3/modeling_qwen3.js
28776
29124
  var Qwen3PreTrainedModel = class extends PreTrainedModel {
28777
29125
  };
@@ -29217,6 +29565,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
29217
29565
  }
29218
29566
  };
29219
29567
 
29568
+ // src/models/solar_open/modeling_solar_open.js
29569
+ var SolarOpenPreTrainedModel = class extends PreTrainedModel {
29570
+ };
29571
+ var SolarOpenModel = class extends SolarOpenPreTrainedModel {
29572
+ };
29573
+ var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
29574
+ };
29575
+
29220
29576
  // src/models/speecht5/modeling_speecht5.js
29221
29577
  var SpeechT5PreTrainedModel = class extends PreTrainedModel {
29222
29578
  };
@@ -30333,6 +30689,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
30333
30689
  // src/models/registry.js
30334
30690
  var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
30335
30691
  ["bert", "BertModel"],
30692
+ ["eurobert", "EuroBertModel"],
30336
30693
  ["neobert", "NeoBertModel"],
30337
30694
  ["modernbert", "ModernBertModel"],
30338
30695
  ["nomic_bert", "NomicBertModel"],
@@ -30464,6 +30821,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
30464
30821
  ["gemma3_text", "Gemma3Model"],
30465
30822
  ["helium", "HeliumModel"],
30466
30823
  ["glm", "GlmModel"],
30824
+ ["glm_moe_dsa", "GlmMoeDsaModel"],
30467
30825
  ["openelm", "OpenELMModel"],
30468
30826
  ["qwen2", "Qwen2Model"],
30469
30827
  ["qwen2_moe", "Qwen2MoeModel"],
@@ -30475,12 +30833,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
30475
30833
  ["mpt", "MptModel"],
30476
30834
  ["opt", "OPTModel"],
30477
30835
  ["mistral", "MistralModel"],
30836
+ ["mistral4", "Mistral4Model"],
30478
30837
  ["ministral", "MinistralModel"],
30479
30838
  ["ministral3", "Ministral3Model"],
30480
30839
  ["ernie4_5", "Ernie4_5ForCausalLM"],
30481
30840
  ["starcoder2", "Starcoder2Model"],
30841
+ ["deepseek_v3", "DeepseekV3Model"],
30482
30842
  ["falcon", "FalconModel"],
30483
30843
  ["falcon_h1", "FalconH1Model"],
30844
+ ["nemotron_h", "NemotronHModel"],
30845
+ ["solar_open", "SolarOpenModel"],
30484
30846
  ["stablelm", "StableLmModel"],
30485
30847
  ["modernbert-decoder", "ModernBertDecoderModel"],
30486
30848
  ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -30500,6 +30862,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30500
30862
  ]);
30501
30863
  var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30502
30864
  ["bert", "BertForSequenceClassification"],
30865
+ ["eurobert", "EuroBertForSequenceClassification"],
30503
30866
  ["neobert", "NeoBertForSequenceClassification"],
30504
30867
  ["modernbert", "ModernBertForSequenceClassification"],
30505
30868
  ["roformer", "RoFormerForSequenceClassification"],
@@ -30522,6 +30885,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30522
30885
  ]);
30523
30886
  var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30524
30887
  ["bert", "BertForTokenClassification"],
30888
+ ["eurobert", "EuroBertForTokenClassification"],
30525
30889
  ["neobert", "NeoBertForTokenClassification"],
30526
30890
  ["modernbert", "ModernBertForTokenClassification"],
30527
30891
  ["roformer", "RoFormerForTokenClassification"],
@@ -30584,6 +30948,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30584
30948
  ["gemma3", "Gemma3ForCausalLM"],
30585
30949
  ["helium", "HeliumForCausalLM"],
30586
30950
  ["glm", "GlmForCausalLM"],
30951
+ ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
30587
30952
  ["openelm", "OpenELMForCausalLM"],
30588
30953
  ["qwen2", "Qwen2ForCausalLM"],
30589
30954
  ["qwen2_moe", "Qwen2MoeForCausalLM"],
@@ -30595,6 +30960,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30595
30960
  ["qwen3_vl", "Qwen3VLForCausalLM"],
30596
30961
  ["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
30597
30962
  ["qwen3_5", "Qwen3_5ForCausalLM"],
30963
+ ["qwen3_5_text", "Qwen3_5ForCausalLM"],
30598
30964
  ["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
30599
30965
  ["gemma3n", "Gemma3nForCausalLM"],
30600
30966
  ["phi", "PhiForCausalLM"],
@@ -30603,13 +30969,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30603
30969
  ["opt", "OPTForCausalLM"],
30604
30970
  ["mbart", "MBartForCausalLM"],
30605
30971
  ["mistral", "MistralForCausalLM"],
30972
+ ["mistral4", "Mistral4ForCausalLM"],
30606
30973
  ["ministral", "MinistralForCausalLM"],
30607
30974
  ["ministral3", "Ministral3ForCausalLM"],
30608
30975
  ["ernie4_5", "Ernie4_5ForCausalLM"],
30609
30976
  ["starcoder2", "Starcoder2ForCausalLM"],
30977
+ ["deepseek_v3", "DeepseekV3ForCausalLM"],
30610
30978
  ["falcon", "FalconForCausalLM"],
30611
30979
  ["falcon_h1", "FalconH1ForCausalLM"],
30980
+ ["nemotron_h", "NemotronHForCausalLM"],
30612
30981
  ["trocr", "TrOCRForCausalLM"],
30982
+ ["solar_open", "SolarOpenForCausalLM"],
30613
30983
  ["stablelm", "StableLmForCausalLM"],
30614
30984
  ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
30615
30985
  ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -30620,6 +30990,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30620
30990
  var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
30621
30991
  var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30622
30992
  ["bert", "BertForMaskedLM"],
30993
+ ["eurobert", "EuroBertForMaskedLM"],
30623
30994
  ["neobert", "NeoBertForMaskedLM"],
30624
30995
  ["modernbert", "ModernBertForMaskedLM"],
30625
30996
  ["roformer", "RoFormerForMaskedLM"],
@@ -30677,8 +31048,11 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
30677
31048
  ["smolvlm", "SmolVLMForConditionalGeneration"],
30678
31049
  ["paligemma", "PaliGemmaForConditionalGeneration"],
30679
31050
  ["llava_qwen2", "LlavaQwen2ForCausalLM"],
31051
+ ["gemma3", "Gemma3ForConditionalGeneration"],
30680
31052
  ["gemma3n", "Gemma3nForConditionalGeneration"],
30681
- ["mistral3", "Mistral3ForConditionalGeneration"]
31053
+ ["mistral3", "Mistral3ForConditionalGeneration"],
31054
+ ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
31055
+ ["glm_ocr", "GlmOcrForConditionalGeneration"]
30682
31056
  ]);
30683
31057
  var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
30684
31058
  ["granite_speech", "GraniteSpeechForConditionalGeneration"],
@@ -30783,6 +31157,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30783
31157
  ]);
30784
31158
  var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
30785
31159
  var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
31160
+ ["chmv2", "CHMv2ForDepthEstimation"],
30786
31161
  ["dpt", "DPTForDepthEstimation"],
30787
31162
  ["depth_anything", "DepthAnythingForDepthEstimation"],
30788
31163
  ["glpn", "GLPNForDepthEstimation"],
@@ -30868,13 +31243,6 @@ var CUSTOM_MAPPING = [
30868
31243
  ],
30869
31244
  ["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
30870
31245
  ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
30871
- ["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30872
- ["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30873
- ["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30874
- ["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30875
- ["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30876
- ["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30877
- ["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30878
31246
  [
30879
31247
  "VoxtralRealtimeForConditionalGeneration",
30880
31248
  VoxtralRealtimeForConditionalGeneration,
@@ -32556,6 +32924,41 @@ var TASK_ALIASES = Object.freeze({
32556
32924
  embeddings: "feature-extraction"
32557
32925
  });
32558
32926
 
32927
+ // src/utils/model_registry/resolve_model_type.js
32928
+ function resolve_model_type(config, { warn = true } = {}) {
32929
+ const architectures = (
32930
+ /** @type {string[]} */
32931
+ config.architectures || []
32932
+ );
32933
+ for (const arch of architectures) {
32934
+ const mappedType = MODEL_TYPE_MAPPING.get(arch);
32935
+ if (mappedType !== void 0) {
32936
+ return mappedType;
32937
+ }
32938
+ }
32939
+ if (config.model_type) {
32940
+ const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
32941
+ if (mappedType !== void 0) {
32942
+ return mappedType;
32943
+ }
32944
+ for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
32945
+ if (mapping.has(config.model_type)) {
32946
+ const resolved = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
32947
+ if (resolved !== void 0) {
32948
+ return resolved;
32949
+ }
32950
+ }
32951
+ }
32952
+ }
32953
+ if (warn) {
32954
+ const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
32955
+ logger.warn(
32956
+ `[resolve_model_type] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
32957
+ );
32958
+ }
32959
+ return MODEL_TYPES.EncoderOnly;
32960
+ }
32961
+
32559
32962
  // src/utils/model_registry/get_model_files.js
32560
32963
  function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
32561
32964
  if (config !== null) {
@@ -32578,43 +32981,7 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
32578
32981
  const subfolder = "onnx";
32579
32982
  const rawDevice = overrideDevice ?? custom_config.device;
32580
32983
  let dtype = overrideDtype ?? custom_config.dtype;
32581
- let modelType;
32582
- const architectures = (
32583
- /** @type {string[]} */
32584
- config.architectures || []
32585
- );
32586
- let foundInMapping = false;
32587
- for (const arch of architectures) {
32588
- const mappedType = MODEL_TYPE_MAPPING.get(arch);
32589
- if (mappedType !== void 0) {
32590
- modelType = mappedType;
32591
- foundInMapping = true;
32592
- break;
32593
- }
32594
- }
32595
- if (!foundInMapping && config.model_type) {
32596
- const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
32597
- if (mappedType !== void 0) {
32598
- modelType = mappedType;
32599
- foundInMapping = true;
32600
- }
32601
- if (!foundInMapping) {
32602
- for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
32603
- if (mapping.has(config.model_type)) {
32604
- modelType = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
32605
- foundInMapping = true;
32606
- break;
32607
- }
32608
- }
32609
- }
32610
- }
32611
- if (!foundInMapping) {
32612
- const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
32613
- logger.warn(
32614
- `[get_model_files] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
32615
- );
32616
- modelType = MODEL_TYPES.EncoderOnly;
32617
- }
32984
+ const modelType = resolve_model_type(config);
32618
32985
  const add_model_file = (fileName, baseName = null) => {
32619
32986
  baseName = baseName ?? fileName;
32620
32987
  const selectedDevice = selectDevice(rawDevice, fileName);
@@ -33201,6 +33568,31 @@ async function clear_pipeline_cache(task, modelId, options = {}) {
33201
33568
  return await clear_files_from_cache(modelId, files, options);
33202
33569
  }
33203
33570
 
33571
+ // src/utils/model_registry/get_available_dtypes.js
33572
+ var CONCRETE_DTYPES = Object.keys(DEFAULT_DTYPE_SUFFIX_MAPPING);
33573
+ async function get_available_dtypes(modelId, { config = null, model_file_name = null, revision = "main", cache_dir = null, local_files_only = false } = {}) {
33574
+ config = await get_config(modelId, { config, cache_dir, local_files_only, revision });
33575
+ const subfolder = "onnx";
33576
+ const modelType = resolve_model_type(config);
33577
+ const { sessions } = getSessionsConfig(modelType, config, { model_file_name });
33578
+ const baseNames = Object.values(sessions);
33579
+ const metadataOptions = { revision, cache_dir, local_files_only };
33580
+ const probeResults = await Promise.all(
33581
+ CONCRETE_DTYPES.map(async (dtype) => {
33582
+ const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[dtype] ?? "";
33583
+ const allExist = await Promise.all(
33584
+ baseNames.map(async (baseName) => {
33585
+ const filename = `${subfolder}/${baseName}${suffix}.onnx`;
33586
+ const metadata = await get_file_metadata(modelId, filename, metadataOptions);
33587
+ return metadata.exists;
33588
+ })
33589
+ );
33590
+ return { dtype, available: allExist.every(Boolean) };
33591
+ })
33592
+ );
33593
+ return probeResults.filter((r) => r.available).map((r) => r.dtype);
33594
+ }
33595
+
33204
33596
  // src/utils/model_registry/ModelRegistry.js
33205
33597
  var ModelRegistry = class {
33206
33598
  /**
@@ -33287,6 +33679,29 @@ var ModelRegistry = class {
33287
33679
  static async get_processor_files(modelId) {
33288
33680
  return get_processor_files(modelId);
33289
33681
  }
33682
+ /**
33683
+ * Detects which quantization levels (dtypes) are available for a model
33684
+ * by checking which ONNX files exist on the hub or locally.
33685
+ *
33686
+ * A dtype is considered available if all required model session files
33687
+ * exist for that dtype.
33688
+ *
33689
+ * @param {string} modelId - The model id (e.g., "onnx-community/all-MiniLM-L6-v2-ONNX")
33690
+ * @param {Object} [options] - Optional parameters
33691
+ * @param {import('../../configs.js').PretrainedConfig} [options.config=null] - Pre-loaded config
33692
+ * @param {string} [options.model_file_name=null] - Override the model file name (excluding .onnx suffix)
33693
+ * @param {string} [options.revision='main'] - Model revision
33694
+ * @param {string} [options.cache_dir=null] - Custom cache directory
33695
+ * @param {boolean} [options.local_files_only=false] - Only check local files
33696
+ * @returns {Promise<string[]>} Array of available dtype strings (e.g., ['fp32', 'fp16', 'q4', 'q8'])
33697
+ *
33698
+ * @example
33699
+ * const dtypes = await ModelRegistry.get_available_dtypes('onnx-community/all-MiniLM-L6-v2-ONNX');
33700
+ * console.log(dtypes); // ['fp32', 'fp16', 'int8', 'uint8', 'q8', 'q4']
33701
+ */
33702
+ static async get_available_dtypes(modelId, options = {}) {
33703
+ return get_available_dtypes(modelId, options);
33704
+ }
33290
33705
  /**
33291
33706
  * Quickly checks if a model is fully cached by verifying `config.json` is present,
33292
33707
  * then confirming all required files are cached.
@@ -33522,6 +33937,9 @@ var ModelRegistry = class {
33522
33937
  BloomModel,
33523
33938
  BloomPreTrainedModel,
33524
33939
  BloomTokenizer,
33940
+ CHMv2ForDepthEstimation,
33941
+ CHMv2ImageProcessor,
33942
+ CHMv2PreTrainedModel,
33525
33943
  CLIPFeatureExtractor,
33526
33944
  CLIPImageProcessor,
33527
33945
  CLIPModel,
@@ -33617,6 +34035,9 @@ var ModelRegistry = class {
33617
34035
  DebertaV2Tokenizer,
33618
34036
  DecisionTransformerModel,
33619
34037
  DecisionTransformerPreTrainedModel,
34038
+ DeepseekV3ForCausalLM,
34039
+ DeepseekV3Model,
34040
+ DeepseekV3PreTrainedModel,
33620
34041
  DeiTFeatureExtractor,
33621
34042
  DeiTForImageClassification,
33622
34043
  DeiTImageProcessor,
@@ -33677,6 +34098,11 @@ var ModelRegistry = class {
33677
34098
  EsmModel,
33678
34099
  EsmPreTrainedModel,
33679
34100
  EsmTokenizer,
34101
+ EuroBertForMaskedLM,
34102
+ EuroBertForSequenceClassification,
34103
+ EuroBertForTokenClassification,
34104
+ EuroBertModel,
34105
+ EuroBertPreTrainedModel,
33680
34106
  ExaoneForCausalLM,
33681
34107
  ExaoneModel,
33682
34108
  ExaonePreTrainedModel,
@@ -33723,8 +34149,11 @@ var ModelRegistry = class {
33723
34149
  Gemma2Model,
33724
34150
  Gemma2PreTrainedModel,
33725
34151
  Gemma3ForCausalLM,
34152
+ Gemma3ForConditionalGeneration,
34153
+ Gemma3ImageProcessor,
33726
34154
  Gemma3Model,
33727
34155
  Gemma3PreTrainedModel,
34156
+ Gemma3Processor,
33728
34157
  Gemma3nAudioFeatureExtractor,
33729
34158
  Gemma3nForCausalLM,
33730
34159
  Gemma3nForConditionalGeneration,
@@ -33734,8 +34163,14 @@ var ModelRegistry = class {
33734
34163
  GemmaModel,
33735
34164
  GemmaPreTrainedModel,
33736
34165
  GemmaTokenizer,
34166
+ Glm46VImageProcessor,
34167
+ Glm46VProcessor,
33737
34168
  GlmForCausalLM,
33738
34169
  GlmModel,
34170
+ GlmMoeDsaForCausalLM,
34171
+ GlmMoeDsaModel,
34172
+ GlmMoeDsaPreTrainedModel,
34173
+ GlmOcrForConditionalGeneration,
33739
34174
  GlmPreTrainedModel,
33740
34175
  GptOssForCausalLM,
33741
34176
  GptOssModel,
@@ -33801,6 +34236,7 @@ var ModelRegistry = class {
33801
34236
  Lfm2VlForConditionalGeneration,
33802
34237
  Lfm2VlImageProcessor,
33803
34238
  Lfm2VlProcessor,
34239
+ LightOnOcrForConditionalGeneration,
33804
34240
  LiteWhisperForConditionalGeneration,
33805
34241
  Llama4ForCausalLM,
33806
34242
  Llama4PreTrainedModel,
@@ -33870,6 +34306,9 @@ var ModelRegistry = class {
33870
34306
  MimiPreTrainedModel,
33871
34307
  MinLengthLogitsProcessor,
33872
34308
  MinNewTokensLengthLogitsProcessor,
34309
+ Mistral4ForCausalLM,
34310
+ Mistral4Model,
34311
+ Mistral4PreTrainedModel,
33873
34312
  MistralForCausalLM,
33874
34313
  MistralModel,
33875
34314
  MistralPreTrainedModel,
@@ -33941,6 +34380,9 @@ var ModelRegistry = class {
33941
34380
  NanoChatForCausalLM,
33942
34381
  NanoChatModel,
33943
34382
  NanoChatPreTrainedModel,
34383
+ NemotronHForCausalLM,
34384
+ NemotronHModel,
34385
+ NemotronHPreTrainedModel,
33944
34386
  NeoBertForMaskedLM,
33945
34387
  NeoBertForQuestionAnswering,
33946
34388
  NeoBertForSequenceClassification,
@@ -34130,6 +34572,9 @@ var ModelRegistry = class {
34130
34572
  SnacFeatureExtractor,
34131
34573
  SnacModel,
34132
34574
  SnacPreTrainedModel,
34575
+ SolarOpenForCausalLM,
34576
+ SolarOpenModel,
34577
+ SolarOpenPreTrainedModel,
34133
34578
  SpeechT5FeatureExtractor,
34134
34579
  SpeechT5ForSpeechToText,
34135
34580
  SpeechT5ForTextToSpeech,
@@ -34327,7 +34772,7 @@ var ModelRegistry = class {
34327
34772
 
34328
34773
  onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
34329
34774
  (*!
34330
- * ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
34775
+ * ONNX Runtime Web v1.25.0-dev.20260323-a99aad9d36
34331
34776
  * Copyright (c) Microsoft Corporation. All rights reserved.
34332
34777
  * Licensed under the MIT License.
34333
34778
  *)