@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/README.md +13 -2
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +26 -26
  3. package/dist/transformers.js +1002 -587
  4. package/dist/transformers.min.js +23 -19
  5. package/dist/transformers.node.cjs +1030 -585
  6. package/dist/transformers.node.min.cjs +21 -17
  7. package/dist/transformers.node.min.mjs +21 -17
  8. package/dist/transformers.node.mjs +1000 -585
  9. package/dist/transformers.web.js +887 -472
  10. package/dist/transformers.web.min.js +21 -17
  11. package/package.json +3 -3
  12. package/src/configs.js +28 -22
  13. package/src/env.js +1 -1
  14. package/src/image_processors_utils.js +25 -15
  15. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  16. package/src/models/chmv2/modeling_chmv2.js +4 -0
  17. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  18. package/src/models/eurobert/modeling_eurobert.js +41 -0
  19. package/src/models/gemma3/image_processing_gemma3.js +3 -0
  20. package/src/models/gemma3/modeling_gemma3.js +4 -1
  21. package/src/models/gemma3/processing_gemma3.js +45 -0
  22. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  23. package/src/models/glm46v/processing_glm46v.js +5 -0
  24. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  25. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  26. package/src/models/image_processors.js +3 -0
  27. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
  28. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  29. package/src/models/mistral4/modeling_mistral4.js +5 -0
  30. package/src/models/modeling_utils.js +48 -25
  31. package/src/models/models.js +10 -1
  32. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  33. package/src/models/processors.js +2 -0
  34. package/src/models/qwen2_vl/modeling_qwen2_vl.js +226 -168
  35. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  36. package/src/models/registry.js +19 -8
  37. package/src/models/solar_open/modeling_solar_open.js +5 -0
  38. package/src/pipelines.js +1 -0
  39. package/src/utils/hub.js +4 -1
  40. package/src/utils/model_registry/ModelRegistry.js +36 -0
  41. package/src/utils/model_registry/get_available_dtypes.js +68 -0
  42. package/src/utils/model_registry/get_file_metadata.js +1 -0
  43. package/src/utils/model_registry/get_model_files.js +7 -60
  44. package/src/utils/model_registry/resolve_model_type.js +66 -0
  45. package/types/configs.d.ts.map +1 -1
  46. package/types/image_processors_utils.d.ts +3 -2
  47. package/types/image_processors_utils.d.ts.map +1 -1
  48. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  49. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  50. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  51. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  52. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  53. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  54. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  55. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  56. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  57. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  58. package/types/models/gemma3/image_processing_gemma3.d.ts +4 -0
  59. package/types/models/gemma3/image_processing_gemma3.d.ts.map +1 -0
  60. package/types/models/gemma3/modeling_gemma3.d.ts +4 -1
  61. package/types/models/gemma3/modeling_gemma3.d.ts.map +1 -1
  62. package/types/models/gemma3/processing_gemma3.d.ts +20 -0
  63. package/types/models/gemma3/processing_gemma3.d.ts.map +1 -0
  64. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  65. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  66. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  67. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  68. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  69. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  70. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  71. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  72. package/types/models/image_processors.d.ts +3 -0
  73. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  74. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  75. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  76. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  77. package/types/models/modeling_utils.d.ts +2 -3
  78. package/types/models/modeling_utils.d.ts.map +1 -1
  79. package/types/models/models.d.ts +10 -1
  80. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  81. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  82. package/types/models/processors.d.ts +2 -0
  83. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
  84. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  85. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  86. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  87. package/types/models/registry.d.ts.map +1 -1
  88. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  89. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  90. package/types/pipelines.d.ts +1 -0
  91. package/types/pipelines.d.ts.map +1 -1
  92. package/types/utils/hub.d.ts.map +1 -1
  93. package/types/utils/model_registry/ModelRegistry.d.ts +27 -0
  94. package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
  95. package/types/utils/model_registry/get_available_dtypes.d.ts +26 -0
  96. package/types/utils/model_registry/get_available_dtypes.d.ts.map +1 -0
  97. package/types/utils/model_registry/get_model_files.d.ts +25 -0
  98. package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
  99. package/types/utils/model_registry/resolve_model_type.d.ts +24 -0
  100. package/types/utils/model_registry/resolve_model_type.d.ts.map +1 -0
  101. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  102. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -14,7 +14,7 @@ var __export = (target, all) => {
14
14
  import fs from "fs";
15
15
  import path from "path";
16
16
  import url from "url";
17
- var VERSION = "4.0.0-next.7";
17
+ var VERSION = "4.0.0-next.9";
18
18
  var HAS_SELF = typeof self !== "undefined";
19
19
  var IS_FS_AVAILABLE = !isEmpty(fs);
20
20
  var IS_PATH_AVAILABLE = !isEmpty(path);
@@ -244,7 +244,7 @@ var logger = {
244
244
  }
245
245
  };
246
246
 
247
- // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
247
+ // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
248
248
  var DictionarySplitter = class {
249
249
  /**
250
250
  * @param dictionary The dictionary of words to use for splitting.
@@ -1900,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
1900
1900
  );
1901
1901
  if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
1902
1902
  output_tokens.push(...byte_tokens);
1903
- } else {
1903
+ } else if (this.unk_token != null) {
1904
1904
  output_tokens.push(this.unk_token);
1905
1905
  }
1906
- } else {
1906
+ } else if (this.unk_token != null) {
1907
1907
  output_tokens.push(this.unk_token);
1908
1908
  }
1909
1909
  }
@@ -5753,14 +5753,14 @@ var Random = class {
5753
5753
  * @returns {number} A normally distributed random value.
5754
5754
  */
5755
5755
  gauss(mu = 0, sigma = 1) {
5756
- let z = this._gauss_next;
5756
+ let z2 = this._gauss_next;
5757
5757
  this._gauss_next = null;
5758
- if (z === null) {
5758
+ if (z2 === null) {
5759
5759
  const x2pi = this.random() * 2 * Math.PI, g2rad = Math.sqrt(-2 * Math.log(1 - this.random()));
5760
- z = Math.cos(x2pi) * g2rad;
5760
+ z2 = Math.cos(x2pi) * g2rad;
5761
5761
  this._gauss_next = Math.sin(x2pi) * g2rad;
5762
5762
  }
5763
- return mu + z * sigma;
5763
+ return mu + z2 * sigma;
5764
5764
  }
5765
5765
  /**
5766
5766
  * Shuffles an array in-place using the Fisher-Yates algorithm.
@@ -6514,13 +6514,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
6514
6514
  wrapped_progress
6515
6515
  );
6516
6516
  } else if (typeof response !== "string") {
6517
+ const headers = new Headers(response.headers);
6518
+ headers.set("content-length", result.byteLength.toString());
6517
6519
  await cache2.put(
6518
6520
  cacheKey,
6519
6521
  new Response(
6520
6522
  /** @type {any} */
6521
6523
  result,
6522
6524
  {
6523
- headers: response.headers
6525
+ headers
6524
6526
  }
6525
6527
  )
6526
6528
  ).catch((err) => {
@@ -7478,7 +7480,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
7478
7480
  // src/backends/onnx.js
7479
7481
  import * as ONNX_NODE from "onnxruntime-node";
7480
7482
 
7481
- // ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
7483
+ // ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260323-a99aad9d36/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
7482
7484
  var ort_webgpu_bundle_min_exports = {};
7483
7485
  __export(ort_webgpu_bundle_min_exports, {
7484
7486
  InferenceSession: () => Jf,
@@ -8246,7 +8248,7 @@ async function ts(a = {}) {
8246
8248
  throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
8247
8249
  }
8248
8250
  function Ye() {
8249
- return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
8251
+ return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, q: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, s: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: lf, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: uf, A: df, r: cf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
8250
8252
  }
8251
8253
  async function bt() {
8252
8254
  function e(o, u) {
@@ -8309,14 +8311,14 @@ async function ts(a = {}) {
8309
8311
  gt.push(t), Je[e.Nc] = t, t.Nc = e.Nc;
8310
8312
  var n = { Oc: "run", he: e.ge, Wc: e.Wc, Nc: e.Nc };
8311
8313
  return t.postMessage(n, e.Yc), 0;
8312
- }, z = 0, V = (e, t, ...n) => {
8314
+ }, G = 0, V = (e, t, ...n) => {
8313
8315
  var o, u = 16 * n.length, c = P(), h = Ft(u), b = h >>> 3;
8314
8316
  for (o of n) typeof o == "bigint" ? ((p(), pe)[b++ >>> 0] = 1n, (p(), pe)[b++ >>> 0] = o) : ((p(), pe)[b++ >>> 0] = 0n, (p(), ae)[b++ >>> 0] = o);
8315
8317
  return e = Lo(e, 0, u, h, t), D(c), e;
8316
8318
  };
8317
8319
  function qe(e) {
8318
8320
  if (i) return V(0, 1, e);
8319
- if (S = e, !(0 < z)) {
8321
+ if (S = e, !(0 < G)) {
8320
8322
  for (var t of gt) Se(t);
8321
8323
  for (t of We) Se(t);
8322
8324
  We = [], gt = [], Je = {}, W = true;
@@ -8361,7 +8363,7 @@ async function ts(a = {}) {
8361
8363
  We.push(e);
8362
8364
  }
8363
8365
  var Fe, zs = (e, t) => {
8364
- z = 0, e = zr(e, t), 0 < z ? S = e : Fr(e);
8366
+ G = 0, e = zr(e, t), 0 < G ? S = e : Fr(e);
8365
8367
  }, Ct = [], Ut = 0, me = (e) => -9007199254740992 > e || 9007199254740992 < e ? NaN : Number(e);
8366
8368
  function Vs(e) {
8367
8369
  var t = new wr(e >>>= 0);
@@ -8713,7 +8715,7 @@ async function ts(a = {}) {
8713
8715
  }
8714
8716
  var he = (e) => {
8715
8717
  if (!W) try {
8716
- if (e(), !(0 < z)) try {
8718
+ if (e(), !(0 < G)) try {
8717
8719
  i ? Wt() && Fr(S) : br(S);
8718
8720
  } catch (t) {
8719
8721
  t instanceof wt || t == "unwind" || y(0, t);
@@ -8741,7 +8743,7 @@ async function ts(a = {}) {
8741
8743
  return (t ? Vr[t] : of[e])(...Ir);
8742
8744
  }
8743
8745
  var Ei = () => {
8744
- z = 0;
8746
+ G = 0;
8745
8747
  };
8746
8748
  function Si(e) {
8747
8749
  e >>>= 0, i ? postMessage({ Oc: "cleanupThread", ie: e }) : yn(Je[e]);
@@ -8761,7 +8763,7 @@ async function ts(a = {}) {
8761
8763
  try {
8762
8764
  return e(...n);
8763
8765
  } finally {
8764
- W || (_t.pop(), Me && Ge === 1 && _t.length === 0 && (Ge = 0, z += 1, Pt(wa), typeof Fibers < "u" && Fibers.De()));
8766
+ W || (_t.pop(), Me && Ge === 1 && _t.length === 0 && (Ge = 0, G += 1, Pt(wa), typeof Fibers < "u" && Fibers.De()));
8765
8767
  }
8766
8768
  };
8767
8769
  return jn.set(e, t), t;
@@ -8776,7 +8778,7 @@ async function ts(a = {}) {
8776
8778
  try {
8777
8779
  var c = (function() {
8778
8780
  var E = (p(), x)[Me + 8 >>> 2 >>> 0];
8779
- return E = Vn.get(E), E = jn.get(E), --z, E();
8781
+ return E = Vn.get(E), E = jn.get(E), --G, E();
8780
8782
  })();
8781
8783
  } catch (E) {
8782
8784
  c = E, u = true;
@@ -8967,7 +8969,7 @@ async function ts(a = {}) {
8967
8969
  return L(ct(e >>> 0, t >>> 0));
8968
8970
  }
8969
8971
  var ou = () => {
8970
- throw z += 1, "unwind";
8972
+ throw G += 1, "unwind";
8971
8973
  };
8972
8974
  function au() {
8973
8975
  return 4294901760;
@@ -9060,15 +9062,15 @@ async function ts(a = {}) {
9060
9062
  }
9061
9063
  (b = (p(), A)[c + 24 >>> 2 >>> 0]) && (b = { label: Ne(b + 4) }, e.defaultQueue = b), e.label = Ne(c + 4);
9062
9064
  }
9063
- z += 1, lt(t, h.requestDevice(e).then((B) => {
9064
- --z, he(() => {
9065
- ce[u >>> 0] = B.queue, ce[o >>> 0] = B, lt(n, B.lost.then((ue) => {
9065
+ G += 1, lt(t, h.requestDevice(e).then((B) => {
9066
+ --G, he(() => {
9067
+ ce[u >>> 0] = B.queue, ce[o >>> 0] = B, G += 1, lt(n, B.lost.then((ue) => {
9066
9068
  he(() => {
9067
9069
  B.onuncapturederror = () => {
9068
9070
  };
9069
9071
  var ye = P(), fe = Ce(ue.message);
9070
9072
  _r(n, yu[ue.reason], fe), D(ye);
9071
- });
9073
+ }), --G;
9072
9074
  })), B.onuncapturederror = (ue) => {
9073
9075
  var ye = 5;
9074
9076
  ue.error instanceof GPUValidationError ? ye = 2 : ue.error instanceof GPUOutOfMemoryError ? ye = 3 : ue.error instanceof GPUInternalError && (ye = 4);
@@ -9077,7 +9079,7 @@ async function ts(a = {}) {
9077
9079
  }, "adapterInfo" in B || (B.adapterInfo = h.info), kr(t, 1, o, 0);
9078
9080
  });
9079
9081
  }, (B) => {
9080
- --z, he(() => {
9082
+ --G, he(() => {
9081
9083
  var ue = P(), ye = Ce(B.message);
9082
9084
  kr(t, 3, o, ye), n && _r(n, 4, ye), D(ue);
9083
9085
  });
@@ -9120,12 +9122,12 @@ async function ts(a = {}) {
9120
9122
  function vu(e, t, n, o, u) {
9121
9123
  e >>>= 0, t = me(t), n = me(n), u >>>= 0;
9122
9124
  var c = O(e);
9123
- Re[e] = [], u == 4294967295 && (u = void 0), z += 1, lt(t, c.mapAsync(n, o >>> 0, u).then(() => {
9124
- --z, he(() => {
9125
+ Re[e] = [], u == 4294967295 && (u = void 0), G += 1, lt(t, c.mapAsync(n, o >>> 0, u).then(() => {
9126
+ --G, he(() => {
9125
9127
  Rr(t, 1, 0);
9126
9128
  });
9127
9129
  }, (h) => {
9128
- --z, he(() => {
9130
+ --G, he(() => {
9129
9131
  P();
9130
9132
  var b = Ce(h.message);
9131
9133
  Rr(t, h.name === "AbortError" ? 4 : h.name === "OperationError" ? 3 : 0, b), delete Re[e];
@@ -9154,12 +9156,12 @@ async function ts(a = {}) {
9154
9156
  return ce[n >>> 0] = u, o && (Re[n] = []), true;
9155
9157
  }
9156
9158
  function Iu(e, t, n, o) {
9157
- e >>>= 0, t = me(t), o >>>= 0, n = du(n >>> 0), e = O(e), z += 1, lt(t, e.createComputePipelineAsync(n).then((u) => {
9158
- --z, he(() => {
9159
+ e >>>= 0, t = me(t), o >>>= 0, n = du(n >>> 0), e = O(e), G += 1, lt(t, e.createComputePipelineAsync(n).then((u) => {
9160
+ --G, he(() => {
9159
9161
  ce[o >>> 0] = u, Pr(t, 1, o, 0);
9160
9162
  });
9161
9163
  }, (u) => {
9162
- --z, he(() => {
9164
+ --G, he(() => {
9163
9165
  var c = P(), h = Ce(u.message);
9164
9166
  Pr(t, u.reason === "validation" ? 3 : u.reason === "internal" ? 4 : 0, o, h), D(c);
9165
9167
  });
@@ -9174,15 +9176,15 @@ async function ts(a = {}) {
9174
9176
  (e = O(e)).onuncapturederror = null, e.destroy();
9175
9177
  };
9176
9178
  function Ou(e, t) {
9177
- t = me(t), e = O(e >>> 0), z += 1, lt(t, e.popErrorScope().then((n) => {
9178
- --z, he(() => {
9179
+ t = me(t), e = O(e >>> 0), G += 1, lt(t, e.popErrorScope().then((n) => {
9180
+ --G, he(() => {
9179
9181
  var o = 5;
9180
9182
  n ? n instanceof GPUValidationError ? o = 2 : n instanceof GPUOutOfMemoryError ? o = 3 : n instanceof GPUInternalError && (o = 4) : o = 1;
9181
9183
  var u = P(), c = n ? Ce(n.message) : 0;
9182
9184
  Nr(t, 1, o, c), D(u);
9183
9185
  });
9184
9186
  }, (n) => {
9185
- --z, he(() => {
9187
+ --G, he(() => {
9186
9188
  var o = P(), u = Ce(n.message);
9187
9189
  Nr(t, 1, 5, u), D(o);
9188
9190
  });
@@ -9193,8 +9195,8 @@ async function ts(a = {}) {
9193
9195
  var u = { featureLevel: pu[(p(), x)[n + 4 >>> 2 >>> 0]], powerPreference: mu[(p(), x)[n + 8 >>> 2 >>> 0]], forceFallbackAdapter: !!(p(), A)[n + 12 >>> 2 >>> 0] };
9194
9196
  (e = (p(), A)[n >>> 2 >>> 0]) !== 0 && (p(), u.Fe = !!(p(), A)[e + 8 >>> 2 >>> 0]);
9195
9197
  }
9196
- "gpu" in navigator ? (z += 1, lt(t, navigator.gpu.requestAdapter(u).then((c) => {
9197
- --z, he(() => {
9198
+ "gpu" in navigator ? (G += 1, lt(t, navigator.gpu.requestAdapter(u).then((c) => {
9199
+ --G, he(() => {
9198
9200
  if (c) ce[o >>> 0] = c, Et(t, 1, o, 0);
9199
9201
  else {
9200
9202
  var h = P(), b = Ce("WebGPU not available on this browser (requestAdapter returned null)");
@@ -9202,7 +9204,7 @@ async function ts(a = {}) {
9202
9204
  }
9203
9205
  });
9204
9206
  }, (c) => {
9205
- --z, he(() => {
9207
+ --G, he(() => {
9206
9208
  var h = P(), b = Ce(c.message);
9207
9209
  Et(t, 4, o, b), D(h);
9208
9210
  });
@@ -9433,7 +9435,7 @@ async function ts(a = {}) {
9433
9435
  Te(`invalid type for getValue: ${t}`);
9434
9436
  }
9435
9437
  }, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
9436
- var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
9438
+ var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 937012: (e, t, n, o, u) => {
9437
9439
  if (r === void 0 || !r.Uc) return 1;
9438
9440
  if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
9439
9441
  if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
@@ -9453,11 +9455,11 @@ async function ts(a = {}) {
9453
9455
  } catch {
9454
9456
  return 4;
9455
9457
  }
9456
- }, 926500: (e, t, n) => {
9458
+ }, 937836: (e, t, n) => {
9457
9459
  r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
9458
- }, 926564: () => r.me(), 926606: (e) => {
9460
+ }, 937900: () => r.me(), 937942: (e) => {
9459
9461
  r.jd(e);
9460
- }, 926643: () => typeof wasmOffsetConverter < "u" };
9462
+ }, 937979: () => typeof wasmOffsetConverter < "u" };
9461
9463
  function af(e, t, n, o) {
9462
9464
  var u = P();
9463
9465
  try {
@@ -9476,12 +9478,12 @@ async function ts(a = {}) {
9476
9478
  N(1, 0);
9477
9479
  }
9478
9480
  }
9479
- function uf(e, t, n) {
9480
- var o = P();
9481
+ function uf(e) {
9482
+ var t = P();
9481
9483
  try {
9482
- _o(e, t, n);
9483
- } catch (u) {
9484
- if (D(o), u !== u + 0) throw u;
9484
+ Ro(e);
9485
+ } catch (n) {
9486
+ if (D(t), n !== n + 0) throw n;
9485
9487
  N(1, 0);
9486
9488
  }
9487
9489
  }
@@ -9494,25 +9496,16 @@ async function ts(a = {}) {
9494
9496
  N(1, 0);
9495
9497
  }
9496
9498
  }
9497
- function cf(e) {
9498
- var t = P();
9499
- try {
9500
- Ro(e);
9501
- } catch (n) {
9502
- if (D(t), n !== n + 0) throw n;
9503
- N(1, 0);
9504
- }
9505
- }
9506
- function df(e, t, n, o, u, c, h) {
9507
- var b = P();
9499
+ function cf(e, t, n) {
9500
+ var o = P();
9508
9501
  try {
9509
- return Wo(e, t, n, o, u, c, h);
9510
- } catch (E) {
9511
- if (D(b), E !== E + 0) throw E;
9502
+ _o(e, t, n);
9503
+ } catch (u) {
9504
+ if (D(o), u !== u + 0) throw u;
9512
9505
  N(1, 0);
9513
9506
  }
9514
9507
  }
9515
- function lf(e, t) {
9508
+ function df(e, t) {
9516
9509
  var n = P();
9517
9510
  try {
9518
9511
  Vo(e, t);
@@ -9521,6 +9514,15 @@ async function ts(a = {}) {
9521
9514
  N(1, 0);
9522
9515
  }
9523
9516
  }
9517
+ function lf(e, t, n, o, u, c, h) {
9518
+ var b = P();
9519
+ try {
9520
+ return Wo(e, t, n, o, u, c, h);
9521
+ } catch (E) {
9522
+ if (D(b), E !== E + 0) throw E;
9523
+ N(1, 0);
9524
+ }
9525
+ }
9524
9526
  function pf(e, t, n, o, u, c) {
9525
9527
  var h = P();
9526
9528
  try {
@@ -9950,7 +9952,7 @@ var nc;
9950
9952
  var oc;
9951
9953
  var ac;
9952
9954
  var qt;
9953
- var $;
9955
+ var z;
9954
9956
  var je = k(() => {
9955
9957
  "use strict";
9956
9958
  Yt();
@@ -10006,19 +10008,19 @@ var je = k(() => {
10006
10008
  rr = false, ds = true, H(M);
10007
10009
  });
10008
10010
  })), await Promise.race(C), S) throw new Error(`WebAssembly backend initializing failed due to timeout: ${r}ms`);
10009
- }, $ = () => {
10011
+ }, z = () => {
10010
10012
  if (nn && rn) return rn;
10011
10013
  throw new Error("WebAssembly is not initialized yet.");
10012
10014
  };
10013
10015
  });
10014
10016
  var be;
10015
10017
  var Lt;
10016
- var G;
10018
+ var $;
10017
10019
  var nr = k(() => {
10018
10020
  "use strict";
10019
10021
  je();
10020
10022
  be = (a, r) => {
10021
- let s = $(), f = s.lengthBytesUTF8(a) + 1, i = s._malloc(f);
10023
+ let s = z(), f = s.lengthBytesUTF8(a) + 1, i = s._malloc(f);
10022
10024
  return s.stringToUTF8(a, i, f), r.push(i), i;
10023
10025
  }, Lt = (a, r, s, f) => {
10024
10026
  if (typeof a == "object" && a !== null) {
@@ -10032,8 +10034,8 @@ var nr = k(() => {
10032
10034
  else if (typeof d == "boolean") f(l, d ? "1" : "0");
10033
10035
  else throw new Error(`Can't handle extra config type: ${typeof d}`);
10034
10036
  });
10035
- }, G = (a) => {
10036
- let r = $(), s = r.stackSave();
10037
+ }, $ = (a) => {
10038
+ let r = z(), s = r.stackSave();
10037
10039
  try {
10038
10040
  let f = r.PTR_SIZE, i = r.stackAlloc(2 * f);
10039
10041
  r._OrtGetLastError(i, i + f);
@@ -10050,7 +10052,7 @@ var ps = k(() => {
10050
10052
  je();
10051
10053
  nr();
10052
10054
  ls = (a) => {
10053
- let r = $(), s = 0, f = [], i = a || {};
10055
+ let r = z(), s = 0, f = [], i = a || {};
10054
10056
  try {
10055
10057
  if (a?.logSeverityLevel === void 0) i.logSeverityLevel = 2;
10056
10058
  else if (typeof a.logSeverityLevel != "number" || !Number.isInteger(a.logSeverityLevel) || a.logSeverityLevel < 0 || a.logSeverityLevel > 4) throw new Error(`log severity level is not valid: ${a.logSeverityLevel}`);
@@ -10058,9 +10060,9 @@ var ps = k(() => {
10058
10060
  else if (typeof a.logVerbosityLevel != "number" || !Number.isInteger(a.logVerbosityLevel)) throw new Error(`log verbosity level is not valid: ${a.logVerbosityLevel}`);
10059
10061
  a?.terminate === void 0 && (i.terminate = false);
10060
10062
  let d = 0;
10061
- return a?.tag !== void 0 && (d = be(a.tag, f)), s = r._OrtCreateRunOptions(i.logSeverityLevel, i.logVerbosityLevel, !!i.terminate, d), s === 0 && G("Can't create run options."), a?.extra !== void 0 && Lt(a.extra, "", /* @__PURE__ */ new WeakSet(), (l, m) => {
10063
+ return a?.tag !== void 0 && (d = be(a.tag, f)), s = r._OrtCreateRunOptions(i.logSeverityLevel, i.logVerbosityLevel, !!i.terminate, d), s === 0 && $("Can't create run options."), a?.extra !== void 0 && Lt(a.extra, "", /* @__PURE__ */ new WeakSet(), (l, m) => {
10062
10064
  let y = be(l, f), w = be(m, f);
10063
- r._OrtAddRunConfigEntry(s, y, w) !== 0 && G(`Can't set a run config entry: ${l} - ${m}.`);
10065
+ r._OrtAddRunConfigEntry(s, y, w) !== 0 && $(`Can't set a run config entry: ${l} - ${m}.`);
10064
10066
  }), [s, f];
10065
10067
  } catch (d) {
10066
10068
  throw s !== 0 && r._OrtReleaseRunOptions(s), f.forEach((l) => r._free(l)), d;
@@ -10108,7 +10110,7 @@ var hs = k(() => {
10108
10110
  r.use_ort_model_bytes_directly || (r.use_ort_model_bytes_directly = "1"), a.executionProviders && a.executionProviders.some((s) => (typeof s == "string" ? s : s.name) === "webgpu") && (a.enableMemPattern = false);
10109
10111
  }, on = (a, r, s, f) => {
10110
10112
  let i = be(r, f), d = be(s, f);
10111
- $()._OrtAddSessionConfigEntry(a, i, d) !== 0 && G(`Can't set a session config entry: ${r} - ${s}.`);
10113
+ z()._OrtAddSessionConfigEntry(a, i, d) !== 0 && $(`Can't set a session config entry: ${r} - ${s}.`);
10112
10114
  }, ot = (a, r, s, f) => {
10113
10115
  let i = be(r, f), d = be(s, f);
10114
10116
  a.push([i, d]);
@@ -10139,7 +10141,7 @@ var hs = k(() => {
10139
10141
  }
10140
10142
  S.validationMode && ot(l, "validationMode", S.validationMode, s);
10141
10143
  }
10142
- let v = $().webgpuRegisterDevice(g);
10144
+ let v = z().webgpuRegisterDevice(g);
10143
10145
  if (v) {
10144
10146
  let [S, C, R] = v;
10145
10147
  ot(l, "deviceId", S.toString(), s), ot(l, "webgpuInstance", C.toString(), s), ot(l, "webgpuDevice", R.toString(), s);
@@ -10154,13 +10156,13 @@ var hs = k(() => {
10154
10156
  }
10155
10157
  let m = be(d, s), y = l.length, w = 0, T = 0;
10156
10158
  if (y > 0) {
10157
- w = $()._malloc(y * $().PTR_SIZE), s.push(w), T = $()._malloc(y * $().PTR_SIZE), s.push(T);
10158
- for (let g = 0; g < y; g++) $().setValue(w + g * $().PTR_SIZE, l[g][0], "*"), $().setValue(T + g * $().PTR_SIZE, l[g][1], "*");
10159
+ w = z()._malloc(y * z().PTR_SIZE), s.push(w), T = z()._malloc(y * z().PTR_SIZE), s.push(T);
10160
+ for (let g = 0; g < y; g++) z().setValue(w + g * z().PTR_SIZE, l[g][0], "*"), z().setValue(T + g * z().PTR_SIZE, l[g][1], "*");
10159
10161
  }
10160
- await $()._OrtAppendExecutionProvider(a, m, w, T, y) !== 0 && G(`Can't append execution provider: ${d}.`);
10162
+ await z()._OrtAppendExecutionProvider(a, m, w, T, y) !== 0 && $(`Can't append execution provider: ${d}.`);
10161
10163
  }
10162
10164
  }, ms = async (a) => {
10163
- let r = $(), s = 0, f = [], i = a || {};
10165
+ let r = z(), s = 0, f = [], i = a || {};
10164
10166
  uc(i);
10165
10167
  try {
10166
10168
  let d = sc(i.graphOptimizationLevel ?? "all"), l = ic(i.executionMode ?? "sequential"), m = typeof i.logId == "string" ? be(i.logId, f) : 0, y = i.logSeverityLevel ?? 2;
@@ -10168,7 +10170,7 @@ var hs = k(() => {
10168
10170
  let w = i.logVerbosityLevel ?? 0;
10169
10171
  if (!Number.isInteger(w) || w < 0 || w > 4) throw new Error(`log verbosity level is not valid: ${w}`);
10170
10172
  let T = typeof i.optimizedModelFilePath == "string" ? be(i.optimizedModelFilePath, f) : 0;
10171
- if (s = r._OrtCreateSessionOptions(d, !!i.enableCpuMemArena, !!i.enableMemPattern, l, !!i.enableProfiling, 0, m, y, w, T), s === 0 && G("Can't create session options."), i.executionProviders && await fc(s, i, f), i.enableGraphCapture !== void 0) {
10173
+ if (s = r._OrtCreateSessionOptions(d, !!i.enableCpuMemArena, !!i.enableMemPattern, l, !!i.enableProfiling, 0, m, y, w, T), s === 0 && $("Can't create session options."), i.executionProviders && await fc(s, i, f), i.enableGraphCapture !== void 0) {
10172
10174
  if (typeof i.enableGraphCapture != "boolean") throw new Error(`enableGraphCapture must be a boolean value: ${i.enableGraphCapture}`);
10173
10175
  on(s, "enableGraphCapture", i.enableGraphCapture.toString(), f);
10174
10176
  }
@@ -10176,13 +10178,13 @@ var hs = k(() => {
10176
10178
  if (typeof g != "string") throw new Error(`free dimension override name must be a string: ${g}`);
10177
10179
  if (typeof v != "number" || !Number.isInteger(v) || v < 0) throw new Error(`free dimension override value must be a non-negative integer: ${v}`);
10178
10180
  let S = be(g, f);
10179
- r._OrtAddFreeDimensionOverride(s, S, v) !== 0 && G(`Can't set a free dimension override: ${g} - ${v}.`);
10181
+ r._OrtAddFreeDimensionOverride(s, S, v) !== 0 && $(`Can't set a free dimension override: ${g} - ${v}.`);
10180
10182
  }
10181
10183
  return i.extra !== void 0 && Lt(i.extra, "", /* @__PURE__ */ new WeakSet(), (g, v) => {
10182
10184
  on(s, g, v, f);
10183
10185
  }), [s, f];
10184
10186
  } catch (d) {
10185
- throw s !== 0 && r._OrtReleaseSessionOptions(s) !== 0 && G("Can't release session options."), f.forEach((l) => r._free(l)), d;
10187
+ throw s !== 0 && r._OrtReleaseSessionOptions(s) !== 0 && $("Can't release session options."), f.forEach((l) => r._free(l)), d;
10186
10188
  }
10187
10189
  };
10188
10190
  });
@@ -10752,7 +10754,7 @@ var Os = k(() => {
10752
10754
  return l ? l.push(d) : this.temporarySessionTensorIds.set(r, [d]), d;
10753
10755
  }
10754
10756
  uploadTensor(r, s) {
10755
- if (!$().shouldTransferToMLTensor) throw new Error("Trying to upload to a MLTensor while shouldTransferToMLTensor is false");
10757
+ if (!z().shouldTransferToMLTensor) throw new Error("Trying to upload to a MLTensor while shouldTransferToMLTensor is false");
10756
10758
  le("verbose", () => `[WebNN] uploadTensor {tensorId: ${r}, data: ${s.byteLength}}`), this.tensorManager.upload(r, s);
10757
10759
  }
10758
10760
  async downloadTensor(r, s) {
@@ -10858,11 +10860,11 @@ var Kr = k(() => {
10858
10860
  nr();
10859
10861
  sn();
10860
10862
  yc = (a, r) => {
10861
- $()._OrtInit(a, r) !== 0 && G("Can't initialize onnxruntime.");
10863
+ z()._OrtInit(a, r) !== 0 && $("Can't initialize onnxruntime.");
10862
10864
  }, Jt = async (a) => {
10863
10865
  yc(a.wasm.numThreads, Ot(a.logLevel));
10864
10866
  }, Xt = async (a, r) => {
10865
- $().asyncInit?.();
10867
+ z().asyncInit?.();
10866
10868
  let s = a.webgpu.adapter;
10867
10869
  if (r === "webgpu") {
10868
10870
  if (typeof navigator > "u" || !navigator.gpu) throw new Error("WebGPU is not supported in current environment");
@@ -10877,29 +10879,29 @@ var Kr = k(() => {
10877
10879
  }
10878
10880
  }
10879
10881
  if (r === "webnn" && (typeof navigator > "u" || !navigator.ml)) throw new Error("WebNN is not supported in current environment");
10880
- if (r === "webgpu" && $().webgpuInit((f) => {
10882
+ if (r === "webgpu" && z().webgpuInit((f) => {
10881
10883
  a.webgpu.device = f;
10882
10884
  }), r === "webnn") {
10883
10885
  let f = new (Os(), $t(Ls)).WebNNBackend(a);
10884
- $().webnnInit([f, () => f.reserveTensorId(), (i) => f.releaseTensorId(i), async (i, d, l, m, y) => f.ensureTensor(i, d, l, m, y), (i, d) => {
10886
+ z().webnnInit([f, () => f.reserveTensorId(), (i) => f.releaseTensorId(i), async (i, d, l, m, y) => f.ensureTensor(i, d, l, m, y), (i, d) => {
10885
10887
  f.uploadTensor(i, d);
10886
10888
  }, async (i, d) => f.downloadTensor(i, d), (i, d) => f.registerMLContext(i, d), !!a.trace]);
10887
10889
  }
10888
10890
  }, it = /* @__PURE__ */ new Map(), bc = (a) => {
10889
- let r = $(), s = r.stackSave();
10891
+ let r = z(), s = r.stackSave();
10890
10892
  try {
10891
10893
  let f = r.PTR_SIZE, i = r.stackAlloc(2 * f);
10892
- r._OrtGetInputOutputCount(a, i, i + f) !== 0 && G("Can't get session input/output count.");
10894
+ r._OrtGetInputOutputCount(a, i, i + f) !== 0 && $("Can't get session input/output count.");
10893
10895
  let l = f === 4 ? "i32" : "i64";
10894
10896
  return [Number(r.getValue(i, l)), Number(r.getValue(i + f, l))];
10895
10897
  } finally {
10896
10898
  r.stackRestore(s);
10897
10899
  }
10898
10900
  }, Bs = (a, r) => {
10899
- let s = $(), f = s.stackSave(), i = 0;
10901
+ let s = z(), f = s.stackSave(), i = 0;
10900
10902
  try {
10901
10903
  let d = s.PTR_SIZE, l = s.stackAlloc(2 * d);
10902
- s._OrtGetInputOutputMetadata(a, r, l, l + d) !== 0 && G("Can't get session input/output metadata.");
10904
+ s._OrtGetInputOutputMetadata(a, r, l, l + d) !== 0 && $("Can't get session input/output metadata.");
10903
10905
  let y = Number(s.getValue(l, "*"));
10904
10906
  i = Number(s.getValue(l + d, "*"));
10905
10907
  let w = s.HEAP32[i / 4];
@@ -10914,11 +10916,11 @@ var Kr = k(() => {
10914
10916
  s.stackRestore(f), i !== 0 && s._OrtFree(i);
10915
10917
  }
10916
10918
  }, xt = (a) => {
10917
- let r = $(), s = r._malloc(a.byteLength);
10919
+ let r = z(), s = r._malloc(a.byteLength);
10918
10920
  if (s === 0) throw new Error(`Can't create a session. failed to allocate a buffer of size ${a.byteLength}.`);
10919
10921
  return r.HEAPU8.set(a, s), [s, a.byteLength];
10920
10922
  }, Qt = async (a, r) => {
10921
- let s, f, i = $();
10923
+ let s, f, i = z();
10922
10924
  Array.isArray(a) ? [s, f] = a : a.buffer === i.HEAPU8.buffer ? [s, f] = [a.byteOffset, a.byteLength] : [s, f] = xt(a);
10923
10925
  let d = 0, l = 0, m = 0, y = [], w = [], T = [];
10924
10926
  try {
@@ -10939,17 +10941,17 @@ var Kr = k(() => {
10939
10941
  } else i.currentContext = await i.webnnCreateMLContext();
10940
10942
  break;
10941
10943
  }
10942
- d = await i._OrtCreateSession(s, f, l), i.webgpuOnCreateSession?.(d), d === 0 && G("Can't create a session."), i.jsepOnCreateSession?.(), i.currentContext && (i.webnnRegisterMLContext(d, i.currentContext), i.currentContext = void 0, i.shouldTransferToMLTensor = true);
10944
+ d = await i._OrtCreateSession(s, f, l), i.webgpuOnCreateSession?.(d), d === 0 && $("Can't create a session."), i.jsepOnCreateSession?.(), i.currentContext && (i.webnnRegisterMLContext(d, i.currentContext), i.currentContext = void 0, i.shouldTransferToMLTensor = true);
10943
10945
  let [g, v] = bc(d), S = !!r?.enableGraphCapture, C = [], R = [], H = [], U = [], M = [];
10944
10946
  for (let L = 0; L < g; L++) {
10945
10947
  let [W, oe, p] = Bs(d, L);
10946
- W === 0 && G("Can't get an input name."), w.push(W);
10948
+ W === 0 && $("Can't get an input name."), w.push(W);
10947
10949
  let ne = i.UTF8ToString(W);
10948
10950
  C.push(ne), H.push(oe === 0 ? { name: ne, isTensor: false } : { name: ne, isTensor: true, type: or(oe), shape: p });
10949
10951
  }
10950
10952
  for (let L = 0; L < v; L++) {
10951
10953
  let [W, oe, p] = Bs(d, L + g);
10952
- W === 0 && G("Can't get an output name."), T.push(W);
10954
+ W === 0 && $("Can't get an output name."), T.push(W);
10953
10955
  let ne = i.UTF8ToString(W);
10954
10956
  R.push(ne), U.push(oe === 0 ? { name: ne, isTensor: false } : { name: ne, isTensor: true, type: or(oe), shape: p });
10955
10957
  {
@@ -10968,23 +10970,23 @@ var Kr = k(() => {
10968
10970
  }
10969
10971
  }
10970
10972
  let Y = null;
10971
- return M.some((L) => L === "gpu-buffer" || L === "ml-tensor" || L === "ml-tensor-cpu-output") && (m = i._OrtCreateBinding(d), m === 0 && G("Can't create IO binding."), Y = { handle: m, outputPreferredLocations: M, outputPreferredLocationsEncoded: M.map((L) => L === "ml-tensor-cpu-output" ? "ml-tensor" : L).map((L) => an(L)) }), it.set(d, [d, w, T, Y, S, false]), [d, C, R, H, U];
10973
+ return M.some((L) => L === "gpu-buffer" || L === "ml-tensor" || L === "ml-tensor-cpu-output") && (m = i._OrtCreateBinding(d), m === 0 && $("Can't create IO binding."), Y = { handle: m, outputPreferredLocations: M, outputPreferredLocationsEncoded: M.map((L) => L === "ml-tensor-cpu-output" ? "ml-tensor" : L).map((L) => an(L)) }), it.set(d, [d, w, T, Y, S, false]), [d, C, R, H, U];
10972
10974
  } catch (g) {
10973
- throw w.forEach((v) => i._OrtFree(v)), T.forEach((v) => i._OrtFree(v)), m !== 0 && i._OrtReleaseBinding(m) !== 0 && G("Can't release IO binding."), d !== 0 && i._OrtReleaseSession(d) !== 0 && G("Can't release session."), g;
10975
+ throw w.forEach((v) => i._OrtFree(v)), T.forEach((v) => i._OrtFree(v)), m !== 0 && i._OrtReleaseBinding(m) !== 0 && $("Can't release IO binding."), d !== 0 && i._OrtReleaseSession(d) !== 0 && $("Can't release session."), g;
10974
10976
  } finally {
10975
- i._free(s), l !== 0 && i._OrtReleaseSessionOptions(l) !== 0 && G("Can't release session options."), y.forEach((g) => i._free(g)), i.unmountExternalData?.();
10977
+ i._free(s), l !== 0 && i._OrtReleaseSessionOptions(l) !== 0 && $("Can't release session options."), y.forEach((g) => i._free(g)), i.unmountExternalData?.();
10976
10978
  }
10977
10979
  }, Zt = (a) => {
10978
- let r = $(), s = it.get(a);
10980
+ let r = z(), s = it.get(a);
10979
10981
  if (!s) throw new Error(`cannot release session. invalid session id: ${a}`);
10980
10982
  let [f, i, d, l, m] = s;
10981
- l && (m && r._OrtClearBoundOutputs(l.handle) !== 0 && G("Can't clear bound outputs."), r._OrtReleaseBinding(l.handle) !== 0 && G("Can't release IO binding.")), r.jsepOnReleaseSession?.(a), r.webnnOnReleaseSession?.(a), r.webgpuOnReleaseSession?.(a), i.forEach((y) => r._OrtFree(y)), d.forEach((y) => r._OrtFree(y)), r._OrtReleaseSession(f) !== 0 && G("Can't release session."), it.delete(a);
10983
+ l && (m && r._OrtClearBoundOutputs(l.handle) !== 0 && $("Can't clear bound outputs."), r._OrtReleaseBinding(l.handle) !== 0 && $("Can't release IO binding.")), r.jsepOnReleaseSession?.(a), r.webnnOnReleaseSession?.(a), r.webgpuOnReleaseSession?.(a), i.forEach((y) => r._OrtFree(y)), d.forEach((y) => r._OrtFree(y)), r._OrtReleaseSession(f) !== 0 && $("Can't release session."), it.delete(a);
10982
10984
  }, Ms = async (a, r, s, f, i, d, l = false) => {
10983
10985
  if (!a) {
10984
10986
  r.push(0);
10985
10987
  return;
10986
10988
  }
10987
- let m = $(), y = m.PTR_SIZE, w = a[0], T = a[1], g = a[3], v = g, S, C;
10989
+ let m = z(), y = m.PTR_SIZE, w = a[0], T = a[1], g = a[3], v = g, S, C;
10988
10990
  if (w === "string" && (g === "gpu-buffer" || g === "ml-tensor")) throw new Error("String tensor is not supported on GPU.");
10989
10991
  if (l && g !== "gpu-buffer") throw new Error(`External buffer must be provided for input/output index ${d} when enableGraphCapture is true.`);
10990
10992
  if (g === "gpu-buffer") {
@@ -11028,12 +11030,12 @@ var Kr = k(() => {
11028
11030
  try {
11029
11031
  T.forEach((M, Y) => m.setValue(H + Y * y, M, y === 4 ? "i32" : "i64"));
11030
11032
  let U = m._OrtCreateTensor(He(w), S, C, H, T.length, an(v));
11031
- U === 0 && G(`Can't create tensor for input/output. session=${f}, index=${d}.`), r.push(U);
11033
+ U === 0 && $(`Can't create tensor for input/output. session=${f}, index=${d}.`), r.push(U);
11032
11034
  } finally {
11033
11035
  m.stackRestore(R);
11034
11036
  }
11035
11037
  }, Kt = async (a, r, s, f, i, d) => {
11036
- let l = $(), m = l.PTR_SIZE, y = it.get(a);
11038
+ let l = z(), m = l.PTR_SIZE, y = it.get(a);
11037
11039
  if (!y) throw new Error(`cannot run inference. invalid session id: ${a}`);
11038
11040
  let w = y[0], T = y[1], g = y[2], v = y[3], S = y[4], C = y[5], R = r.length, H = f.length, U = 0, M = [], Y = [], L = [], W = [], oe = [], p = l.stackSave(), ne = l.stackAlloc(R * m), X = l.stackAlloc(R * m), J = l.stackAlloc(H * m), Ue = l.stackAlloc(H * m);
11039
11041
  try {
@@ -11049,33 +11051,33 @@ var Kr = k(() => {
11049
11051
  $e("wasm bindInputsOutputs");
11050
11052
  for (let q = 0; q < R; q++) {
11051
11053
  let we = r[q];
11052
- await l._OrtBindInput(_, T[we], Y[q]) !== 0 && G(`Can't bind input[${q}] for session=${a}.`);
11054
+ await l._OrtBindInput(_, T[we], Y[q]) !== 0 && $(`Can't bind input[${q}] for session=${a}.`);
11053
11055
  }
11054
11056
  for (let q = 0; q < H; q++) {
11055
11057
  let we = f[q];
11056
- i[q]?.[3] ? (oe.push(L[q]), l._OrtBindOutput(_, g[we], L[q], 0) !== 0 && G(`Can't bind pre-allocated output[${q}] for session=${a}.`)) : l._OrtBindOutput(_, g[we], 0, pe[we]) !== 0 && G(`Can't bind output[${q}] to ${ae[q]} for session=${a}.`);
11058
+ i[q]?.[3] ? (oe.push(L[q]), l._OrtBindOutput(_, g[we], L[q], 0) !== 0 && $(`Can't bind pre-allocated output[${q}] for session=${a}.`)) : l._OrtBindOutput(_, g[we], 0, pe[we]) !== 0 && $(`Can't bind output[${q}] to ${ae[q]} for session=${a}.`);
11057
11059
  }
11058
11060
  ze("wasm bindInputsOutputs"), it.set(a, [w, T, g, v, S, true]);
11059
11061
  }
11060
11062
  l.jsepOnRunStart?.(w), l.webnnOnRunStart?.(w);
11061
11063
  let Q;
11062
- v ? Q = await l._OrtRunWithBinding(w, v.handle, H, J, U) : Q = await l._OrtRun(w, X, ne, R, Ue, H, J, U), Q !== 0 && G("failed to call OrtRun().");
11064
+ v ? Q = await l._OrtRunWithBinding(w, v.handle, H, J, U) : Q = await l._OrtRun(w, X, ne, R, Ue, H, J, U), Q !== 0 && $("failed to call OrtRun().");
11063
11065
  let x = [], A = [];
11064
11066
  $e("wasm ProcessOutputTensor");
11065
11067
  for (let _ = 0; _ < H; _++) {
11066
11068
  let ae = Number(l.getValue(J + _ * m, "*"));
11067
11069
  if (ae === L[_] || oe.includes(L[_])) {
11068
- x.push(i[_]), ae !== L[_] && l._OrtReleaseTensor(ae) !== 0 && G("Can't release tensor.");
11070
+ x.push(i[_]), ae !== L[_] && l._OrtReleaseTensor(ae) !== 0 && $("Can't release tensor.");
11069
11071
  continue;
11070
11072
  }
11071
11073
  let pe = l.stackSave(), q = l.stackAlloc(4 * m), we = false, re, se = 0;
11072
11074
  try {
11073
- l._OrtGetTensorData(ae, q, q + m, q + 2 * m, q + 3 * m) !== 0 && G(`Can't access output tensor data on index ${_}.`);
11075
+ l._OrtGetTensorData(ae, q, q + m, q + 2 * m, q + 3 * m) !== 0 && $(`Can't access output tensor data on index ${_}.`);
11074
11076
  let Te = m === 4 ? "i32" : "i64", Ye = Number(l.getValue(q, Te));
11075
11077
  se = l.getValue(q + m, "*");
11076
11078
  let bt = l.getValue(q + m * 2, "*"), wt = Number(l.getValue(q + m * 3, Te)), Se = [];
11077
11079
  for (let ee = 0; ee < wt; ee++) Se.push(Number(l.getValue(bt + ee * m, Te)));
11078
- l._OrtFree(bt) !== 0 && G("Can't free memory for tensor dims.");
11080
+ l._OrtFree(bt) !== 0 && $("Can't free memory for tensor dims.");
11079
11081
  let Ae = Se.reduce((ee, Z) => ee * Z, 1);
11080
11082
  re = or(Ye);
11081
11083
  let Oe = v?.outputPreferredLocations[f[_]];
@@ -11083,24 +11085,24 @@ var Kr = k(() => {
11083
11085
  if (Oe === "gpu-buffer" || Oe === "ml-tensor") throw new Error("String tensor is not supported on GPU.");
11084
11086
  let ee = [];
11085
11087
  for (let Z = 0; Z < Ae; Z++) {
11086
- let z = l.getValue(se + Z * m, "*"), V = l.getValue(se + (Z + 1) * m, "*"), qe = Z === Ae - 1 ? void 0 : V - z;
11087
- ee.push(l.UTF8ToString(z, qe));
11088
+ let G = l.getValue(se + Z * m, "*"), V = l.getValue(se + (Z + 1) * m, "*"), qe = Z === Ae - 1 ? void 0 : V - G;
11089
+ ee.push(l.UTF8ToString(G, qe));
11088
11090
  }
11089
11091
  x.push([re, Se, ee, "cpu"]);
11090
11092
  } else if (Oe === "gpu-buffer" && Ae > 0) {
11091
11093
  let ee = l.webgpuGetBuffer;
11092
11094
  if (!ee) throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');
11093
- let Z = ee(se), z = mt(Ye, Ae);
11094
- if (z === void 0 || !ar(re)) throw new Error(`Unsupported data type: ${re}`);
11095
+ let Z = ee(se), G = mt(Ye, Ae);
11096
+ if (G === void 0 || !ar(re)) throw new Error(`Unsupported data type: ${re}`);
11095
11097
  we = true;
11096
11098
  {
11097
11099
  l.webgpuRegisterBuffer(Z, a, se);
11098
- let V = l.webgpuCreateDownloader(Z, z, a);
11100
+ let V = l.webgpuCreateDownloader(Z, G, a);
11099
11101
  x.push([re, Se, { gpuBuffer: Z, download: async () => {
11100
11102
  let qe = await V();
11101
11103
  return new (at(re))(qe);
11102
11104
  }, dispose: () => {
11103
- l._OrtReleaseTensor(ae) !== 0 && G("Can't release tensor.");
11105
+ l._OrtReleaseTensor(ae) !== 0 && $("Can't release tensor.");
11104
11106
  } }, "gpu-buffer"]);
11105
11107
  }
11106
11108
  } else if (Oe === "ml-tensor" && Ae > 0) {
@@ -11115,8 +11117,8 @@ var Kr = k(() => {
11115
11117
  } else if (Oe === "ml-tensor-cpu-output" && Ae > 0) {
11116
11118
  let ee = l.webnnCreateMLTensorDownloader(se, re)(), Z = x.length;
11117
11119
  we = true, A.push((async () => {
11118
- let z = [Z, await ee];
11119
- return l.webnnReleaseTensorId(se), l._OrtReleaseTensor(ae), z;
11120
+ let G = [Z, await ee];
11121
+ return l.webnnReleaseTensorId(se), l._OrtReleaseTensor(ae), G;
11120
11122
  })()), x.push([re, Se, [], "cpu"]);
11121
11123
  } else {
11122
11124
  let ee = at(re), Z = new ee(Ae);
@@ -11126,7 +11128,7 @@ var Kr = k(() => {
11126
11128
  l.stackRestore(pe), re === "string" && se && l._free(se), we || l._OrtReleaseTensor(ae);
11127
11129
  }
11128
11130
  }
11129
- v && !S && (l._OrtClearBoundOutputs(v.handle) !== 0 && G("Can't clear bound outputs."), it.set(a, [w, T, g, v, S, false]));
11131
+ v && !S && (l._OrtClearBoundOutputs(v.handle) !== 0 && $("Can't clear bound outputs."), it.set(a, [w, T, g, v, S, false]));
11130
11132
  for (let [_, ae] of await Promise.all(A)) x[_][2] = ae;
11131
11133
  return ze("wasm ProcessOutputTensor"), x;
11132
11134
  } finally {
@@ -11137,10 +11139,10 @@ var Kr = k(() => {
11137
11139
  }), Y.forEach((Q) => l._OrtReleaseTensor(Q)), L.forEach((Q) => l._OrtReleaseTensor(Q)), W.forEach((Q) => l._free(Q)), U !== 0 && l._OrtReleaseRunOptions(U), M.forEach((Q) => l._free(Q));
11138
11140
  }
11139
11141
  }, er = (a) => {
11140
- let r = $(), s = it.get(a);
11142
+ let r = z(), s = it.get(a);
11141
11143
  if (!s) throw new Error("invalid session id");
11142
11144
  let f = s[0], i = r._OrtEndProfiling(f);
11143
- i === 0 && G("Can't get an profile file name."), r._OrtFree(i);
11145
+ i === 0 && $("Can't get an profile file name."), r._OrtFree(i);
11144
11146
  }, tr = (a) => {
11145
11147
  let r = [];
11146
11148
  for (let s of a) {
@@ -11373,7 +11375,7 @@ var $s = k(() => {
11373
11375
  Ve();
11374
11376
  Ve();
11375
11377
  Ve();
11376
- var Xa = "1.25.0-dev.20260307-d626b568e0";
11378
+ var Xa = "1.25.0-dev.20260323-a99aad9d36";
11377
11379
  var Tl = Zr;
11378
11380
  {
11379
11381
  let a = ($s(), $t(Gs)).wasmBackend;
@@ -15729,7 +15731,9 @@ var processors_exports = {};
15729
15731
  __export(processors_exports, {
15730
15732
  ChatterboxProcessor: () => ChatterboxProcessor,
15731
15733
  Florence2Processor: () => Florence2Processor,
15734
+ Gemma3Processor: () => Gemma3Processor,
15732
15735
  Gemma3nProcessor: () => Gemma3nProcessor,
15736
+ Glm46VProcessor: () => Glm46VProcessor,
15733
15737
  GraniteSpeechProcessor: () => GraniteSpeechProcessor,
15734
15738
  GroundingDinoProcessor: () => GroundingDinoProcessor,
15735
15739
  Idefics3Processor: () => Idefics3Processor,
@@ -18234,26 +18238,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
18234
18238
  }
18235
18239
  return [segmentation, segments];
18236
18240
  }
18237
- function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
18241
+ function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
18238
18242
  if (height < factor || width < factor) {
18239
- throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
18240
- } else if (Math.max(height, width) / Math.min(height, width) > 200) {
18243
+ const scale = Math.max(factor / height, factor / width);
18244
+ height = Math.round(height * scale);
18245
+ width = Math.round(width * scale);
18246
+ }
18247
+ if (Math.max(height, width) / Math.min(height, width) > 200) {
18241
18248
  throw new Error(
18242
18249
  `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
18243
18250
  );
18244
18251
  }
18245
18252
  let h_bar = Math.round(height / factor) * factor;
18246
18253
  let w_bar = Math.round(width / factor) * factor;
18247
- if (h_bar * w_bar > max_pixels) {
18248
- const beta = Math.sqrt(height * width / max_pixels);
18249
- h_bar = Math.floor(height / beta / factor) * factor;
18250
- w_bar = Math.floor(width / beta / factor) * factor;
18251
- } else if (h_bar * w_bar < min_pixels) {
18252
- const beta = Math.sqrt(min_pixels / (height * width));
18254
+ if (temporal_factor * h_bar * w_bar > max_pixels) {
18255
+ const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
18256
+ h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
18257
+ w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
18258
+ } else if (temporal_factor * h_bar * w_bar < min_pixels) {
18259
+ const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
18253
18260
  h_bar = Math.ceil(height * beta / factor) * factor;
18254
18261
  w_bar = Math.ceil(width * beta / factor) * factor;
18255
18262
  }
18256
- return [h_bar, w_bar];
18263
+ return [w_bar, h_bar];
18257
18264
  }
18258
18265
  function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
18259
18266
  if (label_ids_to_fuse === null) {
@@ -18332,7 +18339,7 @@ var ImageProcessor = class extends Callable2 {
18332
18339
  this.do_pad = config.do_pad;
18333
18340
  this.min_pixels = config.min_pixels;
18334
18341
  this.max_pixels = config.max_pixels;
18335
- if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
18342
+ if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
18336
18343
  this.pad_size = this.size;
18337
18344
  }
18338
18345
  this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -18620,10 +18627,8 @@ var ImageProcessor = class extends Callable2 {
18620
18627
  const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
18621
18628
  [pixelData, imgDims] = padded;
18622
18629
  } else if (this.size_divisibility) {
18623
- const [paddedWidth, paddedHeight] = enforce_size_divisibility(
18624
- [imgDims[1], imgDims[0]],
18625
- this.size_divisibility
18626
- );
18630
+ const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
18631
+ const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
18627
18632
  [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
18628
18633
  }
18629
18634
  }
@@ -18700,6 +18705,7 @@ var image_processors_exports = {};
18700
18705
  __export(image_processors_exports, {
18701
18706
  BeitFeatureExtractor: () => BeitFeatureExtractor,
18702
18707
  BitImageProcessor: () => BitImageProcessor,
18708
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
18703
18709
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
18704
18710
  CLIPImageProcessor: () => CLIPImageProcessor,
18705
18711
  ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -18716,6 +18722,8 @@ __export(image_processors_exports, {
18716
18722
  DonutImageProcessor: () => DonutImageProcessor,
18717
18723
  EfficientNetImageProcessor: () => EfficientNetImageProcessor,
18718
18724
  GLPNFeatureExtractor: () => GLPNFeatureExtractor,
18725
+ Gemma3ImageProcessor: () => Gemma3ImageProcessor,
18726
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
18719
18727
  GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
18720
18728
  Idefics3ImageProcessor: () => Idefics3ImageProcessor,
18721
18729
  ImageFeatureExtractor: () => ImageProcessor,
@@ -18776,6 +18784,10 @@ var BitImageProcessor = class extends ImageProcessor {
18776
18784
  var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
18777
18785
  };
18778
18786
 
18787
+ // src/models/chmv2/image_processing_chmv2.js
18788
+ var CHMv2ImageProcessor = class extends ImageProcessor {
18789
+ };
18790
+
18779
18791
  // src/models/clip/image_processing_clip.js
18780
18792
  var CLIPImageProcessor = class extends ImageProcessor {
18781
18793
  };
@@ -18895,6 +18907,69 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
18895
18907
  }
18896
18908
  };
18897
18909
 
18910
+ // src/models/gemma3/image_processing_gemma3.js
18911
+ var Gemma3ImageProcessor = class extends ImageProcessor {
18912
+ };
18913
+
18914
+ // src/models/qwen2_vl/image_processing_qwen2_vl.js
18915
+ var Qwen2VLImageProcessor = class extends ImageProcessor {
18916
+ constructor(config) {
18917
+ super(config);
18918
+ this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
18919
+ this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
18920
+ this.patch_size = config.patch_size;
18921
+ this.merge_size = config.merge_size;
18922
+ }
18923
+ /** @type {ImageProcessor['get_resize_output_image_size']} */
18924
+ get_resize_output_image_size(image, size) {
18925
+ const factor = this.patch_size * this.merge_size;
18926
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
18927
+ }
18928
+ async _call(images, ...args) {
18929
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
18930
+ let patches = pixel_values;
18931
+ const { temporal_patch_size, merge_size, patch_size } = this.config;
18932
+ if (patches.dims[0] === 1) {
18933
+ patches = cat(
18934
+ Array.from({ length: temporal_patch_size }, () => patches),
18935
+ 0
18936
+ );
18937
+ }
18938
+ const grid_t = patches.dims[0] / temporal_patch_size;
18939
+ const channel = patches.dims[1];
18940
+ const grid_h = Math.floor(patches.dims[2] / patch_size);
18941
+ const grid_w = Math.floor(patches.dims[3] / patch_size);
18942
+ const flatten_patches = patches.view(
18943
+ grid_t,
18944
+ temporal_patch_size,
18945
+ channel,
18946
+ Math.floor(grid_h / merge_size),
18947
+ merge_size,
18948
+ patch_size,
18949
+ Math.floor(grid_w / merge_size),
18950
+ merge_size,
18951
+ patch_size
18952
+ ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
18953
+ const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
18954
+ return {
18955
+ pixel_values: flatten_patches,
18956
+ image_grid_thw,
18957
+ original_sizes,
18958
+ reshaped_input_sizes
18959
+ };
18960
+ }
18961
+ };
18962
+
18963
+ // src/models/glm46v/image_processing_glm46v.js
18964
+ var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
18965
+ /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
18966
+ get_resize_output_image_size(image, size) {
18967
+ const factor = this.patch_size * this.merge_size;
18968
+ const temporal_factor = this.config.temporal_patch_size ?? 2;
18969
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
18970
+ }
18971
+ };
18972
+
18898
18973
  // src/models/glpn/image_processing_glpn.js
18899
18974
  var GLPNFeatureExtractor = class extends ImageProcessor {
18900
18975
  };
@@ -19288,7 +19363,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
19288
19363
  const img = pixel_values.unsqueeze_(0);
19289
19364
  const total_factor = this.encoder_patch_size * this.downsample_factor;
19290
19365
  const f2 = total_factor ** 2;
19291
- const [new_height, new_width] = smart_resize(
19366
+ const [new_width, new_height] = smart_resize(
19292
19367
  Math.max(total_factor, height),
19293
19368
  Math.max(total_factor, width),
19294
19369
  total_factor,
@@ -19578,55 +19653,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
19578
19653
  var PvtImageProcessor = class extends ImageProcessor {
19579
19654
  };
19580
19655
 
19581
- // src/models/qwen2_vl/image_processing_qwen2_vl.js
19582
- var Qwen2VLImageProcessor = class extends ImageProcessor {
19583
- constructor(config) {
19584
- super(config);
19585
- this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
19586
- this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
19587
- this.patch_size = config.patch_size;
19588
- this.merge_size = config.merge_size;
19589
- }
19590
- /** @type {ImageProcessor['get_resize_output_image_size']} */
19591
- get_resize_output_image_size(image, size) {
19592
- const factor = this.patch_size * this.merge_size;
19593
- return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
19594
- }
19595
- async _call(images, ...args) {
19596
- const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
19597
- let patches = pixel_values;
19598
- const { temporal_patch_size, merge_size, patch_size } = this.config;
19599
- if (patches.dims[0] === 1) {
19600
- patches = cat(
19601
- Array.from({ length: temporal_patch_size }, () => patches),
19602
- 0
19603
- );
19604
- }
19605
- const grid_t = patches.dims[0] / temporal_patch_size;
19606
- const channel = patches.dims[1];
19607
- const grid_h = Math.floor(patches.dims[2] / patch_size);
19608
- const grid_w = Math.floor(patches.dims[3] / patch_size);
19609
- const flatten_patches = patches.view(
19610
- grid_t,
19611
- temporal_patch_size,
19612
- channel,
19613
- Math.floor(grid_h / merge_size),
19614
- merge_size,
19615
- patch_size,
19616
- Math.floor(grid_w / merge_size),
19617
- merge_size,
19618
- patch_size
19619
- ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
19620
- const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
19621
- return {
19622
- pixel_values: flatten_patches,
19623
- image_grid_thw,
19624
- original_sizes,
19625
- reshaped_input_sizes
19626
- };
19627
- }
19628
- };
19629
-
19630
19656
  // src/models/rt_detr/image_processing_rt_detr.js
19631
19657
  var RTDetrImageProcessor = class extends ImageProcessor {
19632
19658
  /** @type {typeof post_process_object_detection} */
@@ -20108,6 +20134,48 @@ var Florence2Processor = class extends Processor {
20108
20134
  }
20109
20135
  };
20110
20136
 
20137
+ // src/models/gemma3/processing_gemma3.js
20138
+ var Gemma3Processor = class extends Processor {
20139
+ static tokenizer_class = AutoTokenizer;
20140
+ static image_processor_class = AutoImageProcessor;
20141
+ static uses_processor_config = true;
20142
+ static uses_chat_template_file = true;
20143
+ constructor(config, components, chat_template) {
20144
+ super(config, components, chat_template);
20145
+ this.image_seq_length = this.config.image_seq_length;
20146
+ const { boi_token, image_token, eoi_token } = this.tokenizer.config;
20147
+ this.boi_token = boi_token;
20148
+ this.image_token = image_token;
20149
+ this.eoi_token = eoi_token;
20150
+ const image_tokens_expanded = image_token.repeat(this.image_seq_length);
20151
+ this.full_image_sequence = `
20152
+
20153
+ ${boi_token}${image_tokens_expanded}${eoi_token}
20154
+
20155
+ `;
20156
+ }
20157
+ /**
20158
+ * @param {string|string[]} text
20159
+ * @param {import('../../utils/image.js').RawImage|import('../../utils/image.js').RawImage[]} [images]
20160
+ * @param {Object} [options]
20161
+ */
20162
+ async _call(text, images = null, options = {}) {
20163
+ if (typeof text === "string") {
20164
+ text = [text];
20165
+ }
20166
+ let image_inputs;
20167
+ if (images) {
20168
+ image_inputs = await this.image_processor(images, options);
20169
+ text = text.map((prompt) => prompt.replaceAll(this.boi_token, this.full_image_sequence));
20170
+ }
20171
+ const text_inputs = this.tokenizer(text, options);
20172
+ return {
20173
+ ...text_inputs,
20174
+ ...image_inputs
20175
+ };
20176
+ }
20177
+ };
20178
+
20111
20179
  // src/models/gemma3n/processing_gemma3n.js
20112
20180
  var Gemma3nProcessor = class extends Processor {
20113
20181
  static image_processor_class = AutoImageProcessor;
@@ -20180,6 +20248,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
20180
20248
  }
20181
20249
  };
20182
20250
 
20251
+ // src/models/qwen2_vl/processing_qwen2_vl.js
20252
+ var Qwen2VLProcessor = class extends Processor {
20253
+ static image_processor_class = AutoImageProcessor;
20254
+ static tokenizer_class = AutoTokenizer;
20255
+ static image_token = "<|image_pad|>";
20256
+ /**
20257
+ *
20258
+ * @param {string|string[]} text
20259
+ * @param {RawImage|RawImage[]} images
20260
+ * @param {...any} args
20261
+ * @returns {Promise<any>}
20262
+ */
20263
+ async _call(text, images = null, ...args) {
20264
+ if (!Array.isArray(text)) {
20265
+ text = [text];
20266
+ }
20267
+ let image_inputs, image_grid_thw;
20268
+ if (images) {
20269
+ image_inputs = await this.image_processor(images);
20270
+ image_grid_thw = image_inputs.image_grid_thw;
20271
+ }
20272
+ if (image_grid_thw) {
20273
+ let merge_length = this.image_processor.config.merge_size ** 2;
20274
+ let index = 0;
20275
+ const image_token = (
20276
+ /** @type {typeof Qwen2VLProcessor} */
20277
+ this.constructor.image_token
20278
+ );
20279
+ const image_grid_thw_list = image_grid_thw.tolist();
20280
+ text = text.map((t) => {
20281
+ while (t.includes(image_token)) {
20282
+ const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
20283
+ t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
20284
+ }
20285
+ return t.replaceAll("<|placeholder|>", image_token);
20286
+ });
20287
+ }
20288
+ const text_inputs = this.tokenizer(text);
20289
+ return {
20290
+ ...text_inputs,
20291
+ ...image_inputs
20292
+ };
20293
+ }
20294
+ };
20295
+
20296
+ // src/models/glm46v/processing_glm46v.js
20297
+ var Glm46VProcessor = class extends Qwen2VLProcessor {
20298
+ static image_token = "<|image|>";
20299
+ };
20300
+
20183
20301
  // src/models/granite_speech/processing_granite_speech.js
20184
20302
  var GraniteSpeechProcessor = class extends Processor {
20185
20303
  static tokenizer_class = AutoTokenizer;
@@ -20910,47 +21028,6 @@ var PyAnnoteProcessor = class extends Processor {
20910
21028
  }
20911
21029
  };
20912
21030
 
20913
- // src/models/qwen2_vl/processing_qwen2_vl.js
20914
- var Qwen2VLProcessor = class extends Processor {
20915
- static image_processor_class = AutoImageProcessor;
20916
- static tokenizer_class = AutoTokenizer;
20917
- /**
20918
- *
20919
- * @param {string|string[]} text
20920
- * @param {RawImage|RawImage[]} images
20921
- * @param {...any} args
20922
- * @returns {Promise<any>}
20923
- */
20924
- async _call(text, images = null, ...args) {
20925
- if (!Array.isArray(text)) {
20926
- text = [text];
20927
- }
20928
- let image_inputs, image_grid_thw;
20929
- if (images) {
20930
- image_inputs = await this.image_processor(images);
20931
- image_grid_thw = image_inputs.image_grid_thw;
20932
- }
20933
- if (image_grid_thw) {
20934
- let merge_length = this.image_processor.config.merge_size ** 2;
20935
- let index = 0;
20936
- const image_grid_thw_list = image_grid_thw.tolist();
20937
- text = text.map((t) => {
20938
- while (t.includes("<|image_pad|>")) {
20939
- const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
20940
- t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
20941
- }
20942
- return t.replaceAll("<|placeholder|>", "<|image_pad|>");
20943
- });
20944
- }
20945
- const text_inputs = this.tokenizer(text);
20946
- return {
20947
- ...text_inputs,
20948
- ...image_inputs
20949
- // TODO: ...videos_inputs,
20950
- };
20951
- }
20952
- };
20953
-
20954
21031
  // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
20955
21032
  var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
20956
21033
  };
@@ -21294,6 +21371,8 @@ function getNormalizedConfig(config) {
21294
21371
  case "gemma3n":
21295
21372
  case "lfm2_vl":
21296
21373
  case "chatterbox":
21374
+ case "lighton_ocr":
21375
+ case "glm_ocr":
21297
21376
  case "mistral3":
21298
21377
  case "qwen2_5_vl":
21299
21378
  case "qwen3_vl":
@@ -21369,6 +21448,8 @@ function getNormalizedConfig(config) {
21369
21448
  mapping["dim_kv"] = "head_dim";
21370
21449
  break;
21371
21450
  case "qwen3":
21451
+ case "solar_open":
21452
+ case "glm_ocr_text":
21372
21453
  case "gemma":
21373
21454
  case "gemma2":
21374
21455
  case "vaultgemma":
@@ -21379,6 +21460,7 @@ function getNormalizedConfig(config) {
21379
21460
  case "ernie4_5":
21380
21461
  case "hunyuan_v1_dense":
21381
21462
  case "falcon_h1":
21463
+ case "nemotron_h":
21382
21464
  case "ministral":
21383
21465
  case "ministral3":
21384
21466
  mapping["num_heads"] = "num_key_value_heads";
@@ -21413,6 +21495,9 @@ function getNormalizedConfig(config) {
21413
21495
  mapping["num_attention_heads"] = "num_attention_heads";
21414
21496
  break;
21415
21497
  case "youtu":
21498
+ case "deepseek_v3":
21499
+ case "glm_moe_dsa":
21500
+ case "mistral4":
21416
21501
  mapping["num_heads"] = "num_key_value_heads";
21417
21502
  mapping["num_layers"] = "num_hidden_layers";
21418
21503
  mapping["dim_kv"] = "qk_head_dim";
@@ -21501,6 +21586,7 @@ function getCacheShapes(config, options) {
21501
21586
  if (!(config instanceof PretrainedConfig)) {
21502
21587
  config = new PretrainedConfig(config);
21503
21588
  }
21589
+ const batch_size = options?.batch_size ?? 1;
21504
21590
  if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
21505
21591
  const pkv_prefix = options?.prefix ?? "past_key_values";
21506
21592
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -21510,7 +21596,6 @@ function getCacheShapes(config, options) {
21510
21596
  config
21511
21597
  );
21512
21598
  const head_dim = hidden_size / num_attention_heads;
21513
- const batch_size = options?.batch_size ?? 1;
21514
21599
  for (let i = 0; i < layer_types.length; ++i) {
21515
21600
  if (layer_types[i] === "full_attention") {
21516
21601
  for (const kv of ["key", "value"]) {
@@ -21523,31 +21608,26 @@ function getCacheShapes(config, options) {
21523
21608
  }
21524
21609
  }
21525
21610
  return cache_values;
21526
- } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
21611
+ } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
21527
21612
  const pkv_prefix = options?.prefix ?? "past_key_values";
21528
21613
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
21529
- const cache_values = {};
21530
- const {
21531
- layer_types,
21532
- num_hidden_layers,
21533
- num_attention_heads,
21534
- num_key_value_heads,
21535
- hidden_size,
21536
- mamba_d_conv,
21537
- mamba_n_heads,
21538
- mamba_d_head,
21539
- mamba_d_state,
21540
- mamba_n_groups,
21541
- mamba_expand,
21542
- mamba_d_ssm
21543
- } = (
21614
+ const c = (
21544
21615
  /** @type {any} */
21545
21616
  config
21546
21617
  );
21547
- const head_dim = hidden_size / num_attention_heads;
21548
- const batch_size = options?.batch_size ?? 1;
21549
- const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
21550
- for (let i = 0; i < num_hidden_layers; ++i) {
21618
+ const layer_types = c.layer_types ?? c.layers_block_type;
21619
+ const num_layers = c.num_hidden_layers ?? layer_types?.length;
21620
+ const num_key_value_heads = c.num_key_value_heads;
21621
+ const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
21622
+ const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
21623
+ const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
21624
+ const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
21625
+ const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
21626
+ const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
21627
+ const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
21628
+ const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
21629
+ const cache_values = {};
21630
+ for (let i = 0; i < num_layers; ++i) {
21551
21631
  if (!layer_types || layer_types[i] === "mamba") {
21552
21632
  cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
21553
21633
  cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -21581,7 +21661,6 @@ function getCacheShapes(config, options) {
21581
21661
  const key_dim = linear_key_head_dim * linear_num_key_heads;
21582
21662
  const value_dim = linear_value_head_dim * linear_num_value_heads;
21583
21663
  const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
21584
- const batch_size = options?.batch_size ?? 1;
21585
21664
  for (let i = 0; i < layer_types.length; ++i) {
21586
21665
  if (layer_types[i] === "full_attention") {
21587
21666
  for (const kv of ["key", "value"]) {
@@ -23277,8 +23356,7 @@ var MODEL_TYPES = {
23277
23356
  ImageAudioTextToText: 13,
23278
23357
  Supertonic: 14,
23279
23358
  Chatterbox: 15,
23280
- MultimodalLanguageModelOnly: 16,
23281
- VoxtralRealtime: 17
23359
+ VoxtralRealtime: 16
23282
23360
  };
23283
23361
  var MODEL_TYPE_CONFIG = {
23284
23362
  [MODEL_TYPES.DecoderOnly]: {
@@ -23335,12 +23413,12 @@ var MODEL_TYPE_CONFIG = {
23335
23413
  can_generate: true,
23336
23414
  forward: image_text_to_text_forward,
23337
23415
  prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23338
- sessions: (config) => {
23416
+ sessions: (config, options, textOnly) => {
23339
23417
  const s = {
23340
23418
  embed_tokens: "embed_tokens",
23341
- vision_encoder: "vision_encoder",
23342
23419
  decoder_model_merged: "decoder_model_merged"
23343
23420
  };
23421
+ if (!textOnly) s["vision_encoder"] = "vision_encoder";
23344
23422
  if (config.is_encoder_decoder) s["model"] = "encoder_model";
23345
23423
  return s;
23346
23424
  },
@@ -23362,12 +23440,17 @@ var MODEL_TYPE_CONFIG = {
23362
23440
  [MODEL_TYPES.ImageAudioTextToText]: {
23363
23441
  can_generate: true,
23364
23442
  prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23365
- sessions: () => ({
23366
- embed_tokens: "embed_tokens",
23367
- audio_encoder: "audio_encoder",
23368
- vision_encoder: "vision_encoder",
23369
- decoder_model_merged: "decoder_model_merged"
23370
- }),
23443
+ sessions: (config, options, textOnly) => {
23444
+ const s = {
23445
+ embed_tokens: "embed_tokens",
23446
+ decoder_model_merged: "decoder_model_merged"
23447
+ };
23448
+ if (!textOnly) {
23449
+ s["audio_encoder"] = "audio_encoder";
23450
+ s["vision_encoder"] = "vision_encoder";
23451
+ }
23452
+ return s;
23453
+ },
23371
23454
  optional_configs: { generation_config: "generation_config.json" }
23372
23455
  },
23373
23456
  [MODEL_TYPES.Phi3V]: {
@@ -23418,14 +23501,6 @@ var MODEL_TYPE_CONFIG = {
23418
23501
  cache_sessions: { model: true },
23419
23502
  optional_configs: { generation_config: "generation_config.json" }
23420
23503
  },
23421
- [MODEL_TYPES.MultimodalLanguageModelOnly]: {
23422
- can_generate: true,
23423
- forward: image_text_to_text_forward,
23424
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23425
- sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
23426
- cache_sessions: { decoder_model_merged: true },
23427
- optional_configs: { generation_config: "generation_config.json" }
23428
- },
23429
23504
  [MODEL_TYPES.VoxtralRealtime]: {
23430
23505
  can_generate: true,
23431
23506
  prepare_inputs: decoder_prepare_inputs_for_generation,
@@ -23451,6 +23526,19 @@ function getSessionsConfig(modelType, config, options = {}) {
23451
23526
  optional_configs: typeConfig.optional_configs
23452
23527
  };
23453
23528
  }
23529
+ function resolveTypeConfig(modelName, config) {
23530
+ let modelType = MODEL_TYPE_MAPPING.get(modelName);
23531
+ let textOnly = false;
23532
+ const nativeArch = config?.architectures?.[0];
23533
+ if (nativeArch && nativeArch !== modelName && modelName?.endsWith("ForCausalLM") && nativeArch.endsWith("ForConditionalGeneration")) {
23534
+ const nativeType = MODEL_TYPE_MAPPING.get(nativeArch);
23535
+ if (nativeType !== void 0) {
23536
+ modelType = nativeType;
23537
+ textOnly = true;
23538
+ }
23539
+ }
23540
+ return { typeConfig: MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default, textOnly, modelType };
23541
+ }
23454
23542
  var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
23455
23543
  var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
23456
23544
  var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
@@ -23470,8 +23558,7 @@ var PreTrainedModel = class extends Callable2 {
23470
23558
  this.sessions = sessions;
23471
23559
  this.configs = configs;
23472
23560
  const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor);
23473
- const modelType = MODEL_TYPE_MAPPING.get(modelName);
23474
- const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
23561
+ const { typeConfig } = resolveTypeConfig(modelName, config);
23475
23562
  this.can_generate = typeConfig.can_generate;
23476
23563
  this._forward = typeConfig.forward;
23477
23564
  this._prepare_inputs_for_generation = typeConfig.prepare_inputs;
@@ -23534,9 +23621,8 @@ var PreTrainedModel = class extends Callable2 {
23534
23621
  session_options
23535
23622
  };
23536
23623
  const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
23537
- const modelType = MODEL_TYPE_MAPPING.get(modelName);
23538
23624
  config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
23539
- const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
23625
+ const { typeConfig, textOnly, modelType } = resolveTypeConfig(modelName, config);
23540
23626
  if (modelType === void 0) {
23541
23627
  const type = modelName ?? config?.model_type;
23542
23628
  if (type !== "custom") {
@@ -23545,7 +23631,7 @@ var PreTrainedModel = class extends Callable2 {
23545
23631
  );
23546
23632
  }
23547
23633
  }
23548
- const sessions = typeConfig.sessions(config, options);
23634
+ const sessions = typeConfig.sessions(config, options, textOnly);
23549
23635
  const promises = [
23550
23636
  constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
23551
23637
  ];
@@ -24209,7 +24295,9 @@ async function generic_text_to_text_forward(self2, {
24209
24295
  "qwen3_5",
24210
24296
  "qwen3_5_text",
24211
24297
  "qwen3_5_moe",
24212
- "qwen3_5_moe_text"
24298
+ "qwen3_5_moe_text",
24299
+ "glm_ocr",
24300
+ "glm_ocr_text"
24213
24301
  ].includes(self2.config.model_type)
24214
24302
  ) {
24215
24303
  const { image_grid_thw, video_grid_thw } = kwargs;
@@ -24433,6 +24521,8 @@ __export(models_exports, {
24433
24521
  BloomForCausalLM: () => BloomForCausalLM,
24434
24522
  BloomModel: () => BloomModel,
24435
24523
  BloomPreTrainedModel: () => BloomPreTrainedModel,
24524
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
24525
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
24436
24526
  CLIPModel: () => CLIPModel,
24437
24527
  CLIPPreTrainedModel: () => CLIPPreTrainedModel,
24438
24528
  CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -24507,6 +24597,9 @@ __export(models_exports, {
24507
24597
  DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
24508
24598
  DecisionTransformerModel: () => DecisionTransformerModel,
24509
24599
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
24600
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
24601
+ DeepseekV3Model: () => DeepseekV3Model,
24602
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
24510
24603
  DeiTForImageClassification: () => DeiTForImageClassification,
24511
24604
  DeiTModel: () => DeiTModel,
24512
24605
  DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -24552,6 +24645,11 @@ __export(models_exports, {
24552
24645
  EsmForTokenClassification: () => EsmForTokenClassification,
24553
24646
  EsmModel: () => EsmModel,
24554
24647
  EsmPreTrainedModel: () => EsmPreTrainedModel,
24648
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
24649
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
24650
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
24651
+ EuroBertModel: () => EuroBertModel,
24652
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
24555
24653
  ExaoneForCausalLM: () => ExaoneForCausalLM,
24556
24654
  ExaoneModel: () => ExaoneModel,
24557
24655
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -24588,6 +24686,7 @@ __export(models_exports, {
24588
24686
  Gemma2Model: () => Gemma2Model,
24589
24687
  Gemma2PreTrainedModel: () => Gemma2PreTrainedModel,
24590
24688
  Gemma3ForCausalLM: () => Gemma3ForCausalLM,
24689
+ Gemma3ForConditionalGeneration: () => Gemma3ForConditionalGeneration,
24591
24690
  Gemma3Model: () => Gemma3Model,
24592
24691
  Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
24593
24692
  Gemma3nForCausalLM: () => Gemma3nForCausalLM,
@@ -24598,6 +24697,10 @@ __export(models_exports, {
24598
24697
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
24599
24698
  GlmForCausalLM: () => GlmForCausalLM,
24600
24699
  GlmModel: () => GlmModel,
24700
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
24701
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
24702
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
24703
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
24601
24704
  GlmPreTrainedModel: () => GlmPreTrainedModel,
24602
24705
  GptOssForCausalLM: () => GptOssForCausalLM,
24603
24706
  GptOssModel: () => GptOssModel,
@@ -24644,6 +24747,7 @@ __export(models_exports, {
24644
24747
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
24645
24748
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
24646
24749
  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
24750
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
24647
24751
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
24648
24752
  Llama4ForCausalLM: () => Llama4ForCausalLM,
24649
24753
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -24693,6 +24797,9 @@ __export(models_exports, {
24693
24797
  MimiEncoderOutput: () => MimiEncoderOutput,
24694
24798
  MimiModel: () => MimiModel,
24695
24799
  MimiPreTrainedModel: () => MimiPreTrainedModel,
24800
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
24801
+ Mistral4Model: () => Mistral4Model,
24802
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
24696
24803
  MistralForCausalLM: () => MistralForCausalLM,
24697
24804
  MistralModel: () => MistralModel,
24698
24805
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -24750,6 +24857,9 @@ __export(models_exports, {
24750
24857
  NanoChatForCausalLM: () => NanoChatForCausalLM,
24751
24858
  NanoChatModel: () => NanoChatModel,
24752
24859
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
24860
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
24861
+ NemotronHModel: () => NemotronHModel,
24862
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
24753
24863
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
24754
24864
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
24755
24865
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -24887,6 +24997,9 @@ __export(models_exports, {
24887
24997
  SnacEncoderModel: () => SnacEncoderModel,
24888
24998
  SnacModel: () => SnacModel,
24889
24999
  SnacPreTrainedModel: () => SnacPreTrainedModel,
25000
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
25001
+ SolarOpenModel: () => SolarOpenModel,
25002
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
24890
25003
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
24891
25004
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
24892
25005
  SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -25061,7 +25174,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
25061
25174
  var ArceeForCausalLM = class extends ArceePreTrainedModel {
25062
25175
  };
25063
25176
 
25064
- // src/models/ast/modeling_ast.js
25177
+ // src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
25065
25178
  var ASTPreTrainedModel = class extends PreTrainedModel {
25066
25179
  };
25067
25180
  var ASTModel = class extends ASTPreTrainedModel {
@@ -25396,6 +25509,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
25396
25509
  var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
25397
25510
  };
25398
25511
 
25512
+ // src/models/chmv2/modeling_chmv2.js
25513
+ var CHMv2PreTrainedModel = class extends PreTrainedModel {
25514
+ };
25515
+ var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
25516
+ };
25517
+
25399
25518
  // src/models/clap/modeling_clap.js
25400
25519
  var ClapPreTrainedModel = class extends PreTrainedModel {
25401
25520
  };
@@ -25734,6 +25853,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
25734
25853
  }
25735
25854
  };
25736
25855
 
25856
+ // src/models/deepseek_v3/modeling_deepseek_v3.js
25857
+ var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
25858
+ };
25859
+ var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
25860
+ };
25861
+ var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
25862
+ };
25863
+
25737
25864
  // src/models/deberta_v2/modeling_deberta_v2.js
25738
25865
  var DebertaV2PreTrainedModel = class extends PreTrainedModel {
25739
25866
  };
@@ -26082,6 +26209,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
26082
26209
  }
26083
26210
  };
26084
26211
 
26212
+ // src/models/eurobert/modeling_eurobert.js
26213
+ var EuroBertPreTrainedModel = class extends PreTrainedModel {
26214
+ };
26215
+ var EuroBertModel = class extends EuroBertPreTrainedModel {
26216
+ };
26217
+ var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
26218
+ /**
26219
+ * Calls the model on new inputs.
26220
+ *
26221
+ * @param {Object} model_inputs The inputs to the model.
26222
+ * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
26223
+ */
26224
+ async _call(model_inputs) {
26225
+ return new MaskedLMOutput(await super._call(model_inputs));
26226
+ }
26227
+ };
26228
+ var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
26229
+ /**
26230
+ * Calls the model on new inputs.
26231
+ *
26232
+ * @param {Object} model_inputs The inputs to the model.
26233
+ * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
26234
+ */
26235
+ async _call(model_inputs) {
26236
+ return new SequenceClassifierOutput(await super._call(model_inputs));
26237
+ }
26238
+ };
26239
+ var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
26240
+ /**
26241
+ * Calls the model on new inputs.
26242
+ *
26243
+ * @param {Object} model_inputs The inputs to the model.
26244
+ * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
26245
+ */
26246
+ async _call(model_inputs) {
26247
+ return new TokenClassifierOutput(await super._call(model_inputs));
26248
+ }
26249
+ };
26250
+
26085
26251
  // src/models/exaone/modeling_exaone.js
26086
26252
  var ExaonePreTrainedModel = class extends PreTrainedModel {
26087
26253
  };
@@ -26239,12 +26405,35 @@ var Gemma2Model = class extends Gemma2PreTrainedModel {
26239
26405
  var Gemma2ForCausalLM = class extends Gemma2PreTrainedModel {
26240
26406
  };
26241
26407
 
26408
+ // src/models/llava/modeling_llava.js
26409
+ var LlavaPreTrainedModel = class extends PreTrainedModel {
26410
+ forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
26411
+ };
26412
+ var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
26413
+ _merge_input_ids_with_image_features(kwargs) {
26414
+ const vision_hidden_size = kwargs.image_features.dims.at(-1);
26415
+ const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
26416
+ return default_merge_input_ids_with_image_features({
26417
+ // @ts-ignore
26418
+ image_token_id: this.config.image_token_index ?? this.config.image_token_id,
26419
+ ...kwargs,
26420
+ image_features: reshaped_image_hidden_states
26421
+ });
26422
+ }
26423
+ };
26424
+ var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
26425
+ };
26426
+ var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
26427
+ };
26428
+
26242
26429
  // src/models/gemma3/modeling_gemma3.js
26243
26430
  var Gemma3PreTrainedModel = class extends PreTrainedModel {
26244
26431
  };
26245
26432
  var Gemma3Model = class extends Gemma3PreTrainedModel {
26246
26433
  };
26247
- var Gemma3ForCausalLM = class extends Gemma3PreTrainedModel {
26434
+ var Gemma3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
26435
+ };
26436
+ var Gemma3ForCausalLM = class extends Gemma3ForConditionalGeneration {
26248
26437
  };
26249
26438
 
26250
26439
  // src/models/gemma3n/modeling_gemma3n.js
@@ -26357,6 +26546,382 @@ var GlmModel = class extends GlmPreTrainedModel {
26357
26546
  var GlmForCausalLM = class extends GlmPreTrainedModel {
26358
26547
  };
26359
26548
 
26549
+ // src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
26550
+ var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
26551
+ };
26552
+ var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
26553
+ };
26554
+ var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
26555
+ };
26556
+
26557
+ // src/models/qwen2_vl/modeling_qwen2_vl.js
26558
+ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
26559
+ forward_params = [
26560
+ // Text inputs
26561
+ "input_ids",
26562
+ "attention_mask",
26563
+ "position_ids",
26564
+ "past_key_values",
26565
+ // Vision inputs
26566
+ "pixel_values",
26567
+ "image_grid_thw"
26568
+ ];
26569
+ };
26570
+ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
26571
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
26572
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
26573
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
26574
+ image_grid_thw_name = "grid_thw";
26575
+ /**
26576
+ * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
26577
+ * @param {Tensor} input_ids
26578
+ * @param {Tensor} attention_mask
26579
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
26580
+ */
26581
+ _get_text_only_rope_index(input_ids, attention_mask) {
26582
+ if (attention_mask) {
26583
+ const { data, dims } = cumsum_masked_fill(attention_mask);
26584
+ const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
26585
+ const mrope_position_deltas = Array.from(
26586
+ { length: dims[0] },
26587
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
26588
+ );
26589
+ return [
26590
+ new Tensor2("int64", position_ids, [3, ...dims]),
26591
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
26592
+ ];
26593
+ } else {
26594
+ const [batch_size, seq_length] = input_ids.dims;
26595
+ const position_ids = BigInt64Array.from(
26596
+ { length: 3 * batch_size * seq_length },
26597
+ (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
26598
+ );
26599
+ return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
26600
+ }
26601
+ }
26602
+ /**
26603
+ * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
26604
+ * global [all_t, all_h, all_w] order, then write back into the position_ids array
26605
+ * respecting attention mask.
26606
+ * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
26607
+ * @param {number[]} attn_mask Attention mask for this batch element
26608
+ * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
26609
+ * @param {number} batch_idx Current batch index
26610
+ * @returns {number[]} Flat reordered positions of length total_len
26611
+ */
26612
+ _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
26613
+ const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
26614
+ const llm_positions = new Array(total_len);
26615
+ let index = 0;
26616
+ for (let x = 0; x < 3; ++x) {
26617
+ for (const val of llm_pos_ids_list) {
26618
+ const seg_len = val.length / 3;
26619
+ for (let z2 = x * seg_len; z2 < (x + 1) * seg_len; ++z2) {
26620
+ llm_positions[index++] = val[z2];
26621
+ }
26622
+ }
26623
+ }
26624
+ let count2 = 0;
26625
+ for (let y = 0; y < attn_mask.length; ++y) {
26626
+ if (attn_mask[y] == 1) {
26627
+ for (let x = 0; x < 3; ++x) {
26628
+ position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
26629
+ }
26630
+ ++count2;
26631
+ }
26632
+ }
26633
+ return llm_positions;
26634
+ }
26635
+ /**
26636
+ * Build per-batch position ID segments for multimodal rope.
26637
+ * Override this in subclasses to change how vision/text segments are identified and positioned.
26638
+ * @param {object} params
26639
+ * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
26640
+ * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
26641
+ * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
26642
+ * @param {number} params.spatial_merge_size
26643
+ * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
26644
+ * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
26645
+ */
26646
+ _get_multimodal_rope_positions({
26647
+ filtered_ids,
26648
+ image_grid_thw_list,
26649
+ video_grid_thw_list,
26650
+ spatial_merge_size,
26651
+ state
26652
+ }) {
26653
+ const { image_token_id, video_token_id, vision_start_token_id } = this.config;
26654
+ const ids = filtered_ids;
26655
+ const vision_start_indices = ids.reduce((acc, x, idx) => {
26656
+ if (x == vision_start_token_id) acc.push(idx);
26657
+ return acc;
26658
+ }, []);
26659
+ const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
26660
+ const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
26661
+ const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
26662
+ const llm_pos_ids_list = [];
26663
+ let st2 = 0;
26664
+ let remain_images = image_nums;
26665
+ let remain_videos = video_nums;
26666
+ for (let j = 0; j < vision_tokens.length; ++j) {
26667
+ const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
26668
+ const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
26669
+ const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
26670
+ const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
26671
+ let ed;
26672
+ let t, h, w;
26673
+ if (ed_image < ed_video) {
26674
+ [t, h, w] = image_grid_thw_list[state.image_index];
26675
+ ++state.image_index;
26676
+ --remain_images;
26677
+ ed = ed_image;
26678
+ } else {
26679
+ [t, h, w] = video_grid_thw_list[state.video_index];
26680
+ ++state.video_index;
26681
+ --remain_videos;
26682
+ ed = ed_video;
26683
+ }
26684
+ const [llm_grid_t, llm_grid_h, llm_grid_w] = [
26685
+ Number(t),
26686
+ Math.floor(Number(h) / spatial_merge_size),
26687
+ Math.floor(Number(w) / spatial_merge_size)
26688
+ ];
26689
+ const text_len = ed - st2;
26690
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
26691
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
26692
+ const offset = text_len + st_idx;
26693
+ const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
26694
+ const t_index = Array.from(
26695
+ { length: grid_size },
26696
+ (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
26697
+ );
26698
+ const h_index = Array.from(
26699
+ { length: grid_size },
26700
+ (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
26701
+ );
26702
+ const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
26703
+ llm_pos_ids_list.push([t_index, h_index, w_index].flat());
26704
+ st2 = ed + grid_size;
26705
+ }
26706
+ if (st2 < ids.length) {
26707
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
26708
+ const text_len = ids.length - st2;
26709
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
26710
+ }
26711
+ return llm_pos_ids_list;
26712
+ }
26713
+ /**
26714
+ * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
26715
+ *
26716
+ * Explanation:
26717
+ * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
26718
+ *
26719
+ * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
26720
+ * Examples:
26721
+ * input_ids: [T T T T T], here T is for text.
26722
+ * temporal position_ids: [0, 1, 2, 3, 4]
26723
+ * height position_ids: [0, 1, 2, 3, 4]
26724
+ * width position_ids: [0, 1, 2, 3, 4]
26725
+ *
26726
+ * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
26727
+ * and 1D rotary position embeddin for text part.
26728
+ * Examples:
26729
+ * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
26730
+ * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
26731
+ * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
26732
+ * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
26733
+ * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
26734
+ * text temporal position_ids: [3, 4, 5, 6, 7]
26735
+ * text height position_ids: [3, 4, 5, 6, 7]
26736
+ * text width position_ids: [3, 4, 5, 6, 7]
26737
+ * Here we calculate the text start position_ids as the max vision position_ids plus 1.
26738
+ *
26739
+ * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
26740
+ * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
26741
+ * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
26742
+ * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
26743
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
26744
+ */
26745
+ get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
26746
+ const { vision_config } = this.config;
26747
+ const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
26748
+ if (image_grid_thw || video_grid_thw) {
26749
+ const total_input_ids = input_ids.tolist();
26750
+ if (!attention_mask) {
26751
+ attention_mask = ones_like(input_ids);
26752
+ }
26753
+ const attention_mask_list = attention_mask.tolist();
26754
+ const position_ids_list = Array.from(
26755
+ { length: 3 },
26756
+ () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
26757
+ );
26758
+ const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
26759
+ const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
26760
+ const state = { image_index: 0, video_index: 0 };
26761
+ const mrope_position_deltas = [];
26762
+ for (let i = 0; i < total_input_ids.length; ++i) {
26763
+ const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
26764
+ const llm_pos_ids_list = this._get_multimodal_rope_positions({
26765
+ filtered_ids,
26766
+ image_grid_thw_list,
26767
+ video_grid_thw_list,
26768
+ spatial_merge_size,
26769
+ state
26770
+ });
26771
+ const llm_positions = this._reorder_and_write_positions(
26772
+ llm_pos_ids_list,
26773
+ attention_mask_list[i],
26774
+ position_ids_list,
26775
+ i
26776
+ );
26777
+ mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
26778
+ }
26779
+ return [
26780
+ new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
26781
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
26782
+ ];
26783
+ } else {
26784
+ return this._get_text_only_rope_index(input_ids, attention_mask);
26785
+ }
26786
+ }
26787
+ async encode_image({ pixel_values, image_grid_thw }) {
26788
+ const features = (await sessionRun(this.sessions["vision_encoder"], {
26789
+ pixel_values,
26790
+ [this.image_grid_thw_name]: image_grid_thw
26791
+ })).image_features;
26792
+ return features;
26793
+ }
26794
+ _merge_input_ids_with_image_features(kwargs) {
26795
+ return default_merge_input_ids_with_image_features({
26796
+ // @ts-ignore
26797
+ image_token_id: this.config.image_token_id,
26798
+ ...kwargs
26799
+ });
26800
+ }
26801
+ prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
26802
+ if (!model_inputs.attention_mask || model_inputs.position_ids) {
26803
+ return model_inputs;
26804
+ }
26805
+ const session = this.sessions["decoder_model_merged"] ?? this.sessions["model"];
26806
+ if (!session.inputNames.includes("position_ids")) {
26807
+ return model_inputs;
26808
+ }
26809
+ if (!model_inputs.past_key_values) {
26810
+ [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
26811
+ model_inputs.input_ids,
26812
+ model_inputs.image_grid_thw,
26813
+ model_inputs.video_grid_thw,
26814
+ model_inputs.attention_mask
26815
+ );
26816
+ } else {
26817
+ model_inputs.pixel_values = null;
26818
+ const past_length = model_inputs.past_key_values.get_seq_length();
26819
+ if (past_length < model_inputs.input_ids.dims[1]) {
26820
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
26821
+ model_inputs.input_ids,
26822
+ model_inputs.image_grid_thw,
26823
+ model_inputs.video_grid_thw,
26824
+ model_inputs.attention_mask
26825
+ );
26826
+ model_inputs.rope_deltas = rope_deltas;
26827
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
26828
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
26829
+ } else {
26830
+ if (!model_inputs.rope_deltas) {
26831
+ [, model_inputs.rope_deltas] = this.get_rope_index(
26832
+ model_inputs.input_ids,
26833
+ model_inputs.image_grid_thw,
26834
+ model_inputs.video_grid_thw,
26835
+ model_inputs.attention_mask
26836
+ );
26837
+ }
26838
+ const delta = BigInt(past_length);
26839
+ const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
26840
+ model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
26841
+ }
26842
+ }
26843
+ return model_inputs;
26844
+ }
26845
+ };
26846
+ var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
26847
+ };
26848
+
26849
+ // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
26850
+ var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
26851
+ image_grid_thw_name = "image_grid_thw";
26852
+ };
26853
+ var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
26854
+ image_grid_thw_name = "image_grid_thw";
26855
+ };
26856
+
26857
+ // src/models/glm_ocr/modeling_glm_ocr.js
26858
+ var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
26859
+ /**
26860
+ * Compute 3D positional indices for vision tokens.
26861
+ * Temporal is constant, height is repeat-interleaved, width tiles.
26862
+ * @param {number} start_position
26863
+ * @param {number[]} grid_thw [T, H, W]
26864
+ * @param {number} temp_merge_size
26865
+ * @param {number} spatial_merge_size
26866
+ * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
26867
+ */
26868
+ get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
26869
+ const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
26870
+ const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
26871
+ const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
26872
+ const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
26873
+ const t_pos = Array.from({ length: seq_len }, () => start_position);
26874
+ const h_pos = Array.from(
26875
+ { length: seq_len },
26876
+ (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
26877
+ );
26878
+ const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
26879
+ return [...t_pos, ...h_pos, ...w_pos];
26880
+ }
26881
+ /**
26882
+ * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
26883
+ * instead of vision_start_token_id scanning used by Qwen2VL.
26884
+ * After a vision segment, position advances by max(h, w) / spatial_merge_size.
26885
+ */
26886
+ _get_multimodal_rope_positions({
26887
+ filtered_ids,
26888
+ image_grid_thw_list,
26889
+ video_grid_thw_list,
26890
+ spatial_merge_size,
26891
+ state
26892
+ }) {
26893
+ const { image_token_id } = this.config;
26894
+ const groups = [];
26895
+ let group_start = 0;
26896
+ let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
26897
+ for (let j = 1; j <= filtered_ids.length; ++j) {
26898
+ const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
26899
+ if (t !== current_type) {
26900
+ groups.push([current_type, group_start, j]);
26901
+ group_start = j;
26902
+ current_type = t;
26903
+ }
26904
+ }
26905
+ let current_pos = 0;
26906
+ const llm_pos_ids_list = [];
26907
+ for (const [modality_type, start_idx, end_idx] of groups) {
26908
+ if (modality_type === 0) {
26909
+ const text_len = end_idx - start_idx;
26910
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
26911
+ current_pos += text_len;
26912
+ } else {
26913
+ const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
26914
+ const temp_merge_size = grid_thw[0];
26915
+ llm_pos_ids_list.push(
26916
+ this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
26917
+ );
26918
+ current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
26919
+ }
26920
+ }
26921
+ return llm_pos_ids_list;
26922
+ }
26923
+ };
26924
+
26360
26925
  // src/models/glpn/modeling_glpn.js
26361
26926
  var GLPNPreTrainedModel = class extends PreTrainedModel {
26362
26927
  };
@@ -26555,27 +27120,6 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
26555
27120
  var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
26556
27121
  };
26557
27122
 
26558
- // src/models/llava/modeling_llava.js
26559
- var LlavaPreTrainedModel = class extends PreTrainedModel {
26560
- forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
26561
- };
26562
- var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
26563
- _merge_input_ids_with_image_features(kwargs) {
26564
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
26565
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
26566
- return default_merge_input_ids_with_image_features({
26567
- // @ts-ignore
26568
- image_token_id: this.config.image_token_index ?? this.config.image_token_id,
26569
- ...kwargs,
26570
- image_features: reshaped_image_hidden_states
26571
- });
26572
- }
26573
- };
26574
- var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
26575
- };
26576
- var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
26577
- };
26578
-
26579
27123
  // src/models/idefics3/modeling_idefics3.js
26580
27124
  var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
26581
27125
  forward_params = [
@@ -26669,6 +27213,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
26669
27213
  var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
26670
27214
  };
26671
27215
 
27216
+ // src/models/lighton_ocr/modeling_lighton_ocr.js
27217
+ var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
27218
+ };
27219
+
26672
27220
  // src/models/lfm2_moe/modeling_lfm2_moe.js
26673
27221
  var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
26674
27222
  };
@@ -26865,6 +27413,14 @@ var MistralModel = class extends MistralPreTrainedModel {
26865
27413
  var MistralForCausalLM = class extends MistralPreTrainedModel {
26866
27414
  };
26867
27415
 
27416
+ // src/models/mistral4/modeling_mistral4.js
27417
+ var Mistral4PreTrainedModel = class extends PreTrainedModel {
27418
+ };
27419
+ var Mistral4Model = class extends Mistral4PreTrainedModel {
27420
+ };
27421
+ var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
27422
+ };
27423
+
26868
27424
  // src/models/mobilebert/modeling_mobilebert.js
26869
27425
  var MobileBertPreTrainedModel = class extends PreTrainedModel {
26870
27426
  };
@@ -27333,6 +27889,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
27333
27889
  var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
27334
27890
  };
27335
27891
 
27892
+ // src/models/nemotron_h/modeling_nemotron_h.js
27893
+ var NemotronHPreTrainedModel = class extends PreTrainedModel {
27894
+ };
27895
+ var NemotronHModel = class extends NemotronHPreTrainedModel {
27896
+ };
27897
+ var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
27898
+ };
27899
+
27336
27900
  // src/models/neobert/modeling_neobert.js
27337
27901
  var NeoBertPreTrainedModel = class extends PreTrainedModel {
27338
27902
  };
@@ -27613,252 +28177,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
27613
28177
  var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
27614
28178
  };
27615
28179
 
27616
- // src/models/qwen2_vl/modeling_qwen2_vl.js
27617
- var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
27618
- forward_params = [
27619
- // Text inputs
27620
- "input_ids",
27621
- "attention_mask",
27622
- "position_ids",
27623
- "past_key_values",
27624
- // Vision inputs
27625
- "pixel_values",
27626
- "image_grid_thw"
27627
- ];
27628
- };
27629
- var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
27630
- // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
27631
- // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
27632
- // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
27633
- image_grid_thw_name = "grid_thw";
27634
- /**
27635
- * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
27636
- *
27637
- * Explanation:
27638
- * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
27639
- *
27640
- * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
27641
- * Examples:
27642
- * input_ids: [T T T T T], here T is for text.
27643
- * temporal position_ids: [0, 1, 2, 3, 4]
27644
- * height position_ids: [0, 1, 2, 3, 4]
27645
- * width position_ids: [0, 1, 2, 3, 4]
27646
- *
27647
- * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
27648
- * and 1D rotary position embeddin for text part.
27649
- * Examples:
27650
- * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
27651
- * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
27652
- * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
27653
- * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
27654
- * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
27655
- * text temporal position_ids: [3, 4, 5, 6, 7]
27656
- * text height position_ids: [3, 4, 5, 6, 7]
27657
- * text width position_ids: [3, 4, 5, 6, 7]
27658
- * Here we calculate the text start position_ids as the max vision position_ids plus 1.
27659
- *
27660
- * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
27661
- * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
27662
- * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
27663
- * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
27664
- * - 1 for tokens that are **not masked**,
27665
- * - 0 for tokens that are **masked**.
27666
- * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
27667
- * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
27668
- * - mrope_position_deltas: Tensor of shape `(batch_size)`.
27669
- */
27670
- get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
27671
- const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
27672
- const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
27673
- const mrope_position_deltas = [];
27674
- if (image_grid_thw || video_grid_thw) {
27675
- let total_input_ids = input_ids.tolist();
27676
- if (!attention_mask) {
27677
- attention_mask = ones_like(input_ids);
27678
- }
27679
- const attention_mask_list = attention_mask.tolist();
27680
- const position_ids_list = Array.from(
27681
- { length: 3 },
27682
- (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
27683
- );
27684
- const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
27685
- const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
27686
- let image_index = 0;
27687
- let video_index = 0;
27688
- for (let i = 0; i < total_input_ids.length; ++i) {
27689
- const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
27690
- const vision_start_indices = ids.reduce((acc, x, idx) => {
27691
- if (x == vision_start_token_id) acc.push(idx);
27692
- return acc;
27693
- }, []);
27694
- const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
27695
- const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
27696
- const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
27697
- let llm_pos_ids_list = [];
27698
- let st2 = 0;
27699
- let remain_images = image_nums;
27700
- let remain_videos = video_nums;
27701
- for (let j = 0; j < vision_tokens.length; ++j) {
27702
- const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
27703
- const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
27704
- const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
27705
- const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
27706
- let ed;
27707
- let t, h, w;
27708
- if (ed_image < ed_video) {
27709
- [t, h, w] = image_grid_thw_list[image_index];
27710
- ++image_index;
27711
- --remain_images;
27712
- ed = ed_image;
27713
- } else {
27714
- [t, h, w] = video_grid_thw_list[video_index];
27715
- ++video_index;
27716
- --remain_videos;
27717
- ed = ed_video;
27718
- }
27719
- const [llm_grid_t, llm_grid_h, llm_grid_w] = [
27720
- Number(t),
27721
- Math.floor(Number(h) / spatial_merge_size),
27722
- Math.floor(Number(w) / spatial_merge_size)
27723
- ];
27724
- const text_len = ed - st2;
27725
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27726
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
27727
- const offset = text_len + st_idx;
27728
- const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
27729
- const t_index = Array.from(
27730
- { length: grid_size },
27731
- (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
27732
- );
27733
- const h_index = Array.from(
27734
- { length: grid_size },
27735
- (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
27736
- );
27737
- const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
27738
- llm_pos_ids_list.push([t_index, h_index, w_index].flat());
27739
- st2 = ed + grid_size;
27740
- }
27741
- if (st2 < ids.length) {
27742
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27743
- const text_len = ids.length - st2;
27744
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
27745
- }
27746
- const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
27747
- const llm_positions = new Array(num_items);
27748
- let index = 0;
27749
- for (let x = 0; x < 3; ++x) {
27750
- for (let y = 0; y < llm_pos_ids_list.length; ++y) {
27751
- const val = llm_pos_ids_list[y];
27752
- const text_len = val.length / 3;
27753
- for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
27754
- llm_positions[index++] = val[z];
27755
- }
27756
- }
27757
- }
27758
- let count2 = 0;
27759
- const attn_mask = attention_mask_list[i];
27760
- for (let y = 0; y < attn_mask.length; ++y) {
27761
- if (attn_mask[y] == 1) {
27762
- for (let x = 0; x < 3; ++x) {
27763
- position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
27764
- }
27765
- ++count2;
27766
- }
27767
- }
27768
- const max_llm_positions = max(llm_positions)[0];
27769
- mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
27770
- }
27771
- return [
27772
- new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
27773
- new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27774
- ];
27775
- } else {
27776
- if (attention_mask) {
27777
- const { data, dims } = cumsum_masked_fill(attention_mask);
27778
- const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
27779
- const mrope_position_deltas2 = Array.from(
27780
- { length: dims[0] },
27781
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
27782
- );
27783
- return [
27784
- new Tensor2("int64", position_ids, [3, ...dims]),
27785
- new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
27786
- ];
27787
- } else {
27788
- const [batch_size, seq_length] = input_ids.dims;
27789
- const position_ids = BigInt64Array.from(
27790
- { length: 3 * batch_size * seq_length },
27791
- (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
27792
- );
27793
- return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
27794
- }
27795
- }
27796
- }
27797
- async encode_image({ pixel_values, image_grid_thw }) {
27798
- const features = (await sessionRun(this.sessions["vision_encoder"], {
27799
- pixel_values,
27800
- [this.image_grid_thw_name]: image_grid_thw
27801
- })).image_features;
27802
- return features;
27803
- }
27804
- _merge_input_ids_with_image_features(kwargs) {
27805
- return default_merge_input_ids_with_image_features({
27806
- // @ts-ignore
27807
- image_token_id: this.config.image_token_id,
27808
- ...kwargs
27809
- });
27810
- }
27811
- prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
27812
- if (model_inputs.attention_mask && !model_inputs.position_ids) {
27813
- if (!model_inputs.past_key_values) {
27814
- [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
27815
- model_inputs.input_ids,
27816
- model_inputs.image_grid_thw,
27817
- model_inputs.video_grid_thw,
27818
- model_inputs.attention_mask
27819
- );
27820
- } else {
27821
- model_inputs.pixel_values = null;
27822
- const past_length = model_inputs.past_key_values.get_seq_length();
27823
- if (past_length < model_inputs.input_ids.dims[1]) {
27824
- const [full_position_ids, rope_deltas] = this.get_rope_index(
27825
- model_inputs.input_ids,
27826
- model_inputs.image_grid_thw,
27827
- model_inputs.video_grid_thw,
27828
- model_inputs.attention_mask
27829
- );
27830
- model_inputs.rope_deltas = rope_deltas;
27831
- model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
27832
- model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
27833
- } else {
27834
- if (!model_inputs.rope_deltas) {
27835
- [, model_inputs.rope_deltas] = this.get_rope_index(
27836
- model_inputs.input_ids,
27837
- model_inputs.image_grid_thw,
27838
- model_inputs.video_grid_thw,
27839
- model_inputs.attention_mask
27840
- );
27841
- }
27842
- const delta = BigInt(past_length);
27843
- const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
27844
- model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
27845
- }
27846
- }
27847
- }
27848
- return model_inputs;
27849
- }
27850
- };
27851
- var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
27852
- };
27853
-
27854
- // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
27855
- var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
27856
- image_grid_thw_name = "image_grid_thw";
27857
- };
27858
- var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
27859
- image_grid_thw_name = "image_grid_thw";
27860
- };
27861
-
27862
28180
  // src/models/qwen3/modeling_qwen3.js
27863
28181
  var Qwen3PreTrainedModel = class extends PreTrainedModel {
27864
28182
  };
@@ -28304,6 +28622,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
28304
28622
  }
28305
28623
  };
28306
28624
 
28625
+ // src/models/solar_open/modeling_solar_open.js
28626
+ var SolarOpenPreTrainedModel = class extends PreTrainedModel {
28627
+ };
28628
+ var SolarOpenModel = class extends SolarOpenPreTrainedModel {
28629
+ };
28630
+ var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
28631
+ };
28632
+
28307
28633
  // src/models/speecht5/modeling_speecht5.js
28308
28634
  var SpeechT5PreTrainedModel = class extends PreTrainedModel {
28309
28635
  };
@@ -29420,6 +29746,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
29420
29746
  // src/models/registry.js
29421
29747
  var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
29422
29748
  ["bert", "BertModel"],
29749
+ ["eurobert", "EuroBertModel"],
29423
29750
  ["neobert", "NeoBertModel"],
29424
29751
  ["modernbert", "ModernBertModel"],
29425
29752
  ["nomic_bert", "NomicBertModel"],
@@ -29551,6 +29878,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
29551
29878
  ["gemma3_text", "Gemma3Model"],
29552
29879
  ["helium", "HeliumModel"],
29553
29880
  ["glm", "GlmModel"],
29881
+ ["glm_moe_dsa", "GlmMoeDsaModel"],
29554
29882
  ["openelm", "OpenELMModel"],
29555
29883
  ["qwen2", "Qwen2Model"],
29556
29884
  ["qwen2_moe", "Qwen2MoeModel"],
@@ -29562,12 +29890,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
29562
29890
  ["mpt", "MptModel"],
29563
29891
  ["opt", "OPTModel"],
29564
29892
  ["mistral", "MistralModel"],
29893
+ ["mistral4", "Mistral4Model"],
29565
29894
  ["ministral", "MinistralModel"],
29566
29895
  ["ministral3", "Ministral3Model"],
29567
29896
  ["ernie4_5", "Ernie4_5ForCausalLM"],
29568
29897
  ["starcoder2", "Starcoder2Model"],
29898
+ ["deepseek_v3", "DeepseekV3Model"],
29569
29899
  ["falcon", "FalconModel"],
29570
29900
  ["falcon_h1", "FalconH1Model"],
29901
+ ["nemotron_h", "NemotronHModel"],
29902
+ ["solar_open", "SolarOpenModel"],
29571
29903
  ["stablelm", "StableLmModel"],
29572
29904
  ["modernbert-decoder", "ModernBertDecoderModel"],
29573
29905
  ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -29587,6 +29919,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29587
29919
  ]);
29588
29920
  var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29589
29921
  ["bert", "BertForSequenceClassification"],
29922
+ ["eurobert", "EuroBertForSequenceClassification"],
29590
29923
  ["neobert", "NeoBertForSequenceClassification"],
29591
29924
  ["modernbert", "ModernBertForSequenceClassification"],
29592
29925
  ["roformer", "RoFormerForSequenceClassification"],
@@ -29609,6 +29942,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29609
29942
  ]);
29610
29943
  var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29611
29944
  ["bert", "BertForTokenClassification"],
29945
+ ["eurobert", "EuroBertForTokenClassification"],
29612
29946
  ["neobert", "NeoBertForTokenClassification"],
29613
29947
  ["modernbert", "ModernBertForTokenClassification"],
29614
29948
  ["roformer", "RoFormerForTokenClassification"],
@@ -29671,6 +30005,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29671
30005
  ["gemma3", "Gemma3ForCausalLM"],
29672
30006
  ["helium", "HeliumForCausalLM"],
29673
30007
  ["glm", "GlmForCausalLM"],
30008
+ ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
29674
30009
  ["openelm", "OpenELMForCausalLM"],
29675
30010
  ["qwen2", "Qwen2ForCausalLM"],
29676
30011
  ["qwen2_moe", "Qwen2MoeForCausalLM"],
@@ -29682,6 +30017,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29682
30017
  ["qwen3_vl", "Qwen3VLForCausalLM"],
29683
30018
  ["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
29684
30019
  ["qwen3_5", "Qwen3_5ForCausalLM"],
30020
+ ["qwen3_5_text", "Qwen3_5ForCausalLM"],
29685
30021
  ["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
29686
30022
  ["gemma3n", "Gemma3nForCausalLM"],
29687
30023
  ["phi", "PhiForCausalLM"],
@@ -29690,13 +30026,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29690
30026
  ["opt", "OPTForCausalLM"],
29691
30027
  ["mbart", "MBartForCausalLM"],
29692
30028
  ["mistral", "MistralForCausalLM"],
30029
+ ["mistral4", "Mistral4ForCausalLM"],
29693
30030
  ["ministral", "MinistralForCausalLM"],
29694
30031
  ["ministral3", "Ministral3ForCausalLM"],
29695
30032
  ["ernie4_5", "Ernie4_5ForCausalLM"],
29696
30033
  ["starcoder2", "Starcoder2ForCausalLM"],
30034
+ ["deepseek_v3", "DeepseekV3ForCausalLM"],
29697
30035
  ["falcon", "FalconForCausalLM"],
29698
30036
  ["falcon_h1", "FalconH1ForCausalLM"],
30037
+ ["nemotron_h", "NemotronHForCausalLM"],
29699
30038
  ["trocr", "TrOCRForCausalLM"],
30039
+ ["solar_open", "SolarOpenForCausalLM"],
29700
30040
  ["stablelm", "StableLmForCausalLM"],
29701
30041
  ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
29702
30042
  ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -29707,6 +30047,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29707
30047
  var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
29708
30048
  var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29709
30049
  ["bert", "BertForMaskedLM"],
30050
+ ["eurobert", "EuroBertForMaskedLM"],
29710
30051
  ["neobert", "NeoBertForMaskedLM"],
29711
30052
  ["modernbert", "ModernBertForMaskedLM"],
29712
30053
  ["roformer", "RoFormerForMaskedLM"],
@@ -29764,8 +30105,11 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
29764
30105
  ["smolvlm", "SmolVLMForConditionalGeneration"],
29765
30106
  ["paligemma", "PaliGemmaForConditionalGeneration"],
29766
30107
  ["llava_qwen2", "LlavaQwen2ForCausalLM"],
30108
+ ["gemma3", "Gemma3ForConditionalGeneration"],
29767
30109
  ["gemma3n", "Gemma3nForConditionalGeneration"],
29768
- ["mistral3", "Mistral3ForConditionalGeneration"]
30110
+ ["mistral3", "Mistral3ForConditionalGeneration"],
30111
+ ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
30112
+ ["glm_ocr", "GlmOcrForConditionalGeneration"]
29769
30113
  ]);
29770
30114
  var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
29771
30115
  ["granite_speech", "GraniteSpeechForConditionalGeneration"],
@@ -29870,6 +30214,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29870
30214
  ]);
29871
30215
  var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
29872
30216
  var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30217
+ ["chmv2", "CHMv2ForDepthEstimation"],
29873
30218
  ["dpt", "DPTForDepthEstimation"],
29874
30219
  ["depth_anything", "DepthAnythingForDepthEstimation"],
29875
30220
  ["glpn", "GLPNForDepthEstimation"],
@@ -29955,13 +30300,6 @@ var CUSTOM_MAPPING = [
29955
30300
  ],
29956
30301
  ["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
29957
30302
  ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
29958
- ["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29959
- ["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29960
- ["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29961
- ["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29962
- ["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29963
- ["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29964
- ["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29965
30303
  [
29966
30304
  "VoxtralRealtimeForConditionalGeneration",
29967
30305
  VoxtralRealtimeForConditionalGeneration,
@@ -31643,6 +31981,41 @@ var TASK_ALIASES = Object.freeze({
31643
31981
  embeddings: "feature-extraction"
31644
31982
  });
31645
31983
 
31984
+ // src/utils/model_registry/resolve_model_type.js
31985
+ function resolve_model_type(config, { warn = true } = {}) {
31986
+ const architectures = (
31987
+ /** @type {string[]} */
31988
+ config.architectures || []
31989
+ );
31990
+ for (const arch of architectures) {
31991
+ const mappedType = MODEL_TYPE_MAPPING.get(arch);
31992
+ if (mappedType !== void 0) {
31993
+ return mappedType;
31994
+ }
31995
+ }
31996
+ if (config.model_type) {
31997
+ const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
31998
+ if (mappedType !== void 0) {
31999
+ return mappedType;
32000
+ }
32001
+ for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
32002
+ if (mapping.has(config.model_type)) {
32003
+ const resolved = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
32004
+ if (resolved !== void 0) {
32005
+ return resolved;
32006
+ }
32007
+ }
32008
+ }
32009
+ }
32010
+ if (warn) {
32011
+ const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
32012
+ logger.warn(
32013
+ `[resolve_model_type] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
32014
+ );
32015
+ }
32016
+ return MODEL_TYPES.EncoderOnly;
32017
+ }
32018
+
31646
32019
  // src/utils/model_registry/get_model_files.js
31647
32020
  function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
31648
32021
  if (config !== null) {
@@ -31665,43 +32038,7 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
31665
32038
  const subfolder = "onnx";
31666
32039
  const rawDevice = overrideDevice ?? custom_config.device;
31667
32040
  let dtype = overrideDtype ?? custom_config.dtype;
31668
- let modelType;
31669
- const architectures = (
31670
- /** @type {string[]} */
31671
- config.architectures || []
31672
- );
31673
- let foundInMapping = false;
31674
- for (const arch of architectures) {
31675
- const mappedType = MODEL_TYPE_MAPPING.get(arch);
31676
- if (mappedType !== void 0) {
31677
- modelType = mappedType;
31678
- foundInMapping = true;
31679
- break;
31680
- }
31681
- }
31682
- if (!foundInMapping && config.model_type) {
31683
- const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
31684
- if (mappedType !== void 0) {
31685
- modelType = mappedType;
31686
- foundInMapping = true;
31687
- }
31688
- if (!foundInMapping) {
31689
- for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
31690
- if (mapping.has(config.model_type)) {
31691
- modelType = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
31692
- foundInMapping = true;
31693
- break;
31694
- }
31695
- }
31696
- }
31697
- }
31698
- if (!foundInMapping) {
31699
- const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
31700
- logger.warn(
31701
- `[get_model_files] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
31702
- );
31703
- modelType = MODEL_TYPES.EncoderOnly;
31704
- }
32041
+ const modelType = resolve_model_type(config);
31705
32042
  const add_model_file = (fileName, baseName = null) => {
31706
32043
  baseName = baseName ?? fileName;
31707
32044
  const selectedDevice = selectDevice(rawDevice, fileName);
@@ -32288,6 +32625,31 @@ async function clear_pipeline_cache(task, modelId, options = {}) {
32288
32625
  return await clear_files_from_cache(modelId, files, options);
32289
32626
  }
32290
32627
 
32628
+ // src/utils/model_registry/get_available_dtypes.js
32629
+ var CONCRETE_DTYPES = Object.keys(DEFAULT_DTYPE_SUFFIX_MAPPING);
32630
+ async function get_available_dtypes(modelId, { config = null, model_file_name = null, revision = "main", cache_dir = null, local_files_only = false } = {}) {
32631
+ config = await get_config(modelId, { config, cache_dir, local_files_only, revision });
32632
+ const subfolder = "onnx";
32633
+ const modelType = resolve_model_type(config);
32634
+ const { sessions } = getSessionsConfig(modelType, config, { model_file_name });
32635
+ const baseNames = Object.values(sessions);
32636
+ const metadataOptions = { revision, cache_dir, local_files_only };
32637
+ const probeResults = await Promise.all(
32638
+ CONCRETE_DTYPES.map(async (dtype) => {
32639
+ const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[dtype] ?? "";
32640
+ const allExist = await Promise.all(
32641
+ baseNames.map(async (baseName) => {
32642
+ const filename = `${subfolder}/${baseName}${suffix}.onnx`;
32643
+ const metadata = await get_file_metadata(modelId, filename, metadataOptions);
32644
+ return metadata.exists;
32645
+ })
32646
+ );
32647
+ return { dtype, available: allExist.every(Boolean) };
32648
+ })
32649
+ );
32650
+ return probeResults.filter((r) => r.available).map((r) => r.dtype);
32651
+ }
32652
+
32291
32653
  // src/utils/model_registry/ModelRegistry.js
32292
32654
  var ModelRegistry = class {
32293
32655
  /**
@@ -32374,6 +32736,29 @@ var ModelRegistry = class {
32374
32736
  static async get_processor_files(modelId) {
32375
32737
  return get_processor_files(modelId);
32376
32738
  }
32739
+ /**
32740
+ * Detects which quantization levels (dtypes) are available for a model
32741
+ * by checking which ONNX files exist on the hub or locally.
32742
+ *
32743
+ * A dtype is considered available if all required model session files
32744
+ * exist for that dtype.
32745
+ *
32746
+ * @param {string} modelId - The model id (e.g., "onnx-community/all-MiniLM-L6-v2-ONNX")
32747
+ * @param {Object} [options] - Optional parameters
32748
+ * @param {import('../../configs.js').PretrainedConfig} [options.config=null] - Pre-loaded config
32749
+ * @param {string} [options.model_file_name=null] - Override the model file name (excluding .onnx suffix)
32750
+ * @param {string} [options.revision='main'] - Model revision
32751
+ * @param {string} [options.cache_dir=null] - Custom cache directory
32752
+ * @param {boolean} [options.local_files_only=false] - Only check local files
32753
+ * @returns {Promise<string[]>} Array of available dtype strings (e.g., ['fp32', 'fp16', 'q4', 'q8'])
32754
+ *
32755
+ * @example
32756
+ * const dtypes = await ModelRegistry.get_available_dtypes('onnx-community/all-MiniLM-L6-v2-ONNX');
32757
+ * console.log(dtypes); // ['fp32', 'fp16', 'int8', 'uint8', 'q8', 'q4']
32758
+ */
32759
+ static async get_available_dtypes(modelId, options = {}) {
32760
+ return get_available_dtypes(modelId, options);
32761
+ }
32377
32762
  /**
32378
32763
  * Quickly checks if a model is fully cached by verifying `config.json` is present,
32379
32764
  * then confirming all required files are cached.
@@ -32608,6 +32993,9 @@ export {
32608
32993
  BloomModel,
32609
32994
  BloomPreTrainedModel,
32610
32995
  BloomTokenizer,
32996
+ CHMv2ForDepthEstimation,
32997
+ CHMv2ImageProcessor,
32998
+ CHMv2PreTrainedModel,
32611
32999
  CLIPFeatureExtractor,
32612
33000
  CLIPImageProcessor,
32613
33001
  CLIPModel,
@@ -32703,6 +33091,9 @@ export {
32703
33091
  DebertaV2Tokenizer,
32704
33092
  DecisionTransformerModel,
32705
33093
  DecisionTransformerPreTrainedModel,
33094
+ DeepseekV3ForCausalLM,
33095
+ DeepseekV3Model,
33096
+ DeepseekV3PreTrainedModel,
32706
33097
  DeiTFeatureExtractor,
32707
33098
  DeiTForImageClassification,
32708
33099
  DeiTImageProcessor,
@@ -32763,6 +33154,11 @@ export {
32763
33154
  EsmModel,
32764
33155
  EsmPreTrainedModel,
32765
33156
  EsmTokenizer,
33157
+ EuroBertForMaskedLM,
33158
+ EuroBertForSequenceClassification,
33159
+ EuroBertForTokenClassification,
33160
+ EuroBertModel,
33161
+ EuroBertPreTrainedModel,
32766
33162
  ExaoneForCausalLM,
32767
33163
  ExaoneModel,
32768
33164
  ExaonePreTrainedModel,
@@ -32809,8 +33205,11 @@ export {
32809
33205
  Gemma2Model,
32810
33206
  Gemma2PreTrainedModel,
32811
33207
  Gemma3ForCausalLM,
33208
+ Gemma3ForConditionalGeneration,
33209
+ Gemma3ImageProcessor,
32812
33210
  Gemma3Model,
32813
33211
  Gemma3PreTrainedModel,
33212
+ Gemma3Processor,
32814
33213
  Gemma3nAudioFeatureExtractor,
32815
33214
  Gemma3nForCausalLM,
32816
33215
  Gemma3nForConditionalGeneration,
@@ -32820,8 +33219,14 @@ export {
32820
33219
  GemmaModel,
32821
33220
  GemmaPreTrainedModel,
32822
33221
  GemmaTokenizer,
33222
+ Glm46VImageProcessor,
33223
+ Glm46VProcessor,
32823
33224
  GlmForCausalLM,
32824
33225
  GlmModel,
33226
+ GlmMoeDsaForCausalLM,
33227
+ GlmMoeDsaModel,
33228
+ GlmMoeDsaPreTrainedModel,
33229
+ GlmOcrForConditionalGeneration,
32825
33230
  GlmPreTrainedModel,
32826
33231
  GptOssForCausalLM,
32827
33232
  GptOssModel,
@@ -32887,6 +33292,7 @@ export {
32887
33292
  Lfm2VlForConditionalGeneration,
32888
33293
  Lfm2VlImageProcessor,
32889
33294
  Lfm2VlProcessor,
33295
+ LightOnOcrForConditionalGeneration,
32890
33296
  LiteWhisperForConditionalGeneration,
32891
33297
  Llama4ForCausalLM,
32892
33298
  Llama4PreTrainedModel,
@@ -32956,6 +33362,9 @@ export {
32956
33362
  MimiPreTrainedModel,
32957
33363
  MinLengthLogitsProcessor,
32958
33364
  MinNewTokensLengthLogitsProcessor,
33365
+ Mistral4ForCausalLM,
33366
+ Mistral4Model,
33367
+ Mistral4PreTrainedModel,
32959
33368
  MistralForCausalLM,
32960
33369
  MistralModel,
32961
33370
  MistralPreTrainedModel,
@@ -33027,6 +33436,9 @@ export {
33027
33436
  NanoChatForCausalLM,
33028
33437
  NanoChatModel,
33029
33438
  NanoChatPreTrainedModel,
33439
+ NemotronHForCausalLM,
33440
+ NemotronHModel,
33441
+ NemotronHPreTrainedModel,
33030
33442
  NeoBertForMaskedLM,
33031
33443
  NeoBertForQuestionAnswering,
33032
33444
  NeoBertForSequenceClassification,
@@ -33216,6 +33628,9 @@ export {
33216
33628
  SnacFeatureExtractor,
33217
33629
  SnacModel,
33218
33630
  SnacPreTrainedModel,
33631
+ SolarOpenForCausalLM,
33632
+ SolarOpenModel,
33633
+ SolarOpenPreTrainedModel,
33219
33634
  SpeechT5FeatureExtractor,
33220
33635
  SpeechT5ForSpeechToText,
33221
33636
  SpeechT5ForTextToSpeech,
@@ -33413,7 +33828,7 @@ export {
33413
33828
 
33414
33829
  onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
33415
33830
  (*!
33416
- * ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
33831
+ * ONNX Runtime Web v1.25.0-dev.20260323-a99aad9d36
33417
33832
  * Copyright (c) Microsoft Corporation. All rights reserved.
33418
33833
  * Licensed under the MIT License.
33419
33834
  *)