@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +26 -26
- package/dist/transformers.js +1002 -587
- package/dist/transformers.min.js +23 -19
- package/dist/transformers.node.cjs +1030 -585
- package/dist/transformers.node.min.cjs +21 -17
- package/dist/transformers.node.min.mjs +21 -17
- package/dist/transformers.node.mjs +1000 -585
- package/dist/transformers.web.js +887 -472
- package/dist/transformers.web.min.js +21 -17
- package/package.json +3 -3
- package/src/configs.js +28 -22
- package/src/env.js +1 -1
- package/src/image_processors_utils.js +25 -15
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/gemma3/image_processing_gemma3.js +3 -0
- package/src/models/gemma3/modeling_gemma3.js +4 -1
- package/src/models/gemma3/processing_gemma3.js +45 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/image_processors.js +3 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +48 -25
- package/src/models/models.js +10 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/processors.js +2 -0
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +226 -168
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/registry.js +19 -8
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/pipelines.js +1 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/model_registry/ModelRegistry.js +36 -0
- package/src/utils/model_registry/get_available_dtypes.js +68 -0
- package/src/utils/model_registry/get_file_metadata.js +1 -0
- package/src/utils/model_registry/get_model_files.js +7 -60
- package/src/utils/model_registry/resolve_model_type.js +66 -0
- package/types/configs.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +3 -2
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/gemma3/image_processing_gemma3.d.ts +4 -0
- package/types/models/gemma3/image_processing_gemma3.d.ts.map +1 -0
- package/types/models/gemma3/modeling_gemma3.d.ts +4 -1
- package/types/models/gemma3/modeling_gemma3.d.ts.map +1 -1
- package/types/models/gemma3/processing_gemma3.d.ts +20 -0
- package/types/models/gemma3/processing_gemma3.d.ts.map +1 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +3 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts +2 -3
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +10 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/model_registry/ModelRegistry.d.ts +27 -0
- package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
- package/types/utils/model_registry/get_available_dtypes.d.ts +26 -0
- package/types/utils/model_registry/get_available_dtypes.d.ts.map +1 -0
- package/types/utils/model_registry/get_model_files.d.ts +25 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/model_registry/resolve_model_type.d.ts +24 -0
- package/types/utils/model_registry/resolve_model_type.d.ts.map +1 -0
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
|
@@ -14,7 +14,7 @@ var __export = (target, all) => {
|
|
|
14
14
|
import fs from "fs";
|
|
15
15
|
import path from "path";
|
|
16
16
|
import url from "url";
|
|
17
|
-
var VERSION = "4.0.0-next.
|
|
17
|
+
var VERSION = "4.0.0-next.9";
|
|
18
18
|
var HAS_SELF = typeof self !== "undefined";
|
|
19
19
|
var IS_FS_AVAILABLE = !isEmpty(fs);
|
|
20
20
|
var IS_PATH_AVAILABLE = !isEmpty(path);
|
|
@@ -244,7 +244,7 @@ var logger = {
|
|
|
244
244
|
}
|
|
245
245
|
};
|
|
246
246
|
|
|
247
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
247
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
248
248
|
var DictionarySplitter = class {
|
|
249
249
|
/**
|
|
250
250
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -1900,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
1900
1900
|
);
|
|
1901
1901
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
1902
1902
|
output_tokens.push(...byte_tokens);
|
|
1903
|
-
} else {
|
|
1903
|
+
} else if (this.unk_token != null) {
|
|
1904
1904
|
output_tokens.push(this.unk_token);
|
|
1905
1905
|
}
|
|
1906
|
-
} else {
|
|
1906
|
+
} else if (this.unk_token != null) {
|
|
1907
1907
|
output_tokens.push(this.unk_token);
|
|
1908
1908
|
}
|
|
1909
1909
|
}
|
|
@@ -5753,14 +5753,14 @@ var Random = class {
|
|
|
5753
5753
|
* @returns {number} A normally distributed random value.
|
|
5754
5754
|
*/
|
|
5755
5755
|
gauss(mu = 0, sigma = 1) {
|
|
5756
|
-
let
|
|
5756
|
+
let z2 = this._gauss_next;
|
|
5757
5757
|
this._gauss_next = null;
|
|
5758
|
-
if (
|
|
5758
|
+
if (z2 === null) {
|
|
5759
5759
|
const x2pi = this.random() * 2 * Math.PI, g2rad = Math.sqrt(-2 * Math.log(1 - this.random()));
|
|
5760
|
-
|
|
5760
|
+
z2 = Math.cos(x2pi) * g2rad;
|
|
5761
5761
|
this._gauss_next = Math.sin(x2pi) * g2rad;
|
|
5762
5762
|
}
|
|
5763
|
-
return mu +
|
|
5763
|
+
return mu + z2 * sigma;
|
|
5764
5764
|
}
|
|
5765
5765
|
/**
|
|
5766
5766
|
* Shuffles an array in-place using the Fisher-Yates algorithm.
|
|
@@ -6514,13 +6514,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
|
|
|
6514
6514
|
wrapped_progress
|
|
6515
6515
|
);
|
|
6516
6516
|
} else if (typeof response !== "string") {
|
|
6517
|
+
const headers = new Headers(response.headers);
|
|
6518
|
+
headers.set("content-length", result.byteLength.toString());
|
|
6517
6519
|
await cache2.put(
|
|
6518
6520
|
cacheKey,
|
|
6519
6521
|
new Response(
|
|
6520
6522
|
/** @type {any} */
|
|
6521
6523
|
result,
|
|
6522
6524
|
{
|
|
6523
|
-
headers
|
|
6525
|
+
headers
|
|
6524
6526
|
}
|
|
6525
6527
|
)
|
|
6526
6528
|
).catch((err) => {
|
|
@@ -7478,7 +7480,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
|
|
|
7478
7480
|
// src/backends/onnx.js
|
|
7479
7481
|
import * as ONNX_NODE from "onnxruntime-node";
|
|
7480
7482
|
|
|
7481
|
-
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.
|
|
7483
|
+
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260323-a99aad9d36/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
|
|
7482
7484
|
var ort_webgpu_bundle_min_exports = {};
|
|
7483
7485
|
__export(ort_webgpu_bundle_min_exports, {
|
|
7484
7486
|
InferenceSession: () => Jf,
|
|
@@ -8246,7 +8248,7 @@ async function ts(a = {}) {
|
|
|
8246
8248
|
throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
|
|
8247
8249
|
}
|
|
8248
8250
|
function Ye() {
|
|
8249
|
-
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn,
|
|
8251
|
+
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, q: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, s: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: lf, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: uf, A: df, r: cf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
|
|
8250
8252
|
}
|
|
8251
8253
|
async function bt() {
|
|
8252
8254
|
function e(o, u) {
|
|
@@ -8309,14 +8311,14 @@ async function ts(a = {}) {
|
|
|
8309
8311
|
gt.push(t), Je[e.Nc] = t, t.Nc = e.Nc;
|
|
8310
8312
|
var n = { Oc: "run", he: e.ge, Wc: e.Wc, Nc: e.Nc };
|
|
8311
8313
|
return t.postMessage(n, e.Yc), 0;
|
|
8312
|
-
},
|
|
8314
|
+
}, G = 0, V = (e, t, ...n) => {
|
|
8313
8315
|
var o, u = 16 * n.length, c = P(), h = Ft(u), b = h >>> 3;
|
|
8314
8316
|
for (o of n) typeof o == "bigint" ? ((p(), pe)[b++ >>> 0] = 1n, (p(), pe)[b++ >>> 0] = o) : ((p(), pe)[b++ >>> 0] = 0n, (p(), ae)[b++ >>> 0] = o);
|
|
8315
8317
|
return e = Lo(e, 0, u, h, t), D(c), e;
|
|
8316
8318
|
};
|
|
8317
8319
|
function qe(e) {
|
|
8318
8320
|
if (i) return V(0, 1, e);
|
|
8319
|
-
if (S = e, !(0 <
|
|
8321
|
+
if (S = e, !(0 < G)) {
|
|
8320
8322
|
for (var t of gt) Se(t);
|
|
8321
8323
|
for (t of We) Se(t);
|
|
8322
8324
|
We = [], gt = [], Je = {}, W = true;
|
|
@@ -8361,7 +8363,7 @@ async function ts(a = {}) {
|
|
|
8361
8363
|
We.push(e);
|
|
8362
8364
|
}
|
|
8363
8365
|
var Fe, zs = (e, t) => {
|
|
8364
|
-
|
|
8366
|
+
G = 0, e = zr(e, t), 0 < G ? S = e : Fr(e);
|
|
8365
8367
|
}, Ct = [], Ut = 0, me = (e) => -9007199254740992 > e || 9007199254740992 < e ? NaN : Number(e);
|
|
8366
8368
|
function Vs(e) {
|
|
8367
8369
|
var t = new wr(e >>>= 0);
|
|
@@ -8713,7 +8715,7 @@ async function ts(a = {}) {
|
|
|
8713
8715
|
}
|
|
8714
8716
|
var he = (e) => {
|
|
8715
8717
|
if (!W) try {
|
|
8716
|
-
if (e(), !(0 <
|
|
8718
|
+
if (e(), !(0 < G)) try {
|
|
8717
8719
|
i ? Wt() && Fr(S) : br(S);
|
|
8718
8720
|
} catch (t) {
|
|
8719
8721
|
t instanceof wt || t == "unwind" || y(0, t);
|
|
@@ -8741,7 +8743,7 @@ async function ts(a = {}) {
|
|
|
8741
8743
|
return (t ? Vr[t] : of[e])(...Ir);
|
|
8742
8744
|
}
|
|
8743
8745
|
var Ei = () => {
|
|
8744
|
-
|
|
8746
|
+
G = 0;
|
|
8745
8747
|
};
|
|
8746
8748
|
function Si(e) {
|
|
8747
8749
|
e >>>= 0, i ? postMessage({ Oc: "cleanupThread", ie: e }) : yn(Je[e]);
|
|
@@ -8761,7 +8763,7 @@ async function ts(a = {}) {
|
|
|
8761
8763
|
try {
|
|
8762
8764
|
return e(...n);
|
|
8763
8765
|
} finally {
|
|
8764
|
-
W || (_t.pop(), Me && Ge === 1 && _t.length === 0 && (Ge = 0,
|
|
8766
|
+
W || (_t.pop(), Me && Ge === 1 && _t.length === 0 && (Ge = 0, G += 1, Pt(wa), typeof Fibers < "u" && Fibers.De()));
|
|
8765
8767
|
}
|
|
8766
8768
|
};
|
|
8767
8769
|
return jn.set(e, t), t;
|
|
@@ -8776,7 +8778,7 @@ async function ts(a = {}) {
|
|
|
8776
8778
|
try {
|
|
8777
8779
|
var c = (function() {
|
|
8778
8780
|
var E = (p(), x)[Me + 8 >>> 2 >>> 0];
|
|
8779
|
-
return E = Vn.get(E), E = jn.get(E), --
|
|
8781
|
+
return E = Vn.get(E), E = jn.get(E), --G, E();
|
|
8780
8782
|
})();
|
|
8781
8783
|
} catch (E) {
|
|
8782
8784
|
c = E, u = true;
|
|
@@ -8967,7 +8969,7 @@ async function ts(a = {}) {
|
|
|
8967
8969
|
return L(ct(e >>> 0, t >>> 0));
|
|
8968
8970
|
}
|
|
8969
8971
|
var ou = () => {
|
|
8970
|
-
throw
|
|
8972
|
+
throw G += 1, "unwind";
|
|
8971
8973
|
};
|
|
8972
8974
|
function au() {
|
|
8973
8975
|
return 4294901760;
|
|
@@ -9060,15 +9062,15 @@ async function ts(a = {}) {
|
|
|
9060
9062
|
}
|
|
9061
9063
|
(b = (p(), A)[c + 24 >>> 2 >>> 0]) && (b = { label: Ne(b + 4) }, e.defaultQueue = b), e.label = Ne(c + 4);
|
|
9062
9064
|
}
|
|
9063
|
-
|
|
9064
|
-
--
|
|
9065
|
-
ce[u >>> 0] = B.queue, ce[o >>> 0] = B, lt(n, B.lost.then((ue) => {
|
|
9065
|
+
G += 1, lt(t, h.requestDevice(e).then((B) => {
|
|
9066
|
+
--G, he(() => {
|
|
9067
|
+
ce[u >>> 0] = B.queue, ce[o >>> 0] = B, G += 1, lt(n, B.lost.then((ue) => {
|
|
9066
9068
|
he(() => {
|
|
9067
9069
|
B.onuncapturederror = () => {
|
|
9068
9070
|
};
|
|
9069
9071
|
var ye = P(), fe = Ce(ue.message);
|
|
9070
9072
|
_r(n, yu[ue.reason], fe), D(ye);
|
|
9071
|
-
});
|
|
9073
|
+
}), --G;
|
|
9072
9074
|
})), B.onuncapturederror = (ue) => {
|
|
9073
9075
|
var ye = 5;
|
|
9074
9076
|
ue.error instanceof GPUValidationError ? ye = 2 : ue.error instanceof GPUOutOfMemoryError ? ye = 3 : ue.error instanceof GPUInternalError && (ye = 4);
|
|
@@ -9077,7 +9079,7 @@ async function ts(a = {}) {
|
|
|
9077
9079
|
}, "adapterInfo" in B || (B.adapterInfo = h.info), kr(t, 1, o, 0);
|
|
9078
9080
|
});
|
|
9079
9081
|
}, (B) => {
|
|
9080
|
-
--
|
|
9082
|
+
--G, he(() => {
|
|
9081
9083
|
var ue = P(), ye = Ce(B.message);
|
|
9082
9084
|
kr(t, 3, o, ye), n && _r(n, 4, ye), D(ue);
|
|
9083
9085
|
});
|
|
@@ -9120,12 +9122,12 @@ async function ts(a = {}) {
|
|
|
9120
9122
|
function vu(e, t, n, o, u) {
|
|
9121
9123
|
e >>>= 0, t = me(t), n = me(n), u >>>= 0;
|
|
9122
9124
|
var c = O(e);
|
|
9123
|
-
Re[e] = [], u == 4294967295 && (u = void 0),
|
|
9124
|
-
--
|
|
9125
|
+
Re[e] = [], u == 4294967295 && (u = void 0), G += 1, lt(t, c.mapAsync(n, o >>> 0, u).then(() => {
|
|
9126
|
+
--G, he(() => {
|
|
9125
9127
|
Rr(t, 1, 0);
|
|
9126
9128
|
});
|
|
9127
9129
|
}, (h) => {
|
|
9128
|
-
--
|
|
9130
|
+
--G, he(() => {
|
|
9129
9131
|
P();
|
|
9130
9132
|
var b = Ce(h.message);
|
|
9131
9133
|
Rr(t, h.name === "AbortError" ? 4 : h.name === "OperationError" ? 3 : 0, b), delete Re[e];
|
|
@@ -9154,12 +9156,12 @@ async function ts(a = {}) {
|
|
|
9154
9156
|
return ce[n >>> 0] = u, o && (Re[n] = []), true;
|
|
9155
9157
|
}
|
|
9156
9158
|
function Iu(e, t, n, o) {
|
|
9157
|
-
e >>>= 0, t = me(t), o >>>= 0, n = du(n >>> 0), e = O(e),
|
|
9158
|
-
--
|
|
9159
|
+
e >>>= 0, t = me(t), o >>>= 0, n = du(n >>> 0), e = O(e), G += 1, lt(t, e.createComputePipelineAsync(n).then((u) => {
|
|
9160
|
+
--G, he(() => {
|
|
9159
9161
|
ce[o >>> 0] = u, Pr(t, 1, o, 0);
|
|
9160
9162
|
});
|
|
9161
9163
|
}, (u) => {
|
|
9162
|
-
--
|
|
9164
|
+
--G, he(() => {
|
|
9163
9165
|
var c = P(), h = Ce(u.message);
|
|
9164
9166
|
Pr(t, u.reason === "validation" ? 3 : u.reason === "internal" ? 4 : 0, o, h), D(c);
|
|
9165
9167
|
});
|
|
@@ -9174,15 +9176,15 @@ async function ts(a = {}) {
|
|
|
9174
9176
|
(e = O(e)).onuncapturederror = null, e.destroy();
|
|
9175
9177
|
};
|
|
9176
9178
|
function Ou(e, t) {
|
|
9177
|
-
t = me(t), e = O(e >>> 0),
|
|
9178
|
-
--
|
|
9179
|
+
t = me(t), e = O(e >>> 0), G += 1, lt(t, e.popErrorScope().then((n) => {
|
|
9180
|
+
--G, he(() => {
|
|
9179
9181
|
var o = 5;
|
|
9180
9182
|
n ? n instanceof GPUValidationError ? o = 2 : n instanceof GPUOutOfMemoryError ? o = 3 : n instanceof GPUInternalError && (o = 4) : o = 1;
|
|
9181
9183
|
var u = P(), c = n ? Ce(n.message) : 0;
|
|
9182
9184
|
Nr(t, 1, o, c), D(u);
|
|
9183
9185
|
});
|
|
9184
9186
|
}, (n) => {
|
|
9185
|
-
--
|
|
9187
|
+
--G, he(() => {
|
|
9186
9188
|
var o = P(), u = Ce(n.message);
|
|
9187
9189
|
Nr(t, 1, 5, u), D(o);
|
|
9188
9190
|
});
|
|
@@ -9193,8 +9195,8 @@ async function ts(a = {}) {
|
|
|
9193
9195
|
var u = { featureLevel: pu[(p(), x)[n + 4 >>> 2 >>> 0]], powerPreference: mu[(p(), x)[n + 8 >>> 2 >>> 0]], forceFallbackAdapter: !!(p(), A)[n + 12 >>> 2 >>> 0] };
|
|
9194
9196
|
(e = (p(), A)[n >>> 2 >>> 0]) !== 0 && (p(), u.Fe = !!(p(), A)[e + 8 >>> 2 >>> 0]);
|
|
9195
9197
|
}
|
|
9196
|
-
"gpu" in navigator ? (
|
|
9197
|
-
--
|
|
9198
|
+
"gpu" in navigator ? (G += 1, lt(t, navigator.gpu.requestAdapter(u).then((c) => {
|
|
9199
|
+
--G, he(() => {
|
|
9198
9200
|
if (c) ce[o >>> 0] = c, Et(t, 1, o, 0);
|
|
9199
9201
|
else {
|
|
9200
9202
|
var h = P(), b = Ce("WebGPU not available on this browser (requestAdapter returned null)");
|
|
@@ -9202,7 +9204,7 @@ async function ts(a = {}) {
|
|
|
9202
9204
|
}
|
|
9203
9205
|
});
|
|
9204
9206
|
}, (c) => {
|
|
9205
|
-
--
|
|
9207
|
+
--G, he(() => {
|
|
9206
9208
|
var h = P(), b = Ce(c.message);
|
|
9207
9209
|
Et(t, 4, o, b), D(h);
|
|
9208
9210
|
});
|
|
@@ -9433,7 +9435,7 @@ async function ts(a = {}) {
|
|
|
9433
9435
|
Te(`invalid type for getValue: ${t}`);
|
|
9434
9436
|
}
|
|
9435
9437
|
}, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
|
|
9436
|
-
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = {
|
|
9438
|
+
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 937012: (e, t, n, o, u) => {
|
|
9437
9439
|
if (r === void 0 || !r.Uc) return 1;
|
|
9438
9440
|
if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
|
|
9439
9441
|
if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
|
|
@@ -9453,11 +9455,11 @@ async function ts(a = {}) {
|
|
|
9453
9455
|
} catch {
|
|
9454
9456
|
return 4;
|
|
9455
9457
|
}
|
|
9456
|
-
},
|
|
9458
|
+
}, 937836: (e, t, n) => {
|
|
9457
9459
|
r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
|
|
9458
|
-
},
|
|
9460
|
+
}, 937900: () => r.me(), 937942: (e) => {
|
|
9459
9461
|
r.jd(e);
|
|
9460
|
-
},
|
|
9462
|
+
}, 937979: () => typeof wasmOffsetConverter < "u" };
|
|
9461
9463
|
function af(e, t, n, o) {
|
|
9462
9464
|
var u = P();
|
|
9463
9465
|
try {
|
|
@@ -9476,12 +9478,12 @@ async function ts(a = {}) {
|
|
|
9476
9478
|
N(1, 0);
|
|
9477
9479
|
}
|
|
9478
9480
|
}
|
|
9479
|
-
function uf(e
|
|
9480
|
-
var
|
|
9481
|
+
function uf(e) {
|
|
9482
|
+
var t = P();
|
|
9481
9483
|
try {
|
|
9482
|
-
|
|
9483
|
-
} catch (
|
|
9484
|
-
if (D(
|
|
9484
|
+
Ro(e);
|
|
9485
|
+
} catch (n) {
|
|
9486
|
+
if (D(t), n !== n + 0) throw n;
|
|
9485
9487
|
N(1, 0);
|
|
9486
9488
|
}
|
|
9487
9489
|
}
|
|
@@ -9494,25 +9496,16 @@ async function ts(a = {}) {
|
|
|
9494
9496
|
N(1, 0);
|
|
9495
9497
|
}
|
|
9496
9498
|
}
|
|
9497
|
-
function cf(e) {
|
|
9498
|
-
var
|
|
9499
|
-
try {
|
|
9500
|
-
Ro(e);
|
|
9501
|
-
} catch (n) {
|
|
9502
|
-
if (D(t), n !== n + 0) throw n;
|
|
9503
|
-
N(1, 0);
|
|
9504
|
-
}
|
|
9505
|
-
}
|
|
9506
|
-
function df(e, t, n, o, u, c, h) {
|
|
9507
|
-
var b = P();
|
|
9499
|
+
function cf(e, t, n) {
|
|
9500
|
+
var o = P();
|
|
9508
9501
|
try {
|
|
9509
|
-
|
|
9510
|
-
} catch (
|
|
9511
|
-
if (D(
|
|
9502
|
+
_o(e, t, n);
|
|
9503
|
+
} catch (u) {
|
|
9504
|
+
if (D(o), u !== u + 0) throw u;
|
|
9512
9505
|
N(1, 0);
|
|
9513
9506
|
}
|
|
9514
9507
|
}
|
|
9515
|
-
function
|
|
9508
|
+
function df(e, t) {
|
|
9516
9509
|
var n = P();
|
|
9517
9510
|
try {
|
|
9518
9511
|
Vo(e, t);
|
|
@@ -9521,6 +9514,15 @@ async function ts(a = {}) {
|
|
|
9521
9514
|
N(1, 0);
|
|
9522
9515
|
}
|
|
9523
9516
|
}
|
|
9517
|
+
function lf(e, t, n, o, u, c, h) {
|
|
9518
|
+
var b = P();
|
|
9519
|
+
try {
|
|
9520
|
+
return Wo(e, t, n, o, u, c, h);
|
|
9521
|
+
} catch (E) {
|
|
9522
|
+
if (D(b), E !== E + 0) throw E;
|
|
9523
|
+
N(1, 0);
|
|
9524
|
+
}
|
|
9525
|
+
}
|
|
9524
9526
|
function pf(e, t, n, o, u, c) {
|
|
9525
9527
|
var h = P();
|
|
9526
9528
|
try {
|
|
@@ -9950,7 +9952,7 @@ var nc;
|
|
|
9950
9952
|
var oc;
|
|
9951
9953
|
var ac;
|
|
9952
9954
|
var qt;
|
|
9953
|
-
var
|
|
9955
|
+
var z;
|
|
9954
9956
|
var je = k(() => {
|
|
9955
9957
|
"use strict";
|
|
9956
9958
|
Yt();
|
|
@@ -10006,19 +10008,19 @@ var je = k(() => {
|
|
|
10006
10008
|
rr = false, ds = true, H(M);
|
|
10007
10009
|
});
|
|
10008
10010
|
})), await Promise.race(C), S) throw new Error(`WebAssembly backend initializing failed due to timeout: ${r}ms`);
|
|
10009
|
-
},
|
|
10011
|
+
}, z = () => {
|
|
10010
10012
|
if (nn && rn) return rn;
|
|
10011
10013
|
throw new Error("WebAssembly is not initialized yet.");
|
|
10012
10014
|
};
|
|
10013
10015
|
});
|
|
10014
10016
|
var be;
|
|
10015
10017
|
var Lt;
|
|
10016
|
-
var
|
|
10018
|
+
var $;
|
|
10017
10019
|
var nr = k(() => {
|
|
10018
10020
|
"use strict";
|
|
10019
10021
|
je();
|
|
10020
10022
|
be = (a, r) => {
|
|
10021
|
-
let s =
|
|
10023
|
+
let s = z(), f = s.lengthBytesUTF8(a) + 1, i = s._malloc(f);
|
|
10022
10024
|
return s.stringToUTF8(a, i, f), r.push(i), i;
|
|
10023
10025
|
}, Lt = (a, r, s, f) => {
|
|
10024
10026
|
if (typeof a == "object" && a !== null) {
|
|
@@ -10032,8 +10034,8 @@ var nr = k(() => {
|
|
|
10032
10034
|
else if (typeof d == "boolean") f(l, d ? "1" : "0");
|
|
10033
10035
|
else throw new Error(`Can't handle extra config type: ${typeof d}`);
|
|
10034
10036
|
});
|
|
10035
|
-
},
|
|
10036
|
-
let r =
|
|
10037
|
+
}, $ = (a) => {
|
|
10038
|
+
let r = z(), s = r.stackSave();
|
|
10037
10039
|
try {
|
|
10038
10040
|
let f = r.PTR_SIZE, i = r.stackAlloc(2 * f);
|
|
10039
10041
|
r._OrtGetLastError(i, i + f);
|
|
@@ -10050,7 +10052,7 @@ var ps = k(() => {
|
|
|
10050
10052
|
je();
|
|
10051
10053
|
nr();
|
|
10052
10054
|
ls = (a) => {
|
|
10053
|
-
let r =
|
|
10055
|
+
let r = z(), s = 0, f = [], i = a || {};
|
|
10054
10056
|
try {
|
|
10055
10057
|
if (a?.logSeverityLevel === void 0) i.logSeverityLevel = 2;
|
|
10056
10058
|
else if (typeof a.logSeverityLevel != "number" || !Number.isInteger(a.logSeverityLevel) || a.logSeverityLevel < 0 || a.logSeverityLevel > 4) throw new Error(`log severity level is not valid: ${a.logSeverityLevel}`);
|
|
@@ -10058,9 +10060,9 @@ var ps = k(() => {
|
|
|
10058
10060
|
else if (typeof a.logVerbosityLevel != "number" || !Number.isInteger(a.logVerbosityLevel)) throw new Error(`log verbosity level is not valid: ${a.logVerbosityLevel}`);
|
|
10059
10061
|
a?.terminate === void 0 && (i.terminate = false);
|
|
10060
10062
|
let d = 0;
|
|
10061
|
-
return a?.tag !== void 0 && (d = be(a.tag, f)), s = r._OrtCreateRunOptions(i.logSeverityLevel, i.logVerbosityLevel, !!i.terminate, d), s === 0 &&
|
|
10063
|
+
return a?.tag !== void 0 && (d = be(a.tag, f)), s = r._OrtCreateRunOptions(i.logSeverityLevel, i.logVerbosityLevel, !!i.terminate, d), s === 0 && $("Can't create run options."), a?.extra !== void 0 && Lt(a.extra, "", /* @__PURE__ */ new WeakSet(), (l, m) => {
|
|
10062
10064
|
let y = be(l, f), w = be(m, f);
|
|
10063
|
-
r._OrtAddRunConfigEntry(s, y, w) !== 0 &&
|
|
10065
|
+
r._OrtAddRunConfigEntry(s, y, w) !== 0 && $(`Can't set a run config entry: ${l} - ${m}.`);
|
|
10064
10066
|
}), [s, f];
|
|
10065
10067
|
} catch (d) {
|
|
10066
10068
|
throw s !== 0 && r._OrtReleaseRunOptions(s), f.forEach((l) => r._free(l)), d;
|
|
@@ -10108,7 +10110,7 @@ var hs = k(() => {
|
|
|
10108
10110
|
r.use_ort_model_bytes_directly || (r.use_ort_model_bytes_directly = "1"), a.executionProviders && a.executionProviders.some((s) => (typeof s == "string" ? s : s.name) === "webgpu") && (a.enableMemPattern = false);
|
|
10109
10111
|
}, on = (a, r, s, f) => {
|
|
10110
10112
|
let i = be(r, f), d = be(s, f);
|
|
10111
|
-
|
|
10113
|
+
z()._OrtAddSessionConfigEntry(a, i, d) !== 0 && $(`Can't set a session config entry: ${r} - ${s}.`);
|
|
10112
10114
|
}, ot = (a, r, s, f) => {
|
|
10113
10115
|
let i = be(r, f), d = be(s, f);
|
|
10114
10116
|
a.push([i, d]);
|
|
@@ -10139,7 +10141,7 @@ var hs = k(() => {
|
|
|
10139
10141
|
}
|
|
10140
10142
|
S.validationMode && ot(l, "validationMode", S.validationMode, s);
|
|
10141
10143
|
}
|
|
10142
|
-
let v =
|
|
10144
|
+
let v = z().webgpuRegisterDevice(g);
|
|
10143
10145
|
if (v) {
|
|
10144
10146
|
let [S, C, R] = v;
|
|
10145
10147
|
ot(l, "deviceId", S.toString(), s), ot(l, "webgpuInstance", C.toString(), s), ot(l, "webgpuDevice", R.toString(), s);
|
|
@@ -10154,13 +10156,13 @@ var hs = k(() => {
|
|
|
10154
10156
|
}
|
|
10155
10157
|
let m = be(d, s), y = l.length, w = 0, T = 0;
|
|
10156
10158
|
if (y > 0) {
|
|
10157
|
-
w =
|
|
10158
|
-
for (let g = 0; g < y; g++)
|
|
10159
|
+
w = z()._malloc(y * z().PTR_SIZE), s.push(w), T = z()._malloc(y * z().PTR_SIZE), s.push(T);
|
|
10160
|
+
for (let g = 0; g < y; g++) z().setValue(w + g * z().PTR_SIZE, l[g][0], "*"), z().setValue(T + g * z().PTR_SIZE, l[g][1], "*");
|
|
10159
10161
|
}
|
|
10160
|
-
await
|
|
10162
|
+
await z()._OrtAppendExecutionProvider(a, m, w, T, y) !== 0 && $(`Can't append execution provider: ${d}.`);
|
|
10161
10163
|
}
|
|
10162
10164
|
}, ms = async (a) => {
|
|
10163
|
-
let r =
|
|
10165
|
+
let r = z(), s = 0, f = [], i = a || {};
|
|
10164
10166
|
uc(i);
|
|
10165
10167
|
try {
|
|
10166
10168
|
let d = sc(i.graphOptimizationLevel ?? "all"), l = ic(i.executionMode ?? "sequential"), m = typeof i.logId == "string" ? be(i.logId, f) : 0, y = i.logSeverityLevel ?? 2;
|
|
@@ -10168,7 +10170,7 @@ var hs = k(() => {
|
|
|
10168
10170
|
let w = i.logVerbosityLevel ?? 0;
|
|
10169
10171
|
if (!Number.isInteger(w) || w < 0 || w > 4) throw new Error(`log verbosity level is not valid: ${w}`);
|
|
10170
10172
|
let T = typeof i.optimizedModelFilePath == "string" ? be(i.optimizedModelFilePath, f) : 0;
|
|
10171
|
-
if (s = r._OrtCreateSessionOptions(d, !!i.enableCpuMemArena, !!i.enableMemPattern, l, !!i.enableProfiling, 0, m, y, w, T), s === 0 &&
|
|
10173
|
+
if (s = r._OrtCreateSessionOptions(d, !!i.enableCpuMemArena, !!i.enableMemPattern, l, !!i.enableProfiling, 0, m, y, w, T), s === 0 && $("Can't create session options."), i.executionProviders && await fc(s, i, f), i.enableGraphCapture !== void 0) {
|
|
10172
10174
|
if (typeof i.enableGraphCapture != "boolean") throw new Error(`enableGraphCapture must be a boolean value: ${i.enableGraphCapture}`);
|
|
10173
10175
|
on(s, "enableGraphCapture", i.enableGraphCapture.toString(), f);
|
|
10174
10176
|
}
|
|
@@ -10176,13 +10178,13 @@ var hs = k(() => {
|
|
|
10176
10178
|
if (typeof g != "string") throw new Error(`free dimension override name must be a string: ${g}`);
|
|
10177
10179
|
if (typeof v != "number" || !Number.isInteger(v) || v < 0) throw new Error(`free dimension override value must be a non-negative integer: ${v}`);
|
|
10178
10180
|
let S = be(g, f);
|
|
10179
|
-
r._OrtAddFreeDimensionOverride(s, S, v) !== 0 &&
|
|
10181
|
+
r._OrtAddFreeDimensionOverride(s, S, v) !== 0 && $(`Can't set a free dimension override: ${g} - ${v}.`);
|
|
10180
10182
|
}
|
|
10181
10183
|
return i.extra !== void 0 && Lt(i.extra, "", /* @__PURE__ */ new WeakSet(), (g, v) => {
|
|
10182
10184
|
on(s, g, v, f);
|
|
10183
10185
|
}), [s, f];
|
|
10184
10186
|
} catch (d) {
|
|
10185
|
-
throw s !== 0 && r._OrtReleaseSessionOptions(s) !== 0 &&
|
|
10187
|
+
throw s !== 0 && r._OrtReleaseSessionOptions(s) !== 0 && $("Can't release session options."), f.forEach((l) => r._free(l)), d;
|
|
10186
10188
|
}
|
|
10187
10189
|
};
|
|
10188
10190
|
});
|
|
@@ -10752,7 +10754,7 @@ var Os = k(() => {
|
|
|
10752
10754
|
return l ? l.push(d) : this.temporarySessionTensorIds.set(r, [d]), d;
|
|
10753
10755
|
}
|
|
10754
10756
|
uploadTensor(r, s) {
|
|
10755
|
-
if (
|
|
10757
|
+
if (!z().shouldTransferToMLTensor) throw new Error("Trying to upload to a MLTensor while shouldTransferToMLTensor is false");
|
|
10756
10758
|
le("verbose", () => `[WebNN] uploadTensor {tensorId: ${r}, data: ${s.byteLength}}`), this.tensorManager.upload(r, s);
|
|
10757
10759
|
}
|
|
10758
10760
|
async downloadTensor(r, s) {
|
|
@@ -10858,11 +10860,11 @@ var Kr = k(() => {
|
|
|
10858
10860
|
nr();
|
|
10859
10861
|
sn();
|
|
10860
10862
|
yc = (a, r) => {
|
|
10861
|
-
|
|
10863
|
+
z()._OrtInit(a, r) !== 0 && $("Can't initialize onnxruntime.");
|
|
10862
10864
|
}, Jt = async (a) => {
|
|
10863
10865
|
yc(a.wasm.numThreads, Ot(a.logLevel));
|
|
10864
10866
|
}, Xt = async (a, r) => {
|
|
10865
|
-
|
|
10867
|
+
z().asyncInit?.();
|
|
10866
10868
|
let s = a.webgpu.adapter;
|
|
10867
10869
|
if (r === "webgpu") {
|
|
10868
10870
|
if (typeof navigator > "u" || !navigator.gpu) throw new Error("WebGPU is not supported in current environment");
|
|
@@ -10877,29 +10879,29 @@ var Kr = k(() => {
|
|
|
10877
10879
|
}
|
|
10878
10880
|
}
|
|
10879
10881
|
if (r === "webnn" && (typeof navigator > "u" || !navigator.ml)) throw new Error("WebNN is not supported in current environment");
|
|
10880
|
-
if (r === "webgpu" &&
|
|
10882
|
+
if (r === "webgpu" && z().webgpuInit((f) => {
|
|
10881
10883
|
a.webgpu.device = f;
|
|
10882
10884
|
}), r === "webnn") {
|
|
10883
10885
|
let f = new (Os(), $t(Ls)).WebNNBackend(a);
|
|
10884
|
-
|
|
10886
|
+
z().webnnInit([f, () => f.reserveTensorId(), (i) => f.releaseTensorId(i), async (i, d, l, m, y) => f.ensureTensor(i, d, l, m, y), (i, d) => {
|
|
10885
10887
|
f.uploadTensor(i, d);
|
|
10886
10888
|
}, async (i, d) => f.downloadTensor(i, d), (i, d) => f.registerMLContext(i, d), !!a.trace]);
|
|
10887
10889
|
}
|
|
10888
10890
|
}, it = /* @__PURE__ */ new Map(), bc = (a) => {
|
|
10889
|
-
let r =
|
|
10891
|
+
let r = z(), s = r.stackSave();
|
|
10890
10892
|
try {
|
|
10891
10893
|
let f = r.PTR_SIZE, i = r.stackAlloc(2 * f);
|
|
10892
|
-
r._OrtGetInputOutputCount(a, i, i + f) !== 0 &&
|
|
10894
|
+
r._OrtGetInputOutputCount(a, i, i + f) !== 0 && $("Can't get session input/output count.");
|
|
10893
10895
|
let l = f === 4 ? "i32" : "i64";
|
|
10894
10896
|
return [Number(r.getValue(i, l)), Number(r.getValue(i + f, l))];
|
|
10895
10897
|
} finally {
|
|
10896
10898
|
r.stackRestore(s);
|
|
10897
10899
|
}
|
|
10898
10900
|
}, Bs = (a, r) => {
|
|
10899
|
-
let s =
|
|
10901
|
+
let s = z(), f = s.stackSave(), i = 0;
|
|
10900
10902
|
try {
|
|
10901
10903
|
let d = s.PTR_SIZE, l = s.stackAlloc(2 * d);
|
|
10902
|
-
s._OrtGetInputOutputMetadata(a, r, l, l + d) !== 0 &&
|
|
10904
|
+
s._OrtGetInputOutputMetadata(a, r, l, l + d) !== 0 && $("Can't get session input/output metadata.");
|
|
10903
10905
|
let y = Number(s.getValue(l, "*"));
|
|
10904
10906
|
i = Number(s.getValue(l + d, "*"));
|
|
10905
10907
|
let w = s.HEAP32[i / 4];
|
|
@@ -10914,11 +10916,11 @@ var Kr = k(() => {
|
|
|
10914
10916
|
s.stackRestore(f), i !== 0 && s._OrtFree(i);
|
|
10915
10917
|
}
|
|
10916
10918
|
}, xt = (a) => {
|
|
10917
|
-
let r =
|
|
10919
|
+
let r = z(), s = r._malloc(a.byteLength);
|
|
10918
10920
|
if (s === 0) throw new Error(`Can't create a session. failed to allocate a buffer of size ${a.byteLength}.`);
|
|
10919
10921
|
return r.HEAPU8.set(a, s), [s, a.byteLength];
|
|
10920
10922
|
}, Qt = async (a, r) => {
|
|
10921
|
-
let s, f, i =
|
|
10923
|
+
let s, f, i = z();
|
|
10922
10924
|
Array.isArray(a) ? [s, f] = a : a.buffer === i.HEAPU8.buffer ? [s, f] = [a.byteOffset, a.byteLength] : [s, f] = xt(a);
|
|
10923
10925
|
let d = 0, l = 0, m = 0, y = [], w = [], T = [];
|
|
10924
10926
|
try {
|
|
@@ -10939,17 +10941,17 @@ var Kr = k(() => {
|
|
|
10939
10941
|
} else i.currentContext = await i.webnnCreateMLContext();
|
|
10940
10942
|
break;
|
|
10941
10943
|
}
|
|
10942
|
-
d = await i._OrtCreateSession(s, f, l), i.webgpuOnCreateSession?.(d), d === 0 &&
|
|
10944
|
+
d = await i._OrtCreateSession(s, f, l), i.webgpuOnCreateSession?.(d), d === 0 && $("Can't create a session."), i.jsepOnCreateSession?.(), i.currentContext && (i.webnnRegisterMLContext(d, i.currentContext), i.currentContext = void 0, i.shouldTransferToMLTensor = true);
|
|
10943
10945
|
let [g, v] = bc(d), S = !!r?.enableGraphCapture, C = [], R = [], H = [], U = [], M = [];
|
|
10944
10946
|
for (let L = 0; L < g; L++) {
|
|
10945
10947
|
let [W, oe, p] = Bs(d, L);
|
|
10946
|
-
W === 0 &&
|
|
10948
|
+
W === 0 && $("Can't get an input name."), w.push(W);
|
|
10947
10949
|
let ne = i.UTF8ToString(W);
|
|
10948
10950
|
C.push(ne), H.push(oe === 0 ? { name: ne, isTensor: false } : { name: ne, isTensor: true, type: or(oe), shape: p });
|
|
10949
10951
|
}
|
|
10950
10952
|
for (let L = 0; L < v; L++) {
|
|
10951
10953
|
let [W, oe, p] = Bs(d, L + g);
|
|
10952
|
-
W === 0 &&
|
|
10954
|
+
W === 0 && $("Can't get an output name."), T.push(W);
|
|
10953
10955
|
let ne = i.UTF8ToString(W);
|
|
10954
10956
|
R.push(ne), U.push(oe === 0 ? { name: ne, isTensor: false } : { name: ne, isTensor: true, type: or(oe), shape: p });
|
|
10955
10957
|
{
|
|
@@ -10968,23 +10970,23 @@ var Kr = k(() => {
|
|
|
10968
10970
|
}
|
|
10969
10971
|
}
|
|
10970
10972
|
let Y = null;
|
|
10971
|
-
return M.some((L) => L === "gpu-buffer" || L === "ml-tensor" || L === "ml-tensor-cpu-output") && (m = i._OrtCreateBinding(d), m === 0 &&
|
|
10973
|
+
return M.some((L) => L === "gpu-buffer" || L === "ml-tensor" || L === "ml-tensor-cpu-output") && (m = i._OrtCreateBinding(d), m === 0 && $("Can't create IO binding."), Y = { handle: m, outputPreferredLocations: M, outputPreferredLocationsEncoded: M.map((L) => L === "ml-tensor-cpu-output" ? "ml-tensor" : L).map((L) => an(L)) }), it.set(d, [d, w, T, Y, S, false]), [d, C, R, H, U];
|
|
10972
10974
|
} catch (g) {
|
|
10973
|
-
throw w.forEach((v) => i._OrtFree(v)), T.forEach((v) => i._OrtFree(v)), m !== 0 && i._OrtReleaseBinding(m) !== 0 &&
|
|
10975
|
+
throw w.forEach((v) => i._OrtFree(v)), T.forEach((v) => i._OrtFree(v)), m !== 0 && i._OrtReleaseBinding(m) !== 0 && $("Can't release IO binding."), d !== 0 && i._OrtReleaseSession(d) !== 0 && $("Can't release session."), g;
|
|
10974
10976
|
} finally {
|
|
10975
|
-
i._free(s), l !== 0 && i._OrtReleaseSessionOptions(l) !== 0 &&
|
|
10977
|
+
i._free(s), l !== 0 && i._OrtReleaseSessionOptions(l) !== 0 && $("Can't release session options."), y.forEach((g) => i._free(g)), i.unmountExternalData?.();
|
|
10976
10978
|
}
|
|
10977
10979
|
}, Zt = (a) => {
|
|
10978
|
-
let r =
|
|
10980
|
+
let r = z(), s = it.get(a);
|
|
10979
10981
|
if (!s) throw new Error(`cannot release session. invalid session id: ${a}`);
|
|
10980
10982
|
let [f, i, d, l, m] = s;
|
|
10981
|
-
l && (m && r._OrtClearBoundOutputs(l.handle) !== 0 &&
|
|
10983
|
+
l && (m && r._OrtClearBoundOutputs(l.handle) !== 0 && $("Can't clear bound outputs."), r._OrtReleaseBinding(l.handle) !== 0 && $("Can't release IO binding.")), r.jsepOnReleaseSession?.(a), r.webnnOnReleaseSession?.(a), r.webgpuOnReleaseSession?.(a), i.forEach((y) => r._OrtFree(y)), d.forEach((y) => r._OrtFree(y)), r._OrtReleaseSession(f) !== 0 && $("Can't release session."), it.delete(a);
|
|
10982
10984
|
}, Ms = async (a, r, s, f, i, d, l = false) => {
|
|
10983
10985
|
if (!a) {
|
|
10984
10986
|
r.push(0);
|
|
10985
10987
|
return;
|
|
10986
10988
|
}
|
|
10987
|
-
let m =
|
|
10989
|
+
let m = z(), y = m.PTR_SIZE, w = a[0], T = a[1], g = a[3], v = g, S, C;
|
|
10988
10990
|
if (w === "string" && (g === "gpu-buffer" || g === "ml-tensor")) throw new Error("String tensor is not supported on GPU.");
|
|
10989
10991
|
if (l && g !== "gpu-buffer") throw new Error(`External buffer must be provided for input/output index ${d} when enableGraphCapture is true.`);
|
|
10990
10992
|
if (g === "gpu-buffer") {
|
|
@@ -11028,12 +11030,12 @@ var Kr = k(() => {
|
|
|
11028
11030
|
try {
|
|
11029
11031
|
T.forEach((M, Y) => m.setValue(H + Y * y, M, y === 4 ? "i32" : "i64"));
|
|
11030
11032
|
let U = m._OrtCreateTensor(He(w), S, C, H, T.length, an(v));
|
|
11031
|
-
U === 0 &&
|
|
11033
|
+
U === 0 && $(`Can't create tensor for input/output. session=${f}, index=${d}.`), r.push(U);
|
|
11032
11034
|
} finally {
|
|
11033
11035
|
m.stackRestore(R);
|
|
11034
11036
|
}
|
|
11035
11037
|
}, Kt = async (a, r, s, f, i, d) => {
|
|
11036
|
-
let l =
|
|
11038
|
+
let l = z(), m = l.PTR_SIZE, y = it.get(a);
|
|
11037
11039
|
if (!y) throw new Error(`cannot run inference. invalid session id: ${a}`);
|
|
11038
11040
|
let w = y[0], T = y[1], g = y[2], v = y[3], S = y[4], C = y[5], R = r.length, H = f.length, U = 0, M = [], Y = [], L = [], W = [], oe = [], p = l.stackSave(), ne = l.stackAlloc(R * m), X = l.stackAlloc(R * m), J = l.stackAlloc(H * m), Ue = l.stackAlloc(H * m);
|
|
11039
11041
|
try {
|
|
@@ -11049,33 +11051,33 @@ var Kr = k(() => {
|
|
|
11049
11051
|
$e("wasm bindInputsOutputs");
|
|
11050
11052
|
for (let q = 0; q < R; q++) {
|
|
11051
11053
|
let we = r[q];
|
|
11052
|
-
await l._OrtBindInput(_, T[we], Y[q]) !== 0 &&
|
|
11054
|
+
await l._OrtBindInput(_, T[we], Y[q]) !== 0 && $(`Can't bind input[${q}] for session=${a}.`);
|
|
11053
11055
|
}
|
|
11054
11056
|
for (let q = 0; q < H; q++) {
|
|
11055
11057
|
let we = f[q];
|
|
11056
|
-
i[q]?.[3] ? (oe.push(L[q]), l._OrtBindOutput(_, g[we], L[q], 0) !== 0 &&
|
|
11058
|
+
i[q]?.[3] ? (oe.push(L[q]), l._OrtBindOutput(_, g[we], L[q], 0) !== 0 && $(`Can't bind pre-allocated output[${q}] for session=${a}.`)) : l._OrtBindOutput(_, g[we], 0, pe[we]) !== 0 && $(`Can't bind output[${q}] to ${ae[q]} for session=${a}.`);
|
|
11057
11059
|
}
|
|
11058
11060
|
ze("wasm bindInputsOutputs"), it.set(a, [w, T, g, v, S, true]);
|
|
11059
11061
|
}
|
|
11060
11062
|
l.jsepOnRunStart?.(w), l.webnnOnRunStart?.(w);
|
|
11061
11063
|
let Q;
|
|
11062
|
-
v ? Q = await l._OrtRunWithBinding(w, v.handle, H, J, U) : Q = await l._OrtRun(w, X, ne, R, Ue, H, J, U), Q !== 0 &&
|
|
11064
|
+
v ? Q = await l._OrtRunWithBinding(w, v.handle, H, J, U) : Q = await l._OrtRun(w, X, ne, R, Ue, H, J, U), Q !== 0 && $("failed to call OrtRun().");
|
|
11063
11065
|
let x = [], A = [];
|
|
11064
11066
|
$e("wasm ProcessOutputTensor");
|
|
11065
11067
|
for (let _ = 0; _ < H; _++) {
|
|
11066
11068
|
let ae = Number(l.getValue(J + _ * m, "*"));
|
|
11067
11069
|
if (ae === L[_] || oe.includes(L[_])) {
|
|
11068
|
-
x.push(i[_]), ae !== L[_] && l._OrtReleaseTensor(ae) !== 0 &&
|
|
11070
|
+
x.push(i[_]), ae !== L[_] && l._OrtReleaseTensor(ae) !== 0 && $("Can't release tensor.");
|
|
11069
11071
|
continue;
|
|
11070
11072
|
}
|
|
11071
11073
|
let pe = l.stackSave(), q = l.stackAlloc(4 * m), we = false, re, se = 0;
|
|
11072
11074
|
try {
|
|
11073
|
-
l._OrtGetTensorData(ae, q, q + m, q + 2 * m, q + 3 * m) !== 0 &&
|
|
11075
|
+
l._OrtGetTensorData(ae, q, q + m, q + 2 * m, q + 3 * m) !== 0 && $(`Can't access output tensor data on index ${_}.`);
|
|
11074
11076
|
let Te = m === 4 ? "i32" : "i64", Ye = Number(l.getValue(q, Te));
|
|
11075
11077
|
se = l.getValue(q + m, "*");
|
|
11076
11078
|
let bt = l.getValue(q + m * 2, "*"), wt = Number(l.getValue(q + m * 3, Te)), Se = [];
|
|
11077
11079
|
for (let ee = 0; ee < wt; ee++) Se.push(Number(l.getValue(bt + ee * m, Te)));
|
|
11078
|
-
l._OrtFree(bt) !== 0 &&
|
|
11080
|
+
l._OrtFree(bt) !== 0 && $("Can't free memory for tensor dims.");
|
|
11079
11081
|
let Ae = Se.reduce((ee, Z) => ee * Z, 1);
|
|
11080
11082
|
re = or(Ye);
|
|
11081
11083
|
let Oe = v?.outputPreferredLocations[f[_]];
|
|
@@ -11083,24 +11085,24 @@ var Kr = k(() => {
|
|
|
11083
11085
|
if (Oe === "gpu-buffer" || Oe === "ml-tensor") throw new Error("String tensor is not supported on GPU.");
|
|
11084
11086
|
let ee = [];
|
|
11085
11087
|
for (let Z = 0; Z < Ae; Z++) {
|
|
11086
|
-
let
|
|
11087
|
-
ee.push(l.UTF8ToString(
|
|
11088
|
+
let G = l.getValue(se + Z * m, "*"), V = l.getValue(se + (Z + 1) * m, "*"), qe = Z === Ae - 1 ? void 0 : V - G;
|
|
11089
|
+
ee.push(l.UTF8ToString(G, qe));
|
|
11088
11090
|
}
|
|
11089
11091
|
x.push([re, Se, ee, "cpu"]);
|
|
11090
11092
|
} else if (Oe === "gpu-buffer" && Ae > 0) {
|
|
11091
11093
|
let ee = l.webgpuGetBuffer;
|
|
11092
11094
|
if (!ee) throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');
|
|
11093
|
-
let Z = ee(se),
|
|
11094
|
-
if (
|
|
11095
|
+
let Z = ee(se), G = mt(Ye, Ae);
|
|
11096
|
+
if (G === void 0 || !ar(re)) throw new Error(`Unsupported data type: ${re}`);
|
|
11095
11097
|
we = true;
|
|
11096
11098
|
{
|
|
11097
11099
|
l.webgpuRegisterBuffer(Z, a, se);
|
|
11098
|
-
let V = l.webgpuCreateDownloader(Z,
|
|
11100
|
+
let V = l.webgpuCreateDownloader(Z, G, a);
|
|
11099
11101
|
x.push([re, Se, { gpuBuffer: Z, download: async () => {
|
|
11100
11102
|
let qe = await V();
|
|
11101
11103
|
return new (at(re))(qe);
|
|
11102
11104
|
}, dispose: () => {
|
|
11103
|
-
l._OrtReleaseTensor(ae) !== 0 &&
|
|
11105
|
+
l._OrtReleaseTensor(ae) !== 0 && $("Can't release tensor.");
|
|
11104
11106
|
} }, "gpu-buffer"]);
|
|
11105
11107
|
}
|
|
11106
11108
|
} else if (Oe === "ml-tensor" && Ae > 0) {
|
|
@@ -11115,8 +11117,8 @@ var Kr = k(() => {
|
|
|
11115
11117
|
} else if (Oe === "ml-tensor-cpu-output" && Ae > 0) {
|
|
11116
11118
|
let ee = l.webnnCreateMLTensorDownloader(se, re)(), Z = x.length;
|
|
11117
11119
|
we = true, A.push((async () => {
|
|
11118
|
-
let
|
|
11119
|
-
return l.webnnReleaseTensorId(se), l._OrtReleaseTensor(ae),
|
|
11120
|
+
let G = [Z, await ee];
|
|
11121
|
+
return l.webnnReleaseTensorId(se), l._OrtReleaseTensor(ae), G;
|
|
11120
11122
|
})()), x.push([re, Se, [], "cpu"]);
|
|
11121
11123
|
} else {
|
|
11122
11124
|
let ee = at(re), Z = new ee(Ae);
|
|
@@ -11126,7 +11128,7 @@ var Kr = k(() => {
|
|
|
11126
11128
|
l.stackRestore(pe), re === "string" && se && l._free(se), we || l._OrtReleaseTensor(ae);
|
|
11127
11129
|
}
|
|
11128
11130
|
}
|
|
11129
|
-
v && !S && (l._OrtClearBoundOutputs(v.handle) !== 0 &&
|
|
11131
|
+
v && !S && (l._OrtClearBoundOutputs(v.handle) !== 0 && $("Can't clear bound outputs."), it.set(a, [w, T, g, v, S, false]));
|
|
11130
11132
|
for (let [_, ae] of await Promise.all(A)) x[_][2] = ae;
|
|
11131
11133
|
return ze("wasm ProcessOutputTensor"), x;
|
|
11132
11134
|
} finally {
|
|
@@ -11137,10 +11139,10 @@ var Kr = k(() => {
|
|
|
11137
11139
|
}), Y.forEach((Q) => l._OrtReleaseTensor(Q)), L.forEach((Q) => l._OrtReleaseTensor(Q)), W.forEach((Q) => l._free(Q)), U !== 0 && l._OrtReleaseRunOptions(U), M.forEach((Q) => l._free(Q));
|
|
11138
11140
|
}
|
|
11139
11141
|
}, er = (a) => {
|
|
11140
|
-
let r =
|
|
11142
|
+
let r = z(), s = it.get(a);
|
|
11141
11143
|
if (!s) throw new Error("invalid session id");
|
|
11142
11144
|
let f = s[0], i = r._OrtEndProfiling(f);
|
|
11143
|
-
i === 0 &&
|
|
11145
|
+
i === 0 && $("Can't get an profile file name."), r._OrtFree(i);
|
|
11144
11146
|
}, tr = (a) => {
|
|
11145
11147
|
let r = [];
|
|
11146
11148
|
for (let s of a) {
|
|
@@ -11373,7 +11375,7 @@ var $s = k(() => {
|
|
|
11373
11375
|
Ve();
|
|
11374
11376
|
Ve();
|
|
11375
11377
|
Ve();
|
|
11376
|
-
var Xa = "1.25.0-dev.
|
|
11378
|
+
var Xa = "1.25.0-dev.20260323-a99aad9d36";
|
|
11377
11379
|
var Tl = Zr;
|
|
11378
11380
|
{
|
|
11379
11381
|
let a = ($s(), $t(Gs)).wasmBackend;
|
|
@@ -15729,7 +15731,9 @@ var processors_exports = {};
|
|
|
15729
15731
|
__export(processors_exports, {
|
|
15730
15732
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
15731
15733
|
Florence2Processor: () => Florence2Processor,
|
|
15734
|
+
Gemma3Processor: () => Gemma3Processor,
|
|
15732
15735
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
15736
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
15733
15737
|
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
15734
15738
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
15735
15739
|
Idefics3Processor: () => Idefics3Processor,
|
|
@@ -18234,26 +18238,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
18234
18238
|
}
|
|
18235
18239
|
return [segmentation, segments];
|
|
18236
18240
|
}
|
|
18237
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
18241
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
18238
18242
|
if (height < factor || width < factor) {
|
|
18239
|
-
|
|
18240
|
-
|
|
18243
|
+
const scale = Math.max(factor / height, factor / width);
|
|
18244
|
+
height = Math.round(height * scale);
|
|
18245
|
+
width = Math.round(width * scale);
|
|
18246
|
+
}
|
|
18247
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
18241
18248
|
throw new Error(
|
|
18242
18249
|
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
18243
18250
|
);
|
|
18244
18251
|
}
|
|
18245
18252
|
let h_bar = Math.round(height / factor) * factor;
|
|
18246
18253
|
let w_bar = Math.round(width / factor) * factor;
|
|
18247
|
-
if (h_bar * w_bar > max_pixels) {
|
|
18248
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
18249
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
18250
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
18251
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
18252
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
18254
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
18255
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
18256
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
18257
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
18258
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
18259
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
18253
18260
|
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
18254
18261
|
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
18255
18262
|
}
|
|
18256
|
-
return [
|
|
18263
|
+
return [w_bar, h_bar];
|
|
18257
18264
|
}
|
|
18258
18265
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
18259
18266
|
if (label_ids_to_fuse === null) {
|
|
@@ -18332,7 +18339,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18332
18339
|
this.do_pad = config.do_pad;
|
|
18333
18340
|
this.min_pixels = config.min_pixels;
|
|
18334
18341
|
this.max_pixels = config.max_pixels;
|
|
18335
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
18342
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
18336
18343
|
this.pad_size = this.size;
|
|
18337
18344
|
}
|
|
18338
18345
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -18620,10 +18627,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18620
18627
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
18621
18628
|
[pixelData, imgDims] = padded;
|
|
18622
18629
|
} else if (this.size_divisibility) {
|
|
18623
|
-
const
|
|
18624
|
-
|
|
18625
|
-
this.size_divisibility
|
|
18626
|
-
);
|
|
18630
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
18631
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
18627
18632
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
18628
18633
|
}
|
|
18629
18634
|
}
|
|
@@ -18700,6 +18705,7 @@ var image_processors_exports = {};
|
|
|
18700
18705
|
__export(image_processors_exports, {
|
|
18701
18706
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
18702
18707
|
BitImageProcessor: () => BitImageProcessor,
|
|
18708
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
18703
18709
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
18704
18710
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
18705
18711
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -18716,6 +18722,8 @@ __export(image_processors_exports, {
|
|
|
18716
18722
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
18717
18723
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
18718
18724
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
18725
|
+
Gemma3ImageProcessor: () => Gemma3ImageProcessor,
|
|
18726
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
18719
18727
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
18720
18728
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
18721
18729
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
@@ -18776,6 +18784,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
18776
18784
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
18777
18785
|
};
|
|
18778
18786
|
|
|
18787
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
18788
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
18789
|
+
};
|
|
18790
|
+
|
|
18779
18791
|
// src/models/clip/image_processing_clip.js
|
|
18780
18792
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
18781
18793
|
};
|
|
@@ -18895,6 +18907,69 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
18895
18907
|
}
|
|
18896
18908
|
};
|
|
18897
18909
|
|
|
18910
|
+
// src/models/gemma3/image_processing_gemma3.js
|
|
18911
|
+
var Gemma3ImageProcessor = class extends ImageProcessor {
|
|
18912
|
+
};
|
|
18913
|
+
|
|
18914
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
18915
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
18916
|
+
constructor(config) {
|
|
18917
|
+
super(config);
|
|
18918
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
18919
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
18920
|
+
this.patch_size = config.patch_size;
|
|
18921
|
+
this.merge_size = config.merge_size;
|
|
18922
|
+
}
|
|
18923
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
18924
|
+
get_resize_output_image_size(image, size) {
|
|
18925
|
+
const factor = this.patch_size * this.merge_size;
|
|
18926
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
18927
|
+
}
|
|
18928
|
+
async _call(images, ...args) {
|
|
18929
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
18930
|
+
let patches = pixel_values;
|
|
18931
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
18932
|
+
if (patches.dims[0] === 1) {
|
|
18933
|
+
patches = cat(
|
|
18934
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
18935
|
+
0
|
|
18936
|
+
);
|
|
18937
|
+
}
|
|
18938
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
18939
|
+
const channel = patches.dims[1];
|
|
18940
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
18941
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
18942
|
+
const flatten_patches = patches.view(
|
|
18943
|
+
grid_t,
|
|
18944
|
+
temporal_patch_size,
|
|
18945
|
+
channel,
|
|
18946
|
+
Math.floor(grid_h / merge_size),
|
|
18947
|
+
merge_size,
|
|
18948
|
+
patch_size,
|
|
18949
|
+
Math.floor(grid_w / merge_size),
|
|
18950
|
+
merge_size,
|
|
18951
|
+
patch_size
|
|
18952
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
18953
|
+
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
18954
|
+
return {
|
|
18955
|
+
pixel_values: flatten_patches,
|
|
18956
|
+
image_grid_thw,
|
|
18957
|
+
original_sizes,
|
|
18958
|
+
reshaped_input_sizes
|
|
18959
|
+
};
|
|
18960
|
+
}
|
|
18961
|
+
};
|
|
18962
|
+
|
|
18963
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
18964
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
18965
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
18966
|
+
get_resize_output_image_size(image, size) {
|
|
18967
|
+
const factor = this.patch_size * this.merge_size;
|
|
18968
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
18969
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
18970
|
+
}
|
|
18971
|
+
};
|
|
18972
|
+
|
|
18898
18973
|
// src/models/glpn/image_processing_glpn.js
|
|
18899
18974
|
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
18900
18975
|
};
|
|
@@ -19288,7 +19363,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
|
19288
19363
|
const img = pixel_values.unsqueeze_(0);
|
|
19289
19364
|
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
19290
19365
|
const f2 = total_factor ** 2;
|
|
19291
|
-
const [
|
|
19366
|
+
const [new_width, new_height] = smart_resize(
|
|
19292
19367
|
Math.max(total_factor, height),
|
|
19293
19368
|
Math.max(total_factor, width),
|
|
19294
19369
|
total_factor,
|
|
@@ -19578,55 +19653,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
19578
19653
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
19579
19654
|
};
|
|
19580
19655
|
|
|
19581
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19582
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19583
|
-
constructor(config) {
|
|
19584
|
-
super(config);
|
|
19585
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
19586
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
19587
|
-
this.patch_size = config.patch_size;
|
|
19588
|
-
this.merge_size = config.merge_size;
|
|
19589
|
-
}
|
|
19590
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
19591
|
-
get_resize_output_image_size(image, size) {
|
|
19592
|
-
const factor = this.patch_size * this.merge_size;
|
|
19593
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
19594
|
-
}
|
|
19595
|
-
async _call(images, ...args) {
|
|
19596
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
19597
|
-
let patches = pixel_values;
|
|
19598
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
19599
|
-
if (patches.dims[0] === 1) {
|
|
19600
|
-
patches = cat(
|
|
19601
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
19602
|
-
0
|
|
19603
|
-
);
|
|
19604
|
-
}
|
|
19605
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
19606
|
-
const channel = patches.dims[1];
|
|
19607
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
19608
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
19609
|
-
const flatten_patches = patches.view(
|
|
19610
|
-
grid_t,
|
|
19611
|
-
temporal_patch_size,
|
|
19612
|
-
channel,
|
|
19613
|
-
Math.floor(grid_h / merge_size),
|
|
19614
|
-
merge_size,
|
|
19615
|
-
patch_size,
|
|
19616
|
-
Math.floor(grid_w / merge_size),
|
|
19617
|
-
merge_size,
|
|
19618
|
-
patch_size
|
|
19619
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
19620
|
-
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
19621
|
-
return {
|
|
19622
|
-
pixel_values: flatten_patches,
|
|
19623
|
-
image_grid_thw,
|
|
19624
|
-
original_sizes,
|
|
19625
|
-
reshaped_input_sizes
|
|
19626
|
-
};
|
|
19627
|
-
}
|
|
19628
|
-
};
|
|
19629
|
-
|
|
19630
19656
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
19631
19657
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
19632
19658
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -20108,6 +20134,48 @@ var Florence2Processor = class extends Processor {
|
|
|
20108
20134
|
}
|
|
20109
20135
|
};
|
|
20110
20136
|
|
|
20137
|
+
// src/models/gemma3/processing_gemma3.js
|
|
20138
|
+
var Gemma3Processor = class extends Processor {
|
|
20139
|
+
static tokenizer_class = AutoTokenizer;
|
|
20140
|
+
static image_processor_class = AutoImageProcessor;
|
|
20141
|
+
static uses_processor_config = true;
|
|
20142
|
+
static uses_chat_template_file = true;
|
|
20143
|
+
constructor(config, components, chat_template) {
|
|
20144
|
+
super(config, components, chat_template);
|
|
20145
|
+
this.image_seq_length = this.config.image_seq_length;
|
|
20146
|
+
const { boi_token, image_token, eoi_token } = this.tokenizer.config;
|
|
20147
|
+
this.boi_token = boi_token;
|
|
20148
|
+
this.image_token = image_token;
|
|
20149
|
+
this.eoi_token = eoi_token;
|
|
20150
|
+
const image_tokens_expanded = image_token.repeat(this.image_seq_length);
|
|
20151
|
+
this.full_image_sequence = `
|
|
20152
|
+
|
|
20153
|
+
${boi_token}${image_tokens_expanded}${eoi_token}
|
|
20154
|
+
|
|
20155
|
+
`;
|
|
20156
|
+
}
|
|
20157
|
+
/**
|
|
20158
|
+
* @param {string|string[]} text
|
|
20159
|
+
* @param {import('../../utils/image.js').RawImage|import('../../utils/image.js').RawImage[]} [images]
|
|
20160
|
+
* @param {Object} [options]
|
|
20161
|
+
*/
|
|
20162
|
+
async _call(text, images = null, options = {}) {
|
|
20163
|
+
if (typeof text === "string") {
|
|
20164
|
+
text = [text];
|
|
20165
|
+
}
|
|
20166
|
+
let image_inputs;
|
|
20167
|
+
if (images) {
|
|
20168
|
+
image_inputs = await this.image_processor(images, options);
|
|
20169
|
+
text = text.map((prompt) => prompt.replaceAll(this.boi_token, this.full_image_sequence));
|
|
20170
|
+
}
|
|
20171
|
+
const text_inputs = this.tokenizer(text, options);
|
|
20172
|
+
return {
|
|
20173
|
+
...text_inputs,
|
|
20174
|
+
...image_inputs
|
|
20175
|
+
};
|
|
20176
|
+
}
|
|
20177
|
+
};
|
|
20178
|
+
|
|
20111
20179
|
// src/models/gemma3n/processing_gemma3n.js
|
|
20112
20180
|
var Gemma3nProcessor = class extends Processor {
|
|
20113
20181
|
static image_processor_class = AutoImageProcessor;
|
|
@@ -20180,6 +20248,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
20180
20248
|
}
|
|
20181
20249
|
};
|
|
20182
20250
|
|
|
20251
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
20252
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
20253
|
+
static image_processor_class = AutoImageProcessor;
|
|
20254
|
+
static tokenizer_class = AutoTokenizer;
|
|
20255
|
+
static image_token = "<|image_pad|>";
|
|
20256
|
+
/**
|
|
20257
|
+
*
|
|
20258
|
+
* @param {string|string[]} text
|
|
20259
|
+
* @param {RawImage|RawImage[]} images
|
|
20260
|
+
* @param {...any} args
|
|
20261
|
+
* @returns {Promise<any>}
|
|
20262
|
+
*/
|
|
20263
|
+
async _call(text, images = null, ...args) {
|
|
20264
|
+
if (!Array.isArray(text)) {
|
|
20265
|
+
text = [text];
|
|
20266
|
+
}
|
|
20267
|
+
let image_inputs, image_grid_thw;
|
|
20268
|
+
if (images) {
|
|
20269
|
+
image_inputs = await this.image_processor(images);
|
|
20270
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
20271
|
+
}
|
|
20272
|
+
if (image_grid_thw) {
|
|
20273
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
20274
|
+
let index = 0;
|
|
20275
|
+
const image_token = (
|
|
20276
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
20277
|
+
this.constructor.image_token
|
|
20278
|
+
);
|
|
20279
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
20280
|
+
text = text.map((t) => {
|
|
20281
|
+
while (t.includes(image_token)) {
|
|
20282
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
20283
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
20284
|
+
}
|
|
20285
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
20286
|
+
});
|
|
20287
|
+
}
|
|
20288
|
+
const text_inputs = this.tokenizer(text);
|
|
20289
|
+
return {
|
|
20290
|
+
...text_inputs,
|
|
20291
|
+
...image_inputs
|
|
20292
|
+
};
|
|
20293
|
+
}
|
|
20294
|
+
};
|
|
20295
|
+
|
|
20296
|
+
// src/models/glm46v/processing_glm46v.js
|
|
20297
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
20298
|
+
static image_token = "<|image|>";
|
|
20299
|
+
};
|
|
20300
|
+
|
|
20183
20301
|
// src/models/granite_speech/processing_granite_speech.js
|
|
20184
20302
|
var GraniteSpeechProcessor = class extends Processor {
|
|
20185
20303
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -20910,47 +21028,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
20910
21028
|
}
|
|
20911
21029
|
};
|
|
20912
21030
|
|
|
20913
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
20914
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
20915
|
-
static image_processor_class = AutoImageProcessor;
|
|
20916
|
-
static tokenizer_class = AutoTokenizer;
|
|
20917
|
-
/**
|
|
20918
|
-
*
|
|
20919
|
-
* @param {string|string[]} text
|
|
20920
|
-
* @param {RawImage|RawImage[]} images
|
|
20921
|
-
* @param {...any} args
|
|
20922
|
-
* @returns {Promise<any>}
|
|
20923
|
-
*/
|
|
20924
|
-
async _call(text, images = null, ...args) {
|
|
20925
|
-
if (!Array.isArray(text)) {
|
|
20926
|
-
text = [text];
|
|
20927
|
-
}
|
|
20928
|
-
let image_inputs, image_grid_thw;
|
|
20929
|
-
if (images) {
|
|
20930
|
-
image_inputs = await this.image_processor(images);
|
|
20931
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
20932
|
-
}
|
|
20933
|
-
if (image_grid_thw) {
|
|
20934
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
20935
|
-
let index = 0;
|
|
20936
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
20937
|
-
text = text.map((t) => {
|
|
20938
|
-
while (t.includes("<|image_pad|>")) {
|
|
20939
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
20940
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
20941
|
-
}
|
|
20942
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
20943
|
-
});
|
|
20944
|
-
}
|
|
20945
|
-
const text_inputs = this.tokenizer(text);
|
|
20946
|
-
return {
|
|
20947
|
-
...text_inputs,
|
|
20948
|
-
...image_inputs
|
|
20949
|
-
// TODO: ...videos_inputs,
|
|
20950
|
-
};
|
|
20951
|
-
}
|
|
20952
|
-
};
|
|
20953
|
-
|
|
20954
21031
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
20955
21032
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
20956
21033
|
};
|
|
@@ -21294,6 +21371,8 @@ function getNormalizedConfig(config) {
|
|
|
21294
21371
|
case "gemma3n":
|
|
21295
21372
|
case "lfm2_vl":
|
|
21296
21373
|
case "chatterbox":
|
|
21374
|
+
case "lighton_ocr":
|
|
21375
|
+
case "glm_ocr":
|
|
21297
21376
|
case "mistral3":
|
|
21298
21377
|
case "qwen2_5_vl":
|
|
21299
21378
|
case "qwen3_vl":
|
|
@@ -21369,6 +21448,8 @@ function getNormalizedConfig(config) {
|
|
|
21369
21448
|
mapping["dim_kv"] = "head_dim";
|
|
21370
21449
|
break;
|
|
21371
21450
|
case "qwen3":
|
|
21451
|
+
case "solar_open":
|
|
21452
|
+
case "glm_ocr_text":
|
|
21372
21453
|
case "gemma":
|
|
21373
21454
|
case "gemma2":
|
|
21374
21455
|
case "vaultgemma":
|
|
@@ -21379,6 +21460,7 @@ function getNormalizedConfig(config) {
|
|
|
21379
21460
|
case "ernie4_5":
|
|
21380
21461
|
case "hunyuan_v1_dense":
|
|
21381
21462
|
case "falcon_h1":
|
|
21463
|
+
case "nemotron_h":
|
|
21382
21464
|
case "ministral":
|
|
21383
21465
|
case "ministral3":
|
|
21384
21466
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -21413,6 +21495,9 @@ function getNormalizedConfig(config) {
|
|
|
21413
21495
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
21414
21496
|
break;
|
|
21415
21497
|
case "youtu":
|
|
21498
|
+
case "deepseek_v3":
|
|
21499
|
+
case "glm_moe_dsa":
|
|
21500
|
+
case "mistral4":
|
|
21416
21501
|
mapping["num_heads"] = "num_key_value_heads";
|
|
21417
21502
|
mapping["num_layers"] = "num_hidden_layers";
|
|
21418
21503
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -21501,6 +21586,7 @@ function getCacheShapes(config, options) {
|
|
|
21501
21586
|
if (!(config instanceof PretrainedConfig)) {
|
|
21502
21587
|
config = new PretrainedConfig(config);
|
|
21503
21588
|
}
|
|
21589
|
+
const batch_size = options?.batch_size ?? 1;
|
|
21504
21590
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
21505
21591
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
21506
21592
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -21510,7 +21596,6 @@ function getCacheShapes(config, options) {
|
|
|
21510
21596
|
config
|
|
21511
21597
|
);
|
|
21512
21598
|
const head_dim = hidden_size / num_attention_heads;
|
|
21513
|
-
const batch_size = options?.batch_size ?? 1;
|
|
21514
21599
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
21515
21600
|
if (layer_types[i] === "full_attention") {
|
|
21516
21601
|
for (const kv of ["key", "value"]) {
|
|
@@ -21523,31 +21608,26 @@ function getCacheShapes(config, options) {
|
|
|
21523
21608
|
}
|
|
21524
21609
|
}
|
|
21525
21610
|
return cache_values;
|
|
21526
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
21611
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
21527
21612
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
21528
21613
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
21529
|
-
const
|
|
21530
|
-
const {
|
|
21531
|
-
layer_types,
|
|
21532
|
-
num_hidden_layers,
|
|
21533
|
-
num_attention_heads,
|
|
21534
|
-
num_key_value_heads,
|
|
21535
|
-
hidden_size,
|
|
21536
|
-
mamba_d_conv,
|
|
21537
|
-
mamba_n_heads,
|
|
21538
|
-
mamba_d_head,
|
|
21539
|
-
mamba_d_state,
|
|
21540
|
-
mamba_n_groups,
|
|
21541
|
-
mamba_expand,
|
|
21542
|
-
mamba_d_ssm
|
|
21543
|
-
} = (
|
|
21614
|
+
const c = (
|
|
21544
21615
|
/** @type {any} */
|
|
21545
21616
|
config
|
|
21546
21617
|
);
|
|
21547
|
-
const
|
|
21548
|
-
const
|
|
21549
|
-
const
|
|
21550
|
-
|
|
21618
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
21619
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
21620
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
21621
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
21622
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
21623
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
21624
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
21625
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
21626
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
21627
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
21628
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
21629
|
+
const cache_values = {};
|
|
21630
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
21551
21631
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
21552
21632
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
21553
21633
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -21581,7 +21661,6 @@ function getCacheShapes(config, options) {
|
|
|
21581
21661
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
21582
21662
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
21583
21663
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
21584
|
-
const batch_size = options?.batch_size ?? 1;
|
|
21585
21664
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
21586
21665
|
if (layer_types[i] === "full_attention") {
|
|
21587
21666
|
for (const kv of ["key", "value"]) {
|
|
@@ -23277,8 +23356,7 @@ var MODEL_TYPES = {
|
|
|
23277
23356
|
ImageAudioTextToText: 13,
|
|
23278
23357
|
Supertonic: 14,
|
|
23279
23358
|
Chatterbox: 15,
|
|
23280
|
-
|
|
23281
|
-
VoxtralRealtime: 17
|
|
23359
|
+
VoxtralRealtime: 16
|
|
23282
23360
|
};
|
|
23283
23361
|
var MODEL_TYPE_CONFIG = {
|
|
23284
23362
|
[MODEL_TYPES.DecoderOnly]: {
|
|
@@ -23335,12 +23413,12 @@ var MODEL_TYPE_CONFIG = {
|
|
|
23335
23413
|
can_generate: true,
|
|
23336
23414
|
forward: image_text_to_text_forward,
|
|
23337
23415
|
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23338
|
-
sessions: (config) => {
|
|
23416
|
+
sessions: (config, options, textOnly) => {
|
|
23339
23417
|
const s = {
|
|
23340
23418
|
embed_tokens: "embed_tokens",
|
|
23341
|
-
vision_encoder: "vision_encoder",
|
|
23342
23419
|
decoder_model_merged: "decoder_model_merged"
|
|
23343
23420
|
};
|
|
23421
|
+
if (!textOnly) s["vision_encoder"] = "vision_encoder";
|
|
23344
23422
|
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
23345
23423
|
return s;
|
|
23346
23424
|
},
|
|
@@ -23362,12 +23440,17 @@ var MODEL_TYPE_CONFIG = {
|
|
|
23362
23440
|
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
23363
23441
|
can_generate: true,
|
|
23364
23442
|
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23365
|
-
sessions: () =>
|
|
23366
|
-
|
|
23367
|
-
|
|
23368
|
-
|
|
23369
|
-
|
|
23370
|
-
|
|
23443
|
+
sessions: (config, options, textOnly) => {
|
|
23444
|
+
const s = {
|
|
23445
|
+
embed_tokens: "embed_tokens",
|
|
23446
|
+
decoder_model_merged: "decoder_model_merged"
|
|
23447
|
+
};
|
|
23448
|
+
if (!textOnly) {
|
|
23449
|
+
s["audio_encoder"] = "audio_encoder";
|
|
23450
|
+
s["vision_encoder"] = "vision_encoder";
|
|
23451
|
+
}
|
|
23452
|
+
return s;
|
|
23453
|
+
},
|
|
23371
23454
|
optional_configs: { generation_config: "generation_config.json" }
|
|
23372
23455
|
},
|
|
23373
23456
|
[MODEL_TYPES.Phi3V]: {
|
|
@@ -23418,14 +23501,6 @@ var MODEL_TYPE_CONFIG = {
|
|
|
23418
23501
|
cache_sessions: { model: true },
|
|
23419
23502
|
optional_configs: { generation_config: "generation_config.json" }
|
|
23420
23503
|
},
|
|
23421
|
-
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
23422
|
-
can_generate: true,
|
|
23423
|
-
forward: image_text_to_text_forward,
|
|
23424
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23425
|
-
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
23426
|
-
cache_sessions: { decoder_model_merged: true },
|
|
23427
|
-
optional_configs: { generation_config: "generation_config.json" }
|
|
23428
|
-
},
|
|
23429
23504
|
[MODEL_TYPES.VoxtralRealtime]: {
|
|
23430
23505
|
can_generate: true,
|
|
23431
23506
|
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
@@ -23451,6 +23526,19 @@ function getSessionsConfig(modelType, config, options = {}) {
|
|
|
23451
23526
|
optional_configs: typeConfig.optional_configs
|
|
23452
23527
|
};
|
|
23453
23528
|
}
|
|
23529
|
+
function resolveTypeConfig(modelName, config) {
|
|
23530
|
+
let modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
23531
|
+
let textOnly = false;
|
|
23532
|
+
const nativeArch = config?.architectures?.[0];
|
|
23533
|
+
if (nativeArch && nativeArch !== modelName && modelName?.endsWith("ForCausalLM") && nativeArch.endsWith("ForConditionalGeneration")) {
|
|
23534
|
+
const nativeType = MODEL_TYPE_MAPPING.get(nativeArch);
|
|
23535
|
+
if (nativeType !== void 0) {
|
|
23536
|
+
modelType = nativeType;
|
|
23537
|
+
textOnly = true;
|
|
23538
|
+
}
|
|
23539
|
+
}
|
|
23540
|
+
return { typeConfig: MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default, textOnly, modelType };
|
|
23541
|
+
}
|
|
23454
23542
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
23455
23543
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
23456
23544
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -23470,8 +23558,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23470
23558
|
this.sessions = sessions;
|
|
23471
23559
|
this.configs = configs;
|
|
23472
23560
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor);
|
|
23473
|
-
const
|
|
23474
|
-
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
23561
|
+
const { typeConfig } = resolveTypeConfig(modelName, config);
|
|
23475
23562
|
this.can_generate = typeConfig.can_generate;
|
|
23476
23563
|
this._forward = typeConfig.forward;
|
|
23477
23564
|
this._prepare_inputs_for_generation = typeConfig.prepare_inputs;
|
|
@@ -23534,9 +23621,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23534
23621
|
session_options
|
|
23535
23622
|
};
|
|
23536
23623
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
23537
|
-
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
23538
23624
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
23539
|
-
const typeConfig
|
|
23625
|
+
const { typeConfig, textOnly, modelType } = resolveTypeConfig(modelName, config);
|
|
23540
23626
|
if (modelType === void 0) {
|
|
23541
23627
|
const type = modelName ?? config?.model_type;
|
|
23542
23628
|
if (type !== "custom") {
|
|
@@ -23545,7 +23631,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23545
23631
|
);
|
|
23546
23632
|
}
|
|
23547
23633
|
}
|
|
23548
|
-
const sessions = typeConfig.sessions(config, options);
|
|
23634
|
+
const sessions = typeConfig.sessions(config, options, textOnly);
|
|
23549
23635
|
const promises = [
|
|
23550
23636
|
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
23551
23637
|
];
|
|
@@ -24209,7 +24295,9 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24209
24295
|
"qwen3_5",
|
|
24210
24296
|
"qwen3_5_text",
|
|
24211
24297
|
"qwen3_5_moe",
|
|
24212
|
-
"qwen3_5_moe_text"
|
|
24298
|
+
"qwen3_5_moe_text",
|
|
24299
|
+
"glm_ocr",
|
|
24300
|
+
"glm_ocr_text"
|
|
24213
24301
|
].includes(self2.config.model_type)
|
|
24214
24302
|
) {
|
|
24215
24303
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -24433,6 +24521,8 @@ __export(models_exports, {
|
|
|
24433
24521
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
24434
24522
|
BloomModel: () => BloomModel,
|
|
24435
24523
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
24524
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
24525
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
24436
24526
|
CLIPModel: () => CLIPModel,
|
|
24437
24527
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
24438
24528
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -24507,6 +24597,9 @@ __export(models_exports, {
|
|
|
24507
24597
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
24508
24598
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
24509
24599
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
24600
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
24601
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
24602
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
24510
24603
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
24511
24604
|
DeiTModel: () => DeiTModel,
|
|
24512
24605
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -24552,6 +24645,11 @@ __export(models_exports, {
|
|
|
24552
24645
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
24553
24646
|
EsmModel: () => EsmModel,
|
|
24554
24647
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
24648
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
24649
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
24650
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
24651
|
+
EuroBertModel: () => EuroBertModel,
|
|
24652
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
24555
24653
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
24556
24654
|
ExaoneModel: () => ExaoneModel,
|
|
24557
24655
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -24588,6 +24686,7 @@ __export(models_exports, {
|
|
|
24588
24686
|
Gemma2Model: () => Gemma2Model,
|
|
24589
24687
|
Gemma2PreTrainedModel: () => Gemma2PreTrainedModel,
|
|
24590
24688
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
24689
|
+
Gemma3ForConditionalGeneration: () => Gemma3ForConditionalGeneration,
|
|
24591
24690
|
Gemma3Model: () => Gemma3Model,
|
|
24592
24691
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
24593
24692
|
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
@@ -24598,6 +24697,10 @@ __export(models_exports, {
|
|
|
24598
24697
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
24599
24698
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
24600
24699
|
GlmModel: () => GlmModel,
|
|
24700
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
24701
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
24702
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
24703
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
24601
24704
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
24602
24705
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
24603
24706
|
GptOssModel: () => GptOssModel,
|
|
@@ -24644,6 +24747,7 @@ __export(models_exports, {
|
|
|
24644
24747
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
24645
24748
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
24646
24749
|
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
24750
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
24647
24751
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
24648
24752
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
24649
24753
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -24693,6 +24797,9 @@ __export(models_exports, {
|
|
|
24693
24797
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
24694
24798
|
MimiModel: () => MimiModel,
|
|
24695
24799
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
24800
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
24801
|
+
Mistral4Model: () => Mistral4Model,
|
|
24802
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
24696
24803
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
24697
24804
|
MistralModel: () => MistralModel,
|
|
24698
24805
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -24750,6 +24857,9 @@ __export(models_exports, {
|
|
|
24750
24857
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
24751
24858
|
NanoChatModel: () => NanoChatModel,
|
|
24752
24859
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
24860
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
24861
|
+
NemotronHModel: () => NemotronHModel,
|
|
24862
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
24753
24863
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
24754
24864
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
24755
24865
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -24887,6 +24997,9 @@ __export(models_exports, {
|
|
|
24887
24997
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
24888
24998
|
SnacModel: () => SnacModel,
|
|
24889
24999
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
25000
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
25001
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
25002
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
24890
25003
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
24891
25004
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
24892
25005
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -25061,7 +25174,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
25061
25174
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
25062
25175
|
};
|
|
25063
25176
|
|
|
25064
|
-
// src/models/
|
|
25177
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
25065
25178
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
25066
25179
|
};
|
|
25067
25180
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -25396,6 +25509,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
25396
25509
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
25397
25510
|
};
|
|
25398
25511
|
|
|
25512
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
25513
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
25514
|
+
};
|
|
25515
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
25516
|
+
};
|
|
25517
|
+
|
|
25399
25518
|
// src/models/clap/modeling_clap.js
|
|
25400
25519
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
25401
25520
|
};
|
|
@@ -25734,6 +25853,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
25734
25853
|
}
|
|
25735
25854
|
};
|
|
25736
25855
|
|
|
25856
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
25857
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
25858
|
+
};
|
|
25859
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
25860
|
+
};
|
|
25861
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
25862
|
+
};
|
|
25863
|
+
|
|
25737
25864
|
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
25738
25865
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
25739
25866
|
};
|
|
@@ -26082,6 +26209,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
26082
26209
|
}
|
|
26083
26210
|
};
|
|
26084
26211
|
|
|
26212
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
26213
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
26214
|
+
};
|
|
26215
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
26216
|
+
};
|
|
26217
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
26218
|
+
/**
|
|
26219
|
+
* Calls the model on new inputs.
|
|
26220
|
+
*
|
|
26221
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26222
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
26223
|
+
*/
|
|
26224
|
+
async _call(model_inputs) {
|
|
26225
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
26226
|
+
}
|
|
26227
|
+
};
|
|
26228
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
26229
|
+
/**
|
|
26230
|
+
* Calls the model on new inputs.
|
|
26231
|
+
*
|
|
26232
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26233
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
26234
|
+
*/
|
|
26235
|
+
async _call(model_inputs) {
|
|
26236
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
26237
|
+
}
|
|
26238
|
+
};
|
|
26239
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
26240
|
+
/**
|
|
26241
|
+
* Calls the model on new inputs.
|
|
26242
|
+
*
|
|
26243
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26244
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
26245
|
+
*/
|
|
26246
|
+
async _call(model_inputs) {
|
|
26247
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
26248
|
+
}
|
|
26249
|
+
};
|
|
26250
|
+
|
|
26085
26251
|
// src/models/exaone/modeling_exaone.js
|
|
26086
26252
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
26087
26253
|
};
|
|
@@ -26239,12 +26405,35 @@ var Gemma2Model = class extends Gemma2PreTrainedModel {
|
|
|
26239
26405
|
var Gemma2ForCausalLM = class extends Gemma2PreTrainedModel {
|
|
26240
26406
|
};
|
|
26241
26407
|
|
|
26408
|
+
// src/models/llava/modeling_llava.js
|
|
26409
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
26410
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
26411
|
+
};
|
|
26412
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
26413
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
26414
|
+
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26415
|
+
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26416
|
+
return default_merge_input_ids_with_image_features({
|
|
26417
|
+
// @ts-ignore
|
|
26418
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
26419
|
+
...kwargs,
|
|
26420
|
+
image_features: reshaped_image_hidden_states
|
|
26421
|
+
});
|
|
26422
|
+
}
|
|
26423
|
+
};
|
|
26424
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26425
|
+
};
|
|
26426
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
26427
|
+
};
|
|
26428
|
+
|
|
26242
26429
|
// src/models/gemma3/modeling_gemma3.js
|
|
26243
26430
|
var Gemma3PreTrainedModel = class extends PreTrainedModel {
|
|
26244
26431
|
};
|
|
26245
26432
|
var Gemma3Model = class extends Gemma3PreTrainedModel {
|
|
26246
26433
|
};
|
|
26247
|
-
var
|
|
26434
|
+
var Gemma3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26435
|
+
};
|
|
26436
|
+
var Gemma3ForCausalLM = class extends Gemma3ForConditionalGeneration {
|
|
26248
26437
|
};
|
|
26249
26438
|
|
|
26250
26439
|
// src/models/gemma3n/modeling_gemma3n.js
|
|
@@ -26357,6 +26546,382 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
26357
26546
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
26358
26547
|
};
|
|
26359
26548
|
|
|
26549
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
26550
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
26551
|
+
};
|
|
26552
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
26553
|
+
};
|
|
26554
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
26555
|
+
};
|
|
26556
|
+
|
|
26557
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
26558
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
26559
|
+
forward_params = [
|
|
26560
|
+
// Text inputs
|
|
26561
|
+
"input_ids",
|
|
26562
|
+
"attention_mask",
|
|
26563
|
+
"position_ids",
|
|
26564
|
+
"past_key_values",
|
|
26565
|
+
// Vision inputs
|
|
26566
|
+
"pixel_values",
|
|
26567
|
+
"image_grid_thw"
|
|
26568
|
+
];
|
|
26569
|
+
};
|
|
26570
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
26571
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
26572
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
26573
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
26574
|
+
image_grid_thw_name = "grid_thw";
|
|
26575
|
+
/**
|
|
26576
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
26577
|
+
* @param {Tensor} input_ids
|
|
26578
|
+
* @param {Tensor} attention_mask
|
|
26579
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
26580
|
+
*/
|
|
26581
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
26582
|
+
if (attention_mask) {
|
|
26583
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
26584
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
26585
|
+
const mrope_position_deltas = Array.from(
|
|
26586
|
+
{ length: dims[0] },
|
|
26587
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
26588
|
+
);
|
|
26589
|
+
return [
|
|
26590
|
+
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
26591
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
26592
|
+
];
|
|
26593
|
+
} else {
|
|
26594
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
26595
|
+
const position_ids = BigInt64Array.from(
|
|
26596
|
+
{ length: 3 * batch_size * seq_length },
|
|
26597
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
26598
|
+
);
|
|
26599
|
+
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
26600
|
+
}
|
|
26601
|
+
}
|
|
26602
|
+
/**
|
|
26603
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
26604
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
26605
|
+
* respecting attention mask.
|
|
26606
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
26607
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
26608
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
26609
|
+
* @param {number} batch_idx Current batch index
|
|
26610
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
26611
|
+
*/
|
|
26612
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
26613
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
26614
|
+
const llm_positions = new Array(total_len);
|
|
26615
|
+
let index = 0;
|
|
26616
|
+
for (let x = 0; x < 3; ++x) {
|
|
26617
|
+
for (const val of llm_pos_ids_list) {
|
|
26618
|
+
const seg_len = val.length / 3;
|
|
26619
|
+
for (let z2 = x * seg_len; z2 < (x + 1) * seg_len; ++z2) {
|
|
26620
|
+
llm_positions[index++] = val[z2];
|
|
26621
|
+
}
|
|
26622
|
+
}
|
|
26623
|
+
}
|
|
26624
|
+
let count2 = 0;
|
|
26625
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
26626
|
+
if (attn_mask[y] == 1) {
|
|
26627
|
+
for (let x = 0; x < 3; ++x) {
|
|
26628
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
26629
|
+
}
|
|
26630
|
+
++count2;
|
|
26631
|
+
}
|
|
26632
|
+
}
|
|
26633
|
+
return llm_positions;
|
|
26634
|
+
}
|
|
26635
|
+
/**
|
|
26636
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
26637
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
26638
|
+
* @param {object} params
|
|
26639
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
26640
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
26641
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
26642
|
+
* @param {number} params.spatial_merge_size
|
|
26643
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
26644
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
26645
|
+
*/
|
|
26646
|
+
_get_multimodal_rope_positions({
|
|
26647
|
+
filtered_ids,
|
|
26648
|
+
image_grid_thw_list,
|
|
26649
|
+
video_grid_thw_list,
|
|
26650
|
+
spatial_merge_size,
|
|
26651
|
+
state
|
|
26652
|
+
}) {
|
|
26653
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
26654
|
+
const ids = filtered_ids;
|
|
26655
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
26656
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
26657
|
+
return acc;
|
|
26658
|
+
}, []);
|
|
26659
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
26660
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
26661
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
26662
|
+
const llm_pos_ids_list = [];
|
|
26663
|
+
let st2 = 0;
|
|
26664
|
+
let remain_images = image_nums;
|
|
26665
|
+
let remain_videos = video_nums;
|
|
26666
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
26667
|
+
const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
|
|
26668
|
+
const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
|
|
26669
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
26670
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
26671
|
+
let ed;
|
|
26672
|
+
let t, h, w;
|
|
26673
|
+
if (ed_image < ed_video) {
|
|
26674
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
26675
|
+
++state.image_index;
|
|
26676
|
+
--remain_images;
|
|
26677
|
+
ed = ed_image;
|
|
26678
|
+
} else {
|
|
26679
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
26680
|
+
++state.video_index;
|
|
26681
|
+
--remain_videos;
|
|
26682
|
+
ed = ed_video;
|
|
26683
|
+
}
|
|
26684
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
26685
|
+
Number(t),
|
|
26686
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
26687
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
26688
|
+
];
|
|
26689
|
+
const text_len = ed - st2;
|
|
26690
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
26691
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
26692
|
+
const offset = text_len + st_idx;
|
|
26693
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
26694
|
+
const t_index = Array.from(
|
|
26695
|
+
{ length: grid_size },
|
|
26696
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
26697
|
+
);
|
|
26698
|
+
const h_index = Array.from(
|
|
26699
|
+
{ length: grid_size },
|
|
26700
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
26701
|
+
);
|
|
26702
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
26703
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
26704
|
+
st2 = ed + grid_size;
|
|
26705
|
+
}
|
|
26706
|
+
if (st2 < ids.length) {
|
|
26707
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
26708
|
+
const text_len = ids.length - st2;
|
|
26709
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
26710
|
+
}
|
|
26711
|
+
return llm_pos_ids_list;
|
|
26712
|
+
}
|
|
26713
|
+
/**
|
|
26714
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
26715
|
+
*
|
|
26716
|
+
* Explanation:
|
|
26717
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
26718
|
+
*
|
|
26719
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
26720
|
+
* Examples:
|
|
26721
|
+
* input_ids: [T T T T T], here T is for text.
|
|
26722
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
26723
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
26724
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
26725
|
+
*
|
|
26726
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
26727
|
+
* and 1D rotary position embeddin for text part.
|
|
26728
|
+
* Examples:
|
|
26729
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
26730
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
26731
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
26732
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
26733
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
26734
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
26735
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
26736
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
26737
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
26738
|
+
*
|
|
26739
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
26740
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
26741
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
26742
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
26743
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
26744
|
+
*/
|
|
26745
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
26746
|
+
const { vision_config } = this.config;
|
|
26747
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
26748
|
+
if (image_grid_thw || video_grid_thw) {
|
|
26749
|
+
const total_input_ids = input_ids.tolist();
|
|
26750
|
+
if (!attention_mask) {
|
|
26751
|
+
attention_mask = ones_like(input_ids);
|
|
26752
|
+
}
|
|
26753
|
+
const attention_mask_list = attention_mask.tolist();
|
|
26754
|
+
const position_ids_list = Array.from(
|
|
26755
|
+
{ length: 3 },
|
|
26756
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
26757
|
+
);
|
|
26758
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
26759
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
26760
|
+
const state = { image_index: 0, video_index: 0 };
|
|
26761
|
+
const mrope_position_deltas = [];
|
|
26762
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
26763
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
26764
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
26765
|
+
filtered_ids,
|
|
26766
|
+
image_grid_thw_list,
|
|
26767
|
+
video_grid_thw_list,
|
|
26768
|
+
spatial_merge_size,
|
|
26769
|
+
state
|
|
26770
|
+
});
|
|
26771
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
26772
|
+
llm_pos_ids_list,
|
|
26773
|
+
attention_mask_list[i],
|
|
26774
|
+
position_ids_list,
|
|
26775
|
+
i
|
|
26776
|
+
);
|
|
26777
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
26778
|
+
}
|
|
26779
|
+
return [
|
|
26780
|
+
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
26781
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
26782
|
+
];
|
|
26783
|
+
} else {
|
|
26784
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
26785
|
+
}
|
|
26786
|
+
}
|
|
26787
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
26788
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
26789
|
+
pixel_values,
|
|
26790
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
26791
|
+
})).image_features;
|
|
26792
|
+
return features;
|
|
26793
|
+
}
|
|
26794
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
26795
|
+
return default_merge_input_ids_with_image_features({
|
|
26796
|
+
// @ts-ignore
|
|
26797
|
+
image_token_id: this.config.image_token_id,
|
|
26798
|
+
...kwargs
|
|
26799
|
+
});
|
|
26800
|
+
}
|
|
26801
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
26802
|
+
if (!model_inputs.attention_mask || model_inputs.position_ids) {
|
|
26803
|
+
return model_inputs;
|
|
26804
|
+
}
|
|
26805
|
+
const session = this.sessions["decoder_model_merged"] ?? this.sessions["model"];
|
|
26806
|
+
if (!session.inputNames.includes("position_ids")) {
|
|
26807
|
+
return model_inputs;
|
|
26808
|
+
}
|
|
26809
|
+
if (!model_inputs.past_key_values) {
|
|
26810
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
26811
|
+
model_inputs.input_ids,
|
|
26812
|
+
model_inputs.image_grid_thw,
|
|
26813
|
+
model_inputs.video_grid_thw,
|
|
26814
|
+
model_inputs.attention_mask
|
|
26815
|
+
);
|
|
26816
|
+
} else {
|
|
26817
|
+
model_inputs.pixel_values = null;
|
|
26818
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
26819
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
26820
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
26821
|
+
model_inputs.input_ids,
|
|
26822
|
+
model_inputs.image_grid_thw,
|
|
26823
|
+
model_inputs.video_grid_thw,
|
|
26824
|
+
model_inputs.attention_mask
|
|
26825
|
+
);
|
|
26826
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
26827
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
26828
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
26829
|
+
} else {
|
|
26830
|
+
if (!model_inputs.rope_deltas) {
|
|
26831
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
26832
|
+
model_inputs.input_ids,
|
|
26833
|
+
model_inputs.image_grid_thw,
|
|
26834
|
+
model_inputs.video_grid_thw,
|
|
26835
|
+
model_inputs.attention_mask
|
|
26836
|
+
);
|
|
26837
|
+
}
|
|
26838
|
+
const delta = BigInt(past_length);
|
|
26839
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
26840
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
26841
|
+
}
|
|
26842
|
+
}
|
|
26843
|
+
return model_inputs;
|
|
26844
|
+
}
|
|
26845
|
+
};
|
|
26846
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
26847
|
+
};
|
|
26848
|
+
|
|
26849
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
26850
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
26851
|
+
image_grid_thw_name = "image_grid_thw";
|
|
26852
|
+
};
|
|
26853
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
26854
|
+
image_grid_thw_name = "image_grid_thw";
|
|
26855
|
+
};
|
|
26856
|
+
|
|
26857
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
26858
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
26859
|
+
/**
|
|
26860
|
+
* Compute 3D positional indices for vision tokens.
|
|
26861
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
26862
|
+
* @param {number} start_position
|
|
26863
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
26864
|
+
* @param {number} temp_merge_size
|
|
26865
|
+
* @param {number} spatial_merge_size
|
|
26866
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
26867
|
+
*/
|
|
26868
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
26869
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
26870
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
26871
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
26872
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
26873
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
26874
|
+
const h_pos = Array.from(
|
|
26875
|
+
{ length: seq_len },
|
|
26876
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
26877
|
+
);
|
|
26878
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
26879
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
26880
|
+
}
|
|
26881
|
+
/**
|
|
26882
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
26883
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
26884
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
26885
|
+
*/
|
|
26886
|
+
_get_multimodal_rope_positions({
|
|
26887
|
+
filtered_ids,
|
|
26888
|
+
image_grid_thw_list,
|
|
26889
|
+
video_grid_thw_list,
|
|
26890
|
+
spatial_merge_size,
|
|
26891
|
+
state
|
|
26892
|
+
}) {
|
|
26893
|
+
const { image_token_id } = this.config;
|
|
26894
|
+
const groups = [];
|
|
26895
|
+
let group_start = 0;
|
|
26896
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
26897
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
26898
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
26899
|
+
if (t !== current_type) {
|
|
26900
|
+
groups.push([current_type, group_start, j]);
|
|
26901
|
+
group_start = j;
|
|
26902
|
+
current_type = t;
|
|
26903
|
+
}
|
|
26904
|
+
}
|
|
26905
|
+
let current_pos = 0;
|
|
26906
|
+
const llm_pos_ids_list = [];
|
|
26907
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
26908
|
+
if (modality_type === 0) {
|
|
26909
|
+
const text_len = end_idx - start_idx;
|
|
26910
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
26911
|
+
current_pos += text_len;
|
|
26912
|
+
} else {
|
|
26913
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
26914
|
+
const temp_merge_size = grid_thw[0];
|
|
26915
|
+
llm_pos_ids_list.push(
|
|
26916
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
26917
|
+
);
|
|
26918
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
26919
|
+
}
|
|
26920
|
+
}
|
|
26921
|
+
return llm_pos_ids_list;
|
|
26922
|
+
}
|
|
26923
|
+
};
|
|
26924
|
+
|
|
26360
26925
|
// src/models/glpn/modeling_glpn.js
|
|
26361
26926
|
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
26362
26927
|
};
|
|
@@ -26555,27 +27120,6 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
26555
27120
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
26556
27121
|
};
|
|
26557
27122
|
|
|
26558
|
-
// src/models/llava/modeling_llava.js
|
|
26559
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
26560
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
26561
|
-
};
|
|
26562
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
26563
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
26564
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26565
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26566
|
-
return default_merge_input_ids_with_image_features({
|
|
26567
|
-
// @ts-ignore
|
|
26568
|
-
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
26569
|
-
...kwargs,
|
|
26570
|
-
image_features: reshaped_image_hidden_states
|
|
26571
|
-
});
|
|
26572
|
-
}
|
|
26573
|
-
};
|
|
26574
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26575
|
-
};
|
|
26576
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
26577
|
-
};
|
|
26578
|
-
|
|
26579
27123
|
// src/models/idefics3/modeling_idefics3.js
|
|
26580
27124
|
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26581
27125
|
forward_params = [
|
|
@@ -26669,6 +27213,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
26669
27213
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
26670
27214
|
};
|
|
26671
27215
|
|
|
27216
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
27217
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27218
|
+
};
|
|
27219
|
+
|
|
26672
27220
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
26673
27221
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
26674
27222
|
};
|
|
@@ -26865,6 +27413,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
26865
27413
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
26866
27414
|
};
|
|
26867
27415
|
|
|
27416
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
27417
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
27418
|
+
};
|
|
27419
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
27420
|
+
};
|
|
27421
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
27422
|
+
};
|
|
27423
|
+
|
|
26868
27424
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
26869
27425
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
26870
27426
|
};
|
|
@@ -27333,6 +27889,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
27333
27889
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
27334
27890
|
};
|
|
27335
27891
|
|
|
27892
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
27893
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
27894
|
+
};
|
|
27895
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
27896
|
+
};
|
|
27897
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
27898
|
+
};
|
|
27899
|
+
|
|
27336
27900
|
// src/models/neobert/modeling_neobert.js
|
|
27337
27901
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
27338
27902
|
};
|
|
@@ -27613,252 +28177,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
27613
28177
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
27614
28178
|
};
|
|
27615
28179
|
|
|
27616
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
27617
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
27618
|
-
forward_params = [
|
|
27619
|
-
// Text inputs
|
|
27620
|
-
"input_ids",
|
|
27621
|
-
"attention_mask",
|
|
27622
|
-
"position_ids",
|
|
27623
|
-
"past_key_values",
|
|
27624
|
-
// Vision inputs
|
|
27625
|
-
"pixel_values",
|
|
27626
|
-
"image_grid_thw"
|
|
27627
|
-
];
|
|
27628
|
-
};
|
|
27629
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
27630
|
-
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
27631
|
-
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
27632
|
-
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
27633
|
-
image_grid_thw_name = "grid_thw";
|
|
27634
|
-
/**
|
|
27635
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
27636
|
-
*
|
|
27637
|
-
* Explanation:
|
|
27638
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
27639
|
-
*
|
|
27640
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
27641
|
-
* Examples:
|
|
27642
|
-
* input_ids: [T T T T T], here T is for text.
|
|
27643
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
27644
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
27645
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
27646
|
-
*
|
|
27647
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
27648
|
-
* and 1D rotary position embeddin for text part.
|
|
27649
|
-
* Examples:
|
|
27650
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
27651
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
27652
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
27653
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
27654
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
27655
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
27656
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
27657
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
27658
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
27659
|
-
*
|
|
27660
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
27661
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
27662
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
27663
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
27664
|
-
* - 1 for tokens that are **not masked**,
|
|
27665
|
-
* - 0 for tokens that are **masked**.
|
|
27666
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
27667
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
27668
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
27669
|
-
*/
|
|
27670
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
27671
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
27672
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
27673
|
-
const mrope_position_deltas = [];
|
|
27674
|
-
if (image_grid_thw || video_grid_thw) {
|
|
27675
|
-
let total_input_ids = input_ids.tolist();
|
|
27676
|
-
if (!attention_mask) {
|
|
27677
|
-
attention_mask = ones_like(input_ids);
|
|
27678
|
-
}
|
|
27679
|
-
const attention_mask_list = attention_mask.tolist();
|
|
27680
|
-
const position_ids_list = Array.from(
|
|
27681
|
-
{ length: 3 },
|
|
27682
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
27683
|
-
);
|
|
27684
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
27685
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
27686
|
-
let image_index = 0;
|
|
27687
|
-
let video_index = 0;
|
|
27688
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
27689
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
27690
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
27691
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
27692
|
-
return acc;
|
|
27693
|
-
}, []);
|
|
27694
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
27695
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
27696
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
27697
|
-
let llm_pos_ids_list = [];
|
|
27698
|
-
let st2 = 0;
|
|
27699
|
-
let remain_images = image_nums;
|
|
27700
|
-
let remain_videos = video_nums;
|
|
27701
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
27702
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
|
|
27703
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
|
|
27704
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
27705
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
27706
|
-
let ed;
|
|
27707
|
-
let t, h, w;
|
|
27708
|
-
if (ed_image < ed_video) {
|
|
27709
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
27710
|
-
++image_index;
|
|
27711
|
-
--remain_images;
|
|
27712
|
-
ed = ed_image;
|
|
27713
|
-
} else {
|
|
27714
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
27715
|
-
++video_index;
|
|
27716
|
-
--remain_videos;
|
|
27717
|
-
ed = ed_video;
|
|
27718
|
-
}
|
|
27719
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
27720
|
-
Number(t),
|
|
27721
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
27722
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
27723
|
-
];
|
|
27724
|
-
const text_len = ed - st2;
|
|
27725
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27726
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
27727
|
-
const offset = text_len + st_idx;
|
|
27728
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
27729
|
-
const t_index = Array.from(
|
|
27730
|
-
{ length: grid_size },
|
|
27731
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
27732
|
-
);
|
|
27733
|
-
const h_index = Array.from(
|
|
27734
|
-
{ length: grid_size },
|
|
27735
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
27736
|
-
);
|
|
27737
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
27738
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
27739
|
-
st2 = ed + grid_size;
|
|
27740
|
-
}
|
|
27741
|
-
if (st2 < ids.length) {
|
|
27742
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27743
|
-
const text_len = ids.length - st2;
|
|
27744
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
27745
|
-
}
|
|
27746
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
27747
|
-
const llm_positions = new Array(num_items);
|
|
27748
|
-
let index = 0;
|
|
27749
|
-
for (let x = 0; x < 3; ++x) {
|
|
27750
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
27751
|
-
const val = llm_pos_ids_list[y];
|
|
27752
|
-
const text_len = val.length / 3;
|
|
27753
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
27754
|
-
llm_positions[index++] = val[z];
|
|
27755
|
-
}
|
|
27756
|
-
}
|
|
27757
|
-
}
|
|
27758
|
-
let count2 = 0;
|
|
27759
|
-
const attn_mask = attention_mask_list[i];
|
|
27760
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
27761
|
-
if (attn_mask[y] == 1) {
|
|
27762
|
-
for (let x = 0; x < 3; ++x) {
|
|
27763
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
27764
|
-
}
|
|
27765
|
-
++count2;
|
|
27766
|
-
}
|
|
27767
|
-
}
|
|
27768
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
27769
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
27770
|
-
}
|
|
27771
|
-
return [
|
|
27772
|
-
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
27773
|
-
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27774
|
-
];
|
|
27775
|
-
} else {
|
|
27776
|
-
if (attention_mask) {
|
|
27777
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
27778
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
27779
|
-
const mrope_position_deltas2 = Array.from(
|
|
27780
|
-
{ length: dims[0] },
|
|
27781
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
27782
|
-
);
|
|
27783
|
-
return [
|
|
27784
|
-
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
27785
|
-
new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
27786
|
-
];
|
|
27787
|
-
} else {
|
|
27788
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
27789
|
-
const position_ids = BigInt64Array.from(
|
|
27790
|
-
{ length: 3 * batch_size * seq_length },
|
|
27791
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
27792
|
-
);
|
|
27793
|
-
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
27794
|
-
}
|
|
27795
|
-
}
|
|
27796
|
-
}
|
|
27797
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
27798
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
27799
|
-
pixel_values,
|
|
27800
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
27801
|
-
})).image_features;
|
|
27802
|
-
return features;
|
|
27803
|
-
}
|
|
27804
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
27805
|
-
return default_merge_input_ids_with_image_features({
|
|
27806
|
-
// @ts-ignore
|
|
27807
|
-
image_token_id: this.config.image_token_id,
|
|
27808
|
-
...kwargs
|
|
27809
|
-
});
|
|
27810
|
-
}
|
|
27811
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
27812
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
27813
|
-
if (!model_inputs.past_key_values) {
|
|
27814
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27815
|
-
model_inputs.input_ids,
|
|
27816
|
-
model_inputs.image_grid_thw,
|
|
27817
|
-
model_inputs.video_grid_thw,
|
|
27818
|
-
model_inputs.attention_mask
|
|
27819
|
-
);
|
|
27820
|
-
} else {
|
|
27821
|
-
model_inputs.pixel_values = null;
|
|
27822
|
-
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
27823
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27824
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27825
|
-
model_inputs.input_ids,
|
|
27826
|
-
model_inputs.image_grid_thw,
|
|
27827
|
-
model_inputs.video_grid_thw,
|
|
27828
|
-
model_inputs.attention_mask
|
|
27829
|
-
);
|
|
27830
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
27831
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
27832
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
27833
|
-
} else {
|
|
27834
|
-
if (!model_inputs.rope_deltas) {
|
|
27835
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27836
|
-
model_inputs.input_ids,
|
|
27837
|
-
model_inputs.image_grid_thw,
|
|
27838
|
-
model_inputs.video_grid_thw,
|
|
27839
|
-
model_inputs.attention_mask
|
|
27840
|
-
);
|
|
27841
|
-
}
|
|
27842
|
-
const delta = BigInt(past_length);
|
|
27843
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
27844
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
27845
|
-
}
|
|
27846
|
-
}
|
|
27847
|
-
}
|
|
27848
|
-
return model_inputs;
|
|
27849
|
-
}
|
|
27850
|
-
};
|
|
27851
|
-
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
27852
|
-
};
|
|
27853
|
-
|
|
27854
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27855
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27856
|
-
image_grid_thw_name = "image_grid_thw";
|
|
27857
|
-
};
|
|
27858
|
-
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
27859
|
-
image_grid_thw_name = "image_grid_thw";
|
|
27860
|
-
};
|
|
27861
|
-
|
|
27862
28180
|
// src/models/qwen3/modeling_qwen3.js
|
|
27863
28181
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
27864
28182
|
};
|
|
@@ -28304,6 +28622,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
28304
28622
|
}
|
|
28305
28623
|
};
|
|
28306
28624
|
|
|
28625
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
28626
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
28627
|
+
};
|
|
28628
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
28629
|
+
};
|
|
28630
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
28631
|
+
};
|
|
28632
|
+
|
|
28307
28633
|
// src/models/speecht5/modeling_speecht5.js
|
|
28308
28634
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
28309
28635
|
};
|
|
@@ -29420,6 +29746,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
29420
29746
|
// src/models/registry.js
|
|
29421
29747
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
29422
29748
|
["bert", "BertModel"],
|
|
29749
|
+
["eurobert", "EuroBertModel"],
|
|
29423
29750
|
["neobert", "NeoBertModel"],
|
|
29424
29751
|
["modernbert", "ModernBertModel"],
|
|
29425
29752
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -29551,6 +29878,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
29551
29878
|
["gemma3_text", "Gemma3Model"],
|
|
29552
29879
|
["helium", "HeliumModel"],
|
|
29553
29880
|
["glm", "GlmModel"],
|
|
29881
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
29554
29882
|
["openelm", "OpenELMModel"],
|
|
29555
29883
|
["qwen2", "Qwen2Model"],
|
|
29556
29884
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -29562,12 +29890,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
29562
29890
|
["mpt", "MptModel"],
|
|
29563
29891
|
["opt", "OPTModel"],
|
|
29564
29892
|
["mistral", "MistralModel"],
|
|
29893
|
+
["mistral4", "Mistral4Model"],
|
|
29565
29894
|
["ministral", "MinistralModel"],
|
|
29566
29895
|
["ministral3", "Ministral3Model"],
|
|
29567
29896
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
29568
29897
|
["starcoder2", "Starcoder2Model"],
|
|
29898
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
29569
29899
|
["falcon", "FalconModel"],
|
|
29570
29900
|
["falcon_h1", "FalconH1Model"],
|
|
29901
|
+
["nemotron_h", "NemotronHModel"],
|
|
29902
|
+
["solar_open", "SolarOpenModel"],
|
|
29571
29903
|
["stablelm", "StableLmModel"],
|
|
29572
29904
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
29573
29905
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -29587,6 +29919,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29587
29919
|
]);
|
|
29588
29920
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29589
29921
|
["bert", "BertForSequenceClassification"],
|
|
29922
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
29590
29923
|
["neobert", "NeoBertForSequenceClassification"],
|
|
29591
29924
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
29592
29925
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -29609,6 +29942,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29609
29942
|
]);
|
|
29610
29943
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29611
29944
|
["bert", "BertForTokenClassification"],
|
|
29945
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
29612
29946
|
["neobert", "NeoBertForTokenClassification"],
|
|
29613
29947
|
["modernbert", "ModernBertForTokenClassification"],
|
|
29614
29948
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -29671,6 +30005,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29671
30005
|
["gemma3", "Gemma3ForCausalLM"],
|
|
29672
30006
|
["helium", "HeliumForCausalLM"],
|
|
29673
30007
|
["glm", "GlmForCausalLM"],
|
|
30008
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
29674
30009
|
["openelm", "OpenELMForCausalLM"],
|
|
29675
30010
|
["qwen2", "Qwen2ForCausalLM"],
|
|
29676
30011
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
@@ -29682,6 +30017,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29682
30017
|
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
29683
30018
|
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
29684
30019
|
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
30020
|
+
["qwen3_5_text", "Qwen3_5ForCausalLM"],
|
|
29685
30021
|
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
29686
30022
|
["gemma3n", "Gemma3nForCausalLM"],
|
|
29687
30023
|
["phi", "PhiForCausalLM"],
|
|
@@ -29690,13 +30026,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29690
30026
|
["opt", "OPTForCausalLM"],
|
|
29691
30027
|
["mbart", "MBartForCausalLM"],
|
|
29692
30028
|
["mistral", "MistralForCausalLM"],
|
|
30029
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
29693
30030
|
["ministral", "MinistralForCausalLM"],
|
|
29694
30031
|
["ministral3", "Ministral3ForCausalLM"],
|
|
29695
30032
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
29696
30033
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
30034
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
29697
30035
|
["falcon", "FalconForCausalLM"],
|
|
29698
30036
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
30037
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
29699
30038
|
["trocr", "TrOCRForCausalLM"],
|
|
30039
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
29700
30040
|
["stablelm", "StableLmForCausalLM"],
|
|
29701
30041
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
29702
30042
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -29707,6 +30047,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29707
30047
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
29708
30048
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29709
30049
|
["bert", "BertForMaskedLM"],
|
|
30050
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
29710
30051
|
["neobert", "NeoBertForMaskedLM"],
|
|
29711
30052
|
["modernbert", "ModernBertForMaskedLM"],
|
|
29712
30053
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -29764,8 +30105,11 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29764
30105
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
29765
30106
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
29766
30107
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
30108
|
+
["gemma3", "Gemma3ForConditionalGeneration"],
|
|
29767
30109
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
29768
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
30110
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
30111
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
30112
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
29769
30113
|
]);
|
|
29770
30114
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29771
30115
|
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
@@ -29870,6 +30214,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29870
30214
|
]);
|
|
29871
30215
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
29872
30216
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30217
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
29873
30218
|
["dpt", "DPTForDepthEstimation"],
|
|
29874
30219
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
29875
30220
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -29955,13 +30300,6 @@ var CUSTOM_MAPPING = [
|
|
|
29955
30300
|
],
|
|
29956
30301
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
29957
30302
|
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
29958
|
-
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29959
|
-
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29960
|
-
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29961
|
-
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29962
|
-
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29963
|
-
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29964
|
-
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29965
30303
|
[
|
|
29966
30304
|
"VoxtralRealtimeForConditionalGeneration",
|
|
29967
30305
|
VoxtralRealtimeForConditionalGeneration,
|
|
@@ -31643,6 +31981,41 @@ var TASK_ALIASES = Object.freeze({
|
|
|
31643
31981
|
embeddings: "feature-extraction"
|
|
31644
31982
|
});
|
|
31645
31983
|
|
|
31984
|
+
// src/utils/model_registry/resolve_model_type.js
|
|
31985
|
+
function resolve_model_type(config, { warn = true } = {}) {
|
|
31986
|
+
const architectures = (
|
|
31987
|
+
/** @type {string[]} */
|
|
31988
|
+
config.architectures || []
|
|
31989
|
+
);
|
|
31990
|
+
for (const arch of architectures) {
|
|
31991
|
+
const mappedType = MODEL_TYPE_MAPPING.get(arch);
|
|
31992
|
+
if (mappedType !== void 0) {
|
|
31993
|
+
return mappedType;
|
|
31994
|
+
}
|
|
31995
|
+
}
|
|
31996
|
+
if (config.model_type) {
|
|
31997
|
+
const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
|
|
31998
|
+
if (mappedType !== void 0) {
|
|
31999
|
+
return mappedType;
|
|
32000
|
+
}
|
|
32001
|
+
for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
|
|
32002
|
+
if (mapping.has(config.model_type)) {
|
|
32003
|
+
const resolved = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
|
|
32004
|
+
if (resolved !== void 0) {
|
|
32005
|
+
return resolved;
|
|
32006
|
+
}
|
|
32007
|
+
}
|
|
32008
|
+
}
|
|
32009
|
+
}
|
|
32010
|
+
if (warn) {
|
|
32011
|
+
const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
|
|
32012
|
+
logger.warn(
|
|
32013
|
+
`[resolve_model_type] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
|
|
32014
|
+
);
|
|
32015
|
+
}
|
|
32016
|
+
return MODEL_TYPES.EncoderOnly;
|
|
32017
|
+
}
|
|
32018
|
+
|
|
31646
32019
|
// src/utils/model_registry/get_model_files.js
|
|
31647
32020
|
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
31648
32021
|
if (config !== null) {
|
|
@@ -31665,43 +32038,7 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
31665
32038
|
const subfolder = "onnx";
|
|
31666
32039
|
const rawDevice = overrideDevice ?? custom_config.device;
|
|
31667
32040
|
let dtype = overrideDtype ?? custom_config.dtype;
|
|
31668
|
-
|
|
31669
|
-
const architectures = (
|
|
31670
|
-
/** @type {string[]} */
|
|
31671
|
-
config.architectures || []
|
|
31672
|
-
);
|
|
31673
|
-
let foundInMapping = false;
|
|
31674
|
-
for (const arch of architectures) {
|
|
31675
|
-
const mappedType = MODEL_TYPE_MAPPING.get(arch);
|
|
31676
|
-
if (mappedType !== void 0) {
|
|
31677
|
-
modelType = mappedType;
|
|
31678
|
-
foundInMapping = true;
|
|
31679
|
-
break;
|
|
31680
|
-
}
|
|
31681
|
-
}
|
|
31682
|
-
if (!foundInMapping && config.model_type) {
|
|
31683
|
-
const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
|
|
31684
|
-
if (mappedType !== void 0) {
|
|
31685
|
-
modelType = mappedType;
|
|
31686
|
-
foundInMapping = true;
|
|
31687
|
-
}
|
|
31688
|
-
if (!foundInMapping) {
|
|
31689
|
-
for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
|
|
31690
|
-
if (mapping.has(config.model_type)) {
|
|
31691
|
-
modelType = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
|
|
31692
|
-
foundInMapping = true;
|
|
31693
|
-
break;
|
|
31694
|
-
}
|
|
31695
|
-
}
|
|
31696
|
-
}
|
|
31697
|
-
}
|
|
31698
|
-
if (!foundInMapping) {
|
|
31699
|
-
const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
|
|
31700
|
-
logger.warn(
|
|
31701
|
-
`[get_model_files] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
|
|
31702
|
-
);
|
|
31703
|
-
modelType = MODEL_TYPES.EncoderOnly;
|
|
31704
|
-
}
|
|
32041
|
+
const modelType = resolve_model_type(config);
|
|
31705
32042
|
const add_model_file = (fileName, baseName = null) => {
|
|
31706
32043
|
baseName = baseName ?? fileName;
|
|
31707
32044
|
const selectedDevice = selectDevice(rawDevice, fileName);
|
|
@@ -32288,6 +32625,31 @@ async function clear_pipeline_cache(task, modelId, options = {}) {
|
|
|
32288
32625
|
return await clear_files_from_cache(modelId, files, options);
|
|
32289
32626
|
}
|
|
32290
32627
|
|
|
32628
|
+
// src/utils/model_registry/get_available_dtypes.js
|
|
32629
|
+
var CONCRETE_DTYPES = Object.keys(DEFAULT_DTYPE_SUFFIX_MAPPING);
|
|
32630
|
+
async function get_available_dtypes(modelId, { config = null, model_file_name = null, revision = "main", cache_dir = null, local_files_only = false } = {}) {
|
|
32631
|
+
config = await get_config(modelId, { config, cache_dir, local_files_only, revision });
|
|
32632
|
+
const subfolder = "onnx";
|
|
32633
|
+
const modelType = resolve_model_type(config);
|
|
32634
|
+
const { sessions } = getSessionsConfig(modelType, config, { model_file_name });
|
|
32635
|
+
const baseNames = Object.values(sessions);
|
|
32636
|
+
const metadataOptions = { revision, cache_dir, local_files_only };
|
|
32637
|
+
const probeResults = await Promise.all(
|
|
32638
|
+
CONCRETE_DTYPES.map(async (dtype) => {
|
|
32639
|
+
const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[dtype] ?? "";
|
|
32640
|
+
const allExist = await Promise.all(
|
|
32641
|
+
baseNames.map(async (baseName) => {
|
|
32642
|
+
const filename = `${subfolder}/${baseName}${suffix}.onnx`;
|
|
32643
|
+
const metadata = await get_file_metadata(modelId, filename, metadataOptions);
|
|
32644
|
+
return metadata.exists;
|
|
32645
|
+
})
|
|
32646
|
+
);
|
|
32647
|
+
return { dtype, available: allExist.every(Boolean) };
|
|
32648
|
+
})
|
|
32649
|
+
);
|
|
32650
|
+
return probeResults.filter((r) => r.available).map((r) => r.dtype);
|
|
32651
|
+
}
|
|
32652
|
+
|
|
32291
32653
|
// src/utils/model_registry/ModelRegistry.js
|
|
32292
32654
|
var ModelRegistry = class {
|
|
32293
32655
|
/**
|
|
@@ -32374,6 +32736,29 @@ var ModelRegistry = class {
|
|
|
32374
32736
|
static async get_processor_files(modelId) {
|
|
32375
32737
|
return get_processor_files(modelId);
|
|
32376
32738
|
}
|
|
32739
|
+
/**
|
|
32740
|
+
* Detects which quantization levels (dtypes) are available for a model
|
|
32741
|
+
* by checking which ONNX files exist on the hub or locally.
|
|
32742
|
+
*
|
|
32743
|
+
* A dtype is considered available if all required model session files
|
|
32744
|
+
* exist for that dtype.
|
|
32745
|
+
*
|
|
32746
|
+
* @param {string} modelId - The model id (e.g., "onnx-community/all-MiniLM-L6-v2-ONNX")
|
|
32747
|
+
* @param {Object} [options] - Optional parameters
|
|
32748
|
+
* @param {import('../../configs.js').PretrainedConfig} [options.config=null] - Pre-loaded config
|
|
32749
|
+
* @param {string} [options.model_file_name=null] - Override the model file name (excluding .onnx suffix)
|
|
32750
|
+
* @param {string} [options.revision='main'] - Model revision
|
|
32751
|
+
* @param {string} [options.cache_dir=null] - Custom cache directory
|
|
32752
|
+
* @param {boolean} [options.local_files_only=false] - Only check local files
|
|
32753
|
+
* @returns {Promise<string[]>} Array of available dtype strings (e.g., ['fp32', 'fp16', 'q4', 'q8'])
|
|
32754
|
+
*
|
|
32755
|
+
* @example
|
|
32756
|
+
* const dtypes = await ModelRegistry.get_available_dtypes('onnx-community/all-MiniLM-L6-v2-ONNX');
|
|
32757
|
+
* console.log(dtypes); // ['fp32', 'fp16', 'int8', 'uint8', 'q8', 'q4']
|
|
32758
|
+
*/
|
|
32759
|
+
static async get_available_dtypes(modelId, options = {}) {
|
|
32760
|
+
return get_available_dtypes(modelId, options);
|
|
32761
|
+
}
|
|
32377
32762
|
/**
|
|
32378
32763
|
* Quickly checks if a model is fully cached by verifying `config.json` is present,
|
|
32379
32764
|
* then confirming all required files are cached.
|
|
@@ -32608,6 +32993,9 @@ export {
|
|
|
32608
32993
|
BloomModel,
|
|
32609
32994
|
BloomPreTrainedModel,
|
|
32610
32995
|
BloomTokenizer,
|
|
32996
|
+
CHMv2ForDepthEstimation,
|
|
32997
|
+
CHMv2ImageProcessor,
|
|
32998
|
+
CHMv2PreTrainedModel,
|
|
32611
32999
|
CLIPFeatureExtractor,
|
|
32612
33000
|
CLIPImageProcessor,
|
|
32613
33001
|
CLIPModel,
|
|
@@ -32703,6 +33091,9 @@ export {
|
|
|
32703
33091
|
DebertaV2Tokenizer,
|
|
32704
33092
|
DecisionTransformerModel,
|
|
32705
33093
|
DecisionTransformerPreTrainedModel,
|
|
33094
|
+
DeepseekV3ForCausalLM,
|
|
33095
|
+
DeepseekV3Model,
|
|
33096
|
+
DeepseekV3PreTrainedModel,
|
|
32706
33097
|
DeiTFeatureExtractor,
|
|
32707
33098
|
DeiTForImageClassification,
|
|
32708
33099
|
DeiTImageProcessor,
|
|
@@ -32763,6 +33154,11 @@ export {
|
|
|
32763
33154
|
EsmModel,
|
|
32764
33155
|
EsmPreTrainedModel,
|
|
32765
33156
|
EsmTokenizer,
|
|
33157
|
+
EuroBertForMaskedLM,
|
|
33158
|
+
EuroBertForSequenceClassification,
|
|
33159
|
+
EuroBertForTokenClassification,
|
|
33160
|
+
EuroBertModel,
|
|
33161
|
+
EuroBertPreTrainedModel,
|
|
32766
33162
|
ExaoneForCausalLM,
|
|
32767
33163
|
ExaoneModel,
|
|
32768
33164
|
ExaonePreTrainedModel,
|
|
@@ -32809,8 +33205,11 @@ export {
|
|
|
32809
33205
|
Gemma2Model,
|
|
32810
33206
|
Gemma2PreTrainedModel,
|
|
32811
33207
|
Gemma3ForCausalLM,
|
|
33208
|
+
Gemma3ForConditionalGeneration,
|
|
33209
|
+
Gemma3ImageProcessor,
|
|
32812
33210
|
Gemma3Model,
|
|
32813
33211
|
Gemma3PreTrainedModel,
|
|
33212
|
+
Gemma3Processor,
|
|
32814
33213
|
Gemma3nAudioFeatureExtractor,
|
|
32815
33214
|
Gemma3nForCausalLM,
|
|
32816
33215
|
Gemma3nForConditionalGeneration,
|
|
@@ -32820,8 +33219,14 @@ export {
|
|
|
32820
33219
|
GemmaModel,
|
|
32821
33220
|
GemmaPreTrainedModel,
|
|
32822
33221
|
GemmaTokenizer,
|
|
33222
|
+
Glm46VImageProcessor,
|
|
33223
|
+
Glm46VProcessor,
|
|
32823
33224
|
GlmForCausalLM,
|
|
32824
33225
|
GlmModel,
|
|
33226
|
+
GlmMoeDsaForCausalLM,
|
|
33227
|
+
GlmMoeDsaModel,
|
|
33228
|
+
GlmMoeDsaPreTrainedModel,
|
|
33229
|
+
GlmOcrForConditionalGeneration,
|
|
32825
33230
|
GlmPreTrainedModel,
|
|
32826
33231
|
GptOssForCausalLM,
|
|
32827
33232
|
GptOssModel,
|
|
@@ -32887,6 +33292,7 @@ export {
|
|
|
32887
33292
|
Lfm2VlForConditionalGeneration,
|
|
32888
33293
|
Lfm2VlImageProcessor,
|
|
32889
33294
|
Lfm2VlProcessor,
|
|
33295
|
+
LightOnOcrForConditionalGeneration,
|
|
32890
33296
|
LiteWhisperForConditionalGeneration,
|
|
32891
33297
|
Llama4ForCausalLM,
|
|
32892
33298
|
Llama4PreTrainedModel,
|
|
@@ -32956,6 +33362,9 @@ export {
|
|
|
32956
33362
|
MimiPreTrainedModel,
|
|
32957
33363
|
MinLengthLogitsProcessor,
|
|
32958
33364
|
MinNewTokensLengthLogitsProcessor,
|
|
33365
|
+
Mistral4ForCausalLM,
|
|
33366
|
+
Mistral4Model,
|
|
33367
|
+
Mistral4PreTrainedModel,
|
|
32959
33368
|
MistralForCausalLM,
|
|
32960
33369
|
MistralModel,
|
|
32961
33370
|
MistralPreTrainedModel,
|
|
@@ -33027,6 +33436,9 @@ export {
|
|
|
33027
33436
|
NanoChatForCausalLM,
|
|
33028
33437
|
NanoChatModel,
|
|
33029
33438
|
NanoChatPreTrainedModel,
|
|
33439
|
+
NemotronHForCausalLM,
|
|
33440
|
+
NemotronHModel,
|
|
33441
|
+
NemotronHPreTrainedModel,
|
|
33030
33442
|
NeoBertForMaskedLM,
|
|
33031
33443
|
NeoBertForQuestionAnswering,
|
|
33032
33444
|
NeoBertForSequenceClassification,
|
|
@@ -33216,6 +33628,9 @@ export {
|
|
|
33216
33628
|
SnacFeatureExtractor,
|
|
33217
33629
|
SnacModel,
|
|
33218
33630
|
SnacPreTrainedModel,
|
|
33631
|
+
SolarOpenForCausalLM,
|
|
33632
|
+
SolarOpenModel,
|
|
33633
|
+
SolarOpenPreTrainedModel,
|
|
33219
33634
|
SpeechT5FeatureExtractor,
|
|
33220
33635
|
SpeechT5ForSpeechToText,
|
|
33221
33636
|
SpeechT5ForTextToSpeech,
|
|
@@ -33413,7 +33828,7 @@ export {
|
|
|
33413
33828
|
|
|
33414
33829
|
onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
|
|
33415
33830
|
(*!
|
|
33416
|
-
* ONNX Runtime Web v1.25.0-dev.
|
|
33831
|
+
* ONNX Runtime Web v1.25.0-dev.20260323-a99aad9d36
|
|
33417
33832
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
33418
33833
|
* Licensed under the MIT License.
|
|
33419
33834
|
*)
|