@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +26 -26
- package/dist/transformers.js +1002 -587
- package/dist/transformers.min.js +23 -19
- package/dist/transformers.node.cjs +1030 -585
- package/dist/transformers.node.min.cjs +21 -17
- package/dist/transformers.node.min.mjs +21 -17
- package/dist/transformers.node.mjs +1000 -585
- package/dist/transformers.web.js +887 -472
- package/dist/transformers.web.min.js +21 -17
- package/package.json +3 -3
- package/src/configs.js +28 -22
- package/src/env.js +1 -1
- package/src/image_processors_utils.js +25 -15
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/gemma3/image_processing_gemma3.js +3 -0
- package/src/models/gemma3/modeling_gemma3.js +4 -1
- package/src/models/gemma3/processing_gemma3.js +45 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/image_processors.js +3 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +48 -25
- package/src/models/models.js +10 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/processors.js +2 -0
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +226 -168
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/registry.js +19 -8
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/pipelines.js +1 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/model_registry/ModelRegistry.js +36 -0
- package/src/utils/model_registry/get_available_dtypes.js +68 -0
- package/src/utils/model_registry/get_file_metadata.js +1 -0
- package/src/utils/model_registry/get_model_files.js +7 -60
- package/src/utils/model_registry/resolve_model_type.js +66 -0
- package/types/configs.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +3 -2
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/gemma3/image_processing_gemma3.d.ts +4 -0
- package/types/models/gemma3/image_processing_gemma3.d.ts.map +1 -0
- package/types/models/gemma3/modeling_gemma3.d.ts +4 -1
- package/types/models/gemma3/modeling_gemma3.d.ts.map +1 -1
- package/types/models/gemma3/processing_gemma3.d.ts +20 -0
- package/types/models/gemma3/processing_gemma3.d.ts.map +1 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +3 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts +2 -3
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +10 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/model_registry/ModelRegistry.d.ts +27 -0
- package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
- package/types/utils/model_registry/get_available_dtypes.d.ts +26 -0
- package/types/utils/model_registry/get_available_dtypes.d.ts.map +1 -0
- package/types/utils/model_registry/get_model_files.d.ts +25 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/model_registry/resolve_model_type.d.ts +24 -0
- package/types/utils/model_registry/resolve_model_type.d.ts.map +1 -0
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
|
@@ -117,6 +117,9 @@ __export(transformers_exports, {
|
|
|
117
117
|
BloomModel: () => BloomModel,
|
|
118
118
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
119
119
|
BloomTokenizer: () => BloomTokenizer,
|
|
120
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
121
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
122
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
120
123
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
121
124
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
122
125
|
CLIPModel: () => CLIPModel,
|
|
@@ -212,6 +215,9 @@ __export(transformers_exports, {
|
|
|
212
215
|
DebertaV2Tokenizer: () => DebertaV2Tokenizer,
|
|
213
216
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
214
217
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
218
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
219
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
220
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
215
221
|
DeiTFeatureExtractor: () => DeiTFeatureExtractor,
|
|
216
222
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
217
223
|
DeiTImageProcessor: () => DeiTImageProcessor,
|
|
@@ -272,6 +278,11 @@ __export(transformers_exports, {
|
|
|
272
278
|
EsmModel: () => EsmModel,
|
|
273
279
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
274
280
|
EsmTokenizer: () => EsmTokenizer,
|
|
281
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
282
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
283
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
284
|
+
EuroBertModel: () => EuroBertModel,
|
|
285
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
275
286
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
276
287
|
ExaoneModel: () => ExaoneModel,
|
|
277
288
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -318,8 +329,11 @@ __export(transformers_exports, {
|
|
|
318
329
|
Gemma2Model: () => Gemma2Model,
|
|
319
330
|
Gemma2PreTrainedModel: () => Gemma2PreTrainedModel,
|
|
320
331
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
332
|
+
Gemma3ForConditionalGeneration: () => Gemma3ForConditionalGeneration,
|
|
333
|
+
Gemma3ImageProcessor: () => Gemma3ImageProcessor,
|
|
321
334
|
Gemma3Model: () => Gemma3Model,
|
|
322
335
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
336
|
+
Gemma3Processor: () => Gemma3Processor,
|
|
323
337
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
324
338
|
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
325
339
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
@@ -329,8 +343,14 @@ __export(transformers_exports, {
|
|
|
329
343
|
GemmaModel: () => GemmaModel,
|
|
330
344
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
331
345
|
GemmaTokenizer: () => GemmaTokenizer,
|
|
346
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
347
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
332
348
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
333
349
|
GlmModel: () => GlmModel,
|
|
350
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
351
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
352
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
353
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
334
354
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
335
355
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
336
356
|
GptOssModel: () => GptOssModel,
|
|
@@ -396,6 +416,7 @@ __export(transformers_exports, {
|
|
|
396
416
|
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
397
417
|
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
398
418
|
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
419
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
399
420
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
400
421
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
401
422
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -465,6 +486,9 @@ __export(transformers_exports, {
|
|
|
465
486
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
466
487
|
MinLengthLogitsProcessor: () => MinLengthLogitsProcessor,
|
|
467
488
|
MinNewTokensLengthLogitsProcessor: () => MinNewTokensLengthLogitsProcessor,
|
|
489
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
490
|
+
Mistral4Model: () => Mistral4Model,
|
|
491
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
468
492
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
469
493
|
MistralModel: () => MistralModel,
|
|
470
494
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -536,6 +560,9 @@ __export(transformers_exports, {
|
|
|
536
560
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
537
561
|
NanoChatModel: () => NanoChatModel,
|
|
538
562
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
563
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
564
|
+
NemotronHModel: () => NemotronHModel,
|
|
565
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
539
566
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
540
567
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
541
568
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -725,6 +752,9 @@ __export(transformers_exports, {
|
|
|
725
752
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
726
753
|
SnacModel: () => SnacModel,
|
|
727
754
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
755
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
756
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
757
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
728
758
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
729
759
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
730
760
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
@@ -925,7 +955,7 @@ var import_node_fs = __toESM(require("fs"), 1);
|
|
|
925
955
|
var import_node_path = __toESM(require("path"), 1);
|
|
926
956
|
var import_node_url = __toESM(require("url"), 1);
|
|
927
957
|
var import_meta = {};
|
|
928
|
-
var VERSION = "4.0.0-next.
|
|
958
|
+
var VERSION = "4.0.0-next.9";
|
|
929
959
|
var HAS_SELF = typeof self !== "undefined";
|
|
930
960
|
var IS_FS_AVAILABLE = !isEmpty(import_node_fs.default);
|
|
931
961
|
var IS_PATH_AVAILABLE = !isEmpty(import_node_path.default);
|
|
@@ -1155,7 +1185,7 @@ var logger = {
|
|
|
1155
1185
|
}
|
|
1156
1186
|
};
|
|
1157
1187
|
|
|
1158
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
1188
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
1159
1189
|
var DictionarySplitter = class {
|
|
1160
1190
|
/**
|
|
1161
1191
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -2811,10 +2841,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
2811
2841
|
);
|
|
2812
2842
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
2813
2843
|
output_tokens.push(...byte_tokens);
|
|
2814
|
-
} else {
|
|
2844
|
+
} else if (this.unk_token != null) {
|
|
2815
2845
|
output_tokens.push(this.unk_token);
|
|
2816
2846
|
}
|
|
2817
|
-
} else {
|
|
2847
|
+
} else if (this.unk_token != null) {
|
|
2818
2848
|
output_tokens.push(this.unk_token);
|
|
2819
2849
|
}
|
|
2820
2850
|
}
|
|
@@ -6664,14 +6694,14 @@ var Random = class {
|
|
|
6664
6694
|
* @returns {number} A normally distributed random value.
|
|
6665
6695
|
*/
|
|
6666
6696
|
gauss(mu = 0, sigma = 1) {
|
|
6667
|
-
let
|
|
6697
|
+
let z2 = this._gauss_next;
|
|
6668
6698
|
this._gauss_next = null;
|
|
6669
|
-
if (
|
|
6699
|
+
if (z2 === null) {
|
|
6670
6700
|
const x2pi = this.random() * 2 * Math.PI, g2rad = Math.sqrt(-2 * Math.log(1 - this.random()));
|
|
6671
|
-
|
|
6701
|
+
z2 = Math.cos(x2pi) * g2rad;
|
|
6672
6702
|
this._gauss_next = Math.sin(x2pi) * g2rad;
|
|
6673
6703
|
}
|
|
6674
|
-
return mu +
|
|
6704
|
+
return mu + z2 * sigma;
|
|
6675
6705
|
}
|
|
6676
6706
|
/**
|
|
6677
6707
|
* Shuffles an array in-place using the Fisher-Yates algorithm.
|
|
@@ -7426,13 +7456,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
|
|
|
7426
7456
|
wrapped_progress
|
|
7427
7457
|
);
|
|
7428
7458
|
} else if (typeof response !== "string") {
|
|
7459
|
+
const headers = new Headers(response.headers);
|
|
7460
|
+
headers.set("content-length", result.byteLength.toString());
|
|
7429
7461
|
await cache2.put(
|
|
7430
7462
|
cacheKey,
|
|
7431
7463
|
new Response(
|
|
7432
7464
|
/** @type {any} */
|
|
7433
7465
|
result,
|
|
7434
7466
|
{
|
|
7435
|
-
headers
|
|
7467
|
+
headers
|
|
7436
7468
|
}
|
|
7437
7469
|
)
|
|
7438
7470
|
).catch((err) => {
|
|
@@ -8390,7 +8422,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
|
|
|
8390
8422
|
// src/backends/onnx.js
|
|
8391
8423
|
var ONNX_NODE = __toESM(require("onnxruntime-node"), 1);
|
|
8392
8424
|
|
|
8393
|
-
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.
|
|
8425
|
+
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260323-a99aad9d36/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
|
|
8394
8426
|
var ort_webgpu_bundle_min_exports = {};
|
|
8395
8427
|
__export(ort_webgpu_bundle_min_exports, {
|
|
8396
8428
|
InferenceSession: () => Jf,
|
|
@@ -9159,7 +9191,7 @@ async function ts(a = {}) {
|
|
|
9159
9191
|
throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
|
|
9160
9192
|
}
|
|
9161
9193
|
function Ye() {
|
|
9162
|
-
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn,
|
|
9194
|
+
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, q: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, s: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: lf, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: uf, A: df, r: cf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
|
|
9163
9195
|
}
|
|
9164
9196
|
async function bt() {
|
|
9165
9197
|
function e(o, u) {
|
|
@@ -9222,14 +9254,14 @@ async function ts(a = {}) {
|
|
|
9222
9254
|
gt.push(t), Je[e.Nc] = t, t.Nc = e.Nc;
|
|
9223
9255
|
var n = { Oc: "run", he: e.ge, Wc: e.Wc, Nc: e.Nc };
|
|
9224
9256
|
return t.postMessage(n, e.Yc), 0;
|
|
9225
|
-
},
|
|
9257
|
+
}, G = 0, V = (e, t, ...n) => {
|
|
9226
9258
|
var o, u = 16 * n.length, c = P(), h = Ft(u), b = h >>> 3;
|
|
9227
9259
|
for (o of n) typeof o == "bigint" ? ((p(), pe)[b++ >>> 0] = 1n, (p(), pe)[b++ >>> 0] = o) : ((p(), pe)[b++ >>> 0] = 0n, (p(), ae)[b++ >>> 0] = o);
|
|
9228
9260
|
return e = Lo(e, 0, u, h, t), D(c), e;
|
|
9229
9261
|
};
|
|
9230
9262
|
function qe(e) {
|
|
9231
9263
|
if (i) return V(0, 1, e);
|
|
9232
|
-
if (S = e, !(0 <
|
|
9264
|
+
if (S = e, !(0 < G)) {
|
|
9233
9265
|
for (var t of gt) Se(t);
|
|
9234
9266
|
for (t of We) Se(t);
|
|
9235
9267
|
We = [], gt = [], Je = {}, W = true;
|
|
@@ -9274,7 +9306,7 @@ async function ts(a = {}) {
|
|
|
9274
9306
|
We.push(e);
|
|
9275
9307
|
}
|
|
9276
9308
|
var Fe, zs = (e, t) => {
|
|
9277
|
-
|
|
9309
|
+
G = 0, e = zr(e, t), 0 < G ? S = e : Fr(e);
|
|
9278
9310
|
}, Ct = [], Ut = 0, me = (e) => -9007199254740992 > e || 9007199254740992 < e ? NaN : Number(e);
|
|
9279
9311
|
function Vs(e) {
|
|
9280
9312
|
var t = new wr(e >>>= 0);
|
|
@@ -9626,7 +9658,7 @@ async function ts(a = {}) {
|
|
|
9626
9658
|
}
|
|
9627
9659
|
var he = (e) => {
|
|
9628
9660
|
if (!W) try {
|
|
9629
|
-
if (e(), !(0 <
|
|
9661
|
+
if (e(), !(0 < G)) try {
|
|
9630
9662
|
i ? Wt() && Fr(S) : br(S);
|
|
9631
9663
|
} catch (t) {
|
|
9632
9664
|
t instanceof wt || t == "unwind" || y(0, t);
|
|
@@ -9654,7 +9686,7 @@ async function ts(a = {}) {
|
|
|
9654
9686
|
return (t ? Vr[t] : of[e])(...Ir);
|
|
9655
9687
|
}
|
|
9656
9688
|
var Ei = () => {
|
|
9657
|
-
|
|
9689
|
+
G = 0;
|
|
9658
9690
|
};
|
|
9659
9691
|
function Si(e) {
|
|
9660
9692
|
e >>>= 0, i ? postMessage({ Oc: "cleanupThread", ie: e }) : yn(Je[e]);
|
|
@@ -9674,7 +9706,7 @@ async function ts(a = {}) {
|
|
|
9674
9706
|
try {
|
|
9675
9707
|
return e(...n);
|
|
9676
9708
|
} finally {
|
|
9677
|
-
W || (_t.pop(), Me && Ge === 1 && _t.length === 0 && (Ge = 0,
|
|
9709
|
+
W || (_t.pop(), Me && Ge === 1 && _t.length === 0 && (Ge = 0, G += 1, Pt(wa), typeof Fibers < "u" && Fibers.De()));
|
|
9678
9710
|
}
|
|
9679
9711
|
};
|
|
9680
9712
|
return jn.set(e, t), t;
|
|
@@ -9689,7 +9721,7 @@ async function ts(a = {}) {
|
|
|
9689
9721
|
try {
|
|
9690
9722
|
var c = (function() {
|
|
9691
9723
|
var E = (p(), x)[Me + 8 >>> 2 >>> 0];
|
|
9692
|
-
return E = Vn.get(E), E = jn.get(E), --
|
|
9724
|
+
return E = Vn.get(E), E = jn.get(E), --G, E();
|
|
9693
9725
|
})();
|
|
9694
9726
|
} catch (E) {
|
|
9695
9727
|
c = E, u = true;
|
|
@@ -9880,7 +9912,7 @@ async function ts(a = {}) {
|
|
|
9880
9912
|
return L(ct(e >>> 0, t >>> 0));
|
|
9881
9913
|
}
|
|
9882
9914
|
var ou = () => {
|
|
9883
|
-
throw
|
|
9915
|
+
throw G += 1, "unwind";
|
|
9884
9916
|
};
|
|
9885
9917
|
function au() {
|
|
9886
9918
|
return 4294901760;
|
|
@@ -9973,15 +10005,15 @@ async function ts(a = {}) {
|
|
|
9973
10005
|
}
|
|
9974
10006
|
(b = (p(), A)[c + 24 >>> 2 >>> 0]) && (b = { label: Ne(b + 4) }, e.defaultQueue = b), e.label = Ne(c + 4);
|
|
9975
10007
|
}
|
|
9976
|
-
|
|
9977
|
-
--
|
|
9978
|
-
ce[u >>> 0] = B.queue, ce[o >>> 0] = B, lt(n, B.lost.then((ue) => {
|
|
10008
|
+
G += 1, lt(t, h.requestDevice(e).then((B) => {
|
|
10009
|
+
--G, he(() => {
|
|
10010
|
+
ce[u >>> 0] = B.queue, ce[o >>> 0] = B, G += 1, lt(n, B.lost.then((ue) => {
|
|
9979
10011
|
he(() => {
|
|
9980
10012
|
B.onuncapturederror = () => {
|
|
9981
10013
|
};
|
|
9982
10014
|
var ye = P(), fe = Ce(ue.message);
|
|
9983
10015
|
_r(n, yu[ue.reason], fe), D(ye);
|
|
9984
|
-
});
|
|
10016
|
+
}), --G;
|
|
9985
10017
|
})), B.onuncapturederror = (ue) => {
|
|
9986
10018
|
var ye = 5;
|
|
9987
10019
|
ue.error instanceof GPUValidationError ? ye = 2 : ue.error instanceof GPUOutOfMemoryError ? ye = 3 : ue.error instanceof GPUInternalError && (ye = 4);
|
|
@@ -9990,7 +10022,7 @@ async function ts(a = {}) {
|
|
|
9990
10022
|
}, "adapterInfo" in B || (B.adapterInfo = h.info), kr(t, 1, o, 0);
|
|
9991
10023
|
});
|
|
9992
10024
|
}, (B) => {
|
|
9993
|
-
--
|
|
10025
|
+
--G, he(() => {
|
|
9994
10026
|
var ue = P(), ye = Ce(B.message);
|
|
9995
10027
|
kr(t, 3, o, ye), n && _r(n, 4, ye), D(ue);
|
|
9996
10028
|
});
|
|
@@ -10033,12 +10065,12 @@ async function ts(a = {}) {
|
|
|
10033
10065
|
function vu(e, t, n, o, u) {
|
|
10034
10066
|
e >>>= 0, t = me(t), n = me(n), u >>>= 0;
|
|
10035
10067
|
var c = O(e);
|
|
10036
|
-
Re[e] = [], u == 4294967295 && (u = void 0),
|
|
10037
|
-
--
|
|
10068
|
+
Re[e] = [], u == 4294967295 && (u = void 0), G += 1, lt(t, c.mapAsync(n, o >>> 0, u).then(() => {
|
|
10069
|
+
--G, he(() => {
|
|
10038
10070
|
Rr(t, 1, 0);
|
|
10039
10071
|
});
|
|
10040
10072
|
}, (h) => {
|
|
10041
|
-
--
|
|
10073
|
+
--G, he(() => {
|
|
10042
10074
|
P();
|
|
10043
10075
|
var b = Ce(h.message);
|
|
10044
10076
|
Rr(t, h.name === "AbortError" ? 4 : h.name === "OperationError" ? 3 : 0, b), delete Re[e];
|
|
@@ -10067,12 +10099,12 @@ async function ts(a = {}) {
|
|
|
10067
10099
|
return ce[n >>> 0] = u, o && (Re[n] = []), true;
|
|
10068
10100
|
}
|
|
10069
10101
|
function Iu(e, t, n, o) {
|
|
10070
|
-
e >>>= 0, t = me(t), o >>>= 0, n = du(n >>> 0), e = O(e),
|
|
10071
|
-
--
|
|
10102
|
+
e >>>= 0, t = me(t), o >>>= 0, n = du(n >>> 0), e = O(e), G += 1, lt(t, e.createComputePipelineAsync(n).then((u) => {
|
|
10103
|
+
--G, he(() => {
|
|
10072
10104
|
ce[o >>> 0] = u, Pr(t, 1, o, 0);
|
|
10073
10105
|
});
|
|
10074
10106
|
}, (u) => {
|
|
10075
|
-
--
|
|
10107
|
+
--G, he(() => {
|
|
10076
10108
|
var c = P(), h = Ce(u.message);
|
|
10077
10109
|
Pr(t, u.reason === "validation" ? 3 : u.reason === "internal" ? 4 : 0, o, h), D(c);
|
|
10078
10110
|
});
|
|
@@ -10087,15 +10119,15 @@ async function ts(a = {}) {
|
|
|
10087
10119
|
(e = O(e)).onuncapturederror = null, e.destroy();
|
|
10088
10120
|
};
|
|
10089
10121
|
function Ou(e, t) {
|
|
10090
|
-
t = me(t), e = O(e >>> 0),
|
|
10091
|
-
--
|
|
10122
|
+
t = me(t), e = O(e >>> 0), G += 1, lt(t, e.popErrorScope().then((n) => {
|
|
10123
|
+
--G, he(() => {
|
|
10092
10124
|
var o = 5;
|
|
10093
10125
|
n ? n instanceof GPUValidationError ? o = 2 : n instanceof GPUOutOfMemoryError ? o = 3 : n instanceof GPUInternalError && (o = 4) : o = 1;
|
|
10094
10126
|
var u = P(), c = n ? Ce(n.message) : 0;
|
|
10095
10127
|
Nr(t, 1, o, c), D(u);
|
|
10096
10128
|
});
|
|
10097
10129
|
}, (n) => {
|
|
10098
|
-
--
|
|
10130
|
+
--G, he(() => {
|
|
10099
10131
|
var o = P(), u = Ce(n.message);
|
|
10100
10132
|
Nr(t, 1, 5, u), D(o);
|
|
10101
10133
|
});
|
|
@@ -10106,8 +10138,8 @@ async function ts(a = {}) {
|
|
|
10106
10138
|
var u = { featureLevel: pu[(p(), x)[n + 4 >>> 2 >>> 0]], powerPreference: mu[(p(), x)[n + 8 >>> 2 >>> 0]], forceFallbackAdapter: !!(p(), A)[n + 12 >>> 2 >>> 0] };
|
|
10107
10139
|
(e = (p(), A)[n >>> 2 >>> 0]) !== 0 && (p(), u.Fe = !!(p(), A)[e + 8 >>> 2 >>> 0]);
|
|
10108
10140
|
}
|
|
10109
|
-
"gpu" in navigator ? (
|
|
10110
|
-
--
|
|
10141
|
+
"gpu" in navigator ? (G += 1, lt(t, navigator.gpu.requestAdapter(u).then((c) => {
|
|
10142
|
+
--G, he(() => {
|
|
10111
10143
|
if (c) ce[o >>> 0] = c, Et(t, 1, o, 0);
|
|
10112
10144
|
else {
|
|
10113
10145
|
var h = P(), b = Ce("WebGPU not available on this browser (requestAdapter returned null)");
|
|
@@ -10115,7 +10147,7 @@ async function ts(a = {}) {
|
|
|
10115
10147
|
}
|
|
10116
10148
|
});
|
|
10117
10149
|
}, (c) => {
|
|
10118
|
-
--
|
|
10150
|
+
--G, he(() => {
|
|
10119
10151
|
var h = P(), b = Ce(c.message);
|
|
10120
10152
|
Et(t, 4, o, b), D(h);
|
|
10121
10153
|
});
|
|
@@ -10346,7 +10378,7 @@ async function ts(a = {}) {
|
|
|
10346
10378
|
Te(`invalid type for getValue: ${t}`);
|
|
10347
10379
|
}
|
|
10348
10380
|
}, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
|
|
10349
|
-
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = {
|
|
10381
|
+
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 937012: (e, t, n, o, u) => {
|
|
10350
10382
|
if (r === void 0 || !r.Uc) return 1;
|
|
10351
10383
|
if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
|
|
10352
10384
|
if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
|
|
@@ -10366,11 +10398,11 @@ async function ts(a = {}) {
|
|
|
10366
10398
|
} catch {
|
|
10367
10399
|
return 4;
|
|
10368
10400
|
}
|
|
10369
|
-
},
|
|
10401
|
+
}, 937836: (e, t, n) => {
|
|
10370
10402
|
r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
|
|
10371
|
-
},
|
|
10403
|
+
}, 937900: () => r.me(), 937942: (e) => {
|
|
10372
10404
|
r.jd(e);
|
|
10373
|
-
},
|
|
10405
|
+
}, 937979: () => typeof wasmOffsetConverter < "u" };
|
|
10374
10406
|
function af(e, t, n, o) {
|
|
10375
10407
|
var u = P();
|
|
10376
10408
|
try {
|
|
@@ -10389,12 +10421,12 @@ async function ts(a = {}) {
|
|
|
10389
10421
|
N(1, 0);
|
|
10390
10422
|
}
|
|
10391
10423
|
}
|
|
10392
|
-
function uf(e
|
|
10393
|
-
var
|
|
10424
|
+
function uf(e) {
|
|
10425
|
+
var t = P();
|
|
10394
10426
|
try {
|
|
10395
|
-
|
|
10396
|
-
} catch (
|
|
10397
|
-
if (D(
|
|
10427
|
+
Ro(e);
|
|
10428
|
+
} catch (n) {
|
|
10429
|
+
if (D(t), n !== n + 0) throw n;
|
|
10398
10430
|
N(1, 0);
|
|
10399
10431
|
}
|
|
10400
10432
|
}
|
|
@@ -10407,25 +10439,16 @@ async function ts(a = {}) {
|
|
|
10407
10439
|
N(1, 0);
|
|
10408
10440
|
}
|
|
10409
10441
|
}
|
|
10410
|
-
function cf(e) {
|
|
10411
|
-
var
|
|
10412
|
-
try {
|
|
10413
|
-
Ro(e);
|
|
10414
|
-
} catch (n) {
|
|
10415
|
-
if (D(t), n !== n + 0) throw n;
|
|
10416
|
-
N(1, 0);
|
|
10417
|
-
}
|
|
10418
|
-
}
|
|
10419
|
-
function df(e, t, n, o, u, c, h) {
|
|
10420
|
-
var b = P();
|
|
10442
|
+
function cf(e, t, n) {
|
|
10443
|
+
var o = P();
|
|
10421
10444
|
try {
|
|
10422
|
-
|
|
10423
|
-
} catch (
|
|
10424
|
-
if (D(
|
|
10445
|
+
_o(e, t, n);
|
|
10446
|
+
} catch (u) {
|
|
10447
|
+
if (D(o), u !== u + 0) throw u;
|
|
10425
10448
|
N(1, 0);
|
|
10426
10449
|
}
|
|
10427
10450
|
}
|
|
10428
|
-
function
|
|
10451
|
+
function df(e, t) {
|
|
10429
10452
|
var n = P();
|
|
10430
10453
|
try {
|
|
10431
10454
|
Vo(e, t);
|
|
@@ -10434,6 +10457,15 @@ async function ts(a = {}) {
|
|
|
10434
10457
|
N(1, 0);
|
|
10435
10458
|
}
|
|
10436
10459
|
}
|
|
10460
|
+
function lf(e, t, n, o, u, c, h) {
|
|
10461
|
+
var b = P();
|
|
10462
|
+
try {
|
|
10463
|
+
return Wo(e, t, n, o, u, c, h);
|
|
10464
|
+
} catch (E) {
|
|
10465
|
+
if (D(b), E !== E + 0) throw E;
|
|
10466
|
+
N(1, 0);
|
|
10467
|
+
}
|
|
10468
|
+
}
|
|
10437
10469
|
function pf(e, t, n, o, u, c) {
|
|
10438
10470
|
var h = P();
|
|
10439
10471
|
try {
|
|
@@ -10863,7 +10895,7 @@ var nc;
|
|
|
10863
10895
|
var oc;
|
|
10864
10896
|
var ac;
|
|
10865
10897
|
var qt;
|
|
10866
|
-
var
|
|
10898
|
+
var z;
|
|
10867
10899
|
var je = k(() => {
|
|
10868
10900
|
"use strict";
|
|
10869
10901
|
Yt();
|
|
@@ -10919,19 +10951,19 @@ var je = k(() => {
|
|
|
10919
10951
|
rr = false, ds = true, H(M);
|
|
10920
10952
|
});
|
|
10921
10953
|
})), await Promise.race(C), S) throw new Error(`WebAssembly backend initializing failed due to timeout: ${r}ms`);
|
|
10922
|
-
},
|
|
10954
|
+
}, z = () => {
|
|
10923
10955
|
if (nn && rn) return rn;
|
|
10924
10956
|
throw new Error("WebAssembly is not initialized yet.");
|
|
10925
10957
|
};
|
|
10926
10958
|
});
|
|
10927
10959
|
var be;
|
|
10928
10960
|
var Lt;
|
|
10929
|
-
var
|
|
10961
|
+
var $;
|
|
10930
10962
|
var nr = k(() => {
|
|
10931
10963
|
"use strict";
|
|
10932
10964
|
je();
|
|
10933
10965
|
be = (a, r) => {
|
|
10934
|
-
let s =
|
|
10966
|
+
let s = z(), f = s.lengthBytesUTF8(a) + 1, i = s._malloc(f);
|
|
10935
10967
|
return s.stringToUTF8(a, i, f), r.push(i), i;
|
|
10936
10968
|
}, Lt = (a, r, s, f) => {
|
|
10937
10969
|
if (typeof a == "object" && a !== null) {
|
|
@@ -10945,8 +10977,8 @@ var nr = k(() => {
|
|
|
10945
10977
|
else if (typeof d == "boolean") f(l, d ? "1" : "0");
|
|
10946
10978
|
else throw new Error(`Can't handle extra config type: ${typeof d}`);
|
|
10947
10979
|
});
|
|
10948
|
-
},
|
|
10949
|
-
let r =
|
|
10980
|
+
}, $ = (a) => {
|
|
10981
|
+
let r = z(), s = r.stackSave();
|
|
10950
10982
|
try {
|
|
10951
10983
|
let f = r.PTR_SIZE, i = r.stackAlloc(2 * f);
|
|
10952
10984
|
r._OrtGetLastError(i, i + f);
|
|
@@ -10963,7 +10995,7 @@ var ps = k(() => {
|
|
|
10963
10995
|
je();
|
|
10964
10996
|
nr();
|
|
10965
10997
|
ls = (a) => {
|
|
10966
|
-
let r =
|
|
10998
|
+
let r = z(), s = 0, f = [], i = a || {};
|
|
10967
10999
|
try {
|
|
10968
11000
|
if (a?.logSeverityLevel === void 0) i.logSeverityLevel = 2;
|
|
10969
11001
|
else if (typeof a.logSeverityLevel != "number" || !Number.isInteger(a.logSeverityLevel) || a.logSeverityLevel < 0 || a.logSeverityLevel > 4) throw new Error(`log severity level is not valid: ${a.logSeverityLevel}`);
|
|
@@ -10971,9 +11003,9 @@ var ps = k(() => {
|
|
|
10971
11003
|
else if (typeof a.logVerbosityLevel != "number" || !Number.isInteger(a.logVerbosityLevel)) throw new Error(`log verbosity level is not valid: ${a.logVerbosityLevel}`);
|
|
10972
11004
|
a?.terminate === void 0 && (i.terminate = false);
|
|
10973
11005
|
let d = 0;
|
|
10974
|
-
return a?.tag !== void 0 && (d = be(a.tag, f)), s = r._OrtCreateRunOptions(i.logSeverityLevel, i.logVerbosityLevel, !!i.terminate, d), s === 0 &&
|
|
11006
|
+
return a?.tag !== void 0 && (d = be(a.tag, f)), s = r._OrtCreateRunOptions(i.logSeverityLevel, i.logVerbosityLevel, !!i.terminate, d), s === 0 && $("Can't create run options."), a?.extra !== void 0 && Lt(a.extra, "", /* @__PURE__ */ new WeakSet(), (l, m) => {
|
|
10975
11007
|
let y = be(l, f), w = be(m, f);
|
|
10976
|
-
r._OrtAddRunConfigEntry(s, y, w) !== 0 &&
|
|
11008
|
+
r._OrtAddRunConfigEntry(s, y, w) !== 0 && $(`Can't set a run config entry: ${l} - ${m}.`);
|
|
10977
11009
|
}), [s, f];
|
|
10978
11010
|
} catch (d) {
|
|
10979
11011
|
throw s !== 0 && r._OrtReleaseRunOptions(s), f.forEach((l) => r._free(l)), d;
|
|
@@ -11021,7 +11053,7 @@ var hs = k(() => {
|
|
|
11021
11053
|
r.use_ort_model_bytes_directly || (r.use_ort_model_bytes_directly = "1"), a.executionProviders && a.executionProviders.some((s) => (typeof s == "string" ? s : s.name) === "webgpu") && (a.enableMemPattern = false);
|
|
11022
11054
|
}, on = (a, r, s, f) => {
|
|
11023
11055
|
let i = be(r, f), d = be(s, f);
|
|
11024
|
-
|
|
11056
|
+
z()._OrtAddSessionConfigEntry(a, i, d) !== 0 && $(`Can't set a session config entry: ${r} - ${s}.`);
|
|
11025
11057
|
}, ot = (a, r, s, f) => {
|
|
11026
11058
|
let i = be(r, f), d = be(s, f);
|
|
11027
11059
|
a.push([i, d]);
|
|
@@ -11052,7 +11084,7 @@ var hs = k(() => {
|
|
|
11052
11084
|
}
|
|
11053
11085
|
S.validationMode && ot(l, "validationMode", S.validationMode, s);
|
|
11054
11086
|
}
|
|
11055
|
-
let v =
|
|
11087
|
+
let v = z().webgpuRegisterDevice(g);
|
|
11056
11088
|
if (v) {
|
|
11057
11089
|
let [S, C, R] = v;
|
|
11058
11090
|
ot(l, "deviceId", S.toString(), s), ot(l, "webgpuInstance", C.toString(), s), ot(l, "webgpuDevice", R.toString(), s);
|
|
@@ -11067,13 +11099,13 @@ var hs = k(() => {
|
|
|
11067
11099
|
}
|
|
11068
11100
|
let m = be(d, s), y = l.length, w = 0, T = 0;
|
|
11069
11101
|
if (y > 0) {
|
|
11070
|
-
w =
|
|
11071
|
-
for (let g = 0; g < y; g++)
|
|
11102
|
+
w = z()._malloc(y * z().PTR_SIZE), s.push(w), T = z()._malloc(y * z().PTR_SIZE), s.push(T);
|
|
11103
|
+
for (let g = 0; g < y; g++) z().setValue(w + g * z().PTR_SIZE, l[g][0], "*"), z().setValue(T + g * z().PTR_SIZE, l[g][1], "*");
|
|
11072
11104
|
}
|
|
11073
|
-
await
|
|
11105
|
+
await z()._OrtAppendExecutionProvider(a, m, w, T, y) !== 0 && $(`Can't append execution provider: ${d}.`);
|
|
11074
11106
|
}
|
|
11075
11107
|
}, ms = async (a) => {
|
|
11076
|
-
let r =
|
|
11108
|
+
let r = z(), s = 0, f = [], i = a || {};
|
|
11077
11109
|
uc(i);
|
|
11078
11110
|
try {
|
|
11079
11111
|
let d = sc(i.graphOptimizationLevel ?? "all"), l = ic(i.executionMode ?? "sequential"), m = typeof i.logId == "string" ? be(i.logId, f) : 0, y = i.logSeverityLevel ?? 2;
|
|
@@ -11081,7 +11113,7 @@ var hs = k(() => {
|
|
|
11081
11113
|
let w = i.logVerbosityLevel ?? 0;
|
|
11082
11114
|
if (!Number.isInteger(w) || w < 0 || w > 4) throw new Error(`log verbosity level is not valid: ${w}`);
|
|
11083
11115
|
let T = typeof i.optimizedModelFilePath == "string" ? be(i.optimizedModelFilePath, f) : 0;
|
|
11084
|
-
if (s = r._OrtCreateSessionOptions(d, !!i.enableCpuMemArena, !!i.enableMemPattern, l, !!i.enableProfiling, 0, m, y, w, T), s === 0 &&
|
|
11116
|
+
if (s = r._OrtCreateSessionOptions(d, !!i.enableCpuMemArena, !!i.enableMemPattern, l, !!i.enableProfiling, 0, m, y, w, T), s === 0 && $("Can't create session options."), i.executionProviders && await fc(s, i, f), i.enableGraphCapture !== void 0) {
|
|
11085
11117
|
if (typeof i.enableGraphCapture != "boolean") throw new Error(`enableGraphCapture must be a boolean value: ${i.enableGraphCapture}`);
|
|
11086
11118
|
on(s, "enableGraphCapture", i.enableGraphCapture.toString(), f);
|
|
11087
11119
|
}
|
|
@@ -11089,13 +11121,13 @@ var hs = k(() => {
|
|
|
11089
11121
|
if (typeof g != "string") throw new Error(`free dimension override name must be a string: ${g}`);
|
|
11090
11122
|
if (typeof v != "number" || !Number.isInteger(v) || v < 0) throw new Error(`free dimension override value must be a non-negative integer: ${v}`);
|
|
11091
11123
|
let S = be(g, f);
|
|
11092
|
-
r._OrtAddFreeDimensionOverride(s, S, v) !== 0 &&
|
|
11124
|
+
r._OrtAddFreeDimensionOverride(s, S, v) !== 0 && $(`Can't set a free dimension override: ${g} - ${v}.`);
|
|
11093
11125
|
}
|
|
11094
11126
|
return i.extra !== void 0 && Lt(i.extra, "", /* @__PURE__ */ new WeakSet(), (g, v) => {
|
|
11095
11127
|
on(s, g, v, f);
|
|
11096
11128
|
}), [s, f];
|
|
11097
11129
|
} catch (d) {
|
|
11098
|
-
throw s !== 0 && r._OrtReleaseSessionOptions(s) !== 0 &&
|
|
11130
|
+
throw s !== 0 && r._OrtReleaseSessionOptions(s) !== 0 && $("Can't release session options."), f.forEach((l) => r._free(l)), d;
|
|
11099
11131
|
}
|
|
11100
11132
|
};
|
|
11101
11133
|
});
|
|
@@ -11665,7 +11697,7 @@ var Os = k(() => {
|
|
|
11665
11697
|
return l ? l.push(d) : this.temporarySessionTensorIds.set(r, [d]), d;
|
|
11666
11698
|
}
|
|
11667
11699
|
uploadTensor(r, s) {
|
|
11668
|
-
if (
|
|
11700
|
+
if (!z().shouldTransferToMLTensor) throw new Error("Trying to upload to a MLTensor while shouldTransferToMLTensor is false");
|
|
11669
11701
|
le("verbose", () => `[WebNN] uploadTensor {tensorId: ${r}, data: ${s.byteLength}}`), this.tensorManager.upload(r, s);
|
|
11670
11702
|
}
|
|
11671
11703
|
async downloadTensor(r, s) {
|
|
@@ -11771,11 +11803,11 @@ var Kr = k(() => {
|
|
|
11771
11803
|
nr();
|
|
11772
11804
|
sn();
|
|
11773
11805
|
yc = (a, r) => {
|
|
11774
|
-
|
|
11806
|
+
z()._OrtInit(a, r) !== 0 && $("Can't initialize onnxruntime.");
|
|
11775
11807
|
}, Jt = async (a) => {
|
|
11776
11808
|
yc(a.wasm.numThreads, Ot(a.logLevel));
|
|
11777
11809
|
}, Xt = async (a, r) => {
|
|
11778
|
-
|
|
11810
|
+
z().asyncInit?.();
|
|
11779
11811
|
let s = a.webgpu.adapter;
|
|
11780
11812
|
if (r === "webgpu") {
|
|
11781
11813
|
if (typeof navigator > "u" || !navigator.gpu) throw new Error("WebGPU is not supported in current environment");
|
|
@@ -11790,29 +11822,29 @@ var Kr = k(() => {
|
|
|
11790
11822
|
}
|
|
11791
11823
|
}
|
|
11792
11824
|
if (r === "webnn" && (typeof navigator > "u" || !navigator.ml)) throw new Error("WebNN is not supported in current environment");
|
|
11793
|
-
if (r === "webgpu" &&
|
|
11825
|
+
if (r === "webgpu" && z().webgpuInit((f) => {
|
|
11794
11826
|
a.webgpu.device = f;
|
|
11795
11827
|
}), r === "webnn") {
|
|
11796
11828
|
let f = new (Os(), $t(Ls)).WebNNBackend(a);
|
|
11797
|
-
|
|
11829
|
+
z().webnnInit([f, () => f.reserveTensorId(), (i) => f.releaseTensorId(i), async (i, d, l, m, y) => f.ensureTensor(i, d, l, m, y), (i, d) => {
|
|
11798
11830
|
f.uploadTensor(i, d);
|
|
11799
11831
|
}, async (i, d) => f.downloadTensor(i, d), (i, d) => f.registerMLContext(i, d), !!a.trace]);
|
|
11800
11832
|
}
|
|
11801
11833
|
}, it = /* @__PURE__ */ new Map(), bc = (a) => {
|
|
11802
|
-
let r =
|
|
11834
|
+
let r = z(), s = r.stackSave();
|
|
11803
11835
|
try {
|
|
11804
11836
|
let f = r.PTR_SIZE, i = r.stackAlloc(2 * f);
|
|
11805
|
-
r._OrtGetInputOutputCount(a, i, i + f) !== 0 &&
|
|
11837
|
+
r._OrtGetInputOutputCount(a, i, i + f) !== 0 && $("Can't get session input/output count.");
|
|
11806
11838
|
let l = f === 4 ? "i32" : "i64";
|
|
11807
11839
|
return [Number(r.getValue(i, l)), Number(r.getValue(i + f, l))];
|
|
11808
11840
|
} finally {
|
|
11809
11841
|
r.stackRestore(s);
|
|
11810
11842
|
}
|
|
11811
11843
|
}, Bs = (a, r) => {
|
|
11812
|
-
let s =
|
|
11844
|
+
let s = z(), f = s.stackSave(), i = 0;
|
|
11813
11845
|
try {
|
|
11814
11846
|
let d = s.PTR_SIZE, l = s.stackAlloc(2 * d);
|
|
11815
|
-
s._OrtGetInputOutputMetadata(a, r, l, l + d) !== 0 &&
|
|
11847
|
+
s._OrtGetInputOutputMetadata(a, r, l, l + d) !== 0 && $("Can't get session input/output metadata.");
|
|
11816
11848
|
let y = Number(s.getValue(l, "*"));
|
|
11817
11849
|
i = Number(s.getValue(l + d, "*"));
|
|
11818
11850
|
let w = s.HEAP32[i / 4];
|
|
@@ -11827,11 +11859,11 @@ var Kr = k(() => {
|
|
|
11827
11859
|
s.stackRestore(f), i !== 0 && s._OrtFree(i);
|
|
11828
11860
|
}
|
|
11829
11861
|
}, xt = (a) => {
|
|
11830
|
-
let r =
|
|
11862
|
+
let r = z(), s = r._malloc(a.byteLength);
|
|
11831
11863
|
if (s === 0) throw new Error(`Can't create a session. failed to allocate a buffer of size ${a.byteLength}.`);
|
|
11832
11864
|
return r.HEAPU8.set(a, s), [s, a.byteLength];
|
|
11833
11865
|
}, Qt = async (a, r) => {
|
|
11834
|
-
let s, f, i =
|
|
11866
|
+
let s, f, i = z();
|
|
11835
11867
|
Array.isArray(a) ? [s, f] = a : a.buffer === i.HEAPU8.buffer ? [s, f] = [a.byteOffset, a.byteLength] : [s, f] = xt(a);
|
|
11836
11868
|
let d = 0, l = 0, m = 0, y = [], w = [], T = [];
|
|
11837
11869
|
try {
|
|
@@ -11852,17 +11884,17 @@ var Kr = k(() => {
|
|
|
11852
11884
|
} else i.currentContext = await i.webnnCreateMLContext();
|
|
11853
11885
|
break;
|
|
11854
11886
|
}
|
|
11855
|
-
d = await i._OrtCreateSession(s, f, l), i.webgpuOnCreateSession?.(d), d === 0 &&
|
|
11887
|
+
d = await i._OrtCreateSession(s, f, l), i.webgpuOnCreateSession?.(d), d === 0 && $("Can't create a session."), i.jsepOnCreateSession?.(), i.currentContext && (i.webnnRegisterMLContext(d, i.currentContext), i.currentContext = void 0, i.shouldTransferToMLTensor = true);
|
|
11856
11888
|
let [g, v] = bc(d), S = !!r?.enableGraphCapture, C = [], R = [], H = [], U = [], M = [];
|
|
11857
11889
|
for (let L = 0; L < g; L++) {
|
|
11858
11890
|
let [W, oe, p] = Bs(d, L);
|
|
11859
|
-
W === 0 &&
|
|
11891
|
+
W === 0 && $("Can't get an input name."), w.push(W);
|
|
11860
11892
|
let ne = i.UTF8ToString(W);
|
|
11861
11893
|
C.push(ne), H.push(oe === 0 ? { name: ne, isTensor: false } : { name: ne, isTensor: true, type: or(oe), shape: p });
|
|
11862
11894
|
}
|
|
11863
11895
|
for (let L = 0; L < v; L++) {
|
|
11864
11896
|
let [W, oe, p] = Bs(d, L + g);
|
|
11865
|
-
W === 0 &&
|
|
11897
|
+
W === 0 && $("Can't get an output name."), T.push(W);
|
|
11866
11898
|
let ne = i.UTF8ToString(W);
|
|
11867
11899
|
R.push(ne), U.push(oe === 0 ? { name: ne, isTensor: false } : { name: ne, isTensor: true, type: or(oe), shape: p });
|
|
11868
11900
|
{
|
|
@@ -11881,23 +11913,23 @@ var Kr = k(() => {
|
|
|
11881
11913
|
}
|
|
11882
11914
|
}
|
|
11883
11915
|
let Y = null;
|
|
11884
|
-
return M.some((L) => L === "gpu-buffer" || L === "ml-tensor" || L === "ml-tensor-cpu-output") && (m = i._OrtCreateBinding(d), m === 0 &&
|
|
11916
|
+
return M.some((L) => L === "gpu-buffer" || L === "ml-tensor" || L === "ml-tensor-cpu-output") && (m = i._OrtCreateBinding(d), m === 0 && $("Can't create IO binding."), Y = { handle: m, outputPreferredLocations: M, outputPreferredLocationsEncoded: M.map((L) => L === "ml-tensor-cpu-output" ? "ml-tensor" : L).map((L) => an(L)) }), it.set(d, [d, w, T, Y, S, false]), [d, C, R, H, U];
|
|
11885
11917
|
} catch (g) {
|
|
11886
|
-
throw w.forEach((v) => i._OrtFree(v)), T.forEach((v) => i._OrtFree(v)), m !== 0 && i._OrtReleaseBinding(m) !== 0 &&
|
|
11918
|
+
throw w.forEach((v) => i._OrtFree(v)), T.forEach((v) => i._OrtFree(v)), m !== 0 && i._OrtReleaseBinding(m) !== 0 && $("Can't release IO binding."), d !== 0 && i._OrtReleaseSession(d) !== 0 && $("Can't release session."), g;
|
|
11887
11919
|
} finally {
|
|
11888
|
-
i._free(s), l !== 0 && i._OrtReleaseSessionOptions(l) !== 0 &&
|
|
11920
|
+
i._free(s), l !== 0 && i._OrtReleaseSessionOptions(l) !== 0 && $("Can't release session options."), y.forEach((g) => i._free(g)), i.unmountExternalData?.();
|
|
11889
11921
|
}
|
|
11890
11922
|
}, Zt = (a) => {
|
|
11891
|
-
let r =
|
|
11923
|
+
let r = z(), s = it.get(a);
|
|
11892
11924
|
if (!s) throw new Error(`cannot release session. invalid session id: ${a}`);
|
|
11893
11925
|
let [f, i, d, l, m] = s;
|
|
11894
|
-
l && (m && r._OrtClearBoundOutputs(l.handle) !== 0 &&
|
|
11926
|
+
l && (m && r._OrtClearBoundOutputs(l.handle) !== 0 && $("Can't clear bound outputs."), r._OrtReleaseBinding(l.handle) !== 0 && $("Can't release IO binding.")), r.jsepOnReleaseSession?.(a), r.webnnOnReleaseSession?.(a), r.webgpuOnReleaseSession?.(a), i.forEach((y) => r._OrtFree(y)), d.forEach((y) => r._OrtFree(y)), r._OrtReleaseSession(f) !== 0 && $("Can't release session."), it.delete(a);
|
|
11895
11927
|
}, Ms = async (a, r, s, f, i, d, l = false) => {
|
|
11896
11928
|
if (!a) {
|
|
11897
11929
|
r.push(0);
|
|
11898
11930
|
return;
|
|
11899
11931
|
}
|
|
11900
|
-
let m =
|
|
11932
|
+
let m = z(), y = m.PTR_SIZE, w = a[0], T = a[1], g = a[3], v = g, S, C;
|
|
11901
11933
|
if (w === "string" && (g === "gpu-buffer" || g === "ml-tensor")) throw new Error("String tensor is not supported on GPU.");
|
|
11902
11934
|
if (l && g !== "gpu-buffer") throw new Error(`External buffer must be provided for input/output index ${d} when enableGraphCapture is true.`);
|
|
11903
11935
|
if (g === "gpu-buffer") {
|
|
@@ -11941,12 +11973,12 @@ var Kr = k(() => {
|
|
|
11941
11973
|
try {
|
|
11942
11974
|
T.forEach((M, Y) => m.setValue(H + Y * y, M, y === 4 ? "i32" : "i64"));
|
|
11943
11975
|
let U = m._OrtCreateTensor(He(w), S, C, H, T.length, an(v));
|
|
11944
|
-
U === 0 &&
|
|
11976
|
+
U === 0 && $(`Can't create tensor for input/output. session=${f}, index=${d}.`), r.push(U);
|
|
11945
11977
|
} finally {
|
|
11946
11978
|
m.stackRestore(R);
|
|
11947
11979
|
}
|
|
11948
11980
|
}, Kt = async (a, r, s, f, i, d) => {
|
|
11949
|
-
let l =
|
|
11981
|
+
let l = z(), m = l.PTR_SIZE, y = it.get(a);
|
|
11950
11982
|
if (!y) throw new Error(`cannot run inference. invalid session id: ${a}`);
|
|
11951
11983
|
let w = y[0], T = y[1], g = y[2], v = y[3], S = y[4], C = y[5], R = r.length, H = f.length, U = 0, M = [], Y = [], L = [], W = [], oe = [], p = l.stackSave(), ne = l.stackAlloc(R * m), X = l.stackAlloc(R * m), J = l.stackAlloc(H * m), Ue = l.stackAlloc(H * m);
|
|
11952
11984
|
try {
|
|
@@ -11962,33 +11994,33 @@ var Kr = k(() => {
|
|
|
11962
11994
|
$e("wasm bindInputsOutputs");
|
|
11963
11995
|
for (let q = 0; q < R; q++) {
|
|
11964
11996
|
let we = r[q];
|
|
11965
|
-
await l._OrtBindInput(_, T[we], Y[q]) !== 0 &&
|
|
11997
|
+
await l._OrtBindInput(_, T[we], Y[q]) !== 0 && $(`Can't bind input[${q}] for session=${a}.`);
|
|
11966
11998
|
}
|
|
11967
11999
|
for (let q = 0; q < H; q++) {
|
|
11968
12000
|
let we = f[q];
|
|
11969
|
-
i[q]?.[3] ? (oe.push(L[q]), l._OrtBindOutput(_, g[we], L[q], 0) !== 0 &&
|
|
12001
|
+
i[q]?.[3] ? (oe.push(L[q]), l._OrtBindOutput(_, g[we], L[q], 0) !== 0 && $(`Can't bind pre-allocated output[${q}] for session=${a}.`)) : l._OrtBindOutput(_, g[we], 0, pe[we]) !== 0 && $(`Can't bind output[${q}] to ${ae[q]} for session=${a}.`);
|
|
11970
12002
|
}
|
|
11971
12003
|
ze("wasm bindInputsOutputs"), it.set(a, [w, T, g, v, S, true]);
|
|
11972
12004
|
}
|
|
11973
12005
|
l.jsepOnRunStart?.(w), l.webnnOnRunStart?.(w);
|
|
11974
12006
|
let Q;
|
|
11975
|
-
v ? Q = await l._OrtRunWithBinding(w, v.handle, H, J, U) : Q = await l._OrtRun(w, X, ne, R, Ue, H, J, U), Q !== 0 &&
|
|
12007
|
+
v ? Q = await l._OrtRunWithBinding(w, v.handle, H, J, U) : Q = await l._OrtRun(w, X, ne, R, Ue, H, J, U), Q !== 0 && $("failed to call OrtRun().");
|
|
11976
12008
|
let x = [], A = [];
|
|
11977
12009
|
$e("wasm ProcessOutputTensor");
|
|
11978
12010
|
for (let _ = 0; _ < H; _++) {
|
|
11979
12011
|
let ae = Number(l.getValue(J + _ * m, "*"));
|
|
11980
12012
|
if (ae === L[_] || oe.includes(L[_])) {
|
|
11981
|
-
x.push(i[_]), ae !== L[_] && l._OrtReleaseTensor(ae) !== 0 &&
|
|
12013
|
+
x.push(i[_]), ae !== L[_] && l._OrtReleaseTensor(ae) !== 0 && $("Can't release tensor.");
|
|
11982
12014
|
continue;
|
|
11983
12015
|
}
|
|
11984
12016
|
let pe = l.stackSave(), q = l.stackAlloc(4 * m), we = false, re, se = 0;
|
|
11985
12017
|
try {
|
|
11986
|
-
l._OrtGetTensorData(ae, q, q + m, q + 2 * m, q + 3 * m) !== 0 &&
|
|
12018
|
+
l._OrtGetTensorData(ae, q, q + m, q + 2 * m, q + 3 * m) !== 0 && $(`Can't access output tensor data on index ${_}.`);
|
|
11987
12019
|
let Te = m === 4 ? "i32" : "i64", Ye = Number(l.getValue(q, Te));
|
|
11988
12020
|
se = l.getValue(q + m, "*");
|
|
11989
12021
|
let bt = l.getValue(q + m * 2, "*"), wt = Number(l.getValue(q + m * 3, Te)), Se = [];
|
|
11990
12022
|
for (let ee = 0; ee < wt; ee++) Se.push(Number(l.getValue(bt + ee * m, Te)));
|
|
11991
|
-
l._OrtFree(bt) !== 0 &&
|
|
12023
|
+
l._OrtFree(bt) !== 0 && $("Can't free memory for tensor dims.");
|
|
11992
12024
|
let Ae = Se.reduce((ee, Z) => ee * Z, 1);
|
|
11993
12025
|
re = or(Ye);
|
|
11994
12026
|
let Oe = v?.outputPreferredLocations[f[_]];
|
|
@@ -11996,24 +12028,24 @@ var Kr = k(() => {
|
|
|
11996
12028
|
if (Oe === "gpu-buffer" || Oe === "ml-tensor") throw new Error("String tensor is not supported on GPU.");
|
|
11997
12029
|
let ee = [];
|
|
11998
12030
|
for (let Z = 0; Z < Ae; Z++) {
|
|
11999
|
-
let
|
|
12000
|
-
ee.push(l.UTF8ToString(
|
|
12031
|
+
let G = l.getValue(se + Z * m, "*"), V = l.getValue(se + (Z + 1) * m, "*"), qe = Z === Ae - 1 ? void 0 : V - G;
|
|
12032
|
+
ee.push(l.UTF8ToString(G, qe));
|
|
12001
12033
|
}
|
|
12002
12034
|
x.push([re, Se, ee, "cpu"]);
|
|
12003
12035
|
} else if (Oe === "gpu-buffer" && Ae > 0) {
|
|
12004
12036
|
let ee = l.webgpuGetBuffer;
|
|
12005
12037
|
if (!ee) throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.');
|
|
12006
|
-
let Z = ee(se),
|
|
12007
|
-
if (
|
|
12038
|
+
let Z = ee(se), G = mt(Ye, Ae);
|
|
12039
|
+
if (G === void 0 || !ar(re)) throw new Error(`Unsupported data type: ${re}`);
|
|
12008
12040
|
we = true;
|
|
12009
12041
|
{
|
|
12010
12042
|
l.webgpuRegisterBuffer(Z, a, se);
|
|
12011
|
-
let V = l.webgpuCreateDownloader(Z,
|
|
12043
|
+
let V = l.webgpuCreateDownloader(Z, G, a);
|
|
12012
12044
|
x.push([re, Se, { gpuBuffer: Z, download: async () => {
|
|
12013
12045
|
let qe = await V();
|
|
12014
12046
|
return new (at(re))(qe);
|
|
12015
12047
|
}, dispose: () => {
|
|
12016
|
-
l._OrtReleaseTensor(ae) !== 0 &&
|
|
12048
|
+
l._OrtReleaseTensor(ae) !== 0 && $("Can't release tensor.");
|
|
12017
12049
|
} }, "gpu-buffer"]);
|
|
12018
12050
|
}
|
|
12019
12051
|
} else if (Oe === "ml-tensor" && Ae > 0) {
|
|
@@ -12028,8 +12060,8 @@ var Kr = k(() => {
|
|
|
12028
12060
|
} else if (Oe === "ml-tensor-cpu-output" && Ae > 0) {
|
|
12029
12061
|
let ee = l.webnnCreateMLTensorDownloader(se, re)(), Z = x.length;
|
|
12030
12062
|
we = true, A.push((async () => {
|
|
12031
|
-
let
|
|
12032
|
-
return l.webnnReleaseTensorId(se), l._OrtReleaseTensor(ae),
|
|
12063
|
+
let G = [Z, await ee];
|
|
12064
|
+
return l.webnnReleaseTensorId(se), l._OrtReleaseTensor(ae), G;
|
|
12033
12065
|
})()), x.push([re, Se, [], "cpu"]);
|
|
12034
12066
|
} else {
|
|
12035
12067
|
let ee = at(re), Z = new ee(Ae);
|
|
@@ -12039,7 +12071,7 @@ var Kr = k(() => {
|
|
|
12039
12071
|
l.stackRestore(pe), re === "string" && se && l._free(se), we || l._OrtReleaseTensor(ae);
|
|
12040
12072
|
}
|
|
12041
12073
|
}
|
|
12042
|
-
v && !S && (l._OrtClearBoundOutputs(v.handle) !== 0 &&
|
|
12074
|
+
v && !S && (l._OrtClearBoundOutputs(v.handle) !== 0 && $("Can't clear bound outputs."), it.set(a, [w, T, g, v, S, false]));
|
|
12043
12075
|
for (let [_, ae] of await Promise.all(A)) x[_][2] = ae;
|
|
12044
12076
|
return ze("wasm ProcessOutputTensor"), x;
|
|
12045
12077
|
} finally {
|
|
@@ -12050,10 +12082,10 @@ var Kr = k(() => {
|
|
|
12050
12082
|
}), Y.forEach((Q) => l._OrtReleaseTensor(Q)), L.forEach((Q) => l._OrtReleaseTensor(Q)), W.forEach((Q) => l._free(Q)), U !== 0 && l._OrtReleaseRunOptions(U), M.forEach((Q) => l._free(Q));
|
|
12051
12083
|
}
|
|
12052
12084
|
}, er = (a) => {
|
|
12053
|
-
let r =
|
|
12085
|
+
let r = z(), s = it.get(a);
|
|
12054
12086
|
if (!s) throw new Error("invalid session id");
|
|
12055
12087
|
let f = s[0], i = r._OrtEndProfiling(f);
|
|
12056
|
-
i === 0 &&
|
|
12088
|
+
i === 0 && $("Can't get an profile file name."), r._OrtFree(i);
|
|
12057
12089
|
}, tr = (a) => {
|
|
12058
12090
|
let r = [];
|
|
12059
12091
|
for (let s of a) {
|
|
@@ -12286,7 +12318,7 @@ var $s = k(() => {
|
|
|
12286
12318
|
Ve();
|
|
12287
12319
|
Ve();
|
|
12288
12320
|
Ve();
|
|
12289
|
-
var Xa = "1.25.0-dev.
|
|
12321
|
+
var Xa = "1.25.0-dev.20260323-a99aad9d36";
|
|
12290
12322
|
var Tl = Zr;
|
|
12291
12323
|
{
|
|
12292
12324
|
let a = ($s(), $t(Gs)).wasmBackend;
|
|
@@ -16642,7 +16674,9 @@ var processors_exports = {};
|
|
|
16642
16674
|
__export(processors_exports, {
|
|
16643
16675
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
16644
16676
|
Florence2Processor: () => Florence2Processor,
|
|
16677
|
+
Gemma3Processor: () => Gemma3Processor,
|
|
16645
16678
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
16679
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
16646
16680
|
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
16647
16681
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
16648
16682
|
Idefics3Processor: () => Idefics3Processor,
|
|
@@ -19147,26 +19181,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
19147
19181
|
}
|
|
19148
19182
|
return [segmentation, segments];
|
|
19149
19183
|
}
|
|
19150
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
19184
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
19151
19185
|
if (height < factor || width < factor) {
|
|
19152
|
-
|
|
19153
|
-
|
|
19186
|
+
const scale = Math.max(factor / height, factor / width);
|
|
19187
|
+
height = Math.round(height * scale);
|
|
19188
|
+
width = Math.round(width * scale);
|
|
19189
|
+
}
|
|
19190
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19154
19191
|
throw new Error(
|
|
19155
19192
|
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19156
19193
|
);
|
|
19157
19194
|
}
|
|
19158
19195
|
let h_bar = Math.round(height / factor) * factor;
|
|
19159
19196
|
let w_bar = Math.round(width / factor) * factor;
|
|
19160
|
-
if (h_bar * w_bar > max_pixels) {
|
|
19161
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
19162
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
19163
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
19164
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
19165
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
19197
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
19198
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
19199
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
19200
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
19201
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
19202
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
19166
19203
|
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19167
19204
|
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19168
19205
|
}
|
|
19169
|
-
return [
|
|
19206
|
+
return [w_bar, h_bar];
|
|
19170
19207
|
}
|
|
19171
19208
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
19172
19209
|
if (label_ids_to_fuse === null) {
|
|
@@ -19245,7 +19282,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
19245
19282
|
this.do_pad = config.do_pad;
|
|
19246
19283
|
this.min_pixels = config.min_pixels;
|
|
19247
19284
|
this.max_pixels = config.max_pixels;
|
|
19248
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
19285
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
19249
19286
|
this.pad_size = this.size;
|
|
19250
19287
|
}
|
|
19251
19288
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -19533,10 +19570,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
19533
19570
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
19534
19571
|
[pixelData, imgDims] = padded;
|
|
19535
19572
|
} else if (this.size_divisibility) {
|
|
19536
|
-
const
|
|
19537
|
-
|
|
19538
|
-
this.size_divisibility
|
|
19539
|
-
);
|
|
19573
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
19574
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
19540
19575
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
19541
19576
|
}
|
|
19542
19577
|
}
|
|
@@ -19613,6 +19648,7 @@ var image_processors_exports = {};
|
|
|
19613
19648
|
__export(image_processors_exports, {
|
|
19614
19649
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
19615
19650
|
BitImageProcessor: () => BitImageProcessor,
|
|
19651
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
19616
19652
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
19617
19653
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
19618
19654
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -19629,6 +19665,8 @@ __export(image_processors_exports, {
|
|
|
19629
19665
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
19630
19666
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
19631
19667
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
19668
|
+
Gemma3ImageProcessor: () => Gemma3ImageProcessor,
|
|
19669
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
19632
19670
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
19633
19671
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
19634
19672
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
@@ -19689,6 +19727,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
19689
19727
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
19690
19728
|
};
|
|
19691
19729
|
|
|
19730
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
19731
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
19732
|
+
};
|
|
19733
|
+
|
|
19692
19734
|
// src/models/clip/image_processing_clip.js
|
|
19693
19735
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
19694
19736
|
};
|
|
@@ -19808,6 +19850,69 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
19808
19850
|
}
|
|
19809
19851
|
};
|
|
19810
19852
|
|
|
19853
|
+
// src/models/gemma3/image_processing_gemma3.js
|
|
19854
|
+
var Gemma3ImageProcessor = class extends ImageProcessor {
|
|
19855
|
+
};
|
|
19856
|
+
|
|
19857
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19858
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19859
|
+
constructor(config) {
|
|
19860
|
+
super(config);
|
|
19861
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
19862
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
19863
|
+
this.patch_size = config.patch_size;
|
|
19864
|
+
this.merge_size = config.merge_size;
|
|
19865
|
+
}
|
|
19866
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
19867
|
+
get_resize_output_image_size(image, size) {
|
|
19868
|
+
const factor = this.patch_size * this.merge_size;
|
|
19869
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
19870
|
+
}
|
|
19871
|
+
async _call(images, ...args) {
|
|
19872
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
19873
|
+
let patches = pixel_values;
|
|
19874
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
19875
|
+
if (patches.dims[0] === 1) {
|
|
19876
|
+
patches = cat(
|
|
19877
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
19878
|
+
0
|
|
19879
|
+
);
|
|
19880
|
+
}
|
|
19881
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
19882
|
+
const channel = patches.dims[1];
|
|
19883
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
19884
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
19885
|
+
const flatten_patches = patches.view(
|
|
19886
|
+
grid_t,
|
|
19887
|
+
temporal_patch_size,
|
|
19888
|
+
channel,
|
|
19889
|
+
Math.floor(grid_h / merge_size),
|
|
19890
|
+
merge_size,
|
|
19891
|
+
patch_size,
|
|
19892
|
+
Math.floor(grid_w / merge_size),
|
|
19893
|
+
merge_size,
|
|
19894
|
+
patch_size
|
|
19895
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
19896
|
+
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
19897
|
+
return {
|
|
19898
|
+
pixel_values: flatten_patches,
|
|
19899
|
+
image_grid_thw,
|
|
19900
|
+
original_sizes,
|
|
19901
|
+
reshaped_input_sizes
|
|
19902
|
+
};
|
|
19903
|
+
}
|
|
19904
|
+
};
|
|
19905
|
+
|
|
19906
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
19907
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
19908
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
19909
|
+
get_resize_output_image_size(image, size) {
|
|
19910
|
+
const factor = this.patch_size * this.merge_size;
|
|
19911
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
19912
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
19913
|
+
}
|
|
19914
|
+
};
|
|
19915
|
+
|
|
19811
19916
|
// src/models/glpn/image_processing_glpn.js
|
|
19812
19917
|
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
19813
19918
|
};
|
|
@@ -20201,7 +20306,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
|
20201
20306
|
const img = pixel_values.unsqueeze_(0);
|
|
20202
20307
|
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20203
20308
|
const f2 = total_factor ** 2;
|
|
20204
|
-
const [
|
|
20309
|
+
const [new_width, new_height] = smart_resize(
|
|
20205
20310
|
Math.max(total_factor, height),
|
|
20206
20311
|
Math.max(total_factor, width),
|
|
20207
20312
|
total_factor,
|
|
@@ -20491,55 +20596,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
20491
20596
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
20492
20597
|
};
|
|
20493
20598
|
|
|
20494
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
20495
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
20496
|
-
constructor(config) {
|
|
20497
|
-
super(config);
|
|
20498
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
20499
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
20500
|
-
this.patch_size = config.patch_size;
|
|
20501
|
-
this.merge_size = config.merge_size;
|
|
20502
|
-
}
|
|
20503
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
20504
|
-
get_resize_output_image_size(image, size) {
|
|
20505
|
-
const factor = this.patch_size * this.merge_size;
|
|
20506
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
20507
|
-
}
|
|
20508
|
-
async _call(images, ...args) {
|
|
20509
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
20510
|
-
let patches = pixel_values;
|
|
20511
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
20512
|
-
if (patches.dims[0] === 1) {
|
|
20513
|
-
patches = cat(
|
|
20514
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
20515
|
-
0
|
|
20516
|
-
);
|
|
20517
|
-
}
|
|
20518
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
20519
|
-
const channel = patches.dims[1];
|
|
20520
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
20521
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
20522
|
-
const flatten_patches = patches.view(
|
|
20523
|
-
grid_t,
|
|
20524
|
-
temporal_patch_size,
|
|
20525
|
-
channel,
|
|
20526
|
-
Math.floor(grid_h / merge_size),
|
|
20527
|
-
merge_size,
|
|
20528
|
-
patch_size,
|
|
20529
|
-
Math.floor(grid_w / merge_size),
|
|
20530
|
-
merge_size,
|
|
20531
|
-
patch_size
|
|
20532
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
20533
|
-
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
20534
|
-
return {
|
|
20535
|
-
pixel_values: flatten_patches,
|
|
20536
|
-
image_grid_thw,
|
|
20537
|
-
original_sizes,
|
|
20538
|
-
reshaped_input_sizes
|
|
20539
|
-
};
|
|
20540
|
-
}
|
|
20541
|
-
};
|
|
20542
|
-
|
|
20543
20599
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
20544
20600
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
20545
20601
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -21021,6 +21077,48 @@ var Florence2Processor = class extends Processor {
|
|
|
21021
21077
|
}
|
|
21022
21078
|
};
|
|
21023
21079
|
|
|
21080
|
+
// src/models/gemma3/processing_gemma3.js
|
|
21081
|
+
var Gemma3Processor = class extends Processor {
|
|
21082
|
+
static tokenizer_class = AutoTokenizer;
|
|
21083
|
+
static image_processor_class = AutoImageProcessor;
|
|
21084
|
+
static uses_processor_config = true;
|
|
21085
|
+
static uses_chat_template_file = true;
|
|
21086
|
+
constructor(config, components, chat_template) {
|
|
21087
|
+
super(config, components, chat_template);
|
|
21088
|
+
this.image_seq_length = this.config.image_seq_length;
|
|
21089
|
+
const { boi_token, image_token, eoi_token } = this.tokenizer.config;
|
|
21090
|
+
this.boi_token = boi_token;
|
|
21091
|
+
this.image_token = image_token;
|
|
21092
|
+
this.eoi_token = eoi_token;
|
|
21093
|
+
const image_tokens_expanded = image_token.repeat(this.image_seq_length);
|
|
21094
|
+
this.full_image_sequence = `
|
|
21095
|
+
|
|
21096
|
+
${boi_token}${image_tokens_expanded}${eoi_token}
|
|
21097
|
+
|
|
21098
|
+
`;
|
|
21099
|
+
}
|
|
21100
|
+
/**
|
|
21101
|
+
* @param {string|string[]} text
|
|
21102
|
+
* @param {import('../../utils/image.js').RawImage|import('../../utils/image.js').RawImage[]} [images]
|
|
21103
|
+
* @param {Object} [options]
|
|
21104
|
+
*/
|
|
21105
|
+
async _call(text, images = null, options = {}) {
|
|
21106
|
+
if (typeof text === "string") {
|
|
21107
|
+
text = [text];
|
|
21108
|
+
}
|
|
21109
|
+
let image_inputs;
|
|
21110
|
+
if (images) {
|
|
21111
|
+
image_inputs = await this.image_processor(images, options);
|
|
21112
|
+
text = text.map((prompt) => prompt.replaceAll(this.boi_token, this.full_image_sequence));
|
|
21113
|
+
}
|
|
21114
|
+
const text_inputs = this.tokenizer(text, options);
|
|
21115
|
+
return {
|
|
21116
|
+
...text_inputs,
|
|
21117
|
+
...image_inputs
|
|
21118
|
+
};
|
|
21119
|
+
}
|
|
21120
|
+
};
|
|
21121
|
+
|
|
21024
21122
|
// src/models/gemma3n/processing_gemma3n.js
|
|
21025
21123
|
var Gemma3nProcessor = class extends Processor {
|
|
21026
21124
|
static image_processor_class = AutoImageProcessor;
|
|
@@ -21093,6 +21191,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
21093
21191
|
}
|
|
21094
21192
|
};
|
|
21095
21193
|
|
|
21194
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
21195
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
21196
|
+
static image_processor_class = AutoImageProcessor;
|
|
21197
|
+
static tokenizer_class = AutoTokenizer;
|
|
21198
|
+
static image_token = "<|image_pad|>";
|
|
21199
|
+
/**
|
|
21200
|
+
*
|
|
21201
|
+
* @param {string|string[]} text
|
|
21202
|
+
* @param {RawImage|RawImage[]} images
|
|
21203
|
+
* @param {...any} args
|
|
21204
|
+
* @returns {Promise<any>}
|
|
21205
|
+
*/
|
|
21206
|
+
async _call(text, images = null, ...args) {
|
|
21207
|
+
if (!Array.isArray(text)) {
|
|
21208
|
+
text = [text];
|
|
21209
|
+
}
|
|
21210
|
+
let image_inputs, image_grid_thw;
|
|
21211
|
+
if (images) {
|
|
21212
|
+
image_inputs = await this.image_processor(images);
|
|
21213
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
21214
|
+
}
|
|
21215
|
+
if (image_grid_thw) {
|
|
21216
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
21217
|
+
let index = 0;
|
|
21218
|
+
const image_token = (
|
|
21219
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
21220
|
+
this.constructor.image_token
|
|
21221
|
+
);
|
|
21222
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
21223
|
+
text = text.map((t) => {
|
|
21224
|
+
while (t.includes(image_token)) {
|
|
21225
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
21226
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
21227
|
+
}
|
|
21228
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
21229
|
+
});
|
|
21230
|
+
}
|
|
21231
|
+
const text_inputs = this.tokenizer(text);
|
|
21232
|
+
return {
|
|
21233
|
+
...text_inputs,
|
|
21234
|
+
...image_inputs
|
|
21235
|
+
};
|
|
21236
|
+
}
|
|
21237
|
+
};
|
|
21238
|
+
|
|
21239
|
+
// src/models/glm46v/processing_glm46v.js
|
|
21240
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
21241
|
+
static image_token = "<|image|>";
|
|
21242
|
+
};
|
|
21243
|
+
|
|
21096
21244
|
// src/models/granite_speech/processing_granite_speech.js
|
|
21097
21245
|
var GraniteSpeechProcessor = class extends Processor {
|
|
21098
21246
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -21823,47 +21971,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
21823
21971
|
}
|
|
21824
21972
|
};
|
|
21825
21973
|
|
|
21826
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
21827
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
21828
|
-
static image_processor_class = AutoImageProcessor;
|
|
21829
|
-
static tokenizer_class = AutoTokenizer;
|
|
21830
|
-
/**
|
|
21831
|
-
*
|
|
21832
|
-
* @param {string|string[]} text
|
|
21833
|
-
* @param {RawImage|RawImage[]} images
|
|
21834
|
-
* @param {...any} args
|
|
21835
|
-
* @returns {Promise<any>}
|
|
21836
|
-
*/
|
|
21837
|
-
async _call(text, images = null, ...args) {
|
|
21838
|
-
if (!Array.isArray(text)) {
|
|
21839
|
-
text = [text];
|
|
21840
|
-
}
|
|
21841
|
-
let image_inputs, image_grid_thw;
|
|
21842
|
-
if (images) {
|
|
21843
|
-
image_inputs = await this.image_processor(images);
|
|
21844
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
21845
|
-
}
|
|
21846
|
-
if (image_grid_thw) {
|
|
21847
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
21848
|
-
let index = 0;
|
|
21849
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
21850
|
-
text = text.map((t) => {
|
|
21851
|
-
while (t.includes("<|image_pad|>")) {
|
|
21852
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
21853
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
21854
|
-
}
|
|
21855
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
21856
|
-
});
|
|
21857
|
-
}
|
|
21858
|
-
const text_inputs = this.tokenizer(text);
|
|
21859
|
-
return {
|
|
21860
|
-
...text_inputs,
|
|
21861
|
-
...image_inputs
|
|
21862
|
-
// TODO: ...videos_inputs,
|
|
21863
|
-
};
|
|
21864
|
-
}
|
|
21865
|
-
};
|
|
21866
|
-
|
|
21867
21974
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
21868
21975
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
21869
21976
|
};
|
|
@@ -22207,6 +22314,8 @@ function getNormalizedConfig(config) {
|
|
|
22207
22314
|
case "gemma3n":
|
|
22208
22315
|
case "lfm2_vl":
|
|
22209
22316
|
case "chatterbox":
|
|
22317
|
+
case "lighton_ocr":
|
|
22318
|
+
case "glm_ocr":
|
|
22210
22319
|
case "mistral3":
|
|
22211
22320
|
case "qwen2_5_vl":
|
|
22212
22321
|
case "qwen3_vl":
|
|
@@ -22282,6 +22391,8 @@ function getNormalizedConfig(config) {
|
|
|
22282
22391
|
mapping["dim_kv"] = "head_dim";
|
|
22283
22392
|
break;
|
|
22284
22393
|
case "qwen3":
|
|
22394
|
+
case "solar_open":
|
|
22395
|
+
case "glm_ocr_text":
|
|
22285
22396
|
case "gemma":
|
|
22286
22397
|
case "gemma2":
|
|
22287
22398
|
case "vaultgemma":
|
|
@@ -22292,6 +22403,7 @@ function getNormalizedConfig(config) {
|
|
|
22292
22403
|
case "ernie4_5":
|
|
22293
22404
|
case "hunyuan_v1_dense":
|
|
22294
22405
|
case "falcon_h1":
|
|
22406
|
+
case "nemotron_h":
|
|
22295
22407
|
case "ministral":
|
|
22296
22408
|
case "ministral3":
|
|
22297
22409
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -22326,6 +22438,9 @@ function getNormalizedConfig(config) {
|
|
|
22326
22438
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
22327
22439
|
break;
|
|
22328
22440
|
case "youtu":
|
|
22441
|
+
case "deepseek_v3":
|
|
22442
|
+
case "glm_moe_dsa":
|
|
22443
|
+
case "mistral4":
|
|
22329
22444
|
mapping["num_heads"] = "num_key_value_heads";
|
|
22330
22445
|
mapping["num_layers"] = "num_hidden_layers";
|
|
22331
22446
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -22414,6 +22529,7 @@ function getCacheShapes(config, options) {
|
|
|
22414
22529
|
if (!(config instanceof PretrainedConfig)) {
|
|
22415
22530
|
config = new PretrainedConfig(config);
|
|
22416
22531
|
}
|
|
22532
|
+
const batch_size = options?.batch_size ?? 1;
|
|
22417
22533
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
22418
22534
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
22419
22535
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -22423,7 +22539,6 @@ function getCacheShapes(config, options) {
|
|
|
22423
22539
|
config
|
|
22424
22540
|
);
|
|
22425
22541
|
const head_dim = hidden_size / num_attention_heads;
|
|
22426
|
-
const batch_size = options?.batch_size ?? 1;
|
|
22427
22542
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
22428
22543
|
if (layer_types[i] === "full_attention") {
|
|
22429
22544
|
for (const kv of ["key", "value"]) {
|
|
@@ -22436,31 +22551,26 @@ function getCacheShapes(config, options) {
|
|
|
22436
22551
|
}
|
|
22437
22552
|
}
|
|
22438
22553
|
return cache_values;
|
|
22439
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
22554
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
22440
22555
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
22441
22556
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
22442
|
-
const
|
|
22443
|
-
const {
|
|
22444
|
-
layer_types,
|
|
22445
|
-
num_hidden_layers,
|
|
22446
|
-
num_attention_heads,
|
|
22447
|
-
num_key_value_heads,
|
|
22448
|
-
hidden_size,
|
|
22449
|
-
mamba_d_conv,
|
|
22450
|
-
mamba_n_heads,
|
|
22451
|
-
mamba_d_head,
|
|
22452
|
-
mamba_d_state,
|
|
22453
|
-
mamba_n_groups,
|
|
22454
|
-
mamba_expand,
|
|
22455
|
-
mamba_d_ssm
|
|
22456
|
-
} = (
|
|
22557
|
+
const c = (
|
|
22457
22558
|
/** @type {any} */
|
|
22458
22559
|
config
|
|
22459
22560
|
);
|
|
22460
|
-
const
|
|
22461
|
-
const
|
|
22462
|
-
const
|
|
22463
|
-
|
|
22561
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
22562
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
22563
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
22564
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
22565
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
22566
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
22567
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
22568
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
22569
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
22570
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
22571
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
22572
|
+
const cache_values = {};
|
|
22573
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
22464
22574
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
22465
22575
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
22466
22576
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -22494,7 +22604,6 @@ function getCacheShapes(config, options) {
|
|
|
22494
22604
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
22495
22605
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
22496
22606
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
22497
|
-
const batch_size = options?.batch_size ?? 1;
|
|
22498
22607
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
22499
22608
|
if (layer_types[i] === "full_attention") {
|
|
22500
22609
|
for (const kv of ["key", "value"]) {
|
|
@@ -24190,8 +24299,7 @@ var MODEL_TYPES = {
|
|
|
24190
24299
|
ImageAudioTextToText: 13,
|
|
24191
24300
|
Supertonic: 14,
|
|
24192
24301
|
Chatterbox: 15,
|
|
24193
|
-
|
|
24194
|
-
VoxtralRealtime: 17
|
|
24302
|
+
VoxtralRealtime: 16
|
|
24195
24303
|
};
|
|
24196
24304
|
var MODEL_TYPE_CONFIG = {
|
|
24197
24305
|
[MODEL_TYPES.DecoderOnly]: {
|
|
@@ -24248,12 +24356,12 @@ var MODEL_TYPE_CONFIG = {
|
|
|
24248
24356
|
can_generate: true,
|
|
24249
24357
|
forward: image_text_to_text_forward,
|
|
24250
24358
|
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24251
|
-
sessions: (config) => {
|
|
24359
|
+
sessions: (config, options, textOnly) => {
|
|
24252
24360
|
const s = {
|
|
24253
24361
|
embed_tokens: "embed_tokens",
|
|
24254
|
-
vision_encoder: "vision_encoder",
|
|
24255
24362
|
decoder_model_merged: "decoder_model_merged"
|
|
24256
24363
|
};
|
|
24364
|
+
if (!textOnly) s["vision_encoder"] = "vision_encoder";
|
|
24257
24365
|
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
24258
24366
|
return s;
|
|
24259
24367
|
},
|
|
@@ -24275,12 +24383,17 @@ var MODEL_TYPE_CONFIG = {
|
|
|
24275
24383
|
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
24276
24384
|
can_generate: true,
|
|
24277
24385
|
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24278
|
-
sessions: () =>
|
|
24279
|
-
|
|
24280
|
-
|
|
24281
|
-
|
|
24282
|
-
|
|
24283
|
-
|
|
24386
|
+
sessions: (config, options, textOnly) => {
|
|
24387
|
+
const s = {
|
|
24388
|
+
embed_tokens: "embed_tokens",
|
|
24389
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24390
|
+
};
|
|
24391
|
+
if (!textOnly) {
|
|
24392
|
+
s["audio_encoder"] = "audio_encoder";
|
|
24393
|
+
s["vision_encoder"] = "vision_encoder";
|
|
24394
|
+
}
|
|
24395
|
+
return s;
|
|
24396
|
+
},
|
|
24284
24397
|
optional_configs: { generation_config: "generation_config.json" }
|
|
24285
24398
|
},
|
|
24286
24399
|
[MODEL_TYPES.Phi3V]: {
|
|
@@ -24331,14 +24444,6 @@ var MODEL_TYPE_CONFIG = {
|
|
|
24331
24444
|
cache_sessions: { model: true },
|
|
24332
24445
|
optional_configs: { generation_config: "generation_config.json" }
|
|
24333
24446
|
},
|
|
24334
|
-
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
24335
|
-
can_generate: true,
|
|
24336
|
-
forward: image_text_to_text_forward,
|
|
24337
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24338
|
-
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
24339
|
-
cache_sessions: { decoder_model_merged: true },
|
|
24340
|
-
optional_configs: { generation_config: "generation_config.json" }
|
|
24341
|
-
},
|
|
24342
24447
|
[MODEL_TYPES.VoxtralRealtime]: {
|
|
24343
24448
|
can_generate: true,
|
|
24344
24449
|
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
@@ -24364,6 +24469,19 @@ function getSessionsConfig(modelType, config, options = {}) {
|
|
|
24364
24469
|
optional_configs: typeConfig.optional_configs
|
|
24365
24470
|
};
|
|
24366
24471
|
}
|
|
24472
|
+
function resolveTypeConfig(modelName, config) {
|
|
24473
|
+
let modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
24474
|
+
let textOnly = false;
|
|
24475
|
+
const nativeArch = config?.architectures?.[0];
|
|
24476
|
+
if (nativeArch && nativeArch !== modelName && modelName?.endsWith("ForCausalLM") && nativeArch.endsWith("ForConditionalGeneration")) {
|
|
24477
|
+
const nativeType = MODEL_TYPE_MAPPING.get(nativeArch);
|
|
24478
|
+
if (nativeType !== void 0) {
|
|
24479
|
+
modelType = nativeType;
|
|
24480
|
+
textOnly = true;
|
|
24481
|
+
}
|
|
24482
|
+
}
|
|
24483
|
+
return { typeConfig: MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default, textOnly, modelType };
|
|
24484
|
+
}
|
|
24367
24485
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
24368
24486
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
24369
24487
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -24383,8 +24501,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24383
24501
|
this.sessions = sessions;
|
|
24384
24502
|
this.configs = configs;
|
|
24385
24503
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor);
|
|
24386
|
-
const
|
|
24387
|
-
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
24504
|
+
const { typeConfig } = resolveTypeConfig(modelName, config);
|
|
24388
24505
|
this.can_generate = typeConfig.can_generate;
|
|
24389
24506
|
this._forward = typeConfig.forward;
|
|
24390
24507
|
this._prepare_inputs_for_generation = typeConfig.prepare_inputs;
|
|
@@ -24447,9 +24564,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24447
24564
|
session_options
|
|
24448
24565
|
};
|
|
24449
24566
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
24450
|
-
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
24451
24567
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
24452
|
-
const typeConfig
|
|
24568
|
+
const { typeConfig, textOnly, modelType } = resolveTypeConfig(modelName, config);
|
|
24453
24569
|
if (modelType === void 0) {
|
|
24454
24570
|
const type = modelName ?? config?.model_type;
|
|
24455
24571
|
if (type !== "custom") {
|
|
@@ -24458,7 +24574,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24458
24574
|
);
|
|
24459
24575
|
}
|
|
24460
24576
|
}
|
|
24461
|
-
const sessions = typeConfig.sessions(config, options);
|
|
24577
|
+
const sessions = typeConfig.sessions(config, options, textOnly);
|
|
24462
24578
|
const promises = [
|
|
24463
24579
|
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
24464
24580
|
];
|
|
@@ -25122,7 +25238,9 @@ async function generic_text_to_text_forward(self2, {
|
|
|
25122
25238
|
"qwen3_5",
|
|
25123
25239
|
"qwen3_5_text",
|
|
25124
25240
|
"qwen3_5_moe",
|
|
25125
|
-
"qwen3_5_moe_text"
|
|
25241
|
+
"qwen3_5_moe_text",
|
|
25242
|
+
"glm_ocr",
|
|
25243
|
+
"glm_ocr_text"
|
|
25126
25244
|
].includes(self2.config.model_type)
|
|
25127
25245
|
) {
|
|
25128
25246
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -25346,6 +25464,8 @@ __export(models_exports, {
|
|
|
25346
25464
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
25347
25465
|
BloomModel: () => BloomModel,
|
|
25348
25466
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
25467
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
25468
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
25349
25469
|
CLIPModel: () => CLIPModel,
|
|
25350
25470
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
25351
25471
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -25420,6 +25540,9 @@ __export(models_exports, {
|
|
|
25420
25540
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
25421
25541
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
25422
25542
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
25543
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
25544
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
25545
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
25423
25546
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
25424
25547
|
DeiTModel: () => DeiTModel,
|
|
25425
25548
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -25465,6 +25588,11 @@ __export(models_exports, {
|
|
|
25465
25588
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
25466
25589
|
EsmModel: () => EsmModel,
|
|
25467
25590
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
25591
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
25592
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
25593
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
25594
|
+
EuroBertModel: () => EuroBertModel,
|
|
25595
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
25468
25596
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
25469
25597
|
ExaoneModel: () => ExaoneModel,
|
|
25470
25598
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -25501,6 +25629,7 @@ __export(models_exports, {
|
|
|
25501
25629
|
Gemma2Model: () => Gemma2Model,
|
|
25502
25630
|
Gemma2PreTrainedModel: () => Gemma2PreTrainedModel,
|
|
25503
25631
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
25632
|
+
Gemma3ForConditionalGeneration: () => Gemma3ForConditionalGeneration,
|
|
25504
25633
|
Gemma3Model: () => Gemma3Model,
|
|
25505
25634
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
25506
25635
|
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
@@ -25511,6 +25640,10 @@ __export(models_exports, {
|
|
|
25511
25640
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
25512
25641
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
25513
25642
|
GlmModel: () => GlmModel,
|
|
25643
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
25644
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
25645
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
25646
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
25514
25647
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
25515
25648
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
25516
25649
|
GptOssModel: () => GptOssModel,
|
|
@@ -25557,6 +25690,7 @@ __export(models_exports, {
|
|
|
25557
25690
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
25558
25691
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
25559
25692
|
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
25693
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
25560
25694
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
25561
25695
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
25562
25696
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -25606,6 +25740,9 @@ __export(models_exports, {
|
|
|
25606
25740
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
25607
25741
|
MimiModel: () => MimiModel,
|
|
25608
25742
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
25743
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
25744
|
+
Mistral4Model: () => Mistral4Model,
|
|
25745
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
25609
25746
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
25610
25747
|
MistralModel: () => MistralModel,
|
|
25611
25748
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -25663,6 +25800,9 @@ __export(models_exports, {
|
|
|
25663
25800
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
25664
25801
|
NanoChatModel: () => NanoChatModel,
|
|
25665
25802
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
25803
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
25804
|
+
NemotronHModel: () => NemotronHModel,
|
|
25805
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
25666
25806
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
25667
25807
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
25668
25808
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -25800,6 +25940,9 @@ __export(models_exports, {
|
|
|
25800
25940
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
25801
25941
|
SnacModel: () => SnacModel,
|
|
25802
25942
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
25943
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
25944
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
25945
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
25803
25946
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
25804
25947
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
25805
25948
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -25974,7 +26117,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
25974
26117
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
25975
26118
|
};
|
|
25976
26119
|
|
|
25977
|
-
// src/models/
|
|
26120
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
25978
26121
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
25979
26122
|
};
|
|
25980
26123
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -26309,6 +26452,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
26309
26452
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
26310
26453
|
};
|
|
26311
26454
|
|
|
26455
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
26456
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
26457
|
+
};
|
|
26458
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
26459
|
+
};
|
|
26460
|
+
|
|
26312
26461
|
// src/models/clap/modeling_clap.js
|
|
26313
26462
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
26314
26463
|
};
|
|
@@ -26647,6 +26796,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
26647
26796
|
}
|
|
26648
26797
|
};
|
|
26649
26798
|
|
|
26799
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
26800
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
26801
|
+
};
|
|
26802
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
26803
|
+
};
|
|
26804
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
26805
|
+
};
|
|
26806
|
+
|
|
26650
26807
|
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
26651
26808
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
26652
26809
|
};
|
|
@@ -26995,6 +27152,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
26995
27152
|
}
|
|
26996
27153
|
};
|
|
26997
27154
|
|
|
27155
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
27156
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
27157
|
+
};
|
|
27158
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
27159
|
+
};
|
|
27160
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
27161
|
+
/**
|
|
27162
|
+
* Calls the model on new inputs.
|
|
27163
|
+
*
|
|
27164
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
27165
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
27166
|
+
*/
|
|
27167
|
+
async _call(model_inputs) {
|
|
27168
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
27169
|
+
}
|
|
27170
|
+
};
|
|
27171
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
27172
|
+
/**
|
|
27173
|
+
* Calls the model on new inputs.
|
|
27174
|
+
*
|
|
27175
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
27176
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
27177
|
+
*/
|
|
27178
|
+
async _call(model_inputs) {
|
|
27179
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
27180
|
+
}
|
|
27181
|
+
};
|
|
27182
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
27183
|
+
/**
|
|
27184
|
+
* Calls the model on new inputs.
|
|
27185
|
+
*
|
|
27186
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
27187
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
27188
|
+
*/
|
|
27189
|
+
async _call(model_inputs) {
|
|
27190
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
27191
|
+
}
|
|
27192
|
+
};
|
|
27193
|
+
|
|
26998
27194
|
// src/models/exaone/modeling_exaone.js
|
|
26999
27195
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
27000
27196
|
};
|
|
@@ -27152,12 +27348,35 @@ var Gemma2Model = class extends Gemma2PreTrainedModel {
|
|
|
27152
27348
|
var Gemma2ForCausalLM = class extends Gemma2PreTrainedModel {
|
|
27153
27349
|
};
|
|
27154
27350
|
|
|
27351
|
+
// src/models/llava/modeling_llava.js
|
|
27352
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
27353
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
27354
|
+
};
|
|
27355
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
27356
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
27357
|
+
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
27358
|
+
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
27359
|
+
return default_merge_input_ids_with_image_features({
|
|
27360
|
+
// @ts-ignore
|
|
27361
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
27362
|
+
...kwargs,
|
|
27363
|
+
image_features: reshaped_image_hidden_states
|
|
27364
|
+
});
|
|
27365
|
+
}
|
|
27366
|
+
};
|
|
27367
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27368
|
+
};
|
|
27369
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
27370
|
+
};
|
|
27371
|
+
|
|
27155
27372
|
// src/models/gemma3/modeling_gemma3.js
|
|
27156
27373
|
var Gemma3PreTrainedModel = class extends PreTrainedModel {
|
|
27157
27374
|
};
|
|
27158
27375
|
var Gemma3Model = class extends Gemma3PreTrainedModel {
|
|
27159
27376
|
};
|
|
27160
|
-
var
|
|
27377
|
+
var Gemma3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27378
|
+
};
|
|
27379
|
+
var Gemma3ForCausalLM = class extends Gemma3ForConditionalGeneration {
|
|
27161
27380
|
};
|
|
27162
27381
|
|
|
27163
27382
|
// src/models/gemma3n/modeling_gemma3n.js
|
|
@@ -27270,6 +27489,382 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
27270
27489
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
27271
27490
|
};
|
|
27272
27491
|
|
|
27492
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
27493
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
27494
|
+
};
|
|
27495
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
27496
|
+
};
|
|
27497
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
27498
|
+
};
|
|
27499
|
+
|
|
27500
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
27501
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
27502
|
+
forward_params = [
|
|
27503
|
+
// Text inputs
|
|
27504
|
+
"input_ids",
|
|
27505
|
+
"attention_mask",
|
|
27506
|
+
"position_ids",
|
|
27507
|
+
"past_key_values",
|
|
27508
|
+
// Vision inputs
|
|
27509
|
+
"pixel_values",
|
|
27510
|
+
"image_grid_thw"
|
|
27511
|
+
];
|
|
27512
|
+
};
|
|
27513
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
27514
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
27515
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
27516
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
27517
|
+
image_grid_thw_name = "grid_thw";
|
|
27518
|
+
/**
|
|
27519
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
27520
|
+
* @param {Tensor} input_ids
|
|
27521
|
+
* @param {Tensor} attention_mask
|
|
27522
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
27523
|
+
*/
|
|
27524
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
27525
|
+
if (attention_mask) {
|
|
27526
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
27527
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
27528
|
+
const mrope_position_deltas = Array.from(
|
|
27529
|
+
{ length: dims[0] },
|
|
27530
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
27531
|
+
);
|
|
27532
|
+
return [
|
|
27533
|
+
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
27534
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27535
|
+
];
|
|
27536
|
+
} else {
|
|
27537
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
27538
|
+
const position_ids = BigInt64Array.from(
|
|
27539
|
+
{ length: 3 * batch_size * seq_length },
|
|
27540
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
27541
|
+
);
|
|
27542
|
+
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
27543
|
+
}
|
|
27544
|
+
}
|
|
27545
|
+
/**
|
|
27546
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
27547
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
27548
|
+
* respecting attention mask.
|
|
27549
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
27550
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
27551
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
27552
|
+
* @param {number} batch_idx Current batch index
|
|
27553
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
27554
|
+
*/
|
|
27555
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
27556
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
27557
|
+
const llm_positions = new Array(total_len);
|
|
27558
|
+
let index = 0;
|
|
27559
|
+
for (let x = 0; x < 3; ++x) {
|
|
27560
|
+
for (const val of llm_pos_ids_list) {
|
|
27561
|
+
const seg_len = val.length / 3;
|
|
27562
|
+
for (let z2 = x * seg_len; z2 < (x + 1) * seg_len; ++z2) {
|
|
27563
|
+
llm_positions[index++] = val[z2];
|
|
27564
|
+
}
|
|
27565
|
+
}
|
|
27566
|
+
}
|
|
27567
|
+
let count2 = 0;
|
|
27568
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
27569
|
+
if (attn_mask[y] == 1) {
|
|
27570
|
+
for (let x = 0; x < 3; ++x) {
|
|
27571
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
27572
|
+
}
|
|
27573
|
+
++count2;
|
|
27574
|
+
}
|
|
27575
|
+
}
|
|
27576
|
+
return llm_positions;
|
|
27577
|
+
}
|
|
27578
|
+
/**
|
|
27579
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
27580
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
27581
|
+
* @param {object} params
|
|
27582
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
27583
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
27584
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
27585
|
+
* @param {number} params.spatial_merge_size
|
|
27586
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
27587
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
27588
|
+
*/
|
|
27589
|
+
_get_multimodal_rope_positions({
|
|
27590
|
+
filtered_ids,
|
|
27591
|
+
image_grid_thw_list,
|
|
27592
|
+
video_grid_thw_list,
|
|
27593
|
+
spatial_merge_size,
|
|
27594
|
+
state
|
|
27595
|
+
}) {
|
|
27596
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
27597
|
+
const ids = filtered_ids;
|
|
27598
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
27599
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
27600
|
+
return acc;
|
|
27601
|
+
}, []);
|
|
27602
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
27603
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
27604
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
27605
|
+
const llm_pos_ids_list = [];
|
|
27606
|
+
let st2 = 0;
|
|
27607
|
+
let remain_images = image_nums;
|
|
27608
|
+
let remain_videos = video_nums;
|
|
27609
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
27610
|
+
const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
|
|
27611
|
+
const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
|
|
27612
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
27613
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
27614
|
+
let ed;
|
|
27615
|
+
let t, h, w;
|
|
27616
|
+
if (ed_image < ed_video) {
|
|
27617
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
27618
|
+
++state.image_index;
|
|
27619
|
+
--remain_images;
|
|
27620
|
+
ed = ed_image;
|
|
27621
|
+
} else {
|
|
27622
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
27623
|
+
++state.video_index;
|
|
27624
|
+
--remain_videos;
|
|
27625
|
+
ed = ed_video;
|
|
27626
|
+
}
|
|
27627
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
27628
|
+
Number(t),
|
|
27629
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
27630
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
27631
|
+
];
|
|
27632
|
+
const text_len = ed - st2;
|
|
27633
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27634
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
27635
|
+
const offset = text_len + st_idx;
|
|
27636
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
27637
|
+
const t_index = Array.from(
|
|
27638
|
+
{ length: grid_size },
|
|
27639
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
27640
|
+
);
|
|
27641
|
+
const h_index = Array.from(
|
|
27642
|
+
{ length: grid_size },
|
|
27643
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
27644
|
+
);
|
|
27645
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
27646
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
27647
|
+
st2 = ed + grid_size;
|
|
27648
|
+
}
|
|
27649
|
+
if (st2 < ids.length) {
|
|
27650
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27651
|
+
const text_len = ids.length - st2;
|
|
27652
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
27653
|
+
}
|
|
27654
|
+
return llm_pos_ids_list;
|
|
27655
|
+
}
|
|
27656
|
+
/**
|
|
27657
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
27658
|
+
*
|
|
27659
|
+
* Explanation:
|
|
27660
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
27661
|
+
*
|
|
27662
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
27663
|
+
* Examples:
|
|
27664
|
+
* input_ids: [T T T T T], here T is for text.
|
|
27665
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
27666
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
27667
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
27668
|
+
*
|
|
27669
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
27670
|
+
* and 1D rotary position embeddin for text part.
|
|
27671
|
+
* Examples:
|
|
27672
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
27673
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
27674
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
27675
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
27676
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
27677
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
27678
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
27679
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
27680
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
27681
|
+
*
|
|
27682
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
27683
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
27684
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
27685
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
27686
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
27687
|
+
*/
|
|
27688
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
27689
|
+
const { vision_config } = this.config;
|
|
27690
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
27691
|
+
if (image_grid_thw || video_grid_thw) {
|
|
27692
|
+
const total_input_ids = input_ids.tolist();
|
|
27693
|
+
if (!attention_mask) {
|
|
27694
|
+
attention_mask = ones_like(input_ids);
|
|
27695
|
+
}
|
|
27696
|
+
const attention_mask_list = attention_mask.tolist();
|
|
27697
|
+
const position_ids_list = Array.from(
|
|
27698
|
+
{ length: 3 },
|
|
27699
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
27700
|
+
);
|
|
27701
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
27702
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
27703
|
+
const state = { image_index: 0, video_index: 0 };
|
|
27704
|
+
const mrope_position_deltas = [];
|
|
27705
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
27706
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
27707
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
27708
|
+
filtered_ids,
|
|
27709
|
+
image_grid_thw_list,
|
|
27710
|
+
video_grid_thw_list,
|
|
27711
|
+
spatial_merge_size,
|
|
27712
|
+
state
|
|
27713
|
+
});
|
|
27714
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
27715
|
+
llm_pos_ids_list,
|
|
27716
|
+
attention_mask_list[i],
|
|
27717
|
+
position_ids_list,
|
|
27718
|
+
i
|
|
27719
|
+
);
|
|
27720
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
27721
|
+
}
|
|
27722
|
+
return [
|
|
27723
|
+
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
27724
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27725
|
+
];
|
|
27726
|
+
} else {
|
|
27727
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
27728
|
+
}
|
|
27729
|
+
}
|
|
27730
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
27731
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
27732
|
+
pixel_values,
|
|
27733
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
27734
|
+
})).image_features;
|
|
27735
|
+
return features;
|
|
27736
|
+
}
|
|
27737
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
27738
|
+
return default_merge_input_ids_with_image_features({
|
|
27739
|
+
// @ts-ignore
|
|
27740
|
+
image_token_id: this.config.image_token_id,
|
|
27741
|
+
...kwargs
|
|
27742
|
+
});
|
|
27743
|
+
}
|
|
27744
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
27745
|
+
if (!model_inputs.attention_mask || model_inputs.position_ids) {
|
|
27746
|
+
return model_inputs;
|
|
27747
|
+
}
|
|
27748
|
+
const session = this.sessions["decoder_model_merged"] ?? this.sessions["model"];
|
|
27749
|
+
if (!session.inputNames.includes("position_ids")) {
|
|
27750
|
+
return model_inputs;
|
|
27751
|
+
}
|
|
27752
|
+
if (!model_inputs.past_key_values) {
|
|
27753
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27754
|
+
model_inputs.input_ids,
|
|
27755
|
+
model_inputs.image_grid_thw,
|
|
27756
|
+
model_inputs.video_grid_thw,
|
|
27757
|
+
model_inputs.attention_mask
|
|
27758
|
+
);
|
|
27759
|
+
} else {
|
|
27760
|
+
model_inputs.pixel_values = null;
|
|
27761
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
27762
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27763
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27764
|
+
model_inputs.input_ids,
|
|
27765
|
+
model_inputs.image_grid_thw,
|
|
27766
|
+
model_inputs.video_grid_thw,
|
|
27767
|
+
model_inputs.attention_mask
|
|
27768
|
+
);
|
|
27769
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
27770
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
27771
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
27772
|
+
} else {
|
|
27773
|
+
if (!model_inputs.rope_deltas) {
|
|
27774
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27775
|
+
model_inputs.input_ids,
|
|
27776
|
+
model_inputs.image_grid_thw,
|
|
27777
|
+
model_inputs.video_grid_thw,
|
|
27778
|
+
model_inputs.attention_mask
|
|
27779
|
+
);
|
|
27780
|
+
}
|
|
27781
|
+
const delta = BigInt(past_length);
|
|
27782
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
27783
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
27784
|
+
}
|
|
27785
|
+
}
|
|
27786
|
+
return model_inputs;
|
|
27787
|
+
}
|
|
27788
|
+
};
|
|
27789
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
27790
|
+
};
|
|
27791
|
+
|
|
27792
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27793
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27794
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27795
|
+
};
|
|
27796
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
27797
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27798
|
+
};
|
|
27799
|
+
|
|
27800
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
27801
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27802
|
+
/**
|
|
27803
|
+
* Compute 3D positional indices for vision tokens.
|
|
27804
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
27805
|
+
* @param {number} start_position
|
|
27806
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
27807
|
+
* @param {number} temp_merge_size
|
|
27808
|
+
* @param {number} spatial_merge_size
|
|
27809
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
27810
|
+
*/
|
|
27811
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
27812
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
27813
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
27814
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
27815
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
27816
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
27817
|
+
const h_pos = Array.from(
|
|
27818
|
+
{ length: seq_len },
|
|
27819
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
27820
|
+
);
|
|
27821
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
27822
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
27823
|
+
}
|
|
27824
|
+
/**
|
|
27825
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
27826
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
27827
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
27828
|
+
*/
|
|
27829
|
+
_get_multimodal_rope_positions({
|
|
27830
|
+
filtered_ids,
|
|
27831
|
+
image_grid_thw_list,
|
|
27832
|
+
video_grid_thw_list,
|
|
27833
|
+
spatial_merge_size,
|
|
27834
|
+
state
|
|
27835
|
+
}) {
|
|
27836
|
+
const { image_token_id } = this.config;
|
|
27837
|
+
const groups = [];
|
|
27838
|
+
let group_start = 0;
|
|
27839
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
27840
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
27841
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
27842
|
+
if (t !== current_type) {
|
|
27843
|
+
groups.push([current_type, group_start, j]);
|
|
27844
|
+
group_start = j;
|
|
27845
|
+
current_type = t;
|
|
27846
|
+
}
|
|
27847
|
+
}
|
|
27848
|
+
let current_pos = 0;
|
|
27849
|
+
const llm_pos_ids_list = [];
|
|
27850
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
27851
|
+
if (modality_type === 0) {
|
|
27852
|
+
const text_len = end_idx - start_idx;
|
|
27853
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
27854
|
+
current_pos += text_len;
|
|
27855
|
+
} else {
|
|
27856
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
27857
|
+
const temp_merge_size = grid_thw[0];
|
|
27858
|
+
llm_pos_ids_list.push(
|
|
27859
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
27860
|
+
);
|
|
27861
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
27862
|
+
}
|
|
27863
|
+
}
|
|
27864
|
+
return llm_pos_ids_list;
|
|
27865
|
+
}
|
|
27866
|
+
};
|
|
27867
|
+
|
|
27273
27868
|
// src/models/glpn/modeling_glpn.js
|
|
27274
27869
|
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
27275
27870
|
};
|
|
@@ -27468,27 +28063,6 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
27468
28063
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
27469
28064
|
};
|
|
27470
28065
|
|
|
27471
|
-
// src/models/llava/modeling_llava.js
|
|
27472
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
27473
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
27474
|
-
};
|
|
27475
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
27476
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
27477
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
27478
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
27479
|
-
return default_merge_input_ids_with_image_features({
|
|
27480
|
-
// @ts-ignore
|
|
27481
|
-
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
27482
|
-
...kwargs,
|
|
27483
|
-
image_features: reshaped_image_hidden_states
|
|
27484
|
-
});
|
|
27485
|
-
}
|
|
27486
|
-
};
|
|
27487
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27488
|
-
};
|
|
27489
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
27490
|
-
};
|
|
27491
|
-
|
|
27492
28066
|
// src/models/idefics3/modeling_idefics3.js
|
|
27493
28067
|
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27494
28068
|
forward_params = [
|
|
@@ -27582,6 +28156,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
27582
28156
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
27583
28157
|
};
|
|
27584
28158
|
|
|
28159
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
28160
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
28161
|
+
};
|
|
28162
|
+
|
|
27585
28163
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
27586
28164
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
27587
28165
|
};
|
|
@@ -27778,6 +28356,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
27778
28356
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
27779
28357
|
};
|
|
27780
28358
|
|
|
28359
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
28360
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
28361
|
+
};
|
|
28362
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
28363
|
+
};
|
|
28364
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
28365
|
+
};
|
|
28366
|
+
|
|
27781
28367
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
27782
28368
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
27783
28369
|
};
|
|
@@ -28246,6 +28832,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
28246
28832
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
28247
28833
|
};
|
|
28248
28834
|
|
|
28835
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
28836
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
28837
|
+
};
|
|
28838
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
28839
|
+
};
|
|
28840
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
28841
|
+
};
|
|
28842
|
+
|
|
28249
28843
|
// src/models/neobert/modeling_neobert.js
|
|
28250
28844
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
28251
28845
|
};
|
|
@@ -28526,252 +29120,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
28526
29120
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
28527
29121
|
};
|
|
28528
29122
|
|
|
28529
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
28530
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
28531
|
-
forward_params = [
|
|
28532
|
-
// Text inputs
|
|
28533
|
-
"input_ids",
|
|
28534
|
-
"attention_mask",
|
|
28535
|
-
"position_ids",
|
|
28536
|
-
"past_key_values",
|
|
28537
|
-
// Vision inputs
|
|
28538
|
-
"pixel_values",
|
|
28539
|
-
"image_grid_thw"
|
|
28540
|
-
];
|
|
28541
|
-
};
|
|
28542
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
28543
|
-
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
28544
|
-
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
28545
|
-
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
28546
|
-
image_grid_thw_name = "grid_thw";
|
|
28547
|
-
/**
|
|
28548
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
28549
|
-
*
|
|
28550
|
-
* Explanation:
|
|
28551
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
28552
|
-
*
|
|
28553
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
28554
|
-
* Examples:
|
|
28555
|
-
* input_ids: [T T T T T], here T is for text.
|
|
28556
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
28557
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
28558
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
28559
|
-
*
|
|
28560
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
28561
|
-
* and 1D rotary position embeddin for text part.
|
|
28562
|
-
* Examples:
|
|
28563
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
28564
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
28565
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
28566
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
28567
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
28568
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
28569
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
28570
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
28571
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
28572
|
-
*
|
|
28573
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
28574
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
28575
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
28576
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
28577
|
-
* - 1 for tokens that are **not masked**,
|
|
28578
|
-
* - 0 for tokens that are **masked**.
|
|
28579
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
28580
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
28581
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
28582
|
-
*/
|
|
28583
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
28584
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
28585
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
28586
|
-
const mrope_position_deltas = [];
|
|
28587
|
-
if (image_grid_thw || video_grid_thw) {
|
|
28588
|
-
let total_input_ids = input_ids.tolist();
|
|
28589
|
-
if (!attention_mask) {
|
|
28590
|
-
attention_mask = ones_like(input_ids);
|
|
28591
|
-
}
|
|
28592
|
-
const attention_mask_list = attention_mask.tolist();
|
|
28593
|
-
const position_ids_list = Array.from(
|
|
28594
|
-
{ length: 3 },
|
|
28595
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
28596
|
-
);
|
|
28597
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
28598
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
28599
|
-
let image_index = 0;
|
|
28600
|
-
let video_index = 0;
|
|
28601
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
28602
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
28603
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
28604
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
28605
|
-
return acc;
|
|
28606
|
-
}, []);
|
|
28607
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
28608
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
28609
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
28610
|
-
let llm_pos_ids_list = [];
|
|
28611
|
-
let st2 = 0;
|
|
28612
|
-
let remain_images = image_nums;
|
|
28613
|
-
let remain_videos = video_nums;
|
|
28614
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
28615
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
|
|
28616
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
|
|
28617
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
28618
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
28619
|
-
let ed;
|
|
28620
|
-
let t, h, w;
|
|
28621
|
-
if (ed_image < ed_video) {
|
|
28622
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
28623
|
-
++image_index;
|
|
28624
|
-
--remain_images;
|
|
28625
|
-
ed = ed_image;
|
|
28626
|
-
} else {
|
|
28627
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
28628
|
-
++video_index;
|
|
28629
|
-
--remain_videos;
|
|
28630
|
-
ed = ed_video;
|
|
28631
|
-
}
|
|
28632
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
28633
|
-
Number(t),
|
|
28634
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
28635
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
28636
|
-
];
|
|
28637
|
-
const text_len = ed - st2;
|
|
28638
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
28639
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
28640
|
-
const offset = text_len + st_idx;
|
|
28641
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
28642
|
-
const t_index = Array.from(
|
|
28643
|
-
{ length: grid_size },
|
|
28644
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
28645
|
-
);
|
|
28646
|
-
const h_index = Array.from(
|
|
28647
|
-
{ length: grid_size },
|
|
28648
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
28649
|
-
);
|
|
28650
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
28651
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
28652
|
-
st2 = ed + grid_size;
|
|
28653
|
-
}
|
|
28654
|
-
if (st2 < ids.length) {
|
|
28655
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
28656
|
-
const text_len = ids.length - st2;
|
|
28657
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
28658
|
-
}
|
|
28659
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
28660
|
-
const llm_positions = new Array(num_items);
|
|
28661
|
-
let index = 0;
|
|
28662
|
-
for (let x = 0; x < 3; ++x) {
|
|
28663
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
28664
|
-
const val = llm_pos_ids_list[y];
|
|
28665
|
-
const text_len = val.length / 3;
|
|
28666
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
28667
|
-
llm_positions[index++] = val[z];
|
|
28668
|
-
}
|
|
28669
|
-
}
|
|
28670
|
-
}
|
|
28671
|
-
let count2 = 0;
|
|
28672
|
-
const attn_mask = attention_mask_list[i];
|
|
28673
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
28674
|
-
if (attn_mask[y] == 1) {
|
|
28675
|
-
for (let x = 0; x < 3; ++x) {
|
|
28676
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
28677
|
-
}
|
|
28678
|
-
++count2;
|
|
28679
|
-
}
|
|
28680
|
-
}
|
|
28681
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
28682
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
28683
|
-
}
|
|
28684
|
-
return [
|
|
28685
|
-
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
28686
|
-
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
28687
|
-
];
|
|
28688
|
-
} else {
|
|
28689
|
-
if (attention_mask) {
|
|
28690
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
28691
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
28692
|
-
const mrope_position_deltas2 = Array.from(
|
|
28693
|
-
{ length: dims[0] },
|
|
28694
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
28695
|
-
);
|
|
28696
|
-
return [
|
|
28697
|
-
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
28698
|
-
new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
28699
|
-
];
|
|
28700
|
-
} else {
|
|
28701
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
28702
|
-
const position_ids = BigInt64Array.from(
|
|
28703
|
-
{ length: 3 * batch_size * seq_length },
|
|
28704
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
28705
|
-
);
|
|
28706
|
-
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
28707
|
-
}
|
|
28708
|
-
}
|
|
28709
|
-
}
|
|
28710
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
28711
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
28712
|
-
pixel_values,
|
|
28713
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
28714
|
-
})).image_features;
|
|
28715
|
-
return features;
|
|
28716
|
-
}
|
|
28717
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
28718
|
-
return default_merge_input_ids_with_image_features({
|
|
28719
|
-
// @ts-ignore
|
|
28720
|
-
image_token_id: this.config.image_token_id,
|
|
28721
|
-
...kwargs
|
|
28722
|
-
});
|
|
28723
|
-
}
|
|
28724
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
28725
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
28726
|
-
if (!model_inputs.past_key_values) {
|
|
28727
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
28728
|
-
model_inputs.input_ids,
|
|
28729
|
-
model_inputs.image_grid_thw,
|
|
28730
|
-
model_inputs.video_grid_thw,
|
|
28731
|
-
model_inputs.attention_mask
|
|
28732
|
-
);
|
|
28733
|
-
} else {
|
|
28734
|
-
model_inputs.pixel_values = null;
|
|
28735
|
-
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
28736
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
28737
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
28738
|
-
model_inputs.input_ids,
|
|
28739
|
-
model_inputs.image_grid_thw,
|
|
28740
|
-
model_inputs.video_grid_thw,
|
|
28741
|
-
model_inputs.attention_mask
|
|
28742
|
-
);
|
|
28743
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
28744
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
28745
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
28746
|
-
} else {
|
|
28747
|
-
if (!model_inputs.rope_deltas) {
|
|
28748
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
28749
|
-
model_inputs.input_ids,
|
|
28750
|
-
model_inputs.image_grid_thw,
|
|
28751
|
-
model_inputs.video_grid_thw,
|
|
28752
|
-
model_inputs.attention_mask
|
|
28753
|
-
);
|
|
28754
|
-
}
|
|
28755
|
-
const delta = BigInt(past_length);
|
|
28756
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
28757
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
28758
|
-
}
|
|
28759
|
-
}
|
|
28760
|
-
}
|
|
28761
|
-
return model_inputs;
|
|
28762
|
-
}
|
|
28763
|
-
};
|
|
28764
|
-
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
28765
|
-
};
|
|
28766
|
-
|
|
28767
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
28768
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
28769
|
-
image_grid_thw_name = "image_grid_thw";
|
|
28770
|
-
};
|
|
28771
|
-
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
28772
|
-
image_grid_thw_name = "image_grid_thw";
|
|
28773
|
-
};
|
|
28774
|
-
|
|
28775
29123
|
// src/models/qwen3/modeling_qwen3.js
|
|
28776
29124
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
28777
29125
|
};
|
|
@@ -29217,6 +29565,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
29217
29565
|
}
|
|
29218
29566
|
};
|
|
29219
29567
|
|
|
29568
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
29569
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
29570
|
+
};
|
|
29571
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
29572
|
+
};
|
|
29573
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
29574
|
+
};
|
|
29575
|
+
|
|
29220
29576
|
// src/models/speecht5/modeling_speecht5.js
|
|
29221
29577
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
29222
29578
|
};
|
|
@@ -30333,6 +30689,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
30333
30689
|
// src/models/registry.js
|
|
30334
30690
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
30335
30691
|
["bert", "BertModel"],
|
|
30692
|
+
["eurobert", "EuroBertModel"],
|
|
30336
30693
|
["neobert", "NeoBertModel"],
|
|
30337
30694
|
["modernbert", "ModernBertModel"],
|
|
30338
30695
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -30464,6 +30821,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
30464
30821
|
["gemma3_text", "Gemma3Model"],
|
|
30465
30822
|
["helium", "HeliumModel"],
|
|
30466
30823
|
["glm", "GlmModel"],
|
|
30824
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
30467
30825
|
["openelm", "OpenELMModel"],
|
|
30468
30826
|
["qwen2", "Qwen2Model"],
|
|
30469
30827
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -30475,12 +30833,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
30475
30833
|
["mpt", "MptModel"],
|
|
30476
30834
|
["opt", "OPTModel"],
|
|
30477
30835
|
["mistral", "MistralModel"],
|
|
30836
|
+
["mistral4", "Mistral4Model"],
|
|
30478
30837
|
["ministral", "MinistralModel"],
|
|
30479
30838
|
["ministral3", "Ministral3Model"],
|
|
30480
30839
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
30481
30840
|
["starcoder2", "Starcoder2Model"],
|
|
30841
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
30482
30842
|
["falcon", "FalconModel"],
|
|
30483
30843
|
["falcon_h1", "FalconH1Model"],
|
|
30844
|
+
["nemotron_h", "NemotronHModel"],
|
|
30845
|
+
["solar_open", "SolarOpenModel"],
|
|
30484
30846
|
["stablelm", "StableLmModel"],
|
|
30485
30847
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
30486
30848
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -30500,6 +30862,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30500
30862
|
]);
|
|
30501
30863
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30502
30864
|
["bert", "BertForSequenceClassification"],
|
|
30865
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
30503
30866
|
["neobert", "NeoBertForSequenceClassification"],
|
|
30504
30867
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
30505
30868
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -30522,6 +30885,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30522
30885
|
]);
|
|
30523
30886
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30524
30887
|
["bert", "BertForTokenClassification"],
|
|
30888
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
30525
30889
|
["neobert", "NeoBertForTokenClassification"],
|
|
30526
30890
|
["modernbert", "ModernBertForTokenClassification"],
|
|
30527
30891
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -30584,6 +30948,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30584
30948
|
["gemma3", "Gemma3ForCausalLM"],
|
|
30585
30949
|
["helium", "HeliumForCausalLM"],
|
|
30586
30950
|
["glm", "GlmForCausalLM"],
|
|
30951
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
30587
30952
|
["openelm", "OpenELMForCausalLM"],
|
|
30588
30953
|
["qwen2", "Qwen2ForCausalLM"],
|
|
30589
30954
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
@@ -30595,6 +30960,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30595
30960
|
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
30596
30961
|
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
30597
30962
|
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
30963
|
+
["qwen3_5_text", "Qwen3_5ForCausalLM"],
|
|
30598
30964
|
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
30599
30965
|
["gemma3n", "Gemma3nForCausalLM"],
|
|
30600
30966
|
["phi", "PhiForCausalLM"],
|
|
@@ -30603,13 +30969,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30603
30969
|
["opt", "OPTForCausalLM"],
|
|
30604
30970
|
["mbart", "MBartForCausalLM"],
|
|
30605
30971
|
["mistral", "MistralForCausalLM"],
|
|
30972
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
30606
30973
|
["ministral", "MinistralForCausalLM"],
|
|
30607
30974
|
["ministral3", "Ministral3ForCausalLM"],
|
|
30608
30975
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
30609
30976
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
30977
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
30610
30978
|
["falcon", "FalconForCausalLM"],
|
|
30611
30979
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
30980
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
30612
30981
|
["trocr", "TrOCRForCausalLM"],
|
|
30982
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
30613
30983
|
["stablelm", "StableLmForCausalLM"],
|
|
30614
30984
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
30615
30985
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -30620,6 +30990,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30620
30990
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
30621
30991
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30622
30992
|
["bert", "BertForMaskedLM"],
|
|
30993
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
30623
30994
|
["neobert", "NeoBertForMaskedLM"],
|
|
30624
30995
|
["modernbert", "ModernBertForMaskedLM"],
|
|
30625
30996
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -30677,8 +31048,11 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30677
31048
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
30678
31049
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
30679
31050
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
31051
|
+
["gemma3", "Gemma3ForConditionalGeneration"],
|
|
30680
31052
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
30681
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
31053
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
31054
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
31055
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
30682
31056
|
]);
|
|
30683
31057
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30684
31058
|
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
@@ -30783,6 +31157,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30783
31157
|
]);
|
|
30784
31158
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
30785
31159
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
31160
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
30786
31161
|
["dpt", "DPTForDepthEstimation"],
|
|
30787
31162
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
30788
31163
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -30868,13 +31243,6 @@ var CUSTOM_MAPPING = [
|
|
|
30868
31243
|
],
|
|
30869
31244
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
30870
31245
|
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
30871
|
-
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30872
|
-
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30873
|
-
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30874
|
-
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30875
|
-
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30876
|
-
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30877
|
-
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30878
31246
|
[
|
|
30879
31247
|
"VoxtralRealtimeForConditionalGeneration",
|
|
30880
31248
|
VoxtralRealtimeForConditionalGeneration,
|
|
@@ -32556,6 +32924,41 @@ var TASK_ALIASES = Object.freeze({
|
|
|
32556
32924
|
embeddings: "feature-extraction"
|
|
32557
32925
|
});
|
|
32558
32926
|
|
|
32927
|
+
// src/utils/model_registry/resolve_model_type.js
|
|
32928
|
+
function resolve_model_type(config, { warn = true } = {}) {
|
|
32929
|
+
const architectures = (
|
|
32930
|
+
/** @type {string[]} */
|
|
32931
|
+
config.architectures || []
|
|
32932
|
+
);
|
|
32933
|
+
for (const arch of architectures) {
|
|
32934
|
+
const mappedType = MODEL_TYPE_MAPPING.get(arch);
|
|
32935
|
+
if (mappedType !== void 0) {
|
|
32936
|
+
return mappedType;
|
|
32937
|
+
}
|
|
32938
|
+
}
|
|
32939
|
+
if (config.model_type) {
|
|
32940
|
+
const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
|
|
32941
|
+
if (mappedType !== void 0) {
|
|
32942
|
+
return mappedType;
|
|
32943
|
+
}
|
|
32944
|
+
for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
|
|
32945
|
+
if (mapping.has(config.model_type)) {
|
|
32946
|
+
const resolved = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
|
|
32947
|
+
if (resolved !== void 0) {
|
|
32948
|
+
return resolved;
|
|
32949
|
+
}
|
|
32950
|
+
}
|
|
32951
|
+
}
|
|
32952
|
+
}
|
|
32953
|
+
if (warn) {
|
|
32954
|
+
const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
|
|
32955
|
+
logger.warn(
|
|
32956
|
+
`[resolve_model_type] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
|
|
32957
|
+
);
|
|
32958
|
+
}
|
|
32959
|
+
return MODEL_TYPES.EncoderOnly;
|
|
32960
|
+
}
|
|
32961
|
+
|
|
32559
32962
|
// src/utils/model_registry/get_model_files.js
|
|
32560
32963
|
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
32561
32964
|
if (config !== null) {
|
|
@@ -32578,43 +32981,7 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
32578
32981
|
const subfolder = "onnx";
|
|
32579
32982
|
const rawDevice = overrideDevice ?? custom_config.device;
|
|
32580
32983
|
let dtype = overrideDtype ?? custom_config.dtype;
|
|
32581
|
-
|
|
32582
|
-
const architectures = (
|
|
32583
|
-
/** @type {string[]} */
|
|
32584
|
-
config.architectures || []
|
|
32585
|
-
);
|
|
32586
|
-
let foundInMapping = false;
|
|
32587
|
-
for (const arch of architectures) {
|
|
32588
|
-
const mappedType = MODEL_TYPE_MAPPING.get(arch);
|
|
32589
|
-
if (mappedType !== void 0) {
|
|
32590
|
-
modelType = mappedType;
|
|
32591
|
-
foundInMapping = true;
|
|
32592
|
-
break;
|
|
32593
|
-
}
|
|
32594
|
-
}
|
|
32595
|
-
if (!foundInMapping && config.model_type) {
|
|
32596
|
-
const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
|
|
32597
|
-
if (mappedType !== void 0) {
|
|
32598
|
-
modelType = mappedType;
|
|
32599
|
-
foundInMapping = true;
|
|
32600
|
-
}
|
|
32601
|
-
if (!foundInMapping) {
|
|
32602
|
-
for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
|
|
32603
|
-
if (mapping.has(config.model_type)) {
|
|
32604
|
-
modelType = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
|
|
32605
|
-
foundInMapping = true;
|
|
32606
|
-
break;
|
|
32607
|
-
}
|
|
32608
|
-
}
|
|
32609
|
-
}
|
|
32610
|
-
}
|
|
32611
|
-
if (!foundInMapping) {
|
|
32612
|
-
const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
|
|
32613
|
-
logger.warn(
|
|
32614
|
-
`[get_model_files] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
|
|
32615
|
-
);
|
|
32616
|
-
modelType = MODEL_TYPES.EncoderOnly;
|
|
32617
|
-
}
|
|
32984
|
+
const modelType = resolve_model_type(config);
|
|
32618
32985
|
const add_model_file = (fileName, baseName = null) => {
|
|
32619
32986
|
baseName = baseName ?? fileName;
|
|
32620
32987
|
const selectedDevice = selectDevice(rawDevice, fileName);
|
|
@@ -33201,6 +33568,31 @@ async function clear_pipeline_cache(task, modelId, options = {}) {
|
|
|
33201
33568
|
return await clear_files_from_cache(modelId, files, options);
|
|
33202
33569
|
}
|
|
33203
33570
|
|
|
33571
|
+
// src/utils/model_registry/get_available_dtypes.js
|
|
33572
|
+
var CONCRETE_DTYPES = Object.keys(DEFAULT_DTYPE_SUFFIX_MAPPING);
|
|
33573
|
+
async function get_available_dtypes(modelId, { config = null, model_file_name = null, revision = "main", cache_dir = null, local_files_only = false } = {}) {
|
|
33574
|
+
config = await get_config(modelId, { config, cache_dir, local_files_only, revision });
|
|
33575
|
+
const subfolder = "onnx";
|
|
33576
|
+
const modelType = resolve_model_type(config);
|
|
33577
|
+
const { sessions } = getSessionsConfig(modelType, config, { model_file_name });
|
|
33578
|
+
const baseNames = Object.values(sessions);
|
|
33579
|
+
const metadataOptions = { revision, cache_dir, local_files_only };
|
|
33580
|
+
const probeResults = await Promise.all(
|
|
33581
|
+
CONCRETE_DTYPES.map(async (dtype) => {
|
|
33582
|
+
const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[dtype] ?? "";
|
|
33583
|
+
const allExist = await Promise.all(
|
|
33584
|
+
baseNames.map(async (baseName) => {
|
|
33585
|
+
const filename = `${subfolder}/${baseName}${suffix}.onnx`;
|
|
33586
|
+
const metadata = await get_file_metadata(modelId, filename, metadataOptions);
|
|
33587
|
+
return metadata.exists;
|
|
33588
|
+
})
|
|
33589
|
+
);
|
|
33590
|
+
return { dtype, available: allExist.every(Boolean) };
|
|
33591
|
+
})
|
|
33592
|
+
);
|
|
33593
|
+
return probeResults.filter((r) => r.available).map((r) => r.dtype);
|
|
33594
|
+
}
|
|
33595
|
+
|
|
33204
33596
|
// src/utils/model_registry/ModelRegistry.js
|
|
33205
33597
|
var ModelRegistry = class {
|
|
33206
33598
|
/**
|
|
@@ -33287,6 +33679,29 @@ var ModelRegistry = class {
|
|
|
33287
33679
|
static async get_processor_files(modelId) {
|
|
33288
33680
|
return get_processor_files(modelId);
|
|
33289
33681
|
}
|
|
33682
|
+
/**
|
|
33683
|
+
* Detects which quantization levels (dtypes) are available for a model
|
|
33684
|
+
* by checking which ONNX files exist on the hub or locally.
|
|
33685
|
+
*
|
|
33686
|
+
* A dtype is considered available if all required model session files
|
|
33687
|
+
* exist for that dtype.
|
|
33688
|
+
*
|
|
33689
|
+
* @param {string} modelId - The model id (e.g., "onnx-community/all-MiniLM-L6-v2-ONNX")
|
|
33690
|
+
* @param {Object} [options] - Optional parameters
|
|
33691
|
+
* @param {import('../../configs.js').PretrainedConfig} [options.config=null] - Pre-loaded config
|
|
33692
|
+
* @param {string} [options.model_file_name=null] - Override the model file name (excluding .onnx suffix)
|
|
33693
|
+
* @param {string} [options.revision='main'] - Model revision
|
|
33694
|
+
* @param {string} [options.cache_dir=null] - Custom cache directory
|
|
33695
|
+
* @param {boolean} [options.local_files_only=false] - Only check local files
|
|
33696
|
+
* @returns {Promise<string[]>} Array of available dtype strings (e.g., ['fp32', 'fp16', 'q4', 'q8'])
|
|
33697
|
+
*
|
|
33698
|
+
* @example
|
|
33699
|
+
* const dtypes = await ModelRegistry.get_available_dtypes('onnx-community/all-MiniLM-L6-v2-ONNX');
|
|
33700
|
+
* console.log(dtypes); // ['fp32', 'fp16', 'int8', 'uint8', 'q8', 'q4']
|
|
33701
|
+
*/
|
|
33702
|
+
static async get_available_dtypes(modelId, options = {}) {
|
|
33703
|
+
return get_available_dtypes(modelId, options);
|
|
33704
|
+
}
|
|
33290
33705
|
/**
|
|
33291
33706
|
* Quickly checks if a model is fully cached by verifying `config.json` is present,
|
|
33292
33707
|
* then confirming all required files are cached.
|
|
@@ -33522,6 +33937,9 @@ var ModelRegistry = class {
|
|
|
33522
33937
|
BloomModel,
|
|
33523
33938
|
BloomPreTrainedModel,
|
|
33524
33939
|
BloomTokenizer,
|
|
33940
|
+
CHMv2ForDepthEstimation,
|
|
33941
|
+
CHMv2ImageProcessor,
|
|
33942
|
+
CHMv2PreTrainedModel,
|
|
33525
33943
|
CLIPFeatureExtractor,
|
|
33526
33944
|
CLIPImageProcessor,
|
|
33527
33945
|
CLIPModel,
|
|
@@ -33617,6 +34035,9 @@ var ModelRegistry = class {
|
|
|
33617
34035
|
DebertaV2Tokenizer,
|
|
33618
34036
|
DecisionTransformerModel,
|
|
33619
34037
|
DecisionTransformerPreTrainedModel,
|
|
34038
|
+
DeepseekV3ForCausalLM,
|
|
34039
|
+
DeepseekV3Model,
|
|
34040
|
+
DeepseekV3PreTrainedModel,
|
|
33620
34041
|
DeiTFeatureExtractor,
|
|
33621
34042
|
DeiTForImageClassification,
|
|
33622
34043
|
DeiTImageProcessor,
|
|
@@ -33677,6 +34098,11 @@ var ModelRegistry = class {
|
|
|
33677
34098
|
EsmModel,
|
|
33678
34099
|
EsmPreTrainedModel,
|
|
33679
34100
|
EsmTokenizer,
|
|
34101
|
+
EuroBertForMaskedLM,
|
|
34102
|
+
EuroBertForSequenceClassification,
|
|
34103
|
+
EuroBertForTokenClassification,
|
|
34104
|
+
EuroBertModel,
|
|
34105
|
+
EuroBertPreTrainedModel,
|
|
33680
34106
|
ExaoneForCausalLM,
|
|
33681
34107
|
ExaoneModel,
|
|
33682
34108
|
ExaonePreTrainedModel,
|
|
@@ -33723,8 +34149,11 @@ var ModelRegistry = class {
|
|
|
33723
34149
|
Gemma2Model,
|
|
33724
34150
|
Gemma2PreTrainedModel,
|
|
33725
34151
|
Gemma3ForCausalLM,
|
|
34152
|
+
Gemma3ForConditionalGeneration,
|
|
34153
|
+
Gemma3ImageProcessor,
|
|
33726
34154
|
Gemma3Model,
|
|
33727
34155
|
Gemma3PreTrainedModel,
|
|
34156
|
+
Gemma3Processor,
|
|
33728
34157
|
Gemma3nAudioFeatureExtractor,
|
|
33729
34158
|
Gemma3nForCausalLM,
|
|
33730
34159
|
Gemma3nForConditionalGeneration,
|
|
@@ -33734,8 +34163,14 @@ var ModelRegistry = class {
|
|
|
33734
34163
|
GemmaModel,
|
|
33735
34164
|
GemmaPreTrainedModel,
|
|
33736
34165
|
GemmaTokenizer,
|
|
34166
|
+
Glm46VImageProcessor,
|
|
34167
|
+
Glm46VProcessor,
|
|
33737
34168
|
GlmForCausalLM,
|
|
33738
34169
|
GlmModel,
|
|
34170
|
+
GlmMoeDsaForCausalLM,
|
|
34171
|
+
GlmMoeDsaModel,
|
|
34172
|
+
GlmMoeDsaPreTrainedModel,
|
|
34173
|
+
GlmOcrForConditionalGeneration,
|
|
33739
34174
|
GlmPreTrainedModel,
|
|
33740
34175
|
GptOssForCausalLM,
|
|
33741
34176
|
GptOssModel,
|
|
@@ -33801,6 +34236,7 @@ var ModelRegistry = class {
|
|
|
33801
34236
|
Lfm2VlForConditionalGeneration,
|
|
33802
34237
|
Lfm2VlImageProcessor,
|
|
33803
34238
|
Lfm2VlProcessor,
|
|
34239
|
+
LightOnOcrForConditionalGeneration,
|
|
33804
34240
|
LiteWhisperForConditionalGeneration,
|
|
33805
34241
|
Llama4ForCausalLM,
|
|
33806
34242
|
Llama4PreTrainedModel,
|
|
@@ -33870,6 +34306,9 @@ var ModelRegistry = class {
|
|
|
33870
34306
|
MimiPreTrainedModel,
|
|
33871
34307
|
MinLengthLogitsProcessor,
|
|
33872
34308
|
MinNewTokensLengthLogitsProcessor,
|
|
34309
|
+
Mistral4ForCausalLM,
|
|
34310
|
+
Mistral4Model,
|
|
34311
|
+
Mistral4PreTrainedModel,
|
|
33873
34312
|
MistralForCausalLM,
|
|
33874
34313
|
MistralModel,
|
|
33875
34314
|
MistralPreTrainedModel,
|
|
@@ -33941,6 +34380,9 @@ var ModelRegistry = class {
|
|
|
33941
34380
|
NanoChatForCausalLM,
|
|
33942
34381
|
NanoChatModel,
|
|
33943
34382
|
NanoChatPreTrainedModel,
|
|
34383
|
+
NemotronHForCausalLM,
|
|
34384
|
+
NemotronHModel,
|
|
34385
|
+
NemotronHPreTrainedModel,
|
|
33944
34386
|
NeoBertForMaskedLM,
|
|
33945
34387
|
NeoBertForQuestionAnswering,
|
|
33946
34388
|
NeoBertForSequenceClassification,
|
|
@@ -34130,6 +34572,9 @@ var ModelRegistry = class {
|
|
|
34130
34572
|
SnacFeatureExtractor,
|
|
34131
34573
|
SnacModel,
|
|
34132
34574
|
SnacPreTrainedModel,
|
|
34575
|
+
SolarOpenForCausalLM,
|
|
34576
|
+
SolarOpenModel,
|
|
34577
|
+
SolarOpenPreTrainedModel,
|
|
34133
34578
|
SpeechT5FeatureExtractor,
|
|
34134
34579
|
SpeechT5ForSpeechToText,
|
|
34135
34580
|
SpeechT5ForTextToSpeech,
|
|
@@ -34327,7 +34772,7 @@ var ModelRegistry = class {
|
|
|
34327
34772
|
|
|
34328
34773
|
onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
|
|
34329
34774
|
(*!
|
|
34330
|
-
* ONNX Runtime Web v1.25.0-dev.
|
|
34775
|
+
* ONNX Runtime Web v1.25.0-dev.20260323-a99aad9d36
|
|
34331
34776
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
34332
34777
|
* Licensed under the MIT License.
|
|
34333
34778
|
*)
|