@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +2255 -931
- package/dist/transformers.min.js +19 -19
- package/dist/transformers.node.cjs +2300 -934
- package/dist/transformers.node.min.cjs +20 -20
- package/dist/transformers.node.min.mjs +20 -20
- package/dist/transformers.node.mjs +2336 -1012
- package/dist/transformers.web.js +2327 -1003
- package/dist/transformers.web.min.js +17 -17
- package/package.json +4 -4
- package/src/cache_utils.js +62 -0
- package/src/configs.js +45 -24
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +27 -17
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +3 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +224 -308
- package/src/models/models.js +14 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +4 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +194 -143
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +42 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/pipelines.js +1 -0
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +15 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +18 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +3 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +14 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +4 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +43 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
|
@@ -14,7 +14,7 @@ var __export = (target, all) => {
|
|
|
14
14
|
import fs from "fs";
|
|
15
15
|
import path from "path";
|
|
16
16
|
import url from "url";
|
|
17
|
-
var VERSION = "4.0.0-next.
|
|
17
|
+
var VERSION = "4.0.0-next.8";
|
|
18
18
|
var HAS_SELF = typeof self !== "undefined";
|
|
19
19
|
var IS_FS_AVAILABLE = !isEmpty(fs);
|
|
20
20
|
var IS_PATH_AVAILABLE = !isEmpty(path);
|
|
@@ -142,6 +142,7 @@ var env = {
|
|
|
142
142
|
customCache: null,
|
|
143
143
|
useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
|
|
144
144
|
cacheKey: "transformers-cache",
|
|
145
|
+
experimental_useCrossOriginStorage: false,
|
|
145
146
|
/////////////////// Custom fetch /////////////////////
|
|
146
147
|
fetch: DEFAULT_FETCH
|
|
147
148
|
//////////////////////////////////////////////////////
|
|
@@ -243,7 +244,7 @@ var logger = {
|
|
|
243
244
|
}
|
|
244
245
|
};
|
|
245
246
|
|
|
246
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
247
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
247
248
|
var DictionarySplitter = class {
|
|
248
249
|
/**
|
|
249
250
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -1899,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
1899
1900
|
);
|
|
1900
1901
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
1901
1902
|
output_tokens.push(...byte_tokens);
|
|
1902
|
-
} else {
|
|
1903
|
+
} else if (this.unk_token != null) {
|
|
1903
1904
|
output_tokens.push(this.unk_token);
|
|
1904
1905
|
}
|
|
1905
|
-
} else {
|
|
1906
|
+
} else if (this.unk_token != null) {
|
|
1906
1907
|
output_tokens.push(this.unk_token);
|
|
1907
1908
|
}
|
|
1908
1909
|
}
|
|
@@ -2692,7 +2693,7 @@ var Tokenizer = class {
|
|
|
2692
2693
|
};
|
|
2693
2694
|
var Tokenizer_default = Tokenizer;
|
|
2694
2695
|
|
|
2695
|
-
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.
|
|
2696
|
+
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
|
|
2696
2697
|
var TOKEN_TYPES = Object.freeze({
|
|
2697
2698
|
Text: "Text",
|
|
2698
2699
|
// The text between Jinja statements or expressions
|
|
@@ -4211,7 +4212,11 @@ var Environment = class {
|
|
|
4211
4212
|
["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
|
|
4212
4213
|
["integer", (operand) => operand instanceof IntegerValue],
|
|
4213
4214
|
["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
|
|
4214
|
-
["mapping", (operand) => operand
|
|
4215
|
+
["mapping", (operand) => operand instanceof ObjectValue],
|
|
4216
|
+
[
|
|
4217
|
+
"sequence",
|
|
4218
|
+
(operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
|
|
4219
|
+
],
|
|
4215
4220
|
[
|
|
4216
4221
|
"lower",
|
|
4217
4222
|
(operand) => {
|
|
@@ -4484,6 +4489,9 @@ var Interpreter = class {
|
|
|
4484
4489
|
applyFilter(operand, filterNode, environment) {
|
|
4485
4490
|
if (filterNode.type === "Identifier") {
|
|
4486
4491
|
const filter = filterNode;
|
|
4492
|
+
if (filter.value === "safe") {
|
|
4493
|
+
return operand;
|
|
4494
|
+
}
|
|
4487
4495
|
if (filter.value === "tojson") {
|
|
4488
4496
|
return new StringValue(toJSON(operand, {}));
|
|
4489
4497
|
}
|
|
@@ -4573,6 +4581,8 @@ var Interpreter = class {
|
|
|
4573
4581
|
return new IntegerValue(Math.floor(operand.value));
|
|
4574
4582
|
case "float":
|
|
4575
4583
|
return new FloatValue(operand.value);
|
|
4584
|
+
case "string":
|
|
4585
|
+
return new StringValue(operand.toString());
|
|
4576
4586
|
default:
|
|
4577
4587
|
throw new Error(`Unknown NumericValue filter: ${filter.value}`);
|
|
4578
4588
|
}
|
|
@@ -6000,9 +6010,216 @@ function toAbsoluteURL(url2) {
|
|
|
6000
6010
|
return new URL(url2, baseURL).href;
|
|
6001
6011
|
}
|
|
6002
6012
|
|
|
6013
|
+
// src/utils/cache/CrossOriginStorageCache.js
|
|
6014
|
+
var HASH_ALGORITHM = "SHA-256";
|
|
6015
|
+
var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
|
|
6016
|
+
var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
|
|
6017
|
+
var CrossOriginStorage = class {
|
|
6018
|
+
/** @type {Promise<Cache> | null} */
|
|
6019
|
+
#hashCache = null;
|
|
6020
|
+
/**
|
|
6021
|
+
* Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
|
|
6022
|
+
* @returns {Promise<Cache>}
|
|
6023
|
+
*/
|
|
6024
|
+
_getHashCache = () => {
|
|
6025
|
+
this.#hashCache ??= caches.open(HASH_CACHE_NAME);
|
|
6026
|
+
return this.#hashCache;
|
|
6027
|
+
};
|
|
6028
|
+
/**
|
|
6029
|
+
* Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
|
|
6030
|
+
* @returns {boolean}
|
|
6031
|
+
*/
|
|
6032
|
+
static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
|
|
6033
|
+
/**
|
|
6034
|
+
* Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
|
|
6035
|
+
* the corresponding file handle from cross-origin storage.
|
|
6036
|
+
*
|
|
6037
|
+
* Implements `CacheInterface.match`.
|
|
6038
|
+
*
|
|
6039
|
+
* @param {string} request The URL of the resource to look up.
|
|
6040
|
+
* @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
|
|
6041
|
+
*/
|
|
6042
|
+
match = async (request) => {
|
|
6043
|
+
const hashValue = await this._getFileHash(request);
|
|
6044
|
+
if (!hashValue) {
|
|
6045
|
+
return void 0;
|
|
6046
|
+
}
|
|
6047
|
+
try {
|
|
6048
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
|
|
6049
|
+
const blob = await handle.getFile();
|
|
6050
|
+
return new Response(blob, {
|
|
6051
|
+
headers: {
|
|
6052
|
+
"Content-Length": String(blob.size)
|
|
6053
|
+
}
|
|
6054
|
+
});
|
|
6055
|
+
} catch {
|
|
6056
|
+
return void 0;
|
|
6057
|
+
}
|
|
6058
|
+
};
|
|
6059
|
+
/**
|
|
6060
|
+
* Stores a response in cross-origin storage, keyed by its SHA-256 hash.
|
|
6061
|
+
*
|
|
6062
|
+
* For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
|
|
6063
|
+
* `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
|
|
6064
|
+
* without reading the response body a second time.
|
|
6065
|
+
*
|
|
6066
|
+
* For non-LFS resources the hash is unknown upfront. In that case the body is consumed
|
|
6067
|
+
* in the background: the stream is read to compute the content hash, the file is written
|
|
6068
|
+
* into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
|
|
6069
|
+
* so that future `match` calls can resolve the file without a network round-trip.
|
|
6070
|
+
*
|
|
6071
|
+
* Implements `CacheInterface.put`.
|
|
6072
|
+
*
|
|
6073
|
+
* @param {string} request The URL of the resource (used as the hash-cache key).
|
|
6074
|
+
* @param {Response} response The response whose body will be written to the cache.
|
|
6075
|
+
* @returns {Promise<void>}
|
|
6076
|
+
*/
|
|
6077
|
+
put = async (request, response) => {
|
|
6078
|
+
const hashValue = await this._getFileHash(request);
|
|
6079
|
+
if (hashValue) {
|
|
6080
|
+
const blob = await response.blob();
|
|
6081
|
+
await this._storeBlobInCOS(blob, hashValue);
|
|
6082
|
+
} else {
|
|
6083
|
+
this._processAndStore(request, response.body);
|
|
6084
|
+
}
|
|
6085
|
+
};
|
|
6086
|
+
/**
|
|
6087
|
+
* Writes a blob into cross-origin storage using the given pre-computed hex hash string.
|
|
6088
|
+
*
|
|
6089
|
+
* @param {Blob} blob
|
|
6090
|
+
* @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
|
|
6091
|
+
* @returns {Promise<void>}
|
|
6092
|
+
*/
|
|
6093
|
+
_storeBlobInCOS = async (blob, hashHex) => {
|
|
6094
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
|
|
6095
|
+
create: true
|
|
6096
|
+
});
|
|
6097
|
+
const writableStream = await handle.createWritable();
|
|
6098
|
+
await writableStream.write(blob);
|
|
6099
|
+
await writableStream.close();
|
|
6100
|
+
};
|
|
6101
|
+
/**
|
|
6102
|
+
* Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
|
|
6103
|
+
* of the resulting blob, stores it in cross-origin storage, and persists the computed
|
|
6104
|
+
* hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
|
|
6105
|
+
* file without a network round-trip.
|
|
6106
|
+
*
|
|
6107
|
+
* Called fire-and-forget from `put` — errors are swallowed so failures never surface to
|
|
6108
|
+
* the caller.
|
|
6109
|
+
*
|
|
6110
|
+
* @param {string} request The original resource URL.
|
|
6111
|
+
* @param {ReadableStream} stream The response body stream to consume.
|
|
6112
|
+
* @returns {Promise<void>}
|
|
6113
|
+
*/
|
|
6114
|
+
_processAndStore = async (request, stream) => {
|
|
6115
|
+
try {
|
|
6116
|
+
const chunks = [];
|
|
6117
|
+
for await (const chunk2 of stream) {
|
|
6118
|
+
chunks.push(chunk2);
|
|
6119
|
+
}
|
|
6120
|
+
const blob = new Blob(chunks);
|
|
6121
|
+
const hashHex = await this._getBlobHash(blob);
|
|
6122
|
+
await this._storeBlobInCOS(blob, hashHex);
|
|
6123
|
+
try {
|
|
6124
|
+
const hashCache = await this._getHashCache();
|
|
6125
|
+
await hashCache.put(request, new Response(hashHex));
|
|
6126
|
+
} catch {
|
|
6127
|
+
}
|
|
6128
|
+
} catch {
|
|
6129
|
+
}
|
|
6130
|
+
};
|
|
6131
|
+
/**
|
|
6132
|
+
* Deletes the cache entry for the given request.
|
|
6133
|
+
*
|
|
6134
|
+
* Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
|
|
6135
|
+
* expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
|
|
6136
|
+
* permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
|
|
6137
|
+
* re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
|
|
6138
|
+
*
|
|
6139
|
+
* Implements `CacheInterface.delete`.
|
|
6140
|
+
*
|
|
6141
|
+
* @param {string} request
|
|
6142
|
+
* @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
|
|
6143
|
+
*/
|
|
6144
|
+
delete = async (request) => {
|
|
6145
|
+
try {
|
|
6146
|
+
const hashCache = await this._getHashCache();
|
|
6147
|
+
return await hashCache.delete(request);
|
|
6148
|
+
} catch {
|
|
6149
|
+
return false;
|
|
6150
|
+
}
|
|
6151
|
+
};
|
|
6152
|
+
/**
|
|
6153
|
+
* Resolves the SHA-256 hash for a given URL.
|
|
6154
|
+
*
|
|
6155
|
+
* Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
|
|
6156
|
+
* Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
|
|
6157
|
+
* LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
|
|
6158
|
+
*
|
|
6159
|
+
* Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
|
|
6160
|
+
*
|
|
6161
|
+
* @param {string} url The resource URL to resolve a hash for.
|
|
6162
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6163
|
+
*/
|
|
6164
|
+
_getFileHash = async (url2) => {
|
|
6165
|
+
try {
|
|
6166
|
+
const hashCache = await this._getHashCache();
|
|
6167
|
+
const cached = await hashCache.match(url2);
|
|
6168
|
+
if (cached) {
|
|
6169
|
+
return cached.text();
|
|
6170
|
+
}
|
|
6171
|
+
const hash = await this._getLfsFileHash(url2);
|
|
6172
|
+
if (hash) {
|
|
6173
|
+
await hashCache.put(url2, new Response(hash));
|
|
6174
|
+
return hash;
|
|
6175
|
+
}
|
|
6176
|
+
return null;
|
|
6177
|
+
} catch {
|
|
6178
|
+
return null;
|
|
6179
|
+
}
|
|
6180
|
+
};
|
|
6181
|
+
/**
|
|
6182
|
+
* Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
|
|
6183
|
+
* Git LFS pointer file.
|
|
6184
|
+
*
|
|
6185
|
+
* Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
|
|
6186
|
+
* The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
|
|
6187
|
+
* Returns `null` for non-LFS URLs or when the network request fails.
|
|
6188
|
+
*
|
|
6189
|
+
* @see https://huggingface.co/docs/hub/en/storage-backends#xet
|
|
6190
|
+
* @param {string} url The resolved Hugging Face URL of the resource.
|
|
6191
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6192
|
+
*/
|
|
6193
|
+
_getLfsFileHash = async (url2) => {
|
|
6194
|
+
if (!url2.includes("/resolve/")) {
|
|
6195
|
+
return null;
|
|
6196
|
+
}
|
|
6197
|
+
const rawUrl = url2.replace("/resolve/", "/raw/");
|
|
6198
|
+
try {
|
|
6199
|
+
const text = await fetch(rawUrl).then((r) => r.text());
|
|
6200
|
+
const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
|
|
6201
|
+
return match ? match[1] : null;
|
|
6202
|
+
} catch {
|
|
6203
|
+
return null;
|
|
6204
|
+
}
|
|
6205
|
+
};
|
|
6206
|
+
/**
|
|
6207
|
+
* Computes the SHA-256 hash of a `Blob`'s contents.
|
|
6208
|
+
*
|
|
6209
|
+
* @param {Blob} blob The blob to hash.
|
|
6210
|
+
* @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
|
|
6211
|
+
*/
|
|
6212
|
+
_getBlobHash = async (blob) => {
|
|
6213
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
6214
|
+
const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
|
|
6215
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
6216
|
+
return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
6217
|
+
};
|
|
6218
|
+
};
|
|
6219
|
+
|
|
6003
6220
|
// src/utils/cache.js
|
|
6004
6221
|
async function getCache(file_cache_dir = null) {
|
|
6005
|
-
let
|
|
6222
|
+
let cache2 = null;
|
|
6006
6223
|
if (env.useCustomCache) {
|
|
6007
6224
|
if (!env.customCache) {
|
|
6008
6225
|
throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
|
|
@@ -6012,30 +6229,33 @@ async function getCache(file_cache_dir = null) {
|
|
|
6012
6229
|
"`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
|
|
6013
6230
|
);
|
|
6014
6231
|
}
|
|
6015
|
-
|
|
6232
|
+
cache2 = env.customCache;
|
|
6233
|
+
}
|
|
6234
|
+
if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
|
|
6235
|
+
cache2 = new CrossOriginStorage();
|
|
6016
6236
|
}
|
|
6017
|
-
if (!
|
|
6237
|
+
if (!cache2 && env.useBrowserCache) {
|
|
6018
6238
|
if (typeof caches === "undefined") {
|
|
6019
6239
|
throw Error("Browser cache is not available in this environment.");
|
|
6020
6240
|
}
|
|
6021
6241
|
try {
|
|
6022
|
-
|
|
6242
|
+
cache2 = await caches.open(env.cacheKey);
|
|
6023
6243
|
} catch (e) {
|
|
6024
6244
|
logger.warn("An error occurred while opening the browser cache:", e);
|
|
6025
6245
|
}
|
|
6026
6246
|
}
|
|
6027
|
-
if (!
|
|
6247
|
+
if (!cache2 && env.useFSCache) {
|
|
6028
6248
|
if (!apis.IS_FS_AVAILABLE) {
|
|
6029
6249
|
throw Error("File System Cache is not available in this environment.");
|
|
6030
6250
|
}
|
|
6031
|
-
|
|
6251
|
+
cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
|
|
6032
6252
|
}
|
|
6033
|
-
return
|
|
6253
|
+
return cache2;
|
|
6034
6254
|
}
|
|
6035
|
-
async function tryCache(
|
|
6255
|
+
async function tryCache(cache2, ...names) {
|
|
6036
6256
|
for (let name of names) {
|
|
6037
6257
|
try {
|
|
6038
|
-
let result = await
|
|
6258
|
+
let result = await cache2.match(name);
|
|
6039
6259
|
if (result) return result;
|
|
6040
6260
|
} catch (e) {
|
|
6041
6261
|
continue;
|
|
@@ -6044,6 +6264,83 @@ async function tryCache(cache, ...names) {
|
|
|
6044
6264
|
return void 0;
|
|
6045
6265
|
}
|
|
6046
6266
|
|
|
6267
|
+
// src/utils/lru_cache.js
|
|
6268
|
+
var LRUCache2 = class {
|
|
6269
|
+
/** @type {number} */
|
|
6270
|
+
#capacity;
|
|
6271
|
+
/** @type {Map<any, any>} */
|
|
6272
|
+
#cache;
|
|
6273
|
+
/**
|
|
6274
|
+
* Creates an LRUCache instance.
|
|
6275
|
+
* @param {number} capacity The maximum number of items the cache can hold.
|
|
6276
|
+
*/
|
|
6277
|
+
constructor(capacity) {
|
|
6278
|
+
this.#capacity = capacity;
|
|
6279
|
+
this.#cache = /* @__PURE__ */ new Map();
|
|
6280
|
+
}
|
|
6281
|
+
/**
|
|
6282
|
+
* Retrieves the value associated with the given key and marks the key as recently used.
|
|
6283
|
+
* @param {any} key The key to retrieve.
|
|
6284
|
+
* @returns {any} The value associated with the key, or undefined if the key does not exist.
|
|
6285
|
+
*/
|
|
6286
|
+
get(key) {
|
|
6287
|
+
if (!this.#cache.has(key)) return void 0;
|
|
6288
|
+
const value = this.#cache.get(key);
|
|
6289
|
+
this.#cache.delete(key);
|
|
6290
|
+
this.#cache.set(key, value);
|
|
6291
|
+
return value;
|
|
6292
|
+
}
|
|
6293
|
+
/**
|
|
6294
|
+
* Inserts or updates the key-value pair in the cache.
|
|
6295
|
+
* If the key already exists, it is updated and marked as recently used.
|
|
6296
|
+
* If the cache exceeds its capacity, the least recently used item is evicted.
|
|
6297
|
+
* @param {any} key The key to add or update.
|
|
6298
|
+
* @param {any} value The value to associate with the key.
|
|
6299
|
+
*/
|
|
6300
|
+
put(key, value) {
|
|
6301
|
+
if (this.#cache.has(key)) {
|
|
6302
|
+
this.#cache.delete(key);
|
|
6303
|
+
}
|
|
6304
|
+
this.#cache.set(key, value);
|
|
6305
|
+
if (this.#cache.size > this.#capacity) {
|
|
6306
|
+
this.#cache.delete(this.#cache.keys().next().value);
|
|
6307
|
+
}
|
|
6308
|
+
}
|
|
6309
|
+
/**
|
|
6310
|
+
* Removes the entry for the given key from the cache.
|
|
6311
|
+
* @param {any} key The key to delete.
|
|
6312
|
+
* @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
|
|
6313
|
+
*/
|
|
6314
|
+
delete(key) {
|
|
6315
|
+
return this.#cache.delete(key);
|
|
6316
|
+
}
|
|
6317
|
+
/**
|
|
6318
|
+
* Clears the cache.
|
|
6319
|
+
*/
|
|
6320
|
+
clear() {
|
|
6321
|
+
this.#cache.clear();
|
|
6322
|
+
}
|
|
6323
|
+
};
|
|
6324
|
+
|
|
6325
|
+
// src/utils/memoize_promise.js
|
|
6326
|
+
var MAX_CACHE_SIZE = 100;
|
|
6327
|
+
var cache = new LRUCache2(MAX_CACHE_SIZE);
|
|
6328
|
+
function memoizePromise(key, factory) {
|
|
6329
|
+
const cached = cache.get(key);
|
|
6330
|
+
if (cached !== void 0) {
|
|
6331
|
+
return cached;
|
|
6332
|
+
}
|
|
6333
|
+
const promise = factory().then(
|
|
6334
|
+
(value) => value,
|
|
6335
|
+
(err) => {
|
|
6336
|
+
cache.delete(key);
|
|
6337
|
+
return Promise.reject(err);
|
|
6338
|
+
}
|
|
6339
|
+
);
|
|
6340
|
+
cache.put(key, promise);
|
|
6341
|
+
return promise;
|
|
6342
|
+
}
|
|
6343
|
+
|
|
6047
6344
|
// src/utils/model_registry/get_file_metadata.js
|
|
6048
6345
|
async function fetch_file_head(urlOrPath) {
|
|
6049
6346
|
if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
|
|
@@ -6051,17 +6348,27 @@ async function fetch_file_head(urlOrPath) {
|
|
|
6051
6348
|
}
|
|
6052
6349
|
const headers = getFetchHeaders(urlOrPath);
|
|
6053
6350
|
headers.set("Range", "bytes=0-0");
|
|
6054
|
-
return env.fetch(urlOrPath, { method: "GET", headers });
|
|
6351
|
+
return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
|
|
6352
|
+
}
|
|
6353
|
+
function get_file_metadata(path_or_repo_id, filename, options = {}) {
|
|
6354
|
+
const key = JSON.stringify([
|
|
6355
|
+
path_or_repo_id,
|
|
6356
|
+
filename,
|
|
6357
|
+
options?.revision,
|
|
6358
|
+
options?.cache_dir,
|
|
6359
|
+
options?.local_files_only
|
|
6360
|
+
]);
|
|
6361
|
+
return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
|
|
6055
6362
|
}
|
|
6056
|
-
async function
|
|
6057
|
-
const
|
|
6363
|
+
async function _get_file_metadata(path_or_repo_id, filename, options) {
|
|
6364
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6058
6365
|
const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6059
6366
|
path_or_repo_id,
|
|
6060
6367
|
filename,
|
|
6061
6368
|
options,
|
|
6062
|
-
|
|
6369
|
+
cache2
|
|
6063
6370
|
);
|
|
6064
|
-
const cachedResponse = await checkCachedResource(
|
|
6371
|
+
const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6065
6372
|
if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
|
|
6066
6373
|
const size = cachedResponse.headers.get("content-length");
|
|
6067
6374
|
const contentType = cachedResponse.headers.get("content-type");
|
|
@@ -6159,7 +6466,7 @@ function getFetchHeaders(urlOrPath) {
|
|
|
6159
6466
|
}
|
|
6160
6467
|
return headers;
|
|
6161
6468
|
}
|
|
6162
|
-
function buildResourcePaths(path_or_repo_id, filename, options = {},
|
|
6469
|
+
function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
|
|
6163
6470
|
const revision = options.revision ?? "main";
|
|
6164
6471
|
const requestURL = pathJoin(path_or_repo_id, filename);
|
|
6165
6472
|
const validModelId = isValidHfModelId(path_or_repo_id);
|
|
@@ -6169,7 +6476,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6169
6476
|
env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
|
|
6170
6477
|
filename
|
|
6171
6478
|
);
|
|
6172
|
-
const proposedCacheKey =
|
|
6479
|
+
const proposedCacheKey = cache2 instanceof FileCache ? (
|
|
6173
6480
|
// Choose cache key for filesystem cache
|
|
6174
6481
|
// When using the main revision (default), we use the request URL as the cache key.
|
|
6175
6482
|
// If a specific revision is requested, we account for this in the cache key.
|
|
@@ -6183,14 +6490,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6183
6490
|
validModelId
|
|
6184
6491
|
};
|
|
6185
6492
|
}
|
|
6186
|
-
async function checkCachedResource(
|
|
6187
|
-
if (!
|
|
6493
|
+
async function checkCachedResource(cache2, localPath, proposedCacheKey) {
|
|
6494
|
+
if (!cache2) {
|
|
6188
6495
|
return void 0;
|
|
6189
6496
|
}
|
|
6190
|
-
return await tryCache(
|
|
6497
|
+
return await tryCache(cache2, localPath, proposedCacheKey);
|
|
6191
6498
|
}
|
|
6192
|
-
async function storeCachedResource(path_or_repo_id, filename,
|
|
6193
|
-
if (await
|
|
6499
|
+
async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
|
|
6500
|
+
if (await cache2.match(cacheKey) !== void 0) {
|
|
6194
6501
|
return;
|
|
6195
6502
|
}
|
|
6196
6503
|
if (!result) {
|
|
@@ -6200,20 +6507,22 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6200
6507
|
file: filename,
|
|
6201
6508
|
...data
|
|
6202
6509
|
}) : void 0;
|
|
6203
|
-
await
|
|
6510
|
+
await cache2.put(
|
|
6204
6511
|
cacheKey,
|
|
6205
6512
|
/** @type {Response} */
|
|
6206
6513
|
response,
|
|
6207
6514
|
wrapped_progress
|
|
6208
6515
|
);
|
|
6209
6516
|
} else if (typeof response !== "string") {
|
|
6210
|
-
|
|
6517
|
+
const headers = new Headers(response.headers);
|
|
6518
|
+
headers.set("content-length", result.byteLength.toString());
|
|
6519
|
+
await cache2.put(
|
|
6211
6520
|
cacheKey,
|
|
6212
6521
|
new Response(
|
|
6213
6522
|
/** @type {any} */
|
|
6214
6523
|
result,
|
|
6215
6524
|
{
|
|
6216
|
-
headers
|
|
6525
|
+
headers
|
|
6217
6526
|
}
|
|
6218
6527
|
)
|
|
6219
6528
|
).catch((err) => {
|
|
@@ -6221,17 +6530,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6221
6530
|
});
|
|
6222
6531
|
}
|
|
6223
6532
|
}
|
|
6224
|
-
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false,
|
|
6533
|
+
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
|
|
6225
6534
|
const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6226
6535
|
path_or_repo_id,
|
|
6227
6536
|
filename,
|
|
6228
6537
|
options,
|
|
6229
|
-
|
|
6538
|
+
cache2
|
|
6230
6539
|
);
|
|
6231
6540
|
let cacheKey;
|
|
6232
6541
|
let toCacheResponse = false;
|
|
6233
6542
|
let response;
|
|
6234
|
-
response = await checkCachedResource(
|
|
6543
|
+
response = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6235
6544
|
const cacheHit = response !== void 0;
|
|
6236
6545
|
if (!cacheHit) {
|
|
6237
6546
|
if (env.allowLocalModels) {
|
|
@@ -6272,7 +6581,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6272
6581
|
}
|
|
6273
6582
|
cacheKey = proposedCacheKey;
|
|
6274
6583
|
}
|
|
6275
|
-
toCacheResponse =
|
|
6584
|
+
toCacheResponse = cache2 && // 1. A caching system is available
|
|
6276
6585
|
typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
|
|
6277
6586
|
response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
|
|
6278
6587
|
response.status === 200;
|
|
@@ -6334,7 +6643,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6334
6643
|
// i.e., do not cache FileResponses (prevents duplication)
|
|
6335
6644
|
toCacheResponse && cacheKey && typeof response !== "string"
|
|
6336
6645
|
) {
|
|
6337
|
-
await storeCachedResource(path_or_repo_id, filename,
|
|
6646
|
+
await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
|
|
6338
6647
|
}
|
|
6339
6648
|
dispatchCallback(options.progress_callback, {
|
|
6340
6649
|
status: "done",
|
|
@@ -6350,7 +6659,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6350
6659
|
if (response instanceof FileResponse) {
|
|
6351
6660
|
return response.filePath;
|
|
6352
6661
|
}
|
|
6353
|
-
const cachedResponse = await
|
|
6662
|
+
const cachedResponse = await cache2?.match(cacheKey);
|
|
6354
6663
|
if (cachedResponse instanceof FileResponse) {
|
|
6355
6664
|
return cachedResponse.filePath;
|
|
6356
6665
|
} else if (cachedResponse instanceof Response) {
|
|
@@ -6377,8 +6686,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
|
|
|
6377
6686
|
name: path_or_repo_id,
|
|
6378
6687
|
file: filename
|
|
6379
6688
|
});
|
|
6380
|
-
const
|
|
6381
|
-
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path,
|
|
6689
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6690
|
+
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
|
|
6382
6691
|
}
|
|
6383
6692
|
async function getModelText(modelPath, fileName, fatal = true, options = {}) {
|
|
6384
6693
|
const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
|
|
@@ -7171,7 +7480,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
|
|
|
7171
7480
|
// src/backends/onnx.js
|
|
7172
7481
|
import * as ONNX_NODE from "onnxruntime-node";
|
|
7173
7482
|
|
|
7174
|
-
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.
|
|
7483
|
+
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
|
|
7175
7484
|
var ort_webgpu_bundle_min_exports = {};
|
|
7176
7485
|
__export(ort_webgpu_bundle_min_exports, {
|
|
7177
7486
|
InferenceSession: () => Jf,
|
|
@@ -7939,7 +8248,7 @@ async function ts(a = {}) {
|
|
|
7939
8248
|
throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
|
|
7940
8249
|
}
|
|
7941
8250
|
function Ye() {
|
|
7942
|
-
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii,
|
|
8251
|
+
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
|
|
7943
8252
|
}
|
|
7944
8253
|
async function bt() {
|
|
7945
8254
|
function e(o, u) {
|
|
@@ -9126,7 +9435,7 @@ async function ts(a = {}) {
|
|
|
9126
9435
|
Te(`invalid type for getValue: ${t}`);
|
|
9127
9436
|
}
|
|
9128
9437
|
}, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
|
|
9129
|
-
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = {
|
|
9438
|
+
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
|
|
9130
9439
|
if (r === void 0 || !r.Uc) return 1;
|
|
9131
9440
|
if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
|
|
9132
9441
|
if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
|
|
@@ -9146,11 +9455,11 @@ async function ts(a = {}) {
|
|
|
9146
9455
|
} catch {
|
|
9147
9456
|
return 4;
|
|
9148
9457
|
}
|
|
9149
|
-
},
|
|
9458
|
+
}, 926500: (e, t, n) => {
|
|
9150
9459
|
r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
|
|
9151
|
-
},
|
|
9460
|
+
}, 926564: () => r.me(), 926606: (e) => {
|
|
9152
9461
|
r.jd(e);
|
|
9153
|
-
},
|
|
9462
|
+
}, 926643: () => typeof wasmOffsetConverter < "u" };
|
|
9154
9463
|
function af(e, t, n, o) {
|
|
9155
9464
|
var u = P();
|
|
9156
9465
|
try {
|
|
@@ -11066,7 +11375,7 @@ var $s = k(() => {
|
|
|
11066
11375
|
Ve();
|
|
11067
11376
|
Ve();
|
|
11068
11377
|
Ve();
|
|
11069
|
-
var Xa = "1.25.0-dev.
|
|
11378
|
+
var Xa = "1.25.0-dev.20260307-d626b568e0";
|
|
11070
11379
|
var Tl = Zr;
|
|
11071
11380
|
{
|
|
11072
11381
|
let a = ($s(), $t(Gs)).wasmBackend;
|
|
@@ -11077,11 +11386,11 @@ Object.defineProperty(K.versions, "web", { value: Xa, enumerable: true });
|
|
|
11077
11386
|
// src/backends/utils/cacheWasm.js
|
|
11078
11387
|
async function loadAndCacheFile(url2) {
|
|
11079
11388
|
const fileName = url2.split("/").pop();
|
|
11080
|
-
let
|
|
11389
|
+
let cache2;
|
|
11081
11390
|
try {
|
|
11082
|
-
|
|
11083
|
-
if (
|
|
11084
|
-
const result = await
|
|
11391
|
+
cache2 = await getCache();
|
|
11392
|
+
if (cache2) {
|
|
11393
|
+
const result = await cache2.match(url2);
|
|
11085
11394
|
if (result) {
|
|
11086
11395
|
return result;
|
|
11087
11396
|
}
|
|
@@ -11093,9 +11402,9 @@ async function loadAndCacheFile(url2) {
|
|
|
11093
11402
|
if (!response.ok) {
|
|
11094
11403
|
throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
|
|
11095
11404
|
}
|
|
11096
|
-
if (
|
|
11405
|
+
if (cache2) {
|
|
11097
11406
|
try {
|
|
11098
|
-
await
|
|
11407
|
+
await cache2.put(url2, response.clone());
|
|
11099
11408
|
} catch (e) {
|
|
11100
11409
|
logger.warn(`Failed to cache ${fileName}:`, e);
|
|
11101
11410
|
}
|
|
@@ -12947,9 +13256,23 @@ var Tensor2 = class _Tensor {
|
|
|
12947
13256
|
throw Error(`Unsupported norm: ${p}`);
|
|
12948
13257
|
}
|
|
12949
13258
|
const this_data = this.data;
|
|
12950
|
-
const
|
|
13259
|
+
const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
|
|
13260
|
+
if (is_bigint && p !== 1) {
|
|
13261
|
+
throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
|
|
13262
|
+
}
|
|
13263
|
+
let fn2, zero;
|
|
13264
|
+
if (is_bigint) {
|
|
13265
|
+
fn2 = (a, b) => a + b;
|
|
13266
|
+
zero = 0n;
|
|
13267
|
+
} else {
|
|
13268
|
+
fn2 = (a, b) => a + b ** p;
|
|
13269
|
+
zero = 0;
|
|
13270
|
+
}
|
|
12951
13271
|
if (dim === null) {
|
|
12952
|
-
|
|
13272
|
+
let val = this_data.reduce(fn2, zero);
|
|
13273
|
+
if (p !== 1) {
|
|
13274
|
+
val = val ** (1 / p);
|
|
13275
|
+
}
|
|
12953
13276
|
return new _Tensor(this.type, [val], []);
|
|
12954
13277
|
}
|
|
12955
13278
|
const [type, result, resultDims] = reduce_helper(fn2, this, dim, keepdim);
|
|
@@ -15409,9 +15732,12 @@ __export(processors_exports, {
|
|
|
15409
15732
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
15410
15733
|
Florence2Processor: () => Florence2Processor,
|
|
15411
15734
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
15735
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
15736
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
15412
15737
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
15413
15738
|
Idefics3Processor: () => Idefics3Processor,
|
|
15414
15739
|
JinaCLIPProcessor: () => JinaCLIPProcessor,
|
|
15740
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
15415
15741
|
LlavaProcessor: () => LlavaProcessor,
|
|
15416
15742
|
MgpstrProcessor: () => MgpstrProcessor,
|
|
15417
15743
|
MoonshineProcessor: () => MoonshineProcessor,
|
|
@@ -15432,6 +15758,7 @@ __export(processors_exports, {
|
|
|
15432
15758
|
UltravoxProcessor: () => UltravoxProcessor,
|
|
15433
15759
|
VLChatProcessor: () => VLChatProcessor,
|
|
15434
15760
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
15761
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
15435
15762
|
Wav2Vec2Processor: () => Wav2Vec2Processor,
|
|
15436
15763
|
Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
|
|
15437
15764
|
WhisperProcessor: () => WhisperProcessor
|
|
@@ -15486,12 +15813,14 @@ __export(feature_extractors_exports, {
|
|
|
15486
15813
|
EncodecFeatureExtractor: () => EncodecFeatureExtractor,
|
|
15487
15814
|
FeatureExtractor: () => FeatureExtractor,
|
|
15488
15815
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
15816
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
15489
15817
|
MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
|
|
15490
15818
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
15491
15819
|
PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
|
|
15492
15820
|
SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
|
|
15493
15821
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
15494
15822
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
15823
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
15495
15824
|
Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
|
|
15496
15825
|
WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
|
|
15497
15826
|
WhisperFeatureExtractor: () => WhisperFeatureExtractor
|
|
@@ -15719,6 +16048,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
15719
16048
|
mel_filters = null,
|
|
15720
16049
|
mel_floor = 1e-10,
|
|
15721
16050
|
log_mel = null,
|
|
16051
|
+
max_log_mel = null,
|
|
15722
16052
|
reference = 1,
|
|
15723
16053
|
min_value = 1e-10,
|
|
15724
16054
|
db_range = null,
|
|
@@ -15858,6 +16188,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
15858
16188
|
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
15859
16189
|
}
|
|
15860
16190
|
break;
|
|
16191
|
+
case "log10_max_norm": {
|
|
16192
|
+
for (let i = 0; i < o; ++i) {
|
|
16193
|
+
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
16194
|
+
}
|
|
16195
|
+
const logMax = max_log_mel ?? max(mel_spec_data)[0];
|
|
16196
|
+
const threshold = logMax - 8;
|
|
16197
|
+
for (let i = 0; i < o; ++i) {
|
|
16198
|
+
mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
|
|
16199
|
+
}
|
|
16200
|
+
break;
|
|
16201
|
+
}
|
|
15861
16202
|
case "dB":
|
|
15862
16203
|
if (power === 1) {
|
|
15863
16204
|
amplitude_to_db(mel_spec_data, reference, min_value, db_range);
|
|
@@ -15868,7 +16209,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
15868
16209
|
}
|
|
15869
16210
|
break;
|
|
15870
16211
|
default:
|
|
15871
|
-
throw new Error(
|
|
16212
|
+
throw new Error(
|
|
16213
|
+
`log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
|
|
16214
|
+
);
|
|
15872
16215
|
}
|
|
15873
16216
|
}
|
|
15874
16217
|
return mel_spec;
|
|
@@ -16373,6 +16716,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
|
|
|
16373
16716
|
}
|
|
16374
16717
|
};
|
|
16375
16718
|
|
|
16719
|
+
// src/models/granite_speech/feature_extraction_granite_speech.js
|
|
16720
|
+
var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
|
|
16721
|
+
constructor(config) {
|
|
16722
|
+
super(config);
|
|
16723
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
16724
|
+
this.mel_filters = mel_filter_bank(
|
|
16725
|
+
Math.floor(1 + n_fft / 2),
|
|
16726
|
+
// num_frequency_bins = 257
|
|
16727
|
+
n_mels,
|
|
16728
|
+
// 80
|
|
16729
|
+
0,
|
|
16730
|
+
// min_frequency
|
|
16731
|
+
sample_rate / 2,
|
|
16732
|
+
// max_frequency = 8000
|
|
16733
|
+
sample_rate,
|
|
16734
|
+
// 16000
|
|
16735
|
+
null,
|
|
16736
|
+
// norm (torchaudio default: no norm)
|
|
16737
|
+
"htk"
|
|
16738
|
+
// mel_scale (torchaudio default)
|
|
16739
|
+
);
|
|
16740
|
+
const raw_window = window_function(win_length, "hann");
|
|
16741
|
+
this.window = new Float64Array(n_fft);
|
|
16742
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
16743
|
+
this.window.set(raw_window, pad);
|
|
16744
|
+
}
|
|
16745
|
+
/**
|
|
16746
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
16747
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
16748
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
16749
|
+
*/
|
|
16750
|
+
async _call(audio) {
|
|
16751
|
+
validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
|
|
16752
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
16753
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
16754
|
+
const max_num_frames = num_frames - num_frames % 2;
|
|
16755
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
16756
|
+
power: 2,
|
|
16757
|
+
mel_filters: this.mel_filters,
|
|
16758
|
+
log_mel: "log10_max_norm",
|
|
16759
|
+
transpose: true,
|
|
16760
|
+
// [time, n_mels]
|
|
16761
|
+
max_num_frames,
|
|
16762
|
+
do_pad: false
|
|
16763
|
+
});
|
|
16764
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
16765
|
+
return { input_features };
|
|
16766
|
+
}
|
|
16767
|
+
};
|
|
16768
|
+
|
|
16376
16769
|
// src/models/moonshine/feature_extraction_moonshine.js
|
|
16377
16770
|
var MoonshineFeatureExtractor = class extends FeatureExtractor {
|
|
16378
16771
|
/**
|
|
@@ -16853,6 +17246,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
|
|
|
16853
17246
|
}
|
|
16854
17247
|
};
|
|
16855
17248
|
|
|
17249
|
+
// src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
|
|
17250
|
+
var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
|
|
17251
|
+
constructor(config) {
|
|
17252
|
+
super(config);
|
|
17253
|
+
this.config.mel_filters ??= mel_filter_bank(
|
|
17254
|
+
Math.floor(1 + this.config.n_fft / 2),
|
|
17255
|
+
// num_frequency_bins
|
|
17256
|
+
this.config.feature_size,
|
|
17257
|
+
// num_mel_filters
|
|
17258
|
+
0,
|
|
17259
|
+
// min_frequency
|
|
17260
|
+
8e3,
|
|
17261
|
+
// max_frequency
|
|
17262
|
+
this.config.sampling_rate,
|
|
17263
|
+
// sampling_rate
|
|
17264
|
+
"slaney",
|
|
17265
|
+
// norm
|
|
17266
|
+
"slaney"
|
|
17267
|
+
// mel_scale
|
|
17268
|
+
);
|
|
17269
|
+
this.window = window_function(this.config.n_fft, "hann");
|
|
17270
|
+
}
|
|
17271
|
+
/**
|
|
17272
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
17273
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
17274
|
+
* @param {Object} [options]
|
|
17275
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
|
|
17276
|
+
* @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
|
|
17277
|
+
*/
|
|
17278
|
+
async _extract_fbank_features(waveform, { center = true } = {}) {
|
|
17279
|
+
const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
|
|
17280
|
+
const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
|
|
17281
|
+
return await spectrogram(
|
|
17282
|
+
waveform,
|
|
17283
|
+
this.window,
|
|
17284
|
+
n_fft,
|
|
17285
|
+
// frame_length
|
|
17286
|
+
hop_length,
|
|
17287
|
+
{
|
|
17288
|
+
power: 2,
|
|
17289
|
+
mel_filters,
|
|
17290
|
+
log_mel: "log10_max_norm",
|
|
17291
|
+
max_log_mel: global_log_mel_max,
|
|
17292
|
+
center,
|
|
17293
|
+
max_num_frames,
|
|
17294
|
+
do_pad: false
|
|
17295
|
+
}
|
|
17296
|
+
);
|
|
17297
|
+
}
|
|
17298
|
+
/**
|
|
17299
|
+
* Extract mel spectrogram features from audio.
|
|
17300
|
+
* @param {Float32Array|Float64Array} audio The audio data.
|
|
17301
|
+
* @param {Object} [options]
|
|
17302
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform.
|
|
17303
|
+
* @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
|
|
17304
|
+
*/
|
|
17305
|
+
async _call(audio, { center = true } = {}) {
|
|
17306
|
+
validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
|
|
17307
|
+
const features = await this._extract_fbank_features(audio, { center });
|
|
17308
|
+
return {
|
|
17309
|
+
input_features: features.unsqueeze_(0)
|
|
17310
|
+
};
|
|
17311
|
+
}
|
|
17312
|
+
};
|
|
17313
|
+
|
|
16856
17314
|
// src/models/whisper/feature_extraction_whisper.js
|
|
16857
17315
|
var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
16858
17316
|
constructor(config) {
|
|
@@ -16881,7 +17339,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
16881
17339
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
16882
17340
|
*/
|
|
16883
17341
|
async _extract_fbank_features(waveform) {
|
|
16884
|
-
|
|
17342
|
+
return await spectrogram(
|
|
16885
17343
|
waveform,
|
|
16886
17344
|
this.window,
|
|
16887
17345
|
// window
|
|
@@ -16892,7 +17350,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
16892
17350
|
{
|
|
16893
17351
|
power: 2,
|
|
16894
17352
|
mel_filters: this.config.mel_filters,
|
|
16895
|
-
log_mel: "
|
|
17353
|
+
log_mel: "log10_max_norm",
|
|
16896
17354
|
// Custom
|
|
16897
17355
|
max_num_frames: Math.min(
|
|
16898
17356
|
Math.floor(waveform.length / this.config.hop_length),
|
|
@@ -16901,15 +17359,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
16901
17359
|
)
|
|
16902
17360
|
}
|
|
16903
17361
|
);
|
|
16904
|
-
const data = features.data;
|
|
16905
|
-
const maxValue = max(
|
|
16906
|
-
/** @type {Float32Array} */
|
|
16907
|
-
data
|
|
16908
|
-
)[0];
|
|
16909
|
-
for (let i = 0; i < data.length; ++i) {
|
|
16910
|
-
data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
|
|
16911
|
-
}
|
|
16912
|
-
return features;
|
|
16913
17362
|
}
|
|
16914
17363
|
/**
|
|
16915
17364
|
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
@@ -17788,6 +18237,30 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
17788
18237
|
}
|
|
17789
18238
|
return [segmentation, segments];
|
|
17790
18239
|
}
|
|
18240
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
18241
|
+
if (height < factor || width < factor) {
|
|
18242
|
+
const scale = Math.max(factor / height, factor / width);
|
|
18243
|
+
height = Math.round(height * scale);
|
|
18244
|
+
width = Math.round(width * scale);
|
|
18245
|
+
}
|
|
18246
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
18247
|
+
throw new Error(
|
|
18248
|
+
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
18249
|
+
);
|
|
18250
|
+
}
|
|
18251
|
+
let h_bar = Math.round(height / factor) * factor;
|
|
18252
|
+
let w_bar = Math.round(width / factor) * factor;
|
|
18253
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
18254
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
18255
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
18256
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
18257
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
18258
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
18259
|
+
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
18260
|
+
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
18261
|
+
}
|
|
18262
|
+
return [w_bar, h_bar];
|
|
18263
|
+
}
|
|
17791
18264
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
17792
18265
|
if (label_ids_to_fuse === null) {
|
|
17793
18266
|
logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
|
|
@@ -17865,7 +18338,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
17865
18338
|
this.do_pad = config.do_pad;
|
|
17866
18339
|
this.min_pixels = config.min_pixels;
|
|
17867
18340
|
this.max_pixels = config.max_pixels;
|
|
17868
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
18341
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
17869
18342
|
this.pad_size = this.size;
|
|
17870
18343
|
}
|
|
17871
18344
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -18076,7 +18549,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18076
18549
|
});
|
|
18077
18550
|
}
|
|
18078
18551
|
/**
|
|
18079
|
-
* @typedef {
|
|
18552
|
+
* @typedef {Object} PreprocessedImage
|
|
18080
18553
|
* @property {HeightWidth} original_size The original size of the image.
|
|
18081
18554
|
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
|
|
18082
18555
|
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
|
|
@@ -18153,10 +18626,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18153
18626
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
18154
18627
|
[pixelData, imgDims] = padded;
|
|
18155
18628
|
} else if (this.size_divisibility) {
|
|
18156
|
-
const
|
|
18157
|
-
|
|
18158
|
-
this.size_divisibility
|
|
18159
|
-
);
|
|
18629
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
18630
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
18160
18631
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
18161
18632
|
}
|
|
18162
18633
|
}
|
|
@@ -18233,6 +18704,7 @@ var image_processors_exports = {};
|
|
|
18233
18704
|
__export(image_processors_exports, {
|
|
18234
18705
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
18235
18706
|
BitImageProcessor: () => BitImageProcessor,
|
|
18707
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
18236
18708
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
18237
18709
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
18238
18710
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -18249,11 +18721,13 @@ __export(image_processors_exports, {
|
|
|
18249
18721
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
18250
18722
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
18251
18723
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
18724
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
18252
18725
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
18253
18726
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
18254
18727
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
18255
18728
|
ImageProcessor: () => ImageProcessor,
|
|
18256
18729
|
JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
|
|
18730
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
18257
18731
|
LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
|
|
18258
18732
|
Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
|
|
18259
18733
|
MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
|
|
@@ -18308,6 +18782,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
18308
18782
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
18309
18783
|
};
|
|
18310
18784
|
|
|
18785
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
18786
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
18787
|
+
};
|
|
18788
|
+
|
|
18311
18789
|
// src/models/clip/image_processing_clip.js
|
|
18312
18790
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
18313
18791
|
};
|
|
@@ -18427,32 +18905,91 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
18427
18905
|
}
|
|
18428
18906
|
};
|
|
18429
18907
|
|
|
18430
|
-
// src/models/
|
|
18431
|
-
var
|
|
18432
|
-
};
|
|
18433
|
-
|
|
18434
|
-
// src/models/grounding_dino/image_processing_grounding_dino.js
|
|
18435
|
-
var GroundingDinoImageProcessor = class extends ImageProcessor {
|
|
18436
|
-
/**
|
|
18437
|
-
* Calls the feature extraction process on an array of images, preprocesses
|
|
18438
|
-
* each image, and concatenates the resulting features into a single Tensor.
|
|
18439
|
-
* @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
|
|
18440
|
-
* @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
|
|
18441
|
-
*/
|
|
18442
|
-
async _call(images) {
|
|
18443
|
-
const result = await super._call(images);
|
|
18444
|
-
const dims = result.pixel_values.dims;
|
|
18445
|
-
const pixel_mask = ones([dims[0], dims[2], dims[3]]);
|
|
18446
|
-
return { ...result, pixel_mask };
|
|
18447
|
-
}
|
|
18448
|
-
};
|
|
18449
|
-
|
|
18450
|
-
// src/models/idefics3/image_processing_idefics3.js
|
|
18451
|
-
var Idefics3ImageProcessor = class extends ImageProcessor {
|
|
18908
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
18909
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
18452
18910
|
constructor(config) {
|
|
18453
18911
|
super(config);
|
|
18454
|
-
this.
|
|
18455
|
-
this.
|
|
18912
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
18913
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
18914
|
+
this.patch_size = config.patch_size;
|
|
18915
|
+
this.merge_size = config.merge_size;
|
|
18916
|
+
}
|
|
18917
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
18918
|
+
get_resize_output_image_size(image, size) {
|
|
18919
|
+
const factor = this.patch_size * this.merge_size;
|
|
18920
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
18921
|
+
}
|
|
18922
|
+
async _call(images, ...args) {
|
|
18923
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
18924
|
+
let patches = pixel_values;
|
|
18925
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
18926
|
+
if (patches.dims[0] === 1) {
|
|
18927
|
+
patches = cat(
|
|
18928
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
18929
|
+
0
|
|
18930
|
+
);
|
|
18931
|
+
}
|
|
18932
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
18933
|
+
const channel = patches.dims[1];
|
|
18934
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
18935
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
18936
|
+
const flatten_patches = patches.view(
|
|
18937
|
+
grid_t,
|
|
18938
|
+
temporal_patch_size,
|
|
18939
|
+
channel,
|
|
18940
|
+
Math.floor(grid_h / merge_size),
|
|
18941
|
+
merge_size,
|
|
18942
|
+
patch_size,
|
|
18943
|
+
Math.floor(grid_w / merge_size),
|
|
18944
|
+
merge_size,
|
|
18945
|
+
patch_size
|
|
18946
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
18947
|
+
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
18948
|
+
return {
|
|
18949
|
+
pixel_values: flatten_patches,
|
|
18950
|
+
image_grid_thw,
|
|
18951
|
+
original_sizes,
|
|
18952
|
+
reshaped_input_sizes
|
|
18953
|
+
};
|
|
18954
|
+
}
|
|
18955
|
+
};
|
|
18956
|
+
|
|
18957
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
18958
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
18959
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
18960
|
+
get_resize_output_image_size(image, size) {
|
|
18961
|
+
const factor = this.patch_size * this.merge_size;
|
|
18962
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
18963
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
18964
|
+
}
|
|
18965
|
+
};
|
|
18966
|
+
|
|
18967
|
+
// src/models/glpn/image_processing_glpn.js
|
|
18968
|
+
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
18969
|
+
};
|
|
18970
|
+
|
|
18971
|
+
// src/models/grounding_dino/image_processing_grounding_dino.js
|
|
18972
|
+
var GroundingDinoImageProcessor = class extends ImageProcessor {
|
|
18973
|
+
/**
|
|
18974
|
+
* Calls the feature extraction process on an array of images, preprocesses
|
|
18975
|
+
* each image, and concatenates the resulting features into a single Tensor.
|
|
18976
|
+
* @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
|
|
18977
|
+
* @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
|
|
18978
|
+
*/
|
|
18979
|
+
async _call(images) {
|
|
18980
|
+
const result = await super._call(images);
|
|
18981
|
+
const dims = result.pixel_values.dims;
|
|
18982
|
+
const pixel_mask = ones([dims[0], dims[2], dims[3]]);
|
|
18983
|
+
return { ...result, pixel_mask };
|
|
18984
|
+
}
|
|
18985
|
+
};
|
|
18986
|
+
|
|
18987
|
+
// src/models/idefics3/image_processing_idefics3.js
|
|
18988
|
+
var Idefics3ImageProcessor = class extends ImageProcessor {
|
|
18989
|
+
constructor(config) {
|
|
18990
|
+
super(config);
|
|
18991
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
18992
|
+
this.max_image_size = config.max_image_size;
|
|
18456
18993
|
}
|
|
18457
18994
|
/**
|
|
18458
18995
|
* @typedef {import('../../utils/image.js').RawImage} RawImage
|
|
@@ -18657,6 +19194,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
|
|
|
18657
19194
|
}
|
|
18658
19195
|
};
|
|
18659
19196
|
|
|
19197
|
+
// src/models/lfm2_vl/image_processing_lfm2_vl.js
|
|
19198
|
+
function round_by_factor(number, factor) {
|
|
19199
|
+
return Math.round(number / factor) * factor;
|
|
19200
|
+
}
|
|
19201
|
+
function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
|
|
19202
|
+
let best_ratio_diff = Infinity;
|
|
19203
|
+
let best_ratio = [1, 1];
|
|
19204
|
+
const area = width * height;
|
|
19205
|
+
for (const ratio of target_ratios) {
|
|
19206
|
+
const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
|
|
19207
|
+
if (ratio_diff < best_ratio_diff) {
|
|
19208
|
+
best_ratio_diff = ratio_diff;
|
|
19209
|
+
best_ratio = ratio;
|
|
19210
|
+
} else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
|
|
19211
|
+
best_ratio = ratio;
|
|
19212
|
+
}
|
|
19213
|
+
}
|
|
19214
|
+
return best_ratio;
|
|
19215
|
+
}
|
|
19216
|
+
function get_target_ratios(min_tiles, max_tiles) {
|
|
19217
|
+
const ratios = [];
|
|
19218
|
+
const seen = /* @__PURE__ */ new Set();
|
|
19219
|
+
for (let n = min_tiles; n <= max_tiles; ++n) {
|
|
19220
|
+
for (let w = 1; w <= n; ++w) {
|
|
19221
|
+
for (let h = 1; h <= n; ++h) {
|
|
19222
|
+
const product2 = w * h;
|
|
19223
|
+
if (product2 >= min_tiles && product2 <= max_tiles) {
|
|
19224
|
+
const key = w << 16 | h;
|
|
19225
|
+
if (!seen.has(key)) {
|
|
19226
|
+
seen.add(key);
|
|
19227
|
+
ratios.push([w, h]);
|
|
19228
|
+
}
|
|
19229
|
+
}
|
|
19230
|
+
}
|
|
19231
|
+
}
|
|
19232
|
+
}
|
|
19233
|
+
return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
|
|
19234
|
+
}
|
|
19235
|
+
function convert_image_to_patches(images, patch_size) {
|
|
19236
|
+
const [B, C, H, W] = images.dims;
|
|
19237
|
+
const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
|
|
19238
|
+
const patch_dim = patch_size * patch_size * C;
|
|
19239
|
+
const data = (
|
|
19240
|
+
/** @type {Float32Array} */
|
|
19241
|
+
images.data
|
|
19242
|
+
);
|
|
19243
|
+
const result = new Float32Array(B * ph * pw * patch_dim);
|
|
19244
|
+
const ch_stride = H * W;
|
|
19245
|
+
for (let b = 0; b < B; ++b) {
|
|
19246
|
+
const b_src = b * C * ch_stride;
|
|
19247
|
+
const b_dst = b * ph * pw * patch_dim;
|
|
19248
|
+
for (let py = 0; py < ph; ++py) {
|
|
19249
|
+
for (let px = 0; px < pw; ++px) {
|
|
19250
|
+
let off = b_dst + (py * pw + px) * patch_dim;
|
|
19251
|
+
for (let dy = 0; dy < patch_size; ++dy) {
|
|
19252
|
+
const row = (py * patch_size + dy) * W + px * patch_size;
|
|
19253
|
+
for (let dx = 0; dx < patch_size; ++dx) {
|
|
19254
|
+
const pixel = row + dx;
|
|
19255
|
+
for (let c = 0; c < C; ++c) {
|
|
19256
|
+
result[off++] = data[b_src + c * ch_stride + pixel];
|
|
19257
|
+
}
|
|
19258
|
+
}
|
|
19259
|
+
}
|
|
19260
|
+
}
|
|
19261
|
+
}
|
|
19262
|
+
}
|
|
19263
|
+
return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
|
|
19264
|
+
}
|
|
19265
|
+
function pad_along_first_dim(patches, target_length) {
|
|
19266
|
+
const [, len2, dim] = patches.dims;
|
|
19267
|
+
const mask_data = new BigInt64Array(target_length);
|
|
19268
|
+
mask_data.fill(1n, 0, len2);
|
|
19269
|
+
let padded = patches;
|
|
19270
|
+
if (len2 < target_length) {
|
|
19271
|
+
const padded_data = new Float32Array(target_length * dim);
|
|
19272
|
+
padded_data.set(
|
|
19273
|
+
/** @type {Float32Array} */
|
|
19274
|
+
patches.data
|
|
19275
|
+
);
|
|
19276
|
+
padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
|
|
19277
|
+
}
|
|
19278
|
+
return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
|
|
19279
|
+
}
|
|
19280
|
+
var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
19281
|
+
constructor(config) {
|
|
19282
|
+
super(config);
|
|
19283
|
+
this.downsample_factor = config.downsample_factor ?? 2;
|
|
19284
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
19285
|
+
this.min_tiles = config.min_tiles ?? 2;
|
|
19286
|
+
this.max_tiles = config.max_tiles ?? 10;
|
|
19287
|
+
this.use_thumbnail = config.use_thumbnail ?? true;
|
|
19288
|
+
this.min_image_tokens = config.min_image_tokens ?? 64;
|
|
19289
|
+
this.max_image_tokens = config.max_image_tokens ?? 256;
|
|
19290
|
+
this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
|
|
19291
|
+
this.tile_size = config.tile_size ?? 512;
|
|
19292
|
+
this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
|
|
19293
|
+
this.return_row_col_info = config.return_row_col_info ?? false;
|
|
19294
|
+
const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
|
|
19295
|
+
const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
|
|
19296
|
+
this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
|
|
19297
|
+
}
|
|
19298
|
+
/**
|
|
19299
|
+
* Check if the image is too large to be processed as a single tile.
|
|
19300
|
+
* @param {number} height
|
|
19301
|
+
* @param {number} width
|
|
19302
|
+
* @returns {boolean}
|
|
19303
|
+
*/
|
|
19304
|
+
_is_image_too_large(height, width) {
|
|
19305
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
19306
|
+
const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
|
|
19307
|
+
const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
|
|
19308
|
+
return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
|
|
19309
|
+
}
|
|
19310
|
+
/**
|
|
19311
|
+
* Get the grid layout for tiling a large image.
|
|
19312
|
+
* @param {number} height
|
|
19313
|
+
* @param {number} width
|
|
19314
|
+
* @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
|
|
19315
|
+
*/
|
|
19316
|
+
_get_grid_layout(height, width) {
|
|
19317
|
+
const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
|
|
19318
|
+
const [grid_width, grid_height] = find_closest_aspect_ratio(
|
|
19319
|
+
width / height,
|
|
19320
|
+
target_ratios,
|
|
19321
|
+
width,
|
|
19322
|
+
height,
|
|
19323
|
+
this.tile_size
|
|
19324
|
+
);
|
|
19325
|
+
return {
|
|
19326
|
+
grid_width,
|
|
19327
|
+
grid_height,
|
|
19328
|
+
target_width: this.tile_size * grid_width,
|
|
19329
|
+
target_height: this.tile_size * grid_height
|
|
19330
|
+
};
|
|
19331
|
+
}
|
|
19332
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
19333
|
+
// @ts-expect-error
|
|
19334
|
+
async _call(images, { return_row_col_info = null } = {}) {
|
|
19335
|
+
let batched_images;
|
|
19336
|
+
if (!Array.isArray(images)) {
|
|
19337
|
+
batched_images = [[images]];
|
|
19338
|
+
} else if (!Array.isArray(images[0])) {
|
|
19339
|
+
batched_images = [
|
|
19340
|
+
/** @type {RawImage[]} */
|
|
19341
|
+
images
|
|
19342
|
+
];
|
|
19343
|
+
} else {
|
|
19344
|
+
batched_images = /** @type {RawImage[][]} */
|
|
19345
|
+
images;
|
|
19346
|
+
}
|
|
19347
|
+
const all_pixel_values = [];
|
|
19348
|
+
const all_pixel_masks = [];
|
|
19349
|
+
const all_spatial_shapes = [];
|
|
19350
|
+
const all_rows = [];
|
|
19351
|
+
const all_cols = [];
|
|
19352
|
+
const all_image_sizes = [];
|
|
19353
|
+
for (const image_batch of batched_images) {
|
|
19354
|
+
const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
|
|
19355
|
+
for (const { pixel_values } of preprocessed) {
|
|
19356
|
+
const [, height, width] = pixel_values.dims;
|
|
19357
|
+
const img = pixel_values.unsqueeze_(0);
|
|
19358
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
19359
|
+
const f2 = total_factor ** 2;
|
|
19360
|
+
const [new_width, new_height] = smart_resize(
|
|
19361
|
+
Math.max(total_factor, height),
|
|
19362
|
+
Math.max(total_factor, width),
|
|
19363
|
+
total_factor,
|
|
19364
|
+
this.min_image_tokens * f2,
|
|
19365
|
+
this.max_image_tokens * f2
|
|
19366
|
+
).map((x) => Math.max(total_factor, x));
|
|
19367
|
+
let tiles;
|
|
19368
|
+
let num_rows = 1, num_cols = 1;
|
|
19369
|
+
const is_large = this._is_image_too_large(height, width);
|
|
19370
|
+
const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
|
|
19371
|
+
if (is_large && do_splitting) {
|
|
19372
|
+
const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
|
|
19373
|
+
height,
|
|
19374
|
+
width
|
|
19375
|
+
);
|
|
19376
|
+
num_rows = grid_height;
|
|
19377
|
+
num_cols = grid_width;
|
|
19378
|
+
const resized = await interpolate_4d(img, {
|
|
19379
|
+
size: [target_height, target_width]
|
|
19380
|
+
});
|
|
19381
|
+
tiles = [];
|
|
19382
|
+
for (let r = 0; r < grid_height; ++r) {
|
|
19383
|
+
for (let c = 0; c < grid_width; ++c) {
|
|
19384
|
+
const y = r * this.tile_size;
|
|
19385
|
+
const x = c * this.tile_size;
|
|
19386
|
+
tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
|
|
19387
|
+
}
|
|
19388
|
+
}
|
|
19389
|
+
if (this.use_thumbnail && grid_width * grid_height !== 1) {
|
|
19390
|
+
tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
|
|
19391
|
+
}
|
|
19392
|
+
} else {
|
|
19393
|
+
tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
|
|
19394
|
+
}
|
|
19395
|
+
for (const tile of tiles) {
|
|
19396
|
+
const [, , th, tw] = tile.dims;
|
|
19397
|
+
const patches = convert_image_to_patches(tile, this.encoder_patch_size);
|
|
19398
|
+
const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
|
|
19399
|
+
all_pixel_values.push(padded);
|
|
19400
|
+
all_pixel_masks.push(mask);
|
|
19401
|
+
all_spatial_shapes.push([
|
|
19402
|
+
Math.floor(th / this.encoder_patch_size),
|
|
19403
|
+
Math.floor(tw / this.encoder_patch_size)
|
|
19404
|
+
]);
|
|
19405
|
+
}
|
|
19406
|
+
all_rows.push(num_rows);
|
|
19407
|
+
all_cols.push(num_cols);
|
|
19408
|
+
all_image_sizes.push([new_height, new_width]);
|
|
19409
|
+
}
|
|
19410
|
+
}
|
|
19411
|
+
const result = {
|
|
19412
|
+
pixel_values: cat(all_pixel_values, 0),
|
|
19413
|
+
pixel_attention_mask: stack(all_pixel_masks, 0),
|
|
19414
|
+
spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
|
|
19415
|
+
all_spatial_shapes.length,
|
|
19416
|
+
2
|
|
19417
|
+
])
|
|
19418
|
+
};
|
|
19419
|
+
if (return_row_col_info ?? this.return_row_col_info) {
|
|
19420
|
+
result.image_rows = all_rows;
|
|
19421
|
+
result.image_cols = all_cols;
|
|
19422
|
+
result.image_sizes = all_image_sizes;
|
|
19423
|
+
}
|
|
19424
|
+
return result;
|
|
19425
|
+
}
|
|
19426
|
+
};
|
|
19427
|
+
|
|
18660
19428
|
// src/models/llava_onevision/image_processing_llava_onevision.js
|
|
18661
19429
|
var LlavaOnevisionImageProcessor = class extends ImageProcessor {
|
|
18662
19430
|
};
|
|
@@ -18879,76 +19647,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
18879
19647
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
18880
19648
|
};
|
|
18881
19649
|
|
|
18882
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
18883
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
18884
|
-
if (height < factor || width < factor) {
|
|
18885
|
-
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
18886
|
-
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
18887
|
-
throw new Error(
|
|
18888
|
-
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
18889
|
-
);
|
|
18890
|
-
}
|
|
18891
|
-
let h_bar = Math.round(height / factor) * factor;
|
|
18892
|
-
let w_bar = Math.round(width / factor) * factor;
|
|
18893
|
-
if (h_bar * w_bar > max_pixels) {
|
|
18894
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
18895
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
18896
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
18897
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
18898
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
18899
|
-
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
18900
|
-
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
18901
|
-
}
|
|
18902
|
-
return [h_bar, w_bar];
|
|
18903
|
-
}
|
|
18904
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
18905
|
-
constructor(config) {
|
|
18906
|
-
super(config);
|
|
18907
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
18908
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
18909
|
-
this.patch_size = config.patch_size;
|
|
18910
|
-
this.merge_size = config.merge_size;
|
|
18911
|
-
}
|
|
18912
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
18913
|
-
get_resize_output_image_size(image, size) {
|
|
18914
|
-
const factor = this.patch_size * this.merge_size;
|
|
18915
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
18916
|
-
}
|
|
18917
|
-
async _call(images, ...args) {
|
|
18918
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
18919
|
-
let patches = pixel_values;
|
|
18920
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
18921
|
-
if (patches.dims[0] === 1) {
|
|
18922
|
-
patches = cat(
|
|
18923
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
18924
|
-
0
|
|
18925
|
-
);
|
|
18926
|
-
}
|
|
18927
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
18928
|
-
const channel = patches.dims[1];
|
|
18929
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
18930
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
18931
|
-
const flatten_patches = patches.view(
|
|
18932
|
-
grid_t,
|
|
18933
|
-
temporal_patch_size,
|
|
18934
|
-
channel,
|
|
18935
|
-
Math.floor(grid_h / merge_size),
|
|
18936
|
-
merge_size,
|
|
18937
|
-
patch_size,
|
|
18938
|
-
Math.floor(grid_w / merge_size),
|
|
18939
|
-
merge_size,
|
|
18940
|
-
patch_size
|
|
18941
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
18942
|
-
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
18943
|
-
return {
|
|
18944
|
-
pixel_values: flatten_patches,
|
|
18945
|
-
image_grid_thw,
|
|
18946
|
-
original_sizes,
|
|
18947
|
-
reshaped_input_sizes
|
|
18948
|
-
};
|
|
18949
|
-
}
|
|
18950
|
-
};
|
|
18951
|
-
|
|
18952
19650
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
18953
19651
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
18954
19652
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -19502,6 +20200,107 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
19502
20200
|
}
|
|
19503
20201
|
};
|
|
19504
20202
|
|
|
20203
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
20204
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
20205
|
+
static image_processor_class = AutoImageProcessor;
|
|
20206
|
+
static tokenizer_class = AutoTokenizer;
|
|
20207
|
+
static image_token = "<|image_pad|>";
|
|
20208
|
+
/**
|
|
20209
|
+
*
|
|
20210
|
+
* @param {string|string[]} text
|
|
20211
|
+
* @param {RawImage|RawImage[]} images
|
|
20212
|
+
* @param {...any} args
|
|
20213
|
+
* @returns {Promise<any>}
|
|
20214
|
+
*/
|
|
20215
|
+
async _call(text, images = null, ...args) {
|
|
20216
|
+
if (!Array.isArray(text)) {
|
|
20217
|
+
text = [text];
|
|
20218
|
+
}
|
|
20219
|
+
let image_inputs, image_grid_thw;
|
|
20220
|
+
if (images) {
|
|
20221
|
+
image_inputs = await this.image_processor(images);
|
|
20222
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
20223
|
+
}
|
|
20224
|
+
if (image_grid_thw) {
|
|
20225
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
20226
|
+
let index = 0;
|
|
20227
|
+
const image_token = (
|
|
20228
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
20229
|
+
this.constructor.image_token
|
|
20230
|
+
);
|
|
20231
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
20232
|
+
text = text.map((t) => {
|
|
20233
|
+
while (t.includes(image_token)) {
|
|
20234
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
20235
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
20236
|
+
}
|
|
20237
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
20238
|
+
});
|
|
20239
|
+
}
|
|
20240
|
+
const text_inputs = this.tokenizer(text);
|
|
20241
|
+
return {
|
|
20242
|
+
...text_inputs,
|
|
20243
|
+
...image_inputs
|
|
20244
|
+
};
|
|
20245
|
+
}
|
|
20246
|
+
};
|
|
20247
|
+
|
|
20248
|
+
// src/models/glm46v/processing_glm46v.js
|
|
20249
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
20250
|
+
static image_token = "<|image|>";
|
|
20251
|
+
};
|
|
20252
|
+
|
|
20253
|
+
// src/models/granite_speech/processing_granite_speech.js
|
|
20254
|
+
var GraniteSpeechProcessor = class extends Processor {
|
|
20255
|
+
static tokenizer_class = AutoTokenizer;
|
|
20256
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
20257
|
+
static uses_processor_config = true;
|
|
20258
|
+
/**
|
|
20259
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
20260
|
+
* @param {number} audioLength Raw audio sample count.
|
|
20261
|
+
* @returns {number} Number of projector output tokens.
|
|
20262
|
+
*/
|
|
20263
|
+
_get_num_audio_features(audioLength) {
|
|
20264
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
20265
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
20266
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
20267
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
20268
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
20269
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
20270
|
+
return nblocks * effective_window_size;
|
|
20271
|
+
}
|
|
20272
|
+
/**
|
|
20273
|
+
* @param {string} text The text input to process.
|
|
20274
|
+
* @param {Float32Array} audio The audio input to process.
|
|
20275
|
+
*/
|
|
20276
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
20277
|
+
if (Array.isArray(text)) {
|
|
20278
|
+
throw new Error("Batched inputs are not supported yet.");
|
|
20279
|
+
}
|
|
20280
|
+
let audio_inputs = {};
|
|
20281
|
+
if (audio) {
|
|
20282
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
20283
|
+
audio_inputs["input_features"] = input_features;
|
|
20284
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
20285
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
20286
|
+
audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
|
|
20287
|
+
const audio_token = this.config.audio_token ?? "<|audio|>";
|
|
20288
|
+
if (!text.includes(audio_token)) {
|
|
20289
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
20290
|
+
}
|
|
20291
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
20292
|
+
}
|
|
20293
|
+
const text_inputs = this.tokenizer(text, {
|
|
20294
|
+
add_special_tokens: false,
|
|
20295
|
+
...kwargs
|
|
20296
|
+
});
|
|
20297
|
+
return {
|
|
20298
|
+
...text_inputs,
|
|
20299
|
+
...audio_inputs
|
|
20300
|
+
};
|
|
20301
|
+
}
|
|
20302
|
+
};
|
|
20303
|
+
|
|
19505
20304
|
// src/models/grounding_dino/processing_grounding_dino.js
|
|
19506
20305
|
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
19507
20306
|
const left_idx = 0;
|
|
@@ -19778,7 +20577,67 @@ var JinaCLIPProcessor = class extends Processor {
|
|
|
19778
20577
|
}
|
|
19779
20578
|
};
|
|
19780
20579
|
|
|
19781
|
-
// src/models/
|
|
20580
|
+
// src/models/lfm2_vl/processing_lfm2_vl.js
|
|
20581
|
+
var Lfm2VlProcessor = class extends Processor {
|
|
20582
|
+
static tokenizer_class = AutoTokenizer;
|
|
20583
|
+
static image_processor_class = AutoImageProcessor;
|
|
20584
|
+
/**
|
|
20585
|
+
* @param {RawImage|RawImage[]} images
|
|
20586
|
+
* @param {string|string[]|null} [text]
|
|
20587
|
+
* @param {Record<string, any>} [kwargs]
|
|
20588
|
+
*/
|
|
20589
|
+
async _call(images, text = null, kwargs = {}) {
|
|
20590
|
+
const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
|
|
20591
|
+
...kwargs,
|
|
20592
|
+
return_row_col_info: true
|
|
20593
|
+
});
|
|
20594
|
+
if (text) {
|
|
20595
|
+
const image_token = this.config.image_token ?? "<image>";
|
|
20596
|
+
const {
|
|
20597
|
+
tile_size = 512,
|
|
20598
|
+
downsample_factor = 2,
|
|
20599
|
+
encoder_patch_size = 16,
|
|
20600
|
+
use_thumbnail = true
|
|
20601
|
+
} = (
|
|
20602
|
+
/** @type {Record<string, any>} */
|
|
20603
|
+
this.image_processor.config
|
|
20604
|
+
);
|
|
20605
|
+
const ds2 = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
|
|
20606
|
+
const tokens_per_tile = ds2(tile_size) ** 2;
|
|
20607
|
+
const image_start = this.config.image_start_token ?? "<|image_start|>";
|
|
20608
|
+
const image_end = this.config.image_end_token ?? "<|image_end|>";
|
|
20609
|
+
const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
|
|
20610
|
+
if (!Array.isArray(text)) text = [text];
|
|
20611
|
+
let image_idx = 0;
|
|
20612
|
+
text = text.map((sample) => {
|
|
20613
|
+
const parts = sample.split(image_token);
|
|
20614
|
+
return parts[0] + parts.slice(1).map((part) => {
|
|
20615
|
+
const idx = image_idx++;
|
|
20616
|
+
const [h, w] = image_sizes[idx];
|
|
20617
|
+
const rows = image_rows[idx], cols = image_cols[idx];
|
|
20618
|
+
const tokens_for_image = ds2(h) * ds2(w);
|
|
20619
|
+
let expanded = image_start;
|
|
20620
|
+
if (rows > 1 || cols > 1) {
|
|
20621
|
+
const tile_str = image_token.repeat(tokens_per_tile);
|
|
20622
|
+
for (let r = 0; r < rows; ++r)
|
|
20623
|
+
for (let c = 0; c < cols; ++c)
|
|
20624
|
+
expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
|
|
20625
|
+
if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
|
|
20626
|
+
} else {
|
|
20627
|
+
expanded += image_token.repeat(tokens_for_image);
|
|
20628
|
+
}
|
|
20629
|
+
return expanded + image_end + part;
|
|
20630
|
+
}).join("");
|
|
20631
|
+
});
|
|
20632
|
+
}
|
|
20633
|
+
return {
|
|
20634
|
+
...image_inputs,
|
|
20635
|
+
...text ? this.tokenizer(text, kwargs) : {}
|
|
20636
|
+
};
|
|
20637
|
+
}
|
|
20638
|
+
};
|
|
20639
|
+
|
|
20640
|
+
// src/models/llava/processing_llava.js
|
|
19782
20641
|
var LlavaProcessor = class extends Processor {
|
|
19783
20642
|
static tokenizer_class = AutoTokenizer;
|
|
19784
20643
|
static image_processor_class = AutoImageProcessor;
|
|
@@ -20121,47 +20980,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
20121
20980
|
}
|
|
20122
20981
|
};
|
|
20123
20982
|
|
|
20124
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
20125
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
20126
|
-
static image_processor_class = AutoImageProcessor;
|
|
20127
|
-
static tokenizer_class = AutoTokenizer;
|
|
20128
|
-
/**
|
|
20129
|
-
*
|
|
20130
|
-
* @param {string|string[]} text
|
|
20131
|
-
* @param {RawImage|RawImage[]} images
|
|
20132
|
-
* @param {...any} args
|
|
20133
|
-
* @returns {Promise<any>}
|
|
20134
|
-
*/
|
|
20135
|
-
async _call(text, images = null, ...args) {
|
|
20136
|
-
if (!Array.isArray(text)) {
|
|
20137
|
-
text = [text];
|
|
20138
|
-
}
|
|
20139
|
-
let image_inputs, image_grid_thw;
|
|
20140
|
-
if (images) {
|
|
20141
|
-
image_inputs = await this.image_processor(images);
|
|
20142
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
20143
|
-
}
|
|
20144
|
-
if (image_grid_thw) {
|
|
20145
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
20146
|
-
let index = 0;
|
|
20147
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
20148
|
-
text = text.map((t) => {
|
|
20149
|
-
while (t.includes("<|image_pad|>")) {
|
|
20150
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
20151
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
20152
|
-
}
|
|
20153
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
20154
|
-
});
|
|
20155
|
-
}
|
|
20156
|
-
const text_inputs = this.tokenizer(text);
|
|
20157
|
-
return {
|
|
20158
|
-
...text_inputs,
|
|
20159
|
-
...image_inputs
|
|
20160
|
-
// TODO: ...videos_inputs,
|
|
20161
|
-
};
|
|
20162
|
-
}
|
|
20163
|
-
};
|
|
20164
|
-
|
|
20165
20983
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
20166
20984
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
20167
20985
|
};
|
|
@@ -20310,6 +21128,94 @@ var VoxtralProcessor = class extends Processor {
|
|
|
20310
21128
|
}
|
|
20311
21129
|
};
|
|
20312
21130
|
|
|
21131
|
+
// src/models/voxtral_realtime/processing_voxtral_realtime.js
|
|
21132
|
+
var NUM_LEFT_PAD_TOKENS = 32;
|
|
21133
|
+
var NUM_DELAY_TOKENS = 6;
|
|
21134
|
+
var AUDIO_LENGTH_PER_TOK = 8;
|
|
21135
|
+
var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
21136
|
+
var STREAMING_PAD_TOKEN_ID = 32;
|
|
21137
|
+
var VoxtralRealtimeProcessor = class extends Processor {
|
|
21138
|
+
static tokenizer_class = AutoTokenizer;
|
|
21139
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
21140
|
+
static uses_processor_config = false;
|
|
21141
|
+
/** Number of mel frames in the first audio chunk. */
|
|
21142
|
+
get num_mel_frames_first_audio_chunk() {
|
|
21143
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
21144
|
+
}
|
|
21145
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
21146
|
+
get num_samples_first_audio_chunk() {
|
|
21147
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
21148
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
21149
|
+
}
|
|
21150
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
21151
|
+
get num_samples_per_audio_chunk() {
|
|
21152
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
21153
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
21154
|
+
}
|
|
21155
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
21156
|
+
get num_right_pad_tokens() {
|
|
21157
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
21158
|
+
}
|
|
21159
|
+
/** Number of mel frames per text token. */
|
|
21160
|
+
get audio_length_per_tok() {
|
|
21161
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
21162
|
+
}
|
|
21163
|
+
/** Number of raw audio samples per token. */
|
|
21164
|
+
get raw_audio_length_per_tok() {
|
|
21165
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
21166
|
+
}
|
|
21167
|
+
/**
|
|
21168
|
+
* Process audio input for VoxtralRealtime.
|
|
21169
|
+
*
|
|
21170
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
21171
|
+
* with silence and mel features are extracted with `center=true`.
|
|
21172
|
+
* Returns `{ input_ids, input_features }`.
|
|
21173
|
+
*
|
|
21174
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
21175
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
21176
|
+
*
|
|
21177
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
21178
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
21179
|
+
* Returns `{ input_features }`.
|
|
21180
|
+
*
|
|
21181
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
21182
|
+
* @param {Object} [options]
|
|
21183
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
21184
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
21185
|
+
* @returns {Promise<Object>}
|
|
21186
|
+
*/
|
|
21187
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
21188
|
+
validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
|
|
21189
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
21190
|
+
throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
|
|
21191
|
+
}
|
|
21192
|
+
if (is_first_audio_chunk) {
|
|
21193
|
+
if (is_streaming) {
|
|
21194
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
21195
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
21196
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
21197
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
21198
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
21199
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
21200
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
21201
|
+
input_ids_data[0] = 1n;
|
|
21202
|
+
const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
|
|
21203
|
+
return {
|
|
21204
|
+
input_ids,
|
|
21205
|
+
...audio_encoding
|
|
21206
|
+
};
|
|
21207
|
+
} else {
|
|
21208
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
21209
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
21210
|
+
padded_audio.set(audio);
|
|
21211
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
21212
|
+
}
|
|
21213
|
+
} else {
|
|
21214
|
+
return await this.feature_extractor(audio, { center: false });
|
|
21215
|
+
}
|
|
21216
|
+
}
|
|
21217
|
+
};
|
|
21218
|
+
|
|
20313
21219
|
// src/models/wav2vec2/processing_wav2vec2.js
|
|
20314
21220
|
var Wav2Vec2Processor = class extends Processor {
|
|
20315
21221
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -20409,11 +21315,16 @@ function getNormalizedConfig(config) {
|
|
|
20409
21315
|
case "florence2":
|
|
20410
21316
|
case "llava_onevision":
|
|
20411
21317
|
case "idefics3":
|
|
21318
|
+
case "granite_speech":
|
|
20412
21319
|
case "ultravox":
|
|
20413
21320
|
case "voxtral":
|
|
21321
|
+
case "voxtral_realtime":
|
|
20414
21322
|
case "smolvlm":
|
|
20415
21323
|
case "gemma3n":
|
|
21324
|
+
case "lfm2_vl":
|
|
20416
21325
|
case "chatterbox":
|
|
21326
|
+
case "lighton_ocr":
|
|
21327
|
+
case "glm_ocr":
|
|
20417
21328
|
case "mistral3":
|
|
20418
21329
|
case "qwen2_5_vl":
|
|
20419
21330
|
case "qwen3_vl":
|
|
@@ -20467,10 +21378,13 @@ function getNormalizedConfig(config) {
|
|
|
20467
21378
|
case "cohere":
|
|
20468
21379
|
case "cohere2":
|
|
20469
21380
|
case "mistral":
|
|
21381
|
+
case "voxtral_realtime_text":
|
|
21382
|
+
case "voxtral_realtime_encoder":
|
|
20470
21383
|
case "starcoder2":
|
|
20471
21384
|
case "qwen2":
|
|
20472
21385
|
case "qwen2_moe":
|
|
20473
21386
|
case "qwen2_vl":
|
|
21387
|
+
case "qwen2_vl_text":
|
|
20474
21388
|
case "qwen2_5_vl_text":
|
|
20475
21389
|
case "qwen3_moe":
|
|
20476
21390
|
case "qwen3_vl_text":
|
|
@@ -20486,6 +21400,8 @@ function getNormalizedConfig(config) {
|
|
|
20486
21400
|
mapping["dim_kv"] = "head_dim";
|
|
20487
21401
|
break;
|
|
20488
21402
|
case "qwen3":
|
|
21403
|
+
case "solar_open":
|
|
21404
|
+
case "glm_ocr_text":
|
|
20489
21405
|
case "gemma":
|
|
20490
21406
|
case "gemma2":
|
|
20491
21407
|
case "vaultgemma":
|
|
@@ -20496,6 +21412,7 @@ function getNormalizedConfig(config) {
|
|
|
20496
21412
|
case "ernie4_5":
|
|
20497
21413
|
case "hunyuan_v1_dense":
|
|
20498
21414
|
case "falcon_h1":
|
|
21415
|
+
case "nemotron_h":
|
|
20499
21416
|
case "ministral":
|
|
20500
21417
|
case "ministral3":
|
|
20501
21418
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -20530,6 +21447,9 @@ function getNormalizedConfig(config) {
|
|
|
20530
21447
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
20531
21448
|
break;
|
|
20532
21449
|
case "youtu":
|
|
21450
|
+
case "deepseek_v3":
|
|
21451
|
+
case "glm_moe_dsa":
|
|
21452
|
+
case "mistral4":
|
|
20533
21453
|
mapping["num_heads"] = "num_key_value_heads";
|
|
20534
21454
|
mapping["num_layers"] = "num_hidden_layers";
|
|
20535
21455
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -20615,6 +21535,10 @@ function getNormalizedConfig(config) {
|
|
|
20615
21535
|
return normalized_config;
|
|
20616
21536
|
}
|
|
20617
21537
|
function getCacheShapes(config, options) {
|
|
21538
|
+
if (!(config instanceof PretrainedConfig)) {
|
|
21539
|
+
config = new PretrainedConfig(config);
|
|
21540
|
+
}
|
|
21541
|
+
const batch_size = options?.batch_size ?? 1;
|
|
20618
21542
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
20619
21543
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
20620
21544
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -20624,7 +21548,6 @@ function getCacheShapes(config, options) {
|
|
|
20624
21548
|
config
|
|
20625
21549
|
);
|
|
20626
21550
|
const head_dim = hidden_size / num_attention_heads;
|
|
20627
|
-
const batch_size = options?.batch_size ?? 1;
|
|
20628
21551
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
20629
21552
|
if (layer_types[i] === "full_attention") {
|
|
20630
21553
|
for (const kv of ["key", "value"]) {
|
|
@@ -20637,31 +21560,26 @@ function getCacheShapes(config, options) {
|
|
|
20637
21560
|
}
|
|
20638
21561
|
}
|
|
20639
21562
|
return cache_values;
|
|
20640
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
21563
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
20641
21564
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
20642
21565
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
20643
|
-
const
|
|
20644
|
-
const {
|
|
20645
|
-
layer_types,
|
|
20646
|
-
num_hidden_layers,
|
|
20647
|
-
num_attention_heads,
|
|
20648
|
-
num_key_value_heads,
|
|
20649
|
-
hidden_size,
|
|
20650
|
-
mamba_d_conv,
|
|
20651
|
-
mamba_n_heads,
|
|
20652
|
-
mamba_d_head,
|
|
20653
|
-
mamba_d_state,
|
|
20654
|
-
mamba_n_groups,
|
|
20655
|
-
mamba_expand,
|
|
20656
|
-
mamba_d_ssm
|
|
20657
|
-
} = (
|
|
21566
|
+
const c = (
|
|
20658
21567
|
/** @type {any} */
|
|
20659
21568
|
config
|
|
20660
21569
|
);
|
|
20661
|
-
const
|
|
20662
|
-
const
|
|
20663
|
-
const
|
|
20664
|
-
|
|
21570
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
21571
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
21572
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
21573
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
21574
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
21575
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
21576
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
21577
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
21578
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
21579
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
21580
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
21581
|
+
const cache_values = {};
|
|
21582
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
20665
21583
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
20666
21584
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
20667
21585
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -20695,7 +21613,6 @@ function getCacheShapes(config, options) {
|
|
|
20695
21613
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
20696
21614
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
20697
21615
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
20698
|
-
const batch_size = options?.batch_size ?? 1;
|
|
20699
21616
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
20700
21617
|
if (layer_types[i] === "full_attention") {
|
|
20701
21618
|
for (const kv of ["key", "value"]) {
|
|
@@ -20721,12 +21638,16 @@ function getCacheShapes(config, options) {
|
|
|
20721
21638
|
}
|
|
20722
21639
|
}
|
|
20723
21640
|
return cache_values;
|
|
20724
|
-
} else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
|
|
20725
|
-
|
|
20726
|
-
|
|
20727
|
-
|
|
20728
|
-
|
|
20729
|
-
|
|
21641
|
+
} else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
|
|
21642
|
+
let subConfig;
|
|
21643
|
+
if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
|
|
21644
|
+
subConfig = /** @type {any} */
|
|
21645
|
+
config.audio_config;
|
|
21646
|
+
} else {
|
|
21647
|
+
subConfig = /** @type {any} */
|
|
21648
|
+
config.text_config;
|
|
21649
|
+
}
|
|
21650
|
+
return getCacheShapes(subConfig, options);
|
|
20730
21651
|
}
|
|
20731
21652
|
return getKeyValueShapes(config, options);
|
|
20732
21653
|
}
|
|
@@ -20892,7 +21813,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
|
|
|
20892
21813
|
}
|
|
20893
21814
|
|
|
20894
21815
|
// src/models/session.js
|
|
20895
|
-
async function getSession(pretrained_model_name_or_path, fileName, options,
|
|
21816
|
+
async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
|
|
20896
21817
|
let custom_config = options.config?.["transformers.js_config"] ?? {};
|
|
20897
21818
|
const selectedDevice = (
|
|
20898
21819
|
/** @type {import("../utils/devices.js").DeviceType} */
|
|
@@ -20950,9 +21871,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
20950
21871
|
if (externalData.length > 0 && !apis.IS_NODE_ENV) {
|
|
20951
21872
|
session_options.externalData = externalData;
|
|
20952
21873
|
}
|
|
20953
|
-
if (
|
|
21874
|
+
if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
|
|
20954
21875
|
const shapes = getCacheShapes(options.config, {
|
|
20955
|
-
prefix: "present"
|
|
21876
|
+
prefix: "present",
|
|
21877
|
+
session_name
|
|
20956
21878
|
});
|
|
20957
21879
|
if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
|
|
20958
21880
|
const preferredOutputLocation = {};
|
|
@@ -20970,15 +21892,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
20970
21892
|
};
|
|
20971
21893
|
return { buffer_or_path, session_options, session_config };
|
|
20972
21894
|
}
|
|
20973
|
-
async function constructSessions(pretrained_model_name_or_path, names, options,
|
|
21895
|
+
async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
|
|
20974
21896
|
return Object.fromEntries(
|
|
20975
21897
|
await Promise.all(
|
|
20976
21898
|
Object.keys(names).map(async (name) => {
|
|
21899
|
+
const cache_config = cache_sessions?.[name] ?? false;
|
|
20977
21900
|
const { buffer_or_path, session_options, session_config } = await getSession(
|
|
20978
21901
|
pretrained_model_name_or_path,
|
|
20979
21902
|
names[name],
|
|
20980
21903
|
options,
|
|
20981
|
-
|
|
21904
|
+
cache_config,
|
|
21905
|
+
name
|
|
20982
21906
|
);
|
|
20983
21907
|
const session = await createInferenceSession(buffer_or_path, session_options, session_config);
|
|
20984
21908
|
return [name, session];
|
|
@@ -22278,19 +23202,71 @@ var BeamSearchSampler = class extends LogitsSampler {
|
|
|
22278
23202
|
}
|
|
22279
23203
|
};
|
|
22280
23204
|
|
|
23205
|
+
// src/cache_utils.js
|
|
23206
|
+
var _DynamicCache = class {
|
|
23207
|
+
/**
|
|
23208
|
+
* Create a DynamicCache, optionally pre-populated with entries.
|
|
23209
|
+
* @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
|
|
23210
|
+
*/
|
|
23211
|
+
constructor(entries) {
|
|
23212
|
+
if (!entries) return;
|
|
23213
|
+
for (const key in entries) {
|
|
23214
|
+
if (key in this) {
|
|
23215
|
+
throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
|
|
23216
|
+
}
|
|
23217
|
+
const value = entries[key];
|
|
23218
|
+
if (!(value instanceof Tensor2)) {
|
|
23219
|
+
throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
|
|
23220
|
+
}
|
|
23221
|
+
this[key] = value;
|
|
23222
|
+
}
|
|
23223
|
+
}
|
|
23224
|
+
/**
|
|
23225
|
+
* Get the cached sequence length. This requires at least one attention cache entry to be present.
|
|
23226
|
+
* @returns {number} The past sequence length.
|
|
23227
|
+
*/
|
|
23228
|
+
get_seq_length() {
|
|
23229
|
+
const self2 = (
|
|
23230
|
+
/** @type {any} */
|
|
23231
|
+
this
|
|
23232
|
+
);
|
|
23233
|
+
for (const name in self2) {
|
|
23234
|
+
if (name.startsWith("past_key_values.")) {
|
|
23235
|
+
return self2[name].dims.at(-2);
|
|
23236
|
+
}
|
|
23237
|
+
}
|
|
23238
|
+
throw new Error("Unable to determine sequence length from the cache.");
|
|
23239
|
+
}
|
|
23240
|
+
/**
|
|
23241
|
+
* Dispose all contained tensors whose data resides on the GPU.
|
|
23242
|
+
* Returns a promise that resolves when all disposals are complete.
|
|
23243
|
+
* @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
|
|
23244
|
+
*/
|
|
23245
|
+
async dispose() {
|
|
23246
|
+
const promises = [];
|
|
23247
|
+
for (
|
|
23248
|
+
const t of
|
|
23249
|
+
/** @type {Tensor[]} */
|
|
23250
|
+
Object.values(this)
|
|
23251
|
+
) {
|
|
23252
|
+
if (t.location === "gpu-buffer") {
|
|
23253
|
+
promises.push(t.dispose());
|
|
23254
|
+
}
|
|
23255
|
+
}
|
|
23256
|
+
await Promise.all(promises);
|
|
23257
|
+
}
|
|
23258
|
+
};
|
|
23259
|
+
var DynamicCache = (
|
|
23260
|
+
/** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
|
|
23261
|
+
/** @type {unknown} */
|
|
23262
|
+
_DynamicCache
|
|
23263
|
+
);
|
|
23264
|
+
|
|
22281
23265
|
// src/models/modeling_utils.js
|
|
22282
23266
|
var MODEL_MAPPING_NAMES = null;
|
|
22283
23267
|
function registerTaskMappings(mappings) {
|
|
22284
23268
|
MODEL_MAPPING_NAMES = mappings;
|
|
22285
23269
|
}
|
|
22286
|
-
function getPastLength(past_key_values) {
|
|
22287
|
-
for (const name in past_key_values) {
|
|
22288
|
-
if (name.startsWith("past_key_values.")) {
|
|
22289
|
-
return past_key_values[name].dims.at(-2);
|
|
22290
|
-
}
|
|
22291
|
-
}
|
|
22292
|
-
return Object.values(past_key_values)[0].dims.at(-2);
|
|
22293
|
-
}
|
|
22294
23270
|
function toI64Tensor(items) {
|
|
22295
23271
|
if (items instanceof Tensor2) {
|
|
22296
23272
|
return items;
|
|
@@ -22331,71 +23307,181 @@ var MODEL_TYPES = {
|
|
|
22331
23307
|
AutoEncoder: 12,
|
|
22332
23308
|
ImageAudioTextToText: 13,
|
|
22333
23309
|
Supertonic: 14,
|
|
22334
|
-
Chatterbox: 15
|
|
23310
|
+
Chatterbox: 15,
|
|
23311
|
+
MultimodalLanguageModelOnly: 16,
|
|
23312
|
+
VoxtralRealtime: 17
|
|
22335
23313
|
};
|
|
22336
23314
|
var MODEL_TYPE_CONFIG = {
|
|
22337
23315
|
[MODEL_TYPES.DecoderOnly]: {
|
|
22338
23316
|
can_generate: true,
|
|
22339
23317
|
forward: decoder_forward,
|
|
22340
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
23318
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
23319
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
|
|
23320
|
+
cache_sessions: { model: true },
|
|
23321
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22341
23322
|
},
|
|
22342
23323
|
[MODEL_TYPES.DecoderOnlyWithoutHead]: {
|
|
22343
23324
|
can_generate: false,
|
|
22344
23325
|
forward: decoder_forward,
|
|
22345
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
23326
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
23327
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
22346
23328
|
},
|
|
22347
23329
|
[MODEL_TYPES.Seq2Seq]: {
|
|
22348
23330
|
can_generate: true,
|
|
22349
23331
|
forward: seq2seq_forward,
|
|
22350
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
23332
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
23333
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
23334
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23335
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22351
23336
|
},
|
|
22352
23337
|
[MODEL_TYPES.Vision2Seq]: {
|
|
22353
23338
|
can_generate: true,
|
|
22354
23339
|
forward: seq2seq_forward,
|
|
22355
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
23340
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
23341
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
23342
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23343
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22356
23344
|
},
|
|
22357
23345
|
[MODEL_TYPES.Musicgen]: {
|
|
22358
23346
|
can_generate: true,
|
|
22359
|
-
forward: seq2seq_forward
|
|
23347
|
+
forward: seq2seq_forward,
|
|
23348
|
+
sessions: () => ({
|
|
23349
|
+
model: "text_encoder",
|
|
23350
|
+
decoder_model_merged: "decoder_model_merged",
|
|
23351
|
+
encodec_decode: "encodec_decode"
|
|
23352
|
+
}),
|
|
23353
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23354
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22360
23355
|
},
|
|
22361
23356
|
[MODEL_TYPES.EncoderDecoder]: {
|
|
22362
23357
|
can_generate: false,
|
|
22363
|
-
forward: seq2seq_forward
|
|
23358
|
+
forward: seq2seq_forward,
|
|
23359
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
23360
|
+
cache_sessions: { decoder_model_merged: true }
|
|
23361
|
+
},
|
|
23362
|
+
[MODEL_TYPES.MaskGeneration]: {
|
|
23363
|
+
sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
|
|
22364
23364
|
},
|
|
22365
23365
|
[MODEL_TYPES.ImageTextToText]: {
|
|
22366
23366
|
can_generate: true,
|
|
22367
23367
|
forward: image_text_to_text_forward,
|
|
22368
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
23368
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23369
|
+
sessions: (config) => {
|
|
23370
|
+
const s = {
|
|
23371
|
+
embed_tokens: "embed_tokens",
|
|
23372
|
+
vision_encoder: "vision_encoder",
|
|
23373
|
+
decoder_model_merged: "decoder_model_merged"
|
|
23374
|
+
};
|
|
23375
|
+
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
23376
|
+
return s;
|
|
23377
|
+
},
|
|
23378
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23379
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22369
23380
|
},
|
|
22370
23381
|
[MODEL_TYPES.AudioTextToText]: {
|
|
22371
23382
|
can_generate: true,
|
|
22372
23383
|
forward: audio_text_to_text_forward,
|
|
22373
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
23384
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23385
|
+
sessions: () => ({
|
|
23386
|
+
embed_tokens: "embed_tokens",
|
|
23387
|
+
audio_encoder: "audio_encoder",
|
|
23388
|
+
decoder_model_merged: "decoder_model_merged"
|
|
23389
|
+
}),
|
|
23390
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23391
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22374
23392
|
},
|
|
22375
|
-
[MODEL_TYPES.
|
|
23393
|
+
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
22376
23394
|
can_generate: true,
|
|
22377
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
23395
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23396
|
+
sessions: () => ({
|
|
23397
|
+
embed_tokens: "embed_tokens",
|
|
23398
|
+
audio_encoder: "audio_encoder",
|
|
23399
|
+
vision_encoder: "vision_encoder",
|
|
23400
|
+
decoder_model_merged: "decoder_model_merged"
|
|
23401
|
+
}),
|
|
23402
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22378
23403
|
},
|
|
22379
|
-
[MODEL_TYPES.
|
|
23404
|
+
[MODEL_TYPES.Phi3V]: {
|
|
22380
23405
|
can_generate: true,
|
|
22381
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
23406
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23407
|
+
sessions: () => ({
|
|
23408
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23409
|
+
model: "model",
|
|
23410
|
+
vision_encoder: "vision_encoder"
|
|
23411
|
+
}),
|
|
23412
|
+
cache_sessions: { model: true },
|
|
23413
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22382
23414
|
},
|
|
22383
23415
|
[MODEL_TYPES.MultiModality]: {
|
|
22384
|
-
can_generate: true
|
|
23416
|
+
can_generate: true,
|
|
23417
|
+
sessions: () => ({
|
|
23418
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23419
|
+
model: "language_model",
|
|
23420
|
+
lm_head: "lm_head",
|
|
23421
|
+
gen_head: "gen_head",
|
|
23422
|
+
gen_img_embeds: "gen_img_embeds",
|
|
23423
|
+
image_decode: "image_decode"
|
|
23424
|
+
}),
|
|
23425
|
+
cache_sessions: { model: true },
|
|
23426
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22385
23427
|
},
|
|
22386
23428
|
[MODEL_TYPES.AutoEncoder]: {
|
|
22387
23429
|
can_generate: false,
|
|
22388
|
-
forward: auto_encoder_forward
|
|
23430
|
+
forward: auto_encoder_forward,
|
|
23431
|
+
sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
|
|
23432
|
+
},
|
|
23433
|
+
[MODEL_TYPES.Supertonic]: {
|
|
23434
|
+
sessions: () => ({
|
|
23435
|
+
text_encoder: "text_encoder",
|
|
23436
|
+
latent_denoiser: "latent_denoiser",
|
|
23437
|
+
voice_decoder: "voice_decoder"
|
|
23438
|
+
})
|
|
22389
23439
|
},
|
|
22390
23440
|
[MODEL_TYPES.Chatterbox]: {
|
|
22391
23441
|
can_generate: true,
|
|
22392
|
-
forward: encoder_forward
|
|
23442
|
+
forward: encoder_forward,
|
|
23443
|
+
sessions: () => ({
|
|
23444
|
+
embed_tokens: "embed_tokens",
|
|
23445
|
+
speech_encoder: "speech_encoder",
|
|
23446
|
+
model: "language_model",
|
|
23447
|
+
conditional_decoder: "conditional_decoder"
|
|
23448
|
+
}),
|
|
23449
|
+
cache_sessions: { model: true },
|
|
23450
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23451
|
+
},
|
|
23452
|
+
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
23453
|
+
can_generate: true,
|
|
23454
|
+
forward: image_text_to_text_forward,
|
|
23455
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23456
|
+
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
23457
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23458
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23459
|
+
},
|
|
23460
|
+
[MODEL_TYPES.VoxtralRealtime]: {
|
|
23461
|
+
can_generate: true,
|
|
23462
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
23463
|
+
sessions: () => ({
|
|
23464
|
+
embed_tokens: "embed_tokens",
|
|
23465
|
+
audio_encoder: "audio_encoder",
|
|
23466
|
+
decoder_model_merged: "decoder_model_merged"
|
|
23467
|
+
}),
|
|
23468
|
+
cache_sessions: { decoder_model_merged: true, audio_encoder: true },
|
|
23469
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22393
23470
|
},
|
|
22394
23471
|
default: {
|
|
22395
23472
|
can_generate: false,
|
|
22396
|
-
forward: encoder_forward
|
|
23473
|
+
forward: encoder_forward,
|
|
23474
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
22397
23475
|
}
|
|
22398
23476
|
};
|
|
23477
|
+
function getSessionsConfig(modelType, config, options = {}) {
|
|
23478
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
23479
|
+
return {
|
|
23480
|
+
sessions: typeConfig.sessions(config, options),
|
|
23481
|
+
cache_sessions: typeConfig.cache_sessions,
|
|
23482
|
+
optional_configs: typeConfig.optional_configs
|
|
23483
|
+
};
|
|
23484
|
+
}
|
|
22399
23485
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
22400
23486
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
22401
23487
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -22481,300 +23567,78 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
22481
23567
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
22482
23568
|
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
22483
23569
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
22484
|
-
|
|
22485
|
-
if (modelType ===
|
|
22486
|
-
|
|
22487
|
-
|
|
22488
|
-
|
|
22489
|
-
{
|
|
22490
|
-
|
|
22491
|
-
},
|
|
22492
|
-
options,
|
|
22493
|
-
"model"
|
|
22494
|
-
),
|
|
22495
|
-
get_optional_configs(
|
|
22496
|
-
pretrained_model_name_or_path,
|
|
22497
|
-
{
|
|
22498
|
-
generation_config: "generation_config.json"
|
|
22499
|
-
},
|
|
22500
|
-
options
|
|
22501
|
-
)
|
|
22502
|
-
]);
|
|
22503
|
-
} else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
|
|
22504
|
-
info = await Promise.all([
|
|
22505
|
-
constructSessions(
|
|
22506
|
-
pretrained_model_name_or_path,
|
|
22507
|
-
{
|
|
22508
|
-
model: "encoder_model",
|
|
22509
|
-
decoder_model_merged: "decoder_model_merged"
|
|
22510
|
-
},
|
|
22511
|
-
options,
|
|
22512
|
-
"decoder_model_merged"
|
|
22513
|
-
),
|
|
22514
|
-
get_optional_configs(
|
|
22515
|
-
pretrained_model_name_or_path,
|
|
22516
|
-
{
|
|
22517
|
-
generation_config: "generation_config.json"
|
|
22518
|
-
},
|
|
22519
|
-
options
|
|
22520
|
-
)
|
|
22521
|
-
]);
|
|
22522
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
22523
|
-
info = await Promise.all([
|
|
22524
|
-
constructSessions(
|
|
22525
|
-
pretrained_model_name_or_path,
|
|
22526
|
-
{
|
|
22527
|
-
model: "vision_encoder",
|
|
22528
|
-
prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
|
|
22529
|
-
},
|
|
22530
|
-
options
|
|
22531
|
-
)
|
|
22532
|
-
]);
|
|
22533
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
22534
|
-
info = await Promise.all([
|
|
22535
|
-
constructSessions(
|
|
22536
|
-
pretrained_model_name_or_path,
|
|
22537
|
-
{
|
|
22538
|
-
model: "encoder_model",
|
|
22539
|
-
decoder_model_merged: "decoder_model_merged"
|
|
22540
|
-
},
|
|
22541
|
-
options,
|
|
22542
|
-
"decoder_model_merged"
|
|
22543
|
-
)
|
|
22544
|
-
]);
|
|
22545
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
22546
|
-
const sessions = {
|
|
22547
|
-
embed_tokens: "embed_tokens",
|
|
22548
|
-
vision_encoder: "vision_encoder",
|
|
22549
|
-
decoder_model_merged: "decoder_model_merged"
|
|
22550
|
-
};
|
|
22551
|
-
if (config.is_encoder_decoder) {
|
|
22552
|
-
sessions["model"] = "encoder_model";
|
|
23570
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
23571
|
+
if (modelType === void 0) {
|
|
23572
|
+
const type = modelName ?? config?.model_type;
|
|
23573
|
+
if (type !== "custom") {
|
|
23574
|
+
logger.warn(
|
|
23575
|
+
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
23576
|
+
);
|
|
22553
23577
|
}
|
|
22554
|
-
|
|
22555
|
-
|
|
22556
|
-
|
|
22557
|
-
|
|
22558
|
-
|
|
22559
|
-
|
|
22560
|
-
|
|
22561
|
-
|
|
22562
|
-
|
|
22563
|
-
|
|
22564
|
-
|
|
22565
|
-
|
|
22566
|
-
|
|
22567
|
-
|
|
22568
|
-
|
|
22569
|
-
|
|
22570
|
-
|
|
22571
|
-
|
|
22572
|
-
|
|
22573
|
-
|
|
22574
|
-
|
|
22575
|
-
|
|
22576
|
-
|
|
22577
|
-
|
|
22578
|
-
|
|
22579
|
-
|
|
22580
|
-
|
|
22581
|
-
|
|
22582
|
-
|
|
22583
|
-
|
|
22584
|
-
|
|
22585
|
-
|
|
22586
|
-
|
|
22587
|
-
|
|
22588
|
-
|
|
22589
|
-
|
|
22590
|
-
|
|
22591
|
-
|
|
22592
|
-
|
|
22593
|
-
|
|
22594
|
-
|
|
22595
|
-
|
|
22596
|
-
|
|
22597
|
-
|
|
22598
|
-
|
|
22599
|
-
|
|
22600
|
-
|
|
22601
|
-
|
|
22602
|
-
|
|
22603
|
-
|
|
22604
|
-
|
|
22605
|
-
|
|
22606
|
-
|
|
22607
|
-
|
|
22608
|
-
|
|
22609
|
-
|
|
22610
|
-
|
|
22611
|
-
|
|
22612
|
-
|
|
22613
|
-
|
|
22614
|
-
|
|
22615
|
-
|
|
22616
|
-
|
|
22617
|
-
|
|
22618
|
-
info = await Promise.all([
|
|
22619
|
-
constructSessions(
|
|
22620
|
-
pretrained_model_name_or_path,
|
|
22621
|
-
{
|
|
22622
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
22623
|
-
model: "language_model",
|
|
22624
|
-
lm_head: "lm_head",
|
|
22625
|
-
gen_head: "gen_head",
|
|
22626
|
-
gen_img_embeds: "gen_img_embeds",
|
|
22627
|
-
image_decode: "image_decode"
|
|
22628
|
-
},
|
|
22629
|
-
options,
|
|
22630
|
-
"model"
|
|
22631
|
-
),
|
|
22632
|
-
get_optional_configs(
|
|
22633
|
-
pretrained_model_name_or_path,
|
|
22634
|
-
{
|
|
22635
|
-
generation_config: "generation_config.json"
|
|
22636
|
-
},
|
|
22637
|
-
options
|
|
22638
|
-
)
|
|
22639
|
-
]);
|
|
22640
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
22641
|
-
info = await Promise.all([
|
|
22642
|
-
constructSessions(
|
|
22643
|
-
pretrained_model_name_or_path,
|
|
22644
|
-
{
|
|
22645
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
22646
|
-
model: "model",
|
|
22647
|
-
vision_encoder: "vision_encoder"
|
|
22648
|
-
},
|
|
22649
|
-
options,
|
|
22650
|
-
"model"
|
|
22651
|
-
),
|
|
22652
|
-
get_optional_configs(
|
|
22653
|
-
pretrained_model_name_or_path,
|
|
22654
|
-
{
|
|
22655
|
-
generation_config: "generation_config.json"
|
|
22656
|
-
},
|
|
22657
|
-
options
|
|
22658
|
-
)
|
|
22659
|
-
]);
|
|
22660
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
22661
|
-
info = await Promise.all([
|
|
22662
|
-
constructSessions(
|
|
22663
|
-
pretrained_model_name_or_path,
|
|
22664
|
-
{
|
|
22665
|
-
embed_tokens: "embed_tokens",
|
|
22666
|
-
speech_encoder: "speech_encoder",
|
|
22667
|
-
model: "language_model",
|
|
22668
|
-
conditional_decoder: "conditional_decoder"
|
|
22669
|
-
},
|
|
22670
|
-
options,
|
|
22671
|
-
"model"
|
|
22672
|
-
),
|
|
22673
|
-
get_optional_configs(
|
|
22674
|
-
pretrained_model_name_or_path,
|
|
22675
|
-
{
|
|
22676
|
-
generation_config: "generation_config.json"
|
|
22677
|
-
},
|
|
22678
|
-
options
|
|
22679
|
-
)
|
|
22680
|
-
]);
|
|
22681
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
22682
|
-
info = await Promise.all([
|
|
22683
|
-
constructSessions(
|
|
22684
|
-
pretrained_model_name_or_path,
|
|
22685
|
-
{
|
|
22686
|
-
encoder_model: "encoder_model",
|
|
22687
|
-
decoder_model: "decoder_model"
|
|
22688
|
-
},
|
|
22689
|
-
options
|
|
22690
|
-
)
|
|
22691
|
-
]);
|
|
22692
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
22693
|
-
info = await Promise.all([
|
|
22694
|
-
constructSessions(
|
|
22695
|
-
pretrained_model_name_or_path,
|
|
22696
|
-
{
|
|
22697
|
-
text_encoder: "text_encoder",
|
|
22698
|
-
latent_denoiser: "latent_denoiser",
|
|
22699
|
-
voice_decoder: "voice_decoder"
|
|
22700
|
-
},
|
|
22701
|
-
options
|
|
22702
|
-
)
|
|
22703
|
-
]);
|
|
22704
|
-
} else {
|
|
22705
|
-
if (modelType === void 0) {
|
|
22706
|
-
const type = modelName ?? config?.model_type;
|
|
22707
|
-
if (type !== "custom") {
|
|
22708
|
-
logger.warn(
|
|
22709
|
-
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
22710
|
-
);
|
|
22711
|
-
}
|
|
22712
|
-
}
|
|
22713
|
-
info = await Promise.all([
|
|
22714
|
-
constructSessions(
|
|
22715
|
-
pretrained_model_name_or_path,
|
|
22716
|
-
{
|
|
22717
|
-
model: options.model_file_name ?? "model"
|
|
22718
|
-
},
|
|
22719
|
-
options
|
|
22720
|
-
)
|
|
22721
|
-
]);
|
|
22722
|
-
}
|
|
22723
|
-
return new this(config, ...info);
|
|
22724
|
-
}
|
|
22725
|
-
/**
|
|
22726
|
-
* Runs the model with the provided inputs
|
|
22727
|
-
* @param {Object} model_inputs Object containing input tensors
|
|
22728
|
-
* @returns {Promise<Object>} Object containing output tensors
|
|
22729
|
-
*/
|
|
22730
|
-
async _call(model_inputs) {
|
|
22731
|
-
return await this.forward(model_inputs);
|
|
22732
|
-
}
|
|
22733
|
-
/**
|
|
22734
|
-
* Forward method for a pretrained model. If not overridden by a subclass, the correct forward method
|
|
22735
|
-
* will be chosen based on the model type.
|
|
22736
|
-
* @param {Object} model_inputs The input data to the model in the format specified in the ONNX model.
|
|
22737
|
-
* @returns {Promise<Object>} The output data from the model in the format specified in the ONNX model.
|
|
22738
|
-
* @throws {Error} This method must be implemented in subclasses.
|
|
22739
|
-
*/
|
|
22740
|
-
async forward(model_inputs) {
|
|
22741
|
-
return await this._forward(this, model_inputs);
|
|
22742
|
-
}
|
|
22743
|
-
/**
|
|
22744
|
-
* Get the model's generation config, if it exists.
|
|
22745
|
-
* @returns {GenerationConfig|null} The model's generation config if it exists, otherwise `null`.
|
|
22746
|
-
*/
|
|
22747
|
-
get generation_config() {
|
|
22748
|
-
return this.configs?.generation_config ?? null;
|
|
22749
|
-
}
|
|
22750
|
-
/**
|
|
22751
|
-
* @param {GenerationConfig} generation_config
|
|
22752
|
-
* @param {number} input_ids_seq_length The starting sequence length for the input ids.
|
|
22753
|
-
* @returns {LogitsProcessorList}
|
|
22754
|
-
* @private
|
|
22755
|
-
*/
|
|
22756
|
-
_get_logits_processor(generation_config, input_ids_seq_length, logits_processor = null) {
|
|
22757
|
-
const processors = new LogitsProcessorList();
|
|
22758
|
-
if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1) {
|
|
22759
|
-
processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty));
|
|
22760
|
-
}
|
|
22761
|
-
if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) {
|
|
22762
|
-
processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size));
|
|
22763
|
-
}
|
|
22764
|
-
if (generation_config.bad_words_ids !== null) {
|
|
22765
|
-
processors.push(
|
|
22766
|
-
new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id)
|
|
22767
|
-
);
|
|
22768
|
-
}
|
|
22769
|
-
if (generation_config.min_length !== null && generation_config.eos_token_id !== null && generation_config.min_length > 0) {
|
|
22770
|
-
processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id));
|
|
22771
|
-
}
|
|
22772
|
-
if (generation_config.min_new_tokens !== null && generation_config.eos_token_id !== null && generation_config.min_new_tokens > 0) {
|
|
22773
|
-
processors.push(
|
|
22774
|
-
new MinNewTokensLengthLogitsProcessor(
|
|
22775
|
-
input_ids_seq_length,
|
|
22776
|
-
generation_config.min_new_tokens,
|
|
22777
|
-
generation_config.eos_token_id
|
|
23578
|
+
}
|
|
23579
|
+
const sessions = typeConfig.sessions(config, options);
|
|
23580
|
+
const promises = [
|
|
23581
|
+
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
23582
|
+
];
|
|
23583
|
+
if (typeConfig.optional_configs) {
|
|
23584
|
+
promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
|
|
23585
|
+
}
|
|
23586
|
+
const info = await Promise.all(promises);
|
|
23587
|
+
return new this(config, ...info);
|
|
23588
|
+
}
|
|
23589
|
+
/**
|
|
23590
|
+
* Runs the model with the provided inputs
|
|
23591
|
+
* @param {Object} model_inputs Object containing input tensors
|
|
23592
|
+
* @returns {Promise<Object>} Object containing output tensors
|
|
23593
|
+
*/
|
|
23594
|
+
async _call(model_inputs) {
|
|
23595
|
+
return await this.forward(model_inputs);
|
|
23596
|
+
}
|
|
23597
|
+
/**
|
|
23598
|
+
* Forward method for a pretrained model. If not overridden by a subclass, the correct forward method
|
|
23599
|
+
* will be chosen based on the model type.
|
|
23600
|
+
* @param {Object} model_inputs The input data to the model in the format specified in the ONNX model.
|
|
23601
|
+
* @returns {Promise<Object>} The output data from the model in the format specified in the ONNX model.
|
|
23602
|
+
* @throws {Error} This method must be implemented in subclasses.
|
|
23603
|
+
*/
|
|
23604
|
+
async forward(model_inputs) {
|
|
23605
|
+
return await this._forward(this, model_inputs);
|
|
23606
|
+
}
|
|
23607
|
+
/**
|
|
23608
|
+
* Get the model's generation config, if it exists.
|
|
23609
|
+
* @returns {GenerationConfig|null} The model's generation config if it exists, otherwise `null`.
|
|
23610
|
+
*/
|
|
23611
|
+
get generation_config() {
|
|
23612
|
+
return this.configs?.generation_config ?? null;
|
|
23613
|
+
}
|
|
23614
|
+
/**
|
|
23615
|
+
* @param {GenerationConfig} generation_config
|
|
23616
|
+
* @param {number} input_ids_seq_length The starting sequence length for the input ids.
|
|
23617
|
+
* @returns {LogitsProcessorList}
|
|
23618
|
+
* @private
|
|
23619
|
+
*/
|
|
23620
|
+
_get_logits_processor(generation_config, input_ids_seq_length, logits_processor = null) {
|
|
23621
|
+
const processors = new LogitsProcessorList();
|
|
23622
|
+
if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1) {
|
|
23623
|
+
processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty));
|
|
23624
|
+
}
|
|
23625
|
+
if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) {
|
|
23626
|
+
processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size));
|
|
23627
|
+
}
|
|
23628
|
+
if (generation_config.bad_words_ids !== null) {
|
|
23629
|
+
processors.push(
|
|
23630
|
+
new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id)
|
|
23631
|
+
);
|
|
23632
|
+
}
|
|
23633
|
+
if (generation_config.min_length !== null && generation_config.eos_token_id !== null && generation_config.min_length > 0) {
|
|
23634
|
+
processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id));
|
|
23635
|
+
}
|
|
23636
|
+
if (generation_config.min_new_tokens !== null && generation_config.eos_token_id !== null && generation_config.min_new_tokens > 0) {
|
|
23637
|
+
processors.push(
|
|
23638
|
+
new MinNewTokensLengthLogitsProcessor(
|
|
23639
|
+
input_ids_seq_length,
|
|
23640
|
+
generation_config.min_new_tokens,
|
|
23641
|
+
generation_config.eos_token_id
|
|
22778
23642
|
)
|
|
22779
23643
|
);
|
|
22780
23644
|
}
|
|
@@ -22918,7 +23782,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
22918
23782
|
* @param {Tensor} [params.inputs=null]
|
|
22919
23783
|
* @param {number} [params.bos_token_id=null]
|
|
22920
23784
|
* @param {Record<string, Tensor|number[]>} [params.model_kwargs]
|
|
22921
|
-
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor
|
|
23785
|
+
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
|
|
22922
23786
|
*/
|
|
22923
23787
|
_prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
|
|
22924
23788
|
const model_inputs = pick(model_kwargs, this.forward_params);
|
|
@@ -23159,11 +24023,12 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23159
24023
|
}
|
|
23160
24024
|
}
|
|
23161
24025
|
/**
|
|
23162
|
-
* Returns
|
|
24026
|
+
* Returns a DynamicCache containing past key values from the given decoder results object.
|
|
23163
24027
|
*
|
|
23164
24028
|
* @param {Object} decoderResults The decoder results object.
|
|
23165
|
-
* @param {
|
|
23166
|
-
* @
|
|
24029
|
+
* @param {DynamicCache} pastKeyValues The previous past key values.
|
|
24030
|
+
* @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
|
|
24031
|
+
* @returns {DynamicCache} A new DynamicCache containing the updated past key values.
|
|
23167
24032
|
*/
|
|
23168
24033
|
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
23169
24034
|
const pkvs = /* @__PURE__ */ Object.create(null);
|
|
@@ -23184,7 +24049,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23184
24049
|
}
|
|
23185
24050
|
}
|
|
23186
24051
|
}
|
|
23187
|
-
return pkvs;
|
|
24052
|
+
return new DynamicCache(pkvs);
|
|
23188
24053
|
}
|
|
23189
24054
|
/**
|
|
23190
24055
|
* Returns an object containing attentions from the given model output object.
|
|
@@ -23209,8 +24074,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23209
24074
|
/**
|
|
23210
24075
|
* Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
|
|
23211
24076
|
*
|
|
23212
|
-
* @param {
|
|
23213
|
-
* @param {
|
|
24077
|
+
* @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
|
|
24078
|
+
* @param {DynamicCache|null} pastKeyValues The cache containing past key values.
|
|
23214
24079
|
*/
|
|
23215
24080
|
addPastKeyValues(decoderFeeds, pastKeyValues) {
|
|
23216
24081
|
if (pastKeyValues) {
|
|
@@ -23227,14 +24092,29 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23227
24092
|
}
|
|
23228
24093
|
}
|
|
23229
24094
|
}
|
|
23230
|
-
|
|
23231
|
-
|
|
24095
|
+
/**
|
|
24096
|
+
* Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
|
|
24097
|
+
* @param {string} sessionName
|
|
24098
|
+
* @param {Record<string, Tensor>} inputs
|
|
24099
|
+
* @param {string} outputName
|
|
24100
|
+
* @private
|
|
24101
|
+
*/
|
|
24102
|
+
async _encode_input(sessionName, inputs, outputName) {
|
|
24103
|
+
if (!Object.hasOwn(this.sessions, sessionName)) {
|
|
24104
|
+
throw new Error(`Model does not have a ${sessionName} session.`);
|
|
24105
|
+
}
|
|
24106
|
+
const session = this.sessions[sessionName];
|
|
24107
|
+
const output = await sessionRun(session, pick(inputs, session.inputNames));
|
|
24108
|
+
return output[outputName];
|
|
24109
|
+
}
|
|
24110
|
+
async encode_image(inputs) {
|
|
24111
|
+
return this._encode_input("vision_encoder", inputs, "image_features");
|
|
23232
24112
|
}
|
|
23233
|
-
async encode_text(
|
|
23234
|
-
return
|
|
24113
|
+
async encode_text(inputs) {
|
|
24114
|
+
return this._encode_input("embed_tokens", inputs, "inputs_embeds");
|
|
23235
24115
|
}
|
|
23236
|
-
async encode_audio(
|
|
23237
|
-
return
|
|
24116
|
+
async encode_audio(inputs) {
|
|
24117
|
+
return this._encode_input("audio_encoder", inputs, "audio_features");
|
|
23238
24118
|
}
|
|
23239
24119
|
};
|
|
23240
24120
|
async function seq2seq_forward(self2, model_inputs) {
|
|
@@ -23289,6 +24169,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
|
|
|
23289
24169
|
const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
|
|
23290
24170
|
new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
|
|
23291
24171
|
}
|
|
24172
|
+
if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
|
|
24173
|
+
new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
|
|
24174
|
+
}
|
|
23292
24175
|
self2.addPastKeyValues(new_model_inputs, past_key_values);
|
|
23293
24176
|
const fixed = pick(new_model_inputs, session.inputNames);
|
|
23294
24177
|
return await sessionRun(session, fixed);
|
|
@@ -23297,7 +24180,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
23297
24180
|
// Generic parameters:
|
|
23298
24181
|
encode_function,
|
|
23299
24182
|
merge_function,
|
|
23300
|
-
|
|
24183
|
+
modality_input_names,
|
|
23301
24184
|
modality_output_name,
|
|
23302
24185
|
// Produced by the tokenizer/processor:
|
|
23303
24186
|
input_ids = null,
|
|
@@ -23312,32 +24195,34 @@ async function generic_text_to_text_forward(self2, {
|
|
|
23312
24195
|
// Additional parameters
|
|
23313
24196
|
...kwargs
|
|
23314
24197
|
}) {
|
|
23315
|
-
const modality_values = kwargs[modality_input_name];
|
|
23316
24198
|
if (!inputs_embeds) {
|
|
23317
24199
|
inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
|
|
23318
|
-
|
|
23319
|
-
|
|
23320
|
-
|
|
23321
|
-
|
|
23322
|
-
|
|
23323
|
-
|
|
23324
|
-
|
|
23325
|
-
|
|
23326
|
-
|
|
23327
|
-
inputs_embeds,
|
|
23328
|
-
|
|
23329
|
-
|
|
23330
|
-
|
|
23331
|
-
|
|
23332
|
-
|
|
23333
|
-
|
|
23334
|
-
|
|
23335
|
-
|
|
23336
|
-
|
|
23337
|
-
|
|
23338
|
-
|
|
23339
|
-
|
|
23340
|
-
|
|
24200
|
+
const modality_values = pick(kwargs, modality_input_names);
|
|
24201
|
+
if (Object.keys(modality_values).length > 0) {
|
|
24202
|
+
if (input_ids.dims[1] !== 1) {
|
|
24203
|
+
const modality_features = await encode_function({
|
|
24204
|
+
// Pass the modality values under its expected key.
|
|
24205
|
+
// The caller knows whether this is audio or image.
|
|
24206
|
+
...modality_values,
|
|
24207
|
+
...kwargs
|
|
24208
|
+
});
|
|
24209
|
+
({ inputs_embeds, attention_mask } = merge_function({
|
|
24210
|
+
[modality_output_name]: modality_features,
|
|
24211
|
+
inputs_embeds,
|
|
24212
|
+
input_ids,
|
|
24213
|
+
attention_mask
|
|
24214
|
+
}));
|
|
24215
|
+
} else if (past_key_values && input_ids.dims[1] === 1) {
|
|
24216
|
+
const target_length = input_ids.dims[1];
|
|
24217
|
+
const past_length = past_key_values.get_seq_length();
|
|
24218
|
+
attention_mask = cat(
|
|
24219
|
+
[
|
|
24220
|
+
ones([input_ids.dims[0], past_length]),
|
|
24221
|
+
attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
|
|
24222
|
+
],
|
|
24223
|
+
1
|
|
24224
|
+
);
|
|
24225
|
+
}
|
|
23341
24226
|
}
|
|
23342
24227
|
}
|
|
23343
24228
|
if (!position_ids) {
|
|
@@ -23345,14 +24230,19 @@ async function generic_text_to_text_forward(self2, {
|
|
|
23345
24230
|
// Handle special case for qwen vl models
|
|
23346
24231
|
[
|
|
23347
24232
|
"qwen2_vl",
|
|
24233
|
+
"qwen2_vl_text",
|
|
23348
24234
|
"qwen2_5_vl",
|
|
23349
24235
|
"qwen2_5_vl_text",
|
|
23350
24236
|
"qwen3_vl",
|
|
23351
24237
|
"qwen3_vl_text",
|
|
24238
|
+
"qwen3_vl_moe",
|
|
24239
|
+
"qwen3_vl_moe_text",
|
|
23352
24240
|
"qwen3_5",
|
|
23353
24241
|
"qwen3_5_text",
|
|
23354
24242
|
"qwen3_5_moe",
|
|
23355
|
-
"qwen3_5_moe_text"
|
|
24243
|
+
"qwen3_5_moe_text",
|
|
24244
|
+
"glm_ocr",
|
|
24245
|
+
"glm_ocr_text"
|
|
23356
24246
|
].includes(self2.config.model_type)
|
|
23357
24247
|
) {
|
|
23358
24248
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -23376,7 +24266,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
23376
24266
|
async function audio_text_to_text_forward(self2, params) {
|
|
23377
24267
|
return await generic_text_to_text_forward(self2, {
|
|
23378
24268
|
...params,
|
|
23379
|
-
|
|
24269
|
+
modality_input_names: ["audio_values", "input_features"],
|
|
23380
24270
|
modality_output_name: "audio_features",
|
|
23381
24271
|
encode_function: self2.encode_audio.bind(self2),
|
|
23382
24272
|
merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
|
|
@@ -23385,7 +24275,7 @@ async function audio_text_to_text_forward(self2, params) {
|
|
|
23385
24275
|
async function image_text_to_text_forward(self2, params) {
|
|
23386
24276
|
return await generic_text_to_text_forward(self2, {
|
|
23387
24277
|
...params,
|
|
23388
|
-
|
|
24278
|
+
modality_input_names: ["pixel_values"],
|
|
23389
24279
|
modality_output_name: "image_features",
|
|
23390
24280
|
encode_function: self2.encode_image.bind(self2),
|
|
23391
24281
|
merge_function: self2._merge_input_ids_with_image_features.bind(self2)
|
|
@@ -23421,7 +24311,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
|
|
|
23421
24311
|
return position_ids;
|
|
23422
24312
|
}
|
|
23423
24313
|
function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
|
|
23424
|
-
const past_length = model_inputs.past_key_values ?
|
|
24314
|
+
const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
|
|
24315
|
+
const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
|
|
24316
|
+
if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
|
|
24317
|
+
model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
|
|
24318
|
+
}
|
|
23425
24319
|
if (!model_inputs.attention_mask) {
|
|
23426
24320
|
let dims;
|
|
23427
24321
|
for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
|
|
@@ -23572,6 +24466,8 @@ __export(models_exports, {
|
|
|
23572
24466
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
23573
24467
|
BloomModel: () => BloomModel,
|
|
23574
24468
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
24469
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
24470
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
23575
24471
|
CLIPModel: () => CLIPModel,
|
|
23576
24472
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
23577
24473
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -23646,6 +24542,9 @@ __export(models_exports, {
|
|
|
23646
24542
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
23647
24543
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
23648
24544
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
24545
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
24546
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
24547
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
23649
24548
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
23650
24549
|
DeiTModel: () => DeiTModel,
|
|
23651
24550
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -23691,6 +24590,11 @@ __export(models_exports, {
|
|
|
23691
24590
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
23692
24591
|
EsmModel: () => EsmModel,
|
|
23693
24592
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
24593
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
24594
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
24595
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
24596
|
+
EuroBertModel: () => EuroBertModel,
|
|
24597
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
23694
24598
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
23695
24599
|
ExaoneModel: () => ExaoneModel,
|
|
23696
24600
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -23729,6 +24633,7 @@ __export(models_exports, {
|
|
|
23729
24633
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
23730
24634
|
Gemma3Model: () => Gemma3Model,
|
|
23731
24635
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
24636
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
23732
24637
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
23733
24638
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
23734
24639
|
GemmaForCausalLM: () => GemmaForCausalLM,
|
|
@@ -23736,6 +24641,10 @@ __export(models_exports, {
|
|
|
23736
24641
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
23737
24642
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
23738
24643
|
GlmModel: () => GlmModel,
|
|
24644
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
24645
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
24646
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
24647
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
23739
24648
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
23740
24649
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
23741
24650
|
GptOssModel: () => GptOssModel,
|
|
@@ -23746,6 +24655,7 @@ __export(models_exports, {
|
|
|
23746
24655
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
23747
24656
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
23748
24657
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
24658
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
23749
24659
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
23750
24660
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
23751
24661
|
GroupViTModel: () => GroupViTModel,
|
|
@@ -23767,7 +24677,6 @@ __export(models_exports, {
|
|
|
23767
24677
|
IJepaModel: () => IJepaModel,
|
|
23768
24678
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
23769
24679
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
23770
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
23771
24680
|
JAISLMHeadModel: () => JAISLMHeadModel,
|
|
23772
24681
|
JAISModel: () => JAISModel,
|
|
23773
24682
|
JAISPreTrainedModel: () => JAISPreTrainedModel,
|
|
@@ -23781,6 +24690,8 @@ __export(models_exports, {
|
|
|
23781
24690
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
23782
24691
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
23783
24692
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
24693
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
24694
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
23784
24695
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
23785
24696
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
23786
24697
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -23830,6 +24741,9 @@ __export(models_exports, {
|
|
|
23830
24741
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
23831
24742
|
MimiModel: () => MimiModel,
|
|
23832
24743
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
24744
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
24745
|
+
Mistral4Model: () => Mistral4Model,
|
|
24746
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
23833
24747
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
23834
24748
|
MistralModel: () => MistralModel,
|
|
23835
24749
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -23887,6 +24801,9 @@ __export(models_exports, {
|
|
|
23887
24801
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
23888
24802
|
NanoChatModel: () => NanoChatModel,
|
|
23889
24803
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
24804
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
24805
|
+
NemotronHModel: () => NemotronHModel,
|
|
24806
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
23890
24807
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
23891
24808
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
23892
24809
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -23920,7 +24837,6 @@ __export(models_exports, {
|
|
|
23920
24837
|
Owlv2Model: () => Owlv2Model,
|
|
23921
24838
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
23922
24839
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
23923
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
23924
24840
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
23925
24841
|
ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
|
|
23926
24842
|
PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
|
|
@@ -23950,8 +24866,10 @@ __export(models_exports, {
|
|
|
23950
24866
|
Qwen2MoeModel: () => Qwen2MoeModel,
|
|
23951
24867
|
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
23952
24868
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
24869
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
23953
24870
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
23954
24871
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
24872
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
23955
24873
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
23956
24874
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
23957
24875
|
Qwen3Model: () => Qwen3Model,
|
|
@@ -23962,9 +24880,13 @@ __export(models_exports, {
|
|
|
23962
24880
|
Qwen3NextModel: () => Qwen3NextModel,
|
|
23963
24881
|
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
23964
24882
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
24883
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
23965
24884
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
24885
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
23966
24886
|
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
24887
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
23967
24888
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
24889
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
23968
24890
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
23969
24891
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
23970
24892
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -24015,11 +24937,13 @@ __export(models_exports, {
|
|
|
24015
24937
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
24016
24938
|
SmolLM3Model: () => SmolLM3Model,
|
|
24017
24939
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
24018
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
24019
24940
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
24020
24941
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
24021
24942
|
SnacModel: () => SnacModel,
|
|
24022
24943
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
24944
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
24945
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
24946
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
24023
24947
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
24024
24948
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
24025
24949
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -24087,6 +25011,8 @@ __export(models_exports, {
|
|
|
24087
25011
|
VitsModelOutput: () => VitsModelOutput,
|
|
24088
25012
|
VitsPreTrainedModel: () => VitsPreTrainedModel,
|
|
24089
25013
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
25014
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
25015
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
24090
25016
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
24091
25017
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
24092
25018
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -24192,7 +25118,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
24192
25118
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
24193
25119
|
};
|
|
24194
25120
|
|
|
24195
|
-
// src/models/
|
|
25121
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
24196
25122
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
24197
25123
|
};
|
|
24198
25124
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -24447,7 +25373,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
|
|
|
24447
25373
|
if (!past_key_values || target_length !== 1) {
|
|
24448
25374
|
throw new Error("Incorrect state encountered during generation.");
|
|
24449
25375
|
}
|
|
24450
|
-
const past_length =
|
|
25376
|
+
const past_length = past_key_values.get_seq_length();
|
|
24451
25377
|
attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
|
|
24452
25378
|
}
|
|
24453
25379
|
}
|
|
@@ -24527,6 +25453,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
24527
25453
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
24528
25454
|
};
|
|
24529
25455
|
|
|
25456
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
25457
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
25458
|
+
};
|
|
25459
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
25460
|
+
};
|
|
25461
|
+
|
|
24530
25462
|
// src/models/clap/modeling_clap.js
|
|
24531
25463
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
24532
25464
|
};
|
|
@@ -24865,6 +25797,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
24865
25797
|
}
|
|
24866
25798
|
};
|
|
24867
25799
|
|
|
25800
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
25801
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
25802
|
+
};
|
|
25803
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
25804
|
+
};
|
|
25805
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
25806
|
+
};
|
|
25807
|
+
|
|
24868
25808
|
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
24869
25809
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
24870
25810
|
};
|
|
@@ -25213,6 +26153,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
25213
26153
|
}
|
|
25214
26154
|
};
|
|
25215
26155
|
|
|
26156
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
26157
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
26158
|
+
};
|
|
26159
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
26160
|
+
};
|
|
26161
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
26162
|
+
/**
|
|
26163
|
+
* Calls the model on new inputs.
|
|
26164
|
+
*
|
|
26165
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26166
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
26167
|
+
*/
|
|
26168
|
+
async _call(model_inputs) {
|
|
26169
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
26170
|
+
}
|
|
26171
|
+
};
|
|
26172
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
26173
|
+
/**
|
|
26174
|
+
* Calls the model on new inputs.
|
|
26175
|
+
*
|
|
26176
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26177
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
26178
|
+
*/
|
|
26179
|
+
async _call(model_inputs) {
|
|
26180
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
26181
|
+
}
|
|
26182
|
+
};
|
|
26183
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
26184
|
+
/**
|
|
26185
|
+
* Calls the model on new inputs.
|
|
26186
|
+
*
|
|
26187
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26188
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
26189
|
+
*/
|
|
26190
|
+
async _call(model_inputs) {
|
|
26191
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
26192
|
+
}
|
|
26193
|
+
};
|
|
26194
|
+
|
|
25216
26195
|
// src/models/exaone/modeling_exaone.js
|
|
25217
26196
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
25218
26197
|
};
|
|
@@ -25477,6 +26456,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
|
|
|
25477
26456
|
});
|
|
25478
26457
|
}
|
|
25479
26458
|
};
|
|
26459
|
+
var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
|
|
26460
|
+
};
|
|
25480
26461
|
|
|
25481
26462
|
// src/models/glm/modeling_glm.js
|
|
25482
26463
|
var GlmPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -25486,6 +26467,377 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
25486
26467
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
25487
26468
|
};
|
|
25488
26469
|
|
|
26470
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
26471
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
26472
|
+
};
|
|
26473
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
26474
|
+
};
|
|
26475
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
26476
|
+
};
|
|
26477
|
+
|
|
26478
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
26479
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
26480
|
+
forward_params = [
|
|
26481
|
+
// Text inputs
|
|
26482
|
+
"input_ids",
|
|
26483
|
+
"attention_mask",
|
|
26484
|
+
"position_ids",
|
|
26485
|
+
"past_key_values",
|
|
26486
|
+
// Vision inputs
|
|
26487
|
+
"pixel_values",
|
|
26488
|
+
"image_grid_thw"
|
|
26489
|
+
];
|
|
26490
|
+
};
|
|
26491
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
26492
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
26493
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
26494
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
26495
|
+
image_grid_thw_name = "grid_thw";
|
|
26496
|
+
/**
|
|
26497
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
26498
|
+
* @param {Tensor} input_ids
|
|
26499
|
+
* @param {Tensor} attention_mask
|
|
26500
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
26501
|
+
*/
|
|
26502
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
26503
|
+
if (attention_mask) {
|
|
26504
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
26505
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
26506
|
+
const mrope_position_deltas = Array.from(
|
|
26507
|
+
{ length: dims[0] },
|
|
26508
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
26509
|
+
);
|
|
26510
|
+
return [
|
|
26511
|
+
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
26512
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
26513
|
+
];
|
|
26514
|
+
} else {
|
|
26515
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
26516
|
+
const position_ids = BigInt64Array.from(
|
|
26517
|
+
{ length: 3 * batch_size * seq_length },
|
|
26518
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
26519
|
+
);
|
|
26520
|
+
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
26521
|
+
}
|
|
26522
|
+
}
|
|
26523
|
+
/**
|
|
26524
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
26525
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
26526
|
+
* respecting attention mask.
|
|
26527
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
26528
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
26529
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
26530
|
+
* @param {number} batch_idx Current batch index
|
|
26531
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
26532
|
+
*/
|
|
26533
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
26534
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
26535
|
+
const llm_positions = new Array(total_len);
|
|
26536
|
+
let index = 0;
|
|
26537
|
+
for (let x = 0; x < 3; ++x) {
|
|
26538
|
+
for (const val of llm_pos_ids_list) {
|
|
26539
|
+
const seg_len = val.length / 3;
|
|
26540
|
+
for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
|
|
26541
|
+
llm_positions[index++] = val[z];
|
|
26542
|
+
}
|
|
26543
|
+
}
|
|
26544
|
+
}
|
|
26545
|
+
let count2 = 0;
|
|
26546
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
26547
|
+
if (attn_mask[y] == 1) {
|
|
26548
|
+
for (let x = 0; x < 3; ++x) {
|
|
26549
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
26550
|
+
}
|
|
26551
|
+
++count2;
|
|
26552
|
+
}
|
|
26553
|
+
}
|
|
26554
|
+
return llm_positions;
|
|
26555
|
+
}
|
|
26556
|
+
/**
|
|
26557
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
26558
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
26559
|
+
* @param {object} params
|
|
26560
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
26561
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
26562
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
26563
|
+
* @param {number} params.spatial_merge_size
|
|
26564
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
26565
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
26566
|
+
*/
|
|
26567
|
+
_get_multimodal_rope_positions({
|
|
26568
|
+
filtered_ids,
|
|
26569
|
+
image_grid_thw_list,
|
|
26570
|
+
video_grid_thw_list,
|
|
26571
|
+
spatial_merge_size,
|
|
26572
|
+
state
|
|
26573
|
+
}) {
|
|
26574
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
26575
|
+
const ids = filtered_ids;
|
|
26576
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
26577
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
26578
|
+
return acc;
|
|
26579
|
+
}, []);
|
|
26580
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
26581
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
26582
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
26583
|
+
const llm_pos_ids_list = [];
|
|
26584
|
+
let st2 = 0;
|
|
26585
|
+
let remain_images = image_nums;
|
|
26586
|
+
let remain_videos = video_nums;
|
|
26587
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
26588
|
+
const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
|
|
26589
|
+
const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
|
|
26590
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
26591
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
26592
|
+
let ed;
|
|
26593
|
+
let t, h, w;
|
|
26594
|
+
if (ed_image < ed_video) {
|
|
26595
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
26596
|
+
++state.image_index;
|
|
26597
|
+
--remain_images;
|
|
26598
|
+
ed = ed_image;
|
|
26599
|
+
} else {
|
|
26600
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
26601
|
+
++state.video_index;
|
|
26602
|
+
--remain_videos;
|
|
26603
|
+
ed = ed_video;
|
|
26604
|
+
}
|
|
26605
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
26606
|
+
Number(t),
|
|
26607
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
26608
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
26609
|
+
];
|
|
26610
|
+
const text_len = ed - st2;
|
|
26611
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
26612
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
26613
|
+
const offset = text_len + st_idx;
|
|
26614
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
26615
|
+
const t_index = Array.from(
|
|
26616
|
+
{ length: grid_size },
|
|
26617
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
26618
|
+
);
|
|
26619
|
+
const h_index = Array.from(
|
|
26620
|
+
{ length: grid_size },
|
|
26621
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
26622
|
+
);
|
|
26623
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
26624
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
26625
|
+
st2 = ed + grid_size;
|
|
26626
|
+
}
|
|
26627
|
+
if (st2 < ids.length) {
|
|
26628
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
26629
|
+
const text_len = ids.length - st2;
|
|
26630
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
26631
|
+
}
|
|
26632
|
+
return llm_pos_ids_list;
|
|
26633
|
+
}
|
|
26634
|
+
/**
|
|
26635
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
26636
|
+
*
|
|
26637
|
+
* Explanation:
|
|
26638
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
26639
|
+
*
|
|
26640
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
26641
|
+
* Examples:
|
|
26642
|
+
* input_ids: [T T T T T], here T is for text.
|
|
26643
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
26644
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
26645
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
26646
|
+
*
|
|
26647
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
26648
|
+
* and 1D rotary position embeddin for text part.
|
|
26649
|
+
* Examples:
|
|
26650
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
26651
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
26652
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
26653
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
26654
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
26655
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
26656
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
26657
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
26658
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
26659
|
+
*
|
|
26660
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
26661
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
26662
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
26663
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
26664
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
26665
|
+
*/
|
|
26666
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
26667
|
+
const { vision_config } = this.config;
|
|
26668
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
26669
|
+
if (image_grid_thw || video_grid_thw) {
|
|
26670
|
+
const total_input_ids = input_ids.tolist();
|
|
26671
|
+
if (!attention_mask) {
|
|
26672
|
+
attention_mask = ones_like(input_ids);
|
|
26673
|
+
}
|
|
26674
|
+
const attention_mask_list = attention_mask.tolist();
|
|
26675
|
+
const position_ids_list = Array.from(
|
|
26676
|
+
{ length: 3 },
|
|
26677
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
26678
|
+
);
|
|
26679
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
26680
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
26681
|
+
const state = { image_index: 0, video_index: 0 };
|
|
26682
|
+
const mrope_position_deltas = [];
|
|
26683
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
26684
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
26685
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
26686
|
+
filtered_ids,
|
|
26687
|
+
image_grid_thw_list,
|
|
26688
|
+
video_grid_thw_list,
|
|
26689
|
+
spatial_merge_size,
|
|
26690
|
+
state
|
|
26691
|
+
});
|
|
26692
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
26693
|
+
llm_pos_ids_list,
|
|
26694
|
+
attention_mask_list[i],
|
|
26695
|
+
position_ids_list,
|
|
26696
|
+
i
|
|
26697
|
+
);
|
|
26698
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
26699
|
+
}
|
|
26700
|
+
return [
|
|
26701
|
+
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
26702
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
26703
|
+
];
|
|
26704
|
+
} else {
|
|
26705
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
26706
|
+
}
|
|
26707
|
+
}
|
|
26708
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
26709
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
26710
|
+
pixel_values,
|
|
26711
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
26712
|
+
})).image_features;
|
|
26713
|
+
return features;
|
|
26714
|
+
}
|
|
26715
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
26716
|
+
return default_merge_input_ids_with_image_features({
|
|
26717
|
+
// @ts-ignore
|
|
26718
|
+
image_token_id: this.config.image_token_id,
|
|
26719
|
+
...kwargs
|
|
26720
|
+
});
|
|
26721
|
+
}
|
|
26722
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
26723
|
+
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
26724
|
+
if (!model_inputs.past_key_values) {
|
|
26725
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
26726
|
+
model_inputs.input_ids,
|
|
26727
|
+
model_inputs.image_grid_thw,
|
|
26728
|
+
model_inputs.video_grid_thw,
|
|
26729
|
+
model_inputs.attention_mask
|
|
26730
|
+
);
|
|
26731
|
+
} else {
|
|
26732
|
+
model_inputs.pixel_values = null;
|
|
26733
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
26734
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
26735
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
26736
|
+
model_inputs.input_ids,
|
|
26737
|
+
model_inputs.image_grid_thw,
|
|
26738
|
+
model_inputs.video_grid_thw,
|
|
26739
|
+
model_inputs.attention_mask
|
|
26740
|
+
);
|
|
26741
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
26742
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
26743
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
26744
|
+
} else {
|
|
26745
|
+
if (!model_inputs.rope_deltas) {
|
|
26746
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
26747
|
+
model_inputs.input_ids,
|
|
26748
|
+
model_inputs.image_grid_thw,
|
|
26749
|
+
model_inputs.video_grid_thw,
|
|
26750
|
+
model_inputs.attention_mask
|
|
26751
|
+
);
|
|
26752
|
+
}
|
|
26753
|
+
const delta = BigInt(past_length);
|
|
26754
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
26755
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
26756
|
+
}
|
|
26757
|
+
}
|
|
26758
|
+
}
|
|
26759
|
+
return model_inputs;
|
|
26760
|
+
}
|
|
26761
|
+
};
|
|
26762
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
26763
|
+
};
|
|
26764
|
+
|
|
26765
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
26766
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
26767
|
+
image_grid_thw_name = "image_grid_thw";
|
|
26768
|
+
};
|
|
26769
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
26770
|
+
image_grid_thw_name = "image_grid_thw";
|
|
26771
|
+
};
|
|
26772
|
+
|
|
26773
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
26774
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
26775
|
+
/**
|
|
26776
|
+
* Compute 3D positional indices for vision tokens.
|
|
26777
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
26778
|
+
* @param {number} start_position
|
|
26779
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
26780
|
+
* @param {number} temp_merge_size
|
|
26781
|
+
* @param {number} spatial_merge_size
|
|
26782
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
26783
|
+
*/
|
|
26784
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
26785
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
26786
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
26787
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
26788
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
26789
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
26790
|
+
const h_pos = Array.from(
|
|
26791
|
+
{ length: seq_len },
|
|
26792
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
26793
|
+
);
|
|
26794
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
26795
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
26796
|
+
}
|
|
26797
|
+
/**
|
|
26798
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
26799
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
26800
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
26801
|
+
*/
|
|
26802
|
+
_get_multimodal_rope_positions({
|
|
26803
|
+
filtered_ids,
|
|
26804
|
+
image_grid_thw_list,
|
|
26805
|
+
video_grid_thw_list,
|
|
26806
|
+
spatial_merge_size,
|
|
26807
|
+
state
|
|
26808
|
+
}) {
|
|
26809
|
+
const { image_token_id } = this.config;
|
|
26810
|
+
const groups = [];
|
|
26811
|
+
let group_start = 0;
|
|
26812
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
26813
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
26814
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
26815
|
+
if (t !== current_type) {
|
|
26816
|
+
groups.push([current_type, group_start, j]);
|
|
26817
|
+
group_start = j;
|
|
26818
|
+
current_type = t;
|
|
26819
|
+
}
|
|
26820
|
+
}
|
|
26821
|
+
let current_pos = 0;
|
|
26822
|
+
const llm_pos_ids_list = [];
|
|
26823
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
26824
|
+
if (modality_type === 0) {
|
|
26825
|
+
const text_len = end_idx - start_idx;
|
|
26826
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
26827
|
+
current_pos += text_len;
|
|
26828
|
+
} else {
|
|
26829
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
26830
|
+
const temp_merge_size = grid_thw[0];
|
|
26831
|
+
llm_pos_ids_list.push(
|
|
26832
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
26833
|
+
);
|
|
26834
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
26835
|
+
}
|
|
26836
|
+
}
|
|
26837
|
+
return llm_pos_ids_list;
|
|
26838
|
+
}
|
|
26839
|
+
};
|
|
26840
|
+
|
|
25489
26841
|
// src/models/glpn/modeling_glpn.js
|
|
25490
26842
|
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
25491
26843
|
};
|
|
@@ -25558,6 +26910,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
|
|
|
25558
26910
|
var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
|
|
25559
26911
|
};
|
|
25560
26912
|
|
|
26913
|
+
// src/models/ultravox/modeling_ultravox.js
|
|
26914
|
+
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
26915
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
26916
|
+
};
|
|
26917
|
+
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
26918
|
+
_merge_input_ids_with_audio_features(kwargs) {
|
|
26919
|
+
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
26920
|
+
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
26921
|
+
return default_merge_input_ids_with_audio_features({
|
|
26922
|
+
// @ts-ignore
|
|
26923
|
+
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
|
|
26924
|
+
...kwargs,
|
|
26925
|
+
audio_features: reshaped_audio_features
|
|
26926
|
+
});
|
|
26927
|
+
}
|
|
26928
|
+
};
|
|
26929
|
+
|
|
26930
|
+
// src/models/granite_speech/modeling_granite_speech.js
|
|
26931
|
+
var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
|
|
26932
|
+
forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
|
|
26933
|
+
};
|
|
26934
|
+
|
|
25561
26935
|
// src/models/grounding_dino/modeling_grounding_dino.js
|
|
25562
26936
|
var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
|
|
25563
26937
|
};
|
|
@@ -25662,34 +27036,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
25662
27036
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
25663
27037
|
};
|
|
25664
27038
|
|
|
25665
|
-
// src/models/
|
|
25666
|
-
var
|
|
25667
|
-
forward_params = [
|
|
25668
|
-
"input_ids",
|
|
25669
|
-
"attention_mask",
|
|
25670
|
-
"pixel_values",
|
|
25671
|
-
"pixel_attention_mask",
|
|
25672
|
-
"position_ids",
|
|
25673
|
-
"past_key_values"
|
|
25674
|
-
];
|
|
27039
|
+
// src/models/llava/modeling_llava.js
|
|
27040
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
27041
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
25675
27042
|
};
|
|
25676
|
-
var
|
|
25677
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
25678
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
|
|
25679
|
-
return features;
|
|
25680
|
-
}
|
|
27043
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
25681
27044
|
_merge_input_ids_with_image_features(kwargs) {
|
|
25682
27045
|
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
25683
27046
|
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
25684
27047
|
return default_merge_input_ids_with_image_features({
|
|
25685
27048
|
// @ts-ignore
|
|
25686
|
-
image_token_id: this.config.image_token_id,
|
|
27049
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
25687
27050
|
...kwargs,
|
|
25688
27051
|
image_features: reshaped_image_hidden_states
|
|
25689
27052
|
});
|
|
25690
27053
|
}
|
|
25691
27054
|
};
|
|
25692
|
-
var
|
|
27055
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27056
|
+
};
|
|
27057
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
27058
|
+
};
|
|
27059
|
+
|
|
27060
|
+
// src/models/idefics3/modeling_idefics3.js
|
|
27061
|
+
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27062
|
+
forward_params = [
|
|
27063
|
+
"input_ids",
|
|
27064
|
+
"attention_mask",
|
|
27065
|
+
"pixel_values",
|
|
27066
|
+
"pixel_attention_mask",
|
|
27067
|
+
"position_ids",
|
|
27068
|
+
"past_key_values"
|
|
27069
|
+
];
|
|
25693
27070
|
};
|
|
25694
27071
|
|
|
25695
27072
|
// src/models/ijepa/modeling_ijepa.js
|
|
@@ -25773,6 +27150,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
25773
27150
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
25774
27151
|
};
|
|
25775
27152
|
|
|
27153
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
27154
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27155
|
+
};
|
|
27156
|
+
|
|
25776
27157
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
25777
27158
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
25778
27159
|
};
|
|
@@ -25781,6 +27162,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
|
|
|
25781
27162
|
var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
|
|
25782
27163
|
};
|
|
25783
27164
|
|
|
27165
|
+
// src/models/lfm2_vl/modeling_lfm2_vl.js
|
|
27166
|
+
var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27167
|
+
forward_params = [
|
|
27168
|
+
"input_ids",
|
|
27169
|
+
"attention_mask",
|
|
27170
|
+
"pixel_values",
|
|
27171
|
+
"pixel_attention_mask",
|
|
27172
|
+
"spatial_shapes",
|
|
27173
|
+
"position_ids",
|
|
27174
|
+
"past_key_values"
|
|
27175
|
+
];
|
|
27176
|
+
};
|
|
27177
|
+
|
|
25784
27178
|
// src/models/llama/modeling_llama.js
|
|
25785
27179
|
var LlamaPreTrainedModel = class extends PreTrainedModel {
|
|
25786
27180
|
};
|
|
@@ -25795,27 +27189,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
|
|
|
25795
27189
|
var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
|
|
25796
27190
|
};
|
|
25797
27191
|
|
|
25798
|
-
// src/models/llava/modeling_llava.js
|
|
25799
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
25800
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
25801
|
-
};
|
|
25802
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
25803
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
25804
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
25805
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
25806
|
-
return default_merge_input_ids_with_image_features({
|
|
25807
|
-
// @ts-ignore
|
|
25808
|
-
image_token_id: this.config.image_token_index,
|
|
25809
|
-
...kwargs,
|
|
25810
|
-
image_features: reshaped_image_hidden_states
|
|
25811
|
-
});
|
|
25812
|
-
}
|
|
25813
|
-
};
|
|
25814
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
25815
|
-
};
|
|
25816
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
25817
|
-
};
|
|
25818
|
-
|
|
25819
27192
|
// src/models/longt5/modeling_longt5.js
|
|
25820
27193
|
var LongT5PreTrainedModel = class extends PreTrainedModel {
|
|
25821
27194
|
};
|
|
@@ -25977,6 +27350,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
25977
27350
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
25978
27351
|
};
|
|
25979
27352
|
|
|
27353
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
27354
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
27355
|
+
};
|
|
27356
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
27357
|
+
};
|
|
27358
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
27359
|
+
};
|
|
27360
|
+
|
|
25980
27361
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
25981
27362
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
25982
27363
|
};
|
|
@@ -26445,6 +27826,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
26445
27826
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
26446
27827
|
};
|
|
26447
27828
|
|
|
27829
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
27830
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
27831
|
+
};
|
|
27832
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
27833
|
+
};
|
|
27834
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
27835
|
+
};
|
|
27836
|
+
|
|
26448
27837
|
// src/models/neobert/modeling_neobert.js
|
|
26449
27838
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
26450
27839
|
};
|
|
@@ -26566,27 +27955,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
|
|
|
26566
27955
|
};
|
|
26567
27956
|
|
|
26568
27957
|
// src/models/paligemma/modeling_paligemma.js
|
|
26569
|
-
var
|
|
26570
|
-
forward_params = [
|
|
26571
|
-
"input_ids",
|
|
26572
|
-
// 'inputs_embeds',
|
|
26573
|
-
"attention_mask",
|
|
26574
|
-
"pixel_values",
|
|
26575
|
-
"position_ids",
|
|
26576
|
-
"past_key_values"
|
|
26577
|
-
];
|
|
26578
|
-
};
|
|
26579
|
-
var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
|
|
26580
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
26581
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26582
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26583
|
-
return default_merge_input_ids_with_image_features({
|
|
26584
|
-
// @ts-ignore
|
|
26585
|
-
image_token_id: this.config.image_token_index,
|
|
26586
|
-
...kwargs,
|
|
26587
|
-
image_features: reshaped_image_hidden_states
|
|
26588
|
-
});
|
|
26589
|
-
}
|
|
27958
|
+
var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26590
27959
|
};
|
|
26591
27960
|
|
|
26592
27961
|
// src/models/parakeet/modeling_parakeet.js
|
|
@@ -26745,244 +28114,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
26745
28114
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
26746
28115
|
};
|
|
26747
28116
|
|
|
26748
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
26749
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
26750
|
-
forward_params = [
|
|
26751
|
-
// Text inputs
|
|
26752
|
-
"input_ids",
|
|
26753
|
-
"attention_mask",
|
|
26754
|
-
"position_ids",
|
|
26755
|
-
"past_key_values",
|
|
26756
|
-
// Vision inputs
|
|
26757
|
-
"pixel_values",
|
|
26758
|
-
"image_grid_thw"
|
|
26759
|
-
];
|
|
26760
|
-
};
|
|
26761
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
26762
|
-
image_grid_thw_name = "grid_thw";
|
|
26763
|
-
/**
|
|
26764
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
26765
|
-
*
|
|
26766
|
-
* Explanation:
|
|
26767
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
26768
|
-
*
|
|
26769
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
26770
|
-
* Examples:
|
|
26771
|
-
* input_ids: [T T T T T], here T is for text.
|
|
26772
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
26773
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
26774
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
26775
|
-
*
|
|
26776
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
26777
|
-
* and 1D rotary position embeddin for text part.
|
|
26778
|
-
* Examples:
|
|
26779
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
26780
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
26781
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
26782
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
26783
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
26784
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
26785
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
26786
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
26787
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
26788
|
-
*
|
|
26789
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
26790
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
26791
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
26792
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
26793
|
-
* - 1 for tokens that are **not masked**,
|
|
26794
|
-
* - 0 for tokens that are **masked**.
|
|
26795
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
26796
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
26797
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
26798
|
-
*/
|
|
26799
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
26800
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
26801
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
26802
|
-
const mrope_position_deltas = [];
|
|
26803
|
-
if (image_grid_thw || video_grid_thw) {
|
|
26804
|
-
let total_input_ids = input_ids.tolist();
|
|
26805
|
-
if (!attention_mask) {
|
|
26806
|
-
attention_mask = ones_like(input_ids);
|
|
26807
|
-
}
|
|
26808
|
-
const attention_mask_list = attention_mask.tolist();
|
|
26809
|
-
const position_ids_list = Array.from(
|
|
26810
|
-
{ length: 3 },
|
|
26811
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
26812
|
-
);
|
|
26813
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
26814
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
26815
|
-
let image_index = 0;
|
|
26816
|
-
let video_index = 0;
|
|
26817
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
26818
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
26819
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
26820
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
26821
|
-
return acc;
|
|
26822
|
-
}, []);
|
|
26823
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
26824
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
26825
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
26826
|
-
let llm_pos_ids_list = [];
|
|
26827
|
-
let st2 = 0;
|
|
26828
|
-
let remain_images = image_nums;
|
|
26829
|
-
let remain_videos = video_nums;
|
|
26830
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
26831
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
|
|
26832
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
|
|
26833
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
26834
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
26835
|
-
let ed;
|
|
26836
|
-
let t, h, w;
|
|
26837
|
-
if (ed_image < ed_video) {
|
|
26838
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
26839
|
-
++image_index;
|
|
26840
|
-
--remain_images;
|
|
26841
|
-
ed = ed_image;
|
|
26842
|
-
} else {
|
|
26843
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
26844
|
-
++video_index;
|
|
26845
|
-
--remain_videos;
|
|
26846
|
-
ed = ed_video;
|
|
26847
|
-
}
|
|
26848
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
26849
|
-
Number(t),
|
|
26850
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
26851
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
26852
|
-
];
|
|
26853
|
-
const text_len = ed - st2;
|
|
26854
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
26855
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
26856
|
-
const offset = text_len + st_idx;
|
|
26857
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
26858
|
-
const t_index = Array.from(
|
|
26859
|
-
{ length: grid_size },
|
|
26860
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
26861
|
-
);
|
|
26862
|
-
const h_index = Array.from(
|
|
26863
|
-
{ length: grid_size },
|
|
26864
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
26865
|
-
);
|
|
26866
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
26867
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
26868
|
-
st2 = ed + grid_size;
|
|
26869
|
-
}
|
|
26870
|
-
if (st2 < ids.length) {
|
|
26871
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
26872
|
-
const text_len = ids.length - st2;
|
|
26873
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
26874
|
-
}
|
|
26875
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
26876
|
-
const llm_positions = new Array(num_items);
|
|
26877
|
-
let index = 0;
|
|
26878
|
-
for (let x = 0; x < 3; ++x) {
|
|
26879
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
26880
|
-
const val = llm_pos_ids_list[y];
|
|
26881
|
-
const text_len = val.length / 3;
|
|
26882
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
26883
|
-
llm_positions[index++] = val[z];
|
|
26884
|
-
}
|
|
26885
|
-
}
|
|
26886
|
-
}
|
|
26887
|
-
let count2 = 0;
|
|
26888
|
-
const attn_mask = attention_mask_list[i];
|
|
26889
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
26890
|
-
if (attn_mask[y] == 1) {
|
|
26891
|
-
for (let x = 0; x < 3; ++x) {
|
|
26892
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
26893
|
-
}
|
|
26894
|
-
++count2;
|
|
26895
|
-
}
|
|
26896
|
-
}
|
|
26897
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
26898
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
26899
|
-
}
|
|
26900
|
-
return [
|
|
26901
|
-
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
26902
|
-
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
26903
|
-
];
|
|
26904
|
-
} else {
|
|
26905
|
-
if (attention_mask) {
|
|
26906
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
26907
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
26908
|
-
const mrope_position_deltas2 = Array.from(
|
|
26909
|
-
{ length: dims[0] },
|
|
26910
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
26911
|
-
);
|
|
26912
|
-
return [
|
|
26913
|
-
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
26914
|
-
new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
26915
|
-
];
|
|
26916
|
-
} else {
|
|
26917
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
26918
|
-
const position_ids = BigInt64Array.from(
|
|
26919
|
-
{ length: 3 * batch_size * seq_length },
|
|
26920
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
26921
|
-
);
|
|
26922
|
-
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
26923
|
-
}
|
|
26924
|
-
}
|
|
26925
|
-
}
|
|
26926
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
26927
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
26928
|
-
pixel_values,
|
|
26929
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
26930
|
-
})).image_features;
|
|
26931
|
-
return features;
|
|
26932
|
-
}
|
|
26933
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
26934
|
-
return default_merge_input_ids_with_image_features({
|
|
26935
|
-
// @ts-ignore
|
|
26936
|
-
image_token_id: this.config.image_token_id,
|
|
26937
|
-
...kwargs
|
|
26938
|
-
});
|
|
26939
|
-
}
|
|
26940
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
26941
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
26942
|
-
if (!model_inputs.past_key_values) {
|
|
26943
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
26944
|
-
model_inputs.input_ids,
|
|
26945
|
-
model_inputs.image_grid_thw,
|
|
26946
|
-
model_inputs.video_grid_thw,
|
|
26947
|
-
model_inputs.attention_mask
|
|
26948
|
-
);
|
|
26949
|
-
} else {
|
|
26950
|
-
model_inputs.pixel_values = null;
|
|
26951
|
-
const past_length = getPastLength(model_inputs.past_key_values);
|
|
26952
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
26953
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
26954
|
-
model_inputs.input_ids,
|
|
26955
|
-
model_inputs.image_grid_thw,
|
|
26956
|
-
model_inputs.video_grid_thw,
|
|
26957
|
-
model_inputs.attention_mask
|
|
26958
|
-
);
|
|
26959
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
26960
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
26961
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
26962
|
-
} else {
|
|
26963
|
-
if (!model_inputs.rope_deltas) {
|
|
26964
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
26965
|
-
model_inputs.input_ids,
|
|
26966
|
-
model_inputs.image_grid_thw,
|
|
26967
|
-
model_inputs.video_grid_thw,
|
|
26968
|
-
model_inputs.attention_mask
|
|
26969
|
-
);
|
|
26970
|
-
}
|
|
26971
|
-
const delta = BigInt(past_length);
|
|
26972
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
26973
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
26974
|
-
}
|
|
26975
|
-
}
|
|
26976
|
-
}
|
|
26977
|
-
return model_inputs;
|
|
26978
|
-
}
|
|
26979
|
-
};
|
|
26980
|
-
|
|
26981
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
26982
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
26983
|
-
image_grid_thw_name = "image_grid_thw";
|
|
26984
|
-
};
|
|
26985
|
-
|
|
26986
28117
|
// src/models/qwen3/modeling_qwen3.js
|
|
26987
28118
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
26988
28119
|
};
|
|
@@ -27010,18 +28141,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
|
|
|
27010
28141
|
// src/models/qwen3_vl/modeling_qwen3_vl.js
|
|
27011
28142
|
var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27012
28143
|
};
|
|
28144
|
+
var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
|
|
28145
|
+
};
|
|
27013
28146
|
|
|
27014
28147
|
// src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
|
|
27015
28148
|
var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27016
28149
|
};
|
|
28150
|
+
var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
|
|
28151
|
+
};
|
|
27017
28152
|
|
|
27018
28153
|
// src/models/qwen3_5/modeling_qwen3_5.js
|
|
27019
28154
|
var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27020
28155
|
};
|
|
28156
|
+
var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
|
|
28157
|
+
};
|
|
27021
28158
|
|
|
27022
28159
|
// src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
|
|
27023
28160
|
var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
|
|
27024
28161
|
};
|
|
28162
|
+
var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
|
|
28163
|
+
};
|
|
27025
28164
|
|
|
27026
28165
|
// src/models/resnet/modeling_resnet.js
|
|
27027
28166
|
var ResNetPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -27420,6 +28559,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
27420
28559
|
}
|
|
27421
28560
|
};
|
|
27422
28561
|
|
|
28562
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
28563
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
28564
|
+
};
|
|
28565
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
28566
|
+
};
|
|
28567
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
28568
|
+
};
|
|
28569
|
+
|
|
27423
28570
|
// src/models/speecht5/modeling_speecht5.js
|
|
27424
28571
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
27425
28572
|
};
|
|
@@ -27702,25 +28849,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
|
|
|
27702
28849
|
var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
|
|
27703
28850
|
};
|
|
27704
28851
|
|
|
27705
|
-
// src/models/ultravox/modeling_ultravox.js
|
|
27706
|
-
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
27707
|
-
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
27708
|
-
};
|
|
27709
|
-
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
27710
|
-
_merge_input_ids_with_audio_features(kwargs) {
|
|
27711
|
-
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
27712
|
-
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
27713
|
-
return default_merge_input_ids_with_audio_features({
|
|
27714
|
-
// @ts-ignore
|
|
27715
|
-
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
|
|
27716
|
-
...kwargs,
|
|
27717
|
-
audio_features: reshaped_audio_features
|
|
27718
|
-
});
|
|
27719
|
-
}
|
|
27720
|
-
};
|
|
27721
|
-
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
27722
|
-
};
|
|
27723
|
-
|
|
27724
28852
|
// src/models/unispeech/modeling_unispeech.js
|
|
27725
28853
|
var UniSpeechPreTrainedModel = class extends PreTrainedModel {
|
|
27726
28854
|
};
|
|
@@ -27886,6 +29014,170 @@ var VitsModel = class extends VitsPreTrainedModel {
|
|
|
27886
29014
|
}
|
|
27887
29015
|
};
|
|
27888
29016
|
|
|
29017
|
+
// src/models/voxtral/modeling_voxtral.js
|
|
29018
|
+
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
29019
|
+
};
|
|
29020
|
+
|
|
29021
|
+
// src/models/voxtral_realtime/modeling_voxtral_realtime.js
|
|
29022
|
+
var CONV1_LEFT_PAD = 2;
|
|
29023
|
+
var CONV2_LEFT_PAD = 1;
|
|
29024
|
+
var states = /* @__PURE__ */ new WeakMap();
|
|
29025
|
+
function createEncoderState(model, input_features) {
|
|
29026
|
+
const { text_config, audio_config } = (
|
|
29027
|
+
/** @type {any} */
|
|
29028
|
+
model.config
|
|
29029
|
+
);
|
|
29030
|
+
const encoder_session = model.sessions["audio_encoder"];
|
|
29031
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
29032
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
29033
|
+
const enc_kv_cache = new DynamicCache();
|
|
29034
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
|
|
29035
|
+
const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
29036
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
29037
|
+
for (const name in enc_shapes) {
|
|
29038
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
29039
|
+
enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
29040
|
+
}
|
|
29041
|
+
const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
29042
|
+
1,
|
|
29043
|
+
PADDING_CACHE_CHANNELS,
|
|
29044
|
+
CONV1_LEFT_PAD
|
|
29045
|
+
]);
|
|
29046
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
29047
|
+
if (!chunks_iter) {
|
|
29048
|
+
throw new Error("input_features must be iterable or async iterable");
|
|
29049
|
+
}
|
|
29050
|
+
return {
|
|
29051
|
+
encoder_session,
|
|
29052
|
+
enc_kv_cache,
|
|
29053
|
+
enc_padding_cache,
|
|
29054
|
+
enc_past_seq_len: 0,
|
|
29055
|
+
audio_embed_queue: [],
|
|
29056
|
+
audio_embed_total_tokens: 0,
|
|
29057
|
+
audio_queue_offset: 0,
|
|
29058
|
+
audio_consumed: 0,
|
|
29059
|
+
stream_exhausted: false,
|
|
29060
|
+
chunks_iter,
|
|
29061
|
+
text_hidden_size: text_config.hidden_size
|
|
29062
|
+
};
|
|
29063
|
+
}
|
|
29064
|
+
async function encodeChunk(s, chunk_features) {
|
|
29065
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
29066
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
29067
|
+
const position_ids = new Tensor2(
|
|
29068
|
+
"int64",
|
|
29069
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
29070
|
+
[1, conv2_output_len]
|
|
29071
|
+
);
|
|
29072
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
29073
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
29074
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
29075
|
+
input_features: chunk_features,
|
|
29076
|
+
attention_mask,
|
|
29077
|
+
position_ids,
|
|
29078
|
+
past_padding_cache: s.enc_padding_cache,
|
|
29079
|
+
...s.enc_kv_cache
|
|
29080
|
+
});
|
|
29081
|
+
if (s.enc_padding_cache.location === "gpu-buffer") {
|
|
29082
|
+
s.enc_padding_cache.dispose();
|
|
29083
|
+
}
|
|
29084
|
+
s.enc_padding_cache = present_padding_cache;
|
|
29085
|
+
for (const name in present_cache) {
|
|
29086
|
+
if (name.startsWith("present.")) {
|
|
29087
|
+
const pastName = name.replace("present", "past_key_values");
|
|
29088
|
+
const prev = s.enc_kv_cache[pastName];
|
|
29089
|
+
if (prev?.location === "gpu-buffer") {
|
|
29090
|
+
prev.dispose();
|
|
29091
|
+
}
|
|
29092
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
29093
|
+
}
|
|
29094
|
+
}
|
|
29095
|
+
s.enc_past_seq_len = total_seq_len;
|
|
29096
|
+
return audio_embeds;
|
|
29097
|
+
}
|
|
29098
|
+
async function fillAudioBuffer(s, needed) {
|
|
29099
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
29100
|
+
const result = await s.chunks_iter.next();
|
|
29101
|
+
if (result.done) {
|
|
29102
|
+
s.stream_exhausted = true;
|
|
29103
|
+
break;
|
|
29104
|
+
}
|
|
29105
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
29106
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
29107
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
29108
|
+
}
|
|
29109
|
+
}
|
|
29110
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
29111
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
29112
|
+
const embed_data = inputs_embeds.data;
|
|
29113
|
+
let embed_write_pos = 0;
|
|
29114
|
+
let remaining = current_len;
|
|
29115
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
29116
|
+
const front = s.audio_embed_queue[0];
|
|
29117
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
29118
|
+
const n = Math.min(remaining, available);
|
|
29119
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
29120
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
29121
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
29122
|
+
}
|
|
29123
|
+
embed_write_pos += n;
|
|
29124
|
+
remaining -= n;
|
|
29125
|
+
s.audio_queue_offset += n;
|
|
29126
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
29127
|
+
s.audio_embed_queue.shift();
|
|
29128
|
+
s.audio_queue_offset = 0;
|
|
29129
|
+
}
|
|
29130
|
+
}
|
|
29131
|
+
s.audio_consumed += current_len - remaining;
|
|
29132
|
+
}
|
|
29133
|
+
var AudioExhaustedCriteria = class extends StoppingCriteria {
|
|
29134
|
+
constructor(enc_state) {
|
|
29135
|
+
super();
|
|
29136
|
+
this._s = enc_state;
|
|
29137
|
+
}
|
|
29138
|
+
_call(input_ids) {
|
|
29139
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
29140
|
+
return input_ids.map(() => done);
|
|
29141
|
+
}
|
|
29142
|
+
};
|
|
29143
|
+
var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
|
|
29144
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
|
|
29145
|
+
};
|
|
29146
|
+
var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
|
|
29147
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
29148
|
+
const current_len = input_ids.dims[1];
|
|
29149
|
+
const enc = states.get(this);
|
|
29150
|
+
if (enc) {
|
|
29151
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
29152
|
+
}
|
|
29153
|
+
const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
|
|
29154
|
+
if (enc) {
|
|
29155
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
29156
|
+
}
|
|
29157
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
29158
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
29159
|
+
const session = this.sessions["decoder_model_merged"];
|
|
29160
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
29161
|
+
return await sessionRun(session, fixed);
|
|
29162
|
+
}
|
|
29163
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
29164
|
+
if (!input_features) {
|
|
29165
|
+
throw new Error("input_features (generator/iterable) must be provided");
|
|
29166
|
+
}
|
|
29167
|
+
const enc_state = createEncoderState(this, input_features);
|
|
29168
|
+
states.set(this, enc_state);
|
|
29169
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
29170
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
29171
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
29172
|
+
try {
|
|
29173
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
29174
|
+
} finally {
|
|
29175
|
+
enc_state.enc_kv_cache.dispose();
|
|
29176
|
+
states.delete(this);
|
|
29177
|
+
}
|
|
29178
|
+
}
|
|
29179
|
+
};
|
|
29180
|
+
|
|
27889
29181
|
// src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
|
|
27890
29182
|
var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
|
|
27891
29183
|
};
|
|
@@ -28391,6 +29683,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
28391
29683
|
// src/models/registry.js
|
|
28392
29684
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
28393
29685
|
["bert", "BertModel"],
|
|
29686
|
+
["eurobert", "EuroBertModel"],
|
|
28394
29687
|
["neobert", "NeoBertModel"],
|
|
28395
29688
|
["modernbert", "ModernBertModel"],
|
|
28396
29689
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -28522,6 +29815,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
28522
29815
|
["gemma3_text", "Gemma3Model"],
|
|
28523
29816
|
["helium", "HeliumModel"],
|
|
28524
29817
|
["glm", "GlmModel"],
|
|
29818
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
28525
29819
|
["openelm", "OpenELMModel"],
|
|
28526
29820
|
["qwen2", "Qwen2Model"],
|
|
28527
29821
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -28533,12 +29827,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
28533
29827
|
["mpt", "MptModel"],
|
|
28534
29828
|
["opt", "OPTModel"],
|
|
28535
29829
|
["mistral", "MistralModel"],
|
|
29830
|
+
["mistral4", "Mistral4Model"],
|
|
28536
29831
|
["ministral", "MinistralModel"],
|
|
28537
29832
|
["ministral3", "Ministral3Model"],
|
|
28538
29833
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
28539
29834
|
["starcoder2", "Starcoder2Model"],
|
|
29835
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
28540
29836
|
["falcon", "FalconModel"],
|
|
28541
29837
|
["falcon_h1", "FalconH1Model"],
|
|
29838
|
+
["nemotron_h", "NemotronHModel"],
|
|
29839
|
+
["solar_open", "SolarOpenModel"],
|
|
28542
29840
|
["stablelm", "StableLmModel"],
|
|
28543
29841
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
28544
29842
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -28558,6 +29856,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
28558
29856
|
]);
|
|
28559
29857
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
28560
29858
|
["bert", "BertForSequenceClassification"],
|
|
29859
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
28561
29860
|
["neobert", "NeoBertForSequenceClassification"],
|
|
28562
29861
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
28563
29862
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -28580,6 +29879,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
28580
29879
|
]);
|
|
28581
29880
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
28582
29881
|
["bert", "BertForTokenClassification"],
|
|
29882
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
28583
29883
|
["neobert", "NeoBertForTokenClassification"],
|
|
28584
29884
|
["modernbert", "ModernBertForTokenClassification"],
|
|
28585
29885
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -28639,27 +29939,40 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
28639
29939
|
["gemma2", "Gemma2ForCausalLM"],
|
|
28640
29940
|
["vaultgemma", "VaultGemmaForCausalLM"],
|
|
28641
29941
|
["gemma3_text", "Gemma3ForCausalLM"],
|
|
29942
|
+
["gemma3", "Gemma3ForCausalLM"],
|
|
28642
29943
|
["helium", "HeliumForCausalLM"],
|
|
28643
29944
|
["glm", "GlmForCausalLM"],
|
|
29945
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
28644
29946
|
["openelm", "OpenELMForCausalLM"],
|
|
28645
29947
|
["qwen2", "Qwen2ForCausalLM"],
|
|
28646
29948
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
28647
29949
|
["qwen3", "Qwen3ForCausalLM"],
|
|
28648
29950
|
["qwen3_moe", "Qwen3MoeForCausalLM"],
|
|
28649
29951
|
["qwen3_next", "Qwen3NextForCausalLM"],
|
|
29952
|
+
["qwen2_vl", "Qwen2VLForCausalLM"],
|
|
29953
|
+
["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
|
|
29954
|
+
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
29955
|
+
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
29956
|
+
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
29957
|
+
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
29958
|
+
["gemma3n", "Gemma3nForCausalLM"],
|
|
28650
29959
|
["phi", "PhiForCausalLM"],
|
|
28651
29960
|
["phi3", "Phi3ForCausalLM"],
|
|
28652
29961
|
["mpt", "MptForCausalLM"],
|
|
28653
29962
|
["opt", "OPTForCausalLM"],
|
|
28654
29963
|
["mbart", "MBartForCausalLM"],
|
|
28655
29964
|
["mistral", "MistralForCausalLM"],
|
|
29965
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
28656
29966
|
["ministral", "MinistralForCausalLM"],
|
|
28657
29967
|
["ministral3", "Ministral3ForCausalLM"],
|
|
28658
29968
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
28659
29969
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
29970
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
28660
29971
|
["falcon", "FalconForCausalLM"],
|
|
28661
29972
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
29973
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
28662
29974
|
["trocr", "TrOCRForCausalLM"],
|
|
29975
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
28663
29976
|
["stablelm", "StableLmForCausalLM"],
|
|
28664
29977
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
28665
29978
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -28670,6 +29983,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
28670
29983
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
28671
29984
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
28672
29985
|
["bert", "BertForMaskedLM"],
|
|
29986
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
28673
29987
|
["neobert", "NeoBertForMaskedLM"],
|
|
28674
29988
|
["modernbert", "ModernBertForMaskedLM"],
|
|
28675
29989
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -28722,16 +30036,21 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
28722
30036
|
["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
|
|
28723
30037
|
["qwen3_5", "Qwen3_5ForConditionalGeneration"],
|
|
28724
30038
|
["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
|
|
30039
|
+
["lfm2_vl", "Lfm2VlForConditionalGeneration"],
|
|
28725
30040
|
["idefics3", "Idefics3ForConditionalGeneration"],
|
|
28726
30041
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
28727
30042
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
28728
30043
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
28729
30044
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
28730
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
30045
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
30046
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
30047
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
28731
30048
|
]);
|
|
28732
30049
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30050
|
+
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
28733
30051
|
["ultravox", "UltravoxModel"],
|
|
28734
|
-
["voxtral", "VoxtralForConditionalGeneration"]
|
|
30052
|
+
["voxtral", "VoxtralForConditionalGeneration"],
|
|
30053
|
+
["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
|
|
28735
30054
|
]);
|
|
28736
30055
|
var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
28737
30056
|
["vision-encoder-decoder", "VisionEncoderDecoderModel"]
|
|
@@ -28830,6 +30149,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
28830
30149
|
]);
|
|
28831
30150
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
28832
30151
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30152
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
28833
30153
|
["dpt", "DPTForDepthEstimation"],
|
|
28834
30154
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
28835
30155
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -28914,7 +30234,19 @@ var CUSTOM_MAPPING = [
|
|
|
28914
30234
|
MODEL_TYPES.ImageAudioTextToText
|
|
28915
30235
|
],
|
|
28916
30236
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
28917
|
-
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
|
|
30237
|
+
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
30238
|
+
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30239
|
+
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30240
|
+
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30241
|
+
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30242
|
+
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30243
|
+
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30244
|
+
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30245
|
+
[
|
|
30246
|
+
"VoxtralRealtimeForConditionalGeneration",
|
|
30247
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
30248
|
+
MODEL_TYPES.VoxtralRealtime
|
|
30249
|
+
]
|
|
28918
30250
|
];
|
|
28919
30251
|
for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
28920
30252
|
MODEL_TYPE_MAPPING.set(name, type);
|
|
@@ -30592,8 +31924,18 @@ var TASK_ALIASES = Object.freeze({
|
|
|
30592
31924
|
});
|
|
30593
31925
|
|
|
30594
31926
|
// src/utils/model_registry/get_model_files.js
|
|
31927
|
+
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
31928
|
+
if (config !== null) {
|
|
31929
|
+
return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
|
|
31930
|
+
}
|
|
31931
|
+
const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
|
|
31932
|
+
return memoizePromise(
|
|
31933
|
+
key,
|
|
31934
|
+
() => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
|
|
31935
|
+
);
|
|
31936
|
+
}
|
|
30595
31937
|
async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
|
|
30596
|
-
config = await
|
|
31938
|
+
config = await get_config(modelId, { config });
|
|
30597
31939
|
const files = [
|
|
30598
31940
|
// Add config.json (always loaded)
|
|
30599
31941
|
"config.json"
|
|
@@ -30654,74 +31996,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
30654
31996
|
files.push(dataFilePath);
|
|
30655
31997
|
}
|
|
30656
31998
|
};
|
|
30657
|
-
const
|
|
30658
|
-
|
|
30659
|
-
add_model_file(
|
|
30660
|
-
|
|
30661
|
-
|
|
30662
|
-
|
|
30663
|
-
|
|
30664
|
-
|
|
30665
|
-
add_model_file("decoder_model_merged");
|
|
30666
|
-
files.push("generation_config.json");
|
|
30667
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
30668
|
-
add_model_file("model", "vision_encoder");
|
|
30669
|
-
add_model_file("prompt_encoder_mask_decoder");
|
|
30670
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
30671
|
-
add_model_file("model", "encoder_model");
|
|
30672
|
-
add_model_file("decoder_model_merged");
|
|
30673
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
30674
|
-
add_model_file("embed_tokens");
|
|
30675
|
-
add_model_file("vision_encoder");
|
|
30676
|
-
add_model_file("decoder_model_merged");
|
|
30677
|
-
if (config.is_encoder_decoder) {
|
|
30678
|
-
add_model_file("model", "encoder_model");
|
|
30679
|
-
}
|
|
30680
|
-
files.push("generation_config.json");
|
|
30681
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
30682
|
-
add_model_file("embed_tokens");
|
|
30683
|
-
add_model_file("audio_encoder");
|
|
30684
|
-
add_model_file("decoder_model_merged");
|
|
30685
|
-
files.push("generation_config.json");
|
|
30686
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
30687
|
-
add_model_file("embed_tokens");
|
|
30688
|
-
add_model_file("audio_encoder");
|
|
30689
|
-
add_model_file("vision_encoder");
|
|
30690
|
-
add_model_file("decoder_model_merged");
|
|
30691
|
-
files.push("generation_config.json");
|
|
30692
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
30693
|
-
add_model_file("model", "text_encoder");
|
|
30694
|
-
add_model_file("decoder_model_merged");
|
|
30695
|
-
add_model_file("encodec_decode");
|
|
30696
|
-
files.push("generation_config.json");
|
|
30697
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
30698
|
-
add_model_file("prepare_inputs_embeds");
|
|
30699
|
-
add_model_file("model", "language_model");
|
|
30700
|
-
add_model_file("lm_head");
|
|
30701
|
-
add_model_file("gen_head");
|
|
30702
|
-
add_model_file("gen_img_embeds");
|
|
30703
|
-
add_model_file("image_decode");
|
|
30704
|
-
files.push("generation_config.json");
|
|
30705
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
30706
|
-
add_model_file("prepare_inputs_embeds");
|
|
30707
|
-
add_model_file("model");
|
|
30708
|
-
add_model_file("vision_encoder");
|
|
30709
|
-
files.push("generation_config.json");
|
|
30710
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
30711
|
-
add_model_file("embed_tokens");
|
|
30712
|
-
add_model_file("speech_encoder");
|
|
30713
|
-
add_model_file("model", "language_model");
|
|
30714
|
-
add_model_file("conditional_decoder");
|
|
30715
|
-
files.push("generation_config.json");
|
|
30716
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
30717
|
-
add_model_file("encoder_model");
|
|
30718
|
-
add_model_file("decoder_model");
|
|
30719
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
30720
|
-
add_model_file("text_encoder");
|
|
30721
|
-
add_model_file("latent_denoiser");
|
|
30722
|
-
add_model_file("voice_decoder");
|
|
30723
|
-
} else {
|
|
30724
|
-
add_model_file("model", singleModelName);
|
|
31999
|
+
const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
|
|
32000
|
+
for (const [sessionKey, baseName] of Object.entries(sessions)) {
|
|
32001
|
+
add_model_file(sessionKey, baseName);
|
|
32002
|
+
}
|
|
32003
|
+
if (optional_configs) {
|
|
32004
|
+
for (const configFile of Object.values(optional_configs)) {
|
|
32005
|
+
files.push(configFile);
|
|
32006
|
+
}
|
|
30725
32007
|
}
|
|
30726
32008
|
return files;
|
|
30727
32009
|
}
|
|
@@ -31172,25 +32454,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
|
31172
32454
|
|
|
31173
32455
|
// src/utils/model_registry/is_cached.js
|
|
31174
32456
|
async function check_files_cache(modelId, files, options = {}) {
|
|
31175
|
-
const
|
|
31176
|
-
if (!
|
|
32457
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
32458
|
+
if (!cache2) {
|
|
31177
32459
|
const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
|
|
31178
32460
|
return { allCached: false, files: fileStatuses2 };
|
|
31179
32461
|
}
|
|
31180
32462
|
const fileStatuses = await Promise.all(
|
|
31181
32463
|
files.map(async (filename) => {
|
|
31182
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
31183
|
-
const cached = await checkCachedResource(
|
|
32464
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
32465
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
31184
32466
|
return { file: filename, cached: !!cached };
|
|
31185
32467
|
})
|
|
31186
32468
|
);
|
|
31187
32469
|
return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
|
|
31188
32470
|
}
|
|
31189
32471
|
async function is_file_cached(modelId, filename, options = {}) {
|
|
31190
|
-
const
|
|
31191
|
-
if (!
|
|
31192
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
31193
|
-
return !!await checkCachedResource(
|
|
32472
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
32473
|
+
if (!cache2) return false;
|
|
32474
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
32475
|
+
return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
31194
32476
|
}
|
|
31195
32477
|
async function is_cached(modelId, options = {}) {
|
|
31196
32478
|
if (!modelId) {
|
|
@@ -31237,26 +32519,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
|
|
|
31237
32519
|
|
|
31238
32520
|
// src/utils/model_registry/clear_cache.js
|
|
31239
32521
|
async function clear_files_from_cache(modelId, files, options = {}) {
|
|
31240
|
-
const
|
|
31241
|
-
if (!
|
|
32522
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
32523
|
+
if (!cache2) {
|
|
31242
32524
|
return {
|
|
31243
32525
|
filesDeleted: 0,
|
|
31244
32526
|
filesCached: 0,
|
|
31245
32527
|
files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
|
|
31246
32528
|
};
|
|
31247
32529
|
}
|
|
31248
|
-
if (!
|
|
32530
|
+
if (!cache2.delete) {
|
|
31249
32531
|
throw new Error("Cache does not support delete operation");
|
|
31250
32532
|
}
|
|
31251
32533
|
const results = await Promise.all(
|
|
31252
32534
|
files.map(async (filename) => {
|
|
31253
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
31254
|
-
const cached = await checkCachedResource(
|
|
32535
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
32536
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
31255
32537
|
const wasCached = !!cached;
|
|
31256
32538
|
let deleted = false;
|
|
31257
32539
|
if (wasCached) {
|
|
31258
|
-
const deletedWithProposed = await
|
|
31259
|
-
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await
|
|
32540
|
+
const deletedWithProposed = await cache2.delete(proposedCacheKey);
|
|
32541
|
+
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
|
|
31260
32542
|
deleted = deletedWithProposed || deletedWithLocal;
|
|
31261
32543
|
}
|
|
31262
32544
|
return { file: filename, deleted, wasCached };
|
|
@@ -31606,6 +32888,9 @@ export {
|
|
|
31606
32888
|
BloomModel,
|
|
31607
32889
|
BloomPreTrainedModel,
|
|
31608
32890
|
BloomTokenizer,
|
|
32891
|
+
CHMv2ForDepthEstimation,
|
|
32892
|
+
CHMv2ImageProcessor,
|
|
32893
|
+
CHMv2PreTrainedModel,
|
|
31609
32894
|
CLIPFeatureExtractor,
|
|
31610
32895
|
CLIPImageProcessor,
|
|
31611
32896
|
CLIPModel,
|
|
@@ -31701,6 +32986,9 @@ export {
|
|
|
31701
32986
|
DebertaV2Tokenizer,
|
|
31702
32987
|
DecisionTransformerModel,
|
|
31703
32988
|
DecisionTransformerPreTrainedModel,
|
|
32989
|
+
DeepseekV3ForCausalLM,
|
|
32990
|
+
DeepseekV3Model,
|
|
32991
|
+
DeepseekV3PreTrainedModel,
|
|
31704
32992
|
DeiTFeatureExtractor,
|
|
31705
32993
|
DeiTForImageClassification,
|
|
31706
32994
|
DeiTImageProcessor,
|
|
@@ -31737,6 +33025,7 @@ export {
|
|
|
31737
33025
|
DonutImageProcessor,
|
|
31738
33026
|
DonutSwinModel,
|
|
31739
33027
|
DonutSwinPreTrainedModel,
|
|
33028
|
+
DynamicCache,
|
|
31740
33029
|
EdgeTamModel,
|
|
31741
33030
|
EfficientNetForImageClassification,
|
|
31742
33031
|
EfficientNetImageProcessor,
|
|
@@ -31760,6 +33049,11 @@ export {
|
|
|
31760
33049
|
EsmModel,
|
|
31761
33050
|
EsmPreTrainedModel,
|
|
31762
33051
|
EsmTokenizer,
|
|
33052
|
+
EuroBertForMaskedLM,
|
|
33053
|
+
EuroBertForSequenceClassification,
|
|
33054
|
+
EuroBertForTokenClassification,
|
|
33055
|
+
EuroBertModel,
|
|
33056
|
+
EuroBertPreTrainedModel,
|
|
31763
33057
|
ExaoneForCausalLM,
|
|
31764
33058
|
ExaoneModel,
|
|
31765
33059
|
ExaonePreTrainedModel,
|
|
@@ -31809,6 +33103,7 @@ export {
|
|
|
31809
33103
|
Gemma3Model,
|
|
31810
33104
|
Gemma3PreTrainedModel,
|
|
31811
33105
|
Gemma3nAudioFeatureExtractor,
|
|
33106
|
+
Gemma3nForCausalLM,
|
|
31812
33107
|
Gemma3nForConditionalGeneration,
|
|
31813
33108
|
Gemma3nPreTrainedModel,
|
|
31814
33109
|
Gemma3nProcessor,
|
|
@@ -31816,8 +33111,14 @@ export {
|
|
|
31816
33111
|
GemmaModel,
|
|
31817
33112
|
GemmaPreTrainedModel,
|
|
31818
33113
|
GemmaTokenizer,
|
|
33114
|
+
Glm46VImageProcessor,
|
|
33115
|
+
Glm46VProcessor,
|
|
31819
33116
|
GlmForCausalLM,
|
|
31820
33117
|
GlmModel,
|
|
33118
|
+
GlmMoeDsaForCausalLM,
|
|
33119
|
+
GlmMoeDsaModel,
|
|
33120
|
+
GlmMoeDsaPreTrainedModel,
|
|
33121
|
+
GlmOcrForConditionalGeneration,
|
|
31821
33122
|
GlmPreTrainedModel,
|
|
31822
33123
|
GptOssForCausalLM,
|
|
31823
33124
|
GptOssModel,
|
|
@@ -31828,6 +33129,9 @@ export {
|
|
|
31828
33129
|
GraniteMoeHybridModel,
|
|
31829
33130
|
GraniteMoeHybridPreTrainedModel,
|
|
31830
33131
|
GranitePreTrainedModel,
|
|
33132
|
+
GraniteSpeechFeatureExtractor,
|
|
33133
|
+
GraniteSpeechForConditionalGeneration,
|
|
33134
|
+
GraniteSpeechProcessor,
|
|
31831
33135
|
GroundingDinoForObjectDetection,
|
|
31832
33136
|
GroundingDinoImageProcessor,
|
|
31833
33137
|
GroundingDinoPreTrainedModel,
|
|
@@ -31853,7 +33157,6 @@ export {
|
|
|
31853
33157
|
IJepaPreTrainedModel,
|
|
31854
33158
|
Idefics3ForConditionalGeneration,
|
|
31855
33159
|
Idefics3ImageProcessor,
|
|
31856
|
-
Idefics3PreTrainedModel,
|
|
31857
33160
|
Idefics3Processor,
|
|
31858
33161
|
ImageClassificationPipeline,
|
|
31859
33162
|
ImageFeatureExtractionPipeline,
|
|
@@ -31878,6 +33181,10 @@ export {
|
|
|
31878
33181
|
Lfm2MoeModel,
|
|
31879
33182
|
Lfm2MoePreTrainedModel,
|
|
31880
33183
|
Lfm2PreTrainedModel,
|
|
33184
|
+
Lfm2VlForConditionalGeneration,
|
|
33185
|
+
Lfm2VlImageProcessor,
|
|
33186
|
+
Lfm2VlProcessor,
|
|
33187
|
+
LightOnOcrForConditionalGeneration,
|
|
31881
33188
|
LiteWhisperForConditionalGeneration,
|
|
31882
33189
|
Llama4ForCausalLM,
|
|
31883
33190
|
Llama4PreTrainedModel,
|
|
@@ -31947,6 +33254,9 @@ export {
|
|
|
31947
33254
|
MimiPreTrainedModel,
|
|
31948
33255
|
MinLengthLogitsProcessor,
|
|
31949
33256
|
MinNewTokensLengthLogitsProcessor,
|
|
33257
|
+
Mistral4ForCausalLM,
|
|
33258
|
+
Mistral4Model,
|
|
33259
|
+
Mistral4PreTrainedModel,
|
|
31950
33260
|
MistralForCausalLM,
|
|
31951
33261
|
MistralModel,
|
|
31952
33262
|
MistralPreTrainedModel,
|
|
@@ -32018,6 +33328,9 @@ export {
|
|
|
32018
33328
|
NanoChatForCausalLM,
|
|
32019
33329
|
NanoChatModel,
|
|
32020
33330
|
NanoChatPreTrainedModel,
|
|
33331
|
+
NemotronHForCausalLM,
|
|
33332
|
+
NemotronHModel,
|
|
33333
|
+
NemotronHPreTrainedModel,
|
|
32021
33334
|
NeoBertForMaskedLM,
|
|
32022
33335
|
NeoBertForQuestionAnswering,
|
|
32023
33336
|
NeoBertForSequenceClassification,
|
|
@@ -32061,7 +33374,6 @@ export {
|
|
|
32061
33374
|
Owlv2Model,
|
|
32062
33375
|
Owlv2PreTrainedModel,
|
|
32063
33376
|
PaliGemmaForConditionalGeneration,
|
|
32064
|
-
PaliGemmaPreTrainedModel,
|
|
32065
33377
|
PaliGemmaProcessor,
|
|
32066
33378
|
ParakeetFeatureExtractor,
|
|
32067
33379
|
ParakeetForCTC,
|
|
@@ -32105,10 +33417,12 @@ export {
|
|
|
32105
33417
|
Qwen2MoePreTrainedModel,
|
|
32106
33418
|
Qwen2PreTrainedModel,
|
|
32107
33419
|
Qwen2Tokenizer,
|
|
33420
|
+
Qwen2VLForCausalLM,
|
|
32108
33421
|
Qwen2VLForConditionalGeneration,
|
|
32109
33422
|
Qwen2VLImageProcessor,
|
|
32110
33423
|
Qwen2VLPreTrainedModel,
|
|
32111
33424
|
Qwen2VLProcessor,
|
|
33425
|
+
Qwen2_5_VLForCausalLM,
|
|
32112
33426
|
Qwen2_5_VLForConditionalGeneration,
|
|
32113
33427
|
Qwen2_5_VLProcessor,
|
|
32114
33428
|
Qwen3ForCausalLM,
|
|
@@ -32120,10 +33434,14 @@ export {
|
|
|
32120
33434
|
Qwen3NextModel,
|
|
32121
33435
|
Qwen3NextPreTrainedModel,
|
|
32122
33436
|
Qwen3PreTrainedModel,
|
|
33437
|
+
Qwen3VLForCausalLM,
|
|
32123
33438
|
Qwen3VLForConditionalGeneration,
|
|
33439
|
+
Qwen3VLMoeForCausalLM,
|
|
32124
33440
|
Qwen3VLMoeForConditionalGeneration,
|
|
32125
33441
|
Qwen3VLProcessor,
|
|
33442
|
+
Qwen3_5ForCausalLM,
|
|
32126
33443
|
Qwen3_5ForConditionalGeneration,
|
|
33444
|
+
Qwen3_5MoeForCausalLM,
|
|
32127
33445
|
Qwen3_5MoeForConditionalGeneration,
|
|
32128
33446
|
RFDetrForObjectDetection,
|
|
32129
33447
|
RFDetrModel,
|
|
@@ -32195,7 +33513,6 @@ export {
|
|
|
32195
33513
|
SmolLM3ForCausalLM,
|
|
32196
33514
|
SmolLM3Model,
|
|
32197
33515
|
SmolLM3PreTrainedModel,
|
|
32198
|
-
SmolVLMForConditionalGeneration,
|
|
32199
33516
|
Idefics3ImageProcessor as SmolVLMImageProcessor,
|
|
32200
33517
|
Idefics3Processor as SmolVLMProcessor,
|
|
32201
33518
|
SnacDecoderModel,
|
|
@@ -32203,6 +33520,9 @@ export {
|
|
|
32203
33520
|
SnacFeatureExtractor,
|
|
32204
33521
|
SnacModel,
|
|
32205
33522
|
SnacPreTrainedModel,
|
|
33523
|
+
SolarOpenForCausalLM,
|
|
33524
|
+
SolarOpenModel,
|
|
33525
|
+
SolarOpenPreTrainedModel,
|
|
32206
33526
|
SpeechT5FeatureExtractor,
|
|
32207
33527
|
SpeechT5ForSpeechToText,
|
|
32208
33528
|
SpeechT5ForTextToSpeech,
|
|
@@ -32301,6 +33621,10 @@ export {
|
|
|
32301
33621
|
VitsTokenizer,
|
|
32302
33622
|
VoxtralForConditionalGeneration,
|
|
32303
33623
|
VoxtralProcessor,
|
|
33624
|
+
VoxtralRealtimeFeatureExtractor,
|
|
33625
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
33626
|
+
VoxtralRealtimePreTrainedModel,
|
|
33627
|
+
VoxtralRealtimeProcessor,
|
|
32304
33628
|
Wav2Vec2BertForCTC,
|
|
32305
33629
|
Wav2Vec2BertForSequenceClassification,
|
|
32306
33630
|
Wav2Vec2BertModel,
|
|
@@ -32396,7 +33720,7 @@ export {
|
|
|
32396
33720
|
|
|
32397
33721
|
onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
|
|
32398
33722
|
(*!
|
|
32399
|
-
* ONNX Runtime Web v1.25.0-dev.
|
|
33723
|
+
* ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
|
|
32400
33724
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
32401
33725
|
* Licensed under the MIT License.
|
|
32402
33726
|
*)
|