@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +2255 -931
- package/dist/transformers.min.js +19 -19
- package/dist/transformers.node.cjs +2300 -934
- package/dist/transformers.node.min.cjs +20 -20
- package/dist/transformers.node.min.mjs +20 -20
- package/dist/transformers.node.mjs +2336 -1012
- package/dist/transformers.web.js +2327 -1003
- package/dist/transformers.web.min.js +17 -17
- package/package.json +4 -4
- package/src/cache_utils.js +62 -0
- package/src/configs.js +45 -24
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +27 -17
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +3 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +224 -308
- package/src/models/models.js +14 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +4 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +194 -143
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +42 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/pipelines.js +1 -0
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +15 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +18 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +3 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +14 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +4 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +43 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
package/dist/transformers.web.js
CHANGED
|
@@ -14,7 +14,7 @@ var node_path_default = {};
|
|
|
14
14
|
var node_url_default = {};
|
|
15
15
|
|
|
16
16
|
// src/env.js
|
|
17
|
-
var VERSION = "4.0.0-next.
|
|
17
|
+
var VERSION = "4.0.0-next.8";
|
|
18
18
|
var HAS_SELF = typeof self !== "undefined";
|
|
19
19
|
var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
|
|
20
20
|
var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
|
|
@@ -142,6 +142,7 @@ var env = {
|
|
|
142
142
|
customCache: null,
|
|
143
143
|
useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
|
|
144
144
|
cacheKey: "transformers-cache",
|
|
145
|
+
experimental_useCrossOriginStorage: false,
|
|
145
146
|
/////////////////// Custom fetch /////////////////////
|
|
146
147
|
fetch: DEFAULT_FETCH
|
|
147
148
|
//////////////////////////////////////////////////////
|
|
@@ -243,7 +244,7 @@ var logger = {
|
|
|
243
244
|
}
|
|
244
245
|
};
|
|
245
246
|
|
|
246
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
247
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
247
248
|
var DictionarySplitter = class {
|
|
248
249
|
/**
|
|
249
250
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -1899,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
1899
1900
|
);
|
|
1900
1901
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
1901
1902
|
output_tokens.push(...byte_tokens);
|
|
1902
|
-
} else {
|
|
1903
|
+
} else if (this.unk_token != null) {
|
|
1903
1904
|
output_tokens.push(this.unk_token);
|
|
1904
1905
|
}
|
|
1905
|
-
} else {
|
|
1906
|
+
} else if (this.unk_token != null) {
|
|
1906
1907
|
output_tokens.push(this.unk_token);
|
|
1907
1908
|
}
|
|
1908
1909
|
}
|
|
@@ -2692,7 +2693,7 @@ var Tokenizer = class {
|
|
|
2692
2693
|
};
|
|
2693
2694
|
var Tokenizer_default = Tokenizer;
|
|
2694
2695
|
|
|
2695
|
-
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.
|
|
2696
|
+
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
|
|
2696
2697
|
var TOKEN_TYPES = Object.freeze({
|
|
2697
2698
|
Text: "Text",
|
|
2698
2699
|
// The text between Jinja statements or expressions
|
|
@@ -4211,7 +4212,11 @@ var Environment = class {
|
|
|
4211
4212
|
["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
|
|
4212
4213
|
["integer", (operand) => operand instanceof IntegerValue],
|
|
4213
4214
|
["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
|
|
4214
|
-
["mapping", (operand) => operand
|
|
4215
|
+
["mapping", (operand) => operand instanceof ObjectValue],
|
|
4216
|
+
[
|
|
4217
|
+
"sequence",
|
|
4218
|
+
(operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
|
|
4219
|
+
],
|
|
4215
4220
|
[
|
|
4216
4221
|
"lower",
|
|
4217
4222
|
(operand) => {
|
|
@@ -4484,6 +4489,9 @@ var Interpreter = class {
|
|
|
4484
4489
|
applyFilter(operand, filterNode, environment) {
|
|
4485
4490
|
if (filterNode.type === "Identifier") {
|
|
4486
4491
|
const filter = filterNode;
|
|
4492
|
+
if (filter.value === "safe") {
|
|
4493
|
+
return operand;
|
|
4494
|
+
}
|
|
4487
4495
|
if (filter.value === "tojson") {
|
|
4488
4496
|
return new StringValue(toJSON(operand, {}));
|
|
4489
4497
|
}
|
|
@@ -4573,6 +4581,8 @@ var Interpreter = class {
|
|
|
4573
4581
|
return new IntegerValue(Math.floor(operand.value));
|
|
4574
4582
|
case "float":
|
|
4575
4583
|
return new FloatValue(operand.value);
|
|
4584
|
+
case "string":
|
|
4585
|
+
return new StringValue(operand.toString());
|
|
4576
4586
|
default:
|
|
4577
4587
|
throw new Error(`Unknown NumericValue filter: ${filter.value}`);
|
|
4578
4588
|
}
|
|
@@ -5995,9 +6005,216 @@ function toAbsoluteURL(url) {
|
|
|
5995
6005
|
return new URL(url, baseURL).href;
|
|
5996
6006
|
}
|
|
5997
6007
|
|
|
6008
|
+
// src/utils/cache/CrossOriginStorageCache.js
|
|
6009
|
+
var HASH_ALGORITHM = "SHA-256";
|
|
6010
|
+
var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
|
|
6011
|
+
var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
|
|
6012
|
+
var CrossOriginStorage = class {
|
|
6013
|
+
/** @type {Promise<Cache> | null} */
|
|
6014
|
+
#hashCache = null;
|
|
6015
|
+
/**
|
|
6016
|
+
* Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
|
|
6017
|
+
* @returns {Promise<Cache>}
|
|
6018
|
+
*/
|
|
6019
|
+
_getHashCache = () => {
|
|
6020
|
+
this.#hashCache ??= caches.open(HASH_CACHE_NAME);
|
|
6021
|
+
return this.#hashCache;
|
|
6022
|
+
};
|
|
6023
|
+
/**
|
|
6024
|
+
* Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
|
|
6025
|
+
* @returns {boolean}
|
|
6026
|
+
*/
|
|
6027
|
+
static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
|
|
6028
|
+
/**
|
|
6029
|
+
* Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
|
|
6030
|
+
* the corresponding file handle from cross-origin storage.
|
|
6031
|
+
*
|
|
6032
|
+
* Implements `CacheInterface.match`.
|
|
6033
|
+
*
|
|
6034
|
+
* @param {string} request The URL of the resource to look up.
|
|
6035
|
+
* @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
|
|
6036
|
+
*/
|
|
6037
|
+
match = async (request) => {
|
|
6038
|
+
const hashValue = await this._getFileHash(request);
|
|
6039
|
+
if (!hashValue) {
|
|
6040
|
+
return void 0;
|
|
6041
|
+
}
|
|
6042
|
+
try {
|
|
6043
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
|
|
6044
|
+
const blob = await handle.getFile();
|
|
6045
|
+
return new Response(blob, {
|
|
6046
|
+
headers: {
|
|
6047
|
+
"Content-Length": String(blob.size)
|
|
6048
|
+
}
|
|
6049
|
+
});
|
|
6050
|
+
} catch {
|
|
6051
|
+
return void 0;
|
|
6052
|
+
}
|
|
6053
|
+
};
|
|
6054
|
+
/**
|
|
6055
|
+
* Stores a response in cross-origin storage, keyed by its SHA-256 hash.
|
|
6056
|
+
*
|
|
6057
|
+
* For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
|
|
6058
|
+
* `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
|
|
6059
|
+
* without reading the response body a second time.
|
|
6060
|
+
*
|
|
6061
|
+
* For non-LFS resources the hash is unknown upfront. In that case the body is consumed
|
|
6062
|
+
* in the background: the stream is read to compute the content hash, the file is written
|
|
6063
|
+
* into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
|
|
6064
|
+
* so that future `match` calls can resolve the file without a network round-trip.
|
|
6065
|
+
*
|
|
6066
|
+
* Implements `CacheInterface.put`.
|
|
6067
|
+
*
|
|
6068
|
+
* @param {string} request The URL of the resource (used as the hash-cache key).
|
|
6069
|
+
* @param {Response} response The response whose body will be written to the cache.
|
|
6070
|
+
* @returns {Promise<void>}
|
|
6071
|
+
*/
|
|
6072
|
+
put = async (request, response) => {
|
|
6073
|
+
const hashValue = await this._getFileHash(request);
|
|
6074
|
+
if (hashValue) {
|
|
6075
|
+
const blob = await response.blob();
|
|
6076
|
+
await this._storeBlobInCOS(blob, hashValue);
|
|
6077
|
+
} else {
|
|
6078
|
+
this._processAndStore(request, response.body);
|
|
6079
|
+
}
|
|
6080
|
+
};
|
|
6081
|
+
/**
|
|
6082
|
+
* Writes a blob into cross-origin storage using the given pre-computed hex hash string.
|
|
6083
|
+
*
|
|
6084
|
+
* @param {Blob} blob
|
|
6085
|
+
* @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
|
|
6086
|
+
* @returns {Promise<void>}
|
|
6087
|
+
*/
|
|
6088
|
+
_storeBlobInCOS = async (blob, hashHex) => {
|
|
6089
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
|
|
6090
|
+
create: true
|
|
6091
|
+
});
|
|
6092
|
+
const writableStream = await handle.createWritable();
|
|
6093
|
+
await writableStream.write(blob);
|
|
6094
|
+
await writableStream.close();
|
|
6095
|
+
};
|
|
6096
|
+
/**
|
|
6097
|
+
* Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
|
|
6098
|
+
* of the resulting blob, stores it in cross-origin storage, and persists the computed
|
|
6099
|
+
* hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
|
|
6100
|
+
* file without a network round-trip.
|
|
6101
|
+
*
|
|
6102
|
+
* Called fire-and-forget from `put` — errors are swallowed so failures never surface to
|
|
6103
|
+
* the caller.
|
|
6104
|
+
*
|
|
6105
|
+
* @param {string} request The original resource URL.
|
|
6106
|
+
* @param {ReadableStream} stream The response body stream to consume.
|
|
6107
|
+
* @returns {Promise<void>}
|
|
6108
|
+
*/
|
|
6109
|
+
_processAndStore = async (request, stream) => {
|
|
6110
|
+
try {
|
|
6111
|
+
const chunks = [];
|
|
6112
|
+
for await (const chunk2 of stream) {
|
|
6113
|
+
chunks.push(chunk2);
|
|
6114
|
+
}
|
|
6115
|
+
const blob = new Blob(chunks);
|
|
6116
|
+
const hashHex = await this._getBlobHash(blob);
|
|
6117
|
+
await this._storeBlobInCOS(blob, hashHex);
|
|
6118
|
+
try {
|
|
6119
|
+
const hashCache = await this._getHashCache();
|
|
6120
|
+
await hashCache.put(request, new Response(hashHex));
|
|
6121
|
+
} catch {
|
|
6122
|
+
}
|
|
6123
|
+
} catch {
|
|
6124
|
+
}
|
|
6125
|
+
};
|
|
6126
|
+
/**
|
|
6127
|
+
* Deletes the cache entry for the given request.
|
|
6128
|
+
*
|
|
6129
|
+
* Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
|
|
6130
|
+
* expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
|
|
6131
|
+
* permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
|
|
6132
|
+
* re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
|
|
6133
|
+
*
|
|
6134
|
+
* Implements `CacheInterface.delete`.
|
|
6135
|
+
*
|
|
6136
|
+
* @param {string} request
|
|
6137
|
+
* @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
|
|
6138
|
+
*/
|
|
6139
|
+
delete = async (request) => {
|
|
6140
|
+
try {
|
|
6141
|
+
const hashCache = await this._getHashCache();
|
|
6142
|
+
return await hashCache.delete(request);
|
|
6143
|
+
} catch {
|
|
6144
|
+
return false;
|
|
6145
|
+
}
|
|
6146
|
+
};
|
|
6147
|
+
/**
|
|
6148
|
+
* Resolves the SHA-256 hash for a given URL.
|
|
6149
|
+
*
|
|
6150
|
+
* Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
|
|
6151
|
+
* Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
|
|
6152
|
+
* LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
|
|
6153
|
+
*
|
|
6154
|
+
* Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
|
|
6155
|
+
*
|
|
6156
|
+
* @param {string} url The resource URL to resolve a hash for.
|
|
6157
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6158
|
+
*/
|
|
6159
|
+
_getFileHash = async (url) => {
|
|
6160
|
+
try {
|
|
6161
|
+
const hashCache = await this._getHashCache();
|
|
6162
|
+
const cached = await hashCache.match(url);
|
|
6163
|
+
if (cached) {
|
|
6164
|
+
return cached.text();
|
|
6165
|
+
}
|
|
6166
|
+
const hash = await this._getLfsFileHash(url);
|
|
6167
|
+
if (hash) {
|
|
6168
|
+
await hashCache.put(url, new Response(hash));
|
|
6169
|
+
return hash;
|
|
6170
|
+
}
|
|
6171
|
+
return null;
|
|
6172
|
+
} catch {
|
|
6173
|
+
return null;
|
|
6174
|
+
}
|
|
6175
|
+
};
|
|
6176
|
+
/**
|
|
6177
|
+
* Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
|
|
6178
|
+
* Git LFS pointer file.
|
|
6179
|
+
*
|
|
6180
|
+
* Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
|
|
6181
|
+
* The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
|
|
6182
|
+
* Returns `null` for non-LFS URLs or when the network request fails.
|
|
6183
|
+
*
|
|
6184
|
+
* @see https://huggingface.co/docs/hub/en/storage-backends#xet
|
|
6185
|
+
* @param {string} url The resolved Hugging Face URL of the resource.
|
|
6186
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6187
|
+
*/
|
|
6188
|
+
_getLfsFileHash = async (url) => {
|
|
6189
|
+
if (!url.includes("/resolve/")) {
|
|
6190
|
+
return null;
|
|
6191
|
+
}
|
|
6192
|
+
const rawUrl = url.replace("/resolve/", "/raw/");
|
|
6193
|
+
try {
|
|
6194
|
+
const text = await fetch(rawUrl).then((r) => r.text());
|
|
6195
|
+
const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
|
|
6196
|
+
return match ? match[1] : null;
|
|
6197
|
+
} catch {
|
|
6198
|
+
return null;
|
|
6199
|
+
}
|
|
6200
|
+
};
|
|
6201
|
+
/**
|
|
6202
|
+
* Computes the SHA-256 hash of a `Blob`'s contents.
|
|
6203
|
+
*
|
|
6204
|
+
* @param {Blob} blob The blob to hash.
|
|
6205
|
+
* @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
|
|
6206
|
+
*/
|
|
6207
|
+
_getBlobHash = async (blob) => {
|
|
6208
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
6209
|
+
const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
|
|
6210
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
6211
|
+
return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
6212
|
+
};
|
|
6213
|
+
};
|
|
6214
|
+
|
|
5998
6215
|
// src/utils/cache.js
|
|
5999
6216
|
async function getCache(file_cache_dir = null) {
|
|
6000
|
-
let
|
|
6217
|
+
let cache2 = null;
|
|
6001
6218
|
if (env.useCustomCache) {
|
|
6002
6219
|
if (!env.customCache) {
|
|
6003
6220
|
throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
|
|
@@ -6007,30 +6224,33 @@ async function getCache(file_cache_dir = null) {
|
|
|
6007
6224
|
"`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
|
|
6008
6225
|
);
|
|
6009
6226
|
}
|
|
6010
|
-
|
|
6227
|
+
cache2 = env.customCache;
|
|
6228
|
+
}
|
|
6229
|
+
if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
|
|
6230
|
+
cache2 = new CrossOriginStorage();
|
|
6011
6231
|
}
|
|
6012
|
-
if (!
|
|
6232
|
+
if (!cache2 && env.useBrowserCache) {
|
|
6013
6233
|
if (typeof caches === "undefined") {
|
|
6014
6234
|
throw Error("Browser cache is not available in this environment.");
|
|
6015
6235
|
}
|
|
6016
6236
|
try {
|
|
6017
|
-
|
|
6237
|
+
cache2 = await caches.open(env.cacheKey);
|
|
6018
6238
|
} catch (e) {
|
|
6019
6239
|
logger.warn("An error occurred while opening the browser cache:", e);
|
|
6020
6240
|
}
|
|
6021
6241
|
}
|
|
6022
|
-
if (!
|
|
6242
|
+
if (!cache2 && env.useFSCache) {
|
|
6023
6243
|
if (!apis.IS_FS_AVAILABLE) {
|
|
6024
6244
|
throw Error("File System Cache is not available in this environment.");
|
|
6025
6245
|
}
|
|
6026
|
-
|
|
6246
|
+
cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
|
|
6027
6247
|
}
|
|
6028
|
-
return
|
|
6248
|
+
return cache2;
|
|
6029
6249
|
}
|
|
6030
|
-
async function tryCache(
|
|
6250
|
+
async function tryCache(cache2, ...names) {
|
|
6031
6251
|
for (let name of names) {
|
|
6032
6252
|
try {
|
|
6033
|
-
let result = await
|
|
6253
|
+
let result = await cache2.match(name);
|
|
6034
6254
|
if (result) return result;
|
|
6035
6255
|
} catch (e) {
|
|
6036
6256
|
continue;
|
|
@@ -6039,6 +6259,83 @@ async function tryCache(cache, ...names) {
|
|
|
6039
6259
|
return void 0;
|
|
6040
6260
|
}
|
|
6041
6261
|
|
|
6262
|
+
// src/utils/lru_cache.js
|
|
6263
|
+
var LRUCache2 = class {
|
|
6264
|
+
/** @type {number} */
|
|
6265
|
+
#capacity;
|
|
6266
|
+
/** @type {Map<any, any>} */
|
|
6267
|
+
#cache;
|
|
6268
|
+
/**
|
|
6269
|
+
* Creates an LRUCache instance.
|
|
6270
|
+
* @param {number} capacity The maximum number of items the cache can hold.
|
|
6271
|
+
*/
|
|
6272
|
+
constructor(capacity) {
|
|
6273
|
+
this.#capacity = capacity;
|
|
6274
|
+
this.#cache = /* @__PURE__ */ new Map();
|
|
6275
|
+
}
|
|
6276
|
+
/**
|
|
6277
|
+
* Retrieves the value associated with the given key and marks the key as recently used.
|
|
6278
|
+
* @param {any} key The key to retrieve.
|
|
6279
|
+
* @returns {any} The value associated with the key, or undefined if the key does not exist.
|
|
6280
|
+
*/
|
|
6281
|
+
get(key) {
|
|
6282
|
+
if (!this.#cache.has(key)) return void 0;
|
|
6283
|
+
const value = this.#cache.get(key);
|
|
6284
|
+
this.#cache.delete(key);
|
|
6285
|
+
this.#cache.set(key, value);
|
|
6286
|
+
return value;
|
|
6287
|
+
}
|
|
6288
|
+
/**
|
|
6289
|
+
* Inserts or updates the key-value pair in the cache.
|
|
6290
|
+
* If the key already exists, it is updated and marked as recently used.
|
|
6291
|
+
* If the cache exceeds its capacity, the least recently used item is evicted.
|
|
6292
|
+
* @param {any} key The key to add or update.
|
|
6293
|
+
* @param {any} value The value to associate with the key.
|
|
6294
|
+
*/
|
|
6295
|
+
put(key, value) {
|
|
6296
|
+
if (this.#cache.has(key)) {
|
|
6297
|
+
this.#cache.delete(key);
|
|
6298
|
+
}
|
|
6299
|
+
this.#cache.set(key, value);
|
|
6300
|
+
if (this.#cache.size > this.#capacity) {
|
|
6301
|
+
this.#cache.delete(this.#cache.keys().next().value);
|
|
6302
|
+
}
|
|
6303
|
+
}
|
|
6304
|
+
/**
|
|
6305
|
+
* Removes the entry for the given key from the cache.
|
|
6306
|
+
* @param {any} key The key to delete.
|
|
6307
|
+
* @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
|
|
6308
|
+
*/
|
|
6309
|
+
delete(key) {
|
|
6310
|
+
return this.#cache.delete(key);
|
|
6311
|
+
}
|
|
6312
|
+
/**
|
|
6313
|
+
* Clears the cache.
|
|
6314
|
+
*/
|
|
6315
|
+
clear() {
|
|
6316
|
+
this.#cache.clear();
|
|
6317
|
+
}
|
|
6318
|
+
};
|
|
6319
|
+
|
|
6320
|
+
// src/utils/memoize_promise.js
|
|
6321
|
+
var MAX_CACHE_SIZE = 100;
|
|
6322
|
+
var cache = new LRUCache2(MAX_CACHE_SIZE);
|
|
6323
|
+
function memoizePromise(key, factory) {
|
|
6324
|
+
const cached = cache.get(key);
|
|
6325
|
+
if (cached !== void 0) {
|
|
6326
|
+
return cached;
|
|
6327
|
+
}
|
|
6328
|
+
const promise = factory().then(
|
|
6329
|
+
(value) => value,
|
|
6330
|
+
(err) => {
|
|
6331
|
+
cache.delete(key);
|
|
6332
|
+
return Promise.reject(err);
|
|
6333
|
+
}
|
|
6334
|
+
);
|
|
6335
|
+
cache.put(key, promise);
|
|
6336
|
+
return promise;
|
|
6337
|
+
}
|
|
6338
|
+
|
|
6042
6339
|
// src/utils/model_registry/get_file_metadata.js
|
|
6043
6340
|
async function fetch_file_head(urlOrPath) {
|
|
6044
6341
|
if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
|
|
@@ -6046,17 +6343,27 @@ async function fetch_file_head(urlOrPath) {
|
|
|
6046
6343
|
}
|
|
6047
6344
|
const headers = getFetchHeaders(urlOrPath);
|
|
6048
6345
|
headers.set("Range", "bytes=0-0");
|
|
6049
|
-
return env.fetch(urlOrPath, { method: "GET", headers });
|
|
6346
|
+
return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
|
|
6347
|
+
}
|
|
6348
|
+
function get_file_metadata(path_or_repo_id, filename, options = {}) {
|
|
6349
|
+
const key = JSON.stringify([
|
|
6350
|
+
path_or_repo_id,
|
|
6351
|
+
filename,
|
|
6352
|
+
options?.revision,
|
|
6353
|
+
options?.cache_dir,
|
|
6354
|
+
options?.local_files_only
|
|
6355
|
+
]);
|
|
6356
|
+
return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
|
|
6050
6357
|
}
|
|
6051
|
-
async function
|
|
6052
|
-
const
|
|
6358
|
+
async function _get_file_metadata(path_or_repo_id, filename, options) {
|
|
6359
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6053
6360
|
const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6054
6361
|
path_or_repo_id,
|
|
6055
6362
|
filename,
|
|
6056
6363
|
options,
|
|
6057
|
-
|
|
6364
|
+
cache2
|
|
6058
6365
|
);
|
|
6059
|
-
const cachedResponse = await checkCachedResource(
|
|
6366
|
+
const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6060
6367
|
if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
|
|
6061
6368
|
const size = cachedResponse.headers.get("content-length");
|
|
6062
6369
|
const contentType = cachedResponse.headers.get("content-type");
|
|
@@ -6154,7 +6461,7 @@ function getFetchHeaders(urlOrPath) {
|
|
|
6154
6461
|
}
|
|
6155
6462
|
return headers;
|
|
6156
6463
|
}
|
|
6157
|
-
function buildResourcePaths(path_or_repo_id, filename, options = {},
|
|
6464
|
+
function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
|
|
6158
6465
|
const revision = options.revision ?? "main";
|
|
6159
6466
|
const requestURL = pathJoin(path_or_repo_id, filename);
|
|
6160
6467
|
const validModelId = isValidHfModelId(path_or_repo_id);
|
|
@@ -6164,7 +6471,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6164
6471
|
env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
|
|
6165
6472
|
filename
|
|
6166
6473
|
);
|
|
6167
|
-
const proposedCacheKey =
|
|
6474
|
+
const proposedCacheKey = cache2 instanceof FileCache ? (
|
|
6168
6475
|
// Choose cache key for filesystem cache
|
|
6169
6476
|
// When using the main revision (default), we use the request URL as the cache key.
|
|
6170
6477
|
// If a specific revision is requested, we account for this in the cache key.
|
|
@@ -6178,14 +6485,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6178
6485
|
validModelId
|
|
6179
6486
|
};
|
|
6180
6487
|
}
|
|
6181
|
-
async function checkCachedResource(
|
|
6182
|
-
if (!
|
|
6488
|
+
async function checkCachedResource(cache2, localPath, proposedCacheKey) {
|
|
6489
|
+
if (!cache2) {
|
|
6183
6490
|
return void 0;
|
|
6184
6491
|
}
|
|
6185
|
-
return await tryCache(
|
|
6492
|
+
return await tryCache(cache2, localPath, proposedCacheKey);
|
|
6186
6493
|
}
|
|
6187
|
-
async function storeCachedResource(path_or_repo_id, filename,
|
|
6188
|
-
if (await
|
|
6494
|
+
async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
|
|
6495
|
+
if (await cache2.match(cacheKey) !== void 0) {
|
|
6189
6496
|
return;
|
|
6190
6497
|
}
|
|
6191
6498
|
if (!result) {
|
|
@@ -6195,20 +6502,22 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6195
6502
|
file: filename,
|
|
6196
6503
|
...data
|
|
6197
6504
|
}) : void 0;
|
|
6198
|
-
await
|
|
6505
|
+
await cache2.put(
|
|
6199
6506
|
cacheKey,
|
|
6200
6507
|
/** @type {Response} */
|
|
6201
6508
|
response,
|
|
6202
6509
|
wrapped_progress
|
|
6203
6510
|
);
|
|
6204
6511
|
} else if (typeof response !== "string") {
|
|
6205
|
-
|
|
6512
|
+
const headers = new Headers(response.headers);
|
|
6513
|
+
headers.set("content-length", result.byteLength.toString());
|
|
6514
|
+
await cache2.put(
|
|
6206
6515
|
cacheKey,
|
|
6207
6516
|
new Response(
|
|
6208
6517
|
/** @type {any} */
|
|
6209
6518
|
result,
|
|
6210
6519
|
{
|
|
6211
|
-
headers
|
|
6520
|
+
headers
|
|
6212
6521
|
}
|
|
6213
6522
|
)
|
|
6214
6523
|
).catch((err) => {
|
|
@@ -6216,17 +6525,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6216
6525
|
});
|
|
6217
6526
|
}
|
|
6218
6527
|
}
|
|
6219
|
-
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false,
|
|
6528
|
+
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
|
|
6220
6529
|
const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6221
6530
|
path_or_repo_id,
|
|
6222
6531
|
filename,
|
|
6223
6532
|
options,
|
|
6224
|
-
|
|
6533
|
+
cache2
|
|
6225
6534
|
);
|
|
6226
6535
|
let cacheKey;
|
|
6227
6536
|
let toCacheResponse = false;
|
|
6228
6537
|
let response;
|
|
6229
|
-
response = await checkCachedResource(
|
|
6538
|
+
response = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6230
6539
|
const cacheHit = response !== void 0;
|
|
6231
6540
|
if (!cacheHit) {
|
|
6232
6541
|
if (env.allowLocalModels) {
|
|
@@ -6267,7 +6576,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6267
6576
|
}
|
|
6268
6577
|
cacheKey = proposedCacheKey;
|
|
6269
6578
|
}
|
|
6270
|
-
toCacheResponse =
|
|
6579
|
+
toCacheResponse = cache2 && // 1. A caching system is available
|
|
6271
6580
|
typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
|
|
6272
6581
|
response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
|
|
6273
6582
|
response.status === 200;
|
|
@@ -6329,7 +6638,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6329
6638
|
// i.e., do not cache FileResponses (prevents duplication)
|
|
6330
6639
|
toCacheResponse && cacheKey && typeof response !== "string"
|
|
6331
6640
|
) {
|
|
6332
|
-
await storeCachedResource(path_or_repo_id, filename,
|
|
6641
|
+
await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
|
|
6333
6642
|
}
|
|
6334
6643
|
dispatchCallback(options.progress_callback, {
|
|
6335
6644
|
status: "done",
|
|
@@ -6345,7 +6654,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6345
6654
|
if (response instanceof FileResponse) {
|
|
6346
6655
|
return response.filePath;
|
|
6347
6656
|
}
|
|
6348
|
-
const cachedResponse = await
|
|
6657
|
+
const cachedResponse = await cache2?.match(cacheKey);
|
|
6349
6658
|
if (cachedResponse instanceof FileResponse) {
|
|
6350
6659
|
return cachedResponse.filePath;
|
|
6351
6660
|
} else if (cachedResponse instanceof Response) {
|
|
@@ -6372,8 +6681,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
|
|
|
6372
6681
|
name: path_or_repo_id,
|
|
6373
6682
|
file: filename
|
|
6374
6683
|
});
|
|
6375
|
-
const
|
|
6376
|
-
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path,
|
|
6684
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6685
|
+
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
|
|
6377
6686
|
}
|
|
6378
6687
|
async function getModelText(modelPath, fileName, fatal = true, options = {}) {
|
|
6379
6688
|
const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
|
|
@@ -7176,11 +7485,11 @@ import * as ONNX_WEB from "onnxruntime-web/webgpu";
|
|
|
7176
7485
|
// src/backends/utils/cacheWasm.js
|
|
7177
7486
|
async function loadAndCacheFile(url) {
|
|
7178
7487
|
const fileName = url.split("/").pop();
|
|
7179
|
-
let
|
|
7488
|
+
let cache2;
|
|
7180
7489
|
try {
|
|
7181
|
-
|
|
7182
|
-
if (
|
|
7183
|
-
const result = await
|
|
7490
|
+
cache2 = await getCache();
|
|
7491
|
+
if (cache2) {
|
|
7492
|
+
const result = await cache2.match(url);
|
|
7184
7493
|
if (result) {
|
|
7185
7494
|
return result;
|
|
7186
7495
|
}
|
|
@@ -7192,9 +7501,9 @@ async function loadAndCacheFile(url) {
|
|
|
7192
7501
|
if (!response.ok) {
|
|
7193
7502
|
throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
|
|
7194
7503
|
}
|
|
7195
|
-
if (
|
|
7504
|
+
if (cache2) {
|
|
7196
7505
|
try {
|
|
7197
|
-
await
|
|
7506
|
+
await cache2.put(url, response.clone());
|
|
7198
7507
|
} catch (e) {
|
|
7199
7508
|
logger.warn(`Failed to cache ${fileName}:`, e);
|
|
7200
7509
|
}
|
|
@@ -9046,9 +9355,23 @@ var Tensor2 = class _Tensor {
|
|
|
9046
9355
|
throw Error(`Unsupported norm: ${p}`);
|
|
9047
9356
|
}
|
|
9048
9357
|
const this_data = this.data;
|
|
9049
|
-
const
|
|
9358
|
+
const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
|
|
9359
|
+
if (is_bigint && p !== 1) {
|
|
9360
|
+
throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
|
|
9361
|
+
}
|
|
9362
|
+
let fn, zero;
|
|
9363
|
+
if (is_bigint) {
|
|
9364
|
+
fn = (a, b) => a + b;
|
|
9365
|
+
zero = 0n;
|
|
9366
|
+
} else {
|
|
9367
|
+
fn = (a, b) => a + b ** p;
|
|
9368
|
+
zero = 0;
|
|
9369
|
+
}
|
|
9050
9370
|
if (dim === null) {
|
|
9051
|
-
|
|
9371
|
+
let val = this_data.reduce(fn, zero);
|
|
9372
|
+
if (p !== 1) {
|
|
9373
|
+
val = val ** (1 / p);
|
|
9374
|
+
}
|
|
9052
9375
|
return new _Tensor(this.type, [val], []);
|
|
9053
9376
|
}
|
|
9054
9377
|
const [type, result, resultDims] = reduce_helper(fn, this, dim, keepdim);
|
|
@@ -11508,9 +11831,12 @@ __export(processors_exports, {
|
|
|
11508
11831
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
11509
11832
|
Florence2Processor: () => Florence2Processor,
|
|
11510
11833
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
11834
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
11835
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
11511
11836
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
11512
11837
|
Idefics3Processor: () => Idefics3Processor,
|
|
11513
11838
|
JinaCLIPProcessor: () => JinaCLIPProcessor,
|
|
11839
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
11514
11840
|
LlavaProcessor: () => LlavaProcessor,
|
|
11515
11841
|
MgpstrProcessor: () => MgpstrProcessor,
|
|
11516
11842
|
MoonshineProcessor: () => MoonshineProcessor,
|
|
@@ -11531,6 +11857,7 @@ __export(processors_exports, {
|
|
|
11531
11857
|
UltravoxProcessor: () => UltravoxProcessor,
|
|
11532
11858
|
VLChatProcessor: () => VLChatProcessor,
|
|
11533
11859
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
11860
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
11534
11861
|
Wav2Vec2Processor: () => Wav2Vec2Processor,
|
|
11535
11862
|
Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
|
|
11536
11863
|
WhisperProcessor: () => WhisperProcessor
|
|
@@ -11585,12 +11912,14 @@ __export(feature_extractors_exports, {
|
|
|
11585
11912
|
EncodecFeatureExtractor: () => EncodecFeatureExtractor,
|
|
11586
11913
|
FeatureExtractor: () => FeatureExtractor,
|
|
11587
11914
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
11915
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
11588
11916
|
MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
|
|
11589
11917
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
11590
11918
|
PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
|
|
11591
11919
|
SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
|
|
11592
11920
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
11593
11921
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
11922
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
11594
11923
|
Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
|
|
11595
11924
|
WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
|
|
11596
11925
|
WhisperFeatureExtractor: () => WhisperFeatureExtractor
|
|
@@ -11825,6 +12154,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
11825
12154
|
mel_filters = null,
|
|
11826
12155
|
mel_floor = 1e-10,
|
|
11827
12156
|
log_mel = null,
|
|
12157
|
+
max_log_mel = null,
|
|
11828
12158
|
reference = 1,
|
|
11829
12159
|
min_value = 1e-10,
|
|
11830
12160
|
db_range = null,
|
|
@@ -11964,6 +12294,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
11964
12294
|
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
11965
12295
|
}
|
|
11966
12296
|
break;
|
|
12297
|
+
case "log10_max_norm": {
|
|
12298
|
+
for (let i = 0; i < o; ++i) {
|
|
12299
|
+
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
12300
|
+
}
|
|
12301
|
+
const logMax = max_log_mel ?? max(mel_spec_data)[0];
|
|
12302
|
+
const threshold = logMax - 8;
|
|
12303
|
+
for (let i = 0; i < o; ++i) {
|
|
12304
|
+
mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
|
|
12305
|
+
}
|
|
12306
|
+
break;
|
|
12307
|
+
}
|
|
11967
12308
|
case "dB":
|
|
11968
12309
|
if (power === 1) {
|
|
11969
12310
|
amplitude_to_db(mel_spec_data, reference, min_value, db_range);
|
|
@@ -11974,7 +12315,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
11974
12315
|
}
|
|
11975
12316
|
break;
|
|
11976
12317
|
default:
|
|
11977
|
-
throw new Error(
|
|
12318
|
+
throw new Error(
|
|
12319
|
+
`log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
|
|
12320
|
+
);
|
|
11978
12321
|
}
|
|
11979
12322
|
}
|
|
11980
12323
|
return mel_spec;
|
|
@@ -12479,6 +12822,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
|
|
|
12479
12822
|
}
|
|
12480
12823
|
};
|
|
12481
12824
|
|
|
12825
|
+
// src/models/granite_speech/feature_extraction_granite_speech.js
|
|
12826
|
+
var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
|
|
12827
|
+
constructor(config) {
|
|
12828
|
+
super(config);
|
|
12829
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
12830
|
+
this.mel_filters = mel_filter_bank(
|
|
12831
|
+
Math.floor(1 + n_fft / 2),
|
|
12832
|
+
// num_frequency_bins = 257
|
|
12833
|
+
n_mels,
|
|
12834
|
+
// 80
|
|
12835
|
+
0,
|
|
12836
|
+
// min_frequency
|
|
12837
|
+
sample_rate / 2,
|
|
12838
|
+
// max_frequency = 8000
|
|
12839
|
+
sample_rate,
|
|
12840
|
+
// 16000
|
|
12841
|
+
null,
|
|
12842
|
+
// norm (torchaudio default: no norm)
|
|
12843
|
+
"htk"
|
|
12844
|
+
// mel_scale (torchaudio default)
|
|
12845
|
+
);
|
|
12846
|
+
const raw_window = window_function(win_length, "hann");
|
|
12847
|
+
this.window = new Float64Array(n_fft);
|
|
12848
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
12849
|
+
this.window.set(raw_window, pad);
|
|
12850
|
+
}
|
|
12851
|
+
/**
|
|
12852
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
12853
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
12854
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
12855
|
+
*/
|
|
12856
|
+
async _call(audio) {
|
|
12857
|
+
validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
|
|
12858
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
12859
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
12860
|
+
const max_num_frames = num_frames - num_frames % 2;
|
|
12861
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
12862
|
+
power: 2,
|
|
12863
|
+
mel_filters: this.mel_filters,
|
|
12864
|
+
log_mel: "log10_max_norm",
|
|
12865
|
+
transpose: true,
|
|
12866
|
+
// [time, n_mels]
|
|
12867
|
+
max_num_frames,
|
|
12868
|
+
do_pad: false
|
|
12869
|
+
});
|
|
12870
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
12871
|
+
return { input_features };
|
|
12872
|
+
}
|
|
12873
|
+
};
|
|
12874
|
+
|
|
12482
12875
|
// src/models/moonshine/feature_extraction_moonshine.js
|
|
12483
12876
|
var MoonshineFeatureExtractor = class extends FeatureExtractor {
|
|
12484
12877
|
/**
|
|
@@ -12959,6 +13352,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
|
|
|
12959
13352
|
}
|
|
12960
13353
|
};
|
|
12961
13354
|
|
|
13355
|
+
// src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
|
|
13356
|
+
var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
|
|
13357
|
+
constructor(config) {
|
|
13358
|
+
super(config);
|
|
13359
|
+
this.config.mel_filters ??= mel_filter_bank(
|
|
13360
|
+
Math.floor(1 + this.config.n_fft / 2),
|
|
13361
|
+
// num_frequency_bins
|
|
13362
|
+
this.config.feature_size,
|
|
13363
|
+
// num_mel_filters
|
|
13364
|
+
0,
|
|
13365
|
+
// min_frequency
|
|
13366
|
+
8e3,
|
|
13367
|
+
// max_frequency
|
|
13368
|
+
this.config.sampling_rate,
|
|
13369
|
+
// sampling_rate
|
|
13370
|
+
"slaney",
|
|
13371
|
+
// norm
|
|
13372
|
+
"slaney"
|
|
13373
|
+
// mel_scale
|
|
13374
|
+
);
|
|
13375
|
+
this.window = window_function(this.config.n_fft, "hann");
|
|
13376
|
+
}
|
|
13377
|
+
/**
|
|
13378
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
13379
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
13380
|
+
* @param {Object} [options]
|
|
13381
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
|
|
13382
|
+
* @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
|
|
13383
|
+
*/
|
|
13384
|
+
async _extract_fbank_features(waveform, { center = true } = {}) {
|
|
13385
|
+
const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
|
|
13386
|
+
const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
|
|
13387
|
+
return await spectrogram(
|
|
13388
|
+
waveform,
|
|
13389
|
+
this.window,
|
|
13390
|
+
n_fft,
|
|
13391
|
+
// frame_length
|
|
13392
|
+
hop_length,
|
|
13393
|
+
{
|
|
13394
|
+
power: 2,
|
|
13395
|
+
mel_filters,
|
|
13396
|
+
log_mel: "log10_max_norm",
|
|
13397
|
+
max_log_mel: global_log_mel_max,
|
|
13398
|
+
center,
|
|
13399
|
+
max_num_frames,
|
|
13400
|
+
do_pad: false
|
|
13401
|
+
}
|
|
13402
|
+
);
|
|
13403
|
+
}
|
|
13404
|
+
/**
|
|
13405
|
+
* Extract mel spectrogram features from audio.
|
|
13406
|
+
* @param {Float32Array|Float64Array} audio The audio data.
|
|
13407
|
+
* @param {Object} [options]
|
|
13408
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform.
|
|
13409
|
+
* @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
|
|
13410
|
+
*/
|
|
13411
|
+
async _call(audio, { center = true } = {}) {
|
|
13412
|
+
validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
|
|
13413
|
+
const features = await this._extract_fbank_features(audio, { center });
|
|
13414
|
+
return {
|
|
13415
|
+
input_features: features.unsqueeze_(0)
|
|
13416
|
+
};
|
|
13417
|
+
}
|
|
13418
|
+
};
|
|
13419
|
+
|
|
12962
13420
|
// src/models/whisper/feature_extraction_whisper.js
|
|
12963
13421
|
var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
12964
13422
|
constructor(config) {
|
|
@@ -12987,7 +13445,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
12987
13445
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
12988
13446
|
*/
|
|
12989
13447
|
async _extract_fbank_features(waveform) {
|
|
12990
|
-
|
|
13448
|
+
return await spectrogram(
|
|
12991
13449
|
waveform,
|
|
12992
13450
|
this.window,
|
|
12993
13451
|
// window
|
|
@@ -12998,7 +13456,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
12998
13456
|
{
|
|
12999
13457
|
power: 2,
|
|
13000
13458
|
mel_filters: this.config.mel_filters,
|
|
13001
|
-
log_mel: "
|
|
13459
|
+
log_mel: "log10_max_norm",
|
|
13002
13460
|
// Custom
|
|
13003
13461
|
max_num_frames: Math.min(
|
|
13004
13462
|
Math.floor(waveform.length / this.config.hop_length),
|
|
@@ -13007,15 +13465,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
13007
13465
|
)
|
|
13008
13466
|
}
|
|
13009
13467
|
);
|
|
13010
|
-
const data = features.data;
|
|
13011
|
-
const maxValue = max(
|
|
13012
|
-
/** @type {Float32Array} */
|
|
13013
|
-
data
|
|
13014
|
-
)[0];
|
|
13015
|
-
for (let i = 0; i < data.length; ++i) {
|
|
13016
|
-
data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
|
|
13017
|
-
}
|
|
13018
|
-
return features;
|
|
13019
13468
|
}
|
|
13020
13469
|
/**
|
|
13021
13470
|
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
@@ -13896,6 +14345,30 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
13896
14345
|
}
|
|
13897
14346
|
return [segmentation, segments];
|
|
13898
14347
|
}
|
|
14348
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
14349
|
+
if (height < factor || width < factor) {
|
|
14350
|
+
const scale = Math.max(factor / height, factor / width);
|
|
14351
|
+
height = Math.round(height * scale);
|
|
14352
|
+
width = Math.round(width * scale);
|
|
14353
|
+
}
|
|
14354
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
14355
|
+
throw new Error(
|
|
14356
|
+
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
14357
|
+
);
|
|
14358
|
+
}
|
|
14359
|
+
let h_bar = Math.round(height / factor) * factor;
|
|
14360
|
+
let w_bar = Math.round(width / factor) * factor;
|
|
14361
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
14362
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
14363
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
14364
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
14365
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
14366
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
14367
|
+
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
14368
|
+
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
14369
|
+
}
|
|
14370
|
+
return [w_bar, h_bar];
|
|
14371
|
+
}
|
|
13899
14372
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
13900
14373
|
if (label_ids_to_fuse === null) {
|
|
13901
14374
|
logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
|
|
@@ -13973,7 +14446,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
13973
14446
|
this.do_pad = config.do_pad;
|
|
13974
14447
|
this.min_pixels = config.min_pixels;
|
|
13975
14448
|
this.max_pixels = config.max_pixels;
|
|
13976
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
14449
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
13977
14450
|
this.pad_size = this.size;
|
|
13978
14451
|
}
|
|
13979
14452
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -14184,7 +14657,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
14184
14657
|
});
|
|
14185
14658
|
}
|
|
14186
14659
|
/**
|
|
14187
|
-
* @typedef {
|
|
14660
|
+
* @typedef {Object} PreprocessedImage
|
|
14188
14661
|
* @property {HeightWidth} original_size The original size of the image.
|
|
14189
14662
|
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
|
|
14190
14663
|
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
|
|
@@ -14261,10 +14734,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
14261
14734
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
14262
14735
|
[pixelData, imgDims] = padded;
|
|
14263
14736
|
} else if (this.size_divisibility) {
|
|
14264
|
-
const
|
|
14265
|
-
|
|
14266
|
-
this.size_divisibility
|
|
14267
|
-
);
|
|
14737
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
14738
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
14268
14739
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
14269
14740
|
}
|
|
14270
14741
|
}
|
|
@@ -14341,6 +14812,7 @@ var image_processors_exports = {};
|
|
|
14341
14812
|
__export(image_processors_exports, {
|
|
14342
14813
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
14343
14814
|
BitImageProcessor: () => BitImageProcessor,
|
|
14815
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
14344
14816
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
14345
14817
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
14346
14818
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -14357,11 +14829,13 @@ __export(image_processors_exports, {
|
|
|
14357
14829
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
14358
14830
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
14359
14831
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
14832
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
14360
14833
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
14361
14834
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
14362
14835
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
14363
14836
|
ImageProcessor: () => ImageProcessor,
|
|
14364
14837
|
JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
|
|
14838
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
14365
14839
|
LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
|
|
14366
14840
|
Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
|
|
14367
14841
|
MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
|
|
@@ -14416,6 +14890,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
14416
14890
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
14417
14891
|
};
|
|
14418
14892
|
|
|
14893
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
14894
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
14895
|
+
};
|
|
14896
|
+
|
|
14419
14897
|
// src/models/clip/image_processing_clip.js
|
|
14420
14898
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
14421
14899
|
};
|
|
@@ -14535,32 +15013,91 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
14535
15013
|
}
|
|
14536
15014
|
};
|
|
14537
15015
|
|
|
14538
|
-
// src/models/
|
|
14539
|
-
var
|
|
14540
|
-
};
|
|
14541
|
-
|
|
14542
|
-
// src/models/grounding_dino/image_processing_grounding_dino.js
|
|
14543
|
-
var GroundingDinoImageProcessor = class extends ImageProcessor {
|
|
14544
|
-
/**
|
|
14545
|
-
* Calls the feature extraction process on an array of images, preprocesses
|
|
14546
|
-
* each image, and concatenates the resulting features into a single Tensor.
|
|
14547
|
-
* @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
|
|
14548
|
-
* @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
|
|
14549
|
-
*/
|
|
14550
|
-
async _call(images) {
|
|
14551
|
-
const result = await super._call(images);
|
|
14552
|
-
const dims = result.pixel_values.dims;
|
|
14553
|
-
const pixel_mask = ones([dims[0], dims[2], dims[3]]);
|
|
14554
|
-
return { ...result, pixel_mask };
|
|
14555
|
-
}
|
|
14556
|
-
};
|
|
14557
|
-
|
|
14558
|
-
// src/models/idefics3/image_processing_idefics3.js
|
|
14559
|
-
var Idefics3ImageProcessor = class extends ImageProcessor {
|
|
15016
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
15017
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
14560
15018
|
constructor(config) {
|
|
14561
15019
|
super(config);
|
|
14562
|
-
this.
|
|
14563
|
-
this.
|
|
15020
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
15021
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
15022
|
+
this.patch_size = config.patch_size;
|
|
15023
|
+
this.merge_size = config.merge_size;
|
|
15024
|
+
}
|
|
15025
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
15026
|
+
get_resize_output_image_size(image, size) {
|
|
15027
|
+
const factor = this.patch_size * this.merge_size;
|
|
15028
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
15029
|
+
}
|
|
15030
|
+
async _call(images, ...args) {
|
|
15031
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
15032
|
+
let patches = pixel_values;
|
|
15033
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
15034
|
+
if (patches.dims[0] === 1) {
|
|
15035
|
+
patches = cat(
|
|
15036
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
15037
|
+
0
|
|
15038
|
+
);
|
|
15039
|
+
}
|
|
15040
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
15041
|
+
const channel = patches.dims[1];
|
|
15042
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
15043
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
15044
|
+
const flatten_patches = patches.view(
|
|
15045
|
+
grid_t,
|
|
15046
|
+
temporal_patch_size,
|
|
15047
|
+
channel,
|
|
15048
|
+
Math.floor(grid_h / merge_size),
|
|
15049
|
+
merge_size,
|
|
15050
|
+
patch_size,
|
|
15051
|
+
Math.floor(grid_w / merge_size),
|
|
15052
|
+
merge_size,
|
|
15053
|
+
patch_size
|
|
15054
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
15055
|
+
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
15056
|
+
return {
|
|
15057
|
+
pixel_values: flatten_patches,
|
|
15058
|
+
image_grid_thw,
|
|
15059
|
+
original_sizes,
|
|
15060
|
+
reshaped_input_sizes
|
|
15061
|
+
};
|
|
15062
|
+
}
|
|
15063
|
+
};
|
|
15064
|
+
|
|
15065
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
15066
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
15067
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
15068
|
+
get_resize_output_image_size(image, size) {
|
|
15069
|
+
const factor = this.patch_size * this.merge_size;
|
|
15070
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
15071
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
15072
|
+
}
|
|
15073
|
+
};
|
|
15074
|
+
|
|
15075
|
+
// src/models/glpn/image_processing_glpn.js
|
|
15076
|
+
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
15077
|
+
};
|
|
15078
|
+
|
|
15079
|
+
// src/models/grounding_dino/image_processing_grounding_dino.js
|
|
15080
|
+
var GroundingDinoImageProcessor = class extends ImageProcessor {
|
|
15081
|
+
/**
|
|
15082
|
+
* Calls the feature extraction process on an array of images, preprocesses
|
|
15083
|
+
* each image, and concatenates the resulting features into a single Tensor.
|
|
15084
|
+
* @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
|
|
15085
|
+
* @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
|
|
15086
|
+
*/
|
|
15087
|
+
async _call(images) {
|
|
15088
|
+
const result = await super._call(images);
|
|
15089
|
+
const dims = result.pixel_values.dims;
|
|
15090
|
+
const pixel_mask = ones([dims[0], dims[2], dims[3]]);
|
|
15091
|
+
return { ...result, pixel_mask };
|
|
15092
|
+
}
|
|
15093
|
+
};
|
|
15094
|
+
|
|
15095
|
+
// src/models/idefics3/image_processing_idefics3.js
|
|
15096
|
+
var Idefics3ImageProcessor = class extends ImageProcessor {
|
|
15097
|
+
constructor(config) {
|
|
15098
|
+
super(config);
|
|
15099
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
15100
|
+
this.max_image_size = config.max_image_size;
|
|
14564
15101
|
}
|
|
14565
15102
|
/**
|
|
14566
15103
|
* @typedef {import('../../utils/image.js').RawImage} RawImage
|
|
@@ -14765,6 +15302,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
|
|
|
14765
15302
|
}
|
|
14766
15303
|
};
|
|
14767
15304
|
|
|
15305
|
+
// src/models/lfm2_vl/image_processing_lfm2_vl.js
|
|
15306
|
+
function round_by_factor(number, factor) {
|
|
15307
|
+
return Math.round(number / factor) * factor;
|
|
15308
|
+
}
|
|
15309
|
+
function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
|
|
15310
|
+
let best_ratio_diff = Infinity;
|
|
15311
|
+
let best_ratio = [1, 1];
|
|
15312
|
+
const area = width * height;
|
|
15313
|
+
for (const ratio of target_ratios) {
|
|
15314
|
+
const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
|
|
15315
|
+
if (ratio_diff < best_ratio_diff) {
|
|
15316
|
+
best_ratio_diff = ratio_diff;
|
|
15317
|
+
best_ratio = ratio;
|
|
15318
|
+
} else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
|
|
15319
|
+
best_ratio = ratio;
|
|
15320
|
+
}
|
|
15321
|
+
}
|
|
15322
|
+
return best_ratio;
|
|
15323
|
+
}
|
|
15324
|
+
function get_target_ratios(min_tiles, max_tiles) {
|
|
15325
|
+
const ratios = [];
|
|
15326
|
+
const seen = /* @__PURE__ */ new Set();
|
|
15327
|
+
for (let n = min_tiles; n <= max_tiles; ++n) {
|
|
15328
|
+
for (let w = 1; w <= n; ++w) {
|
|
15329
|
+
for (let h = 1; h <= n; ++h) {
|
|
15330
|
+
const product2 = w * h;
|
|
15331
|
+
if (product2 >= min_tiles && product2 <= max_tiles) {
|
|
15332
|
+
const key = w << 16 | h;
|
|
15333
|
+
if (!seen.has(key)) {
|
|
15334
|
+
seen.add(key);
|
|
15335
|
+
ratios.push([w, h]);
|
|
15336
|
+
}
|
|
15337
|
+
}
|
|
15338
|
+
}
|
|
15339
|
+
}
|
|
15340
|
+
}
|
|
15341
|
+
return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
|
|
15342
|
+
}
|
|
15343
|
+
function convert_image_to_patches(images, patch_size) {
|
|
15344
|
+
const [B, C, H, W] = images.dims;
|
|
15345
|
+
const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
|
|
15346
|
+
const patch_dim = patch_size * patch_size * C;
|
|
15347
|
+
const data = (
|
|
15348
|
+
/** @type {Float32Array} */
|
|
15349
|
+
images.data
|
|
15350
|
+
);
|
|
15351
|
+
const result = new Float32Array(B * ph * pw * patch_dim);
|
|
15352
|
+
const ch_stride = H * W;
|
|
15353
|
+
for (let b = 0; b < B; ++b) {
|
|
15354
|
+
const b_src = b * C * ch_stride;
|
|
15355
|
+
const b_dst = b * ph * pw * patch_dim;
|
|
15356
|
+
for (let py = 0; py < ph; ++py) {
|
|
15357
|
+
for (let px = 0; px < pw; ++px) {
|
|
15358
|
+
let off = b_dst + (py * pw + px) * patch_dim;
|
|
15359
|
+
for (let dy = 0; dy < patch_size; ++dy) {
|
|
15360
|
+
const row = (py * patch_size + dy) * W + px * patch_size;
|
|
15361
|
+
for (let dx = 0; dx < patch_size; ++dx) {
|
|
15362
|
+
const pixel = row + dx;
|
|
15363
|
+
for (let c = 0; c < C; ++c) {
|
|
15364
|
+
result[off++] = data[b_src + c * ch_stride + pixel];
|
|
15365
|
+
}
|
|
15366
|
+
}
|
|
15367
|
+
}
|
|
15368
|
+
}
|
|
15369
|
+
}
|
|
15370
|
+
}
|
|
15371
|
+
return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
|
|
15372
|
+
}
|
|
15373
|
+
function pad_along_first_dim(patches, target_length) {
|
|
15374
|
+
const [, len2, dim] = patches.dims;
|
|
15375
|
+
const mask_data = new BigInt64Array(target_length);
|
|
15376
|
+
mask_data.fill(1n, 0, len2);
|
|
15377
|
+
let padded = patches;
|
|
15378
|
+
if (len2 < target_length) {
|
|
15379
|
+
const padded_data = new Float32Array(target_length * dim);
|
|
15380
|
+
padded_data.set(
|
|
15381
|
+
/** @type {Float32Array} */
|
|
15382
|
+
patches.data
|
|
15383
|
+
);
|
|
15384
|
+
padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
|
|
15385
|
+
}
|
|
15386
|
+
return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
|
|
15387
|
+
}
|
|
15388
|
+
var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
15389
|
+
constructor(config) {
|
|
15390
|
+
super(config);
|
|
15391
|
+
this.downsample_factor = config.downsample_factor ?? 2;
|
|
15392
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
15393
|
+
this.min_tiles = config.min_tiles ?? 2;
|
|
15394
|
+
this.max_tiles = config.max_tiles ?? 10;
|
|
15395
|
+
this.use_thumbnail = config.use_thumbnail ?? true;
|
|
15396
|
+
this.min_image_tokens = config.min_image_tokens ?? 64;
|
|
15397
|
+
this.max_image_tokens = config.max_image_tokens ?? 256;
|
|
15398
|
+
this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
|
|
15399
|
+
this.tile_size = config.tile_size ?? 512;
|
|
15400
|
+
this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
|
|
15401
|
+
this.return_row_col_info = config.return_row_col_info ?? false;
|
|
15402
|
+
const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
|
|
15403
|
+
const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
|
|
15404
|
+
this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
|
|
15405
|
+
}
|
|
15406
|
+
/**
|
|
15407
|
+
* Check if the image is too large to be processed as a single tile.
|
|
15408
|
+
* @param {number} height
|
|
15409
|
+
* @param {number} width
|
|
15410
|
+
* @returns {boolean}
|
|
15411
|
+
*/
|
|
15412
|
+
_is_image_too_large(height, width) {
|
|
15413
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
15414
|
+
const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
|
|
15415
|
+
const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
|
|
15416
|
+
return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
|
|
15417
|
+
}
|
|
15418
|
+
/**
|
|
15419
|
+
* Get the grid layout for tiling a large image.
|
|
15420
|
+
* @param {number} height
|
|
15421
|
+
* @param {number} width
|
|
15422
|
+
* @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
|
|
15423
|
+
*/
|
|
15424
|
+
_get_grid_layout(height, width) {
|
|
15425
|
+
const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
|
|
15426
|
+
const [grid_width, grid_height] = find_closest_aspect_ratio(
|
|
15427
|
+
width / height,
|
|
15428
|
+
target_ratios,
|
|
15429
|
+
width,
|
|
15430
|
+
height,
|
|
15431
|
+
this.tile_size
|
|
15432
|
+
);
|
|
15433
|
+
return {
|
|
15434
|
+
grid_width,
|
|
15435
|
+
grid_height,
|
|
15436
|
+
target_width: this.tile_size * grid_width,
|
|
15437
|
+
target_height: this.tile_size * grid_height
|
|
15438
|
+
};
|
|
15439
|
+
}
|
|
15440
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
15441
|
+
// @ts-expect-error
|
|
15442
|
+
async _call(images, { return_row_col_info = null } = {}) {
|
|
15443
|
+
let batched_images;
|
|
15444
|
+
if (!Array.isArray(images)) {
|
|
15445
|
+
batched_images = [[images]];
|
|
15446
|
+
} else if (!Array.isArray(images[0])) {
|
|
15447
|
+
batched_images = [
|
|
15448
|
+
/** @type {RawImage[]} */
|
|
15449
|
+
images
|
|
15450
|
+
];
|
|
15451
|
+
} else {
|
|
15452
|
+
batched_images = /** @type {RawImage[][]} */
|
|
15453
|
+
images;
|
|
15454
|
+
}
|
|
15455
|
+
const all_pixel_values = [];
|
|
15456
|
+
const all_pixel_masks = [];
|
|
15457
|
+
const all_spatial_shapes = [];
|
|
15458
|
+
const all_rows = [];
|
|
15459
|
+
const all_cols = [];
|
|
15460
|
+
const all_image_sizes = [];
|
|
15461
|
+
for (const image_batch of batched_images) {
|
|
15462
|
+
const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
|
|
15463
|
+
for (const { pixel_values } of preprocessed) {
|
|
15464
|
+
const [, height, width] = pixel_values.dims;
|
|
15465
|
+
const img = pixel_values.unsqueeze_(0);
|
|
15466
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
15467
|
+
const f2 = total_factor ** 2;
|
|
15468
|
+
const [new_width, new_height] = smart_resize(
|
|
15469
|
+
Math.max(total_factor, height),
|
|
15470
|
+
Math.max(total_factor, width),
|
|
15471
|
+
total_factor,
|
|
15472
|
+
this.min_image_tokens * f2,
|
|
15473
|
+
this.max_image_tokens * f2
|
|
15474
|
+
).map((x) => Math.max(total_factor, x));
|
|
15475
|
+
let tiles;
|
|
15476
|
+
let num_rows = 1, num_cols = 1;
|
|
15477
|
+
const is_large = this._is_image_too_large(height, width);
|
|
15478
|
+
const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
|
|
15479
|
+
if (is_large && do_splitting) {
|
|
15480
|
+
const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
|
|
15481
|
+
height,
|
|
15482
|
+
width
|
|
15483
|
+
);
|
|
15484
|
+
num_rows = grid_height;
|
|
15485
|
+
num_cols = grid_width;
|
|
15486
|
+
const resized = await interpolate_4d(img, {
|
|
15487
|
+
size: [target_height, target_width]
|
|
15488
|
+
});
|
|
15489
|
+
tiles = [];
|
|
15490
|
+
for (let r = 0; r < grid_height; ++r) {
|
|
15491
|
+
for (let c = 0; c < grid_width; ++c) {
|
|
15492
|
+
const y = r * this.tile_size;
|
|
15493
|
+
const x = c * this.tile_size;
|
|
15494
|
+
tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
|
|
15495
|
+
}
|
|
15496
|
+
}
|
|
15497
|
+
if (this.use_thumbnail && grid_width * grid_height !== 1) {
|
|
15498
|
+
tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
|
|
15499
|
+
}
|
|
15500
|
+
} else {
|
|
15501
|
+
tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
|
|
15502
|
+
}
|
|
15503
|
+
for (const tile of tiles) {
|
|
15504
|
+
const [, , th, tw] = tile.dims;
|
|
15505
|
+
const patches = convert_image_to_patches(tile, this.encoder_patch_size);
|
|
15506
|
+
const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
|
|
15507
|
+
all_pixel_values.push(padded);
|
|
15508
|
+
all_pixel_masks.push(mask);
|
|
15509
|
+
all_spatial_shapes.push([
|
|
15510
|
+
Math.floor(th / this.encoder_patch_size),
|
|
15511
|
+
Math.floor(tw / this.encoder_patch_size)
|
|
15512
|
+
]);
|
|
15513
|
+
}
|
|
15514
|
+
all_rows.push(num_rows);
|
|
15515
|
+
all_cols.push(num_cols);
|
|
15516
|
+
all_image_sizes.push([new_height, new_width]);
|
|
15517
|
+
}
|
|
15518
|
+
}
|
|
15519
|
+
const result = {
|
|
15520
|
+
pixel_values: cat(all_pixel_values, 0),
|
|
15521
|
+
pixel_attention_mask: stack(all_pixel_masks, 0),
|
|
15522
|
+
spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
|
|
15523
|
+
all_spatial_shapes.length,
|
|
15524
|
+
2
|
|
15525
|
+
])
|
|
15526
|
+
};
|
|
15527
|
+
if (return_row_col_info ?? this.return_row_col_info) {
|
|
15528
|
+
result.image_rows = all_rows;
|
|
15529
|
+
result.image_cols = all_cols;
|
|
15530
|
+
result.image_sizes = all_image_sizes;
|
|
15531
|
+
}
|
|
15532
|
+
return result;
|
|
15533
|
+
}
|
|
15534
|
+
};
|
|
15535
|
+
|
|
14768
15536
|
// src/models/llava_onevision/image_processing_llava_onevision.js
|
|
14769
15537
|
var LlavaOnevisionImageProcessor = class extends ImageProcessor {
|
|
14770
15538
|
};
|
|
@@ -14987,76 +15755,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
14987
15755
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
14988
15756
|
};
|
|
14989
15757
|
|
|
14990
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
14991
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
14992
|
-
if (height < factor || width < factor) {
|
|
14993
|
-
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
14994
|
-
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
14995
|
-
throw new Error(
|
|
14996
|
-
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
14997
|
-
);
|
|
14998
|
-
}
|
|
14999
|
-
let h_bar = Math.round(height / factor) * factor;
|
|
15000
|
-
let w_bar = Math.round(width / factor) * factor;
|
|
15001
|
-
if (h_bar * w_bar > max_pixels) {
|
|
15002
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
15003
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
15004
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
15005
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
15006
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
15007
|
-
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
15008
|
-
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
15009
|
-
}
|
|
15010
|
-
return [h_bar, w_bar];
|
|
15011
|
-
}
|
|
15012
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
15013
|
-
constructor(config) {
|
|
15014
|
-
super(config);
|
|
15015
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
15016
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
15017
|
-
this.patch_size = config.patch_size;
|
|
15018
|
-
this.merge_size = config.merge_size;
|
|
15019
|
-
}
|
|
15020
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
15021
|
-
get_resize_output_image_size(image, size) {
|
|
15022
|
-
const factor = this.patch_size * this.merge_size;
|
|
15023
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
15024
|
-
}
|
|
15025
|
-
async _call(images, ...args) {
|
|
15026
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
15027
|
-
let patches = pixel_values;
|
|
15028
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
15029
|
-
if (patches.dims[0] === 1) {
|
|
15030
|
-
patches = cat(
|
|
15031
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
15032
|
-
0
|
|
15033
|
-
);
|
|
15034
|
-
}
|
|
15035
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
15036
|
-
const channel = patches.dims[1];
|
|
15037
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
15038
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
15039
|
-
const flatten_patches = patches.view(
|
|
15040
|
-
grid_t,
|
|
15041
|
-
temporal_patch_size,
|
|
15042
|
-
channel,
|
|
15043
|
-
Math.floor(grid_h / merge_size),
|
|
15044
|
-
merge_size,
|
|
15045
|
-
patch_size,
|
|
15046
|
-
Math.floor(grid_w / merge_size),
|
|
15047
|
-
merge_size,
|
|
15048
|
-
patch_size
|
|
15049
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
15050
|
-
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
15051
|
-
return {
|
|
15052
|
-
pixel_values: flatten_patches,
|
|
15053
|
-
image_grid_thw,
|
|
15054
|
-
original_sizes,
|
|
15055
|
-
reshaped_input_sizes
|
|
15056
|
-
};
|
|
15057
|
-
}
|
|
15058
|
-
};
|
|
15059
|
-
|
|
15060
15758
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
15061
15759
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
15062
15760
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -15610,6 +16308,107 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
15610
16308
|
}
|
|
15611
16309
|
};
|
|
15612
16310
|
|
|
16311
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
16312
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
16313
|
+
static image_processor_class = AutoImageProcessor;
|
|
16314
|
+
static tokenizer_class = AutoTokenizer;
|
|
16315
|
+
static image_token = "<|image_pad|>";
|
|
16316
|
+
/**
|
|
16317
|
+
*
|
|
16318
|
+
* @param {string|string[]} text
|
|
16319
|
+
* @param {RawImage|RawImage[]} images
|
|
16320
|
+
* @param {...any} args
|
|
16321
|
+
* @returns {Promise<any>}
|
|
16322
|
+
*/
|
|
16323
|
+
async _call(text, images = null, ...args) {
|
|
16324
|
+
if (!Array.isArray(text)) {
|
|
16325
|
+
text = [text];
|
|
16326
|
+
}
|
|
16327
|
+
let image_inputs, image_grid_thw;
|
|
16328
|
+
if (images) {
|
|
16329
|
+
image_inputs = await this.image_processor(images);
|
|
16330
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
16331
|
+
}
|
|
16332
|
+
if (image_grid_thw) {
|
|
16333
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
16334
|
+
let index = 0;
|
|
16335
|
+
const image_token = (
|
|
16336
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
16337
|
+
this.constructor.image_token
|
|
16338
|
+
);
|
|
16339
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
16340
|
+
text = text.map((t) => {
|
|
16341
|
+
while (t.includes(image_token)) {
|
|
16342
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
16343
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
16344
|
+
}
|
|
16345
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
16346
|
+
});
|
|
16347
|
+
}
|
|
16348
|
+
const text_inputs = this.tokenizer(text);
|
|
16349
|
+
return {
|
|
16350
|
+
...text_inputs,
|
|
16351
|
+
...image_inputs
|
|
16352
|
+
};
|
|
16353
|
+
}
|
|
16354
|
+
};
|
|
16355
|
+
|
|
16356
|
+
// src/models/glm46v/processing_glm46v.js
|
|
16357
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
16358
|
+
static image_token = "<|image|>";
|
|
16359
|
+
};
|
|
16360
|
+
|
|
16361
|
+
// src/models/granite_speech/processing_granite_speech.js
|
|
16362
|
+
var GraniteSpeechProcessor = class extends Processor {
|
|
16363
|
+
static tokenizer_class = AutoTokenizer;
|
|
16364
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
16365
|
+
static uses_processor_config = true;
|
|
16366
|
+
/**
|
|
16367
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
16368
|
+
* @param {number} audioLength Raw audio sample count.
|
|
16369
|
+
* @returns {number} Number of projector output tokens.
|
|
16370
|
+
*/
|
|
16371
|
+
_get_num_audio_features(audioLength) {
|
|
16372
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
16373
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
16374
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
16375
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
16376
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
16377
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
16378
|
+
return nblocks * effective_window_size;
|
|
16379
|
+
}
|
|
16380
|
+
/**
|
|
16381
|
+
* @param {string} text The text input to process.
|
|
16382
|
+
* @param {Float32Array} audio The audio input to process.
|
|
16383
|
+
*/
|
|
16384
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
16385
|
+
if (Array.isArray(text)) {
|
|
16386
|
+
throw new Error("Batched inputs are not supported yet.");
|
|
16387
|
+
}
|
|
16388
|
+
let audio_inputs = {};
|
|
16389
|
+
if (audio) {
|
|
16390
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
16391
|
+
audio_inputs["input_features"] = input_features;
|
|
16392
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
16393
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
16394
|
+
audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
|
|
16395
|
+
const audio_token = this.config.audio_token ?? "<|audio|>";
|
|
16396
|
+
if (!text.includes(audio_token)) {
|
|
16397
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
16398
|
+
}
|
|
16399
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
16400
|
+
}
|
|
16401
|
+
const text_inputs = this.tokenizer(text, {
|
|
16402
|
+
add_special_tokens: false,
|
|
16403
|
+
...kwargs
|
|
16404
|
+
});
|
|
16405
|
+
return {
|
|
16406
|
+
...text_inputs,
|
|
16407
|
+
...audio_inputs
|
|
16408
|
+
};
|
|
16409
|
+
}
|
|
16410
|
+
};
|
|
16411
|
+
|
|
15613
16412
|
// src/models/grounding_dino/processing_grounding_dino.js
|
|
15614
16413
|
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
15615
16414
|
const left_idx = 0;
|
|
@@ -15886,6 +16685,66 @@ var JinaCLIPProcessor = class extends Processor {
|
|
|
15886
16685
|
}
|
|
15887
16686
|
};
|
|
15888
16687
|
|
|
16688
|
+
// src/models/lfm2_vl/processing_lfm2_vl.js
|
|
16689
|
+
var Lfm2VlProcessor = class extends Processor {
|
|
16690
|
+
static tokenizer_class = AutoTokenizer;
|
|
16691
|
+
static image_processor_class = AutoImageProcessor;
|
|
16692
|
+
/**
|
|
16693
|
+
* @param {RawImage|RawImage[]} images
|
|
16694
|
+
* @param {string|string[]|null} [text]
|
|
16695
|
+
* @param {Record<string, any>} [kwargs]
|
|
16696
|
+
*/
|
|
16697
|
+
async _call(images, text = null, kwargs = {}) {
|
|
16698
|
+
const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
|
|
16699
|
+
...kwargs,
|
|
16700
|
+
return_row_col_info: true
|
|
16701
|
+
});
|
|
16702
|
+
if (text) {
|
|
16703
|
+
const image_token = this.config.image_token ?? "<image>";
|
|
16704
|
+
const {
|
|
16705
|
+
tile_size = 512,
|
|
16706
|
+
downsample_factor = 2,
|
|
16707
|
+
encoder_patch_size = 16,
|
|
16708
|
+
use_thumbnail = true
|
|
16709
|
+
} = (
|
|
16710
|
+
/** @type {Record<string, any>} */
|
|
16711
|
+
this.image_processor.config
|
|
16712
|
+
);
|
|
16713
|
+
const ds = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
|
|
16714
|
+
const tokens_per_tile = ds(tile_size) ** 2;
|
|
16715
|
+
const image_start = this.config.image_start_token ?? "<|image_start|>";
|
|
16716
|
+
const image_end = this.config.image_end_token ?? "<|image_end|>";
|
|
16717
|
+
const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
|
|
16718
|
+
if (!Array.isArray(text)) text = [text];
|
|
16719
|
+
let image_idx = 0;
|
|
16720
|
+
text = text.map((sample) => {
|
|
16721
|
+
const parts = sample.split(image_token);
|
|
16722
|
+
return parts[0] + parts.slice(1).map((part) => {
|
|
16723
|
+
const idx = image_idx++;
|
|
16724
|
+
const [h, w] = image_sizes[idx];
|
|
16725
|
+
const rows = image_rows[idx], cols = image_cols[idx];
|
|
16726
|
+
const tokens_for_image = ds(h) * ds(w);
|
|
16727
|
+
let expanded = image_start;
|
|
16728
|
+
if (rows > 1 || cols > 1) {
|
|
16729
|
+
const tile_str = image_token.repeat(tokens_per_tile);
|
|
16730
|
+
for (let r = 0; r < rows; ++r)
|
|
16731
|
+
for (let c = 0; c < cols; ++c)
|
|
16732
|
+
expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
|
|
16733
|
+
if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
|
|
16734
|
+
} else {
|
|
16735
|
+
expanded += image_token.repeat(tokens_for_image);
|
|
16736
|
+
}
|
|
16737
|
+
return expanded + image_end + part;
|
|
16738
|
+
}).join("");
|
|
16739
|
+
});
|
|
16740
|
+
}
|
|
16741
|
+
return {
|
|
16742
|
+
...image_inputs,
|
|
16743
|
+
...text ? this.tokenizer(text, kwargs) : {}
|
|
16744
|
+
};
|
|
16745
|
+
}
|
|
16746
|
+
};
|
|
16747
|
+
|
|
15889
16748
|
// src/models/llava/processing_llava.js
|
|
15890
16749
|
var LlavaProcessor = class extends Processor {
|
|
15891
16750
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -16229,47 +17088,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
16229
17088
|
}
|
|
16230
17089
|
};
|
|
16231
17090
|
|
|
16232
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
16233
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
16234
|
-
static image_processor_class = AutoImageProcessor;
|
|
16235
|
-
static tokenizer_class = AutoTokenizer;
|
|
16236
|
-
/**
|
|
16237
|
-
*
|
|
16238
|
-
* @param {string|string[]} text
|
|
16239
|
-
* @param {RawImage|RawImage[]} images
|
|
16240
|
-
* @param {...any} args
|
|
16241
|
-
* @returns {Promise<any>}
|
|
16242
|
-
*/
|
|
16243
|
-
async _call(text, images = null, ...args) {
|
|
16244
|
-
if (!Array.isArray(text)) {
|
|
16245
|
-
text = [text];
|
|
16246
|
-
}
|
|
16247
|
-
let image_inputs, image_grid_thw;
|
|
16248
|
-
if (images) {
|
|
16249
|
-
image_inputs = await this.image_processor(images);
|
|
16250
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
16251
|
-
}
|
|
16252
|
-
if (image_grid_thw) {
|
|
16253
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
16254
|
-
let index = 0;
|
|
16255
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
16256
|
-
text = text.map((t) => {
|
|
16257
|
-
while (t.includes("<|image_pad|>")) {
|
|
16258
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
16259
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
16260
|
-
}
|
|
16261
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
16262
|
-
});
|
|
16263
|
-
}
|
|
16264
|
-
const text_inputs = this.tokenizer(text);
|
|
16265
|
-
return {
|
|
16266
|
-
...text_inputs,
|
|
16267
|
-
...image_inputs
|
|
16268
|
-
// TODO: ...videos_inputs,
|
|
16269
|
-
};
|
|
16270
|
-
}
|
|
16271
|
-
};
|
|
16272
|
-
|
|
16273
17091
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
16274
17092
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
16275
17093
|
};
|
|
@@ -16418,6 +17236,94 @@ var VoxtralProcessor = class extends Processor {
|
|
|
16418
17236
|
}
|
|
16419
17237
|
};
|
|
16420
17238
|
|
|
17239
|
+
// src/models/voxtral_realtime/processing_voxtral_realtime.js
|
|
17240
|
+
var NUM_LEFT_PAD_TOKENS = 32;
|
|
17241
|
+
var NUM_DELAY_TOKENS = 6;
|
|
17242
|
+
var AUDIO_LENGTH_PER_TOK = 8;
|
|
17243
|
+
var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
17244
|
+
var STREAMING_PAD_TOKEN_ID = 32;
|
|
17245
|
+
var VoxtralRealtimeProcessor = class extends Processor {
|
|
17246
|
+
static tokenizer_class = AutoTokenizer;
|
|
17247
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
17248
|
+
static uses_processor_config = false;
|
|
17249
|
+
/** Number of mel frames in the first audio chunk. */
|
|
17250
|
+
get num_mel_frames_first_audio_chunk() {
|
|
17251
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
17252
|
+
}
|
|
17253
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
17254
|
+
get num_samples_first_audio_chunk() {
|
|
17255
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
17256
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
17257
|
+
}
|
|
17258
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
17259
|
+
get num_samples_per_audio_chunk() {
|
|
17260
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
17261
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
17262
|
+
}
|
|
17263
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
17264
|
+
get num_right_pad_tokens() {
|
|
17265
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
17266
|
+
}
|
|
17267
|
+
/** Number of mel frames per text token. */
|
|
17268
|
+
get audio_length_per_tok() {
|
|
17269
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
17270
|
+
}
|
|
17271
|
+
/** Number of raw audio samples per token. */
|
|
17272
|
+
get raw_audio_length_per_tok() {
|
|
17273
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
17274
|
+
}
|
|
17275
|
+
/**
|
|
17276
|
+
* Process audio input for VoxtralRealtime.
|
|
17277
|
+
*
|
|
17278
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
17279
|
+
* with silence and mel features are extracted with `center=true`.
|
|
17280
|
+
* Returns `{ input_ids, input_features }`.
|
|
17281
|
+
*
|
|
17282
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
17283
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
17284
|
+
*
|
|
17285
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
17286
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
17287
|
+
* Returns `{ input_features }`.
|
|
17288
|
+
*
|
|
17289
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
17290
|
+
* @param {Object} [options]
|
|
17291
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
17292
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
17293
|
+
* @returns {Promise<Object>}
|
|
17294
|
+
*/
|
|
17295
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
17296
|
+
validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
|
|
17297
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
17298
|
+
throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
|
|
17299
|
+
}
|
|
17300
|
+
if (is_first_audio_chunk) {
|
|
17301
|
+
if (is_streaming) {
|
|
17302
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
17303
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
17304
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
17305
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
17306
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
17307
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
17308
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
17309
|
+
input_ids_data[0] = 1n;
|
|
17310
|
+
const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
|
|
17311
|
+
return {
|
|
17312
|
+
input_ids,
|
|
17313
|
+
...audio_encoding
|
|
17314
|
+
};
|
|
17315
|
+
} else {
|
|
17316
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
17317
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
17318
|
+
padded_audio.set(audio);
|
|
17319
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
17320
|
+
}
|
|
17321
|
+
} else {
|
|
17322
|
+
return await this.feature_extractor(audio, { center: false });
|
|
17323
|
+
}
|
|
17324
|
+
}
|
|
17325
|
+
};
|
|
17326
|
+
|
|
16421
17327
|
// src/models/wav2vec2/processing_wav2vec2.js
|
|
16422
17328
|
var Wav2Vec2Processor = class extends Processor {
|
|
16423
17329
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -16517,11 +17423,16 @@ function getNormalizedConfig(config) {
|
|
|
16517
17423
|
case "florence2":
|
|
16518
17424
|
case "llava_onevision":
|
|
16519
17425
|
case "idefics3":
|
|
17426
|
+
case "granite_speech":
|
|
16520
17427
|
case "ultravox":
|
|
16521
17428
|
case "voxtral":
|
|
17429
|
+
case "voxtral_realtime":
|
|
16522
17430
|
case "smolvlm":
|
|
16523
17431
|
case "gemma3n":
|
|
17432
|
+
case "lfm2_vl":
|
|
16524
17433
|
case "chatterbox":
|
|
17434
|
+
case "lighton_ocr":
|
|
17435
|
+
case "glm_ocr":
|
|
16525
17436
|
case "mistral3":
|
|
16526
17437
|
case "qwen2_5_vl":
|
|
16527
17438
|
case "qwen3_vl":
|
|
@@ -16575,10 +17486,13 @@ function getNormalizedConfig(config) {
|
|
|
16575
17486
|
case "cohere":
|
|
16576
17487
|
case "cohere2":
|
|
16577
17488
|
case "mistral":
|
|
17489
|
+
case "voxtral_realtime_text":
|
|
17490
|
+
case "voxtral_realtime_encoder":
|
|
16578
17491
|
case "starcoder2":
|
|
16579
17492
|
case "qwen2":
|
|
16580
17493
|
case "qwen2_moe":
|
|
16581
17494
|
case "qwen2_vl":
|
|
17495
|
+
case "qwen2_vl_text":
|
|
16582
17496
|
case "qwen2_5_vl_text":
|
|
16583
17497
|
case "qwen3_moe":
|
|
16584
17498
|
case "qwen3_vl_text":
|
|
@@ -16594,6 +17508,8 @@ function getNormalizedConfig(config) {
|
|
|
16594
17508
|
mapping["dim_kv"] = "head_dim";
|
|
16595
17509
|
break;
|
|
16596
17510
|
case "qwen3":
|
|
17511
|
+
case "solar_open":
|
|
17512
|
+
case "glm_ocr_text":
|
|
16597
17513
|
case "gemma":
|
|
16598
17514
|
case "gemma2":
|
|
16599
17515
|
case "vaultgemma":
|
|
@@ -16604,6 +17520,7 @@ function getNormalizedConfig(config) {
|
|
|
16604
17520
|
case "ernie4_5":
|
|
16605
17521
|
case "hunyuan_v1_dense":
|
|
16606
17522
|
case "falcon_h1":
|
|
17523
|
+
case "nemotron_h":
|
|
16607
17524
|
case "ministral":
|
|
16608
17525
|
case "ministral3":
|
|
16609
17526
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -16638,6 +17555,9 @@ function getNormalizedConfig(config) {
|
|
|
16638
17555
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
16639
17556
|
break;
|
|
16640
17557
|
case "youtu":
|
|
17558
|
+
case "deepseek_v3":
|
|
17559
|
+
case "glm_moe_dsa":
|
|
17560
|
+
case "mistral4":
|
|
16641
17561
|
mapping["num_heads"] = "num_key_value_heads";
|
|
16642
17562
|
mapping["num_layers"] = "num_hidden_layers";
|
|
16643
17563
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -16723,6 +17643,10 @@ function getNormalizedConfig(config) {
|
|
|
16723
17643
|
return normalized_config;
|
|
16724
17644
|
}
|
|
16725
17645
|
function getCacheShapes(config, options) {
|
|
17646
|
+
if (!(config instanceof PretrainedConfig)) {
|
|
17647
|
+
config = new PretrainedConfig(config);
|
|
17648
|
+
}
|
|
17649
|
+
const batch_size = options?.batch_size ?? 1;
|
|
16726
17650
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
16727
17651
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
16728
17652
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -16732,7 +17656,6 @@ function getCacheShapes(config, options) {
|
|
|
16732
17656
|
config
|
|
16733
17657
|
);
|
|
16734
17658
|
const head_dim = hidden_size / num_attention_heads;
|
|
16735
|
-
const batch_size = options?.batch_size ?? 1;
|
|
16736
17659
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
16737
17660
|
if (layer_types[i] === "full_attention") {
|
|
16738
17661
|
for (const kv of ["key", "value"]) {
|
|
@@ -16745,31 +17668,26 @@ function getCacheShapes(config, options) {
|
|
|
16745
17668
|
}
|
|
16746
17669
|
}
|
|
16747
17670
|
return cache_values;
|
|
16748
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
17671
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
16749
17672
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
16750
17673
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
16751
|
-
const
|
|
16752
|
-
const {
|
|
16753
|
-
layer_types,
|
|
16754
|
-
num_hidden_layers,
|
|
16755
|
-
num_attention_heads,
|
|
16756
|
-
num_key_value_heads,
|
|
16757
|
-
hidden_size,
|
|
16758
|
-
mamba_d_conv,
|
|
16759
|
-
mamba_n_heads,
|
|
16760
|
-
mamba_d_head,
|
|
16761
|
-
mamba_d_state,
|
|
16762
|
-
mamba_n_groups,
|
|
16763
|
-
mamba_expand,
|
|
16764
|
-
mamba_d_ssm
|
|
16765
|
-
} = (
|
|
17674
|
+
const c = (
|
|
16766
17675
|
/** @type {any} */
|
|
16767
17676
|
config
|
|
16768
17677
|
);
|
|
16769
|
-
const
|
|
16770
|
-
const
|
|
16771
|
-
const
|
|
16772
|
-
|
|
17678
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
17679
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
17680
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
17681
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
17682
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
17683
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
17684
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
17685
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
17686
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
17687
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
17688
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
17689
|
+
const cache_values = {};
|
|
17690
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
16773
17691
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
16774
17692
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
16775
17693
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -16803,7 +17721,6 @@ function getCacheShapes(config, options) {
|
|
|
16803
17721
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
16804
17722
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
16805
17723
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
16806
|
-
const batch_size = options?.batch_size ?? 1;
|
|
16807
17724
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
16808
17725
|
if (layer_types[i] === "full_attention") {
|
|
16809
17726
|
for (const kv of ["key", "value"]) {
|
|
@@ -16829,12 +17746,16 @@ function getCacheShapes(config, options) {
|
|
|
16829
17746
|
}
|
|
16830
17747
|
}
|
|
16831
17748
|
return cache_values;
|
|
16832
|
-
} else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
|
|
16833
|
-
|
|
16834
|
-
|
|
16835
|
-
|
|
16836
|
-
|
|
16837
|
-
|
|
17749
|
+
} else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
|
|
17750
|
+
let subConfig;
|
|
17751
|
+
if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
|
|
17752
|
+
subConfig = /** @type {any} */
|
|
17753
|
+
config.audio_config;
|
|
17754
|
+
} else {
|
|
17755
|
+
subConfig = /** @type {any} */
|
|
17756
|
+
config.text_config;
|
|
17757
|
+
}
|
|
17758
|
+
return getCacheShapes(subConfig, options);
|
|
16838
17759
|
}
|
|
16839
17760
|
return getKeyValueShapes(config, options);
|
|
16840
17761
|
}
|
|
@@ -17000,7 +17921,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
|
|
|
17000
17921
|
}
|
|
17001
17922
|
|
|
17002
17923
|
// src/models/session.js
|
|
17003
|
-
async function getSession(pretrained_model_name_or_path, fileName, options,
|
|
17924
|
+
async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
|
|
17004
17925
|
let custom_config = options.config?.["transformers.js_config"] ?? {};
|
|
17005
17926
|
const selectedDevice = (
|
|
17006
17927
|
/** @type {import("../utils/devices.js").DeviceType} */
|
|
@@ -17058,9 +17979,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
17058
17979
|
if (externalData.length > 0 && !apis.IS_NODE_ENV) {
|
|
17059
17980
|
session_options.externalData = externalData;
|
|
17060
17981
|
}
|
|
17061
|
-
if (
|
|
17982
|
+
if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
|
|
17062
17983
|
const shapes = getCacheShapes(options.config, {
|
|
17063
|
-
prefix: "present"
|
|
17984
|
+
prefix: "present",
|
|
17985
|
+
session_name
|
|
17064
17986
|
});
|
|
17065
17987
|
if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
|
|
17066
17988
|
const preferredOutputLocation = {};
|
|
@@ -17078,15 +18000,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
17078
18000
|
};
|
|
17079
18001
|
return { buffer_or_path, session_options, session_config };
|
|
17080
18002
|
}
|
|
17081
|
-
async function constructSessions(pretrained_model_name_or_path, names, options,
|
|
18003
|
+
async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
|
|
17082
18004
|
return Object.fromEntries(
|
|
17083
18005
|
await Promise.all(
|
|
17084
18006
|
Object.keys(names).map(async (name) => {
|
|
18007
|
+
const cache_config = cache_sessions?.[name] ?? false;
|
|
17085
18008
|
const { buffer_or_path, session_options, session_config } = await getSession(
|
|
17086
18009
|
pretrained_model_name_or_path,
|
|
17087
18010
|
names[name],
|
|
17088
18011
|
options,
|
|
17089
|
-
|
|
18012
|
+
cache_config,
|
|
18013
|
+
name
|
|
17090
18014
|
);
|
|
17091
18015
|
const session = await createInferenceSession(buffer_or_path, session_options, session_config);
|
|
17092
18016
|
return [name, session];
|
|
@@ -18386,19 +19310,71 @@ var BeamSearchSampler = class extends LogitsSampler {
|
|
|
18386
19310
|
}
|
|
18387
19311
|
};
|
|
18388
19312
|
|
|
19313
|
+
// src/cache_utils.js
|
|
19314
|
+
var _DynamicCache = class {
|
|
19315
|
+
/**
|
|
19316
|
+
* Create a DynamicCache, optionally pre-populated with entries.
|
|
19317
|
+
* @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
|
|
19318
|
+
*/
|
|
19319
|
+
constructor(entries) {
|
|
19320
|
+
if (!entries) return;
|
|
19321
|
+
for (const key in entries) {
|
|
19322
|
+
if (key in this) {
|
|
19323
|
+
throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
|
|
19324
|
+
}
|
|
19325
|
+
const value = entries[key];
|
|
19326
|
+
if (!(value instanceof Tensor2)) {
|
|
19327
|
+
throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
|
|
19328
|
+
}
|
|
19329
|
+
this[key] = value;
|
|
19330
|
+
}
|
|
19331
|
+
}
|
|
19332
|
+
/**
|
|
19333
|
+
* Get the cached sequence length. This requires at least one attention cache entry to be present.
|
|
19334
|
+
* @returns {number} The past sequence length.
|
|
19335
|
+
*/
|
|
19336
|
+
get_seq_length() {
|
|
19337
|
+
const self2 = (
|
|
19338
|
+
/** @type {any} */
|
|
19339
|
+
this
|
|
19340
|
+
);
|
|
19341
|
+
for (const name in self2) {
|
|
19342
|
+
if (name.startsWith("past_key_values.")) {
|
|
19343
|
+
return self2[name].dims.at(-2);
|
|
19344
|
+
}
|
|
19345
|
+
}
|
|
19346
|
+
throw new Error("Unable to determine sequence length from the cache.");
|
|
19347
|
+
}
|
|
19348
|
+
/**
|
|
19349
|
+
* Dispose all contained tensors whose data resides on the GPU.
|
|
19350
|
+
* Returns a promise that resolves when all disposals are complete.
|
|
19351
|
+
* @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
|
|
19352
|
+
*/
|
|
19353
|
+
async dispose() {
|
|
19354
|
+
const promises = [];
|
|
19355
|
+
for (
|
|
19356
|
+
const t of
|
|
19357
|
+
/** @type {Tensor[]} */
|
|
19358
|
+
Object.values(this)
|
|
19359
|
+
) {
|
|
19360
|
+
if (t.location === "gpu-buffer") {
|
|
19361
|
+
promises.push(t.dispose());
|
|
19362
|
+
}
|
|
19363
|
+
}
|
|
19364
|
+
await Promise.all(promises);
|
|
19365
|
+
}
|
|
19366
|
+
};
|
|
19367
|
+
var DynamicCache = (
|
|
19368
|
+
/** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
|
|
19369
|
+
/** @type {unknown} */
|
|
19370
|
+
_DynamicCache
|
|
19371
|
+
);
|
|
19372
|
+
|
|
18389
19373
|
// src/models/modeling_utils.js
|
|
18390
19374
|
var MODEL_MAPPING_NAMES = null;
|
|
18391
19375
|
function registerTaskMappings(mappings) {
|
|
18392
19376
|
MODEL_MAPPING_NAMES = mappings;
|
|
18393
19377
|
}
|
|
18394
|
-
function getPastLength(past_key_values) {
|
|
18395
|
-
for (const name in past_key_values) {
|
|
18396
|
-
if (name.startsWith("past_key_values.")) {
|
|
18397
|
-
return past_key_values[name].dims.at(-2);
|
|
18398
|
-
}
|
|
18399
|
-
}
|
|
18400
|
-
return Object.values(past_key_values)[0].dims.at(-2);
|
|
18401
|
-
}
|
|
18402
19378
|
function toI64Tensor(items) {
|
|
18403
19379
|
if (items instanceof Tensor2) {
|
|
18404
19380
|
return items;
|
|
@@ -18439,71 +19415,181 @@ var MODEL_TYPES = {
|
|
|
18439
19415
|
AutoEncoder: 12,
|
|
18440
19416
|
ImageAudioTextToText: 13,
|
|
18441
19417
|
Supertonic: 14,
|
|
18442
|
-
Chatterbox: 15
|
|
19418
|
+
Chatterbox: 15,
|
|
19419
|
+
MultimodalLanguageModelOnly: 16,
|
|
19420
|
+
VoxtralRealtime: 17
|
|
18443
19421
|
};
|
|
18444
19422
|
var MODEL_TYPE_CONFIG = {
|
|
18445
19423
|
[MODEL_TYPES.DecoderOnly]: {
|
|
18446
19424
|
can_generate: true,
|
|
18447
19425
|
forward: decoder_forward,
|
|
18448
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
19426
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
19427
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
|
|
19428
|
+
cache_sessions: { model: true },
|
|
19429
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18449
19430
|
},
|
|
18450
19431
|
[MODEL_TYPES.DecoderOnlyWithoutHead]: {
|
|
18451
19432
|
can_generate: false,
|
|
18452
19433
|
forward: decoder_forward,
|
|
18453
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
19434
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
19435
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
18454
19436
|
},
|
|
18455
19437
|
[MODEL_TYPES.Seq2Seq]: {
|
|
18456
19438
|
can_generate: true,
|
|
18457
19439
|
forward: seq2seq_forward,
|
|
18458
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
19440
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
19441
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
19442
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19443
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18459
19444
|
},
|
|
18460
19445
|
[MODEL_TYPES.Vision2Seq]: {
|
|
18461
19446
|
can_generate: true,
|
|
18462
19447
|
forward: seq2seq_forward,
|
|
18463
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
19448
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
19449
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
19450
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19451
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18464
19452
|
},
|
|
18465
19453
|
[MODEL_TYPES.Musicgen]: {
|
|
18466
19454
|
can_generate: true,
|
|
18467
|
-
forward: seq2seq_forward
|
|
19455
|
+
forward: seq2seq_forward,
|
|
19456
|
+
sessions: () => ({
|
|
19457
|
+
model: "text_encoder",
|
|
19458
|
+
decoder_model_merged: "decoder_model_merged",
|
|
19459
|
+
encodec_decode: "encodec_decode"
|
|
19460
|
+
}),
|
|
19461
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19462
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18468
19463
|
},
|
|
18469
19464
|
[MODEL_TYPES.EncoderDecoder]: {
|
|
18470
19465
|
can_generate: false,
|
|
18471
|
-
forward: seq2seq_forward
|
|
19466
|
+
forward: seq2seq_forward,
|
|
19467
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
19468
|
+
cache_sessions: { decoder_model_merged: true }
|
|
19469
|
+
},
|
|
19470
|
+
[MODEL_TYPES.MaskGeneration]: {
|
|
19471
|
+
sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
|
|
18472
19472
|
},
|
|
18473
19473
|
[MODEL_TYPES.ImageTextToText]: {
|
|
18474
19474
|
can_generate: true,
|
|
18475
19475
|
forward: image_text_to_text_forward,
|
|
18476
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19476
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19477
|
+
sessions: (config) => {
|
|
19478
|
+
const s = {
|
|
19479
|
+
embed_tokens: "embed_tokens",
|
|
19480
|
+
vision_encoder: "vision_encoder",
|
|
19481
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19482
|
+
};
|
|
19483
|
+
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
19484
|
+
return s;
|
|
19485
|
+
},
|
|
19486
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19487
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18477
19488
|
},
|
|
18478
19489
|
[MODEL_TYPES.AudioTextToText]: {
|
|
18479
19490
|
can_generate: true,
|
|
18480
19491
|
forward: audio_text_to_text_forward,
|
|
18481
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19492
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19493
|
+
sessions: () => ({
|
|
19494
|
+
embed_tokens: "embed_tokens",
|
|
19495
|
+
audio_encoder: "audio_encoder",
|
|
19496
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19497
|
+
}),
|
|
19498
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19499
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18482
19500
|
},
|
|
18483
|
-
[MODEL_TYPES.
|
|
19501
|
+
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
18484
19502
|
can_generate: true,
|
|
18485
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19503
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19504
|
+
sessions: () => ({
|
|
19505
|
+
embed_tokens: "embed_tokens",
|
|
19506
|
+
audio_encoder: "audio_encoder",
|
|
19507
|
+
vision_encoder: "vision_encoder",
|
|
19508
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19509
|
+
}),
|
|
19510
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18486
19511
|
},
|
|
18487
|
-
[MODEL_TYPES.
|
|
19512
|
+
[MODEL_TYPES.Phi3V]: {
|
|
18488
19513
|
can_generate: true,
|
|
18489
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19514
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19515
|
+
sessions: () => ({
|
|
19516
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
19517
|
+
model: "model",
|
|
19518
|
+
vision_encoder: "vision_encoder"
|
|
19519
|
+
}),
|
|
19520
|
+
cache_sessions: { model: true },
|
|
19521
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18490
19522
|
},
|
|
18491
19523
|
[MODEL_TYPES.MultiModality]: {
|
|
18492
|
-
can_generate: true
|
|
19524
|
+
can_generate: true,
|
|
19525
|
+
sessions: () => ({
|
|
19526
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
19527
|
+
model: "language_model",
|
|
19528
|
+
lm_head: "lm_head",
|
|
19529
|
+
gen_head: "gen_head",
|
|
19530
|
+
gen_img_embeds: "gen_img_embeds",
|
|
19531
|
+
image_decode: "image_decode"
|
|
19532
|
+
}),
|
|
19533
|
+
cache_sessions: { model: true },
|
|
19534
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18493
19535
|
},
|
|
18494
19536
|
[MODEL_TYPES.AutoEncoder]: {
|
|
18495
19537
|
can_generate: false,
|
|
18496
|
-
forward: auto_encoder_forward
|
|
19538
|
+
forward: auto_encoder_forward,
|
|
19539
|
+
sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
|
|
19540
|
+
},
|
|
19541
|
+
[MODEL_TYPES.Supertonic]: {
|
|
19542
|
+
sessions: () => ({
|
|
19543
|
+
text_encoder: "text_encoder",
|
|
19544
|
+
latent_denoiser: "latent_denoiser",
|
|
19545
|
+
voice_decoder: "voice_decoder"
|
|
19546
|
+
})
|
|
18497
19547
|
},
|
|
18498
19548
|
[MODEL_TYPES.Chatterbox]: {
|
|
18499
19549
|
can_generate: true,
|
|
18500
|
-
forward: encoder_forward
|
|
19550
|
+
forward: encoder_forward,
|
|
19551
|
+
sessions: () => ({
|
|
19552
|
+
embed_tokens: "embed_tokens",
|
|
19553
|
+
speech_encoder: "speech_encoder",
|
|
19554
|
+
model: "language_model",
|
|
19555
|
+
conditional_decoder: "conditional_decoder"
|
|
19556
|
+
}),
|
|
19557
|
+
cache_sessions: { model: true },
|
|
19558
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
19559
|
+
},
|
|
19560
|
+
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
19561
|
+
can_generate: true,
|
|
19562
|
+
forward: image_text_to_text_forward,
|
|
19563
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19564
|
+
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
19565
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19566
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
19567
|
+
},
|
|
19568
|
+
[MODEL_TYPES.VoxtralRealtime]: {
|
|
19569
|
+
can_generate: true,
|
|
19570
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
19571
|
+
sessions: () => ({
|
|
19572
|
+
embed_tokens: "embed_tokens",
|
|
19573
|
+
audio_encoder: "audio_encoder",
|
|
19574
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19575
|
+
}),
|
|
19576
|
+
cache_sessions: { decoder_model_merged: true, audio_encoder: true },
|
|
19577
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18501
19578
|
},
|
|
18502
19579
|
default: {
|
|
18503
19580
|
can_generate: false,
|
|
18504
|
-
forward: encoder_forward
|
|
19581
|
+
forward: encoder_forward,
|
|
19582
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
18505
19583
|
}
|
|
18506
19584
|
};
|
|
19585
|
+
function getSessionsConfig(modelType, config, options = {}) {
|
|
19586
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
19587
|
+
return {
|
|
19588
|
+
sessions: typeConfig.sessions(config, options),
|
|
19589
|
+
cache_sessions: typeConfig.cache_sessions,
|
|
19590
|
+
optional_configs: typeConfig.optional_configs
|
|
19591
|
+
};
|
|
19592
|
+
}
|
|
18507
19593
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
18508
19594
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
18509
19595
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -18589,300 +19675,78 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
18589
19675
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
18590
19676
|
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
18591
19677
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
18592
|
-
|
|
18593
|
-
if (modelType ===
|
|
18594
|
-
|
|
18595
|
-
|
|
18596
|
-
|
|
18597
|
-
{
|
|
18598
|
-
|
|
18599
|
-
},
|
|
18600
|
-
options,
|
|
18601
|
-
"model"
|
|
18602
|
-
),
|
|
18603
|
-
get_optional_configs(
|
|
18604
|
-
pretrained_model_name_or_path,
|
|
18605
|
-
{
|
|
18606
|
-
generation_config: "generation_config.json"
|
|
18607
|
-
},
|
|
18608
|
-
options
|
|
18609
|
-
)
|
|
18610
|
-
]);
|
|
18611
|
-
} else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
|
|
18612
|
-
info = await Promise.all([
|
|
18613
|
-
constructSessions(
|
|
18614
|
-
pretrained_model_name_or_path,
|
|
18615
|
-
{
|
|
18616
|
-
model: "encoder_model",
|
|
18617
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18618
|
-
},
|
|
18619
|
-
options,
|
|
18620
|
-
"decoder_model_merged"
|
|
18621
|
-
),
|
|
18622
|
-
get_optional_configs(
|
|
18623
|
-
pretrained_model_name_or_path,
|
|
18624
|
-
{
|
|
18625
|
-
generation_config: "generation_config.json"
|
|
18626
|
-
},
|
|
18627
|
-
options
|
|
18628
|
-
)
|
|
18629
|
-
]);
|
|
18630
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
18631
|
-
info = await Promise.all([
|
|
18632
|
-
constructSessions(
|
|
18633
|
-
pretrained_model_name_or_path,
|
|
18634
|
-
{
|
|
18635
|
-
model: "vision_encoder",
|
|
18636
|
-
prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
|
|
18637
|
-
},
|
|
18638
|
-
options
|
|
18639
|
-
)
|
|
18640
|
-
]);
|
|
18641
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
18642
|
-
info = await Promise.all([
|
|
18643
|
-
constructSessions(
|
|
18644
|
-
pretrained_model_name_or_path,
|
|
18645
|
-
{
|
|
18646
|
-
model: "encoder_model",
|
|
18647
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18648
|
-
},
|
|
18649
|
-
options,
|
|
18650
|
-
"decoder_model_merged"
|
|
18651
|
-
)
|
|
18652
|
-
]);
|
|
18653
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
18654
|
-
const sessions = {
|
|
18655
|
-
embed_tokens: "embed_tokens",
|
|
18656
|
-
vision_encoder: "vision_encoder",
|
|
18657
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18658
|
-
};
|
|
18659
|
-
if (config.is_encoder_decoder) {
|
|
18660
|
-
sessions["model"] = "encoder_model";
|
|
19678
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
19679
|
+
if (modelType === void 0) {
|
|
19680
|
+
const type = modelName ?? config?.model_type;
|
|
19681
|
+
if (type !== "custom") {
|
|
19682
|
+
logger.warn(
|
|
19683
|
+
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
19684
|
+
);
|
|
18661
19685
|
}
|
|
18662
|
-
|
|
18663
|
-
|
|
18664
|
-
|
|
18665
|
-
|
|
18666
|
-
|
|
18667
|
-
|
|
18668
|
-
|
|
18669
|
-
|
|
18670
|
-
|
|
18671
|
-
|
|
18672
|
-
|
|
18673
|
-
|
|
18674
|
-
|
|
18675
|
-
|
|
18676
|
-
|
|
18677
|
-
|
|
18678
|
-
|
|
18679
|
-
|
|
18680
|
-
|
|
18681
|
-
|
|
18682
|
-
|
|
18683
|
-
|
|
18684
|
-
|
|
18685
|
-
|
|
18686
|
-
|
|
18687
|
-
|
|
18688
|
-
|
|
18689
|
-
|
|
18690
|
-
|
|
18691
|
-
|
|
18692
|
-
|
|
18693
|
-
|
|
18694
|
-
|
|
18695
|
-
|
|
18696
|
-
|
|
18697
|
-
|
|
18698
|
-
|
|
18699
|
-
|
|
18700
|
-
|
|
18701
|
-
|
|
18702
|
-
|
|
18703
|
-
|
|
18704
|
-
|
|
18705
|
-
|
|
18706
|
-
|
|
18707
|
-
|
|
18708
|
-
|
|
18709
|
-
|
|
18710
|
-
|
|
18711
|
-
|
|
18712
|
-
|
|
18713
|
-
|
|
18714
|
-
|
|
18715
|
-
|
|
18716
|
-
|
|
18717
|
-
|
|
18718
|
-
|
|
18719
|
-
|
|
18720
|
-
|
|
18721
|
-
|
|
18722
|
-
|
|
18723
|
-
|
|
18724
|
-
|
|
18725
|
-
|
|
18726
|
-
info = await Promise.all([
|
|
18727
|
-
constructSessions(
|
|
18728
|
-
pretrained_model_name_or_path,
|
|
18729
|
-
{
|
|
18730
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
18731
|
-
model: "language_model",
|
|
18732
|
-
lm_head: "lm_head",
|
|
18733
|
-
gen_head: "gen_head",
|
|
18734
|
-
gen_img_embeds: "gen_img_embeds",
|
|
18735
|
-
image_decode: "image_decode"
|
|
18736
|
-
},
|
|
18737
|
-
options,
|
|
18738
|
-
"model"
|
|
18739
|
-
),
|
|
18740
|
-
get_optional_configs(
|
|
18741
|
-
pretrained_model_name_or_path,
|
|
18742
|
-
{
|
|
18743
|
-
generation_config: "generation_config.json"
|
|
18744
|
-
},
|
|
18745
|
-
options
|
|
18746
|
-
)
|
|
18747
|
-
]);
|
|
18748
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
18749
|
-
info = await Promise.all([
|
|
18750
|
-
constructSessions(
|
|
18751
|
-
pretrained_model_name_or_path,
|
|
18752
|
-
{
|
|
18753
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
18754
|
-
model: "model",
|
|
18755
|
-
vision_encoder: "vision_encoder"
|
|
18756
|
-
},
|
|
18757
|
-
options,
|
|
18758
|
-
"model"
|
|
18759
|
-
),
|
|
18760
|
-
get_optional_configs(
|
|
18761
|
-
pretrained_model_name_or_path,
|
|
18762
|
-
{
|
|
18763
|
-
generation_config: "generation_config.json"
|
|
18764
|
-
},
|
|
18765
|
-
options
|
|
18766
|
-
)
|
|
18767
|
-
]);
|
|
18768
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
18769
|
-
info = await Promise.all([
|
|
18770
|
-
constructSessions(
|
|
18771
|
-
pretrained_model_name_or_path,
|
|
18772
|
-
{
|
|
18773
|
-
embed_tokens: "embed_tokens",
|
|
18774
|
-
speech_encoder: "speech_encoder",
|
|
18775
|
-
model: "language_model",
|
|
18776
|
-
conditional_decoder: "conditional_decoder"
|
|
18777
|
-
},
|
|
18778
|
-
options,
|
|
18779
|
-
"model"
|
|
18780
|
-
),
|
|
18781
|
-
get_optional_configs(
|
|
18782
|
-
pretrained_model_name_or_path,
|
|
18783
|
-
{
|
|
18784
|
-
generation_config: "generation_config.json"
|
|
18785
|
-
},
|
|
18786
|
-
options
|
|
18787
|
-
)
|
|
18788
|
-
]);
|
|
18789
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
18790
|
-
info = await Promise.all([
|
|
18791
|
-
constructSessions(
|
|
18792
|
-
pretrained_model_name_or_path,
|
|
18793
|
-
{
|
|
18794
|
-
encoder_model: "encoder_model",
|
|
18795
|
-
decoder_model: "decoder_model"
|
|
18796
|
-
},
|
|
18797
|
-
options
|
|
18798
|
-
)
|
|
18799
|
-
]);
|
|
18800
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
18801
|
-
info = await Promise.all([
|
|
18802
|
-
constructSessions(
|
|
18803
|
-
pretrained_model_name_or_path,
|
|
18804
|
-
{
|
|
18805
|
-
text_encoder: "text_encoder",
|
|
18806
|
-
latent_denoiser: "latent_denoiser",
|
|
18807
|
-
voice_decoder: "voice_decoder"
|
|
18808
|
-
},
|
|
18809
|
-
options
|
|
18810
|
-
)
|
|
18811
|
-
]);
|
|
18812
|
-
} else {
|
|
18813
|
-
if (modelType === void 0) {
|
|
18814
|
-
const type = modelName ?? config?.model_type;
|
|
18815
|
-
if (type !== "custom") {
|
|
18816
|
-
logger.warn(
|
|
18817
|
-
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
18818
|
-
);
|
|
18819
|
-
}
|
|
18820
|
-
}
|
|
18821
|
-
info = await Promise.all([
|
|
18822
|
-
constructSessions(
|
|
18823
|
-
pretrained_model_name_or_path,
|
|
18824
|
-
{
|
|
18825
|
-
model: options.model_file_name ?? "model"
|
|
18826
|
-
},
|
|
18827
|
-
options
|
|
18828
|
-
)
|
|
18829
|
-
]);
|
|
18830
|
-
}
|
|
18831
|
-
return new this(config, ...info);
|
|
18832
|
-
}
|
|
18833
|
-
/**
|
|
18834
|
-
* Runs the model with the provided inputs
|
|
18835
|
-
* @param {Object} model_inputs Object containing input tensors
|
|
18836
|
-
* @returns {Promise<Object>} Object containing output tensors
|
|
18837
|
-
*/
|
|
18838
|
-
async _call(model_inputs) {
|
|
18839
|
-
return await this.forward(model_inputs);
|
|
18840
|
-
}
|
|
18841
|
-
/**
|
|
18842
|
-
* Forward method for a pretrained model. If not overridden by a subclass, the correct forward method
|
|
18843
|
-
* will be chosen based on the model type.
|
|
18844
|
-
* @param {Object} model_inputs The input data to the model in the format specified in the ONNX model.
|
|
18845
|
-
* @returns {Promise<Object>} The output data from the model in the format specified in the ONNX model.
|
|
18846
|
-
* @throws {Error} This method must be implemented in subclasses.
|
|
18847
|
-
*/
|
|
18848
|
-
async forward(model_inputs) {
|
|
18849
|
-
return await this._forward(this, model_inputs);
|
|
18850
|
-
}
|
|
18851
|
-
/**
|
|
18852
|
-
* Get the model's generation config, if it exists.
|
|
18853
|
-
* @returns {GenerationConfig|null} The model's generation config if it exists, otherwise `null`.
|
|
18854
|
-
*/
|
|
18855
|
-
get generation_config() {
|
|
18856
|
-
return this.configs?.generation_config ?? null;
|
|
18857
|
-
}
|
|
18858
|
-
/**
|
|
18859
|
-
* @param {GenerationConfig} generation_config
|
|
18860
|
-
* @param {number} input_ids_seq_length The starting sequence length for the input ids.
|
|
18861
|
-
* @returns {LogitsProcessorList}
|
|
18862
|
-
* @private
|
|
18863
|
-
*/
|
|
18864
|
-
_get_logits_processor(generation_config, input_ids_seq_length, logits_processor = null) {
|
|
18865
|
-
const processors = new LogitsProcessorList();
|
|
18866
|
-
if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1) {
|
|
18867
|
-
processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty));
|
|
18868
|
-
}
|
|
18869
|
-
if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) {
|
|
18870
|
-
processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size));
|
|
18871
|
-
}
|
|
18872
|
-
if (generation_config.bad_words_ids !== null) {
|
|
18873
|
-
processors.push(
|
|
18874
|
-
new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id)
|
|
18875
|
-
);
|
|
18876
|
-
}
|
|
18877
|
-
if (generation_config.min_length !== null && generation_config.eos_token_id !== null && generation_config.min_length > 0) {
|
|
18878
|
-
processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id));
|
|
18879
|
-
}
|
|
18880
|
-
if (generation_config.min_new_tokens !== null && generation_config.eos_token_id !== null && generation_config.min_new_tokens > 0) {
|
|
18881
|
-
processors.push(
|
|
18882
|
-
new MinNewTokensLengthLogitsProcessor(
|
|
18883
|
-
input_ids_seq_length,
|
|
18884
|
-
generation_config.min_new_tokens,
|
|
18885
|
-
generation_config.eos_token_id
|
|
19686
|
+
}
|
|
19687
|
+
const sessions = typeConfig.sessions(config, options);
|
|
19688
|
+
const promises = [
|
|
19689
|
+
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
19690
|
+
];
|
|
19691
|
+
if (typeConfig.optional_configs) {
|
|
19692
|
+
promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
|
|
19693
|
+
}
|
|
19694
|
+
const info = await Promise.all(promises);
|
|
19695
|
+
return new this(config, ...info);
|
|
19696
|
+
}
|
|
19697
|
+
/**
|
|
19698
|
+
* Runs the model with the provided inputs
|
|
19699
|
+
* @param {Object} model_inputs Object containing input tensors
|
|
19700
|
+
* @returns {Promise<Object>} Object containing output tensors
|
|
19701
|
+
*/
|
|
19702
|
+
async _call(model_inputs) {
|
|
19703
|
+
return await this.forward(model_inputs);
|
|
19704
|
+
}
|
|
19705
|
+
/**
|
|
19706
|
+
* Forward method for a pretrained model. If not overridden by a subclass, the correct forward method
|
|
19707
|
+
* will be chosen based on the model type.
|
|
19708
|
+
* @param {Object} model_inputs The input data to the model in the format specified in the ONNX model.
|
|
19709
|
+
* @returns {Promise<Object>} The output data from the model in the format specified in the ONNX model.
|
|
19710
|
+
* @throws {Error} This method must be implemented in subclasses.
|
|
19711
|
+
*/
|
|
19712
|
+
async forward(model_inputs) {
|
|
19713
|
+
return await this._forward(this, model_inputs);
|
|
19714
|
+
}
|
|
19715
|
+
/**
|
|
19716
|
+
* Get the model's generation config, if it exists.
|
|
19717
|
+
* @returns {GenerationConfig|null} The model's generation config if it exists, otherwise `null`.
|
|
19718
|
+
*/
|
|
19719
|
+
get generation_config() {
|
|
19720
|
+
return this.configs?.generation_config ?? null;
|
|
19721
|
+
}
|
|
19722
|
+
/**
|
|
19723
|
+
* @param {GenerationConfig} generation_config
|
|
19724
|
+
* @param {number} input_ids_seq_length The starting sequence length for the input ids.
|
|
19725
|
+
* @returns {LogitsProcessorList}
|
|
19726
|
+
* @private
|
|
19727
|
+
*/
|
|
19728
|
+
_get_logits_processor(generation_config, input_ids_seq_length, logits_processor = null) {
|
|
19729
|
+
const processors = new LogitsProcessorList();
|
|
19730
|
+
if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1) {
|
|
19731
|
+
processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty));
|
|
19732
|
+
}
|
|
19733
|
+
if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) {
|
|
19734
|
+
processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size));
|
|
19735
|
+
}
|
|
19736
|
+
if (generation_config.bad_words_ids !== null) {
|
|
19737
|
+
processors.push(
|
|
19738
|
+
new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id)
|
|
19739
|
+
);
|
|
19740
|
+
}
|
|
19741
|
+
if (generation_config.min_length !== null && generation_config.eos_token_id !== null && generation_config.min_length > 0) {
|
|
19742
|
+
processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id));
|
|
19743
|
+
}
|
|
19744
|
+
if (generation_config.min_new_tokens !== null && generation_config.eos_token_id !== null && generation_config.min_new_tokens > 0) {
|
|
19745
|
+
processors.push(
|
|
19746
|
+
new MinNewTokensLengthLogitsProcessor(
|
|
19747
|
+
input_ids_seq_length,
|
|
19748
|
+
generation_config.min_new_tokens,
|
|
19749
|
+
generation_config.eos_token_id
|
|
18886
19750
|
)
|
|
18887
19751
|
);
|
|
18888
19752
|
}
|
|
@@ -19026,7 +19890,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19026
19890
|
* @param {Tensor} [params.inputs=null]
|
|
19027
19891
|
* @param {number} [params.bos_token_id=null]
|
|
19028
19892
|
* @param {Record<string, Tensor|number[]>} [params.model_kwargs]
|
|
19029
|
-
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor
|
|
19893
|
+
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
|
|
19030
19894
|
*/
|
|
19031
19895
|
_prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
|
|
19032
19896
|
const model_inputs = pick(model_kwargs, this.forward_params);
|
|
@@ -19267,11 +20131,12 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19267
20131
|
}
|
|
19268
20132
|
}
|
|
19269
20133
|
/**
|
|
19270
|
-
* Returns
|
|
20134
|
+
* Returns a DynamicCache containing past key values from the given decoder results object.
|
|
19271
20135
|
*
|
|
19272
20136
|
* @param {Object} decoderResults The decoder results object.
|
|
19273
|
-
* @param {
|
|
19274
|
-
* @
|
|
20137
|
+
* @param {DynamicCache} pastKeyValues The previous past key values.
|
|
20138
|
+
* @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
|
|
20139
|
+
* @returns {DynamicCache} A new DynamicCache containing the updated past key values.
|
|
19275
20140
|
*/
|
|
19276
20141
|
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
19277
20142
|
const pkvs = /* @__PURE__ */ Object.create(null);
|
|
@@ -19292,7 +20157,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19292
20157
|
}
|
|
19293
20158
|
}
|
|
19294
20159
|
}
|
|
19295
|
-
return pkvs;
|
|
20160
|
+
return new DynamicCache(pkvs);
|
|
19296
20161
|
}
|
|
19297
20162
|
/**
|
|
19298
20163
|
* Returns an object containing attentions from the given model output object.
|
|
@@ -19317,8 +20182,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19317
20182
|
/**
|
|
19318
20183
|
* Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
|
|
19319
20184
|
*
|
|
19320
|
-
* @param {
|
|
19321
|
-
* @param {
|
|
20185
|
+
* @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
|
|
20186
|
+
* @param {DynamicCache|null} pastKeyValues The cache containing past key values.
|
|
19322
20187
|
*/
|
|
19323
20188
|
addPastKeyValues(decoderFeeds, pastKeyValues) {
|
|
19324
20189
|
if (pastKeyValues) {
|
|
@@ -19335,14 +20200,29 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19335
20200
|
}
|
|
19336
20201
|
}
|
|
19337
20202
|
}
|
|
19338
|
-
|
|
19339
|
-
|
|
20203
|
+
/**
|
|
20204
|
+
* Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
|
|
20205
|
+
* @param {string} sessionName
|
|
20206
|
+
* @param {Record<string, Tensor>} inputs
|
|
20207
|
+
* @param {string} outputName
|
|
20208
|
+
* @private
|
|
20209
|
+
*/
|
|
20210
|
+
async _encode_input(sessionName, inputs, outputName) {
|
|
20211
|
+
if (!Object.hasOwn(this.sessions, sessionName)) {
|
|
20212
|
+
throw new Error(`Model does not have a ${sessionName} session.`);
|
|
20213
|
+
}
|
|
20214
|
+
const session = this.sessions[sessionName];
|
|
20215
|
+
const output = await sessionRun(session, pick(inputs, session.inputNames));
|
|
20216
|
+
return output[outputName];
|
|
20217
|
+
}
|
|
20218
|
+
async encode_image(inputs) {
|
|
20219
|
+
return this._encode_input("vision_encoder", inputs, "image_features");
|
|
19340
20220
|
}
|
|
19341
|
-
async encode_text(
|
|
19342
|
-
return
|
|
20221
|
+
async encode_text(inputs) {
|
|
20222
|
+
return this._encode_input("embed_tokens", inputs, "inputs_embeds");
|
|
19343
20223
|
}
|
|
19344
|
-
async encode_audio(
|
|
19345
|
-
return
|
|
20224
|
+
async encode_audio(inputs) {
|
|
20225
|
+
return this._encode_input("audio_encoder", inputs, "audio_features");
|
|
19346
20226
|
}
|
|
19347
20227
|
};
|
|
19348
20228
|
async function seq2seq_forward(self2, model_inputs) {
|
|
@@ -19397,6 +20277,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
|
|
|
19397
20277
|
const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
|
|
19398
20278
|
new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
|
|
19399
20279
|
}
|
|
20280
|
+
if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
|
|
20281
|
+
new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
|
|
20282
|
+
}
|
|
19400
20283
|
self2.addPastKeyValues(new_model_inputs, past_key_values);
|
|
19401
20284
|
const fixed = pick(new_model_inputs, session.inputNames);
|
|
19402
20285
|
return await sessionRun(session, fixed);
|
|
@@ -19405,7 +20288,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19405
20288
|
// Generic parameters:
|
|
19406
20289
|
encode_function,
|
|
19407
20290
|
merge_function,
|
|
19408
|
-
|
|
20291
|
+
modality_input_names,
|
|
19409
20292
|
modality_output_name,
|
|
19410
20293
|
// Produced by the tokenizer/processor:
|
|
19411
20294
|
input_ids = null,
|
|
@@ -19420,32 +20303,34 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19420
20303
|
// Additional parameters
|
|
19421
20304
|
...kwargs
|
|
19422
20305
|
}) {
|
|
19423
|
-
const modality_values = kwargs[modality_input_name];
|
|
19424
20306
|
if (!inputs_embeds) {
|
|
19425
20307
|
inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
|
|
19426
|
-
|
|
19427
|
-
|
|
19428
|
-
|
|
19429
|
-
|
|
19430
|
-
|
|
19431
|
-
|
|
19432
|
-
|
|
19433
|
-
|
|
19434
|
-
|
|
19435
|
-
inputs_embeds,
|
|
19436
|
-
|
|
19437
|
-
|
|
19438
|
-
|
|
19439
|
-
|
|
19440
|
-
|
|
19441
|
-
|
|
19442
|
-
|
|
19443
|
-
|
|
19444
|
-
|
|
19445
|
-
|
|
19446
|
-
|
|
19447
|
-
|
|
19448
|
-
|
|
20308
|
+
const modality_values = pick(kwargs, modality_input_names);
|
|
20309
|
+
if (Object.keys(modality_values).length > 0) {
|
|
20310
|
+
if (input_ids.dims[1] !== 1) {
|
|
20311
|
+
const modality_features = await encode_function({
|
|
20312
|
+
// Pass the modality values under its expected key.
|
|
20313
|
+
// The caller knows whether this is audio or image.
|
|
20314
|
+
...modality_values,
|
|
20315
|
+
...kwargs
|
|
20316
|
+
});
|
|
20317
|
+
({ inputs_embeds, attention_mask } = merge_function({
|
|
20318
|
+
[modality_output_name]: modality_features,
|
|
20319
|
+
inputs_embeds,
|
|
20320
|
+
input_ids,
|
|
20321
|
+
attention_mask
|
|
20322
|
+
}));
|
|
20323
|
+
} else if (past_key_values && input_ids.dims[1] === 1) {
|
|
20324
|
+
const target_length = input_ids.dims[1];
|
|
20325
|
+
const past_length = past_key_values.get_seq_length();
|
|
20326
|
+
attention_mask = cat(
|
|
20327
|
+
[
|
|
20328
|
+
ones([input_ids.dims[0], past_length]),
|
|
20329
|
+
attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
|
|
20330
|
+
],
|
|
20331
|
+
1
|
|
20332
|
+
);
|
|
20333
|
+
}
|
|
19449
20334
|
}
|
|
19450
20335
|
}
|
|
19451
20336
|
if (!position_ids) {
|
|
@@ -19453,14 +20338,19 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19453
20338
|
// Handle special case for qwen vl models
|
|
19454
20339
|
[
|
|
19455
20340
|
"qwen2_vl",
|
|
20341
|
+
"qwen2_vl_text",
|
|
19456
20342
|
"qwen2_5_vl",
|
|
19457
20343
|
"qwen2_5_vl_text",
|
|
19458
20344
|
"qwen3_vl",
|
|
19459
20345
|
"qwen3_vl_text",
|
|
20346
|
+
"qwen3_vl_moe",
|
|
20347
|
+
"qwen3_vl_moe_text",
|
|
19460
20348
|
"qwen3_5",
|
|
19461
20349
|
"qwen3_5_text",
|
|
19462
20350
|
"qwen3_5_moe",
|
|
19463
|
-
"qwen3_5_moe_text"
|
|
20351
|
+
"qwen3_5_moe_text",
|
|
20352
|
+
"glm_ocr",
|
|
20353
|
+
"glm_ocr_text"
|
|
19464
20354
|
].includes(self2.config.model_type)
|
|
19465
20355
|
) {
|
|
19466
20356
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -19484,7 +20374,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19484
20374
|
async function audio_text_to_text_forward(self2, params) {
|
|
19485
20375
|
return await generic_text_to_text_forward(self2, {
|
|
19486
20376
|
...params,
|
|
19487
|
-
|
|
20377
|
+
modality_input_names: ["audio_values", "input_features"],
|
|
19488
20378
|
modality_output_name: "audio_features",
|
|
19489
20379
|
encode_function: self2.encode_audio.bind(self2),
|
|
19490
20380
|
merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
|
|
@@ -19493,7 +20383,7 @@ async function audio_text_to_text_forward(self2, params) {
|
|
|
19493
20383
|
async function image_text_to_text_forward(self2, params) {
|
|
19494
20384
|
return await generic_text_to_text_forward(self2, {
|
|
19495
20385
|
...params,
|
|
19496
|
-
|
|
20386
|
+
modality_input_names: ["pixel_values"],
|
|
19497
20387
|
modality_output_name: "image_features",
|
|
19498
20388
|
encode_function: self2.encode_image.bind(self2),
|
|
19499
20389
|
merge_function: self2._merge_input_ids_with_image_features.bind(self2)
|
|
@@ -19529,7 +20419,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
|
|
|
19529
20419
|
return position_ids;
|
|
19530
20420
|
}
|
|
19531
20421
|
function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
|
|
19532
|
-
const past_length = model_inputs.past_key_values ?
|
|
20422
|
+
const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
|
|
20423
|
+
const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
|
|
20424
|
+
if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
|
|
20425
|
+
model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
|
|
20426
|
+
}
|
|
19533
20427
|
if (!model_inputs.attention_mask) {
|
|
19534
20428
|
let dims;
|
|
19535
20429
|
for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
|
|
@@ -19680,6 +20574,8 @@ __export(models_exports, {
|
|
|
19680
20574
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
19681
20575
|
BloomModel: () => BloomModel,
|
|
19682
20576
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
20577
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
20578
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
19683
20579
|
CLIPModel: () => CLIPModel,
|
|
19684
20580
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
19685
20581
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -19754,6 +20650,9 @@ __export(models_exports, {
|
|
|
19754
20650
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
19755
20651
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
19756
20652
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
20653
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
20654
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
20655
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
19757
20656
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
19758
20657
|
DeiTModel: () => DeiTModel,
|
|
19759
20658
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -19799,6 +20698,11 @@ __export(models_exports, {
|
|
|
19799
20698
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
19800
20699
|
EsmModel: () => EsmModel,
|
|
19801
20700
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
20701
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
20702
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
20703
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
20704
|
+
EuroBertModel: () => EuroBertModel,
|
|
20705
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
19802
20706
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
19803
20707
|
ExaoneModel: () => ExaoneModel,
|
|
19804
20708
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -19837,6 +20741,7 @@ __export(models_exports, {
|
|
|
19837
20741
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
19838
20742
|
Gemma3Model: () => Gemma3Model,
|
|
19839
20743
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
20744
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
19840
20745
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
19841
20746
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
19842
20747
|
GemmaForCausalLM: () => GemmaForCausalLM,
|
|
@@ -19844,6 +20749,10 @@ __export(models_exports, {
|
|
|
19844
20749
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
19845
20750
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
19846
20751
|
GlmModel: () => GlmModel,
|
|
20752
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
20753
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
20754
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
20755
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
19847
20756
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
19848
20757
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
19849
20758
|
GptOssModel: () => GptOssModel,
|
|
@@ -19854,6 +20763,7 @@ __export(models_exports, {
|
|
|
19854
20763
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
19855
20764
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
19856
20765
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
20766
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
19857
20767
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
19858
20768
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
19859
20769
|
GroupViTModel: () => GroupViTModel,
|
|
@@ -19875,7 +20785,6 @@ __export(models_exports, {
|
|
|
19875
20785
|
IJepaModel: () => IJepaModel,
|
|
19876
20786
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
19877
20787
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
19878
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
19879
20788
|
JAISLMHeadModel: () => JAISLMHeadModel,
|
|
19880
20789
|
JAISModel: () => JAISModel,
|
|
19881
20790
|
JAISPreTrainedModel: () => JAISPreTrainedModel,
|
|
@@ -19889,6 +20798,8 @@ __export(models_exports, {
|
|
|
19889
20798
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
19890
20799
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
19891
20800
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
20801
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
20802
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
19892
20803
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
19893
20804
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
19894
20805
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -19938,6 +20849,9 @@ __export(models_exports, {
|
|
|
19938
20849
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
19939
20850
|
MimiModel: () => MimiModel,
|
|
19940
20851
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
20852
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
20853
|
+
Mistral4Model: () => Mistral4Model,
|
|
20854
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
19941
20855
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
19942
20856
|
MistralModel: () => MistralModel,
|
|
19943
20857
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -19995,6 +20909,9 @@ __export(models_exports, {
|
|
|
19995
20909
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
19996
20910
|
NanoChatModel: () => NanoChatModel,
|
|
19997
20911
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
20912
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
20913
|
+
NemotronHModel: () => NemotronHModel,
|
|
20914
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
19998
20915
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
19999
20916
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
20000
20917
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -20028,7 +20945,6 @@ __export(models_exports, {
|
|
|
20028
20945
|
Owlv2Model: () => Owlv2Model,
|
|
20029
20946
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
20030
20947
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
20031
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
20032
20948
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
20033
20949
|
ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
|
|
20034
20950
|
PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
|
|
@@ -20058,8 +20974,10 @@ __export(models_exports, {
|
|
|
20058
20974
|
Qwen2MoeModel: () => Qwen2MoeModel,
|
|
20059
20975
|
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
20060
20976
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
20977
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
20061
20978
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
20062
20979
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
20980
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
20063
20981
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
20064
20982
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
20065
20983
|
Qwen3Model: () => Qwen3Model,
|
|
@@ -20070,9 +20988,13 @@ __export(models_exports, {
|
|
|
20070
20988
|
Qwen3NextModel: () => Qwen3NextModel,
|
|
20071
20989
|
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
20072
20990
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
20991
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
20073
20992
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
20993
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
20074
20994
|
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
20995
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
20075
20996
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
20997
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
20076
20998
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
20077
20999
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
20078
21000
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -20123,11 +21045,13 @@ __export(models_exports, {
|
|
|
20123
21045
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
20124
21046
|
SmolLM3Model: () => SmolLM3Model,
|
|
20125
21047
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
20126
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
20127
21048
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
20128
21049
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
20129
21050
|
SnacModel: () => SnacModel,
|
|
20130
21051
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
21052
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
21053
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
21054
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
20131
21055
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
20132
21056
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
20133
21057
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -20195,6 +21119,8 @@ __export(models_exports, {
|
|
|
20195
21119
|
VitsModelOutput: () => VitsModelOutput,
|
|
20196
21120
|
VitsPreTrainedModel: () => VitsPreTrainedModel,
|
|
20197
21121
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
21122
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
21123
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
20198
21124
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
20199
21125
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
20200
21126
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -20300,7 +21226,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
20300
21226
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
20301
21227
|
};
|
|
20302
21228
|
|
|
20303
|
-
// src/models/
|
|
21229
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
20304
21230
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
20305
21231
|
};
|
|
20306
21232
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -20555,7 +21481,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
|
|
|
20555
21481
|
if (!past_key_values || target_length !== 1) {
|
|
20556
21482
|
throw new Error("Incorrect state encountered during generation.");
|
|
20557
21483
|
}
|
|
20558
|
-
const past_length =
|
|
21484
|
+
const past_length = past_key_values.get_seq_length();
|
|
20559
21485
|
attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
|
|
20560
21486
|
}
|
|
20561
21487
|
}
|
|
@@ -20635,6 +21561,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
20635
21561
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
20636
21562
|
};
|
|
20637
21563
|
|
|
21564
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
21565
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
21566
|
+
};
|
|
21567
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
21568
|
+
};
|
|
21569
|
+
|
|
20638
21570
|
// src/models/clap/modeling_clap.js
|
|
20639
21571
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
20640
21572
|
};
|
|
@@ -20973,6 +21905,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
20973
21905
|
}
|
|
20974
21906
|
};
|
|
20975
21907
|
|
|
21908
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
21909
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
21910
|
+
};
|
|
21911
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
21912
|
+
};
|
|
21913
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
21914
|
+
};
|
|
21915
|
+
|
|
20976
21916
|
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
20977
21917
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
20978
21918
|
};
|
|
@@ -21321,6 +22261,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
21321
22261
|
}
|
|
21322
22262
|
};
|
|
21323
22263
|
|
|
22264
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
22265
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
22266
|
+
};
|
|
22267
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
22268
|
+
};
|
|
22269
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
22270
|
+
/**
|
|
22271
|
+
* Calls the model on new inputs.
|
|
22272
|
+
*
|
|
22273
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
22274
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
22275
|
+
*/
|
|
22276
|
+
async _call(model_inputs) {
|
|
22277
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
22278
|
+
}
|
|
22279
|
+
};
|
|
22280
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
22281
|
+
/**
|
|
22282
|
+
* Calls the model on new inputs.
|
|
22283
|
+
*
|
|
22284
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
22285
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
22286
|
+
*/
|
|
22287
|
+
async _call(model_inputs) {
|
|
22288
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
22289
|
+
}
|
|
22290
|
+
};
|
|
22291
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
22292
|
+
/**
|
|
22293
|
+
* Calls the model on new inputs.
|
|
22294
|
+
*
|
|
22295
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
22296
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
22297
|
+
*/
|
|
22298
|
+
async _call(model_inputs) {
|
|
22299
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
22300
|
+
}
|
|
22301
|
+
};
|
|
22302
|
+
|
|
21324
22303
|
// src/models/exaone/modeling_exaone.js
|
|
21325
22304
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
21326
22305
|
};
|
|
@@ -21585,6 +22564,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
|
|
|
21585
22564
|
});
|
|
21586
22565
|
}
|
|
21587
22566
|
};
|
|
22567
|
+
var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
|
|
22568
|
+
};
|
|
21588
22569
|
|
|
21589
22570
|
// src/models/glm/modeling_glm.js
|
|
21590
22571
|
var GlmPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -21594,6 +22575,377 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
21594
22575
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
21595
22576
|
};
|
|
21596
22577
|
|
|
22578
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
22579
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
22580
|
+
};
|
|
22581
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
22582
|
+
};
|
|
22583
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
22584
|
+
};
|
|
22585
|
+
|
|
22586
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
22587
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
22588
|
+
forward_params = [
|
|
22589
|
+
// Text inputs
|
|
22590
|
+
"input_ids",
|
|
22591
|
+
"attention_mask",
|
|
22592
|
+
"position_ids",
|
|
22593
|
+
"past_key_values",
|
|
22594
|
+
// Vision inputs
|
|
22595
|
+
"pixel_values",
|
|
22596
|
+
"image_grid_thw"
|
|
22597
|
+
];
|
|
22598
|
+
};
|
|
22599
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
22600
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
22601
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
22602
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
22603
|
+
image_grid_thw_name = "grid_thw";
|
|
22604
|
+
/**
|
|
22605
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
22606
|
+
* @param {Tensor} input_ids
|
|
22607
|
+
* @param {Tensor} attention_mask
|
|
22608
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
22609
|
+
*/
|
|
22610
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
22611
|
+
if (attention_mask) {
|
|
22612
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
22613
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
22614
|
+
const mrope_position_deltas = Array.from(
|
|
22615
|
+
{ length: dims[0] },
|
|
22616
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
22617
|
+
);
|
|
22618
|
+
return [
|
|
22619
|
+
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
22620
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
22621
|
+
];
|
|
22622
|
+
} else {
|
|
22623
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
22624
|
+
const position_ids = BigInt64Array.from(
|
|
22625
|
+
{ length: 3 * batch_size * seq_length },
|
|
22626
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
22627
|
+
);
|
|
22628
|
+
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
22629
|
+
}
|
|
22630
|
+
}
|
|
22631
|
+
/**
|
|
22632
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
22633
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
22634
|
+
* respecting attention mask.
|
|
22635
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
22636
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
22637
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
22638
|
+
* @param {number} batch_idx Current batch index
|
|
22639
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
22640
|
+
*/
|
|
22641
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
22642
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
22643
|
+
const llm_positions = new Array(total_len);
|
|
22644
|
+
let index = 0;
|
|
22645
|
+
for (let x = 0; x < 3; ++x) {
|
|
22646
|
+
for (const val of llm_pos_ids_list) {
|
|
22647
|
+
const seg_len = val.length / 3;
|
|
22648
|
+
for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
|
|
22649
|
+
llm_positions[index++] = val[z];
|
|
22650
|
+
}
|
|
22651
|
+
}
|
|
22652
|
+
}
|
|
22653
|
+
let count2 = 0;
|
|
22654
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
22655
|
+
if (attn_mask[y] == 1) {
|
|
22656
|
+
for (let x = 0; x < 3; ++x) {
|
|
22657
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
22658
|
+
}
|
|
22659
|
+
++count2;
|
|
22660
|
+
}
|
|
22661
|
+
}
|
|
22662
|
+
return llm_positions;
|
|
22663
|
+
}
|
|
22664
|
+
/**
|
|
22665
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
22666
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
22667
|
+
* @param {object} params
|
|
22668
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
22669
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
22670
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
22671
|
+
* @param {number} params.spatial_merge_size
|
|
22672
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
22673
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
22674
|
+
*/
|
|
22675
|
+
_get_multimodal_rope_positions({
|
|
22676
|
+
filtered_ids,
|
|
22677
|
+
image_grid_thw_list,
|
|
22678
|
+
video_grid_thw_list,
|
|
22679
|
+
spatial_merge_size,
|
|
22680
|
+
state
|
|
22681
|
+
}) {
|
|
22682
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
22683
|
+
const ids = filtered_ids;
|
|
22684
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
22685
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
22686
|
+
return acc;
|
|
22687
|
+
}, []);
|
|
22688
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
22689
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
22690
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
22691
|
+
const llm_pos_ids_list = [];
|
|
22692
|
+
let st = 0;
|
|
22693
|
+
let remain_images = image_nums;
|
|
22694
|
+
let remain_videos = video_nums;
|
|
22695
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
22696
|
+
const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
|
|
22697
|
+
const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
|
|
22698
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
22699
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
22700
|
+
let ed;
|
|
22701
|
+
let t, h, w;
|
|
22702
|
+
if (ed_image < ed_video) {
|
|
22703
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
22704
|
+
++state.image_index;
|
|
22705
|
+
--remain_images;
|
|
22706
|
+
ed = ed_image;
|
|
22707
|
+
} else {
|
|
22708
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
22709
|
+
++state.video_index;
|
|
22710
|
+
--remain_videos;
|
|
22711
|
+
ed = ed_video;
|
|
22712
|
+
}
|
|
22713
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
22714
|
+
Number(t),
|
|
22715
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
22716
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
22717
|
+
];
|
|
22718
|
+
const text_len = ed - st;
|
|
22719
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
22720
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
22721
|
+
const offset = text_len + st_idx;
|
|
22722
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
22723
|
+
const t_index = Array.from(
|
|
22724
|
+
{ length: grid_size },
|
|
22725
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
22726
|
+
);
|
|
22727
|
+
const h_index = Array.from(
|
|
22728
|
+
{ length: grid_size },
|
|
22729
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
22730
|
+
);
|
|
22731
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
22732
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
22733
|
+
st = ed + grid_size;
|
|
22734
|
+
}
|
|
22735
|
+
if (st < ids.length) {
|
|
22736
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
22737
|
+
const text_len = ids.length - st;
|
|
22738
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
22739
|
+
}
|
|
22740
|
+
return llm_pos_ids_list;
|
|
22741
|
+
}
|
|
22742
|
+
/**
|
|
22743
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
22744
|
+
*
|
|
22745
|
+
* Explanation:
|
|
22746
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
22747
|
+
*
|
|
22748
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
22749
|
+
* Examples:
|
|
22750
|
+
* input_ids: [T T T T T], here T is for text.
|
|
22751
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
22752
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
22753
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
22754
|
+
*
|
|
22755
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
22756
|
+
* and 1D rotary position embeddin for text part.
|
|
22757
|
+
* Examples:
|
|
22758
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
22759
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
22760
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
22761
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
22762
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
22763
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
22764
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
22765
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
22766
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
22767
|
+
*
|
|
22768
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
22769
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
22770
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
22771
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
22772
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
22773
|
+
*/
|
|
22774
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
22775
|
+
const { vision_config } = this.config;
|
|
22776
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
22777
|
+
if (image_grid_thw || video_grid_thw) {
|
|
22778
|
+
const total_input_ids = input_ids.tolist();
|
|
22779
|
+
if (!attention_mask) {
|
|
22780
|
+
attention_mask = ones_like(input_ids);
|
|
22781
|
+
}
|
|
22782
|
+
const attention_mask_list = attention_mask.tolist();
|
|
22783
|
+
const position_ids_list = Array.from(
|
|
22784
|
+
{ length: 3 },
|
|
22785
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
22786
|
+
);
|
|
22787
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
22788
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
22789
|
+
const state = { image_index: 0, video_index: 0 };
|
|
22790
|
+
const mrope_position_deltas = [];
|
|
22791
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
22792
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
22793
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
22794
|
+
filtered_ids,
|
|
22795
|
+
image_grid_thw_list,
|
|
22796
|
+
video_grid_thw_list,
|
|
22797
|
+
spatial_merge_size,
|
|
22798
|
+
state
|
|
22799
|
+
});
|
|
22800
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
22801
|
+
llm_pos_ids_list,
|
|
22802
|
+
attention_mask_list[i],
|
|
22803
|
+
position_ids_list,
|
|
22804
|
+
i
|
|
22805
|
+
);
|
|
22806
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
22807
|
+
}
|
|
22808
|
+
return [
|
|
22809
|
+
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
22810
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
22811
|
+
];
|
|
22812
|
+
} else {
|
|
22813
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
22814
|
+
}
|
|
22815
|
+
}
|
|
22816
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
22817
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
22818
|
+
pixel_values,
|
|
22819
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
22820
|
+
})).image_features;
|
|
22821
|
+
return features;
|
|
22822
|
+
}
|
|
22823
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
22824
|
+
return default_merge_input_ids_with_image_features({
|
|
22825
|
+
// @ts-ignore
|
|
22826
|
+
image_token_id: this.config.image_token_id,
|
|
22827
|
+
...kwargs
|
|
22828
|
+
});
|
|
22829
|
+
}
|
|
22830
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
22831
|
+
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
22832
|
+
if (!model_inputs.past_key_values) {
|
|
22833
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
22834
|
+
model_inputs.input_ids,
|
|
22835
|
+
model_inputs.image_grid_thw,
|
|
22836
|
+
model_inputs.video_grid_thw,
|
|
22837
|
+
model_inputs.attention_mask
|
|
22838
|
+
);
|
|
22839
|
+
} else {
|
|
22840
|
+
model_inputs.pixel_values = null;
|
|
22841
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
22842
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
22843
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
22844
|
+
model_inputs.input_ids,
|
|
22845
|
+
model_inputs.image_grid_thw,
|
|
22846
|
+
model_inputs.video_grid_thw,
|
|
22847
|
+
model_inputs.attention_mask
|
|
22848
|
+
);
|
|
22849
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
22850
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
22851
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
22852
|
+
} else {
|
|
22853
|
+
if (!model_inputs.rope_deltas) {
|
|
22854
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
22855
|
+
model_inputs.input_ids,
|
|
22856
|
+
model_inputs.image_grid_thw,
|
|
22857
|
+
model_inputs.video_grid_thw,
|
|
22858
|
+
model_inputs.attention_mask
|
|
22859
|
+
);
|
|
22860
|
+
}
|
|
22861
|
+
const delta = BigInt(past_length);
|
|
22862
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
22863
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
22864
|
+
}
|
|
22865
|
+
}
|
|
22866
|
+
}
|
|
22867
|
+
return model_inputs;
|
|
22868
|
+
}
|
|
22869
|
+
};
|
|
22870
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
22871
|
+
};
|
|
22872
|
+
|
|
22873
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
22874
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
22875
|
+
image_grid_thw_name = "image_grid_thw";
|
|
22876
|
+
};
|
|
22877
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
22878
|
+
image_grid_thw_name = "image_grid_thw";
|
|
22879
|
+
};
|
|
22880
|
+
|
|
22881
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
22882
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
22883
|
+
/**
|
|
22884
|
+
* Compute 3D positional indices for vision tokens.
|
|
22885
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
22886
|
+
* @param {number} start_position
|
|
22887
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
22888
|
+
* @param {number} temp_merge_size
|
|
22889
|
+
* @param {number} spatial_merge_size
|
|
22890
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
22891
|
+
*/
|
|
22892
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
22893
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
22894
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
22895
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
22896
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
22897
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
22898
|
+
const h_pos = Array.from(
|
|
22899
|
+
{ length: seq_len },
|
|
22900
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
22901
|
+
);
|
|
22902
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
22903
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
22904
|
+
}
|
|
22905
|
+
/**
|
|
22906
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
22907
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
22908
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
22909
|
+
*/
|
|
22910
|
+
_get_multimodal_rope_positions({
|
|
22911
|
+
filtered_ids,
|
|
22912
|
+
image_grid_thw_list,
|
|
22913
|
+
video_grid_thw_list,
|
|
22914
|
+
spatial_merge_size,
|
|
22915
|
+
state
|
|
22916
|
+
}) {
|
|
22917
|
+
const { image_token_id } = this.config;
|
|
22918
|
+
const groups = [];
|
|
22919
|
+
let group_start = 0;
|
|
22920
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
22921
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
22922
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
22923
|
+
if (t !== current_type) {
|
|
22924
|
+
groups.push([current_type, group_start, j]);
|
|
22925
|
+
group_start = j;
|
|
22926
|
+
current_type = t;
|
|
22927
|
+
}
|
|
22928
|
+
}
|
|
22929
|
+
let current_pos = 0;
|
|
22930
|
+
const llm_pos_ids_list = [];
|
|
22931
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
22932
|
+
if (modality_type === 0) {
|
|
22933
|
+
const text_len = end_idx - start_idx;
|
|
22934
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
22935
|
+
current_pos += text_len;
|
|
22936
|
+
} else {
|
|
22937
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
22938
|
+
const temp_merge_size = grid_thw[0];
|
|
22939
|
+
llm_pos_ids_list.push(
|
|
22940
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
22941
|
+
);
|
|
22942
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
22943
|
+
}
|
|
22944
|
+
}
|
|
22945
|
+
return llm_pos_ids_list;
|
|
22946
|
+
}
|
|
22947
|
+
};
|
|
22948
|
+
|
|
21597
22949
|
// src/models/glpn/modeling_glpn.js
|
|
21598
22950
|
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
21599
22951
|
};
|
|
@@ -21666,6 +23018,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
|
|
|
21666
23018
|
var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
|
|
21667
23019
|
};
|
|
21668
23020
|
|
|
23021
|
+
// src/models/ultravox/modeling_ultravox.js
|
|
23022
|
+
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
23023
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
23024
|
+
};
|
|
23025
|
+
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
23026
|
+
_merge_input_ids_with_audio_features(kwargs) {
|
|
23027
|
+
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
23028
|
+
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
23029
|
+
return default_merge_input_ids_with_audio_features({
|
|
23030
|
+
// @ts-ignore
|
|
23031
|
+
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
|
|
23032
|
+
...kwargs,
|
|
23033
|
+
audio_features: reshaped_audio_features
|
|
23034
|
+
});
|
|
23035
|
+
}
|
|
23036
|
+
};
|
|
23037
|
+
|
|
23038
|
+
// src/models/granite_speech/modeling_granite_speech.js
|
|
23039
|
+
var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
|
|
23040
|
+
forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
|
|
23041
|
+
};
|
|
23042
|
+
|
|
21669
23043
|
// src/models/grounding_dino/modeling_grounding_dino.js
|
|
21670
23044
|
var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
|
|
21671
23045
|
};
|
|
@@ -21770,34 +23144,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
21770
23144
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
21771
23145
|
};
|
|
21772
23146
|
|
|
21773
|
-
// src/models/
|
|
21774
|
-
var
|
|
21775
|
-
forward_params = [
|
|
21776
|
-
"input_ids",
|
|
21777
|
-
"attention_mask",
|
|
21778
|
-
"pixel_values",
|
|
21779
|
-
"pixel_attention_mask",
|
|
21780
|
-
"position_ids",
|
|
21781
|
-
"past_key_values"
|
|
21782
|
-
];
|
|
23147
|
+
// src/models/llava/modeling_llava.js
|
|
23148
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
23149
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
21783
23150
|
};
|
|
21784
|
-
var
|
|
21785
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
21786
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
|
|
21787
|
-
return features;
|
|
21788
|
-
}
|
|
23151
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
21789
23152
|
_merge_input_ids_with_image_features(kwargs) {
|
|
21790
23153
|
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
21791
23154
|
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
21792
23155
|
return default_merge_input_ids_with_image_features({
|
|
21793
23156
|
// @ts-ignore
|
|
21794
|
-
image_token_id: this.config.image_token_id,
|
|
23157
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
21795
23158
|
...kwargs,
|
|
21796
23159
|
image_features: reshaped_image_hidden_states
|
|
21797
23160
|
});
|
|
21798
23161
|
}
|
|
21799
23162
|
};
|
|
21800
|
-
var
|
|
23163
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
23164
|
+
};
|
|
23165
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
23166
|
+
};
|
|
23167
|
+
|
|
23168
|
+
// src/models/idefics3/modeling_idefics3.js
|
|
23169
|
+
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
23170
|
+
forward_params = [
|
|
23171
|
+
"input_ids",
|
|
23172
|
+
"attention_mask",
|
|
23173
|
+
"pixel_values",
|
|
23174
|
+
"pixel_attention_mask",
|
|
23175
|
+
"position_ids",
|
|
23176
|
+
"past_key_values"
|
|
23177
|
+
];
|
|
21801
23178
|
};
|
|
21802
23179
|
|
|
21803
23180
|
// src/models/ijepa/modeling_ijepa.js
|
|
@@ -21881,6 +23258,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
21881
23258
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
21882
23259
|
};
|
|
21883
23260
|
|
|
23261
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
23262
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
23263
|
+
};
|
|
23264
|
+
|
|
21884
23265
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
21885
23266
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
21886
23267
|
};
|
|
@@ -21889,6 +23270,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
|
|
|
21889
23270
|
var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
|
|
21890
23271
|
};
|
|
21891
23272
|
|
|
23273
|
+
// src/models/lfm2_vl/modeling_lfm2_vl.js
|
|
23274
|
+
var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
23275
|
+
forward_params = [
|
|
23276
|
+
"input_ids",
|
|
23277
|
+
"attention_mask",
|
|
23278
|
+
"pixel_values",
|
|
23279
|
+
"pixel_attention_mask",
|
|
23280
|
+
"spatial_shapes",
|
|
23281
|
+
"position_ids",
|
|
23282
|
+
"past_key_values"
|
|
23283
|
+
];
|
|
23284
|
+
};
|
|
23285
|
+
|
|
21892
23286
|
// src/models/llama/modeling_llama.js
|
|
21893
23287
|
var LlamaPreTrainedModel = class extends PreTrainedModel {
|
|
21894
23288
|
};
|
|
@@ -21903,27 +23297,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
|
|
|
21903
23297
|
var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
|
|
21904
23298
|
};
|
|
21905
23299
|
|
|
21906
|
-
// src/models/llava/modeling_llava.js
|
|
21907
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
21908
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
21909
|
-
};
|
|
21910
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
21911
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
21912
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
21913
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
21914
|
-
return default_merge_input_ids_with_image_features({
|
|
21915
|
-
// @ts-ignore
|
|
21916
|
-
image_token_id: this.config.image_token_index,
|
|
21917
|
-
...kwargs,
|
|
21918
|
-
image_features: reshaped_image_hidden_states
|
|
21919
|
-
});
|
|
21920
|
-
}
|
|
21921
|
-
};
|
|
21922
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
21923
|
-
};
|
|
21924
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
21925
|
-
};
|
|
21926
|
-
|
|
21927
23300
|
// src/models/longt5/modeling_longt5.js
|
|
21928
23301
|
var LongT5PreTrainedModel = class extends PreTrainedModel {
|
|
21929
23302
|
};
|
|
@@ -22085,6 +23458,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
22085
23458
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
22086
23459
|
};
|
|
22087
23460
|
|
|
23461
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
23462
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
23463
|
+
};
|
|
23464
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
23465
|
+
};
|
|
23466
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
23467
|
+
};
|
|
23468
|
+
|
|
22088
23469
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
22089
23470
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
22090
23471
|
};
|
|
@@ -22553,6 +23934,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
22553
23934
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
22554
23935
|
};
|
|
22555
23936
|
|
|
23937
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
23938
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
23939
|
+
};
|
|
23940
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
23941
|
+
};
|
|
23942
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
23943
|
+
};
|
|
23944
|
+
|
|
22556
23945
|
// src/models/neobert/modeling_neobert.js
|
|
22557
23946
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
22558
23947
|
};
|
|
@@ -22674,27 +24063,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
|
|
|
22674
24063
|
};
|
|
22675
24064
|
|
|
22676
24065
|
// src/models/paligemma/modeling_paligemma.js
|
|
22677
|
-
var
|
|
22678
|
-
forward_params = [
|
|
22679
|
-
"input_ids",
|
|
22680
|
-
// 'inputs_embeds',
|
|
22681
|
-
"attention_mask",
|
|
22682
|
-
"pixel_values",
|
|
22683
|
-
"position_ids",
|
|
22684
|
-
"past_key_values"
|
|
22685
|
-
];
|
|
22686
|
-
};
|
|
22687
|
-
var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
|
|
22688
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
22689
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
22690
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
22691
|
-
return default_merge_input_ids_with_image_features({
|
|
22692
|
-
// @ts-ignore
|
|
22693
|
-
image_token_id: this.config.image_token_index,
|
|
22694
|
-
...kwargs,
|
|
22695
|
-
image_features: reshaped_image_hidden_states
|
|
22696
|
-
});
|
|
22697
|
-
}
|
|
24066
|
+
var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22698
24067
|
};
|
|
22699
24068
|
|
|
22700
24069
|
// src/models/parakeet/modeling_parakeet.js
|
|
@@ -22853,244 +24222,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
22853
24222
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
22854
24223
|
};
|
|
22855
24224
|
|
|
22856
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
22857
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
22858
|
-
forward_params = [
|
|
22859
|
-
// Text inputs
|
|
22860
|
-
"input_ids",
|
|
22861
|
-
"attention_mask",
|
|
22862
|
-
"position_ids",
|
|
22863
|
-
"past_key_values",
|
|
22864
|
-
// Vision inputs
|
|
22865
|
-
"pixel_values",
|
|
22866
|
-
"image_grid_thw"
|
|
22867
|
-
];
|
|
22868
|
-
};
|
|
22869
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
22870
|
-
image_grid_thw_name = "grid_thw";
|
|
22871
|
-
/**
|
|
22872
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
22873
|
-
*
|
|
22874
|
-
* Explanation:
|
|
22875
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
22876
|
-
*
|
|
22877
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
22878
|
-
* Examples:
|
|
22879
|
-
* input_ids: [T T T T T], here T is for text.
|
|
22880
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
22881
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
22882
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
22883
|
-
*
|
|
22884
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
22885
|
-
* and 1D rotary position embeddin for text part.
|
|
22886
|
-
* Examples:
|
|
22887
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
22888
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
22889
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
22890
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
22891
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
22892
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
22893
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
22894
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
22895
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
22896
|
-
*
|
|
22897
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
22898
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
22899
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
22900
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
22901
|
-
* - 1 for tokens that are **not masked**,
|
|
22902
|
-
* - 0 for tokens that are **masked**.
|
|
22903
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
22904
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
22905
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
22906
|
-
*/
|
|
22907
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
22908
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
22909
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
22910
|
-
const mrope_position_deltas = [];
|
|
22911
|
-
if (image_grid_thw || video_grid_thw) {
|
|
22912
|
-
let total_input_ids = input_ids.tolist();
|
|
22913
|
-
if (!attention_mask) {
|
|
22914
|
-
attention_mask = ones_like(input_ids);
|
|
22915
|
-
}
|
|
22916
|
-
const attention_mask_list = attention_mask.tolist();
|
|
22917
|
-
const position_ids_list = Array.from(
|
|
22918
|
-
{ length: 3 },
|
|
22919
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
22920
|
-
);
|
|
22921
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
22922
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
22923
|
-
let image_index = 0;
|
|
22924
|
-
let video_index = 0;
|
|
22925
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
22926
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
22927
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
22928
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
22929
|
-
return acc;
|
|
22930
|
-
}, []);
|
|
22931
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
22932
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
22933
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
22934
|
-
let llm_pos_ids_list = [];
|
|
22935
|
-
let st = 0;
|
|
22936
|
-
let remain_images = image_nums;
|
|
22937
|
-
let remain_videos = video_nums;
|
|
22938
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
22939
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st && x == image_token_id);
|
|
22940
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st && x == video_token_id);
|
|
22941
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
22942
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
22943
|
-
let ed;
|
|
22944
|
-
let t, h, w;
|
|
22945
|
-
if (ed_image < ed_video) {
|
|
22946
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
22947
|
-
++image_index;
|
|
22948
|
-
--remain_images;
|
|
22949
|
-
ed = ed_image;
|
|
22950
|
-
} else {
|
|
22951
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
22952
|
-
++video_index;
|
|
22953
|
-
--remain_videos;
|
|
22954
|
-
ed = ed_video;
|
|
22955
|
-
}
|
|
22956
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
22957
|
-
Number(t),
|
|
22958
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
22959
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
22960
|
-
];
|
|
22961
|
-
const text_len = ed - st;
|
|
22962
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
22963
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
22964
|
-
const offset = text_len + st_idx;
|
|
22965
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
22966
|
-
const t_index = Array.from(
|
|
22967
|
-
{ length: grid_size },
|
|
22968
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
22969
|
-
);
|
|
22970
|
-
const h_index = Array.from(
|
|
22971
|
-
{ length: grid_size },
|
|
22972
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
22973
|
-
);
|
|
22974
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
22975
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
22976
|
-
st = ed + grid_size;
|
|
22977
|
-
}
|
|
22978
|
-
if (st < ids.length) {
|
|
22979
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
22980
|
-
const text_len = ids.length - st;
|
|
22981
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
22982
|
-
}
|
|
22983
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
22984
|
-
const llm_positions = new Array(num_items);
|
|
22985
|
-
let index = 0;
|
|
22986
|
-
for (let x = 0; x < 3; ++x) {
|
|
22987
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
22988
|
-
const val = llm_pos_ids_list[y];
|
|
22989
|
-
const text_len = val.length / 3;
|
|
22990
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
22991
|
-
llm_positions[index++] = val[z];
|
|
22992
|
-
}
|
|
22993
|
-
}
|
|
22994
|
-
}
|
|
22995
|
-
let count2 = 0;
|
|
22996
|
-
const attn_mask = attention_mask_list[i];
|
|
22997
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
22998
|
-
if (attn_mask[y] == 1) {
|
|
22999
|
-
for (let x = 0; x < 3; ++x) {
|
|
23000
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
23001
|
-
}
|
|
23002
|
-
++count2;
|
|
23003
|
-
}
|
|
23004
|
-
}
|
|
23005
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
23006
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
23007
|
-
}
|
|
23008
|
-
return [
|
|
23009
|
-
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
23010
|
-
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
23011
|
-
];
|
|
23012
|
-
} else {
|
|
23013
|
-
if (attention_mask) {
|
|
23014
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
23015
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
23016
|
-
const mrope_position_deltas2 = Array.from(
|
|
23017
|
-
{ length: dims[0] },
|
|
23018
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
23019
|
-
);
|
|
23020
|
-
return [
|
|
23021
|
-
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
23022
|
-
new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
23023
|
-
];
|
|
23024
|
-
} else {
|
|
23025
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
23026
|
-
const position_ids = BigInt64Array.from(
|
|
23027
|
-
{ length: 3 * batch_size * seq_length },
|
|
23028
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
23029
|
-
);
|
|
23030
|
-
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
23031
|
-
}
|
|
23032
|
-
}
|
|
23033
|
-
}
|
|
23034
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
23035
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
23036
|
-
pixel_values,
|
|
23037
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
23038
|
-
})).image_features;
|
|
23039
|
-
return features;
|
|
23040
|
-
}
|
|
23041
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
23042
|
-
return default_merge_input_ids_with_image_features({
|
|
23043
|
-
// @ts-ignore
|
|
23044
|
-
image_token_id: this.config.image_token_id,
|
|
23045
|
-
...kwargs
|
|
23046
|
-
});
|
|
23047
|
-
}
|
|
23048
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
23049
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
23050
|
-
if (!model_inputs.past_key_values) {
|
|
23051
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
23052
|
-
model_inputs.input_ids,
|
|
23053
|
-
model_inputs.image_grid_thw,
|
|
23054
|
-
model_inputs.video_grid_thw,
|
|
23055
|
-
model_inputs.attention_mask
|
|
23056
|
-
);
|
|
23057
|
-
} else {
|
|
23058
|
-
model_inputs.pixel_values = null;
|
|
23059
|
-
const past_length = getPastLength(model_inputs.past_key_values);
|
|
23060
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
23061
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
23062
|
-
model_inputs.input_ids,
|
|
23063
|
-
model_inputs.image_grid_thw,
|
|
23064
|
-
model_inputs.video_grid_thw,
|
|
23065
|
-
model_inputs.attention_mask
|
|
23066
|
-
);
|
|
23067
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
23068
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
23069
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
23070
|
-
} else {
|
|
23071
|
-
if (!model_inputs.rope_deltas) {
|
|
23072
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
23073
|
-
model_inputs.input_ids,
|
|
23074
|
-
model_inputs.image_grid_thw,
|
|
23075
|
-
model_inputs.video_grid_thw,
|
|
23076
|
-
model_inputs.attention_mask
|
|
23077
|
-
);
|
|
23078
|
-
}
|
|
23079
|
-
const delta = BigInt(past_length);
|
|
23080
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
23081
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
23082
|
-
}
|
|
23083
|
-
}
|
|
23084
|
-
}
|
|
23085
|
-
return model_inputs;
|
|
23086
|
-
}
|
|
23087
|
-
};
|
|
23088
|
-
|
|
23089
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
23090
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
23091
|
-
image_grid_thw_name = "image_grid_thw";
|
|
23092
|
-
};
|
|
23093
|
-
|
|
23094
24225
|
// src/models/qwen3/modeling_qwen3.js
|
|
23095
24226
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
23096
24227
|
};
|
|
@@ -23118,18 +24249,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
|
|
|
23118
24249
|
// src/models/qwen3_vl/modeling_qwen3_vl.js
|
|
23119
24250
|
var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
23120
24251
|
};
|
|
24252
|
+
var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
|
|
24253
|
+
};
|
|
23121
24254
|
|
|
23122
24255
|
// src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
|
|
23123
24256
|
var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
23124
24257
|
};
|
|
24258
|
+
var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
|
|
24259
|
+
};
|
|
23125
24260
|
|
|
23126
24261
|
// src/models/qwen3_5/modeling_qwen3_5.js
|
|
23127
24262
|
var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
23128
24263
|
};
|
|
24264
|
+
var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
|
|
24265
|
+
};
|
|
23129
24266
|
|
|
23130
24267
|
// src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
|
|
23131
24268
|
var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
|
|
23132
24269
|
};
|
|
24270
|
+
var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
|
|
24271
|
+
};
|
|
23133
24272
|
|
|
23134
24273
|
// src/models/resnet/modeling_resnet.js
|
|
23135
24274
|
var ResNetPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -23528,6 +24667,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
23528
24667
|
}
|
|
23529
24668
|
};
|
|
23530
24669
|
|
|
24670
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
24671
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
24672
|
+
};
|
|
24673
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
24674
|
+
};
|
|
24675
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
24676
|
+
};
|
|
24677
|
+
|
|
23531
24678
|
// src/models/speecht5/modeling_speecht5.js
|
|
23532
24679
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
23533
24680
|
};
|
|
@@ -23810,25 +24957,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
|
|
|
23810
24957
|
var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
|
|
23811
24958
|
};
|
|
23812
24959
|
|
|
23813
|
-
// src/models/ultravox/modeling_ultravox.js
|
|
23814
|
-
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
23815
|
-
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
23816
|
-
};
|
|
23817
|
-
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
23818
|
-
_merge_input_ids_with_audio_features(kwargs) {
|
|
23819
|
-
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
23820
|
-
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
23821
|
-
return default_merge_input_ids_with_audio_features({
|
|
23822
|
-
// @ts-ignore
|
|
23823
|
-
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
|
|
23824
|
-
...kwargs,
|
|
23825
|
-
audio_features: reshaped_audio_features
|
|
23826
|
-
});
|
|
23827
|
-
}
|
|
23828
|
-
};
|
|
23829
|
-
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
23830
|
-
};
|
|
23831
|
-
|
|
23832
24960
|
// src/models/unispeech/modeling_unispeech.js
|
|
23833
24961
|
var UniSpeechPreTrainedModel = class extends PreTrainedModel {
|
|
23834
24962
|
};
|
|
@@ -23994,6 +25122,170 @@ var VitsModel = class extends VitsPreTrainedModel {
|
|
|
23994
25122
|
}
|
|
23995
25123
|
};
|
|
23996
25124
|
|
|
25125
|
+
// src/models/voxtral/modeling_voxtral.js
|
|
25126
|
+
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
25127
|
+
};
|
|
25128
|
+
|
|
25129
|
+
// src/models/voxtral_realtime/modeling_voxtral_realtime.js
|
|
25130
|
+
var CONV1_LEFT_PAD = 2;
|
|
25131
|
+
var CONV2_LEFT_PAD = 1;
|
|
25132
|
+
var states = /* @__PURE__ */ new WeakMap();
|
|
25133
|
+
function createEncoderState(model, input_features) {
|
|
25134
|
+
const { text_config, audio_config } = (
|
|
25135
|
+
/** @type {any} */
|
|
25136
|
+
model.config
|
|
25137
|
+
);
|
|
25138
|
+
const encoder_session = model.sessions["audio_encoder"];
|
|
25139
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
25140
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
25141
|
+
const enc_kv_cache = new DynamicCache();
|
|
25142
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
|
|
25143
|
+
const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
25144
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
25145
|
+
for (const name in enc_shapes) {
|
|
25146
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
25147
|
+
enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
25148
|
+
}
|
|
25149
|
+
const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
25150
|
+
1,
|
|
25151
|
+
PADDING_CACHE_CHANNELS,
|
|
25152
|
+
CONV1_LEFT_PAD
|
|
25153
|
+
]);
|
|
25154
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
25155
|
+
if (!chunks_iter) {
|
|
25156
|
+
throw new Error("input_features must be iterable or async iterable");
|
|
25157
|
+
}
|
|
25158
|
+
return {
|
|
25159
|
+
encoder_session,
|
|
25160
|
+
enc_kv_cache,
|
|
25161
|
+
enc_padding_cache,
|
|
25162
|
+
enc_past_seq_len: 0,
|
|
25163
|
+
audio_embed_queue: [],
|
|
25164
|
+
audio_embed_total_tokens: 0,
|
|
25165
|
+
audio_queue_offset: 0,
|
|
25166
|
+
audio_consumed: 0,
|
|
25167
|
+
stream_exhausted: false,
|
|
25168
|
+
chunks_iter,
|
|
25169
|
+
text_hidden_size: text_config.hidden_size
|
|
25170
|
+
};
|
|
25171
|
+
}
|
|
25172
|
+
async function encodeChunk(s, chunk_features) {
|
|
25173
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
25174
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
25175
|
+
const position_ids = new Tensor2(
|
|
25176
|
+
"int64",
|
|
25177
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
25178
|
+
[1, conv2_output_len]
|
|
25179
|
+
);
|
|
25180
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
25181
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
25182
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
25183
|
+
input_features: chunk_features,
|
|
25184
|
+
attention_mask,
|
|
25185
|
+
position_ids,
|
|
25186
|
+
past_padding_cache: s.enc_padding_cache,
|
|
25187
|
+
...s.enc_kv_cache
|
|
25188
|
+
});
|
|
25189
|
+
if (s.enc_padding_cache.location === "gpu-buffer") {
|
|
25190
|
+
s.enc_padding_cache.dispose();
|
|
25191
|
+
}
|
|
25192
|
+
s.enc_padding_cache = present_padding_cache;
|
|
25193
|
+
for (const name in present_cache) {
|
|
25194
|
+
if (name.startsWith("present.")) {
|
|
25195
|
+
const pastName = name.replace("present", "past_key_values");
|
|
25196
|
+
const prev = s.enc_kv_cache[pastName];
|
|
25197
|
+
if (prev?.location === "gpu-buffer") {
|
|
25198
|
+
prev.dispose();
|
|
25199
|
+
}
|
|
25200
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
25201
|
+
}
|
|
25202
|
+
}
|
|
25203
|
+
s.enc_past_seq_len = total_seq_len;
|
|
25204
|
+
return audio_embeds;
|
|
25205
|
+
}
|
|
25206
|
+
async function fillAudioBuffer(s, needed) {
|
|
25207
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
25208
|
+
const result = await s.chunks_iter.next();
|
|
25209
|
+
if (result.done) {
|
|
25210
|
+
s.stream_exhausted = true;
|
|
25211
|
+
break;
|
|
25212
|
+
}
|
|
25213
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
25214
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
25215
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
25216
|
+
}
|
|
25217
|
+
}
|
|
25218
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
25219
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
25220
|
+
const embed_data = inputs_embeds.data;
|
|
25221
|
+
let embed_write_pos = 0;
|
|
25222
|
+
let remaining = current_len;
|
|
25223
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
25224
|
+
const front = s.audio_embed_queue[0];
|
|
25225
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
25226
|
+
const n = Math.min(remaining, available);
|
|
25227
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
25228
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
25229
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
25230
|
+
}
|
|
25231
|
+
embed_write_pos += n;
|
|
25232
|
+
remaining -= n;
|
|
25233
|
+
s.audio_queue_offset += n;
|
|
25234
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
25235
|
+
s.audio_embed_queue.shift();
|
|
25236
|
+
s.audio_queue_offset = 0;
|
|
25237
|
+
}
|
|
25238
|
+
}
|
|
25239
|
+
s.audio_consumed += current_len - remaining;
|
|
25240
|
+
}
|
|
25241
|
+
var AudioExhaustedCriteria = class extends StoppingCriteria {
|
|
25242
|
+
constructor(enc_state) {
|
|
25243
|
+
super();
|
|
25244
|
+
this._s = enc_state;
|
|
25245
|
+
}
|
|
25246
|
+
_call(input_ids) {
|
|
25247
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
25248
|
+
return input_ids.map(() => done);
|
|
25249
|
+
}
|
|
25250
|
+
};
|
|
25251
|
+
var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
|
|
25252
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
|
|
25253
|
+
};
|
|
25254
|
+
var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
|
|
25255
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
25256
|
+
const current_len = input_ids.dims[1];
|
|
25257
|
+
const enc = states.get(this);
|
|
25258
|
+
if (enc) {
|
|
25259
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
25260
|
+
}
|
|
25261
|
+
const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
|
|
25262
|
+
if (enc) {
|
|
25263
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
25264
|
+
}
|
|
25265
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
25266
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
25267
|
+
const session = this.sessions["decoder_model_merged"];
|
|
25268
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
25269
|
+
return await sessionRun(session, fixed);
|
|
25270
|
+
}
|
|
25271
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
25272
|
+
if (!input_features) {
|
|
25273
|
+
throw new Error("input_features (generator/iterable) must be provided");
|
|
25274
|
+
}
|
|
25275
|
+
const enc_state = createEncoderState(this, input_features);
|
|
25276
|
+
states.set(this, enc_state);
|
|
25277
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
25278
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
25279
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
25280
|
+
try {
|
|
25281
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
25282
|
+
} finally {
|
|
25283
|
+
enc_state.enc_kv_cache.dispose();
|
|
25284
|
+
states.delete(this);
|
|
25285
|
+
}
|
|
25286
|
+
}
|
|
25287
|
+
};
|
|
25288
|
+
|
|
23997
25289
|
// src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
|
|
23998
25290
|
var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
|
|
23999
25291
|
};
|
|
@@ -24499,6 +25791,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
24499
25791
|
// src/models/registry.js
|
|
24500
25792
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
24501
25793
|
["bert", "BertModel"],
|
|
25794
|
+
["eurobert", "EuroBertModel"],
|
|
24502
25795
|
["neobert", "NeoBertModel"],
|
|
24503
25796
|
["modernbert", "ModernBertModel"],
|
|
24504
25797
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -24630,6 +25923,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
24630
25923
|
["gemma3_text", "Gemma3Model"],
|
|
24631
25924
|
["helium", "HeliumModel"],
|
|
24632
25925
|
["glm", "GlmModel"],
|
|
25926
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
24633
25927
|
["openelm", "OpenELMModel"],
|
|
24634
25928
|
["qwen2", "Qwen2Model"],
|
|
24635
25929
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -24641,12 +25935,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
24641
25935
|
["mpt", "MptModel"],
|
|
24642
25936
|
["opt", "OPTModel"],
|
|
24643
25937
|
["mistral", "MistralModel"],
|
|
25938
|
+
["mistral4", "Mistral4Model"],
|
|
24644
25939
|
["ministral", "MinistralModel"],
|
|
24645
25940
|
["ministral3", "Ministral3Model"],
|
|
24646
25941
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
24647
25942
|
["starcoder2", "Starcoder2Model"],
|
|
25943
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
24648
25944
|
["falcon", "FalconModel"],
|
|
24649
25945
|
["falcon_h1", "FalconH1Model"],
|
|
25946
|
+
["nemotron_h", "NemotronHModel"],
|
|
25947
|
+
["solar_open", "SolarOpenModel"],
|
|
24650
25948
|
["stablelm", "StableLmModel"],
|
|
24651
25949
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
24652
25950
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -24666,6 +25964,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24666
25964
|
]);
|
|
24667
25965
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
24668
25966
|
["bert", "BertForSequenceClassification"],
|
|
25967
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
24669
25968
|
["neobert", "NeoBertForSequenceClassification"],
|
|
24670
25969
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
24671
25970
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -24688,6 +25987,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24688
25987
|
]);
|
|
24689
25988
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
24690
25989
|
["bert", "BertForTokenClassification"],
|
|
25990
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
24691
25991
|
["neobert", "NeoBertForTokenClassification"],
|
|
24692
25992
|
["modernbert", "ModernBertForTokenClassification"],
|
|
24693
25993
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -24747,27 +26047,40 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24747
26047
|
["gemma2", "Gemma2ForCausalLM"],
|
|
24748
26048
|
["vaultgemma", "VaultGemmaForCausalLM"],
|
|
24749
26049
|
["gemma3_text", "Gemma3ForCausalLM"],
|
|
26050
|
+
["gemma3", "Gemma3ForCausalLM"],
|
|
24750
26051
|
["helium", "HeliumForCausalLM"],
|
|
24751
26052
|
["glm", "GlmForCausalLM"],
|
|
26053
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
24752
26054
|
["openelm", "OpenELMForCausalLM"],
|
|
24753
26055
|
["qwen2", "Qwen2ForCausalLM"],
|
|
24754
26056
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
24755
26057
|
["qwen3", "Qwen3ForCausalLM"],
|
|
24756
26058
|
["qwen3_moe", "Qwen3MoeForCausalLM"],
|
|
24757
26059
|
["qwen3_next", "Qwen3NextForCausalLM"],
|
|
26060
|
+
["qwen2_vl", "Qwen2VLForCausalLM"],
|
|
26061
|
+
["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
|
|
26062
|
+
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
26063
|
+
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
26064
|
+
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
26065
|
+
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
26066
|
+
["gemma3n", "Gemma3nForCausalLM"],
|
|
24758
26067
|
["phi", "PhiForCausalLM"],
|
|
24759
26068
|
["phi3", "Phi3ForCausalLM"],
|
|
24760
26069
|
["mpt", "MptForCausalLM"],
|
|
24761
26070
|
["opt", "OPTForCausalLM"],
|
|
24762
26071
|
["mbart", "MBartForCausalLM"],
|
|
24763
26072
|
["mistral", "MistralForCausalLM"],
|
|
26073
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
24764
26074
|
["ministral", "MinistralForCausalLM"],
|
|
24765
26075
|
["ministral3", "Ministral3ForCausalLM"],
|
|
24766
26076
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
24767
26077
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
26078
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
24768
26079
|
["falcon", "FalconForCausalLM"],
|
|
24769
26080
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
26081
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
24770
26082
|
["trocr", "TrOCRForCausalLM"],
|
|
26083
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
24771
26084
|
["stablelm", "StableLmForCausalLM"],
|
|
24772
26085
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
24773
26086
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -24778,6 +26091,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24778
26091
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
24779
26092
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
24780
26093
|
["bert", "BertForMaskedLM"],
|
|
26094
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
24781
26095
|
["neobert", "NeoBertForMaskedLM"],
|
|
24782
26096
|
["modernbert", "ModernBertForMaskedLM"],
|
|
24783
26097
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -24830,16 +26144,21 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24830
26144
|
["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
|
|
24831
26145
|
["qwen3_5", "Qwen3_5ForConditionalGeneration"],
|
|
24832
26146
|
["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
|
|
26147
|
+
["lfm2_vl", "Lfm2VlForConditionalGeneration"],
|
|
24833
26148
|
["idefics3", "Idefics3ForConditionalGeneration"],
|
|
24834
26149
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
24835
26150
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
24836
26151
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
24837
26152
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
24838
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
26153
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
26154
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
26155
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
24839
26156
|
]);
|
|
24840
26157
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
26158
|
+
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
24841
26159
|
["ultravox", "UltravoxModel"],
|
|
24842
|
-
["voxtral", "VoxtralForConditionalGeneration"]
|
|
26160
|
+
["voxtral", "VoxtralForConditionalGeneration"],
|
|
26161
|
+
["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
|
|
24843
26162
|
]);
|
|
24844
26163
|
var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
24845
26164
|
["vision-encoder-decoder", "VisionEncoderDecoderModel"]
|
|
@@ -24938,6 +26257,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24938
26257
|
]);
|
|
24939
26258
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
24940
26259
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
26260
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
24941
26261
|
["dpt", "DPTForDepthEstimation"],
|
|
24942
26262
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
24943
26263
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -25022,7 +26342,19 @@ var CUSTOM_MAPPING = [
|
|
|
25022
26342
|
MODEL_TYPES.ImageAudioTextToText
|
|
25023
26343
|
],
|
|
25024
26344
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
25025
|
-
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
|
|
26345
|
+
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
26346
|
+
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26347
|
+
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26348
|
+
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26349
|
+
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26350
|
+
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26351
|
+
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26352
|
+
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26353
|
+
[
|
|
26354
|
+
"VoxtralRealtimeForConditionalGeneration",
|
|
26355
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
26356
|
+
MODEL_TYPES.VoxtralRealtime
|
|
26357
|
+
]
|
|
25026
26358
|
];
|
|
25027
26359
|
for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
25028
26360
|
MODEL_TYPE_MAPPING.set(name, type);
|
|
@@ -26700,8 +28032,18 @@ var TASK_ALIASES = Object.freeze({
|
|
|
26700
28032
|
});
|
|
26701
28033
|
|
|
26702
28034
|
// src/utils/model_registry/get_model_files.js
|
|
28035
|
+
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
28036
|
+
if (config !== null) {
|
|
28037
|
+
return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
|
|
28038
|
+
}
|
|
28039
|
+
const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
|
|
28040
|
+
return memoizePromise(
|
|
28041
|
+
key,
|
|
28042
|
+
() => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
|
|
28043
|
+
);
|
|
28044
|
+
}
|
|
26703
28045
|
async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
|
|
26704
|
-
config = await
|
|
28046
|
+
config = await get_config(modelId, { config });
|
|
26705
28047
|
const files = [
|
|
26706
28048
|
// Add config.json (always loaded)
|
|
26707
28049
|
"config.json"
|
|
@@ -26762,74 +28104,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
26762
28104
|
files.push(dataFilePath);
|
|
26763
28105
|
}
|
|
26764
28106
|
};
|
|
26765
|
-
const
|
|
26766
|
-
|
|
26767
|
-
add_model_file(
|
|
26768
|
-
|
|
26769
|
-
|
|
26770
|
-
|
|
26771
|
-
|
|
26772
|
-
|
|
26773
|
-
add_model_file("decoder_model_merged");
|
|
26774
|
-
files.push("generation_config.json");
|
|
26775
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
26776
|
-
add_model_file("model", "vision_encoder");
|
|
26777
|
-
add_model_file("prompt_encoder_mask_decoder");
|
|
26778
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
26779
|
-
add_model_file("model", "encoder_model");
|
|
26780
|
-
add_model_file("decoder_model_merged");
|
|
26781
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
26782
|
-
add_model_file("embed_tokens");
|
|
26783
|
-
add_model_file("vision_encoder");
|
|
26784
|
-
add_model_file("decoder_model_merged");
|
|
26785
|
-
if (config.is_encoder_decoder) {
|
|
26786
|
-
add_model_file("model", "encoder_model");
|
|
26787
|
-
}
|
|
26788
|
-
files.push("generation_config.json");
|
|
26789
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
26790
|
-
add_model_file("embed_tokens");
|
|
26791
|
-
add_model_file("audio_encoder");
|
|
26792
|
-
add_model_file("decoder_model_merged");
|
|
26793
|
-
files.push("generation_config.json");
|
|
26794
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
26795
|
-
add_model_file("embed_tokens");
|
|
26796
|
-
add_model_file("audio_encoder");
|
|
26797
|
-
add_model_file("vision_encoder");
|
|
26798
|
-
add_model_file("decoder_model_merged");
|
|
26799
|
-
files.push("generation_config.json");
|
|
26800
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
26801
|
-
add_model_file("model", "text_encoder");
|
|
26802
|
-
add_model_file("decoder_model_merged");
|
|
26803
|
-
add_model_file("encodec_decode");
|
|
26804
|
-
files.push("generation_config.json");
|
|
26805
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
26806
|
-
add_model_file("prepare_inputs_embeds");
|
|
26807
|
-
add_model_file("model", "language_model");
|
|
26808
|
-
add_model_file("lm_head");
|
|
26809
|
-
add_model_file("gen_head");
|
|
26810
|
-
add_model_file("gen_img_embeds");
|
|
26811
|
-
add_model_file("image_decode");
|
|
26812
|
-
files.push("generation_config.json");
|
|
26813
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
26814
|
-
add_model_file("prepare_inputs_embeds");
|
|
26815
|
-
add_model_file("model");
|
|
26816
|
-
add_model_file("vision_encoder");
|
|
26817
|
-
files.push("generation_config.json");
|
|
26818
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
26819
|
-
add_model_file("embed_tokens");
|
|
26820
|
-
add_model_file("speech_encoder");
|
|
26821
|
-
add_model_file("model", "language_model");
|
|
26822
|
-
add_model_file("conditional_decoder");
|
|
26823
|
-
files.push("generation_config.json");
|
|
26824
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
26825
|
-
add_model_file("encoder_model");
|
|
26826
|
-
add_model_file("decoder_model");
|
|
26827
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
26828
|
-
add_model_file("text_encoder");
|
|
26829
|
-
add_model_file("latent_denoiser");
|
|
26830
|
-
add_model_file("voice_decoder");
|
|
26831
|
-
} else {
|
|
26832
|
-
add_model_file("model", singleModelName);
|
|
28107
|
+
const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
|
|
28108
|
+
for (const [sessionKey, baseName] of Object.entries(sessions)) {
|
|
28109
|
+
add_model_file(sessionKey, baseName);
|
|
28110
|
+
}
|
|
28111
|
+
if (optional_configs) {
|
|
28112
|
+
for (const configFile of Object.values(optional_configs)) {
|
|
28113
|
+
files.push(configFile);
|
|
28114
|
+
}
|
|
26833
28115
|
}
|
|
26834
28116
|
return files;
|
|
26835
28117
|
}
|
|
@@ -27280,25 +28562,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
|
27280
28562
|
|
|
27281
28563
|
// src/utils/model_registry/is_cached.js
|
|
27282
28564
|
async function check_files_cache(modelId, files, options = {}) {
|
|
27283
|
-
const
|
|
27284
|
-
if (!
|
|
28565
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
28566
|
+
if (!cache2) {
|
|
27285
28567
|
const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
|
|
27286
28568
|
return { allCached: false, files: fileStatuses2 };
|
|
27287
28569
|
}
|
|
27288
28570
|
const fileStatuses = await Promise.all(
|
|
27289
28571
|
files.map(async (filename) => {
|
|
27290
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
27291
|
-
const cached = await checkCachedResource(
|
|
28572
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
28573
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
27292
28574
|
return { file: filename, cached: !!cached };
|
|
27293
28575
|
})
|
|
27294
28576
|
);
|
|
27295
28577
|
return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
|
|
27296
28578
|
}
|
|
27297
28579
|
async function is_file_cached(modelId, filename, options = {}) {
|
|
27298
|
-
const
|
|
27299
|
-
if (!
|
|
27300
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
27301
|
-
return !!await checkCachedResource(
|
|
28580
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
28581
|
+
if (!cache2) return false;
|
|
28582
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
28583
|
+
return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
27302
28584
|
}
|
|
27303
28585
|
async function is_cached(modelId, options = {}) {
|
|
27304
28586
|
if (!modelId) {
|
|
@@ -27345,26 +28627,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
|
|
|
27345
28627
|
|
|
27346
28628
|
// src/utils/model_registry/clear_cache.js
|
|
27347
28629
|
async function clear_files_from_cache(modelId, files, options = {}) {
|
|
27348
|
-
const
|
|
27349
|
-
if (!
|
|
28630
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
28631
|
+
if (!cache2) {
|
|
27350
28632
|
return {
|
|
27351
28633
|
filesDeleted: 0,
|
|
27352
28634
|
filesCached: 0,
|
|
27353
28635
|
files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
|
|
27354
28636
|
};
|
|
27355
28637
|
}
|
|
27356
|
-
if (!
|
|
28638
|
+
if (!cache2.delete) {
|
|
27357
28639
|
throw new Error("Cache does not support delete operation");
|
|
27358
28640
|
}
|
|
27359
28641
|
const results = await Promise.all(
|
|
27360
28642
|
files.map(async (filename) => {
|
|
27361
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
27362
|
-
const cached = await checkCachedResource(
|
|
28643
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
28644
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
27363
28645
|
const wasCached = !!cached;
|
|
27364
28646
|
let deleted = false;
|
|
27365
28647
|
if (wasCached) {
|
|
27366
|
-
const deletedWithProposed = await
|
|
27367
|
-
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await
|
|
28648
|
+
const deletedWithProposed = await cache2.delete(proposedCacheKey);
|
|
28649
|
+
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
|
|
27368
28650
|
deleted = deletedWithProposed || deletedWithLocal;
|
|
27369
28651
|
}
|
|
27370
28652
|
return { file: filename, deleted, wasCached };
|
|
@@ -27714,6 +28996,9 @@ export {
|
|
|
27714
28996
|
BloomModel,
|
|
27715
28997
|
BloomPreTrainedModel,
|
|
27716
28998
|
BloomTokenizer,
|
|
28999
|
+
CHMv2ForDepthEstimation,
|
|
29000
|
+
CHMv2ImageProcessor,
|
|
29001
|
+
CHMv2PreTrainedModel,
|
|
27717
29002
|
CLIPFeatureExtractor,
|
|
27718
29003
|
CLIPImageProcessor,
|
|
27719
29004
|
CLIPModel,
|
|
@@ -27809,6 +29094,9 @@ export {
|
|
|
27809
29094
|
DebertaV2Tokenizer,
|
|
27810
29095
|
DecisionTransformerModel,
|
|
27811
29096
|
DecisionTransformerPreTrainedModel,
|
|
29097
|
+
DeepseekV3ForCausalLM,
|
|
29098
|
+
DeepseekV3Model,
|
|
29099
|
+
DeepseekV3PreTrainedModel,
|
|
27812
29100
|
DeiTFeatureExtractor,
|
|
27813
29101
|
DeiTForImageClassification,
|
|
27814
29102
|
DeiTImageProcessor,
|
|
@@ -27845,6 +29133,7 @@ export {
|
|
|
27845
29133
|
DonutImageProcessor,
|
|
27846
29134
|
DonutSwinModel,
|
|
27847
29135
|
DonutSwinPreTrainedModel,
|
|
29136
|
+
DynamicCache,
|
|
27848
29137
|
EdgeTamModel,
|
|
27849
29138
|
EfficientNetForImageClassification,
|
|
27850
29139
|
EfficientNetImageProcessor,
|
|
@@ -27868,6 +29157,11 @@ export {
|
|
|
27868
29157
|
EsmModel,
|
|
27869
29158
|
EsmPreTrainedModel,
|
|
27870
29159
|
EsmTokenizer,
|
|
29160
|
+
EuroBertForMaskedLM,
|
|
29161
|
+
EuroBertForSequenceClassification,
|
|
29162
|
+
EuroBertForTokenClassification,
|
|
29163
|
+
EuroBertModel,
|
|
29164
|
+
EuroBertPreTrainedModel,
|
|
27871
29165
|
ExaoneForCausalLM,
|
|
27872
29166
|
ExaoneModel,
|
|
27873
29167
|
ExaonePreTrainedModel,
|
|
@@ -27917,6 +29211,7 @@ export {
|
|
|
27917
29211
|
Gemma3Model,
|
|
27918
29212
|
Gemma3PreTrainedModel,
|
|
27919
29213
|
Gemma3nAudioFeatureExtractor,
|
|
29214
|
+
Gemma3nForCausalLM,
|
|
27920
29215
|
Gemma3nForConditionalGeneration,
|
|
27921
29216
|
Gemma3nPreTrainedModel,
|
|
27922
29217
|
Gemma3nProcessor,
|
|
@@ -27924,8 +29219,14 @@ export {
|
|
|
27924
29219
|
GemmaModel,
|
|
27925
29220
|
GemmaPreTrainedModel,
|
|
27926
29221
|
GemmaTokenizer,
|
|
29222
|
+
Glm46VImageProcessor,
|
|
29223
|
+
Glm46VProcessor,
|
|
27927
29224
|
GlmForCausalLM,
|
|
27928
29225
|
GlmModel,
|
|
29226
|
+
GlmMoeDsaForCausalLM,
|
|
29227
|
+
GlmMoeDsaModel,
|
|
29228
|
+
GlmMoeDsaPreTrainedModel,
|
|
29229
|
+
GlmOcrForConditionalGeneration,
|
|
27929
29230
|
GlmPreTrainedModel,
|
|
27930
29231
|
GptOssForCausalLM,
|
|
27931
29232
|
GptOssModel,
|
|
@@ -27936,6 +29237,9 @@ export {
|
|
|
27936
29237
|
GraniteMoeHybridModel,
|
|
27937
29238
|
GraniteMoeHybridPreTrainedModel,
|
|
27938
29239
|
GranitePreTrainedModel,
|
|
29240
|
+
GraniteSpeechFeatureExtractor,
|
|
29241
|
+
GraniteSpeechForConditionalGeneration,
|
|
29242
|
+
GraniteSpeechProcessor,
|
|
27939
29243
|
GroundingDinoForObjectDetection,
|
|
27940
29244
|
GroundingDinoImageProcessor,
|
|
27941
29245
|
GroundingDinoPreTrainedModel,
|
|
@@ -27961,7 +29265,6 @@ export {
|
|
|
27961
29265
|
IJepaPreTrainedModel,
|
|
27962
29266
|
Idefics3ForConditionalGeneration,
|
|
27963
29267
|
Idefics3ImageProcessor,
|
|
27964
|
-
Idefics3PreTrainedModel,
|
|
27965
29268
|
Idefics3Processor,
|
|
27966
29269
|
ImageClassificationPipeline,
|
|
27967
29270
|
ImageFeatureExtractionPipeline,
|
|
@@ -27986,6 +29289,10 @@ export {
|
|
|
27986
29289
|
Lfm2MoeModel,
|
|
27987
29290
|
Lfm2MoePreTrainedModel,
|
|
27988
29291
|
Lfm2PreTrainedModel,
|
|
29292
|
+
Lfm2VlForConditionalGeneration,
|
|
29293
|
+
Lfm2VlImageProcessor,
|
|
29294
|
+
Lfm2VlProcessor,
|
|
29295
|
+
LightOnOcrForConditionalGeneration,
|
|
27989
29296
|
LiteWhisperForConditionalGeneration,
|
|
27990
29297
|
Llama4ForCausalLM,
|
|
27991
29298
|
Llama4PreTrainedModel,
|
|
@@ -28055,6 +29362,9 @@ export {
|
|
|
28055
29362
|
MimiPreTrainedModel,
|
|
28056
29363
|
MinLengthLogitsProcessor,
|
|
28057
29364
|
MinNewTokensLengthLogitsProcessor,
|
|
29365
|
+
Mistral4ForCausalLM,
|
|
29366
|
+
Mistral4Model,
|
|
29367
|
+
Mistral4PreTrainedModel,
|
|
28058
29368
|
MistralForCausalLM,
|
|
28059
29369
|
MistralModel,
|
|
28060
29370
|
MistralPreTrainedModel,
|
|
@@ -28126,6 +29436,9 @@ export {
|
|
|
28126
29436
|
NanoChatForCausalLM,
|
|
28127
29437
|
NanoChatModel,
|
|
28128
29438
|
NanoChatPreTrainedModel,
|
|
29439
|
+
NemotronHForCausalLM,
|
|
29440
|
+
NemotronHModel,
|
|
29441
|
+
NemotronHPreTrainedModel,
|
|
28129
29442
|
NeoBertForMaskedLM,
|
|
28130
29443
|
NeoBertForQuestionAnswering,
|
|
28131
29444
|
NeoBertForSequenceClassification,
|
|
@@ -28169,7 +29482,6 @@ export {
|
|
|
28169
29482
|
Owlv2Model,
|
|
28170
29483
|
Owlv2PreTrainedModel,
|
|
28171
29484
|
PaliGemmaForConditionalGeneration,
|
|
28172
|
-
PaliGemmaPreTrainedModel,
|
|
28173
29485
|
PaliGemmaProcessor,
|
|
28174
29486
|
ParakeetFeatureExtractor,
|
|
28175
29487
|
ParakeetForCTC,
|
|
@@ -28213,10 +29525,12 @@ export {
|
|
|
28213
29525
|
Qwen2MoePreTrainedModel,
|
|
28214
29526
|
Qwen2PreTrainedModel,
|
|
28215
29527
|
Qwen2Tokenizer,
|
|
29528
|
+
Qwen2VLForCausalLM,
|
|
28216
29529
|
Qwen2VLForConditionalGeneration,
|
|
28217
29530
|
Qwen2VLImageProcessor,
|
|
28218
29531
|
Qwen2VLPreTrainedModel,
|
|
28219
29532
|
Qwen2VLProcessor,
|
|
29533
|
+
Qwen2_5_VLForCausalLM,
|
|
28220
29534
|
Qwen2_5_VLForConditionalGeneration,
|
|
28221
29535
|
Qwen2_5_VLProcessor,
|
|
28222
29536
|
Qwen3ForCausalLM,
|
|
@@ -28228,10 +29542,14 @@ export {
|
|
|
28228
29542
|
Qwen3NextModel,
|
|
28229
29543
|
Qwen3NextPreTrainedModel,
|
|
28230
29544
|
Qwen3PreTrainedModel,
|
|
29545
|
+
Qwen3VLForCausalLM,
|
|
28231
29546
|
Qwen3VLForConditionalGeneration,
|
|
29547
|
+
Qwen3VLMoeForCausalLM,
|
|
28232
29548
|
Qwen3VLMoeForConditionalGeneration,
|
|
28233
29549
|
Qwen3VLProcessor,
|
|
29550
|
+
Qwen3_5ForCausalLM,
|
|
28234
29551
|
Qwen3_5ForConditionalGeneration,
|
|
29552
|
+
Qwen3_5MoeForCausalLM,
|
|
28235
29553
|
Qwen3_5MoeForConditionalGeneration,
|
|
28236
29554
|
RFDetrForObjectDetection,
|
|
28237
29555
|
RFDetrModel,
|
|
@@ -28303,7 +29621,6 @@ export {
|
|
|
28303
29621
|
SmolLM3ForCausalLM,
|
|
28304
29622
|
SmolLM3Model,
|
|
28305
29623
|
SmolLM3PreTrainedModel,
|
|
28306
|
-
SmolVLMForConditionalGeneration,
|
|
28307
29624
|
Idefics3ImageProcessor as SmolVLMImageProcessor,
|
|
28308
29625
|
Idefics3Processor as SmolVLMProcessor,
|
|
28309
29626
|
SnacDecoderModel,
|
|
@@ -28311,6 +29628,9 @@ export {
|
|
|
28311
29628
|
SnacFeatureExtractor,
|
|
28312
29629
|
SnacModel,
|
|
28313
29630
|
SnacPreTrainedModel,
|
|
29631
|
+
SolarOpenForCausalLM,
|
|
29632
|
+
SolarOpenModel,
|
|
29633
|
+
SolarOpenPreTrainedModel,
|
|
28314
29634
|
SpeechT5FeatureExtractor,
|
|
28315
29635
|
SpeechT5ForSpeechToText,
|
|
28316
29636
|
SpeechT5ForTextToSpeech,
|
|
@@ -28409,6 +29729,10 @@ export {
|
|
|
28409
29729
|
VitsTokenizer,
|
|
28410
29730
|
VoxtralForConditionalGeneration,
|
|
28411
29731
|
VoxtralProcessor,
|
|
29732
|
+
VoxtralRealtimeFeatureExtractor,
|
|
29733
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
29734
|
+
VoxtralRealtimePreTrainedModel,
|
|
29735
|
+
VoxtralRealtimeProcessor,
|
|
28412
29736
|
Wav2Vec2BertForCTC,
|
|
28413
29737
|
Wav2Vec2BertForSequenceClassification,
|
|
28414
29738
|
Wav2Vec2BertModel,
|