@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +2255 -931
- package/dist/transformers.min.js +19 -19
- package/dist/transformers.node.cjs +2300 -934
- package/dist/transformers.node.min.cjs +20 -20
- package/dist/transformers.node.min.mjs +20 -20
- package/dist/transformers.node.mjs +2336 -1012
- package/dist/transformers.web.js +2327 -1003
- package/dist/transformers.web.min.js +17 -17
- package/package.json +4 -4
- package/src/cache_utils.js +62 -0
- package/src/configs.js +45 -24
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +27 -17
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +3 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +224 -308
- package/src/models/models.js +14 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +4 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +194 -143
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +42 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/pipelines.js +1 -0
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +15 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +18 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +3 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +14 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +4 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +43 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
package/dist/transformers.js
CHANGED
|
@@ -20,7 +20,7 @@ var node_path_default = {};
|
|
|
20
20
|
var node_url_default = {};
|
|
21
21
|
|
|
22
22
|
// src/env.js
|
|
23
|
-
var VERSION = "4.0.0-next.
|
|
23
|
+
var VERSION = "4.0.0-next.8";
|
|
24
24
|
var HAS_SELF = typeof self !== "undefined";
|
|
25
25
|
var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
|
|
26
26
|
var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
|
|
@@ -148,6 +148,7 @@ var env = {
|
|
|
148
148
|
customCache: null,
|
|
149
149
|
useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
|
|
150
150
|
cacheKey: "transformers-cache",
|
|
151
|
+
experimental_useCrossOriginStorage: false,
|
|
151
152
|
/////////////////// Custom fetch /////////////////////
|
|
152
153
|
fetch: DEFAULT_FETCH
|
|
153
154
|
//////////////////////////////////////////////////////
|
|
@@ -249,7 +250,7 @@ var logger = {
|
|
|
249
250
|
}
|
|
250
251
|
};
|
|
251
252
|
|
|
252
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
253
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
253
254
|
var DictionarySplitter = class {
|
|
254
255
|
/**
|
|
255
256
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -1905,10 +1906,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
1905
1906
|
);
|
|
1906
1907
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
1907
1908
|
output_tokens.push(...byte_tokens);
|
|
1908
|
-
} else {
|
|
1909
|
+
} else if (this.unk_token != null) {
|
|
1909
1910
|
output_tokens.push(this.unk_token);
|
|
1910
1911
|
}
|
|
1911
|
-
} else {
|
|
1912
|
+
} else if (this.unk_token != null) {
|
|
1912
1913
|
output_tokens.push(this.unk_token);
|
|
1913
1914
|
}
|
|
1914
1915
|
}
|
|
@@ -2698,7 +2699,7 @@ var Tokenizer = class {
|
|
|
2698
2699
|
};
|
|
2699
2700
|
var Tokenizer_default = Tokenizer;
|
|
2700
2701
|
|
|
2701
|
-
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.
|
|
2702
|
+
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
|
|
2702
2703
|
var TOKEN_TYPES = Object.freeze({
|
|
2703
2704
|
Text: "Text",
|
|
2704
2705
|
// The text between Jinja statements or expressions
|
|
@@ -4217,7 +4218,11 @@ var Environment = class {
|
|
|
4217
4218
|
["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
|
|
4218
4219
|
["integer", (operand) => operand instanceof IntegerValue],
|
|
4219
4220
|
["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
|
|
4220
|
-
["mapping", (operand) => operand
|
|
4221
|
+
["mapping", (operand) => operand instanceof ObjectValue],
|
|
4222
|
+
[
|
|
4223
|
+
"sequence",
|
|
4224
|
+
(operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
|
|
4225
|
+
],
|
|
4221
4226
|
[
|
|
4222
4227
|
"lower",
|
|
4223
4228
|
(operand) => {
|
|
@@ -4490,6 +4495,9 @@ var Interpreter = class {
|
|
|
4490
4495
|
applyFilter(operand, filterNode, environment) {
|
|
4491
4496
|
if (filterNode.type === "Identifier") {
|
|
4492
4497
|
const filter = filterNode;
|
|
4498
|
+
if (filter.value === "safe") {
|
|
4499
|
+
return operand;
|
|
4500
|
+
}
|
|
4493
4501
|
if (filter.value === "tojson") {
|
|
4494
4502
|
return new StringValue(toJSON(operand, {}));
|
|
4495
4503
|
}
|
|
@@ -4579,6 +4587,8 @@ var Interpreter = class {
|
|
|
4579
4587
|
return new IntegerValue(Math.floor(operand.value));
|
|
4580
4588
|
case "float":
|
|
4581
4589
|
return new FloatValue(operand.value);
|
|
4590
|
+
case "string":
|
|
4591
|
+
return new StringValue(operand.toString());
|
|
4582
4592
|
default:
|
|
4583
4593
|
throw new Error(`Unknown NumericValue filter: ${filter.value}`);
|
|
4584
4594
|
}
|
|
@@ -6001,9 +6011,216 @@ function toAbsoluteURL(url) {
|
|
|
6001
6011
|
return new URL(url, baseURL).href;
|
|
6002
6012
|
}
|
|
6003
6013
|
|
|
6014
|
+
// src/utils/cache/CrossOriginStorageCache.js
|
|
6015
|
+
var HASH_ALGORITHM = "SHA-256";
|
|
6016
|
+
var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
|
|
6017
|
+
var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
|
|
6018
|
+
var CrossOriginStorage = class {
|
|
6019
|
+
/** @type {Promise<Cache> | null} */
|
|
6020
|
+
#hashCache = null;
|
|
6021
|
+
/**
|
|
6022
|
+
* Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
|
|
6023
|
+
* @returns {Promise<Cache>}
|
|
6024
|
+
*/
|
|
6025
|
+
_getHashCache = () => {
|
|
6026
|
+
this.#hashCache ??= caches.open(HASH_CACHE_NAME);
|
|
6027
|
+
return this.#hashCache;
|
|
6028
|
+
};
|
|
6029
|
+
/**
|
|
6030
|
+
* Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
|
|
6031
|
+
* @returns {boolean}
|
|
6032
|
+
*/
|
|
6033
|
+
static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
|
|
6034
|
+
/**
|
|
6035
|
+
* Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
|
|
6036
|
+
* the corresponding file handle from cross-origin storage.
|
|
6037
|
+
*
|
|
6038
|
+
* Implements `CacheInterface.match`.
|
|
6039
|
+
*
|
|
6040
|
+
* @param {string} request The URL of the resource to look up.
|
|
6041
|
+
* @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
|
|
6042
|
+
*/
|
|
6043
|
+
match = async (request) => {
|
|
6044
|
+
const hashValue = await this._getFileHash(request);
|
|
6045
|
+
if (!hashValue) {
|
|
6046
|
+
return void 0;
|
|
6047
|
+
}
|
|
6048
|
+
try {
|
|
6049
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
|
|
6050
|
+
const blob = await handle.getFile();
|
|
6051
|
+
return new Response(blob, {
|
|
6052
|
+
headers: {
|
|
6053
|
+
"Content-Length": String(blob.size)
|
|
6054
|
+
}
|
|
6055
|
+
});
|
|
6056
|
+
} catch {
|
|
6057
|
+
return void 0;
|
|
6058
|
+
}
|
|
6059
|
+
};
|
|
6060
|
+
/**
|
|
6061
|
+
* Stores a response in cross-origin storage, keyed by its SHA-256 hash.
|
|
6062
|
+
*
|
|
6063
|
+
* For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
|
|
6064
|
+
* `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
|
|
6065
|
+
* without reading the response body a second time.
|
|
6066
|
+
*
|
|
6067
|
+
* For non-LFS resources the hash is unknown upfront. In that case the body is consumed
|
|
6068
|
+
* in the background: the stream is read to compute the content hash, the file is written
|
|
6069
|
+
* into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
|
|
6070
|
+
* so that future `match` calls can resolve the file without a network round-trip.
|
|
6071
|
+
*
|
|
6072
|
+
* Implements `CacheInterface.put`.
|
|
6073
|
+
*
|
|
6074
|
+
* @param {string} request The URL of the resource (used as the hash-cache key).
|
|
6075
|
+
* @param {Response} response The response whose body will be written to the cache.
|
|
6076
|
+
* @returns {Promise<void>}
|
|
6077
|
+
*/
|
|
6078
|
+
put = async (request, response) => {
|
|
6079
|
+
const hashValue = await this._getFileHash(request);
|
|
6080
|
+
if (hashValue) {
|
|
6081
|
+
const blob = await response.blob();
|
|
6082
|
+
await this._storeBlobInCOS(blob, hashValue);
|
|
6083
|
+
} else {
|
|
6084
|
+
this._processAndStore(request, response.body);
|
|
6085
|
+
}
|
|
6086
|
+
};
|
|
6087
|
+
/**
|
|
6088
|
+
* Writes a blob into cross-origin storage using the given pre-computed hex hash string.
|
|
6089
|
+
*
|
|
6090
|
+
* @param {Blob} blob
|
|
6091
|
+
* @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
|
|
6092
|
+
* @returns {Promise<void>}
|
|
6093
|
+
*/
|
|
6094
|
+
_storeBlobInCOS = async (blob, hashHex) => {
|
|
6095
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
|
|
6096
|
+
create: true
|
|
6097
|
+
});
|
|
6098
|
+
const writableStream = await handle.createWritable();
|
|
6099
|
+
await writableStream.write(blob);
|
|
6100
|
+
await writableStream.close();
|
|
6101
|
+
};
|
|
6102
|
+
/**
|
|
6103
|
+
* Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
|
|
6104
|
+
* of the resulting blob, stores it in cross-origin storage, and persists the computed
|
|
6105
|
+
* hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
|
|
6106
|
+
* file without a network round-trip.
|
|
6107
|
+
*
|
|
6108
|
+
* Called fire-and-forget from `put` — errors are swallowed so failures never surface to
|
|
6109
|
+
* the caller.
|
|
6110
|
+
*
|
|
6111
|
+
* @param {string} request The original resource URL.
|
|
6112
|
+
* @param {ReadableStream} stream The response body stream to consume.
|
|
6113
|
+
* @returns {Promise<void>}
|
|
6114
|
+
*/
|
|
6115
|
+
_processAndStore = async (request, stream) => {
|
|
6116
|
+
try {
|
|
6117
|
+
const chunks = [];
|
|
6118
|
+
for await (const chunk2 of stream) {
|
|
6119
|
+
chunks.push(chunk2);
|
|
6120
|
+
}
|
|
6121
|
+
const blob = new Blob(chunks);
|
|
6122
|
+
const hashHex = await this._getBlobHash(blob);
|
|
6123
|
+
await this._storeBlobInCOS(blob, hashHex);
|
|
6124
|
+
try {
|
|
6125
|
+
const hashCache = await this._getHashCache();
|
|
6126
|
+
await hashCache.put(request, new Response(hashHex));
|
|
6127
|
+
} catch {
|
|
6128
|
+
}
|
|
6129
|
+
} catch {
|
|
6130
|
+
}
|
|
6131
|
+
};
|
|
6132
|
+
/**
|
|
6133
|
+
* Deletes the cache entry for the given request.
|
|
6134
|
+
*
|
|
6135
|
+
* Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
|
|
6136
|
+
* expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
|
|
6137
|
+
* permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
|
|
6138
|
+
* re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
|
|
6139
|
+
*
|
|
6140
|
+
* Implements `CacheInterface.delete`.
|
|
6141
|
+
*
|
|
6142
|
+
* @param {string} request
|
|
6143
|
+
* @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
|
|
6144
|
+
*/
|
|
6145
|
+
delete = async (request) => {
|
|
6146
|
+
try {
|
|
6147
|
+
const hashCache = await this._getHashCache();
|
|
6148
|
+
return await hashCache.delete(request);
|
|
6149
|
+
} catch {
|
|
6150
|
+
return false;
|
|
6151
|
+
}
|
|
6152
|
+
};
|
|
6153
|
+
/**
|
|
6154
|
+
* Resolves the SHA-256 hash for a given URL.
|
|
6155
|
+
*
|
|
6156
|
+
* Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
|
|
6157
|
+
* Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
|
|
6158
|
+
* LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
|
|
6159
|
+
*
|
|
6160
|
+
* Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
|
|
6161
|
+
*
|
|
6162
|
+
* @param {string} url The resource URL to resolve a hash for.
|
|
6163
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6164
|
+
*/
|
|
6165
|
+
_getFileHash = async (url) => {
|
|
6166
|
+
try {
|
|
6167
|
+
const hashCache = await this._getHashCache();
|
|
6168
|
+
const cached = await hashCache.match(url);
|
|
6169
|
+
if (cached) {
|
|
6170
|
+
return cached.text();
|
|
6171
|
+
}
|
|
6172
|
+
const hash = await this._getLfsFileHash(url);
|
|
6173
|
+
if (hash) {
|
|
6174
|
+
await hashCache.put(url, new Response(hash));
|
|
6175
|
+
return hash;
|
|
6176
|
+
}
|
|
6177
|
+
return null;
|
|
6178
|
+
} catch {
|
|
6179
|
+
return null;
|
|
6180
|
+
}
|
|
6181
|
+
};
|
|
6182
|
+
/**
|
|
6183
|
+
* Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
|
|
6184
|
+
* Git LFS pointer file.
|
|
6185
|
+
*
|
|
6186
|
+
* Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
|
|
6187
|
+
* The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
|
|
6188
|
+
* Returns `null` for non-LFS URLs or when the network request fails.
|
|
6189
|
+
*
|
|
6190
|
+
* @see https://huggingface.co/docs/hub/en/storage-backends#xet
|
|
6191
|
+
* @param {string} url The resolved Hugging Face URL of the resource.
|
|
6192
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6193
|
+
*/
|
|
6194
|
+
_getLfsFileHash = async (url) => {
|
|
6195
|
+
if (!url.includes("/resolve/")) {
|
|
6196
|
+
return null;
|
|
6197
|
+
}
|
|
6198
|
+
const rawUrl = url.replace("/resolve/", "/raw/");
|
|
6199
|
+
try {
|
|
6200
|
+
const text = await fetch(rawUrl).then((r) => r.text());
|
|
6201
|
+
const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
|
|
6202
|
+
return match ? match[1] : null;
|
|
6203
|
+
} catch {
|
|
6204
|
+
return null;
|
|
6205
|
+
}
|
|
6206
|
+
};
|
|
6207
|
+
/**
|
|
6208
|
+
* Computes the SHA-256 hash of a `Blob`'s contents.
|
|
6209
|
+
*
|
|
6210
|
+
* @param {Blob} blob The blob to hash.
|
|
6211
|
+
* @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
|
|
6212
|
+
*/
|
|
6213
|
+
_getBlobHash = async (blob) => {
|
|
6214
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
6215
|
+
const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
|
|
6216
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
6217
|
+
return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
6218
|
+
};
|
|
6219
|
+
};
|
|
6220
|
+
|
|
6004
6221
|
// src/utils/cache.js
|
|
6005
6222
|
async function getCache(file_cache_dir = null) {
|
|
6006
|
-
let
|
|
6223
|
+
let cache2 = null;
|
|
6007
6224
|
if (env.useCustomCache) {
|
|
6008
6225
|
if (!env.customCache) {
|
|
6009
6226
|
throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
|
|
@@ -6013,30 +6230,33 @@ async function getCache(file_cache_dir = null) {
|
|
|
6013
6230
|
"`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
|
|
6014
6231
|
);
|
|
6015
6232
|
}
|
|
6016
|
-
|
|
6233
|
+
cache2 = env.customCache;
|
|
6017
6234
|
}
|
|
6018
|
-
if (!
|
|
6235
|
+
if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
|
|
6236
|
+
cache2 = new CrossOriginStorage();
|
|
6237
|
+
}
|
|
6238
|
+
if (!cache2 && env.useBrowserCache) {
|
|
6019
6239
|
if (typeof caches === "undefined") {
|
|
6020
6240
|
throw Error("Browser cache is not available in this environment.");
|
|
6021
6241
|
}
|
|
6022
6242
|
try {
|
|
6023
|
-
|
|
6243
|
+
cache2 = await caches.open(env.cacheKey);
|
|
6024
6244
|
} catch (e) {
|
|
6025
6245
|
logger.warn("An error occurred while opening the browser cache:", e);
|
|
6026
6246
|
}
|
|
6027
6247
|
}
|
|
6028
|
-
if (!
|
|
6248
|
+
if (!cache2 && env.useFSCache) {
|
|
6029
6249
|
if (!apis.IS_FS_AVAILABLE) {
|
|
6030
6250
|
throw Error("File System Cache is not available in this environment.");
|
|
6031
6251
|
}
|
|
6032
|
-
|
|
6252
|
+
cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
|
|
6033
6253
|
}
|
|
6034
|
-
return
|
|
6254
|
+
return cache2;
|
|
6035
6255
|
}
|
|
6036
|
-
async function tryCache(
|
|
6256
|
+
async function tryCache(cache2, ...names) {
|
|
6037
6257
|
for (let name of names) {
|
|
6038
6258
|
try {
|
|
6039
|
-
let result = await
|
|
6259
|
+
let result = await cache2.match(name);
|
|
6040
6260
|
if (result) return result;
|
|
6041
6261
|
} catch (e) {
|
|
6042
6262
|
continue;
|
|
@@ -6045,6 +6265,83 @@ async function tryCache(cache, ...names) {
|
|
|
6045
6265
|
return void 0;
|
|
6046
6266
|
}
|
|
6047
6267
|
|
|
6268
|
+
// src/utils/lru_cache.js
|
|
6269
|
+
var LRUCache2 = class {
|
|
6270
|
+
/** @type {number} */
|
|
6271
|
+
#capacity;
|
|
6272
|
+
/** @type {Map<any, any>} */
|
|
6273
|
+
#cache;
|
|
6274
|
+
/**
|
|
6275
|
+
* Creates an LRUCache instance.
|
|
6276
|
+
* @param {number} capacity The maximum number of items the cache can hold.
|
|
6277
|
+
*/
|
|
6278
|
+
constructor(capacity) {
|
|
6279
|
+
this.#capacity = capacity;
|
|
6280
|
+
this.#cache = /* @__PURE__ */ new Map();
|
|
6281
|
+
}
|
|
6282
|
+
/**
|
|
6283
|
+
* Retrieves the value associated with the given key and marks the key as recently used.
|
|
6284
|
+
* @param {any} key The key to retrieve.
|
|
6285
|
+
* @returns {any} The value associated with the key, or undefined if the key does not exist.
|
|
6286
|
+
*/
|
|
6287
|
+
get(key) {
|
|
6288
|
+
if (!this.#cache.has(key)) return void 0;
|
|
6289
|
+
const value = this.#cache.get(key);
|
|
6290
|
+
this.#cache.delete(key);
|
|
6291
|
+
this.#cache.set(key, value);
|
|
6292
|
+
return value;
|
|
6293
|
+
}
|
|
6294
|
+
/**
|
|
6295
|
+
* Inserts or updates the key-value pair in the cache.
|
|
6296
|
+
* If the key already exists, it is updated and marked as recently used.
|
|
6297
|
+
* If the cache exceeds its capacity, the least recently used item is evicted.
|
|
6298
|
+
* @param {any} key The key to add or update.
|
|
6299
|
+
* @param {any} value The value to associate with the key.
|
|
6300
|
+
*/
|
|
6301
|
+
put(key, value) {
|
|
6302
|
+
if (this.#cache.has(key)) {
|
|
6303
|
+
this.#cache.delete(key);
|
|
6304
|
+
}
|
|
6305
|
+
this.#cache.set(key, value);
|
|
6306
|
+
if (this.#cache.size > this.#capacity) {
|
|
6307
|
+
this.#cache.delete(this.#cache.keys().next().value);
|
|
6308
|
+
}
|
|
6309
|
+
}
|
|
6310
|
+
/**
|
|
6311
|
+
* Removes the entry for the given key from the cache.
|
|
6312
|
+
* @param {any} key The key to delete.
|
|
6313
|
+
* @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
|
|
6314
|
+
*/
|
|
6315
|
+
delete(key) {
|
|
6316
|
+
return this.#cache.delete(key);
|
|
6317
|
+
}
|
|
6318
|
+
/**
|
|
6319
|
+
* Clears the cache.
|
|
6320
|
+
*/
|
|
6321
|
+
clear() {
|
|
6322
|
+
this.#cache.clear();
|
|
6323
|
+
}
|
|
6324
|
+
};
|
|
6325
|
+
|
|
6326
|
+
// src/utils/memoize_promise.js
|
|
6327
|
+
var MAX_CACHE_SIZE = 100;
|
|
6328
|
+
var cache = new LRUCache2(MAX_CACHE_SIZE);
|
|
6329
|
+
function memoizePromise(key, factory) {
|
|
6330
|
+
const cached = cache.get(key);
|
|
6331
|
+
if (cached !== void 0) {
|
|
6332
|
+
return cached;
|
|
6333
|
+
}
|
|
6334
|
+
const promise = factory().then(
|
|
6335
|
+
(value) => value,
|
|
6336
|
+
(err) => {
|
|
6337
|
+
cache.delete(key);
|
|
6338
|
+
return Promise.reject(err);
|
|
6339
|
+
}
|
|
6340
|
+
);
|
|
6341
|
+
cache.put(key, promise);
|
|
6342
|
+
return promise;
|
|
6343
|
+
}
|
|
6344
|
+
|
|
6048
6345
|
// src/utils/model_registry/get_file_metadata.js
|
|
6049
6346
|
async function fetch_file_head(urlOrPath) {
|
|
6050
6347
|
if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
|
|
@@ -6052,17 +6349,27 @@ async function fetch_file_head(urlOrPath) {
|
|
|
6052
6349
|
}
|
|
6053
6350
|
const headers = getFetchHeaders(urlOrPath);
|
|
6054
6351
|
headers.set("Range", "bytes=0-0");
|
|
6055
|
-
return env.fetch(urlOrPath, { method: "GET", headers });
|
|
6352
|
+
return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
|
|
6353
|
+
}
|
|
6354
|
+
function get_file_metadata(path_or_repo_id, filename, options = {}) {
|
|
6355
|
+
const key = JSON.stringify([
|
|
6356
|
+
path_or_repo_id,
|
|
6357
|
+
filename,
|
|
6358
|
+
options?.revision,
|
|
6359
|
+
options?.cache_dir,
|
|
6360
|
+
options?.local_files_only
|
|
6361
|
+
]);
|
|
6362
|
+
return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
|
|
6056
6363
|
}
|
|
6057
|
-
async function
|
|
6058
|
-
const
|
|
6364
|
+
async function _get_file_metadata(path_or_repo_id, filename, options) {
|
|
6365
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6059
6366
|
const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6060
6367
|
path_or_repo_id,
|
|
6061
6368
|
filename,
|
|
6062
6369
|
options,
|
|
6063
|
-
|
|
6370
|
+
cache2
|
|
6064
6371
|
);
|
|
6065
|
-
const cachedResponse = await checkCachedResource(
|
|
6372
|
+
const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6066
6373
|
if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
|
|
6067
6374
|
const size = cachedResponse.headers.get("content-length");
|
|
6068
6375
|
const contentType = cachedResponse.headers.get("content-type");
|
|
@@ -6160,7 +6467,7 @@ function getFetchHeaders(urlOrPath) {
|
|
|
6160
6467
|
}
|
|
6161
6468
|
return headers;
|
|
6162
6469
|
}
|
|
6163
|
-
function buildResourcePaths(path_or_repo_id, filename, options = {},
|
|
6470
|
+
function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
|
|
6164
6471
|
const revision = options.revision ?? "main";
|
|
6165
6472
|
const requestURL = pathJoin(path_or_repo_id, filename);
|
|
6166
6473
|
const validModelId = isValidHfModelId(path_or_repo_id);
|
|
@@ -6170,7 +6477,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6170
6477
|
env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
|
|
6171
6478
|
filename
|
|
6172
6479
|
);
|
|
6173
|
-
const proposedCacheKey =
|
|
6480
|
+
const proposedCacheKey = cache2 instanceof FileCache ? (
|
|
6174
6481
|
// Choose cache key for filesystem cache
|
|
6175
6482
|
// When using the main revision (default), we use the request URL as the cache key.
|
|
6176
6483
|
// If a specific revision is requested, we account for this in the cache key.
|
|
@@ -6184,14 +6491,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6184
6491
|
validModelId
|
|
6185
6492
|
};
|
|
6186
6493
|
}
|
|
6187
|
-
async function checkCachedResource(
|
|
6188
|
-
if (!
|
|
6494
|
+
async function checkCachedResource(cache2, localPath, proposedCacheKey) {
|
|
6495
|
+
if (!cache2) {
|
|
6189
6496
|
return void 0;
|
|
6190
6497
|
}
|
|
6191
|
-
return await tryCache(
|
|
6498
|
+
return await tryCache(cache2, localPath, proposedCacheKey);
|
|
6192
6499
|
}
|
|
6193
|
-
async function storeCachedResource(path_or_repo_id, filename,
|
|
6194
|
-
if (await
|
|
6500
|
+
async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
|
|
6501
|
+
if (await cache2.match(cacheKey) !== void 0) {
|
|
6195
6502
|
return;
|
|
6196
6503
|
}
|
|
6197
6504
|
if (!result) {
|
|
@@ -6201,20 +6508,22 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6201
6508
|
file: filename,
|
|
6202
6509
|
...data
|
|
6203
6510
|
}) : void 0;
|
|
6204
|
-
await
|
|
6511
|
+
await cache2.put(
|
|
6205
6512
|
cacheKey,
|
|
6206
6513
|
/** @type {Response} */
|
|
6207
6514
|
response,
|
|
6208
6515
|
wrapped_progress
|
|
6209
6516
|
);
|
|
6210
6517
|
} else if (typeof response !== "string") {
|
|
6211
|
-
|
|
6518
|
+
const headers = new Headers(response.headers);
|
|
6519
|
+
headers.set("content-length", result.byteLength.toString());
|
|
6520
|
+
await cache2.put(
|
|
6212
6521
|
cacheKey,
|
|
6213
6522
|
new Response(
|
|
6214
6523
|
/** @type {any} */
|
|
6215
6524
|
result,
|
|
6216
6525
|
{
|
|
6217
|
-
headers
|
|
6526
|
+
headers
|
|
6218
6527
|
}
|
|
6219
6528
|
)
|
|
6220
6529
|
).catch((err) => {
|
|
@@ -6222,17 +6531,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6222
6531
|
});
|
|
6223
6532
|
}
|
|
6224
6533
|
}
|
|
6225
|
-
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false,
|
|
6534
|
+
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
|
|
6226
6535
|
const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6227
6536
|
path_or_repo_id,
|
|
6228
6537
|
filename,
|
|
6229
6538
|
options,
|
|
6230
|
-
|
|
6539
|
+
cache2
|
|
6231
6540
|
);
|
|
6232
6541
|
let cacheKey;
|
|
6233
6542
|
let toCacheResponse = false;
|
|
6234
6543
|
let response;
|
|
6235
|
-
response = await checkCachedResource(
|
|
6544
|
+
response = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6236
6545
|
const cacheHit = response !== void 0;
|
|
6237
6546
|
if (!cacheHit) {
|
|
6238
6547
|
if (env.allowLocalModels) {
|
|
@@ -6273,7 +6582,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6273
6582
|
}
|
|
6274
6583
|
cacheKey = proposedCacheKey;
|
|
6275
6584
|
}
|
|
6276
|
-
toCacheResponse =
|
|
6585
|
+
toCacheResponse = cache2 && // 1. A caching system is available
|
|
6277
6586
|
typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
|
|
6278
6587
|
response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
|
|
6279
6588
|
response.status === 200;
|
|
@@ -6335,7 +6644,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6335
6644
|
// i.e., do not cache FileResponses (prevents duplication)
|
|
6336
6645
|
toCacheResponse && cacheKey && typeof response !== "string"
|
|
6337
6646
|
) {
|
|
6338
|
-
await storeCachedResource(path_or_repo_id, filename,
|
|
6647
|
+
await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
|
|
6339
6648
|
}
|
|
6340
6649
|
dispatchCallback(options.progress_callback, {
|
|
6341
6650
|
status: "done",
|
|
@@ -6351,7 +6660,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6351
6660
|
if (response instanceof FileResponse) {
|
|
6352
6661
|
return response.filePath;
|
|
6353
6662
|
}
|
|
6354
|
-
const cachedResponse = await
|
|
6663
|
+
const cachedResponse = await cache2?.match(cacheKey);
|
|
6355
6664
|
if (cachedResponse instanceof FileResponse) {
|
|
6356
6665
|
return cachedResponse.filePath;
|
|
6357
6666
|
} else if (cachedResponse instanceof Response) {
|
|
@@ -6378,8 +6687,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
|
|
|
6378
6687
|
name: path_or_repo_id,
|
|
6379
6688
|
file: filename
|
|
6380
6689
|
});
|
|
6381
|
-
const
|
|
6382
|
-
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path,
|
|
6690
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6691
|
+
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
|
|
6383
6692
|
}
|
|
6384
6693
|
async function getModelText(modelPath, fileName, fatal = true, options = {}) {
|
|
6385
6694
|
const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
|
|
@@ -7176,7 +7485,7 @@ __export(onnxruntime_node_exports, {
|
|
|
7176
7485
|
});
|
|
7177
7486
|
var onnxruntime_node_default = {};
|
|
7178
7487
|
|
|
7179
|
-
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.
|
|
7488
|
+
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
|
|
7180
7489
|
var ort_webgpu_bundle_min_exports = {};
|
|
7181
7490
|
__export(ort_webgpu_bundle_min_exports, {
|
|
7182
7491
|
InferenceSession: () => Jf,
|
|
@@ -7944,7 +8253,7 @@ async function ts(a = {}) {
|
|
|
7944
8253
|
throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
|
|
7945
8254
|
}
|
|
7946
8255
|
function Ye() {
|
|
7947
|
-
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii,
|
|
8256
|
+
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
|
|
7948
8257
|
}
|
|
7949
8258
|
async function bt() {
|
|
7950
8259
|
function e(o, u) {
|
|
@@ -9131,7 +9440,7 @@ async function ts(a = {}) {
|
|
|
9131
9440
|
Te(`invalid type for getValue: ${t}`);
|
|
9132
9441
|
}
|
|
9133
9442
|
}, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
|
|
9134
|
-
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = {
|
|
9443
|
+
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
|
|
9135
9444
|
if (r === void 0 || !r.Uc) return 1;
|
|
9136
9445
|
if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
|
|
9137
9446
|
if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
|
|
@@ -9151,11 +9460,11 @@ async function ts(a = {}) {
|
|
|
9151
9460
|
} catch {
|
|
9152
9461
|
return 4;
|
|
9153
9462
|
}
|
|
9154
|
-
},
|
|
9463
|
+
}, 926500: (e, t, n) => {
|
|
9155
9464
|
r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
|
|
9156
|
-
},
|
|
9465
|
+
}, 926564: () => r.me(), 926606: (e) => {
|
|
9157
9466
|
r.jd(e);
|
|
9158
|
-
},
|
|
9467
|
+
}, 926643: () => typeof wasmOffsetConverter < "u" };
|
|
9159
9468
|
function af(e, t, n, o) {
|
|
9160
9469
|
var u = P();
|
|
9161
9470
|
try {
|
|
@@ -11071,7 +11380,7 @@ var $s = k(() => {
|
|
|
11071
11380
|
Ve();
|
|
11072
11381
|
Ve();
|
|
11073
11382
|
Ve();
|
|
11074
|
-
var Xa = "1.25.0-dev.
|
|
11383
|
+
var Xa = "1.25.0-dev.20260307-d626b568e0";
|
|
11075
11384
|
var Tl = Zr;
|
|
11076
11385
|
{
|
|
11077
11386
|
let a = ($s(), $t(Gs)).wasmBackend;
|
|
@@ -11082,11 +11391,11 @@ Object.defineProperty(K.versions, "web", { value: Xa, enumerable: true });
|
|
|
11082
11391
|
// src/backends/utils/cacheWasm.js
|
|
11083
11392
|
async function loadAndCacheFile(url) {
|
|
11084
11393
|
const fileName = url.split("/").pop();
|
|
11085
|
-
let
|
|
11394
|
+
let cache2;
|
|
11086
11395
|
try {
|
|
11087
|
-
|
|
11088
|
-
if (
|
|
11089
|
-
const result = await
|
|
11396
|
+
cache2 = await getCache();
|
|
11397
|
+
if (cache2) {
|
|
11398
|
+
const result = await cache2.match(url);
|
|
11090
11399
|
if (result) {
|
|
11091
11400
|
return result;
|
|
11092
11401
|
}
|
|
@@ -11098,9 +11407,9 @@ async function loadAndCacheFile(url) {
|
|
|
11098
11407
|
if (!response.ok) {
|
|
11099
11408
|
throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
|
|
11100
11409
|
}
|
|
11101
|
-
if (
|
|
11410
|
+
if (cache2) {
|
|
11102
11411
|
try {
|
|
11103
|
-
await
|
|
11412
|
+
await cache2.put(url, response.clone());
|
|
11104
11413
|
} catch (e) {
|
|
11105
11414
|
logger.warn(`Failed to cache ${fileName}:`, e);
|
|
11106
11415
|
}
|
|
@@ -13715,9 +14024,23 @@ var Tensor3 = class _Tensor {
|
|
|
13715
14024
|
throw Error(`Unsupported norm: ${p}`);
|
|
13716
14025
|
}
|
|
13717
14026
|
const this_data = this.data;
|
|
13718
|
-
const
|
|
14027
|
+
const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
|
|
14028
|
+
if (is_bigint && p !== 1) {
|
|
14029
|
+
throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
|
|
14030
|
+
}
|
|
14031
|
+
let fn2, zero;
|
|
14032
|
+
if (is_bigint) {
|
|
14033
|
+
fn2 = (a, b) => a + b;
|
|
14034
|
+
zero = 0n;
|
|
14035
|
+
} else {
|
|
14036
|
+
fn2 = (a, b) => a + b ** p;
|
|
14037
|
+
zero = 0;
|
|
14038
|
+
}
|
|
13719
14039
|
if (dim === null) {
|
|
13720
|
-
|
|
14040
|
+
let val = this_data.reduce(fn2, zero);
|
|
14041
|
+
if (p !== 1) {
|
|
14042
|
+
val = val ** (1 / p);
|
|
14043
|
+
}
|
|
13721
14044
|
return new _Tensor(this.type, [val], []);
|
|
13722
14045
|
}
|
|
13723
14046
|
const [type, result, resultDims] = reduce_helper(fn2, this, dim, keepdim);
|
|
@@ -16177,9 +16500,12 @@ __export(processors_exports, {
|
|
|
16177
16500
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
16178
16501
|
Florence2Processor: () => Florence2Processor,
|
|
16179
16502
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
16503
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
16504
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
16180
16505
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
16181
16506
|
Idefics3Processor: () => Idefics3Processor,
|
|
16182
16507
|
JinaCLIPProcessor: () => JinaCLIPProcessor,
|
|
16508
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
16183
16509
|
LlavaProcessor: () => LlavaProcessor,
|
|
16184
16510
|
MgpstrProcessor: () => MgpstrProcessor,
|
|
16185
16511
|
MoonshineProcessor: () => MoonshineProcessor,
|
|
@@ -16200,6 +16526,7 @@ __export(processors_exports, {
|
|
|
16200
16526
|
UltravoxProcessor: () => UltravoxProcessor,
|
|
16201
16527
|
VLChatProcessor: () => VLChatProcessor,
|
|
16202
16528
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
16529
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
16203
16530
|
Wav2Vec2Processor: () => Wav2Vec2Processor,
|
|
16204
16531
|
Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
|
|
16205
16532
|
WhisperProcessor: () => WhisperProcessor
|
|
@@ -16254,12 +16581,14 @@ __export(feature_extractors_exports, {
|
|
|
16254
16581
|
EncodecFeatureExtractor: () => EncodecFeatureExtractor,
|
|
16255
16582
|
FeatureExtractor: () => FeatureExtractor,
|
|
16256
16583
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
16584
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
16257
16585
|
MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
|
|
16258
16586
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
16259
16587
|
PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
|
|
16260
16588
|
SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
|
|
16261
16589
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
16262
16590
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
16591
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
16263
16592
|
Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
|
|
16264
16593
|
WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
|
|
16265
16594
|
WhisperFeatureExtractor: () => WhisperFeatureExtractor
|
|
@@ -16494,6 +16823,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16494
16823
|
mel_filters = null,
|
|
16495
16824
|
mel_floor = 1e-10,
|
|
16496
16825
|
log_mel = null,
|
|
16826
|
+
max_log_mel = null,
|
|
16497
16827
|
reference = 1,
|
|
16498
16828
|
min_value = 1e-10,
|
|
16499
16829
|
db_range = null,
|
|
@@ -16633,6 +16963,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16633
16963
|
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
16634
16964
|
}
|
|
16635
16965
|
break;
|
|
16966
|
+
case "log10_max_norm": {
|
|
16967
|
+
for (let i = 0; i < o; ++i) {
|
|
16968
|
+
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
16969
|
+
}
|
|
16970
|
+
const logMax = max_log_mel ?? max(mel_spec_data)[0];
|
|
16971
|
+
const threshold = logMax - 8;
|
|
16972
|
+
for (let i = 0; i < o; ++i) {
|
|
16973
|
+
mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
|
|
16974
|
+
}
|
|
16975
|
+
break;
|
|
16976
|
+
}
|
|
16636
16977
|
case "dB":
|
|
16637
16978
|
if (power === 1) {
|
|
16638
16979
|
amplitude_to_db(mel_spec_data, reference, min_value, db_range);
|
|
@@ -16643,7 +16984,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16643
16984
|
}
|
|
16644
16985
|
break;
|
|
16645
16986
|
default:
|
|
16646
|
-
throw new Error(
|
|
16987
|
+
throw new Error(
|
|
16988
|
+
`log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
|
|
16989
|
+
);
|
|
16647
16990
|
}
|
|
16648
16991
|
}
|
|
16649
16992
|
return mel_spec;
|
|
@@ -17148,6 +17491,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
|
|
|
17148
17491
|
}
|
|
17149
17492
|
};
|
|
17150
17493
|
|
|
17494
|
+
// src/models/granite_speech/feature_extraction_granite_speech.js
|
|
17495
|
+
var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
|
|
17496
|
+
constructor(config) {
|
|
17497
|
+
super(config);
|
|
17498
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
17499
|
+
this.mel_filters = mel_filter_bank(
|
|
17500
|
+
Math.floor(1 + n_fft / 2),
|
|
17501
|
+
// num_frequency_bins = 257
|
|
17502
|
+
n_mels,
|
|
17503
|
+
// 80
|
|
17504
|
+
0,
|
|
17505
|
+
// min_frequency
|
|
17506
|
+
sample_rate / 2,
|
|
17507
|
+
// max_frequency = 8000
|
|
17508
|
+
sample_rate,
|
|
17509
|
+
// 16000
|
|
17510
|
+
null,
|
|
17511
|
+
// norm (torchaudio default: no norm)
|
|
17512
|
+
"htk"
|
|
17513
|
+
// mel_scale (torchaudio default)
|
|
17514
|
+
);
|
|
17515
|
+
const raw_window = window_function(win_length, "hann");
|
|
17516
|
+
this.window = new Float64Array(n_fft);
|
|
17517
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
17518
|
+
this.window.set(raw_window, pad);
|
|
17519
|
+
}
|
|
17520
|
+
/**
|
|
17521
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
17522
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
17523
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
17524
|
+
*/
|
|
17525
|
+
async _call(audio) {
|
|
17526
|
+
validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
|
|
17527
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
17528
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
17529
|
+
const max_num_frames = num_frames - num_frames % 2;
|
|
17530
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
17531
|
+
power: 2,
|
|
17532
|
+
mel_filters: this.mel_filters,
|
|
17533
|
+
log_mel: "log10_max_norm",
|
|
17534
|
+
transpose: true,
|
|
17535
|
+
// [time, n_mels]
|
|
17536
|
+
max_num_frames,
|
|
17537
|
+
do_pad: false
|
|
17538
|
+
});
|
|
17539
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
17540
|
+
return { input_features };
|
|
17541
|
+
}
|
|
17542
|
+
};
|
|
17543
|
+
|
|
17151
17544
|
// src/models/moonshine/feature_extraction_moonshine.js
|
|
17152
17545
|
var MoonshineFeatureExtractor = class extends FeatureExtractor {
|
|
17153
17546
|
/**
|
|
@@ -17628,6 +18021,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
|
|
|
17628
18021
|
}
|
|
17629
18022
|
};
|
|
17630
18023
|
|
|
18024
|
+
// src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
|
|
18025
|
+
var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
|
|
18026
|
+
constructor(config) {
|
|
18027
|
+
super(config);
|
|
18028
|
+
this.config.mel_filters ??= mel_filter_bank(
|
|
18029
|
+
Math.floor(1 + this.config.n_fft / 2),
|
|
18030
|
+
// num_frequency_bins
|
|
18031
|
+
this.config.feature_size,
|
|
18032
|
+
// num_mel_filters
|
|
18033
|
+
0,
|
|
18034
|
+
// min_frequency
|
|
18035
|
+
8e3,
|
|
18036
|
+
// max_frequency
|
|
18037
|
+
this.config.sampling_rate,
|
|
18038
|
+
// sampling_rate
|
|
18039
|
+
"slaney",
|
|
18040
|
+
// norm
|
|
18041
|
+
"slaney"
|
|
18042
|
+
// mel_scale
|
|
18043
|
+
);
|
|
18044
|
+
this.window = window_function(this.config.n_fft, "hann");
|
|
18045
|
+
}
|
|
18046
|
+
/**
|
|
18047
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
18048
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
18049
|
+
* @param {Object} [options]
|
|
18050
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
|
|
18051
|
+
* @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
|
|
18052
|
+
*/
|
|
18053
|
+
async _extract_fbank_features(waveform, { center = true } = {}) {
|
|
18054
|
+
const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
|
|
18055
|
+
const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
|
|
18056
|
+
return await spectrogram(
|
|
18057
|
+
waveform,
|
|
18058
|
+
this.window,
|
|
18059
|
+
n_fft,
|
|
18060
|
+
// frame_length
|
|
18061
|
+
hop_length,
|
|
18062
|
+
{
|
|
18063
|
+
power: 2,
|
|
18064
|
+
mel_filters,
|
|
18065
|
+
log_mel: "log10_max_norm",
|
|
18066
|
+
max_log_mel: global_log_mel_max,
|
|
18067
|
+
center,
|
|
18068
|
+
max_num_frames,
|
|
18069
|
+
do_pad: false
|
|
18070
|
+
}
|
|
18071
|
+
);
|
|
18072
|
+
}
|
|
18073
|
+
/**
|
|
18074
|
+
* Extract mel spectrogram features from audio.
|
|
18075
|
+
* @param {Float32Array|Float64Array} audio The audio data.
|
|
18076
|
+
* @param {Object} [options]
|
|
18077
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform.
|
|
18078
|
+
* @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
|
|
18079
|
+
*/
|
|
18080
|
+
async _call(audio, { center = true } = {}) {
|
|
18081
|
+
validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
|
|
18082
|
+
const features = await this._extract_fbank_features(audio, { center });
|
|
18083
|
+
return {
|
|
18084
|
+
input_features: features.unsqueeze_(0)
|
|
18085
|
+
};
|
|
18086
|
+
}
|
|
18087
|
+
};
|
|
18088
|
+
|
|
17631
18089
|
// src/models/whisper/feature_extraction_whisper.js
|
|
17632
18090
|
var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
17633
18091
|
constructor(config) {
|
|
@@ -17656,7 +18114,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17656
18114
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
17657
18115
|
*/
|
|
17658
18116
|
async _extract_fbank_features(waveform) {
|
|
17659
|
-
|
|
18117
|
+
return await spectrogram(
|
|
17660
18118
|
waveform,
|
|
17661
18119
|
this.window,
|
|
17662
18120
|
// window
|
|
@@ -17667,7 +18125,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17667
18125
|
{
|
|
17668
18126
|
power: 2,
|
|
17669
18127
|
mel_filters: this.config.mel_filters,
|
|
17670
|
-
log_mel: "
|
|
18128
|
+
log_mel: "log10_max_norm",
|
|
17671
18129
|
// Custom
|
|
17672
18130
|
max_num_frames: Math.min(
|
|
17673
18131
|
Math.floor(waveform.length / this.config.hop_length),
|
|
@@ -17676,15 +18134,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17676
18134
|
)
|
|
17677
18135
|
}
|
|
17678
18136
|
);
|
|
17679
|
-
const data = features.data;
|
|
17680
|
-
const maxValue = max(
|
|
17681
|
-
/** @type {Float32Array} */
|
|
17682
|
-
data
|
|
17683
|
-
)[0];
|
|
17684
|
-
for (let i = 0; i < data.length; ++i) {
|
|
17685
|
-
data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
|
|
17686
|
-
}
|
|
17687
|
-
return features;
|
|
17688
18137
|
}
|
|
17689
18138
|
/**
|
|
17690
18139
|
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
@@ -18565,6 +19014,30 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
18565
19014
|
}
|
|
18566
19015
|
return [segmentation, segments];
|
|
18567
19016
|
}
|
|
19017
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
19018
|
+
if (height < factor || width < factor) {
|
|
19019
|
+
const scale = Math.max(factor / height, factor / width);
|
|
19020
|
+
height = Math.round(height * scale);
|
|
19021
|
+
width = Math.round(width * scale);
|
|
19022
|
+
}
|
|
19023
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19024
|
+
throw new Error(
|
|
19025
|
+
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19026
|
+
);
|
|
19027
|
+
}
|
|
19028
|
+
let h_bar = Math.round(height / factor) * factor;
|
|
19029
|
+
let w_bar = Math.round(width / factor) * factor;
|
|
19030
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
19031
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
19032
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
19033
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
19034
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
19035
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
19036
|
+
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19037
|
+
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19038
|
+
}
|
|
19039
|
+
return [w_bar, h_bar];
|
|
19040
|
+
}
|
|
18568
19041
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
18569
19042
|
if (label_ids_to_fuse === null) {
|
|
18570
19043
|
logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
|
|
@@ -18642,7 +19115,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18642
19115
|
this.do_pad = config.do_pad;
|
|
18643
19116
|
this.min_pixels = config.min_pixels;
|
|
18644
19117
|
this.max_pixels = config.max_pixels;
|
|
18645
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
19118
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
18646
19119
|
this.pad_size = this.size;
|
|
18647
19120
|
}
|
|
18648
19121
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -18853,7 +19326,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18853
19326
|
});
|
|
18854
19327
|
}
|
|
18855
19328
|
/**
|
|
18856
|
-
* @typedef {
|
|
19329
|
+
* @typedef {Object} PreprocessedImage
|
|
18857
19330
|
* @property {HeightWidth} original_size The original size of the image.
|
|
18858
19331
|
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
|
|
18859
19332
|
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
|
|
@@ -18930,10 +19403,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18930
19403
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
18931
19404
|
[pixelData, imgDims] = padded;
|
|
18932
19405
|
} else if (this.size_divisibility) {
|
|
18933
|
-
const
|
|
18934
|
-
|
|
18935
|
-
this.size_divisibility
|
|
18936
|
-
);
|
|
19406
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
19407
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
18937
19408
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
18938
19409
|
}
|
|
18939
19410
|
}
|
|
@@ -19010,6 +19481,7 @@ var image_processors_exports = {};
|
|
|
19010
19481
|
__export(image_processors_exports, {
|
|
19011
19482
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
19012
19483
|
BitImageProcessor: () => BitImageProcessor,
|
|
19484
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
19013
19485
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
19014
19486
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
19015
19487
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -19026,11 +19498,13 @@ __export(image_processors_exports, {
|
|
|
19026
19498
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
19027
19499
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
19028
19500
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
19501
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
19029
19502
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
19030
19503
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
19031
19504
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
19032
19505
|
ImageProcessor: () => ImageProcessor,
|
|
19033
19506
|
JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
|
|
19507
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
19034
19508
|
LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
|
|
19035
19509
|
Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
|
|
19036
19510
|
MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
|
|
@@ -19085,6 +19559,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
19085
19559
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
19086
19560
|
};
|
|
19087
19561
|
|
|
19562
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
19563
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
19564
|
+
};
|
|
19565
|
+
|
|
19088
19566
|
// src/models/clip/image_processing_clip.js
|
|
19089
19567
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
19090
19568
|
};
|
|
@@ -19204,6 +19682,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
19204
19682
|
}
|
|
19205
19683
|
};
|
|
19206
19684
|
|
|
19685
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19686
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19687
|
+
constructor(config) {
|
|
19688
|
+
super(config);
|
|
19689
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
19690
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
19691
|
+
this.patch_size = config.patch_size;
|
|
19692
|
+
this.merge_size = config.merge_size;
|
|
19693
|
+
}
|
|
19694
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
19695
|
+
get_resize_output_image_size(image, size) {
|
|
19696
|
+
const factor = this.patch_size * this.merge_size;
|
|
19697
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
19698
|
+
}
|
|
19699
|
+
async _call(images, ...args) {
|
|
19700
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
19701
|
+
let patches = pixel_values;
|
|
19702
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
19703
|
+
if (patches.dims[0] === 1) {
|
|
19704
|
+
patches = cat(
|
|
19705
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
19706
|
+
0
|
|
19707
|
+
);
|
|
19708
|
+
}
|
|
19709
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
19710
|
+
const channel = patches.dims[1];
|
|
19711
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
19712
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
19713
|
+
const flatten_patches = patches.view(
|
|
19714
|
+
grid_t,
|
|
19715
|
+
temporal_patch_size,
|
|
19716
|
+
channel,
|
|
19717
|
+
Math.floor(grid_h / merge_size),
|
|
19718
|
+
merge_size,
|
|
19719
|
+
patch_size,
|
|
19720
|
+
Math.floor(grid_w / merge_size),
|
|
19721
|
+
merge_size,
|
|
19722
|
+
patch_size
|
|
19723
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
19724
|
+
const image_grid_thw = new Tensor3("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
19725
|
+
return {
|
|
19726
|
+
pixel_values: flatten_patches,
|
|
19727
|
+
image_grid_thw,
|
|
19728
|
+
original_sizes,
|
|
19729
|
+
reshaped_input_sizes
|
|
19730
|
+
};
|
|
19731
|
+
}
|
|
19732
|
+
};
|
|
19733
|
+
|
|
19734
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
19735
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
19736
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
19737
|
+
get_resize_output_image_size(image, size) {
|
|
19738
|
+
const factor = this.patch_size * this.merge_size;
|
|
19739
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
19740
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
19741
|
+
}
|
|
19742
|
+
};
|
|
19743
|
+
|
|
19207
19744
|
// src/models/glpn/image_processing_glpn.js
|
|
19208
19745
|
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
19209
19746
|
};
|
|
@@ -19434,6 +19971,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
|
|
|
19434
19971
|
}
|
|
19435
19972
|
};
|
|
19436
19973
|
|
|
19974
|
+
// src/models/lfm2_vl/image_processing_lfm2_vl.js
|
|
19975
|
+
function round_by_factor(number, factor) {
|
|
19976
|
+
return Math.round(number / factor) * factor;
|
|
19977
|
+
}
|
|
19978
|
+
function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
|
|
19979
|
+
let best_ratio_diff = Infinity;
|
|
19980
|
+
let best_ratio = [1, 1];
|
|
19981
|
+
const area = width * height;
|
|
19982
|
+
for (const ratio of target_ratios) {
|
|
19983
|
+
const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
|
|
19984
|
+
if (ratio_diff < best_ratio_diff) {
|
|
19985
|
+
best_ratio_diff = ratio_diff;
|
|
19986
|
+
best_ratio = ratio;
|
|
19987
|
+
} else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
|
|
19988
|
+
best_ratio = ratio;
|
|
19989
|
+
}
|
|
19990
|
+
}
|
|
19991
|
+
return best_ratio;
|
|
19992
|
+
}
|
|
19993
|
+
function get_target_ratios(min_tiles, max_tiles) {
|
|
19994
|
+
const ratios = [];
|
|
19995
|
+
const seen = /* @__PURE__ */ new Set();
|
|
19996
|
+
for (let n = min_tiles; n <= max_tiles; ++n) {
|
|
19997
|
+
for (let w = 1; w <= n; ++w) {
|
|
19998
|
+
for (let h = 1; h <= n; ++h) {
|
|
19999
|
+
const product2 = w * h;
|
|
20000
|
+
if (product2 >= min_tiles && product2 <= max_tiles) {
|
|
20001
|
+
const key = w << 16 | h;
|
|
20002
|
+
if (!seen.has(key)) {
|
|
20003
|
+
seen.add(key);
|
|
20004
|
+
ratios.push([w, h]);
|
|
20005
|
+
}
|
|
20006
|
+
}
|
|
20007
|
+
}
|
|
20008
|
+
}
|
|
20009
|
+
}
|
|
20010
|
+
return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
|
|
20011
|
+
}
|
|
20012
|
+
function convert_image_to_patches(images, patch_size) {
|
|
20013
|
+
const [B, C, H, W] = images.dims;
|
|
20014
|
+
const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
|
|
20015
|
+
const patch_dim = patch_size * patch_size * C;
|
|
20016
|
+
const data = (
|
|
20017
|
+
/** @type {Float32Array} */
|
|
20018
|
+
images.data
|
|
20019
|
+
);
|
|
20020
|
+
const result = new Float32Array(B * ph * pw * patch_dim);
|
|
20021
|
+
const ch_stride = H * W;
|
|
20022
|
+
for (let b = 0; b < B; ++b) {
|
|
20023
|
+
const b_src = b * C * ch_stride;
|
|
20024
|
+
const b_dst = b * ph * pw * patch_dim;
|
|
20025
|
+
for (let py = 0; py < ph; ++py) {
|
|
20026
|
+
for (let px = 0; px < pw; ++px) {
|
|
20027
|
+
let off = b_dst + (py * pw + px) * patch_dim;
|
|
20028
|
+
for (let dy = 0; dy < patch_size; ++dy) {
|
|
20029
|
+
const row = (py * patch_size + dy) * W + px * patch_size;
|
|
20030
|
+
for (let dx = 0; dx < patch_size; ++dx) {
|
|
20031
|
+
const pixel = row + dx;
|
|
20032
|
+
for (let c = 0; c < C; ++c) {
|
|
20033
|
+
result[off++] = data[b_src + c * ch_stride + pixel];
|
|
20034
|
+
}
|
|
20035
|
+
}
|
|
20036
|
+
}
|
|
20037
|
+
}
|
|
20038
|
+
}
|
|
20039
|
+
}
|
|
20040
|
+
return new Tensor3("float32", result, [B, ph * pw, patch_dim]);
|
|
20041
|
+
}
|
|
20042
|
+
function pad_along_first_dim(patches, target_length) {
|
|
20043
|
+
const [, len2, dim] = patches.dims;
|
|
20044
|
+
const mask_data = new BigInt64Array(target_length);
|
|
20045
|
+
mask_data.fill(1n, 0, len2);
|
|
20046
|
+
let padded = patches;
|
|
20047
|
+
if (len2 < target_length) {
|
|
20048
|
+
const padded_data = new Float32Array(target_length * dim);
|
|
20049
|
+
padded_data.set(
|
|
20050
|
+
/** @type {Float32Array} */
|
|
20051
|
+
patches.data
|
|
20052
|
+
);
|
|
20053
|
+
padded = new Tensor3("float32", padded_data, [1, target_length, dim]);
|
|
20054
|
+
}
|
|
20055
|
+
return { padded, mask: new Tensor3("int64", mask_data, [target_length]) };
|
|
20056
|
+
}
|
|
20057
|
+
var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
20058
|
+
constructor(config) {
|
|
20059
|
+
super(config);
|
|
20060
|
+
this.downsample_factor = config.downsample_factor ?? 2;
|
|
20061
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
20062
|
+
this.min_tiles = config.min_tiles ?? 2;
|
|
20063
|
+
this.max_tiles = config.max_tiles ?? 10;
|
|
20064
|
+
this.use_thumbnail = config.use_thumbnail ?? true;
|
|
20065
|
+
this.min_image_tokens = config.min_image_tokens ?? 64;
|
|
20066
|
+
this.max_image_tokens = config.max_image_tokens ?? 256;
|
|
20067
|
+
this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
|
|
20068
|
+
this.tile_size = config.tile_size ?? 512;
|
|
20069
|
+
this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
|
|
20070
|
+
this.return_row_col_info = config.return_row_col_info ?? false;
|
|
20071
|
+
const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
|
|
20072
|
+
const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
|
|
20073
|
+
this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
|
|
20074
|
+
}
|
|
20075
|
+
/**
|
|
20076
|
+
* Check if the image is too large to be processed as a single tile.
|
|
20077
|
+
* @param {number} height
|
|
20078
|
+
* @param {number} width
|
|
20079
|
+
* @returns {boolean}
|
|
20080
|
+
*/
|
|
20081
|
+
_is_image_too_large(height, width) {
|
|
20082
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20083
|
+
const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
|
|
20084
|
+
const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
|
|
20085
|
+
return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
|
|
20086
|
+
}
|
|
20087
|
+
/**
|
|
20088
|
+
* Get the grid layout for tiling a large image.
|
|
20089
|
+
* @param {number} height
|
|
20090
|
+
* @param {number} width
|
|
20091
|
+
* @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
|
|
20092
|
+
*/
|
|
20093
|
+
_get_grid_layout(height, width) {
|
|
20094
|
+
const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
|
|
20095
|
+
const [grid_width, grid_height] = find_closest_aspect_ratio(
|
|
20096
|
+
width / height,
|
|
20097
|
+
target_ratios,
|
|
20098
|
+
width,
|
|
20099
|
+
height,
|
|
20100
|
+
this.tile_size
|
|
20101
|
+
);
|
|
20102
|
+
return {
|
|
20103
|
+
grid_width,
|
|
20104
|
+
grid_height,
|
|
20105
|
+
target_width: this.tile_size * grid_width,
|
|
20106
|
+
target_height: this.tile_size * grid_height
|
|
20107
|
+
};
|
|
20108
|
+
}
|
|
20109
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
20110
|
+
// @ts-expect-error
|
|
20111
|
+
async _call(images, { return_row_col_info = null } = {}) {
|
|
20112
|
+
let batched_images;
|
|
20113
|
+
if (!Array.isArray(images)) {
|
|
20114
|
+
batched_images = [[images]];
|
|
20115
|
+
} else if (!Array.isArray(images[0])) {
|
|
20116
|
+
batched_images = [
|
|
20117
|
+
/** @type {RawImage[]} */
|
|
20118
|
+
images
|
|
20119
|
+
];
|
|
20120
|
+
} else {
|
|
20121
|
+
batched_images = /** @type {RawImage[][]} */
|
|
20122
|
+
images;
|
|
20123
|
+
}
|
|
20124
|
+
const all_pixel_values = [];
|
|
20125
|
+
const all_pixel_masks = [];
|
|
20126
|
+
const all_spatial_shapes = [];
|
|
20127
|
+
const all_rows = [];
|
|
20128
|
+
const all_cols = [];
|
|
20129
|
+
const all_image_sizes = [];
|
|
20130
|
+
for (const image_batch of batched_images) {
|
|
20131
|
+
const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
|
|
20132
|
+
for (const { pixel_values } of preprocessed) {
|
|
20133
|
+
const [, height, width] = pixel_values.dims;
|
|
20134
|
+
const img = pixel_values.unsqueeze_(0);
|
|
20135
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20136
|
+
const f2 = total_factor ** 2;
|
|
20137
|
+
const [new_width, new_height] = smart_resize(
|
|
20138
|
+
Math.max(total_factor, height),
|
|
20139
|
+
Math.max(total_factor, width),
|
|
20140
|
+
total_factor,
|
|
20141
|
+
this.min_image_tokens * f2,
|
|
20142
|
+
this.max_image_tokens * f2
|
|
20143
|
+
).map((x) => Math.max(total_factor, x));
|
|
20144
|
+
let tiles;
|
|
20145
|
+
let num_rows = 1, num_cols = 1;
|
|
20146
|
+
const is_large = this._is_image_too_large(height, width);
|
|
20147
|
+
const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
|
|
20148
|
+
if (is_large && do_splitting) {
|
|
20149
|
+
const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
|
|
20150
|
+
height,
|
|
20151
|
+
width
|
|
20152
|
+
);
|
|
20153
|
+
num_rows = grid_height;
|
|
20154
|
+
num_cols = grid_width;
|
|
20155
|
+
const resized = await interpolate_4d(img, {
|
|
20156
|
+
size: [target_height, target_width]
|
|
20157
|
+
});
|
|
20158
|
+
tiles = [];
|
|
20159
|
+
for (let r = 0; r < grid_height; ++r) {
|
|
20160
|
+
for (let c = 0; c < grid_width; ++c) {
|
|
20161
|
+
const y = r * this.tile_size;
|
|
20162
|
+
const x = c * this.tile_size;
|
|
20163
|
+
tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
|
|
20164
|
+
}
|
|
20165
|
+
}
|
|
20166
|
+
if (this.use_thumbnail && grid_width * grid_height !== 1) {
|
|
20167
|
+
tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
|
|
20168
|
+
}
|
|
20169
|
+
} else {
|
|
20170
|
+
tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
|
|
20171
|
+
}
|
|
20172
|
+
for (const tile of tiles) {
|
|
20173
|
+
const [, , th, tw] = tile.dims;
|
|
20174
|
+
const patches = convert_image_to_patches(tile, this.encoder_patch_size);
|
|
20175
|
+
const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
|
|
20176
|
+
all_pixel_values.push(padded);
|
|
20177
|
+
all_pixel_masks.push(mask);
|
|
20178
|
+
all_spatial_shapes.push([
|
|
20179
|
+
Math.floor(th / this.encoder_patch_size),
|
|
20180
|
+
Math.floor(tw / this.encoder_patch_size)
|
|
20181
|
+
]);
|
|
20182
|
+
}
|
|
20183
|
+
all_rows.push(num_rows);
|
|
20184
|
+
all_cols.push(num_cols);
|
|
20185
|
+
all_image_sizes.push([new_height, new_width]);
|
|
20186
|
+
}
|
|
20187
|
+
}
|
|
20188
|
+
const result = {
|
|
20189
|
+
pixel_values: cat(all_pixel_values, 0),
|
|
20190
|
+
pixel_attention_mask: stack(all_pixel_masks, 0),
|
|
20191
|
+
spatial_shapes: new Tensor3("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
|
|
20192
|
+
all_spatial_shapes.length,
|
|
20193
|
+
2
|
|
20194
|
+
])
|
|
20195
|
+
};
|
|
20196
|
+
if (return_row_col_info ?? this.return_row_col_info) {
|
|
20197
|
+
result.image_rows = all_rows;
|
|
20198
|
+
result.image_cols = all_cols;
|
|
20199
|
+
result.image_sizes = all_image_sizes;
|
|
20200
|
+
}
|
|
20201
|
+
return result;
|
|
20202
|
+
}
|
|
20203
|
+
};
|
|
20204
|
+
|
|
19437
20205
|
// src/models/llava_onevision/image_processing_llava_onevision.js
|
|
19438
20206
|
var LlavaOnevisionImageProcessor = class extends ImageProcessor {
|
|
19439
20207
|
};
|
|
@@ -19656,76 +20424,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
19656
20424
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
19657
20425
|
};
|
|
19658
20426
|
|
|
19659
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19660
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
19661
|
-
if (height < factor || width < factor) {
|
|
19662
|
-
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
19663
|
-
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19664
|
-
throw new Error(
|
|
19665
|
-
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19666
|
-
);
|
|
19667
|
-
}
|
|
19668
|
-
let h_bar = Math.round(height / factor) * factor;
|
|
19669
|
-
let w_bar = Math.round(width / factor) * factor;
|
|
19670
|
-
if (h_bar * w_bar > max_pixels) {
|
|
19671
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
19672
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
19673
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
19674
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
19675
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
19676
|
-
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19677
|
-
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19678
|
-
}
|
|
19679
|
-
return [h_bar, w_bar];
|
|
19680
|
-
}
|
|
19681
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19682
|
-
constructor(config) {
|
|
19683
|
-
super(config);
|
|
19684
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
19685
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
19686
|
-
this.patch_size = config.patch_size;
|
|
19687
|
-
this.merge_size = config.merge_size;
|
|
19688
|
-
}
|
|
19689
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
19690
|
-
get_resize_output_image_size(image, size) {
|
|
19691
|
-
const factor = this.patch_size * this.merge_size;
|
|
19692
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
19693
|
-
}
|
|
19694
|
-
async _call(images, ...args) {
|
|
19695
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
19696
|
-
let patches = pixel_values;
|
|
19697
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
19698
|
-
if (patches.dims[0] === 1) {
|
|
19699
|
-
patches = cat(
|
|
19700
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
19701
|
-
0
|
|
19702
|
-
);
|
|
19703
|
-
}
|
|
19704
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
19705
|
-
const channel = patches.dims[1];
|
|
19706
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
19707
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
19708
|
-
const flatten_patches = patches.view(
|
|
19709
|
-
grid_t,
|
|
19710
|
-
temporal_patch_size,
|
|
19711
|
-
channel,
|
|
19712
|
-
Math.floor(grid_h / merge_size),
|
|
19713
|
-
merge_size,
|
|
19714
|
-
patch_size,
|
|
19715
|
-
Math.floor(grid_w / merge_size),
|
|
19716
|
-
merge_size,
|
|
19717
|
-
patch_size
|
|
19718
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
19719
|
-
const image_grid_thw = new Tensor3("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
19720
|
-
return {
|
|
19721
|
-
pixel_values: flatten_patches,
|
|
19722
|
-
image_grid_thw,
|
|
19723
|
-
original_sizes,
|
|
19724
|
-
reshaped_input_sizes
|
|
19725
|
-
};
|
|
19726
|
-
}
|
|
19727
|
-
};
|
|
19728
|
-
|
|
19729
20427
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
19730
20428
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
19731
20429
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -20279,6 +20977,107 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
20279
20977
|
}
|
|
20280
20978
|
};
|
|
20281
20979
|
|
|
20980
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
20981
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
20982
|
+
static image_processor_class = AutoImageProcessor;
|
|
20983
|
+
static tokenizer_class = AutoTokenizer;
|
|
20984
|
+
static image_token = "<|image_pad|>";
|
|
20985
|
+
/**
|
|
20986
|
+
*
|
|
20987
|
+
* @param {string|string[]} text
|
|
20988
|
+
* @param {RawImage|RawImage[]} images
|
|
20989
|
+
* @param {...any} args
|
|
20990
|
+
* @returns {Promise<any>}
|
|
20991
|
+
*/
|
|
20992
|
+
async _call(text, images = null, ...args) {
|
|
20993
|
+
if (!Array.isArray(text)) {
|
|
20994
|
+
text = [text];
|
|
20995
|
+
}
|
|
20996
|
+
let image_inputs, image_grid_thw;
|
|
20997
|
+
if (images) {
|
|
20998
|
+
image_inputs = await this.image_processor(images);
|
|
20999
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
21000
|
+
}
|
|
21001
|
+
if (image_grid_thw) {
|
|
21002
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
21003
|
+
let index = 0;
|
|
21004
|
+
const image_token = (
|
|
21005
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
21006
|
+
this.constructor.image_token
|
|
21007
|
+
);
|
|
21008
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
21009
|
+
text = text.map((t) => {
|
|
21010
|
+
while (t.includes(image_token)) {
|
|
21011
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
21012
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
21013
|
+
}
|
|
21014
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
21015
|
+
});
|
|
21016
|
+
}
|
|
21017
|
+
const text_inputs = this.tokenizer(text);
|
|
21018
|
+
return {
|
|
21019
|
+
...text_inputs,
|
|
21020
|
+
...image_inputs
|
|
21021
|
+
};
|
|
21022
|
+
}
|
|
21023
|
+
};
|
|
21024
|
+
|
|
21025
|
+
// src/models/glm46v/processing_glm46v.js
|
|
21026
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
21027
|
+
static image_token = "<|image|>";
|
|
21028
|
+
};
|
|
21029
|
+
|
|
21030
|
+
// src/models/granite_speech/processing_granite_speech.js
|
|
21031
|
+
var GraniteSpeechProcessor = class extends Processor {
|
|
21032
|
+
static tokenizer_class = AutoTokenizer;
|
|
21033
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
21034
|
+
static uses_processor_config = true;
|
|
21035
|
+
/**
|
|
21036
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
21037
|
+
* @param {number} audioLength Raw audio sample count.
|
|
21038
|
+
* @returns {number} Number of projector output tokens.
|
|
21039
|
+
*/
|
|
21040
|
+
_get_num_audio_features(audioLength) {
|
|
21041
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
21042
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
21043
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
21044
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
21045
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
21046
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
21047
|
+
return nblocks * effective_window_size;
|
|
21048
|
+
}
|
|
21049
|
+
/**
|
|
21050
|
+
* @param {string} text The text input to process.
|
|
21051
|
+
* @param {Float32Array} audio The audio input to process.
|
|
21052
|
+
*/
|
|
21053
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
21054
|
+
if (Array.isArray(text)) {
|
|
21055
|
+
throw new Error("Batched inputs are not supported yet.");
|
|
21056
|
+
}
|
|
21057
|
+
let audio_inputs = {};
|
|
21058
|
+
if (audio) {
|
|
21059
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
21060
|
+
audio_inputs["input_features"] = input_features;
|
|
21061
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
21062
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
21063
|
+
audio_inputs["input_features_mask"] = new Tensor3("bool", mask_data, [1, audio_embed_size]);
|
|
21064
|
+
const audio_token = this.config.audio_token ?? "<|audio|>";
|
|
21065
|
+
if (!text.includes(audio_token)) {
|
|
21066
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
21067
|
+
}
|
|
21068
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
21069
|
+
}
|
|
21070
|
+
const text_inputs = this.tokenizer(text, {
|
|
21071
|
+
add_special_tokens: false,
|
|
21072
|
+
...kwargs
|
|
21073
|
+
});
|
|
21074
|
+
return {
|
|
21075
|
+
...text_inputs,
|
|
21076
|
+
...audio_inputs
|
|
21077
|
+
};
|
|
21078
|
+
}
|
|
21079
|
+
};
|
|
21080
|
+
|
|
20282
21081
|
// src/models/grounding_dino/processing_grounding_dino.js
|
|
20283
21082
|
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
20284
21083
|
const left_idx = 0;
|
|
@@ -20555,6 +21354,66 @@ var JinaCLIPProcessor = class extends Processor {
|
|
|
20555
21354
|
}
|
|
20556
21355
|
};
|
|
20557
21356
|
|
|
21357
|
+
// src/models/lfm2_vl/processing_lfm2_vl.js
|
|
21358
|
+
var Lfm2VlProcessor = class extends Processor {
|
|
21359
|
+
static tokenizer_class = AutoTokenizer;
|
|
21360
|
+
static image_processor_class = AutoImageProcessor;
|
|
21361
|
+
/**
|
|
21362
|
+
* @param {RawImage|RawImage[]} images
|
|
21363
|
+
* @param {string|string[]|null} [text]
|
|
21364
|
+
* @param {Record<string, any>} [kwargs]
|
|
21365
|
+
*/
|
|
21366
|
+
async _call(images, text = null, kwargs = {}) {
|
|
21367
|
+
const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
|
|
21368
|
+
...kwargs,
|
|
21369
|
+
return_row_col_info: true
|
|
21370
|
+
});
|
|
21371
|
+
if (text) {
|
|
21372
|
+
const image_token = this.config.image_token ?? "<image>";
|
|
21373
|
+
const {
|
|
21374
|
+
tile_size = 512,
|
|
21375
|
+
downsample_factor = 2,
|
|
21376
|
+
encoder_patch_size = 16,
|
|
21377
|
+
use_thumbnail = true
|
|
21378
|
+
} = (
|
|
21379
|
+
/** @type {Record<string, any>} */
|
|
21380
|
+
this.image_processor.config
|
|
21381
|
+
);
|
|
21382
|
+
const ds2 = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
|
|
21383
|
+
const tokens_per_tile = ds2(tile_size) ** 2;
|
|
21384
|
+
const image_start = this.config.image_start_token ?? "<|image_start|>";
|
|
21385
|
+
const image_end = this.config.image_end_token ?? "<|image_end|>";
|
|
21386
|
+
const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
|
|
21387
|
+
if (!Array.isArray(text)) text = [text];
|
|
21388
|
+
let image_idx = 0;
|
|
21389
|
+
text = text.map((sample) => {
|
|
21390
|
+
const parts = sample.split(image_token);
|
|
21391
|
+
return parts[0] + parts.slice(1).map((part) => {
|
|
21392
|
+
const idx = image_idx++;
|
|
21393
|
+
const [h, w] = image_sizes[idx];
|
|
21394
|
+
const rows = image_rows[idx], cols = image_cols[idx];
|
|
21395
|
+
const tokens_for_image = ds2(h) * ds2(w);
|
|
21396
|
+
let expanded = image_start;
|
|
21397
|
+
if (rows > 1 || cols > 1) {
|
|
21398
|
+
const tile_str = image_token.repeat(tokens_per_tile);
|
|
21399
|
+
for (let r = 0; r < rows; ++r)
|
|
21400
|
+
for (let c = 0; c < cols; ++c)
|
|
21401
|
+
expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
|
|
21402
|
+
if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
|
|
21403
|
+
} else {
|
|
21404
|
+
expanded += image_token.repeat(tokens_for_image);
|
|
21405
|
+
}
|
|
21406
|
+
return expanded + image_end + part;
|
|
21407
|
+
}).join("");
|
|
21408
|
+
});
|
|
21409
|
+
}
|
|
21410
|
+
return {
|
|
21411
|
+
...image_inputs,
|
|
21412
|
+
...text ? this.tokenizer(text, kwargs) : {}
|
|
21413
|
+
};
|
|
21414
|
+
}
|
|
21415
|
+
};
|
|
21416
|
+
|
|
20558
21417
|
// src/models/llava/processing_llava.js
|
|
20559
21418
|
var LlavaProcessor = class extends Processor {
|
|
20560
21419
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -20898,47 +21757,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
20898
21757
|
}
|
|
20899
21758
|
};
|
|
20900
21759
|
|
|
20901
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
20902
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
20903
|
-
static image_processor_class = AutoImageProcessor;
|
|
20904
|
-
static tokenizer_class = AutoTokenizer;
|
|
20905
|
-
/**
|
|
20906
|
-
*
|
|
20907
|
-
* @param {string|string[]} text
|
|
20908
|
-
* @param {RawImage|RawImage[]} images
|
|
20909
|
-
* @param {...any} args
|
|
20910
|
-
* @returns {Promise<any>}
|
|
20911
|
-
*/
|
|
20912
|
-
async _call(text, images = null, ...args) {
|
|
20913
|
-
if (!Array.isArray(text)) {
|
|
20914
|
-
text = [text];
|
|
20915
|
-
}
|
|
20916
|
-
let image_inputs, image_grid_thw;
|
|
20917
|
-
if (images) {
|
|
20918
|
-
image_inputs = await this.image_processor(images);
|
|
20919
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
20920
|
-
}
|
|
20921
|
-
if (image_grid_thw) {
|
|
20922
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
20923
|
-
let index = 0;
|
|
20924
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
20925
|
-
text = text.map((t) => {
|
|
20926
|
-
while (t.includes("<|image_pad|>")) {
|
|
20927
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
20928
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
20929
|
-
}
|
|
20930
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
20931
|
-
});
|
|
20932
|
-
}
|
|
20933
|
-
const text_inputs = this.tokenizer(text);
|
|
20934
|
-
return {
|
|
20935
|
-
...text_inputs,
|
|
20936
|
-
...image_inputs
|
|
20937
|
-
// TODO: ...videos_inputs,
|
|
20938
|
-
};
|
|
20939
|
-
}
|
|
20940
|
-
};
|
|
20941
|
-
|
|
20942
21760
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
20943
21761
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
20944
21762
|
};
|
|
@@ -21087,6 +21905,94 @@ var VoxtralProcessor = class extends Processor {
|
|
|
21087
21905
|
}
|
|
21088
21906
|
};
|
|
21089
21907
|
|
|
21908
|
+
// src/models/voxtral_realtime/processing_voxtral_realtime.js
|
|
21909
|
+
var NUM_LEFT_PAD_TOKENS = 32;
|
|
21910
|
+
var NUM_DELAY_TOKENS = 6;
|
|
21911
|
+
var AUDIO_LENGTH_PER_TOK = 8;
|
|
21912
|
+
var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
21913
|
+
var STREAMING_PAD_TOKEN_ID = 32;
|
|
21914
|
+
var VoxtralRealtimeProcessor = class extends Processor {
|
|
21915
|
+
static tokenizer_class = AutoTokenizer;
|
|
21916
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
21917
|
+
static uses_processor_config = false;
|
|
21918
|
+
/** Number of mel frames in the first audio chunk. */
|
|
21919
|
+
get num_mel_frames_first_audio_chunk() {
|
|
21920
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
21921
|
+
}
|
|
21922
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
21923
|
+
get num_samples_first_audio_chunk() {
|
|
21924
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
21925
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
21926
|
+
}
|
|
21927
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
21928
|
+
get num_samples_per_audio_chunk() {
|
|
21929
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
21930
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
21931
|
+
}
|
|
21932
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
21933
|
+
get num_right_pad_tokens() {
|
|
21934
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
21935
|
+
}
|
|
21936
|
+
/** Number of mel frames per text token. */
|
|
21937
|
+
get audio_length_per_tok() {
|
|
21938
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
21939
|
+
}
|
|
21940
|
+
/** Number of raw audio samples per token. */
|
|
21941
|
+
get raw_audio_length_per_tok() {
|
|
21942
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
21943
|
+
}
|
|
21944
|
+
/**
|
|
21945
|
+
* Process audio input for VoxtralRealtime.
|
|
21946
|
+
*
|
|
21947
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
21948
|
+
* with silence and mel features are extracted with `center=true`.
|
|
21949
|
+
* Returns `{ input_ids, input_features }`.
|
|
21950
|
+
*
|
|
21951
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
21952
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
21953
|
+
*
|
|
21954
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
21955
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
21956
|
+
* Returns `{ input_features }`.
|
|
21957
|
+
*
|
|
21958
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
21959
|
+
* @param {Object} [options]
|
|
21960
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
21961
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
21962
|
+
* @returns {Promise<Object>}
|
|
21963
|
+
*/
|
|
21964
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
21965
|
+
validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
|
|
21966
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
21967
|
+
throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
|
|
21968
|
+
}
|
|
21969
|
+
if (is_first_audio_chunk) {
|
|
21970
|
+
if (is_streaming) {
|
|
21971
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
21972
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
21973
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
21974
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
21975
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
21976
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
21977
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
21978
|
+
input_ids_data[0] = 1n;
|
|
21979
|
+
const input_ids = new Tensor3("int64", input_ids_data, [1, num_input_tokens]);
|
|
21980
|
+
return {
|
|
21981
|
+
input_ids,
|
|
21982
|
+
...audio_encoding
|
|
21983
|
+
};
|
|
21984
|
+
} else {
|
|
21985
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
21986
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
21987
|
+
padded_audio.set(audio);
|
|
21988
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
21989
|
+
}
|
|
21990
|
+
} else {
|
|
21991
|
+
return await this.feature_extractor(audio, { center: false });
|
|
21992
|
+
}
|
|
21993
|
+
}
|
|
21994
|
+
};
|
|
21995
|
+
|
|
21090
21996
|
// src/models/wav2vec2/processing_wav2vec2.js
|
|
21091
21997
|
var Wav2Vec2Processor = class extends Processor {
|
|
21092
21998
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -21186,11 +22092,16 @@ function getNormalizedConfig(config) {
|
|
|
21186
22092
|
case "florence2":
|
|
21187
22093
|
case "llava_onevision":
|
|
21188
22094
|
case "idefics3":
|
|
22095
|
+
case "granite_speech":
|
|
21189
22096
|
case "ultravox":
|
|
21190
22097
|
case "voxtral":
|
|
22098
|
+
case "voxtral_realtime":
|
|
21191
22099
|
case "smolvlm":
|
|
21192
22100
|
case "gemma3n":
|
|
22101
|
+
case "lfm2_vl":
|
|
21193
22102
|
case "chatterbox":
|
|
22103
|
+
case "lighton_ocr":
|
|
22104
|
+
case "glm_ocr":
|
|
21194
22105
|
case "mistral3":
|
|
21195
22106
|
case "qwen2_5_vl":
|
|
21196
22107
|
case "qwen3_vl":
|
|
@@ -21244,10 +22155,13 @@ function getNormalizedConfig(config) {
|
|
|
21244
22155
|
case "cohere":
|
|
21245
22156
|
case "cohere2":
|
|
21246
22157
|
case "mistral":
|
|
22158
|
+
case "voxtral_realtime_text":
|
|
22159
|
+
case "voxtral_realtime_encoder":
|
|
21247
22160
|
case "starcoder2":
|
|
21248
22161
|
case "qwen2":
|
|
21249
22162
|
case "qwen2_moe":
|
|
21250
22163
|
case "qwen2_vl":
|
|
22164
|
+
case "qwen2_vl_text":
|
|
21251
22165
|
case "qwen2_5_vl_text":
|
|
21252
22166
|
case "qwen3_moe":
|
|
21253
22167
|
case "qwen3_vl_text":
|
|
@@ -21263,6 +22177,8 @@ function getNormalizedConfig(config) {
|
|
|
21263
22177
|
mapping["dim_kv"] = "head_dim";
|
|
21264
22178
|
break;
|
|
21265
22179
|
case "qwen3":
|
|
22180
|
+
case "solar_open":
|
|
22181
|
+
case "glm_ocr_text":
|
|
21266
22182
|
case "gemma":
|
|
21267
22183
|
case "gemma2":
|
|
21268
22184
|
case "vaultgemma":
|
|
@@ -21273,6 +22189,7 @@ function getNormalizedConfig(config) {
|
|
|
21273
22189
|
case "ernie4_5":
|
|
21274
22190
|
case "hunyuan_v1_dense":
|
|
21275
22191
|
case "falcon_h1":
|
|
22192
|
+
case "nemotron_h":
|
|
21276
22193
|
case "ministral":
|
|
21277
22194
|
case "ministral3":
|
|
21278
22195
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -21307,6 +22224,9 @@ function getNormalizedConfig(config) {
|
|
|
21307
22224
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
21308
22225
|
break;
|
|
21309
22226
|
case "youtu":
|
|
22227
|
+
case "deepseek_v3":
|
|
22228
|
+
case "glm_moe_dsa":
|
|
22229
|
+
case "mistral4":
|
|
21310
22230
|
mapping["num_heads"] = "num_key_value_heads";
|
|
21311
22231
|
mapping["num_layers"] = "num_hidden_layers";
|
|
21312
22232
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -21392,6 +22312,10 @@ function getNormalizedConfig(config) {
|
|
|
21392
22312
|
return normalized_config;
|
|
21393
22313
|
}
|
|
21394
22314
|
function getCacheShapes(config, options) {
|
|
22315
|
+
if (!(config instanceof PretrainedConfig)) {
|
|
22316
|
+
config = new PretrainedConfig(config);
|
|
22317
|
+
}
|
|
22318
|
+
const batch_size = options?.batch_size ?? 1;
|
|
21395
22319
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
21396
22320
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
21397
22321
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -21401,7 +22325,6 @@ function getCacheShapes(config, options) {
|
|
|
21401
22325
|
config
|
|
21402
22326
|
);
|
|
21403
22327
|
const head_dim = hidden_size / num_attention_heads;
|
|
21404
|
-
const batch_size = options?.batch_size ?? 1;
|
|
21405
22328
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
21406
22329
|
if (layer_types[i] === "full_attention") {
|
|
21407
22330
|
for (const kv of ["key", "value"]) {
|
|
@@ -21414,31 +22337,26 @@ function getCacheShapes(config, options) {
|
|
|
21414
22337
|
}
|
|
21415
22338
|
}
|
|
21416
22339
|
return cache_values;
|
|
21417
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
22340
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
21418
22341
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
21419
22342
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
21420
|
-
const
|
|
21421
|
-
const {
|
|
21422
|
-
layer_types,
|
|
21423
|
-
num_hidden_layers,
|
|
21424
|
-
num_attention_heads,
|
|
21425
|
-
num_key_value_heads,
|
|
21426
|
-
hidden_size,
|
|
21427
|
-
mamba_d_conv,
|
|
21428
|
-
mamba_n_heads,
|
|
21429
|
-
mamba_d_head,
|
|
21430
|
-
mamba_d_state,
|
|
21431
|
-
mamba_n_groups,
|
|
21432
|
-
mamba_expand,
|
|
21433
|
-
mamba_d_ssm
|
|
21434
|
-
} = (
|
|
22343
|
+
const c = (
|
|
21435
22344
|
/** @type {any} */
|
|
21436
22345
|
config
|
|
21437
22346
|
);
|
|
21438
|
-
const
|
|
21439
|
-
const
|
|
21440
|
-
const
|
|
21441
|
-
|
|
22347
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
22348
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
22349
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
22350
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
22351
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
22352
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
22353
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
22354
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
22355
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
22356
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
22357
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
22358
|
+
const cache_values = {};
|
|
22359
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
21442
22360
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
21443
22361
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
21444
22362
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -21472,7 +22390,6 @@ function getCacheShapes(config, options) {
|
|
|
21472
22390
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
21473
22391
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
21474
22392
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
21475
|
-
const batch_size = options?.batch_size ?? 1;
|
|
21476
22393
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
21477
22394
|
if (layer_types[i] === "full_attention") {
|
|
21478
22395
|
for (const kv of ["key", "value"]) {
|
|
@@ -21498,12 +22415,16 @@ function getCacheShapes(config, options) {
|
|
|
21498
22415
|
}
|
|
21499
22416
|
}
|
|
21500
22417
|
return cache_values;
|
|
21501
|
-
} else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
|
|
21502
|
-
|
|
21503
|
-
|
|
21504
|
-
|
|
21505
|
-
|
|
21506
|
-
|
|
22418
|
+
} else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
|
|
22419
|
+
let subConfig;
|
|
22420
|
+
if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
|
|
22421
|
+
subConfig = /** @type {any} */
|
|
22422
|
+
config.audio_config;
|
|
22423
|
+
} else {
|
|
22424
|
+
subConfig = /** @type {any} */
|
|
22425
|
+
config.text_config;
|
|
22426
|
+
}
|
|
22427
|
+
return getCacheShapes(subConfig, options);
|
|
21507
22428
|
}
|
|
21508
22429
|
return getKeyValueShapes(config, options);
|
|
21509
22430
|
}
|
|
@@ -21669,7 +22590,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
|
|
|
21669
22590
|
}
|
|
21670
22591
|
|
|
21671
22592
|
// src/models/session.js
|
|
21672
|
-
async function getSession(pretrained_model_name_or_path, fileName, options,
|
|
22593
|
+
async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
|
|
21673
22594
|
let custom_config = options.config?.["transformers.js_config"] ?? {};
|
|
21674
22595
|
const selectedDevice = (
|
|
21675
22596
|
/** @type {import("../utils/devices.js").DeviceType} */
|
|
@@ -21727,9 +22648,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
21727
22648
|
if (externalData.length > 0 && !apis.IS_NODE_ENV) {
|
|
21728
22649
|
session_options.externalData = externalData;
|
|
21729
22650
|
}
|
|
21730
|
-
if (
|
|
22651
|
+
if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
|
|
21731
22652
|
const shapes = getCacheShapes(options.config, {
|
|
21732
|
-
prefix: "present"
|
|
22653
|
+
prefix: "present",
|
|
22654
|
+
session_name
|
|
21733
22655
|
});
|
|
21734
22656
|
if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
|
|
21735
22657
|
const preferredOutputLocation = {};
|
|
@@ -21747,15 +22669,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
21747
22669
|
};
|
|
21748
22670
|
return { buffer_or_path, session_options, session_config };
|
|
21749
22671
|
}
|
|
21750
|
-
async function constructSessions(pretrained_model_name_or_path, names, options,
|
|
22672
|
+
async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
|
|
21751
22673
|
return Object.fromEntries(
|
|
21752
22674
|
await Promise.all(
|
|
21753
22675
|
Object.keys(names).map(async (name) => {
|
|
22676
|
+
const cache_config = cache_sessions?.[name] ?? false;
|
|
21754
22677
|
const { buffer_or_path, session_options, session_config } = await getSession(
|
|
21755
22678
|
pretrained_model_name_or_path,
|
|
21756
22679
|
names[name],
|
|
21757
22680
|
options,
|
|
21758
|
-
|
|
22681
|
+
cache_config,
|
|
22682
|
+
name
|
|
21759
22683
|
);
|
|
21760
22684
|
const session = await createInferenceSession(buffer_or_path, session_options, session_config);
|
|
21761
22685
|
return [name, session];
|
|
@@ -23055,19 +23979,71 @@ var BeamSearchSampler = class extends LogitsSampler {
|
|
|
23055
23979
|
}
|
|
23056
23980
|
};
|
|
23057
23981
|
|
|
23982
|
+
// src/cache_utils.js
|
|
23983
|
+
var _DynamicCache = class {
|
|
23984
|
+
/**
|
|
23985
|
+
* Create a DynamicCache, optionally pre-populated with entries.
|
|
23986
|
+
* @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
|
|
23987
|
+
*/
|
|
23988
|
+
constructor(entries) {
|
|
23989
|
+
if (!entries) return;
|
|
23990
|
+
for (const key in entries) {
|
|
23991
|
+
if (key in this) {
|
|
23992
|
+
throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
|
|
23993
|
+
}
|
|
23994
|
+
const value = entries[key];
|
|
23995
|
+
if (!(value instanceof Tensor3)) {
|
|
23996
|
+
throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
|
|
23997
|
+
}
|
|
23998
|
+
this[key] = value;
|
|
23999
|
+
}
|
|
24000
|
+
}
|
|
24001
|
+
/**
|
|
24002
|
+
* Get the cached sequence length. This requires at least one attention cache entry to be present.
|
|
24003
|
+
* @returns {number} The past sequence length.
|
|
24004
|
+
*/
|
|
24005
|
+
get_seq_length() {
|
|
24006
|
+
const self2 = (
|
|
24007
|
+
/** @type {any} */
|
|
24008
|
+
this
|
|
24009
|
+
);
|
|
24010
|
+
for (const name in self2) {
|
|
24011
|
+
if (name.startsWith("past_key_values.")) {
|
|
24012
|
+
return self2[name].dims.at(-2);
|
|
24013
|
+
}
|
|
24014
|
+
}
|
|
24015
|
+
throw new Error("Unable to determine sequence length from the cache.");
|
|
24016
|
+
}
|
|
24017
|
+
/**
|
|
24018
|
+
* Dispose all contained tensors whose data resides on the GPU.
|
|
24019
|
+
* Returns a promise that resolves when all disposals are complete.
|
|
24020
|
+
* @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
|
|
24021
|
+
*/
|
|
24022
|
+
async dispose() {
|
|
24023
|
+
const promises = [];
|
|
24024
|
+
for (
|
|
24025
|
+
const t of
|
|
24026
|
+
/** @type {Tensor[]} */
|
|
24027
|
+
Object.values(this)
|
|
24028
|
+
) {
|
|
24029
|
+
if (t.location === "gpu-buffer") {
|
|
24030
|
+
promises.push(t.dispose());
|
|
24031
|
+
}
|
|
24032
|
+
}
|
|
24033
|
+
await Promise.all(promises);
|
|
24034
|
+
}
|
|
24035
|
+
};
|
|
24036
|
+
var DynamicCache = (
|
|
24037
|
+
/** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
|
|
24038
|
+
/** @type {unknown} */
|
|
24039
|
+
_DynamicCache
|
|
24040
|
+
);
|
|
24041
|
+
|
|
23058
24042
|
// src/models/modeling_utils.js
|
|
23059
24043
|
var MODEL_MAPPING_NAMES = null;
|
|
23060
24044
|
function registerTaskMappings(mappings) {
|
|
23061
24045
|
MODEL_MAPPING_NAMES = mappings;
|
|
23062
24046
|
}
|
|
23063
|
-
function getPastLength(past_key_values) {
|
|
23064
|
-
for (const name in past_key_values) {
|
|
23065
|
-
if (name.startsWith("past_key_values.")) {
|
|
23066
|
-
return past_key_values[name].dims.at(-2);
|
|
23067
|
-
}
|
|
23068
|
-
}
|
|
23069
|
-
return Object.values(past_key_values)[0].dims.at(-2);
|
|
23070
|
-
}
|
|
23071
24047
|
function toI64Tensor(items) {
|
|
23072
24048
|
if (items instanceof Tensor3) {
|
|
23073
24049
|
return items;
|
|
@@ -23108,71 +24084,181 @@ var MODEL_TYPES = {
|
|
|
23108
24084
|
AutoEncoder: 12,
|
|
23109
24085
|
ImageAudioTextToText: 13,
|
|
23110
24086
|
Supertonic: 14,
|
|
23111
|
-
Chatterbox: 15
|
|
24087
|
+
Chatterbox: 15,
|
|
24088
|
+
MultimodalLanguageModelOnly: 16,
|
|
24089
|
+
VoxtralRealtime: 17
|
|
23112
24090
|
};
|
|
23113
24091
|
var MODEL_TYPE_CONFIG = {
|
|
23114
24092
|
[MODEL_TYPES.DecoderOnly]: {
|
|
23115
24093
|
can_generate: true,
|
|
23116
24094
|
forward: decoder_forward,
|
|
23117
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
24095
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24096
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
|
|
24097
|
+
cache_sessions: { model: true },
|
|
24098
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23118
24099
|
},
|
|
23119
24100
|
[MODEL_TYPES.DecoderOnlyWithoutHead]: {
|
|
23120
24101
|
can_generate: false,
|
|
23121
24102
|
forward: decoder_forward,
|
|
23122
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
24103
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24104
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
23123
24105
|
},
|
|
23124
24106
|
[MODEL_TYPES.Seq2Seq]: {
|
|
23125
24107
|
can_generate: true,
|
|
23126
24108
|
forward: seq2seq_forward,
|
|
23127
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
24109
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
24110
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24111
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24112
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23128
24113
|
},
|
|
23129
24114
|
[MODEL_TYPES.Vision2Seq]: {
|
|
23130
24115
|
can_generate: true,
|
|
23131
24116
|
forward: seq2seq_forward,
|
|
23132
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
24117
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
24118
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24119
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24120
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23133
24121
|
},
|
|
23134
24122
|
[MODEL_TYPES.Musicgen]: {
|
|
23135
24123
|
can_generate: true,
|
|
23136
|
-
forward: seq2seq_forward
|
|
24124
|
+
forward: seq2seq_forward,
|
|
24125
|
+
sessions: () => ({
|
|
24126
|
+
model: "text_encoder",
|
|
24127
|
+
decoder_model_merged: "decoder_model_merged",
|
|
24128
|
+
encodec_decode: "encodec_decode"
|
|
24129
|
+
}),
|
|
24130
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24131
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23137
24132
|
},
|
|
23138
24133
|
[MODEL_TYPES.EncoderDecoder]: {
|
|
23139
24134
|
can_generate: false,
|
|
23140
|
-
forward: seq2seq_forward
|
|
24135
|
+
forward: seq2seq_forward,
|
|
24136
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24137
|
+
cache_sessions: { decoder_model_merged: true }
|
|
24138
|
+
},
|
|
24139
|
+
[MODEL_TYPES.MaskGeneration]: {
|
|
24140
|
+
sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
|
|
23141
24141
|
},
|
|
23142
24142
|
[MODEL_TYPES.ImageTextToText]: {
|
|
23143
24143
|
can_generate: true,
|
|
23144
24144
|
forward: image_text_to_text_forward,
|
|
23145
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24145
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24146
|
+
sessions: (config) => {
|
|
24147
|
+
const s = {
|
|
24148
|
+
embed_tokens: "embed_tokens",
|
|
24149
|
+
vision_encoder: "vision_encoder",
|
|
24150
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24151
|
+
};
|
|
24152
|
+
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
24153
|
+
return s;
|
|
24154
|
+
},
|
|
24155
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24156
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23146
24157
|
},
|
|
23147
24158
|
[MODEL_TYPES.AudioTextToText]: {
|
|
23148
24159
|
can_generate: true,
|
|
23149
24160
|
forward: audio_text_to_text_forward,
|
|
23150
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24161
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24162
|
+
sessions: () => ({
|
|
24163
|
+
embed_tokens: "embed_tokens",
|
|
24164
|
+
audio_encoder: "audio_encoder",
|
|
24165
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24166
|
+
}),
|
|
24167
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24168
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23151
24169
|
},
|
|
23152
|
-
[MODEL_TYPES.
|
|
24170
|
+
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
23153
24171
|
can_generate: true,
|
|
23154
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24172
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24173
|
+
sessions: () => ({
|
|
24174
|
+
embed_tokens: "embed_tokens",
|
|
24175
|
+
audio_encoder: "audio_encoder",
|
|
24176
|
+
vision_encoder: "vision_encoder",
|
|
24177
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24178
|
+
}),
|
|
24179
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23155
24180
|
},
|
|
23156
|
-
[MODEL_TYPES.
|
|
24181
|
+
[MODEL_TYPES.Phi3V]: {
|
|
23157
24182
|
can_generate: true,
|
|
23158
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24183
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24184
|
+
sessions: () => ({
|
|
24185
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
24186
|
+
model: "model",
|
|
24187
|
+
vision_encoder: "vision_encoder"
|
|
24188
|
+
}),
|
|
24189
|
+
cache_sessions: { model: true },
|
|
24190
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23159
24191
|
},
|
|
23160
24192
|
[MODEL_TYPES.MultiModality]: {
|
|
23161
|
-
can_generate: true
|
|
24193
|
+
can_generate: true,
|
|
24194
|
+
sessions: () => ({
|
|
24195
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
24196
|
+
model: "language_model",
|
|
24197
|
+
lm_head: "lm_head",
|
|
24198
|
+
gen_head: "gen_head",
|
|
24199
|
+
gen_img_embeds: "gen_img_embeds",
|
|
24200
|
+
image_decode: "image_decode"
|
|
24201
|
+
}),
|
|
24202
|
+
cache_sessions: { model: true },
|
|
24203
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23162
24204
|
},
|
|
23163
24205
|
[MODEL_TYPES.AutoEncoder]: {
|
|
23164
24206
|
can_generate: false,
|
|
23165
|
-
forward: auto_encoder_forward
|
|
24207
|
+
forward: auto_encoder_forward,
|
|
24208
|
+
sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
|
|
24209
|
+
},
|
|
24210
|
+
[MODEL_TYPES.Supertonic]: {
|
|
24211
|
+
sessions: () => ({
|
|
24212
|
+
text_encoder: "text_encoder",
|
|
24213
|
+
latent_denoiser: "latent_denoiser",
|
|
24214
|
+
voice_decoder: "voice_decoder"
|
|
24215
|
+
})
|
|
23166
24216
|
},
|
|
23167
24217
|
[MODEL_TYPES.Chatterbox]: {
|
|
23168
24218
|
can_generate: true,
|
|
23169
|
-
forward: encoder_forward
|
|
24219
|
+
forward: encoder_forward,
|
|
24220
|
+
sessions: () => ({
|
|
24221
|
+
embed_tokens: "embed_tokens",
|
|
24222
|
+
speech_encoder: "speech_encoder",
|
|
24223
|
+
model: "language_model",
|
|
24224
|
+
conditional_decoder: "conditional_decoder"
|
|
24225
|
+
}),
|
|
24226
|
+
cache_sessions: { model: true },
|
|
24227
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
24228
|
+
},
|
|
24229
|
+
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
24230
|
+
can_generate: true,
|
|
24231
|
+
forward: image_text_to_text_forward,
|
|
24232
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24233
|
+
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
24234
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24235
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
24236
|
+
},
|
|
24237
|
+
[MODEL_TYPES.VoxtralRealtime]: {
|
|
24238
|
+
can_generate: true,
|
|
24239
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24240
|
+
sessions: () => ({
|
|
24241
|
+
embed_tokens: "embed_tokens",
|
|
24242
|
+
audio_encoder: "audio_encoder",
|
|
24243
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24244
|
+
}),
|
|
24245
|
+
cache_sessions: { decoder_model_merged: true, audio_encoder: true },
|
|
24246
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23170
24247
|
},
|
|
23171
24248
|
default: {
|
|
23172
24249
|
can_generate: false,
|
|
23173
|
-
forward: encoder_forward
|
|
24250
|
+
forward: encoder_forward,
|
|
24251
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
23174
24252
|
}
|
|
23175
24253
|
};
|
|
24254
|
+
function getSessionsConfig(modelType, config, options = {}) {
|
|
24255
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
24256
|
+
return {
|
|
24257
|
+
sessions: typeConfig.sessions(config, options),
|
|
24258
|
+
cache_sessions: typeConfig.cache_sessions,
|
|
24259
|
+
optional_configs: typeConfig.optional_configs
|
|
24260
|
+
};
|
|
24261
|
+
}
|
|
23176
24262
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
23177
24263
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
23178
24264
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -23258,245 +24344,23 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23258
24344
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
23259
24345
|
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
23260
24346
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
23261
|
-
|
|
23262
|
-
if (modelType ===
|
|
23263
|
-
|
|
23264
|
-
|
|
23265
|
-
|
|
23266
|
-
{
|
|
23267
|
-
|
|
23268
|
-
},
|
|
23269
|
-
options,
|
|
23270
|
-
"model"
|
|
23271
|
-
),
|
|
23272
|
-
get_optional_configs(
|
|
23273
|
-
pretrained_model_name_or_path,
|
|
23274
|
-
{
|
|
23275
|
-
generation_config: "generation_config.json"
|
|
23276
|
-
},
|
|
23277
|
-
options
|
|
23278
|
-
)
|
|
23279
|
-
]);
|
|
23280
|
-
} else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
|
|
23281
|
-
info = await Promise.all([
|
|
23282
|
-
constructSessions(
|
|
23283
|
-
pretrained_model_name_or_path,
|
|
23284
|
-
{
|
|
23285
|
-
model: "encoder_model",
|
|
23286
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23287
|
-
},
|
|
23288
|
-
options,
|
|
23289
|
-
"decoder_model_merged"
|
|
23290
|
-
),
|
|
23291
|
-
get_optional_configs(
|
|
23292
|
-
pretrained_model_name_or_path,
|
|
23293
|
-
{
|
|
23294
|
-
generation_config: "generation_config.json"
|
|
23295
|
-
},
|
|
23296
|
-
options
|
|
23297
|
-
)
|
|
23298
|
-
]);
|
|
23299
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
23300
|
-
info = await Promise.all([
|
|
23301
|
-
constructSessions(
|
|
23302
|
-
pretrained_model_name_or_path,
|
|
23303
|
-
{
|
|
23304
|
-
model: "vision_encoder",
|
|
23305
|
-
prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
|
|
23306
|
-
},
|
|
23307
|
-
options
|
|
23308
|
-
)
|
|
23309
|
-
]);
|
|
23310
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
23311
|
-
info = await Promise.all([
|
|
23312
|
-
constructSessions(
|
|
23313
|
-
pretrained_model_name_or_path,
|
|
23314
|
-
{
|
|
23315
|
-
model: "encoder_model",
|
|
23316
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23317
|
-
},
|
|
23318
|
-
options,
|
|
23319
|
-
"decoder_model_merged"
|
|
23320
|
-
)
|
|
23321
|
-
]);
|
|
23322
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
23323
|
-
const sessions = {
|
|
23324
|
-
embed_tokens: "embed_tokens",
|
|
23325
|
-
vision_encoder: "vision_encoder",
|
|
23326
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23327
|
-
};
|
|
23328
|
-
if (config.is_encoder_decoder) {
|
|
23329
|
-
sessions["model"] = "encoder_model";
|
|
23330
|
-
}
|
|
23331
|
-
info = await Promise.all([
|
|
23332
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
23333
|
-
get_optional_configs(
|
|
23334
|
-
pretrained_model_name_or_path,
|
|
23335
|
-
{
|
|
23336
|
-
generation_config: "generation_config.json"
|
|
23337
|
-
},
|
|
23338
|
-
options
|
|
23339
|
-
)
|
|
23340
|
-
]);
|
|
23341
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
23342
|
-
const sessions = {
|
|
23343
|
-
embed_tokens: "embed_tokens",
|
|
23344
|
-
audio_encoder: "audio_encoder",
|
|
23345
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23346
|
-
};
|
|
23347
|
-
info = await Promise.all([
|
|
23348
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
23349
|
-
get_optional_configs(
|
|
23350
|
-
pretrained_model_name_or_path,
|
|
23351
|
-
{
|
|
23352
|
-
generation_config: "generation_config.json"
|
|
23353
|
-
},
|
|
23354
|
-
options
|
|
23355
|
-
)
|
|
23356
|
-
]);
|
|
23357
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
23358
|
-
const sessions = {
|
|
23359
|
-
embed_tokens: "embed_tokens",
|
|
23360
|
-
audio_encoder: "audio_encoder",
|
|
23361
|
-
vision_encoder: "vision_encoder",
|
|
23362
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23363
|
-
};
|
|
23364
|
-
info = await Promise.all([
|
|
23365
|
-
constructSessions(pretrained_model_name_or_path, sessions, options),
|
|
23366
|
-
get_optional_configs(
|
|
23367
|
-
pretrained_model_name_or_path,
|
|
23368
|
-
{
|
|
23369
|
-
generation_config: "generation_config.json"
|
|
23370
|
-
},
|
|
23371
|
-
options
|
|
23372
|
-
)
|
|
23373
|
-
]);
|
|
23374
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
23375
|
-
info = await Promise.all([
|
|
23376
|
-
constructSessions(
|
|
23377
|
-
pretrained_model_name_or_path,
|
|
23378
|
-
{
|
|
23379
|
-
model: "text_encoder",
|
|
23380
|
-
decoder_model_merged: "decoder_model_merged",
|
|
23381
|
-
encodec_decode: "encodec_decode"
|
|
23382
|
-
},
|
|
23383
|
-
options,
|
|
23384
|
-
"decoder_model_merged"
|
|
23385
|
-
),
|
|
23386
|
-
get_optional_configs(
|
|
23387
|
-
pretrained_model_name_or_path,
|
|
23388
|
-
{
|
|
23389
|
-
generation_config: "generation_config.json"
|
|
23390
|
-
},
|
|
23391
|
-
options
|
|
23392
|
-
)
|
|
23393
|
-
]);
|
|
23394
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
23395
|
-
info = await Promise.all([
|
|
23396
|
-
constructSessions(
|
|
23397
|
-
pretrained_model_name_or_path,
|
|
23398
|
-
{
|
|
23399
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23400
|
-
model: "language_model",
|
|
23401
|
-
lm_head: "lm_head",
|
|
23402
|
-
gen_head: "gen_head",
|
|
23403
|
-
gen_img_embeds: "gen_img_embeds",
|
|
23404
|
-
image_decode: "image_decode"
|
|
23405
|
-
},
|
|
23406
|
-
options,
|
|
23407
|
-
"model"
|
|
23408
|
-
),
|
|
23409
|
-
get_optional_configs(
|
|
23410
|
-
pretrained_model_name_or_path,
|
|
23411
|
-
{
|
|
23412
|
-
generation_config: "generation_config.json"
|
|
23413
|
-
},
|
|
23414
|
-
options
|
|
23415
|
-
)
|
|
23416
|
-
]);
|
|
23417
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
23418
|
-
info = await Promise.all([
|
|
23419
|
-
constructSessions(
|
|
23420
|
-
pretrained_model_name_or_path,
|
|
23421
|
-
{
|
|
23422
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23423
|
-
model: "model",
|
|
23424
|
-
vision_encoder: "vision_encoder"
|
|
23425
|
-
},
|
|
23426
|
-
options,
|
|
23427
|
-
"model"
|
|
23428
|
-
),
|
|
23429
|
-
get_optional_configs(
|
|
23430
|
-
pretrained_model_name_or_path,
|
|
23431
|
-
{
|
|
23432
|
-
generation_config: "generation_config.json"
|
|
23433
|
-
},
|
|
23434
|
-
options
|
|
23435
|
-
)
|
|
23436
|
-
]);
|
|
23437
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
23438
|
-
info = await Promise.all([
|
|
23439
|
-
constructSessions(
|
|
23440
|
-
pretrained_model_name_or_path,
|
|
23441
|
-
{
|
|
23442
|
-
embed_tokens: "embed_tokens",
|
|
23443
|
-
speech_encoder: "speech_encoder",
|
|
23444
|
-
model: "language_model",
|
|
23445
|
-
conditional_decoder: "conditional_decoder"
|
|
23446
|
-
},
|
|
23447
|
-
options,
|
|
23448
|
-
"model"
|
|
23449
|
-
),
|
|
23450
|
-
get_optional_configs(
|
|
23451
|
-
pretrained_model_name_or_path,
|
|
23452
|
-
{
|
|
23453
|
-
generation_config: "generation_config.json"
|
|
23454
|
-
},
|
|
23455
|
-
options
|
|
23456
|
-
)
|
|
23457
|
-
]);
|
|
23458
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
23459
|
-
info = await Promise.all([
|
|
23460
|
-
constructSessions(
|
|
23461
|
-
pretrained_model_name_or_path,
|
|
23462
|
-
{
|
|
23463
|
-
encoder_model: "encoder_model",
|
|
23464
|
-
decoder_model: "decoder_model"
|
|
23465
|
-
},
|
|
23466
|
-
options
|
|
23467
|
-
)
|
|
23468
|
-
]);
|
|
23469
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
23470
|
-
info = await Promise.all([
|
|
23471
|
-
constructSessions(
|
|
23472
|
-
pretrained_model_name_or_path,
|
|
23473
|
-
{
|
|
23474
|
-
text_encoder: "text_encoder",
|
|
23475
|
-
latent_denoiser: "latent_denoiser",
|
|
23476
|
-
voice_decoder: "voice_decoder"
|
|
23477
|
-
},
|
|
23478
|
-
options
|
|
23479
|
-
)
|
|
23480
|
-
]);
|
|
23481
|
-
} else {
|
|
23482
|
-
if (modelType === void 0) {
|
|
23483
|
-
const type = modelName ?? config?.model_type;
|
|
23484
|
-
if (type !== "custom") {
|
|
23485
|
-
logger.warn(
|
|
23486
|
-
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
23487
|
-
);
|
|
23488
|
-
}
|
|
24347
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
24348
|
+
if (modelType === void 0) {
|
|
24349
|
+
const type = modelName ?? config?.model_type;
|
|
24350
|
+
if (type !== "custom") {
|
|
24351
|
+
logger.warn(
|
|
24352
|
+
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
24353
|
+
);
|
|
23489
24354
|
}
|
|
23490
|
-
info = await Promise.all([
|
|
23491
|
-
constructSessions(
|
|
23492
|
-
pretrained_model_name_or_path,
|
|
23493
|
-
{
|
|
23494
|
-
model: options.model_file_name ?? "model"
|
|
23495
|
-
},
|
|
23496
|
-
options
|
|
23497
|
-
)
|
|
23498
|
-
]);
|
|
23499
24355
|
}
|
|
24356
|
+
const sessions = typeConfig.sessions(config, options);
|
|
24357
|
+
const promises = [
|
|
24358
|
+
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
24359
|
+
];
|
|
24360
|
+
if (typeConfig.optional_configs) {
|
|
24361
|
+
promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
|
|
24362
|
+
}
|
|
24363
|
+
const info = await Promise.all(promises);
|
|
23500
24364
|
return new this(config, ...info);
|
|
23501
24365
|
}
|
|
23502
24366
|
/**
|
|
@@ -23695,7 +24559,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23695
24559
|
* @param {Tensor} [params.inputs=null]
|
|
23696
24560
|
* @param {number} [params.bos_token_id=null]
|
|
23697
24561
|
* @param {Record<string, Tensor|number[]>} [params.model_kwargs]
|
|
23698
|
-
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor
|
|
24562
|
+
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
|
|
23699
24563
|
*/
|
|
23700
24564
|
_prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
|
|
23701
24565
|
const model_inputs = pick(model_kwargs, this.forward_params);
|
|
@@ -23936,11 +24800,12 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23936
24800
|
}
|
|
23937
24801
|
}
|
|
23938
24802
|
/**
|
|
23939
|
-
* Returns
|
|
24803
|
+
* Returns a DynamicCache containing past key values from the given decoder results object.
|
|
23940
24804
|
*
|
|
23941
24805
|
* @param {Object} decoderResults The decoder results object.
|
|
23942
|
-
* @param {
|
|
23943
|
-
* @
|
|
24806
|
+
* @param {DynamicCache} pastKeyValues The previous past key values.
|
|
24807
|
+
* @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
|
|
24808
|
+
* @returns {DynamicCache} A new DynamicCache containing the updated past key values.
|
|
23944
24809
|
*/
|
|
23945
24810
|
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
23946
24811
|
const pkvs = /* @__PURE__ */ Object.create(null);
|
|
@@ -23961,7 +24826,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23961
24826
|
}
|
|
23962
24827
|
}
|
|
23963
24828
|
}
|
|
23964
|
-
return pkvs;
|
|
24829
|
+
return new DynamicCache(pkvs);
|
|
23965
24830
|
}
|
|
23966
24831
|
/**
|
|
23967
24832
|
* Returns an object containing attentions from the given model output object.
|
|
@@ -23986,8 +24851,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23986
24851
|
/**
|
|
23987
24852
|
* Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
|
|
23988
24853
|
*
|
|
23989
|
-
* @param {
|
|
23990
|
-
* @param {
|
|
24854
|
+
* @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
|
|
24855
|
+
* @param {DynamicCache|null} pastKeyValues The cache containing past key values.
|
|
23991
24856
|
*/
|
|
23992
24857
|
addPastKeyValues(decoderFeeds, pastKeyValues) {
|
|
23993
24858
|
if (pastKeyValues) {
|
|
@@ -24004,14 +24869,29 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24004
24869
|
}
|
|
24005
24870
|
}
|
|
24006
24871
|
}
|
|
24007
|
-
|
|
24008
|
-
|
|
24872
|
+
/**
|
|
24873
|
+
* Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
|
|
24874
|
+
* @param {string} sessionName
|
|
24875
|
+
* @param {Record<string, Tensor>} inputs
|
|
24876
|
+
* @param {string} outputName
|
|
24877
|
+
* @private
|
|
24878
|
+
*/
|
|
24879
|
+
async _encode_input(sessionName, inputs, outputName) {
|
|
24880
|
+
if (!Object.hasOwn(this.sessions, sessionName)) {
|
|
24881
|
+
throw new Error(`Model does not have a ${sessionName} session.`);
|
|
24882
|
+
}
|
|
24883
|
+
const session = this.sessions[sessionName];
|
|
24884
|
+
const output = await sessionRun(session, pick(inputs, session.inputNames));
|
|
24885
|
+
return output[outputName];
|
|
24009
24886
|
}
|
|
24010
|
-
async
|
|
24011
|
-
return
|
|
24887
|
+
async encode_image(inputs) {
|
|
24888
|
+
return this._encode_input("vision_encoder", inputs, "image_features");
|
|
24012
24889
|
}
|
|
24013
|
-
async
|
|
24014
|
-
return
|
|
24890
|
+
async encode_text(inputs) {
|
|
24891
|
+
return this._encode_input("embed_tokens", inputs, "inputs_embeds");
|
|
24892
|
+
}
|
|
24893
|
+
async encode_audio(inputs) {
|
|
24894
|
+
return this._encode_input("audio_encoder", inputs, "audio_features");
|
|
24015
24895
|
}
|
|
24016
24896
|
};
|
|
24017
24897
|
async function seq2seq_forward(self2, model_inputs) {
|
|
@@ -24066,6 +24946,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
|
|
|
24066
24946
|
const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
|
|
24067
24947
|
new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
|
|
24068
24948
|
}
|
|
24949
|
+
if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
|
|
24950
|
+
new_model_inputs.num_logits_to_keep = new Tensor3("int64", [0n], []);
|
|
24951
|
+
}
|
|
24069
24952
|
self2.addPastKeyValues(new_model_inputs, past_key_values);
|
|
24070
24953
|
const fixed = pick(new_model_inputs, session.inputNames);
|
|
24071
24954
|
return await sessionRun(session, fixed);
|
|
@@ -24074,7 +24957,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24074
24957
|
// Generic parameters:
|
|
24075
24958
|
encode_function,
|
|
24076
24959
|
merge_function,
|
|
24077
|
-
|
|
24960
|
+
modality_input_names,
|
|
24078
24961
|
modality_output_name,
|
|
24079
24962
|
// Produced by the tokenizer/processor:
|
|
24080
24963
|
input_ids = null,
|
|
@@ -24089,32 +24972,34 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24089
24972
|
// Additional parameters
|
|
24090
24973
|
...kwargs
|
|
24091
24974
|
}) {
|
|
24092
|
-
const modality_values = kwargs[modality_input_name];
|
|
24093
24975
|
if (!inputs_embeds) {
|
|
24094
24976
|
inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
|
|
24095
|
-
|
|
24096
|
-
|
|
24097
|
-
|
|
24098
|
-
|
|
24099
|
-
|
|
24100
|
-
|
|
24101
|
-
|
|
24102
|
-
|
|
24103
|
-
|
|
24104
|
-
inputs_embeds,
|
|
24105
|
-
|
|
24106
|
-
|
|
24107
|
-
|
|
24108
|
-
|
|
24109
|
-
|
|
24110
|
-
|
|
24111
|
-
|
|
24112
|
-
|
|
24113
|
-
|
|
24114
|
-
|
|
24115
|
-
|
|
24116
|
-
|
|
24117
|
-
|
|
24977
|
+
const modality_values = pick(kwargs, modality_input_names);
|
|
24978
|
+
if (Object.keys(modality_values).length > 0) {
|
|
24979
|
+
if (input_ids.dims[1] !== 1) {
|
|
24980
|
+
const modality_features = await encode_function({
|
|
24981
|
+
// Pass the modality values under its expected key.
|
|
24982
|
+
// The caller knows whether this is audio or image.
|
|
24983
|
+
...modality_values,
|
|
24984
|
+
...kwargs
|
|
24985
|
+
});
|
|
24986
|
+
({ inputs_embeds, attention_mask } = merge_function({
|
|
24987
|
+
[modality_output_name]: modality_features,
|
|
24988
|
+
inputs_embeds,
|
|
24989
|
+
input_ids,
|
|
24990
|
+
attention_mask
|
|
24991
|
+
}));
|
|
24992
|
+
} else if (past_key_values && input_ids.dims[1] === 1) {
|
|
24993
|
+
const target_length = input_ids.dims[1];
|
|
24994
|
+
const past_length = past_key_values.get_seq_length();
|
|
24995
|
+
attention_mask = cat(
|
|
24996
|
+
[
|
|
24997
|
+
ones([input_ids.dims[0], past_length]),
|
|
24998
|
+
attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
|
|
24999
|
+
],
|
|
25000
|
+
1
|
|
25001
|
+
);
|
|
25002
|
+
}
|
|
24118
25003
|
}
|
|
24119
25004
|
}
|
|
24120
25005
|
if (!position_ids) {
|
|
@@ -24122,14 +25007,19 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24122
25007
|
// Handle special case for qwen vl models
|
|
24123
25008
|
[
|
|
24124
25009
|
"qwen2_vl",
|
|
25010
|
+
"qwen2_vl_text",
|
|
24125
25011
|
"qwen2_5_vl",
|
|
24126
25012
|
"qwen2_5_vl_text",
|
|
24127
25013
|
"qwen3_vl",
|
|
24128
25014
|
"qwen3_vl_text",
|
|
25015
|
+
"qwen3_vl_moe",
|
|
25016
|
+
"qwen3_vl_moe_text",
|
|
24129
25017
|
"qwen3_5",
|
|
24130
25018
|
"qwen3_5_text",
|
|
24131
25019
|
"qwen3_5_moe",
|
|
24132
|
-
"qwen3_5_moe_text"
|
|
25020
|
+
"qwen3_5_moe_text",
|
|
25021
|
+
"glm_ocr",
|
|
25022
|
+
"glm_ocr_text"
|
|
24133
25023
|
].includes(self2.config.model_type)
|
|
24134
25024
|
) {
|
|
24135
25025
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -24153,7 +25043,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24153
25043
|
async function audio_text_to_text_forward(self2, params) {
|
|
24154
25044
|
return await generic_text_to_text_forward(self2, {
|
|
24155
25045
|
...params,
|
|
24156
|
-
|
|
25046
|
+
modality_input_names: ["audio_values", "input_features"],
|
|
24157
25047
|
modality_output_name: "audio_features",
|
|
24158
25048
|
encode_function: self2.encode_audio.bind(self2),
|
|
24159
25049
|
merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
|
|
@@ -24162,7 +25052,7 @@ async function audio_text_to_text_forward(self2, params) {
|
|
|
24162
25052
|
async function image_text_to_text_forward(self2, params) {
|
|
24163
25053
|
return await generic_text_to_text_forward(self2, {
|
|
24164
25054
|
...params,
|
|
24165
|
-
|
|
25055
|
+
modality_input_names: ["pixel_values"],
|
|
24166
25056
|
modality_output_name: "image_features",
|
|
24167
25057
|
encode_function: self2.encode_image.bind(self2),
|
|
24168
25058
|
merge_function: self2._merge_input_ids_with_image_features.bind(self2)
|
|
@@ -24198,7 +25088,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
|
|
|
24198
25088
|
return position_ids;
|
|
24199
25089
|
}
|
|
24200
25090
|
function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
|
|
24201
|
-
const past_length = model_inputs.past_key_values ?
|
|
25091
|
+
const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
|
|
25092
|
+
const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
|
|
25093
|
+
if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
|
|
25094
|
+
model_inputs.num_logits_to_keep = new Tensor3("int64", [1n], []);
|
|
25095
|
+
}
|
|
24202
25096
|
if (!model_inputs.attention_mask) {
|
|
24203
25097
|
let dims;
|
|
24204
25098
|
for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
|
|
@@ -24349,6 +25243,8 @@ __export(models_exports, {
|
|
|
24349
25243
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
24350
25244
|
BloomModel: () => BloomModel,
|
|
24351
25245
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
25246
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
25247
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
24352
25248
|
CLIPModel: () => CLIPModel,
|
|
24353
25249
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
24354
25250
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -24423,6 +25319,9 @@ __export(models_exports, {
|
|
|
24423
25319
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
24424
25320
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
24425
25321
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
25322
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
25323
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
25324
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
24426
25325
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
24427
25326
|
DeiTModel: () => DeiTModel,
|
|
24428
25327
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -24468,6 +25367,11 @@ __export(models_exports, {
|
|
|
24468
25367
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
24469
25368
|
EsmModel: () => EsmModel,
|
|
24470
25369
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
25370
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
25371
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
25372
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
25373
|
+
EuroBertModel: () => EuroBertModel,
|
|
25374
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
24471
25375
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
24472
25376
|
ExaoneModel: () => ExaoneModel,
|
|
24473
25377
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -24506,6 +25410,7 @@ __export(models_exports, {
|
|
|
24506
25410
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
24507
25411
|
Gemma3Model: () => Gemma3Model,
|
|
24508
25412
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
25413
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
24509
25414
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
24510
25415
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
24511
25416
|
GemmaForCausalLM: () => GemmaForCausalLM,
|
|
@@ -24513,6 +25418,10 @@ __export(models_exports, {
|
|
|
24513
25418
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
24514
25419
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
24515
25420
|
GlmModel: () => GlmModel,
|
|
25421
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
25422
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
25423
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
25424
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
24516
25425
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
24517
25426
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
24518
25427
|
GptOssModel: () => GptOssModel,
|
|
@@ -24523,6 +25432,7 @@ __export(models_exports, {
|
|
|
24523
25432
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
24524
25433
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
24525
25434
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
25435
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
24526
25436
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
24527
25437
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
24528
25438
|
GroupViTModel: () => GroupViTModel,
|
|
@@ -24544,7 +25454,6 @@ __export(models_exports, {
|
|
|
24544
25454
|
IJepaModel: () => IJepaModel,
|
|
24545
25455
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
24546
25456
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
24547
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
24548
25457
|
JAISLMHeadModel: () => JAISLMHeadModel,
|
|
24549
25458
|
JAISModel: () => JAISModel,
|
|
24550
25459
|
JAISPreTrainedModel: () => JAISPreTrainedModel,
|
|
@@ -24558,6 +25467,8 @@ __export(models_exports, {
|
|
|
24558
25467
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
24559
25468
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
24560
25469
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
25470
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
25471
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
24561
25472
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
24562
25473
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
24563
25474
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -24607,6 +25518,9 @@ __export(models_exports, {
|
|
|
24607
25518
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
24608
25519
|
MimiModel: () => MimiModel,
|
|
24609
25520
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
25521
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
25522
|
+
Mistral4Model: () => Mistral4Model,
|
|
25523
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
24610
25524
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
24611
25525
|
MistralModel: () => MistralModel,
|
|
24612
25526
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -24664,6 +25578,9 @@ __export(models_exports, {
|
|
|
24664
25578
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
24665
25579
|
NanoChatModel: () => NanoChatModel,
|
|
24666
25580
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
25581
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
25582
|
+
NemotronHModel: () => NemotronHModel,
|
|
25583
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
24667
25584
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
24668
25585
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
24669
25586
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -24697,7 +25614,6 @@ __export(models_exports, {
|
|
|
24697
25614
|
Owlv2Model: () => Owlv2Model,
|
|
24698
25615
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
24699
25616
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
24700
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
24701
25617
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
24702
25618
|
ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
|
|
24703
25619
|
PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
|
|
@@ -24727,8 +25643,10 @@ __export(models_exports, {
|
|
|
24727
25643
|
Qwen2MoeModel: () => Qwen2MoeModel,
|
|
24728
25644
|
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
24729
25645
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
25646
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
24730
25647
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
24731
25648
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
25649
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
24732
25650
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
24733
25651
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
24734
25652
|
Qwen3Model: () => Qwen3Model,
|
|
@@ -24739,9 +25657,13 @@ __export(models_exports, {
|
|
|
24739
25657
|
Qwen3NextModel: () => Qwen3NextModel,
|
|
24740
25658
|
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
24741
25659
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
25660
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
24742
25661
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
25662
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
24743
25663
|
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
25664
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
24744
25665
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
25666
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
24745
25667
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
24746
25668
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
24747
25669
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -24792,11 +25714,13 @@ __export(models_exports, {
|
|
|
24792
25714
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
24793
25715
|
SmolLM3Model: () => SmolLM3Model,
|
|
24794
25716
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
24795
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
24796
25717
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
24797
25718
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
24798
25719
|
SnacModel: () => SnacModel,
|
|
24799
25720
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
25721
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
25722
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
25723
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
24800
25724
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
24801
25725
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
24802
25726
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -24864,6 +25788,8 @@ __export(models_exports, {
|
|
|
24864
25788
|
VitsModelOutput: () => VitsModelOutput,
|
|
24865
25789
|
VitsPreTrainedModel: () => VitsPreTrainedModel,
|
|
24866
25790
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
25791
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
25792
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
24867
25793
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
24868
25794
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
24869
25795
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -24969,7 +25895,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
24969
25895
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
24970
25896
|
};
|
|
24971
25897
|
|
|
24972
|
-
// src/models/
|
|
25898
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
24973
25899
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
24974
25900
|
};
|
|
24975
25901
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -25224,7 +26150,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
|
|
|
25224
26150
|
if (!past_key_values || target_length !== 1) {
|
|
25225
26151
|
throw new Error("Incorrect state encountered during generation.");
|
|
25226
26152
|
}
|
|
25227
|
-
const past_length =
|
|
26153
|
+
const past_length = past_key_values.get_seq_length();
|
|
25228
26154
|
attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
|
|
25229
26155
|
}
|
|
25230
26156
|
}
|
|
@@ -25304,6 +26230,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
25304
26230
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
25305
26231
|
};
|
|
25306
26232
|
|
|
26233
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
26234
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
26235
|
+
};
|
|
26236
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
26237
|
+
};
|
|
26238
|
+
|
|
25307
26239
|
// src/models/clap/modeling_clap.js
|
|
25308
26240
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
25309
26241
|
};
|
|
@@ -25642,6 +26574,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
25642
26574
|
}
|
|
25643
26575
|
};
|
|
25644
26576
|
|
|
26577
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
26578
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
26579
|
+
};
|
|
26580
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
26581
|
+
};
|
|
26582
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
26583
|
+
};
|
|
26584
|
+
|
|
25645
26585
|
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
25646
26586
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
25647
26587
|
};
|
|
@@ -25990,6 +26930,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
25990
26930
|
}
|
|
25991
26931
|
};
|
|
25992
26932
|
|
|
26933
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
26934
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
26935
|
+
};
|
|
26936
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
26937
|
+
};
|
|
26938
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
26939
|
+
/**
|
|
26940
|
+
* Calls the model on new inputs.
|
|
26941
|
+
*
|
|
26942
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26943
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
26944
|
+
*/
|
|
26945
|
+
async _call(model_inputs) {
|
|
26946
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
26947
|
+
}
|
|
26948
|
+
};
|
|
26949
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
26950
|
+
/**
|
|
26951
|
+
* Calls the model on new inputs.
|
|
26952
|
+
*
|
|
26953
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26954
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
26955
|
+
*/
|
|
26956
|
+
async _call(model_inputs) {
|
|
26957
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
26958
|
+
}
|
|
26959
|
+
};
|
|
26960
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
26961
|
+
/**
|
|
26962
|
+
* Calls the model on new inputs.
|
|
26963
|
+
*
|
|
26964
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26965
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
26966
|
+
*/
|
|
26967
|
+
async _call(model_inputs) {
|
|
26968
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
26969
|
+
}
|
|
26970
|
+
};
|
|
26971
|
+
|
|
25993
26972
|
// src/models/exaone/modeling_exaone.js
|
|
25994
26973
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
25995
26974
|
};
|
|
@@ -26254,6 +27233,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
|
|
|
26254
27233
|
});
|
|
26255
27234
|
}
|
|
26256
27235
|
};
|
|
27236
|
+
var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
|
|
27237
|
+
};
|
|
26257
27238
|
|
|
26258
27239
|
// src/models/glm/modeling_glm.js
|
|
26259
27240
|
var GlmPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -26263,6 +27244,377 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
26263
27244
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
26264
27245
|
};
|
|
26265
27246
|
|
|
27247
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
27248
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
27249
|
+
};
|
|
27250
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
27251
|
+
};
|
|
27252
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
27253
|
+
};
|
|
27254
|
+
|
|
27255
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
27256
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
27257
|
+
forward_params = [
|
|
27258
|
+
// Text inputs
|
|
27259
|
+
"input_ids",
|
|
27260
|
+
"attention_mask",
|
|
27261
|
+
"position_ids",
|
|
27262
|
+
"past_key_values",
|
|
27263
|
+
// Vision inputs
|
|
27264
|
+
"pixel_values",
|
|
27265
|
+
"image_grid_thw"
|
|
27266
|
+
];
|
|
27267
|
+
};
|
|
27268
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
27269
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
27270
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
27271
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
27272
|
+
image_grid_thw_name = "grid_thw";
|
|
27273
|
+
/**
|
|
27274
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
27275
|
+
* @param {Tensor} input_ids
|
|
27276
|
+
* @param {Tensor} attention_mask
|
|
27277
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
27278
|
+
*/
|
|
27279
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
27280
|
+
if (attention_mask) {
|
|
27281
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
27282
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
27283
|
+
const mrope_position_deltas = Array.from(
|
|
27284
|
+
{ length: dims[0] },
|
|
27285
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
27286
|
+
);
|
|
27287
|
+
return [
|
|
27288
|
+
new Tensor3("int64", position_ids, [3, ...dims]),
|
|
27289
|
+
new Tensor3("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27290
|
+
];
|
|
27291
|
+
} else {
|
|
27292
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
27293
|
+
const position_ids = BigInt64Array.from(
|
|
27294
|
+
{ length: 3 * batch_size * seq_length },
|
|
27295
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
27296
|
+
);
|
|
27297
|
+
return [new Tensor3("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
27298
|
+
}
|
|
27299
|
+
}
|
|
27300
|
+
/**
|
|
27301
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
27302
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
27303
|
+
* respecting attention mask.
|
|
27304
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
27305
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
27306
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
27307
|
+
* @param {number} batch_idx Current batch index
|
|
27308
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
27309
|
+
*/
|
|
27310
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
27311
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
27312
|
+
const llm_positions = new Array(total_len);
|
|
27313
|
+
let index = 0;
|
|
27314
|
+
for (let x = 0; x < 3; ++x) {
|
|
27315
|
+
for (const val of llm_pos_ids_list) {
|
|
27316
|
+
const seg_len = val.length / 3;
|
|
27317
|
+
for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
|
|
27318
|
+
llm_positions[index++] = val[z];
|
|
27319
|
+
}
|
|
27320
|
+
}
|
|
27321
|
+
}
|
|
27322
|
+
let count2 = 0;
|
|
27323
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
27324
|
+
if (attn_mask[y] == 1) {
|
|
27325
|
+
for (let x = 0; x < 3; ++x) {
|
|
27326
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
27327
|
+
}
|
|
27328
|
+
++count2;
|
|
27329
|
+
}
|
|
27330
|
+
}
|
|
27331
|
+
return llm_positions;
|
|
27332
|
+
}
|
|
27333
|
+
/**
|
|
27334
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
27335
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
27336
|
+
* @param {object} params
|
|
27337
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
27338
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
27339
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
27340
|
+
* @param {number} params.spatial_merge_size
|
|
27341
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
27342
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
27343
|
+
*/
|
|
27344
|
+
_get_multimodal_rope_positions({
|
|
27345
|
+
filtered_ids,
|
|
27346
|
+
image_grid_thw_list,
|
|
27347
|
+
video_grid_thw_list,
|
|
27348
|
+
spatial_merge_size,
|
|
27349
|
+
state
|
|
27350
|
+
}) {
|
|
27351
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
27352
|
+
const ids = filtered_ids;
|
|
27353
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
27354
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
27355
|
+
return acc;
|
|
27356
|
+
}, []);
|
|
27357
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
27358
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
27359
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
27360
|
+
const llm_pos_ids_list = [];
|
|
27361
|
+
let st2 = 0;
|
|
27362
|
+
let remain_images = image_nums;
|
|
27363
|
+
let remain_videos = video_nums;
|
|
27364
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
27365
|
+
const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
|
|
27366
|
+
const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
|
|
27367
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
27368
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
27369
|
+
let ed;
|
|
27370
|
+
let t, h, w;
|
|
27371
|
+
if (ed_image < ed_video) {
|
|
27372
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
27373
|
+
++state.image_index;
|
|
27374
|
+
--remain_images;
|
|
27375
|
+
ed = ed_image;
|
|
27376
|
+
} else {
|
|
27377
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
27378
|
+
++state.video_index;
|
|
27379
|
+
--remain_videos;
|
|
27380
|
+
ed = ed_video;
|
|
27381
|
+
}
|
|
27382
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
27383
|
+
Number(t),
|
|
27384
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
27385
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
27386
|
+
];
|
|
27387
|
+
const text_len = ed - st2;
|
|
27388
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27389
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
27390
|
+
const offset = text_len + st_idx;
|
|
27391
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
27392
|
+
const t_index = Array.from(
|
|
27393
|
+
{ length: grid_size },
|
|
27394
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
27395
|
+
);
|
|
27396
|
+
const h_index = Array.from(
|
|
27397
|
+
{ length: grid_size },
|
|
27398
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
27399
|
+
);
|
|
27400
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
27401
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
27402
|
+
st2 = ed + grid_size;
|
|
27403
|
+
}
|
|
27404
|
+
if (st2 < ids.length) {
|
|
27405
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27406
|
+
const text_len = ids.length - st2;
|
|
27407
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
27408
|
+
}
|
|
27409
|
+
return llm_pos_ids_list;
|
|
27410
|
+
}
|
|
27411
|
+
/**
|
|
27412
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
27413
|
+
*
|
|
27414
|
+
* Explanation:
|
|
27415
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
27416
|
+
*
|
|
27417
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
27418
|
+
* Examples:
|
|
27419
|
+
* input_ids: [T T T T T], here T is for text.
|
|
27420
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
27421
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
27422
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
27423
|
+
*
|
|
27424
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
27425
|
+
* and 1D rotary position embeddin for text part.
|
|
27426
|
+
* Examples:
|
|
27427
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
27428
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
27429
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
27430
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
27431
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
27432
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
27433
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
27434
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
27435
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
27436
|
+
*
|
|
27437
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
27438
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
27439
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
27440
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
27441
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
27442
|
+
*/
|
|
27443
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
27444
|
+
const { vision_config } = this.config;
|
|
27445
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
27446
|
+
if (image_grid_thw || video_grid_thw) {
|
|
27447
|
+
const total_input_ids = input_ids.tolist();
|
|
27448
|
+
if (!attention_mask) {
|
|
27449
|
+
attention_mask = ones_like(input_ids);
|
|
27450
|
+
}
|
|
27451
|
+
const attention_mask_list = attention_mask.tolist();
|
|
27452
|
+
const position_ids_list = Array.from(
|
|
27453
|
+
{ length: 3 },
|
|
27454
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
27455
|
+
);
|
|
27456
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
27457
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
27458
|
+
const state = { image_index: 0, video_index: 0 };
|
|
27459
|
+
const mrope_position_deltas = [];
|
|
27460
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
27461
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
27462
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
27463
|
+
filtered_ids,
|
|
27464
|
+
image_grid_thw_list,
|
|
27465
|
+
video_grid_thw_list,
|
|
27466
|
+
spatial_merge_size,
|
|
27467
|
+
state
|
|
27468
|
+
});
|
|
27469
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
27470
|
+
llm_pos_ids_list,
|
|
27471
|
+
attention_mask_list[i],
|
|
27472
|
+
position_ids_list,
|
|
27473
|
+
i
|
|
27474
|
+
);
|
|
27475
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
27476
|
+
}
|
|
27477
|
+
return [
|
|
27478
|
+
new Tensor3("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
27479
|
+
new Tensor3("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27480
|
+
];
|
|
27481
|
+
} else {
|
|
27482
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
27483
|
+
}
|
|
27484
|
+
}
|
|
27485
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
27486
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
27487
|
+
pixel_values,
|
|
27488
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
27489
|
+
})).image_features;
|
|
27490
|
+
return features;
|
|
27491
|
+
}
|
|
27492
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
27493
|
+
return default_merge_input_ids_with_image_features({
|
|
27494
|
+
// @ts-ignore
|
|
27495
|
+
image_token_id: this.config.image_token_id,
|
|
27496
|
+
...kwargs
|
|
27497
|
+
});
|
|
27498
|
+
}
|
|
27499
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
27500
|
+
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
27501
|
+
if (!model_inputs.past_key_values) {
|
|
27502
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27503
|
+
model_inputs.input_ids,
|
|
27504
|
+
model_inputs.image_grid_thw,
|
|
27505
|
+
model_inputs.video_grid_thw,
|
|
27506
|
+
model_inputs.attention_mask
|
|
27507
|
+
);
|
|
27508
|
+
} else {
|
|
27509
|
+
model_inputs.pixel_values = null;
|
|
27510
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
27511
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27512
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27513
|
+
model_inputs.input_ids,
|
|
27514
|
+
model_inputs.image_grid_thw,
|
|
27515
|
+
model_inputs.video_grid_thw,
|
|
27516
|
+
model_inputs.attention_mask
|
|
27517
|
+
);
|
|
27518
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
27519
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
27520
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
27521
|
+
} else {
|
|
27522
|
+
if (!model_inputs.rope_deltas) {
|
|
27523
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27524
|
+
model_inputs.input_ids,
|
|
27525
|
+
model_inputs.image_grid_thw,
|
|
27526
|
+
model_inputs.video_grid_thw,
|
|
27527
|
+
model_inputs.attention_mask
|
|
27528
|
+
);
|
|
27529
|
+
}
|
|
27530
|
+
const delta = BigInt(past_length);
|
|
27531
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
27532
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
27533
|
+
}
|
|
27534
|
+
}
|
|
27535
|
+
}
|
|
27536
|
+
return model_inputs;
|
|
27537
|
+
}
|
|
27538
|
+
};
|
|
27539
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
27540
|
+
};
|
|
27541
|
+
|
|
27542
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27543
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27544
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27545
|
+
};
|
|
27546
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
27547
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27548
|
+
};
|
|
27549
|
+
|
|
27550
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
27551
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27552
|
+
/**
|
|
27553
|
+
* Compute 3D positional indices for vision tokens.
|
|
27554
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
27555
|
+
* @param {number} start_position
|
|
27556
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
27557
|
+
* @param {number} temp_merge_size
|
|
27558
|
+
* @param {number} spatial_merge_size
|
|
27559
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
27560
|
+
*/
|
|
27561
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
27562
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
27563
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
27564
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
27565
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
27566
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
27567
|
+
const h_pos = Array.from(
|
|
27568
|
+
{ length: seq_len },
|
|
27569
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
27570
|
+
);
|
|
27571
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
27572
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
27573
|
+
}
|
|
27574
|
+
/**
|
|
27575
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
27576
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
27577
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
27578
|
+
*/
|
|
27579
|
+
_get_multimodal_rope_positions({
|
|
27580
|
+
filtered_ids,
|
|
27581
|
+
image_grid_thw_list,
|
|
27582
|
+
video_grid_thw_list,
|
|
27583
|
+
spatial_merge_size,
|
|
27584
|
+
state
|
|
27585
|
+
}) {
|
|
27586
|
+
const { image_token_id } = this.config;
|
|
27587
|
+
const groups = [];
|
|
27588
|
+
let group_start = 0;
|
|
27589
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
27590
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
27591
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
27592
|
+
if (t !== current_type) {
|
|
27593
|
+
groups.push([current_type, group_start, j]);
|
|
27594
|
+
group_start = j;
|
|
27595
|
+
current_type = t;
|
|
27596
|
+
}
|
|
27597
|
+
}
|
|
27598
|
+
let current_pos = 0;
|
|
27599
|
+
const llm_pos_ids_list = [];
|
|
27600
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
27601
|
+
if (modality_type === 0) {
|
|
27602
|
+
const text_len = end_idx - start_idx;
|
|
27603
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
27604
|
+
current_pos += text_len;
|
|
27605
|
+
} else {
|
|
27606
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
27607
|
+
const temp_merge_size = grid_thw[0];
|
|
27608
|
+
llm_pos_ids_list.push(
|
|
27609
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
27610
|
+
);
|
|
27611
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
27612
|
+
}
|
|
27613
|
+
}
|
|
27614
|
+
return llm_pos_ids_list;
|
|
27615
|
+
}
|
|
27616
|
+
};
|
|
27617
|
+
|
|
26266
27618
|
// src/models/glpn/modeling_glpn.js
|
|
26267
27619
|
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
26268
27620
|
};
|
|
@@ -26335,6 +27687,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
|
|
|
26335
27687
|
var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
|
|
26336
27688
|
};
|
|
26337
27689
|
|
|
27690
|
+
// src/models/ultravox/modeling_ultravox.js
|
|
27691
|
+
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
27692
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
27693
|
+
};
|
|
27694
|
+
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
27695
|
+
_merge_input_ids_with_audio_features(kwargs) {
|
|
27696
|
+
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
27697
|
+
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
27698
|
+
return default_merge_input_ids_with_audio_features({
|
|
27699
|
+
// @ts-ignore
|
|
27700
|
+
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
|
|
27701
|
+
...kwargs,
|
|
27702
|
+
audio_features: reshaped_audio_features
|
|
27703
|
+
});
|
|
27704
|
+
}
|
|
27705
|
+
};
|
|
27706
|
+
|
|
27707
|
+
// src/models/granite_speech/modeling_granite_speech.js
|
|
27708
|
+
var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
|
|
27709
|
+
forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
|
|
27710
|
+
};
|
|
27711
|
+
|
|
26338
27712
|
// src/models/grounding_dino/modeling_grounding_dino.js
|
|
26339
27713
|
var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
|
|
26340
27714
|
};
|
|
@@ -26439,34 +27813,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
26439
27813
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
26440
27814
|
};
|
|
26441
27815
|
|
|
26442
|
-
// src/models/
|
|
26443
|
-
var
|
|
26444
|
-
forward_params = [
|
|
26445
|
-
"input_ids",
|
|
26446
|
-
"attention_mask",
|
|
26447
|
-
"pixel_values",
|
|
26448
|
-
"pixel_attention_mask",
|
|
26449
|
-
"position_ids",
|
|
26450
|
-
"past_key_values"
|
|
26451
|
-
];
|
|
27816
|
+
// src/models/llava/modeling_llava.js
|
|
27817
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
27818
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
26452
27819
|
};
|
|
26453
|
-
var
|
|
26454
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
26455
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
|
|
26456
|
-
return features;
|
|
26457
|
-
}
|
|
27820
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
26458
27821
|
_merge_input_ids_with_image_features(kwargs) {
|
|
26459
27822
|
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26460
27823
|
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26461
27824
|
return default_merge_input_ids_with_image_features({
|
|
26462
27825
|
// @ts-ignore
|
|
26463
|
-
image_token_id: this.config.image_token_id,
|
|
27826
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
26464
27827
|
...kwargs,
|
|
26465
27828
|
image_features: reshaped_image_hidden_states
|
|
26466
27829
|
});
|
|
26467
27830
|
}
|
|
26468
27831
|
};
|
|
26469
|
-
var
|
|
27832
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27833
|
+
};
|
|
27834
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
27835
|
+
};
|
|
27836
|
+
|
|
27837
|
+
// src/models/idefics3/modeling_idefics3.js
|
|
27838
|
+
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27839
|
+
forward_params = [
|
|
27840
|
+
"input_ids",
|
|
27841
|
+
"attention_mask",
|
|
27842
|
+
"pixel_values",
|
|
27843
|
+
"pixel_attention_mask",
|
|
27844
|
+
"position_ids",
|
|
27845
|
+
"past_key_values"
|
|
27846
|
+
];
|
|
26470
27847
|
};
|
|
26471
27848
|
|
|
26472
27849
|
// src/models/ijepa/modeling_ijepa.js
|
|
@@ -26550,6 +27927,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
26550
27927
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
26551
27928
|
};
|
|
26552
27929
|
|
|
27930
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
27931
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27932
|
+
};
|
|
27933
|
+
|
|
26553
27934
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
26554
27935
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
26555
27936
|
};
|
|
@@ -26558,6 +27939,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
|
|
|
26558
27939
|
var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
|
|
26559
27940
|
};
|
|
26560
27941
|
|
|
27942
|
+
// src/models/lfm2_vl/modeling_lfm2_vl.js
|
|
27943
|
+
var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27944
|
+
forward_params = [
|
|
27945
|
+
"input_ids",
|
|
27946
|
+
"attention_mask",
|
|
27947
|
+
"pixel_values",
|
|
27948
|
+
"pixel_attention_mask",
|
|
27949
|
+
"spatial_shapes",
|
|
27950
|
+
"position_ids",
|
|
27951
|
+
"past_key_values"
|
|
27952
|
+
];
|
|
27953
|
+
};
|
|
27954
|
+
|
|
26561
27955
|
// src/models/llama/modeling_llama.js
|
|
26562
27956
|
var LlamaPreTrainedModel = class extends PreTrainedModel {
|
|
26563
27957
|
};
|
|
@@ -26572,27 +27966,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
|
|
|
26572
27966
|
var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
|
|
26573
27967
|
};
|
|
26574
27968
|
|
|
26575
|
-
// src/models/llava/modeling_llava.js
|
|
26576
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
26577
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
26578
|
-
};
|
|
26579
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
26580
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
26581
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26582
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26583
|
-
return default_merge_input_ids_with_image_features({
|
|
26584
|
-
// @ts-ignore
|
|
26585
|
-
image_token_id: this.config.image_token_index,
|
|
26586
|
-
...kwargs,
|
|
26587
|
-
image_features: reshaped_image_hidden_states
|
|
26588
|
-
});
|
|
26589
|
-
}
|
|
26590
|
-
};
|
|
26591
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26592
|
-
};
|
|
26593
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
26594
|
-
};
|
|
26595
|
-
|
|
26596
27969
|
// src/models/longt5/modeling_longt5.js
|
|
26597
27970
|
var LongT5PreTrainedModel = class extends PreTrainedModel {
|
|
26598
27971
|
};
|
|
@@ -26754,6 +28127,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
26754
28127
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
26755
28128
|
};
|
|
26756
28129
|
|
|
28130
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
28131
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
28132
|
+
};
|
|
28133
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
28134
|
+
};
|
|
28135
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
28136
|
+
};
|
|
28137
|
+
|
|
26757
28138
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
26758
28139
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
26759
28140
|
};
|
|
@@ -27222,6 +28603,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
27222
28603
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
27223
28604
|
};
|
|
27224
28605
|
|
|
28606
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
28607
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
28608
|
+
};
|
|
28609
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
28610
|
+
};
|
|
28611
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
28612
|
+
};
|
|
28613
|
+
|
|
27225
28614
|
// src/models/neobert/modeling_neobert.js
|
|
27226
28615
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
27227
28616
|
};
|
|
@@ -27343,27 +28732,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
|
|
|
27343
28732
|
};
|
|
27344
28733
|
|
|
27345
28734
|
// src/models/paligemma/modeling_paligemma.js
|
|
27346
|
-
var
|
|
27347
|
-
forward_params = [
|
|
27348
|
-
"input_ids",
|
|
27349
|
-
// 'inputs_embeds',
|
|
27350
|
-
"attention_mask",
|
|
27351
|
-
"pixel_values",
|
|
27352
|
-
"position_ids",
|
|
27353
|
-
"past_key_values"
|
|
27354
|
-
];
|
|
27355
|
-
};
|
|
27356
|
-
var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
|
|
27357
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
27358
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
27359
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
27360
|
-
return default_merge_input_ids_with_image_features({
|
|
27361
|
-
// @ts-ignore
|
|
27362
|
-
image_token_id: this.config.image_token_index,
|
|
27363
|
-
...kwargs,
|
|
27364
|
-
image_features: reshaped_image_hidden_states
|
|
27365
|
-
});
|
|
27366
|
-
}
|
|
28735
|
+
var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27367
28736
|
};
|
|
27368
28737
|
|
|
27369
28738
|
// src/models/parakeet/modeling_parakeet.js
|
|
@@ -27522,244 +28891,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
27522
28891
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
27523
28892
|
};
|
|
27524
28893
|
|
|
27525
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
27526
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
27527
|
-
forward_params = [
|
|
27528
|
-
// Text inputs
|
|
27529
|
-
"input_ids",
|
|
27530
|
-
"attention_mask",
|
|
27531
|
-
"position_ids",
|
|
27532
|
-
"past_key_values",
|
|
27533
|
-
// Vision inputs
|
|
27534
|
-
"pixel_values",
|
|
27535
|
-
"image_grid_thw"
|
|
27536
|
-
];
|
|
27537
|
-
};
|
|
27538
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
27539
|
-
image_grid_thw_name = "grid_thw";
|
|
27540
|
-
/**
|
|
27541
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
27542
|
-
*
|
|
27543
|
-
* Explanation:
|
|
27544
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
27545
|
-
*
|
|
27546
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
27547
|
-
* Examples:
|
|
27548
|
-
* input_ids: [T T T T T], here T is for text.
|
|
27549
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
27550
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
27551
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
27552
|
-
*
|
|
27553
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
27554
|
-
* and 1D rotary position embeddin for text part.
|
|
27555
|
-
* Examples:
|
|
27556
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
27557
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
27558
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
27559
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
27560
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
27561
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
27562
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
27563
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
27564
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
27565
|
-
*
|
|
27566
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
27567
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
27568
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
27569
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
27570
|
-
* - 1 for tokens that are **not masked**,
|
|
27571
|
-
* - 0 for tokens that are **masked**.
|
|
27572
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
27573
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
27574
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
27575
|
-
*/
|
|
27576
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
27577
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
27578
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
27579
|
-
const mrope_position_deltas = [];
|
|
27580
|
-
if (image_grid_thw || video_grid_thw) {
|
|
27581
|
-
let total_input_ids = input_ids.tolist();
|
|
27582
|
-
if (!attention_mask) {
|
|
27583
|
-
attention_mask = ones_like(input_ids);
|
|
27584
|
-
}
|
|
27585
|
-
const attention_mask_list = attention_mask.tolist();
|
|
27586
|
-
const position_ids_list = Array.from(
|
|
27587
|
-
{ length: 3 },
|
|
27588
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
27589
|
-
);
|
|
27590
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
27591
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
27592
|
-
let image_index = 0;
|
|
27593
|
-
let video_index = 0;
|
|
27594
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
27595
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
27596
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
27597
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
27598
|
-
return acc;
|
|
27599
|
-
}, []);
|
|
27600
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
27601
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
27602
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
27603
|
-
let llm_pos_ids_list = [];
|
|
27604
|
-
let st2 = 0;
|
|
27605
|
-
let remain_images = image_nums;
|
|
27606
|
-
let remain_videos = video_nums;
|
|
27607
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
27608
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
|
|
27609
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
|
|
27610
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
27611
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
27612
|
-
let ed;
|
|
27613
|
-
let t, h, w;
|
|
27614
|
-
if (ed_image < ed_video) {
|
|
27615
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
27616
|
-
++image_index;
|
|
27617
|
-
--remain_images;
|
|
27618
|
-
ed = ed_image;
|
|
27619
|
-
} else {
|
|
27620
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
27621
|
-
++video_index;
|
|
27622
|
-
--remain_videos;
|
|
27623
|
-
ed = ed_video;
|
|
27624
|
-
}
|
|
27625
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
27626
|
-
Number(t),
|
|
27627
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
27628
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
27629
|
-
];
|
|
27630
|
-
const text_len = ed - st2;
|
|
27631
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27632
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
27633
|
-
const offset = text_len + st_idx;
|
|
27634
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
27635
|
-
const t_index = Array.from(
|
|
27636
|
-
{ length: grid_size },
|
|
27637
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
27638
|
-
);
|
|
27639
|
-
const h_index = Array.from(
|
|
27640
|
-
{ length: grid_size },
|
|
27641
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
27642
|
-
);
|
|
27643
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
27644
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
27645
|
-
st2 = ed + grid_size;
|
|
27646
|
-
}
|
|
27647
|
-
if (st2 < ids.length) {
|
|
27648
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27649
|
-
const text_len = ids.length - st2;
|
|
27650
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
27651
|
-
}
|
|
27652
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
27653
|
-
const llm_positions = new Array(num_items);
|
|
27654
|
-
let index = 0;
|
|
27655
|
-
for (let x = 0; x < 3; ++x) {
|
|
27656
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
27657
|
-
const val = llm_pos_ids_list[y];
|
|
27658
|
-
const text_len = val.length / 3;
|
|
27659
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
27660
|
-
llm_positions[index++] = val[z];
|
|
27661
|
-
}
|
|
27662
|
-
}
|
|
27663
|
-
}
|
|
27664
|
-
let count2 = 0;
|
|
27665
|
-
const attn_mask = attention_mask_list[i];
|
|
27666
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
27667
|
-
if (attn_mask[y] == 1) {
|
|
27668
|
-
for (let x = 0; x < 3; ++x) {
|
|
27669
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
27670
|
-
}
|
|
27671
|
-
++count2;
|
|
27672
|
-
}
|
|
27673
|
-
}
|
|
27674
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
27675
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
27676
|
-
}
|
|
27677
|
-
return [
|
|
27678
|
-
new Tensor3("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
27679
|
-
new Tensor3("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27680
|
-
];
|
|
27681
|
-
} else {
|
|
27682
|
-
if (attention_mask) {
|
|
27683
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
27684
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
27685
|
-
const mrope_position_deltas2 = Array.from(
|
|
27686
|
-
{ length: dims[0] },
|
|
27687
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
27688
|
-
);
|
|
27689
|
-
return [
|
|
27690
|
-
new Tensor3("int64", position_ids, [3, ...dims]),
|
|
27691
|
-
new Tensor3("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
27692
|
-
];
|
|
27693
|
-
} else {
|
|
27694
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
27695
|
-
const position_ids = BigInt64Array.from(
|
|
27696
|
-
{ length: 3 * batch_size * seq_length },
|
|
27697
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
27698
|
-
);
|
|
27699
|
-
return [new Tensor3("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
27700
|
-
}
|
|
27701
|
-
}
|
|
27702
|
-
}
|
|
27703
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
27704
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
27705
|
-
pixel_values,
|
|
27706
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
27707
|
-
})).image_features;
|
|
27708
|
-
return features;
|
|
27709
|
-
}
|
|
27710
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
27711
|
-
return default_merge_input_ids_with_image_features({
|
|
27712
|
-
// @ts-ignore
|
|
27713
|
-
image_token_id: this.config.image_token_id,
|
|
27714
|
-
...kwargs
|
|
27715
|
-
});
|
|
27716
|
-
}
|
|
27717
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
27718
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
27719
|
-
if (!model_inputs.past_key_values) {
|
|
27720
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27721
|
-
model_inputs.input_ids,
|
|
27722
|
-
model_inputs.image_grid_thw,
|
|
27723
|
-
model_inputs.video_grid_thw,
|
|
27724
|
-
model_inputs.attention_mask
|
|
27725
|
-
);
|
|
27726
|
-
} else {
|
|
27727
|
-
model_inputs.pixel_values = null;
|
|
27728
|
-
const past_length = getPastLength(model_inputs.past_key_values);
|
|
27729
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27730
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27731
|
-
model_inputs.input_ids,
|
|
27732
|
-
model_inputs.image_grid_thw,
|
|
27733
|
-
model_inputs.video_grid_thw,
|
|
27734
|
-
model_inputs.attention_mask
|
|
27735
|
-
);
|
|
27736
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
27737
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
27738
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
27739
|
-
} else {
|
|
27740
|
-
if (!model_inputs.rope_deltas) {
|
|
27741
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27742
|
-
model_inputs.input_ids,
|
|
27743
|
-
model_inputs.image_grid_thw,
|
|
27744
|
-
model_inputs.video_grid_thw,
|
|
27745
|
-
model_inputs.attention_mask
|
|
27746
|
-
);
|
|
27747
|
-
}
|
|
27748
|
-
const delta = BigInt(past_length);
|
|
27749
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
27750
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
27751
|
-
}
|
|
27752
|
-
}
|
|
27753
|
-
}
|
|
27754
|
-
return model_inputs;
|
|
27755
|
-
}
|
|
27756
|
-
};
|
|
27757
|
-
|
|
27758
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27759
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27760
|
-
image_grid_thw_name = "image_grid_thw";
|
|
27761
|
-
};
|
|
27762
|
-
|
|
27763
28894
|
// src/models/qwen3/modeling_qwen3.js
|
|
27764
28895
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
27765
28896
|
};
|
|
@@ -27787,18 +28918,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
|
|
|
27787
28918
|
// src/models/qwen3_vl/modeling_qwen3_vl.js
|
|
27788
28919
|
var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27789
28920
|
};
|
|
28921
|
+
var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
|
|
28922
|
+
};
|
|
27790
28923
|
|
|
27791
28924
|
// src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
|
|
27792
28925
|
var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27793
28926
|
};
|
|
28927
|
+
var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
|
|
28928
|
+
};
|
|
27794
28929
|
|
|
27795
28930
|
// src/models/qwen3_5/modeling_qwen3_5.js
|
|
27796
28931
|
var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27797
28932
|
};
|
|
28933
|
+
var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
|
|
28934
|
+
};
|
|
27798
28935
|
|
|
27799
28936
|
// src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
|
|
27800
28937
|
var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
|
|
27801
28938
|
};
|
|
28939
|
+
var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
|
|
28940
|
+
};
|
|
27802
28941
|
|
|
27803
28942
|
// src/models/resnet/modeling_resnet.js
|
|
27804
28943
|
var ResNetPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -28197,6 +29336,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
28197
29336
|
}
|
|
28198
29337
|
};
|
|
28199
29338
|
|
|
29339
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
29340
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
29341
|
+
};
|
|
29342
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
29343
|
+
};
|
|
29344
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
29345
|
+
};
|
|
29346
|
+
|
|
28200
29347
|
// src/models/speecht5/modeling_speecht5.js
|
|
28201
29348
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
28202
29349
|
};
|
|
@@ -28479,25 +29626,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
|
|
|
28479
29626
|
var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
|
|
28480
29627
|
};
|
|
28481
29628
|
|
|
28482
|
-
// src/models/ultravox/modeling_ultravox.js
|
|
28483
|
-
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
28484
|
-
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
28485
|
-
};
|
|
28486
|
-
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
28487
|
-
_merge_input_ids_with_audio_features(kwargs) {
|
|
28488
|
-
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
28489
|
-
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
28490
|
-
return default_merge_input_ids_with_audio_features({
|
|
28491
|
-
// @ts-ignore
|
|
28492
|
-
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
|
|
28493
|
-
...kwargs,
|
|
28494
|
-
audio_features: reshaped_audio_features
|
|
28495
|
-
});
|
|
28496
|
-
}
|
|
28497
|
-
};
|
|
28498
|
-
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
28499
|
-
};
|
|
28500
|
-
|
|
28501
29629
|
// src/models/unispeech/modeling_unispeech.js
|
|
28502
29630
|
var UniSpeechPreTrainedModel = class extends PreTrainedModel {
|
|
28503
29631
|
};
|
|
@@ -28663,6 +29791,170 @@ var VitsModel = class extends VitsPreTrainedModel {
|
|
|
28663
29791
|
}
|
|
28664
29792
|
};
|
|
28665
29793
|
|
|
29794
|
+
// src/models/voxtral/modeling_voxtral.js
|
|
29795
|
+
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
29796
|
+
};
|
|
29797
|
+
|
|
29798
|
+
// src/models/voxtral_realtime/modeling_voxtral_realtime.js
|
|
29799
|
+
var CONV1_LEFT_PAD = 2;
|
|
29800
|
+
var CONV2_LEFT_PAD = 1;
|
|
29801
|
+
var states = /* @__PURE__ */ new WeakMap();
|
|
29802
|
+
function createEncoderState(model, input_features) {
|
|
29803
|
+
const { text_config, audio_config } = (
|
|
29804
|
+
/** @type {any} */
|
|
29805
|
+
model.config
|
|
29806
|
+
);
|
|
29807
|
+
const encoder_session = model.sessions["audio_encoder"];
|
|
29808
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
29809
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
29810
|
+
const enc_kv_cache = new DynamicCache();
|
|
29811
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
|
|
29812
|
+
const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
29813
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
29814
|
+
for (const name in enc_shapes) {
|
|
29815
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
29816
|
+
enc_kv_cache[name] = new Tensor3(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
29817
|
+
}
|
|
29818
|
+
const enc_padding_cache = new Tensor3(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
29819
|
+
1,
|
|
29820
|
+
PADDING_CACHE_CHANNELS,
|
|
29821
|
+
CONV1_LEFT_PAD
|
|
29822
|
+
]);
|
|
29823
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
29824
|
+
if (!chunks_iter) {
|
|
29825
|
+
throw new Error("input_features must be iterable or async iterable");
|
|
29826
|
+
}
|
|
29827
|
+
return {
|
|
29828
|
+
encoder_session,
|
|
29829
|
+
enc_kv_cache,
|
|
29830
|
+
enc_padding_cache,
|
|
29831
|
+
enc_past_seq_len: 0,
|
|
29832
|
+
audio_embed_queue: [],
|
|
29833
|
+
audio_embed_total_tokens: 0,
|
|
29834
|
+
audio_queue_offset: 0,
|
|
29835
|
+
audio_consumed: 0,
|
|
29836
|
+
stream_exhausted: false,
|
|
29837
|
+
chunks_iter,
|
|
29838
|
+
text_hidden_size: text_config.hidden_size
|
|
29839
|
+
};
|
|
29840
|
+
}
|
|
29841
|
+
async function encodeChunk(s, chunk_features) {
|
|
29842
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
29843
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
29844
|
+
const position_ids = new Tensor3(
|
|
29845
|
+
"int64",
|
|
29846
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
29847
|
+
[1, conv2_output_len]
|
|
29848
|
+
);
|
|
29849
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
29850
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
29851
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
29852
|
+
input_features: chunk_features,
|
|
29853
|
+
attention_mask,
|
|
29854
|
+
position_ids,
|
|
29855
|
+
past_padding_cache: s.enc_padding_cache,
|
|
29856
|
+
...s.enc_kv_cache
|
|
29857
|
+
});
|
|
29858
|
+
if (s.enc_padding_cache.location === "gpu-buffer") {
|
|
29859
|
+
s.enc_padding_cache.dispose();
|
|
29860
|
+
}
|
|
29861
|
+
s.enc_padding_cache = present_padding_cache;
|
|
29862
|
+
for (const name in present_cache) {
|
|
29863
|
+
if (name.startsWith("present.")) {
|
|
29864
|
+
const pastName = name.replace("present", "past_key_values");
|
|
29865
|
+
const prev = s.enc_kv_cache[pastName];
|
|
29866
|
+
if (prev?.location === "gpu-buffer") {
|
|
29867
|
+
prev.dispose();
|
|
29868
|
+
}
|
|
29869
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
29870
|
+
}
|
|
29871
|
+
}
|
|
29872
|
+
s.enc_past_seq_len = total_seq_len;
|
|
29873
|
+
return audio_embeds;
|
|
29874
|
+
}
|
|
29875
|
+
async function fillAudioBuffer(s, needed) {
|
|
29876
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
29877
|
+
const result = await s.chunks_iter.next();
|
|
29878
|
+
if (result.done) {
|
|
29879
|
+
s.stream_exhausted = true;
|
|
29880
|
+
break;
|
|
29881
|
+
}
|
|
29882
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
29883
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
29884
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
29885
|
+
}
|
|
29886
|
+
}
|
|
29887
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
29888
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
29889
|
+
const embed_data = inputs_embeds.data;
|
|
29890
|
+
let embed_write_pos = 0;
|
|
29891
|
+
let remaining = current_len;
|
|
29892
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
29893
|
+
const front = s.audio_embed_queue[0];
|
|
29894
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
29895
|
+
const n = Math.min(remaining, available);
|
|
29896
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
29897
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
29898
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
29899
|
+
}
|
|
29900
|
+
embed_write_pos += n;
|
|
29901
|
+
remaining -= n;
|
|
29902
|
+
s.audio_queue_offset += n;
|
|
29903
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
29904
|
+
s.audio_embed_queue.shift();
|
|
29905
|
+
s.audio_queue_offset = 0;
|
|
29906
|
+
}
|
|
29907
|
+
}
|
|
29908
|
+
s.audio_consumed += current_len - remaining;
|
|
29909
|
+
}
|
|
29910
|
+
var AudioExhaustedCriteria = class extends StoppingCriteria {
|
|
29911
|
+
constructor(enc_state) {
|
|
29912
|
+
super();
|
|
29913
|
+
this._s = enc_state;
|
|
29914
|
+
}
|
|
29915
|
+
_call(input_ids) {
|
|
29916
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
29917
|
+
return input_ids.map(() => done);
|
|
29918
|
+
}
|
|
29919
|
+
};
|
|
29920
|
+
var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
|
|
29921
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
|
|
29922
|
+
};
|
|
29923
|
+
var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
|
|
29924
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
29925
|
+
const current_len = input_ids.dims[1];
|
|
29926
|
+
const enc = states.get(this);
|
|
29927
|
+
if (enc) {
|
|
29928
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
29929
|
+
}
|
|
29930
|
+
const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
|
|
29931
|
+
if (enc) {
|
|
29932
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
29933
|
+
}
|
|
29934
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
29935
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
29936
|
+
const session = this.sessions["decoder_model_merged"];
|
|
29937
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
29938
|
+
return await sessionRun(session, fixed);
|
|
29939
|
+
}
|
|
29940
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
29941
|
+
if (!input_features) {
|
|
29942
|
+
throw new Error("input_features (generator/iterable) must be provided");
|
|
29943
|
+
}
|
|
29944
|
+
const enc_state = createEncoderState(this, input_features);
|
|
29945
|
+
states.set(this, enc_state);
|
|
29946
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
29947
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
29948
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
29949
|
+
try {
|
|
29950
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
29951
|
+
} finally {
|
|
29952
|
+
enc_state.enc_kv_cache.dispose();
|
|
29953
|
+
states.delete(this);
|
|
29954
|
+
}
|
|
29955
|
+
}
|
|
29956
|
+
};
|
|
29957
|
+
|
|
28666
29958
|
// src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
|
|
28667
29959
|
var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
|
|
28668
29960
|
};
|
|
@@ -29168,6 +30460,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
29168
30460
|
// src/models/registry.js
|
|
29169
30461
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
29170
30462
|
["bert", "BertModel"],
|
|
30463
|
+
["eurobert", "EuroBertModel"],
|
|
29171
30464
|
["neobert", "NeoBertModel"],
|
|
29172
30465
|
["modernbert", "ModernBertModel"],
|
|
29173
30466
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -29299,6 +30592,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
29299
30592
|
["gemma3_text", "Gemma3Model"],
|
|
29300
30593
|
["helium", "HeliumModel"],
|
|
29301
30594
|
["glm", "GlmModel"],
|
|
30595
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
29302
30596
|
["openelm", "OpenELMModel"],
|
|
29303
30597
|
["qwen2", "Qwen2Model"],
|
|
29304
30598
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -29310,12 +30604,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
29310
30604
|
["mpt", "MptModel"],
|
|
29311
30605
|
["opt", "OPTModel"],
|
|
29312
30606
|
["mistral", "MistralModel"],
|
|
30607
|
+
["mistral4", "Mistral4Model"],
|
|
29313
30608
|
["ministral", "MinistralModel"],
|
|
29314
30609
|
["ministral3", "Ministral3Model"],
|
|
29315
30610
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
29316
30611
|
["starcoder2", "Starcoder2Model"],
|
|
30612
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
29317
30613
|
["falcon", "FalconModel"],
|
|
29318
30614
|
["falcon_h1", "FalconH1Model"],
|
|
30615
|
+
["nemotron_h", "NemotronHModel"],
|
|
30616
|
+
["solar_open", "SolarOpenModel"],
|
|
29319
30617
|
["stablelm", "StableLmModel"],
|
|
29320
30618
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
29321
30619
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -29335,6 +30633,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29335
30633
|
]);
|
|
29336
30634
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29337
30635
|
["bert", "BertForSequenceClassification"],
|
|
30636
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
29338
30637
|
["neobert", "NeoBertForSequenceClassification"],
|
|
29339
30638
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
29340
30639
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -29357,6 +30656,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29357
30656
|
]);
|
|
29358
30657
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29359
30658
|
["bert", "BertForTokenClassification"],
|
|
30659
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
29360
30660
|
["neobert", "NeoBertForTokenClassification"],
|
|
29361
30661
|
["modernbert", "ModernBertForTokenClassification"],
|
|
29362
30662
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -29416,27 +30716,40 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29416
30716
|
["gemma2", "Gemma2ForCausalLM"],
|
|
29417
30717
|
["vaultgemma", "VaultGemmaForCausalLM"],
|
|
29418
30718
|
["gemma3_text", "Gemma3ForCausalLM"],
|
|
30719
|
+
["gemma3", "Gemma3ForCausalLM"],
|
|
29419
30720
|
["helium", "HeliumForCausalLM"],
|
|
29420
30721
|
["glm", "GlmForCausalLM"],
|
|
30722
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
29421
30723
|
["openelm", "OpenELMForCausalLM"],
|
|
29422
30724
|
["qwen2", "Qwen2ForCausalLM"],
|
|
29423
30725
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
29424
30726
|
["qwen3", "Qwen3ForCausalLM"],
|
|
29425
30727
|
["qwen3_moe", "Qwen3MoeForCausalLM"],
|
|
29426
30728
|
["qwen3_next", "Qwen3NextForCausalLM"],
|
|
30729
|
+
["qwen2_vl", "Qwen2VLForCausalLM"],
|
|
30730
|
+
["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
|
|
30731
|
+
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
30732
|
+
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
30733
|
+
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
30734
|
+
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
30735
|
+
["gemma3n", "Gemma3nForCausalLM"],
|
|
29427
30736
|
["phi", "PhiForCausalLM"],
|
|
29428
30737
|
["phi3", "Phi3ForCausalLM"],
|
|
29429
30738
|
["mpt", "MptForCausalLM"],
|
|
29430
30739
|
["opt", "OPTForCausalLM"],
|
|
29431
30740
|
["mbart", "MBartForCausalLM"],
|
|
29432
30741
|
["mistral", "MistralForCausalLM"],
|
|
30742
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
29433
30743
|
["ministral", "MinistralForCausalLM"],
|
|
29434
30744
|
["ministral3", "Ministral3ForCausalLM"],
|
|
29435
30745
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
29436
30746
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
30747
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
29437
30748
|
["falcon", "FalconForCausalLM"],
|
|
29438
30749
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
30750
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
29439
30751
|
["trocr", "TrOCRForCausalLM"],
|
|
30752
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
29440
30753
|
["stablelm", "StableLmForCausalLM"],
|
|
29441
30754
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
29442
30755
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -29447,6 +30760,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29447
30760
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
29448
30761
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29449
30762
|
["bert", "BertForMaskedLM"],
|
|
30763
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
29450
30764
|
["neobert", "NeoBertForMaskedLM"],
|
|
29451
30765
|
["modernbert", "ModernBertForMaskedLM"],
|
|
29452
30766
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -29499,16 +30813,21 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29499
30813
|
["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
|
|
29500
30814
|
["qwen3_5", "Qwen3_5ForConditionalGeneration"],
|
|
29501
30815
|
["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
|
|
30816
|
+
["lfm2_vl", "Lfm2VlForConditionalGeneration"],
|
|
29502
30817
|
["idefics3", "Idefics3ForConditionalGeneration"],
|
|
29503
30818
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
29504
30819
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
29505
30820
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
29506
30821
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
29507
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
30822
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
30823
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
30824
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
29508
30825
|
]);
|
|
29509
30826
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30827
|
+
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
29510
30828
|
["ultravox", "UltravoxModel"],
|
|
29511
|
-
["voxtral", "VoxtralForConditionalGeneration"]
|
|
30829
|
+
["voxtral", "VoxtralForConditionalGeneration"],
|
|
30830
|
+
["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
|
|
29512
30831
|
]);
|
|
29513
30832
|
var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29514
30833
|
["vision-encoder-decoder", "VisionEncoderDecoderModel"]
|
|
@@ -29607,6 +30926,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29607
30926
|
]);
|
|
29608
30927
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
29609
30928
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30929
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
29610
30930
|
["dpt", "DPTForDepthEstimation"],
|
|
29611
30931
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
29612
30932
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -29691,7 +31011,19 @@ var CUSTOM_MAPPING = [
|
|
|
29691
31011
|
MODEL_TYPES.ImageAudioTextToText
|
|
29692
31012
|
],
|
|
29693
31013
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
29694
|
-
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
|
|
31014
|
+
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
31015
|
+
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31016
|
+
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31017
|
+
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31018
|
+
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31019
|
+
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31020
|
+
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31021
|
+
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31022
|
+
[
|
|
31023
|
+
"VoxtralRealtimeForConditionalGeneration",
|
|
31024
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
31025
|
+
MODEL_TYPES.VoxtralRealtime
|
|
31026
|
+
]
|
|
29695
31027
|
];
|
|
29696
31028
|
for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
29697
31029
|
MODEL_TYPE_MAPPING.set(name, type);
|
|
@@ -31369,8 +32701,18 @@ var TASK_ALIASES = Object.freeze({
|
|
|
31369
32701
|
});
|
|
31370
32702
|
|
|
31371
32703
|
// src/utils/model_registry/get_model_files.js
|
|
32704
|
+
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
32705
|
+
if (config !== null) {
|
|
32706
|
+
return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
|
|
32707
|
+
}
|
|
32708
|
+
const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
|
|
32709
|
+
return memoizePromise(
|
|
32710
|
+
key,
|
|
32711
|
+
() => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
|
|
32712
|
+
);
|
|
32713
|
+
}
|
|
31372
32714
|
async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
|
|
31373
|
-
config = await
|
|
32715
|
+
config = await get_config(modelId, { config });
|
|
31374
32716
|
const files = [
|
|
31375
32717
|
// Add config.json (always loaded)
|
|
31376
32718
|
"config.json"
|
|
@@ -31431,74 +32773,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
31431
32773
|
files.push(dataFilePath);
|
|
31432
32774
|
}
|
|
31433
32775
|
};
|
|
31434
|
-
const
|
|
31435
|
-
|
|
31436
|
-
add_model_file(
|
|
31437
|
-
|
|
31438
|
-
|
|
31439
|
-
|
|
31440
|
-
|
|
31441
|
-
|
|
31442
|
-
add_model_file("decoder_model_merged");
|
|
31443
|
-
files.push("generation_config.json");
|
|
31444
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
31445
|
-
add_model_file("model", "vision_encoder");
|
|
31446
|
-
add_model_file("prompt_encoder_mask_decoder");
|
|
31447
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
31448
|
-
add_model_file("model", "encoder_model");
|
|
31449
|
-
add_model_file("decoder_model_merged");
|
|
31450
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
31451
|
-
add_model_file("embed_tokens");
|
|
31452
|
-
add_model_file("vision_encoder");
|
|
31453
|
-
add_model_file("decoder_model_merged");
|
|
31454
|
-
if (config.is_encoder_decoder) {
|
|
31455
|
-
add_model_file("model", "encoder_model");
|
|
31456
|
-
}
|
|
31457
|
-
files.push("generation_config.json");
|
|
31458
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
31459
|
-
add_model_file("embed_tokens");
|
|
31460
|
-
add_model_file("audio_encoder");
|
|
31461
|
-
add_model_file("decoder_model_merged");
|
|
31462
|
-
files.push("generation_config.json");
|
|
31463
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
31464
|
-
add_model_file("embed_tokens");
|
|
31465
|
-
add_model_file("audio_encoder");
|
|
31466
|
-
add_model_file("vision_encoder");
|
|
31467
|
-
add_model_file("decoder_model_merged");
|
|
31468
|
-
files.push("generation_config.json");
|
|
31469
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
31470
|
-
add_model_file("model", "text_encoder");
|
|
31471
|
-
add_model_file("decoder_model_merged");
|
|
31472
|
-
add_model_file("encodec_decode");
|
|
31473
|
-
files.push("generation_config.json");
|
|
31474
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
31475
|
-
add_model_file("prepare_inputs_embeds");
|
|
31476
|
-
add_model_file("model", "language_model");
|
|
31477
|
-
add_model_file("lm_head");
|
|
31478
|
-
add_model_file("gen_head");
|
|
31479
|
-
add_model_file("gen_img_embeds");
|
|
31480
|
-
add_model_file("image_decode");
|
|
31481
|
-
files.push("generation_config.json");
|
|
31482
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
31483
|
-
add_model_file("prepare_inputs_embeds");
|
|
31484
|
-
add_model_file("model");
|
|
31485
|
-
add_model_file("vision_encoder");
|
|
31486
|
-
files.push("generation_config.json");
|
|
31487
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
31488
|
-
add_model_file("embed_tokens");
|
|
31489
|
-
add_model_file("speech_encoder");
|
|
31490
|
-
add_model_file("model", "language_model");
|
|
31491
|
-
add_model_file("conditional_decoder");
|
|
31492
|
-
files.push("generation_config.json");
|
|
31493
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
31494
|
-
add_model_file("encoder_model");
|
|
31495
|
-
add_model_file("decoder_model");
|
|
31496
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
31497
|
-
add_model_file("text_encoder");
|
|
31498
|
-
add_model_file("latent_denoiser");
|
|
31499
|
-
add_model_file("voice_decoder");
|
|
31500
|
-
} else {
|
|
31501
|
-
add_model_file("model", singleModelName);
|
|
32776
|
+
const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
|
|
32777
|
+
for (const [sessionKey, baseName] of Object.entries(sessions)) {
|
|
32778
|
+
add_model_file(sessionKey, baseName);
|
|
32779
|
+
}
|
|
32780
|
+
if (optional_configs) {
|
|
32781
|
+
for (const configFile of Object.values(optional_configs)) {
|
|
32782
|
+
files.push(configFile);
|
|
32783
|
+
}
|
|
31502
32784
|
}
|
|
31503
32785
|
return files;
|
|
31504
32786
|
}
|
|
@@ -31949,25 +33231,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
|
31949
33231
|
|
|
31950
33232
|
// src/utils/model_registry/is_cached.js
|
|
31951
33233
|
async function check_files_cache(modelId, files, options = {}) {
|
|
31952
|
-
const
|
|
31953
|
-
if (!
|
|
33234
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
33235
|
+
if (!cache2) {
|
|
31954
33236
|
const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
|
|
31955
33237
|
return { allCached: false, files: fileStatuses2 };
|
|
31956
33238
|
}
|
|
31957
33239
|
const fileStatuses = await Promise.all(
|
|
31958
33240
|
files.map(async (filename) => {
|
|
31959
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
31960
|
-
const cached = await checkCachedResource(
|
|
33241
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
33242
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
31961
33243
|
return { file: filename, cached: !!cached };
|
|
31962
33244
|
})
|
|
31963
33245
|
);
|
|
31964
33246
|
return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
|
|
31965
33247
|
}
|
|
31966
33248
|
async function is_file_cached(modelId, filename, options = {}) {
|
|
31967
|
-
const
|
|
31968
|
-
if (!
|
|
31969
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
31970
|
-
return !!await checkCachedResource(
|
|
33249
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
33250
|
+
if (!cache2) return false;
|
|
33251
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
33252
|
+
return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
31971
33253
|
}
|
|
31972
33254
|
async function is_cached(modelId, options = {}) {
|
|
31973
33255
|
if (!modelId) {
|
|
@@ -32014,26 +33296,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
|
|
|
32014
33296
|
|
|
32015
33297
|
// src/utils/model_registry/clear_cache.js
|
|
32016
33298
|
async function clear_files_from_cache(modelId, files, options = {}) {
|
|
32017
|
-
const
|
|
32018
|
-
if (!
|
|
33299
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
33300
|
+
if (!cache2) {
|
|
32019
33301
|
return {
|
|
32020
33302
|
filesDeleted: 0,
|
|
32021
33303
|
filesCached: 0,
|
|
32022
33304
|
files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
|
|
32023
33305
|
};
|
|
32024
33306
|
}
|
|
32025
|
-
if (!
|
|
33307
|
+
if (!cache2.delete) {
|
|
32026
33308
|
throw new Error("Cache does not support delete operation");
|
|
32027
33309
|
}
|
|
32028
33310
|
const results = await Promise.all(
|
|
32029
33311
|
files.map(async (filename) => {
|
|
32030
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
32031
|
-
const cached = await checkCachedResource(
|
|
33312
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
33313
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
32032
33314
|
const wasCached = !!cached;
|
|
32033
33315
|
let deleted = false;
|
|
32034
33316
|
if (wasCached) {
|
|
32035
|
-
const deletedWithProposed = await
|
|
32036
|
-
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await
|
|
33317
|
+
const deletedWithProposed = await cache2.delete(proposedCacheKey);
|
|
33318
|
+
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
|
|
32037
33319
|
deleted = deletedWithProposed || deletedWithLocal;
|
|
32038
33320
|
}
|
|
32039
33321
|
return { file: filename, deleted, wasCached };
|
|
@@ -32383,6 +33665,9 @@ export {
|
|
|
32383
33665
|
BloomModel,
|
|
32384
33666
|
BloomPreTrainedModel,
|
|
32385
33667
|
BloomTokenizer,
|
|
33668
|
+
CHMv2ForDepthEstimation,
|
|
33669
|
+
CHMv2ImageProcessor,
|
|
33670
|
+
CHMv2PreTrainedModel,
|
|
32386
33671
|
CLIPFeatureExtractor,
|
|
32387
33672
|
CLIPImageProcessor,
|
|
32388
33673
|
CLIPModel,
|
|
@@ -32478,6 +33763,9 @@ export {
|
|
|
32478
33763
|
DebertaV2Tokenizer,
|
|
32479
33764
|
DecisionTransformerModel,
|
|
32480
33765
|
DecisionTransformerPreTrainedModel,
|
|
33766
|
+
DeepseekV3ForCausalLM,
|
|
33767
|
+
DeepseekV3Model,
|
|
33768
|
+
DeepseekV3PreTrainedModel,
|
|
32481
33769
|
DeiTFeatureExtractor,
|
|
32482
33770
|
DeiTForImageClassification,
|
|
32483
33771
|
DeiTImageProcessor,
|
|
@@ -32514,6 +33802,7 @@ export {
|
|
|
32514
33802
|
DonutImageProcessor,
|
|
32515
33803
|
DonutSwinModel,
|
|
32516
33804
|
DonutSwinPreTrainedModel,
|
|
33805
|
+
DynamicCache,
|
|
32517
33806
|
EdgeTamModel,
|
|
32518
33807
|
EfficientNetForImageClassification,
|
|
32519
33808
|
EfficientNetImageProcessor,
|
|
@@ -32537,6 +33826,11 @@ export {
|
|
|
32537
33826
|
EsmModel,
|
|
32538
33827
|
EsmPreTrainedModel,
|
|
32539
33828
|
EsmTokenizer,
|
|
33829
|
+
EuroBertForMaskedLM,
|
|
33830
|
+
EuroBertForSequenceClassification,
|
|
33831
|
+
EuroBertForTokenClassification,
|
|
33832
|
+
EuroBertModel,
|
|
33833
|
+
EuroBertPreTrainedModel,
|
|
32540
33834
|
ExaoneForCausalLM,
|
|
32541
33835
|
ExaoneModel,
|
|
32542
33836
|
ExaonePreTrainedModel,
|
|
@@ -32586,6 +33880,7 @@ export {
|
|
|
32586
33880
|
Gemma3Model,
|
|
32587
33881
|
Gemma3PreTrainedModel,
|
|
32588
33882
|
Gemma3nAudioFeatureExtractor,
|
|
33883
|
+
Gemma3nForCausalLM,
|
|
32589
33884
|
Gemma3nForConditionalGeneration,
|
|
32590
33885
|
Gemma3nPreTrainedModel,
|
|
32591
33886
|
Gemma3nProcessor,
|
|
@@ -32593,8 +33888,14 @@ export {
|
|
|
32593
33888
|
GemmaModel,
|
|
32594
33889
|
GemmaPreTrainedModel,
|
|
32595
33890
|
GemmaTokenizer,
|
|
33891
|
+
Glm46VImageProcessor,
|
|
33892
|
+
Glm46VProcessor,
|
|
32596
33893
|
GlmForCausalLM,
|
|
32597
33894
|
GlmModel,
|
|
33895
|
+
GlmMoeDsaForCausalLM,
|
|
33896
|
+
GlmMoeDsaModel,
|
|
33897
|
+
GlmMoeDsaPreTrainedModel,
|
|
33898
|
+
GlmOcrForConditionalGeneration,
|
|
32598
33899
|
GlmPreTrainedModel,
|
|
32599
33900
|
GptOssForCausalLM,
|
|
32600
33901
|
GptOssModel,
|
|
@@ -32605,6 +33906,9 @@ export {
|
|
|
32605
33906
|
GraniteMoeHybridModel,
|
|
32606
33907
|
GraniteMoeHybridPreTrainedModel,
|
|
32607
33908
|
GranitePreTrainedModel,
|
|
33909
|
+
GraniteSpeechFeatureExtractor,
|
|
33910
|
+
GraniteSpeechForConditionalGeneration,
|
|
33911
|
+
GraniteSpeechProcessor,
|
|
32608
33912
|
GroundingDinoForObjectDetection,
|
|
32609
33913
|
GroundingDinoImageProcessor,
|
|
32610
33914
|
GroundingDinoPreTrainedModel,
|
|
@@ -32630,7 +33934,6 @@ export {
|
|
|
32630
33934
|
IJepaPreTrainedModel,
|
|
32631
33935
|
Idefics3ForConditionalGeneration,
|
|
32632
33936
|
Idefics3ImageProcessor,
|
|
32633
|
-
Idefics3PreTrainedModel,
|
|
32634
33937
|
Idefics3Processor,
|
|
32635
33938
|
ImageClassificationPipeline,
|
|
32636
33939
|
ImageFeatureExtractionPipeline,
|
|
@@ -32655,6 +33958,10 @@ export {
|
|
|
32655
33958
|
Lfm2MoeModel,
|
|
32656
33959
|
Lfm2MoePreTrainedModel,
|
|
32657
33960
|
Lfm2PreTrainedModel,
|
|
33961
|
+
Lfm2VlForConditionalGeneration,
|
|
33962
|
+
Lfm2VlImageProcessor,
|
|
33963
|
+
Lfm2VlProcessor,
|
|
33964
|
+
LightOnOcrForConditionalGeneration,
|
|
32658
33965
|
LiteWhisperForConditionalGeneration,
|
|
32659
33966
|
Llama4ForCausalLM,
|
|
32660
33967
|
Llama4PreTrainedModel,
|
|
@@ -32724,6 +34031,9 @@ export {
|
|
|
32724
34031
|
MimiPreTrainedModel,
|
|
32725
34032
|
MinLengthLogitsProcessor,
|
|
32726
34033
|
MinNewTokensLengthLogitsProcessor,
|
|
34034
|
+
Mistral4ForCausalLM,
|
|
34035
|
+
Mistral4Model,
|
|
34036
|
+
Mistral4PreTrainedModel,
|
|
32727
34037
|
MistralForCausalLM,
|
|
32728
34038
|
MistralModel,
|
|
32729
34039
|
MistralPreTrainedModel,
|
|
@@ -32795,6 +34105,9 @@ export {
|
|
|
32795
34105
|
NanoChatForCausalLM,
|
|
32796
34106
|
NanoChatModel,
|
|
32797
34107
|
NanoChatPreTrainedModel,
|
|
34108
|
+
NemotronHForCausalLM,
|
|
34109
|
+
NemotronHModel,
|
|
34110
|
+
NemotronHPreTrainedModel,
|
|
32798
34111
|
NeoBertForMaskedLM,
|
|
32799
34112
|
NeoBertForQuestionAnswering,
|
|
32800
34113
|
NeoBertForSequenceClassification,
|
|
@@ -32838,7 +34151,6 @@ export {
|
|
|
32838
34151
|
Owlv2Model,
|
|
32839
34152
|
Owlv2PreTrainedModel,
|
|
32840
34153
|
PaliGemmaForConditionalGeneration,
|
|
32841
|
-
PaliGemmaPreTrainedModel,
|
|
32842
34154
|
PaliGemmaProcessor,
|
|
32843
34155
|
ParakeetFeatureExtractor,
|
|
32844
34156
|
ParakeetForCTC,
|
|
@@ -32882,10 +34194,12 @@ export {
|
|
|
32882
34194
|
Qwen2MoePreTrainedModel,
|
|
32883
34195
|
Qwen2PreTrainedModel,
|
|
32884
34196
|
Qwen2Tokenizer,
|
|
34197
|
+
Qwen2VLForCausalLM,
|
|
32885
34198
|
Qwen2VLForConditionalGeneration,
|
|
32886
34199
|
Qwen2VLImageProcessor,
|
|
32887
34200
|
Qwen2VLPreTrainedModel,
|
|
32888
34201
|
Qwen2VLProcessor,
|
|
34202
|
+
Qwen2_5_VLForCausalLM,
|
|
32889
34203
|
Qwen2_5_VLForConditionalGeneration,
|
|
32890
34204
|
Qwen2_5_VLProcessor,
|
|
32891
34205
|
Qwen3ForCausalLM,
|
|
@@ -32897,10 +34211,14 @@ export {
|
|
|
32897
34211
|
Qwen3NextModel,
|
|
32898
34212
|
Qwen3NextPreTrainedModel,
|
|
32899
34213
|
Qwen3PreTrainedModel,
|
|
34214
|
+
Qwen3VLForCausalLM,
|
|
32900
34215
|
Qwen3VLForConditionalGeneration,
|
|
34216
|
+
Qwen3VLMoeForCausalLM,
|
|
32901
34217
|
Qwen3VLMoeForConditionalGeneration,
|
|
32902
34218
|
Qwen3VLProcessor,
|
|
34219
|
+
Qwen3_5ForCausalLM,
|
|
32903
34220
|
Qwen3_5ForConditionalGeneration,
|
|
34221
|
+
Qwen3_5MoeForCausalLM,
|
|
32904
34222
|
Qwen3_5MoeForConditionalGeneration,
|
|
32905
34223
|
RFDetrForObjectDetection,
|
|
32906
34224
|
RFDetrModel,
|
|
@@ -32972,7 +34290,6 @@ export {
|
|
|
32972
34290
|
SmolLM3ForCausalLM,
|
|
32973
34291
|
SmolLM3Model,
|
|
32974
34292
|
SmolLM3PreTrainedModel,
|
|
32975
|
-
SmolVLMForConditionalGeneration,
|
|
32976
34293
|
Idefics3ImageProcessor as SmolVLMImageProcessor,
|
|
32977
34294
|
Idefics3Processor as SmolVLMProcessor,
|
|
32978
34295
|
SnacDecoderModel,
|
|
@@ -32980,6 +34297,9 @@ export {
|
|
|
32980
34297
|
SnacFeatureExtractor,
|
|
32981
34298
|
SnacModel,
|
|
32982
34299
|
SnacPreTrainedModel,
|
|
34300
|
+
SolarOpenForCausalLM,
|
|
34301
|
+
SolarOpenModel,
|
|
34302
|
+
SolarOpenPreTrainedModel,
|
|
32983
34303
|
SpeechT5FeatureExtractor,
|
|
32984
34304
|
SpeechT5ForSpeechToText,
|
|
32985
34305
|
SpeechT5ForTextToSpeech,
|
|
@@ -33078,6 +34398,10 @@ export {
|
|
|
33078
34398
|
VitsTokenizer,
|
|
33079
34399
|
VoxtralForConditionalGeneration,
|
|
33080
34400
|
VoxtralProcessor,
|
|
34401
|
+
VoxtralRealtimeFeatureExtractor,
|
|
34402
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
34403
|
+
VoxtralRealtimePreTrainedModel,
|
|
34404
|
+
VoxtralRealtimeProcessor,
|
|
33081
34405
|
Wav2Vec2BertForCTC,
|
|
33082
34406
|
Wav2Vec2BertForSequenceClassification,
|
|
33083
34407
|
Wav2Vec2BertModel,
|
|
@@ -33173,7 +34497,7 @@ export {
|
|
|
33173
34497
|
|
|
33174
34498
|
onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
|
|
33175
34499
|
(*!
|
|
33176
|
-
* ONNX Runtime Web v1.25.0-dev.
|
|
34500
|
+
* ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
|
|
33177
34501
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
33178
34502
|
* Licensed under the MIT License.
|
|
33179
34503
|
*)
|