@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +1587 -570
- package/dist/transformers.min.js +17 -17
- package/dist/transformers.node.cjs +1605 -573
- package/dist/transformers.node.min.cjs +21 -21
- package/dist/transformers.node.min.mjs +21 -21
- package/dist/transformers.node.mjs +1600 -583
- package/dist/transformers.web.js +1592 -575
- package/dist/transformers.web.min.js +15 -15
- package/package.json +3 -3
- package/src/cache_utils.js +62 -0
- package/src/configs.js +17 -2
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +3 -3
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +1 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/modeling_utils.js +222 -308
- package/src/models/models.js +4 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +3 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +7 -7
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +25 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +14 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +17 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +4 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
package/dist/transformers.web.js
CHANGED
|
@@ -14,7 +14,7 @@ var node_path_default = {};
|
|
|
14
14
|
var node_url_default = {};
|
|
15
15
|
|
|
16
16
|
// src/env.js
|
|
17
|
-
var VERSION = "4.0.0-next.
|
|
17
|
+
var VERSION = "4.0.0-next.7";
|
|
18
18
|
var HAS_SELF = typeof self !== "undefined";
|
|
19
19
|
var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
|
|
20
20
|
var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
|
|
@@ -142,6 +142,7 @@ var env = {
|
|
|
142
142
|
customCache: null,
|
|
143
143
|
useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
|
|
144
144
|
cacheKey: "transformers-cache",
|
|
145
|
+
experimental_useCrossOriginStorage: false,
|
|
145
146
|
/////////////////// Custom fetch /////////////////////
|
|
146
147
|
fetch: DEFAULT_FETCH
|
|
147
148
|
//////////////////////////////////////////////////////
|
|
@@ -2692,7 +2693,7 @@ var Tokenizer = class {
|
|
|
2692
2693
|
};
|
|
2693
2694
|
var Tokenizer_default = Tokenizer;
|
|
2694
2695
|
|
|
2695
|
-
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.
|
|
2696
|
+
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
|
|
2696
2697
|
var TOKEN_TYPES = Object.freeze({
|
|
2697
2698
|
Text: "Text",
|
|
2698
2699
|
// The text between Jinja statements or expressions
|
|
@@ -4211,7 +4212,11 @@ var Environment = class {
|
|
|
4211
4212
|
["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
|
|
4212
4213
|
["integer", (operand) => operand instanceof IntegerValue],
|
|
4213
4214
|
["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
|
|
4214
|
-
["mapping", (operand) => operand
|
|
4215
|
+
["mapping", (operand) => operand instanceof ObjectValue],
|
|
4216
|
+
[
|
|
4217
|
+
"sequence",
|
|
4218
|
+
(operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
|
|
4219
|
+
],
|
|
4215
4220
|
[
|
|
4216
4221
|
"lower",
|
|
4217
4222
|
(operand) => {
|
|
@@ -4484,6 +4489,9 @@ var Interpreter = class {
|
|
|
4484
4489
|
applyFilter(operand, filterNode, environment) {
|
|
4485
4490
|
if (filterNode.type === "Identifier") {
|
|
4486
4491
|
const filter = filterNode;
|
|
4492
|
+
if (filter.value === "safe") {
|
|
4493
|
+
return operand;
|
|
4494
|
+
}
|
|
4487
4495
|
if (filter.value === "tojson") {
|
|
4488
4496
|
return new StringValue(toJSON(operand, {}));
|
|
4489
4497
|
}
|
|
@@ -4573,6 +4581,8 @@ var Interpreter = class {
|
|
|
4573
4581
|
return new IntegerValue(Math.floor(operand.value));
|
|
4574
4582
|
case "float":
|
|
4575
4583
|
return new FloatValue(operand.value);
|
|
4584
|
+
case "string":
|
|
4585
|
+
return new StringValue(operand.toString());
|
|
4576
4586
|
default:
|
|
4577
4587
|
throw new Error(`Unknown NumericValue filter: ${filter.value}`);
|
|
4578
4588
|
}
|
|
@@ -5995,9 +6005,216 @@ function toAbsoluteURL(url) {
|
|
|
5995
6005
|
return new URL(url, baseURL).href;
|
|
5996
6006
|
}
|
|
5997
6007
|
|
|
6008
|
+
// src/utils/cache/CrossOriginStorageCache.js
|
|
6009
|
+
var HASH_ALGORITHM = "SHA-256";
|
|
6010
|
+
var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
|
|
6011
|
+
var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
|
|
6012
|
+
var CrossOriginStorage = class {
|
|
6013
|
+
/** @type {Promise<Cache> | null} */
|
|
6014
|
+
#hashCache = null;
|
|
6015
|
+
/**
|
|
6016
|
+
* Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
|
|
6017
|
+
* @returns {Promise<Cache>}
|
|
6018
|
+
*/
|
|
6019
|
+
_getHashCache = () => {
|
|
6020
|
+
this.#hashCache ??= caches.open(HASH_CACHE_NAME);
|
|
6021
|
+
return this.#hashCache;
|
|
6022
|
+
};
|
|
6023
|
+
/**
|
|
6024
|
+
* Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
|
|
6025
|
+
* @returns {boolean}
|
|
6026
|
+
*/
|
|
6027
|
+
static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
|
|
6028
|
+
/**
|
|
6029
|
+
* Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
|
|
6030
|
+
* the corresponding file handle from cross-origin storage.
|
|
6031
|
+
*
|
|
6032
|
+
* Implements `CacheInterface.match`.
|
|
6033
|
+
*
|
|
6034
|
+
* @param {string} request The URL of the resource to look up.
|
|
6035
|
+
* @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
|
|
6036
|
+
*/
|
|
6037
|
+
match = async (request) => {
|
|
6038
|
+
const hashValue = await this._getFileHash(request);
|
|
6039
|
+
if (!hashValue) {
|
|
6040
|
+
return void 0;
|
|
6041
|
+
}
|
|
6042
|
+
try {
|
|
6043
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
|
|
6044
|
+
const blob = await handle.getFile();
|
|
6045
|
+
return new Response(blob, {
|
|
6046
|
+
headers: {
|
|
6047
|
+
"Content-Length": String(blob.size)
|
|
6048
|
+
}
|
|
6049
|
+
});
|
|
6050
|
+
} catch {
|
|
6051
|
+
return void 0;
|
|
6052
|
+
}
|
|
6053
|
+
};
|
|
6054
|
+
/**
|
|
6055
|
+
* Stores a response in cross-origin storage, keyed by its SHA-256 hash.
|
|
6056
|
+
*
|
|
6057
|
+
* For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
|
|
6058
|
+
* `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
|
|
6059
|
+
* without reading the response body a second time.
|
|
6060
|
+
*
|
|
6061
|
+
* For non-LFS resources the hash is unknown upfront. In that case the body is consumed
|
|
6062
|
+
* in the background: the stream is read to compute the content hash, the file is written
|
|
6063
|
+
* into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
|
|
6064
|
+
* so that future `match` calls can resolve the file without a network round-trip.
|
|
6065
|
+
*
|
|
6066
|
+
* Implements `CacheInterface.put`.
|
|
6067
|
+
*
|
|
6068
|
+
* @param {string} request The URL of the resource (used as the hash-cache key).
|
|
6069
|
+
* @param {Response} response The response whose body will be written to the cache.
|
|
6070
|
+
* @returns {Promise<void>}
|
|
6071
|
+
*/
|
|
6072
|
+
put = async (request, response) => {
|
|
6073
|
+
const hashValue = await this._getFileHash(request);
|
|
6074
|
+
if (hashValue) {
|
|
6075
|
+
const blob = await response.blob();
|
|
6076
|
+
await this._storeBlobInCOS(blob, hashValue);
|
|
6077
|
+
} else {
|
|
6078
|
+
this._processAndStore(request, response.body);
|
|
6079
|
+
}
|
|
6080
|
+
};
|
|
6081
|
+
/**
|
|
6082
|
+
* Writes a blob into cross-origin storage using the given pre-computed hex hash string.
|
|
6083
|
+
*
|
|
6084
|
+
* @param {Blob} blob
|
|
6085
|
+
* @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
|
|
6086
|
+
* @returns {Promise<void>}
|
|
6087
|
+
*/
|
|
6088
|
+
_storeBlobInCOS = async (blob, hashHex) => {
|
|
6089
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
|
|
6090
|
+
create: true
|
|
6091
|
+
});
|
|
6092
|
+
const writableStream = await handle.createWritable();
|
|
6093
|
+
await writableStream.write(blob);
|
|
6094
|
+
await writableStream.close();
|
|
6095
|
+
};
|
|
6096
|
+
/**
|
|
6097
|
+
* Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
|
|
6098
|
+
* of the resulting blob, stores it in cross-origin storage, and persists the computed
|
|
6099
|
+
* hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
|
|
6100
|
+
* file without a network round-trip.
|
|
6101
|
+
*
|
|
6102
|
+
* Called fire-and-forget from `put` — errors are swallowed so failures never surface to
|
|
6103
|
+
* the caller.
|
|
6104
|
+
*
|
|
6105
|
+
* @param {string} request The original resource URL.
|
|
6106
|
+
* @param {ReadableStream} stream The response body stream to consume.
|
|
6107
|
+
* @returns {Promise<void>}
|
|
6108
|
+
*/
|
|
6109
|
+
_processAndStore = async (request, stream) => {
|
|
6110
|
+
try {
|
|
6111
|
+
const chunks = [];
|
|
6112
|
+
for await (const chunk2 of stream) {
|
|
6113
|
+
chunks.push(chunk2);
|
|
6114
|
+
}
|
|
6115
|
+
const blob = new Blob(chunks);
|
|
6116
|
+
const hashHex = await this._getBlobHash(blob);
|
|
6117
|
+
await this._storeBlobInCOS(blob, hashHex);
|
|
6118
|
+
try {
|
|
6119
|
+
const hashCache = await this._getHashCache();
|
|
6120
|
+
await hashCache.put(request, new Response(hashHex));
|
|
6121
|
+
} catch {
|
|
6122
|
+
}
|
|
6123
|
+
} catch {
|
|
6124
|
+
}
|
|
6125
|
+
};
|
|
6126
|
+
/**
|
|
6127
|
+
* Deletes the cache entry for the given request.
|
|
6128
|
+
*
|
|
6129
|
+
* Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
|
|
6130
|
+
* expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
|
|
6131
|
+
* permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
|
|
6132
|
+
* re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
|
|
6133
|
+
*
|
|
6134
|
+
* Implements `CacheInterface.delete`.
|
|
6135
|
+
*
|
|
6136
|
+
* @param {string} request
|
|
6137
|
+
* @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
|
|
6138
|
+
*/
|
|
6139
|
+
delete = async (request) => {
|
|
6140
|
+
try {
|
|
6141
|
+
const hashCache = await this._getHashCache();
|
|
6142
|
+
return await hashCache.delete(request);
|
|
6143
|
+
} catch {
|
|
6144
|
+
return false;
|
|
6145
|
+
}
|
|
6146
|
+
};
|
|
6147
|
+
/**
|
|
6148
|
+
* Resolves the SHA-256 hash for a given URL.
|
|
6149
|
+
*
|
|
6150
|
+
* Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
|
|
6151
|
+
* Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
|
|
6152
|
+
* LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
|
|
6153
|
+
*
|
|
6154
|
+
* Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
|
|
6155
|
+
*
|
|
6156
|
+
* @param {string} url The resource URL to resolve a hash for.
|
|
6157
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6158
|
+
*/
|
|
6159
|
+
_getFileHash = async (url) => {
|
|
6160
|
+
try {
|
|
6161
|
+
const hashCache = await this._getHashCache();
|
|
6162
|
+
const cached = await hashCache.match(url);
|
|
6163
|
+
if (cached) {
|
|
6164
|
+
return cached.text();
|
|
6165
|
+
}
|
|
6166
|
+
const hash = await this._getLfsFileHash(url);
|
|
6167
|
+
if (hash) {
|
|
6168
|
+
await hashCache.put(url, new Response(hash));
|
|
6169
|
+
return hash;
|
|
6170
|
+
}
|
|
6171
|
+
return null;
|
|
6172
|
+
} catch {
|
|
6173
|
+
return null;
|
|
6174
|
+
}
|
|
6175
|
+
};
|
|
6176
|
+
/**
|
|
6177
|
+
* Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
|
|
6178
|
+
* Git LFS pointer file.
|
|
6179
|
+
*
|
|
6180
|
+
* Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
|
|
6181
|
+
* The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
|
|
6182
|
+
* Returns `null` for non-LFS URLs or when the network request fails.
|
|
6183
|
+
*
|
|
6184
|
+
* @see https://huggingface.co/docs/hub/en/storage-backends#xet
|
|
6185
|
+
* @param {string} url The resolved Hugging Face URL of the resource.
|
|
6186
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6187
|
+
*/
|
|
6188
|
+
_getLfsFileHash = async (url) => {
|
|
6189
|
+
if (!url.includes("/resolve/")) {
|
|
6190
|
+
return null;
|
|
6191
|
+
}
|
|
6192
|
+
const rawUrl = url.replace("/resolve/", "/raw/");
|
|
6193
|
+
try {
|
|
6194
|
+
const text = await fetch(rawUrl).then((r) => r.text());
|
|
6195
|
+
const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
|
|
6196
|
+
return match ? match[1] : null;
|
|
6197
|
+
} catch {
|
|
6198
|
+
return null;
|
|
6199
|
+
}
|
|
6200
|
+
};
|
|
6201
|
+
/**
|
|
6202
|
+
* Computes the SHA-256 hash of a `Blob`'s contents.
|
|
6203
|
+
*
|
|
6204
|
+
* @param {Blob} blob The blob to hash.
|
|
6205
|
+
* @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
|
|
6206
|
+
*/
|
|
6207
|
+
_getBlobHash = async (blob) => {
|
|
6208
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
6209
|
+
const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
|
|
6210
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
6211
|
+
return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
6212
|
+
};
|
|
6213
|
+
};
|
|
6214
|
+
|
|
5998
6215
|
// src/utils/cache.js
|
|
5999
6216
|
async function getCache(file_cache_dir = null) {
|
|
6000
|
-
let
|
|
6217
|
+
let cache2 = null;
|
|
6001
6218
|
if (env.useCustomCache) {
|
|
6002
6219
|
if (!env.customCache) {
|
|
6003
6220
|
throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
|
|
@@ -6007,30 +6224,33 @@ async function getCache(file_cache_dir = null) {
|
|
|
6007
6224
|
"`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
|
|
6008
6225
|
);
|
|
6009
6226
|
}
|
|
6010
|
-
|
|
6227
|
+
cache2 = env.customCache;
|
|
6228
|
+
}
|
|
6229
|
+
if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
|
|
6230
|
+
cache2 = new CrossOriginStorage();
|
|
6011
6231
|
}
|
|
6012
|
-
if (!
|
|
6232
|
+
if (!cache2 && env.useBrowserCache) {
|
|
6013
6233
|
if (typeof caches === "undefined") {
|
|
6014
6234
|
throw Error("Browser cache is not available in this environment.");
|
|
6015
6235
|
}
|
|
6016
6236
|
try {
|
|
6017
|
-
|
|
6237
|
+
cache2 = await caches.open(env.cacheKey);
|
|
6018
6238
|
} catch (e) {
|
|
6019
6239
|
logger.warn("An error occurred while opening the browser cache:", e);
|
|
6020
6240
|
}
|
|
6021
6241
|
}
|
|
6022
|
-
if (!
|
|
6242
|
+
if (!cache2 && env.useFSCache) {
|
|
6023
6243
|
if (!apis.IS_FS_AVAILABLE) {
|
|
6024
6244
|
throw Error("File System Cache is not available in this environment.");
|
|
6025
6245
|
}
|
|
6026
|
-
|
|
6246
|
+
cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
|
|
6027
6247
|
}
|
|
6028
|
-
return
|
|
6248
|
+
return cache2;
|
|
6029
6249
|
}
|
|
6030
|
-
async function tryCache(
|
|
6250
|
+
async function tryCache(cache2, ...names) {
|
|
6031
6251
|
for (let name of names) {
|
|
6032
6252
|
try {
|
|
6033
|
-
let result = await
|
|
6253
|
+
let result = await cache2.match(name);
|
|
6034
6254
|
if (result) return result;
|
|
6035
6255
|
} catch (e) {
|
|
6036
6256
|
continue;
|
|
@@ -6039,6 +6259,83 @@ async function tryCache(cache, ...names) {
|
|
|
6039
6259
|
return void 0;
|
|
6040
6260
|
}
|
|
6041
6261
|
|
|
6262
|
+
// src/utils/lru_cache.js
|
|
6263
|
+
var LRUCache2 = class {
|
|
6264
|
+
/** @type {number} */
|
|
6265
|
+
#capacity;
|
|
6266
|
+
/** @type {Map<any, any>} */
|
|
6267
|
+
#cache;
|
|
6268
|
+
/**
|
|
6269
|
+
* Creates an LRUCache instance.
|
|
6270
|
+
* @param {number} capacity The maximum number of items the cache can hold.
|
|
6271
|
+
*/
|
|
6272
|
+
constructor(capacity) {
|
|
6273
|
+
this.#capacity = capacity;
|
|
6274
|
+
this.#cache = /* @__PURE__ */ new Map();
|
|
6275
|
+
}
|
|
6276
|
+
/**
|
|
6277
|
+
* Retrieves the value associated with the given key and marks the key as recently used.
|
|
6278
|
+
* @param {any} key The key to retrieve.
|
|
6279
|
+
* @returns {any} The value associated with the key, or undefined if the key does not exist.
|
|
6280
|
+
*/
|
|
6281
|
+
get(key) {
|
|
6282
|
+
if (!this.#cache.has(key)) return void 0;
|
|
6283
|
+
const value = this.#cache.get(key);
|
|
6284
|
+
this.#cache.delete(key);
|
|
6285
|
+
this.#cache.set(key, value);
|
|
6286
|
+
return value;
|
|
6287
|
+
}
|
|
6288
|
+
/**
|
|
6289
|
+
* Inserts or updates the key-value pair in the cache.
|
|
6290
|
+
* If the key already exists, it is updated and marked as recently used.
|
|
6291
|
+
* If the cache exceeds its capacity, the least recently used item is evicted.
|
|
6292
|
+
* @param {any} key The key to add or update.
|
|
6293
|
+
* @param {any} value The value to associate with the key.
|
|
6294
|
+
*/
|
|
6295
|
+
put(key, value) {
|
|
6296
|
+
if (this.#cache.has(key)) {
|
|
6297
|
+
this.#cache.delete(key);
|
|
6298
|
+
}
|
|
6299
|
+
this.#cache.set(key, value);
|
|
6300
|
+
if (this.#cache.size > this.#capacity) {
|
|
6301
|
+
this.#cache.delete(this.#cache.keys().next().value);
|
|
6302
|
+
}
|
|
6303
|
+
}
|
|
6304
|
+
/**
|
|
6305
|
+
* Removes the entry for the given key from the cache.
|
|
6306
|
+
* @param {any} key The key to delete.
|
|
6307
|
+
* @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
|
|
6308
|
+
*/
|
|
6309
|
+
delete(key) {
|
|
6310
|
+
return this.#cache.delete(key);
|
|
6311
|
+
}
|
|
6312
|
+
/**
|
|
6313
|
+
* Clears the cache.
|
|
6314
|
+
*/
|
|
6315
|
+
clear() {
|
|
6316
|
+
this.#cache.clear();
|
|
6317
|
+
}
|
|
6318
|
+
};
|
|
6319
|
+
|
|
6320
|
+
// src/utils/memoize_promise.js
|
|
6321
|
+
var MAX_CACHE_SIZE = 100;
|
|
6322
|
+
var cache = new LRUCache2(MAX_CACHE_SIZE);
|
|
6323
|
+
function memoizePromise(key, factory) {
|
|
6324
|
+
const cached = cache.get(key);
|
|
6325
|
+
if (cached !== void 0) {
|
|
6326
|
+
return cached;
|
|
6327
|
+
}
|
|
6328
|
+
const promise = factory().then(
|
|
6329
|
+
(value) => value,
|
|
6330
|
+
(err) => {
|
|
6331
|
+
cache.delete(key);
|
|
6332
|
+
return Promise.reject(err);
|
|
6333
|
+
}
|
|
6334
|
+
);
|
|
6335
|
+
cache.put(key, promise);
|
|
6336
|
+
return promise;
|
|
6337
|
+
}
|
|
6338
|
+
|
|
6042
6339
|
// src/utils/model_registry/get_file_metadata.js
|
|
6043
6340
|
async function fetch_file_head(urlOrPath) {
|
|
6044
6341
|
if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
|
|
@@ -6046,17 +6343,27 @@ async function fetch_file_head(urlOrPath) {
|
|
|
6046
6343
|
}
|
|
6047
6344
|
const headers = getFetchHeaders(urlOrPath);
|
|
6048
6345
|
headers.set("Range", "bytes=0-0");
|
|
6049
|
-
return env.fetch(urlOrPath, { method: "GET", headers });
|
|
6346
|
+
return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
|
|
6347
|
+
}
|
|
6348
|
+
function get_file_metadata(path_or_repo_id, filename, options = {}) {
|
|
6349
|
+
const key = JSON.stringify([
|
|
6350
|
+
path_or_repo_id,
|
|
6351
|
+
filename,
|
|
6352
|
+
options?.revision,
|
|
6353
|
+
options?.cache_dir,
|
|
6354
|
+
options?.local_files_only
|
|
6355
|
+
]);
|
|
6356
|
+
return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
|
|
6050
6357
|
}
|
|
6051
|
-
async function
|
|
6052
|
-
const
|
|
6358
|
+
async function _get_file_metadata(path_or_repo_id, filename, options) {
|
|
6359
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6053
6360
|
const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6054
6361
|
path_or_repo_id,
|
|
6055
6362
|
filename,
|
|
6056
6363
|
options,
|
|
6057
|
-
|
|
6364
|
+
cache2
|
|
6058
6365
|
);
|
|
6059
|
-
const cachedResponse = await checkCachedResource(
|
|
6366
|
+
const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6060
6367
|
if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
|
|
6061
6368
|
const size = cachedResponse.headers.get("content-length");
|
|
6062
6369
|
const contentType = cachedResponse.headers.get("content-type");
|
|
@@ -6154,7 +6461,7 @@ function getFetchHeaders(urlOrPath) {
|
|
|
6154
6461
|
}
|
|
6155
6462
|
return headers;
|
|
6156
6463
|
}
|
|
6157
|
-
function buildResourcePaths(path_or_repo_id, filename, options = {},
|
|
6464
|
+
function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
|
|
6158
6465
|
const revision = options.revision ?? "main";
|
|
6159
6466
|
const requestURL = pathJoin(path_or_repo_id, filename);
|
|
6160
6467
|
const validModelId = isValidHfModelId(path_or_repo_id);
|
|
@@ -6164,7 +6471,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6164
6471
|
env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
|
|
6165
6472
|
filename
|
|
6166
6473
|
);
|
|
6167
|
-
const proposedCacheKey =
|
|
6474
|
+
const proposedCacheKey = cache2 instanceof FileCache ? (
|
|
6168
6475
|
// Choose cache key for filesystem cache
|
|
6169
6476
|
// When using the main revision (default), we use the request URL as the cache key.
|
|
6170
6477
|
// If a specific revision is requested, we account for this in the cache key.
|
|
@@ -6178,14 +6485,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6178
6485
|
validModelId
|
|
6179
6486
|
};
|
|
6180
6487
|
}
|
|
6181
|
-
async function checkCachedResource(
|
|
6182
|
-
if (!
|
|
6488
|
+
async function checkCachedResource(cache2, localPath, proposedCacheKey) {
|
|
6489
|
+
if (!cache2) {
|
|
6183
6490
|
return void 0;
|
|
6184
6491
|
}
|
|
6185
|
-
return await tryCache(
|
|
6492
|
+
return await tryCache(cache2, localPath, proposedCacheKey);
|
|
6186
6493
|
}
|
|
6187
|
-
async function storeCachedResource(path_or_repo_id, filename,
|
|
6188
|
-
if (await
|
|
6494
|
+
async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
|
|
6495
|
+
if (await cache2.match(cacheKey) !== void 0) {
|
|
6189
6496
|
return;
|
|
6190
6497
|
}
|
|
6191
6498
|
if (!result) {
|
|
@@ -6195,14 +6502,14 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6195
6502
|
file: filename,
|
|
6196
6503
|
...data
|
|
6197
6504
|
}) : void 0;
|
|
6198
|
-
await
|
|
6505
|
+
await cache2.put(
|
|
6199
6506
|
cacheKey,
|
|
6200
6507
|
/** @type {Response} */
|
|
6201
6508
|
response,
|
|
6202
6509
|
wrapped_progress
|
|
6203
6510
|
);
|
|
6204
6511
|
} else if (typeof response !== "string") {
|
|
6205
|
-
await
|
|
6512
|
+
await cache2.put(
|
|
6206
6513
|
cacheKey,
|
|
6207
6514
|
new Response(
|
|
6208
6515
|
/** @type {any} */
|
|
@@ -6216,17 +6523,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6216
6523
|
});
|
|
6217
6524
|
}
|
|
6218
6525
|
}
|
|
6219
|
-
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false,
|
|
6526
|
+
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
|
|
6220
6527
|
const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6221
6528
|
path_or_repo_id,
|
|
6222
6529
|
filename,
|
|
6223
6530
|
options,
|
|
6224
|
-
|
|
6531
|
+
cache2
|
|
6225
6532
|
);
|
|
6226
6533
|
let cacheKey;
|
|
6227
6534
|
let toCacheResponse = false;
|
|
6228
6535
|
let response;
|
|
6229
|
-
response = await checkCachedResource(
|
|
6536
|
+
response = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6230
6537
|
const cacheHit = response !== void 0;
|
|
6231
6538
|
if (!cacheHit) {
|
|
6232
6539
|
if (env.allowLocalModels) {
|
|
@@ -6267,7 +6574,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6267
6574
|
}
|
|
6268
6575
|
cacheKey = proposedCacheKey;
|
|
6269
6576
|
}
|
|
6270
|
-
toCacheResponse =
|
|
6577
|
+
toCacheResponse = cache2 && // 1. A caching system is available
|
|
6271
6578
|
typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
|
|
6272
6579
|
response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
|
|
6273
6580
|
response.status === 200;
|
|
@@ -6329,7 +6636,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6329
6636
|
// i.e., do not cache FileResponses (prevents duplication)
|
|
6330
6637
|
toCacheResponse && cacheKey && typeof response !== "string"
|
|
6331
6638
|
) {
|
|
6332
|
-
await storeCachedResource(path_or_repo_id, filename,
|
|
6639
|
+
await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
|
|
6333
6640
|
}
|
|
6334
6641
|
dispatchCallback(options.progress_callback, {
|
|
6335
6642
|
status: "done",
|
|
@@ -6345,7 +6652,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6345
6652
|
if (response instanceof FileResponse) {
|
|
6346
6653
|
return response.filePath;
|
|
6347
6654
|
}
|
|
6348
|
-
const cachedResponse = await
|
|
6655
|
+
const cachedResponse = await cache2?.match(cacheKey);
|
|
6349
6656
|
if (cachedResponse instanceof FileResponse) {
|
|
6350
6657
|
return cachedResponse.filePath;
|
|
6351
6658
|
} else if (cachedResponse instanceof Response) {
|
|
@@ -6372,8 +6679,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
|
|
|
6372
6679
|
name: path_or_repo_id,
|
|
6373
6680
|
file: filename
|
|
6374
6681
|
});
|
|
6375
|
-
const
|
|
6376
|
-
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path,
|
|
6682
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6683
|
+
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
|
|
6377
6684
|
}
|
|
6378
6685
|
async function getModelText(modelPath, fileName, fatal = true, options = {}) {
|
|
6379
6686
|
const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
|
|
@@ -7176,11 +7483,11 @@ import * as ONNX_WEB from "onnxruntime-web/webgpu";
|
|
|
7176
7483
|
// src/backends/utils/cacheWasm.js
|
|
7177
7484
|
async function loadAndCacheFile(url) {
|
|
7178
7485
|
const fileName = url.split("/").pop();
|
|
7179
|
-
let
|
|
7486
|
+
let cache2;
|
|
7180
7487
|
try {
|
|
7181
|
-
|
|
7182
|
-
if (
|
|
7183
|
-
const result = await
|
|
7488
|
+
cache2 = await getCache();
|
|
7489
|
+
if (cache2) {
|
|
7490
|
+
const result = await cache2.match(url);
|
|
7184
7491
|
if (result) {
|
|
7185
7492
|
return result;
|
|
7186
7493
|
}
|
|
@@ -7192,9 +7499,9 @@ async function loadAndCacheFile(url) {
|
|
|
7192
7499
|
if (!response.ok) {
|
|
7193
7500
|
throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
|
|
7194
7501
|
}
|
|
7195
|
-
if (
|
|
7502
|
+
if (cache2) {
|
|
7196
7503
|
try {
|
|
7197
|
-
await
|
|
7504
|
+
await cache2.put(url, response.clone());
|
|
7198
7505
|
} catch (e) {
|
|
7199
7506
|
logger.warn(`Failed to cache ${fileName}:`, e);
|
|
7200
7507
|
}
|
|
@@ -9046,9 +9353,23 @@ var Tensor2 = class _Tensor {
|
|
|
9046
9353
|
throw Error(`Unsupported norm: ${p}`);
|
|
9047
9354
|
}
|
|
9048
9355
|
const this_data = this.data;
|
|
9049
|
-
const
|
|
9356
|
+
const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
|
|
9357
|
+
if (is_bigint && p !== 1) {
|
|
9358
|
+
throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
|
|
9359
|
+
}
|
|
9360
|
+
let fn, zero;
|
|
9361
|
+
if (is_bigint) {
|
|
9362
|
+
fn = (a, b) => a + b;
|
|
9363
|
+
zero = 0n;
|
|
9364
|
+
} else {
|
|
9365
|
+
fn = (a, b) => a + b ** p;
|
|
9366
|
+
zero = 0;
|
|
9367
|
+
}
|
|
9050
9368
|
if (dim === null) {
|
|
9051
|
-
|
|
9369
|
+
let val = this_data.reduce(fn, zero);
|
|
9370
|
+
if (p !== 1) {
|
|
9371
|
+
val = val ** (1 / p);
|
|
9372
|
+
}
|
|
9052
9373
|
return new _Tensor(this.type, [val], []);
|
|
9053
9374
|
}
|
|
9054
9375
|
const [type, result, resultDims] = reduce_helper(fn, this, dim, keepdim);
|
|
@@ -11508,9 +11829,11 @@ __export(processors_exports, {
|
|
|
11508
11829
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
11509
11830
|
Florence2Processor: () => Florence2Processor,
|
|
11510
11831
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
11832
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
11511
11833
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
11512
11834
|
Idefics3Processor: () => Idefics3Processor,
|
|
11513
11835
|
JinaCLIPProcessor: () => JinaCLIPProcessor,
|
|
11836
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
11514
11837
|
LlavaProcessor: () => LlavaProcessor,
|
|
11515
11838
|
MgpstrProcessor: () => MgpstrProcessor,
|
|
11516
11839
|
MoonshineProcessor: () => MoonshineProcessor,
|
|
@@ -11531,6 +11854,7 @@ __export(processors_exports, {
|
|
|
11531
11854
|
UltravoxProcessor: () => UltravoxProcessor,
|
|
11532
11855
|
VLChatProcessor: () => VLChatProcessor,
|
|
11533
11856
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
11857
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
11534
11858
|
Wav2Vec2Processor: () => Wav2Vec2Processor,
|
|
11535
11859
|
Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
|
|
11536
11860
|
WhisperProcessor: () => WhisperProcessor
|
|
@@ -11585,12 +11909,14 @@ __export(feature_extractors_exports, {
|
|
|
11585
11909
|
EncodecFeatureExtractor: () => EncodecFeatureExtractor,
|
|
11586
11910
|
FeatureExtractor: () => FeatureExtractor,
|
|
11587
11911
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
11912
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
11588
11913
|
MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
|
|
11589
11914
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
11590
11915
|
PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
|
|
11591
11916
|
SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
|
|
11592
11917
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
11593
11918
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
11919
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
11594
11920
|
Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
|
|
11595
11921
|
WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
|
|
11596
11922
|
WhisperFeatureExtractor: () => WhisperFeatureExtractor
|
|
@@ -11825,6 +12151,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
11825
12151
|
mel_filters = null,
|
|
11826
12152
|
mel_floor = 1e-10,
|
|
11827
12153
|
log_mel = null,
|
|
12154
|
+
max_log_mel = null,
|
|
11828
12155
|
reference = 1,
|
|
11829
12156
|
min_value = 1e-10,
|
|
11830
12157
|
db_range = null,
|
|
@@ -11964,6 +12291,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
11964
12291
|
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
11965
12292
|
}
|
|
11966
12293
|
break;
|
|
12294
|
+
case "log10_max_norm": {
|
|
12295
|
+
for (let i = 0; i < o; ++i) {
|
|
12296
|
+
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
12297
|
+
}
|
|
12298
|
+
const logMax = max_log_mel ?? max(mel_spec_data)[0];
|
|
12299
|
+
const threshold = logMax - 8;
|
|
12300
|
+
for (let i = 0; i < o; ++i) {
|
|
12301
|
+
mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
|
|
12302
|
+
}
|
|
12303
|
+
break;
|
|
12304
|
+
}
|
|
11967
12305
|
case "dB":
|
|
11968
12306
|
if (power === 1) {
|
|
11969
12307
|
amplitude_to_db(mel_spec_data, reference, min_value, db_range);
|
|
@@ -11974,7 +12312,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
11974
12312
|
}
|
|
11975
12313
|
break;
|
|
11976
12314
|
default:
|
|
11977
|
-
throw new Error(
|
|
12315
|
+
throw new Error(
|
|
12316
|
+
`log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
|
|
12317
|
+
);
|
|
11978
12318
|
}
|
|
11979
12319
|
}
|
|
11980
12320
|
return mel_spec;
|
|
@@ -12479,6 +12819,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
|
|
|
12479
12819
|
}
|
|
12480
12820
|
};
|
|
12481
12821
|
|
|
12822
|
+
// src/models/granite_speech/feature_extraction_granite_speech.js
|
|
12823
|
+
var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
|
|
12824
|
+
constructor(config) {
|
|
12825
|
+
super(config);
|
|
12826
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
12827
|
+
this.mel_filters = mel_filter_bank(
|
|
12828
|
+
Math.floor(1 + n_fft / 2),
|
|
12829
|
+
// num_frequency_bins = 257
|
|
12830
|
+
n_mels,
|
|
12831
|
+
// 80
|
|
12832
|
+
0,
|
|
12833
|
+
// min_frequency
|
|
12834
|
+
sample_rate / 2,
|
|
12835
|
+
// max_frequency = 8000
|
|
12836
|
+
sample_rate,
|
|
12837
|
+
// 16000
|
|
12838
|
+
null,
|
|
12839
|
+
// norm (torchaudio default: no norm)
|
|
12840
|
+
"htk"
|
|
12841
|
+
// mel_scale (torchaudio default)
|
|
12842
|
+
);
|
|
12843
|
+
const raw_window = window_function(win_length, "hann");
|
|
12844
|
+
this.window = new Float64Array(n_fft);
|
|
12845
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
12846
|
+
this.window.set(raw_window, pad);
|
|
12847
|
+
}
|
|
12848
|
+
/**
|
|
12849
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
12850
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
12851
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
12852
|
+
*/
|
|
12853
|
+
async _call(audio) {
|
|
12854
|
+
validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
|
|
12855
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
12856
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
12857
|
+
const max_num_frames = num_frames - num_frames % 2;
|
|
12858
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
12859
|
+
power: 2,
|
|
12860
|
+
mel_filters: this.mel_filters,
|
|
12861
|
+
log_mel: "log10_max_norm",
|
|
12862
|
+
transpose: true,
|
|
12863
|
+
// [time, n_mels]
|
|
12864
|
+
max_num_frames,
|
|
12865
|
+
do_pad: false
|
|
12866
|
+
});
|
|
12867
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
12868
|
+
return { input_features };
|
|
12869
|
+
}
|
|
12870
|
+
};
|
|
12871
|
+
|
|
12482
12872
|
// src/models/moonshine/feature_extraction_moonshine.js
|
|
12483
12873
|
var MoonshineFeatureExtractor = class extends FeatureExtractor {
|
|
12484
12874
|
/**
|
|
@@ -12959,6 +13349,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
|
|
|
12959
13349
|
}
|
|
12960
13350
|
};
|
|
12961
13351
|
|
|
13352
|
+
// src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
|
|
13353
|
+
var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
|
|
13354
|
+
constructor(config) {
|
|
13355
|
+
super(config);
|
|
13356
|
+
this.config.mel_filters ??= mel_filter_bank(
|
|
13357
|
+
Math.floor(1 + this.config.n_fft / 2),
|
|
13358
|
+
// num_frequency_bins
|
|
13359
|
+
this.config.feature_size,
|
|
13360
|
+
// num_mel_filters
|
|
13361
|
+
0,
|
|
13362
|
+
// min_frequency
|
|
13363
|
+
8e3,
|
|
13364
|
+
// max_frequency
|
|
13365
|
+
this.config.sampling_rate,
|
|
13366
|
+
// sampling_rate
|
|
13367
|
+
"slaney",
|
|
13368
|
+
// norm
|
|
13369
|
+
"slaney"
|
|
13370
|
+
// mel_scale
|
|
13371
|
+
);
|
|
13372
|
+
this.window = window_function(this.config.n_fft, "hann");
|
|
13373
|
+
}
|
|
13374
|
+
/**
|
|
13375
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
13376
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
13377
|
+
* @param {Object} [options]
|
|
13378
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
|
|
13379
|
+
* @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
|
|
13380
|
+
*/
|
|
13381
|
+
async _extract_fbank_features(waveform, { center = true } = {}) {
|
|
13382
|
+
const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
|
|
13383
|
+
const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
|
|
13384
|
+
return await spectrogram(
|
|
13385
|
+
waveform,
|
|
13386
|
+
this.window,
|
|
13387
|
+
n_fft,
|
|
13388
|
+
// frame_length
|
|
13389
|
+
hop_length,
|
|
13390
|
+
{
|
|
13391
|
+
power: 2,
|
|
13392
|
+
mel_filters,
|
|
13393
|
+
log_mel: "log10_max_norm",
|
|
13394
|
+
max_log_mel: global_log_mel_max,
|
|
13395
|
+
center,
|
|
13396
|
+
max_num_frames,
|
|
13397
|
+
do_pad: false
|
|
13398
|
+
}
|
|
13399
|
+
);
|
|
13400
|
+
}
|
|
13401
|
+
/**
|
|
13402
|
+
* Extract mel spectrogram features from audio.
|
|
13403
|
+
* @param {Float32Array|Float64Array} audio The audio data.
|
|
13404
|
+
* @param {Object} [options]
|
|
13405
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform.
|
|
13406
|
+
* @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
|
|
13407
|
+
*/
|
|
13408
|
+
async _call(audio, { center = true } = {}) {
|
|
13409
|
+
validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
|
|
13410
|
+
const features = await this._extract_fbank_features(audio, { center });
|
|
13411
|
+
return {
|
|
13412
|
+
input_features: features.unsqueeze_(0)
|
|
13413
|
+
};
|
|
13414
|
+
}
|
|
13415
|
+
};
|
|
13416
|
+
|
|
12962
13417
|
// src/models/whisper/feature_extraction_whisper.js
|
|
12963
13418
|
var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
12964
13419
|
constructor(config) {
|
|
@@ -12987,7 +13442,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
12987
13442
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
12988
13443
|
*/
|
|
12989
13444
|
async _extract_fbank_features(waveform) {
|
|
12990
|
-
|
|
13445
|
+
return await spectrogram(
|
|
12991
13446
|
waveform,
|
|
12992
13447
|
this.window,
|
|
12993
13448
|
// window
|
|
@@ -12998,7 +13453,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
12998
13453
|
{
|
|
12999
13454
|
power: 2,
|
|
13000
13455
|
mel_filters: this.config.mel_filters,
|
|
13001
|
-
log_mel: "
|
|
13456
|
+
log_mel: "log10_max_norm",
|
|
13002
13457
|
// Custom
|
|
13003
13458
|
max_num_frames: Math.min(
|
|
13004
13459
|
Math.floor(waveform.length / this.config.hop_length),
|
|
@@ -13007,15 +13462,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
13007
13462
|
)
|
|
13008
13463
|
}
|
|
13009
13464
|
);
|
|
13010
|
-
const data = features.data;
|
|
13011
|
-
const maxValue = max(
|
|
13012
|
-
/** @type {Float32Array} */
|
|
13013
|
-
data
|
|
13014
|
-
)[0];
|
|
13015
|
-
for (let i = 0; i < data.length; ++i) {
|
|
13016
|
-
data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
|
|
13017
|
-
}
|
|
13018
|
-
return features;
|
|
13019
13465
|
}
|
|
13020
13466
|
/**
|
|
13021
13467
|
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
@@ -13896,6 +14342,27 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
13896
14342
|
}
|
|
13897
14343
|
return [segmentation, segments];
|
|
13898
14344
|
}
|
|
14345
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
14346
|
+
if (height < factor || width < factor) {
|
|
14347
|
+
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
14348
|
+
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
14349
|
+
throw new Error(
|
|
14350
|
+
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
14351
|
+
);
|
|
14352
|
+
}
|
|
14353
|
+
let h_bar = Math.round(height / factor) * factor;
|
|
14354
|
+
let w_bar = Math.round(width / factor) * factor;
|
|
14355
|
+
if (h_bar * w_bar > max_pixels) {
|
|
14356
|
+
const beta = Math.sqrt(height * width / max_pixels);
|
|
14357
|
+
h_bar = Math.floor(height / beta / factor) * factor;
|
|
14358
|
+
w_bar = Math.floor(width / beta / factor) * factor;
|
|
14359
|
+
} else if (h_bar * w_bar < min_pixels) {
|
|
14360
|
+
const beta = Math.sqrt(min_pixels / (height * width));
|
|
14361
|
+
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
14362
|
+
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
14363
|
+
}
|
|
14364
|
+
return [h_bar, w_bar];
|
|
14365
|
+
}
|
|
13899
14366
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
13900
14367
|
if (label_ids_to_fuse === null) {
|
|
13901
14368
|
logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
|
|
@@ -14184,7 +14651,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
14184
14651
|
});
|
|
14185
14652
|
}
|
|
14186
14653
|
/**
|
|
14187
|
-
* @typedef {
|
|
14654
|
+
* @typedef {Object} PreprocessedImage
|
|
14188
14655
|
* @property {HeightWidth} original_size The original size of the image.
|
|
14189
14656
|
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
|
|
14190
14657
|
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
|
|
@@ -14362,6 +14829,7 @@ __export(image_processors_exports, {
|
|
|
14362
14829
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
14363
14830
|
ImageProcessor: () => ImageProcessor,
|
|
14364
14831
|
JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
|
|
14832
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
14365
14833
|
LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
|
|
14366
14834
|
Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
|
|
14367
14835
|
MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
|
|
@@ -14765,21 +15233,252 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
|
|
|
14765
15233
|
}
|
|
14766
15234
|
};
|
|
14767
15235
|
|
|
14768
|
-
// src/models/
|
|
14769
|
-
|
|
14770
|
-
|
|
14771
|
-
|
|
14772
|
-
|
|
14773
|
-
|
|
14774
|
-
|
|
14775
|
-
|
|
14776
|
-
|
|
15236
|
+
// src/models/lfm2_vl/image_processing_lfm2_vl.js
|
|
15237
|
+
function round_by_factor(number, factor) {
|
|
15238
|
+
return Math.round(number / factor) * factor;
|
|
15239
|
+
}
|
|
15240
|
+
function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
|
|
15241
|
+
let best_ratio_diff = Infinity;
|
|
15242
|
+
let best_ratio = [1, 1];
|
|
15243
|
+
const area = width * height;
|
|
15244
|
+
for (const ratio of target_ratios) {
|
|
15245
|
+
const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
|
|
15246
|
+
if (ratio_diff < best_ratio_diff) {
|
|
15247
|
+
best_ratio_diff = ratio_diff;
|
|
15248
|
+
best_ratio = ratio;
|
|
15249
|
+
} else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
|
|
15250
|
+
best_ratio = ratio;
|
|
15251
|
+
}
|
|
15252
|
+
}
|
|
15253
|
+
return best_ratio;
|
|
15254
|
+
}
|
|
15255
|
+
function get_target_ratios(min_tiles, max_tiles) {
|
|
15256
|
+
const ratios = [];
|
|
15257
|
+
const seen = /* @__PURE__ */ new Set();
|
|
15258
|
+
for (let n = min_tiles; n <= max_tiles; ++n) {
|
|
15259
|
+
for (let w = 1; w <= n; ++w) {
|
|
15260
|
+
for (let h = 1; h <= n; ++h) {
|
|
15261
|
+
const product2 = w * h;
|
|
15262
|
+
if (product2 >= min_tiles && product2 <= max_tiles) {
|
|
15263
|
+
const key = w << 16 | h;
|
|
15264
|
+
if (!seen.has(key)) {
|
|
15265
|
+
seen.add(key);
|
|
15266
|
+
ratios.push([w, h]);
|
|
15267
|
+
}
|
|
15268
|
+
}
|
|
15269
|
+
}
|
|
15270
|
+
}
|
|
14777
15271
|
}
|
|
14778
|
-
|
|
14779
|
-
|
|
14780
|
-
|
|
15272
|
+
return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
|
|
15273
|
+
}
|
|
15274
|
+
function convert_image_to_patches(images, patch_size) {
|
|
15275
|
+
const [B, C, H, W] = images.dims;
|
|
15276
|
+
const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
|
|
15277
|
+
const patch_dim = patch_size * patch_size * C;
|
|
15278
|
+
const data = (
|
|
15279
|
+
/** @type {Float32Array} */
|
|
15280
|
+
images.data
|
|
15281
|
+
);
|
|
15282
|
+
const result = new Float32Array(B * ph * pw * patch_dim);
|
|
15283
|
+
const ch_stride = H * W;
|
|
15284
|
+
for (let b = 0; b < B; ++b) {
|
|
15285
|
+
const b_src = b * C * ch_stride;
|
|
15286
|
+
const b_dst = b * ph * pw * patch_dim;
|
|
15287
|
+
for (let py = 0; py < ph; ++py) {
|
|
15288
|
+
for (let px = 0; px < pw; ++px) {
|
|
15289
|
+
let off = b_dst + (py * pw + px) * patch_dim;
|
|
15290
|
+
for (let dy = 0; dy < patch_size; ++dy) {
|
|
15291
|
+
const row = (py * patch_size + dy) * W + px * patch_size;
|
|
15292
|
+
for (let dx = 0; dx < patch_size; ++dx) {
|
|
15293
|
+
const pixel = row + dx;
|
|
15294
|
+
for (let c = 0; c < C; ++c) {
|
|
15295
|
+
result[off++] = data[b_src + c * ch_stride + pixel];
|
|
15296
|
+
}
|
|
15297
|
+
}
|
|
15298
|
+
}
|
|
15299
|
+
}
|
|
15300
|
+
}
|
|
14781
15301
|
}
|
|
14782
|
-
|
|
15302
|
+
return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
|
|
15303
|
+
}
|
|
15304
|
+
function pad_along_first_dim(patches, target_length) {
|
|
15305
|
+
const [, len2, dim] = patches.dims;
|
|
15306
|
+
const mask_data = new BigInt64Array(target_length);
|
|
15307
|
+
mask_data.fill(1n, 0, len2);
|
|
15308
|
+
let padded = patches;
|
|
15309
|
+
if (len2 < target_length) {
|
|
15310
|
+
const padded_data = new Float32Array(target_length * dim);
|
|
15311
|
+
padded_data.set(
|
|
15312
|
+
/** @type {Float32Array} */
|
|
15313
|
+
patches.data
|
|
15314
|
+
);
|
|
15315
|
+
padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
|
|
15316
|
+
}
|
|
15317
|
+
return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
|
|
15318
|
+
}
|
|
15319
|
+
var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
15320
|
+
constructor(config) {
|
|
15321
|
+
super(config);
|
|
15322
|
+
this.downsample_factor = config.downsample_factor ?? 2;
|
|
15323
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
15324
|
+
this.min_tiles = config.min_tiles ?? 2;
|
|
15325
|
+
this.max_tiles = config.max_tiles ?? 10;
|
|
15326
|
+
this.use_thumbnail = config.use_thumbnail ?? true;
|
|
15327
|
+
this.min_image_tokens = config.min_image_tokens ?? 64;
|
|
15328
|
+
this.max_image_tokens = config.max_image_tokens ?? 256;
|
|
15329
|
+
this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
|
|
15330
|
+
this.tile_size = config.tile_size ?? 512;
|
|
15331
|
+
this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
|
|
15332
|
+
this.return_row_col_info = config.return_row_col_info ?? false;
|
|
15333
|
+
const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
|
|
15334
|
+
const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
|
|
15335
|
+
this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
|
|
15336
|
+
}
|
|
15337
|
+
/**
|
|
15338
|
+
* Check if the image is too large to be processed as a single tile.
|
|
15339
|
+
* @param {number} height
|
|
15340
|
+
* @param {number} width
|
|
15341
|
+
* @returns {boolean}
|
|
15342
|
+
*/
|
|
15343
|
+
_is_image_too_large(height, width) {
|
|
15344
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
15345
|
+
const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
|
|
15346
|
+
const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
|
|
15347
|
+
return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
|
|
15348
|
+
}
|
|
15349
|
+
/**
|
|
15350
|
+
* Get the grid layout for tiling a large image.
|
|
15351
|
+
* @param {number} height
|
|
15352
|
+
* @param {number} width
|
|
15353
|
+
* @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
|
|
15354
|
+
*/
|
|
15355
|
+
_get_grid_layout(height, width) {
|
|
15356
|
+
const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
|
|
15357
|
+
const [grid_width, grid_height] = find_closest_aspect_ratio(
|
|
15358
|
+
width / height,
|
|
15359
|
+
target_ratios,
|
|
15360
|
+
width,
|
|
15361
|
+
height,
|
|
15362
|
+
this.tile_size
|
|
15363
|
+
);
|
|
15364
|
+
return {
|
|
15365
|
+
grid_width,
|
|
15366
|
+
grid_height,
|
|
15367
|
+
target_width: this.tile_size * grid_width,
|
|
15368
|
+
target_height: this.tile_size * grid_height
|
|
15369
|
+
};
|
|
15370
|
+
}
|
|
15371
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
15372
|
+
// @ts-expect-error
|
|
15373
|
+
async _call(images, { return_row_col_info = null } = {}) {
|
|
15374
|
+
let batched_images;
|
|
15375
|
+
if (!Array.isArray(images)) {
|
|
15376
|
+
batched_images = [[images]];
|
|
15377
|
+
} else if (!Array.isArray(images[0])) {
|
|
15378
|
+
batched_images = [
|
|
15379
|
+
/** @type {RawImage[]} */
|
|
15380
|
+
images
|
|
15381
|
+
];
|
|
15382
|
+
} else {
|
|
15383
|
+
batched_images = /** @type {RawImage[][]} */
|
|
15384
|
+
images;
|
|
15385
|
+
}
|
|
15386
|
+
const all_pixel_values = [];
|
|
15387
|
+
const all_pixel_masks = [];
|
|
15388
|
+
const all_spatial_shapes = [];
|
|
15389
|
+
const all_rows = [];
|
|
15390
|
+
const all_cols = [];
|
|
15391
|
+
const all_image_sizes = [];
|
|
15392
|
+
for (const image_batch of batched_images) {
|
|
15393
|
+
const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
|
|
15394
|
+
for (const { pixel_values } of preprocessed) {
|
|
15395
|
+
const [, height, width] = pixel_values.dims;
|
|
15396
|
+
const img = pixel_values.unsqueeze_(0);
|
|
15397
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
15398
|
+
const f2 = total_factor ** 2;
|
|
15399
|
+
const [new_height, new_width] = smart_resize(
|
|
15400
|
+
Math.max(total_factor, height),
|
|
15401
|
+
Math.max(total_factor, width),
|
|
15402
|
+
total_factor,
|
|
15403
|
+
this.min_image_tokens * f2,
|
|
15404
|
+
this.max_image_tokens * f2
|
|
15405
|
+
).map((x) => Math.max(total_factor, x));
|
|
15406
|
+
let tiles;
|
|
15407
|
+
let num_rows = 1, num_cols = 1;
|
|
15408
|
+
const is_large = this._is_image_too_large(height, width);
|
|
15409
|
+
const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
|
|
15410
|
+
if (is_large && do_splitting) {
|
|
15411
|
+
const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
|
|
15412
|
+
height,
|
|
15413
|
+
width
|
|
15414
|
+
);
|
|
15415
|
+
num_rows = grid_height;
|
|
15416
|
+
num_cols = grid_width;
|
|
15417
|
+
const resized = await interpolate_4d(img, {
|
|
15418
|
+
size: [target_height, target_width]
|
|
15419
|
+
});
|
|
15420
|
+
tiles = [];
|
|
15421
|
+
for (let r = 0; r < grid_height; ++r) {
|
|
15422
|
+
for (let c = 0; c < grid_width; ++c) {
|
|
15423
|
+
const y = r * this.tile_size;
|
|
15424
|
+
const x = c * this.tile_size;
|
|
15425
|
+
tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
|
|
15426
|
+
}
|
|
15427
|
+
}
|
|
15428
|
+
if (this.use_thumbnail && grid_width * grid_height !== 1) {
|
|
15429
|
+
tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
|
|
15430
|
+
}
|
|
15431
|
+
} else {
|
|
15432
|
+
tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
|
|
15433
|
+
}
|
|
15434
|
+
for (const tile of tiles) {
|
|
15435
|
+
const [, , th, tw] = tile.dims;
|
|
15436
|
+
const patches = convert_image_to_patches(tile, this.encoder_patch_size);
|
|
15437
|
+
const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
|
|
15438
|
+
all_pixel_values.push(padded);
|
|
15439
|
+
all_pixel_masks.push(mask);
|
|
15440
|
+
all_spatial_shapes.push([
|
|
15441
|
+
Math.floor(th / this.encoder_patch_size),
|
|
15442
|
+
Math.floor(tw / this.encoder_patch_size)
|
|
15443
|
+
]);
|
|
15444
|
+
}
|
|
15445
|
+
all_rows.push(num_rows);
|
|
15446
|
+
all_cols.push(num_cols);
|
|
15447
|
+
all_image_sizes.push([new_height, new_width]);
|
|
15448
|
+
}
|
|
15449
|
+
}
|
|
15450
|
+
const result = {
|
|
15451
|
+
pixel_values: cat(all_pixel_values, 0),
|
|
15452
|
+
pixel_attention_mask: stack(all_pixel_masks, 0),
|
|
15453
|
+
spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
|
|
15454
|
+
all_spatial_shapes.length,
|
|
15455
|
+
2
|
|
15456
|
+
])
|
|
15457
|
+
};
|
|
15458
|
+
if (return_row_col_info ?? this.return_row_col_info) {
|
|
15459
|
+
result.image_rows = all_rows;
|
|
15460
|
+
result.image_cols = all_cols;
|
|
15461
|
+
result.image_sizes = all_image_sizes;
|
|
15462
|
+
}
|
|
15463
|
+
return result;
|
|
15464
|
+
}
|
|
15465
|
+
};
|
|
15466
|
+
|
|
15467
|
+
// src/models/llava_onevision/image_processing_llava_onevision.js
|
|
15468
|
+
var LlavaOnevisionImageProcessor = class extends ImageProcessor {
|
|
15469
|
+
};
|
|
15470
|
+
|
|
15471
|
+
// src/models/maskformer/image_processing_maskformer.js
|
|
15472
|
+
var MaskFormerImageProcessor = class extends ImageProcessor {
|
|
15473
|
+
/** @type {typeof post_process_panoptic_segmentation} */
|
|
15474
|
+
post_process_panoptic_segmentation(...args) {
|
|
15475
|
+
return post_process_panoptic_segmentation(...args);
|
|
15476
|
+
}
|
|
15477
|
+
/** @type {typeof post_process_instance_segmentation} */
|
|
15478
|
+
post_process_instance_segmentation(...args) {
|
|
15479
|
+
return post_process_instance_segmentation(...args);
|
|
15480
|
+
}
|
|
15481
|
+
};
|
|
14783
15482
|
var MaskFormerFeatureExtractor = class extends MaskFormerImageProcessor {
|
|
14784
15483
|
};
|
|
14785
15484
|
|
|
@@ -14988,27 +15687,6 @@ var PvtImageProcessor = class extends ImageProcessor {
|
|
|
14988
15687
|
};
|
|
14989
15688
|
|
|
14990
15689
|
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
14991
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
14992
|
-
if (height < factor || width < factor) {
|
|
14993
|
-
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
14994
|
-
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
14995
|
-
throw new Error(
|
|
14996
|
-
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
14997
|
-
);
|
|
14998
|
-
}
|
|
14999
|
-
let h_bar = Math.round(height / factor) * factor;
|
|
15000
|
-
let w_bar = Math.round(width / factor) * factor;
|
|
15001
|
-
if (h_bar * w_bar > max_pixels) {
|
|
15002
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
15003
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
15004
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
15005
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
15006
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
15007
|
-
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
15008
|
-
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
15009
|
-
}
|
|
15010
|
-
return [h_bar, w_bar];
|
|
15011
|
-
}
|
|
15012
15690
|
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
15013
15691
|
constructor(config) {
|
|
15014
15692
|
super(config);
|
|
@@ -15610,6 +16288,57 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
15610
16288
|
}
|
|
15611
16289
|
};
|
|
15612
16290
|
|
|
16291
|
+
// src/models/granite_speech/processing_granite_speech.js
|
|
16292
|
+
var GraniteSpeechProcessor = class extends Processor {
|
|
16293
|
+
static tokenizer_class = AutoTokenizer;
|
|
16294
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
16295
|
+
static uses_processor_config = true;
|
|
16296
|
+
/**
|
|
16297
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
16298
|
+
* @param {number} audioLength Raw audio sample count.
|
|
16299
|
+
* @returns {number} Number of projector output tokens.
|
|
16300
|
+
*/
|
|
16301
|
+
_get_num_audio_features(audioLength) {
|
|
16302
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
16303
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
16304
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
16305
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
16306
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
16307
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
16308
|
+
return nblocks * effective_window_size;
|
|
16309
|
+
}
|
|
16310
|
+
/**
|
|
16311
|
+
* @param {string} text The text input to process.
|
|
16312
|
+
* @param {Float32Array} audio The audio input to process.
|
|
16313
|
+
*/
|
|
16314
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
16315
|
+
if (Array.isArray(text)) {
|
|
16316
|
+
throw new Error("Batched inputs are not supported yet.");
|
|
16317
|
+
}
|
|
16318
|
+
let audio_inputs = {};
|
|
16319
|
+
if (audio) {
|
|
16320
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
16321
|
+
audio_inputs["input_features"] = input_features;
|
|
16322
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
16323
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
16324
|
+
audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
|
|
16325
|
+
const audio_token = this.config.audio_token ?? "<|audio|>";
|
|
16326
|
+
if (!text.includes(audio_token)) {
|
|
16327
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
16328
|
+
}
|
|
16329
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
16330
|
+
}
|
|
16331
|
+
const text_inputs = this.tokenizer(text, {
|
|
16332
|
+
add_special_tokens: false,
|
|
16333
|
+
...kwargs
|
|
16334
|
+
});
|
|
16335
|
+
return {
|
|
16336
|
+
...text_inputs,
|
|
16337
|
+
...audio_inputs
|
|
16338
|
+
};
|
|
16339
|
+
}
|
|
16340
|
+
};
|
|
16341
|
+
|
|
15613
16342
|
// src/models/grounding_dino/processing_grounding_dino.js
|
|
15614
16343
|
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
15615
16344
|
const left_idx = 0;
|
|
@@ -15886,6 +16615,66 @@ var JinaCLIPProcessor = class extends Processor {
|
|
|
15886
16615
|
}
|
|
15887
16616
|
};
|
|
15888
16617
|
|
|
16618
|
+
// src/models/lfm2_vl/processing_lfm2_vl.js
|
|
16619
|
+
var Lfm2VlProcessor = class extends Processor {
|
|
16620
|
+
static tokenizer_class = AutoTokenizer;
|
|
16621
|
+
static image_processor_class = AutoImageProcessor;
|
|
16622
|
+
/**
|
|
16623
|
+
* @param {RawImage|RawImage[]} images
|
|
16624
|
+
* @param {string|string[]|null} [text]
|
|
16625
|
+
* @param {Record<string, any>} [kwargs]
|
|
16626
|
+
*/
|
|
16627
|
+
async _call(images, text = null, kwargs = {}) {
|
|
16628
|
+
const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
|
|
16629
|
+
...kwargs,
|
|
16630
|
+
return_row_col_info: true
|
|
16631
|
+
});
|
|
16632
|
+
if (text) {
|
|
16633
|
+
const image_token = this.config.image_token ?? "<image>";
|
|
16634
|
+
const {
|
|
16635
|
+
tile_size = 512,
|
|
16636
|
+
downsample_factor = 2,
|
|
16637
|
+
encoder_patch_size = 16,
|
|
16638
|
+
use_thumbnail = true
|
|
16639
|
+
} = (
|
|
16640
|
+
/** @type {Record<string, any>} */
|
|
16641
|
+
this.image_processor.config
|
|
16642
|
+
);
|
|
16643
|
+
const ds = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
|
|
16644
|
+
const tokens_per_tile = ds(tile_size) ** 2;
|
|
16645
|
+
const image_start = this.config.image_start_token ?? "<|image_start|>";
|
|
16646
|
+
const image_end = this.config.image_end_token ?? "<|image_end|>";
|
|
16647
|
+
const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
|
|
16648
|
+
if (!Array.isArray(text)) text = [text];
|
|
16649
|
+
let image_idx = 0;
|
|
16650
|
+
text = text.map((sample) => {
|
|
16651
|
+
const parts = sample.split(image_token);
|
|
16652
|
+
return parts[0] + parts.slice(1).map((part) => {
|
|
16653
|
+
const idx = image_idx++;
|
|
16654
|
+
const [h, w] = image_sizes[idx];
|
|
16655
|
+
const rows = image_rows[idx], cols = image_cols[idx];
|
|
16656
|
+
const tokens_for_image = ds(h) * ds(w);
|
|
16657
|
+
let expanded = image_start;
|
|
16658
|
+
if (rows > 1 || cols > 1) {
|
|
16659
|
+
const tile_str = image_token.repeat(tokens_per_tile);
|
|
16660
|
+
for (let r = 0; r < rows; ++r)
|
|
16661
|
+
for (let c = 0; c < cols; ++c)
|
|
16662
|
+
expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
|
|
16663
|
+
if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
|
|
16664
|
+
} else {
|
|
16665
|
+
expanded += image_token.repeat(tokens_for_image);
|
|
16666
|
+
}
|
|
16667
|
+
return expanded + image_end + part;
|
|
16668
|
+
}).join("");
|
|
16669
|
+
});
|
|
16670
|
+
}
|
|
16671
|
+
return {
|
|
16672
|
+
...image_inputs,
|
|
16673
|
+
...text ? this.tokenizer(text, kwargs) : {}
|
|
16674
|
+
};
|
|
16675
|
+
}
|
|
16676
|
+
};
|
|
16677
|
+
|
|
15889
16678
|
// src/models/llava/processing_llava.js
|
|
15890
16679
|
var LlavaProcessor = class extends Processor {
|
|
15891
16680
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -16418,6 +17207,94 @@ var VoxtralProcessor = class extends Processor {
|
|
|
16418
17207
|
}
|
|
16419
17208
|
};
|
|
16420
17209
|
|
|
17210
|
+
// src/models/voxtral_realtime/processing_voxtral_realtime.js
|
|
17211
|
+
var NUM_LEFT_PAD_TOKENS = 32;
|
|
17212
|
+
var NUM_DELAY_TOKENS = 6;
|
|
17213
|
+
var AUDIO_LENGTH_PER_TOK = 8;
|
|
17214
|
+
var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
17215
|
+
var STREAMING_PAD_TOKEN_ID = 32;
|
|
17216
|
+
var VoxtralRealtimeProcessor = class extends Processor {
|
|
17217
|
+
static tokenizer_class = AutoTokenizer;
|
|
17218
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
17219
|
+
static uses_processor_config = false;
|
|
17220
|
+
/** Number of mel frames in the first audio chunk. */
|
|
17221
|
+
get num_mel_frames_first_audio_chunk() {
|
|
17222
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
17223
|
+
}
|
|
17224
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
17225
|
+
get num_samples_first_audio_chunk() {
|
|
17226
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
17227
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
17228
|
+
}
|
|
17229
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
17230
|
+
get num_samples_per_audio_chunk() {
|
|
17231
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
17232
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
17233
|
+
}
|
|
17234
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
17235
|
+
get num_right_pad_tokens() {
|
|
17236
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
17237
|
+
}
|
|
17238
|
+
/** Number of mel frames per text token. */
|
|
17239
|
+
get audio_length_per_tok() {
|
|
17240
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
17241
|
+
}
|
|
17242
|
+
/** Number of raw audio samples per token. */
|
|
17243
|
+
get raw_audio_length_per_tok() {
|
|
17244
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
17245
|
+
}
|
|
17246
|
+
/**
|
|
17247
|
+
* Process audio input for VoxtralRealtime.
|
|
17248
|
+
*
|
|
17249
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
17250
|
+
* with silence and mel features are extracted with `center=true`.
|
|
17251
|
+
* Returns `{ input_ids, input_features }`.
|
|
17252
|
+
*
|
|
17253
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
17254
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
17255
|
+
*
|
|
17256
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
17257
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
17258
|
+
* Returns `{ input_features }`.
|
|
17259
|
+
*
|
|
17260
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
17261
|
+
* @param {Object} [options]
|
|
17262
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
17263
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
17264
|
+
* @returns {Promise<Object>}
|
|
17265
|
+
*/
|
|
17266
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
17267
|
+
validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
|
|
17268
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
17269
|
+
throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
|
|
17270
|
+
}
|
|
17271
|
+
if (is_first_audio_chunk) {
|
|
17272
|
+
if (is_streaming) {
|
|
17273
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
17274
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
17275
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
17276
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
17277
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
17278
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
17279
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
17280
|
+
input_ids_data[0] = 1n;
|
|
17281
|
+
const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
|
|
17282
|
+
return {
|
|
17283
|
+
input_ids,
|
|
17284
|
+
...audio_encoding
|
|
17285
|
+
};
|
|
17286
|
+
} else {
|
|
17287
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
17288
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
17289
|
+
padded_audio.set(audio);
|
|
17290
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
17291
|
+
}
|
|
17292
|
+
} else {
|
|
17293
|
+
return await this.feature_extractor(audio, { center: false });
|
|
17294
|
+
}
|
|
17295
|
+
}
|
|
17296
|
+
};
|
|
17297
|
+
|
|
16421
17298
|
// src/models/wav2vec2/processing_wav2vec2.js
|
|
16422
17299
|
var Wav2Vec2Processor = class extends Processor {
|
|
16423
17300
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -16517,10 +17394,13 @@ function getNormalizedConfig(config) {
|
|
|
16517
17394
|
case "florence2":
|
|
16518
17395
|
case "llava_onevision":
|
|
16519
17396
|
case "idefics3":
|
|
17397
|
+
case "granite_speech":
|
|
16520
17398
|
case "ultravox":
|
|
16521
17399
|
case "voxtral":
|
|
17400
|
+
case "voxtral_realtime":
|
|
16522
17401
|
case "smolvlm":
|
|
16523
17402
|
case "gemma3n":
|
|
17403
|
+
case "lfm2_vl":
|
|
16524
17404
|
case "chatterbox":
|
|
16525
17405
|
case "mistral3":
|
|
16526
17406
|
case "qwen2_5_vl":
|
|
@@ -16575,10 +17455,13 @@ function getNormalizedConfig(config) {
|
|
|
16575
17455
|
case "cohere":
|
|
16576
17456
|
case "cohere2":
|
|
16577
17457
|
case "mistral":
|
|
17458
|
+
case "voxtral_realtime_text":
|
|
17459
|
+
case "voxtral_realtime_encoder":
|
|
16578
17460
|
case "starcoder2":
|
|
16579
17461
|
case "qwen2":
|
|
16580
17462
|
case "qwen2_moe":
|
|
16581
17463
|
case "qwen2_vl":
|
|
17464
|
+
case "qwen2_vl_text":
|
|
16582
17465
|
case "qwen2_5_vl_text":
|
|
16583
17466
|
case "qwen3_moe":
|
|
16584
17467
|
case "qwen3_vl_text":
|
|
@@ -16723,6 +17606,9 @@ function getNormalizedConfig(config) {
|
|
|
16723
17606
|
return normalized_config;
|
|
16724
17607
|
}
|
|
16725
17608
|
function getCacheShapes(config, options) {
|
|
17609
|
+
if (!(config instanceof PretrainedConfig)) {
|
|
17610
|
+
config = new PretrainedConfig(config);
|
|
17611
|
+
}
|
|
16726
17612
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
16727
17613
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
16728
17614
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -16829,12 +17715,16 @@ function getCacheShapes(config, options) {
|
|
|
16829
17715
|
}
|
|
16830
17716
|
}
|
|
16831
17717
|
return cache_values;
|
|
16832
|
-
} else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
|
|
16833
|
-
|
|
16834
|
-
|
|
16835
|
-
|
|
16836
|
-
|
|
16837
|
-
|
|
17718
|
+
} else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
|
|
17719
|
+
let subConfig;
|
|
17720
|
+
if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
|
|
17721
|
+
subConfig = /** @type {any} */
|
|
17722
|
+
config.audio_config;
|
|
17723
|
+
} else {
|
|
17724
|
+
subConfig = /** @type {any} */
|
|
17725
|
+
config.text_config;
|
|
17726
|
+
}
|
|
17727
|
+
return getCacheShapes(subConfig, options);
|
|
16838
17728
|
}
|
|
16839
17729
|
return getKeyValueShapes(config, options);
|
|
16840
17730
|
}
|
|
@@ -17000,7 +17890,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
|
|
|
17000
17890
|
}
|
|
17001
17891
|
|
|
17002
17892
|
// src/models/session.js
|
|
17003
|
-
async function getSession(pretrained_model_name_or_path, fileName, options,
|
|
17893
|
+
async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
|
|
17004
17894
|
let custom_config = options.config?.["transformers.js_config"] ?? {};
|
|
17005
17895
|
const selectedDevice = (
|
|
17006
17896
|
/** @type {import("../utils/devices.js").DeviceType} */
|
|
@@ -17058,9 +17948,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
17058
17948
|
if (externalData.length > 0 && !apis.IS_NODE_ENV) {
|
|
17059
17949
|
session_options.externalData = externalData;
|
|
17060
17950
|
}
|
|
17061
|
-
if (
|
|
17951
|
+
if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
|
|
17062
17952
|
const shapes = getCacheShapes(options.config, {
|
|
17063
|
-
prefix: "present"
|
|
17953
|
+
prefix: "present",
|
|
17954
|
+
session_name
|
|
17064
17955
|
});
|
|
17065
17956
|
if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
|
|
17066
17957
|
const preferredOutputLocation = {};
|
|
@@ -17078,15 +17969,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
17078
17969
|
};
|
|
17079
17970
|
return { buffer_or_path, session_options, session_config };
|
|
17080
17971
|
}
|
|
17081
|
-
async function constructSessions(pretrained_model_name_or_path, names, options,
|
|
17972
|
+
async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
|
|
17082
17973
|
return Object.fromEntries(
|
|
17083
17974
|
await Promise.all(
|
|
17084
17975
|
Object.keys(names).map(async (name) => {
|
|
17976
|
+
const cache_config = cache_sessions?.[name] ?? false;
|
|
17085
17977
|
const { buffer_or_path, session_options, session_config } = await getSession(
|
|
17086
17978
|
pretrained_model_name_or_path,
|
|
17087
17979
|
names[name],
|
|
17088
17980
|
options,
|
|
17089
|
-
|
|
17981
|
+
cache_config,
|
|
17982
|
+
name
|
|
17090
17983
|
);
|
|
17091
17984
|
const session = await createInferenceSession(buffer_or_path, session_options, session_config);
|
|
17092
17985
|
return [name, session];
|
|
@@ -18386,19 +19279,71 @@ var BeamSearchSampler = class extends LogitsSampler {
|
|
|
18386
19279
|
}
|
|
18387
19280
|
};
|
|
18388
19281
|
|
|
19282
|
+
// src/cache_utils.js
|
|
19283
|
+
var _DynamicCache = class {
|
|
19284
|
+
/**
|
|
19285
|
+
* Create a DynamicCache, optionally pre-populated with entries.
|
|
19286
|
+
* @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
|
|
19287
|
+
*/
|
|
19288
|
+
constructor(entries) {
|
|
19289
|
+
if (!entries) return;
|
|
19290
|
+
for (const key in entries) {
|
|
19291
|
+
if (key in this) {
|
|
19292
|
+
throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
|
|
19293
|
+
}
|
|
19294
|
+
const value = entries[key];
|
|
19295
|
+
if (!(value instanceof Tensor2)) {
|
|
19296
|
+
throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
|
|
19297
|
+
}
|
|
19298
|
+
this[key] = value;
|
|
19299
|
+
}
|
|
19300
|
+
}
|
|
19301
|
+
/**
|
|
19302
|
+
* Get the cached sequence length. This requires at least one attention cache entry to be present.
|
|
19303
|
+
* @returns {number} The past sequence length.
|
|
19304
|
+
*/
|
|
19305
|
+
get_seq_length() {
|
|
19306
|
+
const self2 = (
|
|
19307
|
+
/** @type {any} */
|
|
19308
|
+
this
|
|
19309
|
+
);
|
|
19310
|
+
for (const name in self2) {
|
|
19311
|
+
if (name.startsWith("past_key_values.")) {
|
|
19312
|
+
return self2[name].dims.at(-2);
|
|
19313
|
+
}
|
|
19314
|
+
}
|
|
19315
|
+
throw new Error("Unable to determine sequence length from the cache.");
|
|
19316
|
+
}
|
|
19317
|
+
/**
|
|
19318
|
+
* Dispose all contained tensors whose data resides on the GPU.
|
|
19319
|
+
* Returns a promise that resolves when all disposals are complete.
|
|
19320
|
+
* @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
|
|
19321
|
+
*/
|
|
19322
|
+
async dispose() {
|
|
19323
|
+
const promises = [];
|
|
19324
|
+
for (
|
|
19325
|
+
const t of
|
|
19326
|
+
/** @type {Tensor[]} */
|
|
19327
|
+
Object.values(this)
|
|
19328
|
+
) {
|
|
19329
|
+
if (t.location === "gpu-buffer") {
|
|
19330
|
+
promises.push(t.dispose());
|
|
19331
|
+
}
|
|
19332
|
+
}
|
|
19333
|
+
await Promise.all(promises);
|
|
19334
|
+
}
|
|
19335
|
+
};
|
|
19336
|
+
var DynamicCache = (
|
|
19337
|
+
/** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
|
|
19338
|
+
/** @type {unknown} */
|
|
19339
|
+
_DynamicCache
|
|
19340
|
+
);
|
|
19341
|
+
|
|
18389
19342
|
// src/models/modeling_utils.js
|
|
18390
19343
|
var MODEL_MAPPING_NAMES = null;
|
|
18391
19344
|
function registerTaskMappings(mappings) {
|
|
18392
19345
|
MODEL_MAPPING_NAMES = mappings;
|
|
18393
19346
|
}
|
|
18394
|
-
function getPastLength(past_key_values) {
|
|
18395
|
-
for (const name in past_key_values) {
|
|
18396
|
-
if (name.startsWith("past_key_values.")) {
|
|
18397
|
-
return past_key_values[name].dims.at(-2);
|
|
18398
|
-
}
|
|
18399
|
-
}
|
|
18400
|
-
return Object.values(past_key_values)[0].dims.at(-2);
|
|
18401
|
-
}
|
|
18402
19347
|
function toI64Tensor(items) {
|
|
18403
19348
|
if (items instanceof Tensor2) {
|
|
18404
19349
|
return items;
|
|
@@ -18439,71 +19384,181 @@ var MODEL_TYPES = {
|
|
|
18439
19384
|
AutoEncoder: 12,
|
|
18440
19385
|
ImageAudioTextToText: 13,
|
|
18441
19386
|
Supertonic: 14,
|
|
18442
|
-
Chatterbox: 15
|
|
19387
|
+
Chatterbox: 15,
|
|
19388
|
+
MultimodalLanguageModelOnly: 16,
|
|
19389
|
+
VoxtralRealtime: 17
|
|
18443
19390
|
};
|
|
18444
19391
|
var MODEL_TYPE_CONFIG = {
|
|
18445
19392
|
[MODEL_TYPES.DecoderOnly]: {
|
|
18446
19393
|
can_generate: true,
|
|
18447
19394
|
forward: decoder_forward,
|
|
18448
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
19395
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
19396
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
|
|
19397
|
+
cache_sessions: { model: true },
|
|
19398
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18449
19399
|
},
|
|
18450
19400
|
[MODEL_TYPES.DecoderOnlyWithoutHead]: {
|
|
18451
19401
|
can_generate: false,
|
|
18452
19402
|
forward: decoder_forward,
|
|
18453
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
19403
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
19404
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
18454
19405
|
},
|
|
18455
19406
|
[MODEL_TYPES.Seq2Seq]: {
|
|
18456
19407
|
can_generate: true,
|
|
18457
19408
|
forward: seq2seq_forward,
|
|
18458
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
19409
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
19410
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
19411
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19412
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18459
19413
|
},
|
|
18460
19414
|
[MODEL_TYPES.Vision2Seq]: {
|
|
18461
19415
|
can_generate: true,
|
|
18462
19416
|
forward: seq2seq_forward,
|
|
18463
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
19417
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
19418
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
19419
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19420
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18464
19421
|
},
|
|
18465
19422
|
[MODEL_TYPES.Musicgen]: {
|
|
18466
19423
|
can_generate: true,
|
|
18467
|
-
forward: seq2seq_forward
|
|
19424
|
+
forward: seq2seq_forward,
|
|
19425
|
+
sessions: () => ({
|
|
19426
|
+
model: "text_encoder",
|
|
19427
|
+
decoder_model_merged: "decoder_model_merged",
|
|
19428
|
+
encodec_decode: "encodec_decode"
|
|
19429
|
+
}),
|
|
19430
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19431
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18468
19432
|
},
|
|
18469
19433
|
[MODEL_TYPES.EncoderDecoder]: {
|
|
18470
19434
|
can_generate: false,
|
|
18471
|
-
forward: seq2seq_forward
|
|
19435
|
+
forward: seq2seq_forward,
|
|
19436
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
19437
|
+
cache_sessions: { decoder_model_merged: true }
|
|
19438
|
+
},
|
|
19439
|
+
[MODEL_TYPES.MaskGeneration]: {
|
|
19440
|
+
sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
|
|
18472
19441
|
},
|
|
18473
19442
|
[MODEL_TYPES.ImageTextToText]: {
|
|
18474
19443
|
can_generate: true,
|
|
18475
19444
|
forward: image_text_to_text_forward,
|
|
18476
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19445
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19446
|
+
sessions: (config) => {
|
|
19447
|
+
const s = {
|
|
19448
|
+
embed_tokens: "embed_tokens",
|
|
19449
|
+
vision_encoder: "vision_encoder",
|
|
19450
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19451
|
+
};
|
|
19452
|
+
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
19453
|
+
return s;
|
|
19454
|
+
},
|
|
19455
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19456
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18477
19457
|
},
|
|
18478
19458
|
[MODEL_TYPES.AudioTextToText]: {
|
|
18479
19459
|
can_generate: true,
|
|
18480
19460
|
forward: audio_text_to_text_forward,
|
|
18481
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19461
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19462
|
+
sessions: () => ({
|
|
19463
|
+
embed_tokens: "embed_tokens",
|
|
19464
|
+
audio_encoder: "audio_encoder",
|
|
19465
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19466
|
+
}),
|
|
19467
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19468
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18482
19469
|
},
|
|
18483
|
-
[MODEL_TYPES.
|
|
19470
|
+
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
18484
19471
|
can_generate: true,
|
|
18485
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19472
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19473
|
+
sessions: () => ({
|
|
19474
|
+
embed_tokens: "embed_tokens",
|
|
19475
|
+
audio_encoder: "audio_encoder",
|
|
19476
|
+
vision_encoder: "vision_encoder",
|
|
19477
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19478
|
+
}),
|
|
19479
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18486
19480
|
},
|
|
18487
|
-
[MODEL_TYPES.
|
|
19481
|
+
[MODEL_TYPES.Phi3V]: {
|
|
18488
19482
|
can_generate: true,
|
|
18489
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19483
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19484
|
+
sessions: () => ({
|
|
19485
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
19486
|
+
model: "model",
|
|
19487
|
+
vision_encoder: "vision_encoder"
|
|
19488
|
+
}),
|
|
19489
|
+
cache_sessions: { model: true },
|
|
19490
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18490
19491
|
},
|
|
18491
19492
|
[MODEL_TYPES.MultiModality]: {
|
|
18492
|
-
can_generate: true
|
|
19493
|
+
can_generate: true,
|
|
19494
|
+
sessions: () => ({
|
|
19495
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
19496
|
+
model: "language_model",
|
|
19497
|
+
lm_head: "lm_head",
|
|
19498
|
+
gen_head: "gen_head",
|
|
19499
|
+
gen_img_embeds: "gen_img_embeds",
|
|
19500
|
+
image_decode: "image_decode"
|
|
19501
|
+
}),
|
|
19502
|
+
cache_sessions: { model: true },
|
|
19503
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18493
19504
|
},
|
|
18494
19505
|
[MODEL_TYPES.AutoEncoder]: {
|
|
18495
19506
|
can_generate: false,
|
|
18496
|
-
forward: auto_encoder_forward
|
|
19507
|
+
forward: auto_encoder_forward,
|
|
19508
|
+
sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
|
|
19509
|
+
},
|
|
19510
|
+
[MODEL_TYPES.Supertonic]: {
|
|
19511
|
+
sessions: () => ({
|
|
19512
|
+
text_encoder: "text_encoder",
|
|
19513
|
+
latent_denoiser: "latent_denoiser",
|
|
19514
|
+
voice_decoder: "voice_decoder"
|
|
19515
|
+
})
|
|
18497
19516
|
},
|
|
18498
19517
|
[MODEL_TYPES.Chatterbox]: {
|
|
18499
19518
|
can_generate: true,
|
|
18500
|
-
forward: encoder_forward
|
|
19519
|
+
forward: encoder_forward,
|
|
19520
|
+
sessions: () => ({
|
|
19521
|
+
embed_tokens: "embed_tokens",
|
|
19522
|
+
speech_encoder: "speech_encoder",
|
|
19523
|
+
model: "language_model",
|
|
19524
|
+
conditional_decoder: "conditional_decoder"
|
|
19525
|
+
}),
|
|
19526
|
+
cache_sessions: { model: true },
|
|
19527
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
19528
|
+
},
|
|
19529
|
+
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
19530
|
+
can_generate: true,
|
|
19531
|
+
forward: image_text_to_text_forward,
|
|
19532
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19533
|
+
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
19534
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19535
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
19536
|
+
},
|
|
19537
|
+
[MODEL_TYPES.VoxtralRealtime]: {
|
|
19538
|
+
can_generate: true,
|
|
19539
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
19540
|
+
sessions: () => ({
|
|
19541
|
+
embed_tokens: "embed_tokens",
|
|
19542
|
+
audio_encoder: "audio_encoder",
|
|
19543
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19544
|
+
}),
|
|
19545
|
+
cache_sessions: { decoder_model_merged: true, audio_encoder: true },
|
|
19546
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18501
19547
|
},
|
|
18502
19548
|
default: {
|
|
18503
19549
|
can_generate: false,
|
|
18504
|
-
forward: encoder_forward
|
|
19550
|
+
forward: encoder_forward,
|
|
19551
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
18505
19552
|
}
|
|
18506
19553
|
};
|
|
19554
|
+
function getSessionsConfig(modelType, config, options = {}) {
|
|
19555
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
19556
|
+
return {
|
|
19557
|
+
sessions: typeConfig.sessions(config, options),
|
|
19558
|
+
cache_sessions: typeConfig.cache_sessions,
|
|
19559
|
+
optional_configs: typeConfig.optional_configs
|
|
19560
|
+
};
|
|
19561
|
+
}
|
|
18507
19562
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
18508
19563
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
18509
19564
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -18589,245 +19644,23 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
18589
19644
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
18590
19645
|
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
18591
19646
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
18592
|
-
|
|
18593
|
-
if (modelType ===
|
|
18594
|
-
|
|
18595
|
-
|
|
18596
|
-
|
|
18597
|
-
{
|
|
18598
|
-
|
|
18599
|
-
},
|
|
18600
|
-
options,
|
|
18601
|
-
"model"
|
|
18602
|
-
),
|
|
18603
|
-
get_optional_configs(
|
|
18604
|
-
pretrained_model_name_or_path,
|
|
18605
|
-
{
|
|
18606
|
-
generation_config: "generation_config.json"
|
|
18607
|
-
},
|
|
18608
|
-
options
|
|
18609
|
-
)
|
|
18610
|
-
]);
|
|
18611
|
-
} else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
|
|
18612
|
-
info = await Promise.all([
|
|
18613
|
-
constructSessions(
|
|
18614
|
-
pretrained_model_name_or_path,
|
|
18615
|
-
{
|
|
18616
|
-
model: "encoder_model",
|
|
18617
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18618
|
-
},
|
|
18619
|
-
options,
|
|
18620
|
-
"decoder_model_merged"
|
|
18621
|
-
),
|
|
18622
|
-
get_optional_configs(
|
|
18623
|
-
pretrained_model_name_or_path,
|
|
18624
|
-
{
|
|
18625
|
-
generation_config: "generation_config.json"
|
|
18626
|
-
},
|
|
18627
|
-
options
|
|
18628
|
-
)
|
|
18629
|
-
]);
|
|
18630
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
18631
|
-
info = await Promise.all([
|
|
18632
|
-
constructSessions(
|
|
18633
|
-
pretrained_model_name_or_path,
|
|
18634
|
-
{
|
|
18635
|
-
model: "vision_encoder",
|
|
18636
|
-
prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
|
|
18637
|
-
},
|
|
18638
|
-
options
|
|
18639
|
-
)
|
|
18640
|
-
]);
|
|
18641
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
18642
|
-
info = await Promise.all([
|
|
18643
|
-
constructSessions(
|
|
18644
|
-
pretrained_model_name_or_path,
|
|
18645
|
-
{
|
|
18646
|
-
model: "encoder_model",
|
|
18647
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18648
|
-
},
|
|
18649
|
-
options,
|
|
18650
|
-
"decoder_model_merged"
|
|
18651
|
-
)
|
|
18652
|
-
]);
|
|
18653
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
18654
|
-
const sessions = {
|
|
18655
|
-
embed_tokens: "embed_tokens",
|
|
18656
|
-
vision_encoder: "vision_encoder",
|
|
18657
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18658
|
-
};
|
|
18659
|
-
if (config.is_encoder_decoder) {
|
|
18660
|
-
sessions["model"] = "encoder_model";
|
|
18661
|
-
}
|
|
18662
|
-
info = await Promise.all([
|
|
18663
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
18664
|
-
get_optional_configs(
|
|
18665
|
-
pretrained_model_name_or_path,
|
|
18666
|
-
{
|
|
18667
|
-
generation_config: "generation_config.json"
|
|
18668
|
-
},
|
|
18669
|
-
options
|
|
18670
|
-
)
|
|
18671
|
-
]);
|
|
18672
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
18673
|
-
const sessions = {
|
|
18674
|
-
embed_tokens: "embed_tokens",
|
|
18675
|
-
audio_encoder: "audio_encoder",
|
|
18676
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18677
|
-
};
|
|
18678
|
-
info = await Promise.all([
|
|
18679
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
18680
|
-
get_optional_configs(
|
|
18681
|
-
pretrained_model_name_or_path,
|
|
18682
|
-
{
|
|
18683
|
-
generation_config: "generation_config.json"
|
|
18684
|
-
},
|
|
18685
|
-
options
|
|
18686
|
-
)
|
|
18687
|
-
]);
|
|
18688
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
18689
|
-
const sessions = {
|
|
18690
|
-
embed_tokens: "embed_tokens",
|
|
18691
|
-
audio_encoder: "audio_encoder",
|
|
18692
|
-
vision_encoder: "vision_encoder",
|
|
18693
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18694
|
-
};
|
|
18695
|
-
info = await Promise.all([
|
|
18696
|
-
constructSessions(pretrained_model_name_or_path, sessions, options),
|
|
18697
|
-
get_optional_configs(
|
|
18698
|
-
pretrained_model_name_or_path,
|
|
18699
|
-
{
|
|
18700
|
-
generation_config: "generation_config.json"
|
|
18701
|
-
},
|
|
18702
|
-
options
|
|
18703
|
-
)
|
|
18704
|
-
]);
|
|
18705
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
18706
|
-
info = await Promise.all([
|
|
18707
|
-
constructSessions(
|
|
18708
|
-
pretrained_model_name_or_path,
|
|
18709
|
-
{
|
|
18710
|
-
model: "text_encoder",
|
|
18711
|
-
decoder_model_merged: "decoder_model_merged",
|
|
18712
|
-
encodec_decode: "encodec_decode"
|
|
18713
|
-
},
|
|
18714
|
-
options,
|
|
18715
|
-
"decoder_model_merged"
|
|
18716
|
-
),
|
|
18717
|
-
get_optional_configs(
|
|
18718
|
-
pretrained_model_name_or_path,
|
|
18719
|
-
{
|
|
18720
|
-
generation_config: "generation_config.json"
|
|
18721
|
-
},
|
|
18722
|
-
options
|
|
18723
|
-
)
|
|
18724
|
-
]);
|
|
18725
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
18726
|
-
info = await Promise.all([
|
|
18727
|
-
constructSessions(
|
|
18728
|
-
pretrained_model_name_or_path,
|
|
18729
|
-
{
|
|
18730
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
18731
|
-
model: "language_model",
|
|
18732
|
-
lm_head: "lm_head",
|
|
18733
|
-
gen_head: "gen_head",
|
|
18734
|
-
gen_img_embeds: "gen_img_embeds",
|
|
18735
|
-
image_decode: "image_decode"
|
|
18736
|
-
},
|
|
18737
|
-
options,
|
|
18738
|
-
"model"
|
|
18739
|
-
),
|
|
18740
|
-
get_optional_configs(
|
|
18741
|
-
pretrained_model_name_or_path,
|
|
18742
|
-
{
|
|
18743
|
-
generation_config: "generation_config.json"
|
|
18744
|
-
},
|
|
18745
|
-
options
|
|
18746
|
-
)
|
|
18747
|
-
]);
|
|
18748
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
18749
|
-
info = await Promise.all([
|
|
18750
|
-
constructSessions(
|
|
18751
|
-
pretrained_model_name_or_path,
|
|
18752
|
-
{
|
|
18753
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
18754
|
-
model: "model",
|
|
18755
|
-
vision_encoder: "vision_encoder"
|
|
18756
|
-
},
|
|
18757
|
-
options,
|
|
18758
|
-
"model"
|
|
18759
|
-
),
|
|
18760
|
-
get_optional_configs(
|
|
18761
|
-
pretrained_model_name_or_path,
|
|
18762
|
-
{
|
|
18763
|
-
generation_config: "generation_config.json"
|
|
18764
|
-
},
|
|
18765
|
-
options
|
|
18766
|
-
)
|
|
18767
|
-
]);
|
|
18768
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
18769
|
-
info = await Promise.all([
|
|
18770
|
-
constructSessions(
|
|
18771
|
-
pretrained_model_name_or_path,
|
|
18772
|
-
{
|
|
18773
|
-
embed_tokens: "embed_tokens",
|
|
18774
|
-
speech_encoder: "speech_encoder",
|
|
18775
|
-
model: "language_model",
|
|
18776
|
-
conditional_decoder: "conditional_decoder"
|
|
18777
|
-
},
|
|
18778
|
-
options,
|
|
18779
|
-
"model"
|
|
18780
|
-
),
|
|
18781
|
-
get_optional_configs(
|
|
18782
|
-
pretrained_model_name_or_path,
|
|
18783
|
-
{
|
|
18784
|
-
generation_config: "generation_config.json"
|
|
18785
|
-
},
|
|
18786
|
-
options
|
|
18787
|
-
)
|
|
18788
|
-
]);
|
|
18789
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
18790
|
-
info = await Promise.all([
|
|
18791
|
-
constructSessions(
|
|
18792
|
-
pretrained_model_name_or_path,
|
|
18793
|
-
{
|
|
18794
|
-
encoder_model: "encoder_model",
|
|
18795
|
-
decoder_model: "decoder_model"
|
|
18796
|
-
},
|
|
18797
|
-
options
|
|
18798
|
-
)
|
|
18799
|
-
]);
|
|
18800
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
18801
|
-
info = await Promise.all([
|
|
18802
|
-
constructSessions(
|
|
18803
|
-
pretrained_model_name_or_path,
|
|
18804
|
-
{
|
|
18805
|
-
text_encoder: "text_encoder",
|
|
18806
|
-
latent_denoiser: "latent_denoiser",
|
|
18807
|
-
voice_decoder: "voice_decoder"
|
|
18808
|
-
},
|
|
18809
|
-
options
|
|
18810
|
-
)
|
|
18811
|
-
]);
|
|
18812
|
-
} else {
|
|
18813
|
-
if (modelType === void 0) {
|
|
18814
|
-
const type = modelName ?? config?.model_type;
|
|
18815
|
-
if (type !== "custom") {
|
|
18816
|
-
logger.warn(
|
|
18817
|
-
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
18818
|
-
);
|
|
18819
|
-
}
|
|
19647
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
19648
|
+
if (modelType === void 0) {
|
|
19649
|
+
const type = modelName ?? config?.model_type;
|
|
19650
|
+
if (type !== "custom") {
|
|
19651
|
+
logger.warn(
|
|
19652
|
+
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
19653
|
+
);
|
|
18820
19654
|
}
|
|
18821
|
-
info = await Promise.all([
|
|
18822
|
-
constructSessions(
|
|
18823
|
-
pretrained_model_name_or_path,
|
|
18824
|
-
{
|
|
18825
|
-
model: options.model_file_name ?? "model"
|
|
18826
|
-
},
|
|
18827
|
-
options
|
|
18828
|
-
)
|
|
18829
|
-
]);
|
|
18830
19655
|
}
|
|
19656
|
+
const sessions = typeConfig.sessions(config, options);
|
|
19657
|
+
const promises = [
|
|
19658
|
+
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
19659
|
+
];
|
|
19660
|
+
if (typeConfig.optional_configs) {
|
|
19661
|
+
promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
|
|
19662
|
+
}
|
|
19663
|
+
const info = await Promise.all(promises);
|
|
18831
19664
|
return new this(config, ...info);
|
|
18832
19665
|
}
|
|
18833
19666
|
/**
|
|
@@ -19026,7 +19859,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19026
19859
|
* @param {Tensor} [params.inputs=null]
|
|
19027
19860
|
* @param {number} [params.bos_token_id=null]
|
|
19028
19861
|
* @param {Record<string, Tensor|number[]>} [params.model_kwargs]
|
|
19029
|
-
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor
|
|
19862
|
+
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
|
|
19030
19863
|
*/
|
|
19031
19864
|
_prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
|
|
19032
19865
|
const model_inputs = pick(model_kwargs, this.forward_params);
|
|
@@ -19267,11 +20100,12 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19267
20100
|
}
|
|
19268
20101
|
}
|
|
19269
20102
|
/**
|
|
19270
|
-
* Returns
|
|
20103
|
+
* Returns a DynamicCache containing past key values from the given decoder results object.
|
|
19271
20104
|
*
|
|
19272
20105
|
* @param {Object} decoderResults The decoder results object.
|
|
19273
|
-
* @param {
|
|
19274
|
-
* @
|
|
20106
|
+
* @param {DynamicCache} pastKeyValues The previous past key values.
|
|
20107
|
+
* @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
|
|
20108
|
+
* @returns {DynamicCache} A new DynamicCache containing the updated past key values.
|
|
19275
20109
|
*/
|
|
19276
20110
|
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
19277
20111
|
const pkvs = /* @__PURE__ */ Object.create(null);
|
|
@@ -19292,7 +20126,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19292
20126
|
}
|
|
19293
20127
|
}
|
|
19294
20128
|
}
|
|
19295
|
-
return pkvs;
|
|
20129
|
+
return new DynamicCache(pkvs);
|
|
19296
20130
|
}
|
|
19297
20131
|
/**
|
|
19298
20132
|
* Returns an object containing attentions from the given model output object.
|
|
@@ -19317,8 +20151,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19317
20151
|
/**
|
|
19318
20152
|
* Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
|
|
19319
20153
|
*
|
|
19320
|
-
* @param {
|
|
19321
|
-
* @param {
|
|
20154
|
+
* @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
|
|
20155
|
+
* @param {DynamicCache|null} pastKeyValues The cache containing past key values.
|
|
19322
20156
|
*/
|
|
19323
20157
|
addPastKeyValues(decoderFeeds, pastKeyValues) {
|
|
19324
20158
|
if (pastKeyValues) {
|
|
@@ -19335,14 +20169,29 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19335
20169
|
}
|
|
19336
20170
|
}
|
|
19337
20171
|
}
|
|
19338
|
-
|
|
19339
|
-
|
|
20172
|
+
/**
|
|
20173
|
+
* Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
|
|
20174
|
+
* @param {string} sessionName
|
|
20175
|
+
* @param {Record<string, Tensor>} inputs
|
|
20176
|
+
* @param {string} outputName
|
|
20177
|
+
* @private
|
|
20178
|
+
*/
|
|
20179
|
+
async _encode_input(sessionName, inputs, outputName) {
|
|
20180
|
+
if (!Object.hasOwn(this.sessions, sessionName)) {
|
|
20181
|
+
throw new Error(`Model does not have a ${sessionName} session.`);
|
|
20182
|
+
}
|
|
20183
|
+
const session = this.sessions[sessionName];
|
|
20184
|
+
const output = await sessionRun(session, pick(inputs, session.inputNames));
|
|
20185
|
+
return output[outputName];
|
|
19340
20186
|
}
|
|
19341
|
-
async
|
|
19342
|
-
return
|
|
20187
|
+
async encode_image(inputs) {
|
|
20188
|
+
return this._encode_input("vision_encoder", inputs, "image_features");
|
|
19343
20189
|
}
|
|
19344
|
-
async
|
|
19345
|
-
return
|
|
20190
|
+
async encode_text(inputs) {
|
|
20191
|
+
return this._encode_input("embed_tokens", inputs, "inputs_embeds");
|
|
20192
|
+
}
|
|
20193
|
+
async encode_audio(inputs) {
|
|
20194
|
+
return this._encode_input("audio_encoder", inputs, "audio_features");
|
|
19346
20195
|
}
|
|
19347
20196
|
};
|
|
19348
20197
|
async function seq2seq_forward(self2, model_inputs) {
|
|
@@ -19397,6 +20246,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
|
|
|
19397
20246
|
const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
|
|
19398
20247
|
new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
|
|
19399
20248
|
}
|
|
20249
|
+
if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
|
|
20250
|
+
new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
|
|
20251
|
+
}
|
|
19400
20252
|
self2.addPastKeyValues(new_model_inputs, past_key_values);
|
|
19401
20253
|
const fixed = pick(new_model_inputs, session.inputNames);
|
|
19402
20254
|
return await sessionRun(session, fixed);
|
|
@@ -19405,7 +20257,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19405
20257
|
// Generic parameters:
|
|
19406
20258
|
encode_function,
|
|
19407
20259
|
merge_function,
|
|
19408
|
-
|
|
20260
|
+
modality_input_names,
|
|
19409
20261
|
modality_output_name,
|
|
19410
20262
|
// Produced by the tokenizer/processor:
|
|
19411
20263
|
input_ids = null,
|
|
@@ -19420,32 +20272,34 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19420
20272
|
// Additional parameters
|
|
19421
20273
|
...kwargs
|
|
19422
20274
|
}) {
|
|
19423
|
-
const modality_values = kwargs[modality_input_name];
|
|
19424
20275
|
if (!inputs_embeds) {
|
|
19425
20276
|
inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
|
|
19426
|
-
|
|
19427
|
-
|
|
19428
|
-
|
|
19429
|
-
|
|
19430
|
-
|
|
19431
|
-
|
|
19432
|
-
|
|
19433
|
-
|
|
19434
|
-
|
|
19435
|
-
inputs_embeds,
|
|
19436
|
-
|
|
19437
|
-
|
|
19438
|
-
|
|
19439
|
-
|
|
19440
|
-
|
|
19441
|
-
|
|
19442
|
-
|
|
19443
|
-
|
|
19444
|
-
|
|
19445
|
-
|
|
19446
|
-
|
|
19447
|
-
|
|
19448
|
-
|
|
20277
|
+
const modality_values = pick(kwargs, modality_input_names);
|
|
20278
|
+
if (Object.keys(modality_values).length > 0) {
|
|
20279
|
+
if (input_ids.dims[1] !== 1) {
|
|
20280
|
+
const modality_features = await encode_function({
|
|
20281
|
+
// Pass the modality values under its expected key.
|
|
20282
|
+
// The caller knows whether this is audio or image.
|
|
20283
|
+
...modality_values,
|
|
20284
|
+
...kwargs
|
|
20285
|
+
});
|
|
20286
|
+
({ inputs_embeds, attention_mask } = merge_function({
|
|
20287
|
+
[modality_output_name]: modality_features,
|
|
20288
|
+
inputs_embeds,
|
|
20289
|
+
input_ids,
|
|
20290
|
+
attention_mask
|
|
20291
|
+
}));
|
|
20292
|
+
} else if (past_key_values && input_ids.dims[1] === 1) {
|
|
20293
|
+
const target_length = input_ids.dims[1];
|
|
20294
|
+
const past_length = past_key_values.get_seq_length();
|
|
20295
|
+
attention_mask = cat(
|
|
20296
|
+
[
|
|
20297
|
+
ones([input_ids.dims[0], past_length]),
|
|
20298
|
+
attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
|
|
20299
|
+
],
|
|
20300
|
+
1
|
|
20301
|
+
);
|
|
20302
|
+
}
|
|
19449
20303
|
}
|
|
19450
20304
|
}
|
|
19451
20305
|
if (!position_ids) {
|
|
@@ -19453,10 +20307,13 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19453
20307
|
// Handle special case for qwen vl models
|
|
19454
20308
|
[
|
|
19455
20309
|
"qwen2_vl",
|
|
20310
|
+
"qwen2_vl_text",
|
|
19456
20311
|
"qwen2_5_vl",
|
|
19457
20312
|
"qwen2_5_vl_text",
|
|
19458
20313
|
"qwen3_vl",
|
|
19459
20314
|
"qwen3_vl_text",
|
|
20315
|
+
"qwen3_vl_moe",
|
|
20316
|
+
"qwen3_vl_moe_text",
|
|
19460
20317
|
"qwen3_5",
|
|
19461
20318
|
"qwen3_5_text",
|
|
19462
20319
|
"qwen3_5_moe",
|
|
@@ -19484,7 +20341,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19484
20341
|
async function audio_text_to_text_forward(self2, params) {
|
|
19485
20342
|
return await generic_text_to_text_forward(self2, {
|
|
19486
20343
|
...params,
|
|
19487
|
-
|
|
20344
|
+
modality_input_names: ["audio_values", "input_features"],
|
|
19488
20345
|
modality_output_name: "audio_features",
|
|
19489
20346
|
encode_function: self2.encode_audio.bind(self2),
|
|
19490
20347
|
merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
|
|
@@ -19493,7 +20350,7 @@ async function audio_text_to_text_forward(self2, params) {
|
|
|
19493
20350
|
async function image_text_to_text_forward(self2, params) {
|
|
19494
20351
|
return await generic_text_to_text_forward(self2, {
|
|
19495
20352
|
...params,
|
|
19496
|
-
|
|
20353
|
+
modality_input_names: ["pixel_values"],
|
|
19497
20354
|
modality_output_name: "image_features",
|
|
19498
20355
|
encode_function: self2.encode_image.bind(self2),
|
|
19499
20356
|
merge_function: self2._merge_input_ids_with_image_features.bind(self2)
|
|
@@ -19529,7 +20386,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
|
|
|
19529
20386
|
return position_ids;
|
|
19530
20387
|
}
|
|
19531
20388
|
function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
|
|
19532
|
-
const past_length = model_inputs.past_key_values ?
|
|
20389
|
+
const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
|
|
20390
|
+
const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
|
|
20391
|
+
if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
|
|
20392
|
+
model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
|
|
20393
|
+
}
|
|
19533
20394
|
if (!model_inputs.attention_mask) {
|
|
19534
20395
|
let dims;
|
|
19535
20396
|
for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
|
|
@@ -19837,6 +20698,7 @@ __export(models_exports, {
|
|
|
19837
20698
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
19838
20699
|
Gemma3Model: () => Gemma3Model,
|
|
19839
20700
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
20701
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
19840
20702
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
19841
20703
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
19842
20704
|
GemmaForCausalLM: () => GemmaForCausalLM,
|
|
@@ -19854,6 +20716,7 @@ __export(models_exports, {
|
|
|
19854
20716
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
19855
20717
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
19856
20718
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
20719
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
19857
20720
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
19858
20721
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
19859
20722
|
GroupViTModel: () => GroupViTModel,
|
|
@@ -19875,7 +20738,6 @@ __export(models_exports, {
|
|
|
19875
20738
|
IJepaModel: () => IJepaModel,
|
|
19876
20739
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
19877
20740
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
19878
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
19879
20741
|
JAISLMHeadModel: () => JAISLMHeadModel,
|
|
19880
20742
|
JAISModel: () => JAISModel,
|
|
19881
20743
|
JAISPreTrainedModel: () => JAISPreTrainedModel,
|
|
@@ -19889,6 +20751,7 @@ __export(models_exports, {
|
|
|
19889
20751
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
19890
20752
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
19891
20753
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
20754
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
19892
20755
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
19893
20756
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
19894
20757
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -20028,7 +20891,6 @@ __export(models_exports, {
|
|
|
20028
20891
|
Owlv2Model: () => Owlv2Model,
|
|
20029
20892
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
20030
20893
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
20031
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
20032
20894
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
20033
20895
|
ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
|
|
20034
20896
|
PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
|
|
@@ -20058,8 +20920,10 @@ __export(models_exports, {
|
|
|
20058
20920
|
Qwen2MoeModel: () => Qwen2MoeModel,
|
|
20059
20921
|
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
20060
20922
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
20923
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
20061
20924
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
20062
20925
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
20926
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
20063
20927
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
20064
20928
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
20065
20929
|
Qwen3Model: () => Qwen3Model,
|
|
@@ -20070,9 +20934,13 @@ __export(models_exports, {
|
|
|
20070
20934
|
Qwen3NextModel: () => Qwen3NextModel,
|
|
20071
20935
|
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
20072
20936
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
20937
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
20073
20938
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
20939
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
20074
20940
|
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
20941
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
20075
20942
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
20943
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
20076
20944
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
20077
20945
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
20078
20946
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -20123,7 +20991,6 @@ __export(models_exports, {
|
|
|
20123
20991
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
20124
20992
|
SmolLM3Model: () => SmolLM3Model,
|
|
20125
20993
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
20126
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
20127
20994
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
20128
20995
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
20129
20996
|
SnacModel: () => SnacModel,
|
|
@@ -20195,6 +21062,8 @@ __export(models_exports, {
|
|
|
20195
21062
|
VitsModelOutput: () => VitsModelOutput,
|
|
20196
21063
|
VitsPreTrainedModel: () => VitsPreTrainedModel,
|
|
20197
21064
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
21065
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
21066
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
20198
21067
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
20199
21068
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
20200
21069
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -20555,7 +21424,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
|
|
|
20555
21424
|
if (!past_key_values || target_length !== 1) {
|
|
20556
21425
|
throw new Error("Incorrect state encountered during generation.");
|
|
20557
21426
|
}
|
|
20558
|
-
const past_length =
|
|
21427
|
+
const past_length = past_key_values.get_seq_length();
|
|
20559
21428
|
attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
|
|
20560
21429
|
}
|
|
20561
21430
|
}
|
|
@@ -21585,6 +22454,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
|
|
|
21585
22454
|
});
|
|
21586
22455
|
}
|
|
21587
22456
|
};
|
|
22457
|
+
var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
|
|
22458
|
+
};
|
|
21588
22459
|
|
|
21589
22460
|
// src/models/glm/modeling_glm.js
|
|
21590
22461
|
var GlmPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -21666,6 +22537,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
|
|
|
21666
22537
|
var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
|
|
21667
22538
|
};
|
|
21668
22539
|
|
|
22540
|
+
// src/models/ultravox/modeling_ultravox.js
|
|
22541
|
+
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
22542
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
22543
|
+
};
|
|
22544
|
+
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
22545
|
+
_merge_input_ids_with_audio_features(kwargs) {
|
|
22546
|
+
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
22547
|
+
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
22548
|
+
return default_merge_input_ids_with_audio_features({
|
|
22549
|
+
// @ts-ignore
|
|
22550
|
+
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
|
|
22551
|
+
...kwargs,
|
|
22552
|
+
audio_features: reshaped_audio_features
|
|
22553
|
+
});
|
|
22554
|
+
}
|
|
22555
|
+
};
|
|
22556
|
+
|
|
22557
|
+
// src/models/granite_speech/modeling_granite_speech.js
|
|
22558
|
+
var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
|
|
22559
|
+
forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
|
|
22560
|
+
};
|
|
22561
|
+
|
|
21669
22562
|
// src/models/grounding_dino/modeling_grounding_dino.js
|
|
21670
22563
|
var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
|
|
21671
22564
|
};
|
|
@@ -21770,34 +22663,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
21770
22663
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
21771
22664
|
};
|
|
21772
22665
|
|
|
21773
|
-
// src/models/
|
|
21774
|
-
var
|
|
21775
|
-
forward_params = [
|
|
21776
|
-
"input_ids",
|
|
21777
|
-
"attention_mask",
|
|
21778
|
-
"pixel_values",
|
|
21779
|
-
"pixel_attention_mask",
|
|
21780
|
-
"position_ids",
|
|
21781
|
-
"past_key_values"
|
|
21782
|
-
];
|
|
22666
|
+
// src/models/llava/modeling_llava.js
|
|
22667
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
22668
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
21783
22669
|
};
|
|
21784
|
-
var
|
|
21785
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
21786
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
|
|
21787
|
-
return features;
|
|
21788
|
-
}
|
|
22670
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
21789
22671
|
_merge_input_ids_with_image_features(kwargs) {
|
|
21790
22672
|
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
21791
22673
|
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
21792
22674
|
return default_merge_input_ids_with_image_features({
|
|
21793
22675
|
// @ts-ignore
|
|
21794
|
-
image_token_id: this.config.image_token_id,
|
|
22676
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
21795
22677
|
...kwargs,
|
|
21796
22678
|
image_features: reshaped_image_hidden_states
|
|
21797
22679
|
});
|
|
21798
22680
|
}
|
|
21799
22681
|
};
|
|
21800
|
-
var
|
|
22682
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22683
|
+
};
|
|
22684
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
22685
|
+
};
|
|
22686
|
+
|
|
22687
|
+
// src/models/idefics3/modeling_idefics3.js
|
|
22688
|
+
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22689
|
+
forward_params = [
|
|
22690
|
+
"input_ids",
|
|
22691
|
+
"attention_mask",
|
|
22692
|
+
"pixel_values",
|
|
22693
|
+
"pixel_attention_mask",
|
|
22694
|
+
"position_ids",
|
|
22695
|
+
"past_key_values"
|
|
22696
|
+
];
|
|
21801
22697
|
};
|
|
21802
22698
|
|
|
21803
22699
|
// src/models/ijepa/modeling_ijepa.js
|
|
@@ -21889,6 +22785,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
|
|
|
21889
22785
|
var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
|
|
21890
22786
|
};
|
|
21891
22787
|
|
|
22788
|
+
// src/models/lfm2_vl/modeling_lfm2_vl.js
|
|
22789
|
+
var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22790
|
+
forward_params = [
|
|
22791
|
+
"input_ids",
|
|
22792
|
+
"attention_mask",
|
|
22793
|
+
"pixel_values",
|
|
22794
|
+
"pixel_attention_mask",
|
|
22795
|
+
"spatial_shapes",
|
|
22796
|
+
"position_ids",
|
|
22797
|
+
"past_key_values"
|
|
22798
|
+
];
|
|
22799
|
+
};
|
|
22800
|
+
|
|
21892
22801
|
// src/models/llama/modeling_llama.js
|
|
21893
22802
|
var LlamaPreTrainedModel = class extends PreTrainedModel {
|
|
21894
22803
|
};
|
|
@@ -21903,27 +22812,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
|
|
|
21903
22812
|
var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
|
|
21904
22813
|
};
|
|
21905
22814
|
|
|
21906
|
-
// src/models/llava/modeling_llava.js
|
|
21907
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
21908
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
21909
|
-
};
|
|
21910
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
21911
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
21912
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
21913
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
21914
|
-
return default_merge_input_ids_with_image_features({
|
|
21915
|
-
// @ts-ignore
|
|
21916
|
-
image_token_id: this.config.image_token_index,
|
|
21917
|
-
...kwargs,
|
|
21918
|
-
image_features: reshaped_image_hidden_states
|
|
21919
|
-
});
|
|
21920
|
-
}
|
|
21921
|
-
};
|
|
21922
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
21923
|
-
};
|
|
21924
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
21925
|
-
};
|
|
21926
|
-
|
|
21927
22815
|
// src/models/longt5/modeling_longt5.js
|
|
21928
22816
|
var LongT5PreTrainedModel = class extends PreTrainedModel {
|
|
21929
22817
|
};
|
|
@@ -22674,27 +23562,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
|
|
|
22674
23562
|
};
|
|
22675
23563
|
|
|
22676
23564
|
// src/models/paligemma/modeling_paligemma.js
|
|
22677
|
-
var
|
|
22678
|
-
forward_params = [
|
|
22679
|
-
"input_ids",
|
|
22680
|
-
// 'inputs_embeds',
|
|
22681
|
-
"attention_mask",
|
|
22682
|
-
"pixel_values",
|
|
22683
|
-
"position_ids",
|
|
22684
|
-
"past_key_values"
|
|
22685
|
-
];
|
|
22686
|
-
};
|
|
22687
|
-
var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
|
|
22688
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
22689
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
22690
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
22691
|
-
return default_merge_input_ids_with_image_features({
|
|
22692
|
-
// @ts-ignore
|
|
22693
|
-
image_token_id: this.config.image_token_index,
|
|
22694
|
-
...kwargs,
|
|
22695
|
-
image_features: reshaped_image_hidden_states
|
|
22696
|
-
});
|
|
22697
|
-
}
|
|
23565
|
+
var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22698
23566
|
};
|
|
22699
23567
|
|
|
22700
23568
|
// src/models/parakeet/modeling_parakeet.js
|
|
@@ -22867,6 +23735,9 @@ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
|
22867
23735
|
];
|
|
22868
23736
|
};
|
|
22869
23737
|
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
23738
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
23739
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
23740
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
22870
23741
|
image_grid_thw_name = "grid_thw";
|
|
22871
23742
|
/**
|
|
22872
23743
|
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
@@ -23056,7 +23927,7 @@ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
|
23056
23927
|
);
|
|
23057
23928
|
} else {
|
|
23058
23929
|
model_inputs.pixel_values = null;
|
|
23059
|
-
const past_length =
|
|
23930
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
23060
23931
|
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
23061
23932
|
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
23062
23933
|
model_inputs.input_ids,
|
|
@@ -23085,11 +23956,16 @@ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
|
23085
23956
|
return model_inputs;
|
|
23086
23957
|
}
|
|
23087
23958
|
};
|
|
23959
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
23960
|
+
};
|
|
23088
23961
|
|
|
23089
23962
|
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
23090
23963
|
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
23091
23964
|
image_grid_thw_name = "image_grid_thw";
|
|
23092
23965
|
};
|
|
23966
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
23967
|
+
image_grid_thw_name = "image_grid_thw";
|
|
23968
|
+
};
|
|
23093
23969
|
|
|
23094
23970
|
// src/models/qwen3/modeling_qwen3.js
|
|
23095
23971
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
@@ -23118,18 +23994,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
|
|
|
23118
23994
|
// src/models/qwen3_vl/modeling_qwen3_vl.js
|
|
23119
23995
|
var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
23120
23996
|
};
|
|
23997
|
+
var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
|
|
23998
|
+
};
|
|
23121
23999
|
|
|
23122
24000
|
// src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
|
|
23123
24001
|
var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
23124
24002
|
};
|
|
24003
|
+
var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
|
|
24004
|
+
};
|
|
23125
24005
|
|
|
23126
24006
|
// src/models/qwen3_5/modeling_qwen3_5.js
|
|
23127
24007
|
var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
23128
24008
|
};
|
|
24009
|
+
var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
|
|
24010
|
+
};
|
|
23129
24011
|
|
|
23130
24012
|
// src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
|
|
23131
24013
|
var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
|
|
23132
24014
|
};
|
|
24015
|
+
var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
|
|
24016
|
+
};
|
|
23133
24017
|
|
|
23134
24018
|
// src/models/resnet/modeling_resnet.js
|
|
23135
24019
|
var ResNetPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -23810,25 +24694,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
|
|
|
23810
24694
|
var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
|
|
23811
24695
|
};
|
|
23812
24696
|
|
|
23813
|
-
// src/models/ultravox/modeling_ultravox.js
|
|
23814
|
-
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
23815
|
-
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
23816
|
-
};
|
|
23817
|
-
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
23818
|
-
_merge_input_ids_with_audio_features(kwargs) {
|
|
23819
|
-
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
23820
|
-
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
23821
|
-
return default_merge_input_ids_with_audio_features({
|
|
23822
|
-
// @ts-ignore
|
|
23823
|
-
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
|
|
23824
|
-
...kwargs,
|
|
23825
|
-
audio_features: reshaped_audio_features
|
|
23826
|
-
});
|
|
23827
|
-
}
|
|
23828
|
-
};
|
|
23829
|
-
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
23830
|
-
};
|
|
23831
|
-
|
|
23832
24697
|
// src/models/unispeech/modeling_unispeech.js
|
|
23833
24698
|
var UniSpeechPreTrainedModel = class extends PreTrainedModel {
|
|
23834
24699
|
};
|
|
@@ -23994,6 +24859,170 @@ var VitsModel = class extends VitsPreTrainedModel {
|
|
|
23994
24859
|
}
|
|
23995
24860
|
};
|
|
23996
24861
|
|
|
24862
|
+
// src/models/voxtral/modeling_voxtral.js
|
|
24863
|
+
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
24864
|
+
};
|
|
24865
|
+
|
|
24866
|
+
// src/models/voxtral_realtime/modeling_voxtral_realtime.js
|
|
24867
|
+
var CONV1_LEFT_PAD = 2;
|
|
24868
|
+
var CONV2_LEFT_PAD = 1;
|
|
24869
|
+
var states = /* @__PURE__ */ new WeakMap();
|
|
24870
|
+
function createEncoderState(model, input_features) {
|
|
24871
|
+
const { text_config, audio_config } = (
|
|
24872
|
+
/** @type {any} */
|
|
24873
|
+
model.config
|
|
24874
|
+
);
|
|
24875
|
+
const encoder_session = model.sessions["audio_encoder"];
|
|
24876
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
24877
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
24878
|
+
const enc_kv_cache = new DynamicCache();
|
|
24879
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
|
|
24880
|
+
const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
24881
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
24882
|
+
for (const name in enc_shapes) {
|
|
24883
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
24884
|
+
enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
24885
|
+
}
|
|
24886
|
+
const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
24887
|
+
1,
|
|
24888
|
+
PADDING_CACHE_CHANNELS,
|
|
24889
|
+
CONV1_LEFT_PAD
|
|
24890
|
+
]);
|
|
24891
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
24892
|
+
if (!chunks_iter) {
|
|
24893
|
+
throw new Error("input_features must be iterable or async iterable");
|
|
24894
|
+
}
|
|
24895
|
+
return {
|
|
24896
|
+
encoder_session,
|
|
24897
|
+
enc_kv_cache,
|
|
24898
|
+
enc_padding_cache,
|
|
24899
|
+
enc_past_seq_len: 0,
|
|
24900
|
+
audio_embed_queue: [],
|
|
24901
|
+
audio_embed_total_tokens: 0,
|
|
24902
|
+
audio_queue_offset: 0,
|
|
24903
|
+
audio_consumed: 0,
|
|
24904
|
+
stream_exhausted: false,
|
|
24905
|
+
chunks_iter,
|
|
24906
|
+
text_hidden_size: text_config.hidden_size
|
|
24907
|
+
};
|
|
24908
|
+
}
|
|
24909
|
+
async function encodeChunk(s, chunk_features) {
|
|
24910
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
24911
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
24912
|
+
const position_ids = new Tensor2(
|
|
24913
|
+
"int64",
|
|
24914
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
24915
|
+
[1, conv2_output_len]
|
|
24916
|
+
);
|
|
24917
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
24918
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
24919
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
24920
|
+
input_features: chunk_features,
|
|
24921
|
+
attention_mask,
|
|
24922
|
+
position_ids,
|
|
24923
|
+
past_padding_cache: s.enc_padding_cache,
|
|
24924
|
+
...s.enc_kv_cache
|
|
24925
|
+
});
|
|
24926
|
+
if (s.enc_padding_cache.location === "gpu-buffer") {
|
|
24927
|
+
s.enc_padding_cache.dispose();
|
|
24928
|
+
}
|
|
24929
|
+
s.enc_padding_cache = present_padding_cache;
|
|
24930
|
+
for (const name in present_cache) {
|
|
24931
|
+
if (name.startsWith("present.")) {
|
|
24932
|
+
const pastName = name.replace("present", "past_key_values");
|
|
24933
|
+
const prev = s.enc_kv_cache[pastName];
|
|
24934
|
+
if (prev?.location === "gpu-buffer") {
|
|
24935
|
+
prev.dispose();
|
|
24936
|
+
}
|
|
24937
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
24938
|
+
}
|
|
24939
|
+
}
|
|
24940
|
+
s.enc_past_seq_len = total_seq_len;
|
|
24941
|
+
return audio_embeds;
|
|
24942
|
+
}
|
|
24943
|
+
async function fillAudioBuffer(s, needed) {
|
|
24944
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
24945
|
+
const result = await s.chunks_iter.next();
|
|
24946
|
+
if (result.done) {
|
|
24947
|
+
s.stream_exhausted = true;
|
|
24948
|
+
break;
|
|
24949
|
+
}
|
|
24950
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
24951
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
24952
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
24953
|
+
}
|
|
24954
|
+
}
|
|
24955
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
24956
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
24957
|
+
const embed_data = inputs_embeds.data;
|
|
24958
|
+
let embed_write_pos = 0;
|
|
24959
|
+
let remaining = current_len;
|
|
24960
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
24961
|
+
const front = s.audio_embed_queue[0];
|
|
24962
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
24963
|
+
const n = Math.min(remaining, available);
|
|
24964
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
24965
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
24966
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
24967
|
+
}
|
|
24968
|
+
embed_write_pos += n;
|
|
24969
|
+
remaining -= n;
|
|
24970
|
+
s.audio_queue_offset += n;
|
|
24971
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
24972
|
+
s.audio_embed_queue.shift();
|
|
24973
|
+
s.audio_queue_offset = 0;
|
|
24974
|
+
}
|
|
24975
|
+
}
|
|
24976
|
+
s.audio_consumed += current_len - remaining;
|
|
24977
|
+
}
|
|
24978
|
+
var AudioExhaustedCriteria = class extends StoppingCriteria {
|
|
24979
|
+
constructor(enc_state) {
|
|
24980
|
+
super();
|
|
24981
|
+
this._s = enc_state;
|
|
24982
|
+
}
|
|
24983
|
+
_call(input_ids) {
|
|
24984
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
24985
|
+
return input_ids.map(() => done);
|
|
24986
|
+
}
|
|
24987
|
+
};
|
|
24988
|
+
var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
|
|
24989
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
|
|
24990
|
+
};
|
|
24991
|
+
var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
|
|
24992
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
24993
|
+
const current_len = input_ids.dims[1];
|
|
24994
|
+
const enc = states.get(this);
|
|
24995
|
+
if (enc) {
|
|
24996
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
24997
|
+
}
|
|
24998
|
+
const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
|
|
24999
|
+
if (enc) {
|
|
25000
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
25001
|
+
}
|
|
25002
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
25003
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
25004
|
+
const session = this.sessions["decoder_model_merged"];
|
|
25005
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
25006
|
+
return await sessionRun(session, fixed);
|
|
25007
|
+
}
|
|
25008
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
25009
|
+
if (!input_features) {
|
|
25010
|
+
throw new Error("input_features (generator/iterable) must be provided");
|
|
25011
|
+
}
|
|
25012
|
+
const enc_state = createEncoderState(this, input_features);
|
|
25013
|
+
states.set(this, enc_state);
|
|
25014
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
25015
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
25016
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
25017
|
+
try {
|
|
25018
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
25019
|
+
} finally {
|
|
25020
|
+
enc_state.enc_kv_cache.dispose();
|
|
25021
|
+
states.delete(this);
|
|
25022
|
+
}
|
|
25023
|
+
}
|
|
25024
|
+
};
|
|
25025
|
+
|
|
23997
25026
|
// src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
|
|
23998
25027
|
var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
|
|
23999
25028
|
};
|
|
@@ -24747,6 +25776,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24747
25776
|
["gemma2", "Gemma2ForCausalLM"],
|
|
24748
25777
|
["vaultgemma", "VaultGemmaForCausalLM"],
|
|
24749
25778
|
["gemma3_text", "Gemma3ForCausalLM"],
|
|
25779
|
+
["gemma3", "Gemma3ForCausalLM"],
|
|
24750
25780
|
["helium", "HeliumForCausalLM"],
|
|
24751
25781
|
["glm", "GlmForCausalLM"],
|
|
24752
25782
|
["openelm", "OpenELMForCausalLM"],
|
|
@@ -24755,6 +25785,13 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24755
25785
|
["qwen3", "Qwen3ForCausalLM"],
|
|
24756
25786
|
["qwen3_moe", "Qwen3MoeForCausalLM"],
|
|
24757
25787
|
["qwen3_next", "Qwen3NextForCausalLM"],
|
|
25788
|
+
["qwen2_vl", "Qwen2VLForCausalLM"],
|
|
25789
|
+
["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
|
|
25790
|
+
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
25791
|
+
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
25792
|
+
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
25793
|
+
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
25794
|
+
["gemma3n", "Gemma3nForCausalLM"],
|
|
24758
25795
|
["phi", "PhiForCausalLM"],
|
|
24759
25796
|
["phi3", "Phi3ForCausalLM"],
|
|
24760
25797
|
["mpt", "MptForCausalLM"],
|
|
@@ -24830,6 +25867,7 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24830
25867
|
["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
|
|
24831
25868
|
["qwen3_5", "Qwen3_5ForConditionalGeneration"],
|
|
24832
25869
|
["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
|
|
25870
|
+
["lfm2_vl", "Lfm2VlForConditionalGeneration"],
|
|
24833
25871
|
["idefics3", "Idefics3ForConditionalGeneration"],
|
|
24834
25872
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
24835
25873
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
@@ -24838,8 +25876,10 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24838
25876
|
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
24839
25877
|
]);
|
|
24840
25878
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
25879
|
+
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
24841
25880
|
["ultravox", "UltravoxModel"],
|
|
24842
|
-
["voxtral", "VoxtralForConditionalGeneration"]
|
|
25881
|
+
["voxtral", "VoxtralForConditionalGeneration"],
|
|
25882
|
+
["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
|
|
24843
25883
|
]);
|
|
24844
25884
|
var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
24845
25885
|
["vision-encoder-decoder", "VisionEncoderDecoderModel"]
|
|
@@ -25022,7 +26062,19 @@ var CUSTOM_MAPPING = [
|
|
|
25022
26062
|
MODEL_TYPES.ImageAudioTextToText
|
|
25023
26063
|
],
|
|
25024
26064
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
25025
|
-
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
|
|
26065
|
+
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
26066
|
+
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26067
|
+
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26068
|
+
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26069
|
+
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26070
|
+
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26071
|
+
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26072
|
+
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26073
|
+
[
|
|
26074
|
+
"VoxtralRealtimeForConditionalGeneration",
|
|
26075
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
26076
|
+
MODEL_TYPES.VoxtralRealtime
|
|
26077
|
+
]
|
|
25026
26078
|
];
|
|
25027
26079
|
for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
25028
26080
|
MODEL_TYPE_MAPPING.set(name, type);
|
|
@@ -26700,8 +27752,18 @@ var TASK_ALIASES = Object.freeze({
|
|
|
26700
27752
|
});
|
|
26701
27753
|
|
|
26702
27754
|
// src/utils/model_registry/get_model_files.js
|
|
27755
|
+
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
27756
|
+
if (config !== null) {
|
|
27757
|
+
return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
|
|
27758
|
+
}
|
|
27759
|
+
const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
|
|
27760
|
+
return memoizePromise(
|
|
27761
|
+
key,
|
|
27762
|
+
() => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
|
|
27763
|
+
);
|
|
27764
|
+
}
|
|
26703
27765
|
async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
|
|
26704
|
-
config = await
|
|
27766
|
+
config = await get_config(modelId, { config });
|
|
26705
27767
|
const files = [
|
|
26706
27768
|
// Add config.json (always loaded)
|
|
26707
27769
|
"config.json"
|
|
@@ -26762,74 +27824,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
26762
27824
|
files.push(dataFilePath);
|
|
26763
27825
|
}
|
|
26764
27826
|
};
|
|
26765
|
-
const
|
|
26766
|
-
|
|
26767
|
-
add_model_file(
|
|
26768
|
-
|
|
26769
|
-
|
|
26770
|
-
|
|
26771
|
-
|
|
26772
|
-
|
|
26773
|
-
add_model_file("decoder_model_merged");
|
|
26774
|
-
files.push("generation_config.json");
|
|
26775
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
26776
|
-
add_model_file("model", "vision_encoder");
|
|
26777
|
-
add_model_file("prompt_encoder_mask_decoder");
|
|
26778
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
26779
|
-
add_model_file("model", "encoder_model");
|
|
26780
|
-
add_model_file("decoder_model_merged");
|
|
26781
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
26782
|
-
add_model_file("embed_tokens");
|
|
26783
|
-
add_model_file("vision_encoder");
|
|
26784
|
-
add_model_file("decoder_model_merged");
|
|
26785
|
-
if (config.is_encoder_decoder) {
|
|
26786
|
-
add_model_file("model", "encoder_model");
|
|
26787
|
-
}
|
|
26788
|
-
files.push("generation_config.json");
|
|
26789
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
26790
|
-
add_model_file("embed_tokens");
|
|
26791
|
-
add_model_file("audio_encoder");
|
|
26792
|
-
add_model_file("decoder_model_merged");
|
|
26793
|
-
files.push("generation_config.json");
|
|
26794
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
26795
|
-
add_model_file("embed_tokens");
|
|
26796
|
-
add_model_file("audio_encoder");
|
|
26797
|
-
add_model_file("vision_encoder");
|
|
26798
|
-
add_model_file("decoder_model_merged");
|
|
26799
|
-
files.push("generation_config.json");
|
|
26800
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
26801
|
-
add_model_file("model", "text_encoder");
|
|
26802
|
-
add_model_file("decoder_model_merged");
|
|
26803
|
-
add_model_file("encodec_decode");
|
|
26804
|
-
files.push("generation_config.json");
|
|
26805
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
26806
|
-
add_model_file("prepare_inputs_embeds");
|
|
26807
|
-
add_model_file("model", "language_model");
|
|
26808
|
-
add_model_file("lm_head");
|
|
26809
|
-
add_model_file("gen_head");
|
|
26810
|
-
add_model_file("gen_img_embeds");
|
|
26811
|
-
add_model_file("image_decode");
|
|
26812
|
-
files.push("generation_config.json");
|
|
26813
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
26814
|
-
add_model_file("prepare_inputs_embeds");
|
|
26815
|
-
add_model_file("model");
|
|
26816
|
-
add_model_file("vision_encoder");
|
|
26817
|
-
files.push("generation_config.json");
|
|
26818
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
26819
|
-
add_model_file("embed_tokens");
|
|
26820
|
-
add_model_file("speech_encoder");
|
|
26821
|
-
add_model_file("model", "language_model");
|
|
26822
|
-
add_model_file("conditional_decoder");
|
|
26823
|
-
files.push("generation_config.json");
|
|
26824
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
26825
|
-
add_model_file("encoder_model");
|
|
26826
|
-
add_model_file("decoder_model");
|
|
26827
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
26828
|
-
add_model_file("text_encoder");
|
|
26829
|
-
add_model_file("latent_denoiser");
|
|
26830
|
-
add_model_file("voice_decoder");
|
|
26831
|
-
} else {
|
|
26832
|
-
add_model_file("model", singleModelName);
|
|
27827
|
+
const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
|
|
27828
|
+
for (const [sessionKey, baseName] of Object.entries(sessions)) {
|
|
27829
|
+
add_model_file(sessionKey, baseName);
|
|
27830
|
+
}
|
|
27831
|
+
if (optional_configs) {
|
|
27832
|
+
for (const configFile of Object.values(optional_configs)) {
|
|
27833
|
+
files.push(configFile);
|
|
27834
|
+
}
|
|
26833
27835
|
}
|
|
26834
27836
|
return files;
|
|
26835
27837
|
}
|
|
@@ -27280,25 +28282,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
|
27280
28282
|
|
|
27281
28283
|
// src/utils/model_registry/is_cached.js
|
|
27282
28284
|
async function check_files_cache(modelId, files, options = {}) {
|
|
27283
|
-
const
|
|
27284
|
-
if (!
|
|
28285
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
28286
|
+
if (!cache2) {
|
|
27285
28287
|
const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
|
|
27286
28288
|
return { allCached: false, files: fileStatuses2 };
|
|
27287
28289
|
}
|
|
27288
28290
|
const fileStatuses = await Promise.all(
|
|
27289
28291
|
files.map(async (filename) => {
|
|
27290
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
27291
|
-
const cached = await checkCachedResource(
|
|
28292
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
28293
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
27292
28294
|
return { file: filename, cached: !!cached };
|
|
27293
28295
|
})
|
|
27294
28296
|
);
|
|
27295
28297
|
return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
|
|
27296
28298
|
}
|
|
27297
28299
|
async function is_file_cached(modelId, filename, options = {}) {
|
|
27298
|
-
const
|
|
27299
|
-
if (!
|
|
27300
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
27301
|
-
return !!await checkCachedResource(
|
|
28300
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
28301
|
+
if (!cache2) return false;
|
|
28302
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
28303
|
+
return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
27302
28304
|
}
|
|
27303
28305
|
async function is_cached(modelId, options = {}) {
|
|
27304
28306
|
if (!modelId) {
|
|
@@ -27345,26 +28347,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
|
|
|
27345
28347
|
|
|
27346
28348
|
// src/utils/model_registry/clear_cache.js
|
|
27347
28349
|
async function clear_files_from_cache(modelId, files, options = {}) {
|
|
27348
|
-
const
|
|
27349
|
-
if (!
|
|
28350
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
28351
|
+
if (!cache2) {
|
|
27350
28352
|
return {
|
|
27351
28353
|
filesDeleted: 0,
|
|
27352
28354
|
filesCached: 0,
|
|
27353
28355
|
files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
|
|
27354
28356
|
};
|
|
27355
28357
|
}
|
|
27356
|
-
if (!
|
|
28358
|
+
if (!cache2.delete) {
|
|
27357
28359
|
throw new Error("Cache does not support delete operation");
|
|
27358
28360
|
}
|
|
27359
28361
|
const results = await Promise.all(
|
|
27360
28362
|
files.map(async (filename) => {
|
|
27361
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
27362
|
-
const cached = await checkCachedResource(
|
|
28363
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
28364
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
27363
28365
|
const wasCached = !!cached;
|
|
27364
28366
|
let deleted = false;
|
|
27365
28367
|
if (wasCached) {
|
|
27366
|
-
const deletedWithProposed = await
|
|
27367
|
-
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await
|
|
28368
|
+
const deletedWithProposed = await cache2.delete(proposedCacheKey);
|
|
28369
|
+
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
|
|
27368
28370
|
deleted = deletedWithProposed || deletedWithLocal;
|
|
27369
28371
|
}
|
|
27370
28372
|
return { file: filename, deleted, wasCached };
|
|
@@ -27845,6 +28847,7 @@ export {
|
|
|
27845
28847
|
DonutImageProcessor,
|
|
27846
28848
|
DonutSwinModel,
|
|
27847
28849
|
DonutSwinPreTrainedModel,
|
|
28850
|
+
DynamicCache,
|
|
27848
28851
|
EdgeTamModel,
|
|
27849
28852
|
EfficientNetForImageClassification,
|
|
27850
28853
|
EfficientNetImageProcessor,
|
|
@@ -27917,6 +28920,7 @@ export {
|
|
|
27917
28920
|
Gemma3Model,
|
|
27918
28921
|
Gemma3PreTrainedModel,
|
|
27919
28922
|
Gemma3nAudioFeatureExtractor,
|
|
28923
|
+
Gemma3nForCausalLM,
|
|
27920
28924
|
Gemma3nForConditionalGeneration,
|
|
27921
28925
|
Gemma3nPreTrainedModel,
|
|
27922
28926
|
Gemma3nProcessor,
|
|
@@ -27936,6 +28940,9 @@ export {
|
|
|
27936
28940
|
GraniteMoeHybridModel,
|
|
27937
28941
|
GraniteMoeHybridPreTrainedModel,
|
|
27938
28942
|
GranitePreTrainedModel,
|
|
28943
|
+
GraniteSpeechFeatureExtractor,
|
|
28944
|
+
GraniteSpeechForConditionalGeneration,
|
|
28945
|
+
GraniteSpeechProcessor,
|
|
27939
28946
|
GroundingDinoForObjectDetection,
|
|
27940
28947
|
GroundingDinoImageProcessor,
|
|
27941
28948
|
GroundingDinoPreTrainedModel,
|
|
@@ -27961,7 +28968,6 @@ export {
|
|
|
27961
28968
|
IJepaPreTrainedModel,
|
|
27962
28969
|
Idefics3ForConditionalGeneration,
|
|
27963
28970
|
Idefics3ImageProcessor,
|
|
27964
|
-
Idefics3PreTrainedModel,
|
|
27965
28971
|
Idefics3Processor,
|
|
27966
28972
|
ImageClassificationPipeline,
|
|
27967
28973
|
ImageFeatureExtractionPipeline,
|
|
@@ -27986,6 +28992,9 @@ export {
|
|
|
27986
28992
|
Lfm2MoeModel,
|
|
27987
28993
|
Lfm2MoePreTrainedModel,
|
|
27988
28994
|
Lfm2PreTrainedModel,
|
|
28995
|
+
Lfm2VlForConditionalGeneration,
|
|
28996
|
+
Lfm2VlImageProcessor,
|
|
28997
|
+
Lfm2VlProcessor,
|
|
27989
28998
|
LiteWhisperForConditionalGeneration,
|
|
27990
28999
|
Llama4ForCausalLM,
|
|
27991
29000
|
Llama4PreTrainedModel,
|
|
@@ -28169,7 +29178,6 @@ export {
|
|
|
28169
29178
|
Owlv2Model,
|
|
28170
29179
|
Owlv2PreTrainedModel,
|
|
28171
29180
|
PaliGemmaForConditionalGeneration,
|
|
28172
|
-
PaliGemmaPreTrainedModel,
|
|
28173
29181
|
PaliGemmaProcessor,
|
|
28174
29182
|
ParakeetFeatureExtractor,
|
|
28175
29183
|
ParakeetForCTC,
|
|
@@ -28213,10 +29221,12 @@ export {
|
|
|
28213
29221
|
Qwen2MoePreTrainedModel,
|
|
28214
29222
|
Qwen2PreTrainedModel,
|
|
28215
29223
|
Qwen2Tokenizer,
|
|
29224
|
+
Qwen2VLForCausalLM,
|
|
28216
29225
|
Qwen2VLForConditionalGeneration,
|
|
28217
29226
|
Qwen2VLImageProcessor,
|
|
28218
29227
|
Qwen2VLPreTrainedModel,
|
|
28219
29228
|
Qwen2VLProcessor,
|
|
29229
|
+
Qwen2_5_VLForCausalLM,
|
|
28220
29230
|
Qwen2_5_VLForConditionalGeneration,
|
|
28221
29231
|
Qwen2_5_VLProcessor,
|
|
28222
29232
|
Qwen3ForCausalLM,
|
|
@@ -28228,10 +29238,14 @@ export {
|
|
|
28228
29238
|
Qwen3NextModel,
|
|
28229
29239
|
Qwen3NextPreTrainedModel,
|
|
28230
29240
|
Qwen3PreTrainedModel,
|
|
29241
|
+
Qwen3VLForCausalLM,
|
|
28231
29242
|
Qwen3VLForConditionalGeneration,
|
|
29243
|
+
Qwen3VLMoeForCausalLM,
|
|
28232
29244
|
Qwen3VLMoeForConditionalGeneration,
|
|
28233
29245
|
Qwen3VLProcessor,
|
|
29246
|
+
Qwen3_5ForCausalLM,
|
|
28234
29247
|
Qwen3_5ForConditionalGeneration,
|
|
29248
|
+
Qwen3_5MoeForCausalLM,
|
|
28235
29249
|
Qwen3_5MoeForConditionalGeneration,
|
|
28236
29250
|
RFDetrForObjectDetection,
|
|
28237
29251
|
RFDetrModel,
|
|
@@ -28303,7 +29317,6 @@ export {
|
|
|
28303
29317
|
SmolLM3ForCausalLM,
|
|
28304
29318
|
SmolLM3Model,
|
|
28305
29319
|
SmolLM3PreTrainedModel,
|
|
28306
|
-
SmolVLMForConditionalGeneration,
|
|
28307
29320
|
Idefics3ImageProcessor as SmolVLMImageProcessor,
|
|
28308
29321
|
Idefics3Processor as SmolVLMProcessor,
|
|
28309
29322
|
SnacDecoderModel,
|
|
@@ -28409,6 +29422,10 @@ export {
|
|
|
28409
29422
|
VitsTokenizer,
|
|
28410
29423
|
VoxtralForConditionalGeneration,
|
|
28411
29424
|
VoxtralProcessor,
|
|
29425
|
+
VoxtralRealtimeFeatureExtractor,
|
|
29426
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
29427
|
+
VoxtralRealtimePreTrainedModel,
|
|
29428
|
+
VoxtralRealtimeProcessor,
|
|
28412
29429
|
Wav2Vec2BertForCTC,
|
|
28413
29430
|
Wav2Vec2BertForSequenceClassification,
|
|
28414
29431
|
Wav2Vec2BertModel,
|