@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +1587 -570
- package/dist/transformers.min.js +17 -17
- package/dist/transformers.node.cjs +1605 -573
- package/dist/transformers.node.min.cjs +21 -21
- package/dist/transformers.node.min.mjs +21 -21
- package/dist/transformers.node.mjs +1600 -583
- package/dist/transformers.web.js +1592 -575
- package/dist/transformers.web.min.js +15 -15
- package/package.json +3 -3
- package/src/cache_utils.js +62 -0
- package/src/configs.js +17 -2
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +3 -3
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +1 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/modeling_utils.js +222 -308
- package/src/models/models.js +4 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +3 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +7 -7
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +25 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +14 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +17 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +4 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
package/dist/transformers.js
CHANGED
|
@@ -20,7 +20,7 @@ var node_path_default = {};
|
|
|
20
20
|
var node_url_default = {};
|
|
21
21
|
|
|
22
22
|
// src/env.js
|
|
23
|
-
var VERSION = "4.0.0-next.
|
|
23
|
+
var VERSION = "4.0.0-next.7";
|
|
24
24
|
var HAS_SELF = typeof self !== "undefined";
|
|
25
25
|
var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
|
|
26
26
|
var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
|
|
@@ -148,6 +148,7 @@ var env = {
|
|
|
148
148
|
customCache: null,
|
|
149
149
|
useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
|
|
150
150
|
cacheKey: "transformers-cache",
|
|
151
|
+
experimental_useCrossOriginStorage: false,
|
|
151
152
|
/////////////////// Custom fetch /////////////////////
|
|
152
153
|
fetch: DEFAULT_FETCH
|
|
153
154
|
//////////////////////////////////////////////////////
|
|
@@ -2698,7 +2699,7 @@ var Tokenizer = class {
|
|
|
2698
2699
|
};
|
|
2699
2700
|
var Tokenizer_default = Tokenizer;
|
|
2700
2701
|
|
|
2701
|
-
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.
|
|
2702
|
+
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
|
|
2702
2703
|
var TOKEN_TYPES = Object.freeze({
|
|
2703
2704
|
Text: "Text",
|
|
2704
2705
|
// The text between Jinja statements or expressions
|
|
@@ -4217,7 +4218,11 @@ var Environment = class {
|
|
|
4217
4218
|
["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
|
|
4218
4219
|
["integer", (operand) => operand instanceof IntegerValue],
|
|
4219
4220
|
["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
|
|
4220
|
-
["mapping", (operand) => operand
|
|
4221
|
+
["mapping", (operand) => operand instanceof ObjectValue],
|
|
4222
|
+
[
|
|
4223
|
+
"sequence",
|
|
4224
|
+
(operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
|
|
4225
|
+
],
|
|
4221
4226
|
[
|
|
4222
4227
|
"lower",
|
|
4223
4228
|
(operand) => {
|
|
@@ -4490,6 +4495,9 @@ var Interpreter = class {
|
|
|
4490
4495
|
applyFilter(operand, filterNode, environment) {
|
|
4491
4496
|
if (filterNode.type === "Identifier") {
|
|
4492
4497
|
const filter = filterNode;
|
|
4498
|
+
if (filter.value === "safe") {
|
|
4499
|
+
return operand;
|
|
4500
|
+
}
|
|
4493
4501
|
if (filter.value === "tojson") {
|
|
4494
4502
|
return new StringValue(toJSON(operand, {}));
|
|
4495
4503
|
}
|
|
@@ -4579,6 +4587,8 @@ var Interpreter = class {
|
|
|
4579
4587
|
return new IntegerValue(Math.floor(operand.value));
|
|
4580
4588
|
case "float":
|
|
4581
4589
|
return new FloatValue(operand.value);
|
|
4590
|
+
case "string":
|
|
4591
|
+
return new StringValue(operand.toString());
|
|
4582
4592
|
default:
|
|
4583
4593
|
throw new Error(`Unknown NumericValue filter: ${filter.value}`);
|
|
4584
4594
|
}
|
|
@@ -6001,9 +6011,216 @@ function toAbsoluteURL(url) {
|
|
|
6001
6011
|
return new URL(url, baseURL).href;
|
|
6002
6012
|
}
|
|
6003
6013
|
|
|
6014
|
+
// src/utils/cache/CrossOriginStorageCache.js
|
|
6015
|
+
var HASH_ALGORITHM = "SHA-256";
|
|
6016
|
+
var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
|
|
6017
|
+
var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
|
|
6018
|
+
var CrossOriginStorage = class {
|
|
6019
|
+
/** @type {Promise<Cache> | null} */
|
|
6020
|
+
#hashCache = null;
|
|
6021
|
+
/**
|
|
6022
|
+
* Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
|
|
6023
|
+
* @returns {Promise<Cache>}
|
|
6024
|
+
*/
|
|
6025
|
+
_getHashCache = () => {
|
|
6026
|
+
this.#hashCache ??= caches.open(HASH_CACHE_NAME);
|
|
6027
|
+
return this.#hashCache;
|
|
6028
|
+
};
|
|
6029
|
+
/**
|
|
6030
|
+
* Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
|
|
6031
|
+
* @returns {boolean}
|
|
6032
|
+
*/
|
|
6033
|
+
static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
|
|
6034
|
+
/**
|
|
6035
|
+
* Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
|
|
6036
|
+
* the corresponding file handle from cross-origin storage.
|
|
6037
|
+
*
|
|
6038
|
+
* Implements `CacheInterface.match`.
|
|
6039
|
+
*
|
|
6040
|
+
* @param {string} request The URL of the resource to look up.
|
|
6041
|
+
* @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
|
|
6042
|
+
*/
|
|
6043
|
+
match = async (request) => {
|
|
6044
|
+
const hashValue = await this._getFileHash(request);
|
|
6045
|
+
if (!hashValue) {
|
|
6046
|
+
return void 0;
|
|
6047
|
+
}
|
|
6048
|
+
try {
|
|
6049
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
|
|
6050
|
+
const blob = await handle.getFile();
|
|
6051
|
+
return new Response(blob, {
|
|
6052
|
+
headers: {
|
|
6053
|
+
"Content-Length": String(blob.size)
|
|
6054
|
+
}
|
|
6055
|
+
});
|
|
6056
|
+
} catch {
|
|
6057
|
+
return void 0;
|
|
6058
|
+
}
|
|
6059
|
+
};
|
|
6060
|
+
/**
|
|
6061
|
+
* Stores a response in cross-origin storage, keyed by its SHA-256 hash.
|
|
6062
|
+
*
|
|
6063
|
+
* For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
|
|
6064
|
+
* `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
|
|
6065
|
+
* without reading the response body a second time.
|
|
6066
|
+
*
|
|
6067
|
+
* For non-LFS resources the hash is unknown upfront. In that case the body is consumed
|
|
6068
|
+
* in the background: the stream is read to compute the content hash, the file is written
|
|
6069
|
+
* into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
|
|
6070
|
+
* so that future `match` calls can resolve the file without a network round-trip.
|
|
6071
|
+
*
|
|
6072
|
+
* Implements `CacheInterface.put`.
|
|
6073
|
+
*
|
|
6074
|
+
* @param {string} request The URL of the resource (used as the hash-cache key).
|
|
6075
|
+
* @param {Response} response The response whose body will be written to the cache.
|
|
6076
|
+
* @returns {Promise<void>}
|
|
6077
|
+
*/
|
|
6078
|
+
put = async (request, response) => {
|
|
6079
|
+
const hashValue = await this._getFileHash(request);
|
|
6080
|
+
if (hashValue) {
|
|
6081
|
+
const blob = await response.blob();
|
|
6082
|
+
await this._storeBlobInCOS(blob, hashValue);
|
|
6083
|
+
} else {
|
|
6084
|
+
this._processAndStore(request, response.body);
|
|
6085
|
+
}
|
|
6086
|
+
};
|
|
6087
|
+
/**
|
|
6088
|
+
* Writes a blob into cross-origin storage using the given pre-computed hex hash string.
|
|
6089
|
+
*
|
|
6090
|
+
* @param {Blob} blob
|
|
6091
|
+
* @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
|
|
6092
|
+
* @returns {Promise<void>}
|
|
6093
|
+
*/
|
|
6094
|
+
_storeBlobInCOS = async (blob, hashHex) => {
|
|
6095
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
|
|
6096
|
+
create: true
|
|
6097
|
+
});
|
|
6098
|
+
const writableStream = await handle.createWritable();
|
|
6099
|
+
await writableStream.write(blob);
|
|
6100
|
+
await writableStream.close();
|
|
6101
|
+
};
|
|
6102
|
+
/**
|
|
6103
|
+
* Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
|
|
6104
|
+
* of the resulting blob, stores it in cross-origin storage, and persists the computed
|
|
6105
|
+
* hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
|
|
6106
|
+
* file without a network round-trip.
|
|
6107
|
+
*
|
|
6108
|
+
* Called fire-and-forget from `put` — errors are swallowed so failures never surface to
|
|
6109
|
+
* the caller.
|
|
6110
|
+
*
|
|
6111
|
+
* @param {string} request The original resource URL.
|
|
6112
|
+
* @param {ReadableStream} stream The response body stream to consume.
|
|
6113
|
+
* @returns {Promise<void>}
|
|
6114
|
+
*/
|
|
6115
|
+
_processAndStore = async (request, stream) => {
|
|
6116
|
+
try {
|
|
6117
|
+
const chunks = [];
|
|
6118
|
+
for await (const chunk2 of stream) {
|
|
6119
|
+
chunks.push(chunk2);
|
|
6120
|
+
}
|
|
6121
|
+
const blob = new Blob(chunks);
|
|
6122
|
+
const hashHex = await this._getBlobHash(blob);
|
|
6123
|
+
await this._storeBlobInCOS(blob, hashHex);
|
|
6124
|
+
try {
|
|
6125
|
+
const hashCache = await this._getHashCache();
|
|
6126
|
+
await hashCache.put(request, new Response(hashHex));
|
|
6127
|
+
} catch {
|
|
6128
|
+
}
|
|
6129
|
+
} catch {
|
|
6130
|
+
}
|
|
6131
|
+
};
|
|
6132
|
+
/**
|
|
6133
|
+
* Deletes the cache entry for the given request.
|
|
6134
|
+
*
|
|
6135
|
+
* Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
|
|
6136
|
+
* expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
|
|
6137
|
+
* permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
|
|
6138
|
+
* re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
|
|
6139
|
+
*
|
|
6140
|
+
* Implements `CacheInterface.delete`.
|
|
6141
|
+
*
|
|
6142
|
+
* @param {string} request
|
|
6143
|
+
* @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
|
|
6144
|
+
*/
|
|
6145
|
+
delete = async (request) => {
|
|
6146
|
+
try {
|
|
6147
|
+
const hashCache = await this._getHashCache();
|
|
6148
|
+
return await hashCache.delete(request);
|
|
6149
|
+
} catch {
|
|
6150
|
+
return false;
|
|
6151
|
+
}
|
|
6152
|
+
};
|
|
6153
|
+
/**
|
|
6154
|
+
* Resolves the SHA-256 hash for a given URL.
|
|
6155
|
+
*
|
|
6156
|
+
* Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
|
|
6157
|
+
* Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
|
|
6158
|
+
* LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
|
|
6159
|
+
*
|
|
6160
|
+
* Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
|
|
6161
|
+
*
|
|
6162
|
+
* @param {string} url The resource URL to resolve a hash for.
|
|
6163
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6164
|
+
*/
|
|
6165
|
+
_getFileHash = async (url) => {
|
|
6166
|
+
try {
|
|
6167
|
+
const hashCache = await this._getHashCache();
|
|
6168
|
+
const cached = await hashCache.match(url);
|
|
6169
|
+
if (cached) {
|
|
6170
|
+
return cached.text();
|
|
6171
|
+
}
|
|
6172
|
+
const hash = await this._getLfsFileHash(url);
|
|
6173
|
+
if (hash) {
|
|
6174
|
+
await hashCache.put(url, new Response(hash));
|
|
6175
|
+
return hash;
|
|
6176
|
+
}
|
|
6177
|
+
return null;
|
|
6178
|
+
} catch {
|
|
6179
|
+
return null;
|
|
6180
|
+
}
|
|
6181
|
+
};
|
|
6182
|
+
/**
|
|
6183
|
+
* Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
|
|
6184
|
+
* Git LFS pointer file.
|
|
6185
|
+
*
|
|
6186
|
+
* Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
|
|
6187
|
+
* The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
|
|
6188
|
+
* Returns `null` for non-LFS URLs or when the network request fails.
|
|
6189
|
+
*
|
|
6190
|
+
* @see https://huggingface.co/docs/hub/en/storage-backends#xet
|
|
6191
|
+
* @param {string} url The resolved Hugging Face URL of the resource.
|
|
6192
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6193
|
+
*/
|
|
6194
|
+
_getLfsFileHash = async (url) => {
|
|
6195
|
+
if (!url.includes("/resolve/")) {
|
|
6196
|
+
return null;
|
|
6197
|
+
}
|
|
6198
|
+
const rawUrl = url.replace("/resolve/", "/raw/");
|
|
6199
|
+
try {
|
|
6200
|
+
const text = await fetch(rawUrl).then((r) => r.text());
|
|
6201
|
+
const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
|
|
6202
|
+
return match ? match[1] : null;
|
|
6203
|
+
} catch {
|
|
6204
|
+
return null;
|
|
6205
|
+
}
|
|
6206
|
+
};
|
|
6207
|
+
/**
|
|
6208
|
+
* Computes the SHA-256 hash of a `Blob`'s contents.
|
|
6209
|
+
*
|
|
6210
|
+
* @param {Blob} blob The blob to hash.
|
|
6211
|
+
* @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
|
|
6212
|
+
*/
|
|
6213
|
+
_getBlobHash = async (blob) => {
|
|
6214
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
6215
|
+
const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
|
|
6216
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
6217
|
+
return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
6218
|
+
};
|
|
6219
|
+
};
|
|
6220
|
+
|
|
6004
6221
|
// src/utils/cache.js
|
|
6005
6222
|
async function getCache(file_cache_dir = null) {
|
|
6006
|
-
let
|
|
6223
|
+
let cache2 = null;
|
|
6007
6224
|
if (env.useCustomCache) {
|
|
6008
6225
|
if (!env.customCache) {
|
|
6009
6226
|
throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
|
|
@@ -6013,30 +6230,33 @@ async function getCache(file_cache_dir = null) {
|
|
|
6013
6230
|
"`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
|
|
6014
6231
|
);
|
|
6015
6232
|
}
|
|
6016
|
-
|
|
6233
|
+
cache2 = env.customCache;
|
|
6234
|
+
}
|
|
6235
|
+
if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
|
|
6236
|
+
cache2 = new CrossOriginStorage();
|
|
6017
6237
|
}
|
|
6018
|
-
if (!
|
|
6238
|
+
if (!cache2 && env.useBrowserCache) {
|
|
6019
6239
|
if (typeof caches === "undefined") {
|
|
6020
6240
|
throw Error("Browser cache is not available in this environment.");
|
|
6021
6241
|
}
|
|
6022
6242
|
try {
|
|
6023
|
-
|
|
6243
|
+
cache2 = await caches.open(env.cacheKey);
|
|
6024
6244
|
} catch (e) {
|
|
6025
6245
|
logger.warn("An error occurred while opening the browser cache:", e);
|
|
6026
6246
|
}
|
|
6027
6247
|
}
|
|
6028
|
-
if (!
|
|
6248
|
+
if (!cache2 && env.useFSCache) {
|
|
6029
6249
|
if (!apis.IS_FS_AVAILABLE) {
|
|
6030
6250
|
throw Error("File System Cache is not available in this environment.");
|
|
6031
6251
|
}
|
|
6032
|
-
|
|
6252
|
+
cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
|
|
6033
6253
|
}
|
|
6034
|
-
return
|
|
6254
|
+
return cache2;
|
|
6035
6255
|
}
|
|
6036
|
-
async function tryCache(
|
|
6256
|
+
async function tryCache(cache2, ...names) {
|
|
6037
6257
|
for (let name of names) {
|
|
6038
6258
|
try {
|
|
6039
|
-
let result = await
|
|
6259
|
+
let result = await cache2.match(name);
|
|
6040
6260
|
if (result) return result;
|
|
6041
6261
|
} catch (e) {
|
|
6042
6262
|
continue;
|
|
@@ -6045,6 +6265,83 @@ async function tryCache(cache, ...names) {
|
|
|
6045
6265
|
return void 0;
|
|
6046
6266
|
}
|
|
6047
6267
|
|
|
6268
|
+
// src/utils/lru_cache.js
|
|
6269
|
+
var LRUCache2 = class {
|
|
6270
|
+
/** @type {number} */
|
|
6271
|
+
#capacity;
|
|
6272
|
+
/** @type {Map<any, any>} */
|
|
6273
|
+
#cache;
|
|
6274
|
+
/**
|
|
6275
|
+
* Creates an LRUCache instance.
|
|
6276
|
+
* @param {number} capacity The maximum number of items the cache can hold.
|
|
6277
|
+
*/
|
|
6278
|
+
constructor(capacity) {
|
|
6279
|
+
this.#capacity = capacity;
|
|
6280
|
+
this.#cache = /* @__PURE__ */ new Map();
|
|
6281
|
+
}
|
|
6282
|
+
/**
|
|
6283
|
+
* Retrieves the value associated with the given key and marks the key as recently used.
|
|
6284
|
+
* @param {any} key The key to retrieve.
|
|
6285
|
+
* @returns {any} The value associated with the key, or undefined if the key does not exist.
|
|
6286
|
+
*/
|
|
6287
|
+
get(key) {
|
|
6288
|
+
if (!this.#cache.has(key)) return void 0;
|
|
6289
|
+
const value = this.#cache.get(key);
|
|
6290
|
+
this.#cache.delete(key);
|
|
6291
|
+
this.#cache.set(key, value);
|
|
6292
|
+
return value;
|
|
6293
|
+
}
|
|
6294
|
+
/**
|
|
6295
|
+
* Inserts or updates the key-value pair in the cache.
|
|
6296
|
+
* If the key already exists, it is updated and marked as recently used.
|
|
6297
|
+
* If the cache exceeds its capacity, the least recently used item is evicted.
|
|
6298
|
+
* @param {any} key The key to add or update.
|
|
6299
|
+
* @param {any} value The value to associate with the key.
|
|
6300
|
+
*/
|
|
6301
|
+
put(key, value) {
|
|
6302
|
+
if (this.#cache.has(key)) {
|
|
6303
|
+
this.#cache.delete(key);
|
|
6304
|
+
}
|
|
6305
|
+
this.#cache.set(key, value);
|
|
6306
|
+
if (this.#cache.size > this.#capacity) {
|
|
6307
|
+
this.#cache.delete(this.#cache.keys().next().value);
|
|
6308
|
+
}
|
|
6309
|
+
}
|
|
6310
|
+
/**
|
|
6311
|
+
* Removes the entry for the given key from the cache.
|
|
6312
|
+
* @param {any} key The key to delete.
|
|
6313
|
+
* @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
|
|
6314
|
+
*/
|
|
6315
|
+
delete(key) {
|
|
6316
|
+
return this.#cache.delete(key);
|
|
6317
|
+
}
|
|
6318
|
+
/**
|
|
6319
|
+
* Clears the cache.
|
|
6320
|
+
*/
|
|
6321
|
+
clear() {
|
|
6322
|
+
this.#cache.clear();
|
|
6323
|
+
}
|
|
6324
|
+
};
|
|
6325
|
+
|
|
6326
|
+
// src/utils/memoize_promise.js
|
|
6327
|
+
var MAX_CACHE_SIZE = 100;
|
|
6328
|
+
var cache = new LRUCache2(MAX_CACHE_SIZE);
|
|
6329
|
+
function memoizePromise(key, factory) {
|
|
6330
|
+
const cached = cache.get(key);
|
|
6331
|
+
if (cached !== void 0) {
|
|
6332
|
+
return cached;
|
|
6333
|
+
}
|
|
6334
|
+
const promise = factory().then(
|
|
6335
|
+
(value) => value,
|
|
6336
|
+
(err) => {
|
|
6337
|
+
cache.delete(key);
|
|
6338
|
+
return Promise.reject(err);
|
|
6339
|
+
}
|
|
6340
|
+
);
|
|
6341
|
+
cache.put(key, promise);
|
|
6342
|
+
return promise;
|
|
6343
|
+
}
|
|
6344
|
+
|
|
6048
6345
|
// src/utils/model_registry/get_file_metadata.js
|
|
6049
6346
|
async function fetch_file_head(urlOrPath) {
|
|
6050
6347
|
if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
|
|
@@ -6052,17 +6349,27 @@ async function fetch_file_head(urlOrPath) {
|
|
|
6052
6349
|
}
|
|
6053
6350
|
const headers = getFetchHeaders(urlOrPath);
|
|
6054
6351
|
headers.set("Range", "bytes=0-0");
|
|
6055
|
-
return env.fetch(urlOrPath, { method: "GET", headers });
|
|
6352
|
+
return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
|
|
6353
|
+
}
|
|
6354
|
+
function get_file_metadata(path_or_repo_id, filename, options = {}) {
|
|
6355
|
+
const key = JSON.stringify([
|
|
6356
|
+
path_or_repo_id,
|
|
6357
|
+
filename,
|
|
6358
|
+
options?.revision,
|
|
6359
|
+
options?.cache_dir,
|
|
6360
|
+
options?.local_files_only
|
|
6361
|
+
]);
|
|
6362
|
+
return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
|
|
6056
6363
|
}
|
|
6057
|
-
async function
|
|
6058
|
-
const
|
|
6364
|
+
async function _get_file_metadata(path_or_repo_id, filename, options) {
|
|
6365
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6059
6366
|
const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6060
6367
|
path_or_repo_id,
|
|
6061
6368
|
filename,
|
|
6062
6369
|
options,
|
|
6063
|
-
|
|
6370
|
+
cache2
|
|
6064
6371
|
);
|
|
6065
|
-
const cachedResponse = await checkCachedResource(
|
|
6372
|
+
const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6066
6373
|
if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
|
|
6067
6374
|
const size = cachedResponse.headers.get("content-length");
|
|
6068
6375
|
const contentType = cachedResponse.headers.get("content-type");
|
|
@@ -6160,7 +6467,7 @@ function getFetchHeaders(urlOrPath) {
|
|
|
6160
6467
|
}
|
|
6161
6468
|
return headers;
|
|
6162
6469
|
}
|
|
6163
|
-
function buildResourcePaths(path_or_repo_id, filename, options = {},
|
|
6470
|
+
function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
|
|
6164
6471
|
const revision = options.revision ?? "main";
|
|
6165
6472
|
const requestURL = pathJoin(path_or_repo_id, filename);
|
|
6166
6473
|
const validModelId = isValidHfModelId(path_or_repo_id);
|
|
@@ -6170,7 +6477,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6170
6477
|
env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
|
|
6171
6478
|
filename
|
|
6172
6479
|
);
|
|
6173
|
-
const proposedCacheKey =
|
|
6480
|
+
const proposedCacheKey = cache2 instanceof FileCache ? (
|
|
6174
6481
|
// Choose cache key for filesystem cache
|
|
6175
6482
|
// When using the main revision (default), we use the request URL as the cache key.
|
|
6176
6483
|
// If a specific revision is requested, we account for this in the cache key.
|
|
@@ -6184,14 +6491,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6184
6491
|
validModelId
|
|
6185
6492
|
};
|
|
6186
6493
|
}
|
|
6187
|
-
async function checkCachedResource(
|
|
6188
|
-
if (!
|
|
6494
|
+
async function checkCachedResource(cache2, localPath, proposedCacheKey) {
|
|
6495
|
+
if (!cache2) {
|
|
6189
6496
|
return void 0;
|
|
6190
6497
|
}
|
|
6191
|
-
return await tryCache(
|
|
6498
|
+
return await tryCache(cache2, localPath, proposedCacheKey);
|
|
6192
6499
|
}
|
|
6193
|
-
async function storeCachedResource(path_or_repo_id, filename,
|
|
6194
|
-
if (await
|
|
6500
|
+
async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
|
|
6501
|
+
if (await cache2.match(cacheKey) !== void 0) {
|
|
6195
6502
|
return;
|
|
6196
6503
|
}
|
|
6197
6504
|
if (!result) {
|
|
@@ -6201,14 +6508,14 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6201
6508
|
file: filename,
|
|
6202
6509
|
...data
|
|
6203
6510
|
}) : void 0;
|
|
6204
|
-
await
|
|
6511
|
+
await cache2.put(
|
|
6205
6512
|
cacheKey,
|
|
6206
6513
|
/** @type {Response} */
|
|
6207
6514
|
response,
|
|
6208
6515
|
wrapped_progress
|
|
6209
6516
|
);
|
|
6210
6517
|
} else if (typeof response !== "string") {
|
|
6211
|
-
await
|
|
6518
|
+
await cache2.put(
|
|
6212
6519
|
cacheKey,
|
|
6213
6520
|
new Response(
|
|
6214
6521
|
/** @type {any} */
|
|
@@ -6222,17 +6529,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6222
6529
|
});
|
|
6223
6530
|
}
|
|
6224
6531
|
}
|
|
6225
|
-
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false,
|
|
6532
|
+
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
|
|
6226
6533
|
const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6227
6534
|
path_or_repo_id,
|
|
6228
6535
|
filename,
|
|
6229
6536
|
options,
|
|
6230
|
-
|
|
6537
|
+
cache2
|
|
6231
6538
|
);
|
|
6232
6539
|
let cacheKey;
|
|
6233
6540
|
let toCacheResponse = false;
|
|
6234
6541
|
let response;
|
|
6235
|
-
response = await checkCachedResource(
|
|
6542
|
+
response = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6236
6543
|
const cacheHit = response !== void 0;
|
|
6237
6544
|
if (!cacheHit) {
|
|
6238
6545
|
if (env.allowLocalModels) {
|
|
@@ -6273,7 +6580,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6273
6580
|
}
|
|
6274
6581
|
cacheKey = proposedCacheKey;
|
|
6275
6582
|
}
|
|
6276
|
-
toCacheResponse =
|
|
6583
|
+
toCacheResponse = cache2 && // 1. A caching system is available
|
|
6277
6584
|
typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
|
|
6278
6585
|
response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
|
|
6279
6586
|
response.status === 200;
|
|
@@ -6335,7 +6642,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6335
6642
|
// i.e., do not cache FileResponses (prevents duplication)
|
|
6336
6643
|
toCacheResponse && cacheKey && typeof response !== "string"
|
|
6337
6644
|
) {
|
|
6338
|
-
await storeCachedResource(path_or_repo_id, filename,
|
|
6645
|
+
await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
|
|
6339
6646
|
}
|
|
6340
6647
|
dispatchCallback(options.progress_callback, {
|
|
6341
6648
|
status: "done",
|
|
@@ -6351,7 +6658,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6351
6658
|
if (response instanceof FileResponse) {
|
|
6352
6659
|
return response.filePath;
|
|
6353
6660
|
}
|
|
6354
|
-
const cachedResponse = await
|
|
6661
|
+
const cachedResponse = await cache2?.match(cacheKey);
|
|
6355
6662
|
if (cachedResponse instanceof FileResponse) {
|
|
6356
6663
|
return cachedResponse.filePath;
|
|
6357
6664
|
} else if (cachedResponse instanceof Response) {
|
|
@@ -6378,8 +6685,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
|
|
|
6378
6685
|
name: path_or_repo_id,
|
|
6379
6686
|
file: filename
|
|
6380
6687
|
});
|
|
6381
|
-
const
|
|
6382
|
-
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path,
|
|
6688
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6689
|
+
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
|
|
6383
6690
|
}
|
|
6384
6691
|
async function getModelText(modelPath, fileName, fatal = true, options = {}) {
|
|
6385
6692
|
const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
|
|
@@ -7176,7 +7483,7 @@ __export(onnxruntime_node_exports, {
|
|
|
7176
7483
|
});
|
|
7177
7484
|
var onnxruntime_node_default = {};
|
|
7178
7485
|
|
|
7179
|
-
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.
|
|
7486
|
+
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
|
|
7180
7487
|
var ort_webgpu_bundle_min_exports = {};
|
|
7181
7488
|
__export(ort_webgpu_bundle_min_exports, {
|
|
7182
7489
|
InferenceSession: () => Jf,
|
|
@@ -7944,7 +8251,7 @@ async function ts(a = {}) {
|
|
|
7944
8251
|
throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
|
|
7945
8252
|
}
|
|
7946
8253
|
function Ye() {
|
|
7947
|
-
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii,
|
|
8254
|
+
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
|
|
7948
8255
|
}
|
|
7949
8256
|
async function bt() {
|
|
7950
8257
|
function e(o, u) {
|
|
@@ -9131,7 +9438,7 @@ async function ts(a = {}) {
|
|
|
9131
9438
|
Te(`invalid type for getValue: ${t}`);
|
|
9132
9439
|
}
|
|
9133
9440
|
}, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
|
|
9134
|
-
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = {
|
|
9441
|
+
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
|
|
9135
9442
|
if (r === void 0 || !r.Uc) return 1;
|
|
9136
9443
|
if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
|
|
9137
9444
|
if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
|
|
@@ -9151,11 +9458,11 @@ async function ts(a = {}) {
|
|
|
9151
9458
|
} catch {
|
|
9152
9459
|
return 4;
|
|
9153
9460
|
}
|
|
9154
|
-
},
|
|
9461
|
+
}, 926500: (e, t, n) => {
|
|
9155
9462
|
r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
|
|
9156
|
-
},
|
|
9463
|
+
}, 926564: () => r.me(), 926606: (e) => {
|
|
9157
9464
|
r.jd(e);
|
|
9158
|
-
},
|
|
9465
|
+
}, 926643: () => typeof wasmOffsetConverter < "u" };
|
|
9159
9466
|
function af(e, t, n, o) {
|
|
9160
9467
|
var u = P();
|
|
9161
9468
|
try {
|
|
@@ -11071,7 +11378,7 @@ var $s = k(() => {
|
|
|
11071
11378
|
Ve();
|
|
11072
11379
|
Ve();
|
|
11073
11380
|
Ve();
|
|
11074
|
-
var Xa = "1.25.0-dev.
|
|
11381
|
+
var Xa = "1.25.0-dev.20260307-d626b568e0";
|
|
11075
11382
|
var Tl = Zr;
|
|
11076
11383
|
{
|
|
11077
11384
|
let a = ($s(), $t(Gs)).wasmBackend;
|
|
@@ -11082,11 +11389,11 @@ Object.defineProperty(K.versions, "web", { value: Xa, enumerable: true });
|
|
|
11082
11389
|
// src/backends/utils/cacheWasm.js
|
|
11083
11390
|
async function loadAndCacheFile(url) {
|
|
11084
11391
|
const fileName = url.split("/").pop();
|
|
11085
|
-
let
|
|
11392
|
+
let cache2;
|
|
11086
11393
|
try {
|
|
11087
|
-
|
|
11088
|
-
if (
|
|
11089
|
-
const result = await
|
|
11394
|
+
cache2 = await getCache();
|
|
11395
|
+
if (cache2) {
|
|
11396
|
+
const result = await cache2.match(url);
|
|
11090
11397
|
if (result) {
|
|
11091
11398
|
return result;
|
|
11092
11399
|
}
|
|
@@ -11098,9 +11405,9 @@ async function loadAndCacheFile(url) {
|
|
|
11098
11405
|
if (!response.ok) {
|
|
11099
11406
|
throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
|
|
11100
11407
|
}
|
|
11101
|
-
if (
|
|
11408
|
+
if (cache2) {
|
|
11102
11409
|
try {
|
|
11103
|
-
await
|
|
11410
|
+
await cache2.put(url, response.clone());
|
|
11104
11411
|
} catch (e) {
|
|
11105
11412
|
logger.warn(`Failed to cache ${fileName}:`, e);
|
|
11106
11413
|
}
|
|
@@ -13715,9 +14022,23 @@ var Tensor3 = class _Tensor {
|
|
|
13715
14022
|
throw Error(`Unsupported norm: ${p}`);
|
|
13716
14023
|
}
|
|
13717
14024
|
const this_data = this.data;
|
|
13718
|
-
const
|
|
14025
|
+
const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
|
|
14026
|
+
if (is_bigint && p !== 1) {
|
|
14027
|
+
throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
|
|
14028
|
+
}
|
|
14029
|
+
let fn2, zero;
|
|
14030
|
+
if (is_bigint) {
|
|
14031
|
+
fn2 = (a, b) => a + b;
|
|
14032
|
+
zero = 0n;
|
|
14033
|
+
} else {
|
|
14034
|
+
fn2 = (a, b) => a + b ** p;
|
|
14035
|
+
zero = 0;
|
|
14036
|
+
}
|
|
13719
14037
|
if (dim === null) {
|
|
13720
|
-
|
|
14038
|
+
let val = this_data.reduce(fn2, zero);
|
|
14039
|
+
if (p !== 1) {
|
|
14040
|
+
val = val ** (1 / p);
|
|
14041
|
+
}
|
|
13721
14042
|
return new _Tensor(this.type, [val], []);
|
|
13722
14043
|
}
|
|
13723
14044
|
const [type, result, resultDims] = reduce_helper(fn2, this, dim, keepdim);
|
|
@@ -16177,9 +16498,11 @@ __export(processors_exports, {
|
|
|
16177
16498
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
16178
16499
|
Florence2Processor: () => Florence2Processor,
|
|
16179
16500
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
16501
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
16180
16502
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
16181
16503
|
Idefics3Processor: () => Idefics3Processor,
|
|
16182
16504
|
JinaCLIPProcessor: () => JinaCLIPProcessor,
|
|
16505
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
16183
16506
|
LlavaProcessor: () => LlavaProcessor,
|
|
16184
16507
|
MgpstrProcessor: () => MgpstrProcessor,
|
|
16185
16508
|
MoonshineProcessor: () => MoonshineProcessor,
|
|
@@ -16200,6 +16523,7 @@ __export(processors_exports, {
|
|
|
16200
16523
|
UltravoxProcessor: () => UltravoxProcessor,
|
|
16201
16524
|
VLChatProcessor: () => VLChatProcessor,
|
|
16202
16525
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
16526
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
16203
16527
|
Wav2Vec2Processor: () => Wav2Vec2Processor,
|
|
16204
16528
|
Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
|
|
16205
16529
|
WhisperProcessor: () => WhisperProcessor
|
|
@@ -16254,12 +16578,14 @@ __export(feature_extractors_exports, {
|
|
|
16254
16578
|
EncodecFeatureExtractor: () => EncodecFeatureExtractor,
|
|
16255
16579
|
FeatureExtractor: () => FeatureExtractor,
|
|
16256
16580
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
16581
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
16257
16582
|
MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
|
|
16258
16583
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
16259
16584
|
PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
|
|
16260
16585
|
SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
|
|
16261
16586
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
16262
16587
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
16588
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
16263
16589
|
Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
|
|
16264
16590
|
WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
|
|
16265
16591
|
WhisperFeatureExtractor: () => WhisperFeatureExtractor
|
|
@@ -16494,6 +16820,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16494
16820
|
mel_filters = null,
|
|
16495
16821
|
mel_floor = 1e-10,
|
|
16496
16822
|
log_mel = null,
|
|
16823
|
+
max_log_mel = null,
|
|
16497
16824
|
reference = 1,
|
|
16498
16825
|
min_value = 1e-10,
|
|
16499
16826
|
db_range = null,
|
|
@@ -16633,6 +16960,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16633
16960
|
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
16634
16961
|
}
|
|
16635
16962
|
break;
|
|
16963
|
+
case "log10_max_norm": {
|
|
16964
|
+
for (let i = 0; i < o; ++i) {
|
|
16965
|
+
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
16966
|
+
}
|
|
16967
|
+
const logMax = max_log_mel ?? max(mel_spec_data)[0];
|
|
16968
|
+
const threshold = logMax - 8;
|
|
16969
|
+
for (let i = 0; i < o; ++i) {
|
|
16970
|
+
mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
|
|
16971
|
+
}
|
|
16972
|
+
break;
|
|
16973
|
+
}
|
|
16636
16974
|
case "dB":
|
|
16637
16975
|
if (power === 1) {
|
|
16638
16976
|
amplitude_to_db(mel_spec_data, reference, min_value, db_range);
|
|
@@ -16643,7 +16981,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16643
16981
|
}
|
|
16644
16982
|
break;
|
|
16645
16983
|
default:
|
|
16646
|
-
throw new Error(
|
|
16984
|
+
throw new Error(
|
|
16985
|
+
`log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
|
|
16986
|
+
);
|
|
16647
16987
|
}
|
|
16648
16988
|
}
|
|
16649
16989
|
return mel_spec;
|
|
@@ -17148,6 +17488,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
|
|
|
17148
17488
|
}
|
|
17149
17489
|
};
|
|
17150
17490
|
|
|
17491
|
+
// src/models/granite_speech/feature_extraction_granite_speech.js
|
|
17492
|
+
var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
|
|
17493
|
+
constructor(config) {
|
|
17494
|
+
super(config);
|
|
17495
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
17496
|
+
this.mel_filters = mel_filter_bank(
|
|
17497
|
+
Math.floor(1 + n_fft / 2),
|
|
17498
|
+
// num_frequency_bins = 257
|
|
17499
|
+
n_mels,
|
|
17500
|
+
// 80
|
|
17501
|
+
0,
|
|
17502
|
+
// min_frequency
|
|
17503
|
+
sample_rate / 2,
|
|
17504
|
+
// max_frequency = 8000
|
|
17505
|
+
sample_rate,
|
|
17506
|
+
// 16000
|
|
17507
|
+
null,
|
|
17508
|
+
// norm (torchaudio default: no norm)
|
|
17509
|
+
"htk"
|
|
17510
|
+
// mel_scale (torchaudio default)
|
|
17511
|
+
);
|
|
17512
|
+
const raw_window = window_function(win_length, "hann");
|
|
17513
|
+
this.window = new Float64Array(n_fft);
|
|
17514
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
17515
|
+
this.window.set(raw_window, pad);
|
|
17516
|
+
}
|
|
17517
|
+
/**
|
|
17518
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
17519
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
17520
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
17521
|
+
*/
|
|
17522
|
+
async _call(audio) {
|
|
17523
|
+
validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
|
|
17524
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
17525
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
17526
|
+
const max_num_frames = num_frames - num_frames % 2;
|
|
17527
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
17528
|
+
power: 2,
|
|
17529
|
+
mel_filters: this.mel_filters,
|
|
17530
|
+
log_mel: "log10_max_norm",
|
|
17531
|
+
transpose: true,
|
|
17532
|
+
// [time, n_mels]
|
|
17533
|
+
max_num_frames,
|
|
17534
|
+
do_pad: false
|
|
17535
|
+
});
|
|
17536
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
17537
|
+
return { input_features };
|
|
17538
|
+
}
|
|
17539
|
+
};
|
|
17540
|
+
|
|
17151
17541
|
// src/models/moonshine/feature_extraction_moonshine.js
|
|
17152
17542
|
var MoonshineFeatureExtractor = class extends FeatureExtractor {
|
|
17153
17543
|
/**
|
|
@@ -17628,6 +18018,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
|
|
|
17628
18018
|
}
|
|
17629
18019
|
};
|
|
17630
18020
|
|
|
18021
|
+
// src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
|
|
18022
|
+
var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
|
|
18023
|
+
constructor(config) {
|
|
18024
|
+
super(config);
|
|
18025
|
+
this.config.mel_filters ??= mel_filter_bank(
|
|
18026
|
+
Math.floor(1 + this.config.n_fft / 2),
|
|
18027
|
+
// num_frequency_bins
|
|
18028
|
+
this.config.feature_size,
|
|
18029
|
+
// num_mel_filters
|
|
18030
|
+
0,
|
|
18031
|
+
// min_frequency
|
|
18032
|
+
8e3,
|
|
18033
|
+
// max_frequency
|
|
18034
|
+
this.config.sampling_rate,
|
|
18035
|
+
// sampling_rate
|
|
18036
|
+
"slaney",
|
|
18037
|
+
// norm
|
|
18038
|
+
"slaney"
|
|
18039
|
+
// mel_scale
|
|
18040
|
+
);
|
|
18041
|
+
this.window = window_function(this.config.n_fft, "hann");
|
|
18042
|
+
}
|
|
18043
|
+
/**
|
|
18044
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
18045
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
18046
|
+
* @param {Object} [options]
|
|
18047
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
|
|
18048
|
+
* @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
|
|
18049
|
+
*/
|
|
18050
|
+
async _extract_fbank_features(waveform, { center = true } = {}) {
|
|
18051
|
+
const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
|
|
18052
|
+
const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
|
|
18053
|
+
return await spectrogram(
|
|
18054
|
+
waveform,
|
|
18055
|
+
this.window,
|
|
18056
|
+
n_fft,
|
|
18057
|
+
// frame_length
|
|
18058
|
+
hop_length,
|
|
18059
|
+
{
|
|
18060
|
+
power: 2,
|
|
18061
|
+
mel_filters,
|
|
18062
|
+
log_mel: "log10_max_norm",
|
|
18063
|
+
max_log_mel: global_log_mel_max,
|
|
18064
|
+
center,
|
|
18065
|
+
max_num_frames,
|
|
18066
|
+
do_pad: false
|
|
18067
|
+
}
|
|
18068
|
+
);
|
|
18069
|
+
}
|
|
18070
|
+
/**
|
|
18071
|
+
* Extract mel spectrogram features from audio.
|
|
18072
|
+
* @param {Float32Array|Float64Array} audio The audio data.
|
|
18073
|
+
* @param {Object} [options]
|
|
18074
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform.
|
|
18075
|
+
* @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
|
|
18076
|
+
*/
|
|
18077
|
+
async _call(audio, { center = true } = {}) {
|
|
18078
|
+
validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
|
|
18079
|
+
const features = await this._extract_fbank_features(audio, { center });
|
|
18080
|
+
return {
|
|
18081
|
+
input_features: features.unsqueeze_(0)
|
|
18082
|
+
};
|
|
18083
|
+
}
|
|
18084
|
+
};
|
|
18085
|
+
|
|
17631
18086
|
// src/models/whisper/feature_extraction_whisper.js
|
|
17632
18087
|
var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
17633
18088
|
constructor(config) {
|
|
@@ -17656,7 +18111,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17656
18111
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
17657
18112
|
*/
|
|
17658
18113
|
async _extract_fbank_features(waveform) {
|
|
17659
|
-
|
|
18114
|
+
return await spectrogram(
|
|
17660
18115
|
waveform,
|
|
17661
18116
|
this.window,
|
|
17662
18117
|
// window
|
|
@@ -17667,7 +18122,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17667
18122
|
{
|
|
17668
18123
|
power: 2,
|
|
17669
18124
|
mel_filters: this.config.mel_filters,
|
|
17670
|
-
log_mel: "
|
|
18125
|
+
log_mel: "log10_max_norm",
|
|
17671
18126
|
// Custom
|
|
17672
18127
|
max_num_frames: Math.min(
|
|
17673
18128
|
Math.floor(waveform.length / this.config.hop_length),
|
|
@@ -17676,15 +18131,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17676
18131
|
)
|
|
17677
18132
|
}
|
|
17678
18133
|
);
|
|
17679
|
-
const data = features.data;
|
|
17680
|
-
const maxValue = max(
|
|
17681
|
-
/** @type {Float32Array} */
|
|
17682
|
-
data
|
|
17683
|
-
)[0];
|
|
17684
|
-
for (let i = 0; i < data.length; ++i) {
|
|
17685
|
-
data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
|
|
17686
|
-
}
|
|
17687
|
-
return features;
|
|
17688
18134
|
}
|
|
17689
18135
|
/**
|
|
17690
18136
|
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
@@ -18565,6 +19011,27 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
18565
19011
|
}
|
|
18566
19012
|
return [segmentation, segments];
|
|
18567
19013
|
}
|
|
19014
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
19015
|
+
if (height < factor || width < factor) {
|
|
19016
|
+
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
19017
|
+
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19018
|
+
throw new Error(
|
|
19019
|
+
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19020
|
+
);
|
|
19021
|
+
}
|
|
19022
|
+
let h_bar = Math.round(height / factor) * factor;
|
|
19023
|
+
let w_bar = Math.round(width / factor) * factor;
|
|
19024
|
+
if (h_bar * w_bar > max_pixels) {
|
|
19025
|
+
const beta = Math.sqrt(height * width / max_pixels);
|
|
19026
|
+
h_bar = Math.floor(height / beta / factor) * factor;
|
|
19027
|
+
w_bar = Math.floor(width / beta / factor) * factor;
|
|
19028
|
+
} else if (h_bar * w_bar < min_pixels) {
|
|
19029
|
+
const beta = Math.sqrt(min_pixels / (height * width));
|
|
19030
|
+
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19031
|
+
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19032
|
+
}
|
|
19033
|
+
return [h_bar, w_bar];
|
|
19034
|
+
}
|
|
18568
19035
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
18569
19036
|
if (label_ids_to_fuse === null) {
|
|
18570
19037
|
logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
|
|
@@ -18853,7 +19320,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18853
19320
|
});
|
|
18854
19321
|
}
|
|
18855
19322
|
/**
|
|
18856
|
-
* @typedef {
|
|
19323
|
+
* @typedef {Object} PreprocessedImage
|
|
18857
19324
|
* @property {HeightWidth} original_size The original size of the image.
|
|
18858
19325
|
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
|
|
18859
19326
|
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
|
|
@@ -19031,6 +19498,7 @@ __export(image_processors_exports, {
|
|
|
19031
19498
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
19032
19499
|
ImageProcessor: () => ImageProcessor,
|
|
19033
19500
|
JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
|
|
19501
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
19034
19502
|
LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
|
|
19035
19503
|
Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
|
|
19036
19504
|
MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
|
|
@@ -19434,6 +19902,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
|
|
|
19434
19902
|
}
|
|
19435
19903
|
};
|
|
19436
19904
|
|
|
19905
|
+
// src/models/lfm2_vl/image_processing_lfm2_vl.js
|
|
19906
|
+
function round_by_factor(number, factor) {
|
|
19907
|
+
return Math.round(number / factor) * factor;
|
|
19908
|
+
}
|
|
19909
|
+
function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
|
|
19910
|
+
let best_ratio_diff = Infinity;
|
|
19911
|
+
let best_ratio = [1, 1];
|
|
19912
|
+
const area = width * height;
|
|
19913
|
+
for (const ratio of target_ratios) {
|
|
19914
|
+
const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
|
|
19915
|
+
if (ratio_diff < best_ratio_diff) {
|
|
19916
|
+
best_ratio_diff = ratio_diff;
|
|
19917
|
+
best_ratio = ratio;
|
|
19918
|
+
} else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
|
|
19919
|
+
best_ratio = ratio;
|
|
19920
|
+
}
|
|
19921
|
+
}
|
|
19922
|
+
return best_ratio;
|
|
19923
|
+
}
|
|
19924
|
+
function get_target_ratios(min_tiles, max_tiles) {
|
|
19925
|
+
const ratios = [];
|
|
19926
|
+
const seen = /* @__PURE__ */ new Set();
|
|
19927
|
+
for (let n = min_tiles; n <= max_tiles; ++n) {
|
|
19928
|
+
for (let w = 1; w <= n; ++w) {
|
|
19929
|
+
for (let h = 1; h <= n; ++h) {
|
|
19930
|
+
const product2 = w * h;
|
|
19931
|
+
if (product2 >= min_tiles && product2 <= max_tiles) {
|
|
19932
|
+
const key = w << 16 | h;
|
|
19933
|
+
if (!seen.has(key)) {
|
|
19934
|
+
seen.add(key);
|
|
19935
|
+
ratios.push([w, h]);
|
|
19936
|
+
}
|
|
19937
|
+
}
|
|
19938
|
+
}
|
|
19939
|
+
}
|
|
19940
|
+
}
|
|
19941
|
+
return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
|
|
19942
|
+
}
|
|
19943
|
+
function convert_image_to_patches(images, patch_size) {
|
|
19944
|
+
const [B, C, H, W] = images.dims;
|
|
19945
|
+
const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
|
|
19946
|
+
const patch_dim = patch_size * patch_size * C;
|
|
19947
|
+
const data = (
|
|
19948
|
+
/** @type {Float32Array} */
|
|
19949
|
+
images.data
|
|
19950
|
+
);
|
|
19951
|
+
const result = new Float32Array(B * ph * pw * patch_dim);
|
|
19952
|
+
const ch_stride = H * W;
|
|
19953
|
+
for (let b = 0; b < B; ++b) {
|
|
19954
|
+
const b_src = b * C * ch_stride;
|
|
19955
|
+
const b_dst = b * ph * pw * patch_dim;
|
|
19956
|
+
for (let py = 0; py < ph; ++py) {
|
|
19957
|
+
for (let px = 0; px < pw; ++px) {
|
|
19958
|
+
let off = b_dst + (py * pw + px) * patch_dim;
|
|
19959
|
+
for (let dy = 0; dy < patch_size; ++dy) {
|
|
19960
|
+
const row = (py * patch_size + dy) * W + px * patch_size;
|
|
19961
|
+
for (let dx = 0; dx < patch_size; ++dx) {
|
|
19962
|
+
const pixel = row + dx;
|
|
19963
|
+
for (let c = 0; c < C; ++c) {
|
|
19964
|
+
result[off++] = data[b_src + c * ch_stride + pixel];
|
|
19965
|
+
}
|
|
19966
|
+
}
|
|
19967
|
+
}
|
|
19968
|
+
}
|
|
19969
|
+
}
|
|
19970
|
+
}
|
|
19971
|
+
return new Tensor3("float32", result, [B, ph * pw, patch_dim]);
|
|
19972
|
+
}
|
|
19973
|
+
function pad_along_first_dim(patches, target_length) {
|
|
19974
|
+
const [, len2, dim] = patches.dims;
|
|
19975
|
+
const mask_data = new BigInt64Array(target_length);
|
|
19976
|
+
mask_data.fill(1n, 0, len2);
|
|
19977
|
+
let padded = patches;
|
|
19978
|
+
if (len2 < target_length) {
|
|
19979
|
+
const padded_data = new Float32Array(target_length * dim);
|
|
19980
|
+
padded_data.set(
|
|
19981
|
+
/** @type {Float32Array} */
|
|
19982
|
+
patches.data
|
|
19983
|
+
);
|
|
19984
|
+
padded = new Tensor3("float32", padded_data, [1, target_length, dim]);
|
|
19985
|
+
}
|
|
19986
|
+
return { padded, mask: new Tensor3("int64", mask_data, [target_length]) };
|
|
19987
|
+
}
|
|
19988
|
+
var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
19989
|
+
constructor(config) {
|
|
19990
|
+
super(config);
|
|
19991
|
+
this.downsample_factor = config.downsample_factor ?? 2;
|
|
19992
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
19993
|
+
this.min_tiles = config.min_tiles ?? 2;
|
|
19994
|
+
this.max_tiles = config.max_tiles ?? 10;
|
|
19995
|
+
this.use_thumbnail = config.use_thumbnail ?? true;
|
|
19996
|
+
this.min_image_tokens = config.min_image_tokens ?? 64;
|
|
19997
|
+
this.max_image_tokens = config.max_image_tokens ?? 256;
|
|
19998
|
+
this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
|
|
19999
|
+
this.tile_size = config.tile_size ?? 512;
|
|
20000
|
+
this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
|
|
20001
|
+
this.return_row_col_info = config.return_row_col_info ?? false;
|
|
20002
|
+
const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
|
|
20003
|
+
const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
|
|
20004
|
+
this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
|
|
20005
|
+
}
|
|
20006
|
+
/**
|
|
20007
|
+
* Check if the image is too large to be processed as a single tile.
|
|
20008
|
+
* @param {number} height
|
|
20009
|
+
* @param {number} width
|
|
20010
|
+
* @returns {boolean}
|
|
20011
|
+
*/
|
|
20012
|
+
_is_image_too_large(height, width) {
|
|
20013
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20014
|
+
const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
|
|
20015
|
+
const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
|
|
20016
|
+
return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
|
|
20017
|
+
}
|
|
20018
|
+
/**
|
|
20019
|
+
* Get the grid layout for tiling a large image.
|
|
20020
|
+
* @param {number} height
|
|
20021
|
+
* @param {number} width
|
|
20022
|
+
* @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
|
|
20023
|
+
*/
|
|
20024
|
+
_get_grid_layout(height, width) {
|
|
20025
|
+
const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
|
|
20026
|
+
const [grid_width, grid_height] = find_closest_aspect_ratio(
|
|
20027
|
+
width / height,
|
|
20028
|
+
target_ratios,
|
|
20029
|
+
width,
|
|
20030
|
+
height,
|
|
20031
|
+
this.tile_size
|
|
20032
|
+
);
|
|
20033
|
+
return {
|
|
20034
|
+
grid_width,
|
|
20035
|
+
grid_height,
|
|
20036
|
+
target_width: this.tile_size * grid_width,
|
|
20037
|
+
target_height: this.tile_size * grid_height
|
|
20038
|
+
};
|
|
20039
|
+
}
|
|
20040
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
20041
|
+
// @ts-expect-error
|
|
20042
|
+
async _call(images, { return_row_col_info = null } = {}) {
|
|
20043
|
+
let batched_images;
|
|
20044
|
+
if (!Array.isArray(images)) {
|
|
20045
|
+
batched_images = [[images]];
|
|
20046
|
+
} else if (!Array.isArray(images[0])) {
|
|
20047
|
+
batched_images = [
|
|
20048
|
+
/** @type {RawImage[]} */
|
|
20049
|
+
images
|
|
20050
|
+
];
|
|
20051
|
+
} else {
|
|
20052
|
+
batched_images = /** @type {RawImage[][]} */
|
|
20053
|
+
images;
|
|
20054
|
+
}
|
|
20055
|
+
const all_pixel_values = [];
|
|
20056
|
+
const all_pixel_masks = [];
|
|
20057
|
+
const all_spatial_shapes = [];
|
|
20058
|
+
const all_rows = [];
|
|
20059
|
+
const all_cols = [];
|
|
20060
|
+
const all_image_sizes = [];
|
|
20061
|
+
for (const image_batch of batched_images) {
|
|
20062
|
+
const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
|
|
20063
|
+
for (const { pixel_values } of preprocessed) {
|
|
20064
|
+
const [, height, width] = pixel_values.dims;
|
|
20065
|
+
const img = pixel_values.unsqueeze_(0);
|
|
20066
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20067
|
+
const f2 = total_factor ** 2;
|
|
20068
|
+
const [new_height, new_width] = smart_resize(
|
|
20069
|
+
Math.max(total_factor, height),
|
|
20070
|
+
Math.max(total_factor, width),
|
|
20071
|
+
total_factor,
|
|
20072
|
+
this.min_image_tokens * f2,
|
|
20073
|
+
this.max_image_tokens * f2
|
|
20074
|
+
).map((x) => Math.max(total_factor, x));
|
|
20075
|
+
let tiles;
|
|
20076
|
+
let num_rows = 1, num_cols = 1;
|
|
20077
|
+
const is_large = this._is_image_too_large(height, width);
|
|
20078
|
+
const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
|
|
20079
|
+
if (is_large && do_splitting) {
|
|
20080
|
+
const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
|
|
20081
|
+
height,
|
|
20082
|
+
width
|
|
20083
|
+
);
|
|
20084
|
+
num_rows = grid_height;
|
|
20085
|
+
num_cols = grid_width;
|
|
20086
|
+
const resized = await interpolate_4d(img, {
|
|
20087
|
+
size: [target_height, target_width]
|
|
20088
|
+
});
|
|
20089
|
+
tiles = [];
|
|
20090
|
+
for (let r = 0; r < grid_height; ++r) {
|
|
20091
|
+
for (let c = 0; c < grid_width; ++c) {
|
|
20092
|
+
const y = r * this.tile_size;
|
|
20093
|
+
const x = c * this.tile_size;
|
|
20094
|
+
tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
|
|
20095
|
+
}
|
|
20096
|
+
}
|
|
20097
|
+
if (this.use_thumbnail && grid_width * grid_height !== 1) {
|
|
20098
|
+
tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
|
|
20099
|
+
}
|
|
20100
|
+
} else {
|
|
20101
|
+
tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
|
|
20102
|
+
}
|
|
20103
|
+
for (const tile of tiles) {
|
|
20104
|
+
const [, , th, tw] = tile.dims;
|
|
20105
|
+
const patches = convert_image_to_patches(tile, this.encoder_patch_size);
|
|
20106
|
+
const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
|
|
20107
|
+
all_pixel_values.push(padded);
|
|
20108
|
+
all_pixel_masks.push(mask);
|
|
20109
|
+
all_spatial_shapes.push([
|
|
20110
|
+
Math.floor(th / this.encoder_patch_size),
|
|
20111
|
+
Math.floor(tw / this.encoder_patch_size)
|
|
20112
|
+
]);
|
|
20113
|
+
}
|
|
20114
|
+
all_rows.push(num_rows);
|
|
20115
|
+
all_cols.push(num_cols);
|
|
20116
|
+
all_image_sizes.push([new_height, new_width]);
|
|
20117
|
+
}
|
|
20118
|
+
}
|
|
20119
|
+
const result = {
|
|
20120
|
+
pixel_values: cat(all_pixel_values, 0),
|
|
20121
|
+
pixel_attention_mask: stack(all_pixel_masks, 0),
|
|
20122
|
+
spatial_shapes: new Tensor3("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
|
|
20123
|
+
all_spatial_shapes.length,
|
|
20124
|
+
2
|
|
20125
|
+
])
|
|
20126
|
+
};
|
|
20127
|
+
if (return_row_col_info ?? this.return_row_col_info) {
|
|
20128
|
+
result.image_rows = all_rows;
|
|
20129
|
+
result.image_cols = all_cols;
|
|
20130
|
+
result.image_sizes = all_image_sizes;
|
|
20131
|
+
}
|
|
20132
|
+
return result;
|
|
20133
|
+
}
|
|
20134
|
+
};
|
|
20135
|
+
|
|
19437
20136
|
// src/models/llava_onevision/image_processing_llava_onevision.js
|
|
19438
20137
|
var LlavaOnevisionImageProcessor = class extends ImageProcessor {
|
|
19439
20138
|
};
|
|
@@ -19657,27 +20356,6 @@ var PvtImageProcessor = class extends ImageProcessor {
|
|
|
19657
20356
|
};
|
|
19658
20357
|
|
|
19659
20358
|
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19660
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
19661
|
-
if (height < factor || width < factor) {
|
|
19662
|
-
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
19663
|
-
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19664
|
-
throw new Error(
|
|
19665
|
-
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19666
|
-
);
|
|
19667
|
-
}
|
|
19668
|
-
let h_bar = Math.round(height / factor) * factor;
|
|
19669
|
-
let w_bar = Math.round(width / factor) * factor;
|
|
19670
|
-
if (h_bar * w_bar > max_pixels) {
|
|
19671
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
19672
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
19673
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
19674
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
19675
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
19676
|
-
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19677
|
-
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19678
|
-
}
|
|
19679
|
-
return [h_bar, w_bar];
|
|
19680
|
-
}
|
|
19681
20359
|
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19682
20360
|
constructor(config) {
|
|
19683
20361
|
super(config);
|
|
@@ -20279,6 +20957,57 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
20279
20957
|
}
|
|
20280
20958
|
};
|
|
20281
20959
|
|
|
20960
|
+
// src/models/granite_speech/processing_granite_speech.js
|
|
20961
|
+
var GraniteSpeechProcessor = class extends Processor {
|
|
20962
|
+
static tokenizer_class = AutoTokenizer;
|
|
20963
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
20964
|
+
static uses_processor_config = true;
|
|
20965
|
+
/**
|
|
20966
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
20967
|
+
* @param {number} audioLength Raw audio sample count.
|
|
20968
|
+
* @returns {number} Number of projector output tokens.
|
|
20969
|
+
*/
|
|
20970
|
+
_get_num_audio_features(audioLength) {
|
|
20971
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
20972
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
20973
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
20974
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
20975
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
20976
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
20977
|
+
return nblocks * effective_window_size;
|
|
20978
|
+
}
|
|
20979
|
+
/**
|
|
20980
|
+
* @param {string} text The text input to process.
|
|
20981
|
+
* @param {Float32Array} audio The audio input to process.
|
|
20982
|
+
*/
|
|
20983
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
20984
|
+
if (Array.isArray(text)) {
|
|
20985
|
+
throw new Error("Batched inputs are not supported yet.");
|
|
20986
|
+
}
|
|
20987
|
+
let audio_inputs = {};
|
|
20988
|
+
if (audio) {
|
|
20989
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
20990
|
+
audio_inputs["input_features"] = input_features;
|
|
20991
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
20992
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
20993
|
+
audio_inputs["input_features_mask"] = new Tensor3("bool", mask_data, [1, audio_embed_size]);
|
|
20994
|
+
const audio_token = this.config.audio_token ?? "<|audio|>";
|
|
20995
|
+
if (!text.includes(audio_token)) {
|
|
20996
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
20997
|
+
}
|
|
20998
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
20999
|
+
}
|
|
21000
|
+
const text_inputs = this.tokenizer(text, {
|
|
21001
|
+
add_special_tokens: false,
|
|
21002
|
+
...kwargs
|
|
21003
|
+
});
|
|
21004
|
+
return {
|
|
21005
|
+
...text_inputs,
|
|
21006
|
+
...audio_inputs
|
|
21007
|
+
};
|
|
21008
|
+
}
|
|
21009
|
+
};
|
|
21010
|
+
|
|
20282
21011
|
// src/models/grounding_dino/processing_grounding_dino.js
|
|
20283
21012
|
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
20284
21013
|
const left_idx = 0;
|
|
@@ -20555,6 +21284,66 @@ var JinaCLIPProcessor = class extends Processor {
|
|
|
20555
21284
|
}
|
|
20556
21285
|
};
|
|
20557
21286
|
|
|
21287
|
+
// src/models/lfm2_vl/processing_lfm2_vl.js
|
|
21288
|
+
var Lfm2VlProcessor = class extends Processor {
|
|
21289
|
+
static tokenizer_class = AutoTokenizer;
|
|
21290
|
+
static image_processor_class = AutoImageProcessor;
|
|
21291
|
+
/**
|
|
21292
|
+
* @param {RawImage|RawImage[]} images
|
|
21293
|
+
* @param {string|string[]|null} [text]
|
|
21294
|
+
* @param {Record<string, any>} [kwargs]
|
|
21295
|
+
*/
|
|
21296
|
+
async _call(images, text = null, kwargs = {}) {
|
|
21297
|
+
const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
|
|
21298
|
+
...kwargs,
|
|
21299
|
+
return_row_col_info: true
|
|
21300
|
+
});
|
|
21301
|
+
if (text) {
|
|
21302
|
+
const image_token = this.config.image_token ?? "<image>";
|
|
21303
|
+
const {
|
|
21304
|
+
tile_size = 512,
|
|
21305
|
+
downsample_factor = 2,
|
|
21306
|
+
encoder_patch_size = 16,
|
|
21307
|
+
use_thumbnail = true
|
|
21308
|
+
} = (
|
|
21309
|
+
/** @type {Record<string, any>} */
|
|
21310
|
+
this.image_processor.config
|
|
21311
|
+
);
|
|
21312
|
+
const ds2 = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
|
|
21313
|
+
const tokens_per_tile = ds2(tile_size) ** 2;
|
|
21314
|
+
const image_start = this.config.image_start_token ?? "<|image_start|>";
|
|
21315
|
+
const image_end = this.config.image_end_token ?? "<|image_end|>";
|
|
21316
|
+
const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
|
|
21317
|
+
if (!Array.isArray(text)) text = [text];
|
|
21318
|
+
let image_idx = 0;
|
|
21319
|
+
text = text.map((sample) => {
|
|
21320
|
+
const parts = sample.split(image_token);
|
|
21321
|
+
return parts[0] + parts.slice(1).map((part) => {
|
|
21322
|
+
const idx = image_idx++;
|
|
21323
|
+
const [h, w] = image_sizes[idx];
|
|
21324
|
+
const rows = image_rows[idx], cols = image_cols[idx];
|
|
21325
|
+
const tokens_for_image = ds2(h) * ds2(w);
|
|
21326
|
+
let expanded = image_start;
|
|
21327
|
+
if (rows > 1 || cols > 1) {
|
|
21328
|
+
const tile_str = image_token.repeat(tokens_per_tile);
|
|
21329
|
+
for (let r = 0; r < rows; ++r)
|
|
21330
|
+
for (let c = 0; c < cols; ++c)
|
|
21331
|
+
expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
|
|
21332
|
+
if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
|
|
21333
|
+
} else {
|
|
21334
|
+
expanded += image_token.repeat(tokens_for_image);
|
|
21335
|
+
}
|
|
21336
|
+
return expanded + image_end + part;
|
|
21337
|
+
}).join("");
|
|
21338
|
+
});
|
|
21339
|
+
}
|
|
21340
|
+
return {
|
|
21341
|
+
...image_inputs,
|
|
21342
|
+
...text ? this.tokenizer(text, kwargs) : {}
|
|
21343
|
+
};
|
|
21344
|
+
}
|
|
21345
|
+
};
|
|
21346
|
+
|
|
20558
21347
|
// src/models/llava/processing_llava.js
|
|
20559
21348
|
var LlavaProcessor = class extends Processor {
|
|
20560
21349
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -21087,6 +21876,94 @@ var VoxtralProcessor = class extends Processor {
|
|
|
21087
21876
|
}
|
|
21088
21877
|
};
|
|
21089
21878
|
|
|
21879
|
+
// src/models/voxtral_realtime/processing_voxtral_realtime.js
|
|
21880
|
+
var NUM_LEFT_PAD_TOKENS = 32;
|
|
21881
|
+
var NUM_DELAY_TOKENS = 6;
|
|
21882
|
+
var AUDIO_LENGTH_PER_TOK = 8;
|
|
21883
|
+
var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
21884
|
+
var STREAMING_PAD_TOKEN_ID = 32;
|
|
21885
|
+
var VoxtralRealtimeProcessor = class extends Processor {
|
|
21886
|
+
static tokenizer_class = AutoTokenizer;
|
|
21887
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
21888
|
+
static uses_processor_config = false;
|
|
21889
|
+
/** Number of mel frames in the first audio chunk. */
|
|
21890
|
+
get num_mel_frames_first_audio_chunk() {
|
|
21891
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
21892
|
+
}
|
|
21893
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
21894
|
+
get num_samples_first_audio_chunk() {
|
|
21895
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
21896
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
21897
|
+
}
|
|
21898
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
21899
|
+
get num_samples_per_audio_chunk() {
|
|
21900
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
21901
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
21902
|
+
}
|
|
21903
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
21904
|
+
get num_right_pad_tokens() {
|
|
21905
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
21906
|
+
}
|
|
21907
|
+
/** Number of mel frames per text token. */
|
|
21908
|
+
get audio_length_per_tok() {
|
|
21909
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
21910
|
+
}
|
|
21911
|
+
/** Number of raw audio samples per token. */
|
|
21912
|
+
get raw_audio_length_per_tok() {
|
|
21913
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
21914
|
+
}
|
|
21915
|
+
/**
|
|
21916
|
+
* Process audio input for VoxtralRealtime.
|
|
21917
|
+
*
|
|
21918
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
21919
|
+
* with silence and mel features are extracted with `center=true`.
|
|
21920
|
+
* Returns `{ input_ids, input_features }`.
|
|
21921
|
+
*
|
|
21922
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
21923
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
21924
|
+
*
|
|
21925
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
21926
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
21927
|
+
* Returns `{ input_features }`.
|
|
21928
|
+
*
|
|
21929
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
21930
|
+
* @param {Object} [options]
|
|
21931
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
21932
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
21933
|
+
* @returns {Promise<Object>}
|
|
21934
|
+
*/
|
|
21935
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
21936
|
+
validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
|
|
21937
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
21938
|
+
throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
|
|
21939
|
+
}
|
|
21940
|
+
if (is_first_audio_chunk) {
|
|
21941
|
+
if (is_streaming) {
|
|
21942
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
21943
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
21944
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
21945
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
21946
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
21947
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
21948
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
21949
|
+
input_ids_data[0] = 1n;
|
|
21950
|
+
const input_ids = new Tensor3("int64", input_ids_data, [1, num_input_tokens]);
|
|
21951
|
+
return {
|
|
21952
|
+
input_ids,
|
|
21953
|
+
...audio_encoding
|
|
21954
|
+
};
|
|
21955
|
+
} else {
|
|
21956
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
21957
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
21958
|
+
padded_audio.set(audio);
|
|
21959
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
21960
|
+
}
|
|
21961
|
+
} else {
|
|
21962
|
+
return await this.feature_extractor(audio, { center: false });
|
|
21963
|
+
}
|
|
21964
|
+
}
|
|
21965
|
+
};
|
|
21966
|
+
|
|
21090
21967
|
// src/models/wav2vec2/processing_wav2vec2.js
|
|
21091
21968
|
var Wav2Vec2Processor = class extends Processor {
|
|
21092
21969
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -21186,10 +22063,13 @@ function getNormalizedConfig(config) {
|
|
|
21186
22063
|
case "florence2":
|
|
21187
22064
|
case "llava_onevision":
|
|
21188
22065
|
case "idefics3":
|
|
22066
|
+
case "granite_speech":
|
|
21189
22067
|
case "ultravox":
|
|
21190
22068
|
case "voxtral":
|
|
22069
|
+
case "voxtral_realtime":
|
|
21191
22070
|
case "smolvlm":
|
|
21192
22071
|
case "gemma3n":
|
|
22072
|
+
case "lfm2_vl":
|
|
21193
22073
|
case "chatterbox":
|
|
21194
22074
|
case "mistral3":
|
|
21195
22075
|
case "qwen2_5_vl":
|
|
@@ -21244,10 +22124,13 @@ function getNormalizedConfig(config) {
|
|
|
21244
22124
|
case "cohere":
|
|
21245
22125
|
case "cohere2":
|
|
21246
22126
|
case "mistral":
|
|
22127
|
+
case "voxtral_realtime_text":
|
|
22128
|
+
case "voxtral_realtime_encoder":
|
|
21247
22129
|
case "starcoder2":
|
|
21248
22130
|
case "qwen2":
|
|
21249
22131
|
case "qwen2_moe":
|
|
21250
22132
|
case "qwen2_vl":
|
|
22133
|
+
case "qwen2_vl_text":
|
|
21251
22134
|
case "qwen2_5_vl_text":
|
|
21252
22135
|
case "qwen3_moe":
|
|
21253
22136
|
case "qwen3_vl_text":
|
|
@@ -21392,6 +22275,9 @@ function getNormalizedConfig(config) {
|
|
|
21392
22275
|
return normalized_config;
|
|
21393
22276
|
}
|
|
21394
22277
|
function getCacheShapes(config, options) {
|
|
22278
|
+
if (!(config instanceof PretrainedConfig)) {
|
|
22279
|
+
config = new PretrainedConfig(config);
|
|
22280
|
+
}
|
|
21395
22281
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
21396
22282
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
21397
22283
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -21498,12 +22384,16 @@ function getCacheShapes(config, options) {
|
|
|
21498
22384
|
}
|
|
21499
22385
|
}
|
|
21500
22386
|
return cache_values;
|
|
21501
|
-
} else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
|
|
21502
|
-
|
|
21503
|
-
|
|
21504
|
-
|
|
21505
|
-
|
|
21506
|
-
|
|
22387
|
+
} else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
|
|
22388
|
+
let subConfig;
|
|
22389
|
+
if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
|
|
22390
|
+
subConfig = /** @type {any} */
|
|
22391
|
+
config.audio_config;
|
|
22392
|
+
} else {
|
|
22393
|
+
subConfig = /** @type {any} */
|
|
22394
|
+
config.text_config;
|
|
22395
|
+
}
|
|
22396
|
+
return getCacheShapes(subConfig, options);
|
|
21507
22397
|
}
|
|
21508
22398
|
return getKeyValueShapes(config, options);
|
|
21509
22399
|
}
|
|
@@ -21669,7 +22559,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
|
|
|
21669
22559
|
}
|
|
21670
22560
|
|
|
21671
22561
|
// src/models/session.js
|
|
21672
|
-
async function getSession(pretrained_model_name_or_path, fileName, options,
|
|
22562
|
+
async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
|
|
21673
22563
|
let custom_config = options.config?.["transformers.js_config"] ?? {};
|
|
21674
22564
|
const selectedDevice = (
|
|
21675
22565
|
/** @type {import("../utils/devices.js").DeviceType} */
|
|
@@ -21727,9 +22617,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
21727
22617
|
if (externalData.length > 0 && !apis.IS_NODE_ENV) {
|
|
21728
22618
|
session_options.externalData = externalData;
|
|
21729
22619
|
}
|
|
21730
|
-
if (
|
|
22620
|
+
if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
|
|
21731
22621
|
const shapes = getCacheShapes(options.config, {
|
|
21732
|
-
prefix: "present"
|
|
22622
|
+
prefix: "present",
|
|
22623
|
+
session_name
|
|
21733
22624
|
});
|
|
21734
22625
|
if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
|
|
21735
22626
|
const preferredOutputLocation = {};
|
|
@@ -21747,15 +22638,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
21747
22638
|
};
|
|
21748
22639
|
return { buffer_or_path, session_options, session_config };
|
|
21749
22640
|
}
|
|
21750
|
-
async function constructSessions(pretrained_model_name_or_path, names, options,
|
|
22641
|
+
async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
|
|
21751
22642
|
return Object.fromEntries(
|
|
21752
22643
|
await Promise.all(
|
|
21753
22644
|
Object.keys(names).map(async (name) => {
|
|
22645
|
+
const cache_config = cache_sessions?.[name] ?? false;
|
|
21754
22646
|
const { buffer_or_path, session_options, session_config } = await getSession(
|
|
21755
22647
|
pretrained_model_name_or_path,
|
|
21756
22648
|
names[name],
|
|
21757
22649
|
options,
|
|
21758
|
-
|
|
22650
|
+
cache_config,
|
|
22651
|
+
name
|
|
21759
22652
|
);
|
|
21760
22653
|
const session = await createInferenceSession(buffer_or_path, session_options, session_config);
|
|
21761
22654
|
return [name, session];
|
|
@@ -23055,19 +23948,71 @@ var BeamSearchSampler = class extends LogitsSampler {
|
|
|
23055
23948
|
}
|
|
23056
23949
|
};
|
|
23057
23950
|
|
|
23951
|
+
// src/cache_utils.js
|
|
23952
|
+
var _DynamicCache = class {
|
|
23953
|
+
/**
|
|
23954
|
+
* Create a DynamicCache, optionally pre-populated with entries.
|
|
23955
|
+
* @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
|
|
23956
|
+
*/
|
|
23957
|
+
constructor(entries) {
|
|
23958
|
+
if (!entries) return;
|
|
23959
|
+
for (const key in entries) {
|
|
23960
|
+
if (key in this) {
|
|
23961
|
+
throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
|
|
23962
|
+
}
|
|
23963
|
+
const value = entries[key];
|
|
23964
|
+
if (!(value instanceof Tensor3)) {
|
|
23965
|
+
throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
|
|
23966
|
+
}
|
|
23967
|
+
this[key] = value;
|
|
23968
|
+
}
|
|
23969
|
+
}
|
|
23970
|
+
/**
|
|
23971
|
+
* Get the cached sequence length. This requires at least one attention cache entry to be present.
|
|
23972
|
+
* @returns {number} The past sequence length.
|
|
23973
|
+
*/
|
|
23974
|
+
get_seq_length() {
|
|
23975
|
+
const self2 = (
|
|
23976
|
+
/** @type {any} */
|
|
23977
|
+
this
|
|
23978
|
+
);
|
|
23979
|
+
for (const name in self2) {
|
|
23980
|
+
if (name.startsWith("past_key_values.")) {
|
|
23981
|
+
return self2[name].dims.at(-2);
|
|
23982
|
+
}
|
|
23983
|
+
}
|
|
23984
|
+
throw new Error("Unable to determine sequence length from the cache.");
|
|
23985
|
+
}
|
|
23986
|
+
/**
|
|
23987
|
+
* Dispose all contained tensors whose data resides on the GPU.
|
|
23988
|
+
* Returns a promise that resolves when all disposals are complete.
|
|
23989
|
+
* @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
|
|
23990
|
+
*/
|
|
23991
|
+
async dispose() {
|
|
23992
|
+
const promises = [];
|
|
23993
|
+
for (
|
|
23994
|
+
const t of
|
|
23995
|
+
/** @type {Tensor[]} */
|
|
23996
|
+
Object.values(this)
|
|
23997
|
+
) {
|
|
23998
|
+
if (t.location === "gpu-buffer") {
|
|
23999
|
+
promises.push(t.dispose());
|
|
24000
|
+
}
|
|
24001
|
+
}
|
|
24002
|
+
await Promise.all(promises);
|
|
24003
|
+
}
|
|
24004
|
+
};
|
|
24005
|
+
var DynamicCache = (
|
|
24006
|
+
/** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
|
|
24007
|
+
/** @type {unknown} */
|
|
24008
|
+
_DynamicCache
|
|
24009
|
+
);
|
|
24010
|
+
|
|
23058
24011
|
// src/models/modeling_utils.js
|
|
23059
24012
|
var MODEL_MAPPING_NAMES = null;
|
|
23060
24013
|
function registerTaskMappings(mappings) {
|
|
23061
24014
|
MODEL_MAPPING_NAMES = mappings;
|
|
23062
24015
|
}
|
|
23063
|
-
function getPastLength(past_key_values) {
|
|
23064
|
-
for (const name in past_key_values) {
|
|
23065
|
-
if (name.startsWith("past_key_values.")) {
|
|
23066
|
-
return past_key_values[name].dims.at(-2);
|
|
23067
|
-
}
|
|
23068
|
-
}
|
|
23069
|
-
return Object.values(past_key_values)[0].dims.at(-2);
|
|
23070
|
-
}
|
|
23071
24016
|
function toI64Tensor(items) {
|
|
23072
24017
|
if (items instanceof Tensor3) {
|
|
23073
24018
|
return items;
|
|
@@ -23108,71 +24053,181 @@ var MODEL_TYPES = {
|
|
|
23108
24053
|
AutoEncoder: 12,
|
|
23109
24054
|
ImageAudioTextToText: 13,
|
|
23110
24055
|
Supertonic: 14,
|
|
23111
|
-
Chatterbox: 15
|
|
24056
|
+
Chatterbox: 15,
|
|
24057
|
+
MultimodalLanguageModelOnly: 16,
|
|
24058
|
+
VoxtralRealtime: 17
|
|
23112
24059
|
};
|
|
23113
24060
|
var MODEL_TYPE_CONFIG = {
|
|
23114
24061
|
[MODEL_TYPES.DecoderOnly]: {
|
|
23115
24062
|
can_generate: true,
|
|
23116
24063
|
forward: decoder_forward,
|
|
23117
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
24064
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24065
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
|
|
24066
|
+
cache_sessions: { model: true },
|
|
24067
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23118
24068
|
},
|
|
23119
24069
|
[MODEL_TYPES.DecoderOnlyWithoutHead]: {
|
|
23120
24070
|
can_generate: false,
|
|
23121
24071
|
forward: decoder_forward,
|
|
23122
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
24072
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24073
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
23123
24074
|
},
|
|
23124
24075
|
[MODEL_TYPES.Seq2Seq]: {
|
|
23125
24076
|
can_generate: true,
|
|
23126
24077
|
forward: seq2seq_forward,
|
|
23127
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
24078
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
24079
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24080
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24081
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23128
24082
|
},
|
|
23129
24083
|
[MODEL_TYPES.Vision2Seq]: {
|
|
23130
24084
|
can_generate: true,
|
|
23131
24085
|
forward: seq2seq_forward,
|
|
23132
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
24086
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
24087
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24088
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24089
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23133
24090
|
},
|
|
23134
24091
|
[MODEL_TYPES.Musicgen]: {
|
|
23135
24092
|
can_generate: true,
|
|
23136
|
-
forward: seq2seq_forward
|
|
24093
|
+
forward: seq2seq_forward,
|
|
24094
|
+
sessions: () => ({
|
|
24095
|
+
model: "text_encoder",
|
|
24096
|
+
decoder_model_merged: "decoder_model_merged",
|
|
24097
|
+
encodec_decode: "encodec_decode"
|
|
24098
|
+
}),
|
|
24099
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24100
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23137
24101
|
},
|
|
23138
24102
|
[MODEL_TYPES.EncoderDecoder]: {
|
|
23139
24103
|
can_generate: false,
|
|
23140
|
-
forward: seq2seq_forward
|
|
24104
|
+
forward: seq2seq_forward,
|
|
24105
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24106
|
+
cache_sessions: { decoder_model_merged: true }
|
|
24107
|
+
},
|
|
24108
|
+
[MODEL_TYPES.MaskGeneration]: {
|
|
24109
|
+
sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
|
|
23141
24110
|
},
|
|
23142
24111
|
[MODEL_TYPES.ImageTextToText]: {
|
|
23143
24112
|
can_generate: true,
|
|
23144
24113
|
forward: image_text_to_text_forward,
|
|
23145
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24114
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24115
|
+
sessions: (config) => {
|
|
24116
|
+
const s = {
|
|
24117
|
+
embed_tokens: "embed_tokens",
|
|
24118
|
+
vision_encoder: "vision_encoder",
|
|
24119
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24120
|
+
};
|
|
24121
|
+
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
24122
|
+
return s;
|
|
24123
|
+
},
|
|
24124
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24125
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23146
24126
|
},
|
|
23147
24127
|
[MODEL_TYPES.AudioTextToText]: {
|
|
23148
24128
|
can_generate: true,
|
|
23149
24129
|
forward: audio_text_to_text_forward,
|
|
23150
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24130
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24131
|
+
sessions: () => ({
|
|
24132
|
+
embed_tokens: "embed_tokens",
|
|
24133
|
+
audio_encoder: "audio_encoder",
|
|
24134
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24135
|
+
}),
|
|
24136
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24137
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23151
24138
|
},
|
|
23152
|
-
[MODEL_TYPES.
|
|
24139
|
+
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
23153
24140
|
can_generate: true,
|
|
23154
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24141
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24142
|
+
sessions: () => ({
|
|
24143
|
+
embed_tokens: "embed_tokens",
|
|
24144
|
+
audio_encoder: "audio_encoder",
|
|
24145
|
+
vision_encoder: "vision_encoder",
|
|
24146
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24147
|
+
}),
|
|
24148
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23155
24149
|
},
|
|
23156
|
-
[MODEL_TYPES.
|
|
24150
|
+
[MODEL_TYPES.Phi3V]: {
|
|
23157
24151
|
can_generate: true,
|
|
23158
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24152
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24153
|
+
sessions: () => ({
|
|
24154
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
24155
|
+
model: "model",
|
|
24156
|
+
vision_encoder: "vision_encoder"
|
|
24157
|
+
}),
|
|
24158
|
+
cache_sessions: { model: true },
|
|
24159
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23159
24160
|
},
|
|
23160
24161
|
[MODEL_TYPES.MultiModality]: {
|
|
23161
|
-
can_generate: true
|
|
24162
|
+
can_generate: true,
|
|
24163
|
+
sessions: () => ({
|
|
24164
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
24165
|
+
model: "language_model",
|
|
24166
|
+
lm_head: "lm_head",
|
|
24167
|
+
gen_head: "gen_head",
|
|
24168
|
+
gen_img_embeds: "gen_img_embeds",
|
|
24169
|
+
image_decode: "image_decode"
|
|
24170
|
+
}),
|
|
24171
|
+
cache_sessions: { model: true },
|
|
24172
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23162
24173
|
},
|
|
23163
24174
|
[MODEL_TYPES.AutoEncoder]: {
|
|
23164
24175
|
can_generate: false,
|
|
23165
|
-
forward: auto_encoder_forward
|
|
24176
|
+
forward: auto_encoder_forward,
|
|
24177
|
+
sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
|
|
24178
|
+
},
|
|
24179
|
+
[MODEL_TYPES.Supertonic]: {
|
|
24180
|
+
sessions: () => ({
|
|
24181
|
+
text_encoder: "text_encoder",
|
|
24182
|
+
latent_denoiser: "latent_denoiser",
|
|
24183
|
+
voice_decoder: "voice_decoder"
|
|
24184
|
+
})
|
|
23166
24185
|
},
|
|
23167
24186
|
[MODEL_TYPES.Chatterbox]: {
|
|
23168
24187
|
can_generate: true,
|
|
23169
|
-
forward: encoder_forward
|
|
24188
|
+
forward: encoder_forward,
|
|
24189
|
+
sessions: () => ({
|
|
24190
|
+
embed_tokens: "embed_tokens",
|
|
24191
|
+
speech_encoder: "speech_encoder",
|
|
24192
|
+
model: "language_model",
|
|
24193
|
+
conditional_decoder: "conditional_decoder"
|
|
24194
|
+
}),
|
|
24195
|
+
cache_sessions: { model: true },
|
|
24196
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
24197
|
+
},
|
|
24198
|
+
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
24199
|
+
can_generate: true,
|
|
24200
|
+
forward: image_text_to_text_forward,
|
|
24201
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24202
|
+
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
24203
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24204
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
24205
|
+
},
|
|
24206
|
+
[MODEL_TYPES.VoxtralRealtime]: {
|
|
24207
|
+
can_generate: true,
|
|
24208
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24209
|
+
sessions: () => ({
|
|
24210
|
+
embed_tokens: "embed_tokens",
|
|
24211
|
+
audio_encoder: "audio_encoder",
|
|
24212
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24213
|
+
}),
|
|
24214
|
+
cache_sessions: { decoder_model_merged: true, audio_encoder: true },
|
|
24215
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23170
24216
|
},
|
|
23171
24217
|
default: {
|
|
23172
24218
|
can_generate: false,
|
|
23173
|
-
forward: encoder_forward
|
|
24219
|
+
forward: encoder_forward,
|
|
24220
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
23174
24221
|
}
|
|
23175
24222
|
};
|
|
24223
|
+
function getSessionsConfig(modelType, config, options = {}) {
|
|
24224
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
24225
|
+
return {
|
|
24226
|
+
sessions: typeConfig.sessions(config, options),
|
|
24227
|
+
cache_sessions: typeConfig.cache_sessions,
|
|
24228
|
+
optional_configs: typeConfig.optional_configs
|
|
24229
|
+
};
|
|
24230
|
+
}
|
|
23176
24231
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
23177
24232
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
23178
24233
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -23258,245 +24313,23 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23258
24313
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
23259
24314
|
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
23260
24315
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
23261
|
-
|
|
23262
|
-
if (modelType ===
|
|
23263
|
-
|
|
23264
|
-
|
|
23265
|
-
|
|
23266
|
-
{
|
|
23267
|
-
|
|
23268
|
-
},
|
|
23269
|
-
options,
|
|
23270
|
-
"model"
|
|
23271
|
-
),
|
|
23272
|
-
get_optional_configs(
|
|
23273
|
-
pretrained_model_name_or_path,
|
|
23274
|
-
{
|
|
23275
|
-
generation_config: "generation_config.json"
|
|
23276
|
-
},
|
|
23277
|
-
options
|
|
23278
|
-
)
|
|
23279
|
-
]);
|
|
23280
|
-
} else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
|
|
23281
|
-
info = await Promise.all([
|
|
23282
|
-
constructSessions(
|
|
23283
|
-
pretrained_model_name_or_path,
|
|
23284
|
-
{
|
|
23285
|
-
model: "encoder_model",
|
|
23286
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23287
|
-
},
|
|
23288
|
-
options,
|
|
23289
|
-
"decoder_model_merged"
|
|
23290
|
-
),
|
|
23291
|
-
get_optional_configs(
|
|
23292
|
-
pretrained_model_name_or_path,
|
|
23293
|
-
{
|
|
23294
|
-
generation_config: "generation_config.json"
|
|
23295
|
-
},
|
|
23296
|
-
options
|
|
23297
|
-
)
|
|
23298
|
-
]);
|
|
23299
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
23300
|
-
info = await Promise.all([
|
|
23301
|
-
constructSessions(
|
|
23302
|
-
pretrained_model_name_or_path,
|
|
23303
|
-
{
|
|
23304
|
-
model: "vision_encoder",
|
|
23305
|
-
prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
|
|
23306
|
-
},
|
|
23307
|
-
options
|
|
23308
|
-
)
|
|
23309
|
-
]);
|
|
23310
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
23311
|
-
info = await Promise.all([
|
|
23312
|
-
constructSessions(
|
|
23313
|
-
pretrained_model_name_or_path,
|
|
23314
|
-
{
|
|
23315
|
-
model: "encoder_model",
|
|
23316
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23317
|
-
},
|
|
23318
|
-
options,
|
|
23319
|
-
"decoder_model_merged"
|
|
23320
|
-
)
|
|
23321
|
-
]);
|
|
23322
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
23323
|
-
const sessions = {
|
|
23324
|
-
embed_tokens: "embed_tokens",
|
|
23325
|
-
vision_encoder: "vision_encoder",
|
|
23326
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23327
|
-
};
|
|
23328
|
-
if (config.is_encoder_decoder) {
|
|
23329
|
-
sessions["model"] = "encoder_model";
|
|
23330
|
-
}
|
|
23331
|
-
info = await Promise.all([
|
|
23332
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
23333
|
-
get_optional_configs(
|
|
23334
|
-
pretrained_model_name_or_path,
|
|
23335
|
-
{
|
|
23336
|
-
generation_config: "generation_config.json"
|
|
23337
|
-
},
|
|
23338
|
-
options
|
|
23339
|
-
)
|
|
23340
|
-
]);
|
|
23341
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
23342
|
-
const sessions = {
|
|
23343
|
-
embed_tokens: "embed_tokens",
|
|
23344
|
-
audio_encoder: "audio_encoder",
|
|
23345
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23346
|
-
};
|
|
23347
|
-
info = await Promise.all([
|
|
23348
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
23349
|
-
get_optional_configs(
|
|
23350
|
-
pretrained_model_name_or_path,
|
|
23351
|
-
{
|
|
23352
|
-
generation_config: "generation_config.json"
|
|
23353
|
-
},
|
|
23354
|
-
options
|
|
23355
|
-
)
|
|
23356
|
-
]);
|
|
23357
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
23358
|
-
const sessions = {
|
|
23359
|
-
embed_tokens: "embed_tokens",
|
|
23360
|
-
audio_encoder: "audio_encoder",
|
|
23361
|
-
vision_encoder: "vision_encoder",
|
|
23362
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23363
|
-
};
|
|
23364
|
-
info = await Promise.all([
|
|
23365
|
-
constructSessions(pretrained_model_name_or_path, sessions, options),
|
|
23366
|
-
get_optional_configs(
|
|
23367
|
-
pretrained_model_name_or_path,
|
|
23368
|
-
{
|
|
23369
|
-
generation_config: "generation_config.json"
|
|
23370
|
-
},
|
|
23371
|
-
options
|
|
23372
|
-
)
|
|
23373
|
-
]);
|
|
23374
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
23375
|
-
info = await Promise.all([
|
|
23376
|
-
constructSessions(
|
|
23377
|
-
pretrained_model_name_or_path,
|
|
23378
|
-
{
|
|
23379
|
-
model: "text_encoder",
|
|
23380
|
-
decoder_model_merged: "decoder_model_merged",
|
|
23381
|
-
encodec_decode: "encodec_decode"
|
|
23382
|
-
},
|
|
23383
|
-
options,
|
|
23384
|
-
"decoder_model_merged"
|
|
23385
|
-
),
|
|
23386
|
-
get_optional_configs(
|
|
23387
|
-
pretrained_model_name_or_path,
|
|
23388
|
-
{
|
|
23389
|
-
generation_config: "generation_config.json"
|
|
23390
|
-
},
|
|
23391
|
-
options
|
|
23392
|
-
)
|
|
23393
|
-
]);
|
|
23394
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
23395
|
-
info = await Promise.all([
|
|
23396
|
-
constructSessions(
|
|
23397
|
-
pretrained_model_name_or_path,
|
|
23398
|
-
{
|
|
23399
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23400
|
-
model: "language_model",
|
|
23401
|
-
lm_head: "lm_head",
|
|
23402
|
-
gen_head: "gen_head",
|
|
23403
|
-
gen_img_embeds: "gen_img_embeds",
|
|
23404
|
-
image_decode: "image_decode"
|
|
23405
|
-
},
|
|
23406
|
-
options,
|
|
23407
|
-
"model"
|
|
23408
|
-
),
|
|
23409
|
-
get_optional_configs(
|
|
23410
|
-
pretrained_model_name_or_path,
|
|
23411
|
-
{
|
|
23412
|
-
generation_config: "generation_config.json"
|
|
23413
|
-
},
|
|
23414
|
-
options
|
|
23415
|
-
)
|
|
23416
|
-
]);
|
|
23417
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
23418
|
-
info = await Promise.all([
|
|
23419
|
-
constructSessions(
|
|
23420
|
-
pretrained_model_name_or_path,
|
|
23421
|
-
{
|
|
23422
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23423
|
-
model: "model",
|
|
23424
|
-
vision_encoder: "vision_encoder"
|
|
23425
|
-
},
|
|
23426
|
-
options,
|
|
23427
|
-
"model"
|
|
23428
|
-
),
|
|
23429
|
-
get_optional_configs(
|
|
23430
|
-
pretrained_model_name_or_path,
|
|
23431
|
-
{
|
|
23432
|
-
generation_config: "generation_config.json"
|
|
23433
|
-
},
|
|
23434
|
-
options
|
|
23435
|
-
)
|
|
23436
|
-
]);
|
|
23437
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
23438
|
-
info = await Promise.all([
|
|
23439
|
-
constructSessions(
|
|
23440
|
-
pretrained_model_name_or_path,
|
|
23441
|
-
{
|
|
23442
|
-
embed_tokens: "embed_tokens",
|
|
23443
|
-
speech_encoder: "speech_encoder",
|
|
23444
|
-
model: "language_model",
|
|
23445
|
-
conditional_decoder: "conditional_decoder"
|
|
23446
|
-
},
|
|
23447
|
-
options,
|
|
23448
|
-
"model"
|
|
23449
|
-
),
|
|
23450
|
-
get_optional_configs(
|
|
23451
|
-
pretrained_model_name_or_path,
|
|
23452
|
-
{
|
|
23453
|
-
generation_config: "generation_config.json"
|
|
23454
|
-
},
|
|
23455
|
-
options
|
|
23456
|
-
)
|
|
23457
|
-
]);
|
|
23458
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
23459
|
-
info = await Promise.all([
|
|
23460
|
-
constructSessions(
|
|
23461
|
-
pretrained_model_name_or_path,
|
|
23462
|
-
{
|
|
23463
|
-
encoder_model: "encoder_model",
|
|
23464
|
-
decoder_model: "decoder_model"
|
|
23465
|
-
},
|
|
23466
|
-
options
|
|
23467
|
-
)
|
|
23468
|
-
]);
|
|
23469
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
23470
|
-
info = await Promise.all([
|
|
23471
|
-
constructSessions(
|
|
23472
|
-
pretrained_model_name_or_path,
|
|
23473
|
-
{
|
|
23474
|
-
text_encoder: "text_encoder",
|
|
23475
|
-
latent_denoiser: "latent_denoiser",
|
|
23476
|
-
voice_decoder: "voice_decoder"
|
|
23477
|
-
},
|
|
23478
|
-
options
|
|
23479
|
-
)
|
|
23480
|
-
]);
|
|
23481
|
-
} else {
|
|
23482
|
-
if (modelType === void 0) {
|
|
23483
|
-
const type = modelName ?? config?.model_type;
|
|
23484
|
-
if (type !== "custom") {
|
|
23485
|
-
logger.warn(
|
|
23486
|
-
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
23487
|
-
);
|
|
23488
|
-
}
|
|
24316
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
24317
|
+
if (modelType === void 0) {
|
|
24318
|
+
const type = modelName ?? config?.model_type;
|
|
24319
|
+
if (type !== "custom") {
|
|
24320
|
+
logger.warn(
|
|
24321
|
+
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
24322
|
+
);
|
|
23489
24323
|
}
|
|
23490
|
-
info = await Promise.all([
|
|
23491
|
-
constructSessions(
|
|
23492
|
-
pretrained_model_name_or_path,
|
|
23493
|
-
{
|
|
23494
|
-
model: options.model_file_name ?? "model"
|
|
23495
|
-
},
|
|
23496
|
-
options
|
|
23497
|
-
)
|
|
23498
|
-
]);
|
|
23499
24324
|
}
|
|
24325
|
+
const sessions = typeConfig.sessions(config, options);
|
|
24326
|
+
const promises = [
|
|
24327
|
+
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
24328
|
+
];
|
|
24329
|
+
if (typeConfig.optional_configs) {
|
|
24330
|
+
promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
|
|
24331
|
+
}
|
|
24332
|
+
const info = await Promise.all(promises);
|
|
23500
24333
|
return new this(config, ...info);
|
|
23501
24334
|
}
|
|
23502
24335
|
/**
|
|
@@ -23695,7 +24528,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23695
24528
|
* @param {Tensor} [params.inputs=null]
|
|
23696
24529
|
* @param {number} [params.bos_token_id=null]
|
|
23697
24530
|
* @param {Record<string, Tensor|number[]>} [params.model_kwargs]
|
|
23698
|
-
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor
|
|
24531
|
+
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
|
|
23699
24532
|
*/
|
|
23700
24533
|
_prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
|
|
23701
24534
|
const model_inputs = pick(model_kwargs, this.forward_params);
|
|
@@ -23936,11 +24769,12 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23936
24769
|
}
|
|
23937
24770
|
}
|
|
23938
24771
|
/**
|
|
23939
|
-
* Returns
|
|
24772
|
+
* Returns a DynamicCache containing past key values from the given decoder results object.
|
|
23940
24773
|
*
|
|
23941
24774
|
* @param {Object} decoderResults The decoder results object.
|
|
23942
|
-
* @param {
|
|
23943
|
-
* @
|
|
24775
|
+
* @param {DynamicCache} pastKeyValues The previous past key values.
|
|
24776
|
+
* @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
|
|
24777
|
+
* @returns {DynamicCache} A new DynamicCache containing the updated past key values.
|
|
23944
24778
|
*/
|
|
23945
24779
|
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
23946
24780
|
const pkvs = /* @__PURE__ */ Object.create(null);
|
|
@@ -23961,7 +24795,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23961
24795
|
}
|
|
23962
24796
|
}
|
|
23963
24797
|
}
|
|
23964
|
-
return pkvs;
|
|
24798
|
+
return new DynamicCache(pkvs);
|
|
23965
24799
|
}
|
|
23966
24800
|
/**
|
|
23967
24801
|
* Returns an object containing attentions from the given model output object.
|
|
@@ -23986,8 +24820,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23986
24820
|
/**
|
|
23987
24821
|
* Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
|
|
23988
24822
|
*
|
|
23989
|
-
* @param {
|
|
23990
|
-
* @param {
|
|
24823
|
+
* @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
|
|
24824
|
+
* @param {DynamicCache|null} pastKeyValues The cache containing past key values.
|
|
23991
24825
|
*/
|
|
23992
24826
|
addPastKeyValues(decoderFeeds, pastKeyValues) {
|
|
23993
24827
|
if (pastKeyValues) {
|
|
@@ -24004,14 +24838,29 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24004
24838
|
}
|
|
24005
24839
|
}
|
|
24006
24840
|
}
|
|
24007
|
-
|
|
24008
|
-
|
|
24841
|
+
/**
|
|
24842
|
+
* Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
|
|
24843
|
+
* @param {string} sessionName
|
|
24844
|
+
* @param {Record<string, Tensor>} inputs
|
|
24845
|
+
* @param {string} outputName
|
|
24846
|
+
* @private
|
|
24847
|
+
*/
|
|
24848
|
+
async _encode_input(sessionName, inputs, outputName) {
|
|
24849
|
+
if (!Object.hasOwn(this.sessions, sessionName)) {
|
|
24850
|
+
throw new Error(`Model does not have a ${sessionName} session.`);
|
|
24851
|
+
}
|
|
24852
|
+
const session = this.sessions[sessionName];
|
|
24853
|
+
const output = await sessionRun(session, pick(inputs, session.inputNames));
|
|
24854
|
+
return output[outputName];
|
|
24855
|
+
}
|
|
24856
|
+
async encode_image(inputs) {
|
|
24857
|
+
return this._encode_input("vision_encoder", inputs, "image_features");
|
|
24009
24858
|
}
|
|
24010
|
-
async encode_text(
|
|
24011
|
-
return
|
|
24859
|
+
async encode_text(inputs) {
|
|
24860
|
+
return this._encode_input("embed_tokens", inputs, "inputs_embeds");
|
|
24012
24861
|
}
|
|
24013
|
-
async encode_audio(
|
|
24014
|
-
return
|
|
24862
|
+
async encode_audio(inputs) {
|
|
24863
|
+
return this._encode_input("audio_encoder", inputs, "audio_features");
|
|
24015
24864
|
}
|
|
24016
24865
|
};
|
|
24017
24866
|
async function seq2seq_forward(self2, model_inputs) {
|
|
@@ -24066,6 +24915,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
|
|
|
24066
24915
|
const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
|
|
24067
24916
|
new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
|
|
24068
24917
|
}
|
|
24918
|
+
if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
|
|
24919
|
+
new_model_inputs.num_logits_to_keep = new Tensor3("int64", [0n], []);
|
|
24920
|
+
}
|
|
24069
24921
|
self2.addPastKeyValues(new_model_inputs, past_key_values);
|
|
24070
24922
|
const fixed = pick(new_model_inputs, session.inputNames);
|
|
24071
24923
|
return await sessionRun(session, fixed);
|
|
@@ -24074,7 +24926,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24074
24926
|
// Generic parameters:
|
|
24075
24927
|
encode_function,
|
|
24076
24928
|
merge_function,
|
|
24077
|
-
|
|
24929
|
+
modality_input_names,
|
|
24078
24930
|
modality_output_name,
|
|
24079
24931
|
// Produced by the tokenizer/processor:
|
|
24080
24932
|
input_ids = null,
|
|
@@ -24089,32 +24941,34 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24089
24941
|
// Additional parameters
|
|
24090
24942
|
...kwargs
|
|
24091
24943
|
}) {
|
|
24092
|
-
const modality_values = kwargs[modality_input_name];
|
|
24093
24944
|
if (!inputs_embeds) {
|
|
24094
24945
|
inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
|
|
24095
|
-
|
|
24096
|
-
|
|
24097
|
-
|
|
24098
|
-
|
|
24099
|
-
|
|
24100
|
-
|
|
24101
|
-
|
|
24102
|
-
|
|
24103
|
-
|
|
24104
|
-
inputs_embeds,
|
|
24105
|
-
|
|
24106
|
-
|
|
24107
|
-
|
|
24108
|
-
|
|
24109
|
-
|
|
24110
|
-
|
|
24111
|
-
|
|
24112
|
-
|
|
24113
|
-
|
|
24114
|
-
|
|
24115
|
-
|
|
24116
|
-
|
|
24117
|
-
|
|
24946
|
+
const modality_values = pick(kwargs, modality_input_names);
|
|
24947
|
+
if (Object.keys(modality_values).length > 0) {
|
|
24948
|
+
if (input_ids.dims[1] !== 1) {
|
|
24949
|
+
const modality_features = await encode_function({
|
|
24950
|
+
// Pass the modality values under its expected key.
|
|
24951
|
+
// The caller knows whether this is audio or image.
|
|
24952
|
+
...modality_values,
|
|
24953
|
+
...kwargs
|
|
24954
|
+
});
|
|
24955
|
+
({ inputs_embeds, attention_mask } = merge_function({
|
|
24956
|
+
[modality_output_name]: modality_features,
|
|
24957
|
+
inputs_embeds,
|
|
24958
|
+
input_ids,
|
|
24959
|
+
attention_mask
|
|
24960
|
+
}));
|
|
24961
|
+
} else if (past_key_values && input_ids.dims[1] === 1) {
|
|
24962
|
+
const target_length = input_ids.dims[1];
|
|
24963
|
+
const past_length = past_key_values.get_seq_length();
|
|
24964
|
+
attention_mask = cat(
|
|
24965
|
+
[
|
|
24966
|
+
ones([input_ids.dims[0], past_length]),
|
|
24967
|
+
attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
|
|
24968
|
+
],
|
|
24969
|
+
1
|
|
24970
|
+
);
|
|
24971
|
+
}
|
|
24118
24972
|
}
|
|
24119
24973
|
}
|
|
24120
24974
|
if (!position_ids) {
|
|
@@ -24122,10 +24976,13 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24122
24976
|
// Handle special case for qwen vl models
|
|
24123
24977
|
[
|
|
24124
24978
|
"qwen2_vl",
|
|
24979
|
+
"qwen2_vl_text",
|
|
24125
24980
|
"qwen2_5_vl",
|
|
24126
24981
|
"qwen2_5_vl_text",
|
|
24127
24982
|
"qwen3_vl",
|
|
24128
24983
|
"qwen3_vl_text",
|
|
24984
|
+
"qwen3_vl_moe",
|
|
24985
|
+
"qwen3_vl_moe_text",
|
|
24129
24986
|
"qwen3_5",
|
|
24130
24987
|
"qwen3_5_text",
|
|
24131
24988
|
"qwen3_5_moe",
|
|
@@ -24153,7 +25010,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24153
25010
|
async function audio_text_to_text_forward(self2, params) {
|
|
24154
25011
|
return await generic_text_to_text_forward(self2, {
|
|
24155
25012
|
...params,
|
|
24156
|
-
|
|
25013
|
+
modality_input_names: ["audio_values", "input_features"],
|
|
24157
25014
|
modality_output_name: "audio_features",
|
|
24158
25015
|
encode_function: self2.encode_audio.bind(self2),
|
|
24159
25016
|
merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
|
|
@@ -24162,7 +25019,7 @@ async function audio_text_to_text_forward(self2, params) {
|
|
|
24162
25019
|
async function image_text_to_text_forward(self2, params) {
|
|
24163
25020
|
return await generic_text_to_text_forward(self2, {
|
|
24164
25021
|
...params,
|
|
24165
|
-
|
|
25022
|
+
modality_input_names: ["pixel_values"],
|
|
24166
25023
|
modality_output_name: "image_features",
|
|
24167
25024
|
encode_function: self2.encode_image.bind(self2),
|
|
24168
25025
|
merge_function: self2._merge_input_ids_with_image_features.bind(self2)
|
|
@@ -24198,7 +25055,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
|
|
|
24198
25055
|
return position_ids;
|
|
24199
25056
|
}
|
|
24200
25057
|
function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
|
|
24201
|
-
const past_length = model_inputs.past_key_values ?
|
|
25058
|
+
const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
|
|
25059
|
+
const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
|
|
25060
|
+
if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
|
|
25061
|
+
model_inputs.num_logits_to_keep = new Tensor3("int64", [1n], []);
|
|
25062
|
+
}
|
|
24202
25063
|
if (!model_inputs.attention_mask) {
|
|
24203
25064
|
let dims;
|
|
24204
25065
|
for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
|
|
@@ -24506,6 +25367,7 @@ __export(models_exports, {
|
|
|
24506
25367
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
24507
25368
|
Gemma3Model: () => Gemma3Model,
|
|
24508
25369
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
25370
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
24509
25371
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
24510
25372
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
24511
25373
|
GemmaForCausalLM: () => GemmaForCausalLM,
|
|
@@ -24523,6 +25385,7 @@ __export(models_exports, {
|
|
|
24523
25385
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
24524
25386
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
24525
25387
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
25388
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
24526
25389
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
24527
25390
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
24528
25391
|
GroupViTModel: () => GroupViTModel,
|
|
@@ -24544,7 +25407,6 @@ __export(models_exports, {
|
|
|
24544
25407
|
IJepaModel: () => IJepaModel,
|
|
24545
25408
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
24546
25409
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
24547
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
24548
25410
|
JAISLMHeadModel: () => JAISLMHeadModel,
|
|
24549
25411
|
JAISModel: () => JAISModel,
|
|
24550
25412
|
JAISPreTrainedModel: () => JAISPreTrainedModel,
|
|
@@ -24558,6 +25420,7 @@ __export(models_exports, {
|
|
|
24558
25420
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
24559
25421
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
24560
25422
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
25423
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
24561
25424
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
24562
25425
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
24563
25426
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -24697,7 +25560,6 @@ __export(models_exports, {
|
|
|
24697
25560
|
Owlv2Model: () => Owlv2Model,
|
|
24698
25561
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
24699
25562
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
24700
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
24701
25563
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
24702
25564
|
ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
|
|
24703
25565
|
PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
|
|
@@ -24727,8 +25589,10 @@ __export(models_exports, {
|
|
|
24727
25589
|
Qwen2MoeModel: () => Qwen2MoeModel,
|
|
24728
25590
|
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
24729
25591
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
25592
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
24730
25593
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
24731
25594
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
25595
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
24732
25596
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
24733
25597
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
24734
25598
|
Qwen3Model: () => Qwen3Model,
|
|
@@ -24739,9 +25603,13 @@ __export(models_exports, {
|
|
|
24739
25603
|
Qwen3NextModel: () => Qwen3NextModel,
|
|
24740
25604
|
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
24741
25605
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
25606
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
24742
25607
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
25608
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
24743
25609
|
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
25610
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
24744
25611
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
25612
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
24745
25613
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
24746
25614
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
24747
25615
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -24792,7 +25660,6 @@ __export(models_exports, {
|
|
|
24792
25660
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
24793
25661
|
SmolLM3Model: () => SmolLM3Model,
|
|
24794
25662
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
24795
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
24796
25663
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
24797
25664
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
24798
25665
|
SnacModel: () => SnacModel,
|
|
@@ -24864,6 +25731,8 @@ __export(models_exports, {
|
|
|
24864
25731
|
VitsModelOutput: () => VitsModelOutput,
|
|
24865
25732
|
VitsPreTrainedModel: () => VitsPreTrainedModel,
|
|
24866
25733
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
25734
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
25735
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
24867
25736
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
24868
25737
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
24869
25738
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -25224,7 +26093,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
|
|
|
25224
26093
|
if (!past_key_values || target_length !== 1) {
|
|
25225
26094
|
throw new Error("Incorrect state encountered during generation.");
|
|
25226
26095
|
}
|
|
25227
|
-
const past_length =
|
|
26096
|
+
const past_length = past_key_values.get_seq_length();
|
|
25228
26097
|
attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
|
|
25229
26098
|
}
|
|
25230
26099
|
}
|
|
@@ -26254,6 +27123,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
|
|
|
26254
27123
|
});
|
|
26255
27124
|
}
|
|
26256
27125
|
};
|
|
27126
|
+
var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
|
|
27127
|
+
};
|
|
26257
27128
|
|
|
26258
27129
|
// src/models/glm/modeling_glm.js
|
|
26259
27130
|
var GlmPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -26335,6 +27206,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
|
|
|
26335
27206
|
var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
|
|
26336
27207
|
};
|
|
26337
27208
|
|
|
27209
|
+
// src/models/ultravox/modeling_ultravox.js
|
|
27210
|
+
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
27211
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
27212
|
+
};
|
|
27213
|
+
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
27214
|
+
_merge_input_ids_with_audio_features(kwargs) {
|
|
27215
|
+
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
27216
|
+
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
27217
|
+
return default_merge_input_ids_with_audio_features({
|
|
27218
|
+
// @ts-ignore
|
|
27219
|
+
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
|
|
27220
|
+
...kwargs,
|
|
27221
|
+
audio_features: reshaped_audio_features
|
|
27222
|
+
});
|
|
27223
|
+
}
|
|
27224
|
+
};
|
|
27225
|
+
|
|
27226
|
+
// src/models/granite_speech/modeling_granite_speech.js
|
|
27227
|
+
var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
|
|
27228
|
+
forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
|
|
27229
|
+
};
|
|
27230
|
+
|
|
26338
27231
|
// src/models/grounding_dino/modeling_grounding_dino.js
|
|
26339
27232
|
var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
|
|
26340
27233
|
};
|
|
@@ -26439,34 +27332,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
26439
27332
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
26440
27333
|
};
|
|
26441
27334
|
|
|
26442
|
-
// src/models/
|
|
26443
|
-
var
|
|
26444
|
-
forward_params = [
|
|
26445
|
-
"input_ids",
|
|
26446
|
-
"attention_mask",
|
|
26447
|
-
"pixel_values",
|
|
26448
|
-
"pixel_attention_mask",
|
|
26449
|
-
"position_ids",
|
|
26450
|
-
"past_key_values"
|
|
26451
|
-
];
|
|
27335
|
+
// src/models/llava/modeling_llava.js
|
|
27336
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
27337
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
26452
27338
|
};
|
|
26453
|
-
var
|
|
26454
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
26455
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
|
|
26456
|
-
return features;
|
|
26457
|
-
}
|
|
27339
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
26458
27340
|
_merge_input_ids_with_image_features(kwargs) {
|
|
26459
27341
|
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26460
27342
|
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26461
27343
|
return default_merge_input_ids_with_image_features({
|
|
26462
27344
|
// @ts-ignore
|
|
26463
|
-
image_token_id: this.config.image_token_id,
|
|
27345
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
26464
27346
|
...kwargs,
|
|
26465
27347
|
image_features: reshaped_image_hidden_states
|
|
26466
27348
|
});
|
|
26467
27349
|
}
|
|
26468
27350
|
};
|
|
26469
|
-
var
|
|
27351
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27352
|
+
};
|
|
27353
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
27354
|
+
};
|
|
27355
|
+
|
|
27356
|
+
// src/models/idefics3/modeling_idefics3.js
|
|
27357
|
+
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27358
|
+
forward_params = [
|
|
27359
|
+
"input_ids",
|
|
27360
|
+
"attention_mask",
|
|
27361
|
+
"pixel_values",
|
|
27362
|
+
"pixel_attention_mask",
|
|
27363
|
+
"position_ids",
|
|
27364
|
+
"past_key_values"
|
|
27365
|
+
];
|
|
26470
27366
|
};
|
|
26471
27367
|
|
|
26472
27368
|
// src/models/ijepa/modeling_ijepa.js
|
|
@@ -26558,6 +27454,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
|
|
|
26558
27454
|
var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
|
|
26559
27455
|
};
|
|
26560
27456
|
|
|
27457
|
+
// src/models/lfm2_vl/modeling_lfm2_vl.js
|
|
27458
|
+
var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27459
|
+
forward_params = [
|
|
27460
|
+
"input_ids",
|
|
27461
|
+
"attention_mask",
|
|
27462
|
+
"pixel_values",
|
|
27463
|
+
"pixel_attention_mask",
|
|
27464
|
+
"spatial_shapes",
|
|
27465
|
+
"position_ids",
|
|
27466
|
+
"past_key_values"
|
|
27467
|
+
];
|
|
27468
|
+
};
|
|
27469
|
+
|
|
26561
27470
|
// src/models/llama/modeling_llama.js
|
|
26562
27471
|
var LlamaPreTrainedModel = class extends PreTrainedModel {
|
|
26563
27472
|
};
|
|
@@ -26572,27 +27481,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
|
|
|
26572
27481
|
var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
|
|
26573
27482
|
};
|
|
26574
27483
|
|
|
26575
|
-
// src/models/llava/modeling_llava.js
|
|
26576
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
26577
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
26578
|
-
};
|
|
26579
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
26580
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
26581
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26582
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26583
|
-
return default_merge_input_ids_with_image_features({
|
|
26584
|
-
// @ts-ignore
|
|
26585
|
-
image_token_id: this.config.image_token_index,
|
|
26586
|
-
...kwargs,
|
|
26587
|
-
image_features: reshaped_image_hidden_states
|
|
26588
|
-
});
|
|
26589
|
-
}
|
|
26590
|
-
};
|
|
26591
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26592
|
-
};
|
|
26593
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
26594
|
-
};
|
|
26595
|
-
|
|
26596
27484
|
// src/models/longt5/modeling_longt5.js
|
|
26597
27485
|
var LongT5PreTrainedModel = class extends PreTrainedModel {
|
|
26598
27486
|
};
|
|
@@ -27343,27 +28231,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
|
|
|
27343
28231
|
};
|
|
27344
28232
|
|
|
27345
28233
|
// src/models/paligemma/modeling_paligemma.js
|
|
27346
|
-
var
|
|
27347
|
-
forward_params = [
|
|
27348
|
-
"input_ids",
|
|
27349
|
-
// 'inputs_embeds',
|
|
27350
|
-
"attention_mask",
|
|
27351
|
-
"pixel_values",
|
|
27352
|
-
"position_ids",
|
|
27353
|
-
"past_key_values"
|
|
27354
|
-
];
|
|
27355
|
-
};
|
|
27356
|
-
var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
|
|
27357
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
27358
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
27359
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
27360
|
-
return default_merge_input_ids_with_image_features({
|
|
27361
|
-
// @ts-ignore
|
|
27362
|
-
image_token_id: this.config.image_token_index,
|
|
27363
|
-
...kwargs,
|
|
27364
|
-
image_features: reshaped_image_hidden_states
|
|
27365
|
-
});
|
|
27366
|
-
}
|
|
28234
|
+
var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27367
28235
|
};
|
|
27368
28236
|
|
|
27369
28237
|
// src/models/parakeet/modeling_parakeet.js
|
|
@@ -27536,6 +28404,9 @@ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
|
27536
28404
|
];
|
|
27537
28405
|
};
|
|
27538
28406
|
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
28407
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
28408
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
28409
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
27539
28410
|
image_grid_thw_name = "grid_thw";
|
|
27540
28411
|
/**
|
|
27541
28412
|
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
@@ -27725,7 +28596,7 @@ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
|
27725
28596
|
);
|
|
27726
28597
|
} else {
|
|
27727
28598
|
model_inputs.pixel_values = null;
|
|
27728
|
-
const past_length =
|
|
28599
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
27729
28600
|
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27730
28601
|
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27731
28602
|
model_inputs.input_ids,
|
|
@@ -27754,11 +28625,16 @@ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
|
27754
28625
|
return model_inputs;
|
|
27755
28626
|
}
|
|
27756
28627
|
};
|
|
28628
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
28629
|
+
};
|
|
27757
28630
|
|
|
27758
28631
|
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27759
28632
|
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27760
28633
|
image_grid_thw_name = "image_grid_thw";
|
|
27761
28634
|
};
|
|
28635
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
28636
|
+
image_grid_thw_name = "image_grid_thw";
|
|
28637
|
+
};
|
|
27762
28638
|
|
|
27763
28639
|
// src/models/qwen3/modeling_qwen3.js
|
|
27764
28640
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
@@ -27787,18 +28663,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
|
|
|
27787
28663
|
// src/models/qwen3_vl/modeling_qwen3_vl.js
|
|
27788
28664
|
var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27789
28665
|
};
|
|
28666
|
+
var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
|
|
28667
|
+
};
|
|
27790
28668
|
|
|
27791
28669
|
// src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
|
|
27792
28670
|
var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27793
28671
|
};
|
|
28672
|
+
var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
|
|
28673
|
+
};
|
|
27794
28674
|
|
|
27795
28675
|
// src/models/qwen3_5/modeling_qwen3_5.js
|
|
27796
28676
|
var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27797
28677
|
};
|
|
28678
|
+
var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
|
|
28679
|
+
};
|
|
27798
28680
|
|
|
27799
28681
|
// src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
|
|
27800
28682
|
var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
|
|
27801
28683
|
};
|
|
28684
|
+
var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
|
|
28685
|
+
};
|
|
27802
28686
|
|
|
27803
28687
|
// src/models/resnet/modeling_resnet.js
|
|
27804
28688
|
var ResNetPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -28479,25 +29363,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
|
|
|
28479
29363
|
var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
|
|
28480
29364
|
};
|
|
28481
29365
|
|
|
28482
|
-
// src/models/ultravox/modeling_ultravox.js
|
|
28483
|
-
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
28484
|
-
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
28485
|
-
};
|
|
28486
|
-
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
28487
|
-
_merge_input_ids_with_audio_features(kwargs) {
|
|
28488
|
-
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
28489
|
-
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
28490
|
-
return default_merge_input_ids_with_audio_features({
|
|
28491
|
-
// @ts-ignore
|
|
28492
|
-
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
|
|
28493
|
-
...kwargs,
|
|
28494
|
-
audio_features: reshaped_audio_features
|
|
28495
|
-
});
|
|
28496
|
-
}
|
|
28497
|
-
};
|
|
28498
|
-
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
28499
|
-
};
|
|
28500
|
-
|
|
28501
29366
|
// src/models/unispeech/modeling_unispeech.js
|
|
28502
29367
|
var UniSpeechPreTrainedModel = class extends PreTrainedModel {
|
|
28503
29368
|
};
|
|
@@ -28663,6 +29528,170 @@ var VitsModel = class extends VitsPreTrainedModel {
|
|
|
28663
29528
|
}
|
|
28664
29529
|
};
|
|
28665
29530
|
|
|
29531
|
+
// src/models/voxtral/modeling_voxtral.js
|
|
29532
|
+
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
29533
|
+
};
|
|
29534
|
+
|
|
29535
|
+
// src/models/voxtral_realtime/modeling_voxtral_realtime.js
|
|
29536
|
+
var CONV1_LEFT_PAD = 2;
|
|
29537
|
+
var CONV2_LEFT_PAD = 1;
|
|
29538
|
+
var states = /* @__PURE__ */ new WeakMap();
|
|
29539
|
+
function createEncoderState(model, input_features) {
|
|
29540
|
+
const { text_config, audio_config } = (
|
|
29541
|
+
/** @type {any} */
|
|
29542
|
+
model.config
|
|
29543
|
+
);
|
|
29544
|
+
const encoder_session = model.sessions["audio_encoder"];
|
|
29545
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
29546
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
29547
|
+
const enc_kv_cache = new DynamicCache();
|
|
29548
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
|
|
29549
|
+
const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
29550
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
29551
|
+
for (const name in enc_shapes) {
|
|
29552
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
29553
|
+
enc_kv_cache[name] = new Tensor3(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
29554
|
+
}
|
|
29555
|
+
const enc_padding_cache = new Tensor3(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
29556
|
+
1,
|
|
29557
|
+
PADDING_CACHE_CHANNELS,
|
|
29558
|
+
CONV1_LEFT_PAD
|
|
29559
|
+
]);
|
|
29560
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
29561
|
+
if (!chunks_iter) {
|
|
29562
|
+
throw new Error("input_features must be iterable or async iterable");
|
|
29563
|
+
}
|
|
29564
|
+
return {
|
|
29565
|
+
encoder_session,
|
|
29566
|
+
enc_kv_cache,
|
|
29567
|
+
enc_padding_cache,
|
|
29568
|
+
enc_past_seq_len: 0,
|
|
29569
|
+
audio_embed_queue: [],
|
|
29570
|
+
audio_embed_total_tokens: 0,
|
|
29571
|
+
audio_queue_offset: 0,
|
|
29572
|
+
audio_consumed: 0,
|
|
29573
|
+
stream_exhausted: false,
|
|
29574
|
+
chunks_iter,
|
|
29575
|
+
text_hidden_size: text_config.hidden_size
|
|
29576
|
+
};
|
|
29577
|
+
}
|
|
29578
|
+
async function encodeChunk(s, chunk_features) {
|
|
29579
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
29580
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
29581
|
+
const position_ids = new Tensor3(
|
|
29582
|
+
"int64",
|
|
29583
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
29584
|
+
[1, conv2_output_len]
|
|
29585
|
+
);
|
|
29586
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
29587
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
29588
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
29589
|
+
input_features: chunk_features,
|
|
29590
|
+
attention_mask,
|
|
29591
|
+
position_ids,
|
|
29592
|
+
past_padding_cache: s.enc_padding_cache,
|
|
29593
|
+
...s.enc_kv_cache
|
|
29594
|
+
});
|
|
29595
|
+
if (s.enc_padding_cache.location === "gpu-buffer") {
|
|
29596
|
+
s.enc_padding_cache.dispose();
|
|
29597
|
+
}
|
|
29598
|
+
s.enc_padding_cache = present_padding_cache;
|
|
29599
|
+
for (const name in present_cache) {
|
|
29600
|
+
if (name.startsWith("present.")) {
|
|
29601
|
+
const pastName = name.replace("present", "past_key_values");
|
|
29602
|
+
const prev = s.enc_kv_cache[pastName];
|
|
29603
|
+
if (prev?.location === "gpu-buffer") {
|
|
29604
|
+
prev.dispose();
|
|
29605
|
+
}
|
|
29606
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
29607
|
+
}
|
|
29608
|
+
}
|
|
29609
|
+
s.enc_past_seq_len = total_seq_len;
|
|
29610
|
+
return audio_embeds;
|
|
29611
|
+
}
|
|
29612
|
+
async function fillAudioBuffer(s, needed) {
|
|
29613
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
29614
|
+
const result = await s.chunks_iter.next();
|
|
29615
|
+
if (result.done) {
|
|
29616
|
+
s.stream_exhausted = true;
|
|
29617
|
+
break;
|
|
29618
|
+
}
|
|
29619
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
29620
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
29621
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
29622
|
+
}
|
|
29623
|
+
}
|
|
29624
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
29625
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
29626
|
+
const embed_data = inputs_embeds.data;
|
|
29627
|
+
let embed_write_pos = 0;
|
|
29628
|
+
let remaining = current_len;
|
|
29629
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
29630
|
+
const front = s.audio_embed_queue[0];
|
|
29631
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
29632
|
+
const n = Math.min(remaining, available);
|
|
29633
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
29634
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
29635
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
29636
|
+
}
|
|
29637
|
+
embed_write_pos += n;
|
|
29638
|
+
remaining -= n;
|
|
29639
|
+
s.audio_queue_offset += n;
|
|
29640
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
29641
|
+
s.audio_embed_queue.shift();
|
|
29642
|
+
s.audio_queue_offset = 0;
|
|
29643
|
+
}
|
|
29644
|
+
}
|
|
29645
|
+
s.audio_consumed += current_len - remaining;
|
|
29646
|
+
}
|
|
29647
|
+
var AudioExhaustedCriteria = class extends StoppingCriteria {
|
|
29648
|
+
constructor(enc_state) {
|
|
29649
|
+
super();
|
|
29650
|
+
this._s = enc_state;
|
|
29651
|
+
}
|
|
29652
|
+
_call(input_ids) {
|
|
29653
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
29654
|
+
return input_ids.map(() => done);
|
|
29655
|
+
}
|
|
29656
|
+
};
|
|
29657
|
+
var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
|
|
29658
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
|
|
29659
|
+
};
|
|
29660
|
+
var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
|
|
29661
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
29662
|
+
const current_len = input_ids.dims[1];
|
|
29663
|
+
const enc = states.get(this);
|
|
29664
|
+
if (enc) {
|
|
29665
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
29666
|
+
}
|
|
29667
|
+
const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
|
|
29668
|
+
if (enc) {
|
|
29669
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
29670
|
+
}
|
|
29671
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
29672
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
29673
|
+
const session = this.sessions["decoder_model_merged"];
|
|
29674
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
29675
|
+
return await sessionRun(session, fixed);
|
|
29676
|
+
}
|
|
29677
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
29678
|
+
if (!input_features) {
|
|
29679
|
+
throw new Error("input_features (generator/iterable) must be provided");
|
|
29680
|
+
}
|
|
29681
|
+
const enc_state = createEncoderState(this, input_features);
|
|
29682
|
+
states.set(this, enc_state);
|
|
29683
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
29684
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
29685
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
29686
|
+
try {
|
|
29687
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
29688
|
+
} finally {
|
|
29689
|
+
enc_state.enc_kv_cache.dispose();
|
|
29690
|
+
states.delete(this);
|
|
29691
|
+
}
|
|
29692
|
+
}
|
|
29693
|
+
};
|
|
29694
|
+
|
|
28666
29695
|
// src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
|
|
28667
29696
|
var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
|
|
28668
29697
|
};
|
|
@@ -29416,6 +30445,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29416
30445
|
["gemma2", "Gemma2ForCausalLM"],
|
|
29417
30446
|
["vaultgemma", "VaultGemmaForCausalLM"],
|
|
29418
30447
|
["gemma3_text", "Gemma3ForCausalLM"],
|
|
30448
|
+
["gemma3", "Gemma3ForCausalLM"],
|
|
29419
30449
|
["helium", "HeliumForCausalLM"],
|
|
29420
30450
|
["glm", "GlmForCausalLM"],
|
|
29421
30451
|
["openelm", "OpenELMForCausalLM"],
|
|
@@ -29424,6 +30454,13 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29424
30454
|
["qwen3", "Qwen3ForCausalLM"],
|
|
29425
30455
|
["qwen3_moe", "Qwen3MoeForCausalLM"],
|
|
29426
30456
|
["qwen3_next", "Qwen3NextForCausalLM"],
|
|
30457
|
+
["qwen2_vl", "Qwen2VLForCausalLM"],
|
|
30458
|
+
["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
|
|
30459
|
+
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
30460
|
+
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
30461
|
+
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
30462
|
+
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
30463
|
+
["gemma3n", "Gemma3nForCausalLM"],
|
|
29427
30464
|
["phi", "PhiForCausalLM"],
|
|
29428
30465
|
["phi3", "Phi3ForCausalLM"],
|
|
29429
30466
|
["mpt", "MptForCausalLM"],
|
|
@@ -29499,6 +30536,7 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29499
30536
|
["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
|
|
29500
30537
|
["qwen3_5", "Qwen3_5ForConditionalGeneration"],
|
|
29501
30538
|
["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
|
|
30539
|
+
["lfm2_vl", "Lfm2VlForConditionalGeneration"],
|
|
29502
30540
|
["idefics3", "Idefics3ForConditionalGeneration"],
|
|
29503
30541
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
29504
30542
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
@@ -29507,8 +30545,10 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29507
30545
|
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
29508
30546
|
]);
|
|
29509
30547
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30548
|
+
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
29510
30549
|
["ultravox", "UltravoxModel"],
|
|
29511
|
-
["voxtral", "VoxtralForConditionalGeneration"]
|
|
30550
|
+
["voxtral", "VoxtralForConditionalGeneration"],
|
|
30551
|
+
["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
|
|
29512
30552
|
]);
|
|
29513
30553
|
var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29514
30554
|
["vision-encoder-decoder", "VisionEncoderDecoderModel"]
|
|
@@ -29691,7 +30731,19 @@ var CUSTOM_MAPPING = [
|
|
|
29691
30731
|
MODEL_TYPES.ImageAudioTextToText
|
|
29692
30732
|
],
|
|
29693
30733
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
29694
|
-
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
|
|
30734
|
+
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
30735
|
+
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30736
|
+
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30737
|
+
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30738
|
+
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30739
|
+
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30740
|
+
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30741
|
+
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30742
|
+
[
|
|
30743
|
+
"VoxtralRealtimeForConditionalGeneration",
|
|
30744
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
30745
|
+
MODEL_TYPES.VoxtralRealtime
|
|
30746
|
+
]
|
|
29695
30747
|
];
|
|
29696
30748
|
for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
29697
30749
|
MODEL_TYPE_MAPPING.set(name, type);
|
|
@@ -31369,8 +32421,18 @@ var TASK_ALIASES = Object.freeze({
|
|
|
31369
32421
|
});
|
|
31370
32422
|
|
|
31371
32423
|
// src/utils/model_registry/get_model_files.js
|
|
32424
|
+
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
32425
|
+
if (config !== null) {
|
|
32426
|
+
return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
|
|
32427
|
+
}
|
|
32428
|
+
const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
|
|
32429
|
+
return memoizePromise(
|
|
32430
|
+
key,
|
|
32431
|
+
() => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
|
|
32432
|
+
);
|
|
32433
|
+
}
|
|
31372
32434
|
async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
|
|
31373
|
-
config = await
|
|
32435
|
+
config = await get_config(modelId, { config });
|
|
31374
32436
|
const files = [
|
|
31375
32437
|
// Add config.json (always loaded)
|
|
31376
32438
|
"config.json"
|
|
@@ -31431,74 +32493,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
31431
32493
|
files.push(dataFilePath);
|
|
31432
32494
|
}
|
|
31433
32495
|
};
|
|
31434
|
-
const
|
|
31435
|
-
|
|
31436
|
-
add_model_file(
|
|
31437
|
-
|
|
31438
|
-
|
|
31439
|
-
|
|
31440
|
-
|
|
31441
|
-
|
|
31442
|
-
add_model_file("decoder_model_merged");
|
|
31443
|
-
files.push("generation_config.json");
|
|
31444
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
31445
|
-
add_model_file("model", "vision_encoder");
|
|
31446
|
-
add_model_file("prompt_encoder_mask_decoder");
|
|
31447
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
31448
|
-
add_model_file("model", "encoder_model");
|
|
31449
|
-
add_model_file("decoder_model_merged");
|
|
31450
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
31451
|
-
add_model_file("embed_tokens");
|
|
31452
|
-
add_model_file("vision_encoder");
|
|
31453
|
-
add_model_file("decoder_model_merged");
|
|
31454
|
-
if (config.is_encoder_decoder) {
|
|
31455
|
-
add_model_file("model", "encoder_model");
|
|
31456
|
-
}
|
|
31457
|
-
files.push("generation_config.json");
|
|
31458
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
31459
|
-
add_model_file("embed_tokens");
|
|
31460
|
-
add_model_file("audio_encoder");
|
|
31461
|
-
add_model_file("decoder_model_merged");
|
|
31462
|
-
files.push("generation_config.json");
|
|
31463
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
31464
|
-
add_model_file("embed_tokens");
|
|
31465
|
-
add_model_file("audio_encoder");
|
|
31466
|
-
add_model_file("vision_encoder");
|
|
31467
|
-
add_model_file("decoder_model_merged");
|
|
31468
|
-
files.push("generation_config.json");
|
|
31469
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
31470
|
-
add_model_file("model", "text_encoder");
|
|
31471
|
-
add_model_file("decoder_model_merged");
|
|
31472
|
-
add_model_file("encodec_decode");
|
|
31473
|
-
files.push("generation_config.json");
|
|
31474
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
31475
|
-
add_model_file("prepare_inputs_embeds");
|
|
31476
|
-
add_model_file("model", "language_model");
|
|
31477
|
-
add_model_file("lm_head");
|
|
31478
|
-
add_model_file("gen_head");
|
|
31479
|
-
add_model_file("gen_img_embeds");
|
|
31480
|
-
add_model_file("image_decode");
|
|
31481
|
-
files.push("generation_config.json");
|
|
31482
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
31483
|
-
add_model_file("prepare_inputs_embeds");
|
|
31484
|
-
add_model_file("model");
|
|
31485
|
-
add_model_file("vision_encoder");
|
|
31486
|
-
files.push("generation_config.json");
|
|
31487
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
31488
|
-
add_model_file("embed_tokens");
|
|
31489
|
-
add_model_file("speech_encoder");
|
|
31490
|
-
add_model_file("model", "language_model");
|
|
31491
|
-
add_model_file("conditional_decoder");
|
|
31492
|
-
files.push("generation_config.json");
|
|
31493
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
31494
|
-
add_model_file("encoder_model");
|
|
31495
|
-
add_model_file("decoder_model");
|
|
31496
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
31497
|
-
add_model_file("text_encoder");
|
|
31498
|
-
add_model_file("latent_denoiser");
|
|
31499
|
-
add_model_file("voice_decoder");
|
|
31500
|
-
} else {
|
|
31501
|
-
add_model_file("model", singleModelName);
|
|
32496
|
+
const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
|
|
32497
|
+
for (const [sessionKey, baseName] of Object.entries(sessions)) {
|
|
32498
|
+
add_model_file(sessionKey, baseName);
|
|
32499
|
+
}
|
|
32500
|
+
if (optional_configs) {
|
|
32501
|
+
for (const configFile of Object.values(optional_configs)) {
|
|
32502
|
+
files.push(configFile);
|
|
32503
|
+
}
|
|
31502
32504
|
}
|
|
31503
32505
|
return files;
|
|
31504
32506
|
}
|
|
@@ -31949,25 +32951,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
|
31949
32951
|
|
|
31950
32952
|
// src/utils/model_registry/is_cached.js
|
|
31951
32953
|
async function check_files_cache(modelId, files, options = {}) {
|
|
31952
|
-
const
|
|
31953
|
-
if (!
|
|
32954
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
32955
|
+
if (!cache2) {
|
|
31954
32956
|
const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
|
|
31955
32957
|
return { allCached: false, files: fileStatuses2 };
|
|
31956
32958
|
}
|
|
31957
32959
|
const fileStatuses = await Promise.all(
|
|
31958
32960
|
files.map(async (filename) => {
|
|
31959
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
31960
|
-
const cached = await checkCachedResource(
|
|
32961
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
32962
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
31961
32963
|
return { file: filename, cached: !!cached };
|
|
31962
32964
|
})
|
|
31963
32965
|
);
|
|
31964
32966
|
return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
|
|
31965
32967
|
}
|
|
31966
32968
|
async function is_file_cached(modelId, filename, options = {}) {
|
|
31967
|
-
const
|
|
31968
|
-
if (!
|
|
31969
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
31970
|
-
return !!await checkCachedResource(
|
|
32969
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
32970
|
+
if (!cache2) return false;
|
|
32971
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
32972
|
+
return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
31971
32973
|
}
|
|
31972
32974
|
async function is_cached(modelId, options = {}) {
|
|
31973
32975
|
if (!modelId) {
|
|
@@ -32014,26 +33016,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
|
|
|
32014
33016
|
|
|
32015
33017
|
// src/utils/model_registry/clear_cache.js
|
|
32016
33018
|
async function clear_files_from_cache(modelId, files, options = {}) {
|
|
32017
|
-
const
|
|
32018
|
-
if (!
|
|
33019
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
33020
|
+
if (!cache2) {
|
|
32019
33021
|
return {
|
|
32020
33022
|
filesDeleted: 0,
|
|
32021
33023
|
filesCached: 0,
|
|
32022
33024
|
files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
|
|
32023
33025
|
};
|
|
32024
33026
|
}
|
|
32025
|
-
if (!
|
|
33027
|
+
if (!cache2.delete) {
|
|
32026
33028
|
throw new Error("Cache does not support delete operation");
|
|
32027
33029
|
}
|
|
32028
33030
|
const results = await Promise.all(
|
|
32029
33031
|
files.map(async (filename) => {
|
|
32030
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
32031
|
-
const cached = await checkCachedResource(
|
|
33032
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
33033
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
32032
33034
|
const wasCached = !!cached;
|
|
32033
33035
|
let deleted = false;
|
|
32034
33036
|
if (wasCached) {
|
|
32035
|
-
const deletedWithProposed = await
|
|
32036
|
-
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await
|
|
33037
|
+
const deletedWithProposed = await cache2.delete(proposedCacheKey);
|
|
33038
|
+
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
|
|
32037
33039
|
deleted = deletedWithProposed || deletedWithLocal;
|
|
32038
33040
|
}
|
|
32039
33041
|
return { file: filename, deleted, wasCached };
|
|
@@ -32514,6 +33516,7 @@ export {
|
|
|
32514
33516
|
DonutImageProcessor,
|
|
32515
33517
|
DonutSwinModel,
|
|
32516
33518
|
DonutSwinPreTrainedModel,
|
|
33519
|
+
DynamicCache,
|
|
32517
33520
|
EdgeTamModel,
|
|
32518
33521
|
EfficientNetForImageClassification,
|
|
32519
33522
|
EfficientNetImageProcessor,
|
|
@@ -32586,6 +33589,7 @@ export {
|
|
|
32586
33589
|
Gemma3Model,
|
|
32587
33590
|
Gemma3PreTrainedModel,
|
|
32588
33591
|
Gemma3nAudioFeatureExtractor,
|
|
33592
|
+
Gemma3nForCausalLM,
|
|
32589
33593
|
Gemma3nForConditionalGeneration,
|
|
32590
33594
|
Gemma3nPreTrainedModel,
|
|
32591
33595
|
Gemma3nProcessor,
|
|
@@ -32605,6 +33609,9 @@ export {
|
|
|
32605
33609
|
GraniteMoeHybridModel,
|
|
32606
33610
|
GraniteMoeHybridPreTrainedModel,
|
|
32607
33611
|
GranitePreTrainedModel,
|
|
33612
|
+
GraniteSpeechFeatureExtractor,
|
|
33613
|
+
GraniteSpeechForConditionalGeneration,
|
|
33614
|
+
GraniteSpeechProcessor,
|
|
32608
33615
|
GroundingDinoForObjectDetection,
|
|
32609
33616
|
GroundingDinoImageProcessor,
|
|
32610
33617
|
GroundingDinoPreTrainedModel,
|
|
@@ -32630,7 +33637,6 @@ export {
|
|
|
32630
33637
|
IJepaPreTrainedModel,
|
|
32631
33638
|
Idefics3ForConditionalGeneration,
|
|
32632
33639
|
Idefics3ImageProcessor,
|
|
32633
|
-
Idefics3PreTrainedModel,
|
|
32634
33640
|
Idefics3Processor,
|
|
32635
33641
|
ImageClassificationPipeline,
|
|
32636
33642
|
ImageFeatureExtractionPipeline,
|
|
@@ -32655,6 +33661,9 @@ export {
|
|
|
32655
33661
|
Lfm2MoeModel,
|
|
32656
33662
|
Lfm2MoePreTrainedModel,
|
|
32657
33663
|
Lfm2PreTrainedModel,
|
|
33664
|
+
Lfm2VlForConditionalGeneration,
|
|
33665
|
+
Lfm2VlImageProcessor,
|
|
33666
|
+
Lfm2VlProcessor,
|
|
32658
33667
|
LiteWhisperForConditionalGeneration,
|
|
32659
33668
|
Llama4ForCausalLM,
|
|
32660
33669
|
Llama4PreTrainedModel,
|
|
@@ -32838,7 +33847,6 @@ export {
|
|
|
32838
33847
|
Owlv2Model,
|
|
32839
33848
|
Owlv2PreTrainedModel,
|
|
32840
33849
|
PaliGemmaForConditionalGeneration,
|
|
32841
|
-
PaliGemmaPreTrainedModel,
|
|
32842
33850
|
PaliGemmaProcessor,
|
|
32843
33851
|
ParakeetFeatureExtractor,
|
|
32844
33852
|
ParakeetForCTC,
|
|
@@ -32882,10 +33890,12 @@ export {
|
|
|
32882
33890
|
Qwen2MoePreTrainedModel,
|
|
32883
33891
|
Qwen2PreTrainedModel,
|
|
32884
33892
|
Qwen2Tokenizer,
|
|
33893
|
+
Qwen2VLForCausalLM,
|
|
32885
33894
|
Qwen2VLForConditionalGeneration,
|
|
32886
33895
|
Qwen2VLImageProcessor,
|
|
32887
33896
|
Qwen2VLPreTrainedModel,
|
|
32888
33897
|
Qwen2VLProcessor,
|
|
33898
|
+
Qwen2_5_VLForCausalLM,
|
|
32889
33899
|
Qwen2_5_VLForConditionalGeneration,
|
|
32890
33900
|
Qwen2_5_VLProcessor,
|
|
32891
33901
|
Qwen3ForCausalLM,
|
|
@@ -32897,10 +33907,14 @@ export {
|
|
|
32897
33907
|
Qwen3NextModel,
|
|
32898
33908
|
Qwen3NextPreTrainedModel,
|
|
32899
33909
|
Qwen3PreTrainedModel,
|
|
33910
|
+
Qwen3VLForCausalLM,
|
|
32900
33911
|
Qwen3VLForConditionalGeneration,
|
|
33912
|
+
Qwen3VLMoeForCausalLM,
|
|
32901
33913
|
Qwen3VLMoeForConditionalGeneration,
|
|
32902
33914
|
Qwen3VLProcessor,
|
|
33915
|
+
Qwen3_5ForCausalLM,
|
|
32903
33916
|
Qwen3_5ForConditionalGeneration,
|
|
33917
|
+
Qwen3_5MoeForCausalLM,
|
|
32904
33918
|
Qwen3_5MoeForConditionalGeneration,
|
|
32905
33919
|
RFDetrForObjectDetection,
|
|
32906
33920
|
RFDetrModel,
|
|
@@ -32972,7 +33986,6 @@ export {
|
|
|
32972
33986
|
SmolLM3ForCausalLM,
|
|
32973
33987
|
SmolLM3Model,
|
|
32974
33988
|
SmolLM3PreTrainedModel,
|
|
32975
|
-
SmolVLMForConditionalGeneration,
|
|
32976
33989
|
Idefics3ImageProcessor as SmolVLMImageProcessor,
|
|
32977
33990
|
Idefics3Processor as SmolVLMProcessor,
|
|
32978
33991
|
SnacDecoderModel,
|
|
@@ -33078,6 +34091,10 @@ export {
|
|
|
33078
34091
|
VitsTokenizer,
|
|
33079
34092
|
VoxtralForConditionalGeneration,
|
|
33080
34093
|
VoxtralProcessor,
|
|
34094
|
+
VoxtralRealtimeFeatureExtractor,
|
|
34095
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
34096
|
+
VoxtralRealtimePreTrainedModel,
|
|
34097
|
+
VoxtralRealtimeProcessor,
|
|
33081
34098
|
Wav2Vec2BertForCTC,
|
|
33082
34099
|
Wav2Vec2BertForSequenceClassification,
|
|
33083
34100
|
Wav2Vec2BertModel,
|
|
@@ -33173,7 +34190,7 @@ export {
|
|
|
33173
34190
|
|
|
33174
34191
|
onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
|
|
33175
34192
|
(*!
|
|
33176
|
-
* ONNX Runtime Web v1.25.0-dev.
|
|
34193
|
+
* ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
|
|
33177
34194
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
33178
34195
|
* Licensed under the MIT License.
|
|
33179
34196
|
*)
|