@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +1587 -570
- package/dist/transformers.min.js +17 -17
- package/dist/transformers.node.cjs +1605 -573
- package/dist/transformers.node.min.cjs +21 -21
- package/dist/transformers.node.min.mjs +21 -21
- package/dist/transformers.node.mjs +1600 -583
- package/dist/transformers.web.js +1592 -575
- package/dist/transformers.web.min.js +15 -15
- package/package.json +3 -3
- package/src/cache_utils.js +62 -0
- package/src/configs.js +17 -2
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +3 -3
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +1 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/modeling_utils.js +222 -308
- package/src/models/models.js +4 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +3 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +7 -7
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +25 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +14 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +17 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +4 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
|
@@ -14,7 +14,7 @@ var __export = (target, all) => {
|
|
|
14
14
|
import fs from "fs";
|
|
15
15
|
import path from "path";
|
|
16
16
|
import url from "url";
|
|
17
|
-
var VERSION = "4.0.0-next.
|
|
17
|
+
var VERSION = "4.0.0-next.7";
|
|
18
18
|
var HAS_SELF = typeof self !== "undefined";
|
|
19
19
|
var IS_FS_AVAILABLE = !isEmpty(fs);
|
|
20
20
|
var IS_PATH_AVAILABLE = !isEmpty(path);
|
|
@@ -142,6 +142,7 @@ var env = {
|
|
|
142
142
|
customCache: null,
|
|
143
143
|
useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
|
|
144
144
|
cacheKey: "transformers-cache",
|
|
145
|
+
experimental_useCrossOriginStorage: false,
|
|
145
146
|
/////////////////// Custom fetch /////////////////////
|
|
146
147
|
fetch: DEFAULT_FETCH
|
|
147
148
|
//////////////////////////////////////////////////////
|
|
@@ -2692,7 +2693,7 @@ var Tokenizer = class {
|
|
|
2692
2693
|
};
|
|
2693
2694
|
var Tokenizer_default = Tokenizer;
|
|
2694
2695
|
|
|
2695
|
-
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.
|
|
2696
|
+
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
|
|
2696
2697
|
var TOKEN_TYPES = Object.freeze({
|
|
2697
2698
|
Text: "Text",
|
|
2698
2699
|
// The text between Jinja statements or expressions
|
|
@@ -4211,7 +4212,11 @@ var Environment = class {
|
|
|
4211
4212
|
["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
|
|
4212
4213
|
["integer", (operand) => operand instanceof IntegerValue],
|
|
4213
4214
|
["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
|
|
4214
|
-
["mapping", (operand) => operand
|
|
4215
|
+
["mapping", (operand) => operand instanceof ObjectValue],
|
|
4216
|
+
[
|
|
4217
|
+
"sequence",
|
|
4218
|
+
(operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
|
|
4219
|
+
],
|
|
4215
4220
|
[
|
|
4216
4221
|
"lower",
|
|
4217
4222
|
(operand) => {
|
|
@@ -4484,6 +4489,9 @@ var Interpreter = class {
|
|
|
4484
4489
|
applyFilter(operand, filterNode, environment) {
|
|
4485
4490
|
if (filterNode.type === "Identifier") {
|
|
4486
4491
|
const filter = filterNode;
|
|
4492
|
+
if (filter.value === "safe") {
|
|
4493
|
+
return operand;
|
|
4494
|
+
}
|
|
4487
4495
|
if (filter.value === "tojson") {
|
|
4488
4496
|
return new StringValue(toJSON(operand, {}));
|
|
4489
4497
|
}
|
|
@@ -4573,6 +4581,8 @@ var Interpreter = class {
|
|
|
4573
4581
|
return new IntegerValue(Math.floor(operand.value));
|
|
4574
4582
|
case "float":
|
|
4575
4583
|
return new FloatValue(operand.value);
|
|
4584
|
+
case "string":
|
|
4585
|
+
return new StringValue(operand.toString());
|
|
4576
4586
|
default:
|
|
4577
4587
|
throw new Error(`Unknown NumericValue filter: ${filter.value}`);
|
|
4578
4588
|
}
|
|
@@ -6000,9 +6010,216 @@ function toAbsoluteURL(url2) {
|
|
|
6000
6010
|
return new URL(url2, baseURL).href;
|
|
6001
6011
|
}
|
|
6002
6012
|
|
|
6013
|
+
// src/utils/cache/CrossOriginStorageCache.js
|
|
6014
|
+
var HASH_ALGORITHM = "SHA-256";
|
|
6015
|
+
var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
|
|
6016
|
+
var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
|
|
6017
|
+
var CrossOriginStorage = class {
|
|
6018
|
+
/** @type {Promise<Cache> | null} */
|
|
6019
|
+
#hashCache = null;
|
|
6020
|
+
/**
|
|
6021
|
+
* Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
|
|
6022
|
+
* @returns {Promise<Cache>}
|
|
6023
|
+
*/
|
|
6024
|
+
_getHashCache = () => {
|
|
6025
|
+
this.#hashCache ??= caches.open(HASH_CACHE_NAME);
|
|
6026
|
+
return this.#hashCache;
|
|
6027
|
+
};
|
|
6028
|
+
/**
|
|
6029
|
+
* Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
|
|
6030
|
+
* @returns {boolean}
|
|
6031
|
+
*/
|
|
6032
|
+
static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
|
|
6033
|
+
/**
|
|
6034
|
+
* Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
|
|
6035
|
+
* the corresponding file handle from cross-origin storage.
|
|
6036
|
+
*
|
|
6037
|
+
* Implements `CacheInterface.match`.
|
|
6038
|
+
*
|
|
6039
|
+
* @param {string} request The URL of the resource to look up.
|
|
6040
|
+
* @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
|
|
6041
|
+
*/
|
|
6042
|
+
match = async (request) => {
|
|
6043
|
+
const hashValue = await this._getFileHash(request);
|
|
6044
|
+
if (!hashValue) {
|
|
6045
|
+
return void 0;
|
|
6046
|
+
}
|
|
6047
|
+
try {
|
|
6048
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
|
|
6049
|
+
const blob = await handle.getFile();
|
|
6050
|
+
return new Response(blob, {
|
|
6051
|
+
headers: {
|
|
6052
|
+
"Content-Length": String(blob.size)
|
|
6053
|
+
}
|
|
6054
|
+
});
|
|
6055
|
+
} catch {
|
|
6056
|
+
return void 0;
|
|
6057
|
+
}
|
|
6058
|
+
};
|
|
6059
|
+
/**
|
|
6060
|
+
* Stores a response in cross-origin storage, keyed by its SHA-256 hash.
|
|
6061
|
+
*
|
|
6062
|
+
* For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
|
|
6063
|
+
* `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
|
|
6064
|
+
* without reading the response body a second time.
|
|
6065
|
+
*
|
|
6066
|
+
* For non-LFS resources the hash is unknown upfront. In that case the body is consumed
|
|
6067
|
+
* in the background: the stream is read to compute the content hash, the file is written
|
|
6068
|
+
* into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
|
|
6069
|
+
* so that future `match` calls can resolve the file without a network round-trip.
|
|
6070
|
+
*
|
|
6071
|
+
* Implements `CacheInterface.put`.
|
|
6072
|
+
*
|
|
6073
|
+
* @param {string} request The URL of the resource (used as the hash-cache key).
|
|
6074
|
+
* @param {Response} response The response whose body will be written to the cache.
|
|
6075
|
+
* @returns {Promise<void>}
|
|
6076
|
+
*/
|
|
6077
|
+
put = async (request, response) => {
|
|
6078
|
+
const hashValue = await this._getFileHash(request);
|
|
6079
|
+
if (hashValue) {
|
|
6080
|
+
const blob = await response.blob();
|
|
6081
|
+
await this._storeBlobInCOS(blob, hashValue);
|
|
6082
|
+
} else {
|
|
6083
|
+
this._processAndStore(request, response.body);
|
|
6084
|
+
}
|
|
6085
|
+
};
|
|
6086
|
+
/**
|
|
6087
|
+
* Writes a blob into cross-origin storage using the given pre-computed hex hash string.
|
|
6088
|
+
*
|
|
6089
|
+
* @param {Blob} blob
|
|
6090
|
+
* @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
|
|
6091
|
+
* @returns {Promise<void>}
|
|
6092
|
+
*/
|
|
6093
|
+
_storeBlobInCOS = async (blob, hashHex) => {
|
|
6094
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
|
|
6095
|
+
create: true
|
|
6096
|
+
});
|
|
6097
|
+
const writableStream = await handle.createWritable();
|
|
6098
|
+
await writableStream.write(blob);
|
|
6099
|
+
await writableStream.close();
|
|
6100
|
+
};
|
|
6101
|
+
/**
|
|
6102
|
+
* Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
|
|
6103
|
+
* of the resulting blob, stores it in cross-origin storage, and persists the computed
|
|
6104
|
+
* hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
|
|
6105
|
+
* file without a network round-trip.
|
|
6106
|
+
*
|
|
6107
|
+
* Called fire-and-forget from `put` — errors are swallowed so failures never surface to
|
|
6108
|
+
* the caller.
|
|
6109
|
+
*
|
|
6110
|
+
* @param {string} request The original resource URL.
|
|
6111
|
+
* @param {ReadableStream} stream The response body stream to consume.
|
|
6112
|
+
* @returns {Promise<void>}
|
|
6113
|
+
*/
|
|
6114
|
+
_processAndStore = async (request, stream) => {
|
|
6115
|
+
try {
|
|
6116
|
+
const chunks = [];
|
|
6117
|
+
for await (const chunk2 of stream) {
|
|
6118
|
+
chunks.push(chunk2);
|
|
6119
|
+
}
|
|
6120
|
+
const blob = new Blob(chunks);
|
|
6121
|
+
const hashHex = await this._getBlobHash(blob);
|
|
6122
|
+
await this._storeBlobInCOS(blob, hashHex);
|
|
6123
|
+
try {
|
|
6124
|
+
const hashCache = await this._getHashCache();
|
|
6125
|
+
await hashCache.put(request, new Response(hashHex));
|
|
6126
|
+
} catch {
|
|
6127
|
+
}
|
|
6128
|
+
} catch {
|
|
6129
|
+
}
|
|
6130
|
+
};
|
|
6131
|
+
/**
|
|
6132
|
+
* Deletes the cache entry for the given request.
|
|
6133
|
+
*
|
|
6134
|
+
* Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
|
|
6135
|
+
* expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
|
|
6136
|
+
* permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
|
|
6137
|
+
* re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
|
|
6138
|
+
*
|
|
6139
|
+
* Implements `CacheInterface.delete`.
|
|
6140
|
+
*
|
|
6141
|
+
* @param {string} request
|
|
6142
|
+
* @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
|
|
6143
|
+
*/
|
|
6144
|
+
delete = async (request) => {
|
|
6145
|
+
try {
|
|
6146
|
+
const hashCache = await this._getHashCache();
|
|
6147
|
+
return await hashCache.delete(request);
|
|
6148
|
+
} catch {
|
|
6149
|
+
return false;
|
|
6150
|
+
}
|
|
6151
|
+
};
|
|
6152
|
+
/**
|
|
6153
|
+
* Resolves the SHA-256 hash for a given URL.
|
|
6154
|
+
*
|
|
6155
|
+
* Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
|
|
6156
|
+
* Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
|
|
6157
|
+
* LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
|
|
6158
|
+
*
|
|
6159
|
+
* Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
|
|
6160
|
+
*
|
|
6161
|
+
* @param {string} url The resource URL to resolve a hash for.
|
|
6162
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6163
|
+
*/
|
|
6164
|
+
_getFileHash = async (url2) => {
|
|
6165
|
+
try {
|
|
6166
|
+
const hashCache = await this._getHashCache();
|
|
6167
|
+
const cached = await hashCache.match(url2);
|
|
6168
|
+
if (cached) {
|
|
6169
|
+
return cached.text();
|
|
6170
|
+
}
|
|
6171
|
+
const hash = await this._getLfsFileHash(url2);
|
|
6172
|
+
if (hash) {
|
|
6173
|
+
await hashCache.put(url2, new Response(hash));
|
|
6174
|
+
return hash;
|
|
6175
|
+
}
|
|
6176
|
+
return null;
|
|
6177
|
+
} catch {
|
|
6178
|
+
return null;
|
|
6179
|
+
}
|
|
6180
|
+
};
|
|
6181
|
+
/**
|
|
6182
|
+
* Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
|
|
6183
|
+
* Git LFS pointer file.
|
|
6184
|
+
*
|
|
6185
|
+
* Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
|
|
6186
|
+
* The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
|
|
6187
|
+
* Returns `null` for non-LFS URLs or when the network request fails.
|
|
6188
|
+
*
|
|
6189
|
+
* @see https://huggingface.co/docs/hub/en/storage-backends#xet
|
|
6190
|
+
* @param {string} url The resolved Hugging Face URL of the resource.
|
|
6191
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6192
|
+
*/
|
|
6193
|
+
_getLfsFileHash = async (url2) => {
|
|
6194
|
+
if (!url2.includes("/resolve/")) {
|
|
6195
|
+
return null;
|
|
6196
|
+
}
|
|
6197
|
+
const rawUrl = url2.replace("/resolve/", "/raw/");
|
|
6198
|
+
try {
|
|
6199
|
+
const text = await fetch(rawUrl).then((r) => r.text());
|
|
6200
|
+
const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
|
|
6201
|
+
return match ? match[1] : null;
|
|
6202
|
+
} catch {
|
|
6203
|
+
return null;
|
|
6204
|
+
}
|
|
6205
|
+
};
|
|
6206
|
+
/**
|
|
6207
|
+
* Computes the SHA-256 hash of a `Blob`'s contents.
|
|
6208
|
+
*
|
|
6209
|
+
* @param {Blob} blob The blob to hash.
|
|
6210
|
+
* @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
|
|
6211
|
+
*/
|
|
6212
|
+
_getBlobHash = async (blob) => {
|
|
6213
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
6214
|
+
const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
|
|
6215
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
6216
|
+
return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
6217
|
+
};
|
|
6218
|
+
};
|
|
6219
|
+
|
|
6003
6220
|
// src/utils/cache.js
|
|
6004
6221
|
async function getCache(file_cache_dir = null) {
|
|
6005
|
-
let
|
|
6222
|
+
let cache2 = null;
|
|
6006
6223
|
if (env.useCustomCache) {
|
|
6007
6224
|
if (!env.customCache) {
|
|
6008
6225
|
throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
|
|
@@ -6012,30 +6229,33 @@ async function getCache(file_cache_dir = null) {
|
|
|
6012
6229
|
"`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
|
|
6013
6230
|
);
|
|
6014
6231
|
}
|
|
6015
|
-
|
|
6232
|
+
cache2 = env.customCache;
|
|
6233
|
+
}
|
|
6234
|
+
if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
|
|
6235
|
+
cache2 = new CrossOriginStorage();
|
|
6016
6236
|
}
|
|
6017
|
-
if (!
|
|
6237
|
+
if (!cache2 && env.useBrowserCache) {
|
|
6018
6238
|
if (typeof caches === "undefined") {
|
|
6019
6239
|
throw Error("Browser cache is not available in this environment.");
|
|
6020
6240
|
}
|
|
6021
6241
|
try {
|
|
6022
|
-
|
|
6242
|
+
cache2 = await caches.open(env.cacheKey);
|
|
6023
6243
|
} catch (e) {
|
|
6024
6244
|
logger.warn("An error occurred while opening the browser cache:", e);
|
|
6025
6245
|
}
|
|
6026
6246
|
}
|
|
6027
|
-
if (!
|
|
6247
|
+
if (!cache2 && env.useFSCache) {
|
|
6028
6248
|
if (!apis.IS_FS_AVAILABLE) {
|
|
6029
6249
|
throw Error("File System Cache is not available in this environment.");
|
|
6030
6250
|
}
|
|
6031
|
-
|
|
6251
|
+
cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
|
|
6032
6252
|
}
|
|
6033
|
-
return
|
|
6253
|
+
return cache2;
|
|
6034
6254
|
}
|
|
6035
|
-
async function tryCache(
|
|
6255
|
+
async function tryCache(cache2, ...names) {
|
|
6036
6256
|
for (let name of names) {
|
|
6037
6257
|
try {
|
|
6038
|
-
let result = await
|
|
6258
|
+
let result = await cache2.match(name);
|
|
6039
6259
|
if (result) return result;
|
|
6040
6260
|
} catch (e) {
|
|
6041
6261
|
continue;
|
|
@@ -6044,6 +6264,83 @@ async function tryCache(cache, ...names) {
|
|
|
6044
6264
|
return void 0;
|
|
6045
6265
|
}
|
|
6046
6266
|
|
|
6267
|
+
// src/utils/lru_cache.js
|
|
6268
|
+
var LRUCache2 = class {
|
|
6269
|
+
/** @type {number} */
|
|
6270
|
+
#capacity;
|
|
6271
|
+
/** @type {Map<any, any>} */
|
|
6272
|
+
#cache;
|
|
6273
|
+
/**
|
|
6274
|
+
* Creates an LRUCache instance.
|
|
6275
|
+
* @param {number} capacity The maximum number of items the cache can hold.
|
|
6276
|
+
*/
|
|
6277
|
+
constructor(capacity) {
|
|
6278
|
+
this.#capacity = capacity;
|
|
6279
|
+
this.#cache = /* @__PURE__ */ new Map();
|
|
6280
|
+
}
|
|
6281
|
+
/**
|
|
6282
|
+
* Retrieves the value associated with the given key and marks the key as recently used.
|
|
6283
|
+
* @param {any} key The key to retrieve.
|
|
6284
|
+
* @returns {any} The value associated with the key, or undefined if the key does not exist.
|
|
6285
|
+
*/
|
|
6286
|
+
get(key) {
|
|
6287
|
+
if (!this.#cache.has(key)) return void 0;
|
|
6288
|
+
const value = this.#cache.get(key);
|
|
6289
|
+
this.#cache.delete(key);
|
|
6290
|
+
this.#cache.set(key, value);
|
|
6291
|
+
return value;
|
|
6292
|
+
}
|
|
6293
|
+
/**
|
|
6294
|
+
* Inserts or updates the key-value pair in the cache.
|
|
6295
|
+
* If the key already exists, it is updated and marked as recently used.
|
|
6296
|
+
* If the cache exceeds its capacity, the least recently used item is evicted.
|
|
6297
|
+
* @param {any} key The key to add or update.
|
|
6298
|
+
* @param {any} value The value to associate with the key.
|
|
6299
|
+
*/
|
|
6300
|
+
put(key, value) {
|
|
6301
|
+
if (this.#cache.has(key)) {
|
|
6302
|
+
this.#cache.delete(key);
|
|
6303
|
+
}
|
|
6304
|
+
this.#cache.set(key, value);
|
|
6305
|
+
if (this.#cache.size > this.#capacity) {
|
|
6306
|
+
this.#cache.delete(this.#cache.keys().next().value);
|
|
6307
|
+
}
|
|
6308
|
+
}
|
|
6309
|
+
/**
|
|
6310
|
+
* Removes the entry for the given key from the cache.
|
|
6311
|
+
* @param {any} key The key to delete.
|
|
6312
|
+
* @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
|
|
6313
|
+
*/
|
|
6314
|
+
delete(key) {
|
|
6315
|
+
return this.#cache.delete(key);
|
|
6316
|
+
}
|
|
6317
|
+
/**
|
|
6318
|
+
* Clears the cache.
|
|
6319
|
+
*/
|
|
6320
|
+
clear() {
|
|
6321
|
+
this.#cache.clear();
|
|
6322
|
+
}
|
|
6323
|
+
};
|
|
6324
|
+
|
|
6325
|
+
// src/utils/memoize_promise.js
|
|
6326
|
+
var MAX_CACHE_SIZE = 100;
|
|
6327
|
+
var cache = new LRUCache2(MAX_CACHE_SIZE);
|
|
6328
|
+
function memoizePromise(key, factory) {
|
|
6329
|
+
const cached = cache.get(key);
|
|
6330
|
+
if (cached !== void 0) {
|
|
6331
|
+
return cached;
|
|
6332
|
+
}
|
|
6333
|
+
const promise = factory().then(
|
|
6334
|
+
(value) => value,
|
|
6335
|
+
(err) => {
|
|
6336
|
+
cache.delete(key);
|
|
6337
|
+
return Promise.reject(err);
|
|
6338
|
+
}
|
|
6339
|
+
);
|
|
6340
|
+
cache.put(key, promise);
|
|
6341
|
+
return promise;
|
|
6342
|
+
}
|
|
6343
|
+
|
|
6047
6344
|
// src/utils/model_registry/get_file_metadata.js
|
|
6048
6345
|
async function fetch_file_head(urlOrPath) {
|
|
6049
6346
|
if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
|
|
@@ -6051,17 +6348,27 @@ async function fetch_file_head(urlOrPath) {
|
|
|
6051
6348
|
}
|
|
6052
6349
|
const headers = getFetchHeaders(urlOrPath);
|
|
6053
6350
|
headers.set("Range", "bytes=0-0");
|
|
6054
|
-
return env.fetch(urlOrPath, { method: "GET", headers });
|
|
6351
|
+
return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
|
|
6352
|
+
}
|
|
6353
|
+
function get_file_metadata(path_or_repo_id, filename, options = {}) {
|
|
6354
|
+
const key = JSON.stringify([
|
|
6355
|
+
path_or_repo_id,
|
|
6356
|
+
filename,
|
|
6357
|
+
options?.revision,
|
|
6358
|
+
options?.cache_dir,
|
|
6359
|
+
options?.local_files_only
|
|
6360
|
+
]);
|
|
6361
|
+
return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
|
|
6055
6362
|
}
|
|
6056
|
-
async function
|
|
6057
|
-
const
|
|
6363
|
+
async function _get_file_metadata(path_or_repo_id, filename, options) {
|
|
6364
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6058
6365
|
const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6059
6366
|
path_or_repo_id,
|
|
6060
6367
|
filename,
|
|
6061
6368
|
options,
|
|
6062
|
-
|
|
6369
|
+
cache2
|
|
6063
6370
|
);
|
|
6064
|
-
const cachedResponse = await checkCachedResource(
|
|
6371
|
+
const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6065
6372
|
if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
|
|
6066
6373
|
const size = cachedResponse.headers.get("content-length");
|
|
6067
6374
|
const contentType = cachedResponse.headers.get("content-type");
|
|
@@ -6159,7 +6466,7 @@ function getFetchHeaders(urlOrPath) {
|
|
|
6159
6466
|
}
|
|
6160
6467
|
return headers;
|
|
6161
6468
|
}
|
|
6162
|
-
function buildResourcePaths(path_or_repo_id, filename, options = {},
|
|
6469
|
+
function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
|
|
6163
6470
|
const revision = options.revision ?? "main";
|
|
6164
6471
|
const requestURL = pathJoin(path_or_repo_id, filename);
|
|
6165
6472
|
const validModelId = isValidHfModelId(path_or_repo_id);
|
|
@@ -6169,7 +6476,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6169
6476
|
env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
|
|
6170
6477
|
filename
|
|
6171
6478
|
);
|
|
6172
|
-
const proposedCacheKey =
|
|
6479
|
+
const proposedCacheKey = cache2 instanceof FileCache ? (
|
|
6173
6480
|
// Choose cache key for filesystem cache
|
|
6174
6481
|
// When using the main revision (default), we use the request URL as the cache key.
|
|
6175
6482
|
// If a specific revision is requested, we account for this in the cache key.
|
|
@@ -6183,14 +6490,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
6183
6490
|
validModelId
|
|
6184
6491
|
};
|
|
6185
6492
|
}
|
|
6186
|
-
async function checkCachedResource(
|
|
6187
|
-
if (!
|
|
6493
|
+
async function checkCachedResource(cache2, localPath, proposedCacheKey) {
|
|
6494
|
+
if (!cache2) {
|
|
6188
6495
|
return void 0;
|
|
6189
6496
|
}
|
|
6190
|
-
return await tryCache(
|
|
6497
|
+
return await tryCache(cache2, localPath, proposedCacheKey);
|
|
6191
6498
|
}
|
|
6192
|
-
async function storeCachedResource(path_or_repo_id, filename,
|
|
6193
|
-
if (await
|
|
6499
|
+
async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
|
|
6500
|
+
if (await cache2.match(cacheKey) !== void 0) {
|
|
6194
6501
|
return;
|
|
6195
6502
|
}
|
|
6196
6503
|
if (!result) {
|
|
@@ -6200,14 +6507,14 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6200
6507
|
file: filename,
|
|
6201
6508
|
...data
|
|
6202
6509
|
}) : void 0;
|
|
6203
|
-
await
|
|
6510
|
+
await cache2.put(
|
|
6204
6511
|
cacheKey,
|
|
6205
6512
|
/** @type {Response} */
|
|
6206
6513
|
response,
|
|
6207
6514
|
wrapped_progress
|
|
6208
6515
|
);
|
|
6209
6516
|
} else if (typeof response !== "string") {
|
|
6210
|
-
await
|
|
6517
|
+
await cache2.put(
|
|
6211
6518
|
cacheKey,
|
|
6212
6519
|
new Response(
|
|
6213
6520
|
/** @type {any} */
|
|
@@ -6221,17 +6528,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6221
6528
|
});
|
|
6222
6529
|
}
|
|
6223
6530
|
}
|
|
6224
|
-
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false,
|
|
6531
|
+
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
|
|
6225
6532
|
const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6226
6533
|
path_or_repo_id,
|
|
6227
6534
|
filename,
|
|
6228
6535
|
options,
|
|
6229
|
-
|
|
6536
|
+
cache2
|
|
6230
6537
|
);
|
|
6231
6538
|
let cacheKey;
|
|
6232
6539
|
let toCacheResponse = false;
|
|
6233
6540
|
let response;
|
|
6234
|
-
response = await checkCachedResource(
|
|
6541
|
+
response = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6235
6542
|
const cacheHit = response !== void 0;
|
|
6236
6543
|
if (!cacheHit) {
|
|
6237
6544
|
if (env.allowLocalModels) {
|
|
@@ -6272,7 +6579,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6272
6579
|
}
|
|
6273
6580
|
cacheKey = proposedCacheKey;
|
|
6274
6581
|
}
|
|
6275
|
-
toCacheResponse =
|
|
6582
|
+
toCacheResponse = cache2 && // 1. A caching system is available
|
|
6276
6583
|
typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
|
|
6277
6584
|
response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
|
|
6278
6585
|
response.status === 200;
|
|
@@ -6334,7 +6641,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6334
6641
|
// i.e., do not cache FileResponses (prevents duplication)
|
|
6335
6642
|
toCacheResponse && cacheKey && typeof response !== "string"
|
|
6336
6643
|
) {
|
|
6337
|
-
await storeCachedResource(path_or_repo_id, filename,
|
|
6644
|
+
await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
|
|
6338
6645
|
}
|
|
6339
6646
|
dispatchCallback(options.progress_callback, {
|
|
6340
6647
|
status: "done",
|
|
@@ -6350,7 +6657,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6350
6657
|
if (response instanceof FileResponse) {
|
|
6351
6658
|
return response.filePath;
|
|
6352
6659
|
}
|
|
6353
|
-
const cachedResponse = await
|
|
6660
|
+
const cachedResponse = await cache2?.match(cacheKey);
|
|
6354
6661
|
if (cachedResponse instanceof FileResponse) {
|
|
6355
6662
|
return cachedResponse.filePath;
|
|
6356
6663
|
} else if (cachedResponse instanceof Response) {
|
|
@@ -6377,8 +6684,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
|
|
|
6377
6684
|
name: path_or_repo_id,
|
|
6378
6685
|
file: filename
|
|
6379
6686
|
});
|
|
6380
|
-
const
|
|
6381
|
-
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path,
|
|
6687
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6688
|
+
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
|
|
6382
6689
|
}
|
|
6383
6690
|
async function getModelText(modelPath, fileName, fatal = true, options = {}) {
|
|
6384
6691
|
const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
|
|
@@ -7171,7 +7478,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
|
|
|
7171
7478
|
// src/backends/onnx.js
|
|
7172
7479
|
import * as ONNX_NODE from "onnxruntime-node";
|
|
7173
7480
|
|
|
7174
|
-
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.
|
|
7481
|
+
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
|
|
7175
7482
|
var ort_webgpu_bundle_min_exports = {};
|
|
7176
7483
|
__export(ort_webgpu_bundle_min_exports, {
|
|
7177
7484
|
InferenceSession: () => Jf,
|
|
@@ -7939,7 +8246,7 @@ async function ts(a = {}) {
|
|
|
7939
8246
|
throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
|
|
7940
8247
|
}
|
|
7941
8248
|
function Ye() {
|
|
7942
|
-
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii,
|
|
8249
|
+
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
|
|
7943
8250
|
}
|
|
7944
8251
|
async function bt() {
|
|
7945
8252
|
function e(o, u) {
|
|
@@ -9126,7 +9433,7 @@ async function ts(a = {}) {
|
|
|
9126
9433
|
Te(`invalid type for getValue: ${t}`);
|
|
9127
9434
|
}
|
|
9128
9435
|
}, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
|
|
9129
|
-
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = {
|
|
9436
|
+
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
|
|
9130
9437
|
if (r === void 0 || !r.Uc) return 1;
|
|
9131
9438
|
if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
|
|
9132
9439
|
if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
|
|
@@ -9146,11 +9453,11 @@ async function ts(a = {}) {
|
|
|
9146
9453
|
} catch {
|
|
9147
9454
|
return 4;
|
|
9148
9455
|
}
|
|
9149
|
-
},
|
|
9456
|
+
}, 926500: (e, t, n) => {
|
|
9150
9457
|
r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
|
|
9151
|
-
},
|
|
9458
|
+
}, 926564: () => r.me(), 926606: (e) => {
|
|
9152
9459
|
r.jd(e);
|
|
9153
|
-
},
|
|
9460
|
+
}, 926643: () => typeof wasmOffsetConverter < "u" };
|
|
9154
9461
|
function af(e, t, n, o) {
|
|
9155
9462
|
var u = P();
|
|
9156
9463
|
try {
|
|
@@ -11066,7 +11373,7 @@ var $s = k(() => {
|
|
|
11066
11373
|
Ve();
|
|
11067
11374
|
Ve();
|
|
11068
11375
|
Ve();
|
|
11069
|
-
var Xa = "1.25.0-dev.
|
|
11376
|
+
var Xa = "1.25.0-dev.20260307-d626b568e0";
|
|
11070
11377
|
var Tl = Zr;
|
|
11071
11378
|
{
|
|
11072
11379
|
let a = ($s(), $t(Gs)).wasmBackend;
|
|
@@ -11077,11 +11384,11 @@ Object.defineProperty(K.versions, "web", { value: Xa, enumerable: true });
|
|
|
11077
11384
|
// src/backends/utils/cacheWasm.js
|
|
11078
11385
|
async function loadAndCacheFile(url2) {
|
|
11079
11386
|
const fileName = url2.split("/").pop();
|
|
11080
|
-
let
|
|
11387
|
+
let cache2;
|
|
11081
11388
|
try {
|
|
11082
|
-
|
|
11083
|
-
if (
|
|
11084
|
-
const result = await
|
|
11389
|
+
cache2 = await getCache();
|
|
11390
|
+
if (cache2) {
|
|
11391
|
+
const result = await cache2.match(url2);
|
|
11085
11392
|
if (result) {
|
|
11086
11393
|
return result;
|
|
11087
11394
|
}
|
|
@@ -11093,9 +11400,9 @@ async function loadAndCacheFile(url2) {
|
|
|
11093
11400
|
if (!response.ok) {
|
|
11094
11401
|
throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
|
|
11095
11402
|
}
|
|
11096
|
-
if (
|
|
11403
|
+
if (cache2) {
|
|
11097
11404
|
try {
|
|
11098
|
-
await
|
|
11405
|
+
await cache2.put(url2, response.clone());
|
|
11099
11406
|
} catch (e) {
|
|
11100
11407
|
logger.warn(`Failed to cache ${fileName}:`, e);
|
|
11101
11408
|
}
|
|
@@ -12947,9 +13254,23 @@ var Tensor2 = class _Tensor {
|
|
|
12947
13254
|
throw Error(`Unsupported norm: ${p}`);
|
|
12948
13255
|
}
|
|
12949
13256
|
const this_data = this.data;
|
|
12950
|
-
const
|
|
13257
|
+
const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
|
|
13258
|
+
if (is_bigint && p !== 1) {
|
|
13259
|
+
throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
|
|
13260
|
+
}
|
|
13261
|
+
let fn2, zero;
|
|
13262
|
+
if (is_bigint) {
|
|
13263
|
+
fn2 = (a, b) => a + b;
|
|
13264
|
+
zero = 0n;
|
|
13265
|
+
} else {
|
|
13266
|
+
fn2 = (a, b) => a + b ** p;
|
|
13267
|
+
zero = 0;
|
|
13268
|
+
}
|
|
12951
13269
|
if (dim === null) {
|
|
12952
|
-
|
|
13270
|
+
let val = this_data.reduce(fn2, zero);
|
|
13271
|
+
if (p !== 1) {
|
|
13272
|
+
val = val ** (1 / p);
|
|
13273
|
+
}
|
|
12953
13274
|
return new _Tensor(this.type, [val], []);
|
|
12954
13275
|
}
|
|
12955
13276
|
const [type, result, resultDims] = reduce_helper(fn2, this, dim, keepdim);
|
|
@@ -15409,9 +15730,11 @@ __export(processors_exports, {
|
|
|
15409
15730
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
15410
15731
|
Florence2Processor: () => Florence2Processor,
|
|
15411
15732
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
15733
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
15412
15734
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
15413
15735
|
Idefics3Processor: () => Idefics3Processor,
|
|
15414
15736
|
JinaCLIPProcessor: () => JinaCLIPProcessor,
|
|
15737
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
15415
15738
|
LlavaProcessor: () => LlavaProcessor,
|
|
15416
15739
|
MgpstrProcessor: () => MgpstrProcessor,
|
|
15417
15740
|
MoonshineProcessor: () => MoonshineProcessor,
|
|
@@ -15432,6 +15755,7 @@ __export(processors_exports, {
|
|
|
15432
15755
|
UltravoxProcessor: () => UltravoxProcessor,
|
|
15433
15756
|
VLChatProcessor: () => VLChatProcessor,
|
|
15434
15757
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
15758
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
15435
15759
|
Wav2Vec2Processor: () => Wav2Vec2Processor,
|
|
15436
15760
|
Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
|
|
15437
15761
|
WhisperProcessor: () => WhisperProcessor
|
|
@@ -15486,12 +15810,14 @@ __export(feature_extractors_exports, {
|
|
|
15486
15810
|
EncodecFeatureExtractor: () => EncodecFeatureExtractor,
|
|
15487
15811
|
FeatureExtractor: () => FeatureExtractor,
|
|
15488
15812
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
15813
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
15489
15814
|
MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
|
|
15490
15815
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
15491
15816
|
PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
|
|
15492
15817
|
SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
|
|
15493
15818
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
15494
15819
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
15820
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
15495
15821
|
Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
|
|
15496
15822
|
WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
|
|
15497
15823
|
WhisperFeatureExtractor: () => WhisperFeatureExtractor
|
|
@@ -15719,6 +16045,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
15719
16045
|
mel_filters = null,
|
|
15720
16046
|
mel_floor = 1e-10,
|
|
15721
16047
|
log_mel = null,
|
|
16048
|
+
max_log_mel = null,
|
|
15722
16049
|
reference = 1,
|
|
15723
16050
|
min_value = 1e-10,
|
|
15724
16051
|
db_range = null,
|
|
@@ -15858,6 +16185,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
15858
16185
|
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
15859
16186
|
}
|
|
15860
16187
|
break;
|
|
16188
|
+
case "log10_max_norm": {
|
|
16189
|
+
for (let i = 0; i < o; ++i) {
|
|
16190
|
+
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
16191
|
+
}
|
|
16192
|
+
const logMax = max_log_mel ?? max(mel_spec_data)[0];
|
|
16193
|
+
const threshold = logMax - 8;
|
|
16194
|
+
for (let i = 0; i < o; ++i) {
|
|
16195
|
+
mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
|
|
16196
|
+
}
|
|
16197
|
+
break;
|
|
16198
|
+
}
|
|
15861
16199
|
case "dB":
|
|
15862
16200
|
if (power === 1) {
|
|
15863
16201
|
amplitude_to_db(mel_spec_data, reference, min_value, db_range);
|
|
@@ -15868,7 +16206,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
15868
16206
|
}
|
|
15869
16207
|
break;
|
|
15870
16208
|
default:
|
|
15871
|
-
throw new Error(
|
|
16209
|
+
throw new Error(
|
|
16210
|
+
`log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
|
|
16211
|
+
);
|
|
15872
16212
|
}
|
|
15873
16213
|
}
|
|
15874
16214
|
return mel_spec;
|
|
@@ -16373,6 +16713,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
|
|
|
16373
16713
|
}
|
|
16374
16714
|
};
|
|
16375
16715
|
|
|
16716
|
+
// src/models/granite_speech/feature_extraction_granite_speech.js
|
|
16717
|
+
var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
|
|
16718
|
+
constructor(config) {
|
|
16719
|
+
super(config);
|
|
16720
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
16721
|
+
this.mel_filters = mel_filter_bank(
|
|
16722
|
+
Math.floor(1 + n_fft / 2),
|
|
16723
|
+
// num_frequency_bins = 257
|
|
16724
|
+
n_mels,
|
|
16725
|
+
// 80
|
|
16726
|
+
0,
|
|
16727
|
+
// min_frequency
|
|
16728
|
+
sample_rate / 2,
|
|
16729
|
+
// max_frequency = 8000
|
|
16730
|
+
sample_rate,
|
|
16731
|
+
// 16000
|
|
16732
|
+
null,
|
|
16733
|
+
// norm (torchaudio default: no norm)
|
|
16734
|
+
"htk"
|
|
16735
|
+
// mel_scale (torchaudio default)
|
|
16736
|
+
);
|
|
16737
|
+
const raw_window = window_function(win_length, "hann");
|
|
16738
|
+
this.window = new Float64Array(n_fft);
|
|
16739
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
16740
|
+
this.window.set(raw_window, pad);
|
|
16741
|
+
}
|
|
16742
|
+
/**
|
|
16743
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
16744
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
16745
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
16746
|
+
*/
|
|
16747
|
+
async _call(audio) {
|
|
16748
|
+
validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
|
|
16749
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
16750
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
16751
|
+
const max_num_frames = num_frames - num_frames % 2;
|
|
16752
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
16753
|
+
power: 2,
|
|
16754
|
+
mel_filters: this.mel_filters,
|
|
16755
|
+
log_mel: "log10_max_norm",
|
|
16756
|
+
transpose: true,
|
|
16757
|
+
// [time, n_mels]
|
|
16758
|
+
max_num_frames,
|
|
16759
|
+
do_pad: false
|
|
16760
|
+
});
|
|
16761
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
16762
|
+
return { input_features };
|
|
16763
|
+
}
|
|
16764
|
+
};
|
|
16765
|
+
|
|
16376
16766
|
// src/models/moonshine/feature_extraction_moonshine.js
|
|
16377
16767
|
var MoonshineFeatureExtractor = class extends FeatureExtractor {
|
|
16378
16768
|
/**
|
|
@@ -16853,6 +17243,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
|
|
|
16853
17243
|
}
|
|
16854
17244
|
};
|
|
16855
17245
|
|
|
17246
|
+
// src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
|
|
17247
|
+
var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
|
|
17248
|
+
constructor(config) {
|
|
17249
|
+
super(config);
|
|
17250
|
+
this.config.mel_filters ??= mel_filter_bank(
|
|
17251
|
+
Math.floor(1 + this.config.n_fft / 2),
|
|
17252
|
+
// num_frequency_bins
|
|
17253
|
+
this.config.feature_size,
|
|
17254
|
+
// num_mel_filters
|
|
17255
|
+
0,
|
|
17256
|
+
// min_frequency
|
|
17257
|
+
8e3,
|
|
17258
|
+
// max_frequency
|
|
17259
|
+
this.config.sampling_rate,
|
|
17260
|
+
// sampling_rate
|
|
17261
|
+
"slaney",
|
|
17262
|
+
// norm
|
|
17263
|
+
"slaney"
|
|
17264
|
+
// mel_scale
|
|
17265
|
+
);
|
|
17266
|
+
this.window = window_function(this.config.n_fft, "hann");
|
|
17267
|
+
}
|
|
17268
|
+
/**
|
|
17269
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
17270
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
17271
|
+
* @param {Object} [options]
|
|
17272
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
|
|
17273
|
+
* @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
|
|
17274
|
+
*/
|
|
17275
|
+
async _extract_fbank_features(waveform, { center = true } = {}) {
|
|
17276
|
+
const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
|
|
17277
|
+
const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
|
|
17278
|
+
return await spectrogram(
|
|
17279
|
+
waveform,
|
|
17280
|
+
this.window,
|
|
17281
|
+
n_fft,
|
|
17282
|
+
// frame_length
|
|
17283
|
+
hop_length,
|
|
17284
|
+
{
|
|
17285
|
+
power: 2,
|
|
17286
|
+
mel_filters,
|
|
17287
|
+
log_mel: "log10_max_norm",
|
|
17288
|
+
max_log_mel: global_log_mel_max,
|
|
17289
|
+
center,
|
|
17290
|
+
max_num_frames,
|
|
17291
|
+
do_pad: false
|
|
17292
|
+
}
|
|
17293
|
+
);
|
|
17294
|
+
}
|
|
17295
|
+
/**
|
|
17296
|
+
* Extract mel spectrogram features from audio.
|
|
17297
|
+
* @param {Float32Array|Float64Array} audio The audio data.
|
|
17298
|
+
* @param {Object} [options]
|
|
17299
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform.
|
|
17300
|
+
* @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
|
|
17301
|
+
*/
|
|
17302
|
+
async _call(audio, { center = true } = {}) {
|
|
17303
|
+
validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
|
|
17304
|
+
const features = await this._extract_fbank_features(audio, { center });
|
|
17305
|
+
return {
|
|
17306
|
+
input_features: features.unsqueeze_(0)
|
|
17307
|
+
};
|
|
17308
|
+
}
|
|
17309
|
+
};
|
|
17310
|
+
|
|
16856
17311
|
// src/models/whisper/feature_extraction_whisper.js
|
|
16857
17312
|
var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
16858
17313
|
constructor(config) {
|
|
@@ -16881,7 +17336,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
16881
17336
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
16882
17337
|
*/
|
|
16883
17338
|
async _extract_fbank_features(waveform) {
|
|
16884
|
-
|
|
17339
|
+
return await spectrogram(
|
|
16885
17340
|
waveform,
|
|
16886
17341
|
this.window,
|
|
16887
17342
|
// window
|
|
@@ -16892,7 +17347,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
16892
17347
|
{
|
|
16893
17348
|
power: 2,
|
|
16894
17349
|
mel_filters: this.config.mel_filters,
|
|
16895
|
-
log_mel: "
|
|
17350
|
+
log_mel: "log10_max_norm",
|
|
16896
17351
|
// Custom
|
|
16897
17352
|
max_num_frames: Math.min(
|
|
16898
17353
|
Math.floor(waveform.length / this.config.hop_length),
|
|
@@ -16901,15 +17356,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
16901
17356
|
)
|
|
16902
17357
|
}
|
|
16903
17358
|
);
|
|
16904
|
-
const data = features.data;
|
|
16905
|
-
const maxValue = max(
|
|
16906
|
-
/** @type {Float32Array} */
|
|
16907
|
-
data
|
|
16908
|
-
)[0];
|
|
16909
|
-
for (let i = 0; i < data.length; ++i) {
|
|
16910
|
-
data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
|
|
16911
|
-
}
|
|
16912
|
-
return features;
|
|
16913
17359
|
}
|
|
16914
17360
|
/**
|
|
16915
17361
|
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
@@ -17788,6 +18234,27 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
17788
18234
|
}
|
|
17789
18235
|
return [segmentation, segments];
|
|
17790
18236
|
}
|
|
18237
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
18238
|
+
if (height < factor || width < factor) {
|
|
18239
|
+
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
18240
|
+
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
18241
|
+
throw new Error(
|
|
18242
|
+
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
18243
|
+
);
|
|
18244
|
+
}
|
|
18245
|
+
let h_bar = Math.round(height / factor) * factor;
|
|
18246
|
+
let w_bar = Math.round(width / factor) * factor;
|
|
18247
|
+
if (h_bar * w_bar > max_pixels) {
|
|
18248
|
+
const beta = Math.sqrt(height * width / max_pixels);
|
|
18249
|
+
h_bar = Math.floor(height / beta / factor) * factor;
|
|
18250
|
+
w_bar = Math.floor(width / beta / factor) * factor;
|
|
18251
|
+
} else if (h_bar * w_bar < min_pixels) {
|
|
18252
|
+
const beta = Math.sqrt(min_pixels / (height * width));
|
|
18253
|
+
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
18254
|
+
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
18255
|
+
}
|
|
18256
|
+
return [h_bar, w_bar];
|
|
18257
|
+
}
|
|
17791
18258
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
17792
18259
|
if (label_ids_to_fuse === null) {
|
|
17793
18260
|
logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
|
|
@@ -18076,7 +18543,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18076
18543
|
});
|
|
18077
18544
|
}
|
|
18078
18545
|
/**
|
|
18079
|
-
* @typedef {
|
|
18546
|
+
* @typedef {Object} PreprocessedImage
|
|
18080
18547
|
* @property {HeightWidth} original_size The original size of the image.
|
|
18081
18548
|
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
|
|
18082
18549
|
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
|
|
@@ -18254,6 +18721,7 @@ __export(image_processors_exports, {
|
|
|
18254
18721
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
18255
18722
|
ImageProcessor: () => ImageProcessor,
|
|
18256
18723
|
JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
|
|
18724
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
18257
18725
|
LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
|
|
18258
18726
|
Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
|
|
18259
18727
|
MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
|
|
@@ -18657,21 +19125,252 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
|
|
|
18657
19125
|
}
|
|
18658
19126
|
};
|
|
18659
19127
|
|
|
18660
|
-
// src/models/
|
|
18661
|
-
|
|
18662
|
-
|
|
18663
|
-
|
|
18664
|
-
|
|
18665
|
-
|
|
18666
|
-
|
|
18667
|
-
|
|
18668
|
-
|
|
19128
|
+
// src/models/lfm2_vl/image_processing_lfm2_vl.js
|
|
19129
|
+
function round_by_factor(number, factor) {
|
|
19130
|
+
return Math.round(number / factor) * factor;
|
|
19131
|
+
}
|
|
19132
|
+
function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
|
|
19133
|
+
let best_ratio_diff = Infinity;
|
|
19134
|
+
let best_ratio = [1, 1];
|
|
19135
|
+
const area = width * height;
|
|
19136
|
+
for (const ratio of target_ratios) {
|
|
19137
|
+
const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
|
|
19138
|
+
if (ratio_diff < best_ratio_diff) {
|
|
19139
|
+
best_ratio_diff = ratio_diff;
|
|
19140
|
+
best_ratio = ratio;
|
|
19141
|
+
} else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
|
|
19142
|
+
best_ratio = ratio;
|
|
19143
|
+
}
|
|
19144
|
+
}
|
|
19145
|
+
return best_ratio;
|
|
19146
|
+
}
|
|
19147
|
+
function get_target_ratios(min_tiles, max_tiles) {
|
|
19148
|
+
const ratios = [];
|
|
19149
|
+
const seen = /* @__PURE__ */ new Set();
|
|
19150
|
+
for (let n = min_tiles; n <= max_tiles; ++n) {
|
|
19151
|
+
for (let w = 1; w <= n; ++w) {
|
|
19152
|
+
for (let h = 1; h <= n; ++h) {
|
|
19153
|
+
const product2 = w * h;
|
|
19154
|
+
if (product2 >= min_tiles && product2 <= max_tiles) {
|
|
19155
|
+
const key = w << 16 | h;
|
|
19156
|
+
if (!seen.has(key)) {
|
|
19157
|
+
seen.add(key);
|
|
19158
|
+
ratios.push([w, h]);
|
|
19159
|
+
}
|
|
19160
|
+
}
|
|
19161
|
+
}
|
|
19162
|
+
}
|
|
18669
19163
|
}
|
|
18670
|
-
|
|
18671
|
-
|
|
18672
|
-
|
|
19164
|
+
return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
|
|
19165
|
+
}
|
|
19166
|
+
function convert_image_to_patches(images, patch_size) {
|
|
19167
|
+
const [B, C, H, W] = images.dims;
|
|
19168
|
+
const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
|
|
19169
|
+
const patch_dim = patch_size * patch_size * C;
|
|
19170
|
+
const data = (
|
|
19171
|
+
/** @type {Float32Array} */
|
|
19172
|
+
images.data
|
|
19173
|
+
);
|
|
19174
|
+
const result = new Float32Array(B * ph * pw * patch_dim);
|
|
19175
|
+
const ch_stride = H * W;
|
|
19176
|
+
for (let b = 0; b < B; ++b) {
|
|
19177
|
+
const b_src = b * C * ch_stride;
|
|
19178
|
+
const b_dst = b * ph * pw * patch_dim;
|
|
19179
|
+
for (let py = 0; py < ph; ++py) {
|
|
19180
|
+
for (let px = 0; px < pw; ++px) {
|
|
19181
|
+
let off = b_dst + (py * pw + px) * patch_dim;
|
|
19182
|
+
for (let dy = 0; dy < patch_size; ++dy) {
|
|
19183
|
+
const row = (py * patch_size + dy) * W + px * patch_size;
|
|
19184
|
+
for (let dx = 0; dx < patch_size; ++dx) {
|
|
19185
|
+
const pixel = row + dx;
|
|
19186
|
+
for (let c = 0; c < C; ++c) {
|
|
19187
|
+
result[off++] = data[b_src + c * ch_stride + pixel];
|
|
19188
|
+
}
|
|
19189
|
+
}
|
|
19190
|
+
}
|
|
19191
|
+
}
|
|
19192
|
+
}
|
|
18673
19193
|
}
|
|
18674
|
-
|
|
19194
|
+
return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
|
|
19195
|
+
}
|
|
19196
|
+
function pad_along_first_dim(patches, target_length) {
|
|
19197
|
+
const [, len2, dim] = patches.dims;
|
|
19198
|
+
const mask_data = new BigInt64Array(target_length);
|
|
19199
|
+
mask_data.fill(1n, 0, len2);
|
|
19200
|
+
let padded = patches;
|
|
19201
|
+
if (len2 < target_length) {
|
|
19202
|
+
const padded_data = new Float32Array(target_length * dim);
|
|
19203
|
+
padded_data.set(
|
|
19204
|
+
/** @type {Float32Array} */
|
|
19205
|
+
patches.data
|
|
19206
|
+
);
|
|
19207
|
+
padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
|
|
19208
|
+
}
|
|
19209
|
+
return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
|
|
19210
|
+
}
|
|
19211
|
+
var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
19212
|
+
constructor(config) {
|
|
19213
|
+
super(config);
|
|
19214
|
+
this.downsample_factor = config.downsample_factor ?? 2;
|
|
19215
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
19216
|
+
this.min_tiles = config.min_tiles ?? 2;
|
|
19217
|
+
this.max_tiles = config.max_tiles ?? 10;
|
|
19218
|
+
this.use_thumbnail = config.use_thumbnail ?? true;
|
|
19219
|
+
this.min_image_tokens = config.min_image_tokens ?? 64;
|
|
19220
|
+
this.max_image_tokens = config.max_image_tokens ?? 256;
|
|
19221
|
+
this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
|
|
19222
|
+
this.tile_size = config.tile_size ?? 512;
|
|
19223
|
+
this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
|
|
19224
|
+
this.return_row_col_info = config.return_row_col_info ?? false;
|
|
19225
|
+
const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
|
|
19226
|
+
const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
|
|
19227
|
+
this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
|
|
19228
|
+
}
|
|
19229
|
+
/**
|
|
19230
|
+
* Check if the image is too large to be processed as a single tile.
|
|
19231
|
+
* @param {number} height
|
|
19232
|
+
* @param {number} width
|
|
19233
|
+
* @returns {boolean}
|
|
19234
|
+
*/
|
|
19235
|
+
_is_image_too_large(height, width) {
|
|
19236
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
19237
|
+
const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
|
|
19238
|
+
const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
|
|
19239
|
+
return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
|
|
19240
|
+
}
|
|
19241
|
+
/**
|
|
19242
|
+
* Get the grid layout for tiling a large image.
|
|
19243
|
+
* @param {number} height
|
|
19244
|
+
* @param {number} width
|
|
19245
|
+
* @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
|
|
19246
|
+
*/
|
|
19247
|
+
_get_grid_layout(height, width) {
|
|
19248
|
+
const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
|
|
19249
|
+
const [grid_width, grid_height] = find_closest_aspect_ratio(
|
|
19250
|
+
width / height,
|
|
19251
|
+
target_ratios,
|
|
19252
|
+
width,
|
|
19253
|
+
height,
|
|
19254
|
+
this.tile_size
|
|
19255
|
+
);
|
|
19256
|
+
return {
|
|
19257
|
+
grid_width,
|
|
19258
|
+
grid_height,
|
|
19259
|
+
target_width: this.tile_size * grid_width,
|
|
19260
|
+
target_height: this.tile_size * grid_height
|
|
19261
|
+
};
|
|
19262
|
+
}
|
|
19263
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
19264
|
+
// @ts-expect-error
|
|
19265
|
+
async _call(images, { return_row_col_info = null } = {}) {
|
|
19266
|
+
let batched_images;
|
|
19267
|
+
if (!Array.isArray(images)) {
|
|
19268
|
+
batched_images = [[images]];
|
|
19269
|
+
} else if (!Array.isArray(images[0])) {
|
|
19270
|
+
batched_images = [
|
|
19271
|
+
/** @type {RawImage[]} */
|
|
19272
|
+
images
|
|
19273
|
+
];
|
|
19274
|
+
} else {
|
|
19275
|
+
batched_images = /** @type {RawImage[][]} */
|
|
19276
|
+
images;
|
|
19277
|
+
}
|
|
19278
|
+
const all_pixel_values = [];
|
|
19279
|
+
const all_pixel_masks = [];
|
|
19280
|
+
const all_spatial_shapes = [];
|
|
19281
|
+
const all_rows = [];
|
|
19282
|
+
const all_cols = [];
|
|
19283
|
+
const all_image_sizes = [];
|
|
19284
|
+
for (const image_batch of batched_images) {
|
|
19285
|
+
const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
|
|
19286
|
+
for (const { pixel_values } of preprocessed) {
|
|
19287
|
+
const [, height, width] = pixel_values.dims;
|
|
19288
|
+
const img = pixel_values.unsqueeze_(0);
|
|
19289
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
19290
|
+
const f2 = total_factor ** 2;
|
|
19291
|
+
const [new_height, new_width] = smart_resize(
|
|
19292
|
+
Math.max(total_factor, height),
|
|
19293
|
+
Math.max(total_factor, width),
|
|
19294
|
+
total_factor,
|
|
19295
|
+
this.min_image_tokens * f2,
|
|
19296
|
+
this.max_image_tokens * f2
|
|
19297
|
+
).map((x) => Math.max(total_factor, x));
|
|
19298
|
+
let tiles;
|
|
19299
|
+
let num_rows = 1, num_cols = 1;
|
|
19300
|
+
const is_large = this._is_image_too_large(height, width);
|
|
19301
|
+
const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
|
|
19302
|
+
if (is_large && do_splitting) {
|
|
19303
|
+
const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
|
|
19304
|
+
height,
|
|
19305
|
+
width
|
|
19306
|
+
);
|
|
19307
|
+
num_rows = grid_height;
|
|
19308
|
+
num_cols = grid_width;
|
|
19309
|
+
const resized = await interpolate_4d(img, {
|
|
19310
|
+
size: [target_height, target_width]
|
|
19311
|
+
});
|
|
19312
|
+
tiles = [];
|
|
19313
|
+
for (let r = 0; r < grid_height; ++r) {
|
|
19314
|
+
for (let c = 0; c < grid_width; ++c) {
|
|
19315
|
+
const y = r * this.tile_size;
|
|
19316
|
+
const x = c * this.tile_size;
|
|
19317
|
+
tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
|
|
19318
|
+
}
|
|
19319
|
+
}
|
|
19320
|
+
if (this.use_thumbnail && grid_width * grid_height !== 1) {
|
|
19321
|
+
tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
|
|
19322
|
+
}
|
|
19323
|
+
} else {
|
|
19324
|
+
tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
|
|
19325
|
+
}
|
|
19326
|
+
for (const tile of tiles) {
|
|
19327
|
+
const [, , th, tw] = tile.dims;
|
|
19328
|
+
const patches = convert_image_to_patches(tile, this.encoder_patch_size);
|
|
19329
|
+
const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
|
|
19330
|
+
all_pixel_values.push(padded);
|
|
19331
|
+
all_pixel_masks.push(mask);
|
|
19332
|
+
all_spatial_shapes.push([
|
|
19333
|
+
Math.floor(th / this.encoder_patch_size),
|
|
19334
|
+
Math.floor(tw / this.encoder_patch_size)
|
|
19335
|
+
]);
|
|
19336
|
+
}
|
|
19337
|
+
all_rows.push(num_rows);
|
|
19338
|
+
all_cols.push(num_cols);
|
|
19339
|
+
all_image_sizes.push([new_height, new_width]);
|
|
19340
|
+
}
|
|
19341
|
+
}
|
|
19342
|
+
const result = {
|
|
19343
|
+
pixel_values: cat(all_pixel_values, 0),
|
|
19344
|
+
pixel_attention_mask: stack(all_pixel_masks, 0),
|
|
19345
|
+
spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
|
|
19346
|
+
all_spatial_shapes.length,
|
|
19347
|
+
2
|
|
19348
|
+
])
|
|
19349
|
+
};
|
|
19350
|
+
if (return_row_col_info ?? this.return_row_col_info) {
|
|
19351
|
+
result.image_rows = all_rows;
|
|
19352
|
+
result.image_cols = all_cols;
|
|
19353
|
+
result.image_sizes = all_image_sizes;
|
|
19354
|
+
}
|
|
19355
|
+
return result;
|
|
19356
|
+
}
|
|
19357
|
+
};
|
|
19358
|
+
|
|
19359
|
+
// src/models/llava_onevision/image_processing_llava_onevision.js
|
|
19360
|
+
var LlavaOnevisionImageProcessor = class extends ImageProcessor {
|
|
19361
|
+
};
|
|
19362
|
+
|
|
19363
|
+
// src/models/maskformer/image_processing_maskformer.js
|
|
19364
|
+
var MaskFormerImageProcessor = class extends ImageProcessor {
|
|
19365
|
+
/** @type {typeof post_process_panoptic_segmentation} */
|
|
19366
|
+
post_process_panoptic_segmentation(...args) {
|
|
19367
|
+
return post_process_panoptic_segmentation(...args);
|
|
19368
|
+
}
|
|
19369
|
+
/** @type {typeof post_process_instance_segmentation} */
|
|
19370
|
+
post_process_instance_segmentation(...args) {
|
|
19371
|
+
return post_process_instance_segmentation(...args);
|
|
19372
|
+
}
|
|
19373
|
+
};
|
|
18675
19374
|
var MaskFormerFeatureExtractor = class extends MaskFormerImageProcessor {
|
|
18676
19375
|
};
|
|
18677
19376
|
|
|
@@ -18880,27 +19579,6 @@ var PvtImageProcessor = class extends ImageProcessor {
|
|
|
18880
19579
|
};
|
|
18881
19580
|
|
|
18882
19581
|
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
18883
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
18884
|
-
if (height < factor || width < factor) {
|
|
18885
|
-
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
18886
|
-
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
18887
|
-
throw new Error(
|
|
18888
|
-
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
18889
|
-
);
|
|
18890
|
-
}
|
|
18891
|
-
let h_bar = Math.round(height / factor) * factor;
|
|
18892
|
-
let w_bar = Math.round(width / factor) * factor;
|
|
18893
|
-
if (h_bar * w_bar > max_pixels) {
|
|
18894
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
18895
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
18896
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
18897
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
18898
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
18899
|
-
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
18900
|
-
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
18901
|
-
}
|
|
18902
|
-
return [h_bar, w_bar];
|
|
18903
|
-
}
|
|
18904
19582
|
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
18905
19583
|
constructor(config) {
|
|
18906
19584
|
super(config);
|
|
@@ -19502,6 +20180,57 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
19502
20180
|
}
|
|
19503
20181
|
};
|
|
19504
20182
|
|
|
20183
|
+
// src/models/granite_speech/processing_granite_speech.js
|
|
20184
|
+
var GraniteSpeechProcessor = class extends Processor {
|
|
20185
|
+
static tokenizer_class = AutoTokenizer;
|
|
20186
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
20187
|
+
static uses_processor_config = true;
|
|
20188
|
+
/**
|
|
20189
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
20190
|
+
* @param {number} audioLength Raw audio sample count.
|
|
20191
|
+
* @returns {number} Number of projector output tokens.
|
|
20192
|
+
*/
|
|
20193
|
+
_get_num_audio_features(audioLength) {
|
|
20194
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
20195
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
20196
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
20197
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
20198
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
20199
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
20200
|
+
return nblocks * effective_window_size;
|
|
20201
|
+
}
|
|
20202
|
+
/**
|
|
20203
|
+
* @param {string} text The text input to process.
|
|
20204
|
+
* @param {Float32Array} audio The audio input to process.
|
|
20205
|
+
*/
|
|
20206
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
20207
|
+
if (Array.isArray(text)) {
|
|
20208
|
+
throw new Error("Batched inputs are not supported yet.");
|
|
20209
|
+
}
|
|
20210
|
+
let audio_inputs = {};
|
|
20211
|
+
if (audio) {
|
|
20212
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
20213
|
+
audio_inputs["input_features"] = input_features;
|
|
20214
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
20215
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
20216
|
+
audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
|
|
20217
|
+
const audio_token = this.config.audio_token ?? "<|audio|>";
|
|
20218
|
+
if (!text.includes(audio_token)) {
|
|
20219
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
20220
|
+
}
|
|
20221
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
20222
|
+
}
|
|
20223
|
+
const text_inputs = this.tokenizer(text, {
|
|
20224
|
+
add_special_tokens: false,
|
|
20225
|
+
...kwargs
|
|
20226
|
+
});
|
|
20227
|
+
return {
|
|
20228
|
+
...text_inputs,
|
|
20229
|
+
...audio_inputs
|
|
20230
|
+
};
|
|
20231
|
+
}
|
|
20232
|
+
};
|
|
20233
|
+
|
|
19505
20234
|
// src/models/grounding_dino/processing_grounding_dino.js
|
|
19506
20235
|
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
19507
20236
|
const left_idx = 0;
|
|
@@ -19778,6 +20507,66 @@ var JinaCLIPProcessor = class extends Processor {
|
|
|
19778
20507
|
}
|
|
19779
20508
|
};
|
|
19780
20509
|
|
|
20510
|
+
// src/models/lfm2_vl/processing_lfm2_vl.js
|
|
20511
|
+
var Lfm2VlProcessor = class extends Processor {
|
|
20512
|
+
static tokenizer_class = AutoTokenizer;
|
|
20513
|
+
static image_processor_class = AutoImageProcessor;
|
|
20514
|
+
/**
|
|
20515
|
+
* @param {RawImage|RawImage[]} images
|
|
20516
|
+
* @param {string|string[]|null} [text]
|
|
20517
|
+
* @param {Record<string, any>} [kwargs]
|
|
20518
|
+
*/
|
|
20519
|
+
async _call(images, text = null, kwargs = {}) {
|
|
20520
|
+
const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
|
|
20521
|
+
...kwargs,
|
|
20522
|
+
return_row_col_info: true
|
|
20523
|
+
});
|
|
20524
|
+
if (text) {
|
|
20525
|
+
const image_token = this.config.image_token ?? "<image>";
|
|
20526
|
+
const {
|
|
20527
|
+
tile_size = 512,
|
|
20528
|
+
downsample_factor = 2,
|
|
20529
|
+
encoder_patch_size = 16,
|
|
20530
|
+
use_thumbnail = true
|
|
20531
|
+
} = (
|
|
20532
|
+
/** @type {Record<string, any>} */
|
|
20533
|
+
this.image_processor.config
|
|
20534
|
+
);
|
|
20535
|
+
const ds2 = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
|
|
20536
|
+
const tokens_per_tile = ds2(tile_size) ** 2;
|
|
20537
|
+
const image_start = this.config.image_start_token ?? "<|image_start|>";
|
|
20538
|
+
const image_end = this.config.image_end_token ?? "<|image_end|>";
|
|
20539
|
+
const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
|
|
20540
|
+
if (!Array.isArray(text)) text = [text];
|
|
20541
|
+
let image_idx = 0;
|
|
20542
|
+
text = text.map((sample) => {
|
|
20543
|
+
const parts = sample.split(image_token);
|
|
20544
|
+
return parts[0] + parts.slice(1).map((part) => {
|
|
20545
|
+
const idx = image_idx++;
|
|
20546
|
+
const [h, w] = image_sizes[idx];
|
|
20547
|
+
const rows = image_rows[idx], cols = image_cols[idx];
|
|
20548
|
+
const tokens_for_image = ds2(h) * ds2(w);
|
|
20549
|
+
let expanded = image_start;
|
|
20550
|
+
if (rows > 1 || cols > 1) {
|
|
20551
|
+
const tile_str = image_token.repeat(tokens_per_tile);
|
|
20552
|
+
for (let r = 0; r < rows; ++r)
|
|
20553
|
+
for (let c = 0; c < cols; ++c)
|
|
20554
|
+
expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
|
|
20555
|
+
if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
|
|
20556
|
+
} else {
|
|
20557
|
+
expanded += image_token.repeat(tokens_for_image);
|
|
20558
|
+
}
|
|
20559
|
+
return expanded + image_end + part;
|
|
20560
|
+
}).join("");
|
|
20561
|
+
});
|
|
20562
|
+
}
|
|
20563
|
+
return {
|
|
20564
|
+
...image_inputs,
|
|
20565
|
+
...text ? this.tokenizer(text, kwargs) : {}
|
|
20566
|
+
};
|
|
20567
|
+
}
|
|
20568
|
+
};
|
|
20569
|
+
|
|
19781
20570
|
// src/models/llava/processing_llava.js
|
|
19782
20571
|
var LlavaProcessor = class extends Processor {
|
|
19783
20572
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -20310,6 +21099,94 @@ var VoxtralProcessor = class extends Processor {
|
|
|
20310
21099
|
}
|
|
20311
21100
|
};
|
|
20312
21101
|
|
|
21102
|
+
// src/models/voxtral_realtime/processing_voxtral_realtime.js
|
|
21103
|
+
var NUM_LEFT_PAD_TOKENS = 32;
|
|
21104
|
+
var NUM_DELAY_TOKENS = 6;
|
|
21105
|
+
var AUDIO_LENGTH_PER_TOK = 8;
|
|
21106
|
+
var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
21107
|
+
var STREAMING_PAD_TOKEN_ID = 32;
|
|
21108
|
+
var VoxtralRealtimeProcessor = class extends Processor {
|
|
21109
|
+
static tokenizer_class = AutoTokenizer;
|
|
21110
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
21111
|
+
static uses_processor_config = false;
|
|
21112
|
+
/** Number of mel frames in the first audio chunk. */
|
|
21113
|
+
get num_mel_frames_first_audio_chunk() {
|
|
21114
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
21115
|
+
}
|
|
21116
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
21117
|
+
get num_samples_first_audio_chunk() {
|
|
21118
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
21119
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
21120
|
+
}
|
|
21121
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
21122
|
+
get num_samples_per_audio_chunk() {
|
|
21123
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
21124
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
21125
|
+
}
|
|
21126
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
21127
|
+
get num_right_pad_tokens() {
|
|
21128
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
21129
|
+
}
|
|
21130
|
+
/** Number of mel frames per text token. */
|
|
21131
|
+
get audio_length_per_tok() {
|
|
21132
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
21133
|
+
}
|
|
21134
|
+
/** Number of raw audio samples per token. */
|
|
21135
|
+
get raw_audio_length_per_tok() {
|
|
21136
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
21137
|
+
}
|
|
21138
|
+
/**
|
|
21139
|
+
* Process audio input for VoxtralRealtime.
|
|
21140
|
+
*
|
|
21141
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
21142
|
+
* with silence and mel features are extracted with `center=true`.
|
|
21143
|
+
* Returns `{ input_ids, input_features }`.
|
|
21144
|
+
*
|
|
21145
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
21146
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
21147
|
+
*
|
|
21148
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
21149
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
21150
|
+
* Returns `{ input_features }`.
|
|
21151
|
+
*
|
|
21152
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
21153
|
+
* @param {Object} [options]
|
|
21154
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
21155
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
21156
|
+
* @returns {Promise<Object>}
|
|
21157
|
+
*/
|
|
21158
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
21159
|
+
validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
|
|
21160
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
21161
|
+
throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
|
|
21162
|
+
}
|
|
21163
|
+
if (is_first_audio_chunk) {
|
|
21164
|
+
if (is_streaming) {
|
|
21165
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
21166
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
21167
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
21168
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
21169
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
21170
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
21171
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
21172
|
+
input_ids_data[0] = 1n;
|
|
21173
|
+
const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
|
|
21174
|
+
return {
|
|
21175
|
+
input_ids,
|
|
21176
|
+
...audio_encoding
|
|
21177
|
+
};
|
|
21178
|
+
} else {
|
|
21179
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
21180
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
21181
|
+
padded_audio.set(audio);
|
|
21182
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
21183
|
+
}
|
|
21184
|
+
} else {
|
|
21185
|
+
return await this.feature_extractor(audio, { center: false });
|
|
21186
|
+
}
|
|
21187
|
+
}
|
|
21188
|
+
};
|
|
21189
|
+
|
|
20313
21190
|
// src/models/wav2vec2/processing_wav2vec2.js
|
|
20314
21191
|
var Wav2Vec2Processor = class extends Processor {
|
|
20315
21192
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -20409,10 +21286,13 @@ function getNormalizedConfig(config) {
|
|
|
20409
21286
|
case "florence2":
|
|
20410
21287
|
case "llava_onevision":
|
|
20411
21288
|
case "idefics3":
|
|
21289
|
+
case "granite_speech":
|
|
20412
21290
|
case "ultravox":
|
|
20413
21291
|
case "voxtral":
|
|
21292
|
+
case "voxtral_realtime":
|
|
20414
21293
|
case "smolvlm":
|
|
20415
21294
|
case "gemma3n":
|
|
21295
|
+
case "lfm2_vl":
|
|
20416
21296
|
case "chatterbox":
|
|
20417
21297
|
case "mistral3":
|
|
20418
21298
|
case "qwen2_5_vl":
|
|
@@ -20467,10 +21347,13 @@ function getNormalizedConfig(config) {
|
|
|
20467
21347
|
case "cohere":
|
|
20468
21348
|
case "cohere2":
|
|
20469
21349
|
case "mistral":
|
|
21350
|
+
case "voxtral_realtime_text":
|
|
21351
|
+
case "voxtral_realtime_encoder":
|
|
20470
21352
|
case "starcoder2":
|
|
20471
21353
|
case "qwen2":
|
|
20472
21354
|
case "qwen2_moe":
|
|
20473
21355
|
case "qwen2_vl":
|
|
21356
|
+
case "qwen2_vl_text":
|
|
20474
21357
|
case "qwen2_5_vl_text":
|
|
20475
21358
|
case "qwen3_moe":
|
|
20476
21359
|
case "qwen3_vl_text":
|
|
@@ -20615,6 +21498,9 @@ function getNormalizedConfig(config) {
|
|
|
20615
21498
|
return normalized_config;
|
|
20616
21499
|
}
|
|
20617
21500
|
function getCacheShapes(config, options) {
|
|
21501
|
+
if (!(config instanceof PretrainedConfig)) {
|
|
21502
|
+
config = new PretrainedConfig(config);
|
|
21503
|
+
}
|
|
20618
21504
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
20619
21505
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
20620
21506
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -20721,12 +21607,16 @@ function getCacheShapes(config, options) {
|
|
|
20721
21607
|
}
|
|
20722
21608
|
}
|
|
20723
21609
|
return cache_values;
|
|
20724
|
-
} else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
|
|
20725
|
-
|
|
20726
|
-
|
|
20727
|
-
|
|
20728
|
-
|
|
20729
|
-
|
|
21610
|
+
} else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
|
|
21611
|
+
let subConfig;
|
|
21612
|
+
if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
|
|
21613
|
+
subConfig = /** @type {any} */
|
|
21614
|
+
config.audio_config;
|
|
21615
|
+
} else {
|
|
21616
|
+
subConfig = /** @type {any} */
|
|
21617
|
+
config.text_config;
|
|
21618
|
+
}
|
|
21619
|
+
return getCacheShapes(subConfig, options);
|
|
20730
21620
|
}
|
|
20731
21621
|
return getKeyValueShapes(config, options);
|
|
20732
21622
|
}
|
|
@@ -20892,7 +21782,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
|
|
|
20892
21782
|
}
|
|
20893
21783
|
|
|
20894
21784
|
// src/models/session.js
|
|
20895
|
-
async function getSession(pretrained_model_name_or_path, fileName, options,
|
|
21785
|
+
async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
|
|
20896
21786
|
let custom_config = options.config?.["transformers.js_config"] ?? {};
|
|
20897
21787
|
const selectedDevice = (
|
|
20898
21788
|
/** @type {import("../utils/devices.js").DeviceType} */
|
|
@@ -20950,9 +21840,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
20950
21840
|
if (externalData.length > 0 && !apis.IS_NODE_ENV) {
|
|
20951
21841
|
session_options.externalData = externalData;
|
|
20952
21842
|
}
|
|
20953
|
-
if (
|
|
21843
|
+
if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
|
|
20954
21844
|
const shapes = getCacheShapes(options.config, {
|
|
20955
|
-
prefix: "present"
|
|
21845
|
+
prefix: "present",
|
|
21846
|
+
session_name
|
|
20956
21847
|
});
|
|
20957
21848
|
if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
|
|
20958
21849
|
const preferredOutputLocation = {};
|
|
@@ -20970,15 +21861,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
20970
21861
|
};
|
|
20971
21862
|
return { buffer_or_path, session_options, session_config };
|
|
20972
21863
|
}
|
|
20973
|
-
async function constructSessions(pretrained_model_name_or_path, names, options,
|
|
21864
|
+
async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
|
|
20974
21865
|
return Object.fromEntries(
|
|
20975
21866
|
await Promise.all(
|
|
20976
21867
|
Object.keys(names).map(async (name) => {
|
|
21868
|
+
const cache_config = cache_sessions?.[name] ?? false;
|
|
20977
21869
|
const { buffer_or_path, session_options, session_config } = await getSession(
|
|
20978
21870
|
pretrained_model_name_or_path,
|
|
20979
21871
|
names[name],
|
|
20980
21872
|
options,
|
|
20981
|
-
|
|
21873
|
+
cache_config,
|
|
21874
|
+
name
|
|
20982
21875
|
);
|
|
20983
21876
|
const session = await createInferenceSession(buffer_or_path, session_options, session_config);
|
|
20984
21877
|
return [name, session];
|
|
@@ -22278,19 +23171,71 @@ var BeamSearchSampler = class extends LogitsSampler {
|
|
|
22278
23171
|
}
|
|
22279
23172
|
};
|
|
22280
23173
|
|
|
23174
|
+
// src/cache_utils.js
|
|
23175
|
+
var _DynamicCache = class {
|
|
23176
|
+
/**
|
|
23177
|
+
* Create a DynamicCache, optionally pre-populated with entries.
|
|
23178
|
+
* @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
|
|
23179
|
+
*/
|
|
23180
|
+
constructor(entries) {
|
|
23181
|
+
if (!entries) return;
|
|
23182
|
+
for (const key in entries) {
|
|
23183
|
+
if (key in this) {
|
|
23184
|
+
throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
|
|
23185
|
+
}
|
|
23186
|
+
const value = entries[key];
|
|
23187
|
+
if (!(value instanceof Tensor2)) {
|
|
23188
|
+
throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
|
|
23189
|
+
}
|
|
23190
|
+
this[key] = value;
|
|
23191
|
+
}
|
|
23192
|
+
}
|
|
23193
|
+
/**
|
|
23194
|
+
* Get the cached sequence length. This requires at least one attention cache entry to be present.
|
|
23195
|
+
* @returns {number} The past sequence length.
|
|
23196
|
+
*/
|
|
23197
|
+
get_seq_length() {
|
|
23198
|
+
const self2 = (
|
|
23199
|
+
/** @type {any} */
|
|
23200
|
+
this
|
|
23201
|
+
);
|
|
23202
|
+
for (const name in self2) {
|
|
23203
|
+
if (name.startsWith("past_key_values.")) {
|
|
23204
|
+
return self2[name].dims.at(-2);
|
|
23205
|
+
}
|
|
23206
|
+
}
|
|
23207
|
+
throw new Error("Unable to determine sequence length from the cache.");
|
|
23208
|
+
}
|
|
23209
|
+
/**
|
|
23210
|
+
* Dispose all contained tensors whose data resides on the GPU.
|
|
23211
|
+
* Returns a promise that resolves when all disposals are complete.
|
|
23212
|
+
* @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
|
|
23213
|
+
*/
|
|
23214
|
+
async dispose() {
|
|
23215
|
+
const promises = [];
|
|
23216
|
+
for (
|
|
23217
|
+
const t of
|
|
23218
|
+
/** @type {Tensor[]} */
|
|
23219
|
+
Object.values(this)
|
|
23220
|
+
) {
|
|
23221
|
+
if (t.location === "gpu-buffer") {
|
|
23222
|
+
promises.push(t.dispose());
|
|
23223
|
+
}
|
|
23224
|
+
}
|
|
23225
|
+
await Promise.all(promises);
|
|
23226
|
+
}
|
|
23227
|
+
};
|
|
23228
|
+
var DynamicCache = (
|
|
23229
|
+
/** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
|
|
23230
|
+
/** @type {unknown} */
|
|
23231
|
+
_DynamicCache
|
|
23232
|
+
);
|
|
23233
|
+
|
|
22281
23234
|
// src/models/modeling_utils.js
|
|
22282
23235
|
var MODEL_MAPPING_NAMES = null;
|
|
22283
23236
|
function registerTaskMappings(mappings) {
|
|
22284
23237
|
MODEL_MAPPING_NAMES = mappings;
|
|
22285
23238
|
}
|
|
22286
|
-
function getPastLength(past_key_values) {
|
|
22287
|
-
for (const name in past_key_values) {
|
|
22288
|
-
if (name.startsWith("past_key_values.")) {
|
|
22289
|
-
return past_key_values[name].dims.at(-2);
|
|
22290
|
-
}
|
|
22291
|
-
}
|
|
22292
|
-
return Object.values(past_key_values)[0].dims.at(-2);
|
|
22293
|
-
}
|
|
22294
23239
|
function toI64Tensor(items) {
|
|
22295
23240
|
if (items instanceof Tensor2) {
|
|
22296
23241
|
return items;
|
|
@@ -22331,71 +23276,181 @@ var MODEL_TYPES = {
|
|
|
22331
23276
|
AutoEncoder: 12,
|
|
22332
23277
|
ImageAudioTextToText: 13,
|
|
22333
23278
|
Supertonic: 14,
|
|
22334
|
-
Chatterbox: 15
|
|
23279
|
+
Chatterbox: 15,
|
|
23280
|
+
MultimodalLanguageModelOnly: 16,
|
|
23281
|
+
VoxtralRealtime: 17
|
|
22335
23282
|
};
|
|
22336
23283
|
var MODEL_TYPE_CONFIG = {
|
|
22337
23284
|
[MODEL_TYPES.DecoderOnly]: {
|
|
22338
23285
|
can_generate: true,
|
|
22339
23286
|
forward: decoder_forward,
|
|
22340
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
23287
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
23288
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
|
|
23289
|
+
cache_sessions: { model: true },
|
|
23290
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22341
23291
|
},
|
|
22342
23292
|
[MODEL_TYPES.DecoderOnlyWithoutHead]: {
|
|
22343
23293
|
can_generate: false,
|
|
22344
23294
|
forward: decoder_forward,
|
|
22345
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
23295
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
23296
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
22346
23297
|
},
|
|
22347
23298
|
[MODEL_TYPES.Seq2Seq]: {
|
|
22348
23299
|
can_generate: true,
|
|
22349
23300
|
forward: seq2seq_forward,
|
|
22350
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
23301
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
23302
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
23303
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23304
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22351
23305
|
},
|
|
22352
23306
|
[MODEL_TYPES.Vision2Seq]: {
|
|
22353
23307
|
can_generate: true,
|
|
22354
23308
|
forward: seq2seq_forward,
|
|
22355
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
23309
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
23310
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
23311
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23312
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22356
23313
|
},
|
|
22357
23314
|
[MODEL_TYPES.Musicgen]: {
|
|
22358
23315
|
can_generate: true,
|
|
22359
|
-
forward: seq2seq_forward
|
|
23316
|
+
forward: seq2seq_forward,
|
|
23317
|
+
sessions: () => ({
|
|
23318
|
+
model: "text_encoder",
|
|
23319
|
+
decoder_model_merged: "decoder_model_merged",
|
|
23320
|
+
encodec_decode: "encodec_decode"
|
|
23321
|
+
}),
|
|
23322
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23323
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22360
23324
|
},
|
|
22361
23325
|
[MODEL_TYPES.EncoderDecoder]: {
|
|
22362
23326
|
can_generate: false,
|
|
22363
|
-
forward: seq2seq_forward
|
|
23327
|
+
forward: seq2seq_forward,
|
|
23328
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
23329
|
+
cache_sessions: { decoder_model_merged: true }
|
|
23330
|
+
},
|
|
23331
|
+
[MODEL_TYPES.MaskGeneration]: {
|
|
23332
|
+
sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
|
|
22364
23333
|
},
|
|
22365
23334
|
[MODEL_TYPES.ImageTextToText]: {
|
|
22366
23335
|
can_generate: true,
|
|
22367
23336
|
forward: image_text_to_text_forward,
|
|
22368
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
23337
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23338
|
+
sessions: (config) => {
|
|
23339
|
+
const s = {
|
|
23340
|
+
embed_tokens: "embed_tokens",
|
|
23341
|
+
vision_encoder: "vision_encoder",
|
|
23342
|
+
decoder_model_merged: "decoder_model_merged"
|
|
23343
|
+
};
|
|
23344
|
+
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
23345
|
+
return s;
|
|
23346
|
+
},
|
|
23347
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23348
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22369
23349
|
},
|
|
22370
23350
|
[MODEL_TYPES.AudioTextToText]: {
|
|
22371
23351
|
can_generate: true,
|
|
22372
23352
|
forward: audio_text_to_text_forward,
|
|
22373
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
23353
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23354
|
+
sessions: () => ({
|
|
23355
|
+
embed_tokens: "embed_tokens",
|
|
23356
|
+
audio_encoder: "audio_encoder",
|
|
23357
|
+
decoder_model_merged: "decoder_model_merged"
|
|
23358
|
+
}),
|
|
23359
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23360
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22374
23361
|
},
|
|
22375
|
-
[MODEL_TYPES.
|
|
23362
|
+
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
22376
23363
|
can_generate: true,
|
|
22377
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
23364
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23365
|
+
sessions: () => ({
|
|
23366
|
+
embed_tokens: "embed_tokens",
|
|
23367
|
+
audio_encoder: "audio_encoder",
|
|
23368
|
+
vision_encoder: "vision_encoder",
|
|
23369
|
+
decoder_model_merged: "decoder_model_merged"
|
|
23370
|
+
}),
|
|
23371
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22378
23372
|
},
|
|
22379
|
-
[MODEL_TYPES.
|
|
23373
|
+
[MODEL_TYPES.Phi3V]: {
|
|
22380
23374
|
can_generate: true,
|
|
22381
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
23375
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23376
|
+
sessions: () => ({
|
|
23377
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23378
|
+
model: "model",
|
|
23379
|
+
vision_encoder: "vision_encoder"
|
|
23380
|
+
}),
|
|
23381
|
+
cache_sessions: { model: true },
|
|
23382
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22382
23383
|
},
|
|
22383
23384
|
[MODEL_TYPES.MultiModality]: {
|
|
22384
|
-
can_generate: true
|
|
23385
|
+
can_generate: true,
|
|
23386
|
+
sessions: () => ({
|
|
23387
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23388
|
+
model: "language_model",
|
|
23389
|
+
lm_head: "lm_head",
|
|
23390
|
+
gen_head: "gen_head",
|
|
23391
|
+
gen_img_embeds: "gen_img_embeds",
|
|
23392
|
+
image_decode: "image_decode"
|
|
23393
|
+
}),
|
|
23394
|
+
cache_sessions: { model: true },
|
|
23395
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22385
23396
|
},
|
|
22386
23397
|
[MODEL_TYPES.AutoEncoder]: {
|
|
22387
23398
|
can_generate: false,
|
|
22388
|
-
forward: auto_encoder_forward
|
|
23399
|
+
forward: auto_encoder_forward,
|
|
23400
|
+
sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
|
|
23401
|
+
},
|
|
23402
|
+
[MODEL_TYPES.Supertonic]: {
|
|
23403
|
+
sessions: () => ({
|
|
23404
|
+
text_encoder: "text_encoder",
|
|
23405
|
+
latent_denoiser: "latent_denoiser",
|
|
23406
|
+
voice_decoder: "voice_decoder"
|
|
23407
|
+
})
|
|
22389
23408
|
},
|
|
22390
23409
|
[MODEL_TYPES.Chatterbox]: {
|
|
22391
23410
|
can_generate: true,
|
|
22392
|
-
forward: encoder_forward
|
|
23411
|
+
forward: encoder_forward,
|
|
23412
|
+
sessions: () => ({
|
|
23413
|
+
embed_tokens: "embed_tokens",
|
|
23414
|
+
speech_encoder: "speech_encoder",
|
|
23415
|
+
model: "language_model",
|
|
23416
|
+
conditional_decoder: "conditional_decoder"
|
|
23417
|
+
}),
|
|
23418
|
+
cache_sessions: { model: true },
|
|
23419
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23420
|
+
},
|
|
23421
|
+
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
23422
|
+
can_generate: true,
|
|
23423
|
+
forward: image_text_to_text_forward,
|
|
23424
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
23425
|
+
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
23426
|
+
cache_sessions: { decoder_model_merged: true },
|
|
23427
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23428
|
+
},
|
|
23429
|
+
[MODEL_TYPES.VoxtralRealtime]: {
|
|
23430
|
+
can_generate: true,
|
|
23431
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
23432
|
+
sessions: () => ({
|
|
23433
|
+
embed_tokens: "embed_tokens",
|
|
23434
|
+
audio_encoder: "audio_encoder",
|
|
23435
|
+
decoder_model_merged: "decoder_model_merged"
|
|
23436
|
+
}),
|
|
23437
|
+
cache_sessions: { decoder_model_merged: true, audio_encoder: true },
|
|
23438
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
22393
23439
|
},
|
|
22394
23440
|
default: {
|
|
22395
23441
|
can_generate: false,
|
|
22396
|
-
forward: encoder_forward
|
|
23442
|
+
forward: encoder_forward,
|
|
23443
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
22397
23444
|
}
|
|
22398
23445
|
};
|
|
23446
|
+
function getSessionsConfig(modelType, config, options = {}) {
|
|
23447
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
23448
|
+
return {
|
|
23449
|
+
sessions: typeConfig.sessions(config, options),
|
|
23450
|
+
cache_sessions: typeConfig.cache_sessions,
|
|
23451
|
+
optional_configs: typeConfig.optional_configs
|
|
23452
|
+
};
|
|
23453
|
+
}
|
|
22399
23454
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
22400
23455
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
22401
23456
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -22481,245 +23536,23 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
22481
23536
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
22482
23537
|
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
22483
23538
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
22484
|
-
|
|
22485
|
-
if (modelType ===
|
|
22486
|
-
|
|
22487
|
-
|
|
22488
|
-
|
|
22489
|
-
{
|
|
22490
|
-
|
|
22491
|
-
},
|
|
22492
|
-
options,
|
|
22493
|
-
"model"
|
|
22494
|
-
),
|
|
22495
|
-
get_optional_configs(
|
|
22496
|
-
pretrained_model_name_or_path,
|
|
22497
|
-
{
|
|
22498
|
-
generation_config: "generation_config.json"
|
|
22499
|
-
},
|
|
22500
|
-
options
|
|
22501
|
-
)
|
|
22502
|
-
]);
|
|
22503
|
-
} else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
|
|
22504
|
-
info = await Promise.all([
|
|
22505
|
-
constructSessions(
|
|
22506
|
-
pretrained_model_name_or_path,
|
|
22507
|
-
{
|
|
22508
|
-
model: "encoder_model",
|
|
22509
|
-
decoder_model_merged: "decoder_model_merged"
|
|
22510
|
-
},
|
|
22511
|
-
options,
|
|
22512
|
-
"decoder_model_merged"
|
|
22513
|
-
),
|
|
22514
|
-
get_optional_configs(
|
|
22515
|
-
pretrained_model_name_or_path,
|
|
22516
|
-
{
|
|
22517
|
-
generation_config: "generation_config.json"
|
|
22518
|
-
},
|
|
22519
|
-
options
|
|
22520
|
-
)
|
|
22521
|
-
]);
|
|
22522
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
22523
|
-
info = await Promise.all([
|
|
22524
|
-
constructSessions(
|
|
22525
|
-
pretrained_model_name_or_path,
|
|
22526
|
-
{
|
|
22527
|
-
model: "vision_encoder",
|
|
22528
|
-
prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
|
|
22529
|
-
},
|
|
22530
|
-
options
|
|
22531
|
-
)
|
|
22532
|
-
]);
|
|
22533
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
22534
|
-
info = await Promise.all([
|
|
22535
|
-
constructSessions(
|
|
22536
|
-
pretrained_model_name_or_path,
|
|
22537
|
-
{
|
|
22538
|
-
model: "encoder_model",
|
|
22539
|
-
decoder_model_merged: "decoder_model_merged"
|
|
22540
|
-
},
|
|
22541
|
-
options,
|
|
22542
|
-
"decoder_model_merged"
|
|
22543
|
-
)
|
|
22544
|
-
]);
|
|
22545
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
22546
|
-
const sessions = {
|
|
22547
|
-
embed_tokens: "embed_tokens",
|
|
22548
|
-
vision_encoder: "vision_encoder",
|
|
22549
|
-
decoder_model_merged: "decoder_model_merged"
|
|
22550
|
-
};
|
|
22551
|
-
if (config.is_encoder_decoder) {
|
|
22552
|
-
sessions["model"] = "encoder_model";
|
|
22553
|
-
}
|
|
22554
|
-
info = await Promise.all([
|
|
22555
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
22556
|
-
get_optional_configs(
|
|
22557
|
-
pretrained_model_name_or_path,
|
|
22558
|
-
{
|
|
22559
|
-
generation_config: "generation_config.json"
|
|
22560
|
-
},
|
|
22561
|
-
options
|
|
22562
|
-
)
|
|
22563
|
-
]);
|
|
22564
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
22565
|
-
const sessions = {
|
|
22566
|
-
embed_tokens: "embed_tokens",
|
|
22567
|
-
audio_encoder: "audio_encoder",
|
|
22568
|
-
decoder_model_merged: "decoder_model_merged"
|
|
22569
|
-
};
|
|
22570
|
-
info = await Promise.all([
|
|
22571
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
22572
|
-
get_optional_configs(
|
|
22573
|
-
pretrained_model_name_or_path,
|
|
22574
|
-
{
|
|
22575
|
-
generation_config: "generation_config.json"
|
|
22576
|
-
},
|
|
22577
|
-
options
|
|
22578
|
-
)
|
|
22579
|
-
]);
|
|
22580
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
22581
|
-
const sessions = {
|
|
22582
|
-
embed_tokens: "embed_tokens",
|
|
22583
|
-
audio_encoder: "audio_encoder",
|
|
22584
|
-
vision_encoder: "vision_encoder",
|
|
22585
|
-
decoder_model_merged: "decoder_model_merged"
|
|
22586
|
-
};
|
|
22587
|
-
info = await Promise.all([
|
|
22588
|
-
constructSessions(pretrained_model_name_or_path, sessions, options),
|
|
22589
|
-
get_optional_configs(
|
|
22590
|
-
pretrained_model_name_or_path,
|
|
22591
|
-
{
|
|
22592
|
-
generation_config: "generation_config.json"
|
|
22593
|
-
},
|
|
22594
|
-
options
|
|
22595
|
-
)
|
|
22596
|
-
]);
|
|
22597
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
22598
|
-
info = await Promise.all([
|
|
22599
|
-
constructSessions(
|
|
22600
|
-
pretrained_model_name_or_path,
|
|
22601
|
-
{
|
|
22602
|
-
model: "text_encoder",
|
|
22603
|
-
decoder_model_merged: "decoder_model_merged",
|
|
22604
|
-
encodec_decode: "encodec_decode"
|
|
22605
|
-
},
|
|
22606
|
-
options,
|
|
22607
|
-
"decoder_model_merged"
|
|
22608
|
-
),
|
|
22609
|
-
get_optional_configs(
|
|
22610
|
-
pretrained_model_name_or_path,
|
|
22611
|
-
{
|
|
22612
|
-
generation_config: "generation_config.json"
|
|
22613
|
-
},
|
|
22614
|
-
options
|
|
22615
|
-
)
|
|
22616
|
-
]);
|
|
22617
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
22618
|
-
info = await Promise.all([
|
|
22619
|
-
constructSessions(
|
|
22620
|
-
pretrained_model_name_or_path,
|
|
22621
|
-
{
|
|
22622
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
22623
|
-
model: "language_model",
|
|
22624
|
-
lm_head: "lm_head",
|
|
22625
|
-
gen_head: "gen_head",
|
|
22626
|
-
gen_img_embeds: "gen_img_embeds",
|
|
22627
|
-
image_decode: "image_decode"
|
|
22628
|
-
},
|
|
22629
|
-
options,
|
|
22630
|
-
"model"
|
|
22631
|
-
),
|
|
22632
|
-
get_optional_configs(
|
|
22633
|
-
pretrained_model_name_or_path,
|
|
22634
|
-
{
|
|
22635
|
-
generation_config: "generation_config.json"
|
|
22636
|
-
},
|
|
22637
|
-
options
|
|
22638
|
-
)
|
|
22639
|
-
]);
|
|
22640
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
22641
|
-
info = await Promise.all([
|
|
22642
|
-
constructSessions(
|
|
22643
|
-
pretrained_model_name_or_path,
|
|
22644
|
-
{
|
|
22645
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
22646
|
-
model: "model",
|
|
22647
|
-
vision_encoder: "vision_encoder"
|
|
22648
|
-
},
|
|
22649
|
-
options,
|
|
22650
|
-
"model"
|
|
22651
|
-
),
|
|
22652
|
-
get_optional_configs(
|
|
22653
|
-
pretrained_model_name_or_path,
|
|
22654
|
-
{
|
|
22655
|
-
generation_config: "generation_config.json"
|
|
22656
|
-
},
|
|
22657
|
-
options
|
|
22658
|
-
)
|
|
22659
|
-
]);
|
|
22660
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
22661
|
-
info = await Promise.all([
|
|
22662
|
-
constructSessions(
|
|
22663
|
-
pretrained_model_name_or_path,
|
|
22664
|
-
{
|
|
22665
|
-
embed_tokens: "embed_tokens",
|
|
22666
|
-
speech_encoder: "speech_encoder",
|
|
22667
|
-
model: "language_model",
|
|
22668
|
-
conditional_decoder: "conditional_decoder"
|
|
22669
|
-
},
|
|
22670
|
-
options,
|
|
22671
|
-
"model"
|
|
22672
|
-
),
|
|
22673
|
-
get_optional_configs(
|
|
22674
|
-
pretrained_model_name_or_path,
|
|
22675
|
-
{
|
|
22676
|
-
generation_config: "generation_config.json"
|
|
22677
|
-
},
|
|
22678
|
-
options
|
|
22679
|
-
)
|
|
22680
|
-
]);
|
|
22681
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
22682
|
-
info = await Promise.all([
|
|
22683
|
-
constructSessions(
|
|
22684
|
-
pretrained_model_name_or_path,
|
|
22685
|
-
{
|
|
22686
|
-
encoder_model: "encoder_model",
|
|
22687
|
-
decoder_model: "decoder_model"
|
|
22688
|
-
},
|
|
22689
|
-
options
|
|
22690
|
-
)
|
|
22691
|
-
]);
|
|
22692
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
22693
|
-
info = await Promise.all([
|
|
22694
|
-
constructSessions(
|
|
22695
|
-
pretrained_model_name_or_path,
|
|
22696
|
-
{
|
|
22697
|
-
text_encoder: "text_encoder",
|
|
22698
|
-
latent_denoiser: "latent_denoiser",
|
|
22699
|
-
voice_decoder: "voice_decoder"
|
|
22700
|
-
},
|
|
22701
|
-
options
|
|
22702
|
-
)
|
|
22703
|
-
]);
|
|
22704
|
-
} else {
|
|
22705
|
-
if (modelType === void 0) {
|
|
22706
|
-
const type = modelName ?? config?.model_type;
|
|
22707
|
-
if (type !== "custom") {
|
|
22708
|
-
logger.warn(
|
|
22709
|
-
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
22710
|
-
);
|
|
22711
|
-
}
|
|
23539
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
23540
|
+
if (modelType === void 0) {
|
|
23541
|
+
const type = modelName ?? config?.model_type;
|
|
23542
|
+
if (type !== "custom") {
|
|
23543
|
+
logger.warn(
|
|
23544
|
+
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
23545
|
+
);
|
|
22712
23546
|
}
|
|
22713
|
-
info = await Promise.all([
|
|
22714
|
-
constructSessions(
|
|
22715
|
-
pretrained_model_name_or_path,
|
|
22716
|
-
{
|
|
22717
|
-
model: options.model_file_name ?? "model"
|
|
22718
|
-
},
|
|
22719
|
-
options
|
|
22720
|
-
)
|
|
22721
|
-
]);
|
|
22722
23547
|
}
|
|
23548
|
+
const sessions = typeConfig.sessions(config, options);
|
|
23549
|
+
const promises = [
|
|
23550
|
+
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
23551
|
+
];
|
|
23552
|
+
if (typeConfig.optional_configs) {
|
|
23553
|
+
promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
|
|
23554
|
+
}
|
|
23555
|
+
const info = await Promise.all(promises);
|
|
22723
23556
|
return new this(config, ...info);
|
|
22724
23557
|
}
|
|
22725
23558
|
/**
|
|
@@ -22918,7 +23751,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
22918
23751
|
* @param {Tensor} [params.inputs=null]
|
|
22919
23752
|
* @param {number} [params.bos_token_id=null]
|
|
22920
23753
|
* @param {Record<string, Tensor|number[]>} [params.model_kwargs]
|
|
22921
|
-
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor
|
|
23754
|
+
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
|
|
22922
23755
|
*/
|
|
22923
23756
|
_prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
|
|
22924
23757
|
const model_inputs = pick(model_kwargs, this.forward_params);
|
|
@@ -23159,11 +23992,12 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23159
23992
|
}
|
|
23160
23993
|
}
|
|
23161
23994
|
/**
|
|
23162
|
-
* Returns
|
|
23995
|
+
* Returns a DynamicCache containing past key values from the given decoder results object.
|
|
23163
23996
|
*
|
|
23164
23997
|
* @param {Object} decoderResults The decoder results object.
|
|
23165
|
-
* @param {
|
|
23166
|
-
* @
|
|
23998
|
+
* @param {DynamicCache} pastKeyValues The previous past key values.
|
|
23999
|
+
* @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
|
|
24000
|
+
* @returns {DynamicCache} A new DynamicCache containing the updated past key values.
|
|
23167
24001
|
*/
|
|
23168
24002
|
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
23169
24003
|
const pkvs = /* @__PURE__ */ Object.create(null);
|
|
@@ -23184,7 +24018,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23184
24018
|
}
|
|
23185
24019
|
}
|
|
23186
24020
|
}
|
|
23187
|
-
return pkvs;
|
|
24021
|
+
return new DynamicCache(pkvs);
|
|
23188
24022
|
}
|
|
23189
24023
|
/**
|
|
23190
24024
|
* Returns an object containing attentions from the given model output object.
|
|
@@ -23209,8 +24043,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23209
24043
|
/**
|
|
23210
24044
|
* Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
|
|
23211
24045
|
*
|
|
23212
|
-
* @param {
|
|
23213
|
-
* @param {
|
|
24046
|
+
* @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
|
|
24047
|
+
* @param {DynamicCache|null} pastKeyValues The cache containing past key values.
|
|
23214
24048
|
*/
|
|
23215
24049
|
addPastKeyValues(decoderFeeds, pastKeyValues) {
|
|
23216
24050
|
if (pastKeyValues) {
|
|
@@ -23227,14 +24061,29 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23227
24061
|
}
|
|
23228
24062
|
}
|
|
23229
24063
|
}
|
|
23230
|
-
|
|
23231
|
-
|
|
24064
|
+
/**
|
|
24065
|
+
* Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
|
|
24066
|
+
* @param {string} sessionName
|
|
24067
|
+
* @param {Record<string, Tensor>} inputs
|
|
24068
|
+
* @param {string} outputName
|
|
24069
|
+
* @private
|
|
24070
|
+
*/
|
|
24071
|
+
async _encode_input(sessionName, inputs, outputName) {
|
|
24072
|
+
if (!Object.hasOwn(this.sessions, sessionName)) {
|
|
24073
|
+
throw new Error(`Model does not have a ${sessionName} session.`);
|
|
24074
|
+
}
|
|
24075
|
+
const session = this.sessions[sessionName];
|
|
24076
|
+
const output = await sessionRun(session, pick(inputs, session.inputNames));
|
|
24077
|
+
return output[outputName];
|
|
24078
|
+
}
|
|
24079
|
+
async encode_image(inputs) {
|
|
24080
|
+
return this._encode_input("vision_encoder", inputs, "image_features");
|
|
23232
24081
|
}
|
|
23233
|
-
async encode_text(
|
|
23234
|
-
return
|
|
24082
|
+
async encode_text(inputs) {
|
|
24083
|
+
return this._encode_input("embed_tokens", inputs, "inputs_embeds");
|
|
23235
24084
|
}
|
|
23236
|
-
async encode_audio(
|
|
23237
|
-
return
|
|
24085
|
+
async encode_audio(inputs) {
|
|
24086
|
+
return this._encode_input("audio_encoder", inputs, "audio_features");
|
|
23238
24087
|
}
|
|
23239
24088
|
};
|
|
23240
24089
|
async function seq2seq_forward(self2, model_inputs) {
|
|
@@ -23289,6 +24138,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
|
|
|
23289
24138
|
const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
|
|
23290
24139
|
new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
|
|
23291
24140
|
}
|
|
24141
|
+
if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
|
|
24142
|
+
new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
|
|
24143
|
+
}
|
|
23292
24144
|
self2.addPastKeyValues(new_model_inputs, past_key_values);
|
|
23293
24145
|
const fixed = pick(new_model_inputs, session.inputNames);
|
|
23294
24146
|
return await sessionRun(session, fixed);
|
|
@@ -23297,7 +24149,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
23297
24149
|
// Generic parameters:
|
|
23298
24150
|
encode_function,
|
|
23299
24151
|
merge_function,
|
|
23300
|
-
|
|
24152
|
+
modality_input_names,
|
|
23301
24153
|
modality_output_name,
|
|
23302
24154
|
// Produced by the tokenizer/processor:
|
|
23303
24155
|
input_ids = null,
|
|
@@ -23312,32 +24164,34 @@ async function generic_text_to_text_forward(self2, {
|
|
|
23312
24164
|
// Additional parameters
|
|
23313
24165
|
...kwargs
|
|
23314
24166
|
}) {
|
|
23315
|
-
const modality_values = kwargs[modality_input_name];
|
|
23316
24167
|
if (!inputs_embeds) {
|
|
23317
24168
|
inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
|
|
23318
|
-
|
|
23319
|
-
|
|
23320
|
-
|
|
23321
|
-
|
|
23322
|
-
|
|
23323
|
-
|
|
23324
|
-
|
|
23325
|
-
|
|
23326
|
-
|
|
23327
|
-
inputs_embeds,
|
|
23328
|
-
|
|
23329
|
-
|
|
23330
|
-
|
|
23331
|
-
|
|
23332
|
-
|
|
23333
|
-
|
|
23334
|
-
|
|
23335
|
-
|
|
23336
|
-
|
|
23337
|
-
|
|
23338
|
-
|
|
23339
|
-
|
|
23340
|
-
|
|
24169
|
+
const modality_values = pick(kwargs, modality_input_names);
|
|
24170
|
+
if (Object.keys(modality_values).length > 0) {
|
|
24171
|
+
if (input_ids.dims[1] !== 1) {
|
|
24172
|
+
const modality_features = await encode_function({
|
|
24173
|
+
// Pass the modality values under its expected key.
|
|
24174
|
+
// The caller knows whether this is audio or image.
|
|
24175
|
+
...modality_values,
|
|
24176
|
+
...kwargs
|
|
24177
|
+
});
|
|
24178
|
+
({ inputs_embeds, attention_mask } = merge_function({
|
|
24179
|
+
[modality_output_name]: modality_features,
|
|
24180
|
+
inputs_embeds,
|
|
24181
|
+
input_ids,
|
|
24182
|
+
attention_mask
|
|
24183
|
+
}));
|
|
24184
|
+
} else if (past_key_values && input_ids.dims[1] === 1) {
|
|
24185
|
+
const target_length = input_ids.dims[1];
|
|
24186
|
+
const past_length = past_key_values.get_seq_length();
|
|
24187
|
+
attention_mask = cat(
|
|
24188
|
+
[
|
|
24189
|
+
ones([input_ids.dims[0], past_length]),
|
|
24190
|
+
attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
|
|
24191
|
+
],
|
|
24192
|
+
1
|
|
24193
|
+
);
|
|
24194
|
+
}
|
|
23341
24195
|
}
|
|
23342
24196
|
}
|
|
23343
24197
|
if (!position_ids) {
|
|
@@ -23345,10 +24199,13 @@ async function generic_text_to_text_forward(self2, {
|
|
|
23345
24199
|
// Handle special case for qwen vl models
|
|
23346
24200
|
[
|
|
23347
24201
|
"qwen2_vl",
|
|
24202
|
+
"qwen2_vl_text",
|
|
23348
24203
|
"qwen2_5_vl",
|
|
23349
24204
|
"qwen2_5_vl_text",
|
|
23350
24205
|
"qwen3_vl",
|
|
23351
24206
|
"qwen3_vl_text",
|
|
24207
|
+
"qwen3_vl_moe",
|
|
24208
|
+
"qwen3_vl_moe_text",
|
|
23352
24209
|
"qwen3_5",
|
|
23353
24210
|
"qwen3_5_text",
|
|
23354
24211
|
"qwen3_5_moe",
|
|
@@ -23376,7 +24233,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
23376
24233
|
async function audio_text_to_text_forward(self2, params) {
|
|
23377
24234
|
return await generic_text_to_text_forward(self2, {
|
|
23378
24235
|
...params,
|
|
23379
|
-
|
|
24236
|
+
modality_input_names: ["audio_values", "input_features"],
|
|
23380
24237
|
modality_output_name: "audio_features",
|
|
23381
24238
|
encode_function: self2.encode_audio.bind(self2),
|
|
23382
24239
|
merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
|
|
@@ -23385,7 +24242,7 @@ async function audio_text_to_text_forward(self2, params) {
|
|
|
23385
24242
|
async function image_text_to_text_forward(self2, params) {
|
|
23386
24243
|
return await generic_text_to_text_forward(self2, {
|
|
23387
24244
|
...params,
|
|
23388
|
-
|
|
24245
|
+
modality_input_names: ["pixel_values"],
|
|
23389
24246
|
modality_output_name: "image_features",
|
|
23390
24247
|
encode_function: self2.encode_image.bind(self2),
|
|
23391
24248
|
merge_function: self2._merge_input_ids_with_image_features.bind(self2)
|
|
@@ -23421,7 +24278,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
|
|
|
23421
24278
|
return position_ids;
|
|
23422
24279
|
}
|
|
23423
24280
|
function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
|
|
23424
|
-
const past_length = model_inputs.past_key_values ?
|
|
24281
|
+
const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
|
|
24282
|
+
const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
|
|
24283
|
+
if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
|
|
24284
|
+
model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
|
|
24285
|
+
}
|
|
23425
24286
|
if (!model_inputs.attention_mask) {
|
|
23426
24287
|
let dims;
|
|
23427
24288
|
for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
|
|
@@ -23729,6 +24590,7 @@ __export(models_exports, {
|
|
|
23729
24590
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
23730
24591
|
Gemma3Model: () => Gemma3Model,
|
|
23731
24592
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
24593
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
23732
24594
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
23733
24595
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
23734
24596
|
GemmaForCausalLM: () => GemmaForCausalLM,
|
|
@@ -23746,6 +24608,7 @@ __export(models_exports, {
|
|
|
23746
24608
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
23747
24609
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
23748
24610
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
24611
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
23749
24612
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
23750
24613
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
23751
24614
|
GroupViTModel: () => GroupViTModel,
|
|
@@ -23767,7 +24630,6 @@ __export(models_exports, {
|
|
|
23767
24630
|
IJepaModel: () => IJepaModel,
|
|
23768
24631
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
23769
24632
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
23770
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
23771
24633
|
JAISLMHeadModel: () => JAISLMHeadModel,
|
|
23772
24634
|
JAISModel: () => JAISModel,
|
|
23773
24635
|
JAISPreTrainedModel: () => JAISPreTrainedModel,
|
|
@@ -23781,6 +24643,7 @@ __export(models_exports, {
|
|
|
23781
24643
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
23782
24644
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
23783
24645
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
24646
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
23784
24647
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
23785
24648
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
23786
24649
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -23920,7 +24783,6 @@ __export(models_exports, {
|
|
|
23920
24783
|
Owlv2Model: () => Owlv2Model,
|
|
23921
24784
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
23922
24785
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
23923
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
23924
24786
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
23925
24787
|
ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
|
|
23926
24788
|
PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
|
|
@@ -23950,8 +24812,10 @@ __export(models_exports, {
|
|
|
23950
24812
|
Qwen2MoeModel: () => Qwen2MoeModel,
|
|
23951
24813
|
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
23952
24814
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
24815
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
23953
24816
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
23954
24817
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
24818
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
23955
24819
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
23956
24820
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
23957
24821
|
Qwen3Model: () => Qwen3Model,
|
|
@@ -23962,9 +24826,13 @@ __export(models_exports, {
|
|
|
23962
24826
|
Qwen3NextModel: () => Qwen3NextModel,
|
|
23963
24827
|
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
23964
24828
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
24829
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
23965
24830
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
24831
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
23966
24832
|
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
24833
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
23967
24834
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
24835
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
23968
24836
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
23969
24837
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
23970
24838
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -24015,7 +24883,6 @@ __export(models_exports, {
|
|
|
24015
24883
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
24016
24884
|
SmolLM3Model: () => SmolLM3Model,
|
|
24017
24885
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
24018
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
24019
24886
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
24020
24887
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
24021
24888
|
SnacModel: () => SnacModel,
|
|
@@ -24087,6 +24954,8 @@ __export(models_exports, {
|
|
|
24087
24954
|
VitsModelOutput: () => VitsModelOutput,
|
|
24088
24955
|
VitsPreTrainedModel: () => VitsPreTrainedModel,
|
|
24089
24956
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
24957
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
24958
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
24090
24959
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
24091
24960
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
24092
24961
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -24447,7 +25316,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
|
|
|
24447
25316
|
if (!past_key_values || target_length !== 1) {
|
|
24448
25317
|
throw new Error("Incorrect state encountered during generation.");
|
|
24449
25318
|
}
|
|
24450
|
-
const past_length =
|
|
25319
|
+
const past_length = past_key_values.get_seq_length();
|
|
24451
25320
|
attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
|
|
24452
25321
|
}
|
|
24453
25322
|
}
|
|
@@ -25477,6 +26346,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
|
|
|
25477
26346
|
});
|
|
25478
26347
|
}
|
|
25479
26348
|
};
|
|
26349
|
+
var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
|
|
26350
|
+
};
|
|
25480
26351
|
|
|
25481
26352
|
// src/models/glm/modeling_glm.js
|
|
25482
26353
|
var GlmPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -25558,6 +26429,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
|
|
|
25558
26429
|
var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
|
|
25559
26430
|
};
|
|
25560
26431
|
|
|
26432
|
+
// src/models/ultravox/modeling_ultravox.js
|
|
26433
|
+
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
26434
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
26435
|
+
};
|
|
26436
|
+
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
26437
|
+
_merge_input_ids_with_audio_features(kwargs) {
|
|
26438
|
+
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
26439
|
+
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
26440
|
+
return default_merge_input_ids_with_audio_features({
|
|
26441
|
+
// @ts-ignore
|
|
26442
|
+
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
|
|
26443
|
+
...kwargs,
|
|
26444
|
+
audio_features: reshaped_audio_features
|
|
26445
|
+
});
|
|
26446
|
+
}
|
|
26447
|
+
};
|
|
26448
|
+
|
|
26449
|
+
// src/models/granite_speech/modeling_granite_speech.js
|
|
26450
|
+
var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
|
|
26451
|
+
forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
|
|
26452
|
+
};
|
|
26453
|
+
|
|
25561
26454
|
// src/models/grounding_dino/modeling_grounding_dino.js
|
|
25562
26455
|
var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
|
|
25563
26456
|
};
|
|
@@ -25662,34 +26555,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
25662
26555
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
25663
26556
|
};
|
|
25664
26557
|
|
|
25665
|
-
// src/models/
|
|
25666
|
-
var
|
|
25667
|
-
forward_params = [
|
|
25668
|
-
"input_ids",
|
|
25669
|
-
"attention_mask",
|
|
25670
|
-
"pixel_values",
|
|
25671
|
-
"pixel_attention_mask",
|
|
25672
|
-
"position_ids",
|
|
25673
|
-
"past_key_values"
|
|
25674
|
-
];
|
|
26558
|
+
// src/models/llava/modeling_llava.js
|
|
26559
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
26560
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
25675
26561
|
};
|
|
25676
|
-
var
|
|
25677
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
25678
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
|
|
25679
|
-
return features;
|
|
25680
|
-
}
|
|
26562
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
25681
26563
|
_merge_input_ids_with_image_features(kwargs) {
|
|
25682
26564
|
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
25683
26565
|
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
25684
26566
|
return default_merge_input_ids_with_image_features({
|
|
25685
26567
|
// @ts-ignore
|
|
25686
|
-
image_token_id: this.config.image_token_id,
|
|
26568
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
25687
26569
|
...kwargs,
|
|
25688
26570
|
image_features: reshaped_image_hidden_states
|
|
25689
26571
|
});
|
|
25690
26572
|
}
|
|
25691
26573
|
};
|
|
25692
|
-
var
|
|
26574
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26575
|
+
};
|
|
26576
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
26577
|
+
};
|
|
26578
|
+
|
|
26579
|
+
// src/models/idefics3/modeling_idefics3.js
|
|
26580
|
+
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26581
|
+
forward_params = [
|
|
26582
|
+
"input_ids",
|
|
26583
|
+
"attention_mask",
|
|
26584
|
+
"pixel_values",
|
|
26585
|
+
"pixel_attention_mask",
|
|
26586
|
+
"position_ids",
|
|
26587
|
+
"past_key_values"
|
|
26588
|
+
];
|
|
25693
26589
|
};
|
|
25694
26590
|
|
|
25695
26591
|
// src/models/ijepa/modeling_ijepa.js
|
|
@@ -25781,6 +26677,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
|
|
|
25781
26677
|
var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
|
|
25782
26678
|
};
|
|
25783
26679
|
|
|
26680
|
+
// src/models/lfm2_vl/modeling_lfm2_vl.js
|
|
26681
|
+
var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26682
|
+
forward_params = [
|
|
26683
|
+
"input_ids",
|
|
26684
|
+
"attention_mask",
|
|
26685
|
+
"pixel_values",
|
|
26686
|
+
"pixel_attention_mask",
|
|
26687
|
+
"spatial_shapes",
|
|
26688
|
+
"position_ids",
|
|
26689
|
+
"past_key_values"
|
|
26690
|
+
];
|
|
26691
|
+
};
|
|
26692
|
+
|
|
25784
26693
|
// src/models/llama/modeling_llama.js
|
|
25785
26694
|
var LlamaPreTrainedModel = class extends PreTrainedModel {
|
|
25786
26695
|
};
|
|
@@ -25795,27 +26704,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
|
|
|
25795
26704
|
var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
|
|
25796
26705
|
};
|
|
25797
26706
|
|
|
25798
|
-
// src/models/llava/modeling_llava.js
|
|
25799
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
25800
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
25801
|
-
};
|
|
25802
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
25803
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
25804
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
25805
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
25806
|
-
return default_merge_input_ids_with_image_features({
|
|
25807
|
-
// @ts-ignore
|
|
25808
|
-
image_token_id: this.config.image_token_index,
|
|
25809
|
-
...kwargs,
|
|
25810
|
-
image_features: reshaped_image_hidden_states
|
|
25811
|
-
});
|
|
25812
|
-
}
|
|
25813
|
-
};
|
|
25814
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
25815
|
-
};
|
|
25816
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
25817
|
-
};
|
|
25818
|
-
|
|
25819
26707
|
// src/models/longt5/modeling_longt5.js
|
|
25820
26708
|
var LongT5PreTrainedModel = class extends PreTrainedModel {
|
|
25821
26709
|
};
|
|
@@ -26566,27 +27454,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
|
|
|
26566
27454
|
};
|
|
26567
27455
|
|
|
26568
27456
|
// src/models/paligemma/modeling_paligemma.js
|
|
26569
|
-
var
|
|
26570
|
-
forward_params = [
|
|
26571
|
-
"input_ids",
|
|
26572
|
-
// 'inputs_embeds',
|
|
26573
|
-
"attention_mask",
|
|
26574
|
-
"pixel_values",
|
|
26575
|
-
"position_ids",
|
|
26576
|
-
"past_key_values"
|
|
26577
|
-
];
|
|
26578
|
-
};
|
|
26579
|
-
var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
|
|
26580
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
26581
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26582
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26583
|
-
return default_merge_input_ids_with_image_features({
|
|
26584
|
-
// @ts-ignore
|
|
26585
|
-
image_token_id: this.config.image_token_index,
|
|
26586
|
-
...kwargs,
|
|
26587
|
-
image_features: reshaped_image_hidden_states
|
|
26588
|
-
});
|
|
26589
|
-
}
|
|
27457
|
+
var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26590
27458
|
};
|
|
26591
27459
|
|
|
26592
27460
|
// src/models/parakeet/modeling_parakeet.js
|
|
@@ -26759,6 +27627,9 @@ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
|
26759
27627
|
];
|
|
26760
27628
|
};
|
|
26761
27629
|
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
27630
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
27631
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
27632
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
26762
27633
|
image_grid_thw_name = "grid_thw";
|
|
26763
27634
|
/**
|
|
26764
27635
|
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
@@ -26948,7 +27819,7 @@ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
|
26948
27819
|
);
|
|
26949
27820
|
} else {
|
|
26950
27821
|
model_inputs.pixel_values = null;
|
|
26951
|
-
const past_length =
|
|
27822
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
26952
27823
|
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
26953
27824
|
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
26954
27825
|
model_inputs.input_ids,
|
|
@@ -26977,11 +27848,16 @@ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
|
26977
27848
|
return model_inputs;
|
|
26978
27849
|
}
|
|
26979
27850
|
};
|
|
27851
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
27852
|
+
};
|
|
26980
27853
|
|
|
26981
27854
|
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
26982
27855
|
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
26983
27856
|
image_grid_thw_name = "image_grid_thw";
|
|
26984
27857
|
};
|
|
27858
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
27859
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27860
|
+
};
|
|
26985
27861
|
|
|
26986
27862
|
// src/models/qwen3/modeling_qwen3.js
|
|
26987
27863
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
@@ -27010,18 +27886,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
|
|
|
27010
27886
|
// src/models/qwen3_vl/modeling_qwen3_vl.js
|
|
27011
27887
|
var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27012
27888
|
};
|
|
27889
|
+
var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
|
|
27890
|
+
};
|
|
27013
27891
|
|
|
27014
27892
|
// src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
|
|
27015
27893
|
var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27016
27894
|
};
|
|
27895
|
+
var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
|
|
27896
|
+
};
|
|
27017
27897
|
|
|
27018
27898
|
// src/models/qwen3_5/modeling_qwen3_5.js
|
|
27019
27899
|
var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27020
27900
|
};
|
|
27901
|
+
var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
|
|
27902
|
+
};
|
|
27021
27903
|
|
|
27022
27904
|
// src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
|
|
27023
27905
|
var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
|
|
27024
27906
|
};
|
|
27907
|
+
var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
|
|
27908
|
+
};
|
|
27025
27909
|
|
|
27026
27910
|
// src/models/resnet/modeling_resnet.js
|
|
27027
27911
|
var ResNetPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -27702,25 +28586,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
|
|
|
27702
28586
|
var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
|
|
27703
28587
|
};
|
|
27704
28588
|
|
|
27705
|
-
// src/models/ultravox/modeling_ultravox.js
|
|
27706
|
-
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
27707
|
-
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
27708
|
-
};
|
|
27709
|
-
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
27710
|
-
_merge_input_ids_with_audio_features(kwargs) {
|
|
27711
|
-
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
27712
|
-
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
27713
|
-
return default_merge_input_ids_with_audio_features({
|
|
27714
|
-
// @ts-ignore
|
|
27715
|
-
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
|
|
27716
|
-
...kwargs,
|
|
27717
|
-
audio_features: reshaped_audio_features
|
|
27718
|
-
});
|
|
27719
|
-
}
|
|
27720
|
-
};
|
|
27721
|
-
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
27722
|
-
};
|
|
27723
|
-
|
|
27724
28589
|
// src/models/unispeech/modeling_unispeech.js
|
|
27725
28590
|
var UniSpeechPreTrainedModel = class extends PreTrainedModel {
|
|
27726
28591
|
};
|
|
@@ -27886,6 +28751,170 @@ var VitsModel = class extends VitsPreTrainedModel {
|
|
|
27886
28751
|
}
|
|
27887
28752
|
};
|
|
27888
28753
|
|
|
28754
|
+
// src/models/voxtral/modeling_voxtral.js
|
|
28755
|
+
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
28756
|
+
};
|
|
28757
|
+
|
|
28758
|
+
// src/models/voxtral_realtime/modeling_voxtral_realtime.js
|
|
28759
|
+
var CONV1_LEFT_PAD = 2;
|
|
28760
|
+
var CONV2_LEFT_PAD = 1;
|
|
28761
|
+
var states = /* @__PURE__ */ new WeakMap();
|
|
28762
|
+
function createEncoderState(model, input_features) {
|
|
28763
|
+
const { text_config, audio_config } = (
|
|
28764
|
+
/** @type {any} */
|
|
28765
|
+
model.config
|
|
28766
|
+
);
|
|
28767
|
+
const encoder_session = model.sessions["audio_encoder"];
|
|
28768
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
28769
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
28770
|
+
const enc_kv_cache = new DynamicCache();
|
|
28771
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
|
|
28772
|
+
const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
28773
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
28774
|
+
for (const name in enc_shapes) {
|
|
28775
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
28776
|
+
enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
28777
|
+
}
|
|
28778
|
+
const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
28779
|
+
1,
|
|
28780
|
+
PADDING_CACHE_CHANNELS,
|
|
28781
|
+
CONV1_LEFT_PAD
|
|
28782
|
+
]);
|
|
28783
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
28784
|
+
if (!chunks_iter) {
|
|
28785
|
+
throw new Error("input_features must be iterable or async iterable");
|
|
28786
|
+
}
|
|
28787
|
+
return {
|
|
28788
|
+
encoder_session,
|
|
28789
|
+
enc_kv_cache,
|
|
28790
|
+
enc_padding_cache,
|
|
28791
|
+
enc_past_seq_len: 0,
|
|
28792
|
+
audio_embed_queue: [],
|
|
28793
|
+
audio_embed_total_tokens: 0,
|
|
28794
|
+
audio_queue_offset: 0,
|
|
28795
|
+
audio_consumed: 0,
|
|
28796
|
+
stream_exhausted: false,
|
|
28797
|
+
chunks_iter,
|
|
28798
|
+
text_hidden_size: text_config.hidden_size
|
|
28799
|
+
};
|
|
28800
|
+
}
|
|
28801
|
+
async function encodeChunk(s, chunk_features) {
|
|
28802
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
28803
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
28804
|
+
const position_ids = new Tensor2(
|
|
28805
|
+
"int64",
|
|
28806
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
28807
|
+
[1, conv2_output_len]
|
|
28808
|
+
);
|
|
28809
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
28810
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
28811
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
28812
|
+
input_features: chunk_features,
|
|
28813
|
+
attention_mask,
|
|
28814
|
+
position_ids,
|
|
28815
|
+
past_padding_cache: s.enc_padding_cache,
|
|
28816
|
+
...s.enc_kv_cache
|
|
28817
|
+
});
|
|
28818
|
+
if (s.enc_padding_cache.location === "gpu-buffer") {
|
|
28819
|
+
s.enc_padding_cache.dispose();
|
|
28820
|
+
}
|
|
28821
|
+
s.enc_padding_cache = present_padding_cache;
|
|
28822
|
+
for (const name in present_cache) {
|
|
28823
|
+
if (name.startsWith("present.")) {
|
|
28824
|
+
const pastName = name.replace("present", "past_key_values");
|
|
28825
|
+
const prev = s.enc_kv_cache[pastName];
|
|
28826
|
+
if (prev?.location === "gpu-buffer") {
|
|
28827
|
+
prev.dispose();
|
|
28828
|
+
}
|
|
28829
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
28830
|
+
}
|
|
28831
|
+
}
|
|
28832
|
+
s.enc_past_seq_len = total_seq_len;
|
|
28833
|
+
return audio_embeds;
|
|
28834
|
+
}
|
|
28835
|
+
async function fillAudioBuffer(s, needed) {
|
|
28836
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
28837
|
+
const result = await s.chunks_iter.next();
|
|
28838
|
+
if (result.done) {
|
|
28839
|
+
s.stream_exhausted = true;
|
|
28840
|
+
break;
|
|
28841
|
+
}
|
|
28842
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
28843
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
28844
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
28845
|
+
}
|
|
28846
|
+
}
|
|
28847
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
28848
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
28849
|
+
const embed_data = inputs_embeds.data;
|
|
28850
|
+
let embed_write_pos = 0;
|
|
28851
|
+
let remaining = current_len;
|
|
28852
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
28853
|
+
const front = s.audio_embed_queue[0];
|
|
28854
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
28855
|
+
const n = Math.min(remaining, available);
|
|
28856
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
28857
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
28858
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
28859
|
+
}
|
|
28860
|
+
embed_write_pos += n;
|
|
28861
|
+
remaining -= n;
|
|
28862
|
+
s.audio_queue_offset += n;
|
|
28863
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
28864
|
+
s.audio_embed_queue.shift();
|
|
28865
|
+
s.audio_queue_offset = 0;
|
|
28866
|
+
}
|
|
28867
|
+
}
|
|
28868
|
+
s.audio_consumed += current_len - remaining;
|
|
28869
|
+
}
|
|
28870
|
+
var AudioExhaustedCriteria = class extends StoppingCriteria {
|
|
28871
|
+
constructor(enc_state) {
|
|
28872
|
+
super();
|
|
28873
|
+
this._s = enc_state;
|
|
28874
|
+
}
|
|
28875
|
+
_call(input_ids) {
|
|
28876
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
28877
|
+
return input_ids.map(() => done);
|
|
28878
|
+
}
|
|
28879
|
+
};
|
|
28880
|
+
var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
|
|
28881
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
|
|
28882
|
+
};
|
|
28883
|
+
var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
|
|
28884
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
28885
|
+
const current_len = input_ids.dims[1];
|
|
28886
|
+
const enc = states.get(this);
|
|
28887
|
+
if (enc) {
|
|
28888
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
28889
|
+
}
|
|
28890
|
+
const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
|
|
28891
|
+
if (enc) {
|
|
28892
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
28893
|
+
}
|
|
28894
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
28895
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
28896
|
+
const session = this.sessions["decoder_model_merged"];
|
|
28897
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
28898
|
+
return await sessionRun(session, fixed);
|
|
28899
|
+
}
|
|
28900
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
28901
|
+
if (!input_features) {
|
|
28902
|
+
throw new Error("input_features (generator/iterable) must be provided");
|
|
28903
|
+
}
|
|
28904
|
+
const enc_state = createEncoderState(this, input_features);
|
|
28905
|
+
states.set(this, enc_state);
|
|
28906
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
28907
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
28908
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
28909
|
+
try {
|
|
28910
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
28911
|
+
} finally {
|
|
28912
|
+
enc_state.enc_kv_cache.dispose();
|
|
28913
|
+
states.delete(this);
|
|
28914
|
+
}
|
|
28915
|
+
}
|
|
28916
|
+
};
|
|
28917
|
+
|
|
27889
28918
|
// src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
|
|
27890
28919
|
var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
|
|
27891
28920
|
};
|
|
@@ -28639,6 +29668,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
28639
29668
|
["gemma2", "Gemma2ForCausalLM"],
|
|
28640
29669
|
["vaultgemma", "VaultGemmaForCausalLM"],
|
|
28641
29670
|
["gemma3_text", "Gemma3ForCausalLM"],
|
|
29671
|
+
["gemma3", "Gemma3ForCausalLM"],
|
|
28642
29672
|
["helium", "HeliumForCausalLM"],
|
|
28643
29673
|
["glm", "GlmForCausalLM"],
|
|
28644
29674
|
["openelm", "OpenELMForCausalLM"],
|
|
@@ -28647,6 +29677,13 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
28647
29677
|
["qwen3", "Qwen3ForCausalLM"],
|
|
28648
29678
|
["qwen3_moe", "Qwen3MoeForCausalLM"],
|
|
28649
29679
|
["qwen3_next", "Qwen3NextForCausalLM"],
|
|
29680
|
+
["qwen2_vl", "Qwen2VLForCausalLM"],
|
|
29681
|
+
["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
|
|
29682
|
+
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
29683
|
+
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
29684
|
+
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
29685
|
+
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
29686
|
+
["gemma3n", "Gemma3nForCausalLM"],
|
|
28650
29687
|
["phi", "PhiForCausalLM"],
|
|
28651
29688
|
["phi3", "Phi3ForCausalLM"],
|
|
28652
29689
|
["mpt", "MptForCausalLM"],
|
|
@@ -28722,6 +29759,7 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
28722
29759
|
["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
|
|
28723
29760
|
["qwen3_5", "Qwen3_5ForConditionalGeneration"],
|
|
28724
29761
|
["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
|
|
29762
|
+
["lfm2_vl", "Lfm2VlForConditionalGeneration"],
|
|
28725
29763
|
["idefics3", "Idefics3ForConditionalGeneration"],
|
|
28726
29764
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
28727
29765
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
@@ -28730,8 +29768,10 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
28730
29768
|
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
28731
29769
|
]);
|
|
28732
29770
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29771
|
+
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
28733
29772
|
["ultravox", "UltravoxModel"],
|
|
28734
|
-
["voxtral", "VoxtralForConditionalGeneration"]
|
|
29773
|
+
["voxtral", "VoxtralForConditionalGeneration"],
|
|
29774
|
+
["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
|
|
28735
29775
|
]);
|
|
28736
29776
|
var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
28737
29777
|
["vision-encoder-decoder", "VisionEncoderDecoderModel"]
|
|
@@ -28914,7 +29954,19 @@ var CUSTOM_MAPPING = [
|
|
|
28914
29954
|
MODEL_TYPES.ImageAudioTextToText
|
|
28915
29955
|
],
|
|
28916
29956
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
28917
|
-
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
|
|
29957
|
+
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
29958
|
+
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29959
|
+
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29960
|
+
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29961
|
+
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29962
|
+
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29963
|
+
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29964
|
+
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
29965
|
+
[
|
|
29966
|
+
"VoxtralRealtimeForConditionalGeneration",
|
|
29967
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
29968
|
+
MODEL_TYPES.VoxtralRealtime
|
|
29969
|
+
]
|
|
28918
29970
|
];
|
|
28919
29971
|
for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
28920
29972
|
MODEL_TYPE_MAPPING.set(name, type);
|
|
@@ -30592,8 +31644,18 @@ var TASK_ALIASES = Object.freeze({
|
|
|
30592
31644
|
});
|
|
30593
31645
|
|
|
30594
31646
|
// src/utils/model_registry/get_model_files.js
|
|
31647
|
+
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
31648
|
+
if (config !== null) {
|
|
31649
|
+
return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
|
|
31650
|
+
}
|
|
31651
|
+
const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
|
|
31652
|
+
return memoizePromise(
|
|
31653
|
+
key,
|
|
31654
|
+
() => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
|
|
31655
|
+
);
|
|
31656
|
+
}
|
|
30595
31657
|
async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
|
|
30596
|
-
config = await
|
|
31658
|
+
config = await get_config(modelId, { config });
|
|
30597
31659
|
const files = [
|
|
30598
31660
|
// Add config.json (always loaded)
|
|
30599
31661
|
"config.json"
|
|
@@ -30654,74 +31716,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
30654
31716
|
files.push(dataFilePath);
|
|
30655
31717
|
}
|
|
30656
31718
|
};
|
|
30657
|
-
const
|
|
30658
|
-
|
|
30659
|
-
add_model_file(
|
|
30660
|
-
|
|
30661
|
-
|
|
30662
|
-
|
|
30663
|
-
|
|
30664
|
-
|
|
30665
|
-
add_model_file("decoder_model_merged");
|
|
30666
|
-
files.push("generation_config.json");
|
|
30667
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
30668
|
-
add_model_file("model", "vision_encoder");
|
|
30669
|
-
add_model_file("prompt_encoder_mask_decoder");
|
|
30670
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
30671
|
-
add_model_file("model", "encoder_model");
|
|
30672
|
-
add_model_file("decoder_model_merged");
|
|
30673
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
30674
|
-
add_model_file("embed_tokens");
|
|
30675
|
-
add_model_file("vision_encoder");
|
|
30676
|
-
add_model_file("decoder_model_merged");
|
|
30677
|
-
if (config.is_encoder_decoder) {
|
|
30678
|
-
add_model_file("model", "encoder_model");
|
|
30679
|
-
}
|
|
30680
|
-
files.push("generation_config.json");
|
|
30681
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
30682
|
-
add_model_file("embed_tokens");
|
|
30683
|
-
add_model_file("audio_encoder");
|
|
30684
|
-
add_model_file("decoder_model_merged");
|
|
30685
|
-
files.push("generation_config.json");
|
|
30686
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
30687
|
-
add_model_file("embed_tokens");
|
|
30688
|
-
add_model_file("audio_encoder");
|
|
30689
|
-
add_model_file("vision_encoder");
|
|
30690
|
-
add_model_file("decoder_model_merged");
|
|
30691
|
-
files.push("generation_config.json");
|
|
30692
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
30693
|
-
add_model_file("model", "text_encoder");
|
|
30694
|
-
add_model_file("decoder_model_merged");
|
|
30695
|
-
add_model_file("encodec_decode");
|
|
30696
|
-
files.push("generation_config.json");
|
|
30697
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
30698
|
-
add_model_file("prepare_inputs_embeds");
|
|
30699
|
-
add_model_file("model", "language_model");
|
|
30700
|
-
add_model_file("lm_head");
|
|
30701
|
-
add_model_file("gen_head");
|
|
30702
|
-
add_model_file("gen_img_embeds");
|
|
30703
|
-
add_model_file("image_decode");
|
|
30704
|
-
files.push("generation_config.json");
|
|
30705
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
30706
|
-
add_model_file("prepare_inputs_embeds");
|
|
30707
|
-
add_model_file("model");
|
|
30708
|
-
add_model_file("vision_encoder");
|
|
30709
|
-
files.push("generation_config.json");
|
|
30710
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
30711
|
-
add_model_file("embed_tokens");
|
|
30712
|
-
add_model_file("speech_encoder");
|
|
30713
|
-
add_model_file("model", "language_model");
|
|
30714
|
-
add_model_file("conditional_decoder");
|
|
30715
|
-
files.push("generation_config.json");
|
|
30716
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
30717
|
-
add_model_file("encoder_model");
|
|
30718
|
-
add_model_file("decoder_model");
|
|
30719
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
30720
|
-
add_model_file("text_encoder");
|
|
30721
|
-
add_model_file("latent_denoiser");
|
|
30722
|
-
add_model_file("voice_decoder");
|
|
30723
|
-
} else {
|
|
30724
|
-
add_model_file("model", singleModelName);
|
|
31719
|
+
const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
|
|
31720
|
+
for (const [sessionKey, baseName] of Object.entries(sessions)) {
|
|
31721
|
+
add_model_file(sessionKey, baseName);
|
|
31722
|
+
}
|
|
31723
|
+
if (optional_configs) {
|
|
31724
|
+
for (const configFile of Object.values(optional_configs)) {
|
|
31725
|
+
files.push(configFile);
|
|
31726
|
+
}
|
|
30725
31727
|
}
|
|
30726
31728
|
return files;
|
|
30727
31729
|
}
|
|
@@ -31172,25 +32174,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
|
31172
32174
|
|
|
31173
32175
|
// src/utils/model_registry/is_cached.js
|
|
31174
32176
|
async function check_files_cache(modelId, files, options = {}) {
|
|
31175
|
-
const
|
|
31176
|
-
if (!
|
|
32177
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
32178
|
+
if (!cache2) {
|
|
31177
32179
|
const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
|
|
31178
32180
|
return { allCached: false, files: fileStatuses2 };
|
|
31179
32181
|
}
|
|
31180
32182
|
const fileStatuses = await Promise.all(
|
|
31181
32183
|
files.map(async (filename) => {
|
|
31182
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
31183
|
-
const cached = await checkCachedResource(
|
|
32184
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
32185
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
31184
32186
|
return { file: filename, cached: !!cached };
|
|
31185
32187
|
})
|
|
31186
32188
|
);
|
|
31187
32189
|
return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
|
|
31188
32190
|
}
|
|
31189
32191
|
async function is_file_cached(modelId, filename, options = {}) {
|
|
31190
|
-
const
|
|
31191
|
-
if (!
|
|
31192
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
31193
|
-
return !!await checkCachedResource(
|
|
32192
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
32193
|
+
if (!cache2) return false;
|
|
32194
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
32195
|
+
return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
31194
32196
|
}
|
|
31195
32197
|
async function is_cached(modelId, options = {}) {
|
|
31196
32198
|
if (!modelId) {
|
|
@@ -31237,26 +32239,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
|
|
|
31237
32239
|
|
|
31238
32240
|
// src/utils/model_registry/clear_cache.js
|
|
31239
32241
|
async function clear_files_from_cache(modelId, files, options = {}) {
|
|
31240
|
-
const
|
|
31241
|
-
if (!
|
|
32242
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
32243
|
+
if (!cache2) {
|
|
31242
32244
|
return {
|
|
31243
32245
|
filesDeleted: 0,
|
|
31244
32246
|
filesCached: 0,
|
|
31245
32247
|
files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
|
|
31246
32248
|
};
|
|
31247
32249
|
}
|
|
31248
|
-
if (!
|
|
32250
|
+
if (!cache2.delete) {
|
|
31249
32251
|
throw new Error("Cache does not support delete operation");
|
|
31250
32252
|
}
|
|
31251
32253
|
const results = await Promise.all(
|
|
31252
32254
|
files.map(async (filename) => {
|
|
31253
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
31254
|
-
const cached = await checkCachedResource(
|
|
32255
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
32256
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
31255
32257
|
const wasCached = !!cached;
|
|
31256
32258
|
let deleted = false;
|
|
31257
32259
|
if (wasCached) {
|
|
31258
|
-
const deletedWithProposed = await
|
|
31259
|
-
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await
|
|
32260
|
+
const deletedWithProposed = await cache2.delete(proposedCacheKey);
|
|
32261
|
+
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
|
|
31260
32262
|
deleted = deletedWithProposed || deletedWithLocal;
|
|
31261
32263
|
}
|
|
31262
32264
|
return { file: filename, deleted, wasCached };
|
|
@@ -31737,6 +32739,7 @@ export {
|
|
|
31737
32739
|
DonutImageProcessor,
|
|
31738
32740
|
DonutSwinModel,
|
|
31739
32741
|
DonutSwinPreTrainedModel,
|
|
32742
|
+
DynamicCache,
|
|
31740
32743
|
EdgeTamModel,
|
|
31741
32744
|
EfficientNetForImageClassification,
|
|
31742
32745
|
EfficientNetImageProcessor,
|
|
@@ -31809,6 +32812,7 @@ export {
|
|
|
31809
32812
|
Gemma3Model,
|
|
31810
32813
|
Gemma3PreTrainedModel,
|
|
31811
32814
|
Gemma3nAudioFeatureExtractor,
|
|
32815
|
+
Gemma3nForCausalLM,
|
|
31812
32816
|
Gemma3nForConditionalGeneration,
|
|
31813
32817
|
Gemma3nPreTrainedModel,
|
|
31814
32818
|
Gemma3nProcessor,
|
|
@@ -31828,6 +32832,9 @@ export {
|
|
|
31828
32832
|
GraniteMoeHybridModel,
|
|
31829
32833
|
GraniteMoeHybridPreTrainedModel,
|
|
31830
32834
|
GranitePreTrainedModel,
|
|
32835
|
+
GraniteSpeechFeatureExtractor,
|
|
32836
|
+
GraniteSpeechForConditionalGeneration,
|
|
32837
|
+
GraniteSpeechProcessor,
|
|
31831
32838
|
GroundingDinoForObjectDetection,
|
|
31832
32839
|
GroundingDinoImageProcessor,
|
|
31833
32840
|
GroundingDinoPreTrainedModel,
|
|
@@ -31853,7 +32860,6 @@ export {
|
|
|
31853
32860
|
IJepaPreTrainedModel,
|
|
31854
32861
|
Idefics3ForConditionalGeneration,
|
|
31855
32862
|
Idefics3ImageProcessor,
|
|
31856
|
-
Idefics3PreTrainedModel,
|
|
31857
32863
|
Idefics3Processor,
|
|
31858
32864
|
ImageClassificationPipeline,
|
|
31859
32865
|
ImageFeatureExtractionPipeline,
|
|
@@ -31878,6 +32884,9 @@ export {
|
|
|
31878
32884
|
Lfm2MoeModel,
|
|
31879
32885
|
Lfm2MoePreTrainedModel,
|
|
31880
32886
|
Lfm2PreTrainedModel,
|
|
32887
|
+
Lfm2VlForConditionalGeneration,
|
|
32888
|
+
Lfm2VlImageProcessor,
|
|
32889
|
+
Lfm2VlProcessor,
|
|
31881
32890
|
LiteWhisperForConditionalGeneration,
|
|
31882
32891
|
Llama4ForCausalLM,
|
|
31883
32892
|
Llama4PreTrainedModel,
|
|
@@ -32061,7 +33070,6 @@ export {
|
|
|
32061
33070
|
Owlv2Model,
|
|
32062
33071
|
Owlv2PreTrainedModel,
|
|
32063
33072
|
PaliGemmaForConditionalGeneration,
|
|
32064
|
-
PaliGemmaPreTrainedModel,
|
|
32065
33073
|
PaliGemmaProcessor,
|
|
32066
33074
|
ParakeetFeatureExtractor,
|
|
32067
33075
|
ParakeetForCTC,
|
|
@@ -32105,10 +33113,12 @@ export {
|
|
|
32105
33113
|
Qwen2MoePreTrainedModel,
|
|
32106
33114
|
Qwen2PreTrainedModel,
|
|
32107
33115
|
Qwen2Tokenizer,
|
|
33116
|
+
Qwen2VLForCausalLM,
|
|
32108
33117
|
Qwen2VLForConditionalGeneration,
|
|
32109
33118
|
Qwen2VLImageProcessor,
|
|
32110
33119
|
Qwen2VLPreTrainedModel,
|
|
32111
33120
|
Qwen2VLProcessor,
|
|
33121
|
+
Qwen2_5_VLForCausalLM,
|
|
32112
33122
|
Qwen2_5_VLForConditionalGeneration,
|
|
32113
33123
|
Qwen2_5_VLProcessor,
|
|
32114
33124
|
Qwen3ForCausalLM,
|
|
@@ -32120,10 +33130,14 @@ export {
|
|
|
32120
33130
|
Qwen3NextModel,
|
|
32121
33131
|
Qwen3NextPreTrainedModel,
|
|
32122
33132
|
Qwen3PreTrainedModel,
|
|
33133
|
+
Qwen3VLForCausalLM,
|
|
32123
33134
|
Qwen3VLForConditionalGeneration,
|
|
33135
|
+
Qwen3VLMoeForCausalLM,
|
|
32124
33136
|
Qwen3VLMoeForConditionalGeneration,
|
|
32125
33137
|
Qwen3VLProcessor,
|
|
33138
|
+
Qwen3_5ForCausalLM,
|
|
32126
33139
|
Qwen3_5ForConditionalGeneration,
|
|
33140
|
+
Qwen3_5MoeForCausalLM,
|
|
32127
33141
|
Qwen3_5MoeForConditionalGeneration,
|
|
32128
33142
|
RFDetrForObjectDetection,
|
|
32129
33143
|
RFDetrModel,
|
|
@@ -32195,7 +33209,6 @@ export {
|
|
|
32195
33209
|
SmolLM3ForCausalLM,
|
|
32196
33210
|
SmolLM3Model,
|
|
32197
33211
|
SmolLM3PreTrainedModel,
|
|
32198
|
-
SmolVLMForConditionalGeneration,
|
|
32199
33212
|
Idefics3ImageProcessor as SmolVLMImageProcessor,
|
|
32200
33213
|
Idefics3Processor as SmolVLMProcessor,
|
|
32201
33214
|
SnacDecoderModel,
|
|
@@ -32301,6 +33314,10 @@ export {
|
|
|
32301
33314
|
VitsTokenizer,
|
|
32302
33315
|
VoxtralForConditionalGeneration,
|
|
32303
33316
|
VoxtralProcessor,
|
|
33317
|
+
VoxtralRealtimeFeatureExtractor,
|
|
33318
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
33319
|
+
VoxtralRealtimePreTrainedModel,
|
|
33320
|
+
VoxtralRealtimeProcessor,
|
|
32304
33321
|
Wav2Vec2BertForCTC,
|
|
32305
33322
|
Wav2Vec2BertForSequenceClassification,
|
|
32306
33323
|
Wav2Vec2BertModel,
|
|
@@ -32396,7 +33413,7 @@ export {
|
|
|
32396
33413
|
|
|
32397
33414
|
onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
|
|
32398
33415
|
(*!
|
|
32399
|
-
* ONNX Runtime Web v1.25.0-dev.
|
|
33416
|
+
* ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
|
|
32400
33417
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
32401
33418
|
* Licensed under the MIT License.
|
|
32402
33419
|
*)
|