@huggingface/transformers 4.0.0-next.5 → 4.0.0-next.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +12 -4
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +2189 -1015
- package/dist/transformers.min.js +16 -16
- package/dist/transformers.node.cjs +2234 -1029
- package/dist/transformers.node.min.cjs +20 -20
- package/dist/transformers.node.min.mjs +20 -20
- package/dist/transformers.node.mjs +2194 -1017
- package/dist/transformers.web.js +2175 -1001
- package/dist/transformers.web.min.js +18 -18
- package/package.json +4 -4
- package/src/backends/onnx.js +77 -58
- package/src/backends/utils/cacheWasm.js +22 -43
- package/src/cache_utils.js +62 -0
- package/src/configs.js +32 -5
- package/src/env.js +36 -6
- package/src/image_processors_utils.js +3 -3
- package/src/models/auto/modeling_auto.js +14 -1
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +1 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/modeling_utils.js +234 -292
- package/src/models/models.js +9 -0
- package/src/models/olmo_hybrid/modeling_olmo_hybrid.js +5 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +3 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_moe/modeling_qwen2_moe.js +5 -0
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +36 -3
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_moe/modeling_qwen3_moe.js +5 -0
- package/src/models/qwen3_next/modeling_qwen3_next.js +5 -0
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +4 -0
- package/src/models/registry.js +39 -4
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/pipelines/index.js +2 -84
- package/src/pipelines.js +40 -77
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/FileCache.js +128 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +8 -3
- package/src/utils/hub/{files.js → FileResponse.js} +0 -105
- package/src/utils/hub/utils.js +35 -1
- package/src/utils/hub.js +6 -5
- package/src/utils/image.js +12 -13
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/ModelRegistry.js +70 -23
- package/src/utils/model_registry/get_file_metadata.js +14 -2
- package/src/utils/model_registry/get_model_files.js +63 -78
- package/src/utils/model_registry/get_pipeline_files.js +15 -24
- package/src/utils/model_registry/is_cached.js +81 -4
- package/src/utils/tensor.js +18 -2
- package/types/backends/onnx.d.ts.map +1 -1
- package/types/backends/utils/cacheWasm.d.ts +3 -17
- package/types/backends/utils/cacheWasm.d.ts.map +1 -1
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +18 -3
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +17 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/auto/modeling_auto.d.ts +6 -0
- package/types/models/auto/modeling_auto.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/modeling_utils.d.ts +44 -24
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +9 -0
- package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts +8 -0
- package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts.map +1 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts +8 -0
- package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts.map +1 -0
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts +8 -0
- package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts.map +1 -0
- package/types/models/qwen3_next/modeling_qwen3_next.d.ts +8 -0
- package/types/models/qwen3_next/modeling_qwen3_next.d.ts.map +1 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +7 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -0
- package/types/models/registry.d.ts +2 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/pipelines/index.d.ts +0 -34
- package/types/pipelines/index.d.ts.map +1 -1
- package/types/pipelines.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache/FileCache.d.ts +39 -0
- package/types/utils/cache/FileCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts +4 -4
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/hub/{files.d.ts → FileResponse.d.ts} +1 -38
- package/types/utils/hub/FileResponse.d.ts.map +1 -0
- package/types/utils/hub/utils.d.ts +17 -2
- package/types/utils/hub/utils.d.ts.map +1 -1
- package/types/utils/hub.d.ts +7 -7
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/image.d.ts.map +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/ModelRegistry.d.ts +66 -6
- package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/model_registry/get_pipeline_files.d.ts +2 -1
- package/types/utils/model_registry/get_pipeline_files.d.ts.map +1 -1
- package/types/utils/model_registry/is_cached.d.ts +47 -4
- package/types/utils/model_registry/is_cached.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
- package/types/utils/hub/files.d.ts.map +0 -1
package/dist/transformers.web.js
CHANGED
|
@@ -14,22 +14,32 @@ var node_path_default = {};
|
|
|
14
14
|
var node_url_default = {};
|
|
15
15
|
|
|
16
16
|
// src/env.js
|
|
17
|
-
var VERSION = "4.0.0-next.
|
|
17
|
+
var VERSION = "4.0.0-next.7";
|
|
18
|
+
var HAS_SELF = typeof self !== "undefined";
|
|
18
19
|
var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
|
|
19
20
|
var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
|
|
20
|
-
var IS_WEB_CACHE_AVAILABLE =
|
|
21
|
+
var IS_WEB_CACHE_AVAILABLE = HAS_SELF && "caches" in self;
|
|
21
22
|
var IS_DENO_RUNTIME = typeof globalThis.Deno !== "undefined";
|
|
22
23
|
var IS_BUN_RUNTIME = typeof globalThis.Bun !== "undefined";
|
|
23
24
|
var IS_DENO_WEB_RUNTIME = IS_DENO_RUNTIME && IS_WEB_CACHE_AVAILABLE && !IS_FS_AVAILABLE;
|
|
24
25
|
var IS_PROCESS_AVAILABLE = typeof process !== "undefined";
|
|
25
26
|
var IS_NODE_ENV = IS_PROCESS_AVAILABLE && process?.release?.name === "node" && !IS_DENO_WEB_RUNTIME;
|
|
26
27
|
var IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
|
|
27
|
-
var IS_WEBWORKER_ENV =
|
|
28
|
+
var IS_WEBWORKER_ENV = HAS_SELF && ["DedicatedWorkerGlobalScope", "ServiceWorkerGlobalScope", "SharedWorkerGlobalScope"].includes(
|
|
28
29
|
self.constructor?.name
|
|
29
30
|
);
|
|
31
|
+
var IS_WEB_ENV = IS_BROWSER_ENV || IS_WEBWORKER_ENV || IS_DENO_WEB_RUNTIME;
|
|
30
32
|
var IS_WEBGPU_AVAILABLE = IS_NODE_ENV || typeof navigator !== "undefined" && "gpu" in navigator;
|
|
31
33
|
var IS_WEBNN_AVAILABLE = typeof navigator !== "undefined" && "ml" in navigator;
|
|
32
34
|
var IS_CRYPTO_AVAILABLE = typeof crypto !== "undefined" && typeof crypto.getRandomValues === "function";
|
|
35
|
+
var IS_CHROME_AVAILABLE = (
|
|
36
|
+
// @ts-ignore - chrome may not exist in all environments
|
|
37
|
+
typeof chrome !== "undefined" && typeof chrome.runtime !== "undefined" && typeof chrome.runtime.id === "string"
|
|
38
|
+
);
|
|
39
|
+
var IS_SERVICE_WORKER_ENV = (
|
|
40
|
+
// @ts-ignore - ServiceWorkerGlobalScope may not exist in all environments
|
|
41
|
+
typeof ServiceWorkerGlobalScope !== "undefined" && HAS_SELF && self instanceof ServiceWorkerGlobalScope
|
|
42
|
+
);
|
|
33
43
|
var isSafari = () => {
|
|
34
44
|
if (typeof navigator === "undefined") {
|
|
35
45
|
return false;
|
|
@@ -46,6 +56,12 @@ var apis = Object.freeze({
|
|
|
46
56
|
IS_BROWSER_ENV,
|
|
47
57
|
/** Whether we are running in a web worker environment */
|
|
48
58
|
IS_WEBWORKER_ENV,
|
|
59
|
+
/** Whether we are running in a web-like environment (browser, web worker, or Deno web runtime) */
|
|
60
|
+
IS_WEB_ENV,
|
|
61
|
+
/** Whether we are running in a service worker environment */
|
|
62
|
+
IS_SERVICE_WORKER_ENV,
|
|
63
|
+
/** Whether we are running in Deno's web runtime (CDN imports, Cache API available, no filesystem) */
|
|
64
|
+
IS_DENO_WEB_RUNTIME,
|
|
49
65
|
/** Whether the Cache API is available */
|
|
50
66
|
IS_WEB_CACHE_AVAILABLE,
|
|
51
67
|
/** Whether the WebGPU API is available */
|
|
@@ -63,7 +79,9 @@ var apis = Object.freeze({
|
|
|
63
79
|
/** Whether the path API is available */
|
|
64
80
|
IS_PATH_AVAILABLE,
|
|
65
81
|
/** Whether the crypto API is available */
|
|
66
|
-
IS_CRYPTO_AVAILABLE
|
|
82
|
+
IS_CRYPTO_AVAILABLE,
|
|
83
|
+
/** Whether the Chrome runtime API is available */
|
|
84
|
+
IS_CHROME_AVAILABLE
|
|
67
85
|
});
|
|
68
86
|
var RUNNING_LOCALLY = IS_FS_AVAILABLE && IS_PATH_AVAILABLE;
|
|
69
87
|
var dirname__ = "./";
|
|
@@ -124,6 +142,7 @@ var env = {
|
|
|
124
142
|
customCache: null,
|
|
125
143
|
useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
|
|
126
144
|
cacheKey: "transformers-cache",
|
|
145
|
+
experimental_useCrossOriginStorage: false,
|
|
127
146
|
/////////////////// Custom fetch /////////////////////
|
|
128
147
|
fetch: DEFAULT_FETCH
|
|
129
148
|
//////////////////////////////////////////////////////
|
|
@@ -2674,7 +2693,7 @@ var Tokenizer = class {
|
|
|
2674
2693
|
};
|
|
2675
2694
|
var Tokenizer_default = Tokenizer;
|
|
2676
2695
|
|
|
2677
|
-
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.
|
|
2696
|
+
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
|
|
2678
2697
|
var TOKEN_TYPES = Object.freeze({
|
|
2679
2698
|
Text: "Text",
|
|
2680
2699
|
// The text between Jinja statements or expressions
|
|
@@ -4193,7 +4212,11 @@ var Environment = class {
|
|
|
4193
4212
|
["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
|
|
4194
4213
|
["integer", (operand) => operand instanceof IntegerValue],
|
|
4195
4214
|
["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
|
|
4196
|
-
["mapping", (operand) => operand
|
|
4215
|
+
["mapping", (operand) => operand instanceof ObjectValue],
|
|
4216
|
+
[
|
|
4217
|
+
"sequence",
|
|
4218
|
+
(operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
|
|
4219
|
+
],
|
|
4197
4220
|
[
|
|
4198
4221
|
"lower",
|
|
4199
4222
|
(operand) => {
|
|
@@ -4466,6 +4489,9 @@ var Interpreter = class {
|
|
|
4466
4489
|
applyFilter(operand, filterNode, environment) {
|
|
4467
4490
|
if (filterNode.type === "Identifier") {
|
|
4468
4491
|
const filter = filterNode;
|
|
4492
|
+
if (filter.value === "safe") {
|
|
4493
|
+
return operand;
|
|
4494
|
+
}
|
|
4469
4495
|
if (filter.value === "tojson") {
|
|
4470
4496
|
return new StringValue(toJSON(operand, {}));
|
|
4471
4497
|
}
|
|
@@ -4555,6 +4581,8 @@ var Interpreter = class {
|
|
|
4555
4581
|
return new IntegerValue(Math.floor(operand.value));
|
|
4556
4582
|
case "float":
|
|
4557
4583
|
return new FloatValue(operand.value);
|
|
4584
|
+
case "string":
|
|
4585
|
+
return new StringValue(operand.toString());
|
|
4558
4586
|
default:
|
|
4559
4587
|
throw new Error(`Unknown NumericValue filter: ${filter.value}`);
|
|
4560
4588
|
}
|
|
@@ -5508,7 +5536,7 @@ var Callable2 = (
|
|
|
5508
5536
|
}
|
|
5509
5537
|
);
|
|
5510
5538
|
|
|
5511
|
-
// src/utils/hub/
|
|
5539
|
+
// src/utils/hub/FileResponse.js
|
|
5512
5540
|
var CONTENT_TYPE_MAP = {
|
|
5513
5541
|
txt: "text/plain",
|
|
5514
5542
|
html: "text/html",
|
|
@@ -5619,6 +5647,170 @@ var FileResponse = class _FileResponse {
|
|
|
5619
5647
|
return JSON.parse(await this.text());
|
|
5620
5648
|
}
|
|
5621
5649
|
};
|
|
5650
|
+
|
|
5651
|
+
// src/utils/random.js
|
|
5652
|
+
var Random = class {
|
|
5653
|
+
constructor(seed) {
|
|
5654
|
+
this._mt = new Uint32Array(624);
|
|
5655
|
+
this._idx = 625;
|
|
5656
|
+
this._gauss_next = null;
|
|
5657
|
+
this._random_fn = this.random.bind(this);
|
|
5658
|
+
this.seed(seed);
|
|
5659
|
+
}
|
|
5660
|
+
/**
|
|
5661
|
+
* Seeds this instance's PRNG.
|
|
5662
|
+
*
|
|
5663
|
+
* When called with a number, initializes the state deterministically from that value.
|
|
5664
|
+
* When called with no arguments (or `undefined`/`null`), seeds from OS entropy
|
|
5665
|
+
* via `crypto.getRandomValues`, matching Python's `random.seed()` behaviour.
|
|
5666
|
+
*
|
|
5667
|
+
* @param {number} [n] The seed value. Omit to seed from OS entropy.
|
|
5668
|
+
*/
|
|
5669
|
+
seed(n) {
|
|
5670
|
+
if (n === void 0 || n === null) {
|
|
5671
|
+
if (apis.IS_CRYPTO_AVAILABLE) {
|
|
5672
|
+
const buf = new Uint32Array(1);
|
|
5673
|
+
crypto.getRandomValues(buf);
|
|
5674
|
+
n = buf[0];
|
|
5675
|
+
} else {
|
|
5676
|
+
n = Date.now() >>> 0;
|
|
5677
|
+
}
|
|
5678
|
+
}
|
|
5679
|
+
const mt = this._mt;
|
|
5680
|
+
const u = (a, b) => Math.imul(a, b) >>> 0, key = [];
|
|
5681
|
+
for (let v = n || 0; v > 0; v = Math.floor(v / 4294967296)) key.push(v & 4294967295);
|
|
5682
|
+
if (!key.length) key.push(0);
|
|
5683
|
+
mt[0] = 19650218;
|
|
5684
|
+
for (let k = 1; k < 624; ++k) mt[k] = u(1812433253, mt[k - 1] ^ mt[k - 1] >>> 30) + k >>> 0;
|
|
5685
|
+
let i = 1, j = 0;
|
|
5686
|
+
for (let k = Math.max(624, key.length); k > 0; --k, ++i, ++j) {
|
|
5687
|
+
if (i >= 624) {
|
|
5688
|
+
mt[0] = mt[623];
|
|
5689
|
+
i = 1;
|
|
5690
|
+
}
|
|
5691
|
+
if (j >= key.length) j = 0;
|
|
5692
|
+
mt[i] = (mt[i] ^ u(mt[i - 1] ^ mt[i - 1] >>> 30, 1664525)) + key[j] + j >>> 0;
|
|
5693
|
+
}
|
|
5694
|
+
for (let k = 623; k > 0; --k, ++i) {
|
|
5695
|
+
if (i >= 624) {
|
|
5696
|
+
mt[0] = mt[623];
|
|
5697
|
+
i = 1;
|
|
5698
|
+
}
|
|
5699
|
+
mt[i] = (mt[i] ^ u(mt[i - 1] ^ mt[i - 1] >>> 30, 1566083941)) - i >>> 0;
|
|
5700
|
+
}
|
|
5701
|
+
mt[0] = 2147483648;
|
|
5702
|
+
this._idx = 624;
|
|
5703
|
+
this._gauss_next = null;
|
|
5704
|
+
}
|
|
5705
|
+
/**
|
|
5706
|
+
* Generates a random unsigned 32-bit integer.
|
|
5707
|
+
*
|
|
5708
|
+
* Performs the "twist" step when the state buffer is exhausted,
|
|
5709
|
+
* then applies the standard MT19937 tempering transform.
|
|
5710
|
+
*
|
|
5711
|
+
* @returns {number} A random integer in the range [0, 2^32 - 1].
|
|
5712
|
+
*/
|
|
5713
|
+
_int32() {
|
|
5714
|
+
const mt = this._mt;
|
|
5715
|
+
if (this._idx >= 624) {
|
|
5716
|
+
for (let k = 0; k < 624; ++k) {
|
|
5717
|
+
const y2 = mt[k] & 2147483648 | mt[(k + 1) % 624] & 2147483647;
|
|
5718
|
+
mt[k] = (mt[(k + 397) % 624] ^ y2 >>> 1 ^ (y2 & 1 ? 2567483615 : 0)) >>> 0;
|
|
5719
|
+
}
|
|
5720
|
+
this._idx = 0;
|
|
5721
|
+
}
|
|
5722
|
+
let y = mt[this._idx++];
|
|
5723
|
+
y ^= y >>> 11;
|
|
5724
|
+
y ^= y << 7 & 2636928640;
|
|
5725
|
+
y ^= y << 15 & 4022730752;
|
|
5726
|
+
y ^= y >>> 18;
|
|
5727
|
+
return y >>> 0;
|
|
5728
|
+
}
|
|
5729
|
+
/**
|
|
5730
|
+
* Generates a random floating-point number in the half-open interval [0, 1).
|
|
5731
|
+
*
|
|
5732
|
+
* Combines two 32-bit integers (using 53 bits of precision) to produce
|
|
5733
|
+
* a uniformly distributed double, matching Python's `random.random()`.
|
|
5734
|
+
*
|
|
5735
|
+
* @returns {number} A random float in [0, 1).
|
|
5736
|
+
*/
|
|
5737
|
+
random() {
|
|
5738
|
+
return ((this._int32() >>> 5) * 67108864 + (this._int32() >>> 6)) / 9007199254740992;
|
|
5739
|
+
}
|
|
5740
|
+
/**
|
|
5741
|
+
* Generates a random number from a Gaussian (normal) distribution.
|
|
5742
|
+
*
|
|
5743
|
+
* Uses the Box-Muller transform with a cached spare value,
|
|
5744
|
+
* matching Python's `random.gauss()` output for the same seed.
|
|
5745
|
+
*
|
|
5746
|
+
* @param {number} [mu=0] The mean of the distribution.
|
|
5747
|
+
* @param {number} [sigma=1] The standard deviation of the distribution.
|
|
5748
|
+
* @returns {number} A normally distributed random value.
|
|
5749
|
+
*/
|
|
5750
|
+
gauss(mu = 0, sigma = 1) {
|
|
5751
|
+
let z = this._gauss_next;
|
|
5752
|
+
this._gauss_next = null;
|
|
5753
|
+
if (z === null) {
|
|
5754
|
+
const x2pi = this.random() * 2 * Math.PI, g2rad = Math.sqrt(-2 * Math.log(1 - this.random()));
|
|
5755
|
+
z = Math.cos(x2pi) * g2rad;
|
|
5756
|
+
this._gauss_next = Math.sin(x2pi) * g2rad;
|
|
5757
|
+
}
|
|
5758
|
+
return mu + z * sigma;
|
|
5759
|
+
}
|
|
5760
|
+
/**
|
|
5761
|
+
* Shuffles an array in-place using the Fisher-Yates algorithm.
|
|
5762
|
+
*
|
|
5763
|
+
* Uses rejection sampling via `getrandbits`-style bit masking to ensure
|
|
5764
|
+
* a uniform distribution, matching Python's `random.shuffle()`.
|
|
5765
|
+
*
|
|
5766
|
+
* @param {any[]} arr The array to shuffle in-place.
|
|
5767
|
+
*/
|
|
5768
|
+
shuffle(arr) {
|
|
5769
|
+
for (let i = arr.length - 1; i > 0; --i) {
|
|
5770
|
+
const k = 32 - Math.clz32(i + 1);
|
|
5771
|
+
let r = this._int32() >>> 32 - k;
|
|
5772
|
+
while (r > i) r = this._int32() >>> 32 - k;
|
|
5773
|
+
const t = arr[i];
|
|
5774
|
+
arr[i] = arr[r];
|
|
5775
|
+
arr[r] = t;
|
|
5776
|
+
}
|
|
5777
|
+
}
|
|
5778
|
+
/**
|
|
5779
|
+
* Selects a single element from a weighted population.
|
|
5780
|
+
*
|
|
5781
|
+
* Matches Python's `random.choices(population, weights=weights, k=1)[0]`
|
|
5782
|
+
*
|
|
5783
|
+
* @param {any[]} population The array of items to choose from.
|
|
5784
|
+
* @param {number[]} weights An array of non-negative weights, one per population element.
|
|
5785
|
+
* @returns {*} A single randomly selected element from the population.
|
|
5786
|
+
*/
|
|
5787
|
+
choices(population, weights) {
|
|
5788
|
+
return population[_weightedIndexWith(this._random_fn, weights)];
|
|
5789
|
+
}
|
|
5790
|
+
};
|
|
5791
|
+
function _weightedIndexWith(randomFn, weights) {
|
|
5792
|
+
let sum = 0;
|
|
5793
|
+
for (let i = 0; i < weights.length; ++i) sum += weights[i];
|
|
5794
|
+
let x = randomFn() * sum;
|
|
5795
|
+
for (let i = 0; i < weights.length; ++i) {
|
|
5796
|
+
x -= weights[i];
|
|
5797
|
+
if (x < 0) return i;
|
|
5798
|
+
}
|
|
5799
|
+
return weights.length - 1;
|
|
5800
|
+
}
|
|
5801
|
+
var _default = new Random();
|
|
5802
|
+
var random = Object.freeze({
|
|
5803
|
+
Random,
|
|
5804
|
+
seed: _default.seed.bind(_default),
|
|
5805
|
+
random: _default.random.bind(_default),
|
|
5806
|
+
gauss: _default.gauss.bind(_default),
|
|
5807
|
+
shuffle: _default.shuffle.bind(_default),
|
|
5808
|
+
choices: _default.choices.bind(_default)
|
|
5809
|
+
});
|
|
5810
|
+
var _weightedIndex = (weights) => _weightedIndexWith(random.random, weights);
|
|
5811
|
+
|
|
5812
|
+
// src/utils/cache/FileCache.js
|
|
5813
|
+
var rng = new Random();
|
|
5622
5814
|
var FileCache = class {
|
|
5623
5815
|
/**
|
|
5624
5816
|
* Instantiate a `FileCache` object.
|
|
@@ -5650,13 +5842,16 @@ var FileCache = class {
|
|
|
5650
5842
|
* @returns {Promise<void>}
|
|
5651
5843
|
*/
|
|
5652
5844
|
async put(request, response, progress_callback = void 0) {
|
|
5653
|
-
|
|
5845
|
+
const filePath = node_path_default.join(this.path, request);
|
|
5846
|
+
const id = apis.IS_PROCESS_AVAILABLE ? process.pid : Date.now();
|
|
5847
|
+
const randomSuffix = rng._int32().toString(36);
|
|
5848
|
+
const tmpPath = filePath + `.tmp.${id}.${randomSuffix}`;
|
|
5654
5849
|
try {
|
|
5655
5850
|
const contentLength = response.headers.get("Content-Length");
|
|
5656
5851
|
const total = parseInt(contentLength ?? "0");
|
|
5657
5852
|
let loaded = 0;
|
|
5658
5853
|
await node_fs_default.promises.mkdir(node_path_default.dirname(filePath), { recursive: true });
|
|
5659
|
-
const fileStream = node_fs_default.createWriteStream(
|
|
5854
|
+
const fileStream = node_fs_default.createWriteStream(tmpPath);
|
|
5660
5855
|
const reader = response.body.getReader();
|
|
5661
5856
|
while (true) {
|
|
5662
5857
|
const { done, value } = await reader.read();
|
|
@@ -5676,10 +5871,13 @@ var FileCache = class {
|
|
|
5676
5871
|
const progress = total ? loaded / total * 100 : 0;
|
|
5677
5872
|
progress_callback?.({ progress, loaded, total });
|
|
5678
5873
|
}
|
|
5679
|
-
|
|
5874
|
+
await new Promise((resolve, reject) => {
|
|
5875
|
+
fileStream.close((err) => err ? reject(err) : resolve());
|
|
5876
|
+
});
|
|
5877
|
+
await node_fs_default.promises.rename(tmpPath, filePath);
|
|
5680
5878
|
} catch (error) {
|
|
5681
5879
|
try {
|
|
5682
|
-
await node_fs_default.promises.unlink(
|
|
5880
|
+
await node_fs_default.promises.unlink(tmpPath);
|
|
5683
5881
|
} catch {
|
|
5684
5882
|
}
|
|
5685
5883
|
throw error;
|
|
@@ -5702,6 +5900,7 @@ var FileCache = class {
|
|
|
5702
5900
|
// TODO add the rest?
|
|
5703
5901
|
// addAll(requests: RequestInfo[]): Promise<void>;
|
|
5704
5902
|
// keys(request?: RequestInfo | URL, options?: CacheQueryOptions): Promise<ReadonlyArray<Request>>;
|
|
5903
|
+
// match(request: RequestInfo | URL, options?: CacheQueryOptions): Promise<Response | undefined>;
|
|
5705
5904
|
// matchAll(request?: RequestInfo | URL, options?: CacheQueryOptions): Promise<ReadonlyArray<Response>>;
|
|
5706
5905
|
};
|
|
5707
5906
|
|
|
@@ -5791,69 +5990,380 @@ async function readResponse(response, progress_callback, expectedSize) {
|
|
|
5791
5990
|
await read();
|
|
5792
5991
|
return buffer;
|
|
5793
5992
|
}
|
|
5794
|
-
|
|
5795
|
-
|
|
5796
|
-
|
|
5797
|
-
|
|
5798
|
-
|
|
5799
|
-
|
|
5800
|
-
|
|
5801
|
-
|
|
5802
|
-
|
|
5803
|
-
|
|
5804
|
-
|
|
5805
|
-
);
|
|
5806
|
-
}
|
|
5807
|
-
cache = env.customCache;
|
|
5993
|
+
function isBlobURL(url) {
|
|
5994
|
+
return isValidUrl(url, ["blob:"]);
|
|
5995
|
+
}
|
|
5996
|
+
function toAbsoluteURL(url) {
|
|
5997
|
+
let baseURL;
|
|
5998
|
+
if (typeof location !== "undefined" && location.href) {
|
|
5999
|
+
baseURL = location.href;
|
|
6000
|
+
} else if (typeof import.meta !== "undefined" && import.meta.url) {
|
|
6001
|
+
baseURL = import.meta.url;
|
|
6002
|
+
} else {
|
|
6003
|
+
return url;
|
|
5808
6004
|
}
|
|
5809
|
-
|
|
5810
|
-
|
|
5811
|
-
|
|
6005
|
+
return new URL(url, baseURL).href;
|
|
6006
|
+
}
|
|
6007
|
+
|
|
6008
|
+
// src/utils/cache/CrossOriginStorageCache.js
|
|
6009
|
+
var HASH_ALGORITHM = "SHA-256";
|
|
6010
|
+
var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
|
|
6011
|
+
var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
|
|
6012
|
+
var CrossOriginStorage = class {
|
|
6013
|
+
/** @type {Promise<Cache> | null} */
|
|
6014
|
+
#hashCache = null;
|
|
6015
|
+
/**
|
|
6016
|
+
* Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
|
|
6017
|
+
* @returns {Promise<Cache>}
|
|
6018
|
+
*/
|
|
6019
|
+
_getHashCache = () => {
|
|
6020
|
+
this.#hashCache ??= caches.open(HASH_CACHE_NAME);
|
|
6021
|
+
return this.#hashCache;
|
|
6022
|
+
};
|
|
6023
|
+
/**
|
|
6024
|
+
* Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
|
|
6025
|
+
* @returns {boolean}
|
|
6026
|
+
*/
|
|
6027
|
+
static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
|
|
6028
|
+
/**
|
|
6029
|
+
* Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
|
|
6030
|
+
* the corresponding file handle from cross-origin storage.
|
|
6031
|
+
*
|
|
6032
|
+
* Implements `CacheInterface.match`.
|
|
6033
|
+
*
|
|
6034
|
+
* @param {string} request The URL of the resource to look up.
|
|
6035
|
+
* @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
|
|
6036
|
+
*/
|
|
6037
|
+
match = async (request) => {
|
|
6038
|
+
const hashValue = await this._getFileHash(request);
|
|
6039
|
+
if (!hashValue) {
|
|
6040
|
+
return void 0;
|
|
5812
6041
|
}
|
|
5813
6042
|
try {
|
|
5814
|
-
|
|
5815
|
-
|
|
5816
|
-
|
|
6043
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
|
|
6044
|
+
const blob = await handle.getFile();
|
|
6045
|
+
return new Response(blob, {
|
|
6046
|
+
headers: {
|
|
6047
|
+
"Content-Length": String(blob.size)
|
|
6048
|
+
}
|
|
6049
|
+
});
|
|
6050
|
+
} catch {
|
|
6051
|
+
return void 0;
|
|
5817
6052
|
}
|
|
5818
|
-
}
|
|
5819
|
-
|
|
5820
|
-
|
|
5821
|
-
|
|
6053
|
+
};
|
|
6054
|
+
/**
|
|
6055
|
+
* Stores a response in cross-origin storage, keyed by its SHA-256 hash.
|
|
6056
|
+
*
|
|
6057
|
+
* For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
|
|
6058
|
+
* `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
|
|
6059
|
+
* without reading the response body a second time.
|
|
6060
|
+
*
|
|
6061
|
+
* For non-LFS resources the hash is unknown upfront. In that case the body is consumed
|
|
6062
|
+
* in the background: the stream is read to compute the content hash, the file is written
|
|
6063
|
+
* into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
|
|
6064
|
+
* so that future `match` calls can resolve the file without a network round-trip.
|
|
6065
|
+
*
|
|
6066
|
+
* Implements `CacheInterface.put`.
|
|
6067
|
+
*
|
|
6068
|
+
* @param {string} request The URL of the resource (used as the hash-cache key).
|
|
6069
|
+
* @param {Response} response The response whose body will be written to the cache.
|
|
6070
|
+
* @returns {Promise<void>}
|
|
6071
|
+
*/
|
|
6072
|
+
put = async (request, response) => {
|
|
6073
|
+
const hashValue = await this._getFileHash(request);
|
|
6074
|
+
if (hashValue) {
|
|
6075
|
+
const blob = await response.blob();
|
|
6076
|
+
await this._storeBlobInCOS(blob, hashValue);
|
|
6077
|
+
} else {
|
|
6078
|
+
this._processAndStore(request, response.body);
|
|
5822
6079
|
}
|
|
5823
|
-
|
|
5824
|
-
|
|
5825
|
-
|
|
5826
|
-
|
|
5827
|
-
|
|
5828
|
-
|
|
6080
|
+
};
|
|
6081
|
+
/**
|
|
6082
|
+
* Writes a blob into cross-origin storage using the given pre-computed hex hash string.
|
|
6083
|
+
*
|
|
6084
|
+
* @param {Blob} blob
|
|
6085
|
+
* @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
|
|
6086
|
+
* @returns {Promise<void>}
|
|
6087
|
+
*/
|
|
6088
|
+
_storeBlobInCOS = async (blob, hashHex) => {
|
|
6089
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
|
|
6090
|
+
create: true
|
|
6091
|
+
});
|
|
6092
|
+
const writableStream = await handle.createWritable();
|
|
6093
|
+
await writableStream.write(blob);
|
|
6094
|
+
await writableStream.close();
|
|
6095
|
+
};
|
|
6096
|
+
/**
|
|
6097
|
+
* Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
|
|
6098
|
+
* of the resulting blob, stores it in cross-origin storage, and persists the computed
|
|
6099
|
+
* hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
|
|
6100
|
+
* file without a network round-trip.
|
|
6101
|
+
*
|
|
6102
|
+
* Called fire-and-forget from `put` — errors are swallowed so failures never surface to
|
|
6103
|
+
* the caller.
|
|
6104
|
+
*
|
|
6105
|
+
* @param {string} request The original resource URL.
|
|
6106
|
+
* @param {ReadableStream} stream The response body stream to consume.
|
|
6107
|
+
* @returns {Promise<void>}
|
|
6108
|
+
*/
|
|
6109
|
+
_processAndStore = async (request, stream) => {
|
|
5829
6110
|
try {
|
|
5830
|
-
|
|
5831
|
-
|
|
5832
|
-
|
|
5833
|
-
|
|
6111
|
+
const chunks = [];
|
|
6112
|
+
for await (const chunk2 of stream) {
|
|
6113
|
+
chunks.push(chunk2);
|
|
6114
|
+
}
|
|
6115
|
+
const blob = new Blob(chunks);
|
|
6116
|
+
const hashHex = await this._getBlobHash(blob);
|
|
6117
|
+
await this._storeBlobInCOS(blob, hashHex);
|
|
6118
|
+
try {
|
|
6119
|
+
const hashCache = await this._getHashCache();
|
|
6120
|
+
await hashCache.put(request, new Response(hashHex));
|
|
6121
|
+
} catch {
|
|
6122
|
+
}
|
|
6123
|
+
} catch {
|
|
5834
6124
|
}
|
|
5835
|
-
}
|
|
5836
|
-
|
|
5837
|
-
|
|
5838
|
-
|
|
5839
|
-
|
|
5840
|
-
|
|
6125
|
+
};
|
|
6126
|
+
/**
|
|
6127
|
+
* Deletes the cache entry for the given request.
|
|
6128
|
+
*
|
|
6129
|
+
* Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
|
|
6130
|
+
* expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
|
|
6131
|
+
* permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
|
|
6132
|
+
* re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
|
|
6133
|
+
*
|
|
6134
|
+
* Implements `CacheInterface.delete`.
|
|
6135
|
+
*
|
|
6136
|
+
* @param {string} request
|
|
6137
|
+
* @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
|
|
6138
|
+
*/
|
|
6139
|
+
delete = async (request) => {
|
|
6140
|
+
try {
|
|
6141
|
+
const hashCache = await this._getHashCache();
|
|
6142
|
+
return await hashCache.delete(request);
|
|
6143
|
+
} catch {
|
|
6144
|
+
return false;
|
|
6145
|
+
}
|
|
6146
|
+
};
|
|
6147
|
+
/**
|
|
6148
|
+
* Resolves the SHA-256 hash for a given URL.
|
|
6149
|
+
*
|
|
6150
|
+
* Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
|
|
6151
|
+
* Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
|
|
6152
|
+
* LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
|
|
6153
|
+
*
|
|
6154
|
+
* Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
|
|
6155
|
+
*
|
|
6156
|
+
* @param {string} url The resource URL to resolve a hash for.
|
|
6157
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6158
|
+
*/
|
|
6159
|
+
_getFileHash = async (url) => {
|
|
6160
|
+
try {
|
|
6161
|
+
const hashCache = await this._getHashCache();
|
|
6162
|
+
const cached = await hashCache.match(url);
|
|
6163
|
+
if (cached) {
|
|
6164
|
+
return cached.text();
|
|
6165
|
+
}
|
|
6166
|
+
const hash = await this._getLfsFileHash(url);
|
|
6167
|
+
if (hash) {
|
|
6168
|
+
await hashCache.put(url, new Response(hash));
|
|
6169
|
+
return hash;
|
|
6170
|
+
}
|
|
6171
|
+
return null;
|
|
6172
|
+
} catch {
|
|
6173
|
+
return null;
|
|
6174
|
+
}
|
|
6175
|
+
};
|
|
6176
|
+
/**
|
|
6177
|
+
* Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
|
|
6178
|
+
* Git LFS pointer file.
|
|
6179
|
+
*
|
|
6180
|
+
* Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
|
|
6181
|
+
* The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
|
|
6182
|
+
* Returns `null` for non-LFS URLs or when the network request fails.
|
|
6183
|
+
*
|
|
6184
|
+
* @see https://huggingface.co/docs/hub/en/storage-backends#xet
|
|
6185
|
+
* @param {string} url The resolved Hugging Face URL of the resource.
|
|
6186
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
6187
|
+
*/
|
|
6188
|
+
_getLfsFileHash = async (url) => {
|
|
6189
|
+
if (!url.includes("/resolve/")) {
|
|
6190
|
+
return null;
|
|
6191
|
+
}
|
|
6192
|
+
const rawUrl = url.replace("/resolve/", "/raw/");
|
|
6193
|
+
try {
|
|
6194
|
+
const text = await fetch(rawUrl).then((r) => r.text());
|
|
6195
|
+
const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
|
|
6196
|
+
return match ? match[1] : null;
|
|
6197
|
+
} catch {
|
|
6198
|
+
return null;
|
|
6199
|
+
}
|
|
6200
|
+
};
|
|
6201
|
+
/**
|
|
6202
|
+
* Computes the SHA-256 hash of a `Blob`'s contents.
|
|
6203
|
+
*
|
|
6204
|
+
* @param {Blob} blob The blob to hash.
|
|
6205
|
+
* @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
|
|
6206
|
+
*/
|
|
6207
|
+
_getBlobHash = async (blob) => {
|
|
6208
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
6209
|
+
const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
|
|
6210
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
6211
|
+
return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
6212
|
+
};
|
|
6213
|
+
};
|
|
6214
|
+
|
|
6215
|
+
// src/utils/cache.js
|
|
6216
|
+
async function getCache(file_cache_dir = null) {
|
|
6217
|
+
let cache2 = null;
|
|
6218
|
+
if (env.useCustomCache) {
|
|
6219
|
+
if (!env.customCache) {
|
|
6220
|
+
throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
|
|
6221
|
+
}
|
|
6222
|
+
if (!env.customCache.match || !env.customCache.put) {
|
|
6223
|
+
throw new Error(
|
|
6224
|
+
"`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
|
|
6225
|
+
);
|
|
6226
|
+
}
|
|
6227
|
+
cache2 = env.customCache;
|
|
6228
|
+
}
|
|
6229
|
+
if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
|
|
6230
|
+
cache2 = new CrossOriginStorage();
|
|
6231
|
+
}
|
|
6232
|
+
if (!cache2 && env.useBrowserCache) {
|
|
6233
|
+
if (typeof caches === "undefined") {
|
|
6234
|
+
throw Error("Browser cache is not available in this environment.");
|
|
6235
|
+
}
|
|
6236
|
+
try {
|
|
6237
|
+
cache2 = await caches.open(env.cacheKey);
|
|
6238
|
+
} catch (e) {
|
|
6239
|
+
logger.warn("An error occurred while opening the browser cache:", e);
|
|
6240
|
+
}
|
|
6241
|
+
}
|
|
6242
|
+
if (!cache2 && env.useFSCache) {
|
|
6243
|
+
if (!apis.IS_FS_AVAILABLE) {
|
|
6244
|
+
throw Error("File System Cache is not available in this environment.");
|
|
6245
|
+
}
|
|
6246
|
+
cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
|
|
6247
|
+
}
|
|
6248
|
+
return cache2;
|
|
6249
|
+
}
|
|
6250
|
+
async function tryCache(cache2, ...names) {
|
|
6251
|
+
for (let name of names) {
|
|
6252
|
+
try {
|
|
6253
|
+
let result = await cache2.match(name);
|
|
6254
|
+
if (result) return result;
|
|
6255
|
+
} catch (e) {
|
|
6256
|
+
continue;
|
|
6257
|
+
}
|
|
6258
|
+
}
|
|
6259
|
+
return void 0;
|
|
6260
|
+
}
|
|
6261
|
+
|
|
6262
|
+
// src/utils/lru_cache.js
|
|
6263
|
+
var LRUCache2 = class {
|
|
6264
|
+
/** @type {number} */
|
|
6265
|
+
#capacity;
|
|
6266
|
+
/** @type {Map<any, any>} */
|
|
6267
|
+
#cache;
|
|
6268
|
+
/**
|
|
6269
|
+
* Creates an LRUCache instance.
|
|
6270
|
+
* @param {number} capacity The maximum number of items the cache can hold.
|
|
6271
|
+
*/
|
|
6272
|
+
constructor(capacity) {
|
|
6273
|
+
this.#capacity = capacity;
|
|
6274
|
+
this.#cache = /* @__PURE__ */ new Map();
|
|
6275
|
+
}
|
|
6276
|
+
/**
|
|
6277
|
+
* Retrieves the value associated with the given key and marks the key as recently used.
|
|
6278
|
+
* @param {any} key The key to retrieve.
|
|
6279
|
+
* @returns {any} The value associated with the key, or undefined if the key does not exist.
|
|
6280
|
+
*/
|
|
6281
|
+
get(key) {
|
|
6282
|
+
if (!this.#cache.has(key)) return void 0;
|
|
6283
|
+
const value = this.#cache.get(key);
|
|
6284
|
+
this.#cache.delete(key);
|
|
6285
|
+
this.#cache.set(key, value);
|
|
6286
|
+
return value;
|
|
6287
|
+
}
|
|
6288
|
+
/**
|
|
6289
|
+
* Inserts or updates the key-value pair in the cache.
|
|
6290
|
+
* If the key already exists, it is updated and marked as recently used.
|
|
6291
|
+
* If the cache exceeds its capacity, the least recently used item is evicted.
|
|
6292
|
+
* @param {any} key The key to add or update.
|
|
6293
|
+
* @param {any} value The value to associate with the key.
|
|
6294
|
+
*/
|
|
6295
|
+
put(key, value) {
|
|
6296
|
+
if (this.#cache.has(key)) {
|
|
6297
|
+
this.#cache.delete(key);
|
|
6298
|
+
}
|
|
6299
|
+
this.#cache.set(key, value);
|
|
6300
|
+
if (this.#cache.size > this.#capacity) {
|
|
6301
|
+
this.#cache.delete(this.#cache.keys().next().value);
|
|
6302
|
+
}
|
|
6303
|
+
}
|
|
6304
|
+
/**
|
|
6305
|
+
* Removes the entry for the given key from the cache.
|
|
6306
|
+
* @param {any} key The key to delete.
|
|
6307
|
+
* @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
|
|
6308
|
+
*/
|
|
6309
|
+
delete(key) {
|
|
6310
|
+
return this.#cache.delete(key);
|
|
6311
|
+
}
|
|
6312
|
+
/**
|
|
6313
|
+
* Clears the cache.
|
|
6314
|
+
*/
|
|
6315
|
+
clear() {
|
|
6316
|
+
this.#cache.clear();
|
|
6317
|
+
}
|
|
6318
|
+
};
|
|
6319
|
+
|
|
6320
|
+
// src/utils/memoize_promise.js
|
|
6321
|
+
var MAX_CACHE_SIZE = 100;
|
|
6322
|
+
var cache = new LRUCache2(MAX_CACHE_SIZE);
|
|
6323
|
+
function memoizePromise(key, factory) {
|
|
6324
|
+
const cached = cache.get(key);
|
|
6325
|
+
if (cached !== void 0) {
|
|
6326
|
+
return cached;
|
|
6327
|
+
}
|
|
6328
|
+
const promise = factory().then(
|
|
6329
|
+
(value) => value,
|
|
6330
|
+
(err) => {
|
|
6331
|
+
cache.delete(key);
|
|
6332
|
+
return Promise.reject(err);
|
|
6333
|
+
}
|
|
6334
|
+
);
|
|
6335
|
+
cache.put(key, promise);
|
|
6336
|
+
return promise;
|
|
6337
|
+
}
|
|
6338
|
+
|
|
6339
|
+
// src/utils/model_registry/get_file_metadata.js
|
|
6340
|
+
async function fetch_file_head(urlOrPath) {
|
|
5841
6341
|
if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
|
|
5842
6342
|
return null;
|
|
5843
6343
|
}
|
|
5844
6344
|
const headers = getFetchHeaders(urlOrPath);
|
|
5845
6345
|
headers.set("Range", "bytes=0-0");
|
|
5846
|
-
return env.fetch(urlOrPath, { method: "GET", headers });
|
|
6346
|
+
return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
|
|
6347
|
+
}
|
|
6348
|
+
function get_file_metadata(path_or_repo_id, filename, options = {}) {
|
|
6349
|
+
const key = JSON.stringify([
|
|
6350
|
+
path_or_repo_id,
|
|
6351
|
+
filename,
|
|
6352
|
+
options?.revision,
|
|
6353
|
+
options?.cache_dir,
|
|
6354
|
+
options?.local_files_only
|
|
6355
|
+
]);
|
|
6356
|
+
return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
|
|
5847
6357
|
}
|
|
5848
|
-
async function
|
|
5849
|
-
const
|
|
6358
|
+
async function _get_file_metadata(path_or_repo_id, filename, options) {
|
|
6359
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
5850
6360
|
const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
5851
6361
|
path_or_repo_id,
|
|
5852
6362
|
filename,
|
|
5853
6363
|
options,
|
|
5854
|
-
|
|
6364
|
+
cache2
|
|
5855
6365
|
);
|
|
5856
|
-
const cachedResponse = await checkCachedResource(
|
|
6366
|
+
const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
5857
6367
|
if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
|
|
5858
6368
|
const size = cachedResponse.headers.get("content-length");
|
|
5859
6369
|
const contentType = cachedResponse.headers.get("content-type");
|
|
@@ -5951,7 +6461,7 @@ function getFetchHeaders(urlOrPath) {
|
|
|
5951
6461
|
}
|
|
5952
6462
|
return headers;
|
|
5953
6463
|
}
|
|
5954
|
-
function buildResourcePaths(path_or_repo_id, filename, options = {},
|
|
6464
|
+
function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
|
|
5955
6465
|
const revision = options.revision ?? "main";
|
|
5956
6466
|
const requestURL = pathJoin(path_or_repo_id, filename);
|
|
5957
6467
|
const validModelId = isValidHfModelId(path_or_repo_id);
|
|
@@ -5961,7 +6471,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
5961
6471
|
env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
|
|
5962
6472
|
filename
|
|
5963
6473
|
);
|
|
5964
|
-
const proposedCacheKey =
|
|
6474
|
+
const proposedCacheKey = cache2 instanceof FileCache ? (
|
|
5965
6475
|
// Choose cache key for filesystem cache
|
|
5966
6476
|
// When using the main revision (default), we use the request URL as the cache key.
|
|
5967
6477
|
// If a specific revision is requested, we account for this in the cache key.
|
|
@@ -5975,14 +6485,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
5975
6485
|
validModelId
|
|
5976
6486
|
};
|
|
5977
6487
|
}
|
|
5978
|
-
async function checkCachedResource(
|
|
5979
|
-
if (!
|
|
6488
|
+
async function checkCachedResource(cache2, localPath, proposedCacheKey) {
|
|
6489
|
+
if (!cache2) {
|
|
5980
6490
|
return void 0;
|
|
5981
6491
|
}
|
|
5982
|
-
return await tryCache(
|
|
6492
|
+
return await tryCache(cache2, localPath, proposedCacheKey);
|
|
5983
6493
|
}
|
|
5984
|
-
async function storeCachedResource(path_or_repo_id, filename,
|
|
5985
|
-
if (await
|
|
6494
|
+
async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
|
|
6495
|
+
if (await cache2.match(cacheKey) !== void 0) {
|
|
5986
6496
|
return;
|
|
5987
6497
|
}
|
|
5988
6498
|
if (!result) {
|
|
@@ -5992,14 +6502,14 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
5992
6502
|
file: filename,
|
|
5993
6503
|
...data
|
|
5994
6504
|
}) : void 0;
|
|
5995
|
-
await
|
|
6505
|
+
await cache2.put(
|
|
5996
6506
|
cacheKey,
|
|
5997
6507
|
/** @type {Response} */
|
|
5998
6508
|
response,
|
|
5999
6509
|
wrapped_progress
|
|
6000
6510
|
);
|
|
6001
6511
|
} else if (typeof response !== "string") {
|
|
6002
|
-
await
|
|
6512
|
+
await cache2.put(
|
|
6003
6513
|
cacheKey,
|
|
6004
6514
|
new Response(
|
|
6005
6515
|
/** @type {any} */
|
|
@@ -6013,17 +6523,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
6013
6523
|
});
|
|
6014
6524
|
}
|
|
6015
6525
|
}
|
|
6016
|
-
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false,
|
|
6526
|
+
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
|
|
6017
6527
|
const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6018
6528
|
path_or_repo_id,
|
|
6019
6529
|
filename,
|
|
6020
6530
|
options,
|
|
6021
|
-
|
|
6531
|
+
cache2
|
|
6022
6532
|
);
|
|
6023
6533
|
let cacheKey;
|
|
6024
6534
|
let toCacheResponse = false;
|
|
6025
6535
|
let response;
|
|
6026
|
-
response = await checkCachedResource(
|
|
6536
|
+
response = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6027
6537
|
const cacheHit = response !== void 0;
|
|
6028
6538
|
if (!cacheHit) {
|
|
6029
6539
|
if (env.allowLocalModels) {
|
|
@@ -6064,7 +6574,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6064
6574
|
}
|
|
6065
6575
|
cacheKey = proposedCacheKey;
|
|
6066
6576
|
}
|
|
6067
|
-
toCacheResponse =
|
|
6577
|
+
toCacheResponse = cache2 && // 1. A caching system is available
|
|
6068
6578
|
typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
|
|
6069
6579
|
response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
|
|
6070
6580
|
response.status === 200;
|
|
@@ -6126,7 +6636,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6126
6636
|
// i.e., do not cache FileResponses (prevents duplication)
|
|
6127
6637
|
toCacheResponse && cacheKey && typeof response !== "string"
|
|
6128
6638
|
) {
|
|
6129
|
-
await storeCachedResource(path_or_repo_id, filename,
|
|
6639
|
+
await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
|
|
6130
6640
|
}
|
|
6131
6641
|
dispatchCallback(options.progress_callback, {
|
|
6132
6642
|
status: "done",
|
|
@@ -6142,7 +6652,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
6142
6652
|
if (response instanceof FileResponse) {
|
|
6143
6653
|
return response.filePath;
|
|
6144
6654
|
}
|
|
6145
|
-
const cachedResponse = await
|
|
6655
|
+
const cachedResponse = await cache2?.match(cacheKey);
|
|
6146
6656
|
if (cachedResponse instanceof FileResponse) {
|
|
6147
6657
|
return cachedResponse.filePath;
|
|
6148
6658
|
} else if (cachedResponse instanceof Response) {
|
|
@@ -6169,8 +6679,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
|
|
|
6169
6679
|
name: path_or_repo_id,
|
|
6170
6680
|
file: filename
|
|
6171
6681
|
});
|
|
6172
|
-
const
|
|
6173
|
-
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path,
|
|
6682
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6683
|
+
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
|
|
6174
6684
|
}
|
|
6175
6685
|
async function getModelText(modelPath, fileName, fatal = true, options = {}) {
|
|
6176
6686
|
const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
|
|
@@ -6973,11 +7483,11 @@ import * as ONNX_WEB from "onnxruntime-web/webgpu";
|
|
|
6973
7483
|
// src/backends/utils/cacheWasm.js
|
|
6974
7484
|
async function loadAndCacheFile(url) {
|
|
6975
7485
|
const fileName = url.split("/").pop();
|
|
6976
|
-
let
|
|
7486
|
+
let cache2;
|
|
6977
7487
|
try {
|
|
6978
|
-
|
|
6979
|
-
if (
|
|
6980
|
-
const result = await
|
|
7488
|
+
cache2 = await getCache();
|
|
7489
|
+
if (cache2) {
|
|
7490
|
+
const result = await cache2.match(url);
|
|
6981
7491
|
if (result) {
|
|
6982
7492
|
return result;
|
|
6983
7493
|
}
|
|
@@ -6989,9 +7499,9 @@ async function loadAndCacheFile(url) {
|
|
|
6989
7499
|
if (!response.ok) {
|
|
6990
7500
|
throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
|
|
6991
7501
|
}
|
|
6992
|
-
if (
|
|
7502
|
+
if (cache2) {
|
|
6993
7503
|
try {
|
|
6994
|
-
await
|
|
7504
|
+
await cache2.put(url, response.clone());
|
|
6995
7505
|
} catch (e) {
|
|
6996
7506
|
logger.warn(`Failed to cache ${fileName}:`, e);
|
|
6997
7507
|
}
|
|
@@ -7009,34 +7519,21 @@ async function loadWasmBinary(wasmURL) {
|
|
|
7009
7519
|
}
|
|
7010
7520
|
}
|
|
7011
7521
|
async function loadWasmFactory(libURL) {
|
|
7522
|
+
if (apis.IS_SERVICE_WORKER_ENV || apis.IS_CHROME_AVAILABLE) {
|
|
7523
|
+
return libURL;
|
|
7524
|
+
}
|
|
7012
7525
|
const response = await loadAndCacheFile(libURL);
|
|
7013
7526
|
if (!response || typeof response === "string") return null;
|
|
7014
7527
|
try {
|
|
7015
7528
|
let code = await response.text();
|
|
7016
|
-
const baseUrl = libURL.split("/").slice(0, -1).join("/");
|
|
7017
|
-
code = code.replaceAll("import.meta.url", `"${baseUrl}"`);
|
|
7018
7529
|
code = code.replaceAll("globalThis.process?.versions?.node", "false");
|
|
7019
7530
|
const blob = new Blob([code], { type: "text/javascript" });
|
|
7020
7531
|
return URL.createObjectURL(blob);
|
|
7021
7532
|
} catch (error) {
|
|
7022
|
-
logger.warn("Failed to read WASM
|
|
7533
|
+
logger.warn("Failed to read WASM factory:", error);
|
|
7023
7534
|
return null;
|
|
7024
7535
|
}
|
|
7025
7536
|
}
|
|
7026
|
-
function isBlobURL(url) {
|
|
7027
|
-
return isValidUrl(url, ["blob:"]);
|
|
7028
|
-
}
|
|
7029
|
-
function toAbsoluteURL(url) {
|
|
7030
|
-
let baseURL;
|
|
7031
|
-
if (typeof location !== "undefined" && location.href) {
|
|
7032
|
-
baseURL = location.href;
|
|
7033
|
-
} else if (typeof import.meta !== "undefined" && import.meta.url) {
|
|
7034
|
-
baseURL = import.meta.url;
|
|
7035
|
-
} else {
|
|
7036
|
-
return url;
|
|
7037
|
-
}
|
|
7038
|
-
return new URL(url, baseURL).href;
|
|
7039
|
-
}
|
|
7040
7537
|
|
|
7041
7538
|
// src/backends/onnx.js
|
|
7042
7539
|
import { Tensor } from "onnxruntime-common";
|
|
@@ -7135,7 +7632,6 @@ function deviceToExecutionProviders(device = null) {
|
|
|
7135
7632
|
}
|
|
7136
7633
|
throw new Error(`Unsupported device: "${device}". Should be one of: ${supportedDevices.join(", ")}.`);
|
|
7137
7634
|
}
|
|
7138
|
-
var IS_WEB_ENV = apis.IS_BROWSER_ENV || apis.IS_WEBWORKER_ENV;
|
|
7139
7635
|
var webInitChain = Promise.resolve();
|
|
7140
7636
|
var wasmLoadPromise = null;
|
|
7141
7637
|
async function ensureWasmLoaded() {
|
|
@@ -7144,6 +7640,11 @@ async function ensureWasmLoaded() {
|
|
|
7144
7640
|
}
|
|
7145
7641
|
const shouldUseWasmCache = env.useWasmCache && typeof ONNX_ENV?.wasm?.wasmPaths === "object" && ONNX_ENV?.wasm?.wasmPaths?.wasm && ONNX_ENV?.wasm?.wasmPaths?.mjs;
|
|
7146
7642
|
if (!shouldUseWasmCache) {
|
|
7643
|
+
if (apis.IS_DENO_WEB_RUNTIME) {
|
|
7644
|
+
throw new Error(
|
|
7645
|
+
"env.useWasmCache=false is not supported in Deno's web runtime. Remove the useWasmCache override."
|
|
7646
|
+
);
|
|
7647
|
+
}
|
|
7147
7648
|
wasmLoadPromise = Promise.resolve();
|
|
7148
7649
|
return wasmLoadPromise;
|
|
7149
7650
|
}
|
|
@@ -7152,6 +7653,7 @@ async function ensureWasmLoaded() {
|
|
|
7152
7653
|
/** @type {{ wasm: string, mjs: string }} */
|
|
7153
7654
|
ONNX_ENV.wasm.wasmPaths
|
|
7154
7655
|
);
|
|
7656
|
+
let wasmBinaryLoaded = false;
|
|
7155
7657
|
await Promise.all([
|
|
7156
7658
|
// Load and cache the WASM binary
|
|
7157
7659
|
urls.wasm && !isBlobURL(urls.wasm) ? (async () => {
|
|
@@ -7159,12 +7661,13 @@ async function ensureWasmLoaded() {
|
|
|
7159
7661
|
const wasmBinary = await loadWasmBinary(toAbsoluteURL(urls.wasm));
|
|
7160
7662
|
if (wasmBinary) {
|
|
7161
7663
|
ONNX_ENV.wasm.wasmBinary = wasmBinary;
|
|
7664
|
+
wasmBinaryLoaded = true;
|
|
7162
7665
|
}
|
|
7163
7666
|
} catch (err) {
|
|
7164
7667
|
logger.warn("Failed to pre-load WASM binary:", err);
|
|
7165
7668
|
}
|
|
7166
7669
|
})() : Promise.resolve(),
|
|
7167
|
-
// Load and cache the WASM factory
|
|
7670
|
+
// Load and cache the WASM factory as a blob URL
|
|
7168
7671
|
urls.mjs && !isBlobURL(urls.mjs) ? (async () => {
|
|
7169
7672
|
try {
|
|
7170
7673
|
const wasmFactoryBlob = await loadWasmFactory(toAbsoluteURL(urls.mjs));
|
|
@@ -7176,6 +7679,9 @@ async function ensureWasmLoaded() {
|
|
|
7176
7679
|
}
|
|
7177
7680
|
})() : Promise.resolve()
|
|
7178
7681
|
]);
|
|
7682
|
+
if (!wasmBinaryLoaded) {
|
|
7683
|
+
ONNX_ENV.wasm.wasmPaths.mjs = urls.mjs;
|
|
7684
|
+
}
|
|
7179
7685
|
})();
|
|
7180
7686
|
return wasmLoadPromise;
|
|
7181
7687
|
}
|
|
@@ -7187,51 +7693,52 @@ async function createInferenceSession(buffer_or_path, session_options, session_c
|
|
|
7187
7693
|
logSeverityLevel,
|
|
7188
7694
|
...session_options
|
|
7189
7695
|
});
|
|
7190
|
-
const session = await (IS_WEB_ENV ? webInitChain = webInitChain.then(load) : load());
|
|
7696
|
+
const session = await (apis.IS_WEB_ENV ? webInitChain = webInitChain.then(load) : load());
|
|
7191
7697
|
session.config = session_config;
|
|
7192
7698
|
return session;
|
|
7193
7699
|
}
|
|
7194
7700
|
var webInferenceChain = Promise.resolve();
|
|
7195
7701
|
async function runInferenceSession(session, ortFeed) {
|
|
7196
7702
|
const run = () => session.run(ortFeed);
|
|
7197
|
-
|
|
7198
|
-
return output;
|
|
7703
|
+
return apis.IS_WEB_ENV ? webInferenceChain = webInferenceChain.then(run) : run();
|
|
7199
7704
|
}
|
|
7200
7705
|
function isONNXTensor(x) {
|
|
7201
7706
|
return x instanceof ONNX.Tensor;
|
|
7202
7707
|
}
|
|
7203
7708
|
var ONNX_ENV = ONNX?.env;
|
|
7204
|
-
if (ONNX_ENV?.wasm) {
|
|
7205
|
-
if (
|
|
7206
|
-
// @ts-ignore Cannot find name 'ServiceWorkerGlobalScope'.ts(2304)
|
|
7207
|
-
!(typeof ServiceWorkerGlobalScope !== "undefined" && self instanceof ServiceWorkerGlobalScope) && ONNX_ENV.versions?.web && !ONNX_ENV.wasm.wasmPaths
|
|
7208
|
-
) {
|
|
7209
|
-
const wasmPathPrefix = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${ONNX_ENV.versions.web}/dist/`;
|
|
7210
|
-
ONNX_ENV.wasm.wasmPaths = apis.IS_SAFARI ? {
|
|
7211
|
-
mjs: `${wasmPathPrefix}ort-wasm-simd-threaded.mjs`,
|
|
7212
|
-
wasm: `${wasmPathPrefix}ort-wasm-simd-threaded.wasm`
|
|
7213
|
-
} : {
|
|
7214
|
-
mjs: `${wasmPathPrefix}ort-wasm-simd-threaded.asyncify.mjs`,
|
|
7215
|
-
wasm: `${wasmPathPrefix}ort-wasm-simd-threaded.asyncify.wasm`
|
|
7216
|
-
};
|
|
7217
|
-
}
|
|
7218
|
-
ONNX_ENV.wasm.proxy = false;
|
|
7219
|
-
}
|
|
7220
|
-
if (ONNX_ENV?.webgpu) {
|
|
7221
|
-
ONNX_ENV.webgpu.powerPreference = "high-performance";
|
|
7222
|
-
}
|
|
7223
7709
|
function isONNXProxy() {
|
|
7224
7710
|
return ONNX_ENV?.wasm?.proxy;
|
|
7225
7711
|
}
|
|
7226
|
-
|
|
7227
|
-
|
|
7228
|
-
|
|
7712
|
+
if (ONNX_ENV) {
|
|
7713
|
+
let setLogLevel = function(logLevel2) {
|
|
7714
|
+
const severityLevel = getOnnxLogSeverityLevel(logLevel2);
|
|
7715
|
+
ONNX_ENV.logLevel = ONNX_LOG_LEVEL_NAMES[severityLevel];
|
|
7716
|
+
};
|
|
7717
|
+
if (ONNX_ENV.wasm) {
|
|
7718
|
+
if (
|
|
7719
|
+
// @ts-ignore Cannot find name 'ServiceWorkerGlobalScope'.ts(2304)
|
|
7720
|
+
!(typeof ServiceWorkerGlobalScope !== "undefined" && self instanceof ServiceWorkerGlobalScope) && ONNX_ENV.versions?.web && !ONNX_ENV.wasm.wasmPaths
|
|
7721
|
+
) {
|
|
7722
|
+
const wasmPathPrefix = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${ONNX_ENV.versions.web}/dist/`;
|
|
7723
|
+
ONNX_ENV.wasm.wasmPaths = apis.IS_SAFARI ? {
|
|
7724
|
+
mjs: `${wasmPathPrefix}ort-wasm-simd-threaded.mjs`,
|
|
7725
|
+
wasm: `${wasmPathPrefix}ort-wasm-simd-threaded.wasm`
|
|
7726
|
+
} : {
|
|
7727
|
+
mjs: `${wasmPathPrefix}ort-wasm-simd-threaded.asyncify.mjs`,
|
|
7728
|
+
wasm: `${wasmPathPrefix}ort-wasm-simd-threaded.asyncify.wasm`
|
|
7729
|
+
};
|
|
7730
|
+
}
|
|
7731
|
+
ONNX_ENV.wasm.proxy = false;
|
|
7732
|
+
}
|
|
7733
|
+
if (ONNX_ENV.webgpu) {
|
|
7734
|
+
ONNX_ENV.webgpu.powerPreference = "high-performance";
|
|
7735
|
+
}
|
|
7736
|
+
setLogLevel(env.logLevel ?? LogLevel.WARNING);
|
|
7737
|
+
env.backends.onnx = {
|
|
7738
|
+
...ONNX_ENV,
|
|
7739
|
+
setLogLevel
|
|
7740
|
+
};
|
|
7229
7741
|
}
|
|
7230
|
-
setLogLevel(env.logLevel ?? LogLevel.WARNING);
|
|
7231
|
-
env.backends.onnx = {
|
|
7232
|
-
...ONNX_ENV,
|
|
7233
|
-
setLogLevel
|
|
7234
|
-
};
|
|
7235
7742
|
|
|
7236
7743
|
// src/ops/registry.js
|
|
7237
7744
|
var wrap = async (session_bytes, session_options, names) => {
|
|
@@ -8423,179 +8930,18 @@ var DataTypeMap = Object.freeze({
|
|
|
8423
8930
|
float64: Float64Array,
|
|
8424
8931
|
string: Array,
|
|
8425
8932
|
// string[]
|
|
8426
|
-
int8: Int8Array,
|
|
8427
|
-
uint8: Uint8Array,
|
|
8428
|
-
int16: Int16Array,
|
|
8429
|
-
uint16: Uint16Array,
|
|
8430
|
-
int32: Int32Array,
|
|
8431
|
-
uint32: Uint32Array,
|
|
8432
|
-
int64: BigInt64Array,
|
|
8433
|
-
uint64: BigUint64Array,
|
|
8434
|
-
bool: Uint8Array,
|
|
8435
|
-
uint4: Uint8Array,
|
|
8436
|
-
int4: Int8Array
|
|
8437
|
-
});
|
|
8438
|
-
|
|
8439
|
-
// src/utils/random.js
|
|
8440
|
-
var Random = class {
|
|
8441
|
-
constructor(seed) {
|
|
8442
|
-
this._mt = new Uint32Array(624);
|
|
8443
|
-
this._idx = 625;
|
|
8444
|
-
this._gauss_next = null;
|
|
8445
|
-
this._random_fn = this.random.bind(this);
|
|
8446
|
-
this.seed(seed);
|
|
8447
|
-
}
|
|
8448
|
-
/**
|
|
8449
|
-
* Seeds this instance's PRNG.
|
|
8450
|
-
*
|
|
8451
|
-
* When called with a number, initializes the state deterministically from that value.
|
|
8452
|
-
* When called with no arguments (or `undefined`/`null`), seeds from OS entropy
|
|
8453
|
-
* via `crypto.getRandomValues`, matching Python's `random.seed()` behaviour.
|
|
8454
|
-
*
|
|
8455
|
-
* @param {number} [n] The seed value. Omit to seed from OS entropy.
|
|
8456
|
-
*/
|
|
8457
|
-
seed(n) {
|
|
8458
|
-
if (n === void 0 || n === null) {
|
|
8459
|
-
if (apis.IS_CRYPTO_AVAILABLE) {
|
|
8460
|
-
const buf = new Uint32Array(1);
|
|
8461
|
-
crypto.getRandomValues(buf);
|
|
8462
|
-
n = buf[0];
|
|
8463
|
-
} else {
|
|
8464
|
-
n = Date.now() >>> 0;
|
|
8465
|
-
}
|
|
8466
|
-
}
|
|
8467
|
-
const mt = this._mt;
|
|
8468
|
-
const u = (a, b) => Math.imul(a, b) >>> 0, key = [];
|
|
8469
|
-
for (let v = n || 0; v > 0; v = Math.floor(v / 4294967296)) key.push(v & 4294967295);
|
|
8470
|
-
if (!key.length) key.push(0);
|
|
8471
|
-
mt[0] = 19650218;
|
|
8472
|
-
for (let k = 1; k < 624; ++k) mt[k] = u(1812433253, mt[k - 1] ^ mt[k - 1] >>> 30) + k >>> 0;
|
|
8473
|
-
let i = 1, j = 0;
|
|
8474
|
-
for (let k = Math.max(624, key.length); k > 0; --k, ++i, ++j) {
|
|
8475
|
-
if (i >= 624) {
|
|
8476
|
-
mt[0] = mt[623];
|
|
8477
|
-
i = 1;
|
|
8478
|
-
}
|
|
8479
|
-
if (j >= key.length) j = 0;
|
|
8480
|
-
mt[i] = (mt[i] ^ u(mt[i - 1] ^ mt[i - 1] >>> 30, 1664525)) + key[j] + j >>> 0;
|
|
8481
|
-
}
|
|
8482
|
-
for (let k = 623; k > 0; --k, ++i) {
|
|
8483
|
-
if (i >= 624) {
|
|
8484
|
-
mt[0] = mt[623];
|
|
8485
|
-
i = 1;
|
|
8486
|
-
}
|
|
8487
|
-
mt[i] = (mt[i] ^ u(mt[i - 1] ^ mt[i - 1] >>> 30, 1566083941)) - i >>> 0;
|
|
8488
|
-
}
|
|
8489
|
-
mt[0] = 2147483648;
|
|
8490
|
-
this._idx = 624;
|
|
8491
|
-
this._gauss_next = null;
|
|
8492
|
-
}
|
|
8493
|
-
/**
|
|
8494
|
-
* Generates a random unsigned 32-bit integer.
|
|
8495
|
-
*
|
|
8496
|
-
* Performs the "twist" step when the state buffer is exhausted,
|
|
8497
|
-
* then applies the standard MT19937 tempering transform.
|
|
8498
|
-
*
|
|
8499
|
-
* @returns {number} A random integer in the range [0, 2^32 - 1].
|
|
8500
|
-
*/
|
|
8501
|
-
_int32() {
|
|
8502
|
-
const mt = this._mt;
|
|
8503
|
-
if (this._idx >= 624) {
|
|
8504
|
-
for (let k = 0; k < 624; ++k) {
|
|
8505
|
-
const y2 = mt[k] & 2147483648 | mt[(k + 1) % 624] & 2147483647;
|
|
8506
|
-
mt[k] = (mt[(k + 397) % 624] ^ y2 >>> 1 ^ (y2 & 1 ? 2567483615 : 0)) >>> 0;
|
|
8507
|
-
}
|
|
8508
|
-
this._idx = 0;
|
|
8509
|
-
}
|
|
8510
|
-
let y = mt[this._idx++];
|
|
8511
|
-
y ^= y >>> 11;
|
|
8512
|
-
y ^= y << 7 & 2636928640;
|
|
8513
|
-
y ^= y << 15 & 4022730752;
|
|
8514
|
-
y ^= y >>> 18;
|
|
8515
|
-
return y >>> 0;
|
|
8516
|
-
}
|
|
8517
|
-
/**
|
|
8518
|
-
* Generates a random floating-point number in the half-open interval [0, 1).
|
|
8519
|
-
*
|
|
8520
|
-
* Combines two 32-bit integers (using 53 bits of precision) to produce
|
|
8521
|
-
* a uniformly distributed double, matching Python's `random.random()`.
|
|
8522
|
-
*
|
|
8523
|
-
* @returns {number} A random float in [0, 1).
|
|
8524
|
-
*/
|
|
8525
|
-
random() {
|
|
8526
|
-
return ((this._int32() >>> 5) * 67108864 + (this._int32() >>> 6)) / 9007199254740992;
|
|
8527
|
-
}
|
|
8528
|
-
/**
|
|
8529
|
-
* Generates a random number from a Gaussian (normal) distribution.
|
|
8530
|
-
*
|
|
8531
|
-
* Uses the Box-Muller transform with a cached spare value,
|
|
8532
|
-
* matching Python's `random.gauss()` output for the same seed.
|
|
8533
|
-
*
|
|
8534
|
-
* @param {number} [mu=0] The mean of the distribution.
|
|
8535
|
-
* @param {number} [sigma=1] The standard deviation of the distribution.
|
|
8536
|
-
* @returns {number} A normally distributed random value.
|
|
8537
|
-
*/
|
|
8538
|
-
gauss(mu = 0, sigma = 1) {
|
|
8539
|
-
let z = this._gauss_next;
|
|
8540
|
-
this._gauss_next = null;
|
|
8541
|
-
if (z === null) {
|
|
8542
|
-
const x2pi = this.random() * 2 * Math.PI, g2rad = Math.sqrt(-2 * Math.log(1 - this.random()));
|
|
8543
|
-
z = Math.cos(x2pi) * g2rad;
|
|
8544
|
-
this._gauss_next = Math.sin(x2pi) * g2rad;
|
|
8545
|
-
}
|
|
8546
|
-
return mu + z * sigma;
|
|
8547
|
-
}
|
|
8548
|
-
/**
|
|
8549
|
-
* Shuffles an array in-place using the Fisher-Yates algorithm.
|
|
8550
|
-
*
|
|
8551
|
-
* Uses rejection sampling via `getrandbits`-style bit masking to ensure
|
|
8552
|
-
* a uniform distribution, matching Python's `random.shuffle()`.
|
|
8553
|
-
*
|
|
8554
|
-
* @param {any[]} arr The array to shuffle in-place.
|
|
8555
|
-
*/
|
|
8556
|
-
shuffle(arr) {
|
|
8557
|
-
for (let i = arr.length - 1; i > 0; --i) {
|
|
8558
|
-
const k = 32 - Math.clz32(i + 1);
|
|
8559
|
-
let r = this._int32() >>> 32 - k;
|
|
8560
|
-
while (r > i) r = this._int32() >>> 32 - k;
|
|
8561
|
-
const t = arr[i];
|
|
8562
|
-
arr[i] = arr[r];
|
|
8563
|
-
arr[r] = t;
|
|
8564
|
-
}
|
|
8565
|
-
}
|
|
8566
|
-
/**
|
|
8567
|
-
* Selects a single element from a weighted population.
|
|
8568
|
-
*
|
|
8569
|
-
* Matches Python's `random.choices(population, weights=weights, k=1)[0]`
|
|
8570
|
-
*
|
|
8571
|
-
* @param {any[]} population The array of items to choose from.
|
|
8572
|
-
* @param {number[]} weights An array of non-negative weights, one per population element.
|
|
8573
|
-
* @returns {*} A single randomly selected element from the population.
|
|
8574
|
-
*/
|
|
8575
|
-
choices(population, weights) {
|
|
8576
|
-
return population[_weightedIndexWith(this._random_fn, weights)];
|
|
8577
|
-
}
|
|
8578
|
-
};
|
|
8579
|
-
function _weightedIndexWith(randomFn, weights) {
|
|
8580
|
-
let sum = 0;
|
|
8581
|
-
for (let i = 0; i < weights.length; ++i) sum += weights[i];
|
|
8582
|
-
let x = randomFn() * sum;
|
|
8583
|
-
for (let i = 0; i < weights.length; ++i) {
|
|
8584
|
-
x -= weights[i];
|
|
8585
|
-
if (x < 0) return i;
|
|
8586
|
-
}
|
|
8587
|
-
return weights.length - 1;
|
|
8588
|
-
}
|
|
8589
|
-
var _default = new Random();
|
|
8590
|
-
var random = Object.freeze({
|
|
8591
|
-
Random,
|
|
8592
|
-
seed: _default.seed.bind(_default),
|
|
8593
|
-
random: _default.random.bind(_default),
|
|
8594
|
-
gauss: _default.gauss.bind(_default),
|
|
8595
|
-
shuffle: _default.shuffle.bind(_default),
|
|
8596
|
-
choices: _default.choices.bind(_default)
|
|
8933
|
+
int8: Int8Array,
|
|
8934
|
+
uint8: Uint8Array,
|
|
8935
|
+
int16: Int16Array,
|
|
8936
|
+
uint16: Uint16Array,
|
|
8937
|
+
int32: Int32Array,
|
|
8938
|
+
uint32: Uint32Array,
|
|
8939
|
+
int64: BigInt64Array,
|
|
8940
|
+
uint64: BigUint64Array,
|
|
8941
|
+
bool: Uint8Array,
|
|
8942
|
+
uint4: Uint8Array,
|
|
8943
|
+
int4: Int8Array
|
|
8597
8944
|
});
|
|
8598
|
-
var _weightedIndex = (weights) => _weightedIndexWith(random.random, weights);
|
|
8599
8945
|
|
|
8600
8946
|
// src/utils/tensor.js
|
|
8601
8947
|
var Tensor2 = class _Tensor {
|
|
@@ -9007,9 +9353,23 @@ var Tensor2 = class _Tensor {
|
|
|
9007
9353
|
throw Error(`Unsupported norm: ${p}`);
|
|
9008
9354
|
}
|
|
9009
9355
|
const this_data = this.data;
|
|
9010
|
-
const
|
|
9356
|
+
const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
|
|
9357
|
+
if (is_bigint && p !== 1) {
|
|
9358
|
+
throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
|
|
9359
|
+
}
|
|
9360
|
+
let fn, zero;
|
|
9361
|
+
if (is_bigint) {
|
|
9362
|
+
fn = (a, b) => a + b;
|
|
9363
|
+
zero = 0n;
|
|
9364
|
+
} else {
|
|
9365
|
+
fn = (a, b) => a + b ** p;
|
|
9366
|
+
zero = 0;
|
|
9367
|
+
}
|
|
9011
9368
|
if (dim === null) {
|
|
9012
|
-
|
|
9369
|
+
let val = this_data.reduce(fn, zero);
|
|
9370
|
+
if (p !== 1) {
|
|
9371
|
+
val = val ** (1 / p);
|
|
9372
|
+
}
|
|
9013
9373
|
return new _Tensor(this.type, [val], []);
|
|
9014
9374
|
}
|
|
9015
9375
|
const [type, result, resultDims] = reduce_helper(fn, this, dim, keepdim);
|
|
@@ -11469,9 +11829,11 @@ __export(processors_exports, {
|
|
|
11469
11829
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
11470
11830
|
Florence2Processor: () => Florence2Processor,
|
|
11471
11831
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
11832
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
11472
11833
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
11473
11834
|
Idefics3Processor: () => Idefics3Processor,
|
|
11474
11835
|
JinaCLIPProcessor: () => JinaCLIPProcessor,
|
|
11836
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
11475
11837
|
LlavaProcessor: () => LlavaProcessor,
|
|
11476
11838
|
MgpstrProcessor: () => MgpstrProcessor,
|
|
11477
11839
|
MoonshineProcessor: () => MoonshineProcessor,
|
|
@@ -11492,6 +11854,7 @@ __export(processors_exports, {
|
|
|
11492
11854
|
UltravoxProcessor: () => UltravoxProcessor,
|
|
11493
11855
|
VLChatProcessor: () => VLChatProcessor,
|
|
11494
11856
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
11857
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
11495
11858
|
Wav2Vec2Processor: () => Wav2Vec2Processor,
|
|
11496
11859
|
Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
|
|
11497
11860
|
WhisperProcessor: () => WhisperProcessor
|
|
@@ -11546,12 +11909,14 @@ __export(feature_extractors_exports, {
|
|
|
11546
11909
|
EncodecFeatureExtractor: () => EncodecFeatureExtractor,
|
|
11547
11910
|
FeatureExtractor: () => FeatureExtractor,
|
|
11548
11911
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
11912
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
11549
11913
|
MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
|
|
11550
11914
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
11551
11915
|
PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
|
|
11552
11916
|
SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
|
|
11553
11917
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
11554
11918
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
11919
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
11555
11920
|
Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
|
|
11556
11921
|
WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
|
|
11557
11922
|
WhisperFeatureExtractor: () => WhisperFeatureExtractor
|
|
@@ -11786,6 +12151,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
11786
12151
|
mel_filters = null,
|
|
11787
12152
|
mel_floor = 1e-10,
|
|
11788
12153
|
log_mel = null,
|
|
12154
|
+
max_log_mel = null,
|
|
11789
12155
|
reference = 1,
|
|
11790
12156
|
min_value = 1e-10,
|
|
11791
12157
|
db_range = null,
|
|
@@ -11925,6 +12291,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
11925
12291
|
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
11926
12292
|
}
|
|
11927
12293
|
break;
|
|
12294
|
+
case "log10_max_norm": {
|
|
12295
|
+
for (let i = 0; i < o; ++i) {
|
|
12296
|
+
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
12297
|
+
}
|
|
12298
|
+
const logMax = max_log_mel ?? max(mel_spec_data)[0];
|
|
12299
|
+
const threshold = logMax - 8;
|
|
12300
|
+
for (let i = 0; i < o; ++i) {
|
|
12301
|
+
mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
|
|
12302
|
+
}
|
|
12303
|
+
break;
|
|
12304
|
+
}
|
|
11928
12305
|
case "dB":
|
|
11929
12306
|
if (power === 1) {
|
|
11930
12307
|
amplitude_to_db(mel_spec_data, reference, min_value, db_range);
|
|
@@ -11935,7 +12312,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
11935
12312
|
}
|
|
11936
12313
|
break;
|
|
11937
12314
|
default:
|
|
11938
|
-
throw new Error(
|
|
12315
|
+
throw new Error(
|
|
12316
|
+
`log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
|
|
12317
|
+
);
|
|
11939
12318
|
}
|
|
11940
12319
|
}
|
|
11941
12320
|
return mel_spec;
|
|
@@ -12440,6 +12819,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
|
|
|
12440
12819
|
}
|
|
12441
12820
|
};
|
|
12442
12821
|
|
|
12822
|
+
// src/models/granite_speech/feature_extraction_granite_speech.js
|
|
12823
|
+
var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
|
|
12824
|
+
constructor(config) {
|
|
12825
|
+
super(config);
|
|
12826
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
12827
|
+
this.mel_filters = mel_filter_bank(
|
|
12828
|
+
Math.floor(1 + n_fft / 2),
|
|
12829
|
+
// num_frequency_bins = 257
|
|
12830
|
+
n_mels,
|
|
12831
|
+
// 80
|
|
12832
|
+
0,
|
|
12833
|
+
// min_frequency
|
|
12834
|
+
sample_rate / 2,
|
|
12835
|
+
// max_frequency = 8000
|
|
12836
|
+
sample_rate,
|
|
12837
|
+
// 16000
|
|
12838
|
+
null,
|
|
12839
|
+
// norm (torchaudio default: no norm)
|
|
12840
|
+
"htk"
|
|
12841
|
+
// mel_scale (torchaudio default)
|
|
12842
|
+
);
|
|
12843
|
+
const raw_window = window_function(win_length, "hann");
|
|
12844
|
+
this.window = new Float64Array(n_fft);
|
|
12845
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
12846
|
+
this.window.set(raw_window, pad);
|
|
12847
|
+
}
|
|
12848
|
+
/**
|
|
12849
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
12850
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
12851
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
12852
|
+
*/
|
|
12853
|
+
async _call(audio) {
|
|
12854
|
+
validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
|
|
12855
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
12856
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
12857
|
+
const max_num_frames = num_frames - num_frames % 2;
|
|
12858
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
12859
|
+
power: 2,
|
|
12860
|
+
mel_filters: this.mel_filters,
|
|
12861
|
+
log_mel: "log10_max_norm",
|
|
12862
|
+
transpose: true,
|
|
12863
|
+
// [time, n_mels]
|
|
12864
|
+
max_num_frames,
|
|
12865
|
+
do_pad: false
|
|
12866
|
+
});
|
|
12867
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
12868
|
+
return { input_features };
|
|
12869
|
+
}
|
|
12870
|
+
};
|
|
12871
|
+
|
|
12443
12872
|
// src/models/moonshine/feature_extraction_moonshine.js
|
|
12444
12873
|
var MoonshineFeatureExtractor = class extends FeatureExtractor {
|
|
12445
12874
|
/**
|
|
@@ -12920,6 +13349,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
|
|
|
12920
13349
|
}
|
|
12921
13350
|
};
|
|
12922
13351
|
|
|
13352
|
+
// src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
|
|
13353
|
+
var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
|
|
13354
|
+
constructor(config) {
|
|
13355
|
+
super(config);
|
|
13356
|
+
this.config.mel_filters ??= mel_filter_bank(
|
|
13357
|
+
Math.floor(1 + this.config.n_fft / 2),
|
|
13358
|
+
// num_frequency_bins
|
|
13359
|
+
this.config.feature_size,
|
|
13360
|
+
// num_mel_filters
|
|
13361
|
+
0,
|
|
13362
|
+
// min_frequency
|
|
13363
|
+
8e3,
|
|
13364
|
+
// max_frequency
|
|
13365
|
+
this.config.sampling_rate,
|
|
13366
|
+
// sampling_rate
|
|
13367
|
+
"slaney",
|
|
13368
|
+
// norm
|
|
13369
|
+
"slaney"
|
|
13370
|
+
// mel_scale
|
|
13371
|
+
);
|
|
13372
|
+
this.window = window_function(this.config.n_fft, "hann");
|
|
13373
|
+
}
|
|
13374
|
+
/**
|
|
13375
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
13376
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
13377
|
+
* @param {Object} [options]
|
|
13378
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
|
|
13379
|
+
* @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
|
|
13380
|
+
*/
|
|
13381
|
+
async _extract_fbank_features(waveform, { center = true } = {}) {
|
|
13382
|
+
const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
|
|
13383
|
+
const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
|
|
13384
|
+
return await spectrogram(
|
|
13385
|
+
waveform,
|
|
13386
|
+
this.window,
|
|
13387
|
+
n_fft,
|
|
13388
|
+
// frame_length
|
|
13389
|
+
hop_length,
|
|
13390
|
+
{
|
|
13391
|
+
power: 2,
|
|
13392
|
+
mel_filters,
|
|
13393
|
+
log_mel: "log10_max_norm",
|
|
13394
|
+
max_log_mel: global_log_mel_max,
|
|
13395
|
+
center,
|
|
13396
|
+
max_num_frames,
|
|
13397
|
+
do_pad: false
|
|
13398
|
+
}
|
|
13399
|
+
);
|
|
13400
|
+
}
|
|
13401
|
+
/**
|
|
13402
|
+
* Extract mel spectrogram features from audio.
|
|
13403
|
+
* @param {Float32Array|Float64Array} audio The audio data.
|
|
13404
|
+
* @param {Object} [options]
|
|
13405
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform.
|
|
13406
|
+
* @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
|
|
13407
|
+
*/
|
|
13408
|
+
async _call(audio, { center = true } = {}) {
|
|
13409
|
+
validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
|
|
13410
|
+
const features = await this._extract_fbank_features(audio, { center });
|
|
13411
|
+
return {
|
|
13412
|
+
input_features: features.unsqueeze_(0)
|
|
13413
|
+
};
|
|
13414
|
+
}
|
|
13415
|
+
};
|
|
13416
|
+
|
|
12923
13417
|
// src/models/whisper/feature_extraction_whisper.js
|
|
12924
13418
|
var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
12925
13419
|
constructor(config) {
|
|
@@ -12948,7 +13442,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
12948
13442
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
12949
13443
|
*/
|
|
12950
13444
|
async _extract_fbank_features(waveform) {
|
|
12951
|
-
|
|
13445
|
+
return await spectrogram(
|
|
12952
13446
|
waveform,
|
|
12953
13447
|
this.window,
|
|
12954
13448
|
// window
|
|
@@ -12959,7 +13453,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
12959
13453
|
{
|
|
12960
13454
|
power: 2,
|
|
12961
13455
|
mel_filters: this.config.mel_filters,
|
|
12962
|
-
log_mel: "
|
|
13456
|
+
log_mel: "log10_max_norm",
|
|
12963
13457
|
// Custom
|
|
12964
13458
|
max_num_frames: Math.min(
|
|
12965
13459
|
Math.floor(waveform.length / this.config.hop_length),
|
|
@@ -12968,15 +13462,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
12968
13462
|
)
|
|
12969
13463
|
}
|
|
12970
13464
|
);
|
|
12971
|
-
const data = features.data;
|
|
12972
|
-
const maxValue = max(
|
|
12973
|
-
/** @type {Float32Array} */
|
|
12974
|
-
data
|
|
12975
|
-
)[0];
|
|
12976
|
-
for (let i = 0; i < data.length; ++i) {
|
|
12977
|
-
data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
|
|
12978
|
-
}
|
|
12979
|
-
return features;
|
|
12980
13465
|
}
|
|
12981
13466
|
/**
|
|
12982
13467
|
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
@@ -13042,11 +13527,10 @@ var sharp_default = {};
|
|
|
13042
13527
|
var createCanvasFunction;
|
|
13043
13528
|
var ImageDataClass;
|
|
13044
13529
|
var loadImageFunction;
|
|
13045
|
-
|
|
13046
|
-
if (IS_BROWSER_OR_WEBWORKER) {
|
|
13530
|
+
if (apis.IS_WEB_ENV) {
|
|
13047
13531
|
createCanvasFunction = (width, height) => {
|
|
13048
13532
|
if (!self.OffscreenCanvas) {
|
|
13049
|
-
throw new Error("OffscreenCanvas not supported by this
|
|
13533
|
+
throw new Error("OffscreenCanvas not supported by this environment.");
|
|
13050
13534
|
}
|
|
13051
13535
|
return new self.OffscreenCanvas(width, height);
|
|
13052
13536
|
};
|
|
@@ -13136,7 +13620,7 @@ var RawImage = class _RawImage {
|
|
|
13136
13620
|
* @returns {RawImage} The image object.
|
|
13137
13621
|
*/
|
|
13138
13622
|
static fromCanvas(canvas) {
|
|
13139
|
-
if (!
|
|
13623
|
+
if (!apis.IS_WEB_ENV) {
|
|
13140
13624
|
throw new Error("fromCanvas() is only supported in browser environments.");
|
|
13141
13625
|
}
|
|
13142
13626
|
const ctx = (
|
|
@@ -13165,7 +13649,7 @@ var RawImage = class _RawImage {
|
|
|
13165
13649
|
* @returns {Promise<RawImage>} The image object.
|
|
13166
13650
|
*/
|
|
13167
13651
|
static async fromBlob(blob) {
|
|
13168
|
-
if (
|
|
13652
|
+
if (apis.IS_WEB_ENV) {
|
|
13169
13653
|
const img = await loadImageFunction(blob);
|
|
13170
13654
|
const ctx = createCanvasFunction(img.width, img.height).getContext("2d");
|
|
13171
13655
|
ctx.drawImage(img, 0, 0);
|
|
@@ -13346,7 +13830,7 @@ var RawImage = class _RawImage {
|
|
|
13346
13830
|
} else if (nullish_height) {
|
|
13347
13831
|
height = width / this.width * this.height;
|
|
13348
13832
|
}
|
|
13349
|
-
if (
|
|
13833
|
+
if (apis.IS_WEB_ENV) {
|
|
13350
13834
|
const numChannels = this.channels;
|
|
13351
13835
|
const canvas = this.toCanvas();
|
|
13352
13836
|
const ctx = createCanvasFunction(width, height).getContext("2d");
|
|
@@ -13394,7 +13878,7 @@ var RawImage = class _RawImage {
|
|
|
13394
13878
|
if (left === 0 && right === 0 && top === 0 && bottom === 0) {
|
|
13395
13879
|
return this;
|
|
13396
13880
|
}
|
|
13397
|
-
if (
|
|
13881
|
+
if (apis.IS_WEB_ENV) {
|
|
13398
13882
|
const numChannels = this.channels;
|
|
13399
13883
|
const canvas = this.toCanvas();
|
|
13400
13884
|
const newWidth = this.width + left + right;
|
|
@@ -13418,7 +13902,7 @@ var RawImage = class _RawImage {
|
|
|
13418
13902
|
}
|
|
13419
13903
|
const crop_width = x_max - x_min + 1;
|
|
13420
13904
|
const crop_height = y_max - y_min + 1;
|
|
13421
|
-
if (
|
|
13905
|
+
if (apis.IS_WEB_ENV) {
|
|
13422
13906
|
const numChannels = this.channels;
|
|
13423
13907
|
const canvas = this.toCanvas();
|
|
13424
13908
|
const ctx = createCanvasFunction(crop_width, crop_height).getContext("2d");
|
|
@@ -13446,7 +13930,7 @@ var RawImage = class _RawImage {
|
|
|
13446
13930
|
}
|
|
13447
13931
|
const width_offset = (this.width - crop_width) / 2;
|
|
13448
13932
|
const height_offset = (this.height - crop_height) / 2;
|
|
13449
|
-
if (
|
|
13933
|
+
if (apis.IS_WEB_ENV) {
|
|
13450
13934
|
const numChannels = this.channels;
|
|
13451
13935
|
const canvas = this.toCanvas();
|
|
13452
13936
|
const ctx = createCanvasFunction(crop_width, crop_height).getContext("2d");
|
|
@@ -13524,7 +14008,7 @@ var RawImage = class _RawImage {
|
|
|
13524
14008
|
}
|
|
13525
14009
|
}
|
|
13526
14010
|
async toBlob(type = "image/png", quality = 1) {
|
|
13527
|
-
if (!
|
|
14011
|
+
if (!apis.IS_WEB_ENV) {
|
|
13528
14012
|
throw new Error("toBlob() is only supported in browser environments.");
|
|
13529
14013
|
}
|
|
13530
14014
|
const canvas = this.toCanvas();
|
|
@@ -13541,7 +14025,7 @@ var RawImage = class _RawImage {
|
|
|
13541
14025
|
return tensor;
|
|
13542
14026
|
}
|
|
13543
14027
|
toCanvas() {
|
|
13544
|
-
if (!
|
|
14028
|
+
if (!apis.IS_WEB_ENV) {
|
|
13545
14029
|
throw new Error("toCanvas() is only supported in browser environments.");
|
|
13546
14030
|
}
|
|
13547
14031
|
const cloned = this.clone().rgba();
|
|
@@ -13625,7 +14109,7 @@ var RawImage = class _RawImage {
|
|
|
13625
14109
|
* @returns {Promise<void>}
|
|
13626
14110
|
*/
|
|
13627
14111
|
async save(path) {
|
|
13628
|
-
if (
|
|
14112
|
+
if (apis.IS_WEB_ENV) {
|
|
13629
14113
|
if (apis.IS_WEBWORKER_ENV) {
|
|
13630
14114
|
throw new Error("Unable to save an image from a Web Worker.");
|
|
13631
14115
|
}
|
|
@@ -13645,7 +14129,7 @@ var RawImage = class _RawImage {
|
|
|
13645
14129
|
* @returns {import('sharp').Sharp} The Sharp instance.
|
|
13646
14130
|
*/
|
|
13647
14131
|
toSharp() {
|
|
13648
|
-
if (
|
|
14132
|
+
if (apis.IS_WEB_ENV) {
|
|
13649
14133
|
throw new Error("toSharp() is only supported in server-side environments.");
|
|
13650
14134
|
}
|
|
13651
14135
|
return sharp_default(this.data, {
|
|
@@ -13858,6 +14342,27 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
13858
14342
|
}
|
|
13859
14343
|
return [segmentation, segments];
|
|
13860
14344
|
}
|
|
14345
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
14346
|
+
if (height < factor || width < factor) {
|
|
14347
|
+
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
14348
|
+
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
14349
|
+
throw new Error(
|
|
14350
|
+
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
14351
|
+
);
|
|
14352
|
+
}
|
|
14353
|
+
let h_bar = Math.round(height / factor) * factor;
|
|
14354
|
+
let w_bar = Math.round(width / factor) * factor;
|
|
14355
|
+
if (h_bar * w_bar > max_pixels) {
|
|
14356
|
+
const beta = Math.sqrt(height * width / max_pixels);
|
|
14357
|
+
h_bar = Math.floor(height / beta / factor) * factor;
|
|
14358
|
+
w_bar = Math.floor(width / beta / factor) * factor;
|
|
14359
|
+
} else if (h_bar * w_bar < min_pixels) {
|
|
14360
|
+
const beta = Math.sqrt(min_pixels / (height * width));
|
|
14361
|
+
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
14362
|
+
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
14363
|
+
}
|
|
14364
|
+
return [h_bar, w_bar];
|
|
14365
|
+
}
|
|
13861
14366
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
13862
14367
|
if (label_ids_to_fuse === null) {
|
|
13863
14368
|
logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
|
|
@@ -14146,7 +14651,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
14146
14651
|
});
|
|
14147
14652
|
}
|
|
14148
14653
|
/**
|
|
14149
|
-
* @typedef {
|
|
14654
|
+
* @typedef {Object} PreprocessedImage
|
|
14150
14655
|
* @property {HeightWidth} original_size The original size of the image.
|
|
14151
14656
|
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
|
|
14152
14657
|
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
|
|
@@ -14324,6 +14829,7 @@ __export(image_processors_exports, {
|
|
|
14324
14829
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
14325
14830
|
ImageProcessor: () => ImageProcessor,
|
|
14326
14831
|
JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
|
|
14832
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
14327
14833
|
LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
|
|
14328
14834
|
Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
|
|
14329
14835
|
MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
|
|
@@ -14727,6 +15233,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
|
|
|
14727
15233
|
}
|
|
14728
15234
|
};
|
|
14729
15235
|
|
|
15236
|
+
// src/models/lfm2_vl/image_processing_lfm2_vl.js
|
|
15237
|
+
function round_by_factor(number, factor) {
|
|
15238
|
+
return Math.round(number / factor) * factor;
|
|
15239
|
+
}
|
|
15240
|
+
function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
|
|
15241
|
+
let best_ratio_diff = Infinity;
|
|
15242
|
+
let best_ratio = [1, 1];
|
|
15243
|
+
const area = width * height;
|
|
15244
|
+
for (const ratio of target_ratios) {
|
|
15245
|
+
const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
|
|
15246
|
+
if (ratio_diff < best_ratio_diff) {
|
|
15247
|
+
best_ratio_diff = ratio_diff;
|
|
15248
|
+
best_ratio = ratio;
|
|
15249
|
+
} else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
|
|
15250
|
+
best_ratio = ratio;
|
|
15251
|
+
}
|
|
15252
|
+
}
|
|
15253
|
+
return best_ratio;
|
|
15254
|
+
}
|
|
15255
|
+
function get_target_ratios(min_tiles, max_tiles) {
|
|
15256
|
+
const ratios = [];
|
|
15257
|
+
const seen = /* @__PURE__ */ new Set();
|
|
15258
|
+
for (let n = min_tiles; n <= max_tiles; ++n) {
|
|
15259
|
+
for (let w = 1; w <= n; ++w) {
|
|
15260
|
+
for (let h = 1; h <= n; ++h) {
|
|
15261
|
+
const product2 = w * h;
|
|
15262
|
+
if (product2 >= min_tiles && product2 <= max_tiles) {
|
|
15263
|
+
const key = w << 16 | h;
|
|
15264
|
+
if (!seen.has(key)) {
|
|
15265
|
+
seen.add(key);
|
|
15266
|
+
ratios.push([w, h]);
|
|
15267
|
+
}
|
|
15268
|
+
}
|
|
15269
|
+
}
|
|
15270
|
+
}
|
|
15271
|
+
}
|
|
15272
|
+
return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
|
|
15273
|
+
}
|
|
15274
|
+
function convert_image_to_patches(images, patch_size) {
|
|
15275
|
+
const [B, C, H, W] = images.dims;
|
|
15276
|
+
const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
|
|
15277
|
+
const patch_dim = patch_size * patch_size * C;
|
|
15278
|
+
const data = (
|
|
15279
|
+
/** @type {Float32Array} */
|
|
15280
|
+
images.data
|
|
15281
|
+
);
|
|
15282
|
+
const result = new Float32Array(B * ph * pw * patch_dim);
|
|
15283
|
+
const ch_stride = H * W;
|
|
15284
|
+
for (let b = 0; b < B; ++b) {
|
|
15285
|
+
const b_src = b * C * ch_stride;
|
|
15286
|
+
const b_dst = b * ph * pw * patch_dim;
|
|
15287
|
+
for (let py = 0; py < ph; ++py) {
|
|
15288
|
+
for (let px = 0; px < pw; ++px) {
|
|
15289
|
+
let off = b_dst + (py * pw + px) * patch_dim;
|
|
15290
|
+
for (let dy = 0; dy < patch_size; ++dy) {
|
|
15291
|
+
const row = (py * patch_size + dy) * W + px * patch_size;
|
|
15292
|
+
for (let dx = 0; dx < patch_size; ++dx) {
|
|
15293
|
+
const pixel = row + dx;
|
|
15294
|
+
for (let c = 0; c < C; ++c) {
|
|
15295
|
+
result[off++] = data[b_src + c * ch_stride + pixel];
|
|
15296
|
+
}
|
|
15297
|
+
}
|
|
15298
|
+
}
|
|
15299
|
+
}
|
|
15300
|
+
}
|
|
15301
|
+
}
|
|
15302
|
+
return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
|
|
15303
|
+
}
|
|
15304
|
+
function pad_along_first_dim(patches, target_length) {
|
|
15305
|
+
const [, len2, dim] = patches.dims;
|
|
15306
|
+
const mask_data = new BigInt64Array(target_length);
|
|
15307
|
+
mask_data.fill(1n, 0, len2);
|
|
15308
|
+
let padded = patches;
|
|
15309
|
+
if (len2 < target_length) {
|
|
15310
|
+
const padded_data = new Float32Array(target_length * dim);
|
|
15311
|
+
padded_data.set(
|
|
15312
|
+
/** @type {Float32Array} */
|
|
15313
|
+
patches.data
|
|
15314
|
+
);
|
|
15315
|
+
padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
|
|
15316
|
+
}
|
|
15317
|
+
return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
|
|
15318
|
+
}
|
|
15319
|
+
var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
15320
|
+
constructor(config) {
|
|
15321
|
+
super(config);
|
|
15322
|
+
this.downsample_factor = config.downsample_factor ?? 2;
|
|
15323
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
15324
|
+
this.min_tiles = config.min_tiles ?? 2;
|
|
15325
|
+
this.max_tiles = config.max_tiles ?? 10;
|
|
15326
|
+
this.use_thumbnail = config.use_thumbnail ?? true;
|
|
15327
|
+
this.min_image_tokens = config.min_image_tokens ?? 64;
|
|
15328
|
+
this.max_image_tokens = config.max_image_tokens ?? 256;
|
|
15329
|
+
this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
|
|
15330
|
+
this.tile_size = config.tile_size ?? 512;
|
|
15331
|
+
this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
|
|
15332
|
+
this.return_row_col_info = config.return_row_col_info ?? false;
|
|
15333
|
+
const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
|
|
15334
|
+
const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
|
|
15335
|
+
this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
|
|
15336
|
+
}
|
|
15337
|
+
/**
|
|
15338
|
+
* Check if the image is too large to be processed as a single tile.
|
|
15339
|
+
* @param {number} height
|
|
15340
|
+
* @param {number} width
|
|
15341
|
+
* @returns {boolean}
|
|
15342
|
+
*/
|
|
15343
|
+
_is_image_too_large(height, width) {
|
|
15344
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
15345
|
+
const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
|
|
15346
|
+
const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
|
|
15347
|
+
return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
|
|
15348
|
+
}
|
|
15349
|
+
/**
|
|
15350
|
+
* Get the grid layout for tiling a large image.
|
|
15351
|
+
* @param {number} height
|
|
15352
|
+
* @param {number} width
|
|
15353
|
+
* @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
|
|
15354
|
+
*/
|
|
15355
|
+
_get_grid_layout(height, width) {
|
|
15356
|
+
const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
|
|
15357
|
+
const [grid_width, grid_height] = find_closest_aspect_ratio(
|
|
15358
|
+
width / height,
|
|
15359
|
+
target_ratios,
|
|
15360
|
+
width,
|
|
15361
|
+
height,
|
|
15362
|
+
this.tile_size
|
|
15363
|
+
);
|
|
15364
|
+
return {
|
|
15365
|
+
grid_width,
|
|
15366
|
+
grid_height,
|
|
15367
|
+
target_width: this.tile_size * grid_width,
|
|
15368
|
+
target_height: this.tile_size * grid_height
|
|
15369
|
+
};
|
|
15370
|
+
}
|
|
15371
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
15372
|
+
// @ts-expect-error
|
|
15373
|
+
async _call(images, { return_row_col_info = null } = {}) {
|
|
15374
|
+
let batched_images;
|
|
15375
|
+
if (!Array.isArray(images)) {
|
|
15376
|
+
batched_images = [[images]];
|
|
15377
|
+
} else if (!Array.isArray(images[0])) {
|
|
15378
|
+
batched_images = [
|
|
15379
|
+
/** @type {RawImage[]} */
|
|
15380
|
+
images
|
|
15381
|
+
];
|
|
15382
|
+
} else {
|
|
15383
|
+
batched_images = /** @type {RawImage[][]} */
|
|
15384
|
+
images;
|
|
15385
|
+
}
|
|
15386
|
+
const all_pixel_values = [];
|
|
15387
|
+
const all_pixel_masks = [];
|
|
15388
|
+
const all_spatial_shapes = [];
|
|
15389
|
+
const all_rows = [];
|
|
15390
|
+
const all_cols = [];
|
|
15391
|
+
const all_image_sizes = [];
|
|
15392
|
+
for (const image_batch of batched_images) {
|
|
15393
|
+
const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
|
|
15394
|
+
for (const { pixel_values } of preprocessed) {
|
|
15395
|
+
const [, height, width] = pixel_values.dims;
|
|
15396
|
+
const img = pixel_values.unsqueeze_(0);
|
|
15397
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
15398
|
+
const f2 = total_factor ** 2;
|
|
15399
|
+
const [new_height, new_width] = smart_resize(
|
|
15400
|
+
Math.max(total_factor, height),
|
|
15401
|
+
Math.max(total_factor, width),
|
|
15402
|
+
total_factor,
|
|
15403
|
+
this.min_image_tokens * f2,
|
|
15404
|
+
this.max_image_tokens * f2
|
|
15405
|
+
).map((x) => Math.max(total_factor, x));
|
|
15406
|
+
let tiles;
|
|
15407
|
+
let num_rows = 1, num_cols = 1;
|
|
15408
|
+
const is_large = this._is_image_too_large(height, width);
|
|
15409
|
+
const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
|
|
15410
|
+
if (is_large && do_splitting) {
|
|
15411
|
+
const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
|
|
15412
|
+
height,
|
|
15413
|
+
width
|
|
15414
|
+
);
|
|
15415
|
+
num_rows = grid_height;
|
|
15416
|
+
num_cols = grid_width;
|
|
15417
|
+
const resized = await interpolate_4d(img, {
|
|
15418
|
+
size: [target_height, target_width]
|
|
15419
|
+
});
|
|
15420
|
+
tiles = [];
|
|
15421
|
+
for (let r = 0; r < grid_height; ++r) {
|
|
15422
|
+
for (let c = 0; c < grid_width; ++c) {
|
|
15423
|
+
const y = r * this.tile_size;
|
|
15424
|
+
const x = c * this.tile_size;
|
|
15425
|
+
tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
|
|
15426
|
+
}
|
|
15427
|
+
}
|
|
15428
|
+
if (this.use_thumbnail && grid_width * grid_height !== 1) {
|
|
15429
|
+
tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
|
|
15430
|
+
}
|
|
15431
|
+
} else {
|
|
15432
|
+
tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
|
|
15433
|
+
}
|
|
15434
|
+
for (const tile of tiles) {
|
|
15435
|
+
const [, , th, tw] = tile.dims;
|
|
15436
|
+
const patches = convert_image_to_patches(tile, this.encoder_patch_size);
|
|
15437
|
+
const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
|
|
15438
|
+
all_pixel_values.push(padded);
|
|
15439
|
+
all_pixel_masks.push(mask);
|
|
15440
|
+
all_spatial_shapes.push([
|
|
15441
|
+
Math.floor(th / this.encoder_patch_size),
|
|
15442
|
+
Math.floor(tw / this.encoder_patch_size)
|
|
15443
|
+
]);
|
|
15444
|
+
}
|
|
15445
|
+
all_rows.push(num_rows);
|
|
15446
|
+
all_cols.push(num_cols);
|
|
15447
|
+
all_image_sizes.push([new_height, new_width]);
|
|
15448
|
+
}
|
|
15449
|
+
}
|
|
15450
|
+
const result = {
|
|
15451
|
+
pixel_values: cat(all_pixel_values, 0),
|
|
15452
|
+
pixel_attention_mask: stack(all_pixel_masks, 0),
|
|
15453
|
+
spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
|
|
15454
|
+
all_spatial_shapes.length,
|
|
15455
|
+
2
|
|
15456
|
+
])
|
|
15457
|
+
};
|
|
15458
|
+
if (return_row_col_info ?? this.return_row_col_info) {
|
|
15459
|
+
result.image_rows = all_rows;
|
|
15460
|
+
result.image_cols = all_cols;
|
|
15461
|
+
result.image_sizes = all_image_sizes;
|
|
15462
|
+
}
|
|
15463
|
+
return result;
|
|
15464
|
+
}
|
|
15465
|
+
};
|
|
15466
|
+
|
|
14730
15467
|
// src/models/llava_onevision/image_processing_llava_onevision.js
|
|
14731
15468
|
var LlavaOnevisionImageProcessor = class extends ImageProcessor {
|
|
14732
15469
|
};
|
|
@@ -14950,27 +15687,6 @@ var PvtImageProcessor = class extends ImageProcessor {
|
|
|
14950
15687
|
};
|
|
14951
15688
|
|
|
14952
15689
|
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
14953
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
14954
|
-
if (height < factor || width < factor) {
|
|
14955
|
-
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
14956
|
-
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
14957
|
-
throw new Error(
|
|
14958
|
-
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
14959
|
-
);
|
|
14960
|
-
}
|
|
14961
|
-
let h_bar = Math.round(height / factor) * factor;
|
|
14962
|
-
let w_bar = Math.round(width / factor) * factor;
|
|
14963
|
-
if (h_bar * w_bar > max_pixels) {
|
|
14964
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
14965
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
14966
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
14967
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
14968
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
14969
|
-
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
14970
|
-
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
14971
|
-
}
|
|
14972
|
-
return [h_bar, w_bar];
|
|
14973
|
-
}
|
|
14974
15690
|
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
14975
15691
|
constructor(config) {
|
|
14976
15692
|
super(config);
|
|
@@ -15572,6 +16288,57 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
15572
16288
|
}
|
|
15573
16289
|
};
|
|
15574
16290
|
|
|
16291
|
+
// src/models/granite_speech/processing_granite_speech.js
|
|
16292
|
+
var GraniteSpeechProcessor = class extends Processor {
|
|
16293
|
+
static tokenizer_class = AutoTokenizer;
|
|
16294
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
16295
|
+
static uses_processor_config = true;
|
|
16296
|
+
/**
|
|
16297
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
16298
|
+
* @param {number} audioLength Raw audio sample count.
|
|
16299
|
+
* @returns {number} Number of projector output tokens.
|
|
16300
|
+
*/
|
|
16301
|
+
_get_num_audio_features(audioLength) {
|
|
16302
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
16303
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
16304
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
16305
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
16306
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
16307
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
16308
|
+
return nblocks * effective_window_size;
|
|
16309
|
+
}
|
|
16310
|
+
/**
|
|
16311
|
+
* @param {string} text The text input to process.
|
|
16312
|
+
* @param {Float32Array} audio The audio input to process.
|
|
16313
|
+
*/
|
|
16314
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
16315
|
+
if (Array.isArray(text)) {
|
|
16316
|
+
throw new Error("Batched inputs are not supported yet.");
|
|
16317
|
+
}
|
|
16318
|
+
let audio_inputs = {};
|
|
16319
|
+
if (audio) {
|
|
16320
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
16321
|
+
audio_inputs["input_features"] = input_features;
|
|
16322
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
16323
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
16324
|
+
audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
|
|
16325
|
+
const audio_token = this.config.audio_token ?? "<|audio|>";
|
|
16326
|
+
if (!text.includes(audio_token)) {
|
|
16327
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
16328
|
+
}
|
|
16329
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
16330
|
+
}
|
|
16331
|
+
const text_inputs = this.tokenizer(text, {
|
|
16332
|
+
add_special_tokens: false,
|
|
16333
|
+
...kwargs
|
|
16334
|
+
});
|
|
16335
|
+
return {
|
|
16336
|
+
...text_inputs,
|
|
16337
|
+
...audio_inputs
|
|
16338
|
+
};
|
|
16339
|
+
}
|
|
16340
|
+
};
|
|
16341
|
+
|
|
15575
16342
|
// src/models/grounding_dino/processing_grounding_dino.js
|
|
15576
16343
|
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
15577
16344
|
const left_idx = 0;
|
|
@@ -15848,6 +16615,66 @@ var JinaCLIPProcessor = class extends Processor {
|
|
|
15848
16615
|
}
|
|
15849
16616
|
};
|
|
15850
16617
|
|
|
16618
|
+
// src/models/lfm2_vl/processing_lfm2_vl.js
|
|
16619
|
+
var Lfm2VlProcessor = class extends Processor {
|
|
16620
|
+
static tokenizer_class = AutoTokenizer;
|
|
16621
|
+
static image_processor_class = AutoImageProcessor;
|
|
16622
|
+
/**
|
|
16623
|
+
* @param {RawImage|RawImage[]} images
|
|
16624
|
+
* @param {string|string[]|null} [text]
|
|
16625
|
+
* @param {Record<string, any>} [kwargs]
|
|
16626
|
+
*/
|
|
16627
|
+
async _call(images, text = null, kwargs = {}) {
|
|
16628
|
+
const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
|
|
16629
|
+
...kwargs,
|
|
16630
|
+
return_row_col_info: true
|
|
16631
|
+
});
|
|
16632
|
+
if (text) {
|
|
16633
|
+
const image_token = this.config.image_token ?? "<image>";
|
|
16634
|
+
const {
|
|
16635
|
+
tile_size = 512,
|
|
16636
|
+
downsample_factor = 2,
|
|
16637
|
+
encoder_patch_size = 16,
|
|
16638
|
+
use_thumbnail = true
|
|
16639
|
+
} = (
|
|
16640
|
+
/** @type {Record<string, any>} */
|
|
16641
|
+
this.image_processor.config
|
|
16642
|
+
);
|
|
16643
|
+
const ds = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
|
|
16644
|
+
const tokens_per_tile = ds(tile_size) ** 2;
|
|
16645
|
+
const image_start = this.config.image_start_token ?? "<|image_start|>";
|
|
16646
|
+
const image_end = this.config.image_end_token ?? "<|image_end|>";
|
|
16647
|
+
const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
|
|
16648
|
+
if (!Array.isArray(text)) text = [text];
|
|
16649
|
+
let image_idx = 0;
|
|
16650
|
+
text = text.map((sample) => {
|
|
16651
|
+
const parts = sample.split(image_token);
|
|
16652
|
+
return parts[0] + parts.slice(1).map((part) => {
|
|
16653
|
+
const idx = image_idx++;
|
|
16654
|
+
const [h, w] = image_sizes[idx];
|
|
16655
|
+
const rows = image_rows[idx], cols = image_cols[idx];
|
|
16656
|
+
const tokens_for_image = ds(h) * ds(w);
|
|
16657
|
+
let expanded = image_start;
|
|
16658
|
+
if (rows > 1 || cols > 1) {
|
|
16659
|
+
const tile_str = image_token.repeat(tokens_per_tile);
|
|
16660
|
+
for (let r = 0; r < rows; ++r)
|
|
16661
|
+
for (let c = 0; c < cols; ++c)
|
|
16662
|
+
expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
|
|
16663
|
+
if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
|
|
16664
|
+
} else {
|
|
16665
|
+
expanded += image_token.repeat(tokens_for_image);
|
|
16666
|
+
}
|
|
16667
|
+
return expanded + image_end + part;
|
|
16668
|
+
}).join("");
|
|
16669
|
+
});
|
|
16670
|
+
}
|
|
16671
|
+
return {
|
|
16672
|
+
...image_inputs,
|
|
16673
|
+
...text ? this.tokenizer(text, kwargs) : {}
|
|
16674
|
+
};
|
|
16675
|
+
}
|
|
16676
|
+
};
|
|
16677
|
+
|
|
15851
16678
|
// src/models/llava/processing_llava.js
|
|
15852
16679
|
var LlavaProcessor = class extends Processor {
|
|
15853
16680
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -16367,16 +17194,104 @@ var VoxtralProcessor = class extends Processor {
|
|
|
16367
17194
|
}
|
|
16368
17195
|
new_text += text_parts[i + 1];
|
|
16369
17196
|
}
|
|
16370
|
-
text = new_text;
|
|
17197
|
+
text = new_text;
|
|
17198
|
+
}
|
|
17199
|
+
const text_inputs = this.tokenizer(text, {
|
|
17200
|
+
add_special_tokens: false,
|
|
17201
|
+
...kwargs
|
|
17202
|
+
});
|
|
17203
|
+
return {
|
|
17204
|
+
...text_inputs,
|
|
17205
|
+
...audio_inputs
|
|
17206
|
+
};
|
|
17207
|
+
}
|
|
17208
|
+
};
|
|
17209
|
+
|
|
17210
|
+
// src/models/voxtral_realtime/processing_voxtral_realtime.js
|
|
17211
|
+
var NUM_LEFT_PAD_TOKENS = 32;
|
|
17212
|
+
var NUM_DELAY_TOKENS = 6;
|
|
17213
|
+
var AUDIO_LENGTH_PER_TOK = 8;
|
|
17214
|
+
var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
17215
|
+
var STREAMING_PAD_TOKEN_ID = 32;
|
|
17216
|
+
var VoxtralRealtimeProcessor = class extends Processor {
|
|
17217
|
+
static tokenizer_class = AutoTokenizer;
|
|
17218
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
17219
|
+
static uses_processor_config = false;
|
|
17220
|
+
/** Number of mel frames in the first audio chunk. */
|
|
17221
|
+
get num_mel_frames_first_audio_chunk() {
|
|
17222
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
17223
|
+
}
|
|
17224
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
17225
|
+
get num_samples_first_audio_chunk() {
|
|
17226
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
17227
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
17228
|
+
}
|
|
17229
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
17230
|
+
get num_samples_per_audio_chunk() {
|
|
17231
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
17232
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
17233
|
+
}
|
|
17234
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
17235
|
+
get num_right_pad_tokens() {
|
|
17236
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
17237
|
+
}
|
|
17238
|
+
/** Number of mel frames per text token. */
|
|
17239
|
+
get audio_length_per_tok() {
|
|
17240
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
17241
|
+
}
|
|
17242
|
+
/** Number of raw audio samples per token. */
|
|
17243
|
+
get raw_audio_length_per_tok() {
|
|
17244
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
17245
|
+
}
|
|
17246
|
+
/**
|
|
17247
|
+
* Process audio input for VoxtralRealtime.
|
|
17248
|
+
*
|
|
17249
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
17250
|
+
* with silence and mel features are extracted with `center=true`.
|
|
17251
|
+
* Returns `{ input_ids, input_features }`.
|
|
17252
|
+
*
|
|
17253
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
17254
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
17255
|
+
*
|
|
17256
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
17257
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
17258
|
+
* Returns `{ input_features }`.
|
|
17259
|
+
*
|
|
17260
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
17261
|
+
* @param {Object} [options]
|
|
17262
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
17263
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
17264
|
+
* @returns {Promise<Object>}
|
|
17265
|
+
*/
|
|
17266
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
17267
|
+
validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
|
|
17268
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
17269
|
+
throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
|
|
17270
|
+
}
|
|
17271
|
+
if (is_first_audio_chunk) {
|
|
17272
|
+
if (is_streaming) {
|
|
17273
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
17274
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
17275
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
17276
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
17277
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
17278
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
17279
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
17280
|
+
input_ids_data[0] = 1n;
|
|
17281
|
+
const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
|
|
17282
|
+
return {
|
|
17283
|
+
input_ids,
|
|
17284
|
+
...audio_encoding
|
|
17285
|
+
};
|
|
17286
|
+
} else {
|
|
17287
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
17288
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
17289
|
+
padded_audio.set(audio);
|
|
17290
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
17291
|
+
}
|
|
17292
|
+
} else {
|
|
17293
|
+
return await this.feature_extractor(audio, { center: false });
|
|
16371
17294
|
}
|
|
16372
|
-
const text_inputs = this.tokenizer(text, {
|
|
16373
|
-
add_special_tokens: false,
|
|
16374
|
-
...kwargs
|
|
16375
|
-
});
|
|
16376
|
-
return {
|
|
16377
|
-
...text_inputs,
|
|
16378
|
-
...audio_inputs
|
|
16379
|
-
};
|
|
16380
17295
|
}
|
|
16381
17296
|
};
|
|
16382
17297
|
|
|
@@ -16479,14 +17394,18 @@ function getNormalizedConfig(config) {
|
|
|
16479
17394
|
case "florence2":
|
|
16480
17395
|
case "llava_onevision":
|
|
16481
17396
|
case "idefics3":
|
|
17397
|
+
case "granite_speech":
|
|
16482
17398
|
case "ultravox":
|
|
16483
17399
|
case "voxtral":
|
|
17400
|
+
case "voxtral_realtime":
|
|
16484
17401
|
case "smolvlm":
|
|
16485
17402
|
case "gemma3n":
|
|
17403
|
+
case "lfm2_vl":
|
|
16486
17404
|
case "chatterbox":
|
|
16487
17405
|
case "mistral3":
|
|
16488
17406
|
case "qwen2_5_vl":
|
|
16489
17407
|
case "qwen3_vl":
|
|
17408
|
+
case "qwen3_vl_moe":
|
|
16490
17409
|
init_normalized_config = getNormalizedConfig(config.text_config);
|
|
16491
17410
|
break;
|
|
16492
17411
|
case "moondream1":
|
|
@@ -16536,11 +17455,17 @@ function getNormalizedConfig(config) {
|
|
|
16536
17455
|
case "cohere":
|
|
16537
17456
|
case "cohere2":
|
|
16538
17457
|
case "mistral":
|
|
17458
|
+
case "voxtral_realtime_text":
|
|
17459
|
+
case "voxtral_realtime_encoder":
|
|
16539
17460
|
case "starcoder2":
|
|
16540
17461
|
case "qwen2":
|
|
17462
|
+
case "qwen2_moe":
|
|
16541
17463
|
case "qwen2_vl":
|
|
17464
|
+
case "qwen2_vl_text":
|
|
16542
17465
|
case "qwen2_5_vl_text":
|
|
17466
|
+
case "qwen3_moe":
|
|
16543
17467
|
case "qwen3_vl_text":
|
|
17468
|
+
case "qwen3_vl_moe_text":
|
|
16544
17469
|
case "phi":
|
|
16545
17470
|
case "phi3":
|
|
16546
17471
|
case "phi3_v":
|
|
@@ -16681,6 +17606,9 @@ function getNormalizedConfig(config) {
|
|
|
16681
17606
|
return normalized_config;
|
|
16682
17607
|
}
|
|
16683
17608
|
function getCacheShapes(config, options) {
|
|
17609
|
+
if (!(config instanceof PretrainedConfig)) {
|
|
17610
|
+
config = new PretrainedConfig(config);
|
|
17611
|
+
}
|
|
16684
17612
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
16685
17613
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
16686
17614
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -16739,7 +17667,7 @@ function getCacheShapes(config, options) {
|
|
|
16739
17667
|
}
|
|
16740
17668
|
}
|
|
16741
17669
|
return cache_values;
|
|
16742
|
-
} else if (["
|
|
17670
|
+
} else if (["qwen3_next", "qwen3_5_text", "qwen3_5_moe_text", "olmo_hybrid"].includes(config.model_type)) {
|
|
16743
17671
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
16744
17672
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
16745
17673
|
const cache_values = {};
|
|
@@ -16756,11 +17684,10 @@ function getCacheShapes(config, options) {
|
|
|
16756
17684
|
linear_conv_kernel_dim
|
|
16757
17685
|
} = (
|
|
16758
17686
|
/** @type {any} */
|
|
16759
|
-
config
|
|
17687
|
+
config
|
|
16760
17688
|
);
|
|
16761
17689
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
16762
17690
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
16763
|
-
const conv_dim = key_dim * 2 + value_dim;
|
|
16764
17691
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
16765
17692
|
const batch_size = options?.batch_size ?? 1;
|
|
16766
17693
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
@@ -16769,7 +17696,14 @@ function getCacheShapes(config, options) {
|
|
|
16769
17696
|
cache_values[`${pkv_prefix}.${i}.${kv}`] = [batch_size, num_key_value_heads, 0, final_head_dim];
|
|
16770
17697
|
}
|
|
16771
17698
|
} else if (layer_types[i] === "linear_attention") {
|
|
16772
|
-
|
|
17699
|
+
if (config.model_type === "olmo_hybrid") {
|
|
17700
|
+
cache_values[`${conv_prefix}_conv.${i}.key`] = [batch_size, key_dim, linear_conv_kernel_dim];
|
|
17701
|
+
cache_values[`${conv_prefix}_conv.${i}.value`] = [batch_size, value_dim, linear_conv_kernel_dim];
|
|
17702
|
+
cache_values[`${conv_prefix}_conv.${i}.query`] = [batch_size, key_dim, linear_conv_kernel_dim];
|
|
17703
|
+
} else {
|
|
17704
|
+
const conv_dim = key_dim * 2 + value_dim;
|
|
17705
|
+
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_dim, linear_conv_kernel_dim];
|
|
17706
|
+
}
|
|
16773
17707
|
cache_values[`${conv_prefix}_recurrent.${i}`] = [
|
|
16774
17708
|
batch_size,
|
|
16775
17709
|
linear_num_value_heads,
|
|
@@ -16781,6 +17715,16 @@ function getCacheShapes(config, options) {
|
|
|
16781
17715
|
}
|
|
16782
17716
|
}
|
|
16783
17717
|
return cache_values;
|
|
17718
|
+
} else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
|
|
17719
|
+
let subConfig;
|
|
17720
|
+
if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
|
|
17721
|
+
subConfig = /** @type {any} */
|
|
17722
|
+
config.audio_config;
|
|
17723
|
+
} else {
|
|
17724
|
+
subConfig = /** @type {any} */
|
|
17725
|
+
config.text_config;
|
|
17726
|
+
}
|
|
17727
|
+
return getCacheShapes(subConfig, options);
|
|
16784
17728
|
}
|
|
16785
17729
|
return getKeyValueShapes(config, options);
|
|
16786
17730
|
}
|
|
@@ -16946,7 +17890,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
|
|
|
16946
17890
|
}
|
|
16947
17891
|
|
|
16948
17892
|
// src/models/session.js
|
|
16949
|
-
async function getSession(pretrained_model_name_or_path, fileName, options,
|
|
17893
|
+
async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
|
|
16950
17894
|
let custom_config = options.config?.["transformers.js_config"] ?? {};
|
|
16951
17895
|
const selectedDevice = (
|
|
16952
17896
|
/** @type {import("../utils/devices.js").DeviceType} */
|
|
@@ -17004,9 +17948,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
17004
17948
|
if (externalData.length > 0 && !apis.IS_NODE_ENV) {
|
|
17005
17949
|
session_options.externalData = externalData;
|
|
17006
17950
|
}
|
|
17007
|
-
if (
|
|
17951
|
+
if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
|
|
17008
17952
|
const shapes = getCacheShapes(options.config, {
|
|
17009
|
-
prefix: "present"
|
|
17953
|
+
prefix: "present",
|
|
17954
|
+
session_name
|
|
17010
17955
|
});
|
|
17011
17956
|
if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
|
|
17012
17957
|
const preferredOutputLocation = {};
|
|
@@ -17024,15 +17969,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
17024
17969
|
};
|
|
17025
17970
|
return { buffer_or_path, session_options, session_config };
|
|
17026
17971
|
}
|
|
17027
|
-
async function constructSessions(pretrained_model_name_or_path, names, options,
|
|
17972
|
+
async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
|
|
17028
17973
|
return Object.fromEntries(
|
|
17029
17974
|
await Promise.all(
|
|
17030
17975
|
Object.keys(names).map(async (name) => {
|
|
17976
|
+
const cache_config = cache_sessions?.[name] ?? false;
|
|
17031
17977
|
const { buffer_or_path, session_options, session_config } = await getSession(
|
|
17032
17978
|
pretrained_model_name_or_path,
|
|
17033
17979
|
names[name],
|
|
17034
17980
|
options,
|
|
17035
|
-
|
|
17981
|
+
cache_config,
|
|
17982
|
+
name
|
|
17036
17983
|
);
|
|
17037
17984
|
const session = await createInferenceSession(buffer_or_path, session_options, session_config);
|
|
17038
17985
|
return [name, session];
|
|
@@ -18332,6 +19279,66 @@ var BeamSearchSampler = class extends LogitsSampler {
|
|
|
18332
19279
|
}
|
|
18333
19280
|
};
|
|
18334
19281
|
|
|
19282
|
+
// src/cache_utils.js
|
|
19283
|
+
var _DynamicCache = class {
|
|
19284
|
+
/**
|
|
19285
|
+
* Create a DynamicCache, optionally pre-populated with entries.
|
|
19286
|
+
* @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
|
|
19287
|
+
*/
|
|
19288
|
+
constructor(entries) {
|
|
19289
|
+
if (!entries) return;
|
|
19290
|
+
for (const key in entries) {
|
|
19291
|
+
if (key in this) {
|
|
19292
|
+
throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
|
|
19293
|
+
}
|
|
19294
|
+
const value = entries[key];
|
|
19295
|
+
if (!(value instanceof Tensor2)) {
|
|
19296
|
+
throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
|
|
19297
|
+
}
|
|
19298
|
+
this[key] = value;
|
|
19299
|
+
}
|
|
19300
|
+
}
|
|
19301
|
+
/**
|
|
19302
|
+
* Get the cached sequence length. This requires at least one attention cache entry to be present.
|
|
19303
|
+
* @returns {number} The past sequence length.
|
|
19304
|
+
*/
|
|
19305
|
+
get_seq_length() {
|
|
19306
|
+
const self2 = (
|
|
19307
|
+
/** @type {any} */
|
|
19308
|
+
this
|
|
19309
|
+
);
|
|
19310
|
+
for (const name in self2) {
|
|
19311
|
+
if (name.startsWith("past_key_values.")) {
|
|
19312
|
+
return self2[name].dims.at(-2);
|
|
19313
|
+
}
|
|
19314
|
+
}
|
|
19315
|
+
throw new Error("Unable to determine sequence length from the cache.");
|
|
19316
|
+
}
|
|
19317
|
+
/**
|
|
19318
|
+
* Dispose all contained tensors whose data resides on the GPU.
|
|
19319
|
+
* Returns a promise that resolves when all disposals are complete.
|
|
19320
|
+
* @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
|
|
19321
|
+
*/
|
|
19322
|
+
async dispose() {
|
|
19323
|
+
const promises = [];
|
|
19324
|
+
for (
|
|
19325
|
+
const t of
|
|
19326
|
+
/** @type {Tensor[]} */
|
|
19327
|
+
Object.values(this)
|
|
19328
|
+
) {
|
|
19329
|
+
if (t.location === "gpu-buffer") {
|
|
19330
|
+
promises.push(t.dispose());
|
|
19331
|
+
}
|
|
19332
|
+
}
|
|
19333
|
+
await Promise.all(promises);
|
|
19334
|
+
}
|
|
19335
|
+
};
|
|
19336
|
+
var DynamicCache = (
|
|
19337
|
+
/** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
|
|
19338
|
+
/** @type {unknown} */
|
|
19339
|
+
_DynamicCache
|
|
19340
|
+
);
|
|
19341
|
+
|
|
18335
19342
|
// src/models/modeling_utils.js
|
|
18336
19343
|
var MODEL_MAPPING_NAMES = null;
|
|
18337
19344
|
function registerTaskMappings(mappings) {
|
|
@@ -18377,71 +19384,181 @@ var MODEL_TYPES = {
|
|
|
18377
19384
|
AutoEncoder: 12,
|
|
18378
19385
|
ImageAudioTextToText: 13,
|
|
18379
19386
|
Supertonic: 14,
|
|
18380
|
-
Chatterbox: 15
|
|
19387
|
+
Chatterbox: 15,
|
|
19388
|
+
MultimodalLanguageModelOnly: 16,
|
|
19389
|
+
VoxtralRealtime: 17
|
|
18381
19390
|
};
|
|
18382
19391
|
var MODEL_TYPE_CONFIG = {
|
|
18383
19392
|
[MODEL_TYPES.DecoderOnly]: {
|
|
18384
19393
|
can_generate: true,
|
|
18385
19394
|
forward: decoder_forward,
|
|
18386
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
19395
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
19396
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
|
|
19397
|
+
cache_sessions: { model: true },
|
|
19398
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18387
19399
|
},
|
|
18388
19400
|
[MODEL_TYPES.DecoderOnlyWithoutHead]: {
|
|
18389
19401
|
can_generate: false,
|
|
18390
19402
|
forward: decoder_forward,
|
|
18391
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
19403
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
19404
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
18392
19405
|
},
|
|
18393
19406
|
[MODEL_TYPES.Seq2Seq]: {
|
|
18394
19407
|
can_generate: true,
|
|
18395
19408
|
forward: seq2seq_forward,
|
|
18396
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
19409
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
19410
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
19411
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19412
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18397
19413
|
},
|
|
18398
19414
|
[MODEL_TYPES.Vision2Seq]: {
|
|
18399
19415
|
can_generate: true,
|
|
18400
19416
|
forward: seq2seq_forward,
|
|
18401
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
19417
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
19418
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
19419
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19420
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18402
19421
|
},
|
|
18403
19422
|
[MODEL_TYPES.Musicgen]: {
|
|
18404
19423
|
can_generate: true,
|
|
18405
|
-
forward: seq2seq_forward
|
|
19424
|
+
forward: seq2seq_forward,
|
|
19425
|
+
sessions: () => ({
|
|
19426
|
+
model: "text_encoder",
|
|
19427
|
+
decoder_model_merged: "decoder_model_merged",
|
|
19428
|
+
encodec_decode: "encodec_decode"
|
|
19429
|
+
}),
|
|
19430
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19431
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18406
19432
|
},
|
|
18407
19433
|
[MODEL_TYPES.EncoderDecoder]: {
|
|
18408
19434
|
can_generate: false,
|
|
18409
|
-
forward: seq2seq_forward
|
|
19435
|
+
forward: seq2seq_forward,
|
|
19436
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
19437
|
+
cache_sessions: { decoder_model_merged: true }
|
|
19438
|
+
},
|
|
19439
|
+
[MODEL_TYPES.MaskGeneration]: {
|
|
19440
|
+
sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
|
|
18410
19441
|
},
|
|
18411
19442
|
[MODEL_TYPES.ImageTextToText]: {
|
|
18412
19443
|
can_generate: true,
|
|
18413
19444
|
forward: image_text_to_text_forward,
|
|
18414
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19445
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19446
|
+
sessions: (config) => {
|
|
19447
|
+
const s = {
|
|
19448
|
+
embed_tokens: "embed_tokens",
|
|
19449
|
+
vision_encoder: "vision_encoder",
|
|
19450
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19451
|
+
};
|
|
19452
|
+
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
19453
|
+
return s;
|
|
19454
|
+
},
|
|
19455
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19456
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18415
19457
|
},
|
|
18416
19458
|
[MODEL_TYPES.AudioTextToText]: {
|
|
18417
19459
|
can_generate: true,
|
|
18418
19460
|
forward: audio_text_to_text_forward,
|
|
18419
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19461
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19462
|
+
sessions: () => ({
|
|
19463
|
+
embed_tokens: "embed_tokens",
|
|
19464
|
+
audio_encoder: "audio_encoder",
|
|
19465
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19466
|
+
}),
|
|
19467
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19468
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18420
19469
|
},
|
|
18421
|
-
[MODEL_TYPES.
|
|
19470
|
+
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
18422
19471
|
can_generate: true,
|
|
18423
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19472
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19473
|
+
sessions: () => ({
|
|
19474
|
+
embed_tokens: "embed_tokens",
|
|
19475
|
+
audio_encoder: "audio_encoder",
|
|
19476
|
+
vision_encoder: "vision_encoder",
|
|
19477
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19478
|
+
}),
|
|
19479
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18424
19480
|
},
|
|
18425
|
-
[MODEL_TYPES.
|
|
19481
|
+
[MODEL_TYPES.Phi3V]: {
|
|
18426
19482
|
can_generate: true,
|
|
18427
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
19483
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19484
|
+
sessions: () => ({
|
|
19485
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
19486
|
+
model: "model",
|
|
19487
|
+
vision_encoder: "vision_encoder"
|
|
19488
|
+
}),
|
|
19489
|
+
cache_sessions: { model: true },
|
|
19490
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18428
19491
|
},
|
|
18429
19492
|
[MODEL_TYPES.MultiModality]: {
|
|
18430
|
-
can_generate: true
|
|
19493
|
+
can_generate: true,
|
|
19494
|
+
sessions: () => ({
|
|
19495
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
19496
|
+
model: "language_model",
|
|
19497
|
+
lm_head: "lm_head",
|
|
19498
|
+
gen_head: "gen_head",
|
|
19499
|
+
gen_img_embeds: "gen_img_embeds",
|
|
19500
|
+
image_decode: "image_decode"
|
|
19501
|
+
}),
|
|
19502
|
+
cache_sessions: { model: true },
|
|
19503
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18431
19504
|
},
|
|
18432
19505
|
[MODEL_TYPES.AutoEncoder]: {
|
|
18433
19506
|
can_generate: false,
|
|
18434
|
-
forward: auto_encoder_forward
|
|
19507
|
+
forward: auto_encoder_forward,
|
|
19508
|
+
sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
|
|
19509
|
+
},
|
|
19510
|
+
[MODEL_TYPES.Supertonic]: {
|
|
19511
|
+
sessions: () => ({
|
|
19512
|
+
text_encoder: "text_encoder",
|
|
19513
|
+
latent_denoiser: "latent_denoiser",
|
|
19514
|
+
voice_decoder: "voice_decoder"
|
|
19515
|
+
})
|
|
18435
19516
|
},
|
|
18436
19517
|
[MODEL_TYPES.Chatterbox]: {
|
|
18437
19518
|
can_generate: true,
|
|
18438
|
-
forward: encoder_forward
|
|
19519
|
+
forward: encoder_forward,
|
|
19520
|
+
sessions: () => ({
|
|
19521
|
+
embed_tokens: "embed_tokens",
|
|
19522
|
+
speech_encoder: "speech_encoder",
|
|
19523
|
+
model: "language_model",
|
|
19524
|
+
conditional_decoder: "conditional_decoder"
|
|
19525
|
+
}),
|
|
19526
|
+
cache_sessions: { model: true },
|
|
19527
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
19528
|
+
},
|
|
19529
|
+
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
19530
|
+
can_generate: true,
|
|
19531
|
+
forward: image_text_to_text_forward,
|
|
19532
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19533
|
+
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
19534
|
+
cache_sessions: { decoder_model_merged: true },
|
|
19535
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
19536
|
+
},
|
|
19537
|
+
[MODEL_TYPES.VoxtralRealtime]: {
|
|
19538
|
+
can_generate: true,
|
|
19539
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
19540
|
+
sessions: () => ({
|
|
19541
|
+
embed_tokens: "embed_tokens",
|
|
19542
|
+
audio_encoder: "audio_encoder",
|
|
19543
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19544
|
+
}),
|
|
19545
|
+
cache_sessions: { decoder_model_merged: true, audio_encoder: true },
|
|
19546
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
18439
19547
|
},
|
|
18440
19548
|
default: {
|
|
18441
19549
|
can_generate: false,
|
|
18442
|
-
forward: encoder_forward
|
|
19550
|
+
forward: encoder_forward,
|
|
19551
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
18443
19552
|
}
|
|
18444
19553
|
};
|
|
19554
|
+
function getSessionsConfig(modelType, config, options = {}) {
|
|
19555
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
19556
|
+
return {
|
|
19557
|
+
sessions: typeConfig.sessions(config, options),
|
|
19558
|
+
cache_sessions: typeConfig.cache_sessions,
|
|
19559
|
+
optional_configs: typeConfig.optional_configs
|
|
19560
|
+
};
|
|
19561
|
+
}
|
|
18445
19562
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
18446
19563
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
18447
19564
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -18527,245 +19644,23 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
18527
19644
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
18528
19645
|
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
18529
19646
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
18530
|
-
|
|
18531
|
-
if (modelType ===
|
|
18532
|
-
|
|
18533
|
-
|
|
18534
|
-
|
|
18535
|
-
{
|
|
18536
|
-
|
|
18537
|
-
},
|
|
18538
|
-
options,
|
|
18539
|
-
"model"
|
|
18540
|
-
),
|
|
18541
|
-
get_optional_configs(
|
|
18542
|
-
pretrained_model_name_or_path,
|
|
18543
|
-
{
|
|
18544
|
-
generation_config: "generation_config.json"
|
|
18545
|
-
},
|
|
18546
|
-
options
|
|
18547
|
-
)
|
|
18548
|
-
]);
|
|
18549
|
-
} else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
|
|
18550
|
-
info = await Promise.all([
|
|
18551
|
-
constructSessions(
|
|
18552
|
-
pretrained_model_name_or_path,
|
|
18553
|
-
{
|
|
18554
|
-
model: "encoder_model",
|
|
18555
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18556
|
-
},
|
|
18557
|
-
options,
|
|
18558
|
-
"decoder_model_merged"
|
|
18559
|
-
),
|
|
18560
|
-
get_optional_configs(
|
|
18561
|
-
pretrained_model_name_or_path,
|
|
18562
|
-
{
|
|
18563
|
-
generation_config: "generation_config.json"
|
|
18564
|
-
},
|
|
18565
|
-
options
|
|
18566
|
-
)
|
|
18567
|
-
]);
|
|
18568
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
18569
|
-
info = await Promise.all([
|
|
18570
|
-
constructSessions(
|
|
18571
|
-
pretrained_model_name_or_path,
|
|
18572
|
-
{
|
|
18573
|
-
model: "vision_encoder",
|
|
18574
|
-
prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
|
|
18575
|
-
},
|
|
18576
|
-
options
|
|
18577
|
-
)
|
|
18578
|
-
]);
|
|
18579
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
18580
|
-
info = await Promise.all([
|
|
18581
|
-
constructSessions(
|
|
18582
|
-
pretrained_model_name_or_path,
|
|
18583
|
-
{
|
|
18584
|
-
model: "encoder_model",
|
|
18585
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18586
|
-
},
|
|
18587
|
-
options,
|
|
18588
|
-
"decoder_model_merged"
|
|
18589
|
-
)
|
|
18590
|
-
]);
|
|
18591
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
18592
|
-
const sessions = {
|
|
18593
|
-
embed_tokens: "embed_tokens",
|
|
18594
|
-
vision_encoder: "vision_encoder",
|
|
18595
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18596
|
-
};
|
|
18597
|
-
if (config.is_encoder_decoder) {
|
|
18598
|
-
sessions["model"] = "encoder_model";
|
|
18599
|
-
}
|
|
18600
|
-
info = await Promise.all([
|
|
18601
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
18602
|
-
get_optional_configs(
|
|
18603
|
-
pretrained_model_name_or_path,
|
|
18604
|
-
{
|
|
18605
|
-
generation_config: "generation_config.json"
|
|
18606
|
-
},
|
|
18607
|
-
options
|
|
18608
|
-
)
|
|
18609
|
-
]);
|
|
18610
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
18611
|
-
const sessions = {
|
|
18612
|
-
embed_tokens: "embed_tokens",
|
|
18613
|
-
audio_encoder: "audio_encoder",
|
|
18614
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18615
|
-
};
|
|
18616
|
-
info = await Promise.all([
|
|
18617
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
18618
|
-
get_optional_configs(
|
|
18619
|
-
pretrained_model_name_or_path,
|
|
18620
|
-
{
|
|
18621
|
-
generation_config: "generation_config.json"
|
|
18622
|
-
},
|
|
18623
|
-
options
|
|
18624
|
-
)
|
|
18625
|
-
]);
|
|
18626
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
18627
|
-
const sessions = {
|
|
18628
|
-
embed_tokens: "embed_tokens",
|
|
18629
|
-
audio_encoder: "audio_encoder",
|
|
18630
|
-
vision_encoder: "vision_encoder",
|
|
18631
|
-
decoder_model_merged: "decoder_model_merged"
|
|
18632
|
-
};
|
|
18633
|
-
info = await Promise.all([
|
|
18634
|
-
constructSessions(pretrained_model_name_or_path, sessions, options),
|
|
18635
|
-
get_optional_configs(
|
|
18636
|
-
pretrained_model_name_or_path,
|
|
18637
|
-
{
|
|
18638
|
-
generation_config: "generation_config.json"
|
|
18639
|
-
},
|
|
18640
|
-
options
|
|
18641
|
-
)
|
|
18642
|
-
]);
|
|
18643
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
18644
|
-
info = await Promise.all([
|
|
18645
|
-
constructSessions(
|
|
18646
|
-
pretrained_model_name_or_path,
|
|
18647
|
-
{
|
|
18648
|
-
model: "text_encoder",
|
|
18649
|
-
decoder_model_merged: "decoder_model_merged",
|
|
18650
|
-
encodec_decode: "encodec_decode"
|
|
18651
|
-
},
|
|
18652
|
-
options,
|
|
18653
|
-
"decoder_model_merged"
|
|
18654
|
-
),
|
|
18655
|
-
get_optional_configs(
|
|
18656
|
-
pretrained_model_name_or_path,
|
|
18657
|
-
{
|
|
18658
|
-
generation_config: "generation_config.json"
|
|
18659
|
-
},
|
|
18660
|
-
options
|
|
18661
|
-
)
|
|
18662
|
-
]);
|
|
18663
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
18664
|
-
info = await Promise.all([
|
|
18665
|
-
constructSessions(
|
|
18666
|
-
pretrained_model_name_or_path,
|
|
18667
|
-
{
|
|
18668
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
18669
|
-
model: "language_model",
|
|
18670
|
-
lm_head: "lm_head",
|
|
18671
|
-
gen_head: "gen_head",
|
|
18672
|
-
gen_img_embeds: "gen_img_embeds",
|
|
18673
|
-
image_decode: "image_decode"
|
|
18674
|
-
},
|
|
18675
|
-
options,
|
|
18676
|
-
"model"
|
|
18677
|
-
),
|
|
18678
|
-
get_optional_configs(
|
|
18679
|
-
pretrained_model_name_or_path,
|
|
18680
|
-
{
|
|
18681
|
-
generation_config: "generation_config.json"
|
|
18682
|
-
},
|
|
18683
|
-
options
|
|
18684
|
-
)
|
|
18685
|
-
]);
|
|
18686
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
18687
|
-
info = await Promise.all([
|
|
18688
|
-
constructSessions(
|
|
18689
|
-
pretrained_model_name_or_path,
|
|
18690
|
-
{
|
|
18691
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
18692
|
-
model: "model",
|
|
18693
|
-
vision_encoder: "vision_encoder"
|
|
18694
|
-
},
|
|
18695
|
-
options,
|
|
18696
|
-
"model"
|
|
18697
|
-
),
|
|
18698
|
-
get_optional_configs(
|
|
18699
|
-
pretrained_model_name_or_path,
|
|
18700
|
-
{
|
|
18701
|
-
generation_config: "generation_config.json"
|
|
18702
|
-
},
|
|
18703
|
-
options
|
|
18704
|
-
)
|
|
18705
|
-
]);
|
|
18706
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
18707
|
-
info = await Promise.all([
|
|
18708
|
-
constructSessions(
|
|
18709
|
-
pretrained_model_name_or_path,
|
|
18710
|
-
{
|
|
18711
|
-
embed_tokens: "embed_tokens",
|
|
18712
|
-
speech_encoder: "speech_encoder",
|
|
18713
|
-
model: "language_model",
|
|
18714
|
-
conditional_decoder: "conditional_decoder"
|
|
18715
|
-
},
|
|
18716
|
-
options,
|
|
18717
|
-
"model"
|
|
18718
|
-
),
|
|
18719
|
-
get_optional_configs(
|
|
18720
|
-
pretrained_model_name_or_path,
|
|
18721
|
-
{
|
|
18722
|
-
generation_config: "generation_config.json"
|
|
18723
|
-
},
|
|
18724
|
-
options
|
|
18725
|
-
)
|
|
18726
|
-
]);
|
|
18727
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
18728
|
-
info = await Promise.all([
|
|
18729
|
-
constructSessions(
|
|
18730
|
-
pretrained_model_name_or_path,
|
|
18731
|
-
{
|
|
18732
|
-
encoder_model: "encoder_model",
|
|
18733
|
-
decoder_model: "decoder_model"
|
|
18734
|
-
},
|
|
18735
|
-
options
|
|
18736
|
-
)
|
|
18737
|
-
]);
|
|
18738
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
18739
|
-
info = await Promise.all([
|
|
18740
|
-
constructSessions(
|
|
18741
|
-
pretrained_model_name_or_path,
|
|
18742
|
-
{
|
|
18743
|
-
text_encoder: "text_encoder",
|
|
18744
|
-
latent_denoiser: "latent_denoiser",
|
|
18745
|
-
voice_decoder: "voice_decoder"
|
|
18746
|
-
},
|
|
18747
|
-
options
|
|
18748
|
-
)
|
|
18749
|
-
]);
|
|
18750
|
-
} else {
|
|
18751
|
-
if (modelType === void 0) {
|
|
18752
|
-
const type = modelName ?? config?.model_type;
|
|
18753
|
-
if (type !== "custom") {
|
|
18754
|
-
logger.warn(
|
|
18755
|
-
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
18756
|
-
);
|
|
18757
|
-
}
|
|
19647
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
19648
|
+
if (modelType === void 0) {
|
|
19649
|
+
const type = modelName ?? config?.model_type;
|
|
19650
|
+
if (type !== "custom") {
|
|
19651
|
+
logger.warn(
|
|
19652
|
+
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
19653
|
+
);
|
|
18758
19654
|
}
|
|
18759
|
-
info = await Promise.all([
|
|
18760
|
-
constructSessions(
|
|
18761
|
-
pretrained_model_name_or_path,
|
|
18762
|
-
{
|
|
18763
|
-
model: options.model_file_name ?? "model"
|
|
18764
|
-
},
|
|
18765
|
-
options
|
|
18766
|
-
)
|
|
18767
|
-
]);
|
|
18768
19655
|
}
|
|
19656
|
+
const sessions = typeConfig.sessions(config, options);
|
|
19657
|
+
const promises = [
|
|
19658
|
+
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
19659
|
+
];
|
|
19660
|
+
if (typeConfig.optional_configs) {
|
|
19661
|
+
promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
|
|
19662
|
+
}
|
|
19663
|
+
const info = await Promise.all(promises);
|
|
18769
19664
|
return new this(config, ...info);
|
|
18770
19665
|
}
|
|
18771
19666
|
/**
|
|
@@ -18964,7 +19859,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
18964
19859
|
* @param {Tensor} [params.inputs=null]
|
|
18965
19860
|
* @param {number} [params.bos_token_id=null]
|
|
18966
19861
|
* @param {Record<string, Tensor|number[]>} [params.model_kwargs]
|
|
18967
|
-
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor
|
|
19862
|
+
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
|
|
18968
19863
|
*/
|
|
18969
19864
|
_prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
|
|
18970
19865
|
const model_inputs = pick(model_kwargs, this.forward_params);
|
|
@@ -19205,11 +20100,12 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19205
20100
|
}
|
|
19206
20101
|
}
|
|
19207
20102
|
/**
|
|
19208
|
-
* Returns
|
|
20103
|
+
* Returns a DynamicCache containing past key values from the given decoder results object.
|
|
19209
20104
|
*
|
|
19210
20105
|
* @param {Object} decoderResults The decoder results object.
|
|
19211
|
-
* @param {
|
|
19212
|
-
* @
|
|
20106
|
+
* @param {DynamicCache} pastKeyValues The previous past key values.
|
|
20107
|
+
* @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
|
|
20108
|
+
* @returns {DynamicCache} A new DynamicCache containing the updated past key values.
|
|
19213
20109
|
*/
|
|
19214
20110
|
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
19215
20111
|
const pkvs = /* @__PURE__ */ Object.create(null);
|
|
@@ -19230,7 +20126,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19230
20126
|
}
|
|
19231
20127
|
}
|
|
19232
20128
|
}
|
|
19233
|
-
return pkvs;
|
|
20129
|
+
return new DynamicCache(pkvs);
|
|
19234
20130
|
}
|
|
19235
20131
|
/**
|
|
19236
20132
|
* Returns an object containing attentions from the given model output object.
|
|
@@ -19255,8 +20151,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19255
20151
|
/**
|
|
19256
20152
|
* Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
|
|
19257
20153
|
*
|
|
19258
|
-
* @param {
|
|
19259
|
-
* @param {
|
|
20154
|
+
* @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
|
|
20155
|
+
* @param {DynamicCache|null} pastKeyValues The cache containing past key values.
|
|
19260
20156
|
*/
|
|
19261
20157
|
addPastKeyValues(decoderFeeds, pastKeyValues) {
|
|
19262
20158
|
if (pastKeyValues) {
|
|
@@ -19273,14 +20169,29 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19273
20169
|
}
|
|
19274
20170
|
}
|
|
19275
20171
|
}
|
|
19276
|
-
|
|
19277
|
-
|
|
20172
|
+
/**
|
|
20173
|
+
* Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
|
|
20174
|
+
* @param {string} sessionName
|
|
20175
|
+
* @param {Record<string, Tensor>} inputs
|
|
20176
|
+
* @param {string} outputName
|
|
20177
|
+
* @private
|
|
20178
|
+
*/
|
|
20179
|
+
async _encode_input(sessionName, inputs, outputName) {
|
|
20180
|
+
if (!Object.hasOwn(this.sessions, sessionName)) {
|
|
20181
|
+
throw new Error(`Model does not have a ${sessionName} session.`);
|
|
20182
|
+
}
|
|
20183
|
+
const session = this.sessions[sessionName];
|
|
20184
|
+
const output = await sessionRun(session, pick(inputs, session.inputNames));
|
|
20185
|
+
return output[outputName];
|
|
20186
|
+
}
|
|
20187
|
+
async encode_image(inputs) {
|
|
20188
|
+
return this._encode_input("vision_encoder", inputs, "image_features");
|
|
19278
20189
|
}
|
|
19279
|
-
async encode_text(
|
|
19280
|
-
return
|
|
20190
|
+
async encode_text(inputs) {
|
|
20191
|
+
return this._encode_input("embed_tokens", inputs, "inputs_embeds");
|
|
19281
20192
|
}
|
|
19282
|
-
async encode_audio(
|
|
19283
|
-
return
|
|
20193
|
+
async encode_audio(inputs) {
|
|
20194
|
+
return this._encode_input("audio_encoder", inputs, "audio_features");
|
|
19284
20195
|
}
|
|
19285
20196
|
};
|
|
19286
20197
|
async function seq2seq_forward(self2, model_inputs) {
|
|
@@ -19335,6 +20246,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
|
|
|
19335
20246
|
const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
|
|
19336
20247
|
new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
|
|
19337
20248
|
}
|
|
20249
|
+
if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
|
|
20250
|
+
new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
|
|
20251
|
+
}
|
|
19338
20252
|
self2.addPastKeyValues(new_model_inputs, past_key_values);
|
|
19339
20253
|
const fixed = pick(new_model_inputs, session.inputNames);
|
|
19340
20254
|
return await sessionRun(session, fixed);
|
|
@@ -19343,7 +20257,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19343
20257
|
// Generic parameters:
|
|
19344
20258
|
encode_function,
|
|
19345
20259
|
merge_function,
|
|
19346
|
-
|
|
20260
|
+
modality_input_names,
|
|
19347
20261
|
modality_output_name,
|
|
19348
20262
|
// Produced by the tokenizer/processor:
|
|
19349
20263
|
input_ids = null,
|
|
@@ -19358,38 +20272,54 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19358
20272
|
// Additional parameters
|
|
19359
20273
|
...kwargs
|
|
19360
20274
|
}) {
|
|
19361
|
-
const modality_values = kwargs[modality_input_name];
|
|
19362
20275
|
if (!inputs_embeds) {
|
|
19363
20276
|
inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
|
|
19364
|
-
|
|
19365
|
-
|
|
19366
|
-
|
|
19367
|
-
|
|
19368
|
-
|
|
19369
|
-
|
|
19370
|
-
|
|
19371
|
-
|
|
19372
|
-
|
|
19373
|
-
inputs_embeds,
|
|
19374
|
-
|
|
19375
|
-
|
|
19376
|
-
|
|
19377
|
-
|
|
19378
|
-
|
|
19379
|
-
|
|
19380
|
-
|
|
19381
|
-
|
|
19382
|
-
|
|
19383
|
-
|
|
19384
|
-
|
|
19385
|
-
|
|
19386
|
-
|
|
20277
|
+
const modality_values = pick(kwargs, modality_input_names);
|
|
20278
|
+
if (Object.keys(modality_values).length > 0) {
|
|
20279
|
+
if (input_ids.dims[1] !== 1) {
|
|
20280
|
+
const modality_features = await encode_function({
|
|
20281
|
+
// Pass the modality values under its expected key.
|
|
20282
|
+
// The caller knows whether this is audio or image.
|
|
20283
|
+
...modality_values,
|
|
20284
|
+
...kwargs
|
|
20285
|
+
});
|
|
20286
|
+
({ inputs_embeds, attention_mask } = merge_function({
|
|
20287
|
+
[modality_output_name]: modality_features,
|
|
20288
|
+
inputs_embeds,
|
|
20289
|
+
input_ids,
|
|
20290
|
+
attention_mask
|
|
20291
|
+
}));
|
|
20292
|
+
} else if (past_key_values && input_ids.dims[1] === 1) {
|
|
20293
|
+
const target_length = input_ids.dims[1];
|
|
20294
|
+
const past_length = past_key_values.get_seq_length();
|
|
20295
|
+
attention_mask = cat(
|
|
20296
|
+
[
|
|
20297
|
+
ones([input_ids.dims[0], past_length]),
|
|
20298
|
+
attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
|
|
20299
|
+
],
|
|
20300
|
+
1
|
|
20301
|
+
);
|
|
20302
|
+
}
|
|
19387
20303
|
}
|
|
19388
20304
|
}
|
|
19389
20305
|
if (!position_ids) {
|
|
19390
|
-
if (
|
|
19391
|
-
|
|
19392
|
-
|
|
20306
|
+
if (
|
|
20307
|
+
// Handle special case for qwen vl models
|
|
20308
|
+
[
|
|
20309
|
+
"qwen2_vl",
|
|
20310
|
+
"qwen2_vl_text",
|
|
20311
|
+
"qwen2_5_vl",
|
|
20312
|
+
"qwen2_5_vl_text",
|
|
20313
|
+
"qwen3_vl",
|
|
20314
|
+
"qwen3_vl_text",
|
|
20315
|
+
"qwen3_vl_moe",
|
|
20316
|
+
"qwen3_vl_moe_text",
|
|
20317
|
+
"qwen3_5",
|
|
20318
|
+
"qwen3_5_text",
|
|
20319
|
+
"qwen3_5_moe",
|
|
20320
|
+
"qwen3_5_moe_text"
|
|
20321
|
+
].includes(self2.config.model_type)
|
|
20322
|
+
) {
|
|
19393
20323
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
19394
20324
|
[position_ids] = self2.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask);
|
|
19395
20325
|
}
|
|
@@ -19411,7 +20341,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
19411
20341
|
async function audio_text_to_text_forward(self2, params) {
|
|
19412
20342
|
return await generic_text_to_text_forward(self2, {
|
|
19413
20343
|
...params,
|
|
19414
|
-
|
|
20344
|
+
modality_input_names: ["audio_values", "input_features"],
|
|
19415
20345
|
modality_output_name: "audio_features",
|
|
19416
20346
|
encode_function: self2.encode_audio.bind(self2),
|
|
19417
20347
|
merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
|
|
@@ -19420,7 +20350,7 @@ async function audio_text_to_text_forward(self2, params) {
|
|
|
19420
20350
|
async function image_text_to_text_forward(self2, params) {
|
|
19421
20351
|
return await generic_text_to_text_forward(self2, {
|
|
19422
20352
|
...params,
|
|
19423
|
-
|
|
20353
|
+
modality_input_names: ["pixel_values"],
|
|
19424
20354
|
modality_output_name: "image_features",
|
|
19425
20355
|
encode_function: self2.encode_image.bind(self2),
|
|
19426
20356
|
merge_function: self2._merge_input_ids_with_image_features.bind(self2)
|
|
@@ -19456,7 +20386,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
|
|
|
19456
20386
|
return position_ids;
|
|
19457
20387
|
}
|
|
19458
20388
|
function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
|
|
19459
|
-
const past_length = model_inputs.past_key_values ?
|
|
20389
|
+
const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
|
|
20390
|
+
const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
|
|
20391
|
+
if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
|
|
20392
|
+
model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
|
|
20393
|
+
}
|
|
19460
20394
|
if (!model_inputs.attention_mask) {
|
|
19461
20395
|
let dims;
|
|
19462
20396
|
for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
|
|
@@ -19764,6 +20698,7 @@ __export(models_exports, {
|
|
|
19764
20698
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
19765
20699
|
Gemma3Model: () => Gemma3Model,
|
|
19766
20700
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
20701
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
19767
20702
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
19768
20703
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
19769
20704
|
GemmaForCausalLM: () => GemmaForCausalLM,
|
|
@@ -19781,6 +20716,7 @@ __export(models_exports, {
|
|
|
19781
20716
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
19782
20717
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
19783
20718
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
20719
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
19784
20720
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
19785
20721
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
19786
20722
|
GroupViTModel: () => GroupViTModel,
|
|
@@ -19802,7 +20738,6 @@ __export(models_exports, {
|
|
|
19802
20738
|
IJepaModel: () => IJepaModel,
|
|
19803
20739
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
19804
20740
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
19805
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
19806
20741
|
JAISLMHeadModel: () => JAISLMHeadModel,
|
|
19807
20742
|
JAISModel: () => JAISModel,
|
|
19808
20743
|
JAISPreTrainedModel: () => JAISPreTrainedModel,
|
|
@@ -19816,6 +20751,7 @@ __export(models_exports, {
|
|
|
19816
20751
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
19817
20752
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
19818
20753
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
20754
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
19819
20755
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
19820
20756
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
19821
20757
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -19940,6 +20876,9 @@ __export(models_exports, {
|
|
|
19940
20876
|
Olmo3Model: () => Olmo3Model,
|
|
19941
20877
|
Olmo3PreTrainedModel: () => Olmo3PreTrainedModel,
|
|
19942
20878
|
OlmoForCausalLM: () => OlmoForCausalLM,
|
|
20879
|
+
OlmoHybridForCausalLM: () => OlmoHybridForCausalLM,
|
|
20880
|
+
OlmoHybridModel: () => OlmoHybridModel,
|
|
20881
|
+
OlmoHybridPreTrainedModel: () => OlmoHybridPreTrainedModel,
|
|
19943
20882
|
OlmoModel: () => OlmoModel,
|
|
19944
20883
|
OlmoPreTrainedModel: () => OlmoPreTrainedModel,
|
|
19945
20884
|
OpenELMForCausalLM: () => OpenELMForCausalLM,
|
|
@@ -19952,7 +20891,6 @@ __export(models_exports, {
|
|
|
19952
20891
|
Owlv2Model: () => Owlv2Model,
|
|
19953
20892
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
19954
20893
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
19955
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
19956
20894
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
19957
20895
|
ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
|
|
19958
20896
|
PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
|
|
@@ -19978,15 +20916,31 @@ __export(models_exports, {
|
|
|
19978
20916
|
PyAnnotePreTrainedModel: () => PyAnnotePreTrainedModel,
|
|
19979
20917
|
Qwen2ForCausalLM: () => Qwen2ForCausalLM,
|
|
19980
20918
|
Qwen2Model: () => Qwen2Model,
|
|
20919
|
+
Qwen2MoeForCausalLM: () => Qwen2MoeForCausalLM,
|
|
20920
|
+
Qwen2MoeModel: () => Qwen2MoeModel,
|
|
20921
|
+
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
19981
20922
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
20923
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
19982
20924
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
19983
20925
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
20926
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
19984
20927
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
19985
20928
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
19986
20929
|
Qwen3Model: () => Qwen3Model,
|
|
20930
|
+
Qwen3MoeForCausalLM: () => Qwen3MoeForCausalLM,
|
|
20931
|
+
Qwen3MoeModel: () => Qwen3MoeModel,
|
|
20932
|
+
Qwen3MoePreTrainedModel: () => Qwen3MoePreTrainedModel,
|
|
20933
|
+
Qwen3NextForCausalLM: () => Qwen3NextForCausalLM,
|
|
20934
|
+
Qwen3NextModel: () => Qwen3NextModel,
|
|
20935
|
+
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
19987
20936
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
20937
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
19988
20938
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
20939
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
20940
|
+
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
20941
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
19989
20942
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
20943
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
19990
20944
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
19991
20945
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
19992
20946
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -20037,7 +20991,6 @@ __export(models_exports, {
|
|
|
20037
20991
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
20038
20992
|
SmolLM3Model: () => SmolLM3Model,
|
|
20039
20993
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
20040
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
20041
20994
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
20042
20995
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
20043
20996
|
SnacModel: () => SnacModel,
|
|
@@ -20109,6 +21062,8 @@ __export(models_exports, {
|
|
|
20109
21062
|
VitsModelOutput: () => VitsModelOutput,
|
|
20110
21063
|
VitsPreTrainedModel: () => VitsPreTrainedModel,
|
|
20111
21064
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
21065
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
21066
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
20112
21067
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
20113
21068
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
20114
21069
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -20469,7 +21424,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
|
|
|
20469
21424
|
if (!past_key_values || target_length !== 1) {
|
|
20470
21425
|
throw new Error("Incorrect state encountered during generation.");
|
|
20471
21426
|
}
|
|
20472
|
-
const past_length =
|
|
21427
|
+
const past_length = past_key_values.get_seq_length();
|
|
20473
21428
|
attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
|
|
20474
21429
|
}
|
|
20475
21430
|
}
|
|
@@ -21499,6 +22454,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
|
|
|
21499
22454
|
});
|
|
21500
22455
|
}
|
|
21501
22456
|
};
|
|
22457
|
+
var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
|
|
22458
|
+
};
|
|
21502
22459
|
|
|
21503
22460
|
// src/models/glm/modeling_glm.js
|
|
21504
22461
|
var GlmPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -21580,6 +22537,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
|
|
|
21580
22537
|
var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
|
|
21581
22538
|
};
|
|
21582
22539
|
|
|
22540
|
+
// src/models/ultravox/modeling_ultravox.js
|
|
22541
|
+
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
22542
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
22543
|
+
};
|
|
22544
|
+
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
22545
|
+
_merge_input_ids_with_audio_features(kwargs) {
|
|
22546
|
+
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
22547
|
+
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
22548
|
+
return default_merge_input_ids_with_audio_features({
|
|
22549
|
+
// @ts-ignore
|
|
22550
|
+
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
|
|
22551
|
+
...kwargs,
|
|
22552
|
+
audio_features: reshaped_audio_features
|
|
22553
|
+
});
|
|
22554
|
+
}
|
|
22555
|
+
};
|
|
22556
|
+
|
|
22557
|
+
// src/models/granite_speech/modeling_granite_speech.js
|
|
22558
|
+
var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
|
|
22559
|
+
forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
|
|
22560
|
+
};
|
|
22561
|
+
|
|
21583
22562
|
// src/models/grounding_dino/modeling_grounding_dino.js
|
|
21584
22563
|
var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
|
|
21585
22564
|
};
|
|
@@ -21675,17 +22654,38 @@ var HubertForSequenceClassification = class extends Wav2Vec2PreTrainedModel {
|
|
|
21675
22654
|
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
21676
22655
|
}
|
|
21677
22656
|
};
|
|
21678
|
-
|
|
21679
|
-
// src/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.js
|
|
21680
|
-
var HunYuanDenseV1PreTrainedModel = class extends PreTrainedModel {
|
|
21681
|
-
};
|
|
21682
|
-
var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
22657
|
+
|
|
22658
|
+
// src/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.js
|
|
22659
|
+
var HunYuanDenseV1PreTrainedModel = class extends PreTrainedModel {
|
|
22660
|
+
};
|
|
22661
|
+
var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
22662
|
+
};
|
|
22663
|
+
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
22664
|
+
};
|
|
22665
|
+
|
|
22666
|
+
// src/models/llava/modeling_llava.js
|
|
22667
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
22668
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
22669
|
+
};
|
|
22670
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
22671
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
22672
|
+
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
22673
|
+
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
22674
|
+
return default_merge_input_ids_with_image_features({
|
|
22675
|
+
// @ts-ignore
|
|
22676
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
22677
|
+
...kwargs,
|
|
22678
|
+
image_features: reshaped_image_hidden_states
|
|
22679
|
+
});
|
|
22680
|
+
}
|
|
22681
|
+
};
|
|
22682
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
21683
22683
|
};
|
|
21684
|
-
var
|
|
22684
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
21685
22685
|
};
|
|
21686
22686
|
|
|
21687
22687
|
// src/models/idefics3/modeling_idefics3.js
|
|
21688
|
-
var
|
|
22688
|
+
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
21689
22689
|
forward_params = [
|
|
21690
22690
|
"input_ids",
|
|
21691
22691
|
"attention_mask",
|
|
@@ -21695,24 +22695,6 @@ var Idefics3PreTrainedModel = class extends PreTrainedModel {
|
|
|
21695
22695
|
"past_key_values"
|
|
21696
22696
|
];
|
|
21697
22697
|
};
|
|
21698
|
-
var Idefics3ForConditionalGeneration = class extends Idefics3PreTrainedModel {
|
|
21699
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
21700
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
|
|
21701
|
-
return features;
|
|
21702
|
-
}
|
|
21703
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
21704
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
21705
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
21706
|
-
return default_merge_input_ids_with_image_features({
|
|
21707
|
-
// @ts-ignore
|
|
21708
|
-
image_token_id: this.config.image_token_id,
|
|
21709
|
-
...kwargs,
|
|
21710
|
-
image_features: reshaped_image_hidden_states
|
|
21711
|
-
});
|
|
21712
|
-
}
|
|
21713
|
-
};
|
|
21714
|
-
var SmolVLMForConditionalGeneration = class extends Idefics3ForConditionalGeneration {
|
|
21715
|
-
};
|
|
21716
22698
|
|
|
21717
22699
|
// src/models/ijepa/modeling_ijepa.js
|
|
21718
22700
|
var IJepaPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -21803,6 +22785,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
|
|
|
21803
22785
|
var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
|
|
21804
22786
|
};
|
|
21805
22787
|
|
|
22788
|
+
// src/models/lfm2_vl/modeling_lfm2_vl.js
|
|
22789
|
+
var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22790
|
+
forward_params = [
|
|
22791
|
+
"input_ids",
|
|
22792
|
+
"attention_mask",
|
|
22793
|
+
"pixel_values",
|
|
22794
|
+
"pixel_attention_mask",
|
|
22795
|
+
"spatial_shapes",
|
|
22796
|
+
"position_ids",
|
|
22797
|
+
"past_key_values"
|
|
22798
|
+
];
|
|
22799
|
+
};
|
|
22800
|
+
|
|
21806
22801
|
// src/models/llama/modeling_llama.js
|
|
21807
22802
|
var LlamaPreTrainedModel = class extends PreTrainedModel {
|
|
21808
22803
|
};
|
|
@@ -21817,27 +22812,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
|
|
|
21817
22812
|
var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
|
|
21818
22813
|
};
|
|
21819
22814
|
|
|
21820
|
-
// src/models/llava/modeling_llava.js
|
|
21821
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
21822
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
21823
|
-
};
|
|
21824
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
21825
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
21826
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
21827
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
21828
|
-
return default_merge_input_ids_with_image_features({
|
|
21829
|
-
// @ts-ignore
|
|
21830
|
-
image_token_id: this.config.image_token_index,
|
|
21831
|
-
...kwargs,
|
|
21832
|
-
image_features: reshaped_image_hidden_states
|
|
21833
|
-
});
|
|
21834
|
-
}
|
|
21835
|
-
};
|
|
21836
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
21837
|
-
};
|
|
21838
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
21839
|
-
};
|
|
21840
|
-
|
|
21841
22815
|
// src/models/longt5/modeling_longt5.js
|
|
21842
22816
|
var LongT5PreTrainedModel = class extends PreTrainedModel {
|
|
21843
22817
|
};
|
|
@@ -22547,6 +23521,14 @@ var Olmo3Model = class extends Olmo3PreTrainedModel {
|
|
|
22547
23521
|
var Olmo3ForCausalLM = class extends Olmo3PreTrainedModel {
|
|
22548
23522
|
};
|
|
22549
23523
|
|
|
23524
|
+
// src/models/olmo_hybrid/modeling_olmo_hybrid.js
|
|
23525
|
+
var OlmoHybridPreTrainedModel = class extends PreTrainedModel {
|
|
23526
|
+
};
|
|
23527
|
+
var OlmoHybridModel = class extends OlmoHybridPreTrainedModel {
|
|
23528
|
+
};
|
|
23529
|
+
var OlmoHybridForCausalLM = class extends OlmoHybridPreTrainedModel {
|
|
23530
|
+
};
|
|
23531
|
+
|
|
22550
23532
|
// src/models/openelm/modeling_openelm.js
|
|
22551
23533
|
var OpenELMPreTrainedModel = class extends PreTrainedModel {
|
|
22552
23534
|
};
|
|
@@ -22580,27 +23562,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
|
|
|
22580
23562
|
};
|
|
22581
23563
|
|
|
22582
23564
|
// src/models/paligemma/modeling_paligemma.js
|
|
22583
|
-
var
|
|
22584
|
-
forward_params = [
|
|
22585
|
-
"input_ids",
|
|
22586
|
-
// 'inputs_embeds',
|
|
22587
|
-
"attention_mask",
|
|
22588
|
-
"pixel_values",
|
|
22589
|
-
"position_ids",
|
|
22590
|
-
"past_key_values"
|
|
22591
|
-
];
|
|
22592
|
-
};
|
|
22593
|
-
var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
|
|
22594
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
22595
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
22596
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
22597
|
-
return default_merge_input_ids_with_image_features({
|
|
22598
|
-
// @ts-ignore
|
|
22599
|
-
image_token_id: this.config.image_token_index,
|
|
22600
|
-
...kwargs,
|
|
22601
|
-
image_features: reshaped_image_hidden_states
|
|
22602
|
-
});
|
|
22603
|
-
}
|
|
23565
|
+
var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22604
23566
|
};
|
|
22605
23567
|
|
|
22606
23568
|
// src/models/parakeet/modeling_parakeet.js
|
|
@@ -22751,6 +23713,14 @@ var Qwen2Model = class extends Qwen2PreTrainedModel {
|
|
|
22751
23713
|
var Qwen2ForCausalLM = class extends Qwen2PreTrainedModel {
|
|
22752
23714
|
};
|
|
22753
23715
|
|
|
23716
|
+
// src/models/qwen2_moe/modeling_qwen2_moe.js
|
|
23717
|
+
var Qwen2MoePreTrainedModel = class extends PreTrainedModel {
|
|
23718
|
+
};
|
|
23719
|
+
var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
23720
|
+
};
|
|
23721
|
+
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
23722
|
+
};
|
|
23723
|
+
|
|
22754
23724
|
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
22755
23725
|
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
22756
23726
|
forward_params = [
|
|
@@ -22765,6 +23735,9 @@ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
|
22765
23735
|
];
|
|
22766
23736
|
};
|
|
22767
23737
|
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
23738
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
23739
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
23740
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
22768
23741
|
image_grid_thw_name = "grid_thw";
|
|
22769
23742
|
/**
|
|
22770
23743
|
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
@@ -22954,19 +23927,45 @@ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
|
22954
23927
|
);
|
|
22955
23928
|
} else {
|
|
22956
23929
|
model_inputs.pixel_values = null;
|
|
22957
|
-
const
|
|
22958
|
-
|
|
22959
|
-
|
|
23930
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
23931
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
23932
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
23933
|
+
model_inputs.input_ids,
|
|
23934
|
+
model_inputs.image_grid_thw,
|
|
23935
|
+
model_inputs.video_grid_thw,
|
|
23936
|
+
model_inputs.attention_mask
|
|
23937
|
+
);
|
|
23938
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
23939
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
23940
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
23941
|
+
} else {
|
|
23942
|
+
if (!model_inputs.rope_deltas) {
|
|
23943
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
23944
|
+
model_inputs.input_ids,
|
|
23945
|
+
model_inputs.image_grid_thw,
|
|
23946
|
+
model_inputs.video_grid_thw,
|
|
23947
|
+
model_inputs.attention_mask
|
|
23948
|
+
);
|
|
23949
|
+
}
|
|
23950
|
+
const delta = BigInt(past_length);
|
|
23951
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
23952
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
23953
|
+
}
|
|
22960
23954
|
}
|
|
22961
23955
|
}
|
|
22962
23956
|
return model_inputs;
|
|
22963
23957
|
}
|
|
22964
23958
|
};
|
|
23959
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
23960
|
+
};
|
|
22965
23961
|
|
|
22966
23962
|
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
22967
23963
|
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
22968
23964
|
image_grid_thw_name = "image_grid_thw";
|
|
22969
23965
|
};
|
|
23966
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
23967
|
+
image_grid_thw_name = "image_grid_thw";
|
|
23968
|
+
};
|
|
22970
23969
|
|
|
22971
23970
|
// src/models/qwen3/modeling_qwen3.js
|
|
22972
23971
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
@@ -22976,17 +23975,45 @@ var Qwen3Model = class extends Qwen3PreTrainedModel {
|
|
|
22976
23975
|
var Qwen3ForCausalLM = class extends Qwen3PreTrainedModel {
|
|
22977
23976
|
};
|
|
22978
23977
|
|
|
23978
|
+
// src/models/qwen3_moe/modeling_qwen3_moe.js
|
|
23979
|
+
var Qwen3MoePreTrainedModel = class extends PreTrainedModel {
|
|
23980
|
+
};
|
|
23981
|
+
var Qwen3MoeModel = class extends Qwen3MoePreTrainedModel {
|
|
23982
|
+
};
|
|
23983
|
+
var Qwen3MoeForCausalLM = class extends Qwen3MoePreTrainedModel {
|
|
23984
|
+
};
|
|
23985
|
+
|
|
23986
|
+
// src/models/qwen3_next/modeling_qwen3_next.js
|
|
23987
|
+
var Qwen3NextPreTrainedModel = class extends PreTrainedModel {
|
|
23988
|
+
};
|
|
23989
|
+
var Qwen3NextModel = class extends Qwen3NextPreTrainedModel {
|
|
23990
|
+
};
|
|
23991
|
+
var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
|
|
23992
|
+
};
|
|
23993
|
+
|
|
22979
23994
|
// src/models/qwen3_vl/modeling_qwen3_vl.js
|
|
22980
23995
|
var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
22981
23996
|
};
|
|
23997
|
+
var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
|
|
23998
|
+
};
|
|
23999
|
+
|
|
24000
|
+
// src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
|
|
24001
|
+
var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
24002
|
+
};
|
|
24003
|
+
var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
|
|
24004
|
+
};
|
|
22982
24005
|
|
|
22983
24006
|
// src/models/qwen3_5/modeling_qwen3_5.js
|
|
22984
24007
|
var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
22985
24008
|
};
|
|
24009
|
+
var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
|
|
24010
|
+
};
|
|
22986
24011
|
|
|
22987
24012
|
// src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
|
|
22988
24013
|
var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
|
|
22989
24014
|
};
|
|
24015
|
+
var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
|
|
24016
|
+
};
|
|
22990
24017
|
|
|
22991
24018
|
// src/models/resnet/modeling_resnet.js
|
|
22992
24019
|
var ResNetPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -23667,25 +24694,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
|
|
|
23667
24694
|
var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
|
|
23668
24695
|
};
|
|
23669
24696
|
|
|
23670
|
-
// src/models/ultravox/modeling_ultravox.js
|
|
23671
|
-
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
23672
|
-
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
23673
|
-
};
|
|
23674
|
-
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
23675
|
-
_merge_input_ids_with_audio_features(kwargs) {
|
|
23676
|
-
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
23677
|
-
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
23678
|
-
return default_merge_input_ids_with_audio_features({
|
|
23679
|
-
// @ts-ignore
|
|
23680
|
-
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
|
|
23681
|
-
...kwargs,
|
|
23682
|
-
audio_features: reshaped_audio_features
|
|
23683
|
-
});
|
|
23684
|
-
}
|
|
23685
|
-
};
|
|
23686
|
-
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
23687
|
-
};
|
|
23688
|
-
|
|
23689
24697
|
// src/models/unispeech/modeling_unispeech.js
|
|
23690
24698
|
var UniSpeechPreTrainedModel = class extends PreTrainedModel {
|
|
23691
24699
|
};
|
|
@@ -23851,6 +24859,170 @@ var VitsModel = class extends VitsPreTrainedModel {
|
|
|
23851
24859
|
}
|
|
23852
24860
|
};
|
|
23853
24861
|
|
|
24862
|
+
// src/models/voxtral/modeling_voxtral.js
|
|
24863
|
+
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
24864
|
+
};
|
|
24865
|
+
|
|
24866
|
+
// src/models/voxtral_realtime/modeling_voxtral_realtime.js
|
|
24867
|
+
var CONV1_LEFT_PAD = 2;
|
|
24868
|
+
var CONV2_LEFT_PAD = 1;
|
|
24869
|
+
var states = /* @__PURE__ */ new WeakMap();
|
|
24870
|
+
function createEncoderState(model, input_features) {
|
|
24871
|
+
const { text_config, audio_config } = (
|
|
24872
|
+
/** @type {any} */
|
|
24873
|
+
model.config
|
|
24874
|
+
);
|
|
24875
|
+
const encoder_session = model.sessions["audio_encoder"];
|
|
24876
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
24877
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
24878
|
+
const enc_kv_cache = new DynamicCache();
|
|
24879
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
|
|
24880
|
+
const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
24881
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
24882
|
+
for (const name in enc_shapes) {
|
|
24883
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
24884
|
+
enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
24885
|
+
}
|
|
24886
|
+
const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
24887
|
+
1,
|
|
24888
|
+
PADDING_CACHE_CHANNELS,
|
|
24889
|
+
CONV1_LEFT_PAD
|
|
24890
|
+
]);
|
|
24891
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
24892
|
+
if (!chunks_iter) {
|
|
24893
|
+
throw new Error("input_features must be iterable or async iterable");
|
|
24894
|
+
}
|
|
24895
|
+
return {
|
|
24896
|
+
encoder_session,
|
|
24897
|
+
enc_kv_cache,
|
|
24898
|
+
enc_padding_cache,
|
|
24899
|
+
enc_past_seq_len: 0,
|
|
24900
|
+
audio_embed_queue: [],
|
|
24901
|
+
audio_embed_total_tokens: 0,
|
|
24902
|
+
audio_queue_offset: 0,
|
|
24903
|
+
audio_consumed: 0,
|
|
24904
|
+
stream_exhausted: false,
|
|
24905
|
+
chunks_iter,
|
|
24906
|
+
text_hidden_size: text_config.hidden_size
|
|
24907
|
+
};
|
|
24908
|
+
}
|
|
24909
|
+
async function encodeChunk(s, chunk_features) {
|
|
24910
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
24911
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
24912
|
+
const position_ids = new Tensor2(
|
|
24913
|
+
"int64",
|
|
24914
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
24915
|
+
[1, conv2_output_len]
|
|
24916
|
+
);
|
|
24917
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
24918
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
24919
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
24920
|
+
input_features: chunk_features,
|
|
24921
|
+
attention_mask,
|
|
24922
|
+
position_ids,
|
|
24923
|
+
past_padding_cache: s.enc_padding_cache,
|
|
24924
|
+
...s.enc_kv_cache
|
|
24925
|
+
});
|
|
24926
|
+
if (s.enc_padding_cache.location === "gpu-buffer") {
|
|
24927
|
+
s.enc_padding_cache.dispose();
|
|
24928
|
+
}
|
|
24929
|
+
s.enc_padding_cache = present_padding_cache;
|
|
24930
|
+
for (const name in present_cache) {
|
|
24931
|
+
if (name.startsWith("present.")) {
|
|
24932
|
+
const pastName = name.replace("present", "past_key_values");
|
|
24933
|
+
const prev = s.enc_kv_cache[pastName];
|
|
24934
|
+
if (prev?.location === "gpu-buffer") {
|
|
24935
|
+
prev.dispose();
|
|
24936
|
+
}
|
|
24937
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
24938
|
+
}
|
|
24939
|
+
}
|
|
24940
|
+
s.enc_past_seq_len = total_seq_len;
|
|
24941
|
+
return audio_embeds;
|
|
24942
|
+
}
|
|
24943
|
+
async function fillAudioBuffer(s, needed) {
|
|
24944
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
24945
|
+
const result = await s.chunks_iter.next();
|
|
24946
|
+
if (result.done) {
|
|
24947
|
+
s.stream_exhausted = true;
|
|
24948
|
+
break;
|
|
24949
|
+
}
|
|
24950
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
24951
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
24952
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
24953
|
+
}
|
|
24954
|
+
}
|
|
24955
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
24956
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
24957
|
+
const embed_data = inputs_embeds.data;
|
|
24958
|
+
let embed_write_pos = 0;
|
|
24959
|
+
let remaining = current_len;
|
|
24960
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
24961
|
+
const front = s.audio_embed_queue[0];
|
|
24962
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
24963
|
+
const n = Math.min(remaining, available);
|
|
24964
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
24965
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
24966
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
24967
|
+
}
|
|
24968
|
+
embed_write_pos += n;
|
|
24969
|
+
remaining -= n;
|
|
24970
|
+
s.audio_queue_offset += n;
|
|
24971
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
24972
|
+
s.audio_embed_queue.shift();
|
|
24973
|
+
s.audio_queue_offset = 0;
|
|
24974
|
+
}
|
|
24975
|
+
}
|
|
24976
|
+
s.audio_consumed += current_len - remaining;
|
|
24977
|
+
}
|
|
24978
|
+
var AudioExhaustedCriteria = class extends StoppingCriteria {
|
|
24979
|
+
constructor(enc_state) {
|
|
24980
|
+
super();
|
|
24981
|
+
this._s = enc_state;
|
|
24982
|
+
}
|
|
24983
|
+
_call(input_ids) {
|
|
24984
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
24985
|
+
return input_ids.map(() => done);
|
|
24986
|
+
}
|
|
24987
|
+
};
|
|
24988
|
+
var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
|
|
24989
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
|
|
24990
|
+
};
|
|
24991
|
+
var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
|
|
24992
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
24993
|
+
const current_len = input_ids.dims[1];
|
|
24994
|
+
const enc = states.get(this);
|
|
24995
|
+
if (enc) {
|
|
24996
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
24997
|
+
}
|
|
24998
|
+
const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
|
|
24999
|
+
if (enc) {
|
|
25000
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
25001
|
+
}
|
|
25002
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
25003
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
25004
|
+
const session = this.sessions["decoder_model_merged"];
|
|
25005
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
25006
|
+
return await sessionRun(session, fixed);
|
|
25007
|
+
}
|
|
25008
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
25009
|
+
if (!input_features) {
|
|
25010
|
+
throw new Error("input_features (generator/iterable) must be provided");
|
|
25011
|
+
}
|
|
25012
|
+
const enc_state = createEncoderState(this, input_features);
|
|
25013
|
+
states.set(this, enc_state);
|
|
25014
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
25015
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
25016
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
25017
|
+
try {
|
|
25018
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
25019
|
+
} finally {
|
|
25020
|
+
enc_state.enc_kv_cache.dispose();
|
|
25021
|
+
states.delete(this);
|
|
25022
|
+
}
|
|
25023
|
+
}
|
|
25024
|
+
};
|
|
25025
|
+
|
|
23854
25026
|
// src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
|
|
23855
25027
|
var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
|
|
23856
25028
|
};
|
|
@@ -24475,6 +25647,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
24475
25647
|
["olmo", "OlmoModel"],
|
|
24476
25648
|
["olmo2", "Olmo2Model"],
|
|
24477
25649
|
["olmo3", "Olmo3Model"],
|
|
25650
|
+
["olmo_hybrid", "OlmoHybridModel"],
|
|
24478
25651
|
["mobilellm", "MobileLLMModel"],
|
|
24479
25652
|
["granite", "GraniteModel"],
|
|
24480
25653
|
["granitemoehybrid", "GraniteMoeHybridModel"],
|
|
@@ -24488,7 +25661,10 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
24488
25661
|
["glm", "GlmModel"],
|
|
24489
25662
|
["openelm", "OpenELMModel"],
|
|
24490
25663
|
["qwen2", "Qwen2Model"],
|
|
25664
|
+
["qwen2_moe", "Qwen2MoeModel"],
|
|
24491
25665
|
["qwen3", "Qwen3Model"],
|
|
25666
|
+
["qwen3_moe", "Qwen3MoeModel"],
|
|
25667
|
+
["qwen3_next", "Qwen3NextModel"],
|
|
24492
25668
|
["phi", "PhiModel"],
|
|
24493
25669
|
["phi3", "Phi3Model"],
|
|
24494
25670
|
["mpt", "MptModel"],
|
|
@@ -24496,7 +25672,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
24496
25672
|
["mistral", "MistralModel"],
|
|
24497
25673
|
["ministral", "MinistralModel"],
|
|
24498
25674
|
["ministral3", "Ministral3Model"],
|
|
24499
|
-
["ernie4_5", "
|
|
25675
|
+
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
24500
25676
|
["starcoder2", "Starcoder2Model"],
|
|
24501
25677
|
["falcon", "FalconModel"],
|
|
24502
25678
|
["falcon_h1", "FalconH1Model"],
|
|
@@ -24590,6 +25766,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24590
25766
|
["olmo", "OlmoForCausalLM"],
|
|
24591
25767
|
["olmo2", "Olmo2ForCausalLM"],
|
|
24592
25768
|
["olmo3", "Olmo3ForCausalLM"],
|
|
25769
|
+
["olmo_hybrid", "OlmoHybridForCausalLM"],
|
|
24593
25770
|
["mobilellm", "MobileLLMForCausalLM"],
|
|
24594
25771
|
["granite", "GraniteForCausalLM"],
|
|
24595
25772
|
["granitemoehybrid", "GraniteMoeHybridForCausalLM"],
|
|
@@ -24599,11 +25776,22 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24599
25776
|
["gemma2", "Gemma2ForCausalLM"],
|
|
24600
25777
|
["vaultgemma", "VaultGemmaForCausalLM"],
|
|
24601
25778
|
["gemma3_text", "Gemma3ForCausalLM"],
|
|
25779
|
+
["gemma3", "Gemma3ForCausalLM"],
|
|
24602
25780
|
["helium", "HeliumForCausalLM"],
|
|
24603
25781
|
["glm", "GlmForCausalLM"],
|
|
24604
25782
|
["openelm", "OpenELMForCausalLM"],
|
|
24605
25783
|
["qwen2", "Qwen2ForCausalLM"],
|
|
25784
|
+
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
24606
25785
|
["qwen3", "Qwen3ForCausalLM"],
|
|
25786
|
+
["qwen3_moe", "Qwen3MoeForCausalLM"],
|
|
25787
|
+
["qwen3_next", "Qwen3NextForCausalLM"],
|
|
25788
|
+
["qwen2_vl", "Qwen2VLForCausalLM"],
|
|
25789
|
+
["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
|
|
25790
|
+
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
25791
|
+
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
25792
|
+
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
25793
|
+
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
25794
|
+
["gemma3n", "Gemma3nForCausalLM"],
|
|
24607
25795
|
["phi", "PhiForCausalLM"],
|
|
24608
25796
|
["phi3", "Phi3ForCausalLM"],
|
|
24609
25797
|
["mpt", "MptForCausalLM"],
|
|
@@ -24612,7 +25800,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24612
25800
|
["mistral", "MistralForCausalLM"],
|
|
24613
25801
|
["ministral", "MinistralForCausalLM"],
|
|
24614
25802
|
["ministral3", "Ministral3ForCausalLM"],
|
|
24615
|
-
["ernie4_5", "
|
|
25803
|
+
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
24616
25804
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
24617
25805
|
["falcon", "FalconForCausalLM"],
|
|
24618
25806
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
@@ -24676,8 +25864,10 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24676
25864
|
["qwen2_vl", "Qwen2VLForConditionalGeneration"],
|
|
24677
25865
|
["qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"],
|
|
24678
25866
|
["qwen3_vl", "Qwen3VLForConditionalGeneration"],
|
|
25867
|
+
["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
|
|
24679
25868
|
["qwen3_5", "Qwen3_5ForConditionalGeneration"],
|
|
24680
25869
|
["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
|
|
25870
|
+
["lfm2_vl", "Lfm2VlForConditionalGeneration"],
|
|
24681
25871
|
["idefics3", "Idefics3ForConditionalGeneration"],
|
|
24682
25872
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
24683
25873
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
@@ -24686,8 +25876,10 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
24686
25876
|
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
24687
25877
|
]);
|
|
24688
25878
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
25879
|
+
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
24689
25880
|
["ultravox", "UltravoxModel"],
|
|
24690
|
-
["voxtral", "VoxtralForConditionalGeneration"]
|
|
25881
|
+
["voxtral", "VoxtralForConditionalGeneration"],
|
|
25882
|
+
["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
|
|
24691
25883
|
]);
|
|
24692
25884
|
var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
24693
25885
|
["vision-encoder-decoder", "VisionEncoderDecoderModel"]
|
|
@@ -24870,24 +26062,37 @@ var CUSTOM_MAPPING = [
|
|
|
24870
26062
|
MODEL_TYPES.ImageAudioTextToText
|
|
24871
26063
|
],
|
|
24872
26064
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
24873
|
-
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
|
|
26065
|
+
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
26066
|
+
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26067
|
+
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26068
|
+
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26069
|
+
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26070
|
+
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26071
|
+
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26072
|
+
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26073
|
+
[
|
|
26074
|
+
"VoxtralRealtimeForConditionalGeneration",
|
|
26075
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
26076
|
+
MODEL_TYPES.VoxtralRealtime
|
|
26077
|
+
]
|
|
24874
26078
|
];
|
|
24875
26079
|
for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
24876
26080
|
MODEL_TYPE_MAPPING.set(name, type);
|
|
24877
26081
|
MODEL_CLASS_TO_NAME_MAPPING.set(model, name);
|
|
24878
26082
|
MODEL_NAME_TO_CLASS_MAPPING.set(name, model);
|
|
24879
26083
|
}
|
|
24880
|
-
var
|
|
26084
|
+
var CUSTOM_ARCHITECTURES_MAPPING = /* @__PURE__ */ new Map([
|
|
24881
26085
|
["modnet", MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
|
|
24882
26086
|
["birefnet", MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
|
|
24883
26087
|
["isnet", MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
|
|
24884
26088
|
["ben", MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES]
|
|
24885
26089
|
]);
|
|
24886
|
-
for (const [name, mapping] of
|
|
26090
|
+
for (const [name, mapping] of CUSTOM_ARCHITECTURES_MAPPING.entries()) {
|
|
24887
26091
|
mapping.set(name, "PreTrainedModel");
|
|
24888
26092
|
MODEL_TYPE_MAPPING.set(name, MODEL_TYPES.EncoderOnly);
|
|
24889
26093
|
MODEL_NAME_TO_CLASS_MAPPING.set(name, PreTrainedModel);
|
|
24890
26094
|
}
|
|
26095
|
+
var CUSTOM_ARCHITECTURES = new Set(CUSTOM_ARCHITECTURES_MAPPING.keys());
|
|
24891
26096
|
MODEL_TYPE_MAPPING.set("PreTrainedModel", MODEL_TYPES.EncoderOnly);
|
|
24892
26097
|
MODEL_CLASS_TO_NAME_MAPPING.set(PreTrainedModel, "PreTrainedModel");
|
|
24893
26098
|
var MODEL_MAPPINGS = {
|
|
@@ -24936,6 +26141,18 @@ var PretrainedMixin = class {
|
|
|
24936
26141
|
* the model type is not found in the mapping.
|
|
24937
26142
|
*/
|
|
24938
26143
|
static BASE_IF_FAIL = false;
|
|
26144
|
+
/**
|
|
26145
|
+
* Check whether this AutoModel class supports a given model type.
|
|
26146
|
+
* @param {string} model_type The model type from config (e.g., 'bert', 'whisper').
|
|
26147
|
+
* @returns {boolean} Whether this class can handle the given model type.
|
|
26148
|
+
*/
|
|
26149
|
+
static supports(model_type) {
|
|
26150
|
+
if (!this.MODEL_CLASS_MAPPINGS) return false;
|
|
26151
|
+
for (const mapping of this.MODEL_CLASS_MAPPINGS) {
|
|
26152
|
+
if (mapping.has(model_type)) return true;
|
|
26153
|
+
}
|
|
26154
|
+
return this.BASE_IF_FAIL;
|
|
26155
|
+
}
|
|
24939
26156
|
/** @type {typeof PreTrainedModel.from_pretrained} */
|
|
24940
26157
|
static async from_pretrained(pretrained_model_name_or_path, {
|
|
24941
26158
|
progress_callback = null,
|
|
@@ -24967,7 +26184,7 @@ var PretrainedMixin = class {
|
|
|
24967
26184
|
if (!this.MODEL_CLASS_MAPPINGS) {
|
|
24968
26185
|
throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + this.name);
|
|
24969
26186
|
}
|
|
24970
|
-
const model_type = options.config
|
|
26187
|
+
const { model_type } = options.config;
|
|
24971
26188
|
for (const MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
|
|
24972
26189
|
let modelInfo = MODEL_CLASS_MAPPING.get(model_type);
|
|
24973
26190
|
if (!modelInfo) {
|
|
@@ -26319,40 +27536,30 @@ Pipeline {
|
|
|
26319
27536
|
// src/pipelines/index.js
|
|
26320
27537
|
var SUPPORTED_TASKS = Object.freeze({
|
|
26321
27538
|
"text-classification": {
|
|
26322
|
-
tokenizer: AutoTokenizer,
|
|
26323
27539
|
pipeline: TextClassificationPipeline,
|
|
26324
27540
|
model: AutoModelForSequenceClassification,
|
|
26325
27541
|
default: {
|
|
26326
|
-
// TODO: replace with original
|
|
26327
|
-
// "model": "distilbert-base-uncased-finetuned-sst-2-english",
|
|
26328
27542
|
model: "Xenova/distilbert-base-uncased-finetuned-sst-2-english"
|
|
26329
27543
|
},
|
|
26330
27544
|
type: "text"
|
|
26331
27545
|
},
|
|
26332
27546
|
"token-classification": {
|
|
26333
|
-
tokenizer: AutoTokenizer,
|
|
26334
27547
|
pipeline: TokenClassificationPipeline,
|
|
26335
27548
|
model: AutoModelForTokenClassification,
|
|
26336
27549
|
default: {
|
|
26337
|
-
// TODO: replace with original
|
|
26338
|
-
// "model": "Davlan/bert-base-multilingual-cased-ner-hrl",
|
|
26339
27550
|
model: "Xenova/bert-base-multilingual-cased-ner-hrl"
|
|
26340
27551
|
},
|
|
26341
27552
|
type: "text"
|
|
26342
27553
|
},
|
|
26343
27554
|
"question-answering": {
|
|
26344
|
-
tokenizer: AutoTokenizer,
|
|
26345
27555
|
pipeline: QuestionAnsweringPipeline,
|
|
26346
27556
|
model: AutoModelForQuestionAnswering,
|
|
26347
27557
|
default: {
|
|
26348
|
-
// TODO: replace with original
|
|
26349
|
-
// "model": "distilbert-base-cased-distilled-squad",
|
|
26350
27558
|
model: "Xenova/distilbert-base-cased-distilled-squad"
|
|
26351
27559
|
},
|
|
26352
27560
|
type: "text"
|
|
26353
27561
|
},
|
|
26354
27562
|
"fill-mask": {
|
|
26355
|
-
tokenizer: AutoTokenizer,
|
|
26356
27563
|
pipeline: FillMaskPipeline,
|
|
26357
27564
|
model: AutoModelForMaskedLM,
|
|
26358
27565
|
default: {
|
|
@@ -26362,40 +27569,30 @@ var SUPPORTED_TASKS = Object.freeze({
|
|
|
26362
27569
|
type: "text"
|
|
26363
27570
|
},
|
|
26364
27571
|
summarization: {
|
|
26365
|
-
tokenizer: AutoTokenizer,
|
|
26366
27572
|
pipeline: SummarizationPipeline,
|
|
26367
27573
|
model: AutoModelForSeq2SeqLM,
|
|
26368
27574
|
default: {
|
|
26369
|
-
// TODO: replace with original
|
|
26370
|
-
// "model": "sshleifer/distilbart-cnn-6-6",
|
|
26371
27575
|
model: "Xenova/distilbart-cnn-6-6"
|
|
26372
27576
|
},
|
|
26373
27577
|
type: "text"
|
|
26374
27578
|
},
|
|
26375
27579
|
translation: {
|
|
26376
|
-
tokenizer: AutoTokenizer,
|
|
26377
27580
|
pipeline: TranslationPipeline,
|
|
26378
27581
|
model: AutoModelForSeq2SeqLM,
|
|
26379
27582
|
default: {
|
|
26380
|
-
// TODO: replace with original
|
|
26381
|
-
// "model": "t5-small",
|
|
26382
27583
|
model: "Xenova/t5-small"
|
|
26383
27584
|
},
|
|
26384
27585
|
type: "text"
|
|
26385
27586
|
},
|
|
26386
27587
|
"text2text-generation": {
|
|
26387
|
-
tokenizer: AutoTokenizer,
|
|
26388
27588
|
pipeline: Text2TextGenerationPipeline,
|
|
26389
27589
|
model: AutoModelForSeq2SeqLM,
|
|
26390
27590
|
default: {
|
|
26391
|
-
// TODO: replace with original
|
|
26392
|
-
// "model": "google/flan-t5-small",
|
|
26393
27591
|
model: "Xenova/flan-t5-small"
|
|
26394
27592
|
},
|
|
26395
27593
|
type: "text"
|
|
26396
27594
|
},
|
|
26397
27595
|
"text-generation": {
|
|
26398
|
-
tokenizer: AutoTokenizer,
|
|
26399
27596
|
pipeline: TextGenerationPipeline,
|
|
26400
27597
|
model: AutoModelForCausalLM,
|
|
26401
27598
|
default: {
|
|
@@ -26405,12 +27602,9 @@ var SUPPORTED_TASKS = Object.freeze({
|
|
|
26405
27602
|
type: "text"
|
|
26406
27603
|
},
|
|
26407
27604
|
"zero-shot-classification": {
|
|
26408
|
-
tokenizer: AutoTokenizer,
|
|
26409
27605
|
pipeline: ZeroShotClassificationPipeline,
|
|
26410
27606
|
model: AutoModelForSequenceClassification,
|
|
26411
27607
|
default: {
|
|
26412
|
-
// TODO: replace with original
|
|
26413
|
-
// "model": "typeform/distilbert-base-uncased-mnli",
|
|
26414
27608
|
model: "Xenova/distilbert-base-uncased-mnli"
|
|
26415
27609
|
},
|
|
26416
27610
|
type: "text"
|
|
@@ -26418,47 +27612,30 @@ var SUPPORTED_TASKS = Object.freeze({
|
|
|
26418
27612
|
"audio-classification": {
|
|
26419
27613
|
pipeline: AudioClassificationPipeline,
|
|
26420
27614
|
model: AutoModelForAudioClassification,
|
|
26421
|
-
processor: AutoProcessor,
|
|
26422
27615
|
default: {
|
|
26423
|
-
// TODO: replace with original
|
|
26424
|
-
// "model": "superb/wav2vec2-base-superb-ks",
|
|
26425
27616
|
model: "Xenova/wav2vec2-base-superb-ks"
|
|
26426
27617
|
},
|
|
26427
27618
|
type: "audio"
|
|
26428
27619
|
},
|
|
26429
27620
|
"zero-shot-audio-classification": {
|
|
26430
|
-
tokenizer: AutoTokenizer,
|
|
26431
27621
|
pipeline: ZeroShotAudioClassificationPipeline,
|
|
26432
27622
|
model: AutoModel,
|
|
26433
|
-
processor: AutoProcessor,
|
|
26434
27623
|
default: {
|
|
26435
|
-
// TODO: replace with original
|
|
26436
|
-
// "model": "laion/clap-htsat-fused",
|
|
26437
27624
|
model: "Xenova/clap-htsat-unfused"
|
|
26438
27625
|
},
|
|
26439
27626
|
type: "multimodal"
|
|
26440
27627
|
},
|
|
26441
27628
|
"automatic-speech-recognition": {
|
|
26442
|
-
tokenizer: AutoTokenizer,
|
|
26443
27629
|
pipeline: AutomaticSpeechRecognitionPipeline,
|
|
26444
27630
|
model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
|
|
26445
|
-
processor: AutoProcessor,
|
|
26446
27631
|
default: {
|
|
26447
|
-
// TODO: replace with original
|
|
26448
|
-
// "model": "openai/whisper-tiny.en",
|
|
26449
27632
|
model: "Xenova/whisper-tiny.en"
|
|
26450
27633
|
},
|
|
26451
27634
|
type: "multimodal"
|
|
26452
27635
|
},
|
|
26453
27636
|
"text-to-audio": {
|
|
26454
|
-
tokenizer: AutoTokenizer,
|
|
26455
27637
|
pipeline: TextToAudioPipeline,
|
|
26456
27638
|
model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
|
|
26457
|
-
processor: [
|
|
26458
|
-
AutoProcessor,
|
|
26459
|
-
/* Some don't use a processor */
|
|
26460
|
-
null
|
|
26461
|
-
],
|
|
26462
27639
|
default: {
|
|
26463
27640
|
model: "onnx-community/Supertonic-TTS-ONNX",
|
|
26464
27641
|
dtype: "fp32"
|
|
@@ -26466,124 +27643,86 @@ var SUPPORTED_TASKS = Object.freeze({
|
|
|
26466
27643
|
type: "text"
|
|
26467
27644
|
},
|
|
26468
27645
|
"image-to-text": {
|
|
26469
|
-
tokenizer: AutoTokenizer,
|
|
26470
27646
|
pipeline: ImageToTextPipeline,
|
|
26471
27647
|
model: AutoModelForVision2Seq,
|
|
26472
|
-
processor: AutoProcessor,
|
|
26473
27648
|
default: {
|
|
26474
|
-
// TODO: replace with original
|
|
26475
|
-
// "model": "nlpconnect/vit-gpt2-image-captioning",
|
|
26476
27649
|
model: "Xenova/vit-gpt2-image-captioning"
|
|
26477
27650
|
},
|
|
26478
27651
|
type: "multimodal"
|
|
26479
27652
|
},
|
|
26480
27653
|
"image-classification": {
|
|
26481
|
-
// no tokenizer
|
|
26482
27654
|
pipeline: ImageClassificationPipeline,
|
|
26483
27655
|
model: AutoModelForImageClassification,
|
|
26484
|
-
processor: AutoProcessor,
|
|
26485
27656
|
default: {
|
|
26486
|
-
// TODO: replace with original
|
|
26487
|
-
// "model": "google/vit-base-patch16-224",
|
|
26488
27657
|
model: "Xenova/vit-base-patch16-224"
|
|
26489
27658
|
},
|
|
26490
27659
|
type: "multimodal"
|
|
26491
27660
|
},
|
|
26492
27661
|
"image-segmentation": {
|
|
26493
|
-
// no tokenizer
|
|
26494
27662
|
pipeline: ImageSegmentationPipeline,
|
|
26495
27663
|
model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
|
|
26496
|
-
processor: AutoProcessor,
|
|
26497
27664
|
default: {
|
|
26498
|
-
// TODO: replace with original
|
|
26499
|
-
// "model": "facebook/detr-resnet-50-panoptic",
|
|
26500
27665
|
model: "Xenova/detr-resnet-50-panoptic"
|
|
26501
27666
|
},
|
|
26502
27667
|
type: "multimodal"
|
|
26503
27668
|
},
|
|
26504
27669
|
"background-removal": {
|
|
26505
|
-
// no tokenizer
|
|
26506
27670
|
pipeline: BackgroundRemovalPipeline,
|
|
26507
27671
|
model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
|
|
26508
|
-
processor: AutoProcessor,
|
|
26509
27672
|
default: {
|
|
26510
27673
|
model: "Xenova/modnet"
|
|
26511
27674
|
},
|
|
26512
27675
|
type: "image"
|
|
26513
27676
|
},
|
|
26514
27677
|
"zero-shot-image-classification": {
|
|
26515
|
-
tokenizer: AutoTokenizer,
|
|
26516
27678
|
pipeline: ZeroShotImageClassificationPipeline,
|
|
26517
27679
|
model: AutoModel,
|
|
26518
|
-
processor: AutoProcessor,
|
|
26519
27680
|
default: {
|
|
26520
|
-
// TODO: replace with original
|
|
26521
|
-
// "model": "openai/clip-vit-base-patch32",
|
|
26522
27681
|
model: "Xenova/clip-vit-base-patch32"
|
|
26523
27682
|
},
|
|
26524
27683
|
type: "multimodal"
|
|
26525
27684
|
},
|
|
26526
27685
|
"object-detection": {
|
|
26527
|
-
// no tokenizer
|
|
26528
27686
|
pipeline: ObjectDetectionPipeline,
|
|
26529
27687
|
model: AutoModelForObjectDetection,
|
|
26530
|
-
processor: AutoProcessor,
|
|
26531
27688
|
default: {
|
|
26532
|
-
// TODO: replace with original
|
|
26533
|
-
// "model": "facebook/detr-resnet-50",
|
|
26534
27689
|
model: "Xenova/detr-resnet-50"
|
|
26535
27690
|
},
|
|
26536
27691
|
type: "multimodal"
|
|
26537
27692
|
},
|
|
26538
27693
|
"zero-shot-object-detection": {
|
|
26539
|
-
tokenizer: AutoTokenizer,
|
|
26540
27694
|
pipeline: ZeroShotObjectDetectionPipeline,
|
|
26541
27695
|
model: AutoModelForZeroShotObjectDetection,
|
|
26542
|
-
processor: AutoProcessor,
|
|
26543
27696
|
default: {
|
|
26544
|
-
// TODO: replace with original
|
|
26545
|
-
// "model": "google/owlvit-base-patch32",
|
|
26546
27697
|
model: "Xenova/owlvit-base-patch32"
|
|
26547
27698
|
},
|
|
26548
27699
|
type: "multimodal"
|
|
26549
27700
|
},
|
|
26550
27701
|
"document-question-answering": {
|
|
26551
|
-
tokenizer: AutoTokenizer,
|
|
26552
27702
|
pipeline: DocumentQuestionAnsweringPipeline,
|
|
26553
27703
|
model: AutoModelForDocumentQuestionAnswering,
|
|
26554
|
-
processor: AutoProcessor,
|
|
26555
27704
|
default: {
|
|
26556
|
-
// TODO: replace with original
|
|
26557
|
-
// "model": "naver-clova-ix/donut-base-finetuned-docvqa",
|
|
26558
27705
|
model: "Xenova/donut-base-finetuned-docvqa"
|
|
26559
27706
|
},
|
|
26560
27707
|
type: "multimodal"
|
|
26561
27708
|
},
|
|
26562
27709
|
"image-to-image": {
|
|
26563
|
-
// no tokenizer
|
|
26564
27710
|
pipeline: ImageToImagePipeline,
|
|
26565
27711
|
model: AutoModelForImageToImage,
|
|
26566
|
-
processor: AutoProcessor,
|
|
26567
27712
|
default: {
|
|
26568
|
-
// TODO: replace with original
|
|
26569
|
-
// "model": "caidas/swin2SR-classical-sr-x2-64",
|
|
26570
27713
|
model: "Xenova/swin2SR-classical-sr-x2-64"
|
|
26571
27714
|
},
|
|
26572
27715
|
type: "image"
|
|
26573
27716
|
},
|
|
26574
27717
|
"depth-estimation": {
|
|
26575
|
-
// no tokenizer
|
|
26576
27718
|
pipeline: DepthEstimationPipeline,
|
|
26577
27719
|
model: AutoModelForDepthEstimation,
|
|
26578
|
-
processor: AutoProcessor,
|
|
26579
27720
|
default: {
|
|
26580
27721
|
model: "onnx-community/depth-anything-v2-small"
|
|
26581
27722
|
},
|
|
26582
27723
|
type: "image"
|
|
26583
27724
|
},
|
|
26584
|
-
// This task serves as a useful interface for dealing with sentence-transformers (https://huggingface.co/sentence-transformers).
|
|
26585
27725
|
"feature-extraction": {
|
|
26586
|
-
tokenizer: AutoTokenizer,
|
|
26587
27726
|
pipeline: FeatureExtractionPipeline,
|
|
26588
27727
|
model: AutoModel,
|
|
26589
27728
|
default: {
|
|
@@ -26593,7 +27732,6 @@ var SUPPORTED_TASKS = Object.freeze({
|
|
|
26593
27732
|
type: "text"
|
|
26594
27733
|
},
|
|
26595
27734
|
"image-feature-extraction": {
|
|
26596
|
-
processor: AutoProcessor,
|
|
26597
27735
|
pipeline: ImageFeatureExtractionPipeline,
|
|
26598
27736
|
model: [AutoModelForImageFeatureExtraction, AutoModel],
|
|
26599
27737
|
default: {
|
|
@@ -26614,8 +27752,18 @@ var TASK_ALIASES = Object.freeze({
|
|
|
26614
27752
|
});
|
|
26615
27753
|
|
|
26616
27754
|
// src/utils/model_registry/get_model_files.js
|
|
27755
|
+
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
27756
|
+
if (config !== null) {
|
|
27757
|
+
return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
|
|
27758
|
+
}
|
|
27759
|
+
const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
|
|
27760
|
+
return memoizePromise(
|
|
27761
|
+
key,
|
|
27762
|
+
() => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
|
|
27763
|
+
);
|
|
27764
|
+
}
|
|
26617
27765
|
async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
|
|
26618
|
-
config = await
|
|
27766
|
+
config = await get_config(modelId, { config });
|
|
26619
27767
|
const files = [
|
|
26620
27768
|
// Add config.json (always loaded)
|
|
26621
27769
|
"config.json"
|
|
@@ -26645,6 +27793,15 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
26645
27793
|
modelType = mappedType;
|
|
26646
27794
|
foundInMapping = true;
|
|
26647
27795
|
}
|
|
27796
|
+
if (!foundInMapping) {
|
|
27797
|
+
for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
|
|
27798
|
+
if (mapping.has(config.model_type)) {
|
|
27799
|
+
modelType = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
|
|
27800
|
+
foundInMapping = true;
|
|
27801
|
+
break;
|
|
27802
|
+
}
|
|
27803
|
+
}
|
|
27804
|
+
}
|
|
26648
27805
|
}
|
|
26649
27806
|
if (!foundInMapping) {
|
|
26650
27807
|
const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
|
|
@@ -26667,74 +27824,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
26667
27824
|
files.push(dataFilePath);
|
|
26668
27825
|
}
|
|
26669
27826
|
};
|
|
26670
|
-
const
|
|
26671
|
-
|
|
26672
|
-
add_model_file(
|
|
26673
|
-
|
|
26674
|
-
|
|
26675
|
-
|
|
26676
|
-
|
|
26677
|
-
|
|
26678
|
-
add_model_file("decoder_model_merged");
|
|
26679
|
-
files.push("generation_config.json");
|
|
26680
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
26681
|
-
add_model_file("model", "vision_encoder");
|
|
26682
|
-
add_model_file("prompt_encoder_mask_decoder");
|
|
26683
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
26684
|
-
add_model_file("model", "encoder_model");
|
|
26685
|
-
add_model_file("decoder_model_merged");
|
|
26686
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
26687
|
-
add_model_file("embed_tokens");
|
|
26688
|
-
add_model_file("vision_encoder");
|
|
26689
|
-
add_model_file("decoder_model_merged");
|
|
26690
|
-
if (config.is_encoder_decoder) {
|
|
26691
|
-
add_model_file("model", "encoder_model");
|
|
26692
|
-
}
|
|
26693
|
-
files.push("generation_config.json");
|
|
26694
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
26695
|
-
add_model_file("embed_tokens");
|
|
26696
|
-
add_model_file("audio_encoder");
|
|
26697
|
-
add_model_file("decoder_model_merged");
|
|
26698
|
-
files.push("generation_config.json");
|
|
26699
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
26700
|
-
add_model_file("embed_tokens");
|
|
26701
|
-
add_model_file("audio_encoder");
|
|
26702
|
-
add_model_file("vision_encoder");
|
|
26703
|
-
add_model_file("decoder_model_merged");
|
|
26704
|
-
files.push("generation_config.json");
|
|
26705
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
26706
|
-
add_model_file("model", "text_encoder");
|
|
26707
|
-
add_model_file("decoder_model_merged");
|
|
26708
|
-
add_model_file("encodec_decode");
|
|
26709
|
-
files.push("generation_config.json");
|
|
26710
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
26711
|
-
add_model_file("prepare_inputs_embeds");
|
|
26712
|
-
add_model_file("model", "language_model");
|
|
26713
|
-
add_model_file("lm_head");
|
|
26714
|
-
add_model_file("gen_head");
|
|
26715
|
-
add_model_file("gen_img_embeds");
|
|
26716
|
-
add_model_file("image_decode");
|
|
26717
|
-
files.push("generation_config.json");
|
|
26718
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
26719
|
-
add_model_file("prepare_inputs_embeds");
|
|
26720
|
-
add_model_file("model");
|
|
26721
|
-
add_model_file("vision_encoder");
|
|
26722
|
-
files.push("generation_config.json");
|
|
26723
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
26724
|
-
add_model_file("embed_tokens");
|
|
26725
|
-
add_model_file("speech_encoder");
|
|
26726
|
-
add_model_file("model", "language_model");
|
|
26727
|
-
add_model_file("conditional_decoder");
|
|
26728
|
-
files.push("generation_config.json");
|
|
26729
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
26730
|
-
add_model_file("encoder_model");
|
|
26731
|
-
add_model_file("decoder_model");
|
|
26732
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
26733
|
-
add_model_file("text_encoder");
|
|
26734
|
-
add_model_file("latent_denoiser");
|
|
26735
|
-
add_model_file("voice_decoder");
|
|
26736
|
-
} else {
|
|
26737
|
-
add_model_file("model", singleModelName);
|
|
27827
|
+
const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
|
|
27828
|
+
for (const [sessionKey, baseName] of Object.entries(sessions)) {
|
|
27829
|
+
add_model_file(sessionKey, baseName);
|
|
27830
|
+
}
|
|
27831
|
+
if (optional_configs) {
|
|
27832
|
+
for (const configFile of Object.values(optional_configs)) {
|
|
27833
|
+
files.push(configFile);
|
|
27834
|
+
}
|
|
26738
27835
|
}
|
|
26739
27836
|
return files;
|
|
26740
27837
|
}
|
|
@@ -26770,28 +27867,21 @@ async function get_files(modelId, {
|
|
|
26770
27867
|
}
|
|
26771
27868
|
|
|
26772
27869
|
// src/utils/model_registry/get_pipeline_files.js
|
|
26773
|
-
function get_task_components(task) {
|
|
26774
|
-
const taskConfig = SUPPORTED_TASKS[task];
|
|
26775
|
-
if (!taskConfig) {
|
|
26776
|
-
return null;
|
|
26777
|
-
}
|
|
26778
|
-
return {
|
|
26779
|
-
tokenizer: !!taskConfig.tokenizer,
|
|
26780
|
-
processor: !!taskConfig.processor
|
|
26781
|
-
};
|
|
26782
|
-
}
|
|
26783
27870
|
async function get_pipeline_files(task, modelId, options = {}) {
|
|
26784
27871
|
task = TASK_ALIASES[task] ?? task;
|
|
26785
|
-
const
|
|
26786
|
-
if (!
|
|
27872
|
+
const taskConfig = SUPPORTED_TASKS[task];
|
|
27873
|
+
if (!taskConfig) {
|
|
26787
27874
|
throw new Error(
|
|
26788
27875
|
`Unsupported pipeline task: ${task}. Must be one of [${Object.keys(SUPPORTED_TASKS).join(", ")}]`
|
|
26789
27876
|
);
|
|
26790
27877
|
}
|
|
27878
|
+
const { type } = taskConfig;
|
|
27879
|
+
const include_tokenizer = type !== "audio" && type !== "image";
|
|
27880
|
+
const include_processor = type !== "text";
|
|
26791
27881
|
return get_files(modelId, {
|
|
26792
27882
|
...options,
|
|
26793
|
-
include_tokenizer
|
|
26794
|
-
include_processor
|
|
27883
|
+
include_tokenizer,
|
|
27884
|
+
include_processor
|
|
26795
27885
|
});
|
|
26796
27886
|
}
|
|
26797
27887
|
|
|
@@ -26821,12 +27911,12 @@ async function pipeline2(task, model = null, {
|
|
|
26821
27911
|
dtype = pipelineInfo.default.dtype;
|
|
26822
27912
|
}
|
|
26823
27913
|
}
|
|
27914
|
+
const expected_files = await get_pipeline_files(task, model, {
|
|
27915
|
+
device,
|
|
27916
|
+
dtype
|
|
27917
|
+
});
|
|
26824
27918
|
let files_loading = {};
|
|
26825
27919
|
if (progress_callback) {
|
|
26826
|
-
const expected_files = await get_pipeline_files(task, model, {
|
|
26827
|
-
device,
|
|
26828
|
-
dtype
|
|
26829
|
-
});
|
|
26830
27920
|
const metadata = await Promise.all(expected_files.map(async (file) => get_file_metadata(model, file)));
|
|
26831
27921
|
metadata.forEach((m, i) => {
|
|
26832
27922
|
if (m.exists) {
|
|
@@ -26872,13 +27962,31 @@ async function pipeline2(task, model = null, {
|
|
|
26872
27962
|
model_file_name,
|
|
26873
27963
|
session_options
|
|
26874
27964
|
};
|
|
26875
|
-
const
|
|
26876
|
-
|
|
26877
|
-
|
|
26878
|
-
|
|
27965
|
+
const hasTokenizer = expected_files.includes("tokenizer.json");
|
|
27966
|
+
const hasProcessor = expected_files.includes("preprocessor_config.json");
|
|
27967
|
+
const modelClasses = pipelineInfo.model;
|
|
27968
|
+
let modelPromise;
|
|
27969
|
+
if (Array.isArray(modelClasses)) {
|
|
27970
|
+
const resolvedConfig = config ?? await AutoConfig.from_pretrained(model, pretrainedOptions);
|
|
27971
|
+
const { model_type } = resolvedConfig;
|
|
27972
|
+
const matchedClass = modelClasses.find((cls) => cls.supports(model_type));
|
|
27973
|
+
if (!matchedClass) {
|
|
27974
|
+
throw Error(
|
|
27975
|
+
`Unsupported model type "${model_type}" for task "${task}". None of the candidate model classes support this type.`
|
|
27976
|
+
);
|
|
27977
|
+
}
|
|
27978
|
+
modelPromise = matchedClass.from_pretrained(model, { ...pretrainedOptions, config: resolvedConfig });
|
|
27979
|
+
} else {
|
|
27980
|
+
modelPromise = modelClasses.from_pretrained(model, pretrainedOptions);
|
|
27981
|
+
}
|
|
27982
|
+
const [tokenizer, processor, model_loaded] = await Promise.all([
|
|
27983
|
+
hasTokenizer ? AutoTokenizer.from_pretrained(model, pretrainedOptions) : null,
|
|
27984
|
+
hasProcessor ? AutoProcessor.from_pretrained(model, pretrainedOptions) : null,
|
|
27985
|
+
modelPromise
|
|
26879
27986
|
]);
|
|
26880
|
-
const results =
|
|
26881
|
-
results.
|
|
27987
|
+
const results = { task, model: model_loaded };
|
|
27988
|
+
if (tokenizer) results.tokenizer = tokenizer;
|
|
27989
|
+
if (processor) results.processor = processor;
|
|
26882
27990
|
dispatchCallback(progress_callback, {
|
|
26883
27991
|
status: "ready",
|
|
26884
27992
|
task,
|
|
@@ -26887,48 +27995,6 @@ async function pipeline2(task, model = null, {
|
|
|
26887
27995
|
const pipelineClass = pipelineInfo.pipeline;
|
|
26888
27996
|
return new pipelineClass(results);
|
|
26889
27997
|
}
|
|
26890
|
-
async function loadItems(mapping, model, pretrainedOptions) {
|
|
26891
|
-
const result = /* @__PURE__ */ Object.create(null);
|
|
26892
|
-
const promises = [];
|
|
26893
|
-
for (const [name, cls] of mapping.entries()) {
|
|
26894
|
-
if (!cls) continue;
|
|
26895
|
-
let promise;
|
|
26896
|
-
if (Array.isArray(cls)) {
|
|
26897
|
-
promise = new Promise(async (resolve, reject) => {
|
|
26898
|
-
let e;
|
|
26899
|
-
for (const c of cls) {
|
|
26900
|
-
if (c === null) {
|
|
26901
|
-
resolve(null);
|
|
26902
|
-
return;
|
|
26903
|
-
}
|
|
26904
|
-
try {
|
|
26905
|
-
resolve(await c.from_pretrained(model, pretrainedOptions));
|
|
26906
|
-
return;
|
|
26907
|
-
} catch (err) {
|
|
26908
|
-
if (err.message?.includes("Unsupported model type")) {
|
|
26909
|
-
e = err;
|
|
26910
|
-
} else if (err.message?.includes("Could not locate file")) {
|
|
26911
|
-
e = err;
|
|
26912
|
-
} else {
|
|
26913
|
-
reject(err);
|
|
26914
|
-
return;
|
|
26915
|
-
}
|
|
26916
|
-
}
|
|
26917
|
-
}
|
|
26918
|
-
reject(e);
|
|
26919
|
-
});
|
|
26920
|
-
} else {
|
|
26921
|
-
promise = cls.from_pretrained(model, pretrainedOptions);
|
|
26922
|
-
}
|
|
26923
|
-
result[name] = promise;
|
|
26924
|
-
promises.push(promise);
|
|
26925
|
-
}
|
|
26926
|
-
await Promise.all(promises);
|
|
26927
|
-
for (const [name, promise] of Object.entries(result)) {
|
|
26928
|
-
result[name] = await promise;
|
|
26929
|
-
}
|
|
26930
|
-
return result;
|
|
26931
|
-
}
|
|
26932
27998
|
|
|
26933
27999
|
// src/generation/streamers.js
|
|
26934
28000
|
var is_chinese_char2 = (cp) => cp >= 19968 && cp <= 40959 || cp >= 13312 && cp <= 19903 || cp >= 131072 && cp <= 173791 || cp >= 173824 && cp <= 177983 || cp >= 177984 && cp <= 178207 || cp >= 178208 && cp <= 183983 || cp >= 63744 && cp <= 64255 || cp >= 194560 && cp <= 195103;
|
|
@@ -27216,21 +28282,38 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
|
27216
28282
|
|
|
27217
28283
|
// src/utils/model_registry/is_cached.js
|
|
27218
28284
|
async function check_files_cache(modelId, files, options = {}) {
|
|
27219
|
-
const
|
|
27220
|
-
if (!
|
|
28285
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
28286
|
+
if (!cache2) {
|
|
27221
28287
|
const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
|
|
27222
28288
|
return { allCached: false, files: fileStatuses2 };
|
|
27223
28289
|
}
|
|
27224
28290
|
const fileStatuses = await Promise.all(
|
|
27225
28291
|
files.map(async (filename) => {
|
|
27226
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
27227
|
-
const cached = await checkCachedResource(
|
|
28292
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
28293
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
27228
28294
|
return { file: filename, cached: !!cached };
|
|
27229
28295
|
})
|
|
27230
28296
|
);
|
|
27231
28297
|
return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
|
|
27232
28298
|
}
|
|
28299
|
+
async function is_file_cached(modelId, filename, options = {}) {
|
|
28300
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
28301
|
+
if (!cache2) return false;
|
|
28302
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
28303
|
+
return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
28304
|
+
}
|
|
27233
28305
|
async function is_cached(modelId, options = {}) {
|
|
28306
|
+
if (!modelId) {
|
|
28307
|
+
throw new Error("modelId is required");
|
|
28308
|
+
}
|
|
28309
|
+
if (!await is_file_cached(modelId, "config.json", options)) {
|
|
28310
|
+
return false;
|
|
28311
|
+
}
|
|
28312
|
+
const files = await get_files(modelId, options);
|
|
28313
|
+
const result = await check_files_cache(modelId, files, options);
|
|
28314
|
+
return result.allCached;
|
|
28315
|
+
}
|
|
28316
|
+
async function is_cached_files(modelId, options = {}) {
|
|
27234
28317
|
if (!modelId) {
|
|
27235
28318
|
throw new Error("modelId is required");
|
|
27236
28319
|
}
|
|
@@ -27238,6 +28321,20 @@ async function is_cached(modelId, options = {}) {
|
|
|
27238
28321
|
return await check_files_cache(modelId, files, options);
|
|
27239
28322
|
}
|
|
27240
28323
|
async function is_pipeline_cached(task, modelId, options = {}) {
|
|
28324
|
+
if (!task) {
|
|
28325
|
+
throw new Error("task is required");
|
|
28326
|
+
}
|
|
28327
|
+
if (!modelId) {
|
|
28328
|
+
throw new Error("modelId is required");
|
|
28329
|
+
}
|
|
28330
|
+
if (!await is_file_cached(modelId, "config.json", options)) {
|
|
28331
|
+
return false;
|
|
28332
|
+
}
|
|
28333
|
+
const files = await get_pipeline_files(task, modelId, options);
|
|
28334
|
+
const result = await check_files_cache(modelId, files, options);
|
|
28335
|
+
return result.allCached;
|
|
28336
|
+
}
|
|
28337
|
+
async function is_pipeline_cached_files(task, modelId, options = {}) {
|
|
27241
28338
|
if (!task) {
|
|
27242
28339
|
throw new Error("task is required");
|
|
27243
28340
|
}
|
|
@@ -27250,26 +28347,26 @@ async function is_pipeline_cached(task, modelId, options = {}) {
|
|
|
27250
28347
|
|
|
27251
28348
|
// src/utils/model_registry/clear_cache.js
|
|
27252
28349
|
async function clear_files_from_cache(modelId, files, options = {}) {
|
|
27253
|
-
const
|
|
27254
|
-
if (!
|
|
28350
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
28351
|
+
if (!cache2) {
|
|
27255
28352
|
return {
|
|
27256
28353
|
filesDeleted: 0,
|
|
27257
28354
|
filesCached: 0,
|
|
27258
28355
|
files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
|
|
27259
28356
|
};
|
|
27260
28357
|
}
|
|
27261
|
-
if (!
|
|
28358
|
+
if (!cache2.delete) {
|
|
27262
28359
|
throw new Error("Cache does not support delete operation");
|
|
27263
28360
|
}
|
|
27264
28361
|
const results = await Promise.all(
|
|
27265
28362
|
files.map(async (filename) => {
|
|
27266
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
27267
|
-
const cached = await checkCachedResource(
|
|
28363
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
28364
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
27268
28365
|
const wasCached = !!cached;
|
|
27269
28366
|
let deleted = false;
|
|
27270
28367
|
if (wasCached) {
|
|
27271
|
-
const deletedWithProposed = await
|
|
27272
|
-
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await
|
|
28368
|
+
const deletedWithProposed = await cache2.delete(proposedCacheKey);
|
|
28369
|
+
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
|
|
27273
28370
|
deleted = deletedWithProposed || deletedWithLocal;
|
|
27274
28371
|
}
|
|
27275
28372
|
return { file: filename, deleted, wasCached };
|
|
@@ -27386,26 +28483,30 @@ var ModelRegistry = class {
|
|
|
27386
28483
|
return get_processor_files(modelId);
|
|
27387
28484
|
}
|
|
27388
28485
|
/**
|
|
27389
|
-
*
|
|
28486
|
+
* Quickly checks if a model is fully cached by verifying `config.json` is present,
|
|
28487
|
+
* then confirming all required files are cached.
|
|
28488
|
+
* Returns a plain boolean — use `is_cached_files` if you need per-file detail.
|
|
27390
28489
|
*
|
|
27391
28490
|
* @param {string} modelId - The model id
|
|
27392
28491
|
* @param {Object} [options] - Optional parameters
|
|
28492
|
+
* @param {string} [options.cache_dir] - Custom cache directory
|
|
28493
|
+
* @param {string} [options.revision] - Model revision (default: 'main')
|
|
28494
|
+
* @param {import('../../configs.js').PretrainedConfig} [options.config] - Pre-loaded config
|
|
27393
28495
|
* @param {import('../dtypes.js').DataType|Record<string, import('../dtypes.js').DataType>} [options.dtype=null] - Override dtype
|
|
27394
28496
|
* @param {import('../devices.js').DeviceType|Record<string, import('../devices.js').DeviceType>} [options.device=null] - Override device
|
|
27395
|
-
* @returns {Promise<
|
|
28497
|
+
* @returns {Promise<boolean>} Whether all required files are cached
|
|
27396
28498
|
*
|
|
27397
28499
|
* @example
|
|
27398
|
-
* const
|
|
27399
|
-
* console.log(
|
|
28500
|
+
* const cached = await ModelRegistry.is_cached('onnx-community/bert-base-uncased-ONNX');
|
|
28501
|
+
* console.log(cached); // true or false
|
|
27400
28502
|
*/
|
|
27401
28503
|
static async is_cached(modelId, options = {}) {
|
|
27402
28504
|
return is_cached(modelId, options);
|
|
27403
28505
|
}
|
|
27404
28506
|
/**
|
|
27405
|
-
*
|
|
27406
|
-
* Automatically determines which
|
|
28507
|
+
* Checks if all files for a given model are already cached, with per-file detail.
|
|
28508
|
+
* Automatically determines which files are needed using get_files().
|
|
27407
28509
|
*
|
|
27408
|
-
* @param {string} task - The pipeline task (e.g., "text-generation", "background-removal")
|
|
27409
28510
|
* @param {string} modelId - The model id
|
|
27410
28511
|
* @param {Object} [options] - Optional parameters
|
|
27411
28512
|
* @param {string} [options.cache_dir] - Custom cache directory
|
|
@@ -27416,12 +28517,57 @@ var ModelRegistry = class {
|
|
|
27416
28517
|
* @returns {Promise<import('./is_cached.js').CacheCheckResult>} Object with allCached boolean and files array with cache status
|
|
27417
28518
|
*
|
|
27418
28519
|
* @example
|
|
27419
|
-
* const status = await ModelRegistry.
|
|
28520
|
+
* const status = await ModelRegistry.is_cached_files('onnx-community/bert-base-uncased-ONNX');
|
|
27420
28521
|
* console.log(status.allCached); // true or false
|
|
28522
|
+
* console.log(status.files); // [{ file: 'config.json', cached: true }, ...]
|
|
28523
|
+
*/
|
|
28524
|
+
static async is_cached_files(modelId, options = {}) {
|
|
28525
|
+
return is_cached_files(modelId, options);
|
|
28526
|
+
}
|
|
28527
|
+
/**
|
|
28528
|
+
* Quickly checks if all files for a specific pipeline task are cached by verifying
|
|
28529
|
+
* `config.json` is present, then confirming all required files are cached.
|
|
28530
|
+
* Returns a plain boolean — use `is_pipeline_cached_files` if you need per-file detail.
|
|
28531
|
+
*
|
|
28532
|
+
* @param {string} task - The pipeline task (e.g., "text-generation", "background-removal")
|
|
28533
|
+
* @param {string} modelId - The model id
|
|
28534
|
+
* @param {Object} [options] - Optional parameters
|
|
28535
|
+
* @param {string} [options.cache_dir] - Custom cache directory
|
|
28536
|
+
* @param {string} [options.revision] - Model revision (default: 'main')
|
|
28537
|
+
* @param {import('../../configs.js').PretrainedConfig} [options.config] - Pre-loaded config
|
|
28538
|
+
* @param {import('../dtypes.js').DataType|Record<string, import('../dtypes.js').DataType>} [options.dtype=null] - Override dtype
|
|
28539
|
+
* @param {import('../devices.js').DeviceType|Record<string, import('../devices.js').DeviceType>} [options.device=null] - Override device
|
|
28540
|
+
* @returns {Promise<boolean>} Whether all required files are cached
|
|
28541
|
+
*
|
|
28542
|
+
* @example
|
|
28543
|
+
* const cached = await ModelRegistry.is_pipeline_cached('text-generation', 'onnx-community/gpt2-ONNX');
|
|
28544
|
+
* console.log(cached); // true or false
|
|
27421
28545
|
*/
|
|
27422
28546
|
static async is_pipeline_cached(task, modelId, options = {}) {
|
|
27423
28547
|
return is_pipeline_cached(task, modelId, options);
|
|
27424
28548
|
}
|
|
28549
|
+
/**
|
|
28550
|
+
* Checks if all files for a specific pipeline task are already cached, with per-file detail.
|
|
28551
|
+
* Automatically determines which components are needed based on the task.
|
|
28552
|
+
*
|
|
28553
|
+
* @param {string} task - The pipeline task (e.g., "text-generation", "background-removal")
|
|
28554
|
+
* @param {string} modelId - The model id
|
|
28555
|
+
* @param {Object} [options] - Optional parameters
|
|
28556
|
+
* @param {string} [options.cache_dir] - Custom cache directory
|
|
28557
|
+
* @param {string} [options.revision] - Model revision (default: 'main')
|
|
28558
|
+
* @param {import('../../configs.js').PretrainedConfig} [options.config] - Pre-loaded config
|
|
28559
|
+
* @param {import('../dtypes.js').DataType|Record<string, import('../dtypes.js').DataType>} [options.dtype=null] - Override dtype
|
|
28560
|
+
* @param {import('../devices.js').DeviceType|Record<string, import('../devices.js').DeviceType>} [options.device=null] - Override device
|
|
28561
|
+
* @returns {Promise<import('./is_cached.js').CacheCheckResult>} Object with allCached boolean and files array with cache status
|
|
28562
|
+
*
|
|
28563
|
+
* @example
|
|
28564
|
+
* const status = await ModelRegistry.is_pipeline_cached_files('text-generation', 'onnx-community/gpt2-ONNX');
|
|
28565
|
+
* console.log(status.allCached); // true or false
|
|
28566
|
+
* console.log(status.files); // [{ file: 'config.json', cached: true }, ...]
|
|
28567
|
+
*/
|
|
28568
|
+
static async is_pipeline_cached_files(task, modelId, options = {}) {
|
|
28569
|
+
return is_pipeline_cached_files(task, modelId, options);
|
|
28570
|
+
}
|
|
27425
28571
|
/**
|
|
27426
28572
|
* Get metadata for a specific file without downloading it.
|
|
27427
28573
|
*
|
|
@@ -27701,6 +28847,7 @@ export {
|
|
|
27701
28847
|
DonutImageProcessor,
|
|
27702
28848
|
DonutSwinModel,
|
|
27703
28849
|
DonutSwinPreTrainedModel,
|
|
28850
|
+
DynamicCache,
|
|
27704
28851
|
EdgeTamModel,
|
|
27705
28852
|
EfficientNetForImageClassification,
|
|
27706
28853
|
EfficientNetImageProcessor,
|
|
@@ -27773,6 +28920,7 @@ export {
|
|
|
27773
28920
|
Gemma3Model,
|
|
27774
28921
|
Gemma3PreTrainedModel,
|
|
27775
28922
|
Gemma3nAudioFeatureExtractor,
|
|
28923
|
+
Gemma3nForCausalLM,
|
|
27776
28924
|
Gemma3nForConditionalGeneration,
|
|
27777
28925
|
Gemma3nPreTrainedModel,
|
|
27778
28926
|
Gemma3nProcessor,
|
|
@@ -27792,6 +28940,9 @@ export {
|
|
|
27792
28940
|
GraniteMoeHybridModel,
|
|
27793
28941
|
GraniteMoeHybridPreTrainedModel,
|
|
27794
28942
|
GranitePreTrainedModel,
|
|
28943
|
+
GraniteSpeechFeatureExtractor,
|
|
28944
|
+
GraniteSpeechForConditionalGeneration,
|
|
28945
|
+
GraniteSpeechProcessor,
|
|
27795
28946
|
GroundingDinoForObjectDetection,
|
|
27796
28947
|
GroundingDinoImageProcessor,
|
|
27797
28948
|
GroundingDinoPreTrainedModel,
|
|
@@ -27817,7 +28968,6 @@ export {
|
|
|
27817
28968
|
IJepaPreTrainedModel,
|
|
27818
28969
|
Idefics3ForConditionalGeneration,
|
|
27819
28970
|
Idefics3ImageProcessor,
|
|
27820
|
-
Idefics3PreTrainedModel,
|
|
27821
28971
|
Idefics3Processor,
|
|
27822
28972
|
ImageClassificationPipeline,
|
|
27823
28973
|
ImageFeatureExtractionPipeline,
|
|
@@ -27842,6 +28992,9 @@ export {
|
|
|
27842
28992
|
Lfm2MoeModel,
|
|
27843
28993
|
Lfm2MoePreTrainedModel,
|
|
27844
28994
|
Lfm2PreTrainedModel,
|
|
28995
|
+
Lfm2VlForConditionalGeneration,
|
|
28996
|
+
Lfm2VlImageProcessor,
|
|
28997
|
+
Lfm2VlProcessor,
|
|
27845
28998
|
LiteWhisperForConditionalGeneration,
|
|
27846
28999
|
Llama4ForCausalLM,
|
|
27847
29000
|
Llama4PreTrainedModel,
|
|
@@ -28006,6 +29159,9 @@ export {
|
|
|
28006
29159
|
Olmo3Model,
|
|
28007
29160
|
Olmo3PreTrainedModel,
|
|
28008
29161
|
OlmoForCausalLM,
|
|
29162
|
+
OlmoHybridForCausalLM,
|
|
29163
|
+
OlmoHybridModel,
|
|
29164
|
+
OlmoHybridPreTrainedModel,
|
|
28009
29165
|
OlmoModel,
|
|
28010
29166
|
OlmoPreTrainedModel,
|
|
28011
29167
|
OpenELMForCausalLM,
|
|
@@ -28022,7 +29178,6 @@ export {
|
|
|
28022
29178
|
Owlv2Model,
|
|
28023
29179
|
Owlv2PreTrainedModel,
|
|
28024
29180
|
PaliGemmaForConditionalGeneration,
|
|
28025
|
-
PaliGemmaPreTrainedModel,
|
|
28026
29181
|
PaliGemmaProcessor,
|
|
28027
29182
|
ParakeetFeatureExtractor,
|
|
28028
29183
|
ParakeetForCTC,
|
|
@@ -28061,20 +29216,36 @@ export {
|
|
|
28061
29216
|
QuestionAnsweringPipeline,
|
|
28062
29217
|
Qwen2ForCausalLM,
|
|
28063
29218
|
Qwen2Model,
|
|
29219
|
+
Qwen2MoeForCausalLM,
|
|
29220
|
+
Qwen2MoeModel,
|
|
29221
|
+
Qwen2MoePreTrainedModel,
|
|
28064
29222
|
Qwen2PreTrainedModel,
|
|
28065
29223
|
Qwen2Tokenizer,
|
|
29224
|
+
Qwen2VLForCausalLM,
|
|
28066
29225
|
Qwen2VLForConditionalGeneration,
|
|
28067
29226
|
Qwen2VLImageProcessor,
|
|
28068
29227
|
Qwen2VLPreTrainedModel,
|
|
28069
29228
|
Qwen2VLProcessor,
|
|
29229
|
+
Qwen2_5_VLForCausalLM,
|
|
28070
29230
|
Qwen2_5_VLForConditionalGeneration,
|
|
28071
29231
|
Qwen2_5_VLProcessor,
|
|
28072
29232
|
Qwen3ForCausalLM,
|
|
28073
29233
|
Qwen3Model,
|
|
29234
|
+
Qwen3MoeForCausalLM,
|
|
29235
|
+
Qwen3MoeModel,
|
|
29236
|
+
Qwen3MoePreTrainedModel,
|
|
29237
|
+
Qwen3NextForCausalLM,
|
|
29238
|
+
Qwen3NextModel,
|
|
29239
|
+
Qwen3NextPreTrainedModel,
|
|
28074
29240
|
Qwen3PreTrainedModel,
|
|
29241
|
+
Qwen3VLForCausalLM,
|
|
28075
29242
|
Qwen3VLForConditionalGeneration,
|
|
29243
|
+
Qwen3VLMoeForCausalLM,
|
|
29244
|
+
Qwen3VLMoeForConditionalGeneration,
|
|
28076
29245
|
Qwen3VLProcessor,
|
|
29246
|
+
Qwen3_5ForCausalLM,
|
|
28077
29247
|
Qwen3_5ForConditionalGeneration,
|
|
29248
|
+
Qwen3_5MoeForCausalLM,
|
|
28078
29249
|
Qwen3_5MoeForConditionalGeneration,
|
|
28079
29250
|
RFDetrForObjectDetection,
|
|
28080
29251
|
RFDetrModel,
|
|
@@ -28146,7 +29317,6 @@ export {
|
|
|
28146
29317
|
SmolLM3ForCausalLM,
|
|
28147
29318
|
SmolLM3Model,
|
|
28148
29319
|
SmolLM3PreTrainedModel,
|
|
28149
|
-
SmolVLMForConditionalGeneration,
|
|
28150
29320
|
Idefics3ImageProcessor as SmolVLMImageProcessor,
|
|
28151
29321
|
Idefics3Processor as SmolVLMProcessor,
|
|
28152
29322
|
SnacDecoderModel,
|
|
@@ -28252,6 +29422,10 @@ export {
|
|
|
28252
29422
|
VitsTokenizer,
|
|
28253
29423
|
VoxtralForConditionalGeneration,
|
|
28254
29424
|
VoxtralProcessor,
|
|
29425
|
+
VoxtralRealtimeFeatureExtractor,
|
|
29426
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
29427
|
+
VoxtralRealtimePreTrainedModel,
|
|
29428
|
+
VoxtralRealtimeProcessor,
|
|
28255
29429
|
Wav2Vec2BertForCTC,
|
|
28256
29430
|
Wav2Vec2BertForSequenceClassification,
|
|
28257
29431
|
Wav2Vec2BertModel,
|