@huggingface/transformers 4.0.0-next.5 → 4.0.0-next.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. package/README.md +12 -4
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
  3. package/dist/transformers.js +2189 -1015
  4. package/dist/transformers.min.js +16 -16
  5. package/dist/transformers.node.cjs +2234 -1029
  6. package/dist/transformers.node.min.cjs +20 -20
  7. package/dist/transformers.node.min.mjs +20 -20
  8. package/dist/transformers.node.mjs +2194 -1017
  9. package/dist/transformers.web.js +2175 -1001
  10. package/dist/transformers.web.min.js +18 -18
  11. package/package.json +4 -4
  12. package/src/backends/onnx.js +77 -58
  13. package/src/backends/utils/cacheWasm.js +22 -43
  14. package/src/cache_utils.js +62 -0
  15. package/src/configs.js +32 -5
  16. package/src/env.js +36 -6
  17. package/src/image_processors_utils.js +3 -3
  18. package/src/models/auto/modeling_auto.js +14 -1
  19. package/src/models/chatterbox/modeling_chatterbox.js +1 -1
  20. package/src/models/detr/image_processing_detr.js +1 -1
  21. package/src/models/feature_extractors.js +2 -0
  22. package/src/models/gemma3n/modeling_gemma3n.js +2 -0
  23. package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
  24. package/src/models/granite_speech/modeling_granite_speech.js +5 -0
  25. package/src/models/granite_speech/processing_granite_speech.js +62 -0
  26. package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
  27. package/src/models/idefics3/modeling_idefics3.js +5 -32
  28. package/src/models/image_processors.js +1 -0
  29. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
  30. package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
  31. package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
  32. package/src/models/llava/modeling_llava.js +1 -1
  33. package/src/models/mistral3/modeling_mistral3.js +2 -2
  34. package/src/models/modeling_utils.js +234 -292
  35. package/src/models/models.js +9 -0
  36. package/src/models/olmo_hybrid/modeling_olmo_hybrid.js +5 -0
  37. package/src/models/paligemma/modeling_paligemma.js +2 -25
  38. package/src/models/processors.js +3 -0
  39. package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
  40. package/src/models/qwen2_moe/modeling_qwen2_moe.js +5 -0
  41. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
  42. package/src/models/qwen2_vl/modeling_qwen2_vl.js +36 -3
  43. package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
  44. package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
  45. package/src/models/qwen3_moe/modeling_qwen3_moe.js +5 -0
  46. package/src/models/qwen3_next/modeling_qwen3_next.js +5 -0
  47. package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
  48. package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +4 -0
  49. package/src/models/registry.js +39 -4
  50. package/src/models/sam/image_processing_sam.js +1 -1
  51. package/src/models/session.js +17 -6
  52. package/src/models/smolvlm/modeling_smolvlm.js +7 -0
  53. package/src/models/ultravox/modeling_ultravox.js +1 -3
  54. package/src/models/voxtral/modeling_voxtral.js +3 -0
  55. package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
  56. package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
  57. package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
  58. package/src/models/whisper/feature_extraction_whisper.js +2 -12
  59. package/src/pipelines/index.js +2 -84
  60. package/src/pipelines.js +40 -77
  61. package/src/transformers.js +2 -0
  62. package/src/utils/audio.js +18 -2
  63. package/src/utils/cache/CrossOriginStorageCache.js +251 -0
  64. package/src/utils/cache/FileCache.js +128 -0
  65. package/src/utils/cache/cross-origin-storage.d.ts +38 -0
  66. package/src/utils/cache.js +8 -3
  67. package/src/utils/hub/{files.js → FileResponse.js} +0 -105
  68. package/src/utils/hub/utils.js +35 -1
  69. package/src/utils/hub.js +6 -5
  70. package/src/utils/image.js +12 -13
  71. package/src/utils/lru_cache.js +67 -0
  72. package/src/utils/memoize_promise.js +45 -0
  73. package/src/utils/model_registry/ModelRegistry.js +70 -23
  74. package/src/utils/model_registry/get_file_metadata.js +14 -2
  75. package/src/utils/model_registry/get_model_files.js +63 -78
  76. package/src/utils/model_registry/get_pipeline_files.js +15 -24
  77. package/src/utils/model_registry/is_cached.js +81 -4
  78. package/src/utils/tensor.js +18 -2
  79. package/types/backends/onnx.d.ts.map +1 -1
  80. package/types/backends/utils/cacheWasm.d.ts +3 -17
  81. package/types/backends/utils/cacheWasm.d.ts.map +1 -1
  82. package/types/cache_utils.d.ts +29 -0
  83. package/types/cache_utils.d.ts.map +1 -0
  84. package/types/configs.d.ts.map +1 -1
  85. package/types/env.d.ts +18 -3
  86. package/types/env.d.ts.map +1 -1
  87. package/types/image_processors_utils.d.ts +17 -1
  88. package/types/image_processors_utils.d.ts.map +1 -1
  89. package/types/models/auto/modeling_auto.d.ts +6 -0
  90. package/types/models/auto/modeling_auto.d.ts.map +1 -1
  91. package/types/models/detr/image_processing_detr.d.ts +1 -1
  92. package/types/models/feature_extractors.d.ts +2 -0
  93. package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
  94. package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
  95. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
  96. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
  97. package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
  98. package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
  99. package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
  100. package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
  101. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
  102. package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
  103. package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
  104. package/types/models/image_processors.d.ts +1 -0
  105. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
  106. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
  107. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
  108. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
  109. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
  110. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
  111. package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
  112. package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
  113. package/types/models/modeling_utils.d.ts +44 -24
  114. package/types/models/modeling_utils.d.ts.map +1 -1
  115. package/types/models/models.d.ts +9 -0
  116. package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts +8 -0
  117. package/types/models/olmo_hybrid/modeling_olmo_hybrid.d.ts.map +1 -0
  118. package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
  119. package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
  120. package/types/models/processors.d.ts +3 -0
  121. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
  122. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
  123. package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts +8 -0
  124. package/types/models/qwen2_moe/modeling_qwen2_moe.d.ts.map +1 -0
  125. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
  126. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
  127. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  128. package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
  129. package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
  130. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
  131. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
  132. package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts +8 -0
  133. package/types/models/qwen3_moe/modeling_qwen3_moe.d.ts.map +1 -0
  134. package/types/models/qwen3_next/modeling_qwen3_next.d.ts +8 -0
  135. package/types/models/qwen3_next/modeling_qwen3_next.d.ts.map +1 -0
  136. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
  137. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
  138. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +7 -0
  139. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -0
  140. package/types/models/registry.d.ts +2 -1
  141. package/types/models/registry.d.ts.map +1 -1
  142. package/types/models/sam/image_processing_sam.d.ts +1 -1
  143. package/types/models/session.d.ts +3 -2
  144. package/types/models/session.d.ts.map +1 -1
  145. package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
  146. package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
  147. package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
  148. package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
  149. package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
  150. package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
  151. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
  152. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
  153. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
  154. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
  155. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
  156. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
  157. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  158. package/types/pipelines/index.d.ts +0 -34
  159. package/types/pipelines/index.d.ts.map +1 -1
  160. package/types/pipelines.d.ts.map +1 -1
  161. package/types/transformers.d.ts +1 -0
  162. package/types/transformers.d.ts.map +1 -1
  163. package/types/utils/audio.d.ts +5 -2
  164. package/types/utils/audio.d.ts.map +1 -1
  165. package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
  166. package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
  167. package/types/utils/cache/FileCache.d.ts +39 -0
  168. package/types/utils/cache/FileCache.d.ts.map +1 -0
  169. package/types/utils/cache.d.ts +4 -4
  170. package/types/utils/cache.d.ts.map +1 -1
  171. package/types/utils/dtypes.d.ts +1 -1
  172. package/types/utils/hub/{files.d.ts → FileResponse.d.ts} +1 -38
  173. package/types/utils/hub/FileResponse.d.ts.map +1 -0
  174. package/types/utils/hub/utils.d.ts +17 -2
  175. package/types/utils/hub/utils.d.ts.map +1 -1
  176. package/types/utils/hub.d.ts +7 -7
  177. package/types/utils/hub.d.ts.map +1 -1
  178. package/types/utils/image.d.ts +1 -1
  179. package/types/utils/image.d.ts.map +1 -1
  180. package/types/utils/lru_cache.d.ts +38 -0
  181. package/types/utils/lru_cache.d.ts.map +1 -0
  182. package/types/utils/memoize_promise.d.ts +14 -0
  183. package/types/utils/memoize_promise.d.ts.map +1 -0
  184. package/types/utils/model_registry/ModelRegistry.d.ts +66 -6
  185. package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
  186. package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
  187. package/types/utils/model_registry/get_model_files.d.ts +1 -0
  188. package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
  189. package/types/utils/model_registry/get_pipeline_files.d.ts +2 -1
  190. package/types/utils/model_registry/get_pipeline_files.d.ts.map +1 -1
  191. package/types/utils/model_registry/is_cached.d.ts +47 -4
  192. package/types/utils/model_registry/is_cached.d.ts.map +1 -1
  193. package/types/utils/tensor.d.ts.map +1 -1
  194. package/src/utils/data-structures.js +0 -572
  195. package/types/utils/data-structures.d.ts +0 -294
  196. package/types/utils/data-structures.d.ts.map +0 -1
  197. package/types/utils/hub/files.d.ts.map +0 -1
@@ -14,22 +14,32 @@ var __export = (target, all) => {
14
14
  import fs from "fs";
15
15
  import path from "path";
16
16
  import url from "url";
17
- var VERSION = "4.0.0-next.5";
17
+ var VERSION = "4.0.0-next.7";
18
+ var HAS_SELF = typeof self !== "undefined";
18
19
  var IS_FS_AVAILABLE = !isEmpty(fs);
19
20
  var IS_PATH_AVAILABLE = !isEmpty(path);
20
- var IS_WEB_CACHE_AVAILABLE = typeof self !== "undefined" && "caches" in self;
21
+ var IS_WEB_CACHE_AVAILABLE = HAS_SELF && "caches" in self;
21
22
  var IS_DENO_RUNTIME = typeof globalThis.Deno !== "undefined";
22
23
  var IS_BUN_RUNTIME = typeof globalThis.Bun !== "undefined";
23
24
  var IS_DENO_WEB_RUNTIME = IS_DENO_RUNTIME && IS_WEB_CACHE_AVAILABLE && !IS_FS_AVAILABLE;
24
25
  var IS_PROCESS_AVAILABLE = typeof process !== "undefined";
25
26
  var IS_NODE_ENV = IS_PROCESS_AVAILABLE && process?.release?.name === "node" && !IS_DENO_WEB_RUNTIME;
26
27
  var IS_BROWSER_ENV = typeof window !== "undefined" && typeof window.document !== "undefined";
27
- var IS_WEBWORKER_ENV = typeof self !== "undefined" && ["DedicatedWorkerGlobalScope", "ServiceWorkerGlobalScope", "SharedWorkerGlobalScope"].includes(
28
+ var IS_WEBWORKER_ENV = HAS_SELF && ["DedicatedWorkerGlobalScope", "ServiceWorkerGlobalScope", "SharedWorkerGlobalScope"].includes(
28
29
  self.constructor?.name
29
30
  );
31
+ var IS_WEB_ENV = IS_BROWSER_ENV || IS_WEBWORKER_ENV || IS_DENO_WEB_RUNTIME;
30
32
  var IS_WEBGPU_AVAILABLE = IS_NODE_ENV || typeof navigator !== "undefined" && "gpu" in navigator;
31
33
  var IS_WEBNN_AVAILABLE = typeof navigator !== "undefined" && "ml" in navigator;
32
34
  var IS_CRYPTO_AVAILABLE = typeof crypto !== "undefined" && typeof crypto.getRandomValues === "function";
35
+ var IS_CHROME_AVAILABLE = (
36
+ // @ts-ignore - chrome may not exist in all environments
37
+ typeof chrome !== "undefined" && typeof chrome.runtime !== "undefined" && typeof chrome.runtime.id === "string"
38
+ );
39
+ var IS_SERVICE_WORKER_ENV = (
40
+ // @ts-ignore - ServiceWorkerGlobalScope may not exist in all environments
41
+ typeof ServiceWorkerGlobalScope !== "undefined" && HAS_SELF && self instanceof ServiceWorkerGlobalScope
42
+ );
33
43
  var isSafari = () => {
34
44
  if (typeof navigator === "undefined") {
35
45
  return false;
@@ -46,6 +56,12 @@ var apis = Object.freeze({
46
56
  IS_BROWSER_ENV,
47
57
  /** Whether we are running in a web worker environment */
48
58
  IS_WEBWORKER_ENV,
59
+ /** Whether we are running in a web-like environment (browser, web worker, or Deno web runtime) */
60
+ IS_WEB_ENV,
61
+ /** Whether we are running in a service worker environment */
62
+ IS_SERVICE_WORKER_ENV,
63
+ /** Whether we are running in Deno's web runtime (CDN imports, Cache API available, no filesystem) */
64
+ IS_DENO_WEB_RUNTIME,
49
65
  /** Whether the Cache API is available */
50
66
  IS_WEB_CACHE_AVAILABLE,
51
67
  /** Whether the WebGPU API is available */
@@ -63,7 +79,9 @@ var apis = Object.freeze({
63
79
  /** Whether the path API is available */
64
80
  IS_PATH_AVAILABLE,
65
81
  /** Whether the crypto API is available */
66
- IS_CRYPTO_AVAILABLE
82
+ IS_CRYPTO_AVAILABLE,
83
+ /** Whether the Chrome runtime API is available */
84
+ IS_CHROME_AVAILABLE
67
85
  });
68
86
  var RUNNING_LOCALLY = IS_FS_AVAILABLE && IS_PATH_AVAILABLE;
69
87
  var dirname__ = "./";
@@ -124,6 +142,7 @@ var env = {
124
142
  customCache: null,
125
143
  useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
126
144
  cacheKey: "transformers-cache",
145
+ experimental_useCrossOriginStorage: false,
127
146
  /////////////////// Custom fetch /////////////////////
128
147
  fetch: DEFAULT_FETCH
129
148
  //////////////////////////////////////////////////////
@@ -2674,7 +2693,7 @@ var Tokenizer = class {
2674
2693
  };
2675
2694
  var Tokenizer_default = Tokenizer;
2676
2695
 
2677
- // ../../node_modules/.pnpm/@huggingface+jinja@0.5.5/node_modules/@huggingface/jinja/dist/index.js
2696
+ // ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
2678
2697
  var TOKEN_TYPES = Object.freeze({
2679
2698
  Text: "Text",
2680
2699
  // The text between Jinja statements or expressions
@@ -4193,7 +4212,11 @@ var Environment = class {
4193
4212
  ["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
4194
4213
  ["integer", (operand) => operand instanceof IntegerValue],
4195
4214
  ["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
4196
- ["mapping", (operand) => operand.type === "ObjectValue"],
4215
+ ["mapping", (operand) => operand instanceof ObjectValue],
4216
+ [
4217
+ "sequence",
4218
+ (operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
4219
+ ],
4197
4220
  [
4198
4221
  "lower",
4199
4222
  (operand) => {
@@ -4466,6 +4489,9 @@ var Interpreter = class {
4466
4489
  applyFilter(operand, filterNode, environment) {
4467
4490
  if (filterNode.type === "Identifier") {
4468
4491
  const filter = filterNode;
4492
+ if (filter.value === "safe") {
4493
+ return operand;
4494
+ }
4469
4495
  if (filter.value === "tojson") {
4470
4496
  return new StringValue(toJSON(operand, {}));
4471
4497
  }
@@ -4555,6 +4581,8 @@ var Interpreter = class {
4555
4581
  return new IntegerValue(Math.floor(operand.value));
4556
4582
  case "float":
4557
4583
  return new FloatValue(operand.value);
4584
+ case "string":
4585
+ return new StringValue(operand.toString());
4558
4586
  default:
4559
4587
  throw new Error(`Unknown NumericValue filter: ${filter.value}`);
4560
4588
  }
@@ -5508,9 +5536,8 @@ var Callable2 = (
5508
5536
  }
5509
5537
  );
5510
5538
 
5511
- // src/utils/hub/files.js
5539
+ // src/utils/hub/FileResponse.js
5512
5540
  import fs2 from "fs";
5513
- import path2 from "path";
5514
5541
  var CONTENT_TYPE_MAP = {
5515
5542
  txt: "text/plain",
5516
5543
  html: "text/html",
@@ -5621,6 +5648,174 @@ var FileResponse = class _FileResponse {
5621
5648
  return JSON.parse(await this.text());
5622
5649
  }
5623
5650
  };
5651
+
5652
+ // src/utils/cache/FileCache.js
5653
+ import fs3 from "fs";
5654
+ import path2 from "path";
5655
+
5656
+ // src/utils/random.js
5657
+ var Random = class {
5658
+ constructor(seed) {
5659
+ this._mt = new Uint32Array(624);
5660
+ this._idx = 625;
5661
+ this._gauss_next = null;
5662
+ this._random_fn = this.random.bind(this);
5663
+ this.seed(seed);
5664
+ }
5665
+ /**
5666
+ * Seeds this instance's PRNG.
5667
+ *
5668
+ * When called with a number, initializes the state deterministically from that value.
5669
+ * When called with no arguments (or `undefined`/`null`), seeds from OS entropy
5670
+ * via `crypto.getRandomValues`, matching Python's `random.seed()` behaviour.
5671
+ *
5672
+ * @param {number} [n] The seed value. Omit to seed from OS entropy.
5673
+ */
5674
+ seed(n) {
5675
+ if (n === void 0 || n === null) {
5676
+ if (apis.IS_CRYPTO_AVAILABLE) {
5677
+ const buf = new Uint32Array(1);
5678
+ crypto.getRandomValues(buf);
5679
+ n = buf[0];
5680
+ } else {
5681
+ n = Date.now() >>> 0;
5682
+ }
5683
+ }
5684
+ const mt2 = this._mt;
5685
+ const u = (a, b) => Math.imul(a, b) >>> 0, key = [];
5686
+ for (let v = n || 0; v > 0; v = Math.floor(v / 4294967296)) key.push(v & 4294967295);
5687
+ if (!key.length) key.push(0);
5688
+ mt2[0] = 19650218;
5689
+ for (let k2 = 1; k2 < 624; ++k2) mt2[k2] = u(1812433253, mt2[k2 - 1] ^ mt2[k2 - 1] >>> 30) + k2 >>> 0;
5690
+ let i = 1, j = 0;
5691
+ for (let k2 = Math.max(624, key.length); k2 > 0; --k2, ++i, ++j) {
5692
+ if (i >= 624) {
5693
+ mt2[0] = mt2[623];
5694
+ i = 1;
5695
+ }
5696
+ if (j >= key.length) j = 0;
5697
+ mt2[i] = (mt2[i] ^ u(mt2[i - 1] ^ mt2[i - 1] >>> 30, 1664525)) + key[j] + j >>> 0;
5698
+ }
5699
+ for (let k2 = 623; k2 > 0; --k2, ++i) {
5700
+ if (i >= 624) {
5701
+ mt2[0] = mt2[623];
5702
+ i = 1;
5703
+ }
5704
+ mt2[i] = (mt2[i] ^ u(mt2[i - 1] ^ mt2[i - 1] >>> 30, 1566083941)) - i >>> 0;
5705
+ }
5706
+ mt2[0] = 2147483648;
5707
+ this._idx = 624;
5708
+ this._gauss_next = null;
5709
+ }
5710
+ /**
5711
+ * Generates a random unsigned 32-bit integer.
5712
+ *
5713
+ * Performs the "twist" step when the state buffer is exhausted,
5714
+ * then applies the standard MT19937 tempering transform.
5715
+ *
5716
+ * @returns {number} A random integer in the range [0, 2^32 - 1].
5717
+ */
5718
+ _int32() {
5719
+ const mt2 = this._mt;
5720
+ if (this._idx >= 624) {
5721
+ for (let k2 = 0; k2 < 624; ++k2) {
5722
+ const y2 = mt2[k2] & 2147483648 | mt2[(k2 + 1) % 624] & 2147483647;
5723
+ mt2[k2] = (mt2[(k2 + 397) % 624] ^ y2 >>> 1 ^ (y2 & 1 ? 2567483615 : 0)) >>> 0;
5724
+ }
5725
+ this._idx = 0;
5726
+ }
5727
+ let y = mt2[this._idx++];
5728
+ y ^= y >>> 11;
5729
+ y ^= y << 7 & 2636928640;
5730
+ y ^= y << 15 & 4022730752;
5731
+ y ^= y >>> 18;
5732
+ return y >>> 0;
5733
+ }
5734
+ /**
5735
+ * Generates a random floating-point number in the half-open interval [0, 1).
5736
+ *
5737
+ * Combines two 32-bit integers (using 53 bits of precision) to produce
5738
+ * a uniformly distributed double, matching Python's `random.random()`.
5739
+ *
5740
+ * @returns {number} A random float in [0, 1).
5741
+ */
5742
+ random() {
5743
+ return ((this._int32() >>> 5) * 67108864 + (this._int32() >>> 6)) / 9007199254740992;
5744
+ }
5745
+ /**
5746
+ * Generates a random number from a Gaussian (normal) distribution.
5747
+ *
5748
+ * Uses the Box-Muller transform with a cached spare value,
5749
+ * matching Python's `random.gauss()` output for the same seed.
5750
+ *
5751
+ * @param {number} [mu=0] The mean of the distribution.
5752
+ * @param {number} [sigma=1] The standard deviation of the distribution.
5753
+ * @returns {number} A normally distributed random value.
5754
+ */
5755
+ gauss(mu = 0, sigma = 1) {
5756
+ let z = this._gauss_next;
5757
+ this._gauss_next = null;
5758
+ if (z === null) {
5759
+ const x2pi = this.random() * 2 * Math.PI, g2rad = Math.sqrt(-2 * Math.log(1 - this.random()));
5760
+ z = Math.cos(x2pi) * g2rad;
5761
+ this._gauss_next = Math.sin(x2pi) * g2rad;
5762
+ }
5763
+ return mu + z * sigma;
5764
+ }
5765
+ /**
5766
+ * Shuffles an array in-place using the Fisher-Yates algorithm.
5767
+ *
5768
+ * Uses rejection sampling via `getrandbits`-style bit masking to ensure
5769
+ * a uniform distribution, matching Python's `random.shuffle()`.
5770
+ *
5771
+ * @param {any[]} arr The array to shuffle in-place.
5772
+ */
5773
+ shuffle(arr) {
5774
+ for (let i = arr.length - 1; i > 0; --i) {
5775
+ const k2 = 32 - Math.clz32(i + 1);
5776
+ let r = this._int32() >>> 32 - k2;
5777
+ while (r > i) r = this._int32() >>> 32 - k2;
5778
+ const t = arr[i];
5779
+ arr[i] = arr[r];
5780
+ arr[r] = t;
5781
+ }
5782
+ }
5783
+ /**
5784
+ * Selects a single element from a weighted population.
5785
+ *
5786
+ * Matches Python's `random.choices(population, weights=weights, k=1)[0]`
5787
+ *
5788
+ * @param {any[]} population The array of items to choose from.
5789
+ * @param {number[]} weights An array of non-negative weights, one per population element.
5790
+ * @returns {*} A single randomly selected element from the population.
5791
+ */
5792
+ choices(population, weights) {
5793
+ return population[_weightedIndexWith(this._random_fn, weights)];
5794
+ }
5795
+ };
5796
+ function _weightedIndexWith(randomFn, weights) {
5797
+ let sum = 0;
5798
+ for (let i = 0; i < weights.length; ++i) sum += weights[i];
5799
+ let x = randomFn() * sum;
5800
+ for (let i = 0; i < weights.length; ++i) {
5801
+ x -= weights[i];
5802
+ if (x < 0) return i;
5803
+ }
5804
+ return weights.length - 1;
5805
+ }
5806
+ var _default = new Random();
5807
+ var random = Object.freeze({
5808
+ Random,
5809
+ seed: _default.seed.bind(_default),
5810
+ random: _default.random.bind(_default),
5811
+ gauss: _default.gauss.bind(_default),
5812
+ shuffle: _default.shuffle.bind(_default),
5813
+ choices: _default.choices.bind(_default)
5814
+ });
5815
+ var _weightedIndex = (weights) => _weightedIndexWith(random.random, weights);
5816
+
5817
+ // src/utils/cache/FileCache.js
5818
+ var rng = new Random();
5624
5819
  var FileCache = class {
5625
5820
  /**
5626
5821
  * Instantiate a `FileCache` object.
@@ -5652,13 +5847,16 @@ var FileCache = class {
5652
5847
  * @returns {Promise<void>}
5653
5848
  */
5654
5849
  async put(request, response, progress_callback = void 0) {
5655
- let filePath = path2.join(this.path, request);
5850
+ const filePath = path2.join(this.path, request);
5851
+ const id = apis.IS_PROCESS_AVAILABLE ? process.pid : Date.now();
5852
+ const randomSuffix = rng._int32().toString(36);
5853
+ const tmpPath = filePath + `.tmp.${id}.${randomSuffix}`;
5656
5854
  try {
5657
5855
  const contentLength = response.headers.get("Content-Length");
5658
5856
  const total = parseInt(contentLength ?? "0");
5659
5857
  let loaded = 0;
5660
- await fs2.promises.mkdir(path2.dirname(filePath), { recursive: true });
5661
- const fileStream = fs2.createWriteStream(filePath);
5858
+ await fs3.promises.mkdir(path2.dirname(filePath), { recursive: true });
5859
+ const fileStream = fs3.createWriteStream(tmpPath);
5662
5860
  const reader = response.body.getReader();
5663
5861
  while (true) {
5664
5862
  const { done, value } = await reader.read();
@@ -5678,10 +5876,13 @@ var FileCache = class {
5678
5876
  const progress = total ? loaded / total * 100 : 0;
5679
5877
  progress_callback?.({ progress, loaded, total });
5680
5878
  }
5681
- fileStream.close();
5879
+ await new Promise((resolve, reject) => {
5880
+ fileStream.close((err) => err ? reject(err) : resolve());
5881
+ });
5882
+ await fs3.promises.rename(tmpPath, filePath);
5682
5883
  } catch (error) {
5683
5884
  try {
5684
- await fs2.promises.unlink(filePath);
5885
+ await fs3.promises.unlink(tmpPath);
5685
5886
  } catch {
5686
5887
  }
5687
5888
  throw error;
@@ -5695,7 +5896,7 @@ var FileCache = class {
5695
5896
  async delete(request) {
5696
5897
  let filePath = path2.join(this.path, request);
5697
5898
  try {
5698
- await fs2.promises.unlink(filePath);
5899
+ await fs3.promises.unlink(filePath);
5699
5900
  return true;
5700
5901
  } catch (error) {
5701
5902
  return false;
@@ -5704,6 +5905,7 @@ var FileCache = class {
5704
5905
  // TODO add the rest?
5705
5906
  // addAll(requests: RequestInfo[]): Promise<void>;
5706
5907
  // keys(request?: RequestInfo | URL, options?: CacheQueryOptions): Promise<ReadonlyArray<Request>>;
5908
+ // match(request: RequestInfo | URL, options?: CacheQueryOptions): Promise<Response | undefined>;
5707
5909
  // matchAll(request?: RequestInfo | URL, options?: CacheQueryOptions): Promise<ReadonlyArray<Response>>;
5708
5910
  };
5709
5911
 
@@ -5793,50 +5995,351 @@ async function readResponse(response, progress_callback, expectedSize) {
5793
5995
  await read();
5794
5996
  return buffer;
5795
5997
  }
5796
-
5797
- // src/utils/cache.js
5798
- async function getCache(file_cache_dir = null) {
5799
- let cache = null;
5800
- if (env.useCustomCache) {
5801
- if (!env.customCache) {
5802
- throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
5803
- }
5804
- if (!env.customCache.match || !env.customCache.put) {
5805
- throw new Error(
5806
- "`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
5807
- );
5808
- }
5809
- cache = env.customCache;
5998
+ function isBlobURL(url2) {
5999
+ return isValidUrl(url2, ["blob:"]);
6000
+ }
6001
+ function toAbsoluteURL(url2) {
6002
+ let baseURL;
6003
+ if (typeof location !== "undefined" && location.href) {
6004
+ baseURL = location.href;
6005
+ } else if (typeof import.meta !== "undefined" && import.meta.url) {
6006
+ baseURL = import.meta.url;
6007
+ } else {
6008
+ return url2;
5810
6009
  }
5811
- if (!cache && env.useBrowserCache) {
5812
- if (typeof caches === "undefined") {
5813
- throw Error("Browser cache is not available in this environment.");
6010
+ return new URL(url2, baseURL).href;
6011
+ }
6012
+
6013
+ // src/utils/cache/CrossOriginStorageCache.js
6014
+ var HASH_ALGORITHM = "SHA-256";
6015
+ var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
6016
+ var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
6017
+ var CrossOriginStorage = class {
6018
+ /** @type {Promise<Cache> | null} */
6019
+ #hashCache = null;
6020
+ /**
6021
+ * Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
6022
+ * @returns {Promise<Cache>}
6023
+ */
6024
+ _getHashCache = () => {
6025
+ this.#hashCache ??= caches.open(HASH_CACHE_NAME);
6026
+ return this.#hashCache;
6027
+ };
6028
+ /**
6029
+ * Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
6030
+ * @returns {boolean}
6031
+ */
6032
+ static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
6033
+ /**
6034
+ * Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
6035
+ * the corresponding file handle from cross-origin storage.
6036
+ *
6037
+ * Implements `CacheInterface.match`.
6038
+ *
6039
+ * @param {string} request The URL of the resource to look up.
6040
+ * @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
6041
+ */
6042
+ match = async (request) => {
6043
+ const hashValue = await this._getFileHash(request);
6044
+ if (!hashValue) {
6045
+ return void 0;
5814
6046
  }
5815
6047
  try {
5816
- cache = await caches.open(env.cacheKey);
5817
- } catch (e) {
5818
- logger.warn("An error occurred while opening the browser cache:", e);
6048
+ const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
6049
+ const blob = await handle.getFile();
6050
+ return new Response(blob, {
6051
+ headers: {
6052
+ "Content-Length": String(blob.size)
6053
+ }
6054
+ });
6055
+ } catch {
6056
+ return void 0;
5819
6057
  }
5820
- }
5821
- if (!cache && env.useFSCache) {
5822
- if (!apis.IS_FS_AVAILABLE) {
5823
- throw Error("File System Cache is not available in this environment.");
6058
+ };
6059
+ /**
6060
+ * Stores a response in cross-origin storage, keyed by its SHA-256 hash.
6061
+ *
6062
+ * For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
6063
+ * `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
6064
+ * without reading the response body a second time.
6065
+ *
6066
+ * For non-LFS resources the hash is unknown upfront. In that case the body is consumed
6067
+ * in the background: the stream is read to compute the content hash, the file is written
6068
+ * into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
6069
+ * so that future `match` calls can resolve the file without a network round-trip.
6070
+ *
6071
+ * Implements `CacheInterface.put`.
6072
+ *
6073
+ * @param {string} request The URL of the resource (used as the hash-cache key).
6074
+ * @param {Response} response The response whose body will be written to the cache.
6075
+ * @returns {Promise<void>}
6076
+ */
6077
+ put = async (request, response) => {
6078
+ const hashValue = await this._getFileHash(request);
6079
+ if (hashValue) {
6080
+ const blob = await response.blob();
6081
+ await this._storeBlobInCOS(blob, hashValue);
6082
+ } else {
6083
+ this._processAndStore(request, response.body);
5824
6084
  }
5825
- cache = new FileCache(file_cache_dir ?? env.cacheDir);
5826
- }
5827
- return cache;
5828
- }
5829
- async function tryCache(cache, ...names) {
5830
- for (let name of names) {
6085
+ };
6086
+ /**
6087
+ * Writes a blob into cross-origin storage using the given pre-computed hex hash string.
6088
+ *
6089
+ * @param {Blob} blob
6090
+ * @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
6091
+ * @returns {Promise<void>}
6092
+ */
6093
+ _storeBlobInCOS = async (blob, hashHex) => {
6094
+ const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
6095
+ create: true
6096
+ });
6097
+ const writableStream = await handle.createWritable();
6098
+ await writableStream.write(blob);
6099
+ await writableStream.close();
6100
+ };
6101
+ /**
6102
+ * Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
6103
+ * of the resulting blob, stores it in cross-origin storage, and persists the computed
6104
+ * hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
6105
+ * file without a network round-trip.
6106
+ *
6107
+ * Called fire-and-forget from `put` — errors are swallowed so failures never surface to
6108
+ * the caller.
6109
+ *
6110
+ * @param {string} request The original resource URL.
6111
+ * @param {ReadableStream} stream The response body stream to consume.
6112
+ * @returns {Promise<void>}
6113
+ */
6114
+ _processAndStore = async (request, stream) => {
5831
6115
  try {
5832
- let result = await cache.match(name);
5833
- if (result) return result;
5834
- } catch (e) {
5835
- continue;
6116
+ const chunks = [];
6117
+ for await (const chunk2 of stream) {
6118
+ chunks.push(chunk2);
6119
+ }
6120
+ const blob = new Blob(chunks);
6121
+ const hashHex = await this._getBlobHash(blob);
6122
+ await this._storeBlobInCOS(blob, hashHex);
6123
+ try {
6124
+ const hashCache = await this._getHashCache();
6125
+ await hashCache.put(request, new Response(hashHex));
6126
+ } catch {
6127
+ }
6128
+ } catch {
5836
6129
  }
5837
- }
5838
- return void 0;
5839
- }
6130
+ };
6131
+ /**
6132
+ * Deletes the cache entry for the given request.
6133
+ *
6134
+ * Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
6135
+ * expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
6136
+ * permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
6137
+ * re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
6138
+ *
6139
+ * Implements `CacheInterface.delete`.
6140
+ *
6141
+ * @param {string} request
6142
+ * @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
6143
+ */
6144
+ delete = async (request) => {
6145
+ try {
6146
+ const hashCache = await this._getHashCache();
6147
+ return await hashCache.delete(request);
6148
+ } catch {
6149
+ return false;
6150
+ }
6151
+ };
6152
+ /**
6153
+ * Resolves the SHA-256 hash for a given URL.
6154
+ *
6155
+ * Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
6156
+ * Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
6157
+ * LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
6158
+ *
6159
+ * Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
6160
+ *
6161
+ * @param {string} url The resource URL to resolve a hash for.
6162
+ * @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
6163
+ */
6164
+ _getFileHash = async (url2) => {
6165
+ try {
6166
+ const hashCache = await this._getHashCache();
6167
+ const cached = await hashCache.match(url2);
6168
+ if (cached) {
6169
+ return cached.text();
6170
+ }
6171
+ const hash = await this._getLfsFileHash(url2);
6172
+ if (hash) {
6173
+ await hashCache.put(url2, new Response(hash));
6174
+ return hash;
6175
+ }
6176
+ return null;
6177
+ } catch {
6178
+ return null;
6179
+ }
6180
+ };
6181
+ /**
6182
+ * Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
6183
+ * Git LFS pointer file.
6184
+ *
6185
+ * Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
6186
+ * The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
6187
+ * Returns `null` for non-LFS URLs or when the network request fails.
6188
+ *
6189
+ * @see https://huggingface.co/docs/hub/en/storage-backends#xet
6190
+ * @param {string} url The resolved Hugging Face URL of the resource.
6191
+ * @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
6192
+ */
6193
+ _getLfsFileHash = async (url2) => {
6194
+ if (!url2.includes("/resolve/")) {
6195
+ return null;
6196
+ }
6197
+ const rawUrl = url2.replace("/resolve/", "/raw/");
6198
+ try {
6199
+ const text = await fetch(rawUrl).then((r) => r.text());
6200
+ const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
6201
+ return match ? match[1] : null;
6202
+ } catch {
6203
+ return null;
6204
+ }
6205
+ };
6206
+ /**
6207
+ * Computes the SHA-256 hash of a `Blob`'s contents.
6208
+ *
6209
+ * @param {Blob} blob The blob to hash.
6210
+ * @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
6211
+ */
6212
+ _getBlobHash = async (blob) => {
6213
+ const arrayBuffer = await blob.arrayBuffer();
6214
+ const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
6215
+ const hashArray = Array.from(new Uint8Array(hashBuffer));
6216
+ return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
6217
+ };
6218
+ };
6219
+
6220
+ // src/utils/cache.js
6221
+ async function getCache(file_cache_dir = null) {
6222
+ let cache2 = null;
6223
+ if (env.useCustomCache) {
6224
+ if (!env.customCache) {
6225
+ throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
6226
+ }
6227
+ if (!env.customCache.match || !env.customCache.put) {
6228
+ throw new Error(
6229
+ "`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
6230
+ );
6231
+ }
6232
+ cache2 = env.customCache;
6233
+ }
6234
+ if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
6235
+ cache2 = new CrossOriginStorage();
6236
+ }
6237
+ if (!cache2 && env.useBrowserCache) {
6238
+ if (typeof caches === "undefined") {
6239
+ throw Error("Browser cache is not available in this environment.");
6240
+ }
6241
+ try {
6242
+ cache2 = await caches.open(env.cacheKey);
6243
+ } catch (e) {
6244
+ logger.warn("An error occurred while opening the browser cache:", e);
6245
+ }
6246
+ }
6247
+ if (!cache2 && env.useFSCache) {
6248
+ if (!apis.IS_FS_AVAILABLE) {
6249
+ throw Error("File System Cache is not available in this environment.");
6250
+ }
6251
+ cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
6252
+ }
6253
+ return cache2;
6254
+ }
6255
+ async function tryCache(cache2, ...names) {
6256
+ for (let name of names) {
6257
+ try {
6258
+ let result = await cache2.match(name);
6259
+ if (result) return result;
6260
+ } catch (e) {
6261
+ continue;
6262
+ }
6263
+ }
6264
+ return void 0;
6265
+ }
6266
+
6267
+ // src/utils/lru_cache.js
6268
+ var LRUCache2 = class {
6269
+ /** @type {number} */
6270
+ #capacity;
6271
+ /** @type {Map<any, any>} */
6272
+ #cache;
6273
+ /**
6274
+ * Creates an LRUCache instance.
6275
+ * @param {number} capacity The maximum number of items the cache can hold.
6276
+ */
6277
+ constructor(capacity) {
6278
+ this.#capacity = capacity;
6279
+ this.#cache = /* @__PURE__ */ new Map();
6280
+ }
6281
+ /**
6282
+ * Retrieves the value associated with the given key and marks the key as recently used.
6283
+ * @param {any} key The key to retrieve.
6284
+ * @returns {any} The value associated with the key, or undefined if the key does not exist.
6285
+ */
6286
+ get(key) {
6287
+ if (!this.#cache.has(key)) return void 0;
6288
+ const value = this.#cache.get(key);
6289
+ this.#cache.delete(key);
6290
+ this.#cache.set(key, value);
6291
+ return value;
6292
+ }
6293
+ /**
6294
+ * Inserts or updates the key-value pair in the cache.
6295
+ * If the key already exists, it is updated and marked as recently used.
6296
+ * If the cache exceeds its capacity, the least recently used item is evicted.
6297
+ * @param {any} key The key to add or update.
6298
+ * @param {any} value The value to associate with the key.
6299
+ */
6300
+ put(key, value) {
6301
+ if (this.#cache.has(key)) {
6302
+ this.#cache.delete(key);
6303
+ }
6304
+ this.#cache.set(key, value);
6305
+ if (this.#cache.size > this.#capacity) {
6306
+ this.#cache.delete(this.#cache.keys().next().value);
6307
+ }
6308
+ }
6309
+ /**
6310
+ * Removes the entry for the given key from the cache.
6311
+ * @param {any} key The key to delete.
6312
+ * @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
6313
+ */
6314
+ delete(key) {
6315
+ return this.#cache.delete(key);
6316
+ }
6317
+ /**
6318
+ * Clears the cache.
6319
+ */
6320
+ clear() {
6321
+ this.#cache.clear();
6322
+ }
6323
+ };
6324
+
6325
+ // src/utils/memoize_promise.js
6326
+ var MAX_CACHE_SIZE = 100;
6327
+ var cache = new LRUCache2(MAX_CACHE_SIZE);
6328
+ function memoizePromise(key, factory) {
6329
+ const cached = cache.get(key);
6330
+ if (cached !== void 0) {
6331
+ return cached;
6332
+ }
6333
+ const promise = factory().then(
6334
+ (value) => value,
6335
+ (err) => {
6336
+ cache.delete(key);
6337
+ return Promise.reject(err);
6338
+ }
6339
+ );
6340
+ cache.put(key, promise);
6341
+ return promise;
6342
+ }
5840
6343
 
5841
6344
  // src/utils/model_registry/get_file_metadata.js
5842
6345
  async function fetch_file_head(urlOrPath) {
@@ -5845,17 +6348,27 @@ async function fetch_file_head(urlOrPath) {
5845
6348
  }
5846
6349
  const headers = getFetchHeaders(urlOrPath);
5847
6350
  headers.set("Range", "bytes=0-0");
5848
- return env.fetch(urlOrPath, { method: "GET", headers });
6351
+ return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
6352
+ }
6353
+ function get_file_metadata(path_or_repo_id, filename, options = {}) {
6354
+ const key = JSON.stringify([
6355
+ path_or_repo_id,
6356
+ filename,
6357
+ options?.revision,
6358
+ options?.cache_dir,
6359
+ options?.local_files_only
6360
+ ]);
6361
+ return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
5849
6362
  }
5850
- async function get_file_metadata(path_or_repo_id, filename, options = {}) {
5851
- const cache = await getCache(options?.cache_dir);
6363
+ async function _get_file_metadata(path_or_repo_id, filename, options) {
6364
+ const cache2 = await getCache(options?.cache_dir);
5852
6365
  const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
5853
6366
  path_or_repo_id,
5854
6367
  filename,
5855
6368
  options,
5856
- cache
6369
+ cache2
5857
6370
  );
5858
- const cachedResponse = await checkCachedResource(cache, localPath, proposedCacheKey);
6371
+ const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
5859
6372
  if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
5860
6373
  const size = cachedResponse.headers.get("content-length");
5861
6374
  const contentType = cachedResponse.headers.get("content-type");
@@ -5953,7 +6466,7 @@ function getFetchHeaders(urlOrPath) {
5953
6466
  }
5954
6467
  return headers;
5955
6468
  }
5956
- function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = null) {
6469
+ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
5957
6470
  const revision = options.revision ?? "main";
5958
6471
  const requestURL = pathJoin(path_or_repo_id, filename);
5959
6472
  const validModelId = isValidHfModelId(path_or_repo_id);
@@ -5963,7 +6476,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
5963
6476
  env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
5964
6477
  filename
5965
6478
  );
5966
- const proposedCacheKey = cache instanceof FileCache ? (
6479
+ const proposedCacheKey = cache2 instanceof FileCache ? (
5967
6480
  // Choose cache key for filesystem cache
5968
6481
  // When using the main revision (default), we use the request URL as the cache key.
5969
6482
  // If a specific revision is requested, we account for this in the cache key.
@@ -5977,14 +6490,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
5977
6490
  validModelId
5978
6491
  };
5979
6492
  }
5980
- async function checkCachedResource(cache, localPath, proposedCacheKey) {
5981
- if (!cache) {
6493
+ async function checkCachedResource(cache2, localPath, proposedCacheKey) {
6494
+ if (!cache2) {
5982
6495
  return void 0;
5983
6496
  }
5984
- return await tryCache(cache, localPath, proposedCacheKey);
6497
+ return await tryCache(cache2, localPath, proposedCacheKey);
5985
6498
  }
5986
- async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, response, result, options = {}) {
5987
- if (await cache.match(cacheKey) !== void 0) {
6499
+ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
6500
+ if (await cache2.match(cacheKey) !== void 0) {
5988
6501
  return;
5989
6502
  }
5990
6503
  if (!result) {
@@ -5994,14 +6507,14 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
5994
6507
  file: filename,
5995
6508
  ...data
5996
6509
  }) : void 0;
5997
- await cache.put(
6510
+ await cache2.put(
5998
6511
  cacheKey,
5999
6512
  /** @type {Response} */
6000
6513
  response,
6001
6514
  wrapped_progress
6002
6515
  );
6003
6516
  } else if (typeof response !== "string") {
6004
- await cache.put(
6517
+ await cache2.put(
6005
6518
  cacheKey,
6006
6519
  new Response(
6007
6520
  /** @type {any} */
@@ -6015,17 +6528,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
6015
6528
  });
6016
6529
  }
6017
6530
  }
6018
- async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache = null) {
6531
+ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
6019
6532
  const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
6020
6533
  path_or_repo_id,
6021
6534
  filename,
6022
6535
  options,
6023
- cache
6536
+ cache2
6024
6537
  );
6025
6538
  let cacheKey;
6026
6539
  let toCacheResponse = false;
6027
6540
  let response;
6028
- response = await checkCachedResource(cache, localPath, proposedCacheKey);
6541
+ response = await checkCachedResource(cache2, localPath, proposedCacheKey);
6029
6542
  const cacheHit = response !== void 0;
6030
6543
  if (!cacheHit) {
6031
6544
  if (env.allowLocalModels) {
@@ -6066,7 +6579,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
6066
6579
  }
6067
6580
  cacheKey = proposedCacheKey;
6068
6581
  }
6069
- toCacheResponse = cache && // 1. A caching system is available
6582
+ toCacheResponse = cache2 && // 1. A caching system is available
6070
6583
  typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
6071
6584
  response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
6072
6585
  response.status === 200;
@@ -6128,7 +6641,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
6128
6641
  // i.e., do not cache FileResponses (prevents duplication)
6129
6642
  toCacheResponse && cacheKey && typeof response !== "string"
6130
6643
  ) {
6131
- await storeCachedResource(path_or_repo_id, filename, cache, cacheKey, response, result, options);
6644
+ await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
6132
6645
  }
6133
6646
  dispatchCallback(options.progress_callback, {
6134
6647
  status: "done",
@@ -6144,7 +6657,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
6144
6657
  if (response instanceof FileResponse) {
6145
6658
  return response.filePath;
6146
6659
  }
6147
- const cachedResponse = await cache?.match(cacheKey);
6660
+ const cachedResponse = await cache2?.match(cacheKey);
6148
6661
  if (cachedResponse instanceof FileResponse) {
6149
6662
  return cachedResponse.filePath;
6150
6663
  } else if (cachedResponse instanceof Response) {
@@ -6171,8 +6684,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
6171
6684
  name: path_or_repo_id,
6172
6685
  file: filename
6173
6686
  });
6174
- const cache = await getCache(options?.cache_dir);
6175
- return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache);
6687
+ const cache2 = await getCache(options?.cache_dir);
6688
+ return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
6176
6689
  }
6177
6690
  async function getModelText(modelPath, fileName, fatal = true, options = {}) {
6178
6691
  const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
@@ -6965,7 +7478,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
6965
7478
  // src/backends/onnx.js
6966
7479
  import * as ONNX_NODE from "onnxruntime-node";
6967
7480
 
6968
- // ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260228-6e72d31970/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
7481
+ // ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
6969
7482
  var ort_webgpu_bundle_min_exports = {};
6970
7483
  __export(ort_webgpu_bundle_min_exports, {
6971
7484
  InferenceSession: () => Jf,
@@ -7733,7 +8246,7 @@ async function ts(a = {}) {
7733
8246
  throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
7734
8247
  }
7735
8248
  function Ye() {
7736
- return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, H: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, g: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, I: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, h: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
8249
+ return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
7737
8250
  }
7738
8251
  async function bt() {
7739
8252
  function e(o, u) {
@@ -8920,7 +9433,7 @@ async function ts(a = {}) {
8920
9433
  Te(`invalid type for getValue: ${t}`);
8921
9434
  }
8922
9435
  }, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
8923
- var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 922732: (e, t, n, o, u) => {
9436
+ var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
8924
9437
  if (r === void 0 || !r.Uc) return 1;
8925
9438
  if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
8926
9439
  if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
@@ -8940,11 +9453,11 @@ async function ts(a = {}) {
8940
9453
  } catch {
8941
9454
  return 4;
8942
9455
  }
8943
- }, 923556: (e, t, n) => {
9456
+ }, 926500: (e, t, n) => {
8944
9457
  r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
8945
- }, 923620: () => r.me(), 923662: (e) => {
9458
+ }, 926564: () => r.me(), 926606: (e) => {
8946
9459
  r.jd(e);
8947
- }, 923699: () => typeof wasmOffsetConverter < "u" };
9460
+ }, 926643: () => typeof wasmOffsetConverter < "u" };
8948
9461
  function af(e, t, n, o) {
8949
9462
  var u = P();
8950
9463
  try {
@@ -9375,7 +9888,7 @@ var tc;
9375
9888
  var us;
9376
9889
  var rc;
9377
9890
  var os;
9378
- var fs3;
9891
+ var fs4;
9379
9892
  var as;
9380
9893
  var cs;
9381
9894
  var Yt = k(() => {
@@ -9412,7 +9925,7 @@ var Yt = k(() => {
9412
9925
  /*webpackIgnore:true*/
9413
9926
  /*@vite-ignore*/
9414
9927
  a
9415
- )).default, os = (es(), $t(Ka)).default, fs3 = async () => {
9928
+ )).default, os = (es(), $t(Ka)).default, fs4 = async () => {
9416
9929
  if (!ge) throw new Error("Failed to load proxy worker: cannot determine the script source URL.");
9417
9930
  if (en(ge)) return [void 0, os()];
9418
9931
  let a = await us(ge);
@@ -10688,7 +11201,7 @@ var mn = k(() => {
10688
11201
  if (Mt) throw new Error("multiple calls to 'initWasm()' detected.");
10689
11202
  if (lr) throw new Error("previous call to 'initWasm()' failed.");
10690
11203
  if (Mt = true, ut()) return new Promise((a, r) => {
10691
- Ee?.terminate(), fs3().then(([s, f]) => {
11204
+ Ee?.terminate(), fs4().then(([s, f]) => {
10692
11205
  try {
10693
11206
  Ee = f, Ee.onerror = (d) => r(d), Ee.onmessage = gc, ln = [a, r];
10694
11207
  let i = { type: "init-wasm", in: K };
@@ -10860,7 +11373,7 @@ var $s = k(() => {
10860
11373
  Ve();
10861
11374
  Ve();
10862
11375
  Ve();
10863
- var Xa = "1.25.0-dev.20260228-6e72d31970";
11376
+ var Xa = "1.25.0-dev.20260307-d626b568e0";
10864
11377
  var Tl = Zr;
10865
11378
  {
10866
11379
  let a = ($s(), $t(Gs)).wasmBackend;
@@ -10871,11 +11384,11 @@ Object.defineProperty(K.versions, "web", { value: Xa, enumerable: true });
10871
11384
  // src/backends/utils/cacheWasm.js
10872
11385
  async function loadAndCacheFile(url2) {
10873
11386
  const fileName = url2.split("/").pop();
10874
- let cache;
11387
+ let cache2;
10875
11388
  try {
10876
- cache = await getCache();
10877
- if (cache) {
10878
- const result = await cache.match(url2);
11389
+ cache2 = await getCache();
11390
+ if (cache2) {
11391
+ const result = await cache2.match(url2);
10879
11392
  if (result) {
10880
11393
  return result;
10881
11394
  }
@@ -10887,9 +11400,9 @@ async function loadAndCacheFile(url2) {
10887
11400
  if (!response.ok) {
10888
11401
  throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
10889
11402
  }
10890
- if (cache) {
11403
+ if (cache2) {
10891
11404
  try {
10892
- await cache.put(url2, response.clone());
11405
+ await cache2.put(url2, response.clone());
10893
11406
  } catch (e) {
10894
11407
  logger.warn(`Failed to cache ${fileName}:`, e);
10895
11408
  }
@@ -10907,34 +11420,21 @@ async function loadWasmBinary(wasmURL) {
10907
11420
  }
10908
11421
  }
10909
11422
  async function loadWasmFactory(libURL) {
11423
+ if (apis.IS_SERVICE_WORKER_ENV || apis.IS_CHROME_AVAILABLE) {
11424
+ return libURL;
11425
+ }
10910
11426
  const response = await loadAndCacheFile(libURL);
10911
11427
  if (!response || typeof response === "string") return null;
10912
11428
  try {
10913
11429
  let code = await response.text();
10914
- const baseUrl = libURL.split("/").slice(0, -1).join("/");
10915
- code = code.replaceAll("import.meta.url", `"${baseUrl}"`);
10916
11430
  code = code.replaceAll("globalThis.process?.versions?.node", "false");
10917
11431
  const blob = new Blob([code], { type: "text/javascript" });
10918
11432
  return URL.createObjectURL(blob);
10919
11433
  } catch (error) {
10920
- logger.warn("Failed to read WASM binary:", error);
11434
+ logger.warn("Failed to read WASM factory:", error);
10921
11435
  return null;
10922
11436
  }
10923
11437
  }
10924
- function isBlobURL(url2) {
10925
- return isValidUrl(url2, ["blob:"]);
10926
- }
10927
- function toAbsoluteURL(url2) {
10928
- let baseURL;
10929
- if (typeof location !== "undefined" && location.href) {
10930
- baseURL = location.href;
10931
- } else if (typeof import.meta !== "undefined" && import.meta.url) {
10932
- baseURL = import.meta.url;
10933
- } else {
10934
- return url2;
10935
- }
10936
- return new URL(url2, baseURL).href;
10937
- }
10938
11438
 
10939
11439
  // src/backends/onnx.js
10940
11440
  import { Tensor } from "onnxruntime-common";
@@ -11033,7 +11533,6 @@ function deviceToExecutionProviders(device = null) {
11033
11533
  }
11034
11534
  throw new Error(`Unsupported device: "${device}". Should be one of: ${supportedDevices.join(", ")}.`);
11035
11535
  }
11036
- var IS_WEB_ENV = apis.IS_BROWSER_ENV || apis.IS_WEBWORKER_ENV;
11037
11536
  var webInitChain = Promise.resolve();
11038
11537
  var wasmLoadPromise = null;
11039
11538
  async function ensureWasmLoaded() {
@@ -11042,6 +11541,11 @@ async function ensureWasmLoaded() {
11042
11541
  }
11043
11542
  const shouldUseWasmCache = env.useWasmCache && typeof ONNX_ENV?.wasm?.wasmPaths === "object" && ONNX_ENV?.wasm?.wasmPaths?.wasm && ONNX_ENV?.wasm?.wasmPaths?.mjs;
11044
11543
  if (!shouldUseWasmCache) {
11544
+ if (apis.IS_DENO_WEB_RUNTIME) {
11545
+ throw new Error(
11546
+ "env.useWasmCache=false is not supported in Deno's web runtime. Remove the useWasmCache override."
11547
+ );
11548
+ }
11045
11549
  wasmLoadPromise = Promise.resolve();
11046
11550
  return wasmLoadPromise;
11047
11551
  }
@@ -11050,6 +11554,7 @@ async function ensureWasmLoaded() {
11050
11554
  /** @type {{ wasm: string, mjs: string }} */
11051
11555
  ONNX_ENV.wasm.wasmPaths
11052
11556
  );
11557
+ let wasmBinaryLoaded = false;
11053
11558
  await Promise.all([
11054
11559
  // Load and cache the WASM binary
11055
11560
  urls.wasm && !isBlobURL(urls.wasm) ? (async () => {
@@ -11057,12 +11562,13 @@ async function ensureWasmLoaded() {
11057
11562
  const wasmBinary = await loadWasmBinary(toAbsoluteURL(urls.wasm));
11058
11563
  if (wasmBinary) {
11059
11564
  ONNX_ENV.wasm.wasmBinary = wasmBinary;
11565
+ wasmBinaryLoaded = true;
11060
11566
  }
11061
11567
  } catch (err) {
11062
11568
  logger.warn("Failed to pre-load WASM binary:", err);
11063
11569
  }
11064
11570
  })() : Promise.resolve(),
11065
- // Load and cache the WASM factory
11571
+ // Load and cache the WASM factory as a blob URL
11066
11572
  urls.mjs && !isBlobURL(urls.mjs) ? (async () => {
11067
11573
  try {
11068
11574
  const wasmFactoryBlob = await loadWasmFactory(toAbsoluteURL(urls.mjs));
@@ -11074,6 +11580,9 @@ async function ensureWasmLoaded() {
11074
11580
  }
11075
11581
  })() : Promise.resolve()
11076
11582
  ]);
11583
+ if (!wasmBinaryLoaded) {
11584
+ ONNX_ENV.wasm.wasmPaths.mjs = urls.mjs;
11585
+ }
11077
11586
  })();
11078
11587
  return wasmLoadPromise;
11079
11588
  }
@@ -11085,51 +11594,52 @@ async function createInferenceSession(buffer_or_path, session_options, session_c
11085
11594
  logSeverityLevel,
11086
11595
  ...session_options
11087
11596
  });
11088
- const session = await (IS_WEB_ENV ? webInitChain = webInitChain.then(load) : load());
11597
+ const session = await (apis.IS_WEB_ENV ? webInitChain = webInitChain.then(load) : load());
11089
11598
  session.config = session_config;
11090
11599
  return session;
11091
11600
  }
11092
11601
  var webInferenceChain = Promise.resolve();
11093
11602
  async function runInferenceSession(session, ortFeed) {
11094
11603
  const run = () => session.run(ortFeed);
11095
- const output = await (IS_WEB_ENV ? webInferenceChain = webInferenceChain.then(run) : run());
11096
- return output;
11604
+ return apis.IS_WEB_ENV ? webInferenceChain = webInferenceChain.then(run) : run();
11097
11605
  }
11098
11606
  function isONNXTensor(x) {
11099
11607
  return x instanceof ONNX.Tensor;
11100
11608
  }
11101
11609
  var ONNX_ENV = ONNX?.env;
11102
- if (ONNX_ENV?.wasm) {
11103
- if (
11104
- // @ts-ignore Cannot find name 'ServiceWorkerGlobalScope'.ts(2304)
11105
- !(typeof ServiceWorkerGlobalScope !== "undefined" && self instanceof ServiceWorkerGlobalScope) && ONNX_ENV.versions?.web && !ONNX_ENV.wasm.wasmPaths
11106
- ) {
11107
- const wasmPathPrefix = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${ONNX_ENV.versions.web}/dist/`;
11108
- ONNX_ENV.wasm.wasmPaths = apis.IS_SAFARI ? {
11109
- mjs: `${wasmPathPrefix}ort-wasm-simd-threaded.mjs`,
11110
- wasm: `${wasmPathPrefix}ort-wasm-simd-threaded.wasm`
11111
- } : {
11112
- mjs: `${wasmPathPrefix}ort-wasm-simd-threaded.asyncify.mjs`,
11113
- wasm: `${wasmPathPrefix}ort-wasm-simd-threaded.asyncify.wasm`
11114
- };
11115
- }
11116
- ONNX_ENV.wasm.proxy = false;
11117
- }
11118
- if (ONNX_ENV?.webgpu) {
11119
- ONNX_ENV.webgpu.powerPreference = "high-performance";
11120
- }
11121
11610
  function isONNXProxy() {
11122
11611
  return ONNX_ENV?.wasm?.proxy;
11123
11612
  }
11124
- function setLogLevel(logLevel2) {
11125
- const severityLevel = getOnnxLogSeverityLevel(logLevel2);
11126
- ONNX_ENV.logLevel = ONNX_LOG_LEVEL_NAMES[severityLevel];
11613
+ if (ONNX_ENV) {
11614
+ let setLogLevel = function(logLevel2) {
11615
+ const severityLevel = getOnnxLogSeverityLevel(logLevel2);
11616
+ ONNX_ENV.logLevel = ONNX_LOG_LEVEL_NAMES[severityLevel];
11617
+ };
11618
+ if (ONNX_ENV.wasm) {
11619
+ if (
11620
+ // @ts-ignore Cannot find name 'ServiceWorkerGlobalScope'.ts(2304)
11621
+ !(typeof ServiceWorkerGlobalScope !== "undefined" && self instanceof ServiceWorkerGlobalScope) && ONNX_ENV.versions?.web && !ONNX_ENV.wasm.wasmPaths
11622
+ ) {
11623
+ const wasmPathPrefix = `https://cdn.jsdelivr.net/npm/onnxruntime-web@${ONNX_ENV.versions.web}/dist/`;
11624
+ ONNX_ENV.wasm.wasmPaths = apis.IS_SAFARI ? {
11625
+ mjs: `${wasmPathPrefix}ort-wasm-simd-threaded.mjs`,
11626
+ wasm: `${wasmPathPrefix}ort-wasm-simd-threaded.wasm`
11627
+ } : {
11628
+ mjs: `${wasmPathPrefix}ort-wasm-simd-threaded.asyncify.mjs`,
11629
+ wasm: `${wasmPathPrefix}ort-wasm-simd-threaded.asyncify.wasm`
11630
+ };
11631
+ }
11632
+ ONNX_ENV.wasm.proxy = false;
11633
+ }
11634
+ if (ONNX_ENV.webgpu) {
11635
+ ONNX_ENV.webgpu.powerPreference = "high-performance";
11636
+ }
11637
+ setLogLevel(env.logLevel ?? LogLevel.WARNING);
11638
+ env.backends.onnx = {
11639
+ ...ONNX_ENV,
11640
+ setLogLevel
11641
+ };
11127
11642
  }
11128
- setLogLevel(env.logLevel ?? LogLevel.WARNING);
11129
- env.backends.onnx = {
11130
- ...ONNX_ENV,
11131
- setLogLevel
11132
- };
11133
11643
 
11134
11644
  // src/ops/registry.js
11135
11645
  var wrap = async (session_bytes, session_options, names) => {
@@ -12334,199 +12844,38 @@ var DataTypeMap = Object.freeze({
12334
12844
  int4: Int8Array
12335
12845
  });
12336
12846
 
12337
- // src/utils/random.js
12338
- var Random = class {
12339
- constructor(seed) {
12340
- this._mt = new Uint32Array(624);
12341
- this._idx = 625;
12342
- this._gauss_next = null;
12343
- this._random_fn = this.random.bind(this);
12344
- this.seed(seed);
12847
+ // src/utils/tensor.js
12848
+ var Tensor2 = class _Tensor {
12849
+ /**
12850
+ * Dimensions of the tensor.
12851
+ * @type {number[]}
12852
+ */
12853
+ get dims() {
12854
+ return this.ort_tensor.dims;
12855
+ }
12856
+ set dims(value) {
12857
+ this.ort_tensor.dims = value;
12345
12858
  }
12346
12859
  /**
12347
- * Seeds this instance's PRNG.
12348
- *
12349
- * When called with a number, initializes the state deterministically from that value.
12350
- * When called with no arguments (or `undefined`/`null`), seeds from OS entropy
12351
- * via `crypto.getRandomValues`, matching Python's `random.seed()` behaviour.
12352
- *
12353
- * @param {number} [n] The seed value. Omit to seed from OS entropy.
12860
+ * Type of the tensor.
12861
+ * @type {DataType}
12354
12862
  */
12355
- seed(n) {
12356
- if (n === void 0 || n === null) {
12357
- if (apis.IS_CRYPTO_AVAILABLE) {
12358
- const buf = new Uint32Array(1);
12359
- crypto.getRandomValues(buf);
12360
- n = buf[0];
12361
- } else {
12362
- n = Date.now() >>> 0;
12363
- }
12364
- }
12365
- const mt2 = this._mt;
12366
- const u = (a, b) => Math.imul(a, b) >>> 0, key = [];
12367
- for (let v = n || 0; v > 0; v = Math.floor(v / 4294967296)) key.push(v & 4294967295);
12368
- if (!key.length) key.push(0);
12369
- mt2[0] = 19650218;
12370
- for (let k2 = 1; k2 < 624; ++k2) mt2[k2] = u(1812433253, mt2[k2 - 1] ^ mt2[k2 - 1] >>> 30) + k2 >>> 0;
12371
- let i = 1, j = 0;
12372
- for (let k2 = Math.max(624, key.length); k2 > 0; --k2, ++i, ++j) {
12373
- if (i >= 624) {
12374
- mt2[0] = mt2[623];
12375
- i = 1;
12376
- }
12377
- if (j >= key.length) j = 0;
12378
- mt2[i] = (mt2[i] ^ u(mt2[i - 1] ^ mt2[i - 1] >>> 30, 1664525)) + key[j] + j >>> 0;
12379
- }
12380
- for (let k2 = 623; k2 > 0; --k2, ++i) {
12381
- if (i >= 624) {
12382
- mt2[0] = mt2[623];
12383
- i = 1;
12384
- }
12385
- mt2[i] = (mt2[i] ^ u(mt2[i - 1] ^ mt2[i - 1] >>> 30, 1566083941)) - i >>> 0;
12386
- }
12387
- mt2[0] = 2147483648;
12388
- this._idx = 624;
12389
- this._gauss_next = null;
12863
+ get type() {
12864
+ return this.ort_tensor.type;
12390
12865
  }
12391
12866
  /**
12392
- * Generates a random unsigned 32-bit integer.
12393
- *
12394
- * Performs the "twist" step when the state buffer is exhausted,
12395
- * then applies the standard MT19937 tempering transform.
12396
- *
12397
- * @returns {number} A random integer in the range [0, 2^32 - 1].
12867
+ * The data stored in the tensor.
12868
+ * @type {DataArray}
12398
12869
  */
12399
- _int32() {
12400
- const mt2 = this._mt;
12401
- if (this._idx >= 624) {
12402
- for (let k2 = 0; k2 < 624; ++k2) {
12403
- const y2 = mt2[k2] & 2147483648 | mt2[(k2 + 1) % 624] & 2147483647;
12404
- mt2[k2] = (mt2[(k2 + 397) % 624] ^ y2 >>> 1 ^ (y2 & 1 ? 2567483615 : 0)) >>> 0;
12405
- }
12406
- this._idx = 0;
12407
- }
12408
- let y = mt2[this._idx++];
12409
- y ^= y >>> 11;
12410
- y ^= y << 7 & 2636928640;
12411
- y ^= y << 15 & 4022730752;
12412
- y ^= y >>> 18;
12413
- return y >>> 0;
12870
+ get data() {
12871
+ return this.ort_tensor.data;
12414
12872
  }
12415
12873
  /**
12416
- * Generates a random floating-point number in the half-open interval [0, 1).
12417
- *
12418
- * Combines two 32-bit integers (using 53 bits of precision) to produce
12419
- * a uniformly distributed double, matching Python's `random.random()`.
12420
- *
12421
- * @returns {number} A random float in [0, 1).
12874
+ * The number of elements in the tensor.
12875
+ * @type {number}
12422
12876
  */
12423
- random() {
12424
- return ((this._int32() >>> 5) * 67108864 + (this._int32() >>> 6)) / 9007199254740992;
12425
- }
12426
- /**
12427
- * Generates a random number from a Gaussian (normal) distribution.
12428
- *
12429
- * Uses the Box-Muller transform with a cached spare value,
12430
- * matching Python's `random.gauss()` output for the same seed.
12431
- *
12432
- * @param {number} [mu=0] The mean of the distribution.
12433
- * @param {number} [sigma=1] The standard deviation of the distribution.
12434
- * @returns {number} A normally distributed random value.
12435
- */
12436
- gauss(mu = 0, sigma = 1) {
12437
- let z = this._gauss_next;
12438
- this._gauss_next = null;
12439
- if (z === null) {
12440
- const x2pi = this.random() * 2 * Math.PI, g2rad = Math.sqrt(-2 * Math.log(1 - this.random()));
12441
- z = Math.cos(x2pi) * g2rad;
12442
- this._gauss_next = Math.sin(x2pi) * g2rad;
12443
- }
12444
- return mu + z * sigma;
12445
- }
12446
- /**
12447
- * Shuffles an array in-place using the Fisher-Yates algorithm.
12448
- *
12449
- * Uses rejection sampling via `getrandbits`-style bit masking to ensure
12450
- * a uniform distribution, matching Python's `random.shuffle()`.
12451
- *
12452
- * @param {any[]} arr The array to shuffle in-place.
12453
- */
12454
- shuffle(arr) {
12455
- for (let i = arr.length - 1; i > 0; --i) {
12456
- const k2 = 32 - Math.clz32(i + 1);
12457
- let r = this._int32() >>> 32 - k2;
12458
- while (r > i) r = this._int32() >>> 32 - k2;
12459
- const t = arr[i];
12460
- arr[i] = arr[r];
12461
- arr[r] = t;
12462
- }
12463
- }
12464
- /**
12465
- * Selects a single element from a weighted population.
12466
- *
12467
- * Matches Python's `random.choices(population, weights=weights, k=1)[0]`
12468
- *
12469
- * @param {any[]} population The array of items to choose from.
12470
- * @param {number[]} weights An array of non-negative weights, one per population element.
12471
- * @returns {*} A single randomly selected element from the population.
12472
- */
12473
- choices(population, weights) {
12474
- return population[_weightedIndexWith(this._random_fn, weights)];
12475
- }
12476
- };
12477
- function _weightedIndexWith(randomFn, weights) {
12478
- let sum = 0;
12479
- for (let i = 0; i < weights.length; ++i) sum += weights[i];
12480
- let x = randomFn() * sum;
12481
- for (let i = 0; i < weights.length; ++i) {
12482
- x -= weights[i];
12483
- if (x < 0) return i;
12484
- }
12485
- return weights.length - 1;
12486
- }
12487
- var _default = new Random();
12488
- var random = Object.freeze({
12489
- Random,
12490
- seed: _default.seed.bind(_default),
12491
- random: _default.random.bind(_default),
12492
- gauss: _default.gauss.bind(_default),
12493
- shuffle: _default.shuffle.bind(_default),
12494
- choices: _default.choices.bind(_default)
12495
- });
12496
- var _weightedIndex = (weights) => _weightedIndexWith(random.random, weights);
12497
-
12498
- // src/utils/tensor.js
12499
- var Tensor2 = class _Tensor {
12500
- /**
12501
- * Dimensions of the tensor.
12502
- * @type {number[]}
12503
- */
12504
- get dims() {
12505
- return this.ort_tensor.dims;
12506
- }
12507
- set dims(value) {
12508
- this.ort_tensor.dims = value;
12509
- }
12510
- /**
12511
- * Type of the tensor.
12512
- * @type {DataType}
12513
- */
12514
- get type() {
12515
- return this.ort_tensor.type;
12516
- }
12517
- /**
12518
- * The data stored in the tensor.
12519
- * @type {DataArray}
12520
- */
12521
- get data() {
12522
- return this.ort_tensor.data;
12523
- }
12524
- /**
12525
- * The number of elements in the tensor.
12526
- * @type {number}
12527
- */
12528
- get size() {
12529
- return this.ort_tensor.size;
12877
+ get size() {
12878
+ return this.ort_tensor.size;
12530
12879
  }
12531
12880
  /**
12532
12881
  * The location of the tensor data.
@@ -12905,9 +13254,23 @@ var Tensor2 = class _Tensor {
12905
13254
  throw Error(`Unsupported norm: ${p}`);
12906
13255
  }
12907
13256
  const this_data = this.data;
12908
- const fn2 = (a, b) => a + b ** p;
13257
+ const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
13258
+ if (is_bigint && p !== 1) {
13259
+ throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
13260
+ }
13261
+ let fn2, zero;
13262
+ if (is_bigint) {
13263
+ fn2 = (a, b) => a + b;
13264
+ zero = 0n;
13265
+ } else {
13266
+ fn2 = (a, b) => a + b ** p;
13267
+ zero = 0;
13268
+ }
12909
13269
  if (dim === null) {
12910
- const val = this_data.reduce(fn2, 0) ** (1 / p);
13270
+ let val = this_data.reduce(fn2, zero);
13271
+ if (p !== 1) {
13272
+ val = val ** (1 / p);
13273
+ }
12911
13274
  return new _Tensor(this.type, [val], []);
12912
13275
  }
12913
13276
  const [type, result, resultDims] = reduce_helper(fn2, this, dim, keepdim);
@@ -15367,9 +15730,11 @@ __export(processors_exports, {
15367
15730
  ChatterboxProcessor: () => ChatterboxProcessor,
15368
15731
  Florence2Processor: () => Florence2Processor,
15369
15732
  Gemma3nProcessor: () => Gemma3nProcessor,
15733
+ GraniteSpeechProcessor: () => GraniteSpeechProcessor,
15370
15734
  GroundingDinoProcessor: () => GroundingDinoProcessor,
15371
15735
  Idefics3Processor: () => Idefics3Processor,
15372
15736
  JinaCLIPProcessor: () => JinaCLIPProcessor,
15737
+ Lfm2VlProcessor: () => Lfm2VlProcessor,
15373
15738
  LlavaProcessor: () => LlavaProcessor,
15374
15739
  MgpstrProcessor: () => MgpstrProcessor,
15375
15740
  MoonshineProcessor: () => MoonshineProcessor,
@@ -15390,6 +15755,7 @@ __export(processors_exports, {
15390
15755
  UltravoxProcessor: () => UltravoxProcessor,
15391
15756
  VLChatProcessor: () => VLChatProcessor,
15392
15757
  VoxtralProcessor: () => VoxtralProcessor,
15758
+ VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
15393
15759
  Wav2Vec2Processor: () => Wav2Vec2Processor,
15394
15760
  Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
15395
15761
  WhisperProcessor: () => WhisperProcessor
@@ -15444,19 +15810,21 @@ __export(feature_extractors_exports, {
15444
15810
  EncodecFeatureExtractor: () => EncodecFeatureExtractor,
15445
15811
  FeatureExtractor: () => FeatureExtractor,
15446
15812
  Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
15813
+ GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
15447
15814
  MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
15448
15815
  ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
15449
15816
  PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
15450
15817
  SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
15451
15818
  SnacFeatureExtractor: () => SnacFeatureExtractor,
15452
15819
  SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
15820
+ VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
15453
15821
  Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
15454
15822
  WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
15455
15823
  WhisperFeatureExtractor: () => WhisperFeatureExtractor
15456
15824
  });
15457
15825
 
15458
15826
  // src/utils/io.js
15459
- import fs4 from "fs";
15827
+ import fs5 from "fs";
15460
15828
  import { Readable } from "stream";
15461
15829
  import { pipeline as pipe } from "stream/promises";
15462
15830
  async function saveBlob(path3, blob) {
@@ -15474,7 +15842,7 @@ async function saveBlob(path3, blob) {
15474
15842
  } else if (apis.IS_FS_AVAILABLE) {
15475
15843
  const webStream = blob.stream();
15476
15844
  const nodeStream = Readable.fromWeb(webStream);
15477
- const fileStream = fs4.createWriteStream(path3);
15845
+ const fileStream = fs5.createWriteStream(path3);
15478
15846
  await pipe(nodeStream, fileStream);
15479
15847
  } else {
15480
15848
  throw new Error("Unable to save because filesystem is disabled in this environment.");
@@ -15677,6 +16045,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
15677
16045
  mel_filters = null,
15678
16046
  mel_floor = 1e-10,
15679
16047
  log_mel = null,
16048
+ max_log_mel = null,
15680
16049
  reference = 1,
15681
16050
  min_value = 1e-10,
15682
16051
  db_range = null,
@@ -15816,6 +16185,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
15816
16185
  mel_spec_data[i] = Math.log10(mel_spec_data[i]);
15817
16186
  }
15818
16187
  break;
16188
+ case "log10_max_norm": {
16189
+ for (let i = 0; i < o; ++i) {
16190
+ mel_spec_data[i] = Math.log10(mel_spec_data[i]);
16191
+ }
16192
+ const logMax = max_log_mel ?? max(mel_spec_data)[0];
16193
+ const threshold = logMax - 8;
16194
+ for (let i = 0; i < o; ++i) {
16195
+ mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
16196
+ }
16197
+ break;
16198
+ }
15819
16199
  case "dB":
15820
16200
  if (power === 1) {
15821
16201
  amplitude_to_db(mel_spec_data, reference, min_value, db_range);
@@ -15826,7 +16206,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
15826
16206
  }
15827
16207
  break;
15828
16208
  default:
15829
- throw new Error(`log_mel must be one of null, 'log', 'log10' or 'dB'. Got '${log_mel}'`);
16209
+ throw new Error(
16210
+ `log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
16211
+ );
15830
16212
  }
15831
16213
  }
15832
16214
  return mel_spec;
@@ -16331,6 +16713,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
16331
16713
  }
16332
16714
  };
16333
16715
 
16716
+ // src/models/granite_speech/feature_extraction_granite_speech.js
16717
+ var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
16718
+ constructor(config) {
16719
+ super(config);
16720
+ const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
16721
+ this.mel_filters = mel_filter_bank(
16722
+ Math.floor(1 + n_fft / 2),
16723
+ // num_frequency_bins = 257
16724
+ n_mels,
16725
+ // 80
16726
+ 0,
16727
+ // min_frequency
16728
+ sample_rate / 2,
16729
+ // max_frequency = 8000
16730
+ sample_rate,
16731
+ // 16000
16732
+ null,
16733
+ // norm (torchaudio default: no norm)
16734
+ "htk"
16735
+ // mel_scale (torchaudio default)
16736
+ );
16737
+ const raw_window = window_function(win_length, "hann");
16738
+ this.window = new Float64Array(n_fft);
16739
+ const pad = Math.floor((n_fft - win_length) / 2);
16740
+ this.window.set(raw_window, pad);
16741
+ }
16742
+ /**
16743
+ * Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
16744
+ * @param {Float32Array|Float64Array} audio The audio waveform.
16745
+ * @returns {Promise<{input_features: Tensor}>}
16746
+ */
16747
+ async _call(audio) {
16748
+ validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
16749
+ const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
16750
+ const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
16751
+ const max_num_frames = num_frames - num_frames % 2;
16752
+ const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
16753
+ power: 2,
16754
+ mel_filters: this.mel_filters,
16755
+ log_mel: "log10_max_norm",
16756
+ transpose: true,
16757
+ // [time, n_mels]
16758
+ max_num_frames,
16759
+ do_pad: false
16760
+ });
16761
+ const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
16762
+ return { input_features };
16763
+ }
16764
+ };
16765
+
16334
16766
  // src/models/moonshine/feature_extraction_moonshine.js
16335
16767
  var MoonshineFeatureExtractor = class extends FeatureExtractor {
16336
16768
  /**
@@ -16811,6 +17243,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
16811
17243
  }
16812
17244
  };
16813
17245
 
17246
+ // src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
17247
+ var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
17248
+ constructor(config) {
17249
+ super(config);
17250
+ this.config.mel_filters ??= mel_filter_bank(
17251
+ Math.floor(1 + this.config.n_fft / 2),
17252
+ // num_frequency_bins
17253
+ this.config.feature_size,
17254
+ // num_mel_filters
17255
+ 0,
17256
+ // min_frequency
17257
+ 8e3,
17258
+ // max_frequency
17259
+ this.config.sampling_rate,
17260
+ // sampling_rate
17261
+ "slaney",
17262
+ // norm
17263
+ "slaney"
17264
+ // mel_scale
17265
+ );
17266
+ this.window = window_function(this.config.n_fft, "hann");
17267
+ }
17268
+ /**
17269
+ * Computes the log-Mel spectrogram of the provided audio waveform.
17270
+ * @param {Float32Array|Float64Array} waveform The audio waveform to process.
17271
+ * @param {Object} [options]
17272
+ * @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
17273
+ * @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
17274
+ */
17275
+ async _extract_fbank_features(waveform, { center = true } = {}) {
17276
+ const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
17277
+ const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
17278
+ return await spectrogram(
17279
+ waveform,
17280
+ this.window,
17281
+ n_fft,
17282
+ // frame_length
17283
+ hop_length,
17284
+ {
17285
+ power: 2,
17286
+ mel_filters,
17287
+ log_mel: "log10_max_norm",
17288
+ max_log_mel: global_log_mel_max,
17289
+ center,
17290
+ max_num_frames,
17291
+ do_pad: false
17292
+ }
17293
+ );
17294
+ }
17295
+ /**
17296
+ * Extract mel spectrogram features from audio.
17297
+ * @param {Float32Array|Float64Array} audio The audio data.
17298
+ * @param {Object} [options]
17299
+ * @param {boolean} [options.center=true] Whether to center-pad the waveform.
17300
+ * @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
17301
+ */
17302
+ async _call(audio, { center = true } = {}) {
17303
+ validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
17304
+ const features = await this._extract_fbank_features(audio, { center });
17305
+ return {
17306
+ input_features: features.unsqueeze_(0)
17307
+ };
17308
+ }
17309
+ };
17310
+
16814
17311
  // src/models/whisper/feature_extraction_whisper.js
16815
17312
  var WhisperFeatureExtractor = class extends FeatureExtractor {
16816
17313
  constructor(config) {
@@ -16839,7 +17336,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
16839
17336
  * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
16840
17337
  */
16841
17338
  async _extract_fbank_features(waveform) {
16842
- const features = await spectrogram(
17339
+ return await spectrogram(
16843
17340
  waveform,
16844
17341
  this.window,
16845
17342
  // window
@@ -16850,7 +17347,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
16850
17347
  {
16851
17348
  power: 2,
16852
17349
  mel_filters: this.config.mel_filters,
16853
- log_mel: "log10",
17350
+ log_mel: "log10_max_norm",
16854
17351
  // Custom
16855
17352
  max_num_frames: Math.min(
16856
17353
  Math.floor(waveform.length / this.config.hop_length),
@@ -16859,15 +17356,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
16859
17356
  )
16860
17357
  }
16861
17358
  );
16862
- const data = features.data;
16863
- const maxValue = max(
16864
- /** @type {Float32Array} */
16865
- data
16866
- )[0];
16867
- for (let i = 0; i < data.length; ++i) {
16868
- data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
16869
- }
16870
- return features;
16871
17359
  }
16872
17360
  /**
16873
17361
  * Asynchronously extracts features from a given audio using the provided configuration.
@@ -16931,11 +17419,10 @@ import sharp from "sharp";
16931
17419
  var createCanvasFunction;
16932
17420
  var ImageDataClass;
16933
17421
  var loadImageFunction;
16934
- var IS_BROWSER_OR_WEBWORKER = apis.IS_BROWSER_ENV || apis.IS_WEBWORKER_ENV;
16935
- if (IS_BROWSER_OR_WEBWORKER) {
17422
+ if (apis.IS_WEB_ENV) {
16936
17423
  createCanvasFunction = (width, height) => {
16937
17424
  if (!self.OffscreenCanvas) {
16938
- throw new Error("OffscreenCanvas not supported by this browser.");
17425
+ throw new Error("OffscreenCanvas not supported by this environment.");
16939
17426
  }
16940
17427
  return new self.OffscreenCanvas(width, height);
16941
17428
  };
@@ -17025,7 +17512,7 @@ var RawImage = class _RawImage {
17025
17512
  * @returns {RawImage} The image object.
17026
17513
  */
17027
17514
  static fromCanvas(canvas) {
17028
- if (!IS_BROWSER_OR_WEBWORKER) {
17515
+ if (!apis.IS_WEB_ENV) {
17029
17516
  throw new Error("fromCanvas() is only supported in browser environments.");
17030
17517
  }
17031
17518
  const ctx = (
@@ -17054,7 +17541,7 @@ var RawImage = class _RawImage {
17054
17541
  * @returns {Promise<RawImage>} The image object.
17055
17542
  */
17056
17543
  static async fromBlob(blob) {
17057
- if (IS_BROWSER_OR_WEBWORKER) {
17544
+ if (apis.IS_WEB_ENV) {
17058
17545
  const img = await loadImageFunction(blob);
17059
17546
  const ctx = createCanvasFunction(img.width, img.height).getContext("2d");
17060
17547
  ctx.drawImage(img, 0, 0);
@@ -17235,7 +17722,7 @@ var RawImage = class _RawImage {
17235
17722
  } else if (nullish_height) {
17236
17723
  height = width / this.width * this.height;
17237
17724
  }
17238
- if (IS_BROWSER_OR_WEBWORKER) {
17725
+ if (apis.IS_WEB_ENV) {
17239
17726
  const numChannels = this.channels;
17240
17727
  const canvas = this.toCanvas();
17241
17728
  const ctx = createCanvasFunction(width, height).getContext("2d");
@@ -17283,7 +17770,7 @@ var RawImage = class _RawImage {
17283
17770
  if (left === 0 && right === 0 && top === 0 && bottom === 0) {
17284
17771
  return this;
17285
17772
  }
17286
- if (IS_BROWSER_OR_WEBWORKER) {
17773
+ if (apis.IS_WEB_ENV) {
17287
17774
  const numChannels = this.channels;
17288
17775
  const canvas = this.toCanvas();
17289
17776
  const newWidth = this.width + left + right;
@@ -17307,7 +17794,7 @@ var RawImage = class _RawImage {
17307
17794
  }
17308
17795
  const crop_width = x_max - x_min + 1;
17309
17796
  const crop_height = y_max - y_min + 1;
17310
- if (IS_BROWSER_OR_WEBWORKER) {
17797
+ if (apis.IS_WEB_ENV) {
17311
17798
  const numChannels = this.channels;
17312
17799
  const canvas = this.toCanvas();
17313
17800
  const ctx = createCanvasFunction(crop_width, crop_height).getContext("2d");
@@ -17335,7 +17822,7 @@ var RawImage = class _RawImage {
17335
17822
  }
17336
17823
  const width_offset = (this.width - crop_width) / 2;
17337
17824
  const height_offset = (this.height - crop_height) / 2;
17338
- if (IS_BROWSER_OR_WEBWORKER) {
17825
+ if (apis.IS_WEB_ENV) {
17339
17826
  const numChannels = this.channels;
17340
17827
  const canvas = this.toCanvas();
17341
17828
  const ctx = createCanvasFunction(crop_width, crop_height).getContext("2d");
@@ -17413,7 +17900,7 @@ var RawImage = class _RawImage {
17413
17900
  }
17414
17901
  }
17415
17902
  async toBlob(type = "image/png", quality = 1) {
17416
- if (!IS_BROWSER_OR_WEBWORKER) {
17903
+ if (!apis.IS_WEB_ENV) {
17417
17904
  throw new Error("toBlob() is only supported in browser environments.");
17418
17905
  }
17419
17906
  const canvas = this.toCanvas();
@@ -17430,7 +17917,7 @@ var RawImage = class _RawImage {
17430
17917
  return tensor;
17431
17918
  }
17432
17919
  toCanvas() {
17433
- if (!IS_BROWSER_OR_WEBWORKER) {
17920
+ if (!apis.IS_WEB_ENV) {
17434
17921
  throw new Error("toCanvas() is only supported in browser environments.");
17435
17922
  }
17436
17923
  const cloned = this.clone().rgba();
@@ -17514,7 +18001,7 @@ var RawImage = class _RawImage {
17514
18001
  * @returns {Promise<void>}
17515
18002
  */
17516
18003
  async save(path3) {
17517
- if (IS_BROWSER_OR_WEBWORKER) {
18004
+ if (apis.IS_WEB_ENV) {
17518
18005
  if (apis.IS_WEBWORKER_ENV) {
17519
18006
  throw new Error("Unable to save an image from a Web Worker.");
17520
18007
  }
@@ -17534,7 +18021,7 @@ var RawImage = class _RawImage {
17534
18021
  * @returns {import('sharp').Sharp} The Sharp instance.
17535
18022
  */
17536
18023
  toSharp() {
17537
- if (IS_BROWSER_OR_WEBWORKER) {
18024
+ if (apis.IS_WEB_ENV) {
17538
18025
  throw new Error("toSharp() is only supported in server-side environments.");
17539
18026
  }
17540
18027
  return sharp(this.data, {
@@ -17747,6 +18234,27 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
17747
18234
  }
17748
18235
  return [segmentation, segments];
17749
18236
  }
18237
+ function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
18238
+ if (height < factor || width < factor) {
18239
+ throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
18240
+ } else if (Math.max(height, width) / Math.min(height, width) > 200) {
18241
+ throw new Error(
18242
+ `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
18243
+ );
18244
+ }
18245
+ let h_bar = Math.round(height / factor) * factor;
18246
+ let w_bar = Math.round(width / factor) * factor;
18247
+ if (h_bar * w_bar > max_pixels) {
18248
+ const beta = Math.sqrt(height * width / max_pixels);
18249
+ h_bar = Math.floor(height / beta / factor) * factor;
18250
+ w_bar = Math.floor(width / beta / factor) * factor;
18251
+ } else if (h_bar * w_bar < min_pixels) {
18252
+ const beta = Math.sqrt(min_pixels / (height * width));
18253
+ h_bar = Math.ceil(height * beta / factor) * factor;
18254
+ w_bar = Math.ceil(width * beta / factor) * factor;
18255
+ }
18256
+ return [h_bar, w_bar];
18257
+ }
17750
18258
  function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
17751
18259
  if (label_ids_to_fuse === null) {
17752
18260
  logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
@@ -18035,7 +18543,7 @@ var ImageProcessor = class extends Callable2 {
18035
18543
  });
18036
18544
  }
18037
18545
  /**
18038
- * @typedef {object} PreprocessedImage
18546
+ * @typedef {Object} PreprocessedImage
18039
18547
  * @property {HeightWidth} original_size The original size of the image.
18040
18548
  * @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
18041
18549
  * @property {Tensor} pixel_values The pixel values of the preprocessed image.
@@ -18213,6 +18721,7 @@ __export(image_processors_exports, {
18213
18721
  ImageFeatureExtractor: () => ImageProcessor,
18214
18722
  ImageProcessor: () => ImageProcessor,
18215
18723
  JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
18724
+ Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
18216
18725
  LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
18217
18726
  Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
18218
18727
  MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
@@ -18616,6 +19125,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
18616
19125
  }
18617
19126
  };
18618
19127
 
19128
+ // src/models/lfm2_vl/image_processing_lfm2_vl.js
19129
+ function round_by_factor(number, factor) {
19130
+ return Math.round(number / factor) * factor;
19131
+ }
19132
+ function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
19133
+ let best_ratio_diff = Infinity;
19134
+ let best_ratio = [1, 1];
19135
+ const area = width * height;
19136
+ for (const ratio of target_ratios) {
19137
+ const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
19138
+ if (ratio_diff < best_ratio_diff) {
19139
+ best_ratio_diff = ratio_diff;
19140
+ best_ratio = ratio;
19141
+ } else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
19142
+ best_ratio = ratio;
19143
+ }
19144
+ }
19145
+ return best_ratio;
19146
+ }
19147
+ function get_target_ratios(min_tiles, max_tiles) {
19148
+ const ratios = [];
19149
+ const seen = /* @__PURE__ */ new Set();
19150
+ for (let n = min_tiles; n <= max_tiles; ++n) {
19151
+ for (let w = 1; w <= n; ++w) {
19152
+ for (let h = 1; h <= n; ++h) {
19153
+ const product2 = w * h;
19154
+ if (product2 >= min_tiles && product2 <= max_tiles) {
19155
+ const key = w << 16 | h;
19156
+ if (!seen.has(key)) {
19157
+ seen.add(key);
19158
+ ratios.push([w, h]);
19159
+ }
19160
+ }
19161
+ }
19162
+ }
19163
+ }
19164
+ return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
19165
+ }
19166
+ function convert_image_to_patches(images, patch_size) {
19167
+ const [B, C, H, W] = images.dims;
19168
+ const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
19169
+ const patch_dim = patch_size * patch_size * C;
19170
+ const data = (
19171
+ /** @type {Float32Array} */
19172
+ images.data
19173
+ );
19174
+ const result = new Float32Array(B * ph * pw * patch_dim);
19175
+ const ch_stride = H * W;
19176
+ for (let b = 0; b < B; ++b) {
19177
+ const b_src = b * C * ch_stride;
19178
+ const b_dst = b * ph * pw * patch_dim;
19179
+ for (let py = 0; py < ph; ++py) {
19180
+ for (let px = 0; px < pw; ++px) {
19181
+ let off = b_dst + (py * pw + px) * patch_dim;
19182
+ for (let dy = 0; dy < patch_size; ++dy) {
19183
+ const row = (py * patch_size + dy) * W + px * patch_size;
19184
+ for (let dx = 0; dx < patch_size; ++dx) {
19185
+ const pixel = row + dx;
19186
+ for (let c = 0; c < C; ++c) {
19187
+ result[off++] = data[b_src + c * ch_stride + pixel];
19188
+ }
19189
+ }
19190
+ }
19191
+ }
19192
+ }
19193
+ }
19194
+ return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
19195
+ }
19196
+ function pad_along_first_dim(patches, target_length) {
19197
+ const [, len2, dim] = patches.dims;
19198
+ const mask_data = new BigInt64Array(target_length);
19199
+ mask_data.fill(1n, 0, len2);
19200
+ let padded = patches;
19201
+ if (len2 < target_length) {
19202
+ const padded_data = new Float32Array(target_length * dim);
19203
+ padded_data.set(
19204
+ /** @type {Float32Array} */
19205
+ patches.data
19206
+ );
19207
+ padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
19208
+ }
19209
+ return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
19210
+ }
19211
+ var Lfm2VlImageProcessor = class extends ImageProcessor {
19212
+ constructor(config) {
19213
+ super(config);
19214
+ this.downsample_factor = config.downsample_factor ?? 2;
19215
+ this.do_image_splitting = config.do_image_splitting ?? true;
19216
+ this.min_tiles = config.min_tiles ?? 2;
19217
+ this.max_tiles = config.max_tiles ?? 10;
19218
+ this.use_thumbnail = config.use_thumbnail ?? true;
19219
+ this.min_image_tokens = config.min_image_tokens ?? 64;
19220
+ this.max_image_tokens = config.max_image_tokens ?? 256;
19221
+ this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
19222
+ this.tile_size = config.tile_size ?? 512;
19223
+ this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
19224
+ this.return_row_col_info = config.return_row_col_info ?? false;
19225
+ const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
19226
+ const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
19227
+ this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
19228
+ }
19229
+ /**
19230
+ * Check if the image is too large to be processed as a single tile.
19231
+ * @param {number} height
19232
+ * @param {number} width
19233
+ * @returns {boolean}
19234
+ */
19235
+ _is_image_too_large(height, width) {
19236
+ const total_factor = this.encoder_patch_size * this.downsample_factor;
19237
+ const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
19238
+ const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
19239
+ return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
19240
+ }
19241
+ /**
19242
+ * Get the grid layout for tiling a large image.
19243
+ * @param {number} height
19244
+ * @param {number} width
19245
+ * @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
19246
+ */
19247
+ _get_grid_layout(height, width) {
19248
+ const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
19249
+ const [grid_width, grid_height] = find_closest_aspect_ratio(
19250
+ width / height,
19251
+ target_ratios,
19252
+ width,
19253
+ height,
19254
+ this.tile_size
19255
+ );
19256
+ return {
19257
+ grid_width,
19258
+ grid_height,
19259
+ target_width: this.tile_size * grid_width,
19260
+ target_height: this.tile_size * grid_height
19261
+ };
19262
+ }
19263
+ /** @param {RawImage|RawImage[]|RawImage[][]} images */
19264
+ // @ts-expect-error
19265
+ async _call(images, { return_row_col_info = null } = {}) {
19266
+ let batched_images;
19267
+ if (!Array.isArray(images)) {
19268
+ batched_images = [[images]];
19269
+ } else if (!Array.isArray(images[0])) {
19270
+ batched_images = [
19271
+ /** @type {RawImage[]} */
19272
+ images
19273
+ ];
19274
+ } else {
19275
+ batched_images = /** @type {RawImage[][]} */
19276
+ images;
19277
+ }
19278
+ const all_pixel_values = [];
19279
+ const all_pixel_masks = [];
19280
+ const all_spatial_shapes = [];
19281
+ const all_rows = [];
19282
+ const all_cols = [];
19283
+ const all_image_sizes = [];
19284
+ for (const image_batch of batched_images) {
19285
+ const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
19286
+ for (const { pixel_values } of preprocessed) {
19287
+ const [, height, width] = pixel_values.dims;
19288
+ const img = pixel_values.unsqueeze_(0);
19289
+ const total_factor = this.encoder_patch_size * this.downsample_factor;
19290
+ const f2 = total_factor ** 2;
19291
+ const [new_height, new_width] = smart_resize(
19292
+ Math.max(total_factor, height),
19293
+ Math.max(total_factor, width),
19294
+ total_factor,
19295
+ this.min_image_tokens * f2,
19296
+ this.max_image_tokens * f2
19297
+ ).map((x) => Math.max(total_factor, x));
19298
+ let tiles;
19299
+ let num_rows = 1, num_cols = 1;
19300
+ const is_large = this._is_image_too_large(height, width);
19301
+ const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
19302
+ if (is_large && do_splitting) {
19303
+ const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
19304
+ height,
19305
+ width
19306
+ );
19307
+ num_rows = grid_height;
19308
+ num_cols = grid_width;
19309
+ const resized = await interpolate_4d(img, {
19310
+ size: [target_height, target_width]
19311
+ });
19312
+ tiles = [];
19313
+ for (let r = 0; r < grid_height; ++r) {
19314
+ for (let c = 0; c < grid_width; ++c) {
19315
+ const y = r * this.tile_size;
19316
+ const x = c * this.tile_size;
19317
+ tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
19318
+ }
19319
+ }
19320
+ if (this.use_thumbnail && grid_width * grid_height !== 1) {
19321
+ tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
19322
+ }
19323
+ } else {
19324
+ tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
19325
+ }
19326
+ for (const tile of tiles) {
19327
+ const [, , th, tw] = tile.dims;
19328
+ const patches = convert_image_to_patches(tile, this.encoder_patch_size);
19329
+ const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
19330
+ all_pixel_values.push(padded);
19331
+ all_pixel_masks.push(mask);
19332
+ all_spatial_shapes.push([
19333
+ Math.floor(th / this.encoder_patch_size),
19334
+ Math.floor(tw / this.encoder_patch_size)
19335
+ ]);
19336
+ }
19337
+ all_rows.push(num_rows);
19338
+ all_cols.push(num_cols);
19339
+ all_image_sizes.push([new_height, new_width]);
19340
+ }
19341
+ }
19342
+ const result = {
19343
+ pixel_values: cat(all_pixel_values, 0),
19344
+ pixel_attention_mask: stack(all_pixel_masks, 0),
19345
+ spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
19346
+ all_spatial_shapes.length,
19347
+ 2
19348
+ ])
19349
+ };
19350
+ if (return_row_col_info ?? this.return_row_col_info) {
19351
+ result.image_rows = all_rows;
19352
+ result.image_cols = all_cols;
19353
+ result.image_sizes = all_image_sizes;
19354
+ }
19355
+ return result;
19356
+ }
19357
+ };
19358
+
18619
19359
  // src/models/llava_onevision/image_processing_llava_onevision.js
18620
19360
  var LlavaOnevisionImageProcessor = class extends ImageProcessor {
18621
19361
  };
@@ -18839,27 +19579,6 @@ var PvtImageProcessor = class extends ImageProcessor {
18839
19579
  };
18840
19580
 
18841
19581
  // src/models/qwen2_vl/image_processing_qwen2_vl.js
18842
- function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
18843
- if (height < factor || width < factor) {
18844
- throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
18845
- } else if (Math.max(height, width) / Math.min(height, width) > 200) {
18846
- throw new Error(
18847
- `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
18848
- );
18849
- }
18850
- let h_bar = Math.round(height / factor) * factor;
18851
- let w_bar = Math.round(width / factor) * factor;
18852
- if (h_bar * w_bar > max_pixels) {
18853
- const beta = Math.sqrt(height * width / max_pixels);
18854
- h_bar = Math.floor(height / beta / factor) * factor;
18855
- w_bar = Math.floor(width / beta / factor) * factor;
18856
- } else if (h_bar * w_bar < min_pixels) {
18857
- const beta = Math.sqrt(min_pixels / (height * width));
18858
- h_bar = Math.ceil(height * beta / factor) * factor;
18859
- w_bar = Math.ceil(width * beta / factor) * factor;
18860
- }
18861
- return [h_bar, w_bar];
18862
- }
18863
19582
  var Qwen2VLImageProcessor = class extends ImageProcessor {
18864
19583
  constructor(config) {
18865
19584
  super(config);
@@ -19461,6 +20180,57 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
19461
20180
  }
19462
20181
  };
19463
20182
 
20183
+ // src/models/granite_speech/processing_granite_speech.js
20184
+ var GraniteSpeechProcessor = class extends Processor {
20185
+ static tokenizer_class = AutoTokenizer;
20186
+ static feature_extractor_class = AutoFeatureExtractor;
20187
+ static uses_processor_config = true;
20188
+ /**
20189
+ * Compute the number of audio tokens for a given raw audio length.
20190
+ * @param {number} audioLength Raw audio sample count.
20191
+ * @returns {number} Number of projector output tokens.
20192
+ */
20193
+ _get_num_audio_features(audioLength) {
20194
+ const { hop_length } = this.feature_extractor.config.melspec_kwargs;
20195
+ const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
20196
+ const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
20197
+ const mel_length = Math.floor(audioLength / hop_length) + 1;
20198
+ const encoder_length = Math.floor(mel_length / 2);
20199
+ const nblocks = Math.ceil(encoder_length / projector_window_size);
20200
+ return nblocks * effective_window_size;
20201
+ }
20202
+ /**
20203
+ * @param {string} text The text input to process.
20204
+ * @param {Float32Array} audio The audio input to process.
20205
+ */
20206
+ async _call(text, audio = null, kwargs = {}) {
20207
+ if (Array.isArray(text)) {
20208
+ throw new Error("Batched inputs are not supported yet.");
20209
+ }
20210
+ let audio_inputs = {};
20211
+ if (audio) {
20212
+ const { input_features } = await this.feature_extractor(audio);
20213
+ audio_inputs["input_features"] = input_features;
20214
+ const audio_embed_size = this._get_num_audio_features(audio.length);
20215
+ const mask_data = new Uint8Array(audio_embed_size).fill(1);
20216
+ audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
20217
+ const audio_token = this.config.audio_token ?? "<|audio|>";
20218
+ if (!text.includes(audio_token)) {
20219
+ throw new Error(`The input text does not contain the audio token ${audio_token}.`);
20220
+ }
20221
+ text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
20222
+ }
20223
+ const text_inputs = this.tokenizer(text, {
20224
+ add_special_tokens: false,
20225
+ ...kwargs
20226
+ });
20227
+ return {
20228
+ ...text_inputs,
20229
+ ...audio_inputs
20230
+ };
20231
+ }
20232
+ };
20233
+
19464
20234
  // src/models/grounding_dino/processing_grounding_dino.js
19465
20235
  function get_phrases_from_posmap(posmaps, input_ids) {
19466
20236
  const left_idx = 0;
@@ -19737,6 +20507,66 @@ var JinaCLIPProcessor = class extends Processor {
19737
20507
  }
19738
20508
  };
19739
20509
 
20510
+ // src/models/lfm2_vl/processing_lfm2_vl.js
20511
+ var Lfm2VlProcessor = class extends Processor {
20512
+ static tokenizer_class = AutoTokenizer;
20513
+ static image_processor_class = AutoImageProcessor;
20514
+ /**
20515
+ * @param {RawImage|RawImage[]} images
20516
+ * @param {string|string[]|null} [text]
20517
+ * @param {Record<string, any>} [kwargs]
20518
+ */
20519
+ async _call(images, text = null, kwargs = {}) {
20520
+ const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
20521
+ ...kwargs,
20522
+ return_row_col_info: true
20523
+ });
20524
+ if (text) {
20525
+ const image_token = this.config.image_token ?? "<image>";
20526
+ const {
20527
+ tile_size = 512,
20528
+ downsample_factor = 2,
20529
+ encoder_patch_size = 16,
20530
+ use_thumbnail = true
20531
+ } = (
20532
+ /** @type {Record<string, any>} */
20533
+ this.image_processor.config
20534
+ );
20535
+ const ds2 = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
20536
+ const tokens_per_tile = ds2(tile_size) ** 2;
20537
+ const image_start = this.config.image_start_token ?? "<|image_start|>";
20538
+ const image_end = this.config.image_end_token ?? "<|image_end|>";
20539
+ const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
20540
+ if (!Array.isArray(text)) text = [text];
20541
+ let image_idx = 0;
20542
+ text = text.map((sample) => {
20543
+ const parts = sample.split(image_token);
20544
+ return parts[0] + parts.slice(1).map((part) => {
20545
+ const idx = image_idx++;
20546
+ const [h, w] = image_sizes[idx];
20547
+ const rows = image_rows[idx], cols = image_cols[idx];
20548
+ const tokens_for_image = ds2(h) * ds2(w);
20549
+ let expanded = image_start;
20550
+ if (rows > 1 || cols > 1) {
20551
+ const tile_str = image_token.repeat(tokens_per_tile);
20552
+ for (let r = 0; r < rows; ++r)
20553
+ for (let c = 0; c < cols; ++c)
20554
+ expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
20555
+ if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
20556
+ } else {
20557
+ expanded += image_token.repeat(tokens_for_image);
20558
+ }
20559
+ return expanded + image_end + part;
20560
+ }).join("");
20561
+ });
20562
+ }
20563
+ return {
20564
+ ...image_inputs,
20565
+ ...text ? this.tokenizer(text, kwargs) : {}
20566
+ };
20567
+ }
20568
+ };
20569
+
19740
20570
  // src/models/llava/processing_llava.js
19741
20571
  var LlavaProcessor = class extends Processor {
19742
20572
  static tokenizer_class = AutoTokenizer;
@@ -20269,6 +21099,94 @@ var VoxtralProcessor = class extends Processor {
20269
21099
  }
20270
21100
  };
20271
21101
 
21102
+ // src/models/voxtral_realtime/processing_voxtral_realtime.js
21103
+ var NUM_LEFT_PAD_TOKENS = 32;
21104
+ var NUM_DELAY_TOKENS = 6;
21105
+ var AUDIO_LENGTH_PER_TOK = 8;
21106
+ var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
21107
+ var STREAMING_PAD_TOKEN_ID = 32;
21108
+ var VoxtralRealtimeProcessor = class extends Processor {
21109
+ static tokenizer_class = AutoTokenizer;
21110
+ static feature_extractor_class = AutoFeatureExtractor;
21111
+ static uses_processor_config = false;
21112
+ /** Number of mel frames in the first audio chunk. */
21113
+ get num_mel_frames_first_audio_chunk() {
21114
+ return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
21115
+ }
21116
+ /** Number of raw audio samples in the first audio chunk. */
21117
+ get num_samples_first_audio_chunk() {
21118
+ const { hop_length, n_fft } = this.feature_extractor.config;
21119
+ return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
21120
+ }
21121
+ /** Number of raw audio samples per subsequent audio chunk. */
21122
+ get num_samples_per_audio_chunk() {
21123
+ const { hop_length, n_fft } = this.feature_extractor.config;
21124
+ return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
21125
+ }
21126
+ /** Number of right-pad tokens for non-streaming mode. */
21127
+ get num_right_pad_tokens() {
21128
+ return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
21129
+ }
21130
+ /** Number of mel frames per text token. */
21131
+ get audio_length_per_tok() {
21132
+ return AUDIO_LENGTH_PER_TOK;
21133
+ }
21134
+ /** Number of raw audio samples per token. */
21135
+ get raw_audio_length_per_tok() {
21136
+ return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
21137
+ }
21138
+ /**
21139
+ * Process audio input for VoxtralRealtime.
21140
+ *
21141
+ * In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
21142
+ * with silence and mel features are extracted with `center=true`.
21143
+ * Returns `{ input_ids, input_features }`.
21144
+ *
21145
+ * In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
21146
+ * processed with `center=false` and only `{ input_features }` is returned.
21147
+ *
21148
+ * In non-streaming mode, the audio is right-padded to ensure the model
21149
+ * transcribes the full audio, then processed with `center=true`.
21150
+ * Returns `{ input_features }`.
21151
+ *
21152
+ * @param {Float32Array|Float64Array} audio The audio waveform.
21153
+ * @param {Object} [options]
21154
+ * @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
21155
+ * @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
21156
+ * @returns {Promise<Object>}
21157
+ */
21158
+ async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
21159
+ validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
21160
+ if (!is_streaming && !is_first_audio_chunk) {
21161
+ throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
21162
+ }
21163
+ if (is_first_audio_chunk) {
21164
+ if (is_streaming) {
21165
+ const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
21166
+ const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
21167
+ padded_audio.set(audio, num_left_pad_samples);
21168
+ const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
21169
+ const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
21170
+ const num_input_tokens = 1 + num_pad_tokens;
21171
+ const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
21172
+ input_ids_data[0] = 1n;
21173
+ const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
21174
+ return {
21175
+ input_ids,
21176
+ ...audio_encoding
21177
+ };
21178
+ } else {
21179
+ const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
21180
+ const padded_audio = new Float32Array(audio.length + right_pad_samples);
21181
+ padded_audio.set(audio);
21182
+ return await this.feature_extractor(padded_audio, { center: true });
21183
+ }
21184
+ } else {
21185
+ return await this.feature_extractor(audio, { center: false });
21186
+ }
21187
+ }
21188
+ };
21189
+
20272
21190
  // src/models/wav2vec2/processing_wav2vec2.js
20273
21191
  var Wav2Vec2Processor = class extends Processor {
20274
21192
  static tokenizer_class = AutoTokenizer;
@@ -20368,14 +21286,18 @@ function getNormalizedConfig(config) {
20368
21286
  case "florence2":
20369
21287
  case "llava_onevision":
20370
21288
  case "idefics3":
21289
+ case "granite_speech":
20371
21290
  case "ultravox":
20372
21291
  case "voxtral":
21292
+ case "voxtral_realtime":
20373
21293
  case "smolvlm":
20374
21294
  case "gemma3n":
21295
+ case "lfm2_vl":
20375
21296
  case "chatterbox":
20376
21297
  case "mistral3":
20377
21298
  case "qwen2_5_vl":
20378
21299
  case "qwen3_vl":
21300
+ case "qwen3_vl_moe":
20379
21301
  init_normalized_config = getNormalizedConfig(config.text_config);
20380
21302
  break;
20381
21303
  case "moondream1":
@@ -20425,11 +21347,17 @@ function getNormalizedConfig(config) {
20425
21347
  case "cohere":
20426
21348
  case "cohere2":
20427
21349
  case "mistral":
21350
+ case "voxtral_realtime_text":
21351
+ case "voxtral_realtime_encoder":
20428
21352
  case "starcoder2":
20429
21353
  case "qwen2":
21354
+ case "qwen2_moe":
20430
21355
  case "qwen2_vl":
21356
+ case "qwen2_vl_text":
20431
21357
  case "qwen2_5_vl_text":
21358
+ case "qwen3_moe":
20432
21359
  case "qwen3_vl_text":
21360
+ case "qwen3_vl_moe_text":
20433
21361
  case "phi":
20434
21362
  case "phi3":
20435
21363
  case "phi3_v":
@@ -20570,6 +21498,9 @@ function getNormalizedConfig(config) {
20570
21498
  return normalized_config;
20571
21499
  }
20572
21500
  function getCacheShapes(config, options) {
21501
+ if (!(config instanceof PretrainedConfig)) {
21502
+ config = new PretrainedConfig(config);
21503
+ }
20573
21504
  if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
20574
21505
  const pkv_prefix = options?.prefix ?? "past_key_values";
20575
21506
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -20628,7 +21559,7 @@ function getCacheShapes(config, options) {
20628
21559
  }
20629
21560
  }
20630
21561
  return cache_values;
20631
- } else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
21562
+ } else if (["qwen3_next", "qwen3_5_text", "qwen3_5_moe_text", "olmo_hybrid"].includes(config.model_type)) {
20632
21563
  const pkv_prefix = options?.prefix ?? "past_key_values";
20633
21564
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
20634
21565
  const cache_values = {};
@@ -20645,11 +21576,10 @@ function getCacheShapes(config, options) {
20645
21576
  linear_conv_kernel_dim
20646
21577
  } = (
20647
21578
  /** @type {any} */
20648
- config.text_config
21579
+ config
20649
21580
  );
20650
21581
  const key_dim = linear_key_head_dim * linear_num_key_heads;
20651
21582
  const value_dim = linear_value_head_dim * linear_num_value_heads;
20652
- const conv_dim = key_dim * 2 + value_dim;
20653
21583
  const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
20654
21584
  const batch_size = options?.batch_size ?? 1;
20655
21585
  for (let i = 0; i < layer_types.length; ++i) {
@@ -20658,7 +21588,14 @@ function getCacheShapes(config, options) {
20658
21588
  cache_values[`${pkv_prefix}.${i}.${kv}`] = [batch_size, num_key_value_heads, 0, final_head_dim];
20659
21589
  }
20660
21590
  } else if (layer_types[i] === "linear_attention") {
20661
- cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_dim, linear_conv_kernel_dim];
21591
+ if (config.model_type === "olmo_hybrid") {
21592
+ cache_values[`${conv_prefix}_conv.${i}.key`] = [batch_size, key_dim, linear_conv_kernel_dim];
21593
+ cache_values[`${conv_prefix}_conv.${i}.value`] = [batch_size, value_dim, linear_conv_kernel_dim];
21594
+ cache_values[`${conv_prefix}_conv.${i}.query`] = [batch_size, key_dim, linear_conv_kernel_dim];
21595
+ } else {
21596
+ const conv_dim = key_dim * 2 + value_dim;
21597
+ cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_dim, linear_conv_kernel_dim];
21598
+ }
20662
21599
  cache_values[`${conv_prefix}_recurrent.${i}`] = [
20663
21600
  batch_size,
20664
21601
  linear_num_value_heads,
@@ -20670,6 +21607,16 @@ function getCacheShapes(config, options) {
20670
21607
  }
20671
21608
  }
20672
21609
  return cache_values;
21610
+ } else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
21611
+ let subConfig;
21612
+ if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
21613
+ subConfig = /** @type {any} */
21614
+ config.audio_config;
21615
+ } else {
21616
+ subConfig = /** @type {any} */
21617
+ config.text_config;
21618
+ }
21619
+ return getCacheShapes(subConfig, options);
20673
21620
  }
20674
21621
  return getKeyValueShapes(config, options);
20675
21622
  }
@@ -20835,7 +21782,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
20835
21782
  }
20836
21783
 
20837
21784
  // src/models/session.js
20838
- async function getSession(pretrained_model_name_or_path, fileName, options, is_decoder = false) {
21785
+ async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
20839
21786
  let custom_config = options.config?.["transformers.js_config"] ?? {};
20840
21787
  const selectedDevice = (
20841
21788
  /** @type {import("../utils/devices.js").DeviceType} */
@@ -20893,9 +21840,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
20893
21840
  if (externalData.length > 0 && !apis.IS_NODE_ENV) {
20894
21841
  session_options.externalData = externalData;
20895
21842
  }
20896
- if (is_decoder && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
21843
+ if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
20897
21844
  const shapes = getCacheShapes(options.config, {
20898
- prefix: "present"
21845
+ prefix: "present",
21846
+ session_name
20899
21847
  });
20900
21848
  if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
20901
21849
  const preferredOutputLocation = {};
@@ -20913,15 +21861,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
20913
21861
  };
20914
21862
  return { buffer_or_path, session_options, session_config };
20915
21863
  }
20916
- async function constructSessions(pretrained_model_name_or_path, names, options, decoder_name = void 0) {
21864
+ async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
20917
21865
  return Object.fromEntries(
20918
21866
  await Promise.all(
20919
21867
  Object.keys(names).map(async (name) => {
21868
+ const cache_config = cache_sessions?.[name] ?? false;
20920
21869
  const { buffer_or_path, session_options, session_config } = await getSession(
20921
21870
  pretrained_model_name_or_path,
20922
21871
  names[name],
20923
21872
  options,
20924
- name === decoder_name
21873
+ cache_config,
21874
+ name
20925
21875
  );
20926
21876
  const session = await createInferenceSession(buffer_or_path, session_options, session_config);
20927
21877
  return [name, session];
@@ -22221,6 +23171,66 @@ var BeamSearchSampler = class extends LogitsSampler {
22221
23171
  }
22222
23172
  };
22223
23173
 
23174
+ // src/cache_utils.js
23175
+ var _DynamicCache = class {
23176
+ /**
23177
+ * Create a DynamicCache, optionally pre-populated with entries.
23178
+ * @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
23179
+ */
23180
+ constructor(entries) {
23181
+ if (!entries) return;
23182
+ for (const key in entries) {
23183
+ if (key in this) {
23184
+ throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
23185
+ }
23186
+ const value = entries[key];
23187
+ if (!(value instanceof Tensor2)) {
23188
+ throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
23189
+ }
23190
+ this[key] = value;
23191
+ }
23192
+ }
23193
+ /**
23194
+ * Get the cached sequence length. This requires at least one attention cache entry to be present.
23195
+ * @returns {number} The past sequence length.
23196
+ */
23197
+ get_seq_length() {
23198
+ const self2 = (
23199
+ /** @type {any} */
23200
+ this
23201
+ );
23202
+ for (const name in self2) {
23203
+ if (name.startsWith("past_key_values.")) {
23204
+ return self2[name].dims.at(-2);
23205
+ }
23206
+ }
23207
+ throw new Error("Unable to determine sequence length from the cache.");
23208
+ }
23209
+ /**
23210
+ * Dispose all contained tensors whose data resides on the GPU.
23211
+ * Returns a promise that resolves when all disposals are complete.
23212
+ * @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
23213
+ */
23214
+ async dispose() {
23215
+ const promises = [];
23216
+ for (
23217
+ const t of
23218
+ /** @type {Tensor[]} */
23219
+ Object.values(this)
23220
+ ) {
23221
+ if (t.location === "gpu-buffer") {
23222
+ promises.push(t.dispose());
23223
+ }
23224
+ }
23225
+ await Promise.all(promises);
23226
+ }
23227
+ };
23228
+ var DynamicCache = (
23229
+ /** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
23230
+ /** @type {unknown} */
23231
+ _DynamicCache
23232
+ );
23233
+
22224
23234
  // src/models/modeling_utils.js
22225
23235
  var MODEL_MAPPING_NAMES = null;
22226
23236
  function registerTaskMappings(mappings) {
@@ -22266,71 +23276,181 @@ var MODEL_TYPES = {
22266
23276
  AutoEncoder: 12,
22267
23277
  ImageAudioTextToText: 13,
22268
23278
  Supertonic: 14,
22269
- Chatterbox: 15
23279
+ Chatterbox: 15,
23280
+ MultimodalLanguageModelOnly: 16,
23281
+ VoxtralRealtime: 17
22270
23282
  };
22271
23283
  var MODEL_TYPE_CONFIG = {
22272
23284
  [MODEL_TYPES.DecoderOnly]: {
22273
23285
  can_generate: true,
22274
23286
  forward: decoder_forward,
22275
- prepare_inputs: decoder_prepare_inputs_for_generation
23287
+ prepare_inputs: decoder_prepare_inputs_for_generation,
23288
+ sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
23289
+ cache_sessions: { model: true },
23290
+ optional_configs: { generation_config: "generation_config.json" }
22276
23291
  },
22277
23292
  [MODEL_TYPES.DecoderOnlyWithoutHead]: {
22278
23293
  can_generate: false,
22279
23294
  forward: decoder_forward,
22280
- prepare_inputs: decoder_prepare_inputs_for_generation
23295
+ prepare_inputs: decoder_prepare_inputs_for_generation,
23296
+ sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
22281
23297
  },
22282
23298
  [MODEL_TYPES.Seq2Seq]: {
22283
23299
  can_generate: true,
22284
23300
  forward: seq2seq_forward,
22285
- prepare_inputs: encoder_decoder_prepare_inputs_for_generation
23301
+ prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
23302
+ sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
23303
+ cache_sessions: { decoder_model_merged: true },
23304
+ optional_configs: { generation_config: "generation_config.json" }
22286
23305
  },
22287
23306
  [MODEL_TYPES.Vision2Seq]: {
22288
23307
  can_generate: true,
22289
23308
  forward: seq2seq_forward,
22290
- prepare_inputs: encoder_decoder_prepare_inputs_for_generation
23309
+ prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
23310
+ sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
23311
+ cache_sessions: { decoder_model_merged: true },
23312
+ optional_configs: { generation_config: "generation_config.json" }
22291
23313
  },
22292
23314
  [MODEL_TYPES.Musicgen]: {
22293
23315
  can_generate: true,
22294
- forward: seq2seq_forward
23316
+ forward: seq2seq_forward,
23317
+ sessions: () => ({
23318
+ model: "text_encoder",
23319
+ decoder_model_merged: "decoder_model_merged",
23320
+ encodec_decode: "encodec_decode"
23321
+ }),
23322
+ cache_sessions: { decoder_model_merged: true },
23323
+ optional_configs: { generation_config: "generation_config.json" }
22295
23324
  },
22296
23325
  [MODEL_TYPES.EncoderDecoder]: {
22297
23326
  can_generate: false,
22298
- forward: seq2seq_forward
23327
+ forward: seq2seq_forward,
23328
+ sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
23329
+ cache_sessions: { decoder_model_merged: true }
23330
+ },
23331
+ [MODEL_TYPES.MaskGeneration]: {
23332
+ sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
22299
23333
  },
22300
23334
  [MODEL_TYPES.ImageTextToText]: {
22301
23335
  can_generate: true,
22302
23336
  forward: image_text_to_text_forward,
22303
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
23337
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23338
+ sessions: (config) => {
23339
+ const s = {
23340
+ embed_tokens: "embed_tokens",
23341
+ vision_encoder: "vision_encoder",
23342
+ decoder_model_merged: "decoder_model_merged"
23343
+ };
23344
+ if (config.is_encoder_decoder) s["model"] = "encoder_model";
23345
+ return s;
23346
+ },
23347
+ cache_sessions: { decoder_model_merged: true },
23348
+ optional_configs: { generation_config: "generation_config.json" }
22304
23349
  },
22305
23350
  [MODEL_TYPES.AudioTextToText]: {
22306
23351
  can_generate: true,
22307
23352
  forward: audio_text_to_text_forward,
22308
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
23353
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23354
+ sessions: () => ({
23355
+ embed_tokens: "embed_tokens",
23356
+ audio_encoder: "audio_encoder",
23357
+ decoder_model_merged: "decoder_model_merged"
23358
+ }),
23359
+ cache_sessions: { decoder_model_merged: true },
23360
+ optional_configs: { generation_config: "generation_config.json" }
22309
23361
  },
22310
- [MODEL_TYPES.Phi3V]: {
23362
+ [MODEL_TYPES.ImageAudioTextToText]: {
22311
23363
  can_generate: true,
22312
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
23364
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23365
+ sessions: () => ({
23366
+ embed_tokens: "embed_tokens",
23367
+ audio_encoder: "audio_encoder",
23368
+ vision_encoder: "vision_encoder",
23369
+ decoder_model_merged: "decoder_model_merged"
23370
+ }),
23371
+ optional_configs: { generation_config: "generation_config.json" }
22313
23372
  },
22314
- [MODEL_TYPES.ImageAudioTextToText]: {
23373
+ [MODEL_TYPES.Phi3V]: {
22315
23374
  can_generate: true,
22316
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
23375
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23376
+ sessions: () => ({
23377
+ prepare_inputs_embeds: "prepare_inputs_embeds",
23378
+ model: "model",
23379
+ vision_encoder: "vision_encoder"
23380
+ }),
23381
+ cache_sessions: { model: true },
23382
+ optional_configs: { generation_config: "generation_config.json" }
22317
23383
  },
22318
23384
  [MODEL_TYPES.MultiModality]: {
22319
- can_generate: true
23385
+ can_generate: true,
23386
+ sessions: () => ({
23387
+ prepare_inputs_embeds: "prepare_inputs_embeds",
23388
+ model: "language_model",
23389
+ lm_head: "lm_head",
23390
+ gen_head: "gen_head",
23391
+ gen_img_embeds: "gen_img_embeds",
23392
+ image_decode: "image_decode"
23393
+ }),
23394
+ cache_sessions: { model: true },
23395
+ optional_configs: { generation_config: "generation_config.json" }
22320
23396
  },
22321
23397
  [MODEL_TYPES.AutoEncoder]: {
22322
23398
  can_generate: false,
22323
- forward: auto_encoder_forward
23399
+ forward: auto_encoder_forward,
23400
+ sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
23401
+ },
23402
+ [MODEL_TYPES.Supertonic]: {
23403
+ sessions: () => ({
23404
+ text_encoder: "text_encoder",
23405
+ latent_denoiser: "latent_denoiser",
23406
+ voice_decoder: "voice_decoder"
23407
+ })
22324
23408
  },
22325
23409
  [MODEL_TYPES.Chatterbox]: {
22326
23410
  can_generate: true,
22327
- forward: encoder_forward
23411
+ forward: encoder_forward,
23412
+ sessions: () => ({
23413
+ embed_tokens: "embed_tokens",
23414
+ speech_encoder: "speech_encoder",
23415
+ model: "language_model",
23416
+ conditional_decoder: "conditional_decoder"
23417
+ }),
23418
+ cache_sessions: { model: true },
23419
+ optional_configs: { generation_config: "generation_config.json" }
23420
+ },
23421
+ [MODEL_TYPES.MultimodalLanguageModelOnly]: {
23422
+ can_generate: true,
23423
+ forward: image_text_to_text_forward,
23424
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23425
+ sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
23426
+ cache_sessions: { decoder_model_merged: true },
23427
+ optional_configs: { generation_config: "generation_config.json" }
23428
+ },
23429
+ [MODEL_TYPES.VoxtralRealtime]: {
23430
+ can_generate: true,
23431
+ prepare_inputs: decoder_prepare_inputs_for_generation,
23432
+ sessions: () => ({
23433
+ embed_tokens: "embed_tokens",
23434
+ audio_encoder: "audio_encoder",
23435
+ decoder_model_merged: "decoder_model_merged"
23436
+ }),
23437
+ cache_sessions: { decoder_model_merged: true, audio_encoder: true },
23438
+ optional_configs: { generation_config: "generation_config.json" }
22328
23439
  },
22329
23440
  default: {
22330
23441
  can_generate: false,
22331
- forward: encoder_forward
23442
+ forward: encoder_forward,
23443
+ sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
22332
23444
  }
22333
23445
  };
23446
+ function getSessionsConfig(modelType, config, options = {}) {
23447
+ const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
23448
+ return {
23449
+ sessions: typeConfig.sessions(config, options),
23450
+ cache_sessions: typeConfig.cache_sessions,
23451
+ optional_configs: typeConfig.optional_configs
23452
+ };
23453
+ }
22334
23454
  var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
22335
23455
  var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
22336
23456
  var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
@@ -22416,245 +23536,23 @@ var PreTrainedModel = class extends Callable2 {
22416
23536
  const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
22417
23537
  const modelType = MODEL_TYPE_MAPPING.get(modelName);
22418
23538
  config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
22419
- let info;
22420
- if (modelType === MODEL_TYPES.DecoderOnly) {
22421
- info = await Promise.all([
22422
- constructSessions(
22423
- pretrained_model_name_or_path,
22424
- {
22425
- model: options.model_file_name ?? "model"
22426
- },
22427
- options,
22428
- "model"
22429
- ),
22430
- get_optional_configs(
22431
- pretrained_model_name_or_path,
22432
- {
22433
- generation_config: "generation_config.json"
22434
- },
22435
- options
22436
- )
22437
- ]);
22438
- } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
22439
- info = await Promise.all([
22440
- constructSessions(
22441
- pretrained_model_name_or_path,
22442
- {
22443
- model: "encoder_model",
22444
- decoder_model_merged: "decoder_model_merged"
22445
- },
22446
- options,
22447
- "decoder_model_merged"
22448
- ),
22449
- get_optional_configs(
22450
- pretrained_model_name_or_path,
22451
- {
22452
- generation_config: "generation_config.json"
22453
- },
22454
- options
22455
- )
22456
- ]);
22457
- } else if (modelType === MODEL_TYPES.MaskGeneration) {
22458
- info = await Promise.all([
22459
- constructSessions(
22460
- pretrained_model_name_or_path,
22461
- {
22462
- model: "vision_encoder",
22463
- prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
22464
- },
22465
- options
22466
- )
22467
- ]);
22468
- } else if (modelType === MODEL_TYPES.EncoderDecoder) {
22469
- info = await Promise.all([
22470
- constructSessions(
22471
- pretrained_model_name_or_path,
22472
- {
22473
- model: "encoder_model",
22474
- decoder_model_merged: "decoder_model_merged"
22475
- },
22476
- options,
22477
- "decoder_model_merged"
22478
- )
22479
- ]);
22480
- } else if (modelType === MODEL_TYPES.ImageTextToText) {
22481
- const sessions = {
22482
- embed_tokens: "embed_tokens",
22483
- vision_encoder: "vision_encoder",
22484
- decoder_model_merged: "decoder_model_merged"
22485
- };
22486
- if (config.is_encoder_decoder) {
22487
- sessions["model"] = "encoder_model";
22488
- }
22489
- info = await Promise.all([
22490
- constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
22491
- get_optional_configs(
22492
- pretrained_model_name_or_path,
22493
- {
22494
- generation_config: "generation_config.json"
22495
- },
22496
- options
22497
- )
22498
- ]);
22499
- } else if (modelType === MODEL_TYPES.AudioTextToText) {
22500
- const sessions = {
22501
- embed_tokens: "embed_tokens",
22502
- audio_encoder: "audio_encoder",
22503
- decoder_model_merged: "decoder_model_merged"
22504
- };
22505
- info = await Promise.all([
22506
- constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
22507
- get_optional_configs(
22508
- pretrained_model_name_or_path,
22509
- {
22510
- generation_config: "generation_config.json"
22511
- },
22512
- options
22513
- )
22514
- ]);
22515
- } else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
22516
- const sessions = {
22517
- embed_tokens: "embed_tokens",
22518
- audio_encoder: "audio_encoder",
22519
- vision_encoder: "vision_encoder",
22520
- decoder_model_merged: "decoder_model_merged"
22521
- };
22522
- info = await Promise.all([
22523
- constructSessions(pretrained_model_name_or_path, sessions, options),
22524
- get_optional_configs(
22525
- pretrained_model_name_or_path,
22526
- {
22527
- generation_config: "generation_config.json"
22528
- },
22529
- options
22530
- )
22531
- ]);
22532
- } else if (modelType === MODEL_TYPES.Musicgen) {
22533
- info = await Promise.all([
22534
- constructSessions(
22535
- pretrained_model_name_or_path,
22536
- {
22537
- model: "text_encoder",
22538
- decoder_model_merged: "decoder_model_merged",
22539
- encodec_decode: "encodec_decode"
22540
- },
22541
- options,
22542
- "decoder_model_merged"
22543
- ),
22544
- get_optional_configs(
22545
- pretrained_model_name_or_path,
22546
- {
22547
- generation_config: "generation_config.json"
22548
- },
22549
- options
22550
- )
22551
- ]);
22552
- } else if (modelType === MODEL_TYPES.MultiModality) {
22553
- info = await Promise.all([
22554
- constructSessions(
22555
- pretrained_model_name_or_path,
22556
- {
22557
- prepare_inputs_embeds: "prepare_inputs_embeds",
22558
- model: "language_model",
22559
- lm_head: "lm_head",
22560
- gen_head: "gen_head",
22561
- gen_img_embeds: "gen_img_embeds",
22562
- image_decode: "image_decode"
22563
- },
22564
- options,
22565
- "model"
22566
- ),
22567
- get_optional_configs(
22568
- pretrained_model_name_or_path,
22569
- {
22570
- generation_config: "generation_config.json"
22571
- },
22572
- options
22573
- )
22574
- ]);
22575
- } else if (modelType === MODEL_TYPES.Phi3V) {
22576
- info = await Promise.all([
22577
- constructSessions(
22578
- pretrained_model_name_or_path,
22579
- {
22580
- prepare_inputs_embeds: "prepare_inputs_embeds",
22581
- model: "model",
22582
- vision_encoder: "vision_encoder"
22583
- },
22584
- options,
22585
- "model"
22586
- ),
22587
- get_optional_configs(
22588
- pretrained_model_name_or_path,
22589
- {
22590
- generation_config: "generation_config.json"
22591
- },
22592
- options
22593
- )
22594
- ]);
22595
- } else if (modelType === MODEL_TYPES.Chatterbox) {
22596
- info = await Promise.all([
22597
- constructSessions(
22598
- pretrained_model_name_or_path,
22599
- {
22600
- embed_tokens: "embed_tokens",
22601
- speech_encoder: "speech_encoder",
22602
- model: "language_model",
22603
- conditional_decoder: "conditional_decoder"
22604
- },
22605
- options,
22606
- "model"
22607
- ),
22608
- get_optional_configs(
22609
- pretrained_model_name_or_path,
22610
- {
22611
- generation_config: "generation_config.json"
22612
- },
22613
- options
22614
- )
22615
- ]);
22616
- } else if (modelType === MODEL_TYPES.AutoEncoder) {
22617
- info = await Promise.all([
22618
- constructSessions(
22619
- pretrained_model_name_or_path,
22620
- {
22621
- encoder_model: "encoder_model",
22622
- decoder_model: "decoder_model"
22623
- },
22624
- options
22625
- )
22626
- ]);
22627
- } else if (modelType === MODEL_TYPES.Supertonic) {
22628
- info = await Promise.all([
22629
- constructSessions(
22630
- pretrained_model_name_or_path,
22631
- {
22632
- text_encoder: "text_encoder",
22633
- latent_denoiser: "latent_denoiser",
22634
- voice_decoder: "voice_decoder"
22635
- },
22636
- options
22637
- )
22638
- ]);
22639
- } else {
22640
- if (modelType === void 0) {
22641
- const type = modelName ?? config?.model_type;
22642
- if (type !== "custom") {
22643
- logger.warn(
22644
- `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
22645
- );
22646
- }
23539
+ const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
23540
+ if (modelType === void 0) {
23541
+ const type = modelName ?? config?.model_type;
23542
+ if (type !== "custom") {
23543
+ logger.warn(
23544
+ `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
23545
+ );
22647
23546
  }
22648
- info = await Promise.all([
22649
- constructSessions(
22650
- pretrained_model_name_or_path,
22651
- {
22652
- model: options.model_file_name ?? "model"
22653
- },
22654
- options
22655
- )
22656
- ]);
22657
23547
  }
23548
+ const sessions = typeConfig.sessions(config, options);
23549
+ const promises = [
23550
+ constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
23551
+ ];
23552
+ if (typeConfig.optional_configs) {
23553
+ promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
23554
+ }
23555
+ const info = await Promise.all(promises);
22658
23556
  return new this(config, ...info);
22659
23557
  }
22660
23558
  /**
@@ -22853,7 +23751,7 @@ var PreTrainedModel = class extends Callable2 {
22853
23751
  * @param {Tensor} [params.inputs=null]
22854
23752
  * @param {number} [params.bos_token_id=null]
22855
23753
  * @param {Record<string, Tensor|number[]>} [params.model_kwargs]
22856
- * @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor>, model_input_name: string}} The model-specific inputs for generation.
23754
+ * @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
22857
23755
  */
22858
23756
  _prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
22859
23757
  const model_inputs = pick(model_kwargs, this.forward_params);
@@ -23094,11 +23992,12 @@ var PreTrainedModel = class extends Callable2 {
23094
23992
  }
23095
23993
  }
23096
23994
  /**
23097
- * Returns an object containing past key values from the given decoder results object.
23995
+ * Returns a DynamicCache containing past key values from the given decoder results object.
23098
23996
  *
23099
23997
  * @param {Object} decoderResults The decoder results object.
23100
- * @param {Object} pastKeyValues The previous past key values.
23101
- * @returns {Object} An object containing past key values.
23998
+ * @param {DynamicCache} pastKeyValues The previous past key values.
23999
+ * @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
24000
+ * @returns {DynamicCache} A new DynamicCache containing the updated past key values.
23102
24001
  */
23103
24002
  getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
23104
24003
  const pkvs = /* @__PURE__ */ Object.create(null);
@@ -23119,7 +24018,7 @@ var PreTrainedModel = class extends Callable2 {
23119
24018
  }
23120
24019
  }
23121
24020
  }
23122
- return pkvs;
24021
+ return new DynamicCache(pkvs);
23123
24022
  }
23124
24023
  /**
23125
24024
  * Returns an object containing attentions from the given model output object.
@@ -23144,8 +24043,8 @@ var PreTrainedModel = class extends Callable2 {
23144
24043
  /**
23145
24044
  * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
23146
24045
  *
23147
- * @param {Object} decoderFeeds The decoder feeds object to add past key values to.
23148
- * @param {Object} pastKeyValues An object containing past key values.
24046
+ * @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
24047
+ * @param {DynamicCache|null} pastKeyValues The cache containing past key values.
23149
24048
  */
23150
24049
  addPastKeyValues(decoderFeeds, pastKeyValues) {
23151
24050
  if (pastKeyValues) {
@@ -23162,14 +24061,29 @@ var PreTrainedModel = class extends Callable2 {
23162
24061
  }
23163
24062
  }
23164
24063
  }
23165
- async encode_image({ pixel_values }) {
23166
- return (await sessionRun(this.sessions["vision_encoder"], { pixel_values })).image_features;
24064
+ /**
24065
+ * Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
24066
+ * @param {string} sessionName
24067
+ * @param {Record<string, Tensor>} inputs
24068
+ * @param {string} outputName
24069
+ * @private
24070
+ */
24071
+ async _encode_input(sessionName, inputs, outputName) {
24072
+ if (!Object.hasOwn(this.sessions, sessionName)) {
24073
+ throw new Error(`Model does not have a ${sessionName} session.`);
24074
+ }
24075
+ const session = this.sessions[sessionName];
24076
+ const output = await sessionRun(session, pick(inputs, session.inputNames));
24077
+ return output[outputName];
24078
+ }
24079
+ async encode_image(inputs) {
24080
+ return this._encode_input("vision_encoder", inputs, "image_features");
23167
24081
  }
23168
- async encode_text({ input_ids }) {
23169
- return (await sessionRun(this.sessions["embed_tokens"], { input_ids })).inputs_embeds;
24082
+ async encode_text(inputs) {
24083
+ return this._encode_input("embed_tokens", inputs, "inputs_embeds");
23170
24084
  }
23171
- async encode_audio({ audio_values }) {
23172
- return (await sessionRun(this.sessions["audio_encoder"], { audio_values })).audio_features;
24085
+ async encode_audio(inputs) {
24086
+ return this._encode_input("audio_encoder", inputs, "audio_features");
23173
24087
  }
23174
24088
  };
23175
24089
  async function seq2seq_forward(self2, model_inputs) {
@@ -23224,6 +24138,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
23224
24138
  const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
23225
24139
  new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
23226
24140
  }
24141
+ if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
24142
+ new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
24143
+ }
23227
24144
  self2.addPastKeyValues(new_model_inputs, past_key_values);
23228
24145
  const fixed = pick(new_model_inputs, session.inputNames);
23229
24146
  return await sessionRun(session, fixed);
@@ -23232,7 +24149,7 @@ async function generic_text_to_text_forward(self2, {
23232
24149
  // Generic parameters:
23233
24150
  encode_function,
23234
24151
  merge_function,
23235
- modality_input_name,
24152
+ modality_input_names,
23236
24153
  modality_output_name,
23237
24154
  // Produced by the tokenizer/processor:
23238
24155
  input_ids = null,
@@ -23247,38 +24164,54 @@ async function generic_text_to_text_forward(self2, {
23247
24164
  // Additional parameters
23248
24165
  ...kwargs
23249
24166
  }) {
23250
- const modality_values = kwargs[modality_input_name];
23251
24167
  if (!inputs_embeds) {
23252
24168
  inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
23253
- if (modality_values && input_ids.dims[1] !== 1) {
23254
- const modality_features = await encode_function({
23255
- // Pass the modality values under its expected key.
23256
- // The caller knows whether this is audio or image.
23257
- [modality_input_name]: modality_values,
23258
- ...kwargs
23259
- });
23260
- ({ inputs_embeds, attention_mask } = merge_function({
23261
- [modality_output_name]: modality_features,
23262
- inputs_embeds,
23263
- input_ids,
23264
- attention_mask
23265
- }));
23266
- } else if (past_key_values && modality_values && input_ids.dims[1] === 1) {
23267
- const target_length = input_ids.dims[1];
23268
- const past_length = Object.values(past_key_values)[0].dims.at(-2);
23269
- attention_mask = cat(
23270
- [
23271
- ones([input_ids.dims[0], past_length]),
23272
- attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
23273
- ],
23274
- 1
23275
- );
24169
+ const modality_values = pick(kwargs, modality_input_names);
24170
+ if (Object.keys(modality_values).length > 0) {
24171
+ if (input_ids.dims[1] !== 1) {
24172
+ const modality_features = await encode_function({
24173
+ // Pass the modality values under its expected key.
24174
+ // The caller knows whether this is audio or image.
24175
+ ...modality_values,
24176
+ ...kwargs
24177
+ });
24178
+ ({ inputs_embeds, attention_mask } = merge_function({
24179
+ [modality_output_name]: modality_features,
24180
+ inputs_embeds,
24181
+ input_ids,
24182
+ attention_mask
24183
+ }));
24184
+ } else if (past_key_values && input_ids.dims[1] === 1) {
24185
+ const target_length = input_ids.dims[1];
24186
+ const past_length = past_key_values.get_seq_length();
24187
+ attention_mask = cat(
24188
+ [
24189
+ ones([input_ids.dims[0], past_length]),
24190
+ attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
24191
+ ],
24192
+ 1
24193
+ );
24194
+ }
23276
24195
  }
23277
24196
  }
23278
24197
  if (!position_ids) {
23279
- if (["qwen2_vl", "qwen2_5_vl_text", "qwen3_vl_text", "qwen3_5_text", "qwen3_5_moe_text"].includes(
23280
- self2.config.model_type
23281
- )) {
24198
+ if (
24199
+ // Handle special case for qwen vl models
24200
+ [
24201
+ "qwen2_vl",
24202
+ "qwen2_vl_text",
24203
+ "qwen2_5_vl",
24204
+ "qwen2_5_vl_text",
24205
+ "qwen3_vl",
24206
+ "qwen3_vl_text",
24207
+ "qwen3_vl_moe",
24208
+ "qwen3_vl_moe_text",
24209
+ "qwen3_5",
24210
+ "qwen3_5_text",
24211
+ "qwen3_5_moe",
24212
+ "qwen3_5_moe_text"
24213
+ ].includes(self2.config.model_type)
24214
+ ) {
23282
24215
  const { image_grid_thw, video_grid_thw } = kwargs;
23283
24216
  [position_ids] = self2.get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask);
23284
24217
  }
@@ -23300,7 +24233,7 @@ async function generic_text_to_text_forward(self2, {
23300
24233
  async function audio_text_to_text_forward(self2, params) {
23301
24234
  return await generic_text_to_text_forward(self2, {
23302
24235
  ...params,
23303
- modality_input_name: "audio_values",
24236
+ modality_input_names: ["audio_values", "input_features"],
23304
24237
  modality_output_name: "audio_features",
23305
24238
  encode_function: self2.encode_audio.bind(self2),
23306
24239
  merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
@@ -23309,7 +24242,7 @@ async function audio_text_to_text_forward(self2, params) {
23309
24242
  async function image_text_to_text_forward(self2, params) {
23310
24243
  return await generic_text_to_text_forward(self2, {
23311
24244
  ...params,
23312
- modality_input_name: "pixel_values",
24245
+ modality_input_names: ["pixel_values"],
23313
24246
  modality_output_name: "image_features",
23314
24247
  encode_function: self2.encode_image.bind(self2),
23315
24248
  merge_function: self2._merge_input_ids_with_image_features.bind(self2)
@@ -23345,7 +24278,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
23345
24278
  return position_ids;
23346
24279
  }
23347
24280
  function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
23348
- const past_length = model_inputs.past_key_values ? Object.values(model_inputs.past_key_values)[0].dims.at(-2) : 0;
24281
+ const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
24282
+ const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
24283
+ if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
24284
+ model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
24285
+ }
23349
24286
  if (!model_inputs.attention_mask) {
23350
24287
  let dims;
23351
24288
  for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
@@ -23653,6 +24590,7 @@ __export(models_exports, {
23653
24590
  Gemma3ForCausalLM: () => Gemma3ForCausalLM,
23654
24591
  Gemma3Model: () => Gemma3Model,
23655
24592
  Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
24593
+ Gemma3nForCausalLM: () => Gemma3nForCausalLM,
23656
24594
  Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
23657
24595
  Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
23658
24596
  GemmaForCausalLM: () => GemmaForCausalLM,
@@ -23670,6 +24608,7 @@ __export(models_exports, {
23670
24608
  GraniteMoeHybridModel: () => GraniteMoeHybridModel,
23671
24609
  GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
23672
24610
  GranitePreTrainedModel: () => GranitePreTrainedModel,
24611
+ GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
23673
24612
  GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
23674
24613
  GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
23675
24614
  GroupViTModel: () => GroupViTModel,
@@ -23691,7 +24630,6 @@ __export(models_exports, {
23691
24630
  IJepaModel: () => IJepaModel,
23692
24631
  IJepaPreTrainedModel: () => IJepaPreTrainedModel,
23693
24632
  Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
23694
- Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
23695
24633
  JAISLMHeadModel: () => JAISLMHeadModel,
23696
24634
  JAISModel: () => JAISModel,
23697
24635
  JAISPreTrainedModel: () => JAISPreTrainedModel,
@@ -23705,6 +24643,7 @@ __export(models_exports, {
23705
24643
  Lfm2MoeModel: () => Lfm2MoeModel,
23706
24644
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
23707
24645
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
24646
+ Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
23708
24647
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
23709
24648
  Llama4ForCausalLM: () => Llama4ForCausalLM,
23710
24649
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -23829,6 +24768,9 @@ __export(models_exports, {
23829
24768
  Olmo3Model: () => Olmo3Model,
23830
24769
  Olmo3PreTrainedModel: () => Olmo3PreTrainedModel,
23831
24770
  OlmoForCausalLM: () => OlmoForCausalLM,
24771
+ OlmoHybridForCausalLM: () => OlmoHybridForCausalLM,
24772
+ OlmoHybridModel: () => OlmoHybridModel,
24773
+ OlmoHybridPreTrainedModel: () => OlmoHybridPreTrainedModel,
23832
24774
  OlmoModel: () => OlmoModel,
23833
24775
  OlmoPreTrainedModel: () => OlmoPreTrainedModel,
23834
24776
  OpenELMForCausalLM: () => OpenELMForCausalLM,
@@ -23841,7 +24783,6 @@ __export(models_exports, {
23841
24783
  Owlv2Model: () => Owlv2Model,
23842
24784
  Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
23843
24785
  PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
23844
- PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
23845
24786
  ParakeetForCTC: () => ParakeetForCTC,
23846
24787
  ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
23847
24788
  PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
@@ -23867,15 +24808,31 @@ __export(models_exports, {
23867
24808
  PyAnnotePreTrainedModel: () => PyAnnotePreTrainedModel,
23868
24809
  Qwen2ForCausalLM: () => Qwen2ForCausalLM,
23869
24810
  Qwen2Model: () => Qwen2Model,
24811
+ Qwen2MoeForCausalLM: () => Qwen2MoeForCausalLM,
24812
+ Qwen2MoeModel: () => Qwen2MoeModel,
24813
+ Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
23870
24814
  Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
24815
+ Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
23871
24816
  Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
23872
24817
  Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
24818
+ Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
23873
24819
  Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
23874
24820
  Qwen3ForCausalLM: () => Qwen3ForCausalLM,
23875
24821
  Qwen3Model: () => Qwen3Model,
24822
+ Qwen3MoeForCausalLM: () => Qwen3MoeForCausalLM,
24823
+ Qwen3MoeModel: () => Qwen3MoeModel,
24824
+ Qwen3MoePreTrainedModel: () => Qwen3MoePreTrainedModel,
24825
+ Qwen3NextForCausalLM: () => Qwen3NextForCausalLM,
24826
+ Qwen3NextModel: () => Qwen3NextModel,
24827
+ Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
23876
24828
  Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
24829
+ Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
23877
24830
  Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
24831
+ Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
24832
+ Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
24833
+ Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
23878
24834
  Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
24835
+ Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
23879
24836
  Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
23880
24837
  RFDetrForObjectDetection: () => RFDetrForObjectDetection,
23881
24838
  RFDetrModel: () => RFDetrModel,
@@ -23926,7 +24883,6 @@ __export(models_exports, {
23926
24883
  SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
23927
24884
  SmolLM3Model: () => SmolLM3Model,
23928
24885
  SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
23929
- SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
23930
24886
  SnacDecoderModel: () => SnacDecoderModel,
23931
24887
  SnacEncoderModel: () => SnacEncoderModel,
23932
24888
  SnacModel: () => SnacModel,
@@ -23998,6 +24954,8 @@ __export(models_exports, {
23998
24954
  VitsModelOutput: () => VitsModelOutput,
23999
24955
  VitsPreTrainedModel: () => VitsPreTrainedModel,
24000
24956
  VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
24957
+ VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
24958
+ VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
24001
24959
  Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
24002
24960
  Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
24003
24961
  Wav2Vec2BertModel: () => Wav2Vec2BertModel,
@@ -24358,7 +25316,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
24358
25316
  if (!past_key_values || target_length !== 1) {
24359
25317
  throw new Error("Incorrect state encountered during generation.");
24360
25318
  }
24361
- const past_length = Object.values(past_key_values)[0].dims.at(-2);
25319
+ const past_length = past_key_values.get_seq_length();
24362
25320
  attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
24363
25321
  }
24364
25322
  }
@@ -25388,6 +26346,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
25388
26346
  });
25389
26347
  }
25390
26348
  };
26349
+ var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
26350
+ };
25391
26351
 
25392
26352
  // src/models/glm/modeling_glm.js
25393
26353
  var GlmPreTrainedModel = class extends PreTrainedModel {
@@ -25469,6 +26429,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
25469
26429
  var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
25470
26430
  };
25471
26431
 
26432
+ // src/models/ultravox/modeling_ultravox.js
26433
+ var UltravoxPreTrainedModel = class extends PreTrainedModel {
26434
+ forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
26435
+ };
26436
+ var UltravoxModel = class extends UltravoxPreTrainedModel {
26437
+ _merge_input_ids_with_audio_features(kwargs) {
26438
+ const audio_hidden_size = kwargs.audio_features.dims.at(-1);
26439
+ const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
26440
+ return default_merge_input_ids_with_audio_features({
26441
+ // @ts-ignore
26442
+ audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
26443
+ ...kwargs,
26444
+ audio_features: reshaped_audio_features
26445
+ });
26446
+ }
26447
+ };
26448
+
26449
+ // src/models/granite_speech/modeling_granite_speech.js
26450
+ var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
26451
+ forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
26452
+ };
26453
+
25472
26454
  // src/models/grounding_dino/modeling_grounding_dino.js
25473
26455
  var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
25474
26456
  };
@@ -25564,17 +26546,38 @@ var HubertForSequenceClassification = class extends Wav2Vec2PreTrainedModel {
25564
26546
  return new SequenceClassifierOutput(await super._call(model_inputs));
25565
26547
  }
25566
26548
  };
25567
-
25568
- // src/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.js
25569
- var HunYuanDenseV1PreTrainedModel = class extends PreTrainedModel {
25570
- };
25571
- var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
26549
+
26550
+ // src/models/hunyuan_v1_dense/modeling_hunyuan_v1_dense.js
26551
+ var HunYuanDenseV1PreTrainedModel = class extends PreTrainedModel {
26552
+ };
26553
+ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
26554
+ };
26555
+ var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
26556
+ };
26557
+
26558
+ // src/models/llava/modeling_llava.js
26559
+ var LlavaPreTrainedModel = class extends PreTrainedModel {
26560
+ forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
26561
+ };
26562
+ var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
26563
+ _merge_input_ids_with_image_features(kwargs) {
26564
+ const vision_hidden_size = kwargs.image_features.dims.at(-1);
26565
+ const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
26566
+ return default_merge_input_ids_with_image_features({
26567
+ // @ts-ignore
26568
+ image_token_id: this.config.image_token_index ?? this.config.image_token_id,
26569
+ ...kwargs,
26570
+ image_features: reshaped_image_hidden_states
26571
+ });
26572
+ }
26573
+ };
26574
+ var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
25572
26575
  };
25573
- var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
26576
+ var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
25574
26577
  };
25575
26578
 
25576
26579
  // src/models/idefics3/modeling_idefics3.js
25577
- var Idefics3PreTrainedModel = class extends PreTrainedModel {
26580
+ var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
25578
26581
  forward_params = [
25579
26582
  "input_ids",
25580
26583
  "attention_mask",
@@ -25584,24 +26587,6 @@ var Idefics3PreTrainedModel = class extends PreTrainedModel {
25584
26587
  "past_key_values"
25585
26588
  ];
25586
26589
  };
25587
- var Idefics3ForConditionalGeneration = class extends Idefics3PreTrainedModel {
25588
- async encode_image({ pixel_values, pixel_attention_mask }) {
25589
- const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
25590
- return features;
25591
- }
25592
- _merge_input_ids_with_image_features(kwargs) {
25593
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
25594
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
25595
- return default_merge_input_ids_with_image_features({
25596
- // @ts-ignore
25597
- image_token_id: this.config.image_token_id,
25598
- ...kwargs,
25599
- image_features: reshaped_image_hidden_states
25600
- });
25601
- }
25602
- };
25603
- var SmolVLMForConditionalGeneration = class extends Idefics3ForConditionalGeneration {
25604
- };
25605
26590
 
25606
26591
  // src/models/ijepa/modeling_ijepa.js
25607
26592
  var IJepaPreTrainedModel = class extends PreTrainedModel {
@@ -25692,6 +26677,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
25692
26677
  var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
25693
26678
  };
25694
26679
 
26680
+ // src/models/lfm2_vl/modeling_lfm2_vl.js
26681
+ var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
26682
+ forward_params = [
26683
+ "input_ids",
26684
+ "attention_mask",
26685
+ "pixel_values",
26686
+ "pixel_attention_mask",
26687
+ "spatial_shapes",
26688
+ "position_ids",
26689
+ "past_key_values"
26690
+ ];
26691
+ };
26692
+
25695
26693
  // src/models/llama/modeling_llama.js
25696
26694
  var LlamaPreTrainedModel = class extends PreTrainedModel {
25697
26695
  };
@@ -25706,27 +26704,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
25706
26704
  var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
25707
26705
  };
25708
26706
 
25709
- // src/models/llava/modeling_llava.js
25710
- var LlavaPreTrainedModel = class extends PreTrainedModel {
25711
- forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
25712
- };
25713
- var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
25714
- _merge_input_ids_with_image_features(kwargs) {
25715
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
25716
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
25717
- return default_merge_input_ids_with_image_features({
25718
- // @ts-ignore
25719
- image_token_id: this.config.image_token_index,
25720
- ...kwargs,
25721
- image_features: reshaped_image_hidden_states
25722
- });
25723
- }
25724
- };
25725
- var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
25726
- };
25727
- var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
25728
- };
25729
-
25730
26707
  // src/models/longt5/modeling_longt5.js
25731
26708
  var LongT5PreTrainedModel = class extends PreTrainedModel {
25732
26709
  };
@@ -26436,6 +27413,14 @@ var Olmo3Model = class extends Olmo3PreTrainedModel {
26436
27413
  var Olmo3ForCausalLM = class extends Olmo3PreTrainedModel {
26437
27414
  };
26438
27415
 
27416
+ // src/models/olmo_hybrid/modeling_olmo_hybrid.js
27417
+ var OlmoHybridPreTrainedModel = class extends PreTrainedModel {
27418
+ };
27419
+ var OlmoHybridModel = class extends OlmoHybridPreTrainedModel {
27420
+ };
27421
+ var OlmoHybridForCausalLM = class extends OlmoHybridPreTrainedModel {
27422
+ };
27423
+
26439
27424
  // src/models/openelm/modeling_openelm.js
26440
27425
  var OpenELMPreTrainedModel = class extends PreTrainedModel {
26441
27426
  };
@@ -26469,27 +27454,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
26469
27454
  };
26470
27455
 
26471
27456
  // src/models/paligemma/modeling_paligemma.js
26472
- var PaliGemmaPreTrainedModel = class extends PreTrainedModel {
26473
- forward_params = [
26474
- "input_ids",
26475
- // 'inputs_embeds',
26476
- "attention_mask",
26477
- "pixel_values",
26478
- "position_ids",
26479
- "past_key_values"
26480
- ];
26481
- };
26482
- var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
26483
- _merge_input_ids_with_image_features(kwargs) {
26484
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
26485
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
26486
- return default_merge_input_ids_with_image_features({
26487
- // @ts-ignore
26488
- image_token_id: this.config.image_token_index,
26489
- ...kwargs,
26490
- image_features: reshaped_image_hidden_states
26491
- });
26492
- }
27457
+ var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
26493
27458
  };
26494
27459
 
26495
27460
  // src/models/parakeet/modeling_parakeet.js
@@ -26640,6 +27605,14 @@ var Qwen2Model = class extends Qwen2PreTrainedModel {
26640
27605
  var Qwen2ForCausalLM = class extends Qwen2PreTrainedModel {
26641
27606
  };
26642
27607
 
27608
+ // src/models/qwen2_moe/modeling_qwen2_moe.js
27609
+ var Qwen2MoePreTrainedModel = class extends PreTrainedModel {
27610
+ };
27611
+ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
27612
+ };
27613
+ var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
27614
+ };
27615
+
26643
27616
  // src/models/qwen2_vl/modeling_qwen2_vl.js
26644
27617
  var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
26645
27618
  forward_params = [
@@ -26654,6 +27627,9 @@ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
26654
27627
  ];
26655
27628
  };
26656
27629
  var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
27630
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
27631
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
27632
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
26657
27633
  image_grid_thw_name = "grid_thw";
26658
27634
  /**
26659
27635
  * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
@@ -26843,19 +27819,45 @@ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
26843
27819
  );
26844
27820
  } else {
26845
27821
  model_inputs.pixel_values = null;
26846
- const delta = BigInt(Object.values(model_inputs.past_key_values)[0].dims.at(-2));
26847
- const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
26848
- model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
27822
+ const past_length = model_inputs.past_key_values.get_seq_length();
27823
+ if (past_length < model_inputs.input_ids.dims[1]) {
27824
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
27825
+ model_inputs.input_ids,
27826
+ model_inputs.image_grid_thw,
27827
+ model_inputs.video_grid_thw,
27828
+ model_inputs.attention_mask
27829
+ );
27830
+ model_inputs.rope_deltas = rope_deltas;
27831
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
27832
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
27833
+ } else {
27834
+ if (!model_inputs.rope_deltas) {
27835
+ [, model_inputs.rope_deltas] = this.get_rope_index(
27836
+ model_inputs.input_ids,
27837
+ model_inputs.image_grid_thw,
27838
+ model_inputs.video_grid_thw,
27839
+ model_inputs.attention_mask
27840
+ );
27841
+ }
27842
+ const delta = BigInt(past_length);
27843
+ const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
27844
+ model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
27845
+ }
26849
27846
  }
26850
27847
  }
26851
27848
  return model_inputs;
26852
27849
  }
26853
27850
  };
27851
+ var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
27852
+ };
26854
27853
 
26855
27854
  // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
26856
27855
  var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
26857
27856
  image_grid_thw_name = "image_grid_thw";
26858
27857
  };
27858
+ var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
27859
+ image_grid_thw_name = "image_grid_thw";
27860
+ };
26859
27861
 
26860
27862
  // src/models/qwen3/modeling_qwen3.js
26861
27863
  var Qwen3PreTrainedModel = class extends PreTrainedModel {
@@ -26865,17 +27867,45 @@ var Qwen3Model = class extends Qwen3PreTrainedModel {
26865
27867
  var Qwen3ForCausalLM = class extends Qwen3PreTrainedModel {
26866
27868
  };
26867
27869
 
27870
+ // src/models/qwen3_moe/modeling_qwen3_moe.js
27871
+ var Qwen3MoePreTrainedModel = class extends PreTrainedModel {
27872
+ };
27873
+ var Qwen3MoeModel = class extends Qwen3MoePreTrainedModel {
27874
+ };
27875
+ var Qwen3MoeForCausalLM = class extends Qwen3MoePreTrainedModel {
27876
+ };
27877
+
27878
+ // src/models/qwen3_next/modeling_qwen3_next.js
27879
+ var Qwen3NextPreTrainedModel = class extends PreTrainedModel {
27880
+ };
27881
+ var Qwen3NextModel = class extends Qwen3NextPreTrainedModel {
27882
+ };
27883
+ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
27884
+ };
27885
+
26868
27886
  // src/models/qwen3_vl/modeling_qwen3_vl.js
26869
27887
  var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
26870
27888
  };
27889
+ var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
27890
+ };
27891
+
27892
+ // src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
27893
+ var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
27894
+ };
27895
+ var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
27896
+ };
26871
27897
 
26872
27898
  // src/models/qwen3_5/modeling_qwen3_5.js
26873
27899
  var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
26874
27900
  };
27901
+ var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
27902
+ };
26875
27903
 
26876
27904
  // src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
26877
27905
  var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
26878
27906
  };
27907
+ var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
27908
+ };
26879
27909
 
26880
27910
  // src/models/resnet/modeling_resnet.js
26881
27911
  var ResNetPreTrainedModel = class extends PreTrainedModel {
@@ -27556,25 +28586,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
27556
28586
  var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
27557
28587
  };
27558
28588
 
27559
- // src/models/ultravox/modeling_ultravox.js
27560
- var UltravoxPreTrainedModel = class extends PreTrainedModel {
27561
- forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
27562
- };
27563
- var UltravoxModel = class extends UltravoxPreTrainedModel {
27564
- _merge_input_ids_with_audio_features(kwargs) {
27565
- const audio_hidden_size = kwargs.audio_features.dims.at(-1);
27566
- const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
27567
- return default_merge_input_ids_with_audio_features({
27568
- // @ts-ignore
27569
- audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
27570
- ...kwargs,
27571
- audio_features: reshaped_audio_features
27572
- });
27573
- }
27574
- };
27575
- var VoxtralForConditionalGeneration = class extends UltravoxModel {
27576
- };
27577
-
27578
28589
  // src/models/unispeech/modeling_unispeech.js
27579
28590
  var UniSpeechPreTrainedModel = class extends PreTrainedModel {
27580
28591
  };
@@ -27740,6 +28751,170 @@ var VitsModel = class extends VitsPreTrainedModel {
27740
28751
  }
27741
28752
  };
27742
28753
 
28754
+ // src/models/voxtral/modeling_voxtral.js
28755
+ var VoxtralForConditionalGeneration = class extends UltravoxModel {
28756
+ };
28757
+
28758
+ // src/models/voxtral_realtime/modeling_voxtral_realtime.js
28759
+ var CONV1_LEFT_PAD = 2;
28760
+ var CONV2_LEFT_PAD = 1;
28761
+ var states = /* @__PURE__ */ new WeakMap();
28762
+ function createEncoderState(model, input_features) {
28763
+ const { text_config, audio_config } = (
28764
+ /** @type {any} */
28765
+ model.config
28766
+ );
28767
+ const encoder_session = model.sessions["audio_encoder"];
28768
+ const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
28769
+ const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
28770
+ const enc_kv_cache = new DynamicCache();
28771
+ const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
28772
+ const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
28773
+ const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
28774
+ for (const name in enc_shapes) {
28775
+ const size = enc_shapes[name].reduce((a, b) => a * b, 1);
28776
+ enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
28777
+ }
28778
+ const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
28779
+ 1,
28780
+ PADDING_CACHE_CHANNELS,
28781
+ CONV1_LEFT_PAD
28782
+ ]);
28783
+ const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
28784
+ if (!chunks_iter) {
28785
+ throw new Error("input_features must be iterable or async iterable");
28786
+ }
28787
+ return {
28788
+ encoder_session,
28789
+ enc_kv_cache,
28790
+ enc_padding_cache,
28791
+ enc_past_seq_len: 0,
28792
+ audio_embed_queue: [],
28793
+ audio_embed_total_tokens: 0,
28794
+ audio_queue_offset: 0,
28795
+ audio_consumed: 0,
28796
+ stream_exhausted: false,
28797
+ chunks_iter,
28798
+ text_hidden_size: text_config.hidden_size
28799
+ };
28800
+ }
28801
+ async function encodeChunk(s, chunk_features) {
28802
+ const audio_seq_len = chunk_features.dims[2];
28803
+ const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
28804
+ const position_ids = new Tensor2(
28805
+ "int64",
28806
+ BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
28807
+ [1, conv2_output_len]
28808
+ );
28809
+ const total_seq_len = s.enc_past_seq_len + conv2_output_len;
28810
+ const attention_mask = ones([1, total_seq_len]);
28811
+ const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
28812
+ input_features: chunk_features,
28813
+ attention_mask,
28814
+ position_ids,
28815
+ past_padding_cache: s.enc_padding_cache,
28816
+ ...s.enc_kv_cache
28817
+ });
28818
+ if (s.enc_padding_cache.location === "gpu-buffer") {
28819
+ s.enc_padding_cache.dispose();
28820
+ }
28821
+ s.enc_padding_cache = present_padding_cache;
28822
+ for (const name in present_cache) {
28823
+ if (name.startsWith("present.")) {
28824
+ const pastName = name.replace("present", "past_key_values");
28825
+ const prev = s.enc_kv_cache[pastName];
28826
+ if (prev?.location === "gpu-buffer") {
28827
+ prev.dispose();
28828
+ }
28829
+ s.enc_kv_cache[pastName] = present_cache[name];
28830
+ }
28831
+ }
28832
+ s.enc_past_seq_len = total_seq_len;
28833
+ return audio_embeds;
28834
+ }
28835
+ async function fillAudioBuffer(s, needed) {
28836
+ while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
28837
+ const result = await s.chunks_iter.next();
28838
+ if (result.done) {
28839
+ s.stream_exhausted = true;
28840
+ break;
28841
+ }
28842
+ const new_embeds = await encodeChunk(s, result.value);
28843
+ s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
28844
+ s.audio_embed_total_tokens += new_embeds.dims[1];
28845
+ }
28846
+ }
28847
+ function addAudioEmbeddings(s, inputs_embeds, current_len) {
28848
+ if (s.audio_embed_queue.length === 0) return;
28849
+ const embed_data = inputs_embeds.data;
28850
+ let embed_write_pos = 0;
28851
+ let remaining = current_len;
28852
+ while (remaining > 0 && s.audio_embed_queue.length > 0) {
28853
+ const front = s.audio_embed_queue[0];
28854
+ const available = front.tokens - s.audio_queue_offset;
28855
+ const n = Math.min(remaining, available);
28856
+ const src_offset = s.audio_queue_offset * s.text_hidden_size;
28857
+ for (let i = 0; i < n * s.text_hidden_size; ++i) {
28858
+ embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
28859
+ }
28860
+ embed_write_pos += n;
28861
+ remaining -= n;
28862
+ s.audio_queue_offset += n;
28863
+ if (s.audio_queue_offset >= front.tokens) {
28864
+ s.audio_embed_queue.shift();
28865
+ s.audio_queue_offset = 0;
28866
+ }
28867
+ }
28868
+ s.audio_consumed += current_len - remaining;
28869
+ }
28870
+ var AudioExhaustedCriteria = class extends StoppingCriteria {
28871
+ constructor(enc_state) {
28872
+ super();
28873
+ this._s = enc_state;
28874
+ }
28875
+ _call(input_ids) {
28876
+ const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
28877
+ return input_ids.map(() => done);
28878
+ }
28879
+ };
28880
+ var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
28881
+ forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
28882
+ };
28883
+ var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
28884
+ async forward({ input_ids, past_key_values, ...kwargs }) {
28885
+ const current_len = input_ids.dims[1];
28886
+ const enc = states.get(this);
28887
+ if (enc) {
28888
+ await fillAudioBuffer(enc, enc.audio_consumed + current_len);
28889
+ }
28890
+ const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
28891
+ if (enc) {
28892
+ addAudioEmbeddings(enc, inputs_embeds, current_len);
28893
+ }
28894
+ const decoder_feeds = { inputs_embeds, ...kwargs };
28895
+ this.addPastKeyValues(decoder_feeds, past_key_values);
28896
+ const session = this.sessions["decoder_model_merged"];
28897
+ const fixed = pick(decoder_feeds, session.inputNames);
28898
+ return await sessionRun(session, fixed);
28899
+ }
28900
+ async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
28901
+ if (!input_features) {
28902
+ throw new Error("input_features (generator/iterable) must be provided");
28903
+ }
28904
+ const enc_state = createEncoderState(this, input_features);
28905
+ states.set(this, enc_state);
28906
+ const stopping_criteria = new StoppingCriteriaList();
28907
+ stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
28908
+ if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
28909
+ try {
28910
+ return await super.generate({ ...kwargs, stopping_criteria });
28911
+ } finally {
28912
+ enc_state.enc_kv_cache.dispose();
28913
+ states.delete(this);
28914
+ }
28915
+ }
28916
+ };
28917
+
27743
28918
  // src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
27744
28919
  var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
27745
28920
  };
@@ -28364,6 +29539,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
28364
29539
  ["olmo", "OlmoModel"],
28365
29540
  ["olmo2", "Olmo2Model"],
28366
29541
  ["olmo3", "Olmo3Model"],
29542
+ ["olmo_hybrid", "OlmoHybridModel"],
28367
29543
  ["mobilellm", "MobileLLMModel"],
28368
29544
  ["granite", "GraniteModel"],
28369
29545
  ["granitemoehybrid", "GraniteMoeHybridModel"],
@@ -28377,7 +29553,10 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
28377
29553
  ["glm", "GlmModel"],
28378
29554
  ["openelm", "OpenELMModel"],
28379
29555
  ["qwen2", "Qwen2Model"],
29556
+ ["qwen2_moe", "Qwen2MoeModel"],
28380
29557
  ["qwen3", "Qwen3Model"],
29558
+ ["qwen3_moe", "Qwen3MoeModel"],
29559
+ ["qwen3_next", "Qwen3NextModel"],
28381
29560
  ["phi", "PhiModel"],
28382
29561
  ["phi3", "Phi3Model"],
28383
29562
  ["mpt", "MptModel"],
@@ -28385,7 +29564,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
28385
29564
  ["mistral", "MistralModel"],
28386
29565
  ["ministral", "MinistralModel"],
28387
29566
  ["ministral3", "Ministral3Model"],
28388
- ["ernie4_5", "Ernie4_5_Model"],
29567
+ ["ernie4_5", "Ernie4_5ForCausalLM"],
28389
29568
  ["starcoder2", "Starcoder2Model"],
28390
29569
  ["falcon", "FalconModel"],
28391
29570
  ["falcon_h1", "FalconH1Model"],
@@ -28479,6 +29658,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
28479
29658
  ["olmo", "OlmoForCausalLM"],
28480
29659
  ["olmo2", "Olmo2ForCausalLM"],
28481
29660
  ["olmo3", "Olmo3ForCausalLM"],
29661
+ ["olmo_hybrid", "OlmoHybridForCausalLM"],
28482
29662
  ["mobilellm", "MobileLLMForCausalLM"],
28483
29663
  ["granite", "GraniteForCausalLM"],
28484
29664
  ["granitemoehybrid", "GraniteMoeHybridForCausalLM"],
@@ -28488,11 +29668,22 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
28488
29668
  ["gemma2", "Gemma2ForCausalLM"],
28489
29669
  ["vaultgemma", "VaultGemmaForCausalLM"],
28490
29670
  ["gemma3_text", "Gemma3ForCausalLM"],
29671
+ ["gemma3", "Gemma3ForCausalLM"],
28491
29672
  ["helium", "HeliumForCausalLM"],
28492
29673
  ["glm", "GlmForCausalLM"],
28493
29674
  ["openelm", "OpenELMForCausalLM"],
28494
29675
  ["qwen2", "Qwen2ForCausalLM"],
29676
+ ["qwen2_moe", "Qwen2MoeForCausalLM"],
28495
29677
  ["qwen3", "Qwen3ForCausalLM"],
29678
+ ["qwen3_moe", "Qwen3MoeForCausalLM"],
29679
+ ["qwen3_next", "Qwen3NextForCausalLM"],
29680
+ ["qwen2_vl", "Qwen2VLForCausalLM"],
29681
+ ["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
29682
+ ["qwen3_vl", "Qwen3VLForCausalLM"],
29683
+ ["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
29684
+ ["qwen3_5", "Qwen3_5ForCausalLM"],
29685
+ ["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
29686
+ ["gemma3n", "Gemma3nForCausalLM"],
28496
29687
  ["phi", "PhiForCausalLM"],
28497
29688
  ["phi3", "Phi3ForCausalLM"],
28498
29689
  ["mpt", "MptForCausalLM"],
@@ -28501,7 +29692,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
28501
29692
  ["mistral", "MistralForCausalLM"],
28502
29693
  ["ministral", "MinistralForCausalLM"],
28503
29694
  ["ministral3", "Ministral3ForCausalLM"],
28504
- ["ernie4_5", "Ernie4_5_ForCausalLM"],
29695
+ ["ernie4_5", "Ernie4_5ForCausalLM"],
28505
29696
  ["starcoder2", "Starcoder2ForCausalLM"],
28506
29697
  ["falcon", "FalconForCausalLM"],
28507
29698
  ["falcon_h1", "FalconH1ForCausalLM"],
@@ -28565,8 +29756,10 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
28565
29756
  ["qwen2_vl", "Qwen2VLForConditionalGeneration"],
28566
29757
  ["qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"],
28567
29758
  ["qwen3_vl", "Qwen3VLForConditionalGeneration"],
29759
+ ["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
28568
29760
  ["qwen3_5", "Qwen3_5ForConditionalGeneration"],
28569
29761
  ["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
29762
+ ["lfm2_vl", "Lfm2VlForConditionalGeneration"],
28570
29763
  ["idefics3", "Idefics3ForConditionalGeneration"],
28571
29764
  ["smolvlm", "SmolVLMForConditionalGeneration"],
28572
29765
  ["paligemma", "PaliGemmaForConditionalGeneration"],
@@ -28575,8 +29768,10 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
28575
29768
  ["mistral3", "Mistral3ForConditionalGeneration"]
28576
29769
  ]);
28577
29770
  var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
29771
+ ["granite_speech", "GraniteSpeechForConditionalGeneration"],
28578
29772
  ["ultravox", "UltravoxModel"],
28579
- ["voxtral", "VoxtralForConditionalGeneration"]
29773
+ ["voxtral", "VoxtralForConditionalGeneration"],
29774
+ ["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
28580
29775
  ]);
28581
29776
  var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
28582
29777
  ["vision-encoder-decoder", "VisionEncoderDecoderModel"]
@@ -28759,24 +29954,37 @@ var CUSTOM_MAPPING = [
28759
29954
  MODEL_TYPES.ImageAudioTextToText
28760
29955
  ],
28761
29956
  ["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
28762
- ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
29957
+ ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
29958
+ ["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29959
+ ["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29960
+ ["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29961
+ ["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29962
+ ["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29963
+ ["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29964
+ ["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
29965
+ [
29966
+ "VoxtralRealtimeForConditionalGeneration",
29967
+ VoxtralRealtimeForConditionalGeneration,
29968
+ MODEL_TYPES.VoxtralRealtime
29969
+ ]
28763
29970
  ];
28764
29971
  for (const [name, model, type] of CUSTOM_MAPPING) {
28765
29972
  MODEL_TYPE_MAPPING.set(name, type);
28766
29973
  MODEL_CLASS_TO_NAME_MAPPING.set(model, name);
28767
29974
  MODEL_NAME_TO_CLASS_MAPPING.set(name, model);
28768
29975
  }
28769
- var CUSTOM_ARCHITECTURES = /* @__PURE__ */ new Map([
29976
+ var CUSTOM_ARCHITECTURES_MAPPING = /* @__PURE__ */ new Map([
28770
29977
  ["modnet", MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
28771
29978
  ["birefnet", MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
28772
29979
  ["isnet", MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES],
28773
29980
  ["ben", MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES]
28774
29981
  ]);
28775
- for (const [name, mapping] of CUSTOM_ARCHITECTURES.entries()) {
29982
+ for (const [name, mapping] of CUSTOM_ARCHITECTURES_MAPPING.entries()) {
28776
29983
  mapping.set(name, "PreTrainedModel");
28777
29984
  MODEL_TYPE_MAPPING.set(name, MODEL_TYPES.EncoderOnly);
28778
29985
  MODEL_NAME_TO_CLASS_MAPPING.set(name, PreTrainedModel);
28779
29986
  }
29987
+ var CUSTOM_ARCHITECTURES = new Set(CUSTOM_ARCHITECTURES_MAPPING.keys());
28780
29988
  MODEL_TYPE_MAPPING.set("PreTrainedModel", MODEL_TYPES.EncoderOnly);
28781
29989
  MODEL_CLASS_TO_NAME_MAPPING.set(PreTrainedModel, "PreTrainedModel");
28782
29990
  var MODEL_MAPPINGS = {
@@ -28825,6 +30033,18 @@ var PretrainedMixin = class {
28825
30033
  * the model type is not found in the mapping.
28826
30034
  */
28827
30035
  static BASE_IF_FAIL = false;
30036
+ /**
30037
+ * Check whether this AutoModel class supports a given model type.
30038
+ * @param {string} model_type The model type from config (e.g., 'bert', 'whisper').
30039
+ * @returns {boolean} Whether this class can handle the given model type.
30040
+ */
30041
+ static supports(model_type) {
30042
+ if (!this.MODEL_CLASS_MAPPINGS) return false;
30043
+ for (const mapping of this.MODEL_CLASS_MAPPINGS) {
30044
+ if (mapping.has(model_type)) return true;
30045
+ }
30046
+ return this.BASE_IF_FAIL;
30047
+ }
28828
30048
  /** @type {typeof PreTrainedModel.from_pretrained} */
28829
30049
  static async from_pretrained(pretrained_model_name_or_path, {
28830
30050
  progress_callback = null,
@@ -28856,7 +30076,7 @@ var PretrainedMixin = class {
28856
30076
  if (!this.MODEL_CLASS_MAPPINGS) {
28857
30077
  throw new Error("`MODEL_CLASS_MAPPINGS` not implemented for this type of `AutoClass`: " + this.name);
28858
30078
  }
28859
- const model_type = options.config.model_type;
30079
+ const { model_type } = options.config;
28860
30080
  for (const MODEL_CLASS_MAPPING of this.MODEL_CLASS_MAPPINGS) {
28861
30081
  let modelInfo = MODEL_CLASS_MAPPING.get(model_type);
28862
30082
  if (!modelInfo) {
@@ -30208,40 +31428,30 @@ Pipeline {
30208
31428
  // src/pipelines/index.js
30209
31429
  var SUPPORTED_TASKS = Object.freeze({
30210
31430
  "text-classification": {
30211
- tokenizer: AutoTokenizer,
30212
31431
  pipeline: TextClassificationPipeline,
30213
31432
  model: AutoModelForSequenceClassification,
30214
31433
  default: {
30215
- // TODO: replace with original
30216
- // "model": "distilbert-base-uncased-finetuned-sst-2-english",
30217
31434
  model: "Xenova/distilbert-base-uncased-finetuned-sst-2-english"
30218
31435
  },
30219
31436
  type: "text"
30220
31437
  },
30221
31438
  "token-classification": {
30222
- tokenizer: AutoTokenizer,
30223
31439
  pipeline: TokenClassificationPipeline,
30224
31440
  model: AutoModelForTokenClassification,
30225
31441
  default: {
30226
- // TODO: replace with original
30227
- // "model": "Davlan/bert-base-multilingual-cased-ner-hrl",
30228
31442
  model: "Xenova/bert-base-multilingual-cased-ner-hrl"
30229
31443
  },
30230
31444
  type: "text"
30231
31445
  },
30232
31446
  "question-answering": {
30233
- tokenizer: AutoTokenizer,
30234
31447
  pipeline: QuestionAnsweringPipeline,
30235
31448
  model: AutoModelForQuestionAnswering,
30236
31449
  default: {
30237
- // TODO: replace with original
30238
- // "model": "distilbert-base-cased-distilled-squad",
30239
31450
  model: "Xenova/distilbert-base-cased-distilled-squad"
30240
31451
  },
30241
31452
  type: "text"
30242
31453
  },
30243
31454
  "fill-mask": {
30244
- tokenizer: AutoTokenizer,
30245
31455
  pipeline: FillMaskPipeline,
30246
31456
  model: AutoModelForMaskedLM,
30247
31457
  default: {
@@ -30251,40 +31461,30 @@ var SUPPORTED_TASKS = Object.freeze({
30251
31461
  type: "text"
30252
31462
  },
30253
31463
  summarization: {
30254
- tokenizer: AutoTokenizer,
30255
31464
  pipeline: SummarizationPipeline,
30256
31465
  model: AutoModelForSeq2SeqLM,
30257
31466
  default: {
30258
- // TODO: replace with original
30259
- // "model": "sshleifer/distilbart-cnn-6-6",
30260
31467
  model: "Xenova/distilbart-cnn-6-6"
30261
31468
  },
30262
31469
  type: "text"
30263
31470
  },
30264
31471
  translation: {
30265
- tokenizer: AutoTokenizer,
30266
31472
  pipeline: TranslationPipeline,
30267
31473
  model: AutoModelForSeq2SeqLM,
30268
31474
  default: {
30269
- // TODO: replace with original
30270
- // "model": "t5-small",
30271
31475
  model: "Xenova/t5-small"
30272
31476
  },
30273
31477
  type: "text"
30274
31478
  },
30275
31479
  "text2text-generation": {
30276
- tokenizer: AutoTokenizer,
30277
31480
  pipeline: Text2TextGenerationPipeline,
30278
31481
  model: AutoModelForSeq2SeqLM,
30279
31482
  default: {
30280
- // TODO: replace with original
30281
- // "model": "google/flan-t5-small",
30282
31483
  model: "Xenova/flan-t5-small"
30283
31484
  },
30284
31485
  type: "text"
30285
31486
  },
30286
31487
  "text-generation": {
30287
- tokenizer: AutoTokenizer,
30288
31488
  pipeline: TextGenerationPipeline,
30289
31489
  model: AutoModelForCausalLM,
30290
31490
  default: {
@@ -30294,12 +31494,9 @@ var SUPPORTED_TASKS = Object.freeze({
30294
31494
  type: "text"
30295
31495
  },
30296
31496
  "zero-shot-classification": {
30297
- tokenizer: AutoTokenizer,
30298
31497
  pipeline: ZeroShotClassificationPipeline,
30299
31498
  model: AutoModelForSequenceClassification,
30300
31499
  default: {
30301
- // TODO: replace with original
30302
- // "model": "typeform/distilbert-base-uncased-mnli",
30303
31500
  model: "Xenova/distilbert-base-uncased-mnli"
30304
31501
  },
30305
31502
  type: "text"
@@ -30307,47 +31504,30 @@ var SUPPORTED_TASKS = Object.freeze({
30307
31504
  "audio-classification": {
30308
31505
  pipeline: AudioClassificationPipeline,
30309
31506
  model: AutoModelForAudioClassification,
30310
- processor: AutoProcessor,
30311
31507
  default: {
30312
- // TODO: replace with original
30313
- // "model": "superb/wav2vec2-base-superb-ks",
30314
31508
  model: "Xenova/wav2vec2-base-superb-ks"
30315
31509
  },
30316
31510
  type: "audio"
30317
31511
  },
30318
31512
  "zero-shot-audio-classification": {
30319
- tokenizer: AutoTokenizer,
30320
31513
  pipeline: ZeroShotAudioClassificationPipeline,
30321
31514
  model: AutoModel,
30322
- processor: AutoProcessor,
30323
31515
  default: {
30324
- // TODO: replace with original
30325
- // "model": "laion/clap-htsat-fused",
30326
31516
  model: "Xenova/clap-htsat-unfused"
30327
31517
  },
30328
31518
  type: "multimodal"
30329
31519
  },
30330
31520
  "automatic-speech-recognition": {
30331
- tokenizer: AutoTokenizer,
30332
31521
  pipeline: AutomaticSpeechRecognitionPipeline,
30333
31522
  model: [AutoModelForSpeechSeq2Seq, AutoModelForCTC],
30334
- processor: AutoProcessor,
30335
31523
  default: {
30336
- // TODO: replace with original
30337
- // "model": "openai/whisper-tiny.en",
30338
31524
  model: "Xenova/whisper-tiny.en"
30339
31525
  },
30340
31526
  type: "multimodal"
30341
31527
  },
30342
31528
  "text-to-audio": {
30343
- tokenizer: AutoTokenizer,
30344
31529
  pipeline: TextToAudioPipeline,
30345
31530
  model: [AutoModelForTextToWaveform, AutoModelForTextToSpectrogram],
30346
- processor: [
30347
- AutoProcessor,
30348
- /* Some don't use a processor */
30349
- null
30350
- ],
30351
31531
  default: {
30352
31532
  model: "onnx-community/Supertonic-TTS-ONNX",
30353
31533
  dtype: "fp32"
@@ -30355,124 +31535,86 @@ var SUPPORTED_TASKS = Object.freeze({
30355
31535
  type: "text"
30356
31536
  },
30357
31537
  "image-to-text": {
30358
- tokenizer: AutoTokenizer,
30359
31538
  pipeline: ImageToTextPipeline,
30360
31539
  model: AutoModelForVision2Seq,
30361
- processor: AutoProcessor,
30362
31540
  default: {
30363
- // TODO: replace with original
30364
- // "model": "nlpconnect/vit-gpt2-image-captioning",
30365
31541
  model: "Xenova/vit-gpt2-image-captioning"
30366
31542
  },
30367
31543
  type: "multimodal"
30368
31544
  },
30369
31545
  "image-classification": {
30370
- // no tokenizer
30371
31546
  pipeline: ImageClassificationPipeline,
30372
31547
  model: AutoModelForImageClassification,
30373
- processor: AutoProcessor,
30374
31548
  default: {
30375
- // TODO: replace with original
30376
- // "model": "google/vit-base-patch16-224",
30377
31549
  model: "Xenova/vit-base-patch16-224"
30378
31550
  },
30379
31551
  type: "multimodal"
30380
31552
  },
30381
31553
  "image-segmentation": {
30382
- // no tokenizer
30383
31554
  pipeline: ImageSegmentationPipeline,
30384
31555
  model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
30385
- processor: AutoProcessor,
30386
31556
  default: {
30387
- // TODO: replace with original
30388
- // "model": "facebook/detr-resnet-50-panoptic",
30389
31557
  model: "Xenova/detr-resnet-50-panoptic"
30390
31558
  },
30391
31559
  type: "multimodal"
30392
31560
  },
30393
31561
  "background-removal": {
30394
- // no tokenizer
30395
31562
  pipeline: BackgroundRemovalPipeline,
30396
31563
  model: [AutoModelForImageSegmentation, AutoModelForSemanticSegmentation, AutoModelForUniversalSegmentation],
30397
- processor: AutoProcessor,
30398
31564
  default: {
30399
31565
  model: "Xenova/modnet"
30400
31566
  },
30401
31567
  type: "image"
30402
31568
  },
30403
31569
  "zero-shot-image-classification": {
30404
- tokenizer: AutoTokenizer,
30405
31570
  pipeline: ZeroShotImageClassificationPipeline,
30406
31571
  model: AutoModel,
30407
- processor: AutoProcessor,
30408
31572
  default: {
30409
- // TODO: replace with original
30410
- // "model": "openai/clip-vit-base-patch32",
30411
31573
  model: "Xenova/clip-vit-base-patch32"
30412
31574
  },
30413
31575
  type: "multimodal"
30414
31576
  },
30415
31577
  "object-detection": {
30416
- // no tokenizer
30417
31578
  pipeline: ObjectDetectionPipeline,
30418
31579
  model: AutoModelForObjectDetection,
30419
- processor: AutoProcessor,
30420
31580
  default: {
30421
- // TODO: replace with original
30422
- // "model": "facebook/detr-resnet-50",
30423
31581
  model: "Xenova/detr-resnet-50"
30424
31582
  },
30425
31583
  type: "multimodal"
30426
31584
  },
30427
31585
  "zero-shot-object-detection": {
30428
- tokenizer: AutoTokenizer,
30429
31586
  pipeline: ZeroShotObjectDetectionPipeline,
30430
31587
  model: AutoModelForZeroShotObjectDetection,
30431
- processor: AutoProcessor,
30432
31588
  default: {
30433
- // TODO: replace with original
30434
- // "model": "google/owlvit-base-patch32",
30435
31589
  model: "Xenova/owlvit-base-patch32"
30436
31590
  },
30437
31591
  type: "multimodal"
30438
31592
  },
30439
31593
  "document-question-answering": {
30440
- tokenizer: AutoTokenizer,
30441
31594
  pipeline: DocumentQuestionAnsweringPipeline,
30442
31595
  model: AutoModelForDocumentQuestionAnswering,
30443
- processor: AutoProcessor,
30444
31596
  default: {
30445
- // TODO: replace with original
30446
- // "model": "naver-clova-ix/donut-base-finetuned-docvqa",
30447
31597
  model: "Xenova/donut-base-finetuned-docvqa"
30448
31598
  },
30449
31599
  type: "multimodal"
30450
31600
  },
30451
31601
  "image-to-image": {
30452
- // no tokenizer
30453
31602
  pipeline: ImageToImagePipeline,
30454
31603
  model: AutoModelForImageToImage,
30455
- processor: AutoProcessor,
30456
31604
  default: {
30457
- // TODO: replace with original
30458
- // "model": "caidas/swin2SR-classical-sr-x2-64",
30459
31605
  model: "Xenova/swin2SR-classical-sr-x2-64"
30460
31606
  },
30461
31607
  type: "image"
30462
31608
  },
30463
31609
  "depth-estimation": {
30464
- // no tokenizer
30465
31610
  pipeline: DepthEstimationPipeline,
30466
31611
  model: AutoModelForDepthEstimation,
30467
- processor: AutoProcessor,
30468
31612
  default: {
30469
31613
  model: "onnx-community/depth-anything-v2-small"
30470
31614
  },
30471
31615
  type: "image"
30472
31616
  },
30473
- // This task serves as a useful interface for dealing with sentence-transformers (https://huggingface.co/sentence-transformers).
30474
31617
  "feature-extraction": {
30475
- tokenizer: AutoTokenizer,
30476
31618
  pipeline: FeatureExtractionPipeline,
30477
31619
  model: AutoModel,
30478
31620
  default: {
@@ -30482,7 +31624,6 @@ var SUPPORTED_TASKS = Object.freeze({
30482
31624
  type: "text"
30483
31625
  },
30484
31626
  "image-feature-extraction": {
30485
- processor: AutoProcessor,
30486
31627
  pipeline: ImageFeatureExtractionPipeline,
30487
31628
  model: [AutoModelForImageFeatureExtraction, AutoModel],
30488
31629
  default: {
@@ -30503,8 +31644,18 @@ var TASK_ALIASES = Object.freeze({
30503
31644
  });
30504
31645
 
30505
31646
  // src/utils/model_registry/get_model_files.js
31647
+ function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
31648
+ if (config !== null) {
31649
+ return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
31650
+ }
31651
+ const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
31652
+ return memoizePromise(
31653
+ key,
31654
+ () => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
31655
+ );
31656
+ }
30506
31657
  async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
30507
- config = await AutoConfig.from_pretrained(modelId, { config });
31658
+ config = await get_config(modelId, { config });
30508
31659
  const files = [
30509
31660
  // Add config.json (always loaded)
30510
31661
  "config.json"
@@ -30534,6 +31685,15 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
30534
31685
  modelType = mappedType;
30535
31686
  foundInMapping = true;
30536
31687
  }
31688
+ if (!foundInMapping) {
31689
+ for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
31690
+ if (mapping.has(config.model_type)) {
31691
+ modelType = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
31692
+ foundInMapping = true;
31693
+ break;
31694
+ }
31695
+ }
31696
+ }
30537
31697
  }
30538
31698
  if (!foundInMapping) {
30539
31699
  const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
@@ -30556,74 +31716,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
30556
31716
  files.push(dataFilePath);
30557
31717
  }
30558
31718
  };
30559
- const singleModelName = model_file_name ?? "model";
30560
- if (modelType === MODEL_TYPES.DecoderOnly) {
30561
- add_model_file("model", singleModelName);
30562
- files.push("generation_config.json");
30563
- } else if (modelType === MODEL_TYPES.DecoderOnlyWithoutHead) {
30564
- add_model_file("model", singleModelName);
30565
- } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
30566
- add_model_file("model", "encoder_model");
30567
- add_model_file("decoder_model_merged");
30568
- files.push("generation_config.json");
30569
- } else if (modelType === MODEL_TYPES.MaskGeneration) {
30570
- add_model_file("model", "vision_encoder");
30571
- add_model_file("prompt_encoder_mask_decoder");
30572
- } else if (modelType === MODEL_TYPES.EncoderDecoder) {
30573
- add_model_file("model", "encoder_model");
30574
- add_model_file("decoder_model_merged");
30575
- } else if (modelType === MODEL_TYPES.ImageTextToText) {
30576
- add_model_file("embed_tokens");
30577
- add_model_file("vision_encoder");
30578
- add_model_file("decoder_model_merged");
30579
- if (config.is_encoder_decoder) {
30580
- add_model_file("model", "encoder_model");
30581
- }
30582
- files.push("generation_config.json");
30583
- } else if (modelType === MODEL_TYPES.AudioTextToText) {
30584
- add_model_file("embed_tokens");
30585
- add_model_file("audio_encoder");
30586
- add_model_file("decoder_model_merged");
30587
- files.push("generation_config.json");
30588
- } else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
30589
- add_model_file("embed_tokens");
30590
- add_model_file("audio_encoder");
30591
- add_model_file("vision_encoder");
30592
- add_model_file("decoder_model_merged");
30593
- files.push("generation_config.json");
30594
- } else if (modelType === MODEL_TYPES.Musicgen) {
30595
- add_model_file("model", "text_encoder");
30596
- add_model_file("decoder_model_merged");
30597
- add_model_file("encodec_decode");
30598
- files.push("generation_config.json");
30599
- } else if (modelType === MODEL_TYPES.MultiModality) {
30600
- add_model_file("prepare_inputs_embeds");
30601
- add_model_file("model", "language_model");
30602
- add_model_file("lm_head");
30603
- add_model_file("gen_head");
30604
- add_model_file("gen_img_embeds");
30605
- add_model_file("image_decode");
30606
- files.push("generation_config.json");
30607
- } else if (modelType === MODEL_TYPES.Phi3V) {
30608
- add_model_file("prepare_inputs_embeds");
30609
- add_model_file("model");
30610
- add_model_file("vision_encoder");
30611
- files.push("generation_config.json");
30612
- } else if (modelType === MODEL_TYPES.Chatterbox) {
30613
- add_model_file("embed_tokens");
30614
- add_model_file("speech_encoder");
30615
- add_model_file("model", "language_model");
30616
- add_model_file("conditional_decoder");
30617
- files.push("generation_config.json");
30618
- } else if (modelType === MODEL_TYPES.AutoEncoder) {
30619
- add_model_file("encoder_model");
30620
- add_model_file("decoder_model");
30621
- } else if (modelType === MODEL_TYPES.Supertonic) {
30622
- add_model_file("text_encoder");
30623
- add_model_file("latent_denoiser");
30624
- add_model_file("voice_decoder");
30625
- } else {
30626
- add_model_file("model", singleModelName);
31719
+ const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
31720
+ for (const [sessionKey, baseName] of Object.entries(sessions)) {
31721
+ add_model_file(sessionKey, baseName);
31722
+ }
31723
+ if (optional_configs) {
31724
+ for (const configFile of Object.values(optional_configs)) {
31725
+ files.push(configFile);
31726
+ }
30627
31727
  }
30628
31728
  return files;
30629
31729
  }
@@ -30659,28 +31759,21 @@ async function get_files(modelId, {
30659
31759
  }
30660
31760
 
30661
31761
  // src/utils/model_registry/get_pipeline_files.js
30662
- function get_task_components(task) {
30663
- const taskConfig = SUPPORTED_TASKS[task];
30664
- if (!taskConfig) {
30665
- return null;
30666
- }
30667
- return {
30668
- tokenizer: !!taskConfig.tokenizer,
30669
- processor: !!taskConfig.processor
30670
- };
30671
- }
30672
31762
  async function get_pipeline_files(task, modelId, options = {}) {
30673
31763
  task = TASK_ALIASES[task] ?? task;
30674
- const components = get_task_components(task);
30675
- if (!components) {
31764
+ const taskConfig = SUPPORTED_TASKS[task];
31765
+ if (!taskConfig) {
30676
31766
  throw new Error(
30677
31767
  `Unsupported pipeline task: ${task}. Must be one of [${Object.keys(SUPPORTED_TASKS).join(", ")}]`
30678
31768
  );
30679
31769
  }
31770
+ const { type } = taskConfig;
31771
+ const include_tokenizer = type !== "audio" && type !== "image";
31772
+ const include_processor = type !== "text";
30680
31773
  return get_files(modelId, {
30681
31774
  ...options,
30682
- include_tokenizer: components.tokenizer,
30683
- include_processor: components.processor
31775
+ include_tokenizer,
31776
+ include_processor
30684
31777
  });
30685
31778
  }
30686
31779
 
@@ -30710,12 +31803,12 @@ async function pipeline(task, model = null, {
30710
31803
  dtype = pipelineInfo.default.dtype;
30711
31804
  }
30712
31805
  }
31806
+ const expected_files = await get_pipeline_files(task, model, {
31807
+ device,
31808
+ dtype
31809
+ });
30713
31810
  let files_loading = {};
30714
31811
  if (progress_callback) {
30715
- const expected_files = await get_pipeline_files(task, model, {
30716
- device,
30717
- dtype
30718
- });
30719
31812
  const metadata = await Promise.all(expected_files.map(async (file) => get_file_metadata(model, file)));
30720
31813
  metadata.forEach((m, i) => {
30721
31814
  if (m.exists) {
@@ -30761,13 +31854,31 @@ async function pipeline(task, model = null, {
30761
31854
  model_file_name,
30762
31855
  session_options
30763
31856
  };
30764
- const classes = /* @__PURE__ */ new Map([
30765
- ["tokenizer", pipelineInfo.tokenizer],
30766
- ["model", pipelineInfo.model],
30767
- ["processor", pipelineInfo.processor]
31857
+ const hasTokenizer = expected_files.includes("tokenizer.json");
31858
+ const hasProcessor = expected_files.includes("preprocessor_config.json");
31859
+ const modelClasses = pipelineInfo.model;
31860
+ let modelPromise;
31861
+ if (Array.isArray(modelClasses)) {
31862
+ const resolvedConfig = config ?? await AutoConfig.from_pretrained(model, pretrainedOptions);
31863
+ const { model_type } = resolvedConfig;
31864
+ const matchedClass = modelClasses.find((cls) => cls.supports(model_type));
31865
+ if (!matchedClass) {
31866
+ throw Error(
31867
+ `Unsupported model type "${model_type}" for task "${task}". None of the candidate model classes support this type.`
31868
+ );
31869
+ }
31870
+ modelPromise = matchedClass.from_pretrained(model, { ...pretrainedOptions, config: resolvedConfig });
31871
+ } else {
31872
+ modelPromise = modelClasses.from_pretrained(model, pretrainedOptions);
31873
+ }
31874
+ const [tokenizer, processor, model_loaded] = await Promise.all([
31875
+ hasTokenizer ? AutoTokenizer.from_pretrained(model, pretrainedOptions) : null,
31876
+ hasProcessor ? AutoProcessor.from_pretrained(model, pretrainedOptions) : null,
31877
+ modelPromise
30768
31878
  ]);
30769
- const results = await loadItems(classes, model, pretrainedOptions);
30770
- results.task = task;
31879
+ const results = { task, model: model_loaded };
31880
+ if (tokenizer) results.tokenizer = tokenizer;
31881
+ if (processor) results.processor = processor;
30771
31882
  dispatchCallback(progress_callback, {
30772
31883
  status: "ready",
30773
31884
  task,
@@ -30776,48 +31887,6 @@ async function pipeline(task, model = null, {
30776
31887
  const pipelineClass = pipelineInfo.pipeline;
30777
31888
  return new pipelineClass(results);
30778
31889
  }
30779
- async function loadItems(mapping, model, pretrainedOptions) {
30780
- const result = /* @__PURE__ */ Object.create(null);
30781
- const promises = [];
30782
- for (const [name, cls] of mapping.entries()) {
30783
- if (!cls) continue;
30784
- let promise;
30785
- if (Array.isArray(cls)) {
30786
- promise = new Promise(async (resolve, reject) => {
30787
- let e;
30788
- for (const c of cls) {
30789
- if (c === null) {
30790
- resolve(null);
30791
- return;
30792
- }
30793
- try {
30794
- resolve(await c.from_pretrained(model, pretrainedOptions));
30795
- return;
30796
- } catch (err) {
30797
- if (err.message?.includes("Unsupported model type")) {
30798
- e = err;
30799
- } else if (err.message?.includes("Could not locate file")) {
30800
- e = err;
30801
- } else {
30802
- reject(err);
30803
- return;
30804
- }
30805
- }
30806
- }
30807
- reject(e);
30808
- });
30809
- } else {
30810
- promise = cls.from_pretrained(model, pretrainedOptions);
30811
- }
30812
- result[name] = promise;
30813
- promises.push(promise);
30814
- }
30815
- await Promise.all(promises);
30816
- for (const [name, promise] of Object.entries(result)) {
30817
- result[name] = await promise;
30818
- }
30819
- return result;
30820
- }
30821
31890
 
30822
31891
  // src/generation/streamers.js
30823
31892
  var is_chinese_char2 = (cp) => cp >= 19968 && cp <= 40959 || cp >= 13312 && cp <= 19903 || cp >= 131072 && cp <= 173791 || cp >= 173824 && cp <= 177983 || cp >= 177984 && cp <= 178207 || cp >= 178208 && cp <= 183983 || cp >= 63744 && cp <= 64255 || cp >= 194560 && cp <= 195103;
@@ -31105,21 +32174,38 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
31105
32174
 
31106
32175
  // src/utils/model_registry/is_cached.js
31107
32176
  async function check_files_cache(modelId, files, options = {}) {
31108
- const cache = await getCache(options?.cache_dir);
31109
- if (!cache) {
32177
+ const cache2 = await getCache(options?.cache_dir);
32178
+ if (!cache2) {
31110
32179
  const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
31111
32180
  return { allCached: false, files: fileStatuses2 };
31112
32181
  }
31113
32182
  const fileStatuses = await Promise.all(
31114
32183
  files.map(async (filename) => {
31115
- const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
31116
- const cached = await checkCachedResource(cache, localPath, proposedCacheKey);
32184
+ const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
32185
+ const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
31117
32186
  return { file: filename, cached: !!cached };
31118
32187
  })
31119
32188
  );
31120
32189
  return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
31121
32190
  }
32191
+ async function is_file_cached(modelId, filename, options = {}) {
32192
+ const cache2 = await getCache(options?.cache_dir);
32193
+ if (!cache2) return false;
32194
+ const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
32195
+ return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
32196
+ }
31122
32197
  async function is_cached(modelId, options = {}) {
32198
+ if (!modelId) {
32199
+ throw new Error("modelId is required");
32200
+ }
32201
+ if (!await is_file_cached(modelId, "config.json", options)) {
32202
+ return false;
32203
+ }
32204
+ const files = await get_files(modelId, options);
32205
+ const result = await check_files_cache(modelId, files, options);
32206
+ return result.allCached;
32207
+ }
32208
+ async function is_cached_files(modelId, options = {}) {
31123
32209
  if (!modelId) {
31124
32210
  throw new Error("modelId is required");
31125
32211
  }
@@ -31127,6 +32213,20 @@ async function is_cached(modelId, options = {}) {
31127
32213
  return await check_files_cache(modelId, files, options);
31128
32214
  }
31129
32215
  async function is_pipeline_cached(task, modelId, options = {}) {
32216
+ if (!task) {
32217
+ throw new Error("task is required");
32218
+ }
32219
+ if (!modelId) {
32220
+ throw new Error("modelId is required");
32221
+ }
32222
+ if (!await is_file_cached(modelId, "config.json", options)) {
32223
+ return false;
32224
+ }
32225
+ const files = await get_pipeline_files(task, modelId, options);
32226
+ const result = await check_files_cache(modelId, files, options);
32227
+ return result.allCached;
32228
+ }
32229
+ async function is_pipeline_cached_files(task, modelId, options = {}) {
31130
32230
  if (!task) {
31131
32231
  throw new Error("task is required");
31132
32232
  }
@@ -31139,26 +32239,26 @@ async function is_pipeline_cached(task, modelId, options = {}) {
31139
32239
 
31140
32240
  // src/utils/model_registry/clear_cache.js
31141
32241
  async function clear_files_from_cache(modelId, files, options = {}) {
31142
- const cache = await getCache(options?.cache_dir);
31143
- if (!cache) {
32242
+ const cache2 = await getCache(options?.cache_dir);
32243
+ if (!cache2) {
31144
32244
  return {
31145
32245
  filesDeleted: 0,
31146
32246
  filesCached: 0,
31147
32247
  files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
31148
32248
  };
31149
32249
  }
31150
- if (!cache.delete) {
32250
+ if (!cache2.delete) {
31151
32251
  throw new Error("Cache does not support delete operation");
31152
32252
  }
31153
32253
  const results = await Promise.all(
31154
32254
  files.map(async (filename) => {
31155
- const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
31156
- const cached = await checkCachedResource(cache, localPath, proposedCacheKey);
32255
+ const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
32256
+ const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
31157
32257
  const wasCached = !!cached;
31158
32258
  let deleted = false;
31159
32259
  if (wasCached) {
31160
- const deletedWithProposed = await cache.delete(proposedCacheKey);
31161
- const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache.delete(localPath) : false;
32260
+ const deletedWithProposed = await cache2.delete(proposedCacheKey);
32261
+ const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
31162
32262
  deleted = deletedWithProposed || deletedWithLocal;
31163
32263
  }
31164
32264
  return { file: filename, deleted, wasCached };
@@ -31275,26 +32375,30 @@ var ModelRegistry = class {
31275
32375
  return get_processor_files(modelId);
31276
32376
  }
31277
32377
  /**
31278
- * Check if a model and all its required files are cached.
32378
+ * Quickly checks if a model is fully cached by verifying `config.json` is present,
32379
+ * then confirming all required files are cached.
32380
+ * Returns a plain boolean — use `is_cached_files` if you need per-file detail.
31279
32381
  *
31280
32382
  * @param {string} modelId - The model id
31281
32383
  * @param {Object} [options] - Optional parameters
32384
+ * @param {string} [options.cache_dir] - Custom cache directory
32385
+ * @param {string} [options.revision] - Model revision (default: 'main')
32386
+ * @param {import('../../configs.js').PretrainedConfig} [options.config] - Pre-loaded config
31282
32387
  * @param {import('../dtypes.js').DataType|Record<string, import('../dtypes.js').DataType>} [options.dtype=null] - Override dtype
31283
32388
  * @param {import('../devices.js').DeviceType|Record<string, import('../devices.js').DeviceType>} [options.device=null] - Override device
31284
- * @returns {Promise<import('./is_cached.js').CacheCheckResult>} Object with allCached boolean and files array with cache status
32389
+ * @returns {Promise<boolean>} Whether all required files are cached
31285
32390
  *
31286
32391
  * @example
31287
- * const status = await ModelRegistry.is_cached('onnx-community/bert-base-uncased-ONNX');
31288
- * console.log(status.allCached); // true or false
32392
+ * const cached = await ModelRegistry.is_cached('onnx-community/bert-base-uncased-ONNX');
32393
+ * console.log(cached); // true or false
31289
32394
  */
31290
32395
  static async is_cached(modelId, options = {}) {
31291
32396
  return is_cached(modelId, options);
31292
32397
  }
31293
32398
  /**
31294
- * Check if all files for a specific pipeline task are cached.
31295
- * Automatically determines which components are needed based on the task.
32399
+ * Checks if all files for a given model are already cached, with per-file detail.
32400
+ * Automatically determines which files are needed using get_files().
31296
32401
  *
31297
- * @param {string} task - The pipeline task (e.g., "text-generation", "background-removal")
31298
32402
  * @param {string} modelId - The model id
31299
32403
  * @param {Object} [options] - Optional parameters
31300
32404
  * @param {string} [options.cache_dir] - Custom cache directory
@@ -31305,12 +32409,57 @@ var ModelRegistry = class {
31305
32409
  * @returns {Promise<import('./is_cached.js').CacheCheckResult>} Object with allCached boolean and files array with cache status
31306
32410
  *
31307
32411
  * @example
31308
- * const status = await ModelRegistry.is_pipeline_cached('text-generation', 'onnx-community/gpt2-ONNX');
32412
+ * const status = await ModelRegistry.is_cached_files('onnx-community/bert-base-uncased-ONNX');
31309
32413
  * console.log(status.allCached); // true or false
32414
+ * console.log(status.files); // [{ file: 'config.json', cached: true }, ...]
32415
+ */
32416
+ static async is_cached_files(modelId, options = {}) {
32417
+ return is_cached_files(modelId, options);
32418
+ }
32419
+ /**
32420
+ * Quickly checks if all files for a specific pipeline task are cached by verifying
32421
+ * `config.json` is present, then confirming all required files are cached.
32422
+ * Returns a plain boolean — use `is_pipeline_cached_files` if you need per-file detail.
32423
+ *
32424
+ * @param {string} task - The pipeline task (e.g., "text-generation", "background-removal")
32425
+ * @param {string} modelId - The model id
32426
+ * @param {Object} [options] - Optional parameters
32427
+ * @param {string} [options.cache_dir] - Custom cache directory
32428
+ * @param {string} [options.revision] - Model revision (default: 'main')
32429
+ * @param {import('../../configs.js').PretrainedConfig} [options.config] - Pre-loaded config
32430
+ * @param {import('../dtypes.js').DataType|Record<string, import('../dtypes.js').DataType>} [options.dtype=null] - Override dtype
32431
+ * @param {import('../devices.js').DeviceType|Record<string, import('../devices.js').DeviceType>} [options.device=null] - Override device
32432
+ * @returns {Promise<boolean>} Whether all required files are cached
32433
+ *
32434
+ * @example
32435
+ * const cached = await ModelRegistry.is_pipeline_cached('text-generation', 'onnx-community/gpt2-ONNX');
32436
+ * console.log(cached); // true or false
31310
32437
  */
31311
32438
  static async is_pipeline_cached(task, modelId, options = {}) {
31312
32439
  return is_pipeline_cached(task, modelId, options);
31313
32440
  }
32441
+ /**
32442
+ * Checks if all files for a specific pipeline task are already cached, with per-file detail.
32443
+ * Automatically determines which components are needed based on the task.
32444
+ *
32445
+ * @param {string} task - The pipeline task (e.g., "text-generation", "background-removal")
32446
+ * @param {string} modelId - The model id
32447
+ * @param {Object} [options] - Optional parameters
32448
+ * @param {string} [options.cache_dir] - Custom cache directory
32449
+ * @param {string} [options.revision] - Model revision (default: 'main')
32450
+ * @param {import('../../configs.js').PretrainedConfig} [options.config] - Pre-loaded config
32451
+ * @param {import('../dtypes.js').DataType|Record<string, import('../dtypes.js').DataType>} [options.dtype=null] - Override dtype
32452
+ * @param {import('../devices.js').DeviceType|Record<string, import('../devices.js').DeviceType>} [options.device=null] - Override device
32453
+ * @returns {Promise<import('./is_cached.js').CacheCheckResult>} Object with allCached boolean and files array with cache status
32454
+ *
32455
+ * @example
32456
+ * const status = await ModelRegistry.is_pipeline_cached_files('text-generation', 'onnx-community/gpt2-ONNX');
32457
+ * console.log(status.allCached); // true or false
32458
+ * console.log(status.files); // [{ file: 'config.json', cached: true }, ...]
32459
+ */
32460
+ static async is_pipeline_cached_files(task, modelId, options = {}) {
32461
+ return is_pipeline_cached_files(task, modelId, options);
32462
+ }
31314
32463
  /**
31315
32464
  * Get metadata for a specific file without downloading it.
31316
32465
  *
@@ -31590,6 +32739,7 @@ export {
31590
32739
  DonutImageProcessor,
31591
32740
  DonutSwinModel,
31592
32741
  DonutSwinPreTrainedModel,
32742
+ DynamicCache,
31593
32743
  EdgeTamModel,
31594
32744
  EfficientNetForImageClassification,
31595
32745
  EfficientNetImageProcessor,
@@ -31662,6 +32812,7 @@ export {
31662
32812
  Gemma3Model,
31663
32813
  Gemma3PreTrainedModel,
31664
32814
  Gemma3nAudioFeatureExtractor,
32815
+ Gemma3nForCausalLM,
31665
32816
  Gemma3nForConditionalGeneration,
31666
32817
  Gemma3nPreTrainedModel,
31667
32818
  Gemma3nProcessor,
@@ -31681,6 +32832,9 @@ export {
31681
32832
  GraniteMoeHybridModel,
31682
32833
  GraniteMoeHybridPreTrainedModel,
31683
32834
  GranitePreTrainedModel,
32835
+ GraniteSpeechFeatureExtractor,
32836
+ GraniteSpeechForConditionalGeneration,
32837
+ GraniteSpeechProcessor,
31684
32838
  GroundingDinoForObjectDetection,
31685
32839
  GroundingDinoImageProcessor,
31686
32840
  GroundingDinoPreTrainedModel,
@@ -31706,7 +32860,6 @@ export {
31706
32860
  IJepaPreTrainedModel,
31707
32861
  Idefics3ForConditionalGeneration,
31708
32862
  Idefics3ImageProcessor,
31709
- Idefics3PreTrainedModel,
31710
32863
  Idefics3Processor,
31711
32864
  ImageClassificationPipeline,
31712
32865
  ImageFeatureExtractionPipeline,
@@ -31731,6 +32884,9 @@ export {
31731
32884
  Lfm2MoeModel,
31732
32885
  Lfm2MoePreTrainedModel,
31733
32886
  Lfm2PreTrainedModel,
32887
+ Lfm2VlForConditionalGeneration,
32888
+ Lfm2VlImageProcessor,
32889
+ Lfm2VlProcessor,
31734
32890
  LiteWhisperForConditionalGeneration,
31735
32891
  Llama4ForCausalLM,
31736
32892
  Llama4PreTrainedModel,
@@ -31895,6 +33051,9 @@ export {
31895
33051
  Olmo3Model,
31896
33052
  Olmo3PreTrainedModel,
31897
33053
  OlmoForCausalLM,
33054
+ OlmoHybridForCausalLM,
33055
+ OlmoHybridModel,
33056
+ OlmoHybridPreTrainedModel,
31898
33057
  OlmoModel,
31899
33058
  OlmoPreTrainedModel,
31900
33059
  OpenELMForCausalLM,
@@ -31911,7 +33070,6 @@ export {
31911
33070
  Owlv2Model,
31912
33071
  Owlv2PreTrainedModel,
31913
33072
  PaliGemmaForConditionalGeneration,
31914
- PaliGemmaPreTrainedModel,
31915
33073
  PaliGemmaProcessor,
31916
33074
  ParakeetFeatureExtractor,
31917
33075
  ParakeetForCTC,
@@ -31950,20 +33108,36 @@ export {
31950
33108
  QuestionAnsweringPipeline,
31951
33109
  Qwen2ForCausalLM,
31952
33110
  Qwen2Model,
33111
+ Qwen2MoeForCausalLM,
33112
+ Qwen2MoeModel,
33113
+ Qwen2MoePreTrainedModel,
31953
33114
  Qwen2PreTrainedModel,
31954
33115
  Qwen2Tokenizer,
33116
+ Qwen2VLForCausalLM,
31955
33117
  Qwen2VLForConditionalGeneration,
31956
33118
  Qwen2VLImageProcessor,
31957
33119
  Qwen2VLPreTrainedModel,
31958
33120
  Qwen2VLProcessor,
33121
+ Qwen2_5_VLForCausalLM,
31959
33122
  Qwen2_5_VLForConditionalGeneration,
31960
33123
  Qwen2_5_VLProcessor,
31961
33124
  Qwen3ForCausalLM,
31962
33125
  Qwen3Model,
33126
+ Qwen3MoeForCausalLM,
33127
+ Qwen3MoeModel,
33128
+ Qwen3MoePreTrainedModel,
33129
+ Qwen3NextForCausalLM,
33130
+ Qwen3NextModel,
33131
+ Qwen3NextPreTrainedModel,
31963
33132
  Qwen3PreTrainedModel,
33133
+ Qwen3VLForCausalLM,
31964
33134
  Qwen3VLForConditionalGeneration,
33135
+ Qwen3VLMoeForCausalLM,
33136
+ Qwen3VLMoeForConditionalGeneration,
31965
33137
  Qwen3VLProcessor,
33138
+ Qwen3_5ForCausalLM,
31966
33139
  Qwen3_5ForConditionalGeneration,
33140
+ Qwen3_5MoeForCausalLM,
31967
33141
  Qwen3_5MoeForConditionalGeneration,
31968
33142
  RFDetrForObjectDetection,
31969
33143
  RFDetrModel,
@@ -32035,7 +33209,6 @@ export {
32035
33209
  SmolLM3ForCausalLM,
32036
33210
  SmolLM3Model,
32037
33211
  SmolLM3PreTrainedModel,
32038
- SmolVLMForConditionalGeneration,
32039
33212
  Idefics3ImageProcessor as SmolVLMImageProcessor,
32040
33213
  Idefics3Processor as SmolVLMProcessor,
32041
33214
  SnacDecoderModel,
@@ -32141,6 +33314,10 @@ export {
32141
33314
  VitsTokenizer,
32142
33315
  VoxtralForConditionalGeneration,
32143
33316
  VoxtralProcessor,
33317
+ VoxtralRealtimeFeatureExtractor,
33318
+ VoxtralRealtimeForConditionalGeneration,
33319
+ VoxtralRealtimePreTrainedModel,
33320
+ VoxtralRealtimeProcessor,
32144
33321
  Wav2Vec2BertForCTC,
32145
33322
  Wav2Vec2BertForSequenceClassification,
32146
33323
  Wav2Vec2BertModel,
@@ -32236,7 +33413,7 @@ export {
32236
33413
 
32237
33414
  onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
32238
33415
  (*!
32239
- * ONNX Runtime Web v1.25.0-dev.20260228-6e72d31970
33416
+ * ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
32240
33417
  * Copyright (c) Microsoft Corporation. All rights reserved.
32241
33418
  * Licensed under the MIT License.
32242
33419
  *)