@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (194) hide show
  1. package/README.md +16 -2
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
  3. package/dist/transformers.js +2255 -931
  4. package/dist/transformers.min.js +19 -19
  5. package/dist/transformers.node.cjs +2300 -934
  6. package/dist/transformers.node.min.cjs +20 -20
  7. package/dist/transformers.node.min.mjs +20 -20
  8. package/dist/transformers.node.mjs +2336 -1012
  9. package/dist/transformers.web.js +2327 -1003
  10. package/dist/transformers.web.min.js +17 -17
  11. package/package.json +4 -4
  12. package/src/cache_utils.js +62 -0
  13. package/src/configs.js +45 -24
  14. package/src/env.js +8 -1
  15. package/src/image_processors_utils.js +27 -17
  16. package/src/models/chatterbox/modeling_chatterbox.js +1 -1
  17. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  18. package/src/models/chmv2/modeling_chmv2.js +4 -0
  19. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  20. package/src/models/detr/image_processing_detr.js +1 -1
  21. package/src/models/eurobert/modeling_eurobert.js +41 -0
  22. package/src/models/feature_extractors.js +2 -0
  23. package/src/models/gemma3n/modeling_gemma3n.js +2 -0
  24. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  25. package/src/models/glm46v/processing_glm46v.js +5 -0
  26. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  27. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  28. package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
  29. package/src/models/granite_speech/modeling_granite_speech.js +5 -0
  30. package/src/models/granite_speech/processing_granite_speech.js +62 -0
  31. package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
  32. package/src/models/idefics3/modeling_idefics3.js +5 -32
  33. package/src/models/image_processors.js +3 -0
  34. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
  35. package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
  36. package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
  37. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  38. package/src/models/llava/modeling_llava.js +1 -1
  39. package/src/models/mistral3/modeling_mistral3.js +2 -2
  40. package/src/models/mistral4/modeling_mistral4.js +5 -0
  41. package/src/models/modeling_utils.js +224 -308
  42. package/src/models/models.js +14 -1
  43. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  44. package/src/models/paligemma/modeling_paligemma.js +2 -25
  45. package/src/models/processors.js +4 -0
  46. package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
  47. package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
  48. package/src/models/qwen2_vl/modeling_qwen2_vl.js +194 -143
  49. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  50. package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
  51. package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
  52. package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
  53. package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
  54. package/src/models/registry.js +42 -0
  55. package/src/models/sam/image_processing_sam.js +1 -1
  56. package/src/models/session.js +17 -6
  57. package/src/models/smolvlm/modeling_smolvlm.js +7 -0
  58. package/src/models/solar_open/modeling_solar_open.js +5 -0
  59. package/src/models/ultravox/modeling_ultravox.js +1 -3
  60. package/src/models/voxtral/modeling_voxtral.js +3 -0
  61. package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
  62. package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
  63. package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
  64. package/src/models/whisper/feature_extraction_whisper.js +2 -12
  65. package/src/pipelines.js +1 -0
  66. package/src/transformers.js +2 -0
  67. package/src/utils/audio.js +18 -2
  68. package/src/utils/cache/CrossOriginStorageCache.js +251 -0
  69. package/src/utils/cache/cross-origin-storage.d.ts +38 -0
  70. package/src/utils/cache.js +5 -0
  71. package/src/utils/hub.js +4 -1
  72. package/src/utils/lru_cache.js +67 -0
  73. package/src/utils/memoize_promise.js +45 -0
  74. package/src/utils/model_registry/get_file_metadata.js +15 -2
  75. package/src/utils/model_registry/get_model_files.js +52 -78
  76. package/src/utils/tensor.js +18 -2
  77. package/types/cache_utils.d.ts +29 -0
  78. package/types/cache_utils.d.ts.map +1 -0
  79. package/types/configs.d.ts.map +1 -1
  80. package/types/env.d.ts +8 -0
  81. package/types/env.d.ts.map +1 -1
  82. package/types/image_processors_utils.d.ts +18 -1
  83. package/types/image_processors_utils.d.ts.map +1 -1
  84. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  85. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  86. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  87. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  88. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  89. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  90. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  91. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  92. package/types/models/detr/image_processing_detr.d.ts +1 -1
  93. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  94. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  95. package/types/models/feature_extractors.d.ts +2 -0
  96. package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
  97. package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
  98. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  99. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  100. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  101. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  102. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  103. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  104. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  105. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  106. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
  107. package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
  108. package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
  109. package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
  110. package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
  111. package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
  112. package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
  113. package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
  114. package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
  115. package/types/models/image_processors.d.ts +3 -0
  116. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
  117. package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
  118. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
  119. package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
  120. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
  121. package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
  122. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  123. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  124. package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
  125. package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
  126. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  127. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  128. package/types/models/modeling_utils.d.ts +44 -35
  129. package/types/models/modeling_utils.d.ts.map +1 -1
  130. package/types/models/models.d.ts +14 -1
  131. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  132. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  133. package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
  134. package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
  135. package/types/models/processors.d.ts +4 -0
  136. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
  137. package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
  138. package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
  139. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +43 -6
  140. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  141. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  142. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  143. package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
  144. package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
  145. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
  146. package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
  147. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
  148. package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
  149. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
  150. package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
  151. package/types/models/registry.d.ts.map +1 -1
  152. package/types/models/sam/image_processing_sam.d.ts +1 -1
  153. package/types/models/session.d.ts +3 -2
  154. package/types/models/session.d.ts.map +1 -1
  155. package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
  156. package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
  157. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  158. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  159. package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
  160. package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
  161. package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
  162. package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
  163. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
  164. package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
  165. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
  166. package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
  167. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
  168. package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
  169. package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
  170. package/types/pipelines.d.ts +1 -0
  171. package/types/pipelines.d.ts.map +1 -1
  172. package/types/transformers.d.ts +1 -0
  173. package/types/transformers.d.ts.map +1 -1
  174. package/types/utils/audio.d.ts +5 -2
  175. package/types/utils/audio.d.ts.map +1 -1
  176. package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
  177. package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
  178. package/types/utils/cache.d.ts.map +1 -1
  179. package/types/utils/dtypes.d.ts +1 -1
  180. package/types/utils/hub.d.ts.map +1 -1
  181. package/types/utils/image.d.ts +1 -1
  182. package/types/utils/lru_cache.d.ts +38 -0
  183. package/types/utils/lru_cache.d.ts.map +1 -0
  184. package/types/utils/memoize_promise.d.ts +14 -0
  185. package/types/utils/memoize_promise.d.ts.map +1 -0
  186. package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
  187. package/types/utils/model_registry/get_model_files.d.ts +1 -0
  188. package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
  189. package/types/utils/tensor.d.ts.map +1 -1
  190. package/src/utils/data-structures.js +0 -572
  191. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  192. package/types/utils/data-structures.d.ts +0 -294
  193. package/types/utils/data-structures.d.ts.map +0 -1
  194. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -14,7 +14,7 @@ var __export = (target, all) => {
14
14
  import fs from "fs";
15
15
  import path from "path";
16
16
  import url from "url";
17
- var VERSION = "4.0.0-next.6";
17
+ var VERSION = "4.0.0-next.8";
18
18
  var HAS_SELF = typeof self !== "undefined";
19
19
  var IS_FS_AVAILABLE = !isEmpty(fs);
20
20
  var IS_PATH_AVAILABLE = !isEmpty(path);
@@ -142,6 +142,7 @@ var env = {
142
142
  customCache: null,
143
143
  useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
144
144
  cacheKey: "transformers-cache",
145
+ experimental_useCrossOriginStorage: false,
145
146
  /////////////////// Custom fetch /////////////////////
146
147
  fetch: DEFAULT_FETCH
147
148
  //////////////////////////////////////////////////////
@@ -243,7 +244,7 @@ var logger = {
243
244
  }
244
245
  };
245
246
 
246
- // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
247
+ // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
247
248
  var DictionarySplitter = class {
248
249
  /**
249
250
  * @param dictionary The dictionary of words to use for splitting.
@@ -1899,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
1899
1900
  );
1900
1901
  if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
1901
1902
  output_tokens.push(...byte_tokens);
1902
- } else {
1903
+ } else if (this.unk_token != null) {
1903
1904
  output_tokens.push(this.unk_token);
1904
1905
  }
1905
- } else {
1906
+ } else if (this.unk_token != null) {
1906
1907
  output_tokens.push(this.unk_token);
1907
1908
  }
1908
1909
  }
@@ -2692,7 +2693,7 @@ var Tokenizer = class {
2692
2693
  };
2693
2694
  var Tokenizer_default = Tokenizer;
2694
2695
 
2695
- // ../../node_modules/.pnpm/@huggingface+jinja@0.5.5/node_modules/@huggingface/jinja/dist/index.js
2696
+ // ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
2696
2697
  var TOKEN_TYPES = Object.freeze({
2697
2698
  Text: "Text",
2698
2699
  // The text between Jinja statements or expressions
@@ -4211,7 +4212,11 @@ var Environment = class {
4211
4212
  ["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
4212
4213
  ["integer", (operand) => operand instanceof IntegerValue],
4213
4214
  ["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
4214
- ["mapping", (operand) => operand.type === "ObjectValue"],
4215
+ ["mapping", (operand) => operand instanceof ObjectValue],
4216
+ [
4217
+ "sequence",
4218
+ (operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
4219
+ ],
4215
4220
  [
4216
4221
  "lower",
4217
4222
  (operand) => {
@@ -4484,6 +4489,9 @@ var Interpreter = class {
4484
4489
  applyFilter(operand, filterNode, environment) {
4485
4490
  if (filterNode.type === "Identifier") {
4486
4491
  const filter = filterNode;
4492
+ if (filter.value === "safe") {
4493
+ return operand;
4494
+ }
4487
4495
  if (filter.value === "tojson") {
4488
4496
  return new StringValue(toJSON(operand, {}));
4489
4497
  }
@@ -4573,6 +4581,8 @@ var Interpreter = class {
4573
4581
  return new IntegerValue(Math.floor(operand.value));
4574
4582
  case "float":
4575
4583
  return new FloatValue(operand.value);
4584
+ case "string":
4585
+ return new StringValue(operand.toString());
4576
4586
  default:
4577
4587
  throw new Error(`Unknown NumericValue filter: ${filter.value}`);
4578
4588
  }
@@ -6000,9 +6010,216 @@ function toAbsoluteURL(url2) {
6000
6010
  return new URL(url2, baseURL).href;
6001
6011
  }
6002
6012
 
6013
+ // src/utils/cache/CrossOriginStorageCache.js
6014
+ var HASH_ALGORITHM = "SHA-256";
6015
+ var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
6016
+ var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
6017
+ var CrossOriginStorage = class {
6018
+ /** @type {Promise<Cache> | null} */
6019
+ #hashCache = null;
6020
+ /**
6021
+ * Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
6022
+ * @returns {Promise<Cache>}
6023
+ */
6024
+ _getHashCache = () => {
6025
+ this.#hashCache ??= caches.open(HASH_CACHE_NAME);
6026
+ return this.#hashCache;
6027
+ };
6028
+ /**
6029
+ * Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
6030
+ * @returns {boolean}
6031
+ */
6032
+ static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
6033
+ /**
6034
+ * Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
6035
+ * the corresponding file handle from cross-origin storage.
6036
+ *
6037
+ * Implements `CacheInterface.match`.
6038
+ *
6039
+ * @param {string} request The URL of the resource to look up.
6040
+ * @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
6041
+ */
6042
+ match = async (request) => {
6043
+ const hashValue = await this._getFileHash(request);
6044
+ if (!hashValue) {
6045
+ return void 0;
6046
+ }
6047
+ try {
6048
+ const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
6049
+ const blob = await handle.getFile();
6050
+ return new Response(blob, {
6051
+ headers: {
6052
+ "Content-Length": String(blob.size)
6053
+ }
6054
+ });
6055
+ } catch {
6056
+ return void 0;
6057
+ }
6058
+ };
6059
+ /**
6060
+ * Stores a response in cross-origin storage, keyed by its SHA-256 hash.
6061
+ *
6062
+ * For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
6063
+ * `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
6064
+ * without reading the response body a second time.
6065
+ *
6066
+ * For non-LFS resources the hash is unknown upfront. In that case the body is consumed
6067
+ * in the background: the stream is read to compute the content hash, the file is written
6068
+ * into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
6069
+ * so that future `match` calls can resolve the file without a network round-trip.
6070
+ *
6071
+ * Implements `CacheInterface.put`.
6072
+ *
6073
+ * @param {string} request The URL of the resource (used as the hash-cache key).
6074
+ * @param {Response} response The response whose body will be written to the cache.
6075
+ * @returns {Promise<void>}
6076
+ */
6077
+ put = async (request, response) => {
6078
+ const hashValue = await this._getFileHash(request);
6079
+ if (hashValue) {
6080
+ const blob = await response.blob();
6081
+ await this._storeBlobInCOS(blob, hashValue);
6082
+ } else {
6083
+ this._processAndStore(request, response.body);
6084
+ }
6085
+ };
6086
+ /**
6087
+ * Writes a blob into cross-origin storage using the given pre-computed hex hash string.
6088
+ *
6089
+ * @param {Blob} blob
6090
+ * @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
6091
+ * @returns {Promise<void>}
6092
+ */
6093
+ _storeBlobInCOS = async (blob, hashHex) => {
6094
+ const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
6095
+ create: true
6096
+ });
6097
+ const writableStream = await handle.createWritable();
6098
+ await writableStream.write(blob);
6099
+ await writableStream.close();
6100
+ };
6101
+ /**
6102
+ * Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
6103
+ * of the resulting blob, stores it in cross-origin storage, and persists the computed
6104
+ * hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
6105
+ * file without a network round-trip.
6106
+ *
6107
+ * Called fire-and-forget from `put` — errors are swallowed so failures never surface to
6108
+ * the caller.
6109
+ *
6110
+ * @param {string} request The original resource URL.
6111
+ * @param {ReadableStream} stream The response body stream to consume.
6112
+ * @returns {Promise<void>}
6113
+ */
6114
+ _processAndStore = async (request, stream) => {
6115
+ try {
6116
+ const chunks = [];
6117
+ for await (const chunk2 of stream) {
6118
+ chunks.push(chunk2);
6119
+ }
6120
+ const blob = new Blob(chunks);
6121
+ const hashHex = await this._getBlobHash(blob);
6122
+ await this._storeBlobInCOS(blob, hashHex);
6123
+ try {
6124
+ const hashCache = await this._getHashCache();
6125
+ await hashCache.put(request, new Response(hashHex));
6126
+ } catch {
6127
+ }
6128
+ } catch {
6129
+ }
6130
+ };
6131
+ /**
6132
+ * Deletes the cache entry for the given request.
6133
+ *
6134
+ * Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
6135
+ * expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
6136
+ * permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
6137
+ * re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
6138
+ *
6139
+ * Implements `CacheInterface.delete`.
6140
+ *
6141
+ * @param {string} request
6142
+ * @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
6143
+ */
6144
+ delete = async (request) => {
6145
+ try {
6146
+ const hashCache = await this._getHashCache();
6147
+ return await hashCache.delete(request);
6148
+ } catch {
6149
+ return false;
6150
+ }
6151
+ };
6152
+ /**
6153
+ * Resolves the SHA-256 hash for a given URL.
6154
+ *
6155
+ * Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
6156
+ * Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
6157
+ * LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
6158
+ *
6159
+ * Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
6160
+ *
6161
+ * @param {string} url The resource URL to resolve a hash for.
6162
+ * @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
6163
+ */
6164
+ _getFileHash = async (url2) => {
6165
+ try {
6166
+ const hashCache = await this._getHashCache();
6167
+ const cached = await hashCache.match(url2);
6168
+ if (cached) {
6169
+ return cached.text();
6170
+ }
6171
+ const hash = await this._getLfsFileHash(url2);
6172
+ if (hash) {
6173
+ await hashCache.put(url2, new Response(hash));
6174
+ return hash;
6175
+ }
6176
+ return null;
6177
+ } catch {
6178
+ return null;
6179
+ }
6180
+ };
6181
+ /**
6182
+ * Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
6183
+ * Git LFS pointer file.
6184
+ *
6185
+ * Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
6186
+ * The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
6187
+ * Returns `null` for non-LFS URLs or when the network request fails.
6188
+ *
6189
+ * @see https://huggingface.co/docs/hub/en/storage-backends#xet
6190
+ * @param {string} url The resolved Hugging Face URL of the resource.
6191
+ * @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
6192
+ */
6193
+ _getLfsFileHash = async (url2) => {
6194
+ if (!url2.includes("/resolve/")) {
6195
+ return null;
6196
+ }
6197
+ const rawUrl = url2.replace("/resolve/", "/raw/");
6198
+ try {
6199
+ const text = await fetch(rawUrl).then((r) => r.text());
6200
+ const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
6201
+ return match ? match[1] : null;
6202
+ } catch {
6203
+ return null;
6204
+ }
6205
+ };
6206
+ /**
6207
+ * Computes the SHA-256 hash of a `Blob`'s contents.
6208
+ *
6209
+ * @param {Blob} blob The blob to hash.
6210
+ * @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
6211
+ */
6212
+ _getBlobHash = async (blob) => {
6213
+ const arrayBuffer = await blob.arrayBuffer();
6214
+ const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
6215
+ const hashArray = Array.from(new Uint8Array(hashBuffer));
6216
+ return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
6217
+ };
6218
+ };
6219
+
6003
6220
  // src/utils/cache.js
6004
6221
  async function getCache(file_cache_dir = null) {
6005
- let cache = null;
6222
+ let cache2 = null;
6006
6223
  if (env.useCustomCache) {
6007
6224
  if (!env.customCache) {
6008
6225
  throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
@@ -6012,30 +6229,33 @@ async function getCache(file_cache_dir = null) {
6012
6229
  "`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
6013
6230
  );
6014
6231
  }
6015
- cache = env.customCache;
6232
+ cache2 = env.customCache;
6233
+ }
6234
+ if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
6235
+ cache2 = new CrossOriginStorage();
6016
6236
  }
6017
- if (!cache && env.useBrowserCache) {
6237
+ if (!cache2 && env.useBrowserCache) {
6018
6238
  if (typeof caches === "undefined") {
6019
6239
  throw Error("Browser cache is not available in this environment.");
6020
6240
  }
6021
6241
  try {
6022
- cache = await caches.open(env.cacheKey);
6242
+ cache2 = await caches.open(env.cacheKey);
6023
6243
  } catch (e) {
6024
6244
  logger.warn("An error occurred while opening the browser cache:", e);
6025
6245
  }
6026
6246
  }
6027
- if (!cache && env.useFSCache) {
6247
+ if (!cache2 && env.useFSCache) {
6028
6248
  if (!apis.IS_FS_AVAILABLE) {
6029
6249
  throw Error("File System Cache is not available in this environment.");
6030
6250
  }
6031
- cache = new FileCache(file_cache_dir ?? env.cacheDir);
6251
+ cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
6032
6252
  }
6033
- return cache;
6253
+ return cache2;
6034
6254
  }
6035
- async function tryCache(cache, ...names) {
6255
+ async function tryCache(cache2, ...names) {
6036
6256
  for (let name of names) {
6037
6257
  try {
6038
- let result = await cache.match(name);
6258
+ let result = await cache2.match(name);
6039
6259
  if (result) return result;
6040
6260
  } catch (e) {
6041
6261
  continue;
@@ -6044,6 +6264,83 @@ async function tryCache(cache, ...names) {
6044
6264
  return void 0;
6045
6265
  }
6046
6266
 
6267
+ // src/utils/lru_cache.js
6268
+ var LRUCache2 = class {
6269
+ /** @type {number} */
6270
+ #capacity;
6271
+ /** @type {Map<any, any>} */
6272
+ #cache;
6273
+ /**
6274
+ * Creates an LRUCache instance.
6275
+ * @param {number} capacity The maximum number of items the cache can hold.
6276
+ */
6277
+ constructor(capacity) {
6278
+ this.#capacity = capacity;
6279
+ this.#cache = /* @__PURE__ */ new Map();
6280
+ }
6281
+ /**
6282
+ * Retrieves the value associated with the given key and marks the key as recently used.
6283
+ * @param {any} key The key to retrieve.
6284
+ * @returns {any} The value associated with the key, or undefined if the key does not exist.
6285
+ */
6286
+ get(key) {
6287
+ if (!this.#cache.has(key)) return void 0;
6288
+ const value = this.#cache.get(key);
6289
+ this.#cache.delete(key);
6290
+ this.#cache.set(key, value);
6291
+ return value;
6292
+ }
6293
+ /**
6294
+ * Inserts or updates the key-value pair in the cache.
6295
+ * If the key already exists, it is updated and marked as recently used.
6296
+ * If the cache exceeds its capacity, the least recently used item is evicted.
6297
+ * @param {any} key The key to add or update.
6298
+ * @param {any} value The value to associate with the key.
6299
+ */
6300
+ put(key, value) {
6301
+ if (this.#cache.has(key)) {
6302
+ this.#cache.delete(key);
6303
+ }
6304
+ this.#cache.set(key, value);
6305
+ if (this.#cache.size > this.#capacity) {
6306
+ this.#cache.delete(this.#cache.keys().next().value);
6307
+ }
6308
+ }
6309
+ /**
6310
+ * Removes the entry for the given key from the cache.
6311
+ * @param {any} key The key to delete.
6312
+ * @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
6313
+ */
6314
+ delete(key) {
6315
+ return this.#cache.delete(key);
6316
+ }
6317
+ /**
6318
+ * Clears the cache.
6319
+ */
6320
+ clear() {
6321
+ this.#cache.clear();
6322
+ }
6323
+ };
6324
+
6325
+ // src/utils/memoize_promise.js
6326
+ var MAX_CACHE_SIZE = 100;
6327
+ var cache = new LRUCache2(MAX_CACHE_SIZE);
6328
+ function memoizePromise(key, factory) {
6329
+ const cached = cache.get(key);
6330
+ if (cached !== void 0) {
6331
+ return cached;
6332
+ }
6333
+ const promise = factory().then(
6334
+ (value) => value,
6335
+ (err) => {
6336
+ cache.delete(key);
6337
+ return Promise.reject(err);
6338
+ }
6339
+ );
6340
+ cache.put(key, promise);
6341
+ return promise;
6342
+ }
6343
+
6047
6344
  // src/utils/model_registry/get_file_metadata.js
6048
6345
  async function fetch_file_head(urlOrPath) {
6049
6346
  if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
@@ -6051,17 +6348,27 @@ async function fetch_file_head(urlOrPath) {
6051
6348
  }
6052
6349
  const headers = getFetchHeaders(urlOrPath);
6053
6350
  headers.set("Range", "bytes=0-0");
6054
- return env.fetch(urlOrPath, { method: "GET", headers });
6351
+ return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
6352
+ }
6353
+ function get_file_metadata(path_or_repo_id, filename, options = {}) {
6354
+ const key = JSON.stringify([
6355
+ path_or_repo_id,
6356
+ filename,
6357
+ options?.revision,
6358
+ options?.cache_dir,
6359
+ options?.local_files_only
6360
+ ]);
6361
+ return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
6055
6362
  }
6056
- async function get_file_metadata(path_or_repo_id, filename, options = {}) {
6057
- const cache = await getCache(options?.cache_dir);
6363
+ async function _get_file_metadata(path_or_repo_id, filename, options) {
6364
+ const cache2 = await getCache(options?.cache_dir);
6058
6365
  const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
6059
6366
  path_or_repo_id,
6060
6367
  filename,
6061
6368
  options,
6062
- cache
6369
+ cache2
6063
6370
  );
6064
- const cachedResponse = await checkCachedResource(cache, localPath, proposedCacheKey);
6371
+ const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
6065
6372
  if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
6066
6373
  const size = cachedResponse.headers.get("content-length");
6067
6374
  const contentType = cachedResponse.headers.get("content-type");
@@ -6159,7 +6466,7 @@ function getFetchHeaders(urlOrPath) {
6159
6466
  }
6160
6467
  return headers;
6161
6468
  }
6162
- function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = null) {
6469
+ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
6163
6470
  const revision = options.revision ?? "main";
6164
6471
  const requestURL = pathJoin(path_or_repo_id, filename);
6165
6472
  const validModelId = isValidHfModelId(path_or_repo_id);
@@ -6169,7 +6476,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
6169
6476
  env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
6170
6477
  filename
6171
6478
  );
6172
- const proposedCacheKey = cache instanceof FileCache ? (
6479
+ const proposedCacheKey = cache2 instanceof FileCache ? (
6173
6480
  // Choose cache key for filesystem cache
6174
6481
  // When using the main revision (default), we use the request URL as the cache key.
6175
6482
  // If a specific revision is requested, we account for this in the cache key.
@@ -6183,14 +6490,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
6183
6490
  validModelId
6184
6491
  };
6185
6492
  }
6186
- async function checkCachedResource(cache, localPath, proposedCacheKey) {
6187
- if (!cache) {
6493
+ async function checkCachedResource(cache2, localPath, proposedCacheKey) {
6494
+ if (!cache2) {
6188
6495
  return void 0;
6189
6496
  }
6190
- return await tryCache(cache, localPath, proposedCacheKey);
6497
+ return await tryCache(cache2, localPath, proposedCacheKey);
6191
6498
  }
6192
- async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, response, result, options = {}) {
6193
- if (await cache.match(cacheKey) !== void 0) {
6499
+ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
6500
+ if (await cache2.match(cacheKey) !== void 0) {
6194
6501
  return;
6195
6502
  }
6196
6503
  if (!result) {
@@ -6200,20 +6507,22 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
6200
6507
  file: filename,
6201
6508
  ...data
6202
6509
  }) : void 0;
6203
- await cache.put(
6510
+ await cache2.put(
6204
6511
  cacheKey,
6205
6512
  /** @type {Response} */
6206
6513
  response,
6207
6514
  wrapped_progress
6208
6515
  );
6209
6516
  } else if (typeof response !== "string") {
6210
- await cache.put(
6517
+ const headers = new Headers(response.headers);
6518
+ headers.set("content-length", result.byteLength.toString());
6519
+ await cache2.put(
6211
6520
  cacheKey,
6212
6521
  new Response(
6213
6522
  /** @type {any} */
6214
6523
  result,
6215
6524
  {
6216
- headers: response.headers
6525
+ headers
6217
6526
  }
6218
6527
  )
6219
6528
  ).catch((err) => {
@@ -6221,17 +6530,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
6221
6530
  });
6222
6531
  }
6223
6532
  }
6224
- async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache = null) {
6533
+ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
6225
6534
  const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
6226
6535
  path_or_repo_id,
6227
6536
  filename,
6228
6537
  options,
6229
- cache
6538
+ cache2
6230
6539
  );
6231
6540
  let cacheKey;
6232
6541
  let toCacheResponse = false;
6233
6542
  let response;
6234
- response = await checkCachedResource(cache, localPath, proposedCacheKey);
6543
+ response = await checkCachedResource(cache2, localPath, proposedCacheKey);
6235
6544
  const cacheHit = response !== void 0;
6236
6545
  if (!cacheHit) {
6237
6546
  if (env.allowLocalModels) {
@@ -6272,7 +6581,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
6272
6581
  }
6273
6582
  cacheKey = proposedCacheKey;
6274
6583
  }
6275
- toCacheResponse = cache && // 1. A caching system is available
6584
+ toCacheResponse = cache2 && // 1. A caching system is available
6276
6585
  typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
6277
6586
  response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
6278
6587
  response.status === 200;
@@ -6334,7 +6643,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
6334
6643
  // i.e., do not cache FileResponses (prevents duplication)
6335
6644
  toCacheResponse && cacheKey && typeof response !== "string"
6336
6645
  ) {
6337
- await storeCachedResource(path_or_repo_id, filename, cache, cacheKey, response, result, options);
6646
+ await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
6338
6647
  }
6339
6648
  dispatchCallback(options.progress_callback, {
6340
6649
  status: "done",
@@ -6350,7 +6659,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
6350
6659
  if (response instanceof FileResponse) {
6351
6660
  return response.filePath;
6352
6661
  }
6353
- const cachedResponse = await cache?.match(cacheKey);
6662
+ const cachedResponse = await cache2?.match(cacheKey);
6354
6663
  if (cachedResponse instanceof FileResponse) {
6355
6664
  return cachedResponse.filePath;
6356
6665
  } else if (cachedResponse instanceof Response) {
@@ -6377,8 +6686,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
6377
6686
  name: path_or_repo_id,
6378
6687
  file: filename
6379
6688
  });
6380
- const cache = await getCache(options?.cache_dir);
6381
- return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache);
6689
+ const cache2 = await getCache(options?.cache_dir);
6690
+ return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
6382
6691
  }
6383
6692
  async function getModelText(modelPath, fileName, fatal = true, options = {}) {
6384
6693
  const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
@@ -7171,7 +7480,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
7171
7480
  // src/backends/onnx.js
7172
7481
  import * as ONNX_NODE from "onnxruntime-node";
7173
7482
 
7174
- // ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260303-e7e64dc112/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
7483
+ // ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
7175
7484
  var ort_webgpu_bundle_min_exports = {};
7176
7485
  __export(ort_webgpu_bundle_min_exports, {
7177
7486
  InferenceSession: () => Jf,
@@ -7939,7 +8248,7 @@ async function ts(a = {}) {
7939
8248
  throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
7940
8249
  }
7941
8250
  function Ye() {
7942
- return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, H: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, g: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, I: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, h: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
8251
+ return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
7943
8252
  }
7944
8253
  async function bt() {
7945
8254
  function e(o, u) {
@@ -9126,7 +9435,7 @@ async function ts(a = {}) {
9126
9435
  Te(`invalid type for getValue: ${t}`);
9127
9436
  }
9128
9437
  }, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
9129
- var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 923180: (e, t, n, o, u) => {
9438
+ var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
9130
9439
  if (r === void 0 || !r.Uc) return 1;
9131
9440
  if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
9132
9441
  if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
@@ -9146,11 +9455,11 @@ async function ts(a = {}) {
9146
9455
  } catch {
9147
9456
  return 4;
9148
9457
  }
9149
- }, 924004: (e, t, n) => {
9458
+ }, 926500: (e, t, n) => {
9150
9459
  r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
9151
- }, 924068: () => r.me(), 924110: (e) => {
9460
+ }, 926564: () => r.me(), 926606: (e) => {
9152
9461
  r.jd(e);
9153
- }, 924147: () => typeof wasmOffsetConverter < "u" };
9462
+ }, 926643: () => typeof wasmOffsetConverter < "u" };
9154
9463
  function af(e, t, n, o) {
9155
9464
  var u = P();
9156
9465
  try {
@@ -11066,7 +11375,7 @@ var $s = k(() => {
11066
11375
  Ve();
11067
11376
  Ve();
11068
11377
  Ve();
11069
- var Xa = "1.25.0-dev.20260303-e7e64dc112";
11378
+ var Xa = "1.25.0-dev.20260307-d626b568e0";
11070
11379
  var Tl = Zr;
11071
11380
  {
11072
11381
  let a = ($s(), $t(Gs)).wasmBackend;
@@ -11077,11 +11386,11 @@ Object.defineProperty(K.versions, "web", { value: Xa, enumerable: true });
11077
11386
  // src/backends/utils/cacheWasm.js
11078
11387
  async function loadAndCacheFile(url2) {
11079
11388
  const fileName = url2.split("/").pop();
11080
- let cache;
11389
+ let cache2;
11081
11390
  try {
11082
- cache = await getCache();
11083
- if (cache) {
11084
- const result = await cache.match(url2);
11391
+ cache2 = await getCache();
11392
+ if (cache2) {
11393
+ const result = await cache2.match(url2);
11085
11394
  if (result) {
11086
11395
  return result;
11087
11396
  }
@@ -11093,9 +11402,9 @@ async function loadAndCacheFile(url2) {
11093
11402
  if (!response.ok) {
11094
11403
  throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
11095
11404
  }
11096
- if (cache) {
11405
+ if (cache2) {
11097
11406
  try {
11098
- await cache.put(url2, response.clone());
11407
+ await cache2.put(url2, response.clone());
11099
11408
  } catch (e) {
11100
11409
  logger.warn(`Failed to cache ${fileName}:`, e);
11101
11410
  }
@@ -12947,9 +13256,23 @@ var Tensor2 = class _Tensor {
12947
13256
  throw Error(`Unsupported norm: ${p}`);
12948
13257
  }
12949
13258
  const this_data = this.data;
12950
- const fn2 = (a, b) => a + b ** p;
13259
+ const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
13260
+ if (is_bigint && p !== 1) {
13261
+ throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
13262
+ }
13263
+ let fn2, zero;
13264
+ if (is_bigint) {
13265
+ fn2 = (a, b) => a + b;
13266
+ zero = 0n;
13267
+ } else {
13268
+ fn2 = (a, b) => a + b ** p;
13269
+ zero = 0;
13270
+ }
12951
13271
  if (dim === null) {
12952
- const val = this_data.reduce(fn2, 0) ** (1 / p);
13272
+ let val = this_data.reduce(fn2, zero);
13273
+ if (p !== 1) {
13274
+ val = val ** (1 / p);
13275
+ }
12953
13276
  return new _Tensor(this.type, [val], []);
12954
13277
  }
12955
13278
  const [type, result, resultDims] = reduce_helper(fn2, this, dim, keepdim);
@@ -15409,9 +15732,12 @@ __export(processors_exports, {
15409
15732
  ChatterboxProcessor: () => ChatterboxProcessor,
15410
15733
  Florence2Processor: () => Florence2Processor,
15411
15734
  Gemma3nProcessor: () => Gemma3nProcessor,
15735
+ Glm46VProcessor: () => Glm46VProcessor,
15736
+ GraniteSpeechProcessor: () => GraniteSpeechProcessor,
15412
15737
  GroundingDinoProcessor: () => GroundingDinoProcessor,
15413
15738
  Idefics3Processor: () => Idefics3Processor,
15414
15739
  JinaCLIPProcessor: () => JinaCLIPProcessor,
15740
+ Lfm2VlProcessor: () => Lfm2VlProcessor,
15415
15741
  LlavaProcessor: () => LlavaProcessor,
15416
15742
  MgpstrProcessor: () => MgpstrProcessor,
15417
15743
  MoonshineProcessor: () => MoonshineProcessor,
@@ -15432,6 +15758,7 @@ __export(processors_exports, {
15432
15758
  UltravoxProcessor: () => UltravoxProcessor,
15433
15759
  VLChatProcessor: () => VLChatProcessor,
15434
15760
  VoxtralProcessor: () => VoxtralProcessor,
15761
+ VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
15435
15762
  Wav2Vec2Processor: () => Wav2Vec2Processor,
15436
15763
  Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
15437
15764
  WhisperProcessor: () => WhisperProcessor
@@ -15486,12 +15813,14 @@ __export(feature_extractors_exports, {
15486
15813
  EncodecFeatureExtractor: () => EncodecFeatureExtractor,
15487
15814
  FeatureExtractor: () => FeatureExtractor,
15488
15815
  Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
15816
+ GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
15489
15817
  MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
15490
15818
  ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
15491
15819
  PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
15492
15820
  SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
15493
15821
  SnacFeatureExtractor: () => SnacFeatureExtractor,
15494
15822
  SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
15823
+ VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
15495
15824
  Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
15496
15825
  WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
15497
15826
  WhisperFeatureExtractor: () => WhisperFeatureExtractor
@@ -15719,6 +16048,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
15719
16048
  mel_filters = null,
15720
16049
  mel_floor = 1e-10,
15721
16050
  log_mel = null,
16051
+ max_log_mel = null,
15722
16052
  reference = 1,
15723
16053
  min_value = 1e-10,
15724
16054
  db_range = null,
@@ -15858,6 +16188,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
15858
16188
  mel_spec_data[i] = Math.log10(mel_spec_data[i]);
15859
16189
  }
15860
16190
  break;
16191
+ case "log10_max_norm": {
16192
+ for (let i = 0; i < o; ++i) {
16193
+ mel_spec_data[i] = Math.log10(mel_spec_data[i]);
16194
+ }
16195
+ const logMax = max_log_mel ?? max(mel_spec_data)[0];
16196
+ const threshold = logMax - 8;
16197
+ for (let i = 0; i < o; ++i) {
16198
+ mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
16199
+ }
16200
+ break;
16201
+ }
15861
16202
  case "dB":
15862
16203
  if (power === 1) {
15863
16204
  amplitude_to_db(mel_spec_data, reference, min_value, db_range);
@@ -15868,7 +16209,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
15868
16209
  }
15869
16210
  break;
15870
16211
  default:
15871
- throw new Error(`log_mel must be one of null, 'log', 'log10' or 'dB'. Got '${log_mel}'`);
16212
+ throw new Error(
16213
+ `log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
16214
+ );
15872
16215
  }
15873
16216
  }
15874
16217
  return mel_spec;
@@ -16373,6 +16716,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
16373
16716
  }
16374
16717
  };
16375
16718
 
16719
+ // src/models/granite_speech/feature_extraction_granite_speech.js
16720
+ var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
16721
+ constructor(config) {
16722
+ super(config);
16723
+ const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
16724
+ this.mel_filters = mel_filter_bank(
16725
+ Math.floor(1 + n_fft / 2),
16726
+ // num_frequency_bins = 257
16727
+ n_mels,
16728
+ // 80
16729
+ 0,
16730
+ // min_frequency
16731
+ sample_rate / 2,
16732
+ // max_frequency = 8000
16733
+ sample_rate,
16734
+ // 16000
16735
+ null,
16736
+ // norm (torchaudio default: no norm)
16737
+ "htk"
16738
+ // mel_scale (torchaudio default)
16739
+ );
16740
+ const raw_window = window_function(win_length, "hann");
16741
+ this.window = new Float64Array(n_fft);
16742
+ const pad = Math.floor((n_fft - win_length) / 2);
16743
+ this.window.set(raw_window, pad);
16744
+ }
16745
+ /**
16746
+ * Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
16747
+ * @param {Float32Array|Float64Array} audio The audio waveform.
16748
+ * @returns {Promise<{input_features: Tensor}>}
16749
+ */
16750
+ async _call(audio) {
16751
+ validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
16752
+ const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
16753
+ const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
16754
+ const max_num_frames = num_frames - num_frames % 2;
16755
+ const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
16756
+ power: 2,
16757
+ mel_filters: this.mel_filters,
16758
+ log_mel: "log10_max_norm",
16759
+ transpose: true,
16760
+ // [time, n_mels]
16761
+ max_num_frames,
16762
+ do_pad: false
16763
+ });
16764
+ const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
16765
+ return { input_features };
16766
+ }
16767
+ };
16768
+
16376
16769
  // src/models/moonshine/feature_extraction_moonshine.js
16377
16770
  var MoonshineFeatureExtractor = class extends FeatureExtractor {
16378
16771
  /**
@@ -16853,6 +17246,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
16853
17246
  }
16854
17247
  };
16855
17248
 
17249
+ // src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
17250
+ var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
17251
+ constructor(config) {
17252
+ super(config);
17253
+ this.config.mel_filters ??= mel_filter_bank(
17254
+ Math.floor(1 + this.config.n_fft / 2),
17255
+ // num_frequency_bins
17256
+ this.config.feature_size,
17257
+ // num_mel_filters
17258
+ 0,
17259
+ // min_frequency
17260
+ 8e3,
17261
+ // max_frequency
17262
+ this.config.sampling_rate,
17263
+ // sampling_rate
17264
+ "slaney",
17265
+ // norm
17266
+ "slaney"
17267
+ // mel_scale
17268
+ );
17269
+ this.window = window_function(this.config.n_fft, "hann");
17270
+ }
17271
+ /**
17272
+ * Computes the log-Mel spectrogram of the provided audio waveform.
17273
+ * @param {Float32Array|Float64Array} waveform The audio waveform to process.
17274
+ * @param {Object} [options]
17275
+ * @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
17276
+ * @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
17277
+ */
17278
+ async _extract_fbank_features(waveform, { center = true } = {}) {
17279
+ const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
17280
+ const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
17281
+ return await spectrogram(
17282
+ waveform,
17283
+ this.window,
17284
+ n_fft,
17285
+ // frame_length
17286
+ hop_length,
17287
+ {
17288
+ power: 2,
17289
+ mel_filters,
17290
+ log_mel: "log10_max_norm",
17291
+ max_log_mel: global_log_mel_max,
17292
+ center,
17293
+ max_num_frames,
17294
+ do_pad: false
17295
+ }
17296
+ );
17297
+ }
17298
+ /**
17299
+ * Extract mel spectrogram features from audio.
17300
+ * @param {Float32Array|Float64Array} audio The audio data.
17301
+ * @param {Object} [options]
17302
+ * @param {boolean} [options.center=true] Whether to center-pad the waveform.
17303
+ * @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
17304
+ */
17305
+ async _call(audio, { center = true } = {}) {
17306
+ validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
17307
+ const features = await this._extract_fbank_features(audio, { center });
17308
+ return {
17309
+ input_features: features.unsqueeze_(0)
17310
+ };
17311
+ }
17312
+ };
17313
+
16856
17314
  // src/models/whisper/feature_extraction_whisper.js
16857
17315
  var WhisperFeatureExtractor = class extends FeatureExtractor {
16858
17316
  constructor(config) {
@@ -16881,7 +17339,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
16881
17339
  * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
16882
17340
  */
16883
17341
  async _extract_fbank_features(waveform) {
16884
- const features = await spectrogram(
17342
+ return await spectrogram(
16885
17343
  waveform,
16886
17344
  this.window,
16887
17345
  // window
@@ -16892,7 +17350,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
16892
17350
  {
16893
17351
  power: 2,
16894
17352
  mel_filters: this.config.mel_filters,
16895
- log_mel: "log10",
17353
+ log_mel: "log10_max_norm",
16896
17354
  // Custom
16897
17355
  max_num_frames: Math.min(
16898
17356
  Math.floor(waveform.length / this.config.hop_length),
@@ -16901,15 +17359,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
16901
17359
  )
16902
17360
  }
16903
17361
  );
16904
- const data = features.data;
16905
- const maxValue = max(
16906
- /** @type {Float32Array} */
16907
- data
16908
- )[0];
16909
- for (let i = 0; i < data.length; ++i) {
16910
- data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
16911
- }
16912
- return features;
16913
17362
  }
16914
17363
  /**
16915
17364
  * Asynchronously extracts features from a given audio using the provided configuration.
@@ -17788,6 +18237,30 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
17788
18237
  }
17789
18238
  return [segmentation, segments];
17790
18239
  }
18240
+ function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
18241
+ if (height < factor || width < factor) {
18242
+ const scale = Math.max(factor / height, factor / width);
18243
+ height = Math.round(height * scale);
18244
+ width = Math.round(width * scale);
18245
+ }
18246
+ if (Math.max(height, width) / Math.min(height, width) > 200) {
18247
+ throw new Error(
18248
+ `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
18249
+ );
18250
+ }
18251
+ let h_bar = Math.round(height / factor) * factor;
18252
+ let w_bar = Math.round(width / factor) * factor;
18253
+ if (temporal_factor * h_bar * w_bar > max_pixels) {
18254
+ const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
18255
+ h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
18256
+ w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
18257
+ } else if (temporal_factor * h_bar * w_bar < min_pixels) {
18258
+ const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
18259
+ h_bar = Math.ceil(height * beta / factor) * factor;
18260
+ w_bar = Math.ceil(width * beta / factor) * factor;
18261
+ }
18262
+ return [w_bar, h_bar];
18263
+ }
17791
18264
  function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
17792
18265
  if (label_ids_to_fuse === null) {
17793
18266
  logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
@@ -17865,7 +18338,7 @@ var ImageProcessor = class extends Callable2 {
17865
18338
  this.do_pad = config.do_pad;
17866
18339
  this.min_pixels = config.min_pixels;
17867
18340
  this.max_pixels = config.max_pixels;
17868
- if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
18341
+ if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
17869
18342
  this.pad_size = this.size;
17870
18343
  }
17871
18344
  this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -18076,7 +18549,7 @@ var ImageProcessor = class extends Callable2 {
18076
18549
  });
18077
18550
  }
18078
18551
  /**
18079
- * @typedef {object} PreprocessedImage
18552
+ * @typedef {Object} PreprocessedImage
18080
18553
  * @property {HeightWidth} original_size The original size of the image.
18081
18554
  * @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
18082
18555
  * @property {Tensor} pixel_values The pixel values of the preprocessed image.
@@ -18153,10 +18626,8 @@ var ImageProcessor = class extends Callable2 {
18153
18626
  const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
18154
18627
  [pixelData, imgDims] = padded;
18155
18628
  } else if (this.size_divisibility) {
18156
- const [paddedWidth, paddedHeight] = enforce_size_divisibility(
18157
- [imgDims[1], imgDims[0]],
18158
- this.size_divisibility
18159
- );
18629
+ const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
18630
+ const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
18160
18631
  [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
18161
18632
  }
18162
18633
  }
@@ -18233,6 +18704,7 @@ var image_processors_exports = {};
18233
18704
  __export(image_processors_exports, {
18234
18705
  BeitFeatureExtractor: () => BeitFeatureExtractor,
18235
18706
  BitImageProcessor: () => BitImageProcessor,
18707
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
18236
18708
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
18237
18709
  CLIPImageProcessor: () => CLIPImageProcessor,
18238
18710
  ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -18249,11 +18721,13 @@ __export(image_processors_exports, {
18249
18721
  DonutImageProcessor: () => DonutImageProcessor,
18250
18722
  EfficientNetImageProcessor: () => EfficientNetImageProcessor,
18251
18723
  GLPNFeatureExtractor: () => GLPNFeatureExtractor,
18724
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
18252
18725
  GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
18253
18726
  Idefics3ImageProcessor: () => Idefics3ImageProcessor,
18254
18727
  ImageFeatureExtractor: () => ImageProcessor,
18255
18728
  ImageProcessor: () => ImageProcessor,
18256
18729
  JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
18730
+ Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
18257
18731
  LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
18258
18732
  Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
18259
18733
  MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
@@ -18308,6 +18782,10 @@ var BitImageProcessor = class extends ImageProcessor {
18308
18782
  var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
18309
18783
  };
18310
18784
 
18785
+ // src/models/chmv2/image_processing_chmv2.js
18786
+ var CHMv2ImageProcessor = class extends ImageProcessor {
18787
+ };
18788
+
18311
18789
  // src/models/clip/image_processing_clip.js
18312
18790
  var CLIPImageProcessor = class extends ImageProcessor {
18313
18791
  };
@@ -18427,32 +18905,91 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
18427
18905
  }
18428
18906
  };
18429
18907
 
18430
- // src/models/glpn/image_processing_glpn.js
18431
- var GLPNFeatureExtractor = class extends ImageProcessor {
18432
- };
18433
-
18434
- // src/models/grounding_dino/image_processing_grounding_dino.js
18435
- var GroundingDinoImageProcessor = class extends ImageProcessor {
18436
- /**
18437
- * Calls the feature extraction process on an array of images, preprocesses
18438
- * each image, and concatenates the resulting features into a single Tensor.
18439
- * @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
18440
- * @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
18441
- */
18442
- async _call(images) {
18443
- const result = await super._call(images);
18444
- const dims = result.pixel_values.dims;
18445
- const pixel_mask = ones([dims[0], dims[2], dims[3]]);
18446
- return { ...result, pixel_mask };
18447
- }
18448
- };
18449
-
18450
- // src/models/idefics3/image_processing_idefics3.js
18451
- var Idefics3ImageProcessor = class extends ImageProcessor {
18908
+ // src/models/qwen2_vl/image_processing_qwen2_vl.js
18909
+ var Qwen2VLImageProcessor = class extends ImageProcessor {
18452
18910
  constructor(config) {
18453
18911
  super(config);
18454
- this.do_image_splitting = config.do_image_splitting ?? true;
18455
- this.max_image_size = config.max_image_size;
18912
+ this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
18913
+ this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
18914
+ this.patch_size = config.patch_size;
18915
+ this.merge_size = config.merge_size;
18916
+ }
18917
+ /** @type {ImageProcessor['get_resize_output_image_size']} */
18918
+ get_resize_output_image_size(image, size) {
18919
+ const factor = this.patch_size * this.merge_size;
18920
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
18921
+ }
18922
+ async _call(images, ...args) {
18923
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
18924
+ let patches = pixel_values;
18925
+ const { temporal_patch_size, merge_size, patch_size } = this.config;
18926
+ if (patches.dims[0] === 1) {
18927
+ patches = cat(
18928
+ Array.from({ length: temporal_patch_size }, () => patches),
18929
+ 0
18930
+ );
18931
+ }
18932
+ const grid_t = patches.dims[0] / temporal_patch_size;
18933
+ const channel = patches.dims[1];
18934
+ const grid_h = Math.floor(patches.dims[2] / patch_size);
18935
+ const grid_w = Math.floor(patches.dims[3] / patch_size);
18936
+ const flatten_patches = patches.view(
18937
+ grid_t,
18938
+ temporal_patch_size,
18939
+ channel,
18940
+ Math.floor(grid_h / merge_size),
18941
+ merge_size,
18942
+ patch_size,
18943
+ Math.floor(grid_w / merge_size),
18944
+ merge_size,
18945
+ patch_size
18946
+ ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
18947
+ const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
18948
+ return {
18949
+ pixel_values: flatten_patches,
18950
+ image_grid_thw,
18951
+ original_sizes,
18952
+ reshaped_input_sizes
18953
+ };
18954
+ }
18955
+ };
18956
+
18957
+ // src/models/glm46v/image_processing_glm46v.js
18958
+ var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
18959
+ /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
18960
+ get_resize_output_image_size(image, size) {
18961
+ const factor = this.patch_size * this.merge_size;
18962
+ const temporal_factor = this.config.temporal_patch_size ?? 2;
18963
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
18964
+ }
18965
+ };
18966
+
18967
+ // src/models/glpn/image_processing_glpn.js
18968
+ var GLPNFeatureExtractor = class extends ImageProcessor {
18969
+ };
18970
+
18971
+ // src/models/grounding_dino/image_processing_grounding_dino.js
18972
+ var GroundingDinoImageProcessor = class extends ImageProcessor {
18973
+ /**
18974
+ * Calls the feature extraction process on an array of images, preprocesses
18975
+ * each image, and concatenates the resulting features into a single Tensor.
18976
+ * @param {import('../../utils/image.js').RawImage[]} images The image(s) to extract features from.
18977
+ * @returns {Promise<GroundingDinoFeatureExtractorResult>} An object containing the concatenated pixel values of the preprocessed images.
18978
+ */
18979
+ async _call(images) {
18980
+ const result = await super._call(images);
18981
+ const dims = result.pixel_values.dims;
18982
+ const pixel_mask = ones([dims[0], dims[2], dims[3]]);
18983
+ return { ...result, pixel_mask };
18984
+ }
18985
+ };
18986
+
18987
+ // src/models/idefics3/image_processing_idefics3.js
18988
+ var Idefics3ImageProcessor = class extends ImageProcessor {
18989
+ constructor(config) {
18990
+ super(config);
18991
+ this.do_image_splitting = config.do_image_splitting ?? true;
18992
+ this.max_image_size = config.max_image_size;
18456
18993
  }
18457
18994
  /**
18458
18995
  * @typedef {import('../../utils/image.js').RawImage} RawImage
@@ -18657,6 +19194,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
18657
19194
  }
18658
19195
  };
18659
19196
 
19197
+ // src/models/lfm2_vl/image_processing_lfm2_vl.js
19198
+ function round_by_factor(number, factor) {
19199
+ return Math.round(number / factor) * factor;
19200
+ }
19201
+ function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
19202
+ let best_ratio_diff = Infinity;
19203
+ let best_ratio = [1, 1];
19204
+ const area = width * height;
19205
+ for (const ratio of target_ratios) {
19206
+ const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
19207
+ if (ratio_diff < best_ratio_diff) {
19208
+ best_ratio_diff = ratio_diff;
19209
+ best_ratio = ratio;
19210
+ } else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
19211
+ best_ratio = ratio;
19212
+ }
19213
+ }
19214
+ return best_ratio;
19215
+ }
19216
+ function get_target_ratios(min_tiles, max_tiles) {
19217
+ const ratios = [];
19218
+ const seen = /* @__PURE__ */ new Set();
19219
+ for (let n = min_tiles; n <= max_tiles; ++n) {
19220
+ for (let w = 1; w <= n; ++w) {
19221
+ for (let h = 1; h <= n; ++h) {
19222
+ const product2 = w * h;
19223
+ if (product2 >= min_tiles && product2 <= max_tiles) {
19224
+ const key = w << 16 | h;
19225
+ if (!seen.has(key)) {
19226
+ seen.add(key);
19227
+ ratios.push([w, h]);
19228
+ }
19229
+ }
19230
+ }
19231
+ }
19232
+ }
19233
+ return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
19234
+ }
19235
+ function convert_image_to_patches(images, patch_size) {
19236
+ const [B, C, H, W] = images.dims;
19237
+ const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
19238
+ const patch_dim = patch_size * patch_size * C;
19239
+ const data = (
19240
+ /** @type {Float32Array} */
19241
+ images.data
19242
+ );
19243
+ const result = new Float32Array(B * ph * pw * patch_dim);
19244
+ const ch_stride = H * W;
19245
+ for (let b = 0; b < B; ++b) {
19246
+ const b_src = b * C * ch_stride;
19247
+ const b_dst = b * ph * pw * patch_dim;
19248
+ for (let py = 0; py < ph; ++py) {
19249
+ for (let px = 0; px < pw; ++px) {
19250
+ let off = b_dst + (py * pw + px) * patch_dim;
19251
+ for (let dy = 0; dy < patch_size; ++dy) {
19252
+ const row = (py * patch_size + dy) * W + px * patch_size;
19253
+ for (let dx = 0; dx < patch_size; ++dx) {
19254
+ const pixel = row + dx;
19255
+ for (let c = 0; c < C; ++c) {
19256
+ result[off++] = data[b_src + c * ch_stride + pixel];
19257
+ }
19258
+ }
19259
+ }
19260
+ }
19261
+ }
19262
+ }
19263
+ return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
19264
+ }
19265
+ function pad_along_first_dim(patches, target_length) {
19266
+ const [, len2, dim] = patches.dims;
19267
+ const mask_data = new BigInt64Array(target_length);
19268
+ mask_data.fill(1n, 0, len2);
19269
+ let padded = patches;
19270
+ if (len2 < target_length) {
19271
+ const padded_data = new Float32Array(target_length * dim);
19272
+ padded_data.set(
19273
+ /** @type {Float32Array} */
19274
+ patches.data
19275
+ );
19276
+ padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
19277
+ }
19278
+ return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
19279
+ }
19280
+ var Lfm2VlImageProcessor = class extends ImageProcessor {
19281
+ constructor(config) {
19282
+ super(config);
19283
+ this.downsample_factor = config.downsample_factor ?? 2;
19284
+ this.do_image_splitting = config.do_image_splitting ?? true;
19285
+ this.min_tiles = config.min_tiles ?? 2;
19286
+ this.max_tiles = config.max_tiles ?? 10;
19287
+ this.use_thumbnail = config.use_thumbnail ?? true;
19288
+ this.min_image_tokens = config.min_image_tokens ?? 64;
19289
+ this.max_image_tokens = config.max_image_tokens ?? 256;
19290
+ this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
19291
+ this.tile_size = config.tile_size ?? 512;
19292
+ this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
19293
+ this.return_row_col_info = config.return_row_col_info ?? false;
19294
+ const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
19295
+ const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
19296
+ this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
19297
+ }
19298
+ /**
19299
+ * Check if the image is too large to be processed as a single tile.
19300
+ * @param {number} height
19301
+ * @param {number} width
19302
+ * @returns {boolean}
19303
+ */
19304
+ _is_image_too_large(height, width) {
19305
+ const total_factor = this.encoder_patch_size * this.downsample_factor;
19306
+ const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
19307
+ const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
19308
+ return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
19309
+ }
19310
+ /**
19311
+ * Get the grid layout for tiling a large image.
19312
+ * @param {number} height
19313
+ * @param {number} width
19314
+ * @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
19315
+ */
19316
+ _get_grid_layout(height, width) {
19317
+ const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
19318
+ const [grid_width, grid_height] = find_closest_aspect_ratio(
19319
+ width / height,
19320
+ target_ratios,
19321
+ width,
19322
+ height,
19323
+ this.tile_size
19324
+ );
19325
+ return {
19326
+ grid_width,
19327
+ grid_height,
19328
+ target_width: this.tile_size * grid_width,
19329
+ target_height: this.tile_size * grid_height
19330
+ };
19331
+ }
19332
+ /** @param {RawImage|RawImage[]|RawImage[][]} images */
19333
+ // @ts-expect-error
19334
+ async _call(images, { return_row_col_info = null } = {}) {
19335
+ let batched_images;
19336
+ if (!Array.isArray(images)) {
19337
+ batched_images = [[images]];
19338
+ } else if (!Array.isArray(images[0])) {
19339
+ batched_images = [
19340
+ /** @type {RawImage[]} */
19341
+ images
19342
+ ];
19343
+ } else {
19344
+ batched_images = /** @type {RawImage[][]} */
19345
+ images;
19346
+ }
19347
+ const all_pixel_values = [];
19348
+ const all_pixel_masks = [];
19349
+ const all_spatial_shapes = [];
19350
+ const all_rows = [];
19351
+ const all_cols = [];
19352
+ const all_image_sizes = [];
19353
+ for (const image_batch of batched_images) {
19354
+ const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
19355
+ for (const { pixel_values } of preprocessed) {
19356
+ const [, height, width] = pixel_values.dims;
19357
+ const img = pixel_values.unsqueeze_(0);
19358
+ const total_factor = this.encoder_patch_size * this.downsample_factor;
19359
+ const f2 = total_factor ** 2;
19360
+ const [new_width, new_height] = smart_resize(
19361
+ Math.max(total_factor, height),
19362
+ Math.max(total_factor, width),
19363
+ total_factor,
19364
+ this.min_image_tokens * f2,
19365
+ this.max_image_tokens * f2
19366
+ ).map((x) => Math.max(total_factor, x));
19367
+ let tiles;
19368
+ let num_rows = 1, num_cols = 1;
19369
+ const is_large = this._is_image_too_large(height, width);
19370
+ const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
19371
+ if (is_large && do_splitting) {
19372
+ const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
19373
+ height,
19374
+ width
19375
+ );
19376
+ num_rows = grid_height;
19377
+ num_cols = grid_width;
19378
+ const resized = await interpolate_4d(img, {
19379
+ size: [target_height, target_width]
19380
+ });
19381
+ tiles = [];
19382
+ for (let r = 0; r < grid_height; ++r) {
19383
+ for (let c = 0; c < grid_width; ++c) {
19384
+ const y = r * this.tile_size;
19385
+ const x = c * this.tile_size;
19386
+ tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
19387
+ }
19388
+ }
19389
+ if (this.use_thumbnail && grid_width * grid_height !== 1) {
19390
+ tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
19391
+ }
19392
+ } else {
19393
+ tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
19394
+ }
19395
+ for (const tile of tiles) {
19396
+ const [, , th, tw] = tile.dims;
19397
+ const patches = convert_image_to_patches(tile, this.encoder_patch_size);
19398
+ const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
19399
+ all_pixel_values.push(padded);
19400
+ all_pixel_masks.push(mask);
19401
+ all_spatial_shapes.push([
19402
+ Math.floor(th / this.encoder_patch_size),
19403
+ Math.floor(tw / this.encoder_patch_size)
19404
+ ]);
19405
+ }
19406
+ all_rows.push(num_rows);
19407
+ all_cols.push(num_cols);
19408
+ all_image_sizes.push([new_height, new_width]);
19409
+ }
19410
+ }
19411
+ const result = {
19412
+ pixel_values: cat(all_pixel_values, 0),
19413
+ pixel_attention_mask: stack(all_pixel_masks, 0),
19414
+ spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
19415
+ all_spatial_shapes.length,
19416
+ 2
19417
+ ])
19418
+ };
19419
+ if (return_row_col_info ?? this.return_row_col_info) {
19420
+ result.image_rows = all_rows;
19421
+ result.image_cols = all_cols;
19422
+ result.image_sizes = all_image_sizes;
19423
+ }
19424
+ return result;
19425
+ }
19426
+ };
19427
+
18660
19428
  // src/models/llava_onevision/image_processing_llava_onevision.js
18661
19429
  var LlavaOnevisionImageProcessor = class extends ImageProcessor {
18662
19430
  };
@@ -18879,76 +19647,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
18879
19647
  var PvtImageProcessor = class extends ImageProcessor {
18880
19648
  };
18881
19649
 
18882
- // src/models/qwen2_vl/image_processing_qwen2_vl.js
18883
- function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
18884
- if (height < factor || width < factor) {
18885
- throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
18886
- } else if (Math.max(height, width) / Math.min(height, width) > 200) {
18887
- throw new Error(
18888
- `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
18889
- );
18890
- }
18891
- let h_bar = Math.round(height / factor) * factor;
18892
- let w_bar = Math.round(width / factor) * factor;
18893
- if (h_bar * w_bar > max_pixels) {
18894
- const beta = Math.sqrt(height * width / max_pixels);
18895
- h_bar = Math.floor(height / beta / factor) * factor;
18896
- w_bar = Math.floor(width / beta / factor) * factor;
18897
- } else if (h_bar * w_bar < min_pixels) {
18898
- const beta = Math.sqrt(min_pixels / (height * width));
18899
- h_bar = Math.ceil(height * beta / factor) * factor;
18900
- w_bar = Math.ceil(width * beta / factor) * factor;
18901
- }
18902
- return [h_bar, w_bar];
18903
- }
18904
- var Qwen2VLImageProcessor = class extends ImageProcessor {
18905
- constructor(config) {
18906
- super(config);
18907
- this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
18908
- this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
18909
- this.patch_size = config.patch_size;
18910
- this.merge_size = config.merge_size;
18911
- }
18912
- /** @type {ImageProcessor['get_resize_output_image_size']} */
18913
- get_resize_output_image_size(image, size) {
18914
- const factor = this.patch_size * this.merge_size;
18915
- return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
18916
- }
18917
- async _call(images, ...args) {
18918
- const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
18919
- let patches = pixel_values;
18920
- const { temporal_patch_size, merge_size, patch_size } = this.config;
18921
- if (patches.dims[0] === 1) {
18922
- patches = cat(
18923
- Array.from({ length: temporal_patch_size }, () => patches),
18924
- 0
18925
- );
18926
- }
18927
- const grid_t = patches.dims[0] / temporal_patch_size;
18928
- const channel = patches.dims[1];
18929
- const grid_h = Math.floor(patches.dims[2] / patch_size);
18930
- const grid_w = Math.floor(patches.dims[3] / patch_size);
18931
- const flatten_patches = patches.view(
18932
- grid_t,
18933
- temporal_patch_size,
18934
- channel,
18935
- Math.floor(grid_h / merge_size),
18936
- merge_size,
18937
- patch_size,
18938
- Math.floor(grid_w / merge_size),
18939
- merge_size,
18940
- patch_size
18941
- ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
18942
- const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
18943
- return {
18944
- pixel_values: flatten_patches,
18945
- image_grid_thw,
18946
- original_sizes,
18947
- reshaped_input_sizes
18948
- };
18949
- }
18950
- };
18951
-
18952
19650
  // src/models/rt_detr/image_processing_rt_detr.js
18953
19651
  var RTDetrImageProcessor = class extends ImageProcessor {
18954
19652
  /** @type {typeof post_process_object_detection} */
@@ -19502,6 +20200,107 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
19502
20200
  }
19503
20201
  };
19504
20202
 
20203
+ // src/models/qwen2_vl/processing_qwen2_vl.js
20204
+ var Qwen2VLProcessor = class extends Processor {
20205
+ static image_processor_class = AutoImageProcessor;
20206
+ static tokenizer_class = AutoTokenizer;
20207
+ static image_token = "<|image_pad|>";
20208
+ /**
20209
+ *
20210
+ * @param {string|string[]} text
20211
+ * @param {RawImage|RawImage[]} images
20212
+ * @param {...any} args
20213
+ * @returns {Promise<any>}
20214
+ */
20215
+ async _call(text, images = null, ...args) {
20216
+ if (!Array.isArray(text)) {
20217
+ text = [text];
20218
+ }
20219
+ let image_inputs, image_grid_thw;
20220
+ if (images) {
20221
+ image_inputs = await this.image_processor(images);
20222
+ image_grid_thw = image_inputs.image_grid_thw;
20223
+ }
20224
+ if (image_grid_thw) {
20225
+ let merge_length = this.image_processor.config.merge_size ** 2;
20226
+ let index = 0;
20227
+ const image_token = (
20228
+ /** @type {typeof Qwen2VLProcessor} */
20229
+ this.constructor.image_token
20230
+ );
20231
+ const image_grid_thw_list = image_grid_thw.tolist();
20232
+ text = text.map((t) => {
20233
+ while (t.includes(image_token)) {
20234
+ const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
20235
+ t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
20236
+ }
20237
+ return t.replaceAll("<|placeholder|>", image_token);
20238
+ });
20239
+ }
20240
+ const text_inputs = this.tokenizer(text);
20241
+ return {
20242
+ ...text_inputs,
20243
+ ...image_inputs
20244
+ };
20245
+ }
20246
+ };
20247
+
20248
+ // src/models/glm46v/processing_glm46v.js
20249
+ var Glm46VProcessor = class extends Qwen2VLProcessor {
20250
+ static image_token = "<|image|>";
20251
+ };
20252
+
20253
+ // src/models/granite_speech/processing_granite_speech.js
20254
+ var GraniteSpeechProcessor = class extends Processor {
20255
+ static tokenizer_class = AutoTokenizer;
20256
+ static feature_extractor_class = AutoFeatureExtractor;
20257
+ static uses_processor_config = true;
20258
+ /**
20259
+ * Compute the number of audio tokens for a given raw audio length.
20260
+ * @param {number} audioLength Raw audio sample count.
20261
+ * @returns {number} Number of projector output tokens.
20262
+ */
20263
+ _get_num_audio_features(audioLength) {
20264
+ const { hop_length } = this.feature_extractor.config.melspec_kwargs;
20265
+ const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
20266
+ const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
20267
+ const mel_length = Math.floor(audioLength / hop_length) + 1;
20268
+ const encoder_length = Math.floor(mel_length / 2);
20269
+ const nblocks = Math.ceil(encoder_length / projector_window_size);
20270
+ return nblocks * effective_window_size;
20271
+ }
20272
+ /**
20273
+ * @param {string} text The text input to process.
20274
+ * @param {Float32Array} audio The audio input to process.
20275
+ */
20276
+ async _call(text, audio = null, kwargs = {}) {
20277
+ if (Array.isArray(text)) {
20278
+ throw new Error("Batched inputs are not supported yet.");
20279
+ }
20280
+ let audio_inputs = {};
20281
+ if (audio) {
20282
+ const { input_features } = await this.feature_extractor(audio);
20283
+ audio_inputs["input_features"] = input_features;
20284
+ const audio_embed_size = this._get_num_audio_features(audio.length);
20285
+ const mask_data = new Uint8Array(audio_embed_size).fill(1);
20286
+ audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
20287
+ const audio_token = this.config.audio_token ?? "<|audio|>";
20288
+ if (!text.includes(audio_token)) {
20289
+ throw new Error(`The input text does not contain the audio token ${audio_token}.`);
20290
+ }
20291
+ text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
20292
+ }
20293
+ const text_inputs = this.tokenizer(text, {
20294
+ add_special_tokens: false,
20295
+ ...kwargs
20296
+ });
20297
+ return {
20298
+ ...text_inputs,
20299
+ ...audio_inputs
20300
+ };
20301
+ }
20302
+ };
20303
+
19505
20304
  // src/models/grounding_dino/processing_grounding_dino.js
19506
20305
  function get_phrases_from_posmap(posmaps, input_ids) {
19507
20306
  const left_idx = 0;
@@ -19778,7 +20577,67 @@ var JinaCLIPProcessor = class extends Processor {
19778
20577
  }
19779
20578
  };
19780
20579
 
19781
- // src/models/llava/processing_llava.js
20580
+ // src/models/lfm2_vl/processing_lfm2_vl.js
20581
+ var Lfm2VlProcessor = class extends Processor {
20582
+ static tokenizer_class = AutoTokenizer;
20583
+ static image_processor_class = AutoImageProcessor;
20584
+ /**
20585
+ * @param {RawImage|RawImage[]} images
20586
+ * @param {string|string[]|null} [text]
20587
+ * @param {Record<string, any>} [kwargs]
20588
+ */
20589
+ async _call(images, text = null, kwargs = {}) {
20590
+ const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
20591
+ ...kwargs,
20592
+ return_row_col_info: true
20593
+ });
20594
+ if (text) {
20595
+ const image_token = this.config.image_token ?? "<image>";
20596
+ const {
20597
+ tile_size = 512,
20598
+ downsample_factor = 2,
20599
+ encoder_patch_size = 16,
20600
+ use_thumbnail = true
20601
+ } = (
20602
+ /** @type {Record<string, any>} */
20603
+ this.image_processor.config
20604
+ );
20605
+ const ds2 = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
20606
+ const tokens_per_tile = ds2(tile_size) ** 2;
20607
+ const image_start = this.config.image_start_token ?? "<|image_start|>";
20608
+ const image_end = this.config.image_end_token ?? "<|image_end|>";
20609
+ const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
20610
+ if (!Array.isArray(text)) text = [text];
20611
+ let image_idx = 0;
20612
+ text = text.map((sample) => {
20613
+ const parts = sample.split(image_token);
20614
+ return parts[0] + parts.slice(1).map((part) => {
20615
+ const idx = image_idx++;
20616
+ const [h, w] = image_sizes[idx];
20617
+ const rows = image_rows[idx], cols = image_cols[idx];
20618
+ const tokens_for_image = ds2(h) * ds2(w);
20619
+ let expanded = image_start;
20620
+ if (rows > 1 || cols > 1) {
20621
+ const tile_str = image_token.repeat(tokens_per_tile);
20622
+ for (let r = 0; r < rows; ++r)
20623
+ for (let c = 0; c < cols; ++c)
20624
+ expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
20625
+ if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
20626
+ } else {
20627
+ expanded += image_token.repeat(tokens_for_image);
20628
+ }
20629
+ return expanded + image_end + part;
20630
+ }).join("");
20631
+ });
20632
+ }
20633
+ return {
20634
+ ...image_inputs,
20635
+ ...text ? this.tokenizer(text, kwargs) : {}
20636
+ };
20637
+ }
20638
+ };
20639
+
20640
+ // src/models/llava/processing_llava.js
19782
20641
  var LlavaProcessor = class extends Processor {
19783
20642
  static tokenizer_class = AutoTokenizer;
19784
20643
  static image_processor_class = AutoImageProcessor;
@@ -20121,47 +20980,6 @@ var PyAnnoteProcessor = class extends Processor {
20121
20980
  }
20122
20981
  };
20123
20982
 
20124
- // src/models/qwen2_vl/processing_qwen2_vl.js
20125
- var Qwen2VLProcessor = class extends Processor {
20126
- static image_processor_class = AutoImageProcessor;
20127
- static tokenizer_class = AutoTokenizer;
20128
- /**
20129
- *
20130
- * @param {string|string[]} text
20131
- * @param {RawImage|RawImage[]} images
20132
- * @param {...any} args
20133
- * @returns {Promise<any>}
20134
- */
20135
- async _call(text, images = null, ...args) {
20136
- if (!Array.isArray(text)) {
20137
- text = [text];
20138
- }
20139
- let image_inputs, image_grid_thw;
20140
- if (images) {
20141
- image_inputs = await this.image_processor(images);
20142
- image_grid_thw = image_inputs.image_grid_thw;
20143
- }
20144
- if (image_grid_thw) {
20145
- let merge_length = this.image_processor.config.merge_size ** 2;
20146
- let index = 0;
20147
- const image_grid_thw_list = image_grid_thw.tolist();
20148
- text = text.map((t) => {
20149
- while (t.includes("<|image_pad|>")) {
20150
- const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
20151
- t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
20152
- }
20153
- return t.replaceAll("<|placeholder|>", "<|image_pad|>");
20154
- });
20155
- }
20156
- const text_inputs = this.tokenizer(text);
20157
- return {
20158
- ...text_inputs,
20159
- ...image_inputs
20160
- // TODO: ...videos_inputs,
20161
- };
20162
- }
20163
- };
20164
-
20165
20983
  // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
20166
20984
  var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
20167
20985
  };
@@ -20310,6 +21128,94 @@ var VoxtralProcessor = class extends Processor {
20310
21128
  }
20311
21129
  };
20312
21130
 
21131
+ // src/models/voxtral_realtime/processing_voxtral_realtime.js
21132
+ var NUM_LEFT_PAD_TOKENS = 32;
21133
+ var NUM_DELAY_TOKENS = 6;
21134
+ var AUDIO_LENGTH_PER_TOK = 8;
21135
+ var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
21136
+ var STREAMING_PAD_TOKEN_ID = 32;
21137
+ var VoxtralRealtimeProcessor = class extends Processor {
21138
+ static tokenizer_class = AutoTokenizer;
21139
+ static feature_extractor_class = AutoFeatureExtractor;
21140
+ static uses_processor_config = false;
21141
+ /** Number of mel frames in the first audio chunk. */
21142
+ get num_mel_frames_first_audio_chunk() {
21143
+ return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
21144
+ }
21145
+ /** Number of raw audio samples in the first audio chunk. */
21146
+ get num_samples_first_audio_chunk() {
21147
+ const { hop_length, n_fft } = this.feature_extractor.config;
21148
+ return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
21149
+ }
21150
+ /** Number of raw audio samples per subsequent audio chunk. */
21151
+ get num_samples_per_audio_chunk() {
21152
+ const { hop_length, n_fft } = this.feature_extractor.config;
21153
+ return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
21154
+ }
21155
+ /** Number of right-pad tokens for non-streaming mode. */
21156
+ get num_right_pad_tokens() {
21157
+ return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
21158
+ }
21159
+ /** Number of mel frames per text token. */
21160
+ get audio_length_per_tok() {
21161
+ return AUDIO_LENGTH_PER_TOK;
21162
+ }
21163
+ /** Number of raw audio samples per token. */
21164
+ get raw_audio_length_per_tok() {
21165
+ return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
21166
+ }
21167
+ /**
21168
+ * Process audio input for VoxtralRealtime.
21169
+ *
21170
+ * In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
21171
+ * with silence and mel features are extracted with `center=true`.
21172
+ * Returns `{ input_ids, input_features }`.
21173
+ *
21174
+ * In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
21175
+ * processed with `center=false` and only `{ input_features }` is returned.
21176
+ *
21177
+ * In non-streaming mode, the audio is right-padded to ensure the model
21178
+ * transcribes the full audio, then processed with `center=true`.
21179
+ * Returns `{ input_features }`.
21180
+ *
21181
+ * @param {Float32Array|Float64Array} audio The audio waveform.
21182
+ * @param {Object} [options]
21183
+ * @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
21184
+ * @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
21185
+ * @returns {Promise<Object>}
21186
+ */
21187
+ async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
21188
+ validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
21189
+ if (!is_streaming && !is_first_audio_chunk) {
21190
+ throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
21191
+ }
21192
+ if (is_first_audio_chunk) {
21193
+ if (is_streaming) {
21194
+ const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
21195
+ const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
21196
+ padded_audio.set(audio, num_left_pad_samples);
21197
+ const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
21198
+ const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
21199
+ const num_input_tokens = 1 + num_pad_tokens;
21200
+ const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
21201
+ input_ids_data[0] = 1n;
21202
+ const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
21203
+ return {
21204
+ input_ids,
21205
+ ...audio_encoding
21206
+ };
21207
+ } else {
21208
+ const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
21209
+ const padded_audio = new Float32Array(audio.length + right_pad_samples);
21210
+ padded_audio.set(audio);
21211
+ return await this.feature_extractor(padded_audio, { center: true });
21212
+ }
21213
+ } else {
21214
+ return await this.feature_extractor(audio, { center: false });
21215
+ }
21216
+ }
21217
+ };
21218
+
20313
21219
  // src/models/wav2vec2/processing_wav2vec2.js
20314
21220
  var Wav2Vec2Processor = class extends Processor {
20315
21221
  static tokenizer_class = AutoTokenizer;
@@ -20409,11 +21315,16 @@ function getNormalizedConfig(config) {
20409
21315
  case "florence2":
20410
21316
  case "llava_onevision":
20411
21317
  case "idefics3":
21318
+ case "granite_speech":
20412
21319
  case "ultravox":
20413
21320
  case "voxtral":
21321
+ case "voxtral_realtime":
20414
21322
  case "smolvlm":
20415
21323
  case "gemma3n":
21324
+ case "lfm2_vl":
20416
21325
  case "chatterbox":
21326
+ case "lighton_ocr":
21327
+ case "glm_ocr":
20417
21328
  case "mistral3":
20418
21329
  case "qwen2_5_vl":
20419
21330
  case "qwen3_vl":
@@ -20467,10 +21378,13 @@ function getNormalizedConfig(config) {
20467
21378
  case "cohere":
20468
21379
  case "cohere2":
20469
21380
  case "mistral":
21381
+ case "voxtral_realtime_text":
21382
+ case "voxtral_realtime_encoder":
20470
21383
  case "starcoder2":
20471
21384
  case "qwen2":
20472
21385
  case "qwen2_moe":
20473
21386
  case "qwen2_vl":
21387
+ case "qwen2_vl_text":
20474
21388
  case "qwen2_5_vl_text":
20475
21389
  case "qwen3_moe":
20476
21390
  case "qwen3_vl_text":
@@ -20486,6 +21400,8 @@ function getNormalizedConfig(config) {
20486
21400
  mapping["dim_kv"] = "head_dim";
20487
21401
  break;
20488
21402
  case "qwen3":
21403
+ case "solar_open":
21404
+ case "glm_ocr_text":
20489
21405
  case "gemma":
20490
21406
  case "gemma2":
20491
21407
  case "vaultgemma":
@@ -20496,6 +21412,7 @@ function getNormalizedConfig(config) {
20496
21412
  case "ernie4_5":
20497
21413
  case "hunyuan_v1_dense":
20498
21414
  case "falcon_h1":
21415
+ case "nemotron_h":
20499
21416
  case "ministral":
20500
21417
  case "ministral3":
20501
21418
  mapping["num_heads"] = "num_key_value_heads";
@@ -20530,6 +21447,9 @@ function getNormalizedConfig(config) {
20530
21447
  mapping["num_attention_heads"] = "num_attention_heads";
20531
21448
  break;
20532
21449
  case "youtu":
21450
+ case "deepseek_v3":
21451
+ case "glm_moe_dsa":
21452
+ case "mistral4":
20533
21453
  mapping["num_heads"] = "num_key_value_heads";
20534
21454
  mapping["num_layers"] = "num_hidden_layers";
20535
21455
  mapping["dim_kv"] = "qk_head_dim";
@@ -20615,6 +21535,10 @@ function getNormalizedConfig(config) {
20615
21535
  return normalized_config;
20616
21536
  }
20617
21537
  function getCacheShapes(config, options) {
21538
+ if (!(config instanceof PretrainedConfig)) {
21539
+ config = new PretrainedConfig(config);
21540
+ }
21541
+ const batch_size = options?.batch_size ?? 1;
20618
21542
  if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
20619
21543
  const pkv_prefix = options?.prefix ?? "past_key_values";
20620
21544
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -20624,7 +21548,6 @@ function getCacheShapes(config, options) {
20624
21548
  config
20625
21549
  );
20626
21550
  const head_dim = hidden_size / num_attention_heads;
20627
- const batch_size = options?.batch_size ?? 1;
20628
21551
  for (let i = 0; i < layer_types.length; ++i) {
20629
21552
  if (layer_types[i] === "full_attention") {
20630
21553
  for (const kv of ["key", "value"]) {
@@ -20637,31 +21560,26 @@ function getCacheShapes(config, options) {
20637
21560
  }
20638
21561
  }
20639
21562
  return cache_values;
20640
- } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
21563
+ } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
20641
21564
  const pkv_prefix = options?.prefix ?? "past_key_values";
20642
21565
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
20643
- const cache_values = {};
20644
- const {
20645
- layer_types,
20646
- num_hidden_layers,
20647
- num_attention_heads,
20648
- num_key_value_heads,
20649
- hidden_size,
20650
- mamba_d_conv,
20651
- mamba_n_heads,
20652
- mamba_d_head,
20653
- mamba_d_state,
20654
- mamba_n_groups,
20655
- mamba_expand,
20656
- mamba_d_ssm
20657
- } = (
21566
+ const c = (
20658
21567
  /** @type {any} */
20659
21568
  config
20660
21569
  );
20661
- const head_dim = hidden_size / num_attention_heads;
20662
- const batch_size = options?.batch_size ?? 1;
20663
- const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
20664
- for (let i = 0; i < num_hidden_layers; ++i) {
21570
+ const layer_types = c.layer_types ?? c.layers_block_type;
21571
+ const num_layers = c.num_hidden_layers ?? layer_types?.length;
21572
+ const num_key_value_heads = c.num_key_value_heads;
21573
+ const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
21574
+ const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
21575
+ const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
21576
+ const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
21577
+ const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
21578
+ const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
21579
+ const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
21580
+ const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
21581
+ const cache_values = {};
21582
+ for (let i = 0; i < num_layers; ++i) {
20665
21583
  if (!layer_types || layer_types[i] === "mamba") {
20666
21584
  cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
20667
21585
  cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -20695,7 +21613,6 @@ function getCacheShapes(config, options) {
20695
21613
  const key_dim = linear_key_head_dim * linear_num_key_heads;
20696
21614
  const value_dim = linear_value_head_dim * linear_num_value_heads;
20697
21615
  const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
20698
- const batch_size = options?.batch_size ?? 1;
20699
21616
  for (let i = 0; i < layer_types.length; ++i) {
20700
21617
  if (layer_types[i] === "full_attention") {
20701
21618
  for (const kv of ["key", "value"]) {
@@ -20721,12 +21638,16 @@ function getCacheShapes(config, options) {
20721
21638
  }
20722
21639
  }
20723
21640
  return cache_values;
20724
- } else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
20725
- return getCacheShapes(
20726
- /**@type {any} */
20727
- config.text_config,
20728
- options
20729
- );
21641
+ } else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
21642
+ let subConfig;
21643
+ if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
21644
+ subConfig = /** @type {any} */
21645
+ config.audio_config;
21646
+ } else {
21647
+ subConfig = /** @type {any} */
21648
+ config.text_config;
21649
+ }
21650
+ return getCacheShapes(subConfig, options);
20730
21651
  }
20731
21652
  return getKeyValueShapes(config, options);
20732
21653
  }
@@ -20892,7 +21813,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
20892
21813
  }
20893
21814
 
20894
21815
  // src/models/session.js
20895
- async function getSession(pretrained_model_name_or_path, fileName, options, is_decoder = false) {
21816
+ async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
20896
21817
  let custom_config = options.config?.["transformers.js_config"] ?? {};
20897
21818
  const selectedDevice = (
20898
21819
  /** @type {import("../utils/devices.js").DeviceType} */
@@ -20950,9 +21871,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
20950
21871
  if (externalData.length > 0 && !apis.IS_NODE_ENV) {
20951
21872
  session_options.externalData = externalData;
20952
21873
  }
20953
- if (is_decoder && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
21874
+ if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
20954
21875
  const shapes = getCacheShapes(options.config, {
20955
- prefix: "present"
21876
+ prefix: "present",
21877
+ session_name
20956
21878
  });
20957
21879
  if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
20958
21880
  const preferredOutputLocation = {};
@@ -20970,15 +21892,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
20970
21892
  };
20971
21893
  return { buffer_or_path, session_options, session_config };
20972
21894
  }
20973
- async function constructSessions(pretrained_model_name_or_path, names, options, decoder_name = void 0) {
21895
+ async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
20974
21896
  return Object.fromEntries(
20975
21897
  await Promise.all(
20976
21898
  Object.keys(names).map(async (name) => {
21899
+ const cache_config = cache_sessions?.[name] ?? false;
20977
21900
  const { buffer_or_path, session_options, session_config } = await getSession(
20978
21901
  pretrained_model_name_or_path,
20979
21902
  names[name],
20980
21903
  options,
20981
- name === decoder_name
21904
+ cache_config,
21905
+ name
20982
21906
  );
20983
21907
  const session = await createInferenceSession(buffer_or_path, session_options, session_config);
20984
21908
  return [name, session];
@@ -22278,19 +23202,71 @@ var BeamSearchSampler = class extends LogitsSampler {
22278
23202
  }
22279
23203
  };
22280
23204
 
23205
+ // src/cache_utils.js
23206
+ var _DynamicCache = class {
23207
+ /**
23208
+ * Create a DynamicCache, optionally pre-populated with entries.
23209
+ * @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
23210
+ */
23211
+ constructor(entries) {
23212
+ if (!entries) return;
23213
+ for (const key in entries) {
23214
+ if (key in this) {
23215
+ throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
23216
+ }
23217
+ const value = entries[key];
23218
+ if (!(value instanceof Tensor2)) {
23219
+ throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
23220
+ }
23221
+ this[key] = value;
23222
+ }
23223
+ }
23224
+ /**
23225
+ * Get the cached sequence length. This requires at least one attention cache entry to be present.
23226
+ * @returns {number} The past sequence length.
23227
+ */
23228
+ get_seq_length() {
23229
+ const self2 = (
23230
+ /** @type {any} */
23231
+ this
23232
+ );
23233
+ for (const name in self2) {
23234
+ if (name.startsWith("past_key_values.")) {
23235
+ return self2[name].dims.at(-2);
23236
+ }
23237
+ }
23238
+ throw new Error("Unable to determine sequence length from the cache.");
23239
+ }
23240
+ /**
23241
+ * Dispose all contained tensors whose data resides on the GPU.
23242
+ * Returns a promise that resolves when all disposals are complete.
23243
+ * @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
23244
+ */
23245
+ async dispose() {
23246
+ const promises = [];
23247
+ for (
23248
+ const t of
23249
+ /** @type {Tensor[]} */
23250
+ Object.values(this)
23251
+ ) {
23252
+ if (t.location === "gpu-buffer") {
23253
+ promises.push(t.dispose());
23254
+ }
23255
+ }
23256
+ await Promise.all(promises);
23257
+ }
23258
+ };
23259
+ var DynamicCache = (
23260
+ /** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
23261
+ /** @type {unknown} */
23262
+ _DynamicCache
23263
+ );
23264
+
22281
23265
  // src/models/modeling_utils.js
22282
23266
  var MODEL_MAPPING_NAMES = null;
22283
23267
  function registerTaskMappings(mappings) {
22284
23268
  MODEL_MAPPING_NAMES = mappings;
22285
23269
  }
22286
- function getPastLength(past_key_values) {
22287
- for (const name in past_key_values) {
22288
- if (name.startsWith("past_key_values.")) {
22289
- return past_key_values[name].dims.at(-2);
22290
- }
22291
- }
22292
- return Object.values(past_key_values)[0].dims.at(-2);
22293
- }
22294
23270
  function toI64Tensor(items) {
22295
23271
  if (items instanceof Tensor2) {
22296
23272
  return items;
@@ -22331,71 +23307,181 @@ var MODEL_TYPES = {
22331
23307
  AutoEncoder: 12,
22332
23308
  ImageAudioTextToText: 13,
22333
23309
  Supertonic: 14,
22334
- Chatterbox: 15
23310
+ Chatterbox: 15,
23311
+ MultimodalLanguageModelOnly: 16,
23312
+ VoxtralRealtime: 17
22335
23313
  };
22336
23314
  var MODEL_TYPE_CONFIG = {
22337
23315
  [MODEL_TYPES.DecoderOnly]: {
22338
23316
  can_generate: true,
22339
23317
  forward: decoder_forward,
22340
- prepare_inputs: decoder_prepare_inputs_for_generation
23318
+ prepare_inputs: decoder_prepare_inputs_for_generation,
23319
+ sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
23320
+ cache_sessions: { model: true },
23321
+ optional_configs: { generation_config: "generation_config.json" }
22341
23322
  },
22342
23323
  [MODEL_TYPES.DecoderOnlyWithoutHead]: {
22343
23324
  can_generate: false,
22344
23325
  forward: decoder_forward,
22345
- prepare_inputs: decoder_prepare_inputs_for_generation
23326
+ prepare_inputs: decoder_prepare_inputs_for_generation,
23327
+ sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
22346
23328
  },
22347
23329
  [MODEL_TYPES.Seq2Seq]: {
22348
23330
  can_generate: true,
22349
23331
  forward: seq2seq_forward,
22350
- prepare_inputs: encoder_decoder_prepare_inputs_for_generation
23332
+ prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
23333
+ sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
23334
+ cache_sessions: { decoder_model_merged: true },
23335
+ optional_configs: { generation_config: "generation_config.json" }
22351
23336
  },
22352
23337
  [MODEL_TYPES.Vision2Seq]: {
22353
23338
  can_generate: true,
22354
23339
  forward: seq2seq_forward,
22355
- prepare_inputs: encoder_decoder_prepare_inputs_for_generation
23340
+ prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
23341
+ sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
23342
+ cache_sessions: { decoder_model_merged: true },
23343
+ optional_configs: { generation_config: "generation_config.json" }
22356
23344
  },
22357
23345
  [MODEL_TYPES.Musicgen]: {
22358
23346
  can_generate: true,
22359
- forward: seq2seq_forward
23347
+ forward: seq2seq_forward,
23348
+ sessions: () => ({
23349
+ model: "text_encoder",
23350
+ decoder_model_merged: "decoder_model_merged",
23351
+ encodec_decode: "encodec_decode"
23352
+ }),
23353
+ cache_sessions: { decoder_model_merged: true },
23354
+ optional_configs: { generation_config: "generation_config.json" }
22360
23355
  },
22361
23356
  [MODEL_TYPES.EncoderDecoder]: {
22362
23357
  can_generate: false,
22363
- forward: seq2seq_forward
23358
+ forward: seq2seq_forward,
23359
+ sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
23360
+ cache_sessions: { decoder_model_merged: true }
23361
+ },
23362
+ [MODEL_TYPES.MaskGeneration]: {
23363
+ sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
22364
23364
  },
22365
23365
  [MODEL_TYPES.ImageTextToText]: {
22366
23366
  can_generate: true,
22367
23367
  forward: image_text_to_text_forward,
22368
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
23368
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23369
+ sessions: (config) => {
23370
+ const s = {
23371
+ embed_tokens: "embed_tokens",
23372
+ vision_encoder: "vision_encoder",
23373
+ decoder_model_merged: "decoder_model_merged"
23374
+ };
23375
+ if (config.is_encoder_decoder) s["model"] = "encoder_model";
23376
+ return s;
23377
+ },
23378
+ cache_sessions: { decoder_model_merged: true },
23379
+ optional_configs: { generation_config: "generation_config.json" }
22369
23380
  },
22370
23381
  [MODEL_TYPES.AudioTextToText]: {
22371
23382
  can_generate: true,
22372
23383
  forward: audio_text_to_text_forward,
22373
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
23384
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23385
+ sessions: () => ({
23386
+ embed_tokens: "embed_tokens",
23387
+ audio_encoder: "audio_encoder",
23388
+ decoder_model_merged: "decoder_model_merged"
23389
+ }),
23390
+ cache_sessions: { decoder_model_merged: true },
23391
+ optional_configs: { generation_config: "generation_config.json" }
22374
23392
  },
22375
- [MODEL_TYPES.Phi3V]: {
23393
+ [MODEL_TYPES.ImageAudioTextToText]: {
22376
23394
  can_generate: true,
22377
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
23395
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23396
+ sessions: () => ({
23397
+ embed_tokens: "embed_tokens",
23398
+ audio_encoder: "audio_encoder",
23399
+ vision_encoder: "vision_encoder",
23400
+ decoder_model_merged: "decoder_model_merged"
23401
+ }),
23402
+ optional_configs: { generation_config: "generation_config.json" }
22378
23403
  },
22379
- [MODEL_TYPES.ImageAudioTextToText]: {
23404
+ [MODEL_TYPES.Phi3V]: {
22380
23405
  can_generate: true,
22381
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
23406
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23407
+ sessions: () => ({
23408
+ prepare_inputs_embeds: "prepare_inputs_embeds",
23409
+ model: "model",
23410
+ vision_encoder: "vision_encoder"
23411
+ }),
23412
+ cache_sessions: { model: true },
23413
+ optional_configs: { generation_config: "generation_config.json" }
22382
23414
  },
22383
23415
  [MODEL_TYPES.MultiModality]: {
22384
- can_generate: true
23416
+ can_generate: true,
23417
+ sessions: () => ({
23418
+ prepare_inputs_embeds: "prepare_inputs_embeds",
23419
+ model: "language_model",
23420
+ lm_head: "lm_head",
23421
+ gen_head: "gen_head",
23422
+ gen_img_embeds: "gen_img_embeds",
23423
+ image_decode: "image_decode"
23424
+ }),
23425
+ cache_sessions: { model: true },
23426
+ optional_configs: { generation_config: "generation_config.json" }
22385
23427
  },
22386
23428
  [MODEL_TYPES.AutoEncoder]: {
22387
23429
  can_generate: false,
22388
- forward: auto_encoder_forward
23430
+ forward: auto_encoder_forward,
23431
+ sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
23432
+ },
23433
+ [MODEL_TYPES.Supertonic]: {
23434
+ sessions: () => ({
23435
+ text_encoder: "text_encoder",
23436
+ latent_denoiser: "latent_denoiser",
23437
+ voice_decoder: "voice_decoder"
23438
+ })
22389
23439
  },
22390
23440
  [MODEL_TYPES.Chatterbox]: {
22391
23441
  can_generate: true,
22392
- forward: encoder_forward
23442
+ forward: encoder_forward,
23443
+ sessions: () => ({
23444
+ embed_tokens: "embed_tokens",
23445
+ speech_encoder: "speech_encoder",
23446
+ model: "language_model",
23447
+ conditional_decoder: "conditional_decoder"
23448
+ }),
23449
+ cache_sessions: { model: true },
23450
+ optional_configs: { generation_config: "generation_config.json" }
23451
+ },
23452
+ [MODEL_TYPES.MultimodalLanguageModelOnly]: {
23453
+ can_generate: true,
23454
+ forward: image_text_to_text_forward,
23455
+ prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
23456
+ sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
23457
+ cache_sessions: { decoder_model_merged: true },
23458
+ optional_configs: { generation_config: "generation_config.json" }
23459
+ },
23460
+ [MODEL_TYPES.VoxtralRealtime]: {
23461
+ can_generate: true,
23462
+ prepare_inputs: decoder_prepare_inputs_for_generation,
23463
+ sessions: () => ({
23464
+ embed_tokens: "embed_tokens",
23465
+ audio_encoder: "audio_encoder",
23466
+ decoder_model_merged: "decoder_model_merged"
23467
+ }),
23468
+ cache_sessions: { decoder_model_merged: true, audio_encoder: true },
23469
+ optional_configs: { generation_config: "generation_config.json" }
22393
23470
  },
22394
23471
  default: {
22395
23472
  can_generate: false,
22396
- forward: encoder_forward
23473
+ forward: encoder_forward,
23474
+ sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
22397
23475
  }
22398
23476
  };
23477
+ function getSessionsConfig(modelType, config, options = {}) {
23478
+ const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
23479
+ return {
23480
+ sessions: typeConfig.sessions(config, options),
23481
+ cache_sessions: typeConfig.cache_sessions,
23482
+ optional_configs: typeConfig.optional_configs
23483
+ };
23484
+ }
22399
23485
  var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
22400
23486
  var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
22401
23487
  var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
@@ -22481,300 +23567,78 @@ var PreTrainedModel = class extends Callable2 {
22481
23567
  const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
22482
23568
  const modelType = MODEL_TYPE_MAPPING.get(modelName);
22483
23569
  config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
22484
- let info;
22485
- if (modelType === MODEL_TYPES.DecoderOnly) {
22486
- info = await Promise.all([
22487
- constructSessions(
22488
- pretrained_model_name_or_path,
22489
- {
22490
- model: options.model_file_name ?? "model"
22491
- },
22492
- options,
22493
- "model"
22494
- ),
22495
- get_optional_configs(
22496
- pretrained_model_name_or_path,
22497
- {
22498
- generation_config: "generation_config.json"
22499
- },
22500
- options
22501
- )
22502
- ]);
22503
- } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
22504
- info = await Promise.all([
22505
- constructSessions(
22506
- pretrained_model_name_or_path,
22507
- {
22508
- model: "encoder_model",
22509
- decoder_model_merged: "decoder_model_merged"
22510
- },
22511
- options,
22512
- "decoder_model_merged"
22513
- ),
22514
- get_optional_configs(
22515
- pretrained_model_name_or_path,
22516
- {
22517
- generation_config: "generation_config.json"
22518
- },
22519
- options
22520
- )
22521
- ]);
22522
- } else if (modelType === MODEL_TYPES.MaskGeneration) {
22523
- info = await Promise.all([
22524
- constructSessions(
22525
- pretrained_model_name_or_path,
22526
- {
22527
- model: "vision_encoder",
22528
- prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
22529
- },
22530
- options
22531
- )
22532
- ]);
22533
- } else if (modelType === MODEL_TYPES.EncoderDecoder) {
22534
- info = await Promise.all([
22535
- constructSessions(
22536
- pretrained_model_name_or_path,
22537
- {
22538
- model: "encoder_model",
22539
- decoder_model_merged: "decoder_model_merged"
22540
- },
22541
- options,
22542
- "decoder_model_merged"
22543
- )
22544
- ]);
22545
- } else if (modelType === MODEL_TYPES.ImageTextToText) {
22546
- const sessions = {
22547
- embed_tokens: "embed_tokens",
22548
- vision_encoder: "vision_encoder",
22549
- decoder_model_merged: "decoder_model_merged"
22550
- };
22551
- if (config.is_encoder_decoder) {
22552
- sessions["model"] = "encoder_model";
23570
+ const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
23571
+ if (modelType === void 0) {
23572
+ const type = modelName ?? config?.model_type;
23573
+ if (type !== "custom") {
23574
+ logger.warn(
23575
+ `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
23576
+ );
22553
23577
  }
22554
- info = await Promise.all([
22555
- constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
22556
- get_optional_configs(
22557
- pretrained_model_name_or_path,
22558
- {
22559
- generation_config: "generation_config.json"
22560
- },
22561
- options
22562
- )
22563
- ]);
22564
- } else if (modelType === MODEL_TYPES.AudioTextToText) {
22565
- const sessions = {
22566
- embed_tokens: "embed_tokens",
22567
- audio_encoder: "audio_encoder",
22568
- decoder_model_merged: "decoder_model_merged"
22569
- };
22570
- info = await Promise.all([
22571
- constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
22572
- get_optional_configs(
22573
- pretrained_model_name_or_path,
22574
- {
22575
- generation_config: "generation_config.json"
22576
- },
22577
- options
22578
- )
22579
- ]);
22580
- } else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
22581
- const sessions = {
22582
- embed_tokens: "embed_tokens",
22583
- audio_encoder: "audio_encoder",
22584
- vision_encoder: "vision_encoder",
22585
- decoder_model_merged: "decoder_model_merged"
22586
- };
22587
- info = await Promise.all([
22588
- constructSessions(pretrained_model_name_or_path, sessions, options),
22589
- get_optional_configs(
22590
- pretrained_model_name_or_path,
22591
- {
22592
- generation_config: "generation_config.json"
22593
- },
22594
- options
22595
- )
22596
- ]);
22597
- } else if (modelType === MODEL_TYPES.Musicgen) {
22598
- info = await Promise.all([
22599
- constructSessions(
22600
- pretrained_model_name_or_path,
22601
- {
22602
- model: "text_encoder",
22603
- decoder_model_merged: "decoder_model_merged",
22604
- encodec_decode: "encodec_decode"
22605
- },
22606
- options,
22607
- "decoder_model_merged"
22608
- ),
22609
- get_optional_configs(
22610
- pretrained_model_name_or_path,
22611
- {
22612
- generation_config: "generation_config.json"
22613
- },
22614
- options
22615
- )
22616
- ]);
22617
- } else if (modelType === MODEL_TYPES.MultiModality) {
22618
- info = await Promise.all([
22619
- constructSessions(
22620
- pretrained_model_name_or_path,
22621
- {
22622
- prepare_inputs_embeds: "prepare_inputs_embeds",
22623
- model: "language_model",
22624
- lm_head: "lm_head",
22625
- gen_head: "gen_head",
22626
- gen_img_embeds: "gen_img_embeds",
22627
- image_decode: "image_decode"
22628
- },
22629
- options,
22630
- "model"
22631
- ),
22632
- get_optional_configs(
22633
- pretrained_model_name_or_path,
22634
- {
22635
- generation_config: "generation_config.json"
22636
- },
22637
- options
22638
- )
22639
- ]);
22640
- } else if (modelType === MODEL_TYPES.Phi3V) {
22641
- info = await Promise.all([
22642
- constructSessions(
22643
- pretrained_model_name_or_path,
22644
- {
22645
- prepare_inputs_embeds: "prepare_inputs_embeds",
22646
- model: "model",
22647
- vision_encoder: "vision_encoder"
22648
- },
22649
- options,
22650
- "model"
22651
- ),
22652
- get_optional_configs(
22653
- pretrained_model_name_or_path,
22654
- {
22655
- generation_config: "generation_config.json"
22656
- },
22657
- options
22658
- )
22659
- ]);
22660
- } else if (modelType === MODEL_TYPES.Chatterbox) {
22661
- info = await Promise.all([
22662
- constructSessions(
22663
- pretrained_model_name_or_path,
22664
- {
22665
- embed_tokens: "embed_tokens",
22666
- speech_encoder: "speech_encoder",
22667
- model: "language_model",
22668
- conditional_decoder: "conditional_decoder"
22669
- },
22670
- options,
22671
- "model"
22672
- ),
22673
- get_optional_configs(
22674
- pretrained_model_name_or_path,
22675
- {
22676
- generation_config: "generation_config.json"
22677
- },
22678
- options
22679
- )
22680
- ]);
22681
- } else if (modelType === MODEL_TYPES.AutoEncoder) {
22682
- info = await Promise.all([
22683
- constructSessions(
22684
- pretrained_model_name_or_path,
22685
- {
22686
- encoder_model: "encoder_model",
22687
- decoder_model: "decoder_model"
22688
- },
22689
- options
22690
- )
22691
- ]);
22692
- } else if (modelType === MODEL_TYPES.Supertonic) {
22693
- info = await Promise.all([
22694
- constructSessions(
22695
- pretrained_model_name_or_path,
22696
- {
22697
- text_encoder: "text_encoder",
22698
- latent_denoiser: "latent_denoiser",
22699
- voice_decoder: "voice_decoder"
22700
- },
22701
- options
22702
- )
22703
- ]);
22704
- } else {
22705
- if (modelType === void 0) {
22706
- const type = modelName ?? config?.model_type;
22707
- if (type !== "custom") {
22708
- logger.warn(
22709
- `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
22710
- );
22711
- }
22712
- }
22713
- info = await Promise.all([
22714
- constructSessions(
22715
- pretrained_model_name_or_path,
22716
- {
22717
- model: options.model_file_name ?? "model"
22718
- },
22719
- options
22720
- )
22721
- ]);
22722
- }
22723
- return new this(config, ...info);
22724
- }
22725
- /**
22726
- * Runs the model with the provided inputs
22727
- * @param {Object} model_inputs Object containing input tensors
22728
- * @returns {Promise<Object>} Object containing output tensors
22729
- */
22730
- async _call(model_inputs) {
22731
- return await this.forward(model_inputs);
22732
- }
22733
- /**
22734
- * Forward method for a pretrained model. If not overridden by a subclass, the correct forward method
22735
- * will be chosen based on the model type.
22736
- * @param {Object} model_inputs The input data to the model in the format specified in the ONNX model.
22737
- * @returns {Promise<Object>} The output data from the model in the format specified in the ONNX model.
22738
- * @throws {Error} This method must be implemented in subclasses.
22739
- */
22740
- async forward(model_inputs) {
22741
- return await this._forward(this, model_inputs);
22742
- }
22743
- /**
22744
- * Get the model's generation config, if it exists.
22745
- * @returns {GenerationConfig|null} The model's generation config if it exists, otherwise `null`.
22746
- */
22747
- get generation_config() {
22748
- return this.configs?.generation_config ?? null;
22749
- }
22750
- /**
22751
- * @param {GenerationConfig} generation_config
22752
- * @param {number} input_ids_seq_length The starting sequence length for the input ids.
22753
- * @returns {LogitsProcessorList}
22754
- * @private
22755
- */
22756
- _get_logits_processor(generation_config, input_ids_seq_length, logits_processor = null) {
22757
- const processors = new LogitsProcessorList();
22758
- if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1) {
22759
- processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty));
22760
- }
22761
- if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) {
22762
- processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size));
22763
- }
22764
- if (generation_config.bad_words_ids !== null) {
22765
- processors.push(
22766
- new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id)
22767
- );
22768
- }
22769
- if (generation_config.min_length !== null && generation_config.eos_token_id !== null && generation_config.min_length > 0) {
22770
- processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id));
22771
- }
22772
- if (generation_config.min_new_tokens !== null && generation_config.eos_token_id !== null && generation_config.min_new_tokens > 0) {
22773
- processors.push(
22774
- new MinNewTokensLengthLogitsProcessor(
22775
- input_ids_seq_length,
22776
- generation_config.min_new_tokens,
22777
- generation_config.eos_token_id
23578
+ }
23579
+ const sessions = typeConfig.sessions(config, options);
23580
+ const promises = [
23581
+ constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
23582
+ ];
23583
+ if (typeConfig.optional_configs) {
23584
+ promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
23585
+ }
23586
+ const info = await Promise.all(promises);
23587
+ return new this(config, ...info);
23588
+ }
23589
+ /**
23590
+ * Runs the model with the provided inputs
23591
+ * @param {Object} model_inputs Object containing input tensors
23592
+ * @returns {Promise<Object>} Object containing output tensors
23593
+ */
23594
+ async _call(model_inputs) {
23595
+ return await this.forward(model_inputs);
23596
+ }
23597
+ /**
23598
+ * Forward method for a pretrained model. If not overridden by a subclass, the correct forward method
23599
+ * will be chosen based on the model type.
23600
+ * @param {Object} model_inputs The input data to the model in the format specified in the ONNX model.
23601
+ * @returns {Promise<Object>} The output data from the model in the format specified in the ONNX model.
23602
+ * @throws {Error} This method must be implemented in subclasses.
23603
+ */
23604
+ async forward(model_inputs) {
23605
+ return await this._forward(this, model_inputs);
23606
+ }
23607
+ /**
23608
+ * Get the model's generation config, if it exists.
23609
+ * @returns {GenerationConfig|null} The model's generation config if it exists, otherwise `null`.
23610
+ */
23611
+ get generation_config() {
23612
+ return this.configs?.generation_config ?? null;
23613
+ }
23614
+ /**
23615
+ * @param {GenerationConfig} generation_config
23616
+ * @param {number} input_ids_seq_length The starting sequence length for the input ids.
23617
+ * @returns {LogitsProcessorList}
23618
+ * @private
23619
+ */
23620
+ _get_logits_processor(generation_config, input_ids_seq_length, logits_processor = null) {
23621
+ const processors = new LogitsProcessorList();
23622
+ if (generation_config.repetition_penalty !== null && generation_config.repetition_penalty !== 1) {
23623
+ processors.push(new RepetitionPenaltyLogitsProcessor(generation_config.repetition_penalty));
23624
+ }
23625
+ if (generation_config.no_repeat_ngram_size !== null && generation_config.no_repeat_ngram_size > 0) {
23626
+ processors.push(new NoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size));
23627
+ }
23628
+ if (generation_config.bad_words_ids !== null) {
23629
+ processors.push(
23630
+ new NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id)
23631
+ );
23632
+ }
23633
+ if (generation_config.min_length !== null && generation_config.eos_token_id !== null && generation_config.min_length > 0) {
23634
+ processors.push(new MinLengthLogitsProcessor(generation_config.min_length, generation_config.eos_token_id));
23635
+ }
23636
+ if (generation_config.min_new_tokens !== null && generation_config.eos_token_id !== null && generation_config.min_new_tokens > 0) {
23637
+ processors.push(
23638
+ new MinNewTokensLengthLogitsProcessor(
23639
+ input_ids_seq_length,
23640
+ generation_config.min_new_tokens,
23641
+ generation_config.eos_token_id
22778
23642
  )
22779
23643
  );
22780
23644
  }
@@ -22918,7 +23782,7 @@ var PreTrainedModel = class extends Callable2 {
22918
23782
  * @param {Tensor} [params.inputs=null]
22919
23783
  * @param {number} [params.bos_token_id=null]
22920
23784
  * @param {Record<string, Tensor|number[]>} [params.model_kwargs]
22921
- * @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor>, model_input_name: string}} The model-specific inputs for generation.
23785
+ * @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
22922
23786
  */
22923
23787
  _prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
22924
23788
  const model_inputs = pick(model_kwargs, this.forward_params);
@@ -23159,11 +24023,12 @@ var PreTrainedModel = class extends Callable2 {
23159
24023
  }
23160
24024
  }
23161
24025
  /**
23162
- * Returns an object containing past key values from the given decoder results object.
24026
+ * Returns a DynamicCache containing past key values from the given decoder results object.
23163
24027
  *
23164
24028
  * @param {Object} decoderResults The decoder results object.
23165
- * @param {Object} pastKeyValues The previous past key values.
23166
- * @returns {Object} An object containing past key values.
24029
+ * @param {DynamicCache} pastKeyValues The previous past key values.
24030
+ * @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
24031
+ * @returns {DynamicCache} A new DynamicCache containing the updated past key values.
23167
24032
  */
23168
24033
  getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
23169
24034
  const pkvs = /* @__PURE__ */ Object.create(null);
@@ -23184,7 +24049,7 @@ var PreTrainedModel = class extends Callable2 {
23184
24049
  }
23185
24050
  }
23186
24051
  }
23187
- return pkvs;
24052
+ return new DynamicCache(pkvs);
23188
24053
  }
23189
24054
  /**
23190
24055
  * Returns an object containing attentions from the given model output object.
@@ -23209,8 +24074,8 @@ var PreTrainedModel = class extends Callable2 {
23209
24074
  /**
23210
24075
  * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
23211
24076
  *
23212
- * @param {Object} decoderFeeds The decoder feeds object to add past key values to.
23213
- * @param {Object} pastKeyValues An object containing past key values.
24077
+ * @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
24078
+ * @param {DynamicCache|null} pastKeyValues The cache containing past key values.
23214
24079
  */
23215
24080
  addPastKeyValues(decoderFeeds, pastKeyValues) {
23216
24081
  if (pastKeyValues) {
@@ -23227,14 +24092,29 @@ var PreTrainedModel = class extends Callable2 {
23227
24092
  }
23228
24093
  }
23229
24094
  }
23230
- async encode_image({ pixel_values }) {
23231
- return (await sessionRun(this.sessions["vision_encoder"], { pixel_values })).image_features;
24095
+ /**
24096
+ * Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
24097
+ * @param {string} sessionName
24098
+ * @param {Record<string, Tensor>} inputs
24099
+ * @param {string} outputName
24100
+ * @private
24101
+ */
24102
+ async _encode_input(sessionName, inputs, outputName) {
24103
+ if (!Object.hasOwn(this.sessions, sessionName)) {
24104
+ throw new Error(`Model does not have a ${sessionName} session.`);
24105
+ }
24106
+ const session = this.sessions[sessionName];
24107
+ const output = await sessionRun(session, pick(inputs, session.inputNames));
24108
+ return output[outputName];
24109
+ }
24110
+ async encode_image(inputs) {
24111
+ return this._encode_input("vision_encoder", inputs, "image_features");
23232
24112
  }
23233
- async encode_text({ input_ids }) {
23234
- return (await sessionRun(this.sessions["embed_tokens"], { input_ids })).inputs_embeds;
24113
+ async encode_text(inputs) {
24114
+ return this._encode_input("embed_tokens", inputs, "inputs_embeds");
23235
24115
  }
23236
- async encode_audio({ audio_values }) {
23237
- return (await sessionRun(this.sessions["audio_encoder"], { audio_values })).audio_features;
24116
+ async encode_audio(inputs) {
24117
+ return this._encode_input("audio_encoder", inputs, "audio_features");
23238
24118
  }
23239
24119
  };
23240
24120
  async function seq2seq_forward(self2, model_inputs) {
@@ -23289,6 +24169,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
23289
24169
  const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
23290
24170
  new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
23291
24171
  }
24172
+ if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
24173
+ new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
24174
+ }
23292
24175
  self2.addPastKeyValues(new_model_inputs, past_key_values);
23293
24176
  const fixed = pick(new_model_inputs, session.inputNames);
23294
24177
  return await sessionRun(session, fixed);
@@ -23297,7 +24180,7 @@ async function generic_text_to_text_forward(self2, {
23297
24180
  // Generic parameters:
23298
24181
  encode_function,
23299
24182
  merge_function,
23300
- modality_input_name,
24183
+ modality_input_names,
23301
24184
  modality_output_name,
23302
24185
  // Produced by the tokenizer/processor:
23303
24186
  input_ids = null,
@@ -23312,32 +24195,34 @@ async function generic_text_to_text_forward(self2, {
23312
24195
  // Additional parameters
23313
24196
  ...kwargs
23314
24197
  }) {
23315
- const modality_values = kwargs[modality_input_name];
23316
24198
  if (!inputs_embeds) {
23317
24199
  inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
23318
- if (modality_values && input_ids.dims[1] !== 1) {
23319
- const modality_features = await encode_function({
23320
- // Pass the modality values under its expected key.
23321
- // The caller knows whether this is audio or image.
23322
- [modality_input_name]: modality_values,
23323
- ...kwargs
23324
- });
23325
- ({ inputs_embeds, attention_mask } = merge_function({
23326
- [modality_output_name]: modality_features,
23327
- inputs_embeds,
23328
- input_ids,
23329
- attention_mask
23330
- }));
23331
- } else if (past_key_values && modality_values && input_ids.dims[1] === 1) {
23332
- const target_length = input_ids.dims[1];
23333
- const past_length = getPastLength(past_key_values);
23334
- attention_mask = cat(
23335
- [
23336
- ones([input_ids.dims[0], past_length]),
23337
- attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
23338
- ],
23339
- 1
23340
- );
24200
+ const modality_values = pick(kwargs, modality_input_names);
24201
+ if (Object.keys(modality_values).length > 0) {
24202
+ if (input_ids.dims[1] !== 1) {
24203
+ const modality_features = await encode_function({
24204
+ // Pass the modality values under its expected key.
24205
+ // The caller knows whether this is audio or image.
24206
+ ...modality_values,
24207
+ ...kwargs
24208
+ });
24209
+ ({ inputs_embeds, attention_mask } = merge_function({
24210
+ [modality_output_name]: modality_features,
24211
+ inputs_embeds,
24212
+ input_ids,
24213
+ attention_mask
24214
+ }));
24215
+ } else if (past_key_values && input_ids.dims[1] === 1) {
24216
+ const target_length = input_ids.dims[1];
24217
+ const past_length = past_key_values.get_seq_length();
24218
+ attention_mask = cat(
24219
+ [
24220
+ ones([input_ids.dims[0], past_length]),
24221
+ attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
24222
+ ],
24223
+ 1
24224
+ );
24225
+ }
23341
24226
  }
23342
24227
  }
23343
24228
  if (!position_ids) {
@@ -23345,14 +24230,19 @@ async function generic_text_to_text_forward(self2, {
23345
24230
  // Handle special case for qwen vl models
23346
24231
  [
23347
24232
  "qwen2_vl",
24233
+ "qwen2_vl_text",
23348
24234
  "qwen2_5_vl",
23349
24235
  "qwen2_5_vl_text",
23350
24236
  "qwen3_vl",
23351
24237
  "qwen3_vl_text",
24238
+ "qwen3_vl_moe",
24239
+ "qwen3_vl_moe_text",
23352
24240
  "qwen3_5",
23353
24241
  "qwen3_5_text",
23354
24242
  "qwen3_5_moe",
23355
- "qwen3_5_moe_text"
24243
+ "qwen3_5_moe_text",
24244
+ "glm_ocr",
24245
+ "glm_ocr_text"
23356
24246
  ].includes(self2.config.model_type)
23357
24247
  ) {
23358
24248
  const { image_grid_thw, video_grid_thw } = kwargs;
@@ -23376,7 +24266,7 @@ async function generic_text_to_text_forward(self2, {
23376
24266
  async function audio_text_to_text_forward(self2, params) {
23377
24267
  return await generic_text_to_text_forward(self2, {
23378
24268
  ...params,
23379
- modality_input_name: "audio_values",
24269
+ modality_input_names: ["audio_values", "input_features"],
23380
24270
  modality_output_name: "audio_features",
23381
24271
  encode_function: self2.encode_audio.bind(self2),
23382
24272
  merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
@@ -23385,7 +24275,7 @@ async function audio_text_to_text_forward(self2, params) {
23385
24275
  async function image_text_to_text_forward(self2, params) {
23386
24276
  return await generic_text_to_text_forward(self2, {
23387
24277
  ...params,
23388
- modality_input_name: "pixel_values",
24278
+ modality_input_names: ["pixel_values"],
23389
24279
  modality_output_name: "image_features",
23390
24280
  encode_function: self2.encode_image.bind(self2),
23391
24281
  merge_function: self2._merge_input_ids_with_image_features.bind(self2)
@@ -23421,7 +24311,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
23421
24311
  return position_ids;
23422
24312
  }
23423
24313
  function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
23424
- const past_length = model_inputs.past_key_values ? getPastLength(model_inputs.past_key_values) : 0;
24314
+ const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
24315
+ const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
24316
+ if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
24317
+ model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
24318
+ }
23425
24319
  if (!model_inputs.attention_mask) {
23426
24320
  let dims;
23427
24321
  for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
@@ -23572,6 +24466,8 @@ __export(models_exports, {
23572
24466
  BloomForCausalLM: () => BloomForCausalLM,
23573
24467
  BloomModel: () => BloomModel,
23574
24468
  BloomPreTrainedModel: () => BloomPreTrainedModel,
24469
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
24470
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
23575
24471
  CLIPModel: () => CLIPModel,
23576
24472
  CLIPPreTrainedModel: () => CLIPPreTrainedModel,
23577
24473
  CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -23646,6 +24542,9 @@ __export(models_exports, {
23646
24542
  DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
23647
24543
  DecisionTransformerModel: () => DecisionTransformerModel,
23648
24544
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
24545
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
24546
+ DeepseekV3Model: () => DeepseekV3Model,
24547
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
23649
24548
  DeiTForImageClassification: () => DeiTForImageClassification,
23650
24549
  DeiTModel: () => DeiTModel,
23651
24550
  DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -23691,6 +24590,11 @@ __export(models_exports, {
23691
24590
  EsmForTokenClassification: () => EsmForTokenClassification,
23692
24591
  EsmModel: () => EsmModel,
23693
24592
  EsmPreTrainedModel: () => EsmPreTrainedModel,
24593
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
24594
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
24595
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
24596
+ EuroBertModel: () => EuroBertModel,
24597
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
23694
24598
  ExaoneForCausalLM: () => ExaoneForCausalLM,
23695
24599
  ExaoneModel: () => ExaoneModel,
23696
24600
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -23729,6 +24633,7 @@ __export(models_exports, {
23729
24633
  Gemma3ForCausalLM: () => Gemma3ForCausalLM,
23730
24634
  Gemma3Model: () => Gemma3Model,
23731
24635
  Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
24636
+ Gemma3nForCausalLM: () => Gemma3nForCausalLM,
23732
24637
  Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
23733
24638
  Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
23734
24639
  GemmaForCausalLM: () => GemmaForCausalLM,
@@ -23736,6 +24641,10 @@ __export(models_exports, {
23736
24641
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
23737
24642
  GlmForCausalLM: () => GlmForCausalLM,
23738
24643
  GlmModel: () => GlmModel,
24644
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
24645
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
24646
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
24647
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
23739
24648
  GlmPreTrainedModel: () => GlmPreTrainedModel,
23740
24649
  GptOssForCausalLM: () => GptOssForCausalLM,
23741
24650
  GptOssModel: () => GptOssModel,
@@ -23746,6 +24655,7 @@ __export(models_exports, {
23746
24655
  GraniteMoeHybridModel: () => GraniteMoeHybridModel,
23747
24656
  GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
23748
24657
  GranitePreTrainedModel: () => GranitePreTrainedModel,
24658
+ GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
23749
24659
  GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
23750
24660
  GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
23751
24661
  GroupViTModel: () => GroupViTModel,
@@ -23767,7 +24677,6 @@ __export(models_exports, {
23767
24677
  IJepaModel: () => IJepaModel,
23768
24678
  IJepaPreTrainedModel: () => IJepaPreTrainedModel,
23769
24679
  Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
23770
- Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
23771
24680
  JAISLMHeadModel: () => JAISLMHeadModel,
23772
24681
  JAISModel: () => JAISModel,
23773
24682
  JAISPreTrainedModel: () => JAISPreTrainedModel,
@@ -23781,6 +24690,8 @@ __export(models_exports, {
23781
24690
  Lfm2MoeModel: () => Lfm2MoeModel,
23782
24691
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
23783
24692
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
24693
+ Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
24694
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
23784
24695
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
23785
24696
  Llama4ForCausalLM: () => Llama4ForCausalLM,
23786
24697
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -23830,6 +24741,9 @@ __export(models_exports, {
23830
24741
  MimiEncoderOutput: () => MimiEncoderOutput,
23831
24742
  MimiModel: () => MimiModel,
23832
24743
  MimiPreTrainedModel: () => MimiPreTrainedModel,
24744
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
24745
+ Mistral4Model: () => Mistral4Model,
24746
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
23833
24747
  MistralForCausalLM: () => MistralForCausalLM,
23834
24748
  MistralModel: () => MistralModel,
23835
24749
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -23887,6 +24801,9 @@ __export(models_exports, {
23887
24801
  NanoChatForCausalLM: () => NanoChatForCausalLM,
23888
24802
  NanoChatModel: () => NanoChatModel,
23889
24803
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
24804
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
24805
+ NemotronHModel: () => NemotronHModel,
24806
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
23890
24807
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
23891
24808
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
23892
24809
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -23920,7 +24837,6 @@ __export(models_exports, {
23920
24837
  Owlv2Model: () => Owlv2Model,
23921
24838
  Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
23922
24839
  PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
23923
- PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
23924
24840
  ParakeetForCTC: () => ParakeetForCTC,
23925
24841
  ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
23926
24842
  PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
@@ -23950,8 +24866,10 @@ __export(models_exports, {
23950
24866
  Qwen2MoeModel: () => Qwen2MoeModel,
23951
24867
  Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
23952
24868
  Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
24869
+ Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
23953
24870
  Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
23954
24871
  Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
24872
+ Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
23955
24873
  Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
23956
24874
  Qwen3ForCausalLM: () => Qwen3ForCausalLM,
23957
24875
  Qwen3Model: () => Qwen3Model,
@@ -23962,9 +24880,13 @@ __export(models_exports, {
23962
24880
  Qwen3NextModel: () => Qwen3NextModel,
23963
24881
  Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
23964
24882
  Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
24883
+ Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
23965
24884
  Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
24885
+ Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
23966
24886
  Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
24887
+ Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
23967
24888
  Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
24889
+ Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
23968
24890
  Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
23969
24891
  RFDetrForObjectDetection: () => RFDetrForObjectDetection,
23970
24892
  RFDetrModel: () => RFDetrModel,
@@ -24015,11 +24937,13 @@ __export(models_exports, {
24015
24937
  SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
24016
24938
  SmolLM3Model: () => SmolLM3Model,
24017
24939
  SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
24018
- SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
24019
24940
  SnacDecoderModel: () => SnacDecoderModel,
24020
24941
  SnacEncoderModel: () => SnacEncoderModel,
24021
24942
  SnacModel: () => SnacModel,
24022
24943
  SnacPreTrainedModel: () => SnacPreTrainedModel,
24944
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
24945
+ SolarOpenModel: () => SolarOpenModel,
24946
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
24023
24947
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
24024
24948
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
24025
24949
  SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -24087,6 +25011,8 @@ __export(models_exports, {
24087
25011
  VitsModelOutput: () => VitsModelOutput,
24088
25012
  VitsPreTrainedModel: () => VitsPreTrainedModel,
24089
25013
  VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
25014
+ VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
25015
+ VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
24090
25016
  Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
24091
25017
  Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
24092
25018
  Wav2Vec2BertModel: () => Wav2Vec2BertModel,
@@ -24192,7 +25118,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
24192
25118
  var ArceeForCausalLM = class extends ArceePreTrainedModel {
24193
25119
  };
24194
25120
 
24195
- // src/models/ast/modeling_ast.js
25121
+ // src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
24196
25122
  var ASTPreTrainedModel = class extends PreTrainedModel {
24197
25123
  };
24198
25124
  var ASTModel = class extends ASTPreTrainedModel {
@@ -24447,7 +25373,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
24447
25373
  if (!past_key_values || target_length !== 1) {
24448
25374
  throw new Error("Incorrect state encountered during generation.");
24449
25375
  }
24450
- const past_length = Object.values(past_key_values)[0].dims.at(-2);
25376
+ const past_length = past_key_values.get_seq_length();
24451
25377
  attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
24452
25378
  }
24453
25379
  }
@@ -24527,6 +25453,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
24527
25453
  var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
24528
25454
  };
24529
25455
 
25456
+ // src/models/chmv2/modeling_chmv2.js
25457
+ var CHMv2PreTrainedModel = class extends PreTrainedModel {
25458
+ };
25459
+ var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
25460
+ };
25461
+
24530
25462
  // src/models/clap/modeling_clap.js
24531
25463
  var ClapPreTrainedModel = class extends PreTrainedModel {
24532
25464
  };
@@ -24865,6 +25797,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
24865
25797
  }
24866
25798
  };
24867
25799
 
25800
+ // src/models/deepseek_v3/modeling_deepseek_v3.js
25801
+ var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
25802
+ };
25803
+ var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
25804
+ };
25805
+ var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
25806
+ };
25807
+
24868
25808
  // src/models/deberta_v2/modeling_deberta_v2.js
24869
25809
  var DebertaV2PreTrainedModel = class extends PreTrainedModel {
24870
25810
  };
@@ -25213,6 +26153,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
25213
26153
  }
25214
26154
  };
25215
26155
 
26156
+ // src/models/eurobert/modeling_eurobert.js
26157
+ var EuroBertPreTrainedModel = class extends PreTrainedModel {
26158
+ };
26159
+ var EuroBertModel = class extends EuroBertPreTrainedModel {
26160
+ };
26161
+ var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
26162
+ /**
26163
+ * Calls the model on new inputs.
26164
+ *
26165
+ * @param {Object} model_inputs The inputs to the model.
26166
+ * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
26167
+ */
26168
+ async _call(model_inputs) {
26169
+ return new MaskedLMOutput(await super._call(model_inputs));
26170
+ }
26171
+ };
26172
+ var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
26173
+ /**
26174
+ * Calls the model on new inputs.
26175
+ *
26176
+ * @param {Object} model_inputs The inputs to the model.
26177
+ * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
26178
+ */
26179
+ async _call(model_inputs) {
26180
+ return new SequenceClassifierOutput(await super._call(model_inputs));
26181
+ }
26182
+ };
26183
+ var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
26184
+ /**
26185
+ * Calls the model on new inputs.
26186
+ *
26187
+ * @param {Object} model_inputs The inputs to the model.
26188
+ * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
26189
+ */
26190
+ async _call(model_inputs) {
26191
+ return new TokenClassifierOutput(await super._call(model_inputs));
26192
+ }
26193
+ };
26194
+
25216
26195
  // src/models/exaone/modeling_exaone.js
25217
26196
  var ExaonePreTrainedModel = class extends PreTrainedModel {
25218
26197
  };
@@ -25477,6 +26456,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
25477
26456
  });
25478
26457
  }
25479
26458
  };
26459
+ var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
26460
+ };
25480
26461
 
25481
26462
  // src/models/glm/modeling_glm.js
25482
26463
  var GlmPreTrainedModel = class extends PreTrainedModel {
@@ -25486,6 +26467,377 @@ var GlmModel = class extends GlmPreTrainedModel {
25486
26467
  var GlmForCausalLM = class extends GlmPreTrainedModel {
25487
26468
  };
25488
26469
 
26470
+ // src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
26471
+ var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
26472
+ };
26473
+ var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
26474
+ };
26475
+ var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
26476
+ };
26477
+
26478
+ // src/models/qwen2_vl/modeling_qwen2_vl.js
26479
+ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
26480
+ forward_params = [
26481
+ // Text inputs
26482
+ "input_ids",
26483
+ "attention_mask",
26484
+ "position_ids",
26485
+ "past_key_values",
26486
+ // Vision inputs
26487
+ "pixel_values",
26488
+ "image_grid_thw"
26489
+ ];
26490
+ };
26491
+ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
26492
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
26493
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
26494
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
26495
+ image_grid_thw_name = "grid_thw";
26496
+ /**
26497
+ * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
26498
+ * @param {Tensor} input_ids
26499
+ * @param {Tensor} attention_mask
26500
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
26501
+ */
26502
+ _get_text_only_rope_index(input_ids, attention_mask) {
26503
+ if (attention_mask) {
26504
+ const { data, dims } = cumsum_masked_fill(attention_mask);
26505
+ const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
26506
+ const mrope_position_deltas = Array.from(
26507
+ { length: dims[0] },
26508
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
26509
+ );
26510
+ return [
26511
+ new Tensor2("int64", position_ids, [3, ...dims]),
26512
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
26513
+ ];
26514
+ } else {
26515
+ const [batch_size, seq_length] = input_ids.dims;
26516
+ const position_ids = BigInt64Array.from(
26517
+ { length: 3 * batch_size * seq_length },
26518
+ (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
26519
+ );
26520
+ return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
26521
+ }
26522
+ }
26523
+ /**
26524
+ * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
26525
+ * global [all_t, all_h, all_w] order, then write back into the position_ids array
26526
+ * respecting attention mask.
26527
+ * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
26528
+ * @param {number[]} attn_mask Attention mask for this batch element
26529
+ * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
26530
+ * @param {number} batch_idx Current batch index
26531
+ * @returns {number[]} Flat reordered positions of length total_len
26532
+ */
26533
+ _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
26534
+ const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
26535
+ const llm_positions = new Array(total_len);
26536
+ let index = 0;
26537
+ for (let x = 0; x < 3; ++x) {
26538
+ for (const val of llm_pos_ids_list) {
26539
+ const seg_len = val.length / 3;
26540
+ for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
26541
+ llm_positions[index++] = val[z];
26542
+ }
26543
+ }
26544
+ }
26545
+ let count2 = 0;
26546
+ for (let y = 0; y < attn_mask.length; ++y) {
26547
+ if (attn_mask[y] == 1) {
26548
+ for (let x = 0; x < 3; ++x) {
26549
+ position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
26550
+ }
26551
+ ++count2;
26552
+ }
26553
+ }
26554
+ return llm_positions;
26555
+ }
26556
+ /**
26557
+ * Build per-batch position ID segments for multimodal rope.
26558
+ * Override this in subclasses to change how vision/text segments are identified and positioned.
26559
+ * @param {object} params
26560
+ * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
26561
+ * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
26562
+ * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
26563
+ * @param {number} params.spatial_merge_size
26564
+ * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
26565
+ * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
26566
+ */
26567
+ _get_multimodal_rope_positions({
26568
+ filtered_ids,
26569
+ image_grid_thw_list,
26570
+ video_grid_thw_list,
26571
+ spatial_merge_size,
26572
+ state
26573
+ }) {
26574
+ const { image_token_id, video_token_id, vision_start_token_id } = this.config;
26575
+ const ids = filtered_ids;
26576
+ const vision_start_indices = ids.reduce((acc, x, idx) => {
26577
+ if (x == vision_start_token_id) acc.push(idx);
26578
+ return acc;
26579
+ }, []);
26580
+ const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
26581
+ const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
26582
+ const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
26583
+ const llm_pos_ids_list = [];
26584
+ let st2 = 0;
26585
+ let remain_images = image_nums;
26586
+ let remain_videos = video_nums;
26587
+ for (let j = 0; j < vision_tokens.length; ++j) {
26588
+ const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
26589
+ const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
26590
+ const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
26591
+ const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
26592
+ let ed;
26593
+ let t, h, w;
26594
+ if (ed_image < ed_video) {
26595
+ [t, h, w] = image_grid_thw_list[state.image_index];
26596
+ ++state.image_index;
26597
+ --remain_images;
26598
+ ed = ed_image;
26599
+ } else {
26600
+ [t, h, w] = video_grid_thw_list[state.video_index];
26601
+ ++state.video_index;
26602
+ --remain_videos;
26603
+ ed = ed_video;
26604
+ }
26605
+ const [llm_grid_t, llm_grid_h, llm_grid_w] = [
26606
+ Number(t),
26607
+ Math.floor(Number(h) / spatial_merge_size),
26608
+ Math.floor(Number(w) / spatial_merge_size)
26609
+ ];
26610
+ const text_len = ed - st2;
26611
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
26612
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
26613
+ const offset = text_len + st_idx;
26614
+ const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
26615
+ const t_index = Array.from(
26616
+ { length: grid_size },
26617
+ (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
26618
+ );
26619
+ const h_index = Array.from(
26620
+ { length: grid_size },
26621
+ (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
26622
+ );
26623
+ const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
26624
+ llm_pos_ids_list.push([t_index, h_index, w_index].flat());
26625
+ st2 = ed + grid_size;
26626
+ }
26627
+ if (st2 < ids.length) {
26628
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
26629
+ const text_len = ids.length - st2;
26630
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
26631
+ }
26632
+ return llm_pos_ids_list;
26633
+ }
26634
+ /**
26635
+ * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
26636
+ *
26637
+ * Explanation:
26638
+ * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
26639
+ *
26640
+ * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
26641
+ * Examples:
26642
+ * input_ids: [T T T T T], here T is for text.
26643
+ * temporal position_ids: [0, 1, 2, 3, 4]
26644
+ * height position_ids: [0, 1, 2, 3, 4]
26645
+ * width position_ids: [0, 1, 2, 3, 4]
26646
+ *
26647
+ * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
26648
+ * and 1D rotary position embeddin for text part.
26649
+ * Examples:
26650
+ * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
26651
+ * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
26652
+ * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
26653
+ * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
26654
+ * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
26655
+ * text temporal position_ids: [3, 4, 5, 6, 7]
26656
+ * text height position_ids: [3, 4, 5, 6, 7]
26657
+ * text width position_ids: [3, 4, 5, 6, 7]
26658
+ * Here we calculate the text start position_ids as the max vision position_ids plus 1.
26659
+ *
26660
+ * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
26661
+ * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
26662
+ * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
26663
+ * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
26664
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
26665
+ */
26666
+ get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
26667
+ const { vision_config } = this.config;
26668
+ const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
26669
+ if (image_grid_thw || video_grid_thw) {
26670
+ const total_input_ids = input_ids.tolist();
26671
+ if (!attention_mask) {
26672
+ attention_mask = ones_like(input_ids);
26673
+ }
26674
+ const attention_mask_list = attention_mask.tolist();
26675
+ const position_ids_list = Array.from(
26676
+ { length: 3 },
26677
+ () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
26678
+ );
26679
+ const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
26680
+ const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
26681
+ const state = { image_index: 0, video_index: 0 };
26682
+ const mrope_position_deltas = [];
26683
+ for (let i = 0; i < total_input_ids.length; ++i) {
26684
+ const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
26685
+ const llm_pos_ids_list = this._get_multimodal_rope_positions({
26686
+ filtered_ids,
26687
+ image_grid_thw_list,
26688
+ video_grid_thw_list,
26689
+ spatial_merge_size,
26690
+ state
26691
+ });
26692
+ const llm_positions = this._reorder_and_write_positions(
26693
+ llm_pos_ids_list,
26694
+ attention_mask_list[i],
26695
+ position_ids_list,
26696
+ i
26697
+ );
26698
+ mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
26699
+ }
26700
+ return [
26701
+ new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
26702
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
26703
+ ];
26704
+ } else {
26705
+ return this._get_text_only_rope_index(input_ids, attention_mask);
26706
+ }
26707
+ }
26708
+ async encode_image({ pixel_values, image_grid_thw }) {
26709
+ const features = (await sessionRun(this.sessions["vision_encoder"], {
26710
+ pixel_values,
26711
+ [this.image_grid_thw_name]: image_grid_thw
26712
+ })).image_features;
26713
+ return features;
26714
+ }
26715
+ _merge_input_ids_with_image_features(kwargs) {
26716
+ return default_merge_input_ids_with_image_features({
26717
+ // @ts-ignore
26718
+ image_token_id: this.config.image_token_id,
26719
+ ...kwargs
26720
+ });
26721
+ }
26722
+ prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
26723
+ if (model_inputs.attention_mask && !model_inputs.position_ids) {
26724
+ if (!model_inputs.past_key_values) {
26725
+ [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
26726
+ model_inputs.input_ids,
26727
+ model_inputs.image_grid_thw,
26728
+ model_inputs.video_grid_thw,
26729
+ model_inputs.attention_mask
26730
+ );
26731
+ } else {
26732
+ model_inputs.pixel_values = null;
26733
+ const past_length = model_inputs.past_key_values.get_seq_length();
26734
+ if (past_length < model_inputs.input_ids.dims[1]) {
26735
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
26736
+ model_inputs.input_ids,
26737
+ model_inputs.image_grid_thw,
26738
+ model_inputs.video_grid_thw,
26739
+ model_inputs.attention_mask
26740
+ );
26741
+ model_inputs.rope_deltas = rope_deltas;
26742
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
26743
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
26744
+ } else {
26745
+ if (!model_inputs.rope_deltas) {
26746
+ [, model_inputs.rope_deltas] = this.get_rope_index(
26747
+ model_inputs.input_ids,
26748
+ model_inputs.image_grid_thw,
26749
+ model_inputs.video_grid_thw,
26750
+ model_inputs.attention_mask
26751
+ );
26752
+ }
26753
+ const delta = BigInt(past_length);
26754
+ const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
26755
+ model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
26756
+ }
26757
+ }
26758
+ }
26759
+ return model_inputs;
26760
+ }
26761
+ };
26762
+ var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
26763
+ };
26764
+
26765
+ // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
26766
+ var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
26767
+ image_grid_thw_name = "image_grid_thw";
26768
+ };
26769
+ var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
26770
+ image_grid_thw_name = "image_grid_thw";
26771
+ };
26772
+
26773
+ // src/models/glm_ocr/modeling_glm_ocr.js
26774
+ var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
26775
+ /**
26776
+ * Compute 3D positional indices for vision tokens.
26777
+ * Temporal is constant, height is repeat-interleaved, width tiles.
26778
+ * @param {number} start_position
26779
+ * @param {number[]} grid_thw [T, H, W]
26780
+ * @param {number} temp_merge_size
26781
+ * @param {number} spatial_merge_size
26782
+ * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
26783
+ */
26784
+ get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
26785
+ const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
26786
+ const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
26787
+ const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
26788
+ const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
26789
+ const t_pos = Array.from({ length: seq_len }, () => start_position);
26790
+ const h_pos = Array.from(
26791
+ { length: seq_len },
26792
+ (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
26793
+ );
26794
+ const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
26795
+ return [...t_pos, ...h_pos, ...w_pos];
26796
+ }
26797
+ /**
26798
+ * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
26799
+ * instead of vision_start_token_id scanning used by Qwen2VL.
26800
+ * After a vision segment, position advances by max(h, w) / spatial_merge_size.
26801
+ */
26802
+ _get_multimodal_rope_positions({
26803
+ filtered_ids,
26804
+ image_grid_thw_list,
26805
+ video_grid_thw_list,
26806
+ spatial_merge_size,
26807
+ state
26808
+ }) {
26809
+ const { image_token_id } = this.config;
26810
+ const groups = [];
26811
+ let group_start = 0;
26812
+ let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
26813
+ for (let j = 1; j <= filtered_ids.length; ++j) {
26814
+ const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
26815
+ if (t !== current_type) {
26816
+ groups.push([current_type, group_start, j]);
26817
+ group_start = j;
26818
+ current_type = t;
26819
+ }
26820
+ }
26821
+ let current_pos = 0;
26822
+ const llm_pos_ids_list = [];
26823
+ for (const [modality_type, start_idx, end_idx] of groups) {
26824
+ if (modality_type === 0) {
26825
+ const text_len = end_idx - start_idx;
26826
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
26827
+ current_pos += text_len;
26828
+ } else {
26829
+ const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
26830
+ const temp_merge_size = grid_thw[0];
26831
+ llm_pos_ids_list.push(
26832
+ this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
26833
+ );
26834
+ current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
26835
+ }
26836
+ }
26837
+ return llm_pos_ids_list;
26838
+ }
26839
+ };
26840
+
25489
26841
  // src/models/glpn/modeling_glpn.js
25490
26842
  var GLPNPreTrainedModel = class extends PreTrainedModel {
25491
26843
  };
@@ -25558,6 +26910,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
25558
26910
  var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
25559
26911
  };
25560
26912
 
26913
+ // src/models/ultravox/modeling_ultravox.js
26914
+ var UltravoxPreTrainedModel = class extends PreTrainedModel {
26915
+ forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
26916
+ };
26917
+ var UltravoxModel = class extends UltravoxPreTrainedModel {
26918
+ _merge_input_ids_with_audio_features(kwargs) {
26919
+ const audio_hidden_size = kwargs.audio_features.dims.at(-1);
26920
+ const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
26921
+ return default_merge_input_ids_with_audio_features({
26922
+ // @ts-ignore
26923
+ audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
26924
+ ...kwargs,
26925
+ audio_features: reshaped_audio_features
26926
+ });
26927
+ }
26928
+ };
26929
+
26930
+ // src/models/granite_speech/modeling_granite_speech.js
26931
+ var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
26932
+ forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
26933
+ };
26934
+
25561
26935
  // src/models/grounding_dino/modeling_grounding_dino.js
25562
26936
  var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
25563
26937
  };
@@ -25662,34 +27036,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
25662
27036
  var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
25663
27037
  };
25664
27038
 
25665
- // src/models/idefics3/modeling_idefics3.js
25666
- var Idefics3PreTrainedModel = class extends PreTrainedModel {
25667
- forward_params = [
25668
- "input_ids",
25669
- "attention_mask",
25670
- "pixel_values",
25671
- "pixel_attention_mask",
25672
- "position_ids",
25673
- "past_key_values"
25674
- ];
27039
+ // src/models/llava/modeling_llava.js
27040
+ var LlavaPreTrainedModel = class extends PreTrainedModel {
27041
+ forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
25675
27042
  };
25676
- var Idefics3ForConditionalGeneration = class extends Idefics3PreTrainedModel {
25677
- async encode_image({ pixel_values, pixel_attention_mask }) {
25678
- const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
25679
- return features;
25680
- }
27043
+ var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
25681
27044
  _merge_input_ids_with_image_features(kwargs) {
25682
27045
  const vision_hidden_size = kwargs.image_features.dims.at(-1);
25683
27046
  const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
25684
27047
  return default_merge_input_ids_with_image_features({
25685
27048
  // @ts-ignore
25686
- image_token_id: this.config.image_token_id,
27049
+ image_token_id: this.config.image_token_index ?? this.config.image_token_id,
25687
27050
  ...kwargs,
25688
27051
  image_features: reshaped_image_hidden_states
25689
27052
  });
25690
27053
  }
25691
27054
  };
25692
- var SmolVLMForConditionalGeneration = class extends Idefics3ForConditionalGeneration {
27055
+ var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
27056
+ };
27057
+ var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
27058
+ };
27059
+
27060
+ // src/models/idefics3/modeling_idefics3.js
27061
+ var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
27062
+ forward_params = [
27063
+ "input_ids",
27064
+ "attention_mask",
27065
+ "pixel_values",
27066
+ "pixel_attention_mask",
27067
+ "position_ids",
27068
+ "past_key_values"
27069
+ ];
25693
27070
  };
25694
27071
 
25695
27072
  // src/models/ijepa/modeling_ijepa.js
@@ -25773,6 +27150,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
25773
27150
  var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
25774
27151
  };
25775
27152
 
27153
+ // src/models/lighton_ocr/modeling_lighton_ocr.js
27154
+ var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
27155
+ };
27156
+
25776
27157
  // src/models/lfm2_moe/modeling_lfm2_moe.js
25777
27158
  var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
25778
27159
  };
@@ -25781,6 +27162,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
25781
27162
  var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
25782
27163
  };
25783
27164
 
27165
+ // src/models/lfm2_vl/modeling_lfm2_vl.js
27166
+ var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
27167
+ forward_params = [
27168
+ "input_ids",
27169
+ "attention_mask",
27170
+ "pixel_values",
27171
+ "pixel_attention_mask",
27172
+ "spatial_shapes",
27173
+ "position_ids",
27174
+ "past_key_values"
27175
+ ];
27176
+ };
27177
+
25784
27178
  // src/models/llama/modeling_llama.js
25785
27179
  var LlamaPreTrainedModel = class extends PreTrainedModel {
25786
27180
  };
@@ -25795,27 +27189,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
25795
27189
  var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
25796
27190
  };
25797
27191
 
25798
- // src/models/llava/modeling_llava.js
25799
- var LlavaPreTrainedModel = class extends PreTrainedModel {
25800
- forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
25801
- };
25802
- var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
25803
- _merge_input_ids_with_image_features(kwargs) {
25804
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
25805
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
25806
- return default_merge_input_ids_with_image_features({
25807
- // @ts-ignore
25808
- image_token_id: this.config.image_token_index,
25809
- ...kwargs,
25810
- image_features: reshaped_image_hidden_states
25811
- });
25812
- }
25813
- };
25814
- var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
25815
- };
25816
- var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
25817
- };
25818
-
25819
27192
  // src/models/longt5/modeling_longt5.js
25820
27193
  var LongT5PreTrainedModel = class extends PreTrainedModel {
25821
27194
  };
@@ -25977,6 +27350,14 @@ var MistralModel = class extends MistralPreTrainedModel {
25977
27350
  var MistralForCausalLM = class extends MistralPreTrainedModel {
25978
27351
  };
25979
27352
 
27353
+ // src/models/mistral4/modeling_mistral4.js
27354
+ var Mistral4PreTrainedModel = class extends PreTrainedModel {
27355
+ };
27356
+ var Mistral4Model = class extends Mistral4PreTrainedModel {
27357
+ };
27358
+ var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
27359
+ };
27360
+
25980
27361
  // src/models/mobilebert/modeling_mobilebert.js
25981
27362
  var MobileBertPreTrainedModel = class extends PreTrainedModel {
25982
27363
  };
@@ -26445,6 +27826,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
26445
27826
  var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
26446
27827
  };
26447
27828
 
27829
+ // src/models/nemotron_h/modeling_nemotron_h.js
27830
+ var NemotronHPreTrainedModel = class extends PreTrainedModel {
27831
+ };
27832
+ var NemotronHModel = class extends NemotronHPreTrainedModel {
27833
+ };
27834
+ var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
27835
+ };
27836
+
26448
27837
  // src/models/neobert/modeling_neobert.js
26449
27838
  var NeoBertPreTrainedModel = class extends PreTrainedModel {
26450
27839
  };
@@ -26566,27 +27955,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
26566
27955
  };
26567
27956
 
26568
27957
  // src/models/paligemma/modeling_paligemma.js
26569
- var PaliGemmaPreTrainedModel = class extends PreTrainedModel {
26570
- forward_params = [
26571
- "input_ids",
26572
- // 'inputs_embeds',
26573
- "attention_mask",
26574
- "pixel_values",
26575
- "position_ids",
26576
- "past_key_values"
26577
- ];
26578
- };
26579
- var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
26580
- _merge_input_ids_with_image_features(kwargs) {
26581
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
26582
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
26583
- return default_merge_input_ids_with_image_features({
26584
- // @ts-ignore
26585
- image_token_id: this.config.image_token_index,
26586
- ...kwargs,
26587
- image_features: reshaped_image_hidden_states
26588
- });
26589
- }
27958
+ var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
26590
27959
  };
26591
27960
 
26592
27961
  // src/models/parakeet/modeling_parakeet.js
@@ -26745,244 +28114,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
26745
28114
  var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
26746
28115
  };
26747
28116
 
26748
- // src/models/qwen2_vl/modeling_qwen2_vl.js
26749
- var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
26750
- forward_params = [
26751
- // Text inputs
26752
- "input_ids",
26753
- "attention_mask",
26754
- "position_ids",
26755
- "past_key_values",
26756
- // Vision inputs
26757
- "pixel_values",
26758
- "image_grid_thw"
26759
- ];
26760
- };
26761
- var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
26762
- image_grid_thw_name = "grid_thw";
26763
- /**
26764
- * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
26765
- *
26766
- * Explanation:
26767
- * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
26768
- *
26769
- * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
26770
- * Examples:
26771
- * input_ids: [T T T T T], here T is for text.
26772
- * temporal position_ids: [0, 1, 2, 3, 4]
26773
- * height position_ids: [0, 1, 2, 3, 4]
26774
- * width position_ids: [0, 1, 2, 3, 4]
26775
- *
26776
- * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
26777
- * and 1D rotary position embeddin for text part.
26778
- * Examples:
26779
- * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
26780
- * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
26781
- * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
26782
- * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
26783
- * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
26784
- * text temporal position_ids: [3, 4, 5, 6, 7]
26785
- * text height position_ids: [3, 4, 5, 6, 7]
26786
- * text width position_ids: [3, 4, 5, 6, 7]
26787
- * Here we calculate the text start position_ids as the max vision position_ids plus 1.
26788
- *
26789
- * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
26790
- * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
26791
- * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
26792
- * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
26793
- * - 1 for tokens that are **not masked**,
26794
- * - 0 for tokens that are **masked**.
26795
- * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
26796
- * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
26797
- * - mrope_position_deltas: Tensor of shape `(batch_size)`.
26798
- */
26799
- get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
26800
- const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
26801
- const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
26802
- const mrope_position_deltas = [];
26803
- if (image_grid_thw || video_grid_thw) {
26804
- let total_input_ids = input_ids.tolist();
26805
- if (!attention_mask) {
26806
- attention_mask = ones_like(input_ids);
26807
- }
26808
- const attention_mask_list = attention_mask.tolist();
26809
- const position_ids_list = Array.from(
26810
- { length: 3 },
26811
- (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
26812
- );
26813
- const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
26814
- const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
26815
- let image_index = 0;
26816
- let video_index = 0;
26817
- for (let i = 0; i < total_input_ids.length; ++i) {
26818
- const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
26819
- const vision_start_indices = ids.reduce((acc, x, idx) => {
26820
- if (x == vision_start_token_id) acc.push(idx);
26821
- return acc;
26822
- }, []);
26823
- const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
26824
- const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
26825
- const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
26826
- let llm_pos_ids_list = [];
26827
- let st2 = 0;
26828
- let remain_images = image_nums;
26829
- let remain_videos = video_nums;
26830
- for (let j = 0; j < vision_tokens.length; ++j) {
26831
- const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
26832
- const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
26833
- const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
26834
- const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
26835
- let ed;
26836
- let t, h, w;
26837
- if (ed_image < ed_video) {
26838
- [t, h, w] = image_grid_thw_list[image_index];
26839
- ++image_index;
26840
- --remain_images;
26841
- ed = ed_image;
26842
- } else {
26843
- [t, h, w] = video_grid_thw_list[video_index];
26844
- ++video_index;
26845
- --remain_videos;
26846
- ed = ed_video;
26847
- }
26848
- const [llm_grid_t, llm_grid_h, llm_grid_w] = [
26849
- Number(t),
26850
- Math.floor(Number(h) / spatial_merge_size),
26851
- Math.floor(Number(w) / spatial_merge_size)
26852
- ];
26853
- const text_len = ed - st2;
26854
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
26855
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
26856
- const offset = text_len + st_idx;
26857
- const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
26858
- const t_index = Array.from(
26859
- { length: grid_size },
26860
- (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
26861
- );
26862
- const h_index = Array.from(
26863
- { length: grid_size },
26864
- (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
26865
- );
26866
- const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
26867
- llm_pos_ids_list.push([t_index, h_index, w_index].flat());
26868
- st2 = ed + grid_size;
26869
- }
26870
- if (st2 < ids.length) {
26871
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
26872
- const text_len = ids.length - st2;
26873
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
26874
- }
26875
- const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
26876
- const llm_positions = new Array(num_items);
26877
- let index = 0;
26878
- for (let x = 0; x < 3; ++x) {
26879
- for (let y = 0; y < llm_pos_ids_list.length; ++y) {
26880
- const val = llm_pos_ids_list[y];
26881
- const text_len = val.length / 3;
26882
- for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
26883
- llm_positions[index++] = val[z];
26884
- }
26885
- }
26886
- }
26887
- let count2 = 0;
26888
- const attn_mask = attention_mask_list[i];
26889
- for (let y = 0; y < attn_mask.length; ++y) {
26890
- if (attn_mask[y] == 1) {
26891
- for (let x = 0; x < 3; ++x) {
26892
- position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
26893
- }
26894
- ++count2;
26895
- }
26896
- }
26897
- const max_llm_positions = max(llm_positions)[0];
26898
- mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
26899
- }
26900
- return [
26901
- new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
26902
- new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
26903
- ];
26904
- } else {
26905
- if (attention_mask) {
26906
- const { data, dims } = cumsum_masked_fill(attention_mask);
26907
- const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
26908
- const mrope_position_deltas2 = Array.from(
26909
- { length: dims[0] },
26910
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
26911
- );
26912
- return [
26913
- new Tensor2("int64", position_ids, [3, ...dims]),
26914
- new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
26915
- ];
26916
- } else {
26917
- const [batch_size, seq_length] = input_ids.dims;
26918
- const position_ids = BigInt64Array.from(
26919
- { length: 3 * batch_size * seq_length },
26920
- (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
26921
- );
26922
- return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
26923
- }
26924
- }
26925
- }
26926
- async encode_image({ pixel_values, image_grid_thw }) {
26927
- const features = (await sessionRun(this.sessions["vision_encoder"], {
26928
- pixel_values,
26929
- [this.image_grid_thw_name]: image_grid_thw
26930
- })).image_features;
26931
- return features;
26932
- }
26933
- _merge_input_ids_with_image_features(kwargs) {
26934
- return default_merge_input_ids_with_image_features({
26935
- // @ts-ignore
26936
- image_token_id: this.config.image_token_id,
26937
- ...kwargs
26938
- });
26939
- }
26940
- prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
26941
- if (model_inputs.attention_mask && !model_inputs.position_ids) {
26942
- if (!model_inputs.past_key_values) {
26943
- [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
26944
- model_inputs.input_ids,
26945
- model_inputs.image_grid_thw,
26946
- model_inputs.video_grid_thw,
26947
- model_inputs.attention_mask
26948
- );
26949
- } else {
26950
- model_inputs.pixel_values = null;
26951
- const past_length = getPastLength(model_inputs.past_key_values);
26952
- if (past_length < model_inputs.input_ids.dims[1]) {
26953
- const [full_position_ids, rope_deltas] = this.get_rope_index(
26954
- model_inputs.input_ids,
26955
- model_inputs.image_grid_thw,
26956
- model_inputs.video_grid_thw,
26957
- model_inputs.attention_mask
26958
- );
26959
- model_inputs.rope_deltas = rope_deltas;
26960
- model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
26961
- model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
26962
- } else {
26963
- if (!model_inputs.rope_deltas) {
26964
- [, model_inputs.rope_deltas] = this.get_rope_index(
26965
- model_inputs.input_ids,
26966
- model_inputs.image_grid_thw,
26967
- model_inputs.video_grid_thw,
26968
- model_inputs.attention_mask
26969
- );
26970
- }
26971
- const delta = BigInt(past_length);
26972
- const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
26973
- model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
26974
- }
26975
- }
26976
- }
26977
- return model_inputs;
26978
- }
26979
- };
26980
-
26981
- // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
26982
- var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
26983
- image_grid_thw_name = "image_grid_thw";
26984
- };
26985
-
26986
28117
  // src/models/qwen3/modeling_qwen3.js
26987
28118
  var Qwen3PreTrainedModel = class extends PreTrainedModel {
26988
28119
  };
@@ -27010,18 +28141,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
27010
28141
  // src/models/qwen3_vl/modeling_qwen3_vl.js
27011
28142
  var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
27012
28143
  };
28144
+ var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
28145
+ };
27013
28146
 
27014
28147
  // src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
27015
28148
  var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
27016
28149
  };
28150
+ var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
28151
+ };
27017
28152
 
27018
28153
  // src/models/qwen3_5/modeling_qwen3_5.js
27019
28154
  var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
27020
28155
  };
28156
+ var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
28157
+ };
27021
28158
 
27022
28159
  // src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
27023
28160
  var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
27024
28161
  };
28162
+ var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
28163
+ };
27025
28164
 
27026
28165
  // src/models/resnet/modeling_resnet.js
27027
28166
  var ResNetPreTrainedModel = class extends PreTrainedModel {
@@ -27420,6 +28559,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
27420
28559
  }
27421
28560
  };
27422
28561
 
28562
+ // src/models/solar_open/modeling_solar_open.js
28563
+ var SolarOpenPreTrainedModel = class extends PreTrainedModel {
28564
+ };
28565
+ var SolarOpenModel = class extends SolarOpenPreTrainedModel {
28566
+ };
28567
+ var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
28568
+ };
28569
+
27423
28570
  // src/models/speecht5/modeling_speecht5.js
27424
28571
  var SpeechT5PreTrainedModel = class extends PreTrainedModel {
27425
28572
  };
@@ -27702,25 +28849,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
27702
28849
  var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
27703
28850
  };
27704
28851
 
27705
- // src/models/ultravox/modeling_ultravox.js
27706
- var UltravoxPreTrainedModel = class extends PreTrainedModel {
27707
- forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
27708
- };
27709
- var UltravoxModel = class extends UltravoxPreTrainedModel {
27710
- _merge_input_ids_with_audio_features(kwargs) {
27711
- const audio_hidden_size = kwargs.audio_features.dims.at(-1);
27712
- const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
27713
- return default_merge_input_ids_with_audio_features({
27714
- // @ts-ignore
27715
- audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
27716
- ...kwargs,
27717
- audio_features: reshaped_audio_features
27718
- });
27719
- }
27720
- };
27721
- var VoxtralForConditionalGeneration = class extends UltravoxModel {
27722
- };
27723
-
27724
28852
  // src/models/unispeech/modeling_unispeech.js
27725
28853
  var UniSpeechPreTrainedModel = class extends PreTrainedModel {
27726
28854
  };
@@ -27886,6 +29014,170 @@ var VitsModel = class extends VitsPreTrainedModel {
27886
29014
  }
27887
29015
  };
27888
29016
 
29017
+ // src/models/voxtral/modeling_voxtral.js
29018
+ var VoxtralForConditionalGeneration = class extends UltravoxModel {
29019
+ };
29020
+
29021
+ // src/models/voxtral_realtime/modeling_voxtral_realtime.js
29022
+ var CONV1_LEFT_PAD = 2;
29023
+ var CONV2_LEFT_PAD = 1;
29024
+ var states = /* @__PURE__ */ new WeakMap();
29025
+ function createEncoderState(model, input_features) {
29026
+ const { text_config, audio_config } = (
29027
+ /** @type {any} */
29028
+ model.config
29029
+ );
29030
+ const encoder_session = model.sessions["audio_encoder"];
29031
+ const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
29032
+ const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
29033
+ const enc_kv_cache = new DynamicCache();
29034
+ const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
29035
+ const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
29036
+ const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
29037
+ for (const name in enc_shapes) {
29038
+ const size = enc_shapes[name].reduce((a, b) => a * b, 1);
29039
+ enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
29040
+ }
29041
+ const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
29042
+ 1,
29043
+ PADDING_CACHE_CHANNELS,
29044
+ CONV1_LEFT_PAD
29045
+ ]);
29046
+ const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
29047
+ if (!chunks_iter) {
29048
+ throw new Error("input_features must be iterable or async iterable");
29049
+ }
29050
+ return {
29051
+ encoder_session,
29052
+ enc_kv_cache,
29053
+ enc_padding_cache,
29054
+ enc_past_seq_len: 0,
29055
+ audio_embed_queue: [],
29056
+ audio_embed_total_tokens: 0,
29057
+ audio_queue_offset: 0,
29058
+ audio_consumed: 0,
29059
+ stream_exhausted: false,
29060
+ chunks_iter,
29061
+ text_hidden_size: text_config.hidden_size
29062
+ };
29063
+ }
29064
+ async function encodeChunk(s, chunk_features) {
29065
+ const audio_seq_len = chunk_features.dims[2];
29066
+ const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
29067
+ const position_ids = new Tensor2(
29068
+ "int64",
29069
+ BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
29070
+ [1, conv2_output_len]
29071
+ );
29072
+ const total_seq_len = s.enc_past_seq_len + conv2_output_len;
29073
+ const attention_mask = ones([1, total_seq_len]);
29074
+ const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
29075
+ input_features: chunk_features,
29076
+ attention_mask,
29077
+ position_ids,
29078
+ past_padding_cache: s.enc_padding_cache,
29079
+ ...s.enc_kv_cache
29080
+ });
29081
+ if (s.enc_padding_cache.location === "gpu-buffer") {
29082
+ s.enc_padding_cache.dispose();
29083
+ }
29084
+ s.enc_padding_cache = present_padding_cache;
29085
+ for (const name in present_cache) {
29086
+ if (name.startsWith("present.")) {
29087
+ const pastName = name.replace("present", "past_key_values");
29088
+ const prev = s.enc_kv_cache[pastName];
29089
+ if (prev?.location === "gpu-buffer") {
29090
+ prev.dispose();
29091
+ }
29092
+ s.enc_kv_cache[pastName] = present_cache[name];
29093
+ }
29094
+ }
29095
+ s.enc_past_seq_len = total_seq_len;
29096
+ return audio_embeds;
29097
+ }
29098
+ async function fillAudioBuffer(s, needed) {
29099
+ while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
29100
+ const result = await s.chunks_iter.next();
29101
+ if (result.done) {
29102
+ s.stream_exhausted = true;
29103
+ break;
29104
+ }
29105
+ const new_embeds = await encodeChunk(s, result.value);
29106
+ s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
29107
+ s.audio_embed_total_tokens += new_embeds.dims[1];
29108
+ }
29109
+ }
29110
+ function addAudioEmbeddings(s, inputs_embeds, current_len) {
29111
+ if (s.audio_embed_queue.length === 0) return;
29112
+ const embed_data = inputs_embeds.data;
29113
+ let embed_write_pos = 0;
29114
+ let remaining = current_len;
29115
+ while (remaining > 0 && s.audio_embed_queue.length > 0) {
29116
+ const front = s.audio_embed_queue[0];
29117
+ const available = front.tokens - s.audio_queue_offset;
29118
+ const n = Math.min(remaining, available);
29119
+ const src_offset = s.audio_queue_offset * s.text_hidden_size;
29120
+ for (let i = 0; i < n * s.text_hidden_size; ++i) {
29121
+ embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
29122
+ }
29123
+ embed_write_pos += n;
29124
+ remaining -= n;
29125
+ s.audio_queue_offset += n;
29126
+ if (s.audio_queue_offset >= front.tokens) {
29127
+ s.audio_embed_queue.shift();
29128
+ s.audio_queue_offset = 0;
29129
+ }
29130
+ }
29131
+ s.audio_consumed += current_len - remaining;
29132
+ }
29133
+ var AudioExhaustedCriteria = class extends StoppingCriteria {
29134
+ constructor(enc_state) {
29135
+ super();
29136
+ this._s = enc_state;
29137
+ }
29138
+ _call(input_ids) {
29139
+ const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
29140
+ return input_ids.map(() => done);
29141
+ }
29142
+ };
29143
+ var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
29144
+ forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
29145
+ };
29146
+ var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
29147
+ async forward({ input_ids, past_key_values, ...kwargs }) {
29148
+ const current_len = input_ids.dims[1];
29149
+ const enc = states.get(this);
29150
+ if (enc) {
29151
+ await fillAudioBuffer(enc, enc.audio_consumed + current_len);
29152
+ }
29153
+ const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
29154
+ if (enc) {
29155
+ addAudioEmbeddings(enc, inputs_embeds, current_len);
29156
+ }
29157
+ const decoder_feeds = { inputs_embeds, ...kwargs };
29158
+ this.addPastKeyValues(decoder_feeds, past_key_values);
29159
+ const session = this.sessions["decoder_model_merged"];
29160
+ const fixed = pick(decoder_feeds, session.inputNames);
29161
+ return await sessionRun(session, fixed);
29162
+ }
29163
+ async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
29164
+ if (!input_features) {
29165
+ throw new Error("input_features (generator/iterable) must be provided");
29166
+ }
29167
+ const enc_state = createEncoderState(this, input_features);
29168
+ states.set(this, enc_state);
29169
+ const stopping_criteria = new StoppingCriteriaList();
29170
+ stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
29171
+ if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
29172
+ try {
29173
+ return await super.generate({ ...kwargs, stopping_criteria });
29174
+ } finally {
29175
+ enc_state.enc_kv_cache.dispose();
29176
+ states.delete(this);
29177
+ }
29178
+ }
29179
+ };
29180
+
27889
29181
  // src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
27890
29182
  var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
27891
29183
  };
@@ -28391,6 +29683,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
28391
29683
  // src/models/registry.js
28392
29684
  var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
28393
29685
  ["bert", "BertModel"],
29686
+ ["eurobert", "EuroBertModel"],
28394
29687
  ["neobert", "NeoBertModel"],
28395
29688
  ["modernbert", "ModernBertModel"],
28396
29689
  ["nomic_bert", "NomicBertModel"],
@@ -28522,6 +29815,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
28522
29815
  ["gemma3_text", "Gemma3Model"],
28523
29816
  ["helium", "HeliumModel"],
28524
29817
  ["glm", "GlmModel"],
29818
+ ["glm_moe_dsa", "GlmMoeDsaModel"],
28525
29819
  ["openelm", "OpenELMModel"],
28526
29820
  ["qwen2", "Qwen2Model"],
28527
29821
  ["qwen2_moe", "Qwen2MoeModel"],
@@ -28533,12 +29827,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
28533
29827
  ["mpt", "MptModel"],
28534
29828
  ["opt", "OPTModel"],
28535
29829
  ["mistral", "MistralModel"],
29830
+ ["mistral4", "Mistral4Model"],
28536
29831
  ["ministral", "MinistralModel"],
28537
29832
  ["ministral3", "Ministral3Model"],
28538
29833
  ["ernie4_5", "Ernie4_5ForCausalLM"],
28539
29834
  ["starcoder2", "Starcoder2Model"],
29835
+ ["deepseek_v3", "DeepseekV3Model"],
28540
29836
  ["falcon", "FalconModel"],
28541
29837
  ["falcon_h1", "FalconH1Model"],
29838
+ ["nemotron_h", "NemotronHModel"],
29839
+ ["solar_open", "SolarOpenModel"],
28542
29840
  ["stablelm", "StableLmModel"],
28543
29841
  ["modernbert-decoder", "ModernBertDecoderModel"],
28544
29842
  ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -28558,6 +29856,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
28558
29856
  ]);
28559
29857
  var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
28560
29858
  ["bert", "BertForSequenceClassification"],
29859
+ ["eurobert", "EuroBertForSequenceClassification"],
28561
29860
  ["neobert", "NeoBertForSequenceClassification"],
28562
29861
  ["modernbert", "ModernBertForSequenceClassification"],
28563
29862
  ["roformer", "RoFormerForSequenceClassification"],
@@ -28580,6 +29879,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
28580
29879
  ]);
28581
29880
  var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
28582
29881
  ["bert", "BertForTokenClassification"],
29882
+ ["eurobert", "EuroBertForTokenClassification"],
28583
29883
  ["neobert", "NeoBertForTokenClassification"],
28584
29884
  ["modernbert", "ModernBertForTokenClassification"],
28585
29885
  ["roformer", "RoFormerForTokenClassification"],
@@ -28639,27 +29939,40 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
28639
29939
  ["gemma2", "Gemma2ForCausalLM"],
28640
29940
  ["vaultgemma", "VaultGemmaForCausalLM"],
28641
29941
  ["gemma3_text", "Gemma3ForCausalLM"],
29942
+ ["gemma3", "Gemma3ForCausalLM"],
28642
29943
  ["helium", "HeliumForCausalLM"],
28643
29944
  ["glm", "GlmForCausalLM"],
29945
+ ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
28644
29946
  ["openelm", "OpenELMForCausalLM"],
28645
29947
  ["qwen2", "Qwen2ForCausalLM"],
28646
29948
  ["qwen2_moe", "Qwen2MoeForCausalLM"],
28647
29949
  ["qwen3", "Qwen3ForCausalLM"],
28648
29950
  ["qwen3_moe", "Qwen3MoeForCausalLM"],
28649
29951
  ["qwen3_next", "Qwen3NextForCausalLM"],
29952
+ ["qwen2_vl", "Qwen2VLForCausalLM"],
29953
+ ["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
29954
+ ["qwen3_vl", "Qwen3VLForCausalLM"],
29955
+ ["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
29956
+ ["qwen3_5", "Qwen3_5ForCausalLM"],
29957
+ ["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
29958
+ ["gemma3n", "Gemma3nForCausalLM"],
28650
29959
  ["phi", "PhiForCausalLM"],
28651
29960
  ["phi3", "Phi3ForCausalLM"],
28652
29961
  ["mpt", "MptForCausalLM"],
28653
29962
  ["opt", "OPTForCausalLM"],
28654
29963
  ["mbart", "MBartForCausalLM"],
28655
29964
  ["mistral", "MistralForCausalLM"],
29965
+ ["mistral4", "Mistral4ForCausalLM"],
28656
29966
  ["ministral", "MinistralForCausalLM"],
28657
29967
  ["ministral3", "Ministral3ForCausalLM"],
28658
29968
  ["ernie4_5", "Ernie4_5ForCausalLM"],
28659
29969
  ["starcoder2", "Starcoder2ForCausalLM"],
29970
+ ["deepseek_v3", "DeepseekV3ForCausalLM"],
28660
29971
  ["falcon", "FalconForCausalLM"],
28661
29972
  ["falcon_h1", "FalconH1ForCausalLM"],
29973
+ ["nemotron_h", "NemotronHForCausalLM"],
28662
29974
  ["trocr", "TrOCRForCausalLM"],
29975
+ ["solar_open", "SolarOpenForCausalLM"],
28663
29976
  ["stablelm", "StableLmForCausalLM"],
28664
29977
  ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
28665
29978
  ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -28670,6 +29983,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
28670
29983
  var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
28671
29984
  var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
28672
29985
  ["bert", "BertForMaskedLM"],
29986
+ ["eurobert", "EuroBertForMaskedLM"],
28673
29987
  ["neobert", "NeoBertForMaskedLM"],
28674
29988
  ["modernbert", "ModernBertForMaskedLM"],
28675
29989
  ["roformer", "RoFormerForMaskedLM"],
@@ -28722,16 +30036,21 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
28722
30036
  ["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
28723
30037
  ["qwen3_5", "Qwen3_5ForConditionalGeneration"],
28724
30038
  ["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
30039
+ ["lfm2_vl", "Lfm2VlForConditionalGeneration"],
28725
30040
  ["idefics3", "Idefics3ForConditionalGeneration"],
28726
30041
  ["smolvlm", "SmolVLMForConditionalGeneration"],
28727
30042
  ["paligemma", "PaliGemmaForConditionalGeneration"],
28728
30043
  ["llava_qwen2", "LlavaQwen2ForCausalLM"],
28729
30044
  ["gemma3n", "Gemma3nForConditionalGeneration"],
28730
- ["mistral3", "Mistral3ForConditionalGeneration"]
30045
+ ["mistral3", "Mistral3ForConditionalGeneration"],
30046
+ ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
30047
+ ["glm_ocr", "GlmOcrForConditionalGeneration"]
28731
30048
  ]);
28732
30049
  var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
30050
+ ["granite_speech", "GraniteSpeechForConditionalGeneration"],
28733
30051
  ["ultravox", "UltravoxModel"],
28734
- ["voxtral", "VoxtralForConditionalGeneration"]
30052
+ ["voxtral", "VoxtralForConditionalGeneration"],
30053
+ ["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
28735
30054
  ]);
28736
30055
  var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
28737
30056
  ["vision-encoder-decoder", "VisionEncoderDecoderModel"]
@@ -28830,6 +30149,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
28830
30149
  ]);
28831
30150
  var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
28832
30151
  var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30152
+ ["chmv2", "CHMv2ForDepthEstimation"],
28833
30153
  ["dpt", "DPTForDepthEstimation"],
28834
30154
  ["depth_anything", "DepthAnythingForDepthEstimation"],
28835
30155
  ["glpn", "GLPNForDepthEstimation"],
@@ -28914,7 +30234,19 @@ var CUSTOM_MAPPING = [
28914
30234
  MODEL_TYPES.ImageAudioTextToText
28915
30235
  ],
28916
30236
  ["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
28917
- ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
30237
+ ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
30238
+ ["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30239
+ ["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30240
+ ["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30241
+ ["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30242
+ ["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30243
+ ["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30244
+ ["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
30245
+ [
30246
+ "VoxtralRealtimeForConditionalGeneration",
30247
+ VoxtralRealtimeForConditionalGeneration,
30248
+ MODEL_TYPES.VoxtralRealtime
30249
+ ]
28918
30250
  ];
28919
30251
  for (const [name, model, type] of CUSTOM_MAPPING) {
28920
30252
  MODEL_TYPE_MAPPING.set(name, type);
@@ -30592,8 +31924,18 @@ var TASK_ALIASES = Object.freeze({
30592
31924
  });
30593
31925
 
30594
31926
  // src/utils/model_registry/get_model_files.js
31927
+ function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
31928
+ if (config !== null) {
31929
+ return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
31930
+ }
31931
+ const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
31932
+ return memoizePromise(
31933
+ key,
31934
+ () => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
31935
+ );
31936
+ }
30595
31937
  async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
30596
- config = await AutoConfig.from_pretrained(modelId, { config });
31938
+ config = await get_config(modelId, { config });
30597
31939
  const files = [
30598
31940
  // Add config.json (always loaded)
30599
31941
  "config.json"
@@ -30654,74 +31996,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
30654
31996
  files.push(dataFilePath);
30655
31997
  }
30656
31998
  };
30657
- const singleModelName = model_file_name ?? "model";
30658
- if (modelType === MODEL_TYPES.DecoderOnly) {
30659
- add_model_file("model", singleModelName);
30660
- files.push("generation_config.json");
30661
- } else if (modelType === MODEL_TYPES.DecoderOnlyWithoutHead) {
30662
- add_model_file("model", singleModelName);
30663
- } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
30664
- add_model_file("model", "encoder_model");
30665
- add_model_file("decoder_model_merged");
30666
- files.push("generation_config.json");
30667
- } else if (modelType === MODEL_TYPES.MaskGeneration) {
30668
- add_model_file("model", "vision_encoder");
30669
- add_model_file("prompt_encoder_mask_decoder");
30670
- } else if (modelType === MODEL_TYPES.EncoderDecoder) {
30671
- add_model_file("model", "encoder_model");
30672
- add_model_file("decoder_model_merged");
30673
- } else if (modelType === MODEL_TYPES.ImageTextToText) {
30674
- add_model_file("embed_tokens");
30675
- add_model_file("vision_encoder");
30676
- add_model_file("decoder_model_merged");
30677
- if (config.is_encoder_decoder) {
30678
- add_model_file("model", "encoder_model");
30679
- }
30680
- files.push("generation_config.json");
30681
- } else if (modelType === MODEL_TYPES.AudioTextToText) {
30682
- add_model_file("embed_tokens");
30683
- add_model_file("audio_encoder");
30684
- add_model_file("decoder_model_merged");
30685
- files.push("generation_config.json");
30686
- } else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
30687
- add_model_file("embed_tokens");
30688
- add_model_file("audio_encoder");
30689
- add_model_file("vision_encoder");
30690
- add_model_file("decoder_model_merged");
30691
- files.push("generation_config.json");
30692
- } else if (modelType === MODEL_TYPES.Musicgen) {
30693
- add_model_file("model", "text_encoder");
30694
- add_model_file("decoder_model_merged");
30695
- add_model_file("encodec_decode");
30696
- files.push("generation_config.json");
30697
- } else if (modelType === MODEL_TYPES.MultiModality) {
30698
- add_model_file("prepare_inputs_embeds");
30699
- add_model_file("model", "language_model");
30700
- add_model_file("lm_head");
30701
- add_model_file("gen_head");
30702
- add_model_file("gen_img_embeds");
30703
- add_model_file("image_decode");
30704
- files.push("generation_config.json");
30705
- } else if (modelType === MODEL_TYPES.Phi3V) {
30706
- add_model_file("prepare_inputs_embeds");
30707
- add_model_file("model");
30708
- add_model_file("vision_encoder");
30709
- files.push("generation_config.json");
30710
- } else if (modelType === MODEL_TYPES.Chatterbox) {
30711
- add_model_file("embed_tokens");
30712
- add_model_file("speech_encoder");
30713
- add_model_file("model", "language_model");
30714
- add_model_file("conditional_decoder");
30715
- files.push("generation_config.json");
30716
- } else if (modelType === MODEL_TYPES.AutoEncoder) {
30717
- add_model_file("encoder_model");
30718
- add_model_file("decoder_model");
30719
- } else if (modelType === MODEL_TYPES.Supertonic) {
30720
- add_model_file("text_encoder");
30721
- add_model_file("latent_denoiser");
30722
- add_model_file("voice_decoder");
30723
- } else {
30724
- add_model_file("model", singleModelName);
31999
+ const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
32000
+ for (const [sessionKey, baseName] of Object.entries(sessions)) {
32001
+ add_model_file(sessionKey, baseName);
32002
+ }
32003
+ if (optional_configs) {
32004
+ for (const configFile of Object.values(optional_configs)) {
32005
+ files.push(configFile);
32006
+ }
30725
32007
  }
30726
32008
  return files;
30727
32009
  }
@@ -31172,25 +32454,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
31172
32454
 
31173
32455
  // src/utils/model_registry/is_cached.js
31174
32456
  async function check_files_cache(modelId, files, options = {}) {
31175
- const cache = await getCache(options?.cache_dir);
31176
- if (!cache) {
32457
+ const cache2 = await getCache(options?.cache_dir);
32458
+ if (!cache2) {
31177
32459
  const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
31178
32460
  return { allCached: false, files: fileStatuses2 };
31179
32461
  }
31180
32462
  const fileStatuses = await Promise.all(
31181
32463
  files.map(async (filename) => {
31182
- const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
31183
- const cached = await checkCachedResource(cache, localPath, proposedCacheKey);
32464
+ const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
32465
+ const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
31184
32466
  return { file: filename, cached: !!cached };
31185
32467
  })
31186
32468
  );
31187
32469
  return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
31188
32470
  }
31189
32471
  async function is_file_cached(modelId, filename, options = {}) {
31190
- const cache = await getCache(options?.cache_dir);
31191
- if (!cache) return false;
31192
- const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
31193
- return !!await checkCachedResource(cache, localPath, proposedCacheKey);
32472
+ const cache2 = await getCache(options?.cache_dir);
32473
+ if (!cache2) return false;
32474
+ const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
32475
+ return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
31194
32476
  }
31195
32477
  async function is_cached(modelId, options = {}) {
31196
32478
  if (!modelId) {
@@ -31237,26 +32519,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
31237
32519
 
31238
32520
  // src/utils/model_registry/clear_cache.js
31239
32521
  async function clear_files_from_cache(modelId, files, options = {}) {
31240
- const cache = await getCache(options?.cache_dir);
31241
- if (!cache) {
32522
+ const cache2 = await getCache(options?.cache_dir);
32523
+ if (!cache2) {
31242
32524
  return {
31243
32525
  filesDeleted: 0,
31244
32526
  filesCached: 0,
31245
32527
  files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
31246
32528
  };
31247
32529
  }
31248
- if (!cache.delete) {
32530
+ if (!cache2.delete) {
31249
32531
  throw new Error("Cache does not support delete operation");
31250
32532
  }
31251
32533
  const results = await Promise.all(
31252
32534
  files.map(async (filename) => {
31253
- const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
31254
- const cached = await checkCachedResource(cache, localPath, proposedCacheKey);
32535
+ const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
32536
+ const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
31255
32537
  const wasCached = !!cached;
31256
32538
  let deleted = false;
31257
32539
  if (wasCached) {
31258
- const deletedWithProposed = await cache.delete(proposedCacheKey);
31259
- const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache.delete(localPath) : false;
32540
+ const deletedWithProposed = await cache2.delete(proposedCacheKey);
32541
+ const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
31260
32542
  deleted = deletedWithProposed || deletedWithLocal;
31261
32543
  }
31262
32544
  return { file: filename, deleted, wasCached };
@@ -31606,6 +32888,9 @@ export {
31606
32888
  BloomModel,
31607
32889
  BloomPreTrainedModel,
31608
32890
  BloomTokenizer,
32891
+ CHMv2ForDepthEstimation,
32892
+ CHMv2ImageProcessor,
32893
+ CHMv2PreTrainedModel,
31609
32894
  CLIPFeatureExtractor,
31610
32895
  CLIPImageProcessor,
31611
32896
  CLIPModel,
@@ -31701,6 +32986,9 @@ export {
31701
32986
  DebertaV2Tokenizer,
31702
32987
  DecisionTransformerModel,
31703
32988
  DecisionTransformerPreTrainedModel,
32989
+ DeepseekV3ForCausalLM,
32990
+ DeepseekV3Model,
32991
+ DeepseekV3PreTrainedModel,
31704
32992
  DeiTFeatureExtractor,
31705
32993
  DeiTForImageClassification,
31706
32994
  DeiTImageProcessor,
@@ -31737,6 +33025,7 @@ export {
31737
33025
  DonutImageProcessor,
31738
33026
  DonutSwinModel,
31739
33027
  DonutSwinPreTrainedModel,
33028
+ DynamicCache,
31740
33029
  EdgeTamModel,
31741
33030
  EfficientNetForImageClassification,
31742
33031
  EfficientNetImageProcessor,
@@ -31760,6 +33049,11 @@ export {
31760
33049
  EsmModel,
31761
33050
  EsmPreTrainedModel,
31762
33051
  EsmTokenizer,
33052
+ EuroBertForMaskedLM,
33053
+ EuroBertForSequenceClassification,
33054
+ EuroBertForTokenClassification,
33055
+ EuroBertModel,
33056
+ EuroBertPreTrainedModel,
31763
33057
  ExaoneForCausalLM,
31764
33058
  ExaoneModel,
31765
33059
  ExaonePreTrainedModel,
@@ -31809,6 +33103,7 @@ export {
31809
33103
  Gemma3Model,
31810
33104
  Gemma3PreTrainedModel,
31811
33105
  Gemma3nAudioFeatureExtractor,
33106
+ Gemma3nForCausalLM,
31812
33107
  Gemma3nForConditionalGeneration,
31813
33108
  Gemma3nPreTrainedModel,
31814
33109
  Gemma3nProcessor,
@@ -31816,8 +33111,14 @@ export {
31816
33111
  GemmaModel,
31817
33112
  GemmaPreTrainedModel,
31818
33113
  GemmaTokenizer,
33114
+ Glm46VImageProcessor,
33115
+ Glm46VProcessor,
31819
33116
  GlmForCausalLM,
31820
33117
  GlmModel,
33118
+ GlmMoeDsaForCausalLM,
33119
+ GlmMoeDsaModel,
33120
+ GlmMoeDsaPreTrainedModel,
33121
+ GlmOcrForConditionalGeneration,
31821
33122
  GlmPreTrainedModel,
31822
33123
  GptOssForCausalLM,
31823
33124
  GptOssModel,
@@ -31828,6 +33129,9 @@ export {
31828
33129
  GraniteMoeHybridModel,
31829
33130
  GraniteMoeHybridPreTrainedModel,
31830
33131
  GranitePreTrainedModel,
33132
+ GraniteSpeechFeatureExtractor,
33133
+ GraniteSpeechForConditionalGeneration,
33134
+ GraniteSpeechProcessor,
31831
33135
  GroundingDinoForObjectDetection,
31832
33136
  GroundingDinoImageProcessor,
31833
33137
  GroundingDinoPreTrainedModel,
@@ -31853,7 +33157,6 @@ export {
31853
33157
  IJepaPreTrainedModel,
31854
33158
  Idefics3ForConditionalGeneration,
31855
33159
  Idefics3ImageProcessor,
31856
- Idefics3PreTrainedModel,
31857
33160
  Idefics3Processor,
31858
33161
  ImageClassificationPipeline,
31859
33162
  ImageFeatureExtractionPipeline,
@@ -31878,6 +33181,10 @@ export {
31878
33181
  Lfm2MoeModel,
31879
33182
  Lfm2MoePreTrainedModel,
31880
33183
  Lfm2PreTrainedModel,
33184
+ Lfm2VlForConditionalGeneration,
33185
+ Lfm2VlImageProcessor,
33186
+ Lfm2VlProcessor,
33187
+ LightOnOcrForConditionalGeneration,
31881
33188
  LiteWhisperForConditionalGeneration,
31882
33189
  Llama4ForCausalLM,
31883
33190
  Llama4PreTrainedModel,
@@ -31947,6 +33254,9 @@ export {
31947
33254
  MimiPreTrainedModel,
31948
33255
  MinLengthLogitsProcessor,
31949
33256
  MinNewTokensLengthLogitsProcessor,
33257
+ Mistral4ForCausalLM,
33258
+ Mistral4Model,
33259
+ Mistral4PreTrainedModel,
31950
33260
  MistralForCausalLM,
31951
33261
  MistralModel,
31952
33262
  MistralPreTrainedModel,
@@ -32018,6 +33328,9 @@ export {
32018
33328
  NanoChatForCausalLM,
32019
33329
  NanoChatModel,
32020
33330
  NanoChatPreTrainedModel,
33331
+ NemotronHForCausalLM,
33332
+ NemotronHModel,
33333
+ NemotronHPreTrainedModel,
32021
33334
  NeoBertForMaskedLM,
32022
33335
  NeoBertForQuestionAnswering,
32023
33336
  NeoBertForSequenceClassification,
@@ -32061,7 +33374,6 @@ export {
32061
33374
  Owlv2Model,
32062
33375
  Owlv2PreTrainedModel,
32063
33376
  PaliGemmaForConditionalGeneration,
32064
- PaliGemmaPreTrainedModel,
32065
33377
  PaliGemmaProcessor,
32066
33378
  ParakeetFeatureExtractor,
32067
33379
  ParakeetForCTC,
@@ -32105,10 +33417,12 @@ export {
32105
33417
  Qwen2MoePreTrainedModel,
32106
33418
  Qwen2PreTrainedModel,
32107
33419
  Qwen2Tokenizer,
33420
+ Qwen2VLForCausalLM,
32108
33421
  Qwen2VLForConditionalGeneration,
32109
33422
  Qwen2VLImageProcessor,
32110
33423
  Qwen2VLPreTrainedModel,
32111
33424
  Qwen2VLProcessor,
33425
+ Qwen2_5_VLForCausalLM,
32112
33426
  Qwen2_5_VLForConditionalGeneration,
32113
33427
  Qwen2_5_VLProcessor,
32114
33428
  Qwen3ForCausalLM,
@@ -32120,10 +33434,14 @@ export {
32120
33434
  Qwen3NextModel,
32121
33435
  Qwen3NextPreTrainedModel,
32122
33436
  Qwen3PreTrainedModel,
33437
+ Qwen3VLForCausalLM,
32123
33438
  Qwen3VLForConditionalGeneration,
33439
+ Qwen3VLMoeForCausalLM,
32124
33440
  Qwen3VLMoeForConditionalGeneration,
32125
33441
  Qwen3VLProcessor,
33442
+ Qwen3_5ForCausalLM,
32126
33443
  Qwen3_5ForConditionalGeneration,
33444
+ Qwen3_5MoeForCausalLM,
32127
33445
  Qwen3_5MoeForConditionalGeneration,
32128
33446
  RFDetrForObjectDetection,
32129
33447
  RFDetrModel,
@@ -32195,7 +33513,6 @@ export {
32195
33513
  SmolLM3ForCausalLM,
32196
33514
  SmolLM3Model,
32197
33515
  SmolLM3PreTrainedModel,
32198
- SmolVLMForConditionalGeneration,
32199
33516
  Idefics3ImageProcessor as SmolVLMImageProcessor,
32200
33517
  Idefics3Processor as SmolVLMProcessor,
32201
33518
  SnacDecoderModel,
@@ -32203,6 +33520,9 @@ export {
32203
33520
  SnacFeatureExtractor,
32204
33521
  SnacModel,
32205
33522
  SnacPreTrainedModel,
33523
+ SolarOpenForCausalLM,
33524
+ SolarOpenModel,
33525
+ SolarOpenPreTrainedModel,
32206
33526
  SpeechT5FeatureExtractor,
32207
33527
  SpeechT5ForSpeechToText,
32208
33528
  SpeechT5ForTextToSpeech,
@@ -32301,6 +33621,10 @@ export {
32301
33621
  VitsTokenizer,
32302
33622
  VoxtralForConditionalGeneration,
32303
33623
  VoxtralProcessor,
33624
+ VoxtralRealtimeFeatureExtractor,
33625
+ VoxtralRealtimeForConditionalGeneration,
33626
+ VoxtralRealtimePreTrainedModel,
33627
+ VoxtralRealtimeProcessor,
32304
33628
  Wav2Vec2BertForCTC,
32305
33629
  Wav2Vec2BertForSequenceClassification,
32306
33630
  Wav2Vec2BertModel,
@@ -32396,7 +33720,7 @@ export {
32396
33720
 
32397
33721
  onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
32398
33722
  (*!
32399
- * ONNX Runtime Web v1.25.0-dev.20260303-e7e64dc112
33723
+ * ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
32400
33724
  * Copyright (c) Microsoft Corporation. All rights reserved.
32401
33725
  * Licensed under the MIT License.
32402
33726
  *)