npm - @huggingface/transformers - Versions diffs - 4.0.0-next.6 → 4.0.0-next.8 - Mend

@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (194) hide show

package/dist/transformers.node.cjs CHANGED Viewed

@@ -117,6 +117,9 @@ __export(transformers_exports, {
   BloomModel: () => BloomModel,
   BloomPreTrainedModel: () => BloomPreTrainedModel,
   BloomTokenizer: () => BloomTokenizer,
+  CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
+  CHMv2ImageProcessor: () => CHMv2ImageProcessor,
+  CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
   CLIPFeatureExtractor: () => CLIPFeatureExtractor,
   CLIPImageProcessor: () => CLIPImageProcessor,
   CLIPModel: () => CLIPModel,
@@ -212,6 +215,9 @@ __export(transformers_exports, {
   DebertaV2Tokenizer: () => DebertaV2Tokenizer,
   DecisionTransformerModel: () => DecisionTransformerModel,
   DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
+  DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
+  DeepseekV3Model: () => DeepseekV3Model,
+  DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
   DeiTFeatureExtractor: () => DeiTFeatureExtractor,
   DeiTForImageClassification: () => DeiTForImageClassification,
   DeiTImageProcessor: () => DeiTImageProcessor,
@@ -248,6 +254,7 @@ __export(transformers_exports, {
   DonutImageProcessor: () => DonutImageProcessor,
   DonutSwinModel: () => DonutSwinModel,
   DonutSwinPreTrainedModel: () => DonutSwinPreTrainedModel,
+  DynamicCache: () => DynamicCache,
   EdgeTamModel: () => EdgeTamModel,
   EfficientNetForImageClassification: () => EfficientNetForImageClassification,
   EfficientNetImageProcessor: () => EfficientNetImageProcessor,
@@ -271,6 +278,11 @@ __export(transformers_exports, {
   EsmModel: () => EsmModel,
   EsmPreTrainedModel: () => EsmPreTrainedModel,
   EsmTokenizer: () => EsmTokenizer,
+  EuroBertForMaskedLM: () => EuroBertForMaskedLM,
+  EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
+  EuroBertForTokenClassification: () => EuroBertForTokenClassification,
+  EuroBertModel: () => EuroBertModel,
+  EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
   ExaoneForCausalLM: () => ExaoneForCausalLM,
   ExaoneModel: () => ExaoneModel,
   ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -320,6 +332,7 @@ __export(transformers_exports, {
   Gemma3Model: () => Gemma3Model,
   Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
   Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
+  Gemma3nForCausalLM: () => Gemma3nForCausalLM,
   Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
   Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
   Gemma3nProcessor: () => Gemma3nProcessor,
@@ -327,8 +340,14 @@ __export(transformers_exports, {
   GemmaModel: () => GemmaModel,
   GemmaPreTrainedModel: () => GemmaPreTrainedModel,
   GemmaTokenizer: () => GemmaTokenizer,
+  Glm46VImageProcessor: () => Glm46VImageProcessor,
+  Glm46VProcessor: () => Glm46VProcessor,
   GlmForCausalLM: () => GlmForCausalLM,
   GlmModel: () => GlmModel,
+  GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
+  GlmMoeDsaModel: () => GlmMoeDsaModel,
+  GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
+  GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
   GlmPreTrainedModel: () => GlmPreTrainedModel,
   GptOssForCausalLM: () => GptOssForCausalLM,
   GptOssModel: () => GptOssModel,
@@ -339,6 +358,9 @@ __export(transformers_exports, {
   GraniteMoeHybridModel: () => GraniteMoeHybridModel,
   GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
   GranitePreTrainedModel: () => GranitePreTrainedModel,
+  GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
+  GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
+  GraniteSpeechProcessor: () => GraniteSpeechProcessor,
   GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
   GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
   GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
@@ -364,7 +386,6 @@ __export(transformers_exports, {
   IJepaPreTrainedModel: () => IJepaPreTrainedModel,
   Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
   Idefics3ImageProcessor: () => Idefics3ImageProcessor,
-  Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
   Idefics3Processor: () => Idefics3Processor,
   ImageClassificationPipeline: () => ImageClassificationPipeline,
   ImageFeatureExtractionPipeline: () => ImageFeatureExtractionPipeline,
@@ -389,6 +410,10 @@ __export(transformers_exports, {
   Lfm2MoeModel: () => Lfm2MoeModel,
   Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
   Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
+  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
+  Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
+  Lfm2VlProcessor: () => Lfm2VlProcessor,
+  LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
   LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
   Llama4ForCausalLM: () => Llama4ForCausalLM,
   Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -458,6 +483,9 @@ __export(transformers_exports, {
   MimiPreTrainedModel: () => MimiPreTrainedModel,
   MinLengthLogitsProcessor: () => MinLengthLogitsProcessor,
   MinNewTokensLengthLogitsProcessor: () => MinNewTokensLengthLogitsProcessor,
+  Mistral4ForCausalLM: () => Mistral4ForCausalLM,
+  Mistral4Model: () => Mistral4Model,
+  Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
   MistralForCausalLM: () => MistralForCausalLM,
   MistralModel: () => MistralModel,
   MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -529,6 +557,9 @@ __export(transformers_exports, {
   NanoChatForCausalLM: () => NanoChatForCausalLM,
   NanoChatModel: () => NanoChatModel,
   NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
+  NemotronHForCausalLM: () => NemotronHForCausalLM,
+  NemotronHModel: () => NemotronHModel,
+  NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
   NeoBertForMaskedLM: () => NeoBertForMaskedLM,
   NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
   NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -572,7 +603,6 @@ __export(transformers_exports, {
   Owlv2Model: () => Owlv2Model,
   Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
   PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
-  PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
   PaliGemmaProcessor: () => PaliGemmaProcessor,
   ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
   ParakeetForCTC: () => ParakeetForCTC,
@@ -616,10 +646,12 @@ __export(transformers_exports, {
   Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
   Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
   Qwen2Tokenizer: () => Qwen2Tokenizer,
+  Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
   Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
   Qwen2VLImageProcessor: () => Qwen2VLImageProcessor,
   Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
   Qwen2VLProcessor: () => Qwen2VLProcessor,
+  Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
   Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
   Qwen2_5_VLProcessor: () => Qwen2_5_VLProcessor,
   Qwen3ForCausalLM: () => Qwen3ForCausalLM,
@@ -631,10 +663,14 @@ __export(transformers_exports, {
   Qwen3NextModel: () => Qwen3NextModel,
   Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
   Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
+  Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
   Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
+  Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
   Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
   Qwen3VLProcessor: () => Qwen3VLProcessor,
+  Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
   Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
+  Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
   Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
   RFDetrForObjectDetection: () => RFDetrForObjectDetection,
   RFDetrModel: () => RFDetrModel,
@@ -706,7 +742,6 @@ __export(transformers_exports, {
   SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
   SmolLM3Model: () => SmolLM3Model,
   SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
-  SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
   SmolVLMImageProcessor: () => Idefics3ImageProcessor,
   SmolVLMProcessor: () => Idefics3Processor,
   SnacDecoderModel: () => SnacDecoderModel,
@@ -714,6 +749,9 @@ __export(transformers_exports, {
   SnacFeatureExtractor: () => SnacFeatureExtractor,
   SnacModel: () => SnacModel,
   SnacPreTrainedModel: () => SnacPreTrainedModel,
+  SolarOpenForCausalLM: () => SolarOpenForCausalLM,
+  SolarOpenModel: () => SolarOpenModel,
+  SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
   SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
   SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
   SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
@@ -812,6 +850,10 @@ __export(transformers_exports, {
   VitsTokenizer: () => VitsTokenizer,
   VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
   VoxtralProcessor: () => VoxtralProcessor,
+  VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
+  VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
+  VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
+  VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
   Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
   Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
   Wav2Vec2BertModel: () => Wav2Vec2BertModel,
@@ -910,7 +952,7 @@ var import_node_fs = __toESM(require("fs"), 1);
 var import_node_path = __toESM(require("path"), 1);
 var import_node_url = __toESM(require("url"), 1);
 var import_meta = {};
-var VERSION = "4.0.0-next.6";
+var VERSION = "4.0.0-next.8";
 var HAS_SELF = typeof self !== "undefined";
 var IS_FS_AVAILABLE = !isEmpty(import_node_fs.default);
 var IS_PATH_AVAILABLE = !isEmpty(import_node_path.default);
@@ -1038,6 +1080,7 @@ var env = {
   customCache: null,
   useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
   cacheKey: "transformers-cache",
+  experimental_useCrossOriginStorage: false,
   /////////////////// Custom fetch /////////////////////
   fetch: DEFAULT_FETCH
   //////////////////////////////////////////////////////
@@ -1139,7 +1182,7 @@ var logger = {
   }
 };
-// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
+// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
 var DictionarySplitter = class {
   /**
    * @param dictionary The dictionary of words to use for splitting.
@@ -2795,10 +2838,10 @@ var BPE = class extends TokenizerModel_default {
           );
           if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
             output_tokens.push(...byte_tokens);
-          } else {
+          } else if (this.unk_token != null) {
             output_tokens.push(this.unk_token);
           }
-        } else {
+        } else if (this.unk_token != null) {
           output_tokens.push(this.unk_token);
         }
       }
@@ -3588,7 +3631,7 @@ var Tokenizer = class {
 };
 var Tokenizer_default = Tokenizer;
-// ../../node_modules/.pnpm/@huggingface+jinja@0.5.5/node_modules/@huggingface/jinja/dist/index.js
+// ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
 var TOKEN_TYPES = Object.freeze({
   Text: "Text",
   // The text between Jinja statements or expressions
@@ -5107,7 +5150,11 @@ var Environment = class {
     ["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
     ["integer", (operand) => operand instanceof IntegerValue],
     ["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
-    ["mapping", (operand) => operand.type === "ObjectValue"],
+    ["mapping", (operand) => operand instanceof ObjectValue],
+    [
+      "sequence",
+      (operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
+    ],
     [
       "lower",
       (operand) => {
@@ -5380,6 +5427,9 @@ var Interpreter = class {
   applyFilter(operand, filterNode, environment) {
     if (filterNode.type === "Identifier") {
       const filter = filterNode;
+      if (filter.value === "safe") {
+        return operand;
+      }
       if (filter.value === "tojson") {
         return new StringValue(toJSON(operand, {}));
       }
@@ -5469,6 +5519,8 @@ var Interpreter = class {
             return new IntegerValue(Math.floor(operand.value));
           case "float":
             return new FloatValue(operand.value);
+          case "string":
+            return new StringValue(operand.toString());
           default:
             throw new Error(`Unknown NumericValue filter: ${filter.value}`);
         }
@@ -6897,9 +6949,216 @@ function toAbsoluteURL(url2) {
   return new URL(url2, baseURL).href;
 }
+// src/utils/cache/CrossOriginStorageCache.js
+var HASH_ALGORITHM = "SHA-256";
+var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
+var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
+var CrossOriginStorage = class {
+  /** @type {Promise<Cache> | null} */
+  #hashCache = null;
+  /**
+   * Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
+   * @returns {Promise<Cache>}
+   */
+  _getHashCache = () => {
+    this.#hashCache ??= caches.open(HASH_CACHE_NAME);
+    return this.#hashCache;
+  };
+  /**
+   * Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
+   * @returns {boolean}
+   */
+  static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
+  /**
+   * Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
+   * the corresponding file handle from cross-origin storage.
+   *
+   * Implements `CacheInterface.match`.
+   *
+   * @param {string} request The URL of the resource to look up.
+   * @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
+   */
+  match = async (request) => {
+    const hashValue = await this._getFileHash(request);
+    if (!hashValue) {
+      return void 0;
+    }
+    try {
+      const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
+      const blob = await handle.getFile();
+      return new Response(blob, {
+        headers: {
+          "Content-Length": String(blob.size)
+        }
+      });
+    } catch {
+      return void 0;
+    }
+  };
+  /**
+   * Stores a response in cross-origin storage, keyed by its SHA-256 hash.
+   *
+   * For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
+   * `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
+   * without reading the response body a second time.
+   *
+   * For non-LFS resources the hash is unknown upfront.  In that case the body is consumed
+   * in the background: the stream is read to compute the content hash, the file is written
+   * into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
+   * so that future `match` calls can resolve the file without a network round-trip.
+   *
+   * Implements `CacheInterface.put`.
+   *
+   * @param {string} request The URL of the resource (used as the hash-cache key).
+   * @param {Response} response The response whose body will be written to the cache.
+   * @returns {Promise<void>}
+   */
+  put = async (request, response) => {
+    const hashValue = await this._getFileHash(request);
+    if (hashValue) {
+      const blob = await response.blob();
+      await this._storeBlobInCOS(blob, hashValue);
+    } else {
+      this._processAndStore(request, response.body);
+    }
+  };
+  /**
+   * Writes a blob into cross-origin storage using the given pre-computed hex hash string.
+   *
+   * @param {Blob} blob
+   * @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
+   * @returns {Promise<void>}
+   */
+  _storeBlobInCOS = async (blob, hashHex) => {
+    const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
+      create: true
+    });
+    const writableStream = await handle.createWritable();
+    await writableStream.write(blob);
+    await writableStream.close();
+  };
+  /**
+   * Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
+   * of the resulting blob, stores it in cross-origin storage, and persists the computed
+   * hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
+   * file without a network round-trip.
+   *
+   * Called fire-and-forget from `put` — errors are swallowed so failures never surface to
+   * the caller.
+   *
+   * @param {string} request The original resource URL.
+   * @param {ReadableStream} stream The response body stream to consume.
+   * @returns {Promise<void>}
+   */
+  _processAndStore = async (request, stream) => {
+    try {
+      const chunks = [];
+      for await (const chunk2 of stream) {
+        chunks.push(chunk2);
+      }
+      const blob = new Blob(chunks);
+      const hashHex = await this._getBlobHash(blob);
+      await this._storeBlobInCOS(blob, hashHex);
+      try {
+        const hashCache = await this._getHashCache();
+        await hashCache.put(request, new Response(hashHex));
+      } catch {
+      }
+    } catch {
+    }
+  };
+  /**
+   * Deletes the cache entry for the given request.
+   *
+   * Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
+   * expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
+   * permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
+   * re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
+   *
+   * Implements `CacheInterface.delete`.
+   *
+   * @param {string} request
+   * @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
+   */
+  delete = async (request) => {
+    try {
+      const hashCache = await this._getHashCache();
+      return await hashCache.delete(request);
+    } catch {
+      return false;
+    }
+  };
+  /**
+   * Resolves the SHA-256 hash for a given URL.
+   *
+   * Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
+   * Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
+   * LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
+   *
+   * Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
+   *
+   * @param {string} url The resource URL to resolve a hash for.
+   * @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
+   */
+  _getFileHash = async (url2) => {
+    try {
+      const hashCache = await this._getHashCache();
+      const cached = await hashCache.match(url2);
+      if (cached) {
+        return cached.text();
+      }
+      const hash = await this._getLfsFileHash(url2);
+      if (hash) {
+        await hashCache.put(url2, new Response(hash));
+        return hash;
+      }
+      return null;
+    } catch {
+      return null;
+    }
+  };
+  /**
+   * Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
+   * Git LFS pointer file.
+   *
+   * Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
+   * The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
+   * Returns `null` for non-LFS URLs or when the network request fails.
+   *
+   * @see https://huggingface.co/docs/hub/en/storage-backends#xet
+   * @param {string} url The resolved Hugging Face URL of the resource.
+   * @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
+   */
+  _getLfsFileHash = async (url2) => {
+    if (!url2.includes("/resolve/")) {
+      return null;
+    }
+    const rawUrl = url2.replace("/resolve/", "/raw/");
+    try {
+      const text = await fetch(rawUrl).then((r) => r.text());
+      const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
+      return match ? match[1] : null;
+    } catch {
+      return null;
+    }
+  };
+  /**
+   * Computes the SHA-256 hash of a `Blob`'s contents.
+   *
+   * @param {Blob} blob The blob to hash.
+   * @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
+   */
+  _getBlobHash = async (blob) => {
+    const arrayBuffer = await blob.arrayBuffer();
+    const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
+    const hashArray = Array.from(new Uint8Array(hashBuffer));
+    return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
+  };
+};
 // src/utils/cache.js
 async function getCache(file_cache_dir = null) {
-  let cache = null;
+  let cache2 = null;
   if (env.useCustomCache) {
     if (!env.customCache) {
       throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
@@ -6909,30 +7168,33 @@ async function getCache(file_cache_dir = null) {
         "`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
       );
     }
-    cache = env.customCache;
+    cache2 = env.customCache;
   }
-  if (!cache && env.useBrowserCache) {
+  if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
+    cache2 = new CrossOriginStorage();
+  }
+  if (!cache2 && env.useBrowserCache) {
     if (typeof caches === "undefined") {
       throw Error("Browser cache is not available in this environment.");
     }
     try {
-      cache = await caches.open(env.cacheKey);
+      cache2 = await caches.open(env.cacheKey);
     } catch (e) {
       logger.warn("An error occurred while opening the browser cache:", e);
     }
   }
-  if (!cache && env.useFSCache) {
+  if (!cache2 && env.useFSCache) {
     if (!apis.IS_FS_AVAILABLE) {
       throw Error("File System Cache is not available in this environment.");
     }
-    cache = new FileCache(file_cache_dir ?? env.cacheDir);
+    cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
   }
-  return cache;
+  return cache2;
 }
-async function tryCache(cache, ...names) {
+async function tryCache(cache2, ...names) {
   for (let name of names) {
     try {
-      let result = await cache.match(name);
+      let result = await cache2.match(name);
       if (result) return result;
     } catch (e) {
       continue;
@@ -6941,6 +7203,83 @@ async function tryCache(cache, ...names) {
   return void 0;
 }
+// src/utils/lru_cache.js
+var LRUCache2 = class {
+  /** @type {number} */
+  #capacity;
+  /** @type {Map<any, any>} */
+  #cache;
+  /**
+   * Creates an LRUCache instance.
+   * @param {number} capacity The maximum number of items the cache can hold.
+   */
+  constructor(capacity) {
+    this.#capacity = capacity;
+    this.#cache = /* @__PURE__ */ new Map();
+  }
+  /**
+   * Retrieves the value associated with the given key and marks the key as recently used.
+   * @param {any} key The key to retrieve.
+   * @returns {any} The value associated with the key, or undefined if the key does not exist.
+   */
+  get(key) {
+    if (!this.#cache.has(key)) return void 0;
+    const value = this.#cache.get(key);
+    this.#cache.delete(key);
+    this.#cache.set(key, value);
+    return value;
+  }
+  /**
+   * Inserts or updates the key-value pair in the cache.
+   * If the key already exists, it is updated and marked as recently used.
+   * If the cache exceeds its capacity, the least recently used item is evicted.
+   * @param {any} key The key to add or update.
+   * @param {any} value The value to associate with the key.
+   */
+  put(key, value) {
+    if (this.#cache.has(key)) {
+      this.#cache.delete(key);
+    }
+    this.#cache.set(key, value);
+    if (this.#cache.size > this.#capacity) {
+      this.#cache.delete(this.#cache.keys().next().value);
+    }
+  }
+  /**
+   * Removes the entry for the given key from the cache.
+   * @param {any} key The key to delete.
+   * @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
+   */
+  delete(key) {
+    return this.#cache.delete(key);
+  }
+  /**
+   * Clears the cache.
+   */
+  clear() {
+    this.#cache.clear();
+  }
+};
+// src/utils/memoize_promise.js
+var MAX_CACHE_SIZE = 100;
+var cache = new LRUCache2(MAX_CACHE_SIZE);
+function memoizePromise(key, factory) {
+  const cached = cache.get(key);
+  if (cached !== void 0) {
+    return cached;
+  }
+  const promise = factory().then(
+    (value) => value,
+    (err) => {
+      cache.delete(key);
+      return Promise.reject(err);
+    }
+  );
+  cache.put(key, promise);
+  return promise;
+}
 // src/utils/model_registry/get_file_metadata.js
 async function fetch_file_head(urlOrPath) {
   if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
@@ -6948,17 +7287,27 @@ async function fetch_file_head(urlOrPath) {
   }
   const headers = getFetchHeaders(urlOrPath);
   headers.set("Range", "bytes=0-0");
-  return env.fetch(urlOrPath, { method: "GET", headers });
+  return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
+}
+function get_file_metadata(path_or_repo_id, filename, options = {}) {
+  const key = JSON.stringify([
+    path_or_repo_id,
+    filename,
+    options?.revision,
+    options?.cache_dir,
+    options?.local_files_only
+  ]);
+  return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
 }
-async function get_file_metadata(path_or_repo_id, filename, options = {}) {
-  const cache = await getCache(options?.cache_dir);
+async function _get_file_metadata(path_or_repo_id, filename, options) {
+  const cache2 = await getCache(options?.cache_dir);
   const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
     path_or_repo_id,
     filename,
     options,
-    cache
+    cache2
   );
-  const cachedResponse = await checkCachedResource(cache, localPath, proposedCacheKey);
+  const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
   if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
     const size = cachedResponse.headers.get("content-length");
     const contentType = cachedResponse.headers.get("content-type");
@@ -7056,7 +7405,7 @@ function getFetchHeaders(urlOrPath) {
   }
   return headers;
 }
-function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = null) {
+function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
   const revision = options.revision ?? "main";
   const requestURL = pathJoin(path_or_repo_id, filename);
   const validModelId = isValidHfModelId(path_or_repo_id);
@@ -7066,7 +7415,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
     env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
     filename
   );
-  const proposedCacheKey = cache instanceof FileCache ? (
+  const proposedCacheKey = cache2 instanceof FileCache ? (
     // Choose cache key for filesystem cache
     // When using the main revision (default), we use the request URL as the cache key.
     // If a specific revision is requested, we account for this in the cache key.
@@ -7080,14 +7429,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
     validModelId
   };
 }
-async function checkCachedResource(cache, localPath, proposedCacheKey) {
-  if (!cache) {
+async function checkCachedResource(cache2, localPath, proposedCacheKey) {
+  if (!cache2) {
     return void 0;
   }
-  return await tryCache(cache, localPath, proposedCacheKey);
+  return await tryCache(cache2, localPath, proposedCacheKey);
 }
-async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, response, result, options = {}) {
-  if (await cache.match(cacheKey) !== void 0) {
+async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
+  if (await cache2.match(cacheKey) !== void 0) {
     return;
   }
   if (!result) {
@@ -7097,20 +7446,22 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
       file: filename,
       ...data
     }) : void 0;
-    await cache.put(
+    await cache2.put(
       cacheKey,
       /** @type {Response} */
       response,
       wrapped_progress
     );
   } else if (typeof response !== "string") {
-    await cache.put(
+    const headers = new Headers(response.headers);
+    headers.set("content-length", result.byteLength.toString());
+    await cache2.put(
       cacheKey,
       new Response(
         /** @type {any} */
         result,
         {
-          headers: response.headers
+          headers
         }
       )
     ).catch((err) => {
@@ -7118,17 +7469,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
     });
   }
 }
-async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache = null) {
+async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
   const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
     path_or_repo_id,
     filename,
     options,
-    cache
+    cache2
   );
   let cacheKey;
   let toCacheResponse = false;
   let response;
-  response = await checkCachedResource(cache, localPath, proposedCacheKey);
+  response = await checkCachedResource(cache2, localPath, proposedCacheKey);
   const cacheHit = response !== void 0;
   if (!cacheHit) {
     if (env.allowLocalModels) {
@@ -7169,7 +7520,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
       }
       cacheKey = proposedCacheKey;
     }
-    toCacheResponse = cache && // 1. A caching system is available
+    toCacheResponse = cache2 && // 1. A caching system is available
     typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
     response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
     response.status === 200;
@@ -7231,7 +7582,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
     // i.e., do not cache FileResponses (prevents duplication)
     toCacheResponse && cacheKey && typeof response !== "string"
   ) {
-    await storeCachedResource(path_or_repo_id, filename, cache, cacheKey, response, result, options);
+    await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
   }
   dispatchCallback(options.progress_callback, {
     status: "done",
@@ -7247,7 +7598,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
   if (response instanceof FileResponse) {
     return response.filePath;
   }
-  const cachedResponse = await cache?.match(cacheKey);
+  const cachedResponse = await cache2?.match(cacheKey);
   if (cachedResponse instanceof FileResponse) {
     return cachedResponse.filePath;
   } else if (cachedResponse instanceof Response) {
@@ -7274,8 +7625,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
     name: path_or_repo_id,
     file: filename
   });
-  const cache = await getCache(options?.cache_dir);
-  return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache);
+  const cache2 = await getCache(options?.cache_dir);
+  return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
 }
 async function getModelText(modelPath, fileName, fatal = true, options = {}) {
   const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
@@ -8068,7 +8419,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
 // src/backends/onnx.js
 var ONNX_NODE = __toESM(require("onnxruntime-node"), 1);
-// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260303-e7e64dc112/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
+// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
 var ort_webgpu_bundle_min_exports = {};
 __export(ort_webgpu_bundle_min_exports, {
   InferenceSession: () => Jf,
@@ -8837,7 +9188,7 @@ async function ts(a = {}) {
     throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
   }
   function Ye() {
-    return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, H: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, g: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, I: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, h: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
+    return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
   }
   async function bt() {
     function e(o, u) {
@@ -10024,7 +10375,7 @@ async function ts(a = {}) {
         Te(`invalid type for getValue: ${t}`);
     }
   }, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
-  var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 923180: (e, t, n, o, u) => {
+  var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
     if (r === void 0 || !r.Uc) return 1;
     if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
     if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
@@ -10044,11 +10395,11 @@ async function ts(a = {}) {
     } catch {
       return 4;
     }
-  }, 924004: (e, t, n) => {
+  }, 926500: (e, t, n) => {
     r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
-  }, 924068: () => r.me(), 924110: (e) => {
+  }, 926564: () => r.me(), 926606: (e) => {
     r.jd(e);
-  }, 924147: () => typeof wasmOffsetConverter < "u" };
+  }, 926643: () => typeof wasmOffsetConverter < "u" };
   function af(e, t, n, o) {
     var u = P();
     try {
@@ -11964,7 +12315,7 @@ var $s = k(() => {
 Ve();
 Ve();
 Ve();
-var Xa = "1.25.0-dev.20260303-e7e64dc112";
+var Xa = "1.25.0-dev.20260307-d626b568e0";
 var Tl = Zr;
 {
   let a = ($s(), $t(Gs)).wasmBackend;
@@ -11975,11 +12326,11 @@ Object.defineProperty(K.versions, "web", { value: Xa, enumerable: true });
 // src/backends/utils/cacheWasm.js
 async function loadAndCacheFile(url2) {
   const fileName = url2.split("/").pop();
-  let cache;
+  let cache2;
   try {
-    cache = await getCache();
-    if (cache) {
-      const result = await cache.match(url2);
+    cache2 = await getCache();
+    if (cache2) {
+      const result = await cache2.match(url2);
       if (result) {
         return result;
       }
@@ -11991,9 +12342,9 @@ async function loadAndCacheFile(url2) {
   if (!response.ok) {
     throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
   }
-  if (cache) {
+  if (cache2) {
     try {
-      await cache.put(url2, response.clone());
+      await cache2.put(url2, response.clone());
     } catch (e) {
       logger.warn(`Failed to cache ${fileName}:`, e);
     }
@@ -13845,9 +14196,23 @@ var Tensor2 = class _Tensor {
       throw Error(`Unsupported norm: ${p}`);
     }
     const this_data = this.data;
-    const fn2 = (a, b) => a + b ** p;
+    const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
+    if (is_bigint && p !== 1) {
+      throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
+    }
+    let fn2, zero;
+    if (is_bigint) {
+      fn2 = (a, b) => a + b;
+      zero = 0n;
+    } else {
+      fn2 = (a, b) => a + b ** p;
+      zero = 0;
+    }
     if (dim === null) {
-      const val = this_data.reduce(fn2, 0) ** (1 / p);
+      let val = this_data.reduce(fn2, zero);
+      if (p !== 1) {
+        val = val ** (1 / p);
+      }
       return new _Tensor(this.type, [val], []);
     }
     const [type, result, resultDims] = reduce_helper(fn2, this, dim, keepdim);
@@ -16307,9 +16672,12 @@ __export(processors_exports, {
   ChatterboxProcessor: () => ChatterboxProcessor,
   Florence2Processor: () => Florence2Processor,
   Gemma3nProcessor: () => Gemma3nProcessor,
+  Glm46VProcessor: () => Glm46VProcessor,
+  GraniteSpeechProcessor: () => GraniteSpeechProcessor,
   GroundingDinoProcessor: () => GroundingDinoProcessor,
   Idefics3Processor: () => Idefics3Processor,
   JinaCLIPProcessor: () => JinaCLIPProcessor,
+  Lfm2VlProcessor: () => Lfm2VlProcessor,
   LlavaProcessor: () => LlavaProcessor,
   MgpstrProcessor: () => MgpstrProcessor,
   MoonshineProcessor: () => MoonshineProcessor,
@@ -16330,6 +16698,7 @@ __export(processors_exports, {
   UltravoxProcessor: () => UltravoxProcessor,
   VLChatProcessor: () => VLChatProcessor,
   VoxtralProcessor: () => VoxtralProcessor,
+  VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
   Wav2Vec2Processor: () => Wav2Vec2Processor,
   Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
   WhisperProcessor: () => WhisperProcessor
@@ -16384,12 +16753,14 @@ __export(feature_extractors_exports, {
   EncodecFeatureExtractor: () => EncodecFeatureExtractor,
   FeatureExtractor: () => FeatureExtractor,
   Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
+  GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
   MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
   ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
   PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
   SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
   SnacFeatureExtractor: () => SnacFeatureExtractor,
   SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
+  VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
   Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
   WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
   WhisperFeatureExtractor: () => WhisperFeatureExtractor
@@ -16617,6 +16988,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
   mel_filters = null,
   mel_floor = 1e-10,
   log_mel = null,
+  max_log_mel = null,
   reference = 1,
   min_value = 1e-10,
   db_range = null,
@@ -16756,6 +17128,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
           mel_spec_data[i] = Math.log10(mel_spec_data[i]);
         }
         break;
+      case "log10_max_norm": {
+        for (let i = 0; i < o; ++i) {
+          mel_spec_data[i] = Math.log10(mel_spec_data[i]);
+        }
+        const logMax = max_log_mel ?? max(mel_spec_data)[0];
+        const threshold = logMax - 8;
+        for (let i = 0; i < o; ++i) {
+          mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
+        }
+        break;
+      }
       case "dB":
         if (power === 1) {
           amplitude_to_db(mel_spec_data, reference, min_value, db_range);
@@ -16766,7 +17149,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
         }
         break;
       default:
-        throw new Error(`log_mel must be one of null, 'log', 'log10' or 'dB'. Got '${log_mel}'`);
+        throw new Error(
+          `log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
+        );
     }
   }
   return mel_spec;
@@ -17271,6 +17656,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
   }
 };
+// src/models/granite_speech/feature_extraction_granite_speech.js
+var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
+  constructor(config) {
+    super(config);
+    const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
+    this.mel_filters = mel_filter_bank(
+      Math.floor(1 + n_fft / 2),
+      // num_frequency_bins = 257
+      n_mels,
+      // 80
+      0,
+      // min_frequency
+      sample_rate / 2,
+      // max_frequency = 8000
+      sample_rate,
+      // 16000
+      null,
+      // norm (torchaudio default: no norm)
+      "htk"
+      // mel_scale (torchaudio default)
+    );
+    const raw_window = window_function(win_length, "hann");
+    this.window = new Float64Array(n_fft);
+    const pad = Math.floor((n_fft - win_length) / 2);
+    this.window.set(raw_window, pad);
+  }
+  /**
+   * Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
+   * @param {Float32Array|Float64Array} audio The audio waveform.
+   * @returns {Promise<{input_features: Tensor}>}
+   */
+  async _call(audio) {
+    validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
+    const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
+    const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
+    const max_num_frames = num_frames - num_frames % 2;
+    const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
+      power: 2,
+      mel_filters: this.mel_filters,
+      log_mel: "log10_max_norm",
+      transpose: true,
+      // [time, n_mels]
+      max_num_frames,
+      do_pad: false
+    });
+    const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
+    return { input_features };
+  }
+};
 // src/models/moonshine/feature_extraction_moonshine.js
 var MoonshineFeatureExtractor = class extends FeatureExtractor {
   /**
@@ -17751,6 +18186,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
   }
 };
+// src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
+var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
+  constructor(config) {
+    super(config);
+    this.config.mel_filters ??= mel_filter_bank(
+      Math.floor(1 + this.config.n_fft / 2),
+      // num_frequency_bins
+      this.config.feature_size,
+      // num_mel_filters
+      0,
+      // min_frequency
+      8e3,
+      // max_frequency
+      this.config.sampling_rate,
+      // sampling_rate
+      "slaney",
+      // norm
+      "slaney"
+      // mel_scale
+    );
+    this.window = window_function(this.config.n_fft, "hann");
+  }
+  /**
+   * Computes the log-Mel spectrogram of the provided audio waveform.
+   * @param {Float32Array|Float64Array} waveform The audio waveform to process.
+   * @param {Object} [options]
+   * @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
+   * @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
+   */
+  async _extract_fbank_features(waveform, { center = true } = {}) {
+    const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
+    const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
+    return await spectrogram(
+      waveform,
+      this.window,
+      n_fft,
+      // frame_length
+      hop_length,
+      {
+        power: 2,
+        mel_filters,
+        log_mel: "log10_max_norm",
+        max_log_mel: global_log_mel_max,
+        center,
+        max_num_frames,
+        do_pad: false
+      }
+    );
+  }
+  /**
+   * Extract mel spectrogram features from audio.
+   * @param {Float32Array|Float64Array} audio The audio data.
+   * @param {Object} [options]
+   * @param {boolean} [options.center=true] Whether to center-pad the waveform.
+   * @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
+   */
+  async _call(audio, { center = true } = {}) {
+    validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
+    const features = await this._extract_fbank_features(audio, { center });
+    return {
+      input_features: features.unsqueeze_(0)
+    };
+  }
+};
 // src/models/whisper/feature_extraction_whisper.js
 var WhisperFeatureExtractor = class extends FeatureExtractor {
   constructor(config) {
@@ -17779,7 +18279,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
    * @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
    */
   async _extract_fbank_features(waveform) {
-    const features = await spectrogram(
+    return await spectrogram(
       waveform,
       this.window,
       // window
@@ -17790,7 +18290,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
       {
         power: 2,
         mel_filters: this.config.mel_filters,
-        log_mel: "log10",
+        log_mel: "log10_max_norm",
         // Custom
         max_num_frames: Math.min(
           Math.floor(waveform.length / this.config.hop_length),
@@ -17799,15 +18299,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
         )
       }
     );
-    const data = features.data;
-    const maxValue = max(
-      /** @type {Float32Array} */
-      data
-    )[0];
-    for (let i = 0; i < data.length; ++i) {
-      data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
-    }
-    return features;
   }
   /**
    * Asynchronously extracts features from a given audio using the provided configuration.
@@ -18686,6 +19177,30 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
   }
   return [segmentation, segments];
 }
+function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
+  if (height < factor || width < factor) {
+    const scale = Math.max(factor / height, factor / width);
+    height = Math.round(height * scale);
+    width = Math.round(width * scale);
+  }
+  if (Math.max(height, width) / Math.min(height, width) > 200) {
+    throw new Error(
+      `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
+    );
+  }
+  let h_bar = Math.round(height / factor) * factor;
+  let w_bar = Math.round(width / factor) * factor;
+  if (temporal_factor * h_bar * w_bar > max_pixels) {
+    const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
+    h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
+    w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
+  } else if (temporal_factor * h_bar * w_bar < min_pixels) {
+    const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
+    h_bar = Math.ceil(height * beta / factor) * factor;
+    w_bar = Math.ceil(width * beta / factor) * factor;
+  }
+  return [w_bar, h_bar];
+}
 function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
   if (label_ids_to_fuse === null) {
     logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
@@ -18763,7 +19278,7 @@ var ImageProcessor = class extends Callable2 {
     this.do_pad = config.do_pad;
     this.min_pixels = config.min_pixels;
     this.max_pixels = config.max_pixels;
-    if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
+    if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
       this.pad_size = this.size;
     }
     this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -18974,7 +19489,7 @@ var ImageProcessor = class extends Callable2 {
     });
   }
   /**
-   * @typedef {object} PreprocessedImage
+   * @typedef {Object} PreprocessedImage
    * @property {HeightWidth} original_size The original size of the image.
    * @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
    * @property {Tensor} pixel_values The pixel values of the preprocessed image.
@@ -19051,10 +19566,8 @@ var ImageProcessor = class extends Callable2 {
         const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
         [pixelData, imgDims] = padded;
       } else if (this.size_divisibility) {
-        const [paddedWidth, paddedHeight] = enforce_size_divisibility(
-          [imgDims[1], imgDims[0]],
-          this.size_divisibility
-        );
+        const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
+        const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
         [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
       }
     }
@@ -19131,6 +19644,7 @@ var image_processors_exports = {};
 __export(image_processors_exports, {
   BeitFeatureExtractor: () => BeitFeatureExtractor,
   BitImageProcessor: () => BitImageProcessor,
+  CHMv2ImageProcessor: () => CHMv2ImageProcessor,
   CLIPFeatureExtractor: () => CLIPFeatureExtractor,
   CLIPImageProcessor: () => CLIPImageProcessor,
   ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -19147,11 +19661,13 @@ __export(image_processors_exports, {
   DonutImageProcessor: () => DonutImageProcessor,
   EfficientNetImageProcessor: () => EfficientNetImageProcessor,
   GLPNFeatureExtractor: () => GLPNFeatureExtractor,
+  Glm46VImageProcessor: () => Glm46VImageProcessor,
   GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
   Idefics3ImageProcessor: () => Idefics3ImageProcessor,
   ImageFeatureExtractor: () => ImageProcessor,
   ImageProcessor: () => ImageProcessor,
   JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
+  Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
   LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
   Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
   MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
@@ -19206,6 +19722,10 @@ var BitImageProcessor = class extends ImageProcessor {
 var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
 };
+// src/models/chmv2/image_processing_chmv2.js
+var CHMv2ImageProcessor = class extends ImageProcessor {
+};
 // src/models/clip/image_processing_clip.js
 var CLIPImageProcessor = class extends ImageProcessor {
 };
@@ -19325,6 +19845,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
   }
 };
+// src/models/qwen2_vl/image_processing_qwen2_vl.js
+var Qwen2VLImageProcessor = class extends ImageProcessor {
+  constructor(config) {
+    super(config);
+    this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
+    this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
+    this.patch_size = config.patch_size;
+    this.merge_size = config.merge_size;
+  }
+  /** @type {ImageProcessor['get_resize_output_image_size']} */
+  get_resize_output_image_size(image, size) {
+    const factor = this.patch_size * this.merge_size;
+    return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
+  }
+  async _call(images, ...args) {
+    const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
+    let patches = pixel_values;
+    const { temporal_patch_size, merge_size, patch_size } = this.config;
+    if (patches.dims[0] === 1) {
+      patches = cat(
+        Array.from({ length: temporal_patch_size }, () => patches),
+        0
+      );
+    }
+    const grid_t = patches.dims[0] / temporal_patch_size;
+    const channel = patches.dims[1];
+    const grid_h = Math.floor(patches.dims[2] / patch_size);
+    const grid_w = Math.floor(patches.dims[3] / patch_size);
+    const flatten_patches = patches.view(
+      grid_t,
+      temporal_patch_size,
+      channel,
+      Math.floor(grid_h / merge_size),
+      merge_size,
+      patch_size,
+      Math.floor(grid_w / merge_size),
+      merge_size,
+      patch_size
+    ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
+    const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
+    return {
+      pixel_values: flatten_patches,
+      image_grid_thw,
+      original_sizes,
+      reshaped_input_sizes
+    };
+  }
+};
+// src/models/glm46v/image_processing_glm46v.js
+var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
+  /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
+  get_resize_output_image_size(image, size) {
+    const factor = this.patch_size * this.merge_size;
+    const temporal_factor = this.config.temporal_patch_size ?? 2;
+    return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
+  }
+};
 // src/models/glpn/image_processing_glpn.js
 var GLPNFeatureExtractor = class extends ImageProcessor {
 };
@@ -19555,6 +20134,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
   }
 };
+// src/models/lfm2_vl/image_processing_lfm2_vl.js
+function round_by_factor(number, factor) {
+  return Math.round(number / factor) * factor;
+}
+function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
+  let best_ratio_diff = Infinity;
+  let best_ratio = [1, 1];
+  const area = width * height;
+  for (const ratio of target_ratios) {
+    const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
+    if (ratio_diff < best_ratio_diff) {
+      best_ratio_diff = ratio_diff;
+      best_ratio = ratio;
+    } else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
+      best_ratio = ratio;
+    }
+  }
+  return best_ratio;
+}
+function get_target_ratios(min_tiles, max_tiles) {
+  const ratios = [];
+  const seen = /* @__PURE__ */ new Set();
+  for (let n = min_tiles; n <= max_tiles; ++n) {
+    for (let w = 1; w <= n; ++w) {
+      for (let h = 1; h <= n; ++h) {
+        const product2 = w * h;
+        if (product2 >= min_tiles && product2 <= max_tiles) {
+          const key = w << 16 | h;
+          if (!seen.has(key)) {
+            seen.add(key);
+            ratios.push([w, h]);
+          }
+        }
+      }
+    }
+  }
+  return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
+}
+function convert_image_to_patches(images, patch_size) {
+  const [B, C, H, W] = images.dims;
+  const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
+  const patch_dim = patch_size * patch_size * C;
+  const data = (
+    /** @type {Float32Array} */
+    images.data
+  );
+  const result = new Float32Array(B * ph * pw * patch_dim);
+  const ch_stride = H * W;
+  for (let b = 0; b < B; ++b) {
+    const b_src = b * C * ch_stride;
+    const b_dst = b * ph * pw * patch_dim;
+    for (let py = 0; py < ph; ++py) {
+      for (let px = 0; px < pw; ++px) {
+        let off = b_dst + (py * pw + px) * patch_dim;
+        for (let dy = 0; dy < patch_size; ++dy) {
+          const row = (py * patch_size + dy) * W + px * patch_size;
+          for (let dx = 0; dx < patch_size; ++dx) {
+            const pixel = row + dx;
+            for (let c = 0; c < C; ++c) {
+              result[off++] = data[b_src + c * ch_stride + pixel];
+            }
+          }
+        }
+      }
+    }
+  }
+  return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
+}
+function pad_along_first_dim(patches, target_length) {
+  const [, len2, dim] = patches.dims;
+  const mask_data = new BigInt64Array(target_length);
+  mask_data.fill(1n, 0, len2);
+  let padded = patches;
+  if (len2 < target_length) {
+    const padded_data = new Float32Array(target_length * dim);
+    padded_data.set(
+      /** @type {Float32Array} */
+      patches.data
+    );
+    padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
+  }
+  return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
+}
+var Lfm2VlImageProcessor = class extends ImageProcessor {
+  constructor(config) {
+    super(config);
+    this.downsample_factor = config.downsample_factor ?? 2;
+    this.do_image_splitting = config.do_image_splitting ?? true;
+    this.min_tiles = config.min_tiles ?? 2;
+    this.max_tiles = config.max_tiles ?? 10;
+    this.use_thumbnail = config.use_thumbnail ?? true;
+    this.min_image_tokens = config.min_image_tokens ?? 64;
+    this.max_image_tokens = config.max_image_tokens ?? 256;
+    this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
+    this.tile_size = config.tile_size ?? 512;
+    this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
+    this.return_row_col_info = config.return_row_col_info ?? false;
+    const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
+    const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
+    this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
+  }
+  /**
+   * Check if the image is too large to be processed as a single tile.
+   * @param {number} height
+   * @param {number} width
+   * @returns {boolean}
+   */
+  _is_image_too_large(height, width) {
+    const total_factor = this.encoder_patch_size * this.downsample_factor;
+    const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
+    const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
+    return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
+  }
+  /**
+   * Get the grid layout for tiling a large image.
+   * @param {number} height
+   * @param {number} width
+   * @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
+   */
+  _get_grid_layout(height, width) {
+    const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
+    const [grid_width, grid_height] = find_closest_aspect_ratio(
+      width / height,
+      target_ratios,
+      width,
+      height,
+      this.tile_size
+    );
+    return {
+      grid_width,
+      grid_height,
+      target_width: this.tile_size * grid_width,
+      target_height: this.tile_size * grid_height
+    };
+  }
+  /** @param {RawImage|RawImage[]|RawImage[][]} images */
+  // @ts-expect-error
+  async _call(images, { return_row_col_info = null } = {}) {
+    let batched_images;
+    if (!Array.isArray(images)) {
+      batched_images = [[images]];
+    } else if (!Array.isArray(images[0])) {
+      batched_images = [
+        /** @type {RawImage[]} */
+        images
+      ];
+    } else {
+      batched_images = /** @type {RawImage[][]} */
+      images;
+    }
+    const all_pixel_values = [];
+    const all_pixel_masks = [];
+    const all_spatial_shapes = [];
+    const all_rows = [];
+    const all_cols = [];
+    const all_image_sizes = [];
+    for (const image_batch of batched_images) {
+      const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
+      for (const { pixel_values } of preprocessed) {
+        const [, height, width] = pixel_values.dims;
+        const img = pixel_values.unsqueeze_(0);
+        const total_factor = this.encoder_patch_size * this.downsample_factor;
+        const f2 = total_factor ** 2;
+        const [new_width, new_height] = smart_resize(
+          Math.max(total_factor, height),
+          Math.max(total_factor, width),
+          total_factor,
+          this.min_image_tokens * f2,
+          this.max_image_tokens * f2
+        ).map((x) => Math.max(total_factor, x));
+        let tiles;
+        let num_rows = 1, num_cols = 1;
+        const is_large = this._is_image_too_large(height, width);
+        const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
+        if (is_large && do_splitting) {
+          const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
+            height,
+            width
+          );
+          num_rows = grid_height;
+          num_cols = grid_width;
+          const resized = await interpolate_4d(img, {
+            size: [target_height, target_width]
+          });
+          tiles = [];
+          for (let r = 0; r < grid_height; ++r) {
+            for (let c = 0; c < grid_width; ++c) {
+              const y = r * this.tile_size;
+              const x = c * this.tile_size;
+              tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
+            }
+          }
+          if (this.use_thumbnail && grid_width * grid_height !== 1) {
+            tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
+          }
+        } else {
+          tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
+        }
+        for (const tile of tiles) {
+          const [, , th, tw] = tile.dims;
+          const patches = convert_image_to_patches(tile, this.encoder_patch_size);
+          const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
+          all_pixel_values.push(padded);
+          all_pixel_masks.push(mask);
+          all_spatial_shapes.push([
+            Math.floor(th / this.encoder_patch_size),
+            Math.floor(tw / this.encoder_patch_size)
+          ]);
+        }
+        all_rows.push(num_rows);
+        all_cols.push(num_cols);
+        all_image_sizes.push([new_height, new_width]);
+      }
+    }
+    const result = {
+      pixel_values: cat(all_pixel_values, 0),
+      pixel_attention_mask: stack(all_pixel_masks, 0),
+      spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
+        all_spatial_shapes.length,
+        2
+      ])
+    };
+    if (return_row_col_info ?? this.return_row_col_info) {
+      result.image_rows = all_rows;
+      result.image_cols = all_cols;
+      result.image_sizes = all_image_sizes;
+    }
+    return result;
+  }
+};
 // src/models/llava_onevision/image_processing_llava_onevision.js
 var LlavaOnevisionImageProcessor = class extends ImageProcessor {
 };
@@ -19777,76 +20587,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
 var PvtImageProcessor = class extends ImageProcessor {
 };
-// src/models/qwen2_vl/image_processing_qwen2_vl.js
-function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
-  if (height < factor || width < factor) {
-    throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
-  } else if (Math.max(height, width) / Math.min(height, width) > 200) {
-    throw new Error(
-      `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
-    );
-  }
-  let h_bar = Math.round(height / factor) * factor;
-  let w_bar = Math.round(width / factor) * factor;
-  if (h_bar * w_bar > max_pixels) {
-    const beta = Math.sqrt(height * width / max_pixels);
-    h_bar = Math.floor(height / beta / factor) * factor;
-    w_bar = Math.floor(width / beta / factor) * factor;
-  } else if (h_bar * w_bar < min_pixels) {
-    const beta = Math.sqrt(min_pixels / (height * width));
-    h_bar = Math.ceil(height * beta / factor) * factor;
-    w_bar = Math.ceil(width * beta / factor) * factor;
-  }
-  return [h_bar, w_bar];
-}
-var Qwen2VLImageProcessor = class extends ImageProcessor {
-  constructor(config) {
-    super(config);
-    this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
-    this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
-    this.patch_size = config.patch_size;
-    this.merge_size = config.merge_size;
-  }
-  /** @type {ImageProcessor['get_resize_output_image_size']} */
-  get_resize_output_image_size(image, size) {
-    const factor = this.patch_size * this.merge_size;
-    return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
-  }
-  async _call(images, ...args) {
-    const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
-    let patches = pixel_values;
-    const { temporal_patch_size, merge_size, patch_size } = this.config;
-    if (patches.dims[0] === 1) {
-      patches = cat(
-        Array.from({ length: temporal_patch_size }, () => patches),
-        0
-      );
-    }
-    const grid_t = patches.dims[0] / temporal_patch_size;
-    const channel = patches.dims[1];
-    const grid_h = Math.floor(patches.dims[2] / patch_size);
-    const grid_w = Math.floor(patches.dims[3] / patch_size);
-    const flatten_patches = patches.view(
-      grid_t,
-      temporal_patch_size,
-      channel,
-      Math.floor(grid_h / merge_size),
-      merge_size,
-      patch_size,
-      Math.floor(grid_w / merge_size),
-      merge_size,
-      patch_size
-    ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
-    const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
-    return {
-      pixel_values: flatten_patches,
-      image_grid_thw,
-      original_sizes,
-      reshaped_input_sizes
-    };
-  }
-};
 // src/models/rt_detr/image_processing_rt_detr.js
 var RTDetrImageProcessor = class extends ImageProcessor {
   /** @type {typeof post_process_object_detection} */
@@ -20400,6 +21140,107 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
   }
 };
+// src/models/qwen2_vl/processing_qwen2_vl.js
+var Qwen2VLProcessor = class extends Processor {
+  static image_processor_class = AutoImageProcessor;
+  static tokenizer_class = AutoTokenizer;
+  static image_token = "<|image_pad|>";
+  /**
+   *
+   * @param {string|string[]} text
+   * @param {RawImage|RawImage[]} images
+   * @param  {...any} args
+   * @returns {Promise<any>}
+   */
+  async _call(text, images = null, ...args) {
+    if (!Array.isArray(text)) {
+      text = [text];
+    }
+    let image_inputs, image_grid_thw;
+    if (images) {
+      image_inputs = await this.image_processor(images);
+      image_grid_thw = image_inputs.image_grid_thw;
+    }
+    if (image_grid_thw) {
+      let merge_length = this.image_processor.config.merge_size ** 2;
+      let index = 0;
+      const image_token = (
+        /** @type {typeof Qwen2VLProcessor} */
+        this.constructor.image_token
+      );
+      const image_grid_thw_list = image_grid_thw.tolist();
+      text = text.map((t) => {
+        while (t.includes(image_token)) {
+          const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
+          t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
+        }
+        return t.replaceAll("<|placeholder|>", image_token);
+      });
+    }
+    const text_inputs = this.tokenizer(text);
+    return {
+      ...text_inputs,
+      ...image_inputs
+    };
+  }
+};
+// src/models/glm46v/processing_glm46v.js
+var Glm46VProcessor = class extends Qwen2VLProcessor {
+  static image_token = "<|image|>";
+};
+// src/models/granite_speech/processing_granite_speech.js
+var GraniteSpeechProcessor = class extends Processor {
+  static tokenizer_class = AutoTokenizer;
+  static feature_extractor_class = AutoFeatureExtractor;
+  static uses_processor_config = true;
+  /**
+   * Compute the number of audio tokens for a given raw audio length.
+   * @param {number} audioLength Raw audio sample count.
+   * @returns {number} Number of projector output tokens.
+   */
+  _get_num_audio_features(audioLength) {
+    const { hop_length } = this.feature_extractor.config.melspec_kwargs;
+    const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
+    const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
+    const mel_length = Math.floor(audioLength / hop_length) + 1;
+    const encoder_length = Math.floor(mel_length / 2);
+    const nblocks = Math.ceil(encoder_length / projector_window_size);
+    return nblocks * effective_window_size;
+  }
+  /**
+   * @param {string} text The text input to process.
+   * @param {Float32Array} audio The audio input to process.
+   */
+  async _call(text, audio = null, kwargs = {}) {
+    if (Array.isArray(text)) {
+      throw new Error("Batched inputs are not supported yet.");
+    }
+    let audio_inputs = {};
+    if (audio) {
+      const { input_features } = await this.feature_extractor(audio);
+      audio_inputs["input_features"] = input_features;
+      const audio_embed_size = this._get_num_audio_features(audio.length);
+      const mask_data = new Uint8Array(audio_embed_size).fill(1);
+      audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
+      const audio_token = this.config.audio_token ?? "<|audio|>";
+      if (!text.includes(audio_token)) {
+        throw new Error(`The input text does not contain the audio token ${audio_token}.`);
+      }
+      text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
+    }
+    const text_inputs = this.tokenizer(text, {
+      add_special_tokens: false,
+      ...kwargs
+    });
+    return {
+      ...text_inputs,
+      ...audio_inputs
+    };
+  }
+};
 // src/models/grounding_dino/processing_grounding_dino.js
 function get_phrases_from_posmap(posmaps, input_ids) {
   const left_idx = 0;
@@ -20676,6 +21517,66 @@ var JinaCLIPProcessor = class extends Processor {
   }
 };
+// src/models/lfm2_vl/processing_lfm2_vl.js
+var Lfm2VlProcessor = class extends Processor {
+  static tokenizer_class = AutoTokenizer;
+  static image_processor_class = AutoImageProcessor;
+  /**
+   * @param {RawImage|RawImage[]} images
+   * @param {string|string[]|null} [text]
+   * @param {Record<string, any>} [kwargs]
+   */
+  async _call(images, text = null, kwargs = {}) {
+    const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
+      ...kwargs,
+      return_row_col_info: true
+    });
+    if (text) {
+      const image_token = this.config.image_token ?? "<image>";
+      const {
+        tile_size = 512,
+        downsample_factor = 2,
+        encoder_patch_size = 16,
+        use_thumbnail = true
+      } = (
+        /** @type {Record<string, any>} */
+        this.image_processor.config
+      );
+      const ds2 = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
+      const tokens_per_tile = ds2(tile_size) ** 2;
+      const image_start = this.config.image_start_token ?? "<|image_start|>";
+      const image_end = this.config.image_end_token ?? "<|image_end|>";
+      const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
+      if (!Array.isArray(text)) text = [text];
+      let image_idx = 0;
+      text = text.map((sample) => {
+        const parts = sample.split(image_token);
+        return parts[0] + parts.slice(1).map((part) => {
+          const idx = image_idx++;
+          const [h, w] = image_sizes[idx];
+          const rows = image_rows[idx], cols = image_cols[idx];
+          const tokens_for_image = ds2(h) * ds2(w);
+          let expanded = image_start;
+          if (rows > 1 || cols > 1) {
+            const tile_str = image_token.repeat(tokens_per_tile);
+            for (let r = 0; r < rows; ++r)
+              for (let c = 0; c < cols; ++c)
+                expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
+            if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
+          } else {
+            expanded += image_token.repeat(tokens_for_image);
+          }
+          return expanded + image_end + part;
+        }).join("");
+      });
+    }
+    return {
+      ...image_inputs,
+      ...text ? this.tokenizer(text, kwargs) : {}
+    };
+  }
+};
 // src/models/llava/processing_llava.js
 var LlavaProcessor = class extends Processor {
   static tokenizer_class = AutoTokenizer;
@@ -21019,47 +21920,6 @@ var PyAnnoteProcessor = class extends Processor {
   }
 };
-// src/models/qwen2_vl/processing_qwen2_vl.js
-var Qwen2VLProcessor = class extends Processor {
-  static image_processor_class = AutoImageProcessor;
-  static tokenizer_class = AutoTokenizer;
-  /**
-   *
-   * @param {string|string[]} text
-   * @param {RawImage|RawImage[]} images
-   * @param  {...any} args
-   * @returns {Promise<any>}
-   */
-  async _call(text, images = null, ...args) {
-    if (!Array.isArray(text)) {
-      text = [text];
-    }
-    let image_inputs, image_grid_thw;
-    if (images) {
-      image_inputs = await this.image_processor(images);
-      image_grid_thw = image_inputs.image_grid_thw;
-    }
-    if (image_grid_thw) {
-      let merge_length = this.image_processor.config.merge_size ** 2;
-      let index = 0;
-      const image_grid_thw_list = image_grid_thw.tolist();
-      text = text.map((t) => {
-        while (t.includes("<|image_pad|>")) {
-          const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
-          t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
-        }
-        return t.replaceAll("<|placeholder|>", "<|image_pad|>");
-      });
-    }
-    const text_inputs = this.tokenizer(text);
-    return {
-      ...text_inputs,
-      ...image_inputs
-      // TODO: ...videos_inputs,
-    };
-  }
-};
 // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
 var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
 };
@@ -21208,6 +22068,94 @@ var VoxtralProcessor = class extends Processor {
   }
 };
+// src/models/voxtral_realtime/processing_voxtral_realtime.js
+var NUM_LEFT_PAD_TOKENS = 32;
+var NUM_DELAY_TOKENS = 6;
+var AUDIO_LENGTH_PER_TOK = 8;
+var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
+var STREAMING_PAD_TOKEN_ID = 32;
+var VoxtralRealtimeProcessor = class extends Processor {
+  static tokenizer_class = AutoTokenizer;
+  static feature_extractor_class = AutoFeatureExtractor;
+  static uses_processor_config = false;
+  /** Number of mel frames in the first audio chunk. */
+  get num_mel_frames_first_audio_chunk() {
+    return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
+  }
+  /** Number of raw audio samples in the first audio chunk. */
+  get num_samples_first_audio_chunk() {
+    const { hop_length, n_fft } = this.feature_extractor.config;
+    return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
+  }
+  /** Number of raw audio samples per subsequent audio chunk. */
+  get num_samples_per_audio_chunk() {
+    const { hop_length, n_fft } = this.feature_extractor.config;
+    return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
+  }
+  /** Number of right-pad tokens for non-streaming mode. */
+  get num_right_pad_tokens() {
+    return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
+  }
+  /** Number of mel frames per text token. */
+  get audio_length_per_tok() {
+    return AUDIO_LENGTH_PER_TOK;
+  }
+  /** Number of raw audio samples per token. */
+  get raw_audio_length_per_tok() {
+    return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
+  }
+  /**
+   * Process audio input for VoxtralRealtime.
+   *
+   * In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
+   * with silence and mel features are extracted with `center=true`.
+   * Returns `{ input_ids, input_features }`.
+   *
+   * In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
+   * processed with `center=false` and only `{ input_features }` is returned.
+   *
+   * In non-streaming mode, the audio is right-padded to ensure the model
+   * transcribes the full audio, then processed with `center=true`.
+   * Returns `{ input_features }`.
+   *
+   * @param {Float32Array|Float64Array} audio The audio waveform.
+   * @param {Object} [options]
+   * @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
+   * @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
+   * @returns {Promise<Object>}
+   */
+  async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
+    validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
+    if (!is_streaming && !is_first_audio_chunk) {
+      throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
+    }
+    if (is_first_audio_chunk) {
+      if (is_streaming) {
+        const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
+        const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
+        padded_audio.set(audio, num_left_pad_samples);
+        const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
+        const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
+        const num_input_tokens = 1 + num_pad_tokens;
+        const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
+        input_ids_data[0] = 1n;
+        const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
+        return {
+          input_ids,
+          ...audio_encoding
+        };
+      } else {
+        const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
+        const padded_audio = new Float32Array(audio.length + right_pad_samples);
+        padded_audio.set(audio);
+        return await this.feature_extractor(padded_audio, { center: true });
+      }
+    } else {
+      return await this.feature_extractor(audio, { center: false });
+    }
+  }
+};
 // src/models/wav2vec2/processing_wav2vec2.js
 var Wav2Vec2Processor = class extends Processor {
   static tokenizer_class = AutoTokenizer;
@@ -21307,11 +22255,16 @@ function getNormalizedConfig(config) {
     case "florence2":
     case "llava_onevision":
     case "idefics3":
+    case "granite_speech":
     case "ultravox":
     case "voxtral":
+    case "voxtral_realtime":
     case "smolvlm":
     case "gemma3n":
+    case "lfm2_vl":
     case "chatterbox":
+    case "lighton_ocr":
+    case "glm_ocr":
     case "mistral3":
     case "qwen2_5_vl":
     case "qwen3_vl":
@@ -21365,10 +22318,13 @@ function getNormalizedConfig(config) {
     case "cohere":
     case "cohere2":
     case "mistral":
+    case "voxtral_realtime_text":
+    case "voxtral_realtime_encoder":
     case "starcoder2":
     case "qwen2":
     case "qwen2_moe":
     case "qwen2_vl":
+    case "qwen2_vl_text":
     case "qwen2_5_vl_text":
     case "qwen3_moe":
     case "qwen3_vl_text":
@@ -21384,6 +22340,8 @@ function getNormalizedConfig(config) {
       mapping["dim_kv"] = "head_dim";
       break;
     case "qwen3":
+    case "solar_open":
+    case "glm_ocr_text":
     case "gemma":
     case "gemma2":
     case "vaultgemma":
@@ -21394,6 +22352,7 @@ function getNormalizedConfig(config) {
     case "ernie4_5":
     case "hunyuan_v1_dense":
     case "falcon_h1":
+    case "nemotron_h":
     case "ministral":
     case "ministral3":
       mapping["num_heads"] = "num_key_value_heads";
@@ -21428,6 +22387,9 @@ function getNormalizedConfig(config) {
       mapping["num_attention_heads"] = "num_attention_heads";
       break;
     case "youtu":
+    case "deepseek_v3":
+    case "glm_moe_dsa":
+    case "mistral4":
       mapping["num_heads"] = "num_key_value_heads";
       mapping["num_layers"] = "num_hidden_layers";
       mapping["dim_kv"] = "qk_head_dim";
@@ -21513,6 +22475,10 @@ function getNormalizedConfig(config) {
   return normalized_config;
 }
 function getCacheShapes(config, options) {
+  if (!(config instanceof PretrainedConfig)) {
+    config = new PretrainedConfig(config);
+  }
+  const batch_size = options?.batch_size ?? 1;
   if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
     const pkv_prefix = options?.prefix ?? "past_key_values";
     const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -21522,7 +22488,6 @@ function getCacheShapes(config, options) {
       config
     );
     const head_dim = hidden_size / num_attention_heads;
-    const batch_size = options?.batch_size ?? 1;
     for (let i = 0; i < layer_types.length; ++i) {
       if (layer_types[i] === "full_attention") {
         for (const kv of ["key", "value"]) {
@@ -21535,31 +22500,26 @@ function getCacheShapes(config, options) {
       }
     }
     return cache_values;
-  } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
+  } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
     const pkv_prefix = options?.prefix ?? "past_key_values";
     const conv_prefix = pkv_prefix === "present" ? "present" : "past";
-    const cache_values = {};
-    const {
-      layer_types,
-      num_hidden_layers,
-      num_attention_heads,
-      num_key_value_heads,
-      hidden_size,
-      mamba_d_conv,
-      mamba_n_heads,
-      mamba_d_head,
-      mamba_d_state,
-      mamba_n_groups,
-      mamba_expand,
-      mamba_d_ssm
-    } = (
+    const c = (
       /** @type {any} */
       config
     );
-    const head_dim = hidden_size / num_attention_heads;
-    const batch_size = options?.batch_size ?? 1;
-    const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
-    for (let i = 0; i < num_hidden_layers; ++i) {
+    const layer_types = c.layer_types ?? c.layers_block_type;
+    const num_layers = c.num_hidden_layers ?? layer_types?.length;
+    const num_key_value_heads = c.num_key_value_heads;
+    const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
+    const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
+    const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
+    const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
+    const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
+    const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
+    const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
+    const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
+    const cache_values = {};
+    for (let i = 0; i < num_layers; ++i) {
       if (!layer_types || layer_types[i] === "mamba") {
         cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
         cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -21593,7 +22553,6 @@ function getCacheShapes(config, options) {
     const key_dim = linear_key_head_dim * linear_num_key_heads;
     const value_dim = linear_value_head_dim * linear_num_value_heads;
     const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
-    const batch_size = options?.batch_size ?? 1;
     for (let i = 0; i < layer_types.length; ++i) {
       if (layer_types[i] === "full_attention") {
         for (const kv of ["key", "value"]) {
@@ -21619,12 +22578,16 @@ function getCacheShapes(config, options) {
       }
     }
     return cache_values;
-  } else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
-    return getCacheShapes(
-      /**@type {any} */
-      config.text_config,
-      options
-    );
+  } else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
+    let subConfig;
+    if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
+      subConfig = /** @type {any} */
+      config.audio_config;
+    } else {
+      subConfig = /** @type {any} */
+      config.text_config;
+    }
+    return getCacheShapes(subConfig, options);
   }
   return getKeyValueShapes(config, options);
 }
@@ -21790,7 +22753,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
 }
 // src/models/session.js
-async function getSession(pretrained_model_name_or_path, fileName, options, is_decoder = false) {
+async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
   let custom_config = options.config?.["transformers.js_config"] ?? {};
   const selectedDevice = (
     /** @type {import("../utils/devices.js").DeviceType} */
@@ -21848,9 +22811,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
   if (externalData.length > 0 && !apis.IS_NODE_ENV) {
     session_options.externalData = externalData;
   }
-  if (is_decoder && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
+  if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
     const shapes = getCacheShapes(options.config, {
-      prefix: "present"
+      prefix: "present",
+      session_name
     });
     if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
       const preferredOutputLocation = {};
@@ -21868,15 +22832,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
   };
   return { buffer_or_path, session_options, session_config };
 }
-async function constructSessions(pretrained_model_name_or_path, names, options, decoder_name = void 0) {
+async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
   return Object.fromEntries(
     await Promise.all(
       Object.keys(names).map(async (name) => {
+        const cache_config = cache_sessions?.[name] ?? false;
         const { buffer_or_path, session_options, session_config } = await getSession(
           pretrained_model_name_or_path,
           names[name],
           options,
-          name === decoder_name
+          cache_config,
+          name
         );
         const session = await createInferenceSession(buffer_or_path, session_options, session_config);
         return [name, session];
@@ -23176,19 +24142,71 @@ var BeamSearchSampler = class extends LogitsSampler {
   }
 };
+// src/cache_utils.js
+var _DynamicCache = class {
+  /**
+   * Create a DynamicCache, optionally pre-populated with entries.
+   * @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
+   */
+  constructor(entries) {
+    if (!entries) return;
+    for (const key in entries) {
+      if (key in this) {
+        throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
+      }
+      const value = entries[key];
+      if (!(value instanceof Tensor2)) {
+        throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
+      }
+      this[key] = value;
+    }
+  }
+  /**
+   * Get the cached sequence length. This requires at least one attention cache entry to be present.
+   * @returns {number} The past sequence length.
+   */
+  get_seq_length() {
+    const self2 = (
+      /** @type {any} */
+      this
+    );
+    for (const name in self2) {
+      if (name.startsWith("past_key_values.")) {
+        return self2[name].dims.at(-2);
+      }
+    }
+    throw new Error("Unable to determine sequence length from the cache.");
+  }
+  /**
+   * Dispose all contained tensors whose data resides on the GPU.
+   * Returns a promise that resolves when all disposals are complete.
+   * @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
+   */
+  async dispose() {
+    const promises = [];
+    for (
+      const t of
+      /** @type {Tensor[]} */
+      Object.values(this)
+    ) {
+      if (t.location === "gpu-buffer") {
+        promises.push(t.dispose());
+      }
+    }
+    await Promise.all(promises);
+  }
+};
+var DynamicCache = (
+  /** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
+  /** @type {unknown} */
+  _DynamicCache
+);
 // src/models/modeling_utils.js
 var MODEL_MAPPING_NAMES = null;
 function registerTaskMappings(mappings) {
   MODEL_MAPPING_NAMES = mappings;
 }
-function getPastLength(past_key_values) {
-  for (const name in past_key_values) {
-    if (name.startsWith("past_key_values.")) {
-      return past_key_values[name].dims.at(-2);
-    }
-  }
-  return Object.values(past_key_values)[0].dims.at(-2);
-}
 function toI64Tensor(items) {
   if (items instanceof Tensor2) {
     return items;
@@ -23229,71 +24247,181 @@ var MODEL_TYPES = {
   AutoEncoder: 12,
   ImageAudioTextToText: 13,
   Supertonic: 14,
-  Chatterbox: 15
+  Chatterbox: 15,
+  MultimodalLanguageModelOnly: 16,
+  VoxtralRealtime: 17
 };
 var MODEL_TYPE_CONFIG = {
   [MODEL_TYPES.DecoderOnly]: {
     can_generate: true,
     forward: decoder_forward,
-    prepare_inputs: decoder_prepare_inputs_for_generation
+    prepare_inputs: decoder_prepare_inputs_for_generation,
+    sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
+    cache_sessions: { model: true },
+    optional_configs: { generation_config: "generation_config.json" }
   },
   [MODEL_TYPES.DecoderOnlyWithoutHead]: {
     can_generate: false,
     forward: decoder_forward,
-    prepare_inputs: decoder_prepare_inputs_for_generation
+    prepare_inputs: decoder_prepare_inputs_for_generation,
+    sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
   },
   [MODEL_TYPES.Seq2Seq]: {
     can_generate: true,
     forward: seq2seq_forward,
-    prepare_inputs: encoder_decoder_prepare_inputs_for_generation
+    prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
+    sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
+    cache_sessions: { decoder_model_merged: true },
+    optional_configs: { generation_config: "generation_config.json" }
   },
   [MODEL_TYPES.Vision2Seq]: {
     can_generate: true,
     forward: seq2seq_forward,
-    prepare_inputs: encoder_decoder_prepare_inputs_for_generation
+    prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
+    sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
+    cache_sessions: { decoder_model_merged: true },
+    optional_configs: { generation_config: "generation_config.json" }
   },
   [MODEL_TYPES.Musicgen]: {
     can_generate: true,
-    forward: seq2seq_forward
+    forward: seq2seq_forward,
+    sessions: () => ({
+      model: "text_encoder",
+      decoder_model_merged: "decoder_model_merged",
+      encodec_decode: "encodec_decode"
+    }),
+    cache_sessions: { decoder_model_merged: true },
+    optional_configs: { generation_config: "generation_config.json" }
   },
   [MODEL_TYPES.EncoderDecoder]: {
     can_generate: false,
-    forward: seq2seq_forward
+    forward: seq2seq_forward,
+    sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
+    cache_sessions: { decoder_model_merged: true }
+  },
+  [MODEL_TYPES.MaskGeneration]: {
+    sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
   },
   [MODEL_TYPES.ImageTextToText]: {
     can_generate: true,
     forward: image_text_to_text_forward,
-    prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
+    prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
+    sessions: (config) => {
+      const s = {
+        embed_tokens: "embed_tokens",
+        vision_encoder: "vision_encoder",
+        decoder_model_merged: "decoder_model_merged"
+      };
+      if (config.is_encoder_decoder) s["model"] = "encoder_model";
+      return s;
+    },
+    cache_sessions: { decoder_model_merged: true },
+    optional_configs: { generation_config: "generation_config.json" }
   },
   [MODEL_TYPES.AudioTextToText]: {
     can_generate: true,
     forward: audio_text_to_text_forward,
-    prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
+    prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
+    sessions: () => ({
+      embed_tokens: "embed_tokens",
+      audio_encoder: "audio_encoder",
+      decoder_model_merged: "decoder_model_merged"
+    }),
+    cache_sessions: { decoder_model_merged: true },
+    optional_configs: { generation_config: "generation_config.json" }
   },
-  [MODEL_TYPES.Phi3V]: {
+  [MODEL_TYPES.ImageAudioTextToText]: {
     can_generate: true,
-    prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
+    prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
+    sessions: () => ({
+      embed_tokens: "embed_tokens",
+      audio_encoder: "audio_encoder",
+      vision_encoder: "vision_encoder",
+      decoder_model_merged: "decoder_model_merged"
+    }),
+    optional_configs: { generation_config: "generation_config.json" }
   },
-  [MODEL_TYPES.ImageAudioTextToText]: {
+  [MODEL_TYPES.Phi3V]: {
     can_generate: true,
-    prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
+    prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
+    sessions: () => ({
+      prepare_inputs_embeds: "prepare_inputs_embeds",
+      model: "model",
+      vision_encoder: "vision_encoder"
+    }),
+    cache_sessions: { model: true },
+    optional_configs: { generation_config: "generation_config.json" }
   },
   [MODEL_TYPES.MultiModality]: {
-    can_generate: true
+    can_generate: true,
+    sessions: () => ({
+      prepare_inputs_embeds: "prepare_inputs_embeds",
+      model: "language_model",
+      lm_head: "lm_head",
+      gen_head: "gen_head",
+      gen_img_embeds: "gen_img_embeds",
+      image_decode: "image_decode"
+    }),
+    cache_sessions: { model: true },
+    optional_configs: { generation_config: "generation_config.json" }
   },
   [MODEL_TYPES.AutoEncoder]: {
     can_generate: false,
-    forward: auto_encoder_forward
+    forward: auto_encoder_forward,
+    sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
+  },
+  [MODEL_TYPES.Supertonic]: {
+    sessions: () => ({
+      text_encoder: "text_encoder",
+      latent_denoiser: "latent_denoiser",
+      voice_decoder: "voice_decoder"
+    })
   },
   [MODEL_TYPES.Chatterbox]: {
     can_generate: true,
-    forward: encoder_forward
+    forward: encoder_forward,
+    sessions: () => ({
+      embed_tokens: "embed_tokens",
+      speech_encoder: "speech_encoder",
+      model: "language_model",
+      conditional_decoder: "conditional_decoder"
+    }),
+    cache_sessions: { model: true },
+    optional_configs: { generation_config: "generation_config.json" }
+  },
+  [MODEL_TYPES.MultimodalLanguageModelOnly]: {
+    can_generate: true,
+    forward: image_text_to_text_forward,
+    prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
+    sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
+    cache_sessions: { decoder_model_merged: true },
+    optional_configs: { generation_config: "generation_config.json" }
+  },
+  [MODEL_TYPES.VoxtralRealtime]: {
+    can_generate: true,
+    prepare_inputs: decoder_prepare_inputs_for_generation,
+    sessions: () => ({
+      embed_tokens: "embed_tokens",
+      audio_encoder: "audio_encoder",
+      decoder_model_merged: "decoder_model_merged"
+    }),
+    cache_sessions: { decoder_model_merged: true, audio_encoder: true },
+    optional_configs: { generation_config: "generation_config.json" }
   },
   default: {
     can_generate: false,
-    forward: encoder_forward
+    forward: encoder_forward,
+    sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
   }
 };
+function getSessionsConfig(modelType, config, options = {}) {
+  const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
+  return {
+    sessions: typeConfig.sessions(config, options),
+    cache_sessions: typeConfig.cache_sessions,
+    optional_configs: typeConfig.optional_configs
+  };
+}
 var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
 var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
 var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
@@ -23379,245 +24507,23 @@ var PreTrainedModel = class extends Callable2 {
     const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
     const modelType = MODEL_TYPE_MAPPING.get(modelName);
     config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
-    let info;
-    if (modelType === MODEL_TYPES.DecoderOnly) {
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            model: options.model_file_name ?? "model"
-          },
-          options,
-          "model"
-        ),
-        get_optional_configs(
-          pretrained_model_name_or_path,
-          {
-            generation_config: "generation_config.json"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            model: "encoder_model",
-            decoder_model_merged: "decoder_model_merged"
-          },
-          options,
-          "decoder_model_merged"
-        ),
-        get_optional_configs(
-          pretrained_model_name_or_path,
-          {
-            generation_config: "generation_config.json"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.MaskGeneration) {
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            model: "vision_encoder",
-            prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.EncoderDecoder) {
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            model: "encoder_model",
-            decoder_model_merged: "decoder_model_merged"
-          },
-          options,
-          "decoder_model_merged"
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.ImageTextToText) {
-      const sessions = {
-        embed_tokens: "embed_tokens",
-        vision_encoder: "vision_encoder",
-        decoder_model_merged: "decoder_model_merged"
-      };
-      if (config.is_encoder_decoder) {
-        sessions["model"] = "encoder_model";
-      }
-      info = await Promise.all([
-        constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
-        get_optional_configs(
-          pretrained_model_name_or_path,
-          {
-            generation_config: "generation_config.json"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.AudioTextToText) {
-      const sessions = {
-        embed_tokens: "embed_tokens",
-        audio_encoder: "audio_encoder",
-        decoder_model_merged: "decoder_model_merged"
-      };
-      info = await Promise.all([
-        constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
-        get_optional_configs(
-          pretrained_model_name_or_path,
-          {
-            generation_config: "generation_config.json"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
-      const sessions = {
-        embed_tokens: "embed_tokens",
-        audio_encoder: "audio_encoder",
-        vision_encoder: "vision_encoder",
-        decoder_model_merged: "decoder_model_merged"
-      };
-      info = await Promise.all([
-        constructSessions(pretrained_model_name_or_path, sessions, options),
-        get_optional_configs(
-          pretrained_model_name_or_path,
-          {
-            generation_config: "generation_config.json"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.Musicgen) {
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            model: "text_encoder",
-            decoder_model_merged: "decoder_model_merged",
-            encodec_decode: "encodec_decode"
-          },
-          options,
-          "decoder_model_merged"
-        ),
-        get_optional_configs(
-          pretrained_model_name_or_path,
-          {
-            generation_config: "generation_config.json"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.MultiModality) {
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            prepare_inputs_embeds: "prepare_inputs_embeds",
-            model: "language_model",
-            lm_head: "lm_head",
-            gen_head: "gen_head",
-            gen_img_embeds: "gen_img_embeds",
-            image_decode: "image_decode"
-          },
-          options,
-          "model"
-        ),
-        get_optional_configs(
-          pretrained_model_name_or_path,
-          {
-            generation_config: "generation_config.json"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.Phi3V) {
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            prepare_inputs_embeds: "prepare_inputs_embeds",
-            model: "model",
-            vision_encoder: "vision_encoder"
-          },
-          options,
-          "model"
-        ),
-        get_optional_configs(
-          pretrained_model_name_or_path,
-          {
-            generation_config: "generation_config.json"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.Chatterbox) {
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            embed_tokens: "embed_tokens",
-            speech_encoder: "speech_encoder",
-            model: "language_model",
-            conditional_decoder: "conditional_decoder"
-          },
-          options,
-          "model"
-        ),
-        get_optional_configs(
-          pretrained_model_name_or_path,
-          {
-            generation_config: "generation_config.json"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.AutoEncoder) {
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            encoder_model: "encoder_model",
-            decoder_model: "decoder_model"
-          },
-          options
-        )
-      ]);
-    } else if (modelType === MODEL_TYPES.Supertonic) {
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            text_encoder: "text_encoder",
-            latent_denoiser: "latent_denoiser",
-            voice_decoder: "voice_decoder"
-          },
-          options
-        )
-      ]);
-    } else {
-      if (modelType === void 0) {
-        const type = modelName ?? config?.model_type;
-        if (type !== "custom") {
-          logger.warn(
-            `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
-          );
-        }
+    const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
+    if (modelType === void 0) {
+      const type = modelName ?? config?.model_type;
+      if (type !== "custom") {
+        logger.warn(
+          `Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
+        );
       }
-      info = await Promise.all([
-        constructSessions(
-          pretrained_model_name_or_path,
-          {
-            model: options.model_file_name ?? "model"
-          },
-          options
-        )
-      ]);
     }
+    const sessions = typeConfig.sessions(config, options);
+    const promises = [
+      constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
+    ];
+    if (typeConfig.optional_configs) {
+      promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
+    }
+    const info = await Promise.all(promises);
     return new this(config, ...info);
   }
   /**
@@ -23816,7 +24722,7 @@ var PreTrainedModel = class extends Callable2 {
    * @param {Tensor} [params.inputs=null]
    * @param {number} [params.bos_token_id=null]
    * @param {Record<string, Tensor|number[]>} [params.model_kwargs]
-   * @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor>, model_input_name: string}} The model-specific inputs for generation.
+   * @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
    */
   _prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
     const model_inputs = pick(model_kwargs, this.forward_params);
@@ -24057,11 +24963,12 @@ var PreTrainedModel = class extends Callable2 {
     }
   }
   /**
-   * Returns an object containing past key values from the given decoder results object.
+   * Returns a DynamicCache containing past key values from the given decoder results object.
    *
    * @param {Object} decoderResults The decoder results object.
-   * @param {Object} pastKeyValues The previous past key values.
-   * @returns {Object} An object containing past key values.
+   * @param {DynamicCache} pastKeyValues The previous past key values.
+   * @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
+   * @returns {DynamicCache} A new DynamicCache containing the updated past key values.
    */
   getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
     const pkvs = /* @__PURE__ */ Object.create(null);
@@ -24082,7 +24989,7 @@ var PreTrainedModel = class extends Callable2 {
         }
       }
     }
-    return pkvs;
+    return new DynamicCache(pkvs);
   }
   /**
    * Returns an object containing attentions from the given model output object.
@@ -24107,8 +25014,8 @@ var PreTrainedModel = class extends Callable2 {
   /**
    * Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
    *
-   * @param {Object} decoderFeeds The decoder feeds object to add past key values to.
-   * @param {Object} pastKeyValues An object containing past key values.
+   * @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
+   * @param {DynamicCache|null} pastKeyValues The cache containing past key values.
    */
   addPastKeyValues(decoderFeeds, pastKeyValues) {
     if (pastKeyValues) {
@@ -24125,14 +25032,29 @@ var PreTrainedModel = class extends Callable2 {
       }
     }
   }
-  async encode_image({ pixel_values }) {
-    return (await sessionRun(this.sessions["vision_encoder"], { pixel_values })).image_features;
+  /**
+   * Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
+   * @param {string} sessionName
+   * @param {Record<string, Tensor>} inputs
+   * @param {string} outputName
+   * @private
+   */
+  async _encode_input(sessionName, inputs, outputName) {
+    if (!Object.hasOwn(this.sessions, sessionName)) {
+      throw new Error(`Model does not have a ${sessionName} session.`);
+    }
+    const session = this.sessions[sessionName];
+    const output = await sessionRun(session, pick(inputs, session.inputNames));
+    return output[outputName];
   }
-  async encode_text({ input_ids }) {
-    return (await sessionRun(this.sessions["embed_tokens"], { input_ids })).inputs_embeds;
+  async encode_image(inputs) {
+    return this._encode_input("vision_encoder", inputs, "image_features");
   }
-  async encode_audio({ audio_values }) {
-    return (await sessionRun(this.sessions["audio_encoder"], { audio_values })).audio_features;
+  async encode_text(inputs) {
+    return this._encode_input("embed_tokens", inputs, "inputs_embeds");
+  }
+  async encode_audio(inputs) {
+    return this._encode_input("audio_encoder", inputs, "audio_features");
   }
 };
 async function seq2seq_forward(self2, model_inputs) {
@@ -24187,6 +25109,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
     const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
     new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
   }
+  if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
+    new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
+  }
   self2.addPastKeyValues(new_model_inputs, past_key_values);
   const fixed = pick(new_model_inputs, session.inputNames);
   return await sessionRun(session, fixed);
@@ -24195,7 +25120,7 @@ async function generic_text_to_text_forward(self2, {
   // Generic parameters:
   encode_function,
   merge_function,
-  modality_input_name,
+  modality_input_names,
   modality_output_name,
   // Produced by the tokenizer/processor:
   input_ids = null,
@@ -24210,32 +25135,34 @@ async function generic_text_to_text_forward(self2, {
   // Additional parameters
   ...kwargs
 }) {
-  const modality_values = kwargs[modality_input_name];
   if (!inputs_embeds) {
     inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
-    if (modality_values && input_ids.dims[1] !== 1) {
-      const modality_features = await encode_function({
-        // Pass the modality values under its expected key.
-        // The caller knows whether this is audio or image.
-        [modality_input_name]: modality_values,
-        ...kwargs
-      });
-      ({ inputs_embeds, attention_mask } = merge_function({
-        [modality_output_name]: modality_features,
-        inputs_embeds,
-        input_ids,
-        attention_mask
-      }));
-    } else if (past_key_values && modality_values && input_ids.dims[1] === 1) {
-      const target_length = input_ids.dims[1];
-      const past_length = getPastLength(past_key_values);
-      attention_mask = cat(
-        [
-          ones([input_ids.dims[0], past_length]),
-          attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
-        ],
-        1
-      );
+    const modality_values = pick(kwargs, modality_input_names);
+    if (Object.keys(modality_values).length > 0) {
+      if (input_ids.dims[1] !== 1) {
+        const modality_features = await encode_function({
+          // Pass the modality values under its expected key.
+          // The caller knows whether this is audio or image.
+          ...modality_values,
+          ...kwargs
+        });
+        ({ inputs_embeds, attention_mask } = merge_function({
+          [modality_output_name]: modality_features,
+          inputs_embeds,
+          input_ids,
+          attention_mask
+        }));
+      } else if (past_key_values && input_ids.dims[1] === 1) {
+        const target_length = input_ids.dims[1];
+        const past_length = past_key_values.get_seq_length();
+        attention_mask = cat(
+          [
+            ones([input_ids.dims[0], past_length]),
+            attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
+          ],
+          1
+        );
+      }
     }
   }
   if (!position_ids) {
@@ -24243,14 +25170,19 @@ async function generic_text_to_text_forward(self2, {
       // Handle special case for qwen vl models
       [
         "qwen2_vl",
+        "qwen2_vl_text",
         "qwen2_5_vl",
         "qwen2_5_vl_text",
         "qwen3_vl",
         "qwen3_vl_text",
+        "qwen3_vl_moe",
+        "qwen3_vl_moe_text",
         "qwen3_5",
         "qwen3_5_text",
         "qwen3_5_moe",
-        "qwen3_5_moe_text"
+        "qwen3_5_moe_text",
+        "glm_ocr",
+        "glm_ocr_text"
       ].includes(self2.config.model_type)
     ) {
       const { image_grid_thw, video_grid_thw } = kwargs;
@@ -24274,7 +25206,7 @@ async function generic_text_to_text_forward(self2, {
 async function audio_text_to_text_forward(self2, params) {
   return await generic_text_to_text_forward(self2, {
     ...params,
-    modality_input_name: "audio_values",
+    modality_input_names: ["audio_values", "input_features"],
     modality_output_name: "audio_features",
     encode_function: self2.encode_audio.bind(self2),
     merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
@@ -24283,7 +25215,7 @@ async function audio_text_to_text_forward(self2, params) {
 async function image_text_to_text_forward(self2, params) {
   return await generic_text_to_text_forward(self2, {
     ...params,
-    modality_input_name: "pixel_values",
+    modality_input_names: ["pixel_values"],
     modality_output_name: "image_features",
     encode_function: self2.encode_image.bind(self2),
     merge_function: self2._merge_input_ids_with_image_features.bind(self2)
@@ -24319,7 +25251,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
   return position_ids;
 }
 function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
-  const past_length = model_inputs.past_key_values ? getPastLength(model_inputs.past_key_values) : 0;
+  const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
+  const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
+  if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
+    model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
+  }
   if (!model_inputs.attention_mask) {
     let dims;
     for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
@@ -24470,6 +25406,8 @@ __export(models_exports, {
   BloomForCausalLM: () => BloomForCausalLM,
   BloomModel: () => BloomModel,
   BloomPreTrainedModel: () => BloomPreTrainedModel,
+  CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
+  CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
   CLIPModel: () => CLIPModel,
   CLIPPreTrainedModel: () => CLIPPreTrainedModel,
   CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -24544,6 +25482,9 @@ __export(models_exports, {
   DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
   DecisionTransformerModel: () => DecisionTransformerModel,
   DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
+  DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
+  DeepseekV3Model: () => DeepseekV3Model,
+  DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
   DeiTForImageClassification: () => DeiTForImageClassification,
   DeiTModel: () => DeiTModel,
   DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -24589,6 +25530,11 @@ __export(models_exports, {
   EsmForTokenClassification: () => EsmForTokenClassification,
   EsmModel: () => EsmModel,
   EsmPreTrainedModel: () => EsmPreTrainedModel,
+  EuroBertForMaskedLM: () => EuroBertForMaskedLM,
+  EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
+  EuroBertForTokenClassification: () => EuroBertForTokenClassification,
+  EuroBertModel: () => EuroBertModel,
+  EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
   ExaoneForCausalLM: () => ExaoneForCausalLM,
   ExaoneModel: () => ExaoneModel,
   ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -24627,6 +25573,7 @@ __export(models_exports, {
   Gemma3ForCausalLM: () => Gemma3ForCausalLM,
   Gemma3Model: () => Gemma3Model,
   Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
+  Gemma3nForCausalLM: () => Gemma3nForCausalLM,
   Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
   Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
   GemmaForCausalLM: () => GemmaForCausalLM,
@@ -24634,6 +25581,10 @@ __export(models_exports, {
   GemmaPreTrainedModel: () => GemmaPreTrainedModel,
   GlmForCausalLM: () => GlmForCausalLM,
   GlmModel: () => GlmModel,
+  GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
+  GlmMoeDsaModel: () => GlmMoeDsaModel,
+  GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
+  GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
   GlmPreTrainedModel: () => GlmPreTrainedModel,
   GptOssForCausalLM: () => GptOssForCausalLM,
   GptOssModel: () => GptOssModel,
@@ -24644,6 +25595,7 @@ __export(models_exports, {
   GraniteMoeHybridModel: () => GraniteMoeHybridModel,
   GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
   GranitePreTrainedModel: () => GranitePreTrainedModel,
+  GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
   GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
   GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
   GroupViTModel: () => GroupViTModel,
@@ -24665,7 +25617,6 @@ __export(models_exports, {
   IJepaModel: () => IJepaModel,
   IJepaPreTrainedModel: () => IJepaPreTrainedModel,
   Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
-  Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
   JAISLMHeadModel: () => JAISLMHeadModel,
   JAISModel: () => JAISModel,
   JAISPreTrainedModel: () => JAISPreTrainedModel,
@@ -24679,6 +25630,8 @@ __export(models_exports, {
   Lfm2MoeModel: () => Lfm2MoeModel,
   Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
   Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
+  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
+  LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
   LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
   Llama4ForCausalLM: () => Llama4ForCausalLM,
   Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -24728,6 +25681,9 @@ __export(models_exports, {
   MimiEncoderOutput: () => MimiEncoderOutput,
   MimiModel: () => MimiModel,
   MimiPreTrainedModel: () => MimiPreTrainedModel,
+  Mistral4ForCausalLM: () => Mistral4ForCausalLM,
+  Mistral4Model: () => Mistral4Model,
+  Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
   MistralForCausalLM: () => MistralForCausalLM,
   MistralModel: () => MistralModel,
   MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -24785,6 +25741,9 @@ __export(models_exports, {
   NanoChatForCausalLM: () => NanoChatForCausalLM,
   NanoChatModel: () => NanoChatModel,
   NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
+  NemotronHForCausalLM: () => NemotronHForCausalLM,
+  NemotronHModel: () => NemotronHModel,
+  NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
   NeoBertForMaskedLM: () => NeoBertForMaskedLM,
   NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
   NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -24818,7 +25777,6 @@ __export(models_exports, {
   Owlv2Model: () => Owlv2Model,
   Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
   PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
-  PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
   ParakeetForCTC: () => ParakeetForCTC,
   ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
   PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
@@ -24848,8 +25806,10 @@ __export(models_exports, {
   Qwen2MoeModel: () => Qwen2MoeModel,
   Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
   Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
+  Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
   Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
   Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
+  Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
   Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
   Qwen3ForCausalLM: () => Qwen3ForCausalLM,
   Qwen3Model: () => Qwen3Model,
@@ -24860,9 +25820,13 @@ __export(models_exports, {
   Qwen3NextModel: () => Qwen3NextModel,
   Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
   Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
+  Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
   Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
+  Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
   Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
+  Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
   Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
+  Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
   Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
   RFDetrForObjectDetection: () => RFDetrForObjectDetection,
   RFDetrModel: () => RFDetrModel,
@@ -24913,11 +25877,13 @@ __export(models_exports, {
   SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
   SmolLM3Model: () => SmolLM3Model,
   SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
-  SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
   SnacDecoderModel: () => SnacDecoderModel,
   SnacEncoderModel: () => SnacEncoderModel,
   SnacModel: () => SnacModel,
   SnacPreTrainedModel: () => SnacPreTrainedModel,
+  SolarOpenForCausalLM: () => SolarOpenForCausalLM,
+  SolarOpenModel: () => SolarOpenModel,
+  SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
   SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
   SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
   SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -24985,6 +25951,8 @@ __export(models_exports, {
   VitsModelOutput: () => VitsModelOutput,
   VitsPreTrainedModel: () => VitsPreTrainedModel,
   VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
+  VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
+  VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
   Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
   Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
   Wav2Vec2BertModel: () => Wav2Vec2BertModel,
@@ -25090,7 +26058,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
 var ArceeForCausalLM = class extends ArceePreTrainedModel {
 };
-// src/models/ast/modeling_ast.js
+// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
 var ASTPreTrainedModel = class extends PreTrainedModel {
 };
 var ASTModel = class extends ASTPreTrainedModel {
@@ -25345,7 +26313,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
         if (!past_key_values || target_length !== 1) {
           throw new Error("Incorrect state encountered during generation.");
         }
-        const past_length = Object.values(past_key_values)[0].dims.at(-2);
+        const past_length = past_key_values.get_seq_length();
         attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
       }
     }
@@ -25425,6 +26393,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
 var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
 };
+// src/models/chmv2/modeling_chmv2.js
+var CHMv2PreTrainedModel = class extends PreTrainedModel {
+};
+var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
+};
 // src/models/clap/modeling_clap.js
 var ClapPreTrainedModel = class extends PreTrainedModel {
 };
@@ -25763,6 +26737,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
   }
 };
+// src/models/deepseek_v3/modeling_deepseek_v3.js
+var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
+};
+var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
+};
+var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
+};
 // src/models/deberta_v2/modeling_deberta_v2.js
 var DebertaV2PreTrainedModel = class extends PreTrainedModel {
 };
@@ -26111,6 +27093,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
   }
 };
+// src/models/eurobert/modeling_eurobert.js
+var EuroBertPreTrainedModel = class extends PreTrainedModel {
+};
+var EuroBertModel = class extends EuroBertPreTrainedModel {
+};
+var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
+  /**
+   * Calls the model on new inputs.
+   *
+   * @param {Object} model_inputs The inputs to the model.
+   * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+   */
+  async _call(model_inputs) {
+    return new MaskedLMOutput(await super._call(model_inputs));
+  }
+};
+var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
+  /**
+   * Calls the model on new inputs.
+   *
+   * @param {Object} model_inputs The inputs to the model.
+   * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+   */
+  async _call(model_inputs) {
+    return new SequenceClassifierOutput(await super._call(model_inputs));
+  }
+};
+var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
+  /**
+   * Calls the model on new inputs.
+   *
+   * @param {Object} model_inputs The inputs to the model.
+   * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+   */
+  async _call(model_inputs) {
+    return new TokenClassifierOutput(await super._call(model_inputs));
+  }
+};
 // src/models/exaone/modeling_exaone.js
 var ExaonePreTrainedModel = class extends PreTrainedModel {
 };
@@ -26375,6 +27396,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
     });
   }
 };
+var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
+};
 // src/models/glm/modeling_glm.js
 var GlmPreTrainedModel = class extends PreTrainedModel {
@@ -26384,6 +27407,377 @@ var GlmModel = class extends GlmPreTrainedModel {
 var GlmForCausalLM = class extends GlmPreTrainedModel {
 };
+// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
+var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
+};
+var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
+};
+var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
+};
+// src/models/qwen2_vl/modeling_qwen2_vl.js
+var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
+  forward_params = [
+    // Text inputs
+    "input_ids",
+    "attention_mask",
+    "position_ids",
+    "past_key_values",
+    // Vision inputs
+    "pixel_values",
+    "image_grid_thw"
+  ];
+};
+var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
+  // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
+  // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
+  // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
+  image_grid_thw_name = "grid_thw";
+  /**
+   * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
+   * @param {Tensor} input_ids
+   * @param {Tensor} attention_mask
+   * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
+   */
+  _get_text_only_rope_index(input_ids, attention_mask) {
+    if (attention_mask) {
+      const { data, dims } = cumsum_masked_fill(attention_mask);
+      const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
+      const mrope_position_deltas = Array.from(
+        { length: dims[0] },
+        (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
+      );
+      return [
+        new Tensor2("int64", position_ids, [3, ...dims]),
+        new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
+      ];
+    } else {
+      const [batch_size, seq_length] = input_ids.dims;
+      const position_ids = BigInt64Array.from(
+        { length: 3 * batch_size * seq_length },
+        (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
+      );
+      return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
+    }
+  }
+  /**
+   * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
+   * global [all_t, all_h, all_w] order, then write back into the position_ids array
+   * respecting attention mask.
+   * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
+   * @param {number[]} attn_mask Attention mask for this batch element
+   * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
+   * @param {number} batch_idx Current batch index
+   * @returns {number[]} Flat reordered positions of length total_len
+   */
+  _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
+    const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
+    const llm_positions = new Array(total_len);
+    let index = 0;
+    for (let x = 0; x < 3; ++x) {
+      for (const val of llm_pos_ids_list) {
+        const seg_len = val.length / 3;
+        for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
+          llm_positions[index++] = val[z];
+        }
+      }
+    }
+    let count2 = 0;
+    for (let y = 0; y < attn_mask.length; ++y) {
+      if (attn_mask[y] == 1) {
+        for (let x = 0; x < 3; ++x) {
+          position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
+        }
+        ++count2;
+      }
+    }
+    return llm_positions;
+  }
+  /**
+   * Build per-batch position ID segments for multimodal rope.
+   * Override this in subclasses to change how vision/text segments are identified and positioned.
+   * @param {object} params
+   * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
+   * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
+   * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
+   * @param {number} params.spatial_merge_size
+   * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
+   * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
+   */
+  _get_multimodal_rope_positions({
+    filtered_ids,
+    image_grid_thw_list,
+    video_grid_thw_list,
+    spatial_merge_size,
+    state
+  }) {
+    const { image_token_id, video_token_id, vision_start_token_id } = this.config;
+    const ids = filtered_ids;
+    const vision_start_indices = ids.reduce((acc, x, idx) => {
+      if (x == vision_start_token_id) acc.push(idx);
+      return acc;
+    }, []);
+    const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
+    const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
+    const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
+    const llm_pos_ids_list = [];
+    let st2 = 0;
+    let remain_images = image_nums;
+    let remain_videos = video_nums;
+    for (let j = 0; j < vision_tokens.length; ++j) {
+      const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
+      const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
+      const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
+      const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
+      let ed;
+      let t, h, w;
+      if (ed_image < ed_video) {
+        [t, h, w] = image_grid_thw_list[state.image_index];
+        ++state.image_index;
+        --remain_images;
+        ed = ed_image;
+      } else {
+        [t, h, w] = video_grid_thw_list[state.video_index];
+        ++state.video_index;
+        --remain_videos;
+        ed = ed_video;
+      }
+      const [llm_grid_t, llm_grid_h, llm_grid_w] = [
+        Number(t),
+        Math.floor(Number(h) / spatial_merge_size),
+        Math.floor(Number(w) / spatial_merge_size)
+      ];
+      const text_len = ed - st2;
+      const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
+      llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
+      const offset = text_len + st_idx;
+      const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
+      const t_index = Array.from(
+        { length: grid_size },
+        (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
+      );
+      const h_index = Array.from(
+        { length: grid_size },
+        (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
+      );
+      const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
+      llm_pos_ids_list.push([t_index, h_index, w_index].flat());
+      st2 = ed + grid_size;
+    }
+    if (st2 < ids.length) {
+      const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
+      const text_len = ids.length - st2;
+      llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
+    }
+    return llm_pos_ids_list;
+  }
+  /**
+   * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+   *
+   * Explanation:
+   *     Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+   *
+   *     For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
+   *     Examples:
+   *         input_ids: [T T T T T], here T is for text.
+   *         temporal position_ids: [0, 1, 2, 3, 4]
+   *         height position_ids: [0, 1, 2, 3, 4]
+   *         width position_ids: [0, 1, 2, 3, 4]
+   *
+   *     For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+   *     and 1D rotary position embeddin for text part.
+   *     Examples:
+   *         Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
+   *         input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+   *         vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+   *         vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+   *         vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+   *         text temporal position_ids: [3, 4, 5, 6, 7]
+   *         text height position_ids: [3, 4, 5, 6, 7]
+   *         text width position_ids: [3, 4, 5, 6, 7]
+   *         Here we calculate the text start position_ids as the max vision position_ids plus 1.
+   *
+   * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
+   * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
+   * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
+   * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
+   * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
+   */
+  get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
+    const { vision_config } = this.config;
+    const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
+    if (image_grid_thw || video_grid_thw) {
+      const total_input_ids = input_ids.tolist();
+      if (!attention_mask) {
+        attention_mask = ones_like(input_ids);
+      }
+      const attention_mask_list = attention_mask.tolist();
+      const position_ids_list = Array.from(
+        { length: 3 },
+        () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
+      );
+      const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
+      const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
+      const state = { image_index: 0, video_index: 0 };
+      const mrope_position_deltas = [];
+      for (let i = 0; i < total_input_ids.length; ++i) {
+        const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
+        const llm_pos_ids_list = this._get_multimodal_rope_positions({
+          filtered_ids,
+          image_grid_thw_list,
+          video_grid_thw_list,
+          spatial_merge_size,
+          state
+        });
+        const llm_positions = this._reorder_and_write_positions(
+          llm_pos_ids_list,
+          attention_mask_list[i],
+          position_ids_list,
+          i
+        );
+        mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
+      }
+      return [
+        new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
+        new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
+      ];
+    } else {
+      return this._get_text_only_rope_index(input_ids, attention_mask);
+    }
+  }
+  async encode_image({ pixel_values, image_grid_thw }) {
+    const features = (await sessionRun(this.sessions["vision_encoder"], {
+      pixel_values,
+      [this.image_grid_thw_name]: image_grid_thw
+    })).image_features;
+    return features;
+  }
+  _merge_input_ids_with_image_features(kwargs) {
+    return default_merge_input_ids_with_image_features({
+      // @ts-ignore
+      image_token_id: this.config.image_token_id,
+      ...kwargs
+    });
+  }
+  prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
+    if (model_inputs.attention_mask && !model_inputs.position_ids) {
+      if (!model_inputs.past_key_values) {
+        [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
+          model_inputs.input_ids,
+          model_inputs.image_grid_thw,
+          model_inputs.video_grid_thw,
+          model_inputs.attention_mask
+        );
+      } else {
+        model_inputs.pixel_values = null;
+        const past_length = model_inputs.past_key_values.get_seq_length();
+        if (past_length < model_inputs.input_ids.dims[1]) {
+          const [full_position_ids, rope_deltas] = this.get_rope_index(
+            model_inputs.input_ids,
+            model_inputs.image_grid_thw,
+            model_inputs.video_grid_thw,
+            model_inputs.attention_mask
+          );
+          model_inputs.rope_deltas = rope_deltas;
+          model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
+          model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
+        } else {
+          if (!model_inputs.rope_deltas) {
+            [, model_inputs.rope_deltas] = this.get_rope_index(
+              model_inputs.input_ids,
+              model_inputs.image_grid_thw,
+              model_inputs.video_grid_thw,
+              model_inputs.attention_mask
+            );
+          }
+          const delta = BigInt(past_length);
+          const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
+          model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
+        }
+      }
+    }
+    return model_inputs;
+  }
+};
+var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
+};
+// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
+var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
+  image_grid_thw_name = "image_grid_thw";
+};
+var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
+  image_grid_thw_name = "image_grid_thw";
+};
+// src/models/glm_ocr/modeling_glm_ocr.js
+var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
+  /**
+   * Compute 3D positional indices for vision tokens.
+   * Temporal is constant, height is repeat-interleaved, width tiles.
+   * @param {number} start_position
+   * @param {number[]} grid_thw [T, H, W]
+   * @param {number} temp_merge_size
+   * @param {number} spatial_merge_size
+   * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
+   */
+  get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
+    const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
+    const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
+    const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
+    const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
+    const t_pos = Array.from({ length: seq_len }, () => start_position);
+    const h_pos = Array.from(
+      { length: seq_len },
+      (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
+    );
+    const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
+    return [...t_pos, ...h_pos, ...w_pos];
+  }
+  /**
+   * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
+   * instead of vision_start_token_id scanning used by Qwen2VL.
+   * After a vision segment, position advances by max(h, w) / spatial_merge_size.
+   */
+  _get_multimodal_rope_positions({
+    filtered_ids,
+    image_grid_thw_list,
+    video_grid_thw_list,
+    spatial_merge_size,
+    state
+  }) {
+    const { image_token_id } = this.config;
+    const groups = [];
+    let group_start = 0;
+    let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
+    for (let j = 1; j <= filtered_ids.length; ++j) {
+      const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
+      if (t !== current_type) {
+        groups.push([current_type, group_start, j]);
+        group_start = j;
+        current_type = t;
+      }
+    }
+    let current_pos = 0;
+    const llm_pos_ids_list = [];
+    for (const [modality_type, start_idx, end_idx] of groups) {
+      if (modality_type === 0) {
+        const text_len = end_idx - start_idx;
+        llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
+        current_pos += text_len;
+      } else {
+        const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
+        const temp_merge_size = grid_thw[0];
+        llm_pos_ids_list.push(
+          this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
+        );
+        current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
+      }
+    }
+    return llm_pos_ids_list;
+  }
+};
 // src/models/glpn/modeling_glpn.js
 var GLPNPreTrainedModel = class extends PreTrainedModel {
 };
@@ -26456,6 +27850,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
 var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
 };
+// src/models/ultravox/modeling_ultravox.js
+var UltravoxPreTrainedModel = class extends PreTrainedModel {
+  forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
+};
+var UltravoxModel = class extends UltravoxPreTrainedModel {
+  _merge_input_ids_with_audio_features(kwargs) {
+    const audio_hidden_size = kwargs.audio_features.dims.at(-1);
+    const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
+    return default_merge_input_ids_with_audio_features({
+      // @ts-ignore
+      audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
+      ...kwargs,
+      audio_features: reshaped_audio_features
+    });
+  }
+};
+// src/models/granite_speech/modeling_granite_speech.js
+var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
+  forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
+};
 // src/models/grounding_dino/modeling_grounding_dino.js
 var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
 };
@@ -26560,34 +27976,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
 var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
 };
-// src/models/idefics3/modeling_idefics3.js
-var Idefics3PreTrainedModel = class extends PreTrainedModel {
-  forward_params = [
-    "input_ids",
-    "attention_mask",
-    "pixel_values",
-    "pixel_attention_mask",
-    "position_ids",
-    "past_key_values"
-  ];
+// src/models/llava/modeling_llava.js
+var LlavaPreTrainedModel = class extends PreTrainedModel {
+  forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
 };
-var Idefics3ForConditionalGeneration = class extends Idefics3PreTrainedModel {
-  async encode_image({ pixel_values, pixel_attention_mask }) {
-    const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
-    return features;
-  }
+var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
   _merge_input_ids_with_image_features(kwargs) {
     const vision_hidden_size = kwargs.image_features.dims.at(-1);
     const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
     return default_merge_input_ids_with_image_features({
       // @ts-ignore
-      image_token_id: this.config.image_token_id,
+      image_token_id: this.config.image_token_index ?? this.config.image_token_id,
       ...kwargs,
       image_features: reshaped_image_hidden_states
     });
   }
 };
-var SmolVLMForConditionalGeneration = class extends Idefics3ForConditionalGeneration {
+var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
+};
+var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
+};
+// src/models/idefics3/modeling_idefics3.js
+var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
+  forward_params = [
+    "input_ids",
+    "attention_mask",
+    "pixel_values",
+    "pixel_attention_mask",
+    "position_ids",
+    "past_key_values"
+  ];
 };
 // src/models/ijepa/modeling_ijepa.js
@@ -26671,6 +28090,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
 var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
 };
+// src/models/lighton_ocr/modeling_lighton_ocr.js
+var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
+};
 // src/models/lfm2_moe/modeling_lfm2_moe.js
 var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
 };
@@ -26679,6 +28102,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
 var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
 };
+// src/models/lfm2_vl/modeling_lfm2_vl.js
+var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
+  forward_params = [
+    "input_ids",
+    "attention_mask",
+    "pixel_values",
+    "pixel_attention_mask",
+    "spatial_shapes",
+    "position_ids",
+    "past_key_values"
+  ];
+};
 // src/models/llama/modeling_llama.js
 var LlamaPreTrainedModel = class extends PreTrainedModel {
 };
@@ -26693,27 +28129,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
 var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
 };
-// src/models/llava/modeling_llava.js
-var LlavaPreTrainedModel = class extends PreTrainedModel {
-  forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
-};
-var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
-  _merge_input_ids_with_image_features(kwargs) {
-    const vision_hidden_size = kwargs.image_features.dims.at(-1);
-    const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
-    return default_merge_input_ids_with_image_features({
-      // @ts-ignore
-      image_token_id: this.config.image_token_index,
-      ...kwargs,
-      image_features: reshaped_image_hidden_states
-    });
-  }
-};
-var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
-};
-var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
-};
 // src/models/longt5/modeling_longt5.js
 var LongT5PreTrainedModel = class extends PreTrainedModel {
 };
@@ -26875,6 +28290,14 @@ var MistralModel = class extends MistralPreTrainedModel {
 var MistralForCausalLM = class extends MistralPreTrainedModel {
 };
+// src/models/mistral4/modeling_mistral4.js
+var Mistral4PreTrainedModel = class extends PreTrainedModel {
+};
+var Mistral4Model = class extends Mistral4PreTrainedModel {
+};
+var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
+};
 // src/models/mobilebert/modeling_mobilebert.js
 var MobileBertPreTrainedModel = class extends PreTrainedModel {
 };
@@ -27343,6 +28766,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
 var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
 };
+// src/models/nemotron_h/modeling_nemotron_h.js
+var NemotronHPreTrainedModel = class extends PreTrainedModel {
+};
+var NemotronHModel = class extends NemotronHPreTrainedModel {
+};
+var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
+};
 // src/models/neobert/modeling_neobert.js
 var NeoBertPreTrainedModel = class extends PreTrainedModel {
 };
@@ -27464,27 +28895,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
 };
 // src/models/paligemma/modeling_paligemma.js
-var PaliGemmaPreTrainedModel = class extends PreTrainedModel {
-  forward_params = [
-    "input_ids",
-    // 'inputs_embeds',
-    "attention_mask",
-    "pixel_values",
-    "position_ids",
-    "past_key_values"
-  ];
-};
-var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
-  _merge_input_ids_with_image_features(kwargs) {
-    const vision_hidden_size = kwargs.image_features.dims.at(-1);
-    const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
-    return default_merge_input_ids_with_image_features({
-      // @ts-ignore
-      image_token_id: this.config.image_token_index,
-      ...kwargs,
-      image_features: reshaped_image_hidden_states
-    });
-  }
+var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
 };
 // src/models/parakeet/modeling_parakeet.js
@@ -27643,244 +29054,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
 var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
 };
-// src/models/qwen2_vl/modeling_qwen2_vl.js
-var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
-  forward_params = [
-    // Text inputs
-    "input_ids",
-    "attention_mask",
-    "position_ids",
-    "past_key_values",
-    // Vision inputs
-    "pixel_values",
-    "image_grid_thw"
-  ];
-};
-var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
-  image_grid_thw_name = "grid_thw";
-  /**
-   * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
-   *
-   * Explanation:
-   *     Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
-   *
-   *     For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
-   *     Examples:
-   *         input_ids: [T T T T T], here T is for text.
-   *         temporal position_ids: [0, 1, 2, 3, 4]
-   *         height position_ids: [0, 1, 2, 3, 4]
-   *         width position_ids: [0, 1, 2, 3, 4]
-   *
-   *     For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
-   *     and 1D rotary position embeddin for text part.
-   *     Examples:
-   *         Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
-   *         input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
-   *         vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
-   *         vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
-   *         vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
-   *         text temporal position_ids: [3, 4, 5, 6, 7]
-   *         text height position_ids: [3, 4, 5, 6, 7]
-   *         text width position_ids: [3, 4, 5, 6, 7]
-   *         Here we calculate the text start position_ids as the max vision position_ids plus 1.
-   *
-   * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
-   * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
-   * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
-   * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
-   * - 1 for tokens that are **not masked**,
-   * - 0 for tokens that are **masked**.
-   * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
-   * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
-   * - mrope_position_deltas: Tensor of shape `(batch_size)`.
-   */
-  get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
-    const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
-    const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
-    const mrope_position_deltas = [];
-    if (image_grid_thw || video_grid_thw) {
-      let total_input_ids = input_ids.tolist();
-      if (!attention_mask) {
-        attention_mask = ones_like(input_ids);
-      }
-      const attention_mask_list = attention_mask.tolist();
-      const position_ids_list = Array.from(
-        { length: 3 },
-        (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
-      );
-      const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
-      const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
-      let image_index = 0;
-      let video_index = 0;
-      for (let i = 0; i < total_input_ids.length; ++i) {
-        const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
-        const vision_start_indices = ids.reduce((acc, x, idx) => {
-          if (x == vision_start_token_id) acc.push(idx);
-          return acc;
-        }, []);
-        const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
-        const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
-        const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
-        let llm_pos_ids_list = [];
-        let st2 = 0;
-        let remain_images = image_nums;
-        let remain_videos = video_nums;
-        for (let j = 0; j < vision_tokens.length; ++j) {
-          const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
-          const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
-          const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
-          const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
-          let ed;
-          let t, h, w;
-          if (ed_image < ed_video) {
-            [t, h, w] = image_grid_thw_list[image_index];
-            ++image_index;
-            --remain_images;
-            ed = ed_image;
-          } else {
-            [t, h, w] = video_grid_thw_list[video_index];
-            ++video_index;
-            --remain_videos;
-            ed = ed_video;
-          }
-          const [llm_grid_t, llm_grid_h, llm_grid_w] = [
-            Number(t),
-            Math.floor(Number(h) / spatial_merge_size),
-            Math.floor(Number(w) / spatial_merge_size)
-          ];
-          const text_len = ed - st2;
-          const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
-          llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
-          const offset = text_len + st_idx;
-          const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
-          const t_index = Array.from(
-            { length: grid_size },
-            (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
-          );
-          const h_index = Array.from(
-            { length: grid_size },
-            (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
-          );
-          const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
-          llm_pos_ids_list.push([t_index, h_index, w_index].flat());
-          st2 = ed + grid_size;
-        }
-        if (st2 < ids.length) {
-          const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
-          const text_len = ids.length - st2;
-          llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
-        }
-        const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
-        const llm_positions = new Array(num_items);
-        let index = 0;
-        for (let x = 0; x < 3; ++x) {
-          for (let y = 0; y < llm_pos_ids_list.length; ++y) {
-            const val = llm_pos_ids_list[y];
-            const text_len = val.length / 3;
-            for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
-              llm_positions[index++] = val[z];
-            }
-          }
-        }
-        let count2 = 0;
-        const attn_mask = attention_mask_list[i];
-        for (let y = 0; y < attn_mask.length; ++y) {
-          if (attn_mask[y] == 1) {
-            for (let x = 0; x < 3; ++x) {
-              position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
-            }
-            ++count2;
-          }
-        }
-        const max_llm_positions = max(llm_positions)[0];
-        mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
-      }
-      return [
-        new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
-        new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
-      ];
-    } else {
-      if (attention_mask) {
-        const { data, dims } = cumsum_masked_fill(attention_mask);
-        const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
-        const mrope_position_deltas2 = Array.from(
-          { length: dims[0] },
-          (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
-        );
-        return [
-          new Tensor2("int64", position_ids, [3, ...dims]),
-          new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
-        ];
-      } else {
-        const [batch_size, seq_length] = input_ids.dims;
-        const position_ids = BigInt64Array.from(
-          { length: 3 * batch_size * seq_length },
-          (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
-        );
-        return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
-      }
-    }
-  }
-  async encode_image({ pixel_values, image_grid_thw }) {
-    const features = (await sessionRun(this.sessions["vision_encoder"], {
-      pixel_values,
-      [this.image_grid_thw_name]: image_grid_thw
-    })).image_features;
-    return features;
-  }
-  _merge_input_ids_with_image_features(kwargs) {
-    return default_merge_input_ids_with_image_features({
-      // @ts-ignore
-      image_token_id: this.config.image_token_id,
-      ...kwargs
-    });
-  }
-  prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
-    if (model_inputs.attention_mask && !model_inputs.position_ids) {
-      if (!model_inputs.past_key_values) {
-        [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
-          model_inputs.input_ids,
-          model_inputs.image_grid_thw,
-          model_inputs.video_grid_thw,
-          model_inputs.attention_mask
-        );
-      } else {
-        model_inputs.pixel_values = null;
-        const past_length = getPastLength(model_inputs.past_key_values);
-        if (past_length < model_inputs.input_ids.dims[1]) {
-          const [full_position_ids, rope_deltas] = this.get_rope_index(
-            model_inputs.input_ids,
-            model_inputs.image_grid_thw,
-            model_inputs.video_grid_thw,
-            model_inputs.attention_mask
-          );
-          model_inputs.rope_deltas = rope_deltas;
-          model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
-          model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
-        } else {
-          if (!model_inputs.rope_deltas) {
-            [, model_inputs.rope_deltas] = this.get_rope_index(
-              model_inputs.input_ids,
-              model_inputs.image_grid_thw,
-              model_inputs.video_grid_thw,
-              model_inputs.attention_mask
-            );
-          }
-          const delta = BigInt(past_length);
-          const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
-          model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
-        }
-      }
-    }
-    return model_inputs;
-  }
-};
-// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
-var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
-  image_grid_thw_name = "image_grid_thw";
-};
 // src/models/qwen3/modeling_qwen3.js
 var Qwen3PreTrainedModel = class extends PreTrainedModel {
 };
@@ -27908,18 +29081,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
 // src/models/qwen3_vl/modeling_qwen3_vl.js
 var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
 };
+var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
+};
 // src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
 var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
 };
+var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
+};
 // src/models/qwen3_5/modeling_qwen3_5.js
 var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
 };
+var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
+};
 // src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
 var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
 };
+var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
+};
 // src/models/resnet/modeling_resnet.js
 var ResNetPreTrainedModel = class extends PreTrainedModel {
@@ -28318,6 +29499,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
   }
 };
+// src/models/solar_open/modeling_solar_open.js
+var SolarOpenPreTrainedModel = class extends PreTrainedModel {
+};
+var SolarOpenModel = class extends SolarOpenPreTrainedModel {
+};
+var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
+};
 // src/models/speecht5/modeling_speecht5.js
 var SpeechT5PreTrainedModel = class extends PreTrainedModel {
 };
@@ -28600,25 +29789,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
 var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
 };
-// src/models/ultravox/modeling_ultravox.js
-var UltravoxPreTrainedModel = class extends PreTrainedModel {
-  forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
-};
-var UltravoxModel = class extends UltravoxPreTrainedModel {
-  _merge_input_ids_with_audio_features(kwargs) {
-    const audio_hidden_size = kwargs.audio_features.dims.at(-1);
-    const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
-    return default_merge_input_ids_with_audio_features({
-      // @ts-ignore
-      audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
-      ...kwargs,
-      audio_features: reshaped_audio_features
-    });
-  }
-};
-var VoxtralForConditionalGeneration = class extends UltravoxModel {
-};
 // src/models/unispeech/modeling_unispeech.js
 var UniSpeechPreTrainedModel = class extends PreTrainedModel {
 };
@@ -28784,6 +29954,170 @@ var VitsModel = class extends VitsPreTrainedModel {
   }
 };
+// src/models/voxtral/modeling_voxtral.js
+var VoxtralForConditionalGeneration = class extends UltravoxModel {
+};
+// src/models/voxtral_realtime/modeling_voxtral_realtime.js
+var CONV1_LEFT_PAD = 2;
+var CONV2_LEFT_PAD = 1;
+var states = /* @__PURE__ */ new WeakMap();
+function createEncoderState(model, input_features) {
+  const { text_config, audio_config } = (
+    /** @type {any} */
+    model.config
+  );
+  const encoder_session = model.sessions["audio_encoder"];
+  const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
+  const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
+  const enc_kv_cache = new DynamicCache();
+  const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
+  const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
+  const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
+  for (const name in enc_shapes) {
+    const size = enc_shapes[name].reduce((a, b) => a * b, 1);
+    enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
+  }
+  const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
+    1,
+    PADDING_CACHE_CHANNELS,
+    CONV1_LEFT_PAD
+  ]);
+  const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
+  if (!chunks_iter) {
+    throw new Error("input_features must be iterable or async iterable");
+  }
+  return {
+    encoder_session,
+    enc_kv_cache,
+    enc_padding_cache,
+    enc_past_seq_len: 0,
+    audio_embed_queue: [],
+    audio_embed_total_tokens: 0,
+    audio_queue_offset: 0,
+    audio_consumed: 0,
+    stream_exhausted: false,
+    chunks_iter,
+    text_hidden_size: text_config.hidden_size
+  };
+}
+async function encodeChunk(s, chunk_features) {
+  const audio_seq_len = chunk_features.dims[2];
+  const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
+  const position_ids = new Tensor2(
+    "int64",
+    BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
+    [1, conv2_output_len]
+  );
+  const total_seq_len = s.enc_past_seq_len + conv2_output_len;
+  const attention_mask = ones([1, total_seq_len]);
+  const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
+    input_features: chunk_features,
+    attention_mask,
+    position_ids,
+    past_padding_cache: s.enc_padding_cache,
+    ...s.enc_kv_cache
+  });
+  if (s.enc_padding_cache.location === "gpu-buffer") {
+    s.enc_padding_cache.dispose();
+  }
+  s.enc_padding_cache = present_padding_cache;
+  for (const name in present_cache) {
+    if (name.startsWith("present.")) {
+      const pastName = name.replace("present", "past_key_values");
+      const prev = s.enc_kv_cache[pastName];
+      if (prev?.location === "gpu-buffer") {
+        prev.dispose();
+      }
+      s.enc_kv_cache[pastName] = present_cache[name];
+    }
+  }
+  s.enc_past_seq_len = total_seq_len;
+  return audio_embeds;
+}
+async function fillAudioBuffer(s, needed) {
+  while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
+    const result = await s.chunks_iter.next();
+    if (result.done) {
+      s.stream_exhausted = true;
+      break;
+    }
+    const new_embeds = await encodeChunk(s, result.value);
+    s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
+    s.audio_embed_total_tokens += new_embeds.dims[1];
+  }
+}
+function addAudioEmbeddings(s, inputs_embeds, current_len) {
+  if (s.audio_embed_queue.length === 0) return;
+  const embed_data = inputs_embeds.data;
+  let embed_write_pos = 0;
+  let remaining = current_len;
+  while (remaining > 0 && s.audio_embed_queue.length > 0) {
+    const front = s.audio_embed_queue[0];
+    const available = front.tokens - s.audio_queue_offset;
+    const n = Math.min(remaining, available);
+    const src_offset = s.audio_queue_offset * s.text_hidden_size;
+    for (let i = 0; i < n * s.text_hidden_size; ++i) {
+      embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
+    }
+    embed_write_pos += n;
+    remaining -= n;
+    s.audio_queue_offset += n;
+    if (s.audio_queue_offset >= front.tokens) {
+      s.audio_embed_queue.shift();
+      s.audio_queue_offset = 0;
+    }
+  }
+  s.audio_consumed += current_len - remaining;
+}
+var AudioExhaustedCriteria = class extends StoppingCriteria {
+  constructor(enc_state) {
+    super();
+    this._s = enc_state;
+  }
+  _call(input_ids) {
+    const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
+    return input_ids.map(() => done);
+  }
+};
+var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
+  forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
+};
+var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
+  async forward({ input_ids, past_key_values, ...kwargs }) {
+    const current_len = input_ids.dims[1];
+    const enc = states.get(this);
+    if (enc) {
+      await fillAudioBuffer(enc, enc.audio_consumed + current_len);
+    }
+    const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
+    if (enc) {
+      addAudioEmbeddings(enc, inputs_embeds, current_len);
+    }
+    const decoder_feeds = { inputs_embeds, ...kwargs };
+    this.addPastKeyValues(decoder_feeds, past_key_values);
+    const session = this.sessions["decoder_model_merged"];
+    const fixed = pick(decoder_feeds, session.inputNames);
+    return await sessionRun(session, fixed);
+  }
+  async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
+    if (!input_features) {
+      throw new Error("input_features (generator/iterable) must be provided");
+    }
+    const enc_state = createEncoderState(this, input_features);
+    states.set(this, enc_state);
+    const stopping_criteria = new StoppingCriteriaList();
+    stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
+    if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
+    try {
+      return await super.generate({ ...kwargs, stopping_criteria });
+    } finally {
+      enc_state.enc_kv_cache.dispose();
+      states.delete(this);
+    }
+  }
+};
 // src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
 var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
 };
@@ -29289,6 +30623,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
 // src/models/registry.js
 var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
   ["bert", "BertModel"],
+  ["eurobert", "EuroBertModel"],
   ["neobert", "NeoBertModel"],
   ["modernbert", "ModernBertModel"],
   ["nomic_bert", "NomicBertModel"],
@@ -29420,6 +30755,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
   ["gemma3_text", "Gemma3Model"],
   ["helium", "HeliumModel"],
   ["glm", "GlmModel"],
+  ["glm_moe_dsa", "GlmMoeDsaModel"],
   ["openelm", "OpenELMModel"],
   ["qwen2", "Qwen2Model"],
   ["qwen2_moe", "Qwen2MoeModel"],
@@ -29431,12 +30767,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
   ["mpt", "MptModel"],
   ["opt", "OPTModel"],
   ["mistral", "MistralModel"],
+  ["mistral4", "Mistral4Model"],
   ["ministral", "MinistralModel"],
   ["ministral3", "Ministral3Model"],
   ["ernie4_5", "Ernie4_5ForCausalLM"],
   ["starcoder2", "Starcoder2Model"],
+  ["deepseek_v3", "DeepseekV3Model"],
   ["falcon", "FalconModel"],
   ["falcon_h1", "FalconH1Model"],
+  ["nemotron_h", "NemotronHModel"],
+  ["solar_open", "SolarOpenModel"],
   ["stablelm", "StableLmModel"],
   ["modernbert-decoder", "ModernBertDecoderModel"],
   ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -29456,6 +30796,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
 ]);
 var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["bert", "BertForSequenceClassification"],
+  ["eurobert", "EuroBertForSequenceClassification"],
   ["neobert", "NeoBertForSequenceClassification"],
   ["modernbert", "ModernBertForSequenceClassification"],
   ["roformer", "RoFormerForSequenceClassification"],
@@ -29478,6 +30819,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
 ]);
 var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["bert", "BertForTokenClassification"],
+  ["eurobert", "EuroBertForTokenClassification"],
   ["neobert", "NeoBertForTokenClassification"],
   ["modernbert", "ModernBertForTokenClassification"],
   ["roformer", "RoFormerForTokenClassification"],
@@ -29537,27 +30879,40 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["gemma2", "Gemma2ForCausalLM"],
   ["vaultgemma", "VaultGemmaForCausalLM"],
   ["gemma3_text", "Gemma3ForCausalLM"],
+  ["gemma3", "Gemma3ForCausalLM"],
   ["helium", "HeliumForCausalLM"],
   ["glm", "GlmForCausalLM"],
+  ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
   ["openelm", "OpenELMForCausalLM"],
   ["qwen2", "Qwen2ForCausalLM"],
   ["qwen2_moe", "Qwen2MoeForCausalLM"],
   ["qwen3", "Qwen3ForCausalLM"],
   ["qwen3_moe", "Qwen3MoeForCausalLM"],
   ["qwen3_next", "Qwen3NextForCausalLM"],
+  ["qwen2_vl", "Qwen2VLForCausalLM"],
+  ["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
+  ["qwen3_vl", "Qwen3VLForCausalLM"],
+  ["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
+  ["qwen3_5", "Qwen3_5ForCausalLM"],
+  ["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
+  ["gemma3n", "Gemma3nForCausalLM"],
   ["phi", "PhiForCausalLM"],
   ["phi3", "Phi3ForCausalLM"],
   ["mpt", "MptForCausalLM"],
   ["opt", "OPTForCausalLM"],
   ["mbart", "MBartForCausalLM"],
   ["mistral", "MistralForCausalLM"],
+  ["mistral4", "Mistral4ForCausalLM"],
   ["ministral", "MinistralForCausalLM"],
   ["ministral3", "Ministral3ForCausalLM"],
   ["ernie4_5", "Ernie4_5ForCausalLM"],
   ["starcoder2", "Starcoder2ForCausalLM"],
+  ["deepseek_v3", "DeepseekV3ForCausalLM"],
   ["falcon", "FalconForCausalLM"],
   ["falcon_h1", "FalconH1ForCausalLM"],
+  ["nemotron_h", "NemotronHForCausalLM"],
   ["trocr", "TrOCRForCausalLM"],
+  ["solar_open", "SolarOpenForCausalLM"],
   ["stablelm", "StableLmForCausalLM"],
   ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
   ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -29568,6 +30923,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
 var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
 var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["bert", "BertForMaskedLM"],
+  ["eurobert", "EuroBertForMaskedLM"],
   ["neobert", "NeoBertForMaskedLM"],
   ["modernbert", "ModernBertForMaskedLM"],
   ["roformer", "RoFormerForMaskedLM"],
@@ -29620,16 +30976,21 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
   ["qwen3_5", "Qwen3_5ForConditionalGeneration"],
   ["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
+  ["lfm2_vl", "Lfm2VlForConditionalGeneration"],
   ["idefics3", "Idefics3ForConditionalGeneration"],
   ["smolvlm", "SmolVLMForConditionalGeneration"],
   ["paligemma", "PaliGemmaForConditionalGeneration"],
   ["llava_qwen2", "LlavaQwen2ForCausalLM"],
   ["gemma3n", "Gemma3nForConditionalGeneration"],
-  ["mistral3", "Mistral3ForConditionalGeneration"]
+  ["mistral3", "Mistral3ForConditionalGeneration"],
+  ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
+  ["glm_ocr", "GlmOcrForConditionalGeneration"]
 ]);
 var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
+  ["granite_speech", "GraniteSpeechForConditionalGeneration"],
   ["ultravox", "UltravoxModel"],
-  ["voxtral", "VoxtralForConditionalGeneration"]
+  ["voxtral", "VoxtralForConditionalGeneration"],
+  ["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
 ]);
 var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["vision-encoder-decoder", "VisionEncoderDecoderModel"]
@@ -29728,6 +31089,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
 ]);
 var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
 var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
+  ["chmv2", "CHMv2ForDepthEstimation"],
   ["dpt", "DPTForDepthEstimation"],
   ["depth_anything", "DepthAnythingForDepthEstimation"],
   ["glpn", "GLPNForDepthEstimation"],
@@ -29812,7 +31174,19 @@ var CUSTOM_MAPPING = [
     MODEL_TYPES.ImageAudioTextToText
   ],
   ["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
-  ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
+  ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
+  ["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
+  ["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
+  ["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
+  ["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
+  ["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
+  ["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
+  ["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
+  [
+    "VoxtralRealtimeForConditionalGeneration",
+    VoxtralRealtimeForConditionalGeneration,
+    MODEL_TYPES.VoxtralRealtime
+  ]
 ];
 for (const [name, model, type] of CUSTOM_MAPPING) {
   MODEL_TYPE_MAPPING.set(name, type);
@@ -31490,8 +32864,18 @@ var TASK_ALIASES = Object.freeze({
 });
 // src/utils/model_registry/get_model_files.js
+function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
+  if (config !== null) {
+    return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
+  }
+  const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
+  return memoizePromise(
+    key,
+    () => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
+  );
+}
 async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
-  config = await AutoConfig.from_pretrained(modelId, { config });
+  config = await get_config(modelId, { config });
   const files = [
     // Add config.json (always loaded)
     "config.json"
@@ -31552,74 +32936,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
       files.push(dataFilePath);
     }
   };
-  const singleModelName = model_file_name ?? "model";
-  if (modelType === MODEL_TYPES.DecoderOnly) {
-    add_model_file("model", singleModelName);
-    files.push("generation_config.json");
-  } else if (modelType === MODEL_TYPES.DecoderOnlyWithoutHead) {
-    add_model_file("model", singleModelName);
-  } else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
-    add_model_file("model", "encoder_model");
-    add_model_file("decoder_model_merged");
-    files.push("generation_config.json");
-  } else if (modelType === MODEL_TYPES.MaskGeneration) {
-    add_model_file("model", "vision_encoder");
-    add_model_file("prompt_encoder_mask_decoder");
-  } else if (modelType === MODEL_TYPES.EncoderDecoder) {
-    add_model_file("model", "encoder_model");
-    add_model_file("decoder_model_merged");
-  } else if (modelType === MODEL_TYPES.ImageTextToText) {
-    add_model_file("embed_tokens");
-    add_model_file("vision_encoder");
-    add_model_file("decoder_model_merged");
-    if (config.is_encoder_decoder) {
-      add_model_file("model", "encoder_model");
-    }
-    files.push("generation_config.json");
-  } else if (modelType === MODEL_TYPES.AudioTextToText) {
-    add_model_file("embed_tokens");
-    add_model_file("audio_encoder");
-    add_model_file("decoder_model_merged");
-    files.push("generation_config.json");
-  } else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
-    add_model_file("embed_tokens");
-    add_model_file("audio_encoder");
-    add_model_file("vision_encoder");
-    add_model_file("decoder_model_merged");
-    files.push("generation_config.json");
-  } else if (modelType === MODEL_TYPES.Musicgen) {
-    add_model_file("model", "text_encoder");
-    add_model_file("decoder_model_merged");
-    add_model_file("encodec_decode");
-    files.push("generation_config.json");
-  } else if (modelType === MODEL_TYPES.MultiModality) {
-    add_model_file("prepare_inputs_embeds");
-    add_model_file("model", "language_model");
-    add_model_file("lm_head");
-    add_model_file("gen_head");
-    add_model_file("gen_img_embeds");
-    add_model_file("image_decode");
-    files.push("generation_config.json");
-  } else if (modelType === MODEL_TYPES.Phi3V) {
-    add_model_file("prepare_inputs_embeds");
-    add_model_file("model");
-    add_model_file("vision_encoder");
-    files.push("generation_config.json");
-  } else if (modelType === MODEL_TYPES.Chatterbox) {
-    add_model_file("embed_tokens");
-    add_model_file("speech_encoder");
-    add_model_file("model", "language_model");
-    add_model_file("conditional_decoder");
-    files.push("generation_config.json");
-  } else if (modelType === MODEL_TYPES.AutoEncoder) {
-    add_model_file("encoder_model");
-    add_model_file("decoder_model");
-  } else if (modelType === MODEL_TYPES.Supertonic) {
-    add_model_file("text_encoder");
-    add_model_file("latent_denoiser");
-    add_model_file("voice_decoder");
-  } else {
-    add_model_file("model", singleModelName);
+  const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
+  for (const [sessionKey, baseName] of Object.entries(sessions)) {
+    add_model_file(sessionKey, baseName);
+  }
+  if (optional_configs) {
+    for (const configFile of Object.values(optional_configs)) {
+      files.push(configFile);
+    }
   }
   return files;
 }
@@ -32070,25 +33394,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
 // src/utils/model_registry/is_cached.js
 async function check_files_cache(modelId, files, options = {}) {
-  const cache = await getCache(options?.cache_dir);
-  if (!cache) {
+  const cache2 = await getCache(options?.cache_dir);
+  if (!cache2) {
     const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
     return { allCached: false, files: fileStatuses2 };
   }
   const fileStatuses = await Promise.all(
     files.map(async (filename) => {
-      const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
-      const cached = await checkCachedResource(cache, localPath, proposedCacheKey);
+      const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
+      const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
       return { file: filename, cached: !!cached };
     })
   );
   return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
 }
 async function is_file_cached(modelId, filename, options = {}) {
-  const cache = await getCache(options?.cache_dir);
-  if (!cache) return false;
-  const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
-  return !!await checkCachedResource(cache, localPath, proposedCacheKey);
+  const cache2 = await getCache(options?.cache_dir);
+  if (!cache2) return false;
+  const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
+  return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
 }
 async function is_cached(modelId, options = {}) {
   if (!modelId) {
@@ -32135,26 +33459,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
 // src/utils/model_registry/clear_cache.js
 async function clear_files_from_cache(modelId, files, options = {}) {
-  const cache = await getCache(options?.cache_dir);
-  if (!cache) {
+  const cache2 = await getCache(options?.cache_dir);
+  if (!cache2) {
     return {
       filesDeleted: 0,
       filesCached: 0,
       files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
     };
   }
-  if (!cache.delete) {
+  if (!cache2.delete) {
     throw new Error("Cache does not support delete operation");
   }
   const results = await Promise.all(
     files.map(async (filename) => {
-      const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache);
-      const cached = await checkCachedResource(cache, localPath, proposedCacheKey);
+      const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
+      const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
       const wasCached = !!cached;
       let deleted = false;
       if (wasCached) {
-        const deletedWithProposed = await cache.delete(proposedCacheKey);
-        const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache.delete(localPath) : false;
+        const deletedWithProposed = await cache2.delete(proposedCacheKey);
+        const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
         deleted = deletedWithProposed || deletedWithLocal;
       }
       return { file: filename, deleted, wasCached };
@@ -32505,6 +33829,9 @@ var ModelRegistry = class {
   BloomModel,
   BloomPreTrainedModel,
   BloomTokenizer,
+  CHMv2ForDepthEstimation,
+  CHMv2ImageProcessor,
+  CHMv2PreTrainedModel,
   CLIPFeatureExtractor,
   CLIPImageProcessor,
   CLIPModel,
@@ -32600,6 +33927,9 @@ var ModelRegistry = class {
   DebertaV2Tokenizer,
   DecisionTransformerModel,
   DecisionTransformerPreTrainedModel,
+  DeepseekV3ForCausalLM,
+  DeepseekV3Model,
+  DeepseekV3PreTrainedModel,
   DeiTFeatureExtractor,
   DeiTForImageClassification,
   DeiTImageProcessor,
@@ -32636,6 +33966,7 @@ var ModelRegistry = class {
   DonutImageProcessor,
   DonutSwinModel,
   DonutSwinPreTrainedModel,
+  DynamicCache,
   EdgeTamModel,
   EfficientNetForImageClassification,
   EfficientNetImageProcessor,
@@ -32659,6 +33990,11 @@ var ModelRegistry = class {
   EsmModel,
   EsmPreTrainedModel,
   EsmTokenizer,
+  EuroBertForMaskedLM,
+  EuroBertForSequenceClassification,
+  EuroBertForTokenClassification,
+  EuroBertModel,
+  EuroBertPreTrainedModel,
   ExaoneForCausalLM,
   ExaoneModel,
   ExaonePreTrainedModel,
@@ -32708,6 +34044,7 @@ var ModelRegistry = class {
   Gemma3Model,
   Gemma3PreTrainedModel,
   Gemma3nAudioFeatureExtractor,
+  Gemma3nForCausalLM,
   Gemma3nForConditionalGeneration,
   Gemma3nPreTrainedModel,
   Gemma3nProcessor,
@@ -32715,8 +34052,14 @@ var ModelRegistry = class {
   GemmaModel,
   GemmaPreTrainedModel,
   GemmaTokenizer,
+  Glm46VImageProcessor,
+  Glm46VProcessor,
   GlmForCausalLM,
   GlmModel,
+  GlmMoeDsaForCausalLM,
+  GlmMoeDsaModel,
+  GlmMoeDsaPreTrainedModel,
+  GlmOcrForConditionalGeneration,
   GlmPreTrainedModel,
   GptOssForCausalLM,
   GptOssModel,
@@ -32727,6 +34070,9 @@ var ModelRegistry = class {
   GraniteMoeHybridModel,
   GraniteMoeHybridPreTrainedModel,
   GranitePreTrainedModel,
+  GraniteSpeechFeatureExtractor,
+  GraniteSpeechForConditionalGeneration,
+  GraniteSpeechProcessor,
   GroundingDinoForObjectDetection,
   GroundingDinoImageProcessor,
   GroundingDinoPreTrainedModel,
@@ -32752,7 +34098,6 @@ var ModelRegistry = class {
   IJepaPreTrainedModel,
   Idefics3ForConditionalGeneration,
   Idefics3ImageProcessor,
-  Idefics3PreTrainedModel,
   Idefics3Processor,
   ImageClassificationPipeline,
   ImageFeatureExtractionPipeline,
@@ -32777,6 +34122,10 @@ var ModelRegistry = class {
   Lfm2MoeModel,
   Lfm2MoePreTrainedModel,
   Lfm2PreTrainedModel,
+  Lfm2VlForConditionalGeneration,
+  Lfm2VlImageProcessor,
+  Lfm2VlProcessor,
+  LightOnOcrForConditionalGeneration,
   LiteWhisperForConditionalGeneration,
   Llama4ForCausalLM,
   Llama4PreTrainedModel,
@@ -32846,6 +34195,9 @@ var ModelRegistry = class {
   MimiPreTrainedModel,
   MinLengthLogitsProcessor,
   MinNewTokensLengthLogitsProcessor,
+  Mistral4ForCausalLM,
+  Mistral4Model,
+  Mistral4PreTrainedModel,
   MistralForCausalLM,
   MistralModel,
   MistralPreTrainedModel,
@@ -32917,6 +34269,9 @@ var ModelRegistry = class {
   NanoChatForCausalLM,
   NanoChatModel,
   NanoChatPreTrainedModel,
+  NemotronHForCausalLM,
+  NemotronHModel,
+  NemotronHPreTrainedModel,
   NeoBertForMaskedLM,
   NeoBertForQuestionAnswering,
   NeoBertForSequenceClassification,
@@ -32960,7 +34315,6 @@ var ModelRegistry = class {
   Owlv2Model,
   Owlv2PreTrainedModel,
   PaliGemmaForConditionalGeneration,
-  PaliGemmaPreTrainedModel,
   PaliGemmaProcessor,
   ParakeetFeatureExtractor,
   ParakeetForCTC,
@@ -33004,10 +34358,12 @@ var ModelRegistry = class {
   Qwen2MoePreTrainedModel,
   Qwen2PreTrainedModel,
   Qwen2Tokenizer,
+  Qwen2VLForCausalLM,
   Qwen2VLForConditionalGeneration,
   Qwen2VLImageProcessor,
   Qwen2VLPreTrainedModel,
   Qwen2VLProcessor,
+  Qwen2_5_VLForCausalLM,
   Qwen2_5_VLForConditionalGeneration,
   Qwen2_5_VLProcessor,
   Qwen3ForCausalLM,
@@ -33019,10 +34375,14 @@ var ModelRegistry = class {
   Qwen3NextModel,
   Qwen3NextPreTrainedModel,
   Qwen3PreTrainedModel,
+  Qwen3VLForCausalLM,
   Qwen3VLForConditionalGeneration,
+  Qwen3VLMoeForCausalLM,
   Qwen3VLMoeForConditionalGeneration,
   Qwen3VLProcessor,
+  Qwen3_5ForCausalLM,
   Qwen3_5ForConditionalGeneration,
+  Qwen3_5MoeForCausalLM,
   Qwen3_5MoeForConditionalGeneration,
   RFDetrForObjectDetection,
   RFDetrModel,
@@ -33094,7 +34454,6 @@ var ModelRegistry = class {
   SmolLM3ForCausalLM,
   SmolLM3Model,
   SmolLM3PreTrainedModel,
-  SmolVLMForConditionalGeneration,
   SmolVLMImageProcessor,
   SmolVLMProcessor,
   SnacDecoderModel,
@@ -33102,6 +34461,9 @@ var ModelRegistry = class {
   SnacFeatureExtractor,
   SnacModel,
   SnacPreTrainedModel,
+  SolarOpenForCausalLM,
+  SolarOpenModel,
+  SolarOpenPreTrainedModel,
   SpeechT5FeatureExtractor,
   SpeechT5ForSpeechToText,
   SpeechT5ForTextToSpeech,
@@ -33200,6 +34562,10 @@ var ModelRegistry = class {
   VitsTokenizer,
   VoxtralForConditionalGeneration,
   VoxtralProcessor,
+  VoxtralRealtimeFeatureExtractor,
+  VoxtralRealtimeForConditionalGeneration,
+  VoxtralRealtimePreTrainedModel,
+  VoxtralRealtimeProcessor,
   Wav2Vec2BertForCTC,
   Wav2Vec2BertForSequenceClassification,
   Wav2Vec2BertModel,
@@ -33295,7 +34661,7 @@ var ModelRegistry = class {
 onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
   (*!
-   * ONNX Runtime Web v1.25.0-dev.20260303-e7e64dc112
+   * ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
    * Copyright (c) Microsoft Corporation. All rights reserved.
    * Licensed under the MIT License.
    *)