npm - @huggingface/transformers - Versions diffs - 4.0.0-next.7 → 4.0.0-next.8 - Mend

@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (79) hide show

package/README.md +13 -2
package/dist/transformers.js +689 -382
package/dist/transformers.min.js +19 -19
package/dist/transformers.node.cjs +716 -382
package/dist/transformers.node.min.cjs +19 -19
package/dist/transformers.node.min.mjs +19 -19
package/dist/transformers.node.mjs +689 -382
package/dist/transformers.web.js +697 -390
package/dist/transformers.web.min.js +17 -17
package/package.json +2 -2
package/src/configs.js +28 -22
package/src/env.js +1 -1
package/src/image_processors_utils.js +25 -15
package/src/models/chmv2/image_processing_chmv2.js +3 -0
package/src/models/chmv2/modeling_chmv2.js +4 -0
package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
package/src/models/eurobert/modeling_eurobert.js +41 -0
package/src/models/glm46v/image_processing_glm46v.js +12 -0
package/src/models/glm46v/processing_glm46v.js +5 -0
package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
package/src/models/image_processors.js +2 -0
package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
package/src/models/mistral4/modeling_mistral4.js +5 -0
package/src/models/modeling_utils.js +2 -0
package/src/models/models.js +10 -1
package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
package/src/models/processors.js +1 -0
package/src/models/qwen2_vl/modeling_qwen2_vl.js +187 -136
package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
package/src/models/registry.js +17 -0
package/src/models/solar_open/modeling_solar_open.js +5 -0
package/src/pipelines.js +1 -0
package/src/utils/hub.js +4 -1
package/src/utils/model_registry/get_file_metadata.js +1 -0
package/types/configs.d.ts.map +1 -1
package/types/image_processors_utils.d.ts +3 -2
package/types/image_processors_utils.d.ts.map +1 -1
package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
package/types/models/glm46v/processing_glm46v.d.ts +4 -0
package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
package/types/models/image_processors.d.ts +2 -0
package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
package/types/models/modeling_utils.d.ts.map +1 -1
package/types/models/models.d.ts +10 -1
package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
package/types/models/processors.d.ts +1 -0
package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
package/types/models/registry.d.ts.map +1 -1
package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
package/types/pipelines.d.ts +1 -0
package/types/pipelines.d.ts.map +1 -1
package/types/utils/hub.d.ts.map +1 -1
package/types/models/ast/modeling_ast.d.ts.map +0 -1
/package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0

package/dist/transformers.node.cjs CHANGED Viewed

@@ -117,6 +117,9 @@ __export(transformers_exports, {
   BloomModel: () => BloomModel,
   BloomPreTrainedModel: () => BloomPreTrainedModel,
   BloomTokenizer: () => BloomTokenizer,
+  CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
+  CHMv2ImageProcessor: () => CHMv2ImageProcessor,
+  CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
   CLIPFeatureExtractor: () => CLIPFeatureExtractor,
   CLIPImageProcessor: () => CLIPImageProcessor,
   CLIPModel: () => CLIPModel,
@@ -212,6 +215,9 @@ __export(transformers_exports, {
   DebertaV2Tokenizer: () => DebertaV2Tokenizer,
   DecisionTransformerModel: () => DecisionTransformerModel,
   DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
+  DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
+  DeepseekV3Model: () => DeepseekV3Model,
+  DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
   DeiTFeatureExtractor: () => DeiTFeatureExtractor,
   DeiTForImageClassification: () => DeiTForImageClassification,
   DeiTImageProcessor: () => DeiTImageProcessor,
@@ -272,6 +278,11 @@ __export(transformers_exports, {
   EsmModel: () => EsmModel,
   EsmPreTrainedModel: () => EsmPreTrainedModel,
   EsmTokenizer: () => EsmTokenizer,
+  EuroBertForMaskedLM: () => EuroBertForMaskedLM,
+  EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
+  EuroBertForTokenClassification: () => EuroBertForTokenClassification,
+  EuroBertModel: () => EuroBertModel,
+  EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
   ExaoneForCausalLM: () => ExaoneForCausalLM,
   ExaoneModel: () => ExaoneModel,
   ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -329,8 +340,14 @@ __export(transformers_exports, {
   GemmaModel: () => GemmaModel,
   GemmaPreTrainedModel: () => GemmaPreTrainedModel,
   GemmaTokenizer: () => GemmaTokenizer,
+  Glm46VImageProcessor: () => Glm46VImageProcessor,
+  Glm46VProcessor: () => Glm46VProcessor,
   GlmForCausalLM: () => GlmForCausalLM,
   GlmModel: () => GlmModel,
+  GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
+  GlmMoeDsaModel: () => GlmMoeDsaModel,
+  GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
+  GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
   GlmPreTrainedModel: () => GlmPreTrainedModel,
   GptOssForCausalLM: () => GptOssForCausalLM,
   GptOssModel: () => GptOssModel,
@@ -396,6 +413,7 @@ __export(transformers_exports, {
   Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
   Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
   Lfm2VlProcessor: () => Lfm2VlProcessor,
+  LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
   LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
   Llama4ForCausalLM: () => Llama4ForCausalLM,
   Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -465,6 +483,9 @@ __export(transformers_exports, {
   MimiPreTrainedModel: () => MimiPreTrainedModel,
   MinLengthLogitsProcessor: () => MinLengthLogitsProcessor,
   MinNewTokensLengthLogitsProcessor: () => MinNewTokensLengthLogitsProcessor,
+  Mistral4ForCausalLM: () => Mistral4ForCausalLM,
+  Mistral4Model: () => Mistral4Model,
+  Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
   MistralForCausalLM: () => MistralForCausalLM,
   MistralModel: () => MistralModel,
   MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -536,6 +557,9 @@ __export(transformers_exports, {
   NanoChatForCausalLM: () => NanoChatForCausalLM,
   NanoChatModel: () => NanoChatModel,
   NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
+  NemotronHForCausalLM: () => NemotronHForCausalLM,
+  NemotronHModel: () => NemotronHModel,
+  NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
   NeoBertForMaskedLM: () => NeoBertForMaskedLM,
   NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
   NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -725,6 +749,9 @@ __export(transformers_exports, {
   SnacFeatureExtractor: () => SnacFeatureExtractor,
   SnacModel: () => SnacModel,
   SnacPreTrainedModel: () => SnacPreTrainedModel,
+  SolarOpenForCausalLM: () => SolarOpenForCausalLM,
+  SolarOpenModel: () => SolarOpenModel,
+  SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
   SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
   SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
   SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
@@ -925,7 +952,7 @@ var import_node_fs = __toESM(require("fs"), 1);
 var import_node_path = __toESM(require("path"), 1);
 var import_node_url = __toESM(require("url"), 1);
 var import_meta = {};
-var VERSION = "4.0.0-next.7";
+var VERSION = "4.0.0-next.8";
 var HAS_SELF = typeof self !== "undefined";
 var IS_FS_AVAILABLE = !isEmpty(import_node_fs.default);
 var IS_PATH_AVAILABLE = !isEmpty(import_node_path.default);
@@ -1155,7 +1182,7 @@ var logger = {
   }
 };
-// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
+// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
 var DictionarySplitter = class {
   /**
    * @param dictionary The dictionary of words to use for splitting.
@@ -2811,10 +2838,10 @@ var BPE = class extends TokenizerModel_default {
           );
           if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
             output_tokens.push(...byte_tokens);
-          } else {
+          } else if (this.unk_token != null) {
             output_tokens.push(this.unk_token);
           }
-        } else {
+        } else if (this.unk_token != null) {
           output_tokens.push(this.unk_token);
         }
       }
@@ -7426,13 +7453,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
       wrapped_progress
     );
   } else if (typeof response !== "string") {
+    const headers = new Headers(response.headers);
+    headers.set("content-length", result.byteLength.toString());
     await cache2.put(
       cacheKey,
       new Response(
         /** @type {any} */
         result,
         {
-          headers: response.headers
+          headers
         }
       )
     ).catch((err) => {
@@ -16643,6 +16672,7 @@ __export(processors_exports, {
   ChatterboxProcessor: () => ChatterboxProcessor,
   Florence2Processor: () => Florence2Processor,
   Gemma3nProcessor: () => Gemma3nProcessor,
+  Glm46VProcessor: () => Glm46VProcessor,
   GraniteSpeechProcessor: () => GraniteSpeechProcessor,
   GroundingDinoProcessor: () => GroundingDinoProcessor,
   Idefics3Processor: () => Idefics3Processor,
@@ -19147,26 +19177,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
   }
   return [segmentation, segments];
 }
-function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
+function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
   if (height < factor || width < factor) {
-    throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
-  } else if (Math.max(height, width) / Math.min(height, width) > 200) {
+    const scale = Math.max(factor / height, factor / width);
+    height = Math.round(height * scale);
+    width = Math.round(width * scale);
+  }
+  if (Math.max(height, width) / Math.min(height, width) > 200) {
     throw new Error(
       `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
     );
   }
   let h_bar = Math.round(height / factor) * factor;
   let w_bar = Math.round(width / factor) * factor;
-  if (h_bar * w_bar > max_pixels) {
-    const beta = Math.sqrt(height * width / max_pixels);
-    h_bar = Math.floor(height / beta / factor) * factor;
-    w_bar = Math.floor(width / beta / factor) * factor;
-  } else if (h_bar * w_bar < min_pixels) {
-    const beta = Math.sqrt(min_pixels / (height * width));
+  if (temporal_factor * h_bar * w_bar > max_pixels) {
+    const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
+    h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
+    w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
+  } else if (temporal_factor * h_bar * w_bar < min_pixels) {
+    const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
     h_bar = Math.ceil(height * beta / factor) * factor;
     w_bar = Math.ceil(width * beta / factor) * factor;
   }
-  return [h_bar, w_bar];
+  return [w_bar, h_bar];
 }
 function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
   if (label_ids_to_fuse === null) {
@@ -19245,7 +19278,7 @@ var ImageProcessor = class extends Callable2 {
     this.do_pad = config.do_pad;
     this.min_pixels = config.min_pixels;
     this.max_pixels = config.max_pixels;
-    if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
+    if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
       this.pad_size = this.size;
     }
     this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -19533,10 +19566,8 @@ var ImageProcessor = class extends Callable2 {
         const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
         [pixelData, imgDims] = padded;
       } else if (this.size_divisibility) {
-        const [paddedWidth, paddedHeight] = enforce_size_divisibility(
-          [imgDims[1], imgDims[0]],
-          this.size_divisibility
-        );
+        const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
+        const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
         [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
       }
     }
@@ -19613,6 +19644,7 @@ var image_processors_exports = {};
 __export(image_processors_exports, {
   BeitFeatureExtractor: () => BeitFeatureExtractor,
   BitImageProcessor: () => BitImageProcessor,
+  CHMv2ImageProcessor: () => CHMv2ImageProcessor,
   CLIPFeatureExtractor: () => CLIPFeatureExtractor,
   CLIPImageProcessor: () => CLIPImageProcessor,
   ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -19629,6 +19661,7 @@ __export(image_processors_exports, {
   DonutImageProcessor: () => DonutImageProcessor,
   EfficientNetImageProcessor: () => EfficientNetImageProcessor,
   GLPNFeatureExtractor: () => GLPNFeatureExtractor,
+  Glm46VImageProcessor: () => Glm46VImageProcessor,
   GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
   Idefics3ImageProcessor: () => Idefics3ImageProcessor,
   ImageFeatureExtractor: () => ImageProcessor,
@@ -19689,6 +19722,10 @@ var BitImageProcessor = class extends ImageProcessor {
 var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
 };
+// src/models/chmv2/image_processing_chmv2.js
+var CHMv2ImageProcessor = class extends ImageProcessor {
+};
 // src/models/clip/image_processing_clip.js
 var CLIPImageProcessor = class extends ImageProcessor {
 };
@@ -19808,6 +19845,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
   }
 };
+// src/models/qwen2_vl/image_processing_qwen2_vl.js
+var Qwen2VLImageProcessor = class extends ImageProcessor {
+  constructor(config) {
+    super(config);
+    this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
+    this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
+    this.patch_size = config.patch_size;
+    this.merge_size = config.merge_size;
+  }
+  /** @type {ImageProcessor['get_resize_output_image_size']} */
+  get_resize_output_image_size(image, size) {
+    const factor = this.patch_size * this.merge_size;
+    return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
+  }
+  async _call(images, ...args) {
+    const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
+    let patches = pixel_values;
+    const { temporal_patch_size, merge_size, patch_size } = this.config;
+    if (patches.dims[0] === 1) {
+      patches = cat(
+        Array.from({ length: temporal_patch_size }, () => patches),
+        0
+      );
+    }
+    const grid_t = patches.dims[0] / temporal_patch_size;
+    const channel = patches.dims[1];
+    const grid_h = Math.floor(patches.dims[2] / patch_size);
+    const grid_w = Math.floor(patches.dims[3] / patch_size);
+    const flatten_patches = patches.view(
+      grid_t,
+      temporal_patch_size,
+      channel,
+      Math.floor(grid_h / merge_size),
+      merge_size,
+      patch_size,
+      Math.floor(grid_w / merge_size),
+      merge_size,
+      patch_size
+    ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
+    const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
+    return {
+      pixel_values: flatten_patches,
+      image_grid_thw,
+      original_sizes,
+      reshaped_input_sizes
+    };
+  }
+};
+// src/models/glm46v/image_processing_glm46v.js
+var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
+  /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
+  get_resize_output_image_size(image, size) {
+    const factor = this.patch_size * this.merge_size;
+    const temporal_factor = this.config.temporal_patch_size ?? 2;
+    return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
+  }
+};
 // src/models/glpn/image_processing_glpn.js
 var GLPNFeatureExtractor = class extends ImageProcessor {
 };
@@ -20201,7 +20297,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
         const img = pixel_values.unsqueeze_(0);
         const total_factor = this.encoder_patch_size * this.downsample_factor;
         const f2 = total_factor ** 2;
-        const [new_height, new_width] = smart_resize(
+        const [new_width, new_height] = smart_resize(
           Math.max(total_factor, height),
           Math.max(total_factor, width),
           total_factor,
@@ -20491,55 +20587,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
 var PvtImageProcessor = class extends ImageProcessor {
 };
-// src/models/qwen2_vl/image_processing_qwen2_vl.js
-var Qwen2VLImageProcessor = class extends ImageProcessor {
-  constructor(config) {
-    super(config);
-    this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
-    this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
-    this.patch_size = config.patch_size;
-    this.merge_size = config.merge_size;
-  }
-  /** @type {ImageProcessor['get_resize_output_image_size']} */
-  get_resize_output_image_size(image, size) {
-    const factor = this.patch_size * this.merge_size;
-    return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
-  }
-  async _call(images, ...args) {
-    const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
-    let patches = pixel_values;
-    const { temporal_patch_size, merge_size, patch_size } = this.config;
-    if (patches.dims[0] === 1) {
-      patches = cat(
-        Array.from({ length: temporal_patch_size }, () => patches),
-        0
-      );
-    }
-    const grid_t = patches.dims[0] / temporal_patch_size;
-    const channel = patches.dims[1];
-    const grid_h = Math.floor(patches.dims[2] / patch_size);
-    const grid_w = Math.floor(patches.dims[3] / patch_size);
-    const flatten_patches = patches.view(
-      grid_t,
-      temporal_patch_size,
-      channel,
-      Math.floor(grid_h / merge_size),
-      merge_size,
-      patch_size,
-      Math.floor(grid_w / merge_size),
-      merge_size,
-      patch_size
-    ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
-    const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
-    return {
-      pixel_values: flatten_patches,
-      image_grid_thw,
-      original_sizes,
-      reshaped_input_sizes
-    };
-  }
-};
 // src/models/rt_detr/image_processing_rt_detr.js
 var RTDetrImageProcessor = class extends ImageProcessor {
   /** @type {typeof post_process_object_detection} */
@@ -21093,6 +21140,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
   }
 };
+// src/models/qwen2_vl/processing_qwen2_vl.js
+var Qwen2VLProcessor = class extends Processor {
+  static image_processor_class = AutoImageProcessor;
+  static tokenizer_class = AutoTokenizer;
+  static image_token = "<|image_pad|>";
+  /**
+   *
+   * @param {string|string[]} text
+   * @param {RawImage|RawImage[]} images
+   * @param  {...any} args
+   * @returns {Promise<any>}
+   */
+  async _call(text, images = null, ...args) {
+    if (!Array.isArray(text)) {
+      text = [text];
+    }
+    let image_inputs, image_grid_thw;
+    if (images) {
+      image_inputs = await this.image_processor(images);
+      image_grid_thw = image_inputs.image_grid_thw;
+    }
+    if (image_grid_thw) {
+      let merge_length = this.image_processor.config.merge_size ** 2;
+      let index = 0;
+      const image_token = (
+        /** @type {typeof Qwen2VLProcessor} */
+        this.constructor.image_token
+      );
+      const image_grid_thw_list = image_grid_thw.tolist();
+      text = text.map((t) => {
+        while (t.includes(image_token)) {
+          const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
+          t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
+        }
+        return t.replaceAll("<|placeholder|>", image_token);
+      });
+    }
+    const text_inputs = this.tokenizer(text);
+    return {
+      ...text_inputs,
+      ...image_inputs
+    };
+  }
+};
+// src/models/glm46v/processing_glm46v.js
+var Glm46VProcessor = class extends Qwen2VLProcessor {
+  static image_token = "<|image|>";
+};
 // src/models/granite_speech/processing_granite_speech.js
 var GraniteSpeechProcessor = class extends Processor {
   static tokenizer_class = AutoTokenizer;
@@ -21823,47 +21920,6 @@ var PyAnnoteProcessor = class extends Processor {
   }
 };
-// src/models/qwen2_vl/processing_qwen2_vl.js
-var Qwen2VLProcessor = class extends Processor {
-  static image_processor_class = AutoImageProcessor;
-  static tokenizer_class = AutoTokenizer;
-  /**
-   *
-   * @param {string|string[]} text
-   * @param {RawImage|RawImage[]} images
-   * @param  {...any} args
-   * @returns {Promise<any>}
-   */
-  async _call(text, images = null, ...args) {
-    if (!Array.isArray(text)) {
-      text = [text];
-    }
-    let image_inputs, image_grid_thw;
-    if (images) {
-      image_inputs = await this.image_processor(images);
-      image_grid_thw = image_inputs.image_grid_thw;
-    }
-    if (image_grid_thw) {
-      let merge_length = this.image_processor.config.merge_size ** 2;
-      let index = 0;
-      const image_grid_thw_list = image_grid_thw.tolist();
-      text = text.map((t) => {
-        while (t.includes("<|image_pad|>")) {
-          const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
-          t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
-        }
-        return t.replaceAll("<|placeholder|>", "<|image_pad|>");
-      });
-    }
-    const text_inputs = this.tokenizer(text);
-    return {
-      ...text_inputs,
-      ...image_inputs
-      // TODO: ...videos_inputs,
-    };
-  }
-};
 // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
 var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
 };
@@ -22207,6 +22263,8 @@ function getNormalizedConfig(config) {
     case "gemma3n":
     case "lfm2_vl":
     case "chatterbox":
+    case "lighton_ocr":
+    case "glm_ocr":
     case "mistral3":
     case "qwen2_5_vl":
     case "qwen3_vl":
@@ -22282,6 +22340,8 @@ function getNormalizedConfig(config) {
       mapping["dim_kv"] = "head_dim";
       break;
     case "qwen3":
+    case "solar_open":
+    case "glm_ocr_text":
     case "gemma":
     case "gemma2":
     case "vaultgemma":
@@ -22292,6 +22352,7 @@ function getNormalizedConfig(config) {
     case "ernie4_5":
     case "hunyuan_v1_dense":
     case "falcon_h1":
+    case "nemotron_h":
     case "ministral":
     case "ministral3":
       mapping["num_heads"] = "num_key_value_heads";
@@ -22326,6 +22387,9 @@ function getNormalizedConfig(config) {
       mapping["num_attention_heads"] = "num_attention_heads";
       break;
     case "youtu":
+    case "deepseek_v3":
+    case "glm_moe_dsa":
+    case "mistral4":
       mapping["num_heads"] = "num_key_value_heads";
       mapping["num_layers"] = "num_hidden_layers";
       mapping["dim_kv"] = "qk_head_dim";
@@ -22414,6 +22478,7 @@ function getCacheShapes(config, options) {
   if (!(config instanceof PretrainedConfig)) {
     config = new PretrainedConfig(config);
   }
+  const batch_size = options?.batch_size ?? 1;
   if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
     const pkv_prefix = options?.prefix ?? "past_key_values";
     const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -22423,7 +22488,6 @@ function getCacheShapes(config, options) {
       config
     );
     const head_dim = hidden_size / num_attention_heads;
-    const batch_size = options?.batch_size ?? 1;
     for (let i = 0; i < layer_types.length; ++i) {
       if (layer_types[i] === "full_attention") {
         for (const kv of ["key", "value"]) {
@@ -22436,31 +22500,26 @@ function getCacheShapes(config, options) {
       }
     }
     return cache_values;
-  } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
+  } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
     const pkv_prefix = options?.prefix ?? "past_key_values";
     const conv_prefix = pkv_prefix === "present" ? "present" : "past";
-    const cache_values = {};
-    const {
-      layer_types,
-      num_hidden_layers,
-      num_attention_heads,
-      num_key_value_heads,
-      hidden_size,
-      mamba_d_conv,
-      mamba_n_heads,
-      mamba_d_head,
-      mamba_d_state,
-      mamba_n_groups,
-      mamba_expand,
-      mamba_d_ssm
-    } = (
+    const c = (
       /** @type {any} */
       config
     );
-    const head_dim = hidden_size / num_attention_heads;
-    const batch_size = options?.batch_size ?? 1;
-    const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
-    for (let i = 0; i < num_hidden_layers; ++i) {
+    const layer_types = c.layer_types ?? c.layers_block_type;
+    const num_layers = c.num_hidden_layers ?? layer_types?.length;
+    const num_key_value_heads = c.num_key_value_heads;
+    const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
+    const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
+    const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
+    const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
+    const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
+    const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
+    const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
+    const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
+    const cache_values = {};
+    for (let i = 0; i < num_layers; ++i) {
       if (!layer_types || layer_types[i] === "mamba") {
         cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
         cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -22494,7 +22553,6 @@ function getCacheShapes(config, options) {
     const key_dim = linear_key_head_dim * linear_num_key_heads;
     const value_dim = linear_value_head_dim * linear_num_value_heads;
     const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
-    const batch_size = options?.batch_size ?? 1;
     for (let i = 0; i < layer_types.length; ++i) {
       if (layer_types[i] === "full_attention") {
         for (const kv of ["key", "value"]) {
@@ -25122,7 +25180,9 @@ async function generic_text_to_text_forward(self2, {
         "qwen3_5",
         "qwen3_5_text",
         "qwen3_5_moe",
-        "qwen3_5_moe_text"
+        "qwen3_5_moe_text",
+        "glm_ocr",
+        "glm_ocr_text"
       ].includes(self2.config.model_type)
     ) {
       const { image_grid_thw, video_grid_thw } = kwargs;
@@ -25346,6 +25406,8 @@ __export(models_exports, {
   BloomForCausalLM: () => BloomForCausalLM,
   BloomModel: () => BloomModel,
   BloomPreTrainedModel: () => BloomPreTrainedModel,
+  CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
+  CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
   CLIPModel: () => CLIPModel,
   CLIPPreTrainedModel: () => CLIPPreTrainedModel,
   CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -25420,6 +25482,9 @@ __export(models_exports, {
   DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
   DecisionTransformerModel: () => DecisionTransformerModel,
   DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
+  DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
+  DeepseekV3Model: () => DeepseekV3Model,
+  DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
   DeiTForImageClassification: () => DeiTForImageClassification,
   DeiTModel: () => DeiTModel,
   DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -25465,6 +25530,11 @@ __export(models_exports, {
   EsmForTokenClassification: () => EsmForTokenClassification,
   EsmModel: () => EsmModel,
   EsmPreTrainedModel: () => EsmPreTrainedModel,
+  EuroBertForMaskedLM: () => EuroBertForMaskedLM,
+  EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
+  EuroBertForTokenClassification: () => EuroBertForTokenClassification,
+  EuroBertModel: () => EuroBertModel,
+  EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
   ExaoneForCausalLM: () => ExaoneForCausalLM,
   ExaoneModel: () => ExaoneModel,
   ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -25511,6 +25581,10 @@ __export(models_exports, {
   GemmaPreTrainedModel: () => GemmaPreTrainedModel,
   GlmForCausalLM: () => GlmForCausalLM,
   GlmModel: () => GlmModel,
+  GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
+  GlmMoeDsaModel: () => GlmMoeDsaModel,
+  GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
+  GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
   GlmPreTrainedModel: () => GlmPreTrainedModel,
   GptOssForCausalLM: () => GptOssForCausalLM,
   GptOssModel: () => GptOssModel,
@@ -25557,6 +25631,7 @@ __export(models_exports, {
   Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
   Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
   Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
+  LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
   LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
   Llama4ForCausalLM: () => Llama4ForCausalLM,
   Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -25606,6 +25681,9 @@ __export(models_exports, {
   MimiEncoderOutput: () => MimiEncoderOutput,
   MimiModel: () => MimiModel,
   MimiPreTrainedModel: () => MimiPreTrainedModel,
+  Mistral4ForCausalLM: () => Mistral4ForCausalLM,
+  Mistral4Model: () => Mistral4Model,
+  Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
   MistralForCausalLM: () => MistralForCausalLM,
   MistralModel: () => MistralModel,
   MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -25663,6 +25741,9 @@ __export(models_exports, {
   NanoChatForCausalLM: () => NanoChatForCausalLM,
   NanoChatModel: () => NanoChatModel,
   NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
+  NemotronHForCausalLM: () => NemotronHForCausalLM,
+  NemotronHModel: () => NemotronHModel,
+  NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
   NeoBertForMaskedLM: () => NeoBertForMaskedLM,
   NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
   NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -25800,6 +25881,9 @@ __export(models_exports, {
   SnacEncoderModel: () => SnacEncoderModel,
   SnacModel: () => SnacModel,
   SnacPreTrainedModel: () => SnacPreTrainedModel,
+  SolarOpenForCausalLM: () => SolarOpenForCausalLM,
+  SolarOpenModel: () => SolarOpenModel,
+  SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
   SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
   SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
   SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -25974,7 +26058,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
 var ArceeForCausalLM = class extends ArceePreTrainedModel {
 };
-// src/models/ast/modeling_ast.js
+// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
 var ASTPreTrainedModel = class extends PreTrainedModel {
 };
 var ASTModel = class extends ASTPreTrainedModel {
@@ -26309,6 +26393,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
 var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
 };
+// src/models/chmv2/modeling_chmv2.js
+var CHMv2PreTrainedModel = class extends PreTrainedModel {
+};
+var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
+};
 // src/models/clap/modeling_clap.js
 var ClapPreTrainedModel = class extends PreTrainedModel {
 };
@@ -26647,6 +26737,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
   }
 };
+// src/models/deepseek_v3/modeling_deepseek_v3.js
+var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
+};
+var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
+};
+var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
+};
 // src/models/deberta_v2/modeling_deberta_v2.js
 var DebertaV2PreTrainedModel = class extends PreTrainedModel {
 };
@@ -26995,6 +27093,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
   }
 };
+// src/models/eurobert/modeling_eurobert.js
+var EuroBertPreTrainedModel = class extends PreTrainedModel {
+};
+var EuroBertModel = class extends EuroBertPreTrainedModel {
+};
+var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
+  /**
+   * Calls the model on new inputs.
+   *
+   * @param {Object} model_inputs The inputs to the model.
+   * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
+   */
+  async _call(model_inputs) {
+    return new MaskedLMOutput(await super._call(model_inputs));
+  }
+};
+var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
+  /**
+   * Calls the model on new inputs.
+   *
+   * @param {Object} model_inputs The inputs to the model.
+   * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+   */
+  async _call(model_inputs) {
+    return new SequenceClassifierOutput(await super._call(model_inputs));
+  }
+};
+var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
+  /**
+   * Calls the model on new inputs.
+   *
+   * @param {Object} model_inputs The inputs to the model.
+   * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
+   */
+  async _call(model_inputs) {
+    return new TokenClassifierOutput(await super._call(model_inputs));
+  }
+};
 // src/models/exaone/modeling_exaone.js
 var ExaonePreTrainedModel = class extends PreTrainedModel {
 };
@@ -27270,6 +27407,377 @@ var GlmModel = class extends GlmPreTrainedModel {
 var GlmForCausalLM = class extends GlmPreTrainedModel {
 };
+// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
+var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
+};
+var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
+};
+var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
+};
+// src/models/qwen2_vl/modeling_qwen2_vl.js
+var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
+  forward_params = [
+    // Text inputs
+    "input_ids",
+    "attention_mask",
+    "position_ids",
+    "past_key_values",
+    // Vision inputs
+    "pixel_values",
+    "image_grid_thw"
+  ];
+};
+var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
+  // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
+  // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
+  // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
+  image_grid_thw_name = "grid_thw";
+  /**
+   * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
+   * @param {Tensor} input_ids
+   * @param {Tensor} attention_mask
+   * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
+   */
+  _get_text_only_rope_index(input_ids, attention_mask) {
+    if (attention_mask) {
+      const { data, dims } = cumsum_masked_fill(attention_mask);
+      const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
+      const mrope_position_deltas = Array.from(
+        { length: dims[0] },
+        (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
+      );
+      return [
+        new Tensor2("int64", position_ids, [3, ...dims]),
+        new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
+      ];
+    } else {
+      const [batch_size, seq_length] = input_ids.dims;
+      const position_ids = BigInt64Array.from(
+        { length: 3 * batch_size * seq_length },
+        (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
+      );
+      return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
+    }
+  }
+  /**
+   * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
+   * global [all_t, all_h, all_w] order, then write back into the position_ids array
+   * respecting attention mask.
+   * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
+   * @param {number[]} attn_mask Attention mask for this batch element
+   * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
+   * @param {number} batch_idx Current batch index
+   * @returns {number[]} Flat reordered positions of length total_len
+   */
+  _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
+    const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
+    const llm_positions = new Array(total_len);
+    let index = 0;
+    for (let x = 0; x < 3; ++x) {
+      for (const val of llm_pos_ids_list) {
+        const seg_len = val.length / 3;
+        for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
+          llm_positions[index++] = val[z];
+        }
+      }
+    }
+    let count2 = 0;
+    for (let y = 0; y < attn_mask.length; ++y) {
+      if (attn_mask[y] == 1) {
+        for (let x = 0; x < 3; ++x) {
+          position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
+        }
+        ++count2;
+      }
+    }
+    return llm_positions;
+  }
+  /**
+   * Build per-batch position ID segments for multimodal rope.
+   * Override this in subclasses to change how vision/text segments are identified and positioned.
+   * @param {object} params
+   * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
+   * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
+   * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
+   * @param {number} params.spatial_merge_size
+   * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
+   * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
+   */
+  _get_multimodal_rope_positions({
+    filtered_ids,
+    image_grid_thw_list,
+    video_grid_thw_list,
+    spatial_merge_size,
+    state
+  }) {
+    const { image_token_id, video_token_id, vision_start_token_id } = this.config;
+    const ids = filtered_ids;
+    const vision_start_indices = ids.reduce((acc, x, idx) => {
+      if (x == vision_start_token_id) acc.push(idx);
+      return acc;
+    }, []);
+    const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
+    const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
+    const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
+    const llm_pos_ids_list = [];
+    let st2 = 0;
+    let remain_images = image_nums;
+    let remain_videos = video_nums;
+    for (let j = 0; j < vision_tokens.length; ++j) {
+      const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
+      const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
+      const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
+      const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
+      let ed;
+      let t, h, w;
+      if (ed_image < ed_video) {
+        [t, h, w] = image_grid_thw_list[state.image_index];
+        ++state.image_index;
+        --remain_images;
+        ed = ed_image;
+      } else {
+        [t, h, w] = video_grid_thw_list[state.video_index];
+        ++state.video_index;
+        --remain_videos;
+        ed = ed_video;
+      }
+      const [llm_grid_t, llm_grid_h, llm_grid_w] = [
+        Number(t),
+        Math.floor(Number(h) / spatial_merge_size),
+        Math.floor(Number(w) / spatial_merge_size)
+      ];
+      const text_len = ed - st2;
+      const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
+      llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
+      const offset = text_len + st_idx;
+      const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
+      const t_index = Array.from(
+        { length: grid_size },
+        (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
+      );
+      const h_index = Array.from(
+        { length: grid_size },
+        (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
+      );
+      const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
+      llm_pos_ids_list.push([t_index, h_index, w_index].flat());
+      st2 = ed + grid_size;
+    }
+    if (st2 < ids.length) {
+      const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
+      const text_len = ids.length - st2;
+      llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
+    }
+    return llm_pos_ids_list;
+  }
+  /**
+   * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+   *
+   * Explanation:
+   *     Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+   *
+   *     For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
+   *     Examples:
+   *         input_ids: [T T T T T], here T is for text.
+   *         temporal position_ids: [0, 1, 2, 3, 4]
+   *         height position_ids: [0, 1, 2, 3, 4]
+   *         width position_ids: [0, 1, 2, 3, 4]
+   *
+   *     For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+   *     and 1D rotary position embeddin for text part.
+   *     Examples:
+   *         Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
+   *         input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+   *         vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+   *         vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+   *         vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+   *         text temporal position_ids: [3, 4, 5, 6, 7]
+   *         text height position_ids: [3, 4, 5, 6, 7]
+   *         text width position_ids: [3, 4, 5, 6, 7]
+   *         Here we calculate the text start position_ids as the max vision position_ids plus 1.
+   *
+   * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
+   * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
+   * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
+   * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
+   * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
+   */
+  get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
+    const { vision_config } = this.config;
+    const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
+    if (image_grid_thw || video_grid_thw) {
+      const total_input_ids = input_ids.tolist();
+      if (!attention_mask) {
+        attention_mask = ones_like(input_ids);
+      }
+      const attention_mask_list = attention_mask.tolist();
+      const position_ids_list = Array.from(
+        { length: 3 },
+        () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
+      );
+      const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
+      const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
+      const state = { image_index: 0, video_index: 0 };
+      const mrope_position_deltas = [];
+      for (let i = 0; i < total_input_ids.length; ++i) {
+        const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
+        const llm_pos_ids_list = this._get_multimodal_rope_positions({
+          filtered_ids,
+          image_grid_thw_list,
+          video_grid_thw_list,
+          spatial_merge_size,
+          state
+        });
+        const llm_positions = this._reorder_and_write_positions(
+          llm_pos_ids_list,
+          attention_mask_list[i],
+          position_ids_list,
+          i
+        );
+        mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
+      }
+      return [
+        new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
+        new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
+      ];
+    } else {
+      return this._get_text_only_rope_index(input_ids, attention_mask);
+    }
+  }
+  async encode_image({ pixel_values, image_grid_thw }) {
+    const features = (await sessionRun(this.sessions["vision_encoder"], {
+      pixel_values,
+      [this.image_grid_thw_name]: image_grid_thw
+    })).image_features;
+    return features;
+  }
+  _merge_input_ids_with_image_features(kwargs) {
+    return default_merge_input_ids_with_image_features({
+      // @ts-ignore
+      image_token_id: this.config.image_token_id,
+      ...kwargs
+    });
+  }
+  prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
+    if (model_inputs.attention_mask && !model_inputs.position_ids) {
+      if (!model_inputs.past_key_values) {
+        [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
+          model_inputs.input_ids,
+          model_inputs.image_grid_thw,
+          model_inputs.video_grid_thw,
+          model_inputs.attention_mask
+        );
+      } else {
+        model_inputs.pixel_values = null;
+        const past_length = model_inputs.past_key_values.get_seq_length();
+        if (past_length < model_inputs.input_ids.dims[1]) {
+          const [full_position_ids, rope_deltas] = this.get_rope_index(
+            model_inputs.input_ids,
+            model_inputs.image_grid_thw,
+            model_inputs.video_grid_thw,
+            model_inputs.attention_mask
+          );
+          model_inputs.rope_deltas = rope_deltas;
+          model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
+          model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
+        } else {
+          if (!model_inputs.rope_deltas) {
+            [, model_inputs.rope_deltas] = this.get_rope_index(
+              model_inputs.input_ids,
+              model_inputs.image_grid_thw,
+              model_inputs.video_grid_thw,
+              model_inputs.attention_mask
+            );
+          }
+          const delta = BigInt(past_length);
+          const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
+          model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
+        }
+      }
+    }
+    return model_inputs;
+  }
+};
+var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
+};
+// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
+var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
+  image_grid_thw_name = "image_grid_thw";
+};
+var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
+  image_grid_thw_name = "image_grid_thw";
+};
+// src/models/glm_ocr/modeling_glm_ocr.js
+var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
+  /**
+   * Compute 3D positional indices for vision tokens.
+   * Temporal is constant, height is repeat-interleaved, width tiles.
+   * @param {number} start_position
+   * @param {number[]} grid_thw [T, H, W]
+   * @param {number} temp_merge_size
+   * @param {number} spatial_merge_size
+   * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
+   */
+  get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
+    const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
+    const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
+    const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
+    const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
+    const t_pos = Array.from({ length: seq_len }, () => start_position);
+    const h_pos = Array.from(
+      { length: seq_len },
+      (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
+    );
+    const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
+    return [...t_pos, ...h_pos, ...w_pos];
+  }
+  /**
+   * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
+   * instead of vision_start_token_id scanning used by Qwen2VL.
+   * After a vision segment, position advances by max(h, w) / spatial_merge_size.
+   */
+  _get_multimodal_rope_positions({
+    filtered_ids,
+    image_grid_thw_list,
+    video_grid_thw_list,
+    spatial_merge_size,
+    state
+  }) {
+    const { image_token_id } = this.config;
+    const groups = [];
+    let group_start = 0;
+    let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
+    for (let j = 1; j <= filtered_ids.length; ++j) {
+      const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
+      if (t !== current_type) {
+        groups.push([current_type, group_start, j]);
+        group_start = j;
+        current_type = t;
+      }
+    }
+    let current_pos = 0;
+    const llm_pos_ids_list = [];
+    for (const [modality_type, start_idx, end_idx] of groups) {
+      if (modality_type === 0) {
+        const text_len = end_idx - start_idx;
+        llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
+        current_pos += text_len;
+      } else {
+        const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
+        const temp_merge_size = grid_thw[0];
+        llm_pos_ids_list.push(
+          this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
+        );
+        current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
+      }
+    }
+    return llm_pos_ids_list;
+  }
+};
 // src/models/glpn/modeling_glpn.js
 var GLPNPreTrainedModel = class extends PreTrainedModel {
 };
@@ -27582,6 +28090,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
 var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
 };
+// src/models/lighton_ocr/modeling_lighton_ocr.js
+var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
+};
 // src/models/lfm2_moe/modeling_lfm2_moe.js
 var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
 };
@@ -27778,6 +28290,14 @@ var MistralModel = class extends MistralPreTrainedModel {
 var MistralForCausalLM = class extends MistralPreTrainedModel {
 };
+// src/models/mistral4/modeling_mistral4.js
+var Mistral4PreTrainedModel = class extends PreTrainedModel {
+};
+var Mistral4Model = class extends Mistral4PreTrainedModel {
+};
+var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
+};
 // src/models/mobilebert/modeling_mobilebert.js
 var MobileBertPreTrainedModel = class extends PreTrainedModel {
 };
@@ -28246,6 +28766,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
 var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
 };
+// src/models/nemotron_h/modeling_nemotron_h.js
+var NemotronHPreTrainedModel = class extends PreTrainedModel {
+};
+var NemotronHModel = class extends NemotronHPreTrainedModel {
+};
+var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
+};
 // src/models/neobert/modeling_neobert.js
 var NeoBertPreTrainedModel = class extends PreTrainedModel {
 };
@@ -28526,252 +29054,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
 var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
 };
-// src/models/qwen2_vl/modeling_qwen2_vl.js
-var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
-  forward_params = [
-    // Text inputs
-    "input_ids",
-    "attention_mask",
-    "position_ids",
-    "past_key_values",
-    // Vision inputs
-    "pixel_values",
-    "image_grid_thw"
-  ];
-};
-var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
-  // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
-  // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
-  // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
-  image_grid_thw_name = "grid_thw";
-  /**
-   * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
-   *
-   * Explanation:
-   *     Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
-   *
-   *     For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
-   *     Examples:
-   *         input_ids: [T T T T T], here T is for text.
-   *         temporal position_ids: [0, 1, 2, 3, 4]
-   *         height position_ids: [0, 1, 2, 3, 4]
-   *         width position_ids: [0, 1, 2, 3, 4]
-   *
-   *     For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
-   *     and 1D rotary position embeddin for text part.
-   *     Examples:
-   *         Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
-   *         input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
-   *         vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
-   *         vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
-   *         vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
-   *         text temporal position_ids: [3, 4, 5, 6, 7]
-   *         text height position_ids: [3, 4, 5, 6, 7]
-   *         text width position_ids: [3, 4, 5, 6, 7]
-   *         Here we calculate the text start position_ids as the max vision position_ids plus 1.
-   *
-   * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
-   * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
-   * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
-   * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
-   * - 1 for tokens that are **not masked**,
-   * - 0 for tokens that are **masked**.
-   * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
-   * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
-   * - mrope_position_deltas: Tensor of shape `(batch_size)`.
-   */
-  get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
-    const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
-    const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
-    const mrope_position_deltas = [];
-    if (image_grid_thw || video_grid_thw) {
-      let total_input_ids = input_ids.tolist();
-      if (!attention_mask) {
-        attention_mask = ones_like(input_ids);
-      }
-      const attention_mask_list = attention_mask.tolist();
-      const position_ids_list = Array.from(
-        { length: 3 },
-        (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
-      );
-      const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
-      const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
-      let image_index = 0;
-      let video_index = 0;
-      for (let i = 0; i < total_input_ids.length; ++i) {
-        const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
-        const vision_start_indices = ids.reduce((acc, x, idx) => {
-          if (x == vision_start_token_id) acc.push(idx);
-          return acc;
-        }, []);
-        const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
-        const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
-        const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
-        let llm_pos_ids_list = [];
-        let st2 = 0;
-        let remain_images = image_nums;
-        let remain_videos = video_nums;
-        for (let j = 0; j < vision_tokens.length; ++j) {
-          const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
-          const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
-          const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
-          const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
-          let ed;
-          let t, h, w;
-          if (ed_image < ed_video) {
-            [t, h, w] = image_grid_thw_list[image_index];
-            ++image_index;
-            --remain_images;
-            ed = ed_image;
-          } else {
-            [t, h, w] = video_grid_thw_list[video_index];
-            ++video_index;
-            --remain_videos;
-            ed = ed_video;
-          }
-          const [llm_grid_t, llm_grid_h, llm_grid_w] = [
-            Number(t),
-            Math.floor(Number(h) / spatial_merge_size),
-            Math.floor(Number(w) / spatial_merge_size)
-          ];
-          const text_len = ed - st2;
-          const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
-          llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
-          const offset = text_len + st_idx;
-          const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
-          const t_index = Array.from(
-            { length: grid_size },
-            (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
-          );
-          const h_index = Array.from(
-            { length: grid_size },
-            (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
-          );
-          const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
-          llm_pos_ids_list.push([t_index, h_index, w_index].flat());
-          st2 = ed + grid_size;
-        }
-        if (st2 < ids.length) {
-          const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
-          const text_len = ids.length - st2;
-          llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
-        }
-        const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
-        const llm_positions = new Array(num_items);
-        let index = 0;
-        for (let x = 0; x < 3; ++x) {
-          for (let y = 0; y < llm_pos_ids_list.length; ++y) {
-            const val = llm_pos_ids_list[y];
-            const text_len = val.length / 3;
-            for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
-              llm_positions[index++] = val[z];
-            }
-          }
-        }
-        let count2 = 0;
-        const attn_mask = attention_mask_list[i];
-        for (let y = 0; y < attn_mask.length; ++y) {
-          if (attn_mask[y] == 1) {
-            for (let x = 0; x < 3; ++x) {
-              position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
-            }
-            ++count2;
-          }
-        }
-        const max_llm_positions = max(llm_positions)[0];
-        mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
-      }
-      return [
-        new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
-        new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
-      ];
-    } else {
-      if (attention_mask) {
-        const { data, dims } = cumsum_masked_fill(attention_mask);
-        const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
-        const mrope_position_deltas2 = Array.from(
-          { length: dims[0] },
-          (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
-        );
-        return [
-          new Tensor2("int64", position_ids, [3, ...dims]),
-          new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
-        ];
-      } else {
-        const [batch_size, seq_length] = input_ids.dims;
-        const position_ids = BigInt64Array.from(
-          { length: 3 * batch_size * seq_length },
-          (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
-        );
-        return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
-      }
-    }
-  }
-  async encode_image({ pixel_values, image_grid_thw }) {
-    const features = (await sessionRun(this.sessions["vision_encoder"], {
-      pixel_values,
-      [this.image_grid_thw_name]: image_grid_thw
-    })).image_features;
-    return features;
-  }
-  _merge_input_ids_with_image_features(kwargs) {
-    return default_merge_input_ids_with_image_features({
-      // @ts-ignore
-      image_token_id: this.config.image_token_id,
-      ...kwargs
-    });
-  }
-  prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
-    if (model_inputs.attention_mask && !model_inputs.position_ids) {
-      if (!model_inputs.past_key_values) {
-        [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
-          model_inputs.input_ids,
-          model_inputs.image_grid_thw,
-          model_inputs.video_grid_thw,
-          model_inputs.attention_mask
-        );
-      } else {
-        model_inputs.pixel_values = null;
-        const past_length = model_inputs.past_key_values.get_seq_length();
-        if (past_length < model_inputs.input_ids.dims[1]) {
-          const [full_position_ids, rope_deltas] = this.get_rope_index(
-            model_inputs.input_ids,
-            model_inputs.image_grid_thw,
-            model_inputs.video_grid_thw,
-            model_inputs.attention_mask
-          );
-          model_inputs.rope_deltas = rope_deltas;
-          model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
-          model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
-        } else {
-          if (!model_inputs.rope_deltas) {
-            [, model_inputs.rope_deltas] = this.get_rope_index(
-              model_inputs.input_ids,
-              model_inputs.image_grid_thw,
-              model_inputs.video_grid_thw,
-              model_inputs.attention_mask
-            );
-          }
-          const delta = BigInt(past_length);
-          const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
-          model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
-        }
-      }
-    }
-    return model_inputs;
-  }
-};
-var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
-};
-// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
-var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
-  image_grid_thw_name = "image_grid_thw";
-};
-var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
-  image_grid_thw_name = "image_grid_thw";
-};
 // src/models/qwen3/modeling_qwen3.js
 var Qwen3PreTrainedModel = class extends PreTrainedModel {
 };
@@ -29217,6 +29499,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
   }
 };
+// src/models/solar_open/modeling_solar_open.js
+var SolarOpenPreTrainedModel = class extends PreTrainedModel {
+};
+var SolarOpenModel = class extends SolarOpenPreTrainedModel {
+};
+var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
+};
 // src/models/speecht5/modeling_speecht5.js
 var SpeechT5PreTrainedModel = class extends PreTrainedModel {
 };
@@ -30333,6 +30623,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
 // src/models/registry.js
 var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
   ["bert", "BertModel"],
+  ["eurobert", "EuroBertModel"],
   ["neobert", "NeoBertModel"],
   ["modernbert", "ModernBertModel"],
   ["nomic_bert", "NomicBertModel"],
@@ -30464,6 +30755,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
   ["gemma3_text", "Gemma3Model"],
   ["helium", "HeliumModel"],
   ["glm", "GlmModel"],
+  ["glm_moe_dsa", "GlmMoeDsaModel"],
   ["openelm", "OpenELMModel"],
   ["qwen2", "Qwen2Model"],
   ["qwen2_moe", "Qwen2MoeModel"],
@@ -30475,12 +30767,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
   ["mpt", "MptModel"],
   ["opt", "OPTModel"],
   ["mistral", "MistralModel"],
+  ["mistral4", "Mistral4Model"],
   ["ministral", "MinistralModel"],
   ["ministral3", "Ministral3Model"],
   ["ernie4_5", "Ernie4_5ForCausalLM"],
   ["starcoder2", "Starcoder2Model"],
+  ["deepseek_v3", "DeepseekV3Model"],
   ["falcon", "FalconModel"],
   ["falcon_h1", "FalconH1Model"],
+  ["nemotron_h", "NemotronHModel"],
+  ["solar_open", "SolarOpenModel"],
   ["stablelm", "StableLmModel"],
   ["modernbert-decoder", "ModernBertDecoderModel"],
   ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -30500,6 +30796,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
 ]);
 var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["bert", "BertForSequenceClassification"],
+  ["eurobert", "EuroBertForSequenceClassification"],
   ["neobert", "NeoBertForSequenceClassification"],
   ["modernbert", "ModernBertForSequenceClassification"],
   ["roformer", "RoFormerForSequenceClassification"],
@@ -30522,6 +30819,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
 ]);
 var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["bert", "BertForTokenClassification"],
+  ["eurobert", "EuroBertForTokenClassification"],
   ["neobert", "NeoBertForTokenClassification"],
   ["modernbert", "ModernBertForTokenClassification"],
   ["roformer", "RoFormerForTokenClassification"],
@@ -30584,6 +30882,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["gemma3", "Gemma3ForCausalLM"],
   ["helium", "HeliumForCausalLM"],
   ["glm", "GlmForCausalLM"],
+  ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
   ["openelm", "OpenELMForCausalLM"],
   ["qwen2", "Qwen2ForCausalLM"],
   ["qwen2_moe", "Qwen2MoeForCausalLM"],
@@ -30603,13 +30902,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["opt", "OPTForCausalLM"],
   ["mbart", "MBartForCausalLM"],
   ["mistral", "MistralForCausalLM"],
+  ["mistral4", "Mistral4ForCausalLM"],
   ["ministral", "MinistralForCausalLM"],
   ["ministral3", "Ministral3ForCausalLM"],
   ["ernie4_5", "Ernie4_5ForCausalLM"],
   ["starcoder2", "Starcoder2ForCausalLM"],
+  ["deepseek_v3", "DeepseekV3ForCausalLM"],
   ["falcon", "FalconForCausalLM"],
   ["falcon_h1", "FalconH1ForCausalLM"],
+  ["nemotron_h", "NemotronHForCausalLM"],
   ["trocr", "TrOCRForCausalLM"],
+  ["solar_open", "SolarOpenForCausalLM"],
   ["stablelm", "StableLmForCausalLM"],
   ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
   ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -30620,6 +30923,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
 var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
 var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["bert", "BertForMaskedLM"],
+  ["eurobert", "EuroBertForMaskedLM"],
   ["neobert", "NeoBertForMaskedLM"],
   ["modernbert", "ModernBertForMaskedLM"],
   ["roformer", "RoFormerForMaskedLM"],
@@ -30678,7 +30982,9 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["paligemma", "PaliGemmaForConditionalGeneration"],
   ["llava_qwen2", "LlavaQwen2ForCausalLM"],
   ["gemma3n", "Gemma3nForConditionalGeneration"],
-  ["mistral3", "Mistral3ForConditionalGeneration"]
+  ["mistral3", "Mistral3ForConditionalGeneration"],
+  ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
+  ["glm_ocr", "GlmOcrForConditionalGeneration"]
 ]);
 var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
   ["granite_speech", "GraniteSpeechForConditionalGeneration"],
@@ -30783,6 +31089,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
 ]);
 var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
 var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
+  ["chmv2", "CHMv2ForDepthEstimation"],
   ["dpt", "DPTForDepthEstimation"],
   ["depth_anything", "DepthAnythingForDepthEstimation"],
   ["glpn", "GLPNForDepthEstimation"],
@@ -33522,6 +33829,9 @@ var ModelRegistry = class {
   BloomModel,
   BloomPreTrainedModel,
   BloomTokenizer,
+  CHMv2ForDepthEstimation,
+  CHMv2ImageProcessor,
+  CHMv2PreTrainedModel,
   CLIPFeatureExtractor,
   CLIPImageProcessor,
   CLIPModel,
@@ -33617,6 +33927,9 @@ var ModelRegistry = class {
   DebertaV2Tokenizer,
   DecisionTransformerModel,
   DecisionTransformerPreTrainedModel,
+  DeepseekV3ForCausalLM,
+  DeepseekV3Model,
+  DeepseekV3PreTrainedModel,
   DeiTFeatureExtractor,
   DeiTForImageClassification,
   DeiTImageProcessor,
@@ -33677,6 +33990,11 @@ var ModelRegistry = class {
   EsmModel,
   EsmPreTrainedModel,
   EsmTokenizer,
+  EuroBertForMaskedLM,
+  EuroBertForSequenceClassification,
+  EuroBertForTokenClassification,
+  EuroBertModel,
+  EuroBertPreTrainedModel,
   ExaoneForCausalLM,
   ExaoneModel,
   ExaonePreTrainedModel,
@@ -33734,8 +34052,14 @@ var ModelRegistry = class {
   GemmaModel,
   GemmaPreTrainedModel,
   GemmaTokenizer,
+  Glm46VImageProcessor,
+  Glm46VProcessor,
   GlmForCausalLM,
   GlmModel,
+  GlmMoeDsaForCausalLM,
+  GlmMoeDsaModel,
+  GlmMoeDsaPreTrainedModel,
+  GlmOcrForConditionalGeneration,
   GlmPreTrainedModel,
   GptOssForCausalLM,
   GptOssModel,
@@ -33801,6 +34125,7 @@ var ModelRegistry = class {
   Lfm2VlForConditionalGeneration,
   Lfm2VlImageProcessor,
   Lfm2VlProcessor,
+  LightOnOcrForConditionalGeneration,
   LiteWhisperForConditionalGeneration,
   Llama4ForCausalLM,
   Llama4PreTrainedModel,
@@ -33870,6 +34195,9 @@ var ModelRegistry = class {
   MimiPreTrainedModel,
   MinLengthLogitsProcessor,
   MinNewTokensLengthLogitsProcessor,
+  Mistral4ForCausalLM,
+  Mistral4Model,
+  Mistral4PreTrainedModel,
   MistralForCausalLM,
   MistralModel,
   MistralPreTrainedModel,
@@ -33941,6 +34269,9 @@ var ModelRegistry = class {
   NanoChatForCausalLM,
   NanoChatModel,
   NanoChatPreTrainedModel,
+  NemotronHForCausalLM,
+  NemotronHModel,
+  NemotronHPreTrainedModel,
   NeoBertForMaskedLM,
   NeoBertForQuestionAnswering,
   NeoBertForSequenceClassification,
@@ -34130,6 +34461,9 @@ var ModelRegistry = class {
   SnacFeatureExtractor,
   SnacModel,
   SnacPreTrainedModel,
+  SolarOpenForCausalLM,
+  SolarOpenModel,
+  SolarOpenPreTrainedModel,
   SpeechT5FeatureExtractor,
   SpeechT5ForSpeechToText,
   SpeechT5ForTextToSpeech,