@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/README.md +13 -2
  2. package/dist/transformers.js +689 -382
  3. package/dist/transformers.min.js +19 -19
  4. package/dist/transformers.node.cjs +716 -382
  5. package/dist/transformers.node.min.cjs +19 -19
  6. package/dist/transformers.node.min.mjs +19 -19
  7. package/dist/transformers.node.mjs +689 -382
  8. package/dist/transformers.web.js +697 -390
  9. package/dist/transformers.web.min.js +17 -17
  10. package/package.json +2 -2
  11. package/src/configs.js +28 -22
  12. package/src/env.js +1 -1
  13. package/src/image_processors_utils.js +25 -15
  14. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  15. package/src/models/chmv2/modeling_chmv2.js +4 -0
  16. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  17. package/src/models/eurobert/modeling_eurobert.js +41 -0
  18. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  19. package/src/models/glm46v/processing_glm46v.js +5 -0
  20. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  21. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  22. package/src/models/image_processors.js +2 -0
  23. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
  24. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  25. package/src/models/mistral4/modeling_mistral4.js +5 -0
  26. package/src/models/modeling_utils.js +2 -0
  27. package/src/models/models.js +10 -1
  28. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  29. package/src/models/processors.js +1 -0
  30. package/src/models/qwen2_vl/modeling_qwen2_vl.js +187 -136
  31. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  32. package/src/models/registry.js +17 -0
  33. package/src/models/solar_open/modeling_solar_open.js +5 -0
  34. package/src/pipelines.js +1 -0
  35. package/src/utils/hub.js +4 -1
  36. package/src/utils/model_registry/get_file_metadata.js +1 -0
  37. package/types/configs.d.ts.map +1 -1
  38. package/types/image_processors_utils.d.ts +3 -2
  39. package/types/image_processors_utils.d.ts.map +1 -1
  40. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  41. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  42. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  43. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  44. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  45. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  46. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  47. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  48. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  49. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  50. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  51. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  52. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  53. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  54. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  55. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  56. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  57. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  58. package/types/models/image_processors.d.ts +2 -0
  59. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  60. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  61. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  62. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  63. package/types/models/modeling_utils.d.ts.map +1 -1
  64. package/types/models/models.d.ts +10 -1
  65. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  66. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  67. package/types/models/processors.d.ts +1 -0
  68. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
  69. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  70. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  71. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  72. package/types/models/registry.d.ts.map +1 -1
  73. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  74. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  75. package/types/pipelines.d.ts +1 -0
  76. package/types/pipelines.d.ts.map +1 -1
  77. package/types/utils/hub.d.ts.map +1 -1
  78. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  79. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -117,6 +117,9 @@ __export(transformers_exports, {
117
117
  BloomModel: () => BloomModel,
118
118
  BloomPreTrainedModel: () => BloomPreTrainedModel,
119
119
  BloomTokenizer: () => BloomTokenizer,
120
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
121
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
122
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
120
123
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
121
124
  CLIPImageProcessor: () => CLIPImageProcessor,
122
125
  CLIPModel: () => CLIPModel,
@@ -212,6 +215,9 @@ __export(transformers_exports, {
212
215
  DebertaV2Tokenizer: () => DebertaV2Tokenizer,
213
216
  DecisionTransformerModel: () => DecisionTransformerModel,
214
217
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
218
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
219
+ DeepseekV3Model: () => DeepseekV3Model,
220
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
215
221
  DeiTFeatureExtractor: () => DeiTFeatureExtractor,
216
222
  DeiTForImageClassification: () => DeiTForImageClassification,
217
223
  DeiTImageProcessor: () => DeiTImageProcessor,
@@ -272,6 +278,11 @@ __export(transformers_exports, {
272
278
  EsmModel: () => EsmModel,
273
279
  EsmPreTrainedModel: () => EsmPreTrainedModel,
274
280
  EsmTokenizer: () => EsmTokenizer,
281
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
282
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
283
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
284
+ EuroBertModel: () => EuroBertModel,
285
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
275
286
  ExaoneForCausalLM: () => ExaoneForCausalLM,
276
287
  ExaoneModel: () => ExaoneModel,
277
288
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -329,8 +340,14 @@ __export(transformers_exports, {
329
340
  GemmaModel: () => GemmaModel,
330
341
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
331
342
  GemmaTokenizer: () => GemmaTokenizer,
343
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
344
+ Glm46VProcessor: () => Glm46VProcessor,
332
345
  GlmForCausalLM: () => GlmForCausalLM,
333
346
  GlmModel: () => GlmModel,
347
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
348
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
349
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
350
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
334
351
  GlmPreTrainedModel: () => GlmPreTrainedModel,
335
352
  GptOssForCausalLM: () => GptOssForCausalLM,
336
353
  GptOssModel: () => GptOssModel,
@@ -396,6 +413,7 @@ __export(transformers_exports, {
396
413
  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
397
414
  Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
398
415
  Lfm2VlProcessor: () => Lfm2VlProcessor,
416
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
399
417
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
400
418
  Llama4ForCausalLM: () => Llama4ForCausalLM,
401
419
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -465,6 +483,9 @@ __export(transformers_exports, {
465
483
  MimiPreTrainedModel: () => MimiPreTrainedModel,
466
484
  MinLengthLogitsProcessor: () => MinLengthLogitsProcessor,
467
485
  MinNewTokensLengthLogitsProcessor: () => MinNewTokensLengthLogitsProcessor,
486
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
487
+ Mistral4Model: () => Mistral4Model,
488
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
468
489
  MistralForCausalLM: () => MistralForCausalLM,
469
490
  MistralModel: () => MistralModel,
470
491
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -536,6 +557,9 @@ __export(transformers_exports, {
536
557
  NanoChatForCausalLM: () => NanoChatForCausalLM,
537
558
  NanoChatModel: () => NanoChatModel,
538
559
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
560
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
561
+ NemotronHModel: () => NemotronHModel,
562
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
539
563
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
540
564
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
541
565
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -725,6 +749,9 @@ __export(transformers_exports, {
725
749
  SnacFeatureExtractor: () => SnacFeatureExtractor,
726
750
  SnacModel: () => SnacModel,
727
751
  SnacPreTrainedModel: () => SnacPreTrainedModel,
752
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
753
+ SolarOpenModel: () => SolarOpenModel,
754
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
728
755
  SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
729
756
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
730
757
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
@@ -925,7 +952,7 @@ var import_node_fs = __toESM(require("fs"), 1);
925
952
  var import_node_path = __toESM(require("path"), 1);
926
953
  var import_node_url = __toESM(require("url"), 1);
927
954
  var import_meta = {};
928
- var VERSION = "4.0.0-next.7";
955
+ var VERSION = "4.0.0-next.8";
929
956
  var HAS_SELF = typeof self !== "undefined";
930
957
  var IS_FS_AVAILABLE = !isEmpty(import_node_fs.default);
931
958
  var IS_PATH_AVAILABLE = !isEmpty(import_node_path.default);
@@ -1155,7 +1182,7 @@ var logger = {
1155
1182
  }
1156
1183
  };
1157
1184
 
1158
- // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
1185
+ // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
1159
1186
  var DictionarySplitter = class {
1160
1187
  /**
1161
1188
  * @param dictionary The dictionary of words to use for splitting.
@@ -2811,10 +2838,10 @@ var BPE = class extends TokenizerModel_default {
2811
2838
  );
2812
2839
  if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
2813
2840
  output_tokens.push(...byte_tokens);
2814
- } else {
2841
+ } else if (this.unk_token != null) {
2815
2842
  output_tokens.push(this.unk_token);
2816
2843
  }
2817
- } else {
2844
+ } else if (this.unk_token != null) {
2818
2845
  output_tokens.push(this.unk_token);
2819
2846
  }
2820
2847
  }
@@ -7426,13 +7453,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
7426
7453
  wrapped_progress
7427
7454
  );
7428
7455
  } else if (typeof response !== "string") {
7456
+ const headers = new Headers(response.headers);
7457
+ headers.set("content-length", result.byteLength.toString());
7429
7458
  await cache2.put(
7430
7459
  cacheKey,
7431
7460
  new Response(
7432
7461
  /** @type {any} */
7433
7462
  result,
7434
7463
  {
7435
- headers: response.headers
7464
+ headers
7436
7465
  }
7437
7466
  )
7438
7467
  ).catch((err) => {
@@ -16643,6 +16672,7 @@ __export(processors_exports, {
16643
16672
  ChatterboxProcessor: () => ChatterboxProcessor,
16644
16673
  Florence2Processor: () => Florence2Processor,
16645
16674
  Gemma3nProcessor: () => Gemma3nProcessor,
16675
+ Glm46VProcessor: () => Glm46VProcessor,
16646
16676
  GraniteSpeechProcessor: () => GraniteSpeechProcessor,
16647
16677
  GroundingDinoProcessor: () => GroundingDinoProcessor,
16648
16678
  Idefics3Processor: () => Idefics3Processor,
@@ -19147,26 +19177,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
19147
19177
  }
19148
19178
  return [segmentation, segments];
19149
19179
  }
19150
- function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
19180
+ function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
19151
19181
  if (height < factor || width < factor) {
19152
- throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
19153
- } else if (Math.max(height, width) / Math.min(height, width) > 200) {
19182
+ const scale = Math.max(factor / height, factor / width);
19183
+ height = Math.round(height * scale);
19184
+ width = Math.round(width * scale);
19185
+ }
19186
+ if (Math.max(height, width) / Math.min(height, width) > 200) {
19154
19187
  throw new Error(
19155
19188
  `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
19156
19189
  );
19157
19190
  }
19158
19191
  let h_bar = Math.round(height / factor) * factor;
19159
19192
  let w_bar = Math.round(width / factor) * factor;
19160
- if (h_bar * w_bar > max_pixels) {
19161
- const beta = Math.sqrt(height * width / max_pixels);
19162
- h_bar = Math.floor(height / beta / factor) * factor;
19163
- w_bar = Math.floor(width / beta / factor) * factor;
19164
- } else if (h_bar * w_bar < min_pixels) {
19165
- const beta = Math.sqrt(min_pixels / (height * width));
19193
+ if (temporal_factor * h_bar * w_bar > max_pixels) {
19194
+ const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
19195
+ h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
19196
+ w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
19197
+ } else if (temporal_factor * h_bar * w_bar < min_pixels) {
19198
+ const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
19166
19199
  h_bar = Math.ceil(height * beta / factor) * factor;
19167
19200
  w_bar = Math.ceil(width * beta / factor) * factor;
19168
19201
  }
19169
- return [h_bar, w_bar];
19202
+ return [w_bar, h_bar];
19170
19203
  }
19171
19204
  function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
19172
19205
  if (label_ids_to_fuse === null) {
@@ -19245,7 +19278,7 @@ var ImageProcessor = class extends Callable2 {
19245
19278
  this.do_pad = config.do_pad;
19246
19279
  this.min_pixels = config.min_pixels;
19247
19280
  this.max_pixels = config.max_pixels;
19248
- if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
19281
+ if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
19249
19282
  this.pad_size = this.size;
19250
19283
  }
19251
19284
  this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -19533,10 +19566,8 @@ var ImageProcessor = class extends Callable2 {
19533
19566
  const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
19534
19567
  [pixelData, imgDims] = padded;
19535
19568
  } else if (this.size_divisibility) {
19536
- const [paddedWidth, paddedHeight] = enforce_size_divisibility(
19537
- [imgDims[1], imgDims[0]],
19538
- this.size_divisibility
19539
- );
19569
+ const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
19570
+ const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
19540
19571
  [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
19541
19572
  }
19542
19573
  }
@@ -19613,6 +19644,7 @@ var image_processors_exports = {};
19613
19644
  __export(image_processors_exports, {
19614
19645
  BeitFeatureExtractor: () => BeitFeatureExtractor,
19615
19646
  BitImageProcessor: () => BitImageProcessor,
19647
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
19616
19648
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
19617
19649
  CLIPImageProcessor: () => CLIPImageProcessor,
19618
19650
  ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -19629,6 +19661,7 @@ __export(image_processors_exports, {
19629
19661
  DonutImageProcessor: () => DonutImageProcessor,
19630
19662
  EfficientNetImageProcessor: () => EfficientNetImageProcessor,
19631
19663
  GLPNFeatureExtractor: () => GLPNFeatureExtractor,
19664
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
19632
19665
  GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
19633
19666
  Idefics3ImageProcessor: () => Idefics3ImageProcessor,
19634
19667
  ImageFeatureExtractor: () => ImageProcessor,
@@ -19689,6 +19722,10 @@ var BitImageProcessor = class extends ImageProcessor {
19689
19722
  var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
19690
19723
  };
19691
19724
 
19725
+ // src/models/chmv2/image_processing_chmv2.js
19726
+ var CHMv2ImageProcessor = class extends ImageProcessor {
19727
+ };
19728
+
19692
19729
  // src/models/clip/image_processing_clip.js
19693
19730
  var CLIPImageProcessor = class extends ImageProcessor {
19694
19731
  };
@@ -19808,6 +19845,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
19808
19845
  }
19809
19846
  };
19810
19847
 
19848
+ // src/models/qwen2_vl/image_processing_qwen2_vl.js
19849
+ var Qwen2VLImageProcessor = class extends ImageProcessor {
19850
+ constructor(config) {
19851
+ super(config);
19852
+ this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
19853
+ this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
19854
+ this.patch_size = config.patch_size;
19855
+ this.merge_size = config.merge_size;
19856
+ }
19857
+ /** @type {ImageProcessor['get_resize_output_image_size']} */
19858
+ get_resize_output_image_size(image, size) {
19859
+ const factor = this.patch_size * this.merge_size;
19860
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
19861
+ }
19862
+ async _call(images, ...args) {
19863
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
19864
+ let patches = pixel_values;
19865
+ const { temporal_patch_size, merge_size, patch_size } = this.config;
19866
+ if (patches.dims[0] === 1) {
19867
+ patches = cat(
19868
+ Array.from({ length: temporal_patch_size }, () => patches),
19869
+ 0
19870
+ );
19871
+ }
19872
+ const grid_t = patches.dims[0] / temporal_patch_size;
19873
+ const channel = patches.dims[1];
19874
+ const grid_h = Math.floor(patches.dims[2] / patch_size);
19875
+ const grid_w = Math.floor(patches.dims[3] / patch_size);
19876
+ const flatten_patches = patches.view(
19877
+ grid_t,
19878
+ temporal_patch_size,
19879
+ channel,
19880
+ Math.floor(grid_h / merge_size),
19881
+ merge_size,
19882
+ patch_size,
19883
+ Math.floor(grid_w / merge_size),
19884
+ merge_size,
19885
+ patch_size
19886
+ ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
19887
+ const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
19888
+ return {
19889
+ pixel_values: flatten_patches,
19890
+ image_grid_thw,
19891
+ original_sizes,
19892
+ reshaped_input_sizes
19893
+ };
19894
+ }
19895
+ };
19896
+
19897
+ // src/models/glm46v/image_processing_glm46v.js
19898
+ var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
19899
+ /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
19900
+ get_resize_output_image_size(image, size) {
19901
+ const factor = this.patch_size * this.merge_size;
19902
+ const temporal_factor = this.config.temporal_patch_size ?? 2;
19903
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
19904
+ }
19905
+ };
19906
+
19811
19907
  // src/models/glpn/image_processing_glpn.js
19812
19908
  var GLPNFeatureExtractor = class extends ImageProcessor {
19813
19909
  };
@@ -20201,7 +20297,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
20201
20297
  const img = pixel_values.unsqueeze_(0);
20202
20298
  const total_factor = this.encoder_patch_size * this.downsample_factor;
20203
20299
  const f2 = total_factor ** 2;
20204
- const [new_height, new_width] = smart_resize(
20300
+ const [new_width, new_height] = smart_resize(
20205
20301
  Math.max(total_factor, height),
20206
20302
  Math.max(total_factor, width),
20207
20303
  total_factor,
@@ -20491,55 +20587,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
20491
20587
  var PvtImageProcessor = class extends ImageProcessor {
20492
20588
  };
20493
20589
 
20494
- // src/models/qwen2_vl/image_processing_qwen2_vl.js
20495
- var Qwen2VLImageProcessor = class extends ImageProcessor {
20496
- constructor(config) {
20497
- super(config);
20498
- this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
20499
- this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
20500
- this.patch_size = config.patch_size;
20501
- this.merge_size = config.merge_size;
20502
- }
20503
- /** @type {ImageProcessor['get_resize_output_image_size']} */
20504
- get_resize_output_image_size(image, size) {
20505
- const factor = this.patch_size * this.merge_size;
20506
- return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
20507
- }
20508
- async _call(images, ...args) {
20509
- const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
20510
- let patches = pixel_values;
20511
- const { temporal_patch_size, merge_size, patch_size } = this.config;
20512
- if (patches.dims[0] === 1) {
20513
- patches = cat(
20514
- Array.from({ length: temporal_patch_size }, () => patches),
20515
- 0
20516
- );
20517
- }
20518
- const grid_t = patches.dims[0] / temporal_patch_size;
20519
- const channel = patches.dims[1];
20520
- const grid_h = Math.floor(patches.dims[2] / patch_size);
20521
- const grid_w = Math.floor(patches.dims[3] / patch_size);
20522
- const flatten_patches = patches.view(
20523
- grid_t,
20524
- temporal_patch_size,
20525
- channel,
20526
- Math.floor(grid_h / merge_size),
20527
- merge_size,
20528
- patch_size,
20529
- Math.floor(grid_w / merge_size),
20530
- merge_size,
20531
- patch_size
20532
- ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
20533
- const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
20534
- return {
20535
- pixel_values: flatten_patches,
20536
- image_grid_thw,
20537
- original_sizes,
20538
- reshaped_input_sizes
20539
- };
20540
- }
20541
- };
20542
-
20543
20590
  // src/models/rt_detr/image_processing_rt_detr.js
20544
20591
  var RTDetrImageProcessor = class extends ImageProcessor {
20545
20592
  /** @type {typeof post_process_object_detection} */
@@ -21093,6 +21140,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
21093
21140
  }
21094
21141
  };
21095
21142
 
21143
+ // src/models/qwen2_vl/processing_qwen2_vl.js
21144
+ var Qwen2VLProcessor = class extends Processor {
21145
+ static image_processor_class = AutoImageProcessor;
21146
+ static tokenizer_class = AutoTokenizer;
21147
+ static image_token = "<|image_pad|>";
21148
+ /**
21149
+ *
21150
+ * @param {string|string[]} text
21151
+ * @param {RawImage|RawImage[]} images
21152
+ * @param {...any} args
21153
+ * @returns {Promise<any>}
21154
+ */
21155
+ async _call(text, images = null, ...args) {
21156
+ if (!Array.isArray(text)) {
21157
+ text = [text];
21158
+ }
21159
+ let image_inputs, image_grid_thw;
21160
+ if (images) {
21161
+ image_inputs = await this.image_processor(images);
21162
+ image_grid_thw = image_inputs.image_grid_thw;
21163
+ }
21164
+ if (image_grid_thw) {
21165
+ let merge_length = this.image_processor.config.merge_size ** 2;
21166
+ let index = 0;
21167
+ const image_token = (
21168
+ /** @type {typeof Qwen2VLProcessor} */
21169
+ this.constructor.image_token
21170
+ );
21171
+ const image_grid_thw_list = image_grid_thw.tolist();
21172
+ text = text.map((t) => {
21173
+ while (t.includes(image_token)) {
21174
+ const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
21175
+ t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
21176
+ }
21177
+ return t.replaceAll("<|placeholder|>", image_token);
21178
+ });
21179
+ }
21180
+ const text_inputs = this.tokenizer(text);
21181
+ return {
21182
+ ...text_inputs,
21183
+ ...image_inputs
21184
+ };
21185
+ }
21186
+ };
21187
+
21188
+ // src/models/glm46v/processing_glm46v.js
21189
+ var Glm46VProcessor = class extends Qwen2VLProcessor {
21190
+ static image_token = "<|image|>";
21191
+ };
21192
+
21096
21193
  // src/models/granite_speech/processing_granite_speech.js
21097
21194
  var GraniteSpeechProcessor = class extends Processor {
21098
21195
  static tokenizer_class = AutoTokenizer;
@@ -21823,47 +21920,6 @@ var PyAnnoteProcessor = class extends Processor {
21823
21920
  }
21824
21921
  };
21825
21922
 
21826
- // src/models/qwen2_vl/processing_qwen2_vl.js
21827
- var Qwen2VLProcessor = class extends Processor {
21828
- static image_processor_class = AutoImageProcessor;
21829
- static tokenizer_class = AutoTokenizer;
21830
- /**
21831
- *
21832
- * @param {string|string[]} text
21833
- * @param {RawImage|RawImage[]} images
21834
- * @param {...any} args
21835
- * @returns {Promise<any>}
21836
- */
21837
- async _call(text, images = null, ...args) {
21838
- if (!Array.isArray(text)) {
21839
- text = [text];
21840
- }
21841
- let image_inputs, image_grid_thw;
21842
- if (images) {
21843
- image_inputs = await this.image_processor(images);
21844
- image_grid_thw = image_inputs.image_grid_thw;
21845
- }
21846
- if (image_grid_thw) {
21847
- let merge_length = this.image_processor.config.merge_size ** 2;
21848
- let index = 0;
21849
- const image_grid_thw_list = image_grid_thw.tolist();
21850
- text = text.map((t) => {
21851
- while (t.includes("<|image_pad|>")) {
21852
- const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
21853
- t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
21854
- }
21855
- return t.replaceAll("<|placeholder|>", "<|image_pad|>");
21856
- });
21857
- }
21858
- const text_inputs = this.tokenizer(text);
21859
- return {
21860
- ...text_inputs,
21861
- ...image_inputs
21862
- // TODO: ...videos_inputs,
21863
- };
21864
- }
21865
- };
21866
-
21867
21923
  // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
21868
21924
  var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
21869
21925
  };
@@ -22207,6 +22263,8 @@ function getNormalizedConfig(config) {
22207
22263
  case "gemma3n":
22208
22264
  case "lfm2_vl":
22209
22265
  case "chatterbox":
22266
+ case "lighton_ocr":
22267
+ case "glm_ocr":
22210
22268
  case "mistral3":
22211
22269
  case "qwen2_5_vl":
22212
22270
  case "qwen3_vl":
@@ -22282,6 +22340,8 @@ function getNormalizedConfig(config) {
22282
22340
  mapping["dim_kv"] = "head_dim";
22283
22341
  break;
22284
22342
  case "qwen3":
22343
+ case "solar_open":
22344
+ case "glm_ocr_text":
22285
22345
  case "gemma":
22286
22346
  case "gemma2":
22287
22347
  case "vaultgemma":
@@ -22292,6 +22352,7 @@ function getNormalizedConfig(config) {
22292
22352
  case "ernie4_5":
22293
22353
  case "hunyuan_v1_dense":
22294
22354
  case "falcon_h1":
22355
+ case "nemotron_h":
22295
22356
  case "ministral":
22296
22357
  case "ministral3":
22297
22358
  mapping["num_heads"] = "num_key_value_heads";
@@ -22326,6 +22387,9 @@ function getNormalizedConfig(config) {
22326
22387
  mapping["num_attention_heads"] = "num_attention_heads";
22327
22388
  break;
22328
22389
  case "youtu":
22390
+ case "deepseek_v3":
22391
+ case "glm_moe_dsa":
22392
+ case "mistral4":
22329
22393
  mapping["num_heads"] = "num_key_value_heads";
22330
22394
  mapping["num_layers"] = "num_hidden_layers";
22331
22395
  mapping["dim_kv"] = "qk_head_dim";
@@ -22414,6 +22478,7 @@ function getCacheShapes(config, options) {
22414
22478
  if (!(config instanceof PretrainedConfig)) {
22415
22479
  config = new PretrainedConfig(config);
22416
22480
  }
22481
+ const batch_size = options?.batch_size ?? 1;
22417
22482
  if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
22418
22483
  const pkv_prefix = options?.prefix ?? "past_key_values";
22419
22484
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -22423,7 +22488,6 @@ function getCacheShapes(config, options) {
22423
22488
  config
22424
22489
  );
22425
22490
  const head_dim = hidden_size / num_attention_heads;
22426
- const batch_size = options?.batch_size ?? 1;
22427
22491
  for (let i = 0; i < layer_types.length; ++i) {
22428
22492
  if (layer_types[i] === "full_attention") {
22429
22493
  for (const kv of ["key", "value"]) {
@@ -22436,31 +22500,26 @@ function getCacheShapes(config, options) {
22436
22500
  }
22437
22501
  }
22438
22502
  return cache_values;
22439
- } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
22503
+ } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
22440
22504
  const pkv_prefix = options?.prefix ?? "past_key_values";
22441
22505
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
22442
- const cache_values = {};
22443
- const {
22444
- layer_types,
22445
- num_hidden_layers,
22446
- num_attention_heads,
22447
- num_key_value_heads,
22448
- hidden_size,
22449
- mamba_d_conv,
22450
- mamba_n_heads,
22451
- mamba_d_head,
22452
- mamba_d_state,
22453
- mamba_n_groups,
22454
- mamba_expand,
22455
- mamba_d_ssm
22456
- } = (
22506
+ const c = (
22457
22507
  /** @type {any} */
22458
22508
  config
22459
22509
  );
22460
- const head_dim = hidden_size / num_attention_heads;
22461
- const batch_size = options?.batch_size ?? 1;
22462
- const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
22463
- for (let i = 0; i < num_hidden_layers; ++i) {
22510
+ const layer_types = c.layer_types ?? c.layers_block_type;
22511
+ const num_layers = c.num_hidden_layers ?? layer_types?.length;
22512
+ const num_key_value_heads = c.num_key_value_heads;
22513
+ const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
22514
+ const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
22515
+ const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
22516
+ const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
22517
+ const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
22518
+ const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
22519
+ const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
22520
+ const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
22521
+ const cache_values = {};
22522
+ for (let i = 0; i < num_layers; ++i) {
22464
22523
  if (!layer_types || layer_types[i] === "mamba") {
22465
22524
  cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
22466
22525
  cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -22494,7 +22553,6 @@ function getCacheShapes(config, options) {
22494
22553
  const key_dim = linear_key_head_dim * linear_num_key_heads;
22495
22554
  const value_dim = linear_value_head_dim * linear_num_value_heads;
22496
22555
  const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
22497
- const batch_size = options?.batch_size ?? 1;
22498
22556
  for (let i = 0; i < layer_types.length; ++i) {
22499
22557
  if (layer_types[i] === "full_attention") {
22500
22558
  for (const kv of ["key", "value"]) {
@@ -25122,7 +25180,9 @@ async function generic_text_to_text_forward(self2, {
25122
25180
  "qwen3_5",
25123
25181
  "qwen3_5_text",
25124
25182
  "qwen3_5_moe",
25125
- "qwen3_5_moe_text"
25183
+ "qwen3_5_moe_text",
25184
+ "glm_ocr",
25185
+ "glm_ocr_text"
25126
25186
  ].includes(self2.config.model_type)
25127
25187
  ) {
25128
25188
  const { image_grid_thw, video_grid_thw } = kwargs;
@@ -25346,6 +25406,8 @@ __export(models_exports, {
25346
25406
  BloomForCausalLM: () => BloomForCausalLM,
25347
25407
  BloomModel: () => BloomModel,
25348
25408
  BloomPreTrainedModel: () => BloomPreTrainedModel,
25409
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
25410
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
25349
25411
  CLIPModel: () => CLIPModel,
25350
25412
  CLIPPreTrainedModel: () => CLIPPreTrainedModel,
25351
25413
  CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -25420,6 +25482,9 @@ __export(models_exports, {
25420
25482
  DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
25421
25483
  DecisionTransformerModel: () => DecisionTransformerModel,
25422
25484
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
25485
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
25486
+ DeepseekV3Model: () => DeepseekV3Model,
25487
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
25423
25488
  DeiTForImageClassification: () => DeiTForImageClassification,
25424
25489
  DeiTModel: () => DeiTModel,
25425
25490
  DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -25465,6 +25530,11 @@ __export(models_exports, {
25465
25530
  EsmForTokenClassification: () => EsmForTokenClassification,
25466
25531
  EsmModel: () => EsmModel,
25467
25532
  EsmPreTrainedModel: () => EsmPreTrainedModel,
25533
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
25534
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
25535
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
25536
+ EuroBertModel: () => EuroBertModel,
25537
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
25468
25538
  ExaoneForCausalLM: () => ExaoneForCausalLM,
25469
25539
  ExaoneModel: () => ExaoneModel,
25470
25540
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -25511,6 +25581,10 @@ __export(models_exports, {
25511
25581
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
25512
25582
  GlmForCausalLM: () => GlmForCausalLM,
25513
25583
  GlmModel: () => GlmModel,
25584
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
25585
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
25586
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
25587
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
25514
25588
  GlmPreTrainedModel: () => GlmPreTrainedModel,
25515
25589
  GptOssForCausalLM: () => GptOssForCausalLM,
25516
25590
  GptOssModel: () => GptOssModel,
@@ -25557,6 +25631,7 @@ __export(models_exports, {
25557
25631
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
25558
25632
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
25559
25633
  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
25634
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
25560
25635
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
25561
25636
  Llama4ForCausalLM: () => Llama4ForCausalLM,
25562
25637
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -25606,6 +25681,9 @@ __export(models_exports, {
25606
25681
  MimiEncoderOutput: () => MimiEncoderOutput,
25607
25682
  MimiModel: () => MimiModel,
25608
25683
  MimiPreTrainedModel: () => MimiPreTrainedModel,
25684
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
25685
+ Mistral4Model: () => Mistral4Model,
25686
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
25609
25687
  MistralForCausalLM: () => MistralForCausalLM,
25610
25688
  MistralModel: () => MistralModel,
25611
25689
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -25663,6 +25741,9 @@ __export(models_exports, {
25663
25741
  NanoChatForCausalLM: () => NanoChatForCausalLM,
25664
25742
  NanoChatModel: () => NanoChatModel,
25665
25743
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
25744
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
25745
+ NemotronHModel: () => NemotronHModel,
25746
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
25666
25747
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
25667
25748
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
25668
25749
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -25800,6 +25881,9 @@ __export(models_exports, {
25800
25881
  SnacEncoderModel: () => SnacEncoderModel,
25801
25882
  SnacModel: () => SnacModel,
25802
25883
  SnacPreTrainedModel: () => SnacPreTrainedModel,
25884
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
25885
+ SolarOpenModel: () => SolarOpenModel,
25886
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
25803
25887
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
25804
25888
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
25805
25889
  SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -25974,7 +26058,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
25974
26058
  var ArceeForCausalLM = class extends ArceePreTrainedModel {
25975
26059
  };
25976
26060
 
25977
- // src/models/ast/modeling_ast.js
26061
+ // src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
25978
26062
  var ASTPreTrainedModel = class extends PreTrainedModel {
25979
26063
  };
25980
26064
  var ASTModel = class extends ASTPreTrainedModel {
@@ -26309,6 +26393,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
26309
26393
  var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
26310
26394
  };
26311
26395
 
26396
+ // src/models/chmv2/modeling_chmv2.js
26397
+ var CHMv2PreTrainedModel = class extends PreTrainedModel {
26398
+ };
26399
+ var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
26400
+ };
26401
+
26312
26402
  // src/models/clap/modeling_clap.js
26313
26403
  var ClapPreTrainedModel = class extends PreTrainedModel {
26314
26404
  };
@@ -26647,6 +26737,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
26647
26737
  }
26648
26738
  };
26649
26739
 
26740
+ // src/models/deepseek_v3/modeling_deepseek_v3.js
26741
+ var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
26742
+ };
26743
+ var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
26744
+ };
26745
+ var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
26746
+ };
26747
+
26650
26748
  // src/models/deberta_v2/modeling_deberta_v2.js
26651
26749
  var DebertaV2PreTrainedModel = class extends PreTrainedModel {
26652
26750
  };
@@ -26995,6 +27093,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
26995
27093
  }
26996
27094
  };
26997
27095
 
27096
+ // src/models/eurobert/modeling_eurobert.js
27097
+ var EuroBertPreTrainedModel = class extends PreTrainedModel {
27098
+ };
27099
+ var EuroBertModel = class extends EuroBertPreTrainedModel {
27100
+ };
27101
+ var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
27102
+ /**
27103
+ * Calls the model on new inputs.
27104
+ *
27105
+ * @param {Object} model_inputs The inputs to the model.
27106
+ * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
27107
+ */
27108
+ async _call(model_inputs) {
27109
+ return new MaskedLMOutput(await super._call(model_inputs));
27110
+ }
27111
+ };
27112
+ var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
27113
+ /**
27114
+ * Calls the model on new inputs.
27115
+ *
27116
+ * @param {Object} model_inputs The inputs to the model.
27117
+ * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
27118
+ */
27119
+ async _call(model_inputs) {
27120
+ return new SequenceClassifierOutput(await super._call(model_inputs));
27121
+ }
27122
+ };
27123
+ var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
27124
+ /**
27125
+ * Calls the model on new inputs.
27126
+ *
27127
+ * @param {Object} model_inputs The inputs to the model.
27128
+ * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
27129
+ */
27130
+ async _call(model_inputs) {
27131
+ return new TokenClassifierOutput(await super._call(model_inputs));
27132
+ }
27133
+ };
27134
+
26998
27135
  // src/models/exaone/modeling_exaone.js
26999
27136
  var ExaonePreTrainedModel = class extends PreTrainedModel {
27000
27137
  };
@@ -27270,6 +27407,377 @@ var GlmModel = class extends GlmPreTrainedModel {
27270
27407
  var GlmForCausalLM = class extends GlmPreTrainedModel {
27271
27408
  };
27272
27409
 
27410
+ // src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
27411
+ var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
27412
+ };
27413
+ var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
27414
+ };
27415
+ var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
27416
+ };
27417
+
27418
+ // src/models/qwen2_vl/modeling_qwen2_vl.js
27419
+ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
27420
+ forward_params = [
27421
+ // Text inputs
27422
+ "input_ids",
27423
+ "attention_mask",
27424
+ "position_ids",
27425
+ "past_key_values",
27426
+ // Vision inputs
27427
+ "pixel_values",
27428
+ "image_grid_thw"
27429
+ ];
27430
+ };
27431
+ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
27432
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
27433
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
27434
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
27435
+ image_grid_thw_name = "grid_thw";
27436
+ /**
27437
+ * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
27438
+ * @param {Tensor} input_ids
27439
+ * @param {Tensor} attention_mask
27440
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
27441
+ */
27442
+ _get_text_only_rope_index(input_ids, attention_mask) {
27443
+ if (attention_mask) {
27444
+ const { data, dims } = cumsum_masked_fill(attention_mask);
27445
+ const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
27446
+ const mrope_position_deltas = Array.from(
27447
+ { length: dims[0] },
27448
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
27449
+ );
27450
+ return [
27451
+ new Tensor2("int64", position_ids, [3, ...dims]),
27452
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27453
+ ];
27454
+ } else {
27455
+ const [batch_size, seq_length] = input_ids.dims;
27456
+ const position_ids = BigInt64Array.from(
27457
+ { length: 3 * batch_size * seq_length },
27458
+ (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
27459
+ );
27460
+ return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
27461
+ }
27462
+ }
27463
+ /**
27464
+ * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
27465
+ * global [all_t, all_h, all_w] order, then write back into the position_ids array
27466
+ * respecting attention mask.
27467
+ * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
27468
+ * @param {number[]} attn_mask Attention mask for this batch element
27469
+ * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
27470
+ * @param {number} batch_idx Current batch index
27471
+ * @returns {number[]} Flat reordered positions of length total_len
27472
+ */
27473
+ _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
27474
+ const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
27475
+ const llm_positions = new Array(total_len);
27476
+ let index = 0;
27477
+ for (let x = 0; x < 3; ++x) {
27478
+ for (const val of llm_pos_ids_list) {
27479
+ const seg_len = val.length / 3;
27480
+ for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
27481
+ llm_positions[index++] = val[z];
27482
+ }
27483
+ }
27484
+ }
27485
+ let count2 = 0;
27486
+ for (let y = 0; y < attn_mask.length; ++y) {
27487
+ if (attn_mask[y] == 1) {
27488
+ for (let x = 0; x < 3; ++x) {
27489
+ position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
27490
+ }
27491
+ ++count2;
27492
+ }
27493
+ }
27494
+ return llm_positions;
27495
+ }
27496
+ /**
27497
+ * Build per-batch position ID segments for multimodal rope.
27498
+ * Override this in subclasses to change how vision/text segments are identified and positioned.
27499
+ * @param {object} params
27500
+ * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
27501
+ * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
27502
+ * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
27503
+ * @param {number} params.spatial_merge_size
27504
+ * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
27505
+ * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
27506
+ */
27507
+ _get_multimodal_rope_positions({
27508
+ filtered_ids,
27509
+ image_grid_thw_list,
27510
+ video_grid_thw_list,
27511
+ spatial_merge_size,
27512
+ state
27513
+ }) {
27514
+ const { image_token_id, video_token_id, vision_start_token_id } = this.config;
27515
+ const ids = filtered_ids;
27516
+ const vision_start_indices = ids.reduce((acc, x, idx) => {
27517
+ if (x == vision_start_token_id) acc.push(idx);
27518
+ return acc;
27519
+ }, []);
27520
+ const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
27521
+ const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
27522
+ const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
27523
+ const llm_pos_ids_list = [];
27524
+ let st2 = 0;
27525
+ let remain_images = image_nums;
27526
+ let remain_videos = video_nums;
27527
+ for (let j = 0; j < vision_tokens.length; ++j) {
27528
+ const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
27529
+ const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
27530
+ const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
27531
+ const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
27532
+ let ed;
27533
+ let t, h, w;
27534
+ if (ed_image < ed_video) {
27535
+ [t, h, w] = image_grid_thw_list[state.image_index];
27536
+ ++state.image_index;
27537
+ --remain_images;
27538
+ ed = ed_image;
27539
+ } else {
27540
+ [t, h, w] = video_grid_thw_list[state.video_index];
27541
+ ++state.video_index;
27542
+ --remain_videos;
27543
+ ed = ed_video;
27544
+ }
27545
+ const [llm_grid_t, llm_grid_h, llm_grid_w] = [
27546
+ Number(t),
27547
+ Math.floor(Number(h) / spatial_merge_size),
27548
+ Math.floor(Number(w) / spatial_merge_size)
27549
+ ];
27550
+ const text_len = ed - st2;
27551
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27552
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
27553
+ const offset = text_len + st_idx;
27554
+ const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
27555
+ const t_index = Array.from(
27556
+ { length: grid_size },
27557
+ (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
27558
+ );
27559
+ const h_index = Array.from(
27560
+ { length: grid_size },
27561
+ (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
27562
+ );
27563
+ const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
27564
+ llm_pos_ids_list.push([t_index, h_index, w_index].flat());
27565
+ st2 = ed + grid_size;
27566
+ }
27567
+ if (st2 < ids.length) {
27568
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27569
+ const text_len = ids.length - st2;
27570
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
27571
+ }
27572
+ return llm_pos_ids_list;
27573
+ }
27574
+ /**
27575
+ * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
27576
+ *
27577
+ * Explanation:
27578
+ * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
27579
+ *
27580
+ * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
27581
+ * Examples:
27582
+ * input_ids: [T T T T T], here T is for text.
27583
+ * temporal position_ids: [0, 1, 2, 3, 4]
27584
+ * height position_ids: [0, 1, 2, 3, 4]
27585
+ * width position_ids: [0, 1, 2, 3, 4]
27586
+ *
27587
+ * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
27588
+ * and 1D rotary position embeddin for text part.
27589
+ * Examples:
27590
+ * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
27591
+ * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
27592
+ * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
27593
+ * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
27594
+ * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
27595
+ * text temporal position_ids: [3, 4, 5, 6, 7]
27596
+ * text height position_ids: [3, 4, 5, 6, 7]
27597
+ * text width position_ids: [3, 4, 5, 6, 7]
27598
+ * Here we calculate the text start position_ids as the max vision position_ids plus 1.
27599
+ *
27600
+ * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
27601
+ * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
27602
+ * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
27603
+ * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
27604
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
27605
+ */
27606
+ get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
27607
+ const { vision_config } = this.config;
27608
+ const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
27609
+ if (image_grid_thw || video_grid_thw) {
27610
+ const total_input_ids = input_ids.tolist();
27611
+ if (!attention_mask) {
27612
+ attention_mask = ones_like(input_ids);
27613
+ }
27614
+ const attention_mask_list = attention_mask.tolist();
27615
+ const position_ids_list = Array.from(
27616
+ { length: 3 },
27617
+ () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
27618
+ );
27619
+ const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
27620
+ const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
27621
+ const state = { image_index: 0, video_index: 0 };
27622
+ const mrope_position_deltas = [];
27623
+ for (let i = 0; i < total_input_ids.length; ++i) {
27624
+ const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
27625
+ const llm_pos_ids_list = this._get_multimodal_rope_positions({
27626
+ filtered_ids,
27627
+ image_grid_thw_list,
27628
+ video_grid_thw_list,
27629
+ spatial_merge_size,
27630
+ state
27631
+ });
27632
+ const llm_positions = this._reorder_and_write_positions(
27633
+ llm_pos_ids_list,
27634
+ attention_mask_list[i],
27635
+ position_ids_list,
27636
+ i
27637
+ );
27638
+ mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
27639
+ }
27640
+ return [
27641
+ new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
27642
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27643
+ ];
27644
+ } else {
27645
+ return this._get_text_only_rope_index(input_ids, attention_mask);
27646
+ }
27647
+ }
27648
+ async encode_image({ pixel_values, image_grid_thw }) {
27649
+ const features = (await sessionRun(this.sessions["vision_encoder"], {
27650
+ pixel_values,
27651
+ [this.image_grid_thw_name]: image_grid_thw
27652
+ })).image_features;
27653
+ return features;
27654
+ }
27655
+ _merge_input_ids_with_image_features(kwargs) {
27656
+ return default_merge_input_ids_with_image_features({
27657
+ // @ts-ignore
27658
+ image_token_id: this.config.image_token_id,
27659
+ ...kwargs
27660
+ });
27661
+ }
27662
+ prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
27663
+ if (model_inputs.attention_mask && !model_inputs.position_ids) {
27664
+ if (!model_inputs.past_key_values) {
27665
+ [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
27666
+ model_inputs.input_ids,
27667
+ model_inputs.image_grid_thw,
27668
+ model_inputs.video_grid_thw,
27669
+ model_inputs.attention_mask
27670
+ );
27671
+ } else {
27672
+ model_inputs.pixel_values = null;
27673
+ const past_length = model_inputs.past_key_values.get_seq_length();
27674
+ if (past_length < model_inputs.input_ids.dims[1]) {
27675
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
27676
+ model_inputs.input_ids,
27677
+ model_inputs.image_grid_thw,
27678
+ model_inputs.video_grid_thw,
27679
+ model_inputs.attention_mask
27680
+ );
27681
+ model_inputs.rope_deltas = rope_deltas;
27682
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
27683
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
27684
+ } else {
27685
+ if (!model_inputs.rope_deltas) {
27686
+ [, model_inputs.rope_deltas] = this.get_rope_index(
27687
+ model_inputs.input_ids,
27688
+ model_inputs.image_grid_thw,
27689
+ model_inputs.video_grid_thw,
27690
+ model_inputs.attention_mask
27691
+ );
27692
+ }
27693
+ const delta = BigInt(past_length);
27694
+ const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
27695
+ model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
27696
+ }
27697
+ }
27698
+ }
27699
+ return model_inputs;
27700
+ }
27701
+ };
27702
+ var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
27703
+ };
27704
+
27705
+ // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
27706
+ var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
27707
+ image_grid_thw_name = "image_grid_thw";
27708
+ };
27709
+ var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
27710
+ image_grid_thw_name = "image_grid_thw";
27711
+ };
27712
+
27713
+ // src/models/glm_ocr/modeling_glm_ocr.js
27714
+ var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
27715
+ /**
27716
+ * Compute 3D positional indices for vision tokens.
27717
+ * Temporal is constant, height is repeat-interleaved, width tiles.
27718
+ * @param {number} start_position
27719
+ * @param {number[]} grid_thw [T, H, W]
27720
+ * @param {number} temp_merge_size
27721
+ * @param {number} spatial_merge_size
27722
+ * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
27723
+ */
27724
+ get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
27725
+ const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
27726
+ const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
27727
+ const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
27728
+ const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
27729
+ const t_pos = Array.from({ length: seq_len }, () => start_position);
27730
+ const h_pos = Array.from(
27731
+ { length: seq_len },
27732
+ (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
27733
+ );
27734
+ const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
27735
+ return [...t_pos, ...h_pos, ...w_pos];
27736
+ }
27737
+ /**
27738
+ * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
27739
+ * instead of vision_start_token_id scanning used by Qwen2VL.
27740
+ * After a vision segment, position advances by max(h, w) / spatial_merge_size.
27741
+ */
27742
+ _get_multimodal_rope_positions({
27743
+ filtered_ids,
27744
+ image_grid_thw_list,
27745
+ video_grid_thw_list,
27746
+ spatial_merge_size,
27747
+ state
27748
+ }) {
27749
+ const { image_token_id } = this.config;
27750
+ const groups = [];
27751
+ let group_start = 0;
27752
+ let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
27753
+ for (let j = 1; j <= filtered_ids.length; ++j) {
27754
+ const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
27755
+ if (t !== current_type) {
27756
+ groups.push([current_type, group_start, j]);
27757
+ group_start = j;
27758
+ current_type = t;
27759
+ }
27760
+ }
27761
+ let current_pos = 0;
27762
+ const llm_pos_ids_list = [];
27763
+ for (const [modality_type, start_idx, end_idx] of groups) {
27764
+ if (modality_type === 0) {
27765
+ const text_len = end_idx - start_idx;
27766
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
27767
+ current_pos += text_len;
27768
+ } else {
27769
+ const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
27770
+ const temp_merge_size = grid_thw[0];
27771
+ llm_pos_ids_list.push(
27772
+ this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
27773
+ );
27774
+ current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
27775
+ }
27776
+ }
27777
+ return llm_pos_ids_list;
27778
+ }
27779
+ };
27780
+
27273
27781
  // src/models/glpn/modeling_glpn.js
27274
27782
  var GLPNPreTrainedModel = class extends PreTrainedModel {
27275
27783
  };
@@ -27582,6 +28090,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
27582
28090
  var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
27583
28091
  };
27584
28092
 
28093
+ // src/models/lighton_ocr/modeling_lighton_ocr.js
28094
+ var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
28095
+ };
28096
+
27585
28097
  // src/models/lfm2_moe/modeling_lfm2_moe.js
27586
28098
  var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
27587
28099
  };
@@ -27778,6 +28290,14 @@ var MistralModel = class extends MistralPreTrainedModel {
27778
28290
  var MistralForCausalLM = class extends MistralPreTrainedModel {
27779
28291
  };
27780
28292
 
28293
+ // src/models/mistral4/modeling_mistral4.js
28294
+ var Mistral4PreTrainedModel = class extends PreTrainedModel {
28295
+ };
28296
+ var Mistral4Model = class extends Mistral4PreTrainedModel {
28297
+ };
28298
+ var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
28299
+ };
28300
+
27781
28301
  // src/models/mobilebert/modeling_mobilebert.js
27782
28302
  var MobileBertPreTrainedModel = class extends PreTrainedModel {
27783
28303
  };
@@ -28246,6 +28766,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
28246
28766
  var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
28247
28767
  };
28248
28768
 
28769
+ // src/models/nemotron_h/modeling_nemotron_h.js
28770
+ var NemotronHPreTrainedModel = class extends PreTrainedModel {
28771
+ };
28772
+ var NemotronHModel = class extends NemotronHPreTrainedModel {
28773
+ };
28774
+ var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
28775
+ };
28776
+
28249
28777
  // src/models/neobert/modeling_neobert.js
28250
28778
  var NeoBertPreTrainedModel = class extends PreTrainedModel {
28251
28779
  };
@@ -28526,252 +29054,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
28526
29054
  var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
28527
29055
  };
28528
29056
 
28529
- // src/models/qwen2_vl/modeling_qwen2_vl.js
28530
- var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
28531
- forward_params = [
28532
- // Text inputs
28533
- "input_ids",
28534
- "attention_mask",
28535
- "position_ids",
28536
- "past_key_values",
28537
- // Vision inputs
28538
- "pixel_values",
28539
- "image_grid_thw"
28540
- ];
28541
- };
28542
- var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
28543
- // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
28544
- // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
28545
- // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
28546
- image_grid_thw_name = "grid_thw";
28547
- /**
28548
- * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
28549
- *
28550
- * Explanation:
28551
- * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
28552
- *
28553
- * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
28554
- * Examples:
28555
- * input_ids: [T T T T T], here T is for text.
28556
- * temporal position_ids: [0, 1, 2, 3, 4]
28557
- * height position_ids: [0, 1, 2, 3, 4]
28558
- * width position_ids: [0, 1, 2, 3, 4]
28559
- *
28560
- * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
28561
- * and 1D rotary position embeddin for text part.
28562
- * Examples:
28563
- * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
28564
- * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
28565
- * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
28566
- * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
28567
- * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
28568
- * text temporal position_ids: [3, 4, 5, 6, 7]
28569
- * text height position_ids: [3, 4, 5, 6, 7]
28570
- * text width position_ids: [3, 4, 5, 6, 7]
28571
- * Here we calculate the text start position_ids as the max vision position_ids plus 1.
28572
- *
28573
- * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
28574
- * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
28575
- * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
28576
- * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
28577
- * - 1 for tokens that are **not masked**,
28578
- * - 0 for tokens that are **masked**.
28579
- * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
28580
- * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
28581
- * - mrope_position_deltas: Tensor of shape `(batch_size)`.
28582
- */
28583
- get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
28584
- const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
28585
- const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
28586
- const mrope_position_deltas = [];
28587
- if (image_grid_thw || video_grid_thw) {
28588
- let total_input_ids = input_ids.tolist();
28589
- if (!attention_mask) {
28590
- attention_mask = ones_like(input_ids);
28591
- }
28592
- const attention_mask_list = attention_mask.tolist();
28593
- const position_ids_list = Array.from(
28594
- { length: 3 },
28595
- (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
28596
- );
28597
- const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
28598
- const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
28599
- let image_index = 0;
28600
- let video_index = 0;
28601
- for (let i = 0; i < total_input_ids.length; ++i) {
28602
- const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
28603
- const vision_start_indices = ids.reduce((acc, x, idx) => {
28604
- if (x == vision_start_token_id) acc.push(idx);
28605
- return acc;
28606
- }, []);
28607
- const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
28608
- const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
28609
- const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
28610
- let llm_pos_ids_list = [];
28611
- let st2 = 0;
28612
- let remain_images = image_nums;
28613
- let remain_videos = video_nums;
28614
- for (let j = 0; j < vision_tokens.length; ++j) {
28615
- const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
28616
- const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
28617
- const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
28618
- const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
28619
- let ed;
28620
- let t, h, w;
28621
- if (ed_image < ed_video) {
28622
- [t, h, w] = image_grid_thw_list[image_index];
28623
- ++image_index;
28624
- --remain_images;
28625
- ed = ed_image;
28626
- } else {
28627
- [t, h, w] = video_grid_thw_list[video_index];
28628
- ++video_index;
28629
- --remain_videos;
28630
- ed = ed_video;
28631
- }
28632
- const [llm_grid_t, llm_grid_h, llm_grid_w] = [
28633
- Number(t),
28634
- Math.floor(Number(h) / spatial_merge_size),
28635
- Math.floor(Number(w) / spatial_merge_size)
28636
- ];
28637
- const text_len = ed - st2;
28638
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
28639
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
28640
- const offset = text_len + st_idx;
28641
- const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
28642
- const t_index = Array.from(
28643
- { length: grid_size },
28644
- (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
28645
- );
28646
- const h_index = Array.from(
28647
- { length: grid_size },
28648
- (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
28649
- );
28650
- const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
28651
- llm_pos_ids_list.push([t_index, h_index, w_index].flat());
28652
- st2 = ed + grid_size;
28653
- }
28654
- if (st2 < ids.length) {
28655
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
28656
- const text_len = ids.length - st2;
28657
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
28658
- }
28659
- const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
28660
- const llm_positions = new Array(num_items);
28661
- let index = 0;
28662
- for (let x = 0; x < 3; ++x) {
28663
- for (let y = 0; y < llm_pos_ids_list.length; ++y) {
28664
- const val = llm_pos_ids_list[y];
28665
- const text_len = val.length / 3;
28666
- for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
28667
- llm_positions[index++] = val[z];
28668
- }
28669
- }
28670
- }
28671
- let count2 = 0;
28672
- const attn_mask = attention_mask_list[i];
28673
- for (let y = 0; y < attn_mask.length; ++y) {
28674
- if (attn_mask[y] == 1) {
28675
- for (let x = 0; x < 3; ++x) {
28676
- position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
28677
- }
28678
- ++count2;
28679
- }
28680
- }
28681
- const max_llm_positions = max(llm_positions)[0];
28682
- mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
28683
- }
28684
- return [
28685
- new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
28686
- new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
28687
- ];
28688
- } else {
28689
- if (attention_mask) {
28690
- const { data, dims } = cumsum_masked_fill(attention_mask);
28691
- const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
28692
- const mrope_position_deltas2 = Array.from(
28693
- { length: dims[0] },
28694
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
28695
- );
28696
- return [
28697
- new Tensor2("int64", position_ids, [3, ...dims]),
28698
- new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
28699
- ];
28700
- } else {
28701
- const [batch_size, seq_length] = input_ids.dims;
28702
- const position_ids = BigInt64Array.from(
28703
- { length: 3 * batch_size * seq_length },
28704
- (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
28705
- );
28706
- return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
28707
- }
28708
- }
28709
- }
28710
- async encode_image({ pixel_values, image_grid_thw }) {
28711
- const features = (await sessionRun(this.sessions["vision_encoder"], {
28712
- pixel_values,
28713
- [this.image_grid_thw_name]: image_grid_thw
28714
- })).image_features;
28715
- return features;
28716
- }
28717
- _merge_input_ids_with_image_features(kwargs) {
28718
- return default_merge_input_ids_with_image_features({
28719
- // @ts-ignore
28720
- image_token_id: this.config.image_token_id,
28721
- ...kwargs
28722
- });
28723
- }
28724
- prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
28725
- if (model_inputs.attention_mask && !model_inputs.position_ids) {
28726
- if (!model_inputs.past_key_values) {
28727
- [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
28728
- model_inputs.input_ids,
28729
- model_inputs.image_grid_thw,
28730
- model_inputs.video_grid_thw,
28731
- model_inputs.attention_mask
28732
- );
28733
- } else {
28734
- model_inputs.pixel_values = null;
28735
- const past_length = model_inputs.past_key_values.get_seq_length();
28736
- if (past_length < model_inputs.input_ids.dims[1]) {
28737
- const [full_position_ids, rope_deltas] = this.get_rope_index(
28738
- model_inputs.input_ids,
28739
- model_inputs.image_grid_thw,
28740
- model_inputs.video_grid_thw,
28741
- model_inputs.attention_mask
28742
- );
28743
- model_inputs.rope_deltas = rope_deltas;
28744
- model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
28745
- model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
28746
- } else {
28747
- if (!model_inputs.rope_deltas) {
28748
- [, model_inputs.rope_deltas] = this.get_rope_index(
28749
- model_inputs.input_ids,
28750
- model_inputs.image_grid_thw,
28751
- model_inputs.video_grid_thw,
28752
- model_inputs.attention_mask
28753
- );
28754
- }
28755
- const delta = BigInt(past_length);
28756
- const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
28757
- model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
28758
- }
28759
- }
28760
- }
28761
- return model_inputs;
28762
- }
28763
- };
28764
- var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
28765
- };
28766
-
28767
- // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
28768
- var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
28769
- image_grid_thw_name = "image_grid_thw";
28770
- };
28771
- var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
28772
- image_grid_thw_name = "image_grid_thw";
28773
- };
28774
-
28775
29057
  // src/models/qwen3/modeling_qwen3.js
28776
29058
  var Qwen3PreTrainedModel = class extends PreTrainedModel {
28777
29059
  };
@@ -29217,6 +29499,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
29217
29499
  }
29218
29500
  };
29219
29501
 
29502
+ // src/models/solar_open/modeling_solar_open.js
29503
+ var SolarOpenPreTrainedModel = class extends PreTrainedModel {
29504
+ };
29505
+ var SolarOpenModel = class extends SolarOpenPreTrainedModel {
29506
+ };
29507
+ var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
29508
+ };
29509
+
29220
29510
  // src/models/speecht5/modeling_speecht5.js
29221
29511
  var SpeechT5PreTrainedModel = class extends PreTrainedModel {
29222
29512
  };
@@ -30333,6 +30623,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
30333
30623
  // src/models/registry.js
30334
30624
  var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
30335
30625
  ["bert", "BertModel"],
30626
+ ["eurobert", "EuroBertModel"],
30336
30627
  ["neobert", "NeoBertModel"],
30337
30628
  ["modernbert", "ModernBertModel"],
30338
30629
  ["nomic_bert", "NomicBertModel"],
@@ -30464,6 +30755,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
30464
30755
  ["gemma3_text", "Gemma3Model"],
30465
30756
  ["helium", "HeliumModel"],
30466
30757
  ["glm", "GlmModel"],
30758
+ ["glm_moe_dsa", "GlmMoeDsaModel"],
30467
30759
  ["openelm", "OpenELMModel"],
30468
30760
  ["qwen2", "Qwen2Model"],
30469
30761
  ["qwen2_moe", "Qwen2MoeModel"],
@@ -30475,12 +30767,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
30475
30767
  ["mpt", "MptModel"],
30476
30768
  ["opt", "OPTModel"],
30477
30769
  ["mistral", "MistralModel"],
30770
+ ["mistral4", "Mistral4Model"],
30478
30771
  ["ministral", "MinistralModel"],
30479
30772
  ["ministral3", "Ministral3Model"],
30480
30773
  ["ernie4_5", "Ernie4_5ForCausalLM"],
30481
30774
  ["starcoder2", "Starcoder2Model"],
30775
+ ["deepseek_v3", "DeepseekV3Model"],
30482
30776
  ["falcon", "FalconModel"],
30483
30777
  ["falcon_h1", "FalconH1Model"],
30778
+ ["nemotron_h", "NemotronHModel"],
30779
+ ["solar_open", "SolarOpenModel"],
30484
30780
  ["stablelm", "StableLmModel"],
30485
30781
  ["modernbert-decoder", "ModernBertDecoderModel"],
30486
30782
  ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -30500,6 +30796,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30500
30796
  ]);
30501
30797
  var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30502
30798
  ["bert", "BertForSequenceClassification"],
30799
+ ["eurobert", "EuroBertForSequenceClassification"],
30503
30800
  ["neobert", "NeoBertForSequenceClassification"],
30504
30801
  ["modernbert", "ModernBertForSequenceClassification"],
30505
30802
  ["roformer", "RoFormerForSequenceClassification"],
@@ -30522,6 +30819,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30522
30819
  ]);
30523
30820
  var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30524
30821
  ["bert", "BertForTokenClassification"],
30822
+ ["eurobert", "EuroBertForTokenClassification"],
30525
30823
  ["neobert", "NeoBertForTokenClassification"],
30526
30824
  ["modernbert", "ModernBertForTokenClassification"],
30527
30825
  ["roformer", "RoFormerForTokenClassification"],
@@ -30584,6 +30882,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30584
30882
  ["gemma3", "Gemma3ForCausalLM"],
30585
30883
  ["helium", "HeliumForCausalLM"],
30586
30884
  ["glm", "GlmForCausalLM"],
30885
+ ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
30587
30886
  ["openelm", "OpenELMForCausalLM"],
30588
30887
  ["qwen2", "Qwen2ForCausalLM"],
30589
30888
  ["qwen2_moe", "Qwen2MoeForCausalLM"],
@@ -30603,13 +30902,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30603
30902
  ["opt", "OPTForCausalLM"],
30604
30903
  ["mbart", "MBartForCausalLM"],
30605
30904
  ["mistral", "MistralForCausalLM"],
30905
+ ["mistral4", "Mistral4ForCausalLM"],
30606
30906
  ["ministral", "MinistralForCausalLM"],
30607
30907
  ["ministral3", "Ministral3ForCausalLM"],
30608
30908
  ["ernie4_5", "Ernie4_5ForCausalLM"],
30609
30909
  ["starcoder2", "Starcoder2ForCausalLM"],
30910
+ ["deepseek_v3", "DeepseekV3ForCausalLM"],
30610
30911
  ["falcon", "FalconForCausalLM"],
30611
30912
  ["falcon_h1", "FalconH1ForCausalLM"],
30913
+ ["nemotron_h", "NemotronHForCausalLM"],
30612
30914
  ["trocr", "TrOCRForCausalLM"],
30915
+ ["solar_open", "SolarOpenForCausalLM"],
30613
30916
  ["stablelm", "StableLmForCausalLM"],
30614
30917
  ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
30615
30918
  ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -30620,6 +30923,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30620
30923
  var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
30621
30924
  var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30622
30925
  ["bert", "BertForMaskedLM"],
30926
+ ["eurobert", "EuroBertForMaskedLM"],
30623
30927
  ["neobert", "NeoBertForMaskedLM"],
30624
30928
  ["modernbert", "ModernBertForMaskedLM"],
30625
30929
  ["roformer", "RoFormerForMaskedLM"],
@@ -30678,7 +30982,9 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
30678
30982
  ["paligemma", "PaliGemmaForConditionalGeneration"],
30679
30983
  ["llava_qwen2", "LlavaQwen2ForCausalLM"],
30680
30984
  ["gemma3n", "Gemma3nForConditionalGeneration"],
30681
- ["mistral3", "Mistral3ForConditionalGeneration"]
30985
+ ["mistral3", "Mistral3ForConditionalGeneration"],
30986
+ ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
30987
+ ["glm_ocr", "GlmOcrForConditionalGeneration"]
30682
30988
  ]);
30683
30989
  var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
30684
30990
  ["granite_speech", "GraniteSpeechForConditionalGeneration"],
@@ -30783,6 +31089,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30783
31089
  ]);
30784
31090
  var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
30785
31091
  var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
31092
+ ["chmv2", "CHMv2ForDepthEstimation"],
30786
31093
  ["dpt", "DPTForDepthEstimation"],
30787
31094
  ["depth_anything", "DepthAnythingForDepthEstimation"],
30788
31095
  ["glpn", "GLPNForDepthEstimation"],
@@ -33522,6 +33829,9 @@ var ModelRegistry = class {
33522
33829
  BloomModel,
33523
33830
  BloomPreTrainedModel,
33524
33831
  BloomTokenizer,
33832
+ CHMv2ForDepthEstimation,
33833
+ CHMv2ImageProcessor,
33834
+ CHMv2PreTrainedModel,
33525
33835
  CLIPFeatureExtractor,
33526
33836
  CLIPImageProcessor,
33527
33837
  CLIPModel,
@@ -33617,6 +33927,9 @@ var ModelRegistry = class {
33617
33927
  DebertaV2Tokenizer,
33618
33928
  DecisionTransformerModel,
33619
33929
  DecisionTransformerPreTrainedModel,
33930
+ DeepseekV3ForCausalLM,
33931
+ DeepseekV3Model,
33932
+ DeepseekV3PreTrainedModel,
33620
33933
  DeiTFeatureExtractor,
33621
33934
  DeiTForImageClassification,
33622
33935
  DeiTImageProcessor,
@@ -33677,6 +33990,11 @@ var ModelRegistry = class {
33677
33990
  EsmModel,
33678
33991
  EsmPreTrainedModel,
33679
33992
  EsmTokenizer,
33993
+ EuroBertForMaskedLM,
33994
+ EuroBertForSequenceClassification,
33995
+ EuroBertForTokenClassification,
33996
+ EuroBertModel,
33997
+ EuroBertPreTrainedModel,
33680
33998
  ExaoneForCausalLM,
33681
33999
  ExaoneModel,
33682
34000
  ExaonePreTrainedModel,
@@ -33734,8 +34052,14 @@ var ModelRegistry = class {
33734
34052
  GemmaModel,
33735
34053
  GemmaPreTrainedModel,
33736
34054
  GemmaTokenizer,
34055
+ Glm46VImageProcessor,
34056
+ Glm46VProcessor,
33737
34057
  GlmForCausalLM,
33738
34058
  GlmModel,
34059
+ GlmMoeDsaForCausalLM,
34060
+ GlmMoeDsaModel,
34061
+ GlmMoeDsaPreTrainedModel,
34062
+ GlmOcrForConditionalGeneration,
33739
34063
  GlmPreTrainedModel,
33740
34064
  GptOssForCausalLM,
33741
34065
  GptOssModel,
@@ -33801,6 +34125,7 @@ var ModelRegistry = class {
33801
34125
  Lfm2VlForConditionalGeneration,
33802
34126
  Lfm2VlImageProcessor,
33803
34127
  Lfm2VlProcessor,
34128
+ LightOnOcrForConditionalGeneration,
33804
34129
  LiteWhisperForConditionalGeneration,
33805
34130
  Llama4ForCausalLM,
33806
34131
  Llama4PreTrainedModel,
@@ -33870,6 +34195,9 @@ var ModelRegistry = class {
33870
34195
  MimiPreTrainedModel,
33871
34196
  MinLengthLogitsProcessor,
33872
34197
  MinNewTokensLengthLogitsProcessor,
34198
+ Mistral4ForCausalLM,
34199
+ Mistral4Model,
34200
+ Mistral4PreTrainedModel,
33873
34201
  MistralForCausalLM,
33874
34202
  MistralModel,
33875
34203
  MistralPreTrainedModel,
@@ -33941,6 +34269,9 @@ var ModelRegistry = class {
33941
34269
  NanoChatForCausalLM,
33942
34270
  NanoChatModel,
33943
34271
  NanoChatPreTrainedModel,
34272
+ NemotronHForCausalLM,
34273
+ NemotronHModel,
34274
+ NemotronHPreTrainedModel,
33944
34275
  NeoBertForMaskedLM,
33945
34276
  NeoBertForQuestionAnswering,
33946
34277
  NeoBertForSequenceClassification,
@@ -34130,6 +34461,9 @@ var ModelRegistry = class {
34130
34461
  SnacFeatureExtractor,
34131
34462
  SnacModel,
34132
34463
  SnacPreTrainedModel,
34464
+ SolarOpenForCausalLM,
34465
+ SolarOpenModel,
34466
+ SolarOpenPreTrainedModel,
34133
34467
  SpeechT5FeatureExtractor,
34134
34468
  SpeechT5ForSpeechToText,
34135
34469
  SpeechT5ForTextToSpeech,