@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (102) hide show
  1. package/README.md +13 -2
  2. package/dist/ort-wasm-simd-threaded.jsep.mjs +26 -26
  3. package/dist/transformers.js +1002 -587
  4. package/dist/transformers.min.js +23 -19
  5. package/dist/transformers.node.cjs +1030 -585
  6. package/dist/transformers.node.min.cjs +21 -17
  7. package/dist/transformers.node.min.mjs +21 -17
  8. package/dist/transformers.node.mjs +1000 -585
  9. package/dist/transformers.web.js +887 -472
  10. package/dist/transformers.web.min.js +21 -17
  11. package/package.json +3 -3
  12. package/src/configs.js +28 -22
  13. package/src/env.js +1 -1
  14. package/src/image_processors_utils.js +25 -15
  15. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  16. package/src/models/chmv2/modeling_chmv2.js +4 -0
  17. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  18. package/src/models/eurobert/modeling_eurobert.js +41 -0
  19. package/src/models/gemma3/image_processing_gemma3.js +3 -0
  20. package/src/models/gemma3/modeling_gemma3.js +4 -1
  21. package/src/models/gemma3/processing_gemma3.js +45 -0
  22. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  23. package/src/models/glm46v/processing_glm46v.js +5 -0
  24. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  25. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  26. package/src/models/image_processors.js +3 -0
  27. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
  28. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  29. package/src/models/mistral4/modeling_mistral4.js +5 -0
  30. package/src/models/modeling_utils.js +48 -25
  31. package/src/models/models.js +10 -1
  32. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  33. package/src/models/processors.js +2 -0
  34. package/src/models/qwen2_vl/modeling_qwen2_vl.js +226 -168
  35. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  36. package/src/models/registry.js +19 -8
  37. package/src/models/solar_open/modeling_solar_open.js +5 -0
  38. package/src/pipelines.js +1 -0
  39. package/src/utils/hub.js +4 -1
  40. package/src/utils/model_registry/ModelRegistry.js +36 -0
  41. package/src/utils/model_registry/get_available_dtypes.js +68 -0
  42. package/src/utils/model_registry/get_file_metadata.js +1 -0
  43. package/src/utils/model_registry/get_model_files.js +7 -60
  44. package/src/utils/model_registry/resolve_model_type.js +66 -0
  45. package/types/configs.d.ts.map +1 -1
  46. package/types/image_processors_utils.d.ts +3 -2
  47. package/types/image_processors_utils.d.ts.map +1 -1
  48. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  49. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  50. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  51. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  52. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  53. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  54. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  55. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  56. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  57. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  58. package/types/models/gemma3/image_processing_gemma3.d.ts +4 -0
  59. package/types/models/gemma3/image_processing_gemma3.d.ts.map +1 -0
  60. package/types/models/gemma3/modeling_gemma3.d.ts +4 -1
  61. package/types/models/gemma3/modeling_gemma3.d.ts.map +1 -1
  62. package/types/models/gemma3/processing_gemma3.d.ts +20 -0
  63. package/types/models/gemma3/processing_gemma3.d.ts.map +1 -0
  64. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  65. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  66. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  67. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  68. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  69. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  70. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  71. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  72. package/types/models/image_processors.d.ts +3 -0
  73. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  74. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  75. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  76. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  77. package/types/models/modeling_utils.d.ts +2 -3
  78. package/types/models/modeling_utils.d.ts.map +1 -1
  79. package/types/models/models.d.ts +10 -1
  80. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  81. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  82. package/types/models/processors.d.ts +2 -0
  83. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
  84. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  85. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  86. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  87. package/types/models/registry.d.ts.map +1 -1
  88. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  89. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  90. package/types/pipelines.d.ts +1 -0
  91. package/types/pipelines.d.ts.map +1 -1
  92. package/types/utils/hub.d.ts.map +1 -1
  93. package/types/utils/model_registry/ModelRegistry.d.ts +27 -0
  94. package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
  95. package/types/utils/model_registry/get_available_dtypes.d.ts +26 -0
  96. package/types/utils/model_registry/get_available_dtypes.d.ts.map +1 -0
  97. package/types/utils/model_registry/get_model_files.d.ts +25 -0
  98. package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
  99. package/types/utils/model_registry/resolve_model_type.d.ts +24 -0
  100. package/types/utils/model_registry/resolve_model_type.d.ts.map +1 -0
  101. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  102. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -14,7 +14,7 @@ var node_path_default = {};
14
14
  var node_url_default = {};
15
15
 
16
16
  // src/env.js
17
- var VERSION = "4.0.0-next.7";
17
+ var VERSION = "4.0.0-next.9";
18
18
  var HAS_SELF = typeof self !== "undefined";
19
19
  var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
20
20
  var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
@@ -244,7 +244,7 @@ var logger = {
244
244
  }
245
245
  };
246
246
 
247
- // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
247
+ // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
248
248
  var DictionarySplitter = class {
249
249
  /**
250
250
  * @param dictionary The dictionary of words to use for splitting.
@@ -1900,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
1900
1900
  );
1901
1901
  if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
1902
1902
  output_tokens.push(...byte_tokens);
1903
- } else {
1903
+ } else if (this.unk_token != null) {
1904
1904
  output_tokens.push(this.unk_token);
1905
1905
  }
1906
- } else {
1906
+ } else if (this.unk_token != null) {
1907
1907
  output_tokens.push(this.unk_token);
1908
1908
  }
1909
1909
  }
@@ -6509,13 +6509,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
6509
6509
  wrapped_progress
6510
6510
  );
6511
6511
  } else if (typeof response !== "string") {
6512
+ const headers = new Headers(response.headers);
6513
+ headers.set("content-length", result.byteLength.toString());
6512
6514
  await cache2.put(
6513
6515
  cacheKey,
6514
6516
  new Response(
6515
6517
  /** @type {any} */
6516
6518
  result,
6517
6519
  {
6518
- headers: response.headers
6520
+ headers
6519
6521
  }
6520
6522
  )
6521
6523
  ).catch((err) => {
@@ -11828,7 +11830,9 @@ var processors_exports = {};
11828
11830
  __export(processors_exports, {
11829
11831
  ChatterboxProcessor: () => ChatterboxProcessor,
11830
11832
  Florence2Processor: () => Florence2Processor,
11833
+ Gemma3Processor: () => Gemma3Processor,
11831
11834
  Gemma3nProcessor: () => Gemma3nProcessor,
11835
+ Glm46VProcessor: () => Glm46VProcessor,
11832
11836
  GraniteSpeechProcessor: () => GraniteSpeechProcessor,
11833
11837
  GroundingDinoProcessor: () => GroundingDinoProcessor,
11834
11838
  Idefics3Processor: () => Idefics3Processor,
@@ -14342,26 +14346,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
14342
14346
  }
14343
14347
  return [segmentation, segments];
14344
14348
  }
14345
- function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
14349
+ function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
14346
14350
  if (height < factor || width < factor) {
14347
- throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
14348
- } else if (Math.max(height, width) / Math.min(height, width) > 200) {
14351
+ const scale = Math.max(factor / height, factor / width);
14352
+ height = Math.round(height * scale);
14353
+ width = Math.round(width * scale);
14354
+ }
14355
+ if (Math.max(height, width) / Math.min(height, width) > 200) {
14349
14356
  throw new Error(
14350
14357
  `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
14351
14358
  );
14352
14359
  }
14353
14360
  let h_bar = Math.round(height / factor) * factor;
14354
14361
  let w_bar = Math.round(width / factor) * factor;
14355
- if (h_bar * w_bar > max_pixels) {
14356
- const beta = Math.sqrt(height * width / max_pixels);
14357
- h_bar = Math.floor(height / beta / factor) * factor;
14358
- w_bar = Math.floor(width / beta / factor) * factor;
14359
- } else if (h_bar * w_bar < min_pixels) {
14360
- const beta = Math.sqrt(min_pixels / (height * width));
14362
+ if (temporal_factor * h_bar * w_bar > max_pixels) {
14363
+ const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
14364
+ h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
14365
+ w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
14366
+ } else if (temporal_factor * h_bar * w_bar < min_pixels) {
14367
+ const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
14361
14368
  h_bar = Math.ceil(height * beta / factor) * factor;
14362
14369
  w_bar = Math.ceil(width * beta / factor) * factor;
14363
14370
  }
14364
- return [h_bar, w_bar];
14371
+ return [w_bar, h_bar];
14365
14372
  }
14366
14373
  function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
14367
14374
  if (label_ids_to_fuse === null) {
@@ -14440,7 +14447,7 @@ var ImageProcessor = class extends Callable2 {
14440
14447
  this.do_pad = config.do_pad;
14441
14448
  this.min_pixels = config.min_pixels;
14442
14449
  this.max_pixels = config.max_pixels;
14443
- if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
14450
+ if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
14444
14451
  this.pad_size = this.size;
14445
14452
  }
14446
14453
  this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -14728,10 +14735,8 @@ var ImageProcessor = class extends Callable2 {
14728
14735
  const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
14729
14736
  [pixelData, imgDims] = padded;
14730
14737
  } else if (this.size_divisibility) {
14731
- const [paddedWidth, paddedHeight] = enforce_size_divisibility(
14732
- [imgDims[1], imgDims[0]],
14733
- this.size_divisibility
14734
- );
14738
+ const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
14739
+ const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
14735
14740
  [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
14736
14741
  }
14737
14742
  }
@@ -14808,6 +14813,7 @@ var image_processors_exports = {};
14808
14813
  __export(image_processors_exports, {
14809
14814
  BeitFeatureExtractor: () => BeitFeatureExtractor,
14810
14815
  BitImageProcessor: () => BitImageProcessor,
14816
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
14811
14817
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
14812
14818
  CLIPImageProcessor: () => CLIPImageProcessor,
14813
14819
  ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -14824,6 +14830,8 @@ __export(image_processors_exports, {
14824
14830
  DonutImageProcessor: () => DonutImageProcessor,
14825
14831
  EfficientNetImageProcessor: () => EfficientNetImageProcessor,
14826
14832
  GLPNFeatureExtractor: () => GLPNFeatureExtractor,
14833
+ Gemma3ImageProcessor: () => Gemma3ImageProcessor,
14834
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
14827
14835
  GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
14828
14836
  Idefics3ImageProcessor: () => Idefics3ImageProcessor,
14829
14837
  ImageFeatureExtractor: () => ImageProcessor,
@@ -14884,6 +14892,10 @@ var BitImageProcessor = class extends ImageProcessor {
14884
14892
  var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
14885
14893
  };
14886
14894
 
14895
+ // src/models/chmv2/image_processing_chmv2.js
14896
+ var CHMv2ImageProcessor = class extends ImageProcessor {
14897
+ };
14898
+
14887
14899
  // src/models/clip/image_processing_clip.js
14888
14900
  var CLIPImageProcessor = class extends ImageProcessor {
14889
14901
  };
@@ -15003,6 +15015,69 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
15003
15015
  }
15004
15016
  };
15005
15017
 
15018
+ // src/models/gemma3/image_processing_gemma3.js
15019
+ var Gemma3ImageProcessor = class extends ImageProcessor {
15020
+ };
15021
+
15022
+ // src/models/qwen2_vl/image_processing_qwen2_vl.js
15023
+ var Qwen2VLImageProcessor = class extends ImageProcessor {
15024
+ constructor(config) {
15025
+ super(config);
15026
+ this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
15027
+ this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
15028
+ this.patch_size = config.patch_size;
15029
+ this.merge_size = config.merge_size;
15030
+ }
15031
+ /** @type {ImageProcessor['get_resize_output_image_size']} */
15032
+ get_resize_output_image_size(image, size) {
15033
+ const factor = this.patch_size * this.merge_size;
15034
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
15035
+ }
15036
+ async _call(images, ...args) {
15037
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
15038
+ let patches = pixel_values;
15039
+ const { temporal_patch_size, merge_size, patch_size } = this.config;
15040
+ if (patches.dims[0] === 1) {
15041
+ patches = cat(
15042
+ Array.from({ length: temporal_patch_size }, () => patches),
15043
+ 0
15044
+ );
15045
+ }
15046
+ const grid_t = patches.dims[0] / temporal_patch_size;
15047
+ const channel = patches.dims[1];
15048
+ const grid_h = Math.floor(patches.dims[2] / patch_size);
15049
+ const grid_w = Math.floor(patches.dims[3] / patch_size);
15050
+ const flatten_patches = patches.view(
15051
+ grid_t,
15052
+ temporal_patch_size,
15053
+ channel,
15054
+ Math.floor(grid_h / merge_size),
15055
+ merge_size,
15056
+ patch_size,
15057
+ Math.floor(grid_w / merge_size),
15058
+ merge_size,
15059
+ patch_size
15060
+ ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
15061
+ const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
15062
+ return {
15063
+ pixel_values: flatten_patches,
15064
+ image_grid_thw,
15065
+ original_sizes,
15066
+ reshaped_input_sizes
15067
+ };
15068
+ }
15069
+ };
15070
+
15071
+ // src/models/glm46v/image_processing_glm46v.js
15072
+ var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
15073
+ /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
15074
+ get_resize_output_image_size(image, size) {
15075
+ const factor = this.patch_size * this.merge_size;
15076
+ const temporal_factor = this.config.temporal_patch_size ?? 2;
15077
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
15078
+ }
15079
+ };
15080
+
15006
15081
  // src/models/glpn/image_processing_glpn.js
15007
15082
  var GLPNFeatureExtractor = class extends ImageProcessor {
15008
15083
  };
@@ -15396,7 +15471,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
15396
15471
  const img = pixel_values.unsqueeze_(0);
15397
15472
  const total_factor = this.encoder_patch_size * this.downsample_factor;
15398
15473
  const f2 = total_factor ** 2;
15399
- const [new_height, new_width] = smart_resize(
15474
+ const [new_width, new_height] = smart_resize(
15400
15475
  Math.max(total_factor, height),
15401
15476
  Math.max(total_factor, width),
15402
15477
  total_factor,
@@ -15686,55 +15761,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
15686
15761
  var PvtImageProcessor = class extends ImageProcessor {
15687
15762
  };
15688
15763
 
15689
- // src/models/qwen2_vl/image_processing_qwen2_vl.js
15690
- var Qwen2VLImageProcessor = class extends ImageProcessor {
15691
- constructor(config) {
15692
- super(config);
15693
- this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
15694
- this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
15695
- this.patch_size = config.patch_size;
15696
- this.merge_size = config.merge_size;
15697
- }
15698
- /** @type {ImageProcessor['get_resize_output_image_size']} */
15699
- get_resize_output_image_size(image, size) {
15700
- const factor = this.patch_size * this.merge_size;
15701
- return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
15702
- }
15703
- async _call(images, ...args) {
15704
- const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
15705
- let patches = pixel_values;
15706
- const { temporal_patch_size, merge_size, patch_size } = this.config;
15707
- if (patches.dims[0] === 1) {
15708
- patches = cat(
15709
- Array.from({ length: temporal_patch_size }, () => patches),
15710
- 0
15711
- );
15712
- }
15713
- const grid_t = patches.dims[0] / temporal_patch_size;
15714
- const channel = patches.dims[1];
15715
- const grid_h = Math.floor(patches.dims[2] / patch_size);
15716
- const grid_w = Math.floor(patches.dims[3] / patch_size);
15717
- const flatten_patches = patches.view(
15718
- grid_t,
15719
- temporal_patch_size,
15720
- channel,
15721
- Math.floor(grid_h / merge_size),
15722
- merge_size,
15723
- patch_size,
15724
- Math.floor(grid_w / merge_size),
15725
- merge_size,
15726
- patch_size
15727
- ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
15728
- const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
15729
- return {
15730
- pixel_values: flatten_patches,
15731
- image_grid_thw,
15732
- original_sizes,
15733
- reshaped_input_sizes
15734
- };
15735
- }
15736
- };
15737
-
15738
15764
  // src/models/rt_detr/image_processing_rt_detr.js
15739
15765
  var RTDetrImageProcessor = class extends ImageProcessor {
15740
15766
  /** @type {typeof post_process_object_detection} */
@@ -16216,6 +16242,48 @@ var Florence2Processor = class extends Processor {
16216
16242
  }
16217
16243
  };
16218
16244
 
16245
+ // src/models/gemma3/processing_gemma3.js
16246
+ var Gemma3Processor = class extends Processor {
16247
+ static tokenizer_class = AutoTokenizer;
16248
+ static image_processor_class = AutoImageProcessor;
16249
+ static uses_processor_config = true;
16250
+ static uses_chat_template_file = true;
16251
+ constructor(config, components, chat_template) {
16252
+ super(config, components, chat_template);
16253
+ this.image_seq_length = this.config.image_seq_length;
16254
+ const { boi_token, image_token, eoi_token } = this.tokenizer.config;
16255
+ this.boi_token = boi_token;
16256
+ this.image_token = image_token;
16257
+ this.eoi_token = eoi_token;
16258
+ const image_tokens_expanded = image_token.repeat(this.image_seq_length);
16259
+ this.full_image_sequence = `
16260
+
16261
+ ${boi_token}${image_tokens_expanded}${eoi_token}
16262
+
16263
+ `;
16264
+ }
16265
+ /**
16266
+ * @param {string|string[]} text
16267
+ * @param {import('../../utils/image.js').RawImage|import('../../utils/image.js').RawImage[]} [images]
16268
+ * @param {Object} [options]
16269
+ */
16270
+ async _call(text, images = null, options = {}) {
16271
+ if (typeof text === "string") {
16272
+ text = [text];
16273
+ }
16274
+ let image_inputs;
16275
+ if (images) {
16276
+ image_inputs = await this.image_processor(images, options);
16277
+ text = text.map((prompt) => prompt.replaceAll(this.boi_token, this.full_image_sequence));
16278
+ }
16279
+ const text_inputs = this.tokenizer(text, options);
16280
+ return {
16281
+ ...text_inputs,
16282
+ ...image_inputs
16283
+ };
16284
+ }
16285
+ };
16286
+
16219
16287
  // src/models/gemma3n/processing_gemma3n.js
16220
16288
  var Gemma3nProcessor = class extends Processor {
16221
16289
  static image_processor_class = AutoImageProcessor;
@@ -16288,6 +16356,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
16288
16356
  }
16289
16357
  };
16290
16358
 
16359
+ // src/models/qwen2_vl/processing_qwen2_vl.js
16360
+ var Qwen2VLProcessor = class extends Processor {
16361
+ static image_processor_class = AutoImageProcessor;
16362
+ static tokenizer_class = AutoTokenizer;
16363
+ static image_token = "<|image_pad|>";
16364
+ /**
16365
+ *
16366
+ * @param {string|string[]} text
16367
+ * @param {RawImage|RawImage[]} images
16368
+ * @param {...any} args
16369
+ * @returns {Promise<any>}
16370
+ */
16371
+ async _call(text, images = null, ...args) {
16372
+ if (!Array.isArray(text)) {
16373
+ text = [text];
16374
+ }
16375
+ let image_inputs, image_grid_thw;
16376
+ if (images) {
16377
+ image_inputs = await this.image_processor(images);
16378
+ image_grid_thw = image_inputs.image_grid_thw;
16379
+ }
16380
+ if (image_grid_thw) {
16381
+ let merge_length = this.image_processor.config.merge_size ** 2;
16382
+ let index = 0;
16383
+ const image_token = (
16384
+ /** @type {typeof Qwen2VLProcessor} */
16385
+ this.constructor.image_token
16386
+ );
16387
+ const image_grid_thw_list = image_grid_thw.tolist();
16388
+ text = text.map((t) => {
16389
+ while (t.includes(image_token)) {
16390
+ const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
16391
+ t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
16392
+ }
16393
+ return t.replaceAll("<|placeholder|>", image_token);
16394
+ });
16395
+ }
16396
+ const text_inputs = this.tokenizer(text);
16397
+ return {
16398
+ ...text_inputs,
16399
+ ...image_inputs
16400
+ };
16401
+ }
16402
+ };
16403
+
16404
+ // src/models/glm46v/processing_glm46v.js
16405
+ var Glm46VProcessor = class extends Qwen2VLProcessor {
16406
+ static image_token = "<|image|>";
16407
+ };
16408
+
16291
16409
  // src/models/granite_speech/processing_granite_speech.js
16292
16410
  var GraniteSpeechProcessor = class extends Processor {
16293
16411
  static tokenizer_class = AutoTokenizer;
@@ -17018,47 +17136,6 @@ var PyAnnoteProcessor = class extends Processor {
17018
17136
  }
17019
17137
  };
17020
17138
 
17021
- // src/models/qwen2_vl/processing_qwen2_vl.js
17022
- var Qwen2VLProcessor = class extends Processor {
17023
- static image_processor_class = AutoImageProcessor;
17024
- static tokenizer_class = AutoTokenizer;
17025
- /**
17026
- *
17027
- * @param {string|string[]} text
17028
- * @param {RawImage|RawImage[]} images
17029
- * @param {...any} args
17030
- * @returns {Promise<any>}
17031
- */
17032
- async _call(text, images = null, ...args) {
17033
- if (!Array.isArray(text)) {
17034
- text = [text];
17035
- }
17036
- let image_inputs, image_grid_thw;
17037
- if (images) {
17038
- image_inputs = await this.image_processor(images);
17039
- image_grid_thw = image_inputs.image_grid_thw;
17040
- }
17041
- if (image_grid_thw) {
17042
- let merge_length = this.image_processor.config.merge_size ** 2;
17043
- let index = 0;
17044
- const image_grid_thw_list = image_grid_thw.tolist();
17045
- text = text.map((t) => {
17046
- while (t.includes("<|image_pad|>")) {
17047
- const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
17048
- t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
17049
- }
17050
- return t.replaceAll("<|placeholder|>", "<|image_pad|>");
17051
- });
17052
- }
17053
- const text_inputs = this.tokenizer(text);
17054
- return {
17055
- ...text_inputs,
17056
- ...image_inputs
17057
- // TODO: ...videos_inputs,
17058
- };
17059
- }
17060
- };
17061
-
17062
17139
  // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
17063
17140
  var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
17064
17141
  };
@@ -17402,6 +17479,8 @@ function getNormalizedConfig(config) {
17402
17479
  case "gemma3n":
17403
17480
  case "lfm2_vl":
17404
17481
  case "chatterbox":
17482
+ case "lighton_ocr":
17483
+ case "glm_ocr":
17405
17484
  case "mistral3":
17406
17485
  case "qwen2_5_vl":
17407
17486
  case "qwen3_vl":
@@ -17477,6 +17556,8 @@ function getNormalizedConfig(config) {
17477
17556
  mapping["dim_kv"] = "head_dim";
17478
17557
  break;
17479
17558
  case "qwen3":
17559
+ case "solar_open":
17560
+ case "glm_ocr_text":
17480
17561
  case "gemma":
17481
17562
  case "gemma2":
17482
17563
  case "vaultgemma":
@@ -17487,6 +17568,7 @@ function getNormalizedConfig(config) {
17487
17568
  case "ernie4_5":
17488
17569
  case "hunyuan_v1_dense":
17489
17570
  case "falcon_h1":
17571
+ case "nemotron_h":
17490
17572
  case "ministral":
17491
17573
  case "ministral3":
17492
17574
  mapping["num_heads"] = "num_key_value_heads";
@@ -17521,6 +17603,9 @@ function getNormalizedConfig(config) {
17521
17603
  mapping["num_attention_heads"] = "num_attention_heads";
17522
17604
  break;
17523
17605
  case "youtu":
17606
+ case "deepseek_v3":
17607
+ case "glm_moe_dsa":
17608
+ case "mistral4":
17524
17609
  mapping["num_heads"] = "num_key_value_heads";
17525
17610
  mapping["num_layers"] = "num_hidden_layers";
17526
17611
  mapping["dim_kv"] = "qk_head_dim";
@@ -17609,6 +17694,7 @@ function getCacheShapes(config, options) {
17609
17694
  if (!(config instanceof PretrainedConfig)) {
17610
17695
  config = new PretrainedConfig(config);
17611
17696
  }
17697
+ const batch_size = options?.batch_size ?? 1;
17612
17698
  if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
17613
17699
  const pkv_prefix = options?.prefix ?? "past_key_values";
17614
17700
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -17618,7 +17704,6 @@ function getCacheShapes(config, options) {
17618
17704
  config
17619
17705
  );
17620
17706
  const head_dim = hidden_size / num_attention_heads;
17621
- const batch_size = options?.batch_size ?? 1;
17622
17707
  for (let i = 0; i < layer_types.length; ++i) {
17623
17708
  if (layer_types[i] === "full_attention") {
17624
17709
  for (const kv of ["key", "value"]) {
@@ -17631,31 +17716,26 @@ function getCacheShapes(config, options) {
17631
17716
  }
17632
17717
  }
17633
17718
  return cache_values;
17634
- } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
17719
+ } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
17635
17720
  const pkv_prefix = options?.prefix ?? "past_key_values";
17636
17721
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
17637
- const cache_values = {};
17638
- const {
17639
- layer_types,
17640
- num_hidden_layers,
17641
- num_attention_heads,
17642
- num_key_value_heads,
17643
- hidden_size,
17644
- mamba_d_conv,
17645
- mamba_n_heads,
17646
- mamba_d_head,
17647
- mamba_d_state,
17648
- mamba_n_groups,
17649
- mamba_expand,
17650
- mamba_d_ssm
17651
- } = (
17722
+ const c = (
17652
17723
  /** @type {any} */
17653
17724
  config
17654
17725
  );
17655
- const head_dim = hidden_size / num_attention_heads;
17656
- const batch_size = options?.batch_size ?? 1;
17657
- const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
17658
- for (let i = 0; i < num_hidden_layers; ++i) {
17726
+ const layer_types = c.layer_types ?? c.layers_block_type;
17727
+ const num_layers = c.num_hidden_layers ?? layer_types?.length;
17728
+ const num_key_value_heads = c.num_key_value_heads;
17729
+ const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
17730
+ const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
17731
+ const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
17732
+ const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
17733
+ const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
17734
+ const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
17735
+ const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
17736
+ const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
17737
+ const cache_values = {};
17738
+ for (let i = 0; i < num_layers; ++i) {
17659
17739
  if (!layer_types || layer_types[i] === "mamba") {
17660
17740
  cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
17661
17741
  cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -17689,7 +17769,6 @@ function getCacheShapes(config, options) {
17689
17769
  const key_dim = linear_key_head_dim * linear_num_key_heads;
17690
17770
  const value_dim = linear_value_head_dim * linear_num_value_heads;
17691
17771
  const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
17692
- const batch_size = options?.batch_size ?? 1;
17693
17772
  for (let i = 0; i < layer_types.length; ++i) {
17694
17773
  if (layer_types[i] === "full_attention") {
17695
17774
  for (const kv of ["key", "value"]) {
@@ -19385,8 +19464,7 @@ var MODEL_TYPES = {
19385
19464
  ImageAudioTextToText: 13,
19386
19465
  Supertonic: 14,
19387
19466
  Chatterbox: 15,
19388
- MultimodalLanguageModelOnly: 16,
19389
- VoxtralRealtime: 17
19467
+ VoxtralRealtime: 16
19390
19468
  };
19391
19469
  var MODEL_TYPE_CONFIG = {
19392
19470
  [MODEL_TYPES.DecoderOnly]: {
@@ -19443,12 +19521,12 @@ var MODEL_TYPE_CONFIG = {
19443
19521
  can_generate: true,
19444
19522
  forward: image_text_to_text_forward,
19445
19523
  prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
19446
- sessions: (config) => {
19524
+ sessions: (config, options, textOnly) => {
19447
19525
  const s = {
19448
19526
  embed_tokens: "embed_tokens",
19449
- vision_encoder: "vision_encoder",
19450
19527
  decoder_model_merged: "decoder_model_merged"
19451
19528
  };
19529
+ if (!textOnly) s["vision_encoder"] = "vision_encoder";
19452
19530
  if (config.is_encoder_decoder) s["model"] = "encoder_model";
19453
19531
  return s;
19454
19532
  },
@@ -19470,12 +19548,17 @@ var MODEL_TYPE_CONFIG = {
19470
19548
  [MODEL_TYPES.ImageAudioTextToText]: {
19471
19549
  can_generate: true,
19472
19550
  prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
19473
- sessions: () => ({
19474
- embed_tokens: "embed_tokens",
19475
- audio_encoder: "audio_encoder",
19476
- vision_encoder: "vision_encoder",
19477
- decoder_model_merged: "decoder_model_merged"
19478
- }),
19551
+ sessions: (config, options, textOnly) => {
19552
+ const s = {
19553
+ embed_tokens: "embed_tokens",
19554
+ decoder_model_merged: "decoder_model_merged"
19555
+ };
19556
+ if (!textOnly) {
19557
+ s["audio_encoder"] = "audio_encoder";
19558
+ s["vision_encoder"] = "vision_encoder";
19559
+ }
19560
+ return s;
19561
+ },
19479
19562
  optional_configs: { generation_config: "generation_config.json" }
19480
19563
  },
19481
19564
  [MODEL_TYPES.Phi3V]: {
@@ -19526,14 +19609,6 @@ var MODEL_TYPE_CONFIG = {
19526
19609
  cache_sessions: { model: true },
19527
19610
  optional_configs: { generation_config: "generation_config.json" }
19528
19611
  },
19529
- [MODEL_TYPES.MultimodalLanguageModelOnly]: {
19530
- can_generate: true,
19531
- forward: image_text_to_text_forward,
19532
- prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
19533
- sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
19534
- cache_sessions: { decoder_model_merged: true },
19535
- optional_configs: { generation_config: "generation_config.json" }
19536
- },
19537
19612
  [MODEL_TYPES.VoxtralRealtime]: {
19538
19613
  can_generate: true,
19539
19614
  prepare_inputs: decoder_prepare_inputs_for_generation,
@@ -19559,6 +19634,19 @@ function getSessionsConfig(modelType, config, options = {}) {
19559
19634
  optional_configs: typeConfig.optional_configs
19560
19635
  };
19561
19636
  }
19637
+ function resolveTypeConfig(modelName, config) {
19638
+ let modelType = MODEL_TYPE_MAPPING.get(modelName);
19639
+ let textOnly = false;
19640
+ const nativeArch = config?.architectures?.[0];
19641
+ if (nativeArch && nativeArch !== modelName && modelName?.endsWith("ForCausalLM") && nativeArch.endsWith("ForConditionalGeneration")) {
19642
+ const nativeType = MODEL_TYPE_MAPPING.get(nativeArch);
19643
+ if (nativeType !== void 0) {
19644
+ modelType = nativeType;
19645
+ textOnly = true;
19646
+ }
19647
+ }
19648
+ return { typeConfig: MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default, textOnly, modelType };
19649
+ }
19562
19650
  var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
19563
19651
  var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
19564
19652
  var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
@@ -19578,8 +19666,7 @@ var PreTrainedModel = class extends Callable2 {
19578
19666
  this.sessions = sessions;
19579
19667
  this.configs = configs;
19580
19668
  const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor);
19581
- const modelType = MODEL_TYPE_MAPPING.get(modelName);
19582
- const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
19669
+ const { typeConfig } = resolveTypeConfig(modelName, config);
19583
19670
  this.can_generate = typeConfig.can_generate;
19584
19671
  this._forward = typeConfig.forward;
19585
19672
  this._prepare_inputs_for_generation = typeConfig.prepare_inputs;
@@ -19642,9 +19729,8 @@ var PreTrainedModel = class extends Callable2 {
19642
19729
  session_options
19643
19730
  };
19644
19731
  const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
19645
- const modelType = MODEL_TYPE_MAPPING.get(modelName);
19646
19732
  config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
19647
- const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
19733
+ const { typeConfig, textOnly, modelType } = resolveTypeConfig(modelName, config);
19648
19734
  if (modelType === void 0) {
19649
19735
  const type = modelName ?? config?.model_type;
19650
19736
  if (type !== "custom") {
@@ -19653,7 +19739,7 @@ var PreTrainedModel = class extends Callable2 {
19653
19739
  );
19654
19740
  }
19655
19741
  }
19656
- const sessions = typeConfig.sessions(config, options);
19742
+ const sessions = typeConfig.sessions(config, options, textOnly);
19657
19743
  const promises = [
19658
19744
  constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
19659
19745
  ];
@@ -20317,7 +20403,9 @@ async function generic_text_to_text_forward(self2, {
20317
20403
  "qwen3_5",
20318
20404
  "qwen3_5_text",
20319
20405
  "qwen3_5_moe",
20320
- "qwen3_5_moe_text"
20406
+ "qwen3_5_moe_text",
20407
+ "glm_ocr",
20408
+ "glm_ocr_text"
20321
20409
  ].includes(self2.config.model_type)
20322
20410
  ) {
20323
20411
  const { image_grid_thw, video_grid_thw } = kwargs;
@@ -20541,6 +20629,8 @@ __export(models_exports, {
20541
20629
  BloomForCausalLM: () => BloomForCausalLM,
20542
20630
  BloomModel: () => BloomModel,
20543
20631
  BloomPreTrainedModel: () => BloomPreTrainedModel,
20632
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
20633
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
20544
20634
  CLIPModel: () => CLIPModel,
20545
20635
  CLIPPreTrainedModel: () => CLIPPreTrainedModel,
20546
20636
  CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -20615,6 +20705,9 @@ __export(models_exports, {
20615
20705
  DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
20616
20706
  DecisionTransformerModel: () => DecisionTransformerModel,
20617
20707
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
20708
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
20709
+ DeepseekV3Model: () => DeepseekV3Model,
20710
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
20618
20711
  DeiTForImageClassification: () => DeiTForImageClassification,
20619
20712
  DeiTModel: () => DeiTModel,
20620
20713
  DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -20660,6 +20753,11 @@ __export(models_exports, {
20660
20753
  EsmForTokenClassification: () => EsmForTokenClassification,
20661
20754
  EsmModel: () => EsmModel,
20662
20755
  EsmPreTrainedModel: () => EsmPreTrainedModel,
20756
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
20757
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
20758
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
20759
+ EuroBertModel: () => EuroBertModel,
20760
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
20663
20761
  ExaoneForCausalLM: () => ExaoneForCausalLM,
20664
20762
  ExaoneModel: () => ExaoneModel,
20665
20763
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -20696,6 +20794,7 @@ __export(models_exports, {
20696
20794
  Gemma2Model: () => Gemma2Model,
20697
20795
  Gemma2PreTrainedModel: () => Gemma2PreTrainedModel,
20698
20796
  Gemma3ForCausalLM: () => Gemma3ForCausalLM,
20797
+ Gemma3ForConditionalGeneration: () => Gemma3ForConditionalGeneration,
20699
20798
  Gemma3Model: () => Gemma3Model,
20700
20799
  Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
20701
20800
  Gemma3nForCausalLM: () => Gemma3nForCausalLM,
@@ -20706,6 +20805,10 @@ __export(models_exports, {
20706
20805
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
20707
20806
  GlmForCausalLM: () => GlmForCausalLM,
20708
20807
  GlmModel: () => GlmModel,
20808
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
20809
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
20810
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
20811
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
20709
20812
  GlmPreTrainedModel: () => GlmPreTrainedModel,
20710
20813
  GptOssForCausalLM: () => GptOssForCausalLM,
20711
20814
  GptOssModel: () => GptOssModel,
@@ -20752,6 +20855,7 @@ __export(models_exports, {
20752
20855
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
20753
20856
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
20754
20857
  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
20858
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
20755
20859
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
20756
20860
  Llama4ForCausalLM: () => Llama4ForCausalLM,
20757
20861
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -20801,6 +20905,9 @@ __export(models_exports, {
20801
20905
  MimiEncoderOutput: () => MimiEncoderOutput,
20802
20906
  MimiModel: () => MimiModel,
20803
20907
  MimiPreTrainedModel: () => MimiPreTrainedModel,
20908
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
20909
+ Mistral4Model: () => Mistral4Model,
20910
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
20804
20911
  MistralForCausalLM: () => MistralForCausalLM,
20805
20912
  MistralModel: () => MistralModel,
20806
20913
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -20858,6 +20965,9 @@ __export(models_exports, {
20858
20965
  NanoChatForCausalLM: () => NanoChatForCausalLM,
20859
20966
  NanoChatModel: () => NanoChatModel,
20860
20967
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
20968
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
20969
+ NemotronHModel: () => NemotronHModel,
20970
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
20861
20971
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
20862
20972
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
20863
20973
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -20995,6 +21105,9 @@ __export(models_exports, {
20995
21105
  SnacEncoderModel: () => SnacEncoderModel,
20996
21106
  SnacModel: () => SnacModel,
20997
21107
  SnacPreTrainedModel: () => SnacPreTrainedModel,
21108
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
21109
+ SolarOpenModel: () => SolarOpenModel,
21110
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
20998
21111
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
20999
21112
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
21000
21113
  SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -21169,7 +21282,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
21169
21282
  var ArceeForCausalLM = class extends ArceePreTrainedModel {
21170
21283
  };
21171
21284
 
21172
- // src/models/ast/modeling_ast.js
21285
+ // src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
21173
21286
  var ASTPreTrainedModel = class extends PreTrainedModel {
21174
21287
  };
21175
21288
  var ASTModel = class extends ASTPreTrainedModel {
@@ -21504,6 +21617,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
21504
21617
  var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
21505
21618
  };
21506
21619
 
21620
+ // src/models/chmv2/modeling_chmv2.js
21621
+ var CHMv2PreTrainedModel = class extends PreTrainedModel {
21622
+ };
21623
+ var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
21624
+ };
21625
+
21507
21626
  // src/models/clap/modeling_clap.js
21508
21627
  var ClapPreTrainedModel = class extends PreTrainedModel {
21509
21628
  };
@@ -21842,7 +21961,15 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
21842
21961
  }
21843
21962
  };
21844
21963
 
21845
- // src/models/deberta_v2/modeling_deberta_v2.js
21964
+ // src/models/deepseek_v3/modeling_deepseek_v3.js
21965
+ var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
21966
+ };
21967
+ var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
21968
+ };
21969
+ var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
21970
+ };
21971
+
21972
+ // src/models/deberta_v2/modeling_deberta_v2.js
21846
21973
  var DebertaV2PreTrainedModel = class extends PreTrainedModel {
21847
21974
  };
21848
21975
  var DebertaV2Model = class extends DebertaV2PreTrainedModel {
@@ -22190,6 +22317,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
22190
22317
  }
22191
22318
  };
22192
22319
 
22320
+ // src/models/eurobert/modeling_eurobert.js
22321
+ var EuroBertPreTrainedModel = class extends PreTrainedModel {
22322
+ };
22323
+ var EuroBertModel = class extends EuroBertPreTrainedModel {
22324
+ };
22325
+ var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
22326
+ /**
22327
+ * Calls the model on new inputs.
22328
+ *
22329
+ * @param {Object} model_inputs The inputs to the model.
22330
+ * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
22331
+ */
22332
+ async _call(model_inputs) {
22333
+ return new MaskedLMOutput(await super._call(model_inputs));
22334
+ }
22335
+ };
22336
+ var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
22337
+ /**
22338
+ * Calls the model on new inputs.
22339
+ *
22340
+ * @param {Object} model_inputs The inputs to the model.
22341
+ * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
22342
+ */
22343
+ async _call(model_inputs) {
22344
+ return new SequenceClassifierOutput(await super._call(model_inputs));
22345
+ }
22346
+ };
22347
+ var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
22348
+ /**
22349
+ * Calls the model on new inputs.
22350
+ *
22351
+ * @param {Object} model_inputs The inputs to the model.
22352
+ * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
22353
+ */
22354
+ async _call(model_inputs) {
22355
+ return new TokenClassifierOutput(await super._call(model_inputs));
22356
+ }
22357
+ };
22358
+
22193
22359
  // src/models/exaone/modeling_exaone.js
22194
22360
  var ExaonePreTrainedModel = class extends PreTrainedModel {
22195
22361
  };
@@ -22347,12 +22513,35 @@ var Gemma2Model = class extends Gemma2PreTrainedModel {
22347
22513
  var Gemma2ForCausalLM = class extends Gemma2PreTrainedModel {
22348
22514
  };
22349
22515
 
22516
+ // src/models/llava/modeling_llava.js
22517
+ var LlavaPreTrainedModel = class extends PreTrainedModel {
22518
+ forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
22519
+ };
22520
+ var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
22521
+ _merge_input_ids_with_image_features(kwargs) {
22522
+ const vision_hidden_size = kwargs.image_features.dims.at(-1);
22523
+ const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
22524
+ return default_merge_input_ids_with_image_features({
22525
+ // @ts-ignore
22526
+ image_token_id: this.config.image_token_index ?? this.config.image_token_id,
22527
+ ...kwargs,
22528
+ image_features: reshaped_image_hidden_states
22529
+ });
22530
+ }
22531
+ };
22532
+ var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
22533
+ };
22534
+ var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
22535
+ };
22536
+
22350
22537
  // src/models/gemma3/modeling_gemma3.js
22351
22538
  var Gemma3PreTrainedModel = class extends PreTrainedModel {
22352
22539
  };
22353
22540
  var Gemma3Model = class extends Gemma3PreTrainedModel {
22354
22541
  };
22355
- var Gemma3ForCausalLM = class extends Gemma3PreTrainedModel {
22542
+ var Gemma3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
22543
+ };
22544
+ var Gemma3ForCausalLM = class extends Gemma3ForConditionalGeneration {
22356
22545
  };
22357
22546
 
22358
22547
  // src/models/gemma3n/modeling_gemma3n.js
@@ -22465,6 +22654,382 @@ var GlmModel = class extends GlmPreTrainedModel {
22465
22654
  var GlmForCausalLM = class extends GlmPreTrainedModel {
22466
22655
  };
22467
22656
 
22657
+ // src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
22658
+ var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
22659
+ };
22660
+ var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
22661
+ };
22662
+ var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
22663
+ };
22664
+
22665
+ // src/models/qwen2_vl/modeling_qwen2_vl.js
22666
+ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
22667
+ forward_params = [
22668
+ // Text inputs
22669
+ "input_ids",
22670
+ "attention_mask",
22671
+ "position_ids",
22672
+ "past_key_values",
22673
+ // Vision inputs
22674
+ "pixel_values",
22675
+ "image_grid_thw"
22676
+ ];
22677
+ };
22678
+ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
22679
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
22680
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
22681
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
22682
+ image_grid_thw_name = "grid_thw";
22683
+ /**
22684
+ * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
22685
+ * @param {Tensor} input_ids
22686
+ * @param {Tensor} attention_mask
22687
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
22688
+ */
22689
+ _get_text_only_rope_index(input_ids, attention_mask) {
22690
+ if (attention_mask) {
22691
+ const { data, dims } = cumsum_masked_fill(attention_mask);
22692
+ const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
22693
+ const mrope_position_deltas = Array.from(
22694
+ { length: dims[0] },
22695
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
22696
+ );
22697
+ return [
22698
+ new Tensor2("int64", position_ids, [3, ...dims]),
22699
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
22700
+ ];
22701
+ } else {
22702
+ const [batch_size, seq_length] = input_ids.dims;
22703
+ const position_ids = BigInt64Array.from(
22704
+ { length: 3 * batch_size * seq_length },
22705
+ (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
22706
+ );
22707
+ return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
22708
+ }
22709
+ }
22710
+ /**
22711
+ * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
22712
+ * global [all_t, all_h, all_w] order, then write back into the position_ids array
22713
+ * respecting attention mask.
22714
+ * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
22715
+ * @param {number[]} attn_mask Attention mask for this batch element
22716
+ * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
22717
+ * @param {number} batch_idx Current batch index
22718
+ * @returns {number[]} Flat reordered positions of length total_len
22719
+ */
22720
+ _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
22721
+ const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
22722
+ const llm_positions = new Array(total_len);
22723
+ let index = 0;
22724
+ for (let x = 0; x < 3; ++x) {
22725
+ for (const val of llm_pos_ids_list) {
22726
+ const seg_len = val.length / 3;
22727
+ for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
22728
+ llm_positions[index++] = val[z];
22729
+ }
22730
+ }
22731
+ }
22732
+ let count2 = 0;
22733
+ for (let y = 0; y < attn_mask.length; ++y) {
22734
+ if (attn_mask[y] == 1) {
22735
+ for (let x = 0; x < 3; ++x) {
22736
+ position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
22737
+ }
22738
+ ++count2;
22739
+ }
22740
+ }
22741
+ return llm_positions;
22742
+ }
22743
+ /**
22744
+ * Build per-batch position ID segments for multimodal rope.
22745
+ * Override this in subclasses to change how vision/text segments are identified and positioned.
22746
+ * @param {object} params
22747
+ * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
22748
+ * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
22749
+ * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
22750
+ * @param {number} params.spatial_merge_size
22751
+ * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
22752
+ * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
22753
+ */
22754
+ _get_multimodal_rope_positions({
22755
+ filtered_ids,
22756
+ image_grid_thw_list,
22757
+ video_grid_thw_list,
22758
+ spatial_merge_size,
22759
+ state
22760
+ }) {
22761
+ const { image_token_id, video_token_id, vision_start_token_id } = this.config;
22762
+ const ids = filtered_ids;
22763
+ const vision_start_indices = ids.reduce((acc, x, idx) => {
22764
+ if (x == vision_start_token_id) acc.push(idx);
22765
+ return acc;
22766
+ }, []);
22767
+ const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
22768
+ const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
22769
+ const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
22770
+ const llm_pos_ids_list = [];
22771
+ let st = 0;
22772
+ let remain_images = image_nums;
22773
+ let remain_videos = video_nums;
22774
+ for (let j = 0; j < vision_tokens.length; ++j) {
22775
+ const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
22776
+ const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
22777
+ const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
22778
+ const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
22779
+ let ed;
22780
+ let t, h, w;
22781
+ if (ed_image < ed_video) {
22782
+ [t, h, w] = image_grid_thw_list[state.image_index];
22783
+ ++state.image_index;
22784
+ --remain_images;
22785
+ ed = ed_image;
22786
+ } else {
22787
+ [t, h, w] = video_grid_thw_list[state.video_index];
22788
+ ++state.video_index;
22789
+ --remain_videos;
22790
+ ed = ed_video;
22791
+ }
22792
+ const [llm_grid_t, llm_grid_h, llm_grid_w] = [
22793
+ Number(t),
22794
+ Math.floor(Number(h) / spatial_merge_size),
22795
+ Math.floor(Number(w) / spatial_merge_size)
22796
+ ];
22797
+ const text_len = ed - st;
22798
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
22799
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
22800
+ const offset = text_len + st_idx;
22801
+ const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
22802
+ const t_index = Array.from(
22803
+ { length: grid_size },
22804
+ (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
22805
+ );
22806
+ const h_index = Array.from(
22807
+ { length: grid_size },
22808
+ (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
22809
+ );
22810
+ const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
22811
+ llm_pos_ids_list.push([t_index, h_index, w_index].flat());
22812
+ st = ed + grid_size;
22813
+ }
22814
+ if (st < ids.length) {
22815
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
22816
+ const text_len = ids.length - st;
22817
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
22818
+ }
22819
+ return llm_pos_ids_list;
22820
+ }
22821
+ /**
22822
+ * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
22823
+ *
22824
+ * Explanation:
22825
+ * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
22826
+ *
22827
+ * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
22828
+ * Examples:
22829
+ * input_ids: [T T T T T], here T is for text.
22830
+ * temporal position_ids: [0, 1, 2, 3, 4]
22831
+ * height position_ids: [0, 1, 2, 3, 4]
22832
+ * width position_ids: [0, 1, 2, 3, 4]
22833
+ *
22834
+ * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
22835
+ * and 1D rotary position embeddin for text part.
22836
+ * Examples:
22837
+ * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
22838
+ * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
22839
+ * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
22840
+ * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
22841
+ * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
22842
+ * text temporal position_ids: [3, 4, 5, 6, 7]
22843
+ * text height position_ids: [3, 4, 5, 6, 7]
22844
+ * text width position_ids: [3, 4, 5, 6, 7]
22845
+ * Here we calculate the text start position_ids as the max vision position_ids plus 1.
22846
+ *
22847
+ * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
22848
+ * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
22849
+ * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
22850
+ * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
22851
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
22852
+ */
22853
+ get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
22854
+ const { vision_config } = this.config;
22855
+ const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
22856
+ if (image_grid_thw || video_grid_thw) {
22857
+ const total_input_ids = input_ids.tolist();
22858
+ if (!attention_mask) {
22859
+ attention_mask = ones_like(input_ids);
22860
+ }
22861
+ const attention_mask_list = attention_mask.tolist();
22862
+ const position_ids_list = Array.from(
22863
+ { length: 3 },
22864
+ () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
22865
+ );
22866
+ const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
22867
+ const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
22868
+ const state = { image_index: 0, video_index: 0 };
22869
+ const mrope_position_deltas = [];
22870
+ for (let i = 0; i < total_input_ids.length; ++i) {
22871
+ const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
22872
+ const llm_pos_ids_list = this._get_multimodal_rope_positions({
22873
+ filtered_ids,
22874
+ image_grid_thw_list,
22875
+ video_grid_thw_list,
22876
+ spatial_merge_size,
22877
+ state
22878
+ });
22879
+ const llm_positions = this._reorder_and_write_positions(
22880
+ llm_pos_ids_list,
22881
+ attention_mask_list[i],
22882
+ position_ids_list,
22883
+ i
22884
+ );
22885
+ mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
22886
+ }
22887
+ return [
22888
+ new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
22889
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
22890
+ ];
22891
+ } else {
22892
+ return this._get_text_only_rope_index(input_ids, attention_mask);
22893
+ }
22894
+ }
22895
+ async encode_image({ pixel_values, image_grid_thw }) {
22896
+ const features = (await sessionRun(this.sessions["vision_encoder"], {
22897
+ pixel_values,
22898
+ [this.image_grid_thw_name]: image_grid_thw
22899
+ })).image_features;
22900
+ return features;
22901
+ }
22902
+ _merge_input_ids_with_image_features(kwargs) {
22903
+ return default_merge_input_ids_with_image_features({
22904
+ // @ts-ignore
22905
+ image_token_id: this.config.image_token_id,
22906
+ ...kwargs
22907
+ });
22908
+ }
22909
+ prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
22910
+ if (!model_inputs.attention_mask || model_inputs.position_ids) {
22911
+ return model_inputs;
22912
+ }
22913
+ const session = this.sessions["decoder_model_merged"] ?? this.sessions["model"];
22914
+ if (!session.inputNames.includes("position_ids")) {
22915
+ return model_inputs;
22916
+ }
22917
+ if (!model_inputs.past_key_values) {
22918
+ [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
22919
+ model_inputs.input_ids,
22920
+ model_inputs.image_grid_thw,
22921
+ model_inputs.video_grid_thw,
22922
+ model_inputs.attention_mask
22923
+ );
22924
+ } else {
22925
+ model_inputs.pixel_values = null;
22926
+ const past_length = model_inputs.past_key_values.get_seq_length();
22927
+ if (past_length < model_inputs.input_ids.dims[1]) {
22928
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
22929
+ model_inputs.input_ids,
22930
+ model_inputs.image_grid_thw,
22931
+ model_inputs.video_grid_thw,
22932
+ model_inputs.attention_mask
22933
+ );
22934
+ model_inputs.rope_deltas = rope_deltas;
22935
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
22936
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
22937
+ } else {
22938
+ if (!model_inputs.rope_deltas) {
22939
+ [, model_inputs.rope_deltas] = this.get_rope_index(
22940
+ model_inputs.input_ids,
22941
+ model_inputs.image_grid_thw,
22942
+ model_inputs.video_grid_thw,
22943
+ model_inputs.attention_mask
22944
+ );
22945
+ }
22946
+ const delta = BigInt(past_length);
22947
+ const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
22948
+ model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
22949
+ }
22950
+ }
22951
+ return model_inputs;
22952
+ }
22953
+ };
22954
+ var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
22955
+ };
22956
+
22957
+ // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
22958
+ var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
22959
+ image_grid_thw_name = "image_grid_thw";
22960
+ };
22961
+ var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
22962
+ image_grid_thw_name = "image_grid_thw";
22963
+ };
22964
+
22965
+ // src/models/glm_ocr/modeling_glm_ocr.js
22966
+ var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
22967
+ /**
22968
+ * Compute 3D positional indices for vision tokens.
22969
+ * Temporal is constant, height is repeat-interleaved, width tiles.
22970
+ * @param {number} start_position
22971
+ * @param {number[]} grid_thw [T, H, W]
22972
+ * @param {number} temp_merge_size
22973
+ * @param {number} spatial_merge_size
22974
+ * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
22975
+ */
22976
+ get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
22977
+ const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
22978
+ const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
22979
+ const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
22980
+ const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
22981
+ const t_pos = Array.from({ length: seq_len }, () => start_position);
22982
+ const h_pos = Array.from(
22983
+ { length: seq_len },
22984
+ (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
22985
+ );
22986
+ const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
22987
+ return [...t_pos, ...h_pos, ...w_pos];
22988
+ }
22989
+ /**
22990
+ * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
22991
+ * instead of vision_start_token_id scanning used by Qwen2VL.
22992
+ * After a vision segment, position advances by max(h, w) / spatial_merge_size.
22993
+ */
22994
+ _get_multimodal_rope_positions({
22995
+ filtered_ids,
22996
+ image_grid_thw_list,
22997
+ video_grid_thw_list,
22998
+ spatial_merge_size,
22999
+ state
23000
+ }) {
23001
+ const { image_token_id } = this.config;
23002
+ const groups = [];
23003
+ let group_start = 0;
23004
+ let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
23005
+ for (let j = 1; j <= filtered_ids.length; ++j) {
23006
+ const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
23007
+ if (t !== current_type) {
23008
+ groups.push([current_type, group_start, j]);
23009
+ group_start = j;
23010
+ current_type = t;
23011
+ }
23012
+ }
23013
+ let current_pos = 0;
23014
+ const llm_pos_ids_list = [];
23015
+ for (const [modality_type, start_idx, end_idx] of groups) {
23016
+ if (modality_type === 0) {
23017
+ const text_len = end_idx - start_idx;
23018
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
23019
+ current_pos += text_len;
23020
+ } else {
23021
+ const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
23022
+ const temp_merge_size = grid_thw[0];
23023
+ llm_pos_ids_list.push(
23024
+ this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
23025
+ );
23026
+ current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
23027
+ }
23028
+ }
23029
+ return llm_pos_ids_list;
23030
+ }
23031
+ };
23032
+
22468
23033
  // src/models/glpn/modeling_glpn.js
22469
23034
  var GLPNPreTrainedModel = class extends PreTrainedModel {
22470
23035
  };
@@ -22663,27 +23228,6 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
22663
23228
  var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
22664
23229
  };
22665
23230
 
22666
- // src/models/llava/modeling_llava.js
22667
- var LlavaPreTrainedModel = class extends PreTrainedModel {
22668
- forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
22669
- };
22670
- var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
22671
- _merge_input_ids_with_image_features(kwargs) {
22672
- const vision_hidden_size = kwargs.image_features.dims.at(-1);
22673
- const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
22674
- return default_merge_input_ids_with_image_features({
22675
- // @ts-ignore
22676
- image_token_id: this.config.image_token_index ?? this.config.image_token_id,
22677
- ...kwargs,
22678
- image_features: reshaped_image_hidden_states
22679
- });
22680
- }
22681
- };
22682
- var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
22683
- };
22684
- var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
22685
- };
22686
-
22687
23231
  // src/models/idefics3/modeling_idefics3.js
22688
23232
  var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
22689
23233
  forward_params = [
@@ -22777,6 +23321,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
22777
23321
  var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
22778
23322
  };
22779
23323
 
23324
+ // src/models/lighton_ocr/modeling_lighton_ocr.js
23325
+ var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
23326
+ };
23327
+
22780
23328
  // src/models/lfm2_moe/modeling_lfm2_moe.js
22781
23329
  var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
22782
23330
  };
@@ -22973,6 +23521,14 @@ var MistralModel = class extends MistralPreTrainedModel {
22973
23521
  var MistralForCausalLM = class extends MistralPreTrainedModel {
22974
23522
  };
22975
23523
 
23524
+ // src/models/mistral4/modeling_mistral4.js
23525
+ var Mistral4PreTrainedModel = class extends PreTrainedModel {
23526
+ };
23527
+ var Mistral4Model = class extends Mistral4PreTrainedModel {
23528
+ };
23529
+ var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
23530
+ };
23531
+
22976
23532
  // src/models/mobilebert/modeling_mobilebert.js
22977
23533
  var MobileBertPreTrainedModel = class extends PreTrainedModel {
22978
23534
  };
@@ -23441,6 +23997,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
23441
23997
  var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
23442
23998
  };
23443
23999
 
24000
+ // src/models/nemotron_h/modeling_nemotron_h.js
24001
+ var NemotronHPreTrainedModel = class extends PreTrainedModel {
24002
+ };
24003
+ var NemotronHModel = class extends NemotronHPreTrainedModel {
24004
+ };
24005
+ var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
24006
+ };
24007
+
23444
24008
  // src/models/neobert/modeling_neobert.js
23445
24009
  var NeoBertPreTrainedModel = class extends PreTrainedModel {
23446
24010
  };
@@ -23721,252 +24285,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
23721
24285
  var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
23722
24286
  };
23723
24287
 
23724
- // src/models/qwen2_vl/modeling_qwen2_vl.js
23725
- var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
23726
- forward_params = [
23727
- // Text inputs
23728
- "input_ids",
23729
- "attention_mask",
23730
- "position_ids",
23731
- "past_key_values",
23732
- // Vision inputs
23733
- "pixel_values",
23734
- "image_grid_thw"
23735
- ];
23736
- };
23737
- var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
23738
- // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
23739
- // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
23740
- // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
23741
- image_grid_thw_name = "grid_thw";
23742
- /**
23743
- * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
23744
- *
23745
- * Explanation:
23746
- * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
23747
- *
23748
- * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
23749
- * Examples:
23750
- * input_ids: [T T T T T], here T is for text.
23751
- * temporal position_ids: [0, 1, 2, 3, 4]
23752
- * height position_ids: [0, 1, 2, 3, 4]
23753
- * width position_ids: [0, 1, 2, 3, 4]
23754
- *
23755
- * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
23756
- * and 1D rotary position embeddin for text part.
23757
- * Examples:
23758
- * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
23759
- * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
23760
- * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
23761
- * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
23762
- * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
23763
- * text temporal position_ids: [3, 4, 5, 6, 7]
23764
- * text height position_ids: [3, 4, 5, 6, 7]
23765
- * text width position_ids: [3, 4, 5, 6, 7]
23766
- * Here we calculate the text start position_ids as the max vision position_ids plus 1.
23767
- *
23768
- * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
23769
- * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
23770
- * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
23771
- * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
23772
- * - 1 for tokens that are **not masked**,
23773
- * - 0 for tokens that are **masked**.
23774
- * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
23775
- * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
23776
- * - mrope_position_deltas: Tensor of shape `(batch_size)`.
23777
- */
23778
- get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
23779
- const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
23780
- const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
23781
- const mrope_position_deltas = [];
23782
- if (image_grid_thw || video_grid_thw) {
23783
- let total_input_ids = input_ids.tolist();
23784
- if (!attention_mask) {
23785
- attention_mask = ones_like(input_ids);
23786
- }
23787
- const attention_mask_list = attention_mask.tolist();
23788
- const position_ids_list = Array.from(
23789
- { length: 3 },
23790
- (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
23791
- );
23792
- const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
23793
- const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
23794
- let image_index = 0;
23795
- let video_index = 0;
23796
- for (let i = 0; i < total_input_ids.length; ++i) {
23797
- const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
23798
- const vision_start_indices = ids.reduce((acc, x, idx) => {
23799
- if (x == vision_start_token_id) acc.push(idx);
23800
- return acc;
23801
- }, []);
23802
- const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
23803
- const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
23804
- const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
23805
- let llm_pos_ids_list = [];
23806
- let st = 0;
23807
- let remain_images = image_nums;
23808
- let remain_videos = video_nums;
23809
- for (let j = 0; j < vision_tokens.length; ++j) {
23810
- const next_image_token = ids.findIndex((x, i2) => i2 > st && x == image_token_id);
23811
- const next_video_token = ids.findIndex((x, i2) => i2 > st && x == video_token_id);
23812
- const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
23813
- const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
23814
- let ed;
23815
- let t, h, w;
23816
- if (ed_image < ed_video) {
23817
- [t, h, w] = image_grid_thw_list[image_index];
23818
- ++image_index;
23819
- --remain_images;
23820
- ed = ed_image;
23821
- } else {
23822
- [t, h, w] = video_grid_thw_list[video_index];
23823
- ++video_index;
23824
- --remain_videos;
23825
- ed = ed_video;
23826
- }
23827
- const [llm_grid_t, llm_grid_h, llm_grid_w] = [
23828
- Number(t),
23829
- Math.floor(Number(h) / spatial_merge_size),
23830
- Math.floor(Number(w) / spatial_merge_size)
23831
- ];
23832
- const text_len = ed - st;
23833
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
23834
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
23835
- const offset = text_len + st_idx;
23836
- const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
23837
- const t_index = Array.from(
23838
- { length: grid_size },
23839
- (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
23840
- );
23841
- const h_index = Array.from(
23842
- { length: grid_size },
23843
- (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
23844
- );
23845
- const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
23846
- llm_pos_ids_list.push([t_index, h_index, w_index].flat());
23847
- st = ed + grid_size;
23848
- }
23849
- if (st < ids.length) {
23850
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
23851
- const text_len = ids.length - st;
23852
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
23853
- }
23854
- const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
23855
- const llm_positions = new Array(num_items);
23856
- let index = 0;
23857
- for (let x = 0; x < 3; ++x) {
23858
- for (let y = 0; y < llm_pos_ids_list.length; ++y) {
23859
- const val = llm_pos_ids_list[y];
23860
- const text_len = val.length / 3;
23861
- for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
23862
- llm_positions[index++] = val[z];
23863
- }
23864
- }
23865
- }
23866
- let count2 = 0;
23867
- const attn_mask = attention_mask_list[i];
23868
- for (let y = 0; y < attn_mask.length; ++y) {
23869
- if (attn_mask[y] == 1) {
23870
- for (let x = 0; x < 3; ++x) {
23871
- position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
23872
- }
23873
- ++count2;
23874
- }
23875
- }
23876
- const max_llm_positions = max(llm_positions)[0];
23877
- mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
23878
- }
23879
- return [
23880
- new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
23881
- new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
23882
- ];
23883
- } else {
23884
- if (attention_mask) {
23885
- const { data, dims } = cumsum_masked_fill(attention_mask);
23886
- const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
23887
- const mrope_position_deltas2 = Array.from(
23888
- { length: dims[0] },
23889
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
23890
- );
23891
- return [
23892
- new Tensor2("int64", position_ids, [3, ...dims]),
23893
- new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
23894
- ];
23895
- } else {
23896
- const [batch_size, seq_length] = input_ids.dims;
23897
- const position_ids = BigInt64Array.from(
23898
- { length: 3 * batch_size * seq_length },
23899
- (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
23900
- );
23901
- return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
23902
- }
23903
- }
23904
- }
23905
- async encode_image({ pixel_values, image_grid_thw }) {
23906
- const features = (await sessionRun(this.sessions["vision_encoder"], {
23907
- pixel_values,
23908
- [this.image_grid_thw_name]: image_grid_thw
23909
- })).image_features;
23910
- return features;
23911
- }
23912
- _merge_input_ids_with_image_features(kwargs) {
23913
- return default_merge_input_ids_with_image_features({
23914
- // @ts-ignore
23915
- image_token_id: this.config.image_token_id,
23916
- ...kwargs
23917
- });
23918
- }
23919
- prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
23920
- if (model_inputs.attention_mask && !model_inputs.position_ids) {
23921
- if (!model_inputs.past_key_values) {
23922
- [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
23923
- model_inputs.input_ids,
23924
- model_inputs.image_grid_thw,
23925
- model_inputs.video_grid_thw,
23926
- model_inputs.attention_mask
23927
- );
23928
- } else {
23929
- model_inputs.pixel_values = null;
23930
- const past_length = model_inputs.past_key_values.get_seq_length();
23931
- if (past_length < model_inputs.input_ids.dims[1]) {
23932
- const [full_position_ids, rope_deltas] = this.get_rope_index(
23933
- model_inputs.input_ids,
23934
- model_inputs.image_grid_thw,
23935
- model_inputs.video_grid_thw,
23936
- model_inputs.attention_mask
23937
- );
23938
- model_inputs.rope_deltas = rope_deltas;
23939
- model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
23940
- model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
23941
- } else {
23942
- if (!model_inputs.rope_deltas) {
23943
- [, model_inputs.rope_deltas] = this.get_rope_index(
23944
- model_inputs.input_ids,
23945
- model_inputs.image_grid_thw,
23946
- model_inputs.video_grid_thw,
23947
- model_inputs.attention_mask
23948
- );
23949
- }
23950
- const delta = BigInt(past_length);
23951
- const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
23952
- model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
23953
- }
23954
- }
23955
- }
23956
- return model_inputs;
23957
- }
23958
- };
23959
- var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
23960
- };
23961
-
23962
- // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
23963
- var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
23964
- image_grid_thw_name = "image_grid_thw";
23965
- };
23966
- var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
23967
- image_grid_thw_name = "image_grid_thw";
23968
- };
23969
-
23970
24288
  // src/models/qwen3/modeling_qwen3.js
23971
24289
  var Qwen3PreTrainedModel = class extends PreTrainedModel {
23972
24290
  };
@@ -24412,6 +24730,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
24412
24730
  }
24413
24731
  };
24414
24732
 
24733
+ // src/models/solar_open/modeling_solar_open.js
24734
+ var SolarOpenPreTrainedModel = class extends PreTrainedModel {
24735
+ };
24736
+ var SolarOpenModel = class extends SolarOpenPreTrainedModel {
24737
+ };
24738
+ var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
24739
+ };
24740
+
24415
24741
  // src/models/speecht5/modeling_speecht5.js
24416
24742
  var SpeechT5PreTrainedModel = class extends PreTrainedModel {
24417
24743
  };
@@ -25528,6 +25854,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
25528
25854
  // src/models/registry.js
25529
25855
  var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
25530
25856
  ["bert", "BertModel"],
25857
+ ["eurobert", "EuroBertModel"],
25531
25858
  ["neobert", "NeoBertModel"],
25532
25859
  ["modernbert", "ModernBertModel"],
25533
25860
  ["nomic_bert", "NomicBertModel"],
@@ -25659,6 +25986,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
25659
25986
  ["gemma3_text", "Gemma3Model"],
25660
25987
  ["helium", "HeliumModel"],
25661
25988
  ["glm", "GlmModel"],
25989
+ ["glm_moe_dsa", "GlmMoeDsaModel"],
25662
25990
  ["openelm", "OpenELMModel"],
25663
25991
  ["qwen2", "Qwen2Model"],
25664
25992
  ["qwen2_moe", "Qwen2MoeModel"],
@@ -25670,12 +25998,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
25670
25998
  ["mpt", "MptModel"],
25671
25999
  ["opt", "OPTModel"],
25672
26000
  ["mistral", "MistralModel"],
26001
+ ["mistral4", "Mistral4Model"],
25673
26002
  ["ministral", "MinistralModel"],
25674
26003
  ["ministral3", "Ministral3Model"],
25675
26004
  ["ernie4_5", "Ernie4_5ForCausalLM"],
25676
26005
  ["starcoder2", "Starcoder2Model"],
26006
+ ["deepseek_v3", "DeepseekV3Model"],
25677
26007
  ["falcon", "FalconModel"],
25678
26008
  ["falcon_h1", "FalconH1Model"],
26009
+ ["nemotron_h", "NemotronHModel"],
26010
+ ["solar_open", "SolarOpenModel"],
25679
26011
  ["stablelm", "StableLmModel"],
25680
26012
  ["modernbert-decoder", "ModernBertDecoderModel"],
25681
26013
  ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -25695,6 +26027,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25695
26027
  ]);
25696
26028
  var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
25697
26029
  ["bert", "BertForSequenceClassification"],
26030
+ ["eurobert", "EuroBertForSequenceClassification"],
25698
26031
  ["neobert", "NeoBertForSequenceClassification"],
25699
26032
  ["modernbert", "ModernBertForSequenceClassification"],
25700
26033
  ["roformer", "RoFormerForSequenceClassification"],
@@ -25717,6 +26050,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
25717
26050
  ]);
25718
26051
  var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
25719
26052
  ["bert", "BertForTokenClassification"],
26053
+ ["eurobert", "EuroBertForTokenClassification"],
25720
26054
  ["neobert", "NeoBertForTokenClassification"],
25721
26055
  ["modernbert", "ModernBertForTokenClassification"],
25722
26056
  ["roformer", "RoFormerForTokenClassification"],
@@ -25779,6 +26113,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25779
26113
  ["gemma3", "Gemma3ForCausalLM"],
25780
26114
  ["helium", "HeliumForCausalLM"],
25781
26115
  ["glm", "GlmForCausalLM"],
26116
+ ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
25782
26117
  ["openelm", "OpenELMForCausalLM"],
25783
26118
  ["qwen2", "Qwen2ForCausalLM"],
25784
26119
  ["qwen2_moe", "Qwen2MoeForCausalLM"],
@@ -25790,6 +26125,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25790
26125
  ["qwen3_vl", "Qwen3VLForCausalLM"],
25791
26126
  ["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
25792
26127
  ["qwen3_5", "Qwen3_5ForCausalLM"],
26128
+ ["qwen3_5_text", "Qwen3_5ForCausalLM"],
25793
26129
  ["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
25794
26130
  ["gemma3n", "Gemma3nForCausalLM"],
25795
26131
  ["phi", "PhiForCausalLM"],
@@ -25798,13 +26134,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25798
26134
  ["opt", "OPTForCausalLM"],
25799
26135
  ["mbart", "MBartForCausalLM"],
25800
26136
  ["mistral", "MistralForCausalLM"],
26137
+ ["mistral4", "Mistral4ForCausalLM"],
25801
26138
  ["ministral", "MinistralForCausalLM"],
25802
26139
  ["ministral3", "Ministral3ForCausalLM"],
25803
26140
  ["ernie4_5", "Ernie4_5ForCausalLM"],
25804
26141
  ["starcoder2", "Starcoder2ForCausalLM"],
26142
+ ["deepseek_v3", "DeepseekV3ForCausalLM"],
25805
26143
  ["falcon", "FalconForCausalLM"],
25806
26144
  ["falcon_h1", "FalconH1ForCausalLM"],
26145
+ ["nemotron_h", "NemotronHForCausalLM"],
25807
26146
  ["trocr", "TrOCRForCausalLM"],
26147
+ ["solar_open", "SolarOpenForCausalLM"],
25808
26148
  ["stablelm", "StableLmForCausalLM"],
25809
26149
  ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
25810
26150
  ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -25815,6 +26155,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25815
26155
  var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
25816
26156
  var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25817
26157
  ["bert", "BertForMaskedLM"],
26158
+ ["eurobert", "EuroBertForMaskedLM"],
25818
26159
  ["neobert", "NeoBertForMaskedLM"],
25819
26160
  ["modernbert", "ModernBertForMaskedLM"],
25820
26161
  ["roformer", "RoFormerForMaskedLM"],
@@ -25872,8 +26213,11 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
25872
26213
  ["smolvlm", "SmolVLMForConditionalGeneration"],
25873
26214
  ["paligemma", "PaliGemmaForConditionalGeneration"],
25874
26215
  ["llava_qwen2", "LlavaQwen2ForCausalLM"],
26216
+ ["gemma3", "Gemma3ForConditionalGeneration"],
25875
26217
  ["gemma3n", "Gemma3nForConditionalGeneration"],
25876
- ["mistral3", "Mistral3ForConditionalGeneration"]
26218
+ ["mistral3", "Mistral3ForConditionalGeneration"],
26219
+ ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
26220
+ ["glm_ocr", "GlmOcrForConditionalGeneration"]
25877
26221
  ]);
25878
26222
  var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
25879
26223
  ["granite_speech", "GraniteSpeechForConditionalGeneration"],
@@ -25978,6 +26322,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
25978
26322
  ]);
25979
26323
  var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
25980
26324
  var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
26325
+ ["chmv2", "CHMv2ForDepthEstimation"],
25981
26326
  ["dpt", "DPTForDepthEstimation"],
25982
26327
  ["depth_anything", "DepthAnythingForDepthEstimation"],
25983
26328
  ["glpn", "GLPNForDepthEstimation"],
@@ -26063,13 +26408,6 @@ var CUSTOM_MAPPING = [
26063
26408
  ],
26064
26409
  ["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
26065
26410
  ["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
26066
- ["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
26067
- ["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
26068
- ["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
26069
- ["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
26070
- ["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
26071
- ["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
26072
- ["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
26073
26411
  [
26074
26412
  "VoxtralRealtimeForConditionalGeneration",
26075
26413
  VoxtralRealtimeForConditionalGeneration,
@@ -27751,6 +28089,41 @@ var TASK_ALIASES = Object.freeze({
27751
28089
  embeddings: "feature-extraction"
27752
28090
  });
27753
28091
 
28092
+ // src/utils/model_registry/resolve_model_type.js
28093
+ function resolve_model_type(config, { warn = true } = {}) {
28094
+ const architectures = (
28095
+ /** @type {string[]} */
28096
+ config.architectures || []
28097
+ );
28098
+ for (const arch of architectures) {
28099
+ const mappedType = MODEL_TYPE_MAPPING.get(arch);
28100
+ if (mappedType !== void 0) {
28101
+ return mappedType;
28102
+ }
28103
+ }
28104
+ if (config.model_type) {
28105
+ const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
28106
+ if (mappedType !== void 0) {
28107
+ return mappedType;
28108
+ }
28109
+ for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
28110
+ if (mapping.has(config.model_type)) {
28111
+ const resolved = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
28112
+ if (resolved !== void 0) {
28113
+ return resolved;
28114
+ }
28115
+ }
28116
+ }
28117
+ }
28118
+ if (warn) {
28119
+ const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
28120
+ logger.warn(
28121
+ `[resolve_model_type] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
28122
+ );
28123
+ }
28124
+ return MODEL_TYPES.EncoderOnly;
28125
+ }
28126
+
27754
28127
  // src/utils/model_registry/get_model_files.js
27755
28128
  function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
27756
28129
  if (config !== null) {
@@ -27773,43 +28146,7 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
27773
28146
  const subfolder = "onnx";
27774
28147
  const rawDevice = overrideDevice ?? custom_config.device;
27775
28148
  let dtype = overrideDtype ?? custom_config.dtype;
27776
- let modelType;
27777
- const architectures = (
27778
- /** @type {string[]} */
27779
- config.architectures || []
27780
- );
27781
- let foundInMapping = false;
27782
- for (const arch of architectures) {
27783
- const mappedType = MODEL_TYPE_MAPPING.get(arch);
27784
- if (mappedType !== void 0) {
27785
- modelType = mappedType;
27786
- foundInMapping = true;
27787
- break;
27788
- }
27789
- }
27790
- if (!foundInMapping && config.model_type) {
27791
- const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
27792
- if (mappedType !== void 0) {
27793
- modelType = mappedType;
27794
- foundInMapping = true;
27795
- }
27796
- if (!foundInMapping) {
27797
- for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
27798
- if (mapping.has(config.model_type)) {
27799
- modelType = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
27800
- foundInMapping = true;
27801
- break;
27802
- }
27803
- }
27804
- }
27805
- }
27806
- if (!foundInMapping) {
27807
- const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
27808
- logger.warn(
27809
- `[get_model_files] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
27810
- );
27811
- modelType = MODEL_TYPES.EncoderOnly;
27812
- }
28149
+ const modelType = resolve_model_type(config);
27813
28150
  const add_model_file = (fileName, baseName = null) => {
27814
28151
  baseName = baseName ?? fileName;
27815
28152
  const selectedDevice = selectDevice(rawDevice, fileName);
@@ -28396,6 +28733,31 @@ async function clear_pipeline_cache(task, modelId, options = {}) {
28396
28733
  return await clear_files_from_cache(modelId, files, options);
28397
28734
  }
28398
28735
 
28736
+ // src/utils/model_registry/get_available_dtypes.js
28737
+ var CONCRETE_DTYPES = Object.keys(DEFAULT_DTYPE_SUFFIX_MAPPING);
28738
+ async function get_available_dtypes(modelId, { config = null, model_file_name = null, revision = "main", cache_dir = null, local_files_only = false } = {}) {
28739
+ config = await get_config(modelId, { config, cache_dir, local_files_only, revision });
28740
+ const subfolder = "onnx";
28741
+ const modelType = resolve_model_type(config);
28742
+ const { sessions } = getSessionsConfig(modelType, config, { model_file_name });
28743
+ const baseNames = Object.values(sessions);
28744
+ const metadataOptions = { revision, cache_dir, local_files_only };
28745
+ const probeResults = await Promise.all(
28746
+ CONCRETE_DTYPES.map(async (dtype) => {
28747
+ const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[dtype] ?? "";
28748
+ const allExist = await Promise.all(
28749
+ baseNames.map(async (baseName) => {
28750
+ const filename = `${subfolder}/${baseName}${suffix}.onnx`;
28751
+ const metadata = await get_file_metadata(modelId, filename, metadataOptions);
28752
+ return metadata.exists;
28753
+ })
28754
+ );
28755
+ return { dtype, available: allExist.every(Boolean) };
28756
+ })
28757
+ );
28758
+ return probeResults.filter((r) => r.available).map((r) => r.dtype);
28759
+ }
28760
+
28399
28761
  // src/utils/model_registry/ModelRegistry.js
28400
28762
  var ModelRegistry = class {
28401
28763
  /**
@@ -28482,6 +28844,29 @@ var ModelRegistry = class {
28482
28844
  static async get_processor_files(modelId) {
28483
28845
  return get_processor_files(modelId);
28484
28846
  }
28847
+ /**
28848
+ * Detects which quantization levels (dtypes) are available for a model
28849
+ * by checking which ONNX files exist on the hub or locally.
28850
+ *
28851
+ * A dtype is considered available if all required model session files
28852
+ * exist for that dtype.
28853
+ *
28854
+ * @param {string} modelId - The model id (e.g., "onnx-community/all-MiniLM-L6-v2-ONNX")
28855
+ * @param {Object} [options] - Optional parameters
28856
+ * @param {import('../../configs.js').PretrainedConfig} [options.config=null] - Pre-loaded config
28857
+ * @param {string} [options.model_file_name=null] - Override the model file name (excluding .onnx suffix)
28858
+ * @param {string} [options.revision='main'] - Model revision
28859
+ * @param {string} [options.cache_dir=null] - Custom cache directory
28860
+ * @param {boolean} [options.local_files_only=false] - Only check local files
28861
+ * @returns {Promise<string[]>} Array of available dtype strings (e.g., ['fp32', 'fp16', 'q4', 'q8'])
28862
+ *
28863
+ * @example
28864
+ * const dtypes = await ModelRegistry.get_available_dtypes('onnx-community/all-MiniLM-L6-v2-ONNX');
28865
+ * console.log(dtypes); // ['fp32', 'fp16', 'int8', 'uint8', 'q8', 'q4']
28866
+ */
28867
+ static async get_available_dtypes(modelId, options = {}) {
28868
+ return get_available_dtypes(modelId, options);
28869
+ }
28485
28870
  /**
28486
28871
  * Quickly checks if a model is fully cached by verifying `config.json` is present,
28487
28872
  * then confirming all required files are cached.
@@ -28716,6 +29101,9 @@ export {
28716
29101
  BloomModel,
28717
29102
  BloomPreTrainedModel,
28718
29103
  BloomTokenizer,
29104
+ CHMv2ForDepthEstimation,
29105
+ CHMv2ImageProcessor,
29106
+ CHMv2PreTrainedModel,
28719
29107
  CLIPFeatureExtractor,
28720
29108
  CLIPImageProcessor,
28721
29109
  CLIPModel,
@@ -28811,6 +29199,9 @@ export {
28811
29199
  DebertaV2Tokenizer,
28812
29200
  DecisionTransformerModel,
28813
29201
  DecisionTransformerPreTrainedModel,
29202
+ DeepseekV3ForCausalLM,
29203
+ DeepseekV3Model,
29204
+ DeepseekV3PreTrainedModel,
28814
29205
  DeiTFeatureExtractor,
28815
29206
  DeiTForImageClassification,
28816
29207
  DeiTImageProcessor,
@@ -28871,6 +29262,11 @@ export {
28871
29262
  EsmModel,
28872
29263
  EsmPreTrainedModel,
28873
29264
  EsmTokenizer,
29265
+ EuroBertForMaskedLM,
29266
+ EuroBertForSequenceClassification,
29267
+ EuroBertForTokenClassification,
29268
+ EuroBertModel,
29269
+ EuroBertPreTrainedModel,
28874
29270
  ExaoneForCausalLM,
28875
29271
  ExaoneModel,
28876
29272
  ExaonePreTrainedModel,
@@ -28917,8 +29313,11 @@ export {
28917
29313
  Gemma2Model,
28918
29314
  Gemma2PreTrainedModel,
28919
29315
  Gemma3ForCausalLM,
29316
+ Gemma3ForConditionalGeneration,
29317
+ Gemma3ImageProcessor,
28920
29318
  Gemma3Model,
28921
29319
  Gemma3PreTrainedModel,
29320
+ Gemma3Processor,
28922
29321
  Gemma3nAudioFeatureExtractor,
28923
29322
  Gemma3nForCausalLM,
28924
29323
  Gemma3nForConditionalGeneration,
@@ -28928,8 +29327,14 @@ export {
28928
29327
  GemmaModel,
28929
29328
  GemmaPreTrainedModel,
28930
29329
  GemmaTokenizer,
29330
+ Glm46VImageProcessor,
29331
+ Glm46VProcessor,
28931
29332
  GlmForCausalLM,
28932
29333
  GlmModel,
29334
+ GlmMoeDsaForCausalLM,
29335
+ GlmMoeDsaModel,
29336
+ GlmMoeDsaPreTrainedModel,
29337
+ GlmOcrForConditionalGeneration,
28933
29338
  GlmPreTrainedModel,
28934
29339
  GptOssForCausalLM,
28935
29340
  GptOssModel,
@@ -28995,6 +29400,7 @@ export {
28995
29400
  Lfm2VlForConditionalGeneration,
28996
29401
  Lfm2VlImageProcessor,
28997
29402
  Lfm2VlProcessor,
29403
+ LightOnOcrForConditionalGeneration,
28998
29404
  LiteWhisperForConditionalGeneration,
28999
29405
  Llama4ForCausalLM,
29000
29406
  Llama4PreTrainedModel,
@@ -29064,6 +29470,9 @@ export {
29064
29470
  MimiPreTrainedModel,
29065
29471
  MinLengthLogitsProcessor,
29066
29472
  MinNewTokensLengthLogitsProcessor,
29473
+ Mistral4ForCausalLM,
29474
+ Mistral4Model,
29475
+ Mistral4PreTrainedModel,
29067
29476
  MistralForCausalLM,
29068
29477
  MistralModel,
29069
29478
  MistralPreTrainedModel,
@@ -29135,6 +29544,9 @@ export {
29135
29544
  NanoChatForCausalLM,
29136
29545
  NanoChatModel,
29137
29546
  NanoChatPreTrainedModel,
29547
+ NemotronHForCausalLM,
29548
+ NemotronHModel,
29549
+ NemotronHPreTrainedModel,
29138
29550
  NeoBertForMaskedLM,
29139
29551
  NeoBertForQuestionAnswering,
29140
29552
  NeoBertForSequenceClassification,
@@ -29324,6 +29736,9 @@ export {
29324
29736
  SnacFeatureExtractor,
29325
29737
  SnacModel,
29326
29738
  SnacPreTrainedModel,
29739
+ SolarOpenForCausalLM,
29740
+ SolarOpenModel,
29741
+ SolarOpenPreTrainedModel,
29327
29742
  SpeechT5FeatureExtractor,
29328
29743
  SpeechT5ForSpeechToText,
29329
29744
  SpeechT5ForTextToSpeech,