@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/README.md +13 -2
  2. package/dist/transformers.js +689 -382
  3. package/dist/transformers.min.js +19 -19
  4. package/dist/transformers.node.cjs +716 -382
  5. package/dist/transformers.node.min.cjs +19 -19
  6. package/dist/transformers.node.min.mjs +19 -19
  7. package/dist/transformers.node.mjs +689 -382
  8. package/dist/transformers.web.js +697 -390
  9. package/dist/transformers.web.min.js +17 -17
  10. package/package.json +2 -2
  11. package/src/configs.js +28 -22
  12. package/src/env.js +1 -1
  13. package/src/image_processors_utils.js +25 -15
  14. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  15. package/src/models/chmv2/modeling_chmv2.js +4 -0
  16. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  17. package/src/models/eurobert/modeling_eurobert.js +41 -0
  18. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  19. package/src/models/glm46v/processing_glm46v.js +5 -0
  20. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  21. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  22. package/src/models/image_processors.js +2 -0
  23. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
  24. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  25. package/src/models/mistral4/modeling_mistral4.js +5 -0
  26. package/src/models/modeling_utils.js +2 -0
  27. package/src/models/models.js +10 -1
  28. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  29. package/src/models/processors.js +1 -0
  30. package/src/models/qwen2_vl/modeling_qwen2_vl.js +187 -136
  31. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  32. package/src/models/registry.js +17 -0
  33. package/src/models/solar_open/modeling_solar_open.js +5 -0
  34. package/src/pipelines.js +1 -0
  35. package/src/utils/hub.js +4 -1
  36. package/src/utils/model_registry/get_file_metadata.js +1 -0
  37. package/types/configs.d.ts.map +1 -1
  38. package/types/image_processors_utils.d.ts +3 -2
  39. package/types/image_processors_utils.d.ts.map +1 -1
  40. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  41. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  42. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  43. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  44. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  45. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  46. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  47. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  48. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  49. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  50. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  51. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  52. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  53. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  54. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  55. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  56. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  57. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  58. package/types/models/image_processors.d.ts +2 -0
  59. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  60. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  61. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  62. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  63. package/types/models/modeling_utils.d.ts.map +1 -1
  64. package/types/models/models.d.ts +10 -1
  65. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  66. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  67. package/types/models/processors.d.ts +1 -0
  68. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
  69. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  70. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  71. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  72. package/types/models/registry.d.ts.map +1 -1
  73. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  74. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  75. package/types/pipelines.d.ts +1 -0
  76. package/types/pipelines.d.ts.map +1 -1
  77. package/types/utils/hub.d.ts.map +1 -1
  78. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  79. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -20,7 +20,7 @@ var node_path_default = {};
20
20
  var node_url_default = {};
21
21
 
22
22
  // src/env.js
23
- var VERSION = "4.0.0-next.7";
23
+ var VERSION = "4.0.0-next.8";
24
24
  var HAS_SELF = typeof self !== "undefined";
25
25
  var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
26
26
  var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
@@ -250,7 +250,7 @@ var logger = {
250
250
  }
251
251
  };
252
252
 
253
- // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
253
+ // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
254
254
  var DictionarySplitter = class {
255
255
  /**
256
256
  * @param dictionary The dictionary of words to use for splitting.
@@ -1906,10 +1906,10 @@ var BPE = class extends TokenizerModel_default {
1906
1906
  );
1907
1907
  if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
1908
1908
  output_tokens.push(...byte_tokens);
1909
- } else {
1909
+ } else if (this.unk_token != null) {
1910
1910
  output_tokens.push(this.unk_token);
1911
1911
  }
1912
- } else {
1912
+ } else if (this.unk_token != null) {
1913
1913
  output_tokens.push(this.unk_token);
1914
1914
  }
1915
1915
  }
@@ -6515,13 +6515,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
6515
6515
  wrapped_progress
6516
6516
  );
6517
6517
  } else if (typeof response !== "string") {
6518
+ const headers = new Headers(response.headers);
6519
+ headers.set("content-length", result.byteLength.toString());
6518
6520
  await cache2.put(
6519
6521
  cacheKey,
6520
6522
  new Response(
6521
6523
  /** @type {any} */
6522
6524
  result,
6523
6525
  {
6524
- headers: response.headers
6526
+ headers
6525
6527
  }
6526
6528
  )
6527
6529
  ).catch((err) => {
@@ -16498,6 +16500,7 @@ __export(processors_exports, {
16498
16500
  ChatterboxProcessor: () => ChatterboxProcessor,
16499
16501
  Florence2Processor: () => Florence2Processor,
16500
16502
  Gemma3nProcessor: () => Gemma3nProcessor,
16503
+ Glm46VProcessor: () => Glm46VProcessor,
16501
16504
  GraniteSpeechProcessor: () => GraniteSpeechProcessor,
16502
16505
  GroundingDinoProcessor: () => GroundingDinoProcessor,
16503
16506
  Idefics3Processor: () => Idefics3Processor,
@@ -19011,26 +19014,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
19011
19014
  }
19012
19015
  return [segmentation, segments];
19013
19016
  }
19014
- function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
19017
+ function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
19015
19018
  if (height < factor || width < factor) {
19016
- throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
19017
- } else if (Math.max(height, width) / Math.min(height, width) > 200) {
19019
+ const scale = Math.max(factor / height, factor / width);
19020
+ height = Math.round(height * scale);
19021
+ width = Math.round(width * scale);
19022
+ }
19023
+ if (Math.max(height, width) / Math.min(height, width) > 200) {
19018
19024
  throw new Error(
19019
19025
  `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
19020
19026
  );
19021
19027
  }
19022
19028
  let h_bar = Math.round(height / factor) * factor;
19023
19029
  let w_bar = Math.round(width / factor) * factor;
19024
- if (h_bar * w_bar > max_pixels) {
19025
- const beta = Math.sqrt(height * width / max_pixels);
19026
- h_bar = Math.floor(height / beta / factor) * factor;
19027
- w_bar = Math.floor(width / beta / factor) * factor;
19028
- } else if (h_bar * w_bar < min_pixels) {
19029
- const beta = Math.sqrt(min_pixels / (height * width));
19030
+ if (temporal_factor * h_bar * w_bar > max_pixels) {
19031
+ const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
19032
+ h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
19033
+ w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
19034
+ } else if (temporal_factor * h_bar * w_bar < min_pixels) {
19035
+ const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
19030
19036
  h_bar = Math.ceil(height * beta / factor) * factor;
19031
19037
  w_bar = Math.ceil(width * beta / factor) * factor;
19032
19038
  }
19033
- return [h_bar, w_bar];
19039
+ return [w_bar, h_bar];
19034
19040
  }
19035
19041
  function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
19036
19042
  if (label_ids_to_fuse === null) {
@@ -19109,7 +19115,7 @@ var ImageProcessor = class extends Callable2 {
19109
19115
  this.do_pad = config.do_pad;
19110
19116
  this.min_pixels = config.min_pixels;
19111
19117
  this.max_pixels = config.max_pixels;
19112
- if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
19118
+ if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
19113
19119
  this.pad_size = this.size;
19114
19120
  }
19115
19121
  this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -19397,10 +19403,8 @@ var ImageProcessor = class extends Callable2 {
19397
19403
  const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
19398
19404
  [pixelData, imgDims] = padded;
19399
19405
  } else if (this.size_divisibility) {
19400
- const [paddedWidth, paddedHeight] = enforce_size_divisibility(
19401
- [imgDims[1], imgDims[0]],
19402
- this.size_divisibility
19403
- );
19406
+ const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
19407
+ const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
19404
19408
  [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
19405
19409
  }
19406
19410
  }
@@ -19477,6 +19481,7 @@ var image_processors_exports = {};
19477
19481
  __export(image_processors_exports, {
19478
19482
  BeitFeatureExtractor: () => BeitFeatureExtractor,
19479
19483
  BitImageProcessor: () => BitImageProcessor,
19484
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
19480
19485
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
19481
19486
  CLIPImageProcessor: () => CLIPImageProcessor,
19482
19487
  ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -19493,6 +19498,7 @@ __export(image_processors_exports, {
19493
19498
  DonutImageProcessor: () => DonutImageProcessor,
19494
19499
  EfficientNetImageProcessor: () => EfficientNetImageProcessor,
19495
19500
  GLPNFeatureExtractor: () => GLPNFeatureExtractor,
19501
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
19496
19502
  GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
19497
19503
  Idefics3ImageProcessor: () => Idefics3ImageProcessor,
19498
19504
  ImageFeatureExtractor: () => ImageProcessor,
@@ -19553,6 +19559,10 @@ var BitImageProcessor = class extends ImageProcessor {
19553
19559
  var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
19554
19560
  };
19555
19561
 
19562
+ // src/models/chmv2/image_processing_chmv2.js
19563
+ var CHMv2ImageProcessor = class extends ImageProcessor {
19564
+ };
19565
+
19556
19566
  // src/models/clip/image_processing_clip.js
19557
19567
  var CLIPImageProcessor = class extends ImageProcessor {
19558
19568
  };
@@ -19672,6 +19682,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
19672
19682
  }
19673
19683
  };
19674
19684
 
19685
+ // src/models/qwen2_vl/image_processing_qwen2_vl.js
19686
+ var Qwen2VLImageProcessor = class extends ImageProcessor {
19687
+ constructor(config) {
19688
+ super(config);
19689
+ this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
19690
+ this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
19691
+ this.patch_size = config.patch_size;
19692
+ this.merge_size = config.merge_size;
19693
+ }
19694
+ /** @type {ImageProcessor['get_resize_output_image_size']} */
19695
+ get_resize_output_image_size(image, size) {
19696
+ const factor = this.patch_size * this.merge_size;
19697
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
19698
+ }
19699
+ async _call(images, ...args) {
19700
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
19701
+ let patches = pixel_values;
19702
+ const { temporal_patch_size, merge_size, patch_size } = this.config;
19703
+ if (patches.dims[0] === 1) {
19704
+ patches = cat(
19705
+ Array.from({ length: temporal_patch_size }, () => patches),
19706
+ 0
19707
+ );
19708
+ }
19709
+ const grid_t = patches.dims[0] / temporal_patch_size;
19710
+ const channel = patches.dims[1];
19711
+ const grid_h = Math.floor(patches.dims[2] / patch_size);
19712
+ const grid_w = Math.floor(patches.dims[3] / patch_size);
19713
+ const flatten_patches = patches.view(
19714
+ grid_t,
19715
+ temporal_patch_size,
19716
+ channel,
19717
+ Math.floor(grid_h / merge_size),
19718
+ merge_size,
19719
+ patch_size,
19720
+ Math.floor(grid_w / merge_size),
19721
+ merge_size,
19722
+ patch_size
19723
+ ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
19724
+ const image_grid_thw = new Tensor3("int64", [grid_t, grid_h, grid_w], [1, 3]);
19725
+ return {
19726
+ pixel_values: flatten_patches,
19727
+ image_grid_thw,
19728
+ original_sizes,
19729
+ reshaped_input_sizes
19730
+ };
19731
+ }
19732
+ };
19733
+
19734
+ // src/models/glm46v/image_processing_glm46v.js
19735
+ var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
19736
+ /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
19737
+ get_resize_output_image_size(image, size) {
19738
+ const factor = this.patch_size * this.merge_size;
19739
+ const temporal_factor = this.config.temporal_patch_size ?? 2;
19740
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
19741
+ }
19742
+ };
19743
+
19675
19744
  // src/models/glpn/image_processing_glpn.js
19676
19745
  var GLPNFeatureExtractor = class extends ImageProcessor {
19677
19746
  };
@@ -20065,7 +20134,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
20065
20134
  const img = pixel_values.unsqueeze_(0);
20066
20135
  const total_factor = this.encoder_patch_size * this.downsample_factor;
20067
20136
  const f2 = total_factor ** 2;
20068
- const [new_height, new_width] = smart_resize(
20137
+ const [new_width, new_height] = smart_resize(
20069
20138
  Math.max(total_factor, height),
20070
20139
  Math.max(total_factor, width),
20071
20140
  total_factor,
@@ -20355,55 +20424,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
20355
20424
  var PvtImageProcessor = class extends ImageProcessor {
20356
20425
  };
20357
20426
 
20358
- // src/models/qwen2_vl/image_processing_qwen2_vl.js
20359
- var Qwen2VLImageProcessor = class extends ImageProcessor {
20360
- constructor(config) {
20361
- super(config);
20362
- this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
20363
- this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
20364
- this.patch_size = config.patch_size;
20365
- this.merge_size = config.merge_size;
20366
- }
20367
- /** @type {ImageProcessor['get_resize_output_image_size']} */
20368
- get_resize_output_image_size(image, size) {
20369
- const factor = this.patch_size * this.merge_size;
20370
- return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
20371
- }
20372
- async _call(images, ...args) {
20373
- const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
20374
- let patches = pixel_values;
20375
- const { temporal_patch_size, merge_size, patch_size } = this.config;
20376
- if (patches.dims[0] === 1) {
20377
- patches = cat(
20378
- Array.from({ length: temporal_patch_size }, () => patches),
20379
- 0
20380
- );
20381
- }
20382
- const grid_t = patches.dims[0] / temporal_patch_size;
20383
- const channel = patches.dims[1];
20384
- const grid_h = Math.floor(patches.dims[2] / patch_size);
20385
- const grid_w = Math.floor(patches.dims[3] / patch_size);
20386
- const flatten_patches = patches.view(
20387
- grid_t,
20388
- temporal_patch_size,
20389
- channel,
20390
- Math.floor(grid_h / merge_size),
20391
- merge_size,
20392
- patch_size,
20393
- Math.floor(grid_w / merge_size),
20394
- merge_size,
20395
- patch_size
20396
- ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
20397
- const image_grid_thw = new Tensor3("int64", [grid_t, grid_h, grid_w], [1, 3]);
20398
- return {
20399
- pixel_values: flatten_patches,
20400
- image_grid_thw,
20401
- original_sizes,
20402
- reshaped_input_sizes
20403
- };
20404
- }
20405
- };
20406
-
20407
20427
  // src/models/rt_detr/image_processing_rt_detr.js
20408
20428
  var RTDetrImageProcessor = class extends ImageProcessor {
20409
20429
  /** @type {typeof post_process_object_detection} */
@@ -20957,6 +20977,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
20957
20977
  }
20958
20978
  };
20959
20979
 
20980
+ // src/models/qwen2_vl/processing_qwen2_vl.js
20981
+ var Qwen2VLProcessor = class extends Processor {
20982
+ static image_processor_class = AutoImageProcessor;
20983
+ static tokenizer_class = AutoTokenizer;
20984
+ static image_token = "<|image_pad|>";
20985
+ /**
20986
+ *
20987
+ * @param {string|string[]} text
20988
+ * @param {RawImage|RawImage[]} images
20989
+ * @param {...any} args
20990
+ * @returns {Promise<any>}
20991
+ */
20992
+ async _call(text, images = null, ...args) {
20993
+ if (!Array.isArray(text)) {
20994
+ text = [text];
20995
+ }
20996
+ let image_inputs, image_grid_thw;
20997
+ if (images) {
20998
+ image_inputs = await this.image_processor(images);
20999
+ image_grid_thw = image_inputs.image_grid_thw;
21000
+ }
21001
+ if (image_grid_thw) {
21002
+ let merge_length = this.image_processor.config.merge_size ** 2;
21003
+ let index = 0;
21004
+ const image_token = (
21005
+ /** @type {typeof Qwen2VLProcessor} */
21006
+ this.constructor.image_token
21007
+ );
21008
+ const image_grid_thw_list = image_grid_thw.tolist();
21009
+ text = text.map((t) => {
21010
+ while (t.includes(image_token)) {
21011
+ const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
21012
+ t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
21013
+ }
21014
+ return t.replaceAll("<|placeholder|>", image_token);
21015
+ });
21016
+ }
21017
+ const text_inputs = this.tokenizer(text);
21018
+ return {
21019
+ ...text_inputs,
21020
+ ...image_inputs
21021
+ };
21022
+ }
21023
+ };
21024
+
21025
+ // src/models/glm46v/processing_glm46v.js
21026
+ var Glm46VProcessor = class extends Qwen2VLProcessor {
21027
+ static image_token = "<|image|>";
21028
+ };
21029
+
20960
21030
  // src/models/granite_speech/processing_granite_speech.js
20961
21031
  var GraniteSpeechProcessor = class extends Processor {
20962
21032
  static tokenizer_class = AutoTokenizer;
@@ -21687,47 +21757,6 @@ var PyAnnoteProcessor = class extends Processor {
21687
21757
  }
21688
21758
  };
21689
21759
 
21690
- // src/models/qwen2_vl/processing_qwen2_vl.js
21691
- var Qwen2VLProcessor = class extends Processor {
21692
- static image_processor_class = AutoImageProcessor;
21693
- static tokenizer_class = AutoTokenizer;
21694
- /**
21695
- *
21696
- * @param {string|string[]} text
21697
- * @param {RawImage|RawImage[]} images
21698
- * @param {...any} args
21699
- * @returns {Promise<any>}
21700
- */
21701
- async _call(text, images = null, ...args) {
21702
- if (!Array.isArray(text)) {
21703
- text = [text];
21704
- }
21705
- let image_inputs, image_grid_thw;
21706
- if (images) {
21707
- image_inputs = await this.image_processor(images);
21708
- image_grid_thw = image_inputs.image_grid_thw;
21709
- }
21710
- if (image_grid_thw) {
21711
- let merge_length = this.image_processor.config.merge_size ** 2;
21712
- let index = 0;
21713
- const image_grid_thw_list = image_grid_thw.tolist();
21714
- text = text.map((t) => {
21715
- while (t.includes("<|image_pad|>")) {
21716
- const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
21717
- t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
21718
- }
21719
- return t.replaceAll("<|placeholder|>", "<|image_pad|>");
21720
- });
21721
- }
21722
- const text_inputs = this.tokenizer(text);
21723
- return {
21724
- ...text_inputs,
21725
- ...image_inputs
21726
- // TODO: ...videos_inputs,
21727
- };
21728
- }
21729
- };
21730
-
21731
21760
  // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
21732
21761
  var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
21733
21762
  };
@@ -22071,6 +22100,8 @@ function getNormalizedConfig(config) {
22071
22100
  case "gemma3n":
22072
22101
  case "lfm2_vl":
22073
22102
  case "chatterbox":
22103
+ case "lighton_ocr":
22104
+ case "glm_ocr":
22074
22105
  case "mistral3":
22075
22106
  case "qwen2_5_vl":
22076
22107
  case "qwen3_vl":
@@ -22146,6 +22177,8 @@ function getNormalizedConfig(config) {
22146
22177
  mapping["dim_kv"] = "head_dim";
22147
22178
  break;
22148
22179
  case "qwen3":
22180
+ case "solar_open":
22181
+ case "glm_ocr_text":
22149
22182
  case "gemma":
22150
22183
  case "gemma2":
22151
22184
  case "vaultgemma":
@@ -22156,6 +22189,7 @@ function getNormalizedConfig(config) {
22156
22189
  case "ernie4_5":
22157
22190
  case "hunyuan_v1_dense":
22158
22191
  case "falcon_h1":
22192
+ case "nemotron_h":
22159
22193
  case "ministral":
22160
22194
  case "ministral3":
22161
22195
  mapping["num_heads"] = "num_key_value_heads";
@@ -22190,6 +22224,9 @@ function getNormalizedConfig(config) {
22190
22224
  mapping["num_attention_heads"] = "num_attention_heads";
22191
22225
  break;
22192
22226
  case "youtu":
22227
+ case "deepseek_v3":
22228
+ case "glm_moe_dsa":
22229
+ case "mistral4":
22193
22230
  mapping["num_heads"] = "num_key_value_heads";
22194
22231
  mapping["num_layers"] = "num_hidden_layers";
22195
22232
  mapping["dim_kv"] = "qk_head_dim";
@@ -22278,6 +22315,7 @@ function getCacheShapes(config, options) {
22278
22315
  if (!(config instanceof PretrainedConfig)) {
22279
22316
  config = new PretrainedConfig(config);
22280
22317
  }
22318
+ const batch_size = options?.batch_size ?? 1;
22281
22319
  if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
22282
22320
  const pkv_prefix = options?.prefix ?? "past_key_values";
22283
22321
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -22287,7 +22325,6 @@ function getCacheShapes(config, options) {
22287
22325
  config
22288
22326
  );
22289
22327
  const head_dim = hidden_size / num_attention_heads;
22290
- const batch_size = options?.batch_size ?? 1;
22291
22328
  for (let i = 0; i < layer_types.length; ++i) {
22292
22329
  if (layer_types[i] === "full_attention") {
22293
22330
  for (const kv of ["key", "value"]) {
@@ -22300,31 +22337,26 @@ function getCacheShapes(config, options) {
22300
22337
  }
22301
22338
  }
22302
22339
  return cache_values;
22303
- } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
22340
+ } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
22304
22341
  const pkv_prefix = options?.prefix ?? "past_key_values";
22305
22342
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
22306
- const cache_values = {};
22307
- const {
22308
- layer_types,
22309
- num_hidden_layers,
22310
- num_attention_heads,
22311
- num_key_value_heads,
22312
- hidden_size,
22313
- mamba_d_conv,
22314
- mamba_n_heads,
22315
- mamba_d_head,
22316
- mamba_d_state,
22317
- mamba_n_groups,
22318
- mamba_expand,
22319
- mamba_d_ssm
22320
- } = (
22343
+ const c = (
22321
22344
  /** @type {any} */
22322
22345
  config
22323
22346
  );
22324
- const head_dim = hidden_size / num_attention_heads;
22325
- const batch_size = options?.batch_size ?? 1;
22326
- const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
22327
- for (let i = 0; i < num_hidden_layers; ++i) {
22347
+ const layer_types = c.layer_types ?? c.layers_block_type;
22348
+ const num_layers = c.num_hidden_layers ?? layer_types?.length;
22349
+ const num_key_value_heads = c.num_key_value_heads;
22350
+ const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
22351
+ const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
22352
+ const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
22353
+ const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
22354
+ const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
22355
+ const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
22356
+ const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
22357
+ const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
22358
+ const cache_values = {};
22359
+ for (let i = 0; i < num_layers; ++i) {
22328
22360
  if (!layer_types || layer_types[i] === "mamba") {
22329
22361
  cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
22330
22362
  cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -22358,7 +22390,6 @@ function getCacheShapes(config, options) {
22358
22390
  const key_dim = linear_key_head_dim * linear_num_key_heads;
22359
22391
  const value_dim = linear_value_head_dim * linear_num_value_heads;
22360
22392
  const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
22361
- const batch_size = options?.batch_size ?? 1;
22362
22393
  for (let i = 0; i < layer_types.length; ++i) {
22363
22394
  if (layer_types[i] === "full_attention") {
22364
22395
  for (const kv of ["key", "value"]) {
@@ -24986,7 +25017,9 @@ async function generic_text_to_text_forward(self2, {
24986
25017
  "qwen3_5",
24987
25018
  "qwen3_5_text",
24988
25019
  "qwen3_5_moe",
24989
- "qwen3_5_moe_text"
25020
+ "qwen3_5_moe_text",
25021
+ "glm_ocr",
25022
+ "glm_ocr_text"
24990
25023
  ].includes(self2.config.model_type)
24991
25024
  ) {
24992
25025
  const { image_grid_thw, video_grid_thw } = kwargs;
@@ -25210,6 +25243,8 @@ __export(models_exports, {
25210
25243
  BloomForCausalLM: () => BloomForCausalLM,
25211
25244
  BloomModel: () => BloomModel,
25212
25245
  BloomPreTrainedModel: () => BloomPreTrainedModel,
25246
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
25247
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
25213
25248
  CLIPModel: () => CLIPModel,
25214
25249
  CLIPPreTrainedModel: () => CLIPPreTrainedModel,
25215
25250
  CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -25284,6 +25319,9 @@ __export(models_exports, {
25284
25319
  DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
25285
25320
  DecisionTransformerModel: () => DecisionTransformerModel,
25286
25321
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
25322
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
25323
+ DeepseekV3Model: () => DeepseekV3Model,
25324
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
25287
25325
  DeiTForImageClassification: () => DeiTForImageClassification,
25288
25326
  DeiTModel: () => DeiTModel,
25289
25327
  DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -25329,6 +25367,11 @@ __export(models_exports, {
25329
25367
  EsmForTokenClassification: () => EsmForTokenClassification,
25330
25368
  EsmModel: () => EsmModel,
25331
25369
  EsmPreTrainedModel: () => EsmPreTrainedModel,
25370
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
25371
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
25372
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
25373
+ EuroBertModel: () => EuroBertModel,
25374
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
25332
25375
  ExaoneForCausalLM: () => ExaoneForCausalLM,
25333
25376
  ExaoneModel: () => ExaoneModel,
25334
25377
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -25375,6 +25418,10 @@ __export(models_exports, {
25375
25418
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
25376
25419
  GlmForCausalLM: () => GlmForCausalLM,
25377
25420
  GlmModel: () => GlmModel,
25421
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
25422
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
25423
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
25424
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
25378
25425
  GlmPreTrainedModel: () => GlmPreTrainedModel,
25379
25426
  GptOssForCausalLM: () => GptOssForCausalLM,
25380
25427
  GptOssModel: () => GptOssModel,
@@ -25421,6 +25468,7 @@ __export(models_exports, {
25421
25468
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
25422
25469
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
25423
25470
  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
25471
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
25424
25472
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
25425
25473
  Llama4ForCausalLM: () => Llama4ForCausalLM,
25426
25474
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -25470,6 +25518,9 @@ __export(models_exports, {
25470
25518
  MimiEncoderOutput: () => MimiEncoderOutput,
25471
25519
  MimiModel: () => MimiModel,
25472
25520
  MimiPreTrainedModel: () => MimiPreTrainedModel,
25521
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
25522
+ Mistral4Model: () => Mistral4Model,
25523
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
25473
25524
  MistralForCausalLM: () => MistralForCausalLM,
25474
25525
  MistralModel: () => MistralModel,
25475
25526
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -25527,6 +25578,9 @@ __export(models_exports, {
25527
25578
  NanoChatForCausalLM: () => NanoChatForCausalLM,
25528
25579
  NanoChatModel: () => NanoChatModel,
25529
25580
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
25581
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
25582
+ NemotronHModel: () => NemotronHModel,
25583
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
25530
25584
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
25531
25585
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
25532
25586
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -25664,6 +25718,9 @@ __export(models_exports, {
25664
25718
  SnacEncoderModel: () => SnacEncoderModel,
25665
25719
  SnacModel: () => SnacModel,
25666
25720
  SnacPreTrainedModel: () => SnacPreTrainedModel,
25721
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
25722
+ SolarOpenModel: () => SolarOpenModel,
25723
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
25667
25724
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
25668
25725
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
25669
25726
  SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -25838,7 +25895,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
25838
25895
  var ArceeForCausalLM = class extends ArceePreTrainedModel {
25839
25896
  };
25840
25897
 
25841
- // src/models/ast/modeling_ast.js
25898
+ // src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
25842
25899
  var ASTPreTrainedModel = class extends PreTrainedModel {
25843
25900
  };
25844
25901
  var ASTModel = class extends ASTPreTrainedModel {
@@ -26173,6 +26230,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
26173
26230
  var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
26174
26231
  };
26175
26232
 
26233
+ // src/models/chmv2/modeling_chmv2.js
26234
+ var CHMv2PreTrainedModel = class extends PreTrainedModel {
26235
+ };
26236
+ var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
26237
+ };
26238
+
26176
26239
  // src/models/clap/modeling_clap.js
26177
26240
  var ClapPreTrainedModel = class extends PreTrainedModel {
26178
26241
  };
@@ -26511,6 +26574,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
26511
26574
  }
26512
26575
  };
26513
26576
 
26577
+ // src/models/deepseek_v3/modeling_deepseek_v3.js
26578
+ var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
26579
+ };
26580
+ var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
26581
+ };
26582
+ var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
26583
+ };
26584
+
26514
26585
  // src/models/deberta_v2/modeling_deberta_v2.js
26515
26586
  var DebertaV2PreTrainedModel = class extends PreTrainedModel {
26516
26587
  };
@@ -26859,6 +26930,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
26859
26930
  }
26860
26931
  };
26861
26932
 
26933
+ // src/models/eurobert/modeling_eurobert.js
26934
+ var EuroBertPreTrainedModel = class extends PreTrainedModel {
26935
+ };
26936
+ var EuroBertModel = class extends EuroBertPreTrainedModel {
26937
+ };
26938
+ var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
26939
+ /**
26940
+ * Calls the model on new inputs.
26941
+ *
26942
+ * @param {Object} model_inputs The inputs to the model.
26943
+ * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
26944
+ */
26945
+ async _call(model_inputs) {
26946
+ return new MaskedLMOutput(await super._call(model_inputs));
26947
+ }
26948
+ };
26949
+ var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
26950
+ /**
26951
+ * Calls the model on new inputs.
26952
+ *
26953
+ * @param {Object} model_inputs The inputs to the model.
26954
+ * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
26955
+ */
26956
+ async _call(model_inputs) {
26957
+ return new SequenceClassifierOutput(await super._call(model_inputs));
26958
+ }
26959
+ };
26960
+ var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
26961
+ /**
26962
+ * Calls the model on new inputs.
26963
+ *
26964
+ * @param {Object} model_inputs The inputs to the model.
26965
+ * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
26966
+ */
26967
+ async _call(model_inputs) {
26968
+ return new TokenClassifierOutput(await super._call(model_inputs));
26969
+ }
26970
+ };
26971
+
26862
26972
  // src/models/exaone/modeling_exaone.js
26863
26973
  var ExaonePreTrainedModel = class extends PreTrainedModel {
26864
26974
  };
@@ -27134,6 +27244,377 @@ var GlmModel = class extends GlmPreTrainedModel {
27134
27244
  var GlmForCausalLM = class extends GlmPreTrainedModel {
27135
27245
  };
27136
27246
 
27247
+ // src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
27248
+ var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
27249
+ };
27250
+ var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
27251
+ };
27252
+ var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
27253
+ };
27254
+
27255
+ // src/models/qwen2_vl/modeling_qwen2_vl.js
27256
+ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
27257
+ forward_params = [
27258
+ // Text inputs
27259
+ "input_ids",
27260
+ "attention_mask",
27261
+ "position_ids",
27262
+ "past_key_values",
27263
+ // Vision inputs
27264
+ "pixel_values",
27265
+ "image_grid_thw"
27266
+ ];
27267
+ };
27268
+ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
27269
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
27270
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
27271
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
27272
+ image_grid_thw_name = "grid_thw";
27273
+ /**
27274
+ * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
27275
+ * @param {Tensor} input_ids
27276
+ * @param {Tensor} attention_mask
27277
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
27278
+ */
27279
+ _get_text_only_rope_index(input_ids, attention_mask) {
27280
+ if (attention_mask) {
27281
+ const { data, dims } = cumsum_masked_fill(attention_mask);
27282
+ const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
27283
+ const mrope_position_deltas = Array.from(
27284
+ { length: dims[0] },
27285
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
27286
+ );
27287
+ return [
27288
+ new Tensor3("int64", position_ids, [3, ...dims]),
27289
+ new Tensor3("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27290
+ ];
27291
+ } else {
27292
+ const [batch_size, seq_length] = input_ids.dims;
27293
+ const position_ids = BigInt64Array.from(
27294
+ { length: 3 * batch_size * seq_length },
27295
+ (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
27296
+ );
27297
+ return [new Tensor3("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
27298
+ }
27299
+ }
27300
+ /**
27301
+ * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
27302
+ * global [all_t, all_h, all_w] order, then write back into the position_ids array
27303
+ * respecting attention mask.
27304
+ * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
27305
+ * @param {number[]} attn_mask Attention mask for this batch element
27306
+ * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
27307
+ * @param {number} batch_idx Current batch index
27308
+ * @returns {number[]} Flat reordered positions of length total_len
27309
+ */
27310
+ _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
27311
+ const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
27312
+ const llm_positions = new Array(total_len);
27313
+ let index = 0;
27314
+ for (let x = 0; x < 3; ++x) {
27315
+ for (const val of llm_pos_ids_list) {
27316
+ const seg_len = val.length / 3;
27317
+ for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
27318
+ llm_positions[index++] = val[z];
27319
+ }
27320
+ }
27321
+ }
27322
+ let count2 = 0;
27323
+ for (let y = 0; y < attn_mask.length; ++y) {
27324
+ if (attn_mask[y] == 1) {
27325
+ for (let x = 0; x < 3; ++x) {
27326
+ position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
27327
+ }
27328
+ ++count2;
27329
+ }
27330
+ }
27331
+ return llm_positions;
27332
+ }
27333
+ /**
27334
+ * Build per-batch position ID segments for multimodal rope.
27335
+ * Override this in subclasses to change how vision/text segments are identified and positioned.
27336
+ * @param {object} params
27337
+ * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
27338
+ * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
27339
+ * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
27340
+ * @param {number} params.spatial_merge_size
27341
+ * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
27342
+ * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
27343
+ */
27344
+ _get_multimodal_rope_positions({
27345
+ filtered_ids,
27346
+ image_grid_thw_list,
27347
+ video_grid_thw_list,
27348
+ spatial_merge_size,
27349
+ state
27350
+ }) {
27351
+ const { image_token_id, video_token_id, vision_start_token_id } = this.config;
27352
+ const ids = filtered_ids;
27353
+ const vision_start_indices = ids.reduce((acc, x, idx) => {
27354
+ if (x == vision_start_token_id) acc.push(idx);
27355
+ return acc;
27356
+ }, []);
27357
+ const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
27358
+ const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
27359
+ const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
27360
+ const llm_pos_ids_list = [];
27361
+ let st2 = 0;
27362
+ let remain_images = image_nums;
27363
+ let remain_videos = video_nums;
27364
+ for (let j = 0; j < vision_tokens.length; ++j) {
27365
+ const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
27366
+ const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
27367
+ const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
27368
+ const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
27369
+ let ed;
27370
+ let t, h, w;
27371
+ if (ed_image < ed_video) {
27372
+ [t, h, w] = image_grid_thw_list[state.image_index];
27373
+ ++state.image_index;
27374
+ --remain_images;
27375
+ ed = ed_image;
27376
+ } else {
27377
+ [t, h, w] = video_grid_thw_list[state.video_index];
27378
+ ++state.video_index;
27379
+ --remain_videos;
27380
+ ed = ed_video;
27381
+ }
27382
+ const [llm_grid_t, llm_grid_h, llm_grid_w] = [
27383
+ Number(t),
27384
+ Math.floor(Number(h) / spatial_merge_size),
27385
+ Math.floor(Number(w) / spatial_merge_size)
27386
+ ];
27387
+ const text_len = ed - st2;
27388
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27389
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
27390
+ const offset = text_len + st_idx;
27391
+ const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
27392
+ const t_index = Array.from(
27393
+ { length: grid_size },
27394
+ (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
27395
+ );
27396
+ const h_index = Array.from(
27397
+ { length: grid_size },
27398
+ (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
27399
+ );
27400
+ const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
27401
+ llm_pos_ids_list.push([t_index, h_index, w_index].flat());
27402
+ st2 = ed + grid_size;
27403
+ }
27404
+ if (st2 < ids.length) {
27405
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27406
+ const text_len = ids.length - st2;
27407
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
27408
+ }
27409
+ return llm_pos_ids_list;
27410
+ }
27411
+ /**
27412
+ * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
27413
+ *
27414
+ * Explanation:
27415
+ * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
27416
+ *
27417
+ * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
27418
+ * Examples:
27419
+ * input_ids: [T T T T T], here T is for text.
27420
+ * temporal position_ids: [0, 1, 2, 3, 4]
27421
+ * height position_ids: [0, 1, 2, 3, 4]
27422
+ * width position_ids: [0, 1, 2, 3, 4]
27423
+ *
27424
+ * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
27425
+ * and 1D rotary position embeddin for text part.
27426
+ * Examples:
27427
+ * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
27428
+ * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
27429
+ * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
27430
+ * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
27431
+ * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
27432
+ * text temporal position_ids: [3, 4, 5, 6, 7]
27433
+ * text height position_ids: [3, 4, 5, 6, 7]
27434
+ * text width position_ids: [3, 4, 5, 6, 7]
27435
+ * Here we calculate the text start position_ids as the max vision position_ids plus 1.
27436
+ *
27437
+ * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
27438
+ * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
27439
+ * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
27440
+ * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
27441
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
27442
+ */
27443
+ get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
27444
+ const { vision_config } = this.config;
27445
+ const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
27446
+ if (image_grid_thw || video_grid_thw) {
27447
+ const total_input_ids = input_ids.tolist();
27448
+ if (!attention_mask) {
27449
+ attention_mask = ones_like(input_ids);
27450
+ }
27451
+ const attention_mask_list = attention_mask.tolist();
27452
+ const position_ids_list = Array.from(
27453
+ { length: 3 },
27454
+ () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
27455
+ );
27456
+ const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
27457
+ const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
27458
+ const state = { image_index: 0, video_index: 0 };
27459
+ const mrope_position_deltas = [];
27460
+ for (let i = 0; i < total_input_ids.length; ++i) {
27461
+ const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
27462
+ const llm_pos_ids_list = this._get_multimodal_rope_positions({
27463
+ filtered_ids,
27464
+ image_grid_thw_list,
27465
+ video_grid_thw_list,
27466
+ spatial_merge_size,
27467
+ state
27468
+ });
27469
+ const llm_positions = this._reorder_and_write_positions(
27470
+ llm_pos_ids_list,
27471
+ attention_mask_list[i],
27472
+ position_ids_list,
27473
+ i
27474
+ );
27475
+ mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
27476
+ }
27477
+ return [
27478
+ new Tensor3("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
27479
+ new Tensor3("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27480
+ ];
27481
+ } else {
27482
+ return this._get_text_only_rope_index(input_ids, attention_mask);
27483
+ }
27484
+ }
27485
+ async encode_image({ pixel_values, image_grid_thw }) {
27486
+ const features = (await sessionRun(this.sessions["vision_encoder"], {
27487
+ pixel_values,
27488
+ [this.image_grid_thw_name]: image_grid_thw
27489
+ })).image_features;
27490
+ return features;
27491
+ }
27492
+ _merge_input_ids_with_image_features(kwargs) {
27493
+ return default_merge_input_ids_with_image_features({
27494
+ // @ts-ignore
27495
+ image_token_id: this.config.image_token_id,
27496
+ ...kwargs
27497
+ });
27498
+ }
27499
+ prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
27500
+ if (model_inputs.attention_mask && !model_inputs.position_ids) {
27501
+ if (!model_inputs.past_key_values) {
27502
+ [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
27503
+ model_inputs.input_ids,
27504
+ model_inputs.image_grid_thw,
27505
+ model_inputs.video_grid_thw,
27506
+ model_inputs.attention_mask
27507
+ );
27508
+ } else {
27509
+ model_inputs.pixel_values = null;
27510
+ const past_length = model_inputs.past_key_values.get_seq_length();
27511
+ if (past_length < model_inputs.input_ids.dims[1]) {
27512
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
27513
+ model_inputs.input_ids,
27514
+ model_inputs.image_grid_thw,
27515
+ model_inputs.video_grid_thw,
27516
+ model_inputs.attention_mask
27517
+ );
27518
+ model_inputs.rope_deltas = rope_deltas;
27519
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
27520
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
27521
+ } else {
27522
+ if (!model_inputs.rope_deltas) {
27523
+ [, model_inputs.rope_deltas] = this.get_rope_index(
27524
+ model_inputs.input_ids,
27525
+ model_inputs.image_grid_thw,
27526
+ model_inputs.video_grid_thw,
27527
+ model_inputs.attention_mask
27528
+ );
27529
+ }
27530
+ const delta = BigInt(past_length);
27531
+ const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
27532
+ model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
27533
+ }
27534
+ }
27535
+ }
27536
+ return model_inputs;
27537
+ }
27538
+ };
27539
+ var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
27540
+ };
27541
+
27542
+ // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
27543
+ var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
27544
+ image_grid_thw_name = "image_grid_thw";
27545
+ };
27546
+ var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
27547
+ image_grid_thw_name = "image_grid_thw";
27548
+ };
27549
+
27550
+ // src/models/glm_ocr/modeling_glm_ocr.js
27551
+ var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
27552
+ /**
27553
+ * Compute 3D positional indices for vision tokens.
27554
+ * Temporal is constant, height is repeat-interleaved, width tiles.
27555
+ * @param {number} start_position
27556
+ * @param {number[]} grid_thw [T, H, W]
27557
+ * @param {number} temp_merge_size
27558
+ * @param {number} spatial_merge_size
27559
+ * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
27560
+ */
27561
+ get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
27562
+ const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
27563
+ const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
27564
+ const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
27565
+ const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
27566
+ const t_pos = Array.from({ length: seq_len }, () => start_position);
27567
+ const h_pos = Array.from(
27568
+ { length: seq_len },
27569
+ (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
27570
+ );
27571
+ const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
27572
+ return [...t_pos, ...h_pos, ...w_pos];
27573
+ }
27574
+ /**
27575
+ * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
27576
+ * instead of vision_start_token_id scanning used by Qwen2VL.
27577
+ * After a vision segment, position advances by max(h, w) / spatial_merge_size.
27578
+ */
27579
+ _get_multimodal_rope_positions({
27580
+ filtered_ids,
27581
+ image_grid_thw_list,
27582
+ video_grid_thw_list,
27583
+ spatial_merge_size,
27584
+ state
27585
+ }) {
27586
+ const { image_token_id } = this.config;
27587
+ const groups = [];
27588
+ let group_start = 0;
27589
+ let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
27590
+ for (let j = 1; j <= filtered_ids.length; ++j) {
27591
+ const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
27592
+ if (t !== current_type) {
27593
+ groups.push([current_type, group_start, j]);
27594
+ group_start = j;
27595
+ current_type = t;
27596
+ }
27597
+ }
27598
+ let current_pos = 0;
27599
+ const llm_pos_ids_list = [];
27600
+ for (const [modality_type, start_idx, end_idx] of groups) {
27601
+ if (modality_type === 0) {
27602
+ const text_len = end_idx - start_idx;
27603
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
27604
+ current_pos += text_len;
27605
+ } else {
27606
+ const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
27607
+ const temp_merge_size = grid_thw[0];
27608
+ llm_pos_ids_list.push(
27609
+ this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
27610
+ );
27611
+ current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
27612
+ }
27613
+ }
27614
+ return llm_pos_ids_list;
27615
+ }
27616
+ };
27617
+
27137
27618
  // src/models/glpn/modeling_glpn.js
27138
27619
  var GLPNPreTrainedModel = class extends PreTrainedModel {
27139
27620
  };
@@ -27446,6 +27927,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
27446
27927
  var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
27447
27928
  };
27448
27929
 
27930
+ // src/models/lighton_ocr/modeling_lighton_ocr.js
27931
+ var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
27932
+ };
27933
+
27449
27934
  // src/models/lfm2_moe/modeling_lfm2_moe.js
27450
27935
  var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
27451
27936
  };
@@ -27642,6 +28127,14 @@ var MistralModel = class extends MistralPreTrainedModel {
27642
28127
  var MistralForCausalLM = class extends MistralPreTrainedModel {
27643
28128
  };
27644
28129
 
28130
+ // src/models/mistral4/modeling_mistral4.js
28131
+ var Mistral4PreTrainedModel = class extends PreTrainedModel {
28132
+ };
28133
+ var Mistral4Model = class extends Mistral4PreTrainedModel {
28134
+ };
28135
+ var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
28136
+ };
28137
+
27645
28138
  // src/models/mobilebert/modeling_mobilebert.js
27646
28139
  var MobileBertPreTrainedModel = class extends PreTrainedModel {
27647
28140
  };
@@ -28110,6 +28603,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
28110
28603
  var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
28111
28604
  };
28112
28605
 
28606
+ // src/models/nemotron_h/modeling_nemotron_h.js
28607
+ var NemotronHPreTrainedModel = class extends PreTrainedModel {
28608
+ };
28609
+ var NemotronHModel = class extends NemotronHPreTrainedModel {
28610
+ };
28611
+ var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
28612
+ };
28613
+
28113
28614
  // src/models/neobert/modeling_neobert.js
28114
28615
  var NeoBertPreTrainedModel = class extends PreTrainedModel {
28115
28616
  };
@@ -28390,252 +28891,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
28390
28891
  var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
28391
28892
  };
28392
28893
 
28393
- // src/models/qwen2_vl/modeling_qwen2_vl.js
28394
- var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
28395
- forward_params = [
28396
- // Text inputs
28397
- "input_ids",
28398
- "attention_mask",
28399
- "position_ids",
28400
- "past_key_values",
28401
- // Vision inputs
28402
- "pixel_values",
28403
- "image_grid_thw"
28404
- ];
28405
- };
28406
- var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
28407
- // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
28408
- // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
28409
- // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
28410
- image_grid_thw_name = "grid_thw";
28411
- /**
28412
- * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
28413
- *
28414
- * Explanation:
28415
- * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
28416
- *
28417
- * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
28418
- * Examples:
28419
- * input_ids: [T T T T T], here T is for text.
28420
- * temporal position_ids: [0, 1, 2, 3, 4]
28421
- * height position_ids: [0, 1, 2, 3, 4]
28422
- * width position_ids: [0, 1, 2, 3, 4]
28423
- *
28424
- * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
28425
- * and 1D rotary position embeddin for text part.
28426
- * Examples:
28427
- * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
28428
- * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
28429
- * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
28430
- * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
28431
- * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
28432
- * text temporal position_ids: [3, 4, 5, 6, 7]
28433
- * text height position_ids: [3, 4, 5, 6, 7]
28434
- * text width position_ids: [3, 4, 5, 6, 7]
28435
- * Here we calculate the text start position_ids as the max vision position_ids plus 1.
28436
- *
28437
- * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
28438
- * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
28439
- * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
28440
- * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
28441
- * - 1 for tokens that are **not masked**,
28442
- * - 0 for tokens that are **masked**.
28443
- * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
28444
- * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
28445
- * - mrope_position_deltas: Tensor of shape `(batch_size)`.
28446
- */
28447
- get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
28448
- const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
28449
- const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
28450
- const mrope_position_deltas = [];
28451
- if (image_grid_thw || video_grid_thw) {
28452
- let total_input_ids = input_ids.tolist();
28453
- if (!attention_mask) {
28454
- attention_mask = ones_like(input_ids);
28455
- }
28456
- const attention_mask_list = attention_mask.tolist();
28457
- const position_ids_list = Array.from(
28458
- { length: 3 },
28459
- (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
28460
- );
28461
- const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
28462
- const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
28463
- let image_index = 0;
28464
- let video_index = 0;
28465
- for (let i = 0; i < total_input_ids.length; ++i) {
28466
- const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
28467
- const vision_start_indices = ids.reduce((acc, x, idx) => {
28468
- if (x == vision_start_token_id) acc.push(idx);
28469
- return acc;
28470
- }, []);
28471
- const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
28472
- const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
28473
- const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
28474
- let llm_pos_ids_list = [];
28475
- let st2 = 0;
28476
- let remain_images = image_nums;
28477
- let remain_videos = video_nums;
28478
- for (let j = 0; j < vision_tokens.length; ++j) {
28479
- const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
28480
- const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
28481
- const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
28482
- const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
28483
- let ed;
28484
- let t, h, w;
28485
- if (ed_image < ed_video) {
28486
- [t, h, w] = image_grid_thw_list[image_index];
28487
- ++image_index;
28488
- --remain_images;
28489
- ed = ed_image;
28490
- } else {
28491
- [t, h, w] = video_grid_thw_list[video_index];
28492
- ++video_index;
28493
- --remain_videos;
28494
- ed = ed_video;
28495
- }
28496
- const [llm_grid_t, llm_grid_h, llm_grid_w] = [
28497
- Number(t),
28498
- Math.floor(Number(h) / spatial_merge_size),
28499
- Math.floor(Number(w) / spatial_merge_size)
28500
- ];
28501
- const text_len = ed - st2;
28502
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
28503
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
28504
- const offset = text_len + st_idx;
28505
- const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
28506
- const t_index = Array.from(
28507
- { length: grid_size },
28508
- (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
28509
- );
28510
- const h_index = Array.from(
28511
- { length: grid_size },
28512
- (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
28513
- );
28514
- const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
28515
- llm_pos_ids_list.push([t_index, h_index, w_index].flat());
28516
- st2 = ed + grid_size;
28517
- }
28518
- if (st2 < ids.length) {
28519
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
28520
- const text_len = ids.length - st2;
28521
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
28522
- }
28523
- const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
28524
- const llm_positions = new Array(num_items);
28525
- let index = 0;
28526
- for (let x = 0; x < 3; ++x) {
28527
- for (let y = 0; y < llm_pos_ids_list.length; ++y) {
28528
- const val = llm_pos_ids_list[y];
28529
- const text_len = val.length / 3;
28530
- for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
28531
- llm_positions[index++] = val[z];
28532
- }
28533
- }
28534
- }
28535
- let count2 = 0;
28536
- const attn_mask = attention_mask_list[i];
28537
- for (let y = 0; y < attn_mask.length; ++y) {
28538
- if (attn_mask[y] == 1) {
28539
- for (let x = 0; x < 3; ++x) {
28540
- position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
28541
- }
28542
- ++count2;
28543
- }
28544
- }
28545
- const max_llm_positions = max(llm_positions)[0];
28546
- mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
28547
- }
28548
- return [
28549
- new Tensor3("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
28550
- new Tensor3("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
28551
- ];
28552
- } else {
28553
- if (attention_mask) {
28554
- const { data, dims } = cumsum_masked_fill(attention_mask);
28555
- const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
28556
- const mrope_position_deltas2 = Array.from(
28557
- { length: dims[0] },
28558
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
28559
- );
28560
- return [
28561
- new Tensor3("int64", position_ids, [3, ...dims]),
28562
- new Tensor3("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
28563
- ];
28564
- } else {
28565
- const [batch_size, seq_length] = input_ids.dims;
28566
- const position_ids = BigInt64Array.from(
28567
- { length: 3 * batch_size * seq_length },
28568
- (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
28569
- );
28570
- return [new Tensor3("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
28571
- }
28572
- }
28573
- }
28574
- async encode_image({ pixel_values, image_grid_thw }) {
28575
- const features = (await sessionRun(this.sessions["vision_encoder"], {
28576
- pixel_values,
28577
- [this.image_grid_thw_name]: image_grid_thw
28578
- })).image_features;
28579
- return features;
28580
- }
28581
- _merge_input_ids_with_image_features(kwargs) {
28582
- return default_merge_input_ids_with_image_features({
28583
- // @ts-ignore
28584
- image_token_id: this.config.image_token_id,
28585
- ...kwargs
28586
- });
28587
- }
28588
- prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
28589
- if (model_inputs.attention_mask && !model_inputs.position_ids) {
28590
- if (!model_inputs.past_key_values) {
28591
- [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
28592
- model_inputs.input_ids,
28593
- model_inputs.image_grid_thw,
28594
- model_inputs.video_grid_thw,
28595
- model_inputs.attention_mask
28596
- );
28597
- } else {
28598
- model_inputs.pixel_values = null;
28599
- const past_length = model_inputs.past_key_values.get_seq_length();
28600
- if (past_length < model_inputs.input_ids.dims[1]) {
28601
- const [full_position_ids, rope_deltas] = this.get_rope_index(
28602
- model_inputs.input_ids,
28603
- model_inputs.image_grid_thw,
28604
- model_inputs.video_grid_thw,
28605
- model_inputs.attention_mask
28606
- );
28607
- model_inputs.rope_deltas = rope_deltas;
28608
- model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
28609
- model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
28610
- } else {
28611
- if (!model_inputs.rope_deltas) {
28612
- [, model_inputs.rope_deltas] = this.get_rope_index(
28613
- model_inputs.input_ids,
28614
- model_inputs.image_grid_thw,
28615
- model_inputs.video_grid_thw,
28616
- model_inputs.attention_mask
28617
- );
28618
- }
28619
- const delta = BigInt(past_length);
28620
- const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
28621
- model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
28622
- }
28623
- }
28624
- }
28625
- return model_inputs;
28626
- }
28627
- };
28628
- var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
28629
- };
28630
-
28631
- // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
28632
- var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
28633
- image_grid_thw_name = "image_grid_thw";
28634
- };
28635
- var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
28636
- image_grid_thw_name = "image_grid_thw";
28637
- };
28638
-
28639
28894
  // src/models/qwen3/modeling_qwen3.js
28640
28895
  var Qwen3PreTrainedModel = class extends PreTrainedModel {
28641
28896
  };
@@ -29081,6 +29336,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
29081
29336
  }
29082
29337
  };
29083
29338
 
29339
+ // src/models/solar_open/modeling_solar_open.js
29340
+ var SolarOpenPreTrainedModel = class extends PreTrainedModel {
29341
+ };
29342
+ var SolarOpenModel = class extends SolarOpenPreTrainedModel {
29343
+ };
29344
+ var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
29345
+ };
29346
+
29084
29347
  // src/models/speecht5/modeling_speecht5.js
29085
29348
  var SpeechT5PreTrainedModel = class extends PreTrainedModel {
29086
29349
  };
@@ -30197,6 +30460,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
30197
30460
  // src/models/registry.js
30198
30461
  var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
30199
30462
  ["bert", "BertModel"],
30463
+ ["eurobert", "EuroBertModel"],
30200
30464
  ["neobert", "NeoBertModel"],
30201
30465
  ["modernbert", "ModernBertModel"],
30202
30466
  ["nomic_bert", "NomicBertModel"],
@@ -30328,6 +30592,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
30328
30592
  ["gemma3_text", "Gemma3Model"],
30329
30593
  ["helium", "HeliumModel"],
30330
30594
  ["glm", "GlmModel"],
30595
+ ["glm_moe_dsa", "GlmMoeDsaModel"],
30331
30596
  ["openelm", "OpenELMModel"],
30332
30597
  ["qwen2", "Qwen2Model"],
30333
30598
  ["qwen2_moe", "Qwen2MoeModel"],
@@ -30339,12 +30604,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
30339
30604
  ["mpt", "MptModel"],
30340
30605
  ["opt", "OPTModel"],
30341
30606
  ["mistral", "MistralModel"],
30607
+ ["mistral4", "Mistral4Model"],
30342
30608
  ["ministral", "MinistralModel"],
30343
30609
  ["ministral3", "Ministral3Model"],
30344
30610
  ["ernie4_5", "Ernie4_5ForCausalLM"],
30345
30611
  ["starcoder2", "Starcoder2Model"],
30612
+ ["deepseek_v3", "DeepseekV3Model"],
30346
30613
  ["falcon", "FalconModel"],
30347
30614
  ["falcon_h1", "FalconH1Model"],
30615
+ ["nemotron_h", "NemotronHModel"],
30616
+ ["solar_open", "SolarOpenModel"],
30348
30617
  ["stablelm", "StableLmModel"],
30349
30618
  ["modernbert-decoder", "ModernBertDecoderModel"],
30350
30619
  ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -30364,6 +30633,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30364
30633
  ]);
30365
30634
  var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30366
30635
  ["bert", "BertForSequenceClassification"],
30636
+ ["eurobert", "EuroBertForSequenceClassification"],
30367
30637
  ["neobert", "NeoBertForSequenceClassification"],
30368
30638
  ["modernbert", "ModernBertForSequenceClassification"],
30369
30639
  ["roformer", "RoFormerForSequenceClassification"],
@@ -30386,6 +30656,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30386
30656
  ]);
30387
30657
  var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30388
30658
  ["bert", "BertForTokenClassification"],
30659
+ ["eurobert", "EuroBertForTokenClassification"],
30389
30660
  ["neobert", "NeoBertForTokenClassification"],
30390
30661
  ["modernbert", "ModernBertForTokenClassification"],
30391
30662
  ["roformer", "RoFormerForTokenClassification"],
@@ -30448,6 +30719,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30448
30719
  ["gemma3", "Gemma3ForCausalLM"],
30449
30720
  ["helium", "HeliumForCausalLM"],
30450
30721
  ["glm", "GlmForCausalLM"],
30722
+ ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
30451
30723
  ["openelm", "OpenELMForCausalLM"],
30452
30724
  ["qwen2", "Qwen2ForCausalLM"],
30453
30725
  ["qwen2_moe", "Qwen2MoeForCausalLM"],
@@ -30467,13 +30739,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30467
30739
  ["opt", "OPTForCausalLM"],
30468
30740
  ["mbart", "MBartForCausalLM"],
30469
30741
  ["mistral", "MistralForCausalLM"],
30742
+ ["mistral4", "Mistral4ForCausalLM"],
30470
30743
  ["ministral", "MinistralForCausalLM"],
30471
30744
  ["ministral3", "Ministral3ForCausalLM"],
30472
30745
  ["ernie4_5", "Ernie4_5ForCausalLM"],
30473
30746
  ["starcoder2", "Starcoder2ForCausalLM"],
30747
+ ["deepseek_v3", "DeepseekV3ForCausalLM"],
30474
30748
  ["falcon", "FalconForCausalLM"],
30475
30749
  ["falcon_h1", "FalconH1ForCausalLM"],
30750
+ ["nemotron_h", "NemotronHForCausalLM"],
30476
30751
  ["trocr", "TrOCRForCausalLM"],
30752
+ ["solar_open", "SolarOpenForCausalLM"],
30477
30753
  ["stablelm", "StableLmForCausalLM"],
30478
30754
  ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
30479
30755
  ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -30484,6 +30760,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30484
30760
  var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
30485
30761
  var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
30486
30762
  ["bert", "BertForMaskedLM"],
30763
+ ["eurobert", "EuroBertForMaskedLM"],
30487
30764
  ["neobert", "NeoBertForMaskedLM"],
30488
30765
  ["modernbert", "ModernBertForMaskedLM"],
30489
30766
  ["roformer", "RoFormerForMaskedLM"],
@@ -30542,7 +30819,9 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
30542
30819
  ["paligemma", "PaliGemmaForConditionalGeneration"],
30543
30820
  ["llava_qwen2", "LlavaQwen2ForCausalLM"],
30544
30821
  ["gemma3n", "Gemma3nForConditionalGeneration"],
30545
- ["mistral3", "Mistral3ForConditionalGeneration"]
30822
+ ["mistral3", "Mistral3ForConditionalGeneration"],
30823
+ ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
30824
+ ["glm_ocr", "GlmOcrForConditionalGeneration"]
30546
30825
  ]);
30547
30826
  var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
30548
30827
  ["granite_speech", "GraniteSpeechForConditionalGeneration"],
@@ -30647,6 +30926,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30647
30926
  ]);
30648
30927
  var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
30649
30928
  var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30929
+ ["chmv2", "CHMv2ForDepthEstimation"],
30650
30930
  ["dpt", "DPTForDepthEstimation"],
30651
30931
  ["depth_anything", "DepthAnythingForDepthEstimation"],
30652
30932
  ["glpn", "GLPNForDepthEstimation"],
@@ -33385,6 +33665,9 @@ export {
33385
33665
  BloomModel,
33386
33666
  BloomPreTrainedModel,
33387
33667
  BloomTokenizer,
33668
+ CHMv2ForDepthEstimation,
33669
+ CHMv2ImageProcessor,
33670
+ CHMv2PreTrainedModel,
33388
33671
  CLIPFeatureExtractor,
33389
33672
  CLIPImageProcessor,
33390
33673
  CLIPModel,
@@ -33480,6 +33763,9 @@ export {
33480
33763
  DebertaV2Tokenizer,
33481
33764
  DecisionTransformerModel,
33482
33765
  DecisionTransformerPreTrainedModel,
33766
+ DeepseekV3ForCausalLM,
33767
+ DeepseekV3Model,
33768
+ DeepseekV3PreTrainedModel,
33483
33769
  DeiTFeatureExtractor,
33484
33770
  DeiTForImageClassification,
33485
33771
  DeiTImageProcessor,
@@ -33540,6 +33826,11 @@ export {
33540
33826
  EsmModel,
33541
33827
  EsmPreTrainedModel,
33542
33828
  EsmTokenizer,
33829
+ EuroBertForMaskedLM,
33830
+ EuroBertForSequenceClassification,
33831
+ EuroBertForTokenClassification,
33832
+ EuroBertModel,
33833
+ EuroBertPreTrainedModel,
33543
33834
  ExaoneForCausalLM,
33544
33835
  ExaoneModel,
33545
33836
  ExaonePreTrainedModel,
@@ -33597,8 +33888,14 @@ export {
33597
33888
  GemmaModel,
33598
33889
  GemmaPreTrainedModel,
33599
33890
  GemmaTokenizer,
33891
+ Glm46VImageProcessor,
33892
+ Glm46VProcessor,
33600
33893
  GlmForCausalLM,
33601
33894
  GlmModel,
33895
+ GlmMoeDsaForCausalLM,
33896
+ GlmMoeDsaModel,
33897
+ GlmMoeDsaPreTrainedModel,
33898
+ GlmOcrForConditionalGeneration,
33602
33899
  GlmPreTrainedModel,
33603
33900
  GptOssForCausalLM,
33604
33901
  GptOssModel,
@@ -33664,6 +33961,7 @@ export {
33664
33961
  Lfm2VlForConditionalGeneration,
33665
33962
  Lfm2VlImageProcessor,
33666
33963
  Lfm2VlProcessor,
33964
+ LightOnOcrForConditionalGeneration,
33667
33965
  LiteWhisperForConditionalGeneration,
33668
33966
  Llama4ForCausalLM,
33669
33967
  Llama4PreTrainedModel,
@@ -33733,6 +34031,9 @@ export {
33733
34031
  MimiPreTrainedModel,
33734
34032
  MinLengthLogitsProcessor,
33735
34033
  MinNewTokensLengthLogitsProcessor,
34034
+ Mistral4ForCausalLM,
34035
+ Mistral4Model,
34036
+ Mistral4PreTrainedModel,
33736
34037
  MistralForCausalLM,
33737
34038
  MistralModel,
33738
34039
  MistralPreTrainedModel,
@@ -33804,6 +34105,9 @@ export {
33804
34105
  NanoChatForCausalLM,
33805
34106
  NanoChatModel,
33806
34107
  NanoChatPreTrainedModel,
34108
+ NemotronHForCausalLM,
34109
+ NemotronHModel,
34110
+ NemotronHPreTrainedModel,
33807
34111
  NeoBertForMaskedLM,
33808
34112
  NeoBertForQuestionAnswering,
33809
34113
  NeoBertForSequenceClassification,
@@ -33993,6 +34297,9 @@ export {
33993
34297
  SnacFeatureExtractor,
33994
34298
  SnacModel,
33995
34299
  SnacPreTrainedModel,
34300
+ SolarOpenForCausalLM,
34301
+ SolarOpenModel,
34302
+ SolarOpenPreTrainedModel,
33996
34303
  SpeechT5FeatureExtractor,
33997
34304
  SpeechT5ForSpeechToText,
33998
34305
  SpeechT5ForTextToSpeech,