@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/README.md +13 -2
  2. package/dist/transformers.js +689 -382
  3. package/dist/transformers.min.js +19 -19
  4. package/dist/transformers.node.cjs +716 -382
  5. package/dist/transformers.node.min.cjs +19 -19
  6. package/dist/transformers.node.min.mjs +19 -19
  7. package/dist/transformers.node.mjs +689 -382
  8. package/dist/transformers.web.js +697 -390
  9. package/dist/transformers.web.min.js +17 -17
  10. package/package.json +2 -2
  11. package/src/configs.js +28 -22
  12. package/src/env.js +1 -1
  13. package/src/image_processors_utils.js +25 -15
  14. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  15. package/src/models/chmv2/modeling_chmv2.js +4 -0
  16. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  17. package/src/models/eurobert/modeling_eurobert.js +41 -0
  18. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  19. package/src/models/glm46v/processing_glm46v.js +5 -0
  20. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  21. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  22. package/src/models/image_processors.js +2 -0
  23. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
  24. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  25. package/src/models/mistral4/modeling_mistral4.js +5 -0
  26. package/src/models/modeling_utils.js +2 -0
  27. package/src/models/models.js +10 -1
  28. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  29. package/src/models/processors.js +1 -0
  30. package/src/models/qwen2_vl/modeling_qwen2_vl.js +187 -136
  31. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  32. package/src/models/registry.js +17 -0
  33. package/src/models/solar_open/modeling_solar_open.js +5 -0
  34. package/src/pipelines.js +1 -0
  35. package/src/utils/hub.js +4 -1
  36. package/src/utils/model_registry/get_file_metadata.js +1 -0
  37. package/types/configs.d.ts.map +1 -1
  38. package/types/image_processors_utils.d.ts +3 -2
  39. package/types/image_processors_utils.d.ts.map +1 -1
  40. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  41. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  42. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  43. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  44. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  45. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  46. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  47. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  48. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  49. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  50. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  51. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  52. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  53. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  54. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  55. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  56. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  57. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  58. package/types/models/image_processors.d.ts +2 -0
  59. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  60. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  61. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  62. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  63. package/types/models/modeling_utils.d.ts.map +1 -1
  64. package/types/models/models.d.ts +10 -1
  65. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  66. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  67. package/types/models/processors.d.ts +1 -0
  68. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
  69. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  70. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  71. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  72. package/types/models/registry.d.ts.map +1 -1
  73. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  74. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  75. package/types/pipelines.d.ts +1 -0
  76. package/types/pipelines.d.ts.map +1 -1
  77. package/types/utils/hub.d.ts.map +1 -1
  78. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  79. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -14,7 +14,7 @@ var node_path_default = {};
14
14
  var node_url_default = {};
15
15
 
16
16
  // src/env.js
17
- var VERSION = "4.0.0-next.7";
17
+ var VERSION = "4.0.0-next.8";
18
18
  var HAS_SELF = typeof self !== "undefined";
19
19
  var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
20
20
  var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
@@ -244,7 +244,7 @@ var logger = {
244
244
  }
245
245
  };
246
246
 
247
- // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
247
+ // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
248
248
  var DictionarySplitter = class {
249
249
  /**
250
250
  * @param dictionary The dictionary of words to use for splitting.
@@ -1900,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
1900
1900
  );
1901
1901
  if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
1902
1902
  output_tokens.push(...byte_tokens);
1903
- } else {
1903
+ } else if (this.unk_token != null) {
1904
1904
  output_tokens.push(this.unk_token);
1905
1905
  }
1906
- } else {
1906
+ } else if (this.unk_token != null) {
1907
1907
  output_tokens.push(this.unk_token);
1908
1908
  }
1909
1909
  }
@@ -6509,13 +6509,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
6509
6509
  wrapped_progress
6510
6510
  );
6511
6511
  } else if (typeof response !== "string") {
6512
+ const headers = new Headers(response.headers);
6513
+ headers.set("content-length", result.byteLength.toString());
6512
6514
  await cache2.put(
6513
6515
  cacheKey,
6514
6516
  new Response(
6515
6517
  /** @type {any} */
6516
6518
  result,
6517
6519
  {
6518
- headers: response.headers
6520
+ headers
6519
6521
  }
6520
6522
  )
6521
6523
  ).catch((err) => {
@@ -11829,6 +11831,7 @@ __export(processors_exports, {
11829
11831
  ChatterboxProcessor: () => ChatterboxProcessor,
11830
11832
  Florence2Processor: () => Florence2Processor,
11831
11833
  Gemma3nProcessor: () => Gemma3nProcessor,
11834
+ Glm46VProcessor: () => Glm46VProcessor,
11832
11835
  GraniteSpeechProcessor: () => GraniteSpeechProcessor,
11833
11836
  GroundingDinoProcessor: () => GroundingDinoProcessor,
11834
11837
  Idefics3Processor: () => Idefics3Processor,
@@ -14342,26 +14345,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
14342
14345
  }
14343
14346
  return [segmentation, segments];
14344
14347
  }
14345
- function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
14348
+ function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
14346
14349
  if (height < factor || width < factor) {
14347
- throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
14348
- } else if (Math.max(height, width) / Math.min(height, width) > 200) {
14350
+ const scale = Math.max(factor / height, factor / width);
14351
+ height = Math.round(height * scale);
14352
+ width = Math.round(width * scale);
14353
+ }
14354
+ if (Math.max(height, width) / Math.min(height, width) > 200) {
14349
14355
  throw new Error(
14350
14356
  `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
14351
14357
  );
14352
14358
  }
14353
14359
  let h_bar = Math.round(height / factor) * factor;
14354
14360
  let w_bar = Math.round(width / factor) * factor;
14355
- if (h_bar * w_bar > max_pixels) {
14356
- const beta = Math.sqrt(height * width / max_pixels);
14357
- h_bar = Math.floor(height / beta / factor) * factor;
14358
- w_bar = Math.floor(width / beta / factor) * factor;
14359
- } else if (h_bar * w_bar < min_pixels) {
14360
- const beta = Math.sqrt(min_pixels / (height * width));
14361
+ if (temporal_factor * h_bar * w_bar > max_pixels) {
14362
+ const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
14363
+ h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
14364
+ w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
14365
+ } else if (temporal_factor * h_bar * w_bar < min_pixels) {
14366
+ const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
14361
14367
  h_bar = Math.ceil(height * beta / factor) * factor;
14362
14368
  w_bar = Math.ceil(width * beta / factor) * factor;
14363
14369
  }
14364
- return [h_bar, w_bar];
14370
+ return [w_bar, h_bar];
14365
14371
  }
14366
14372
  function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
14367
14373
  if (label_ids_to_fuse === null) {
@@ -14440,7 +14446,7 @@ var ImageProcessor = class extends Callable2 {
14440
14446
  this.do_pad = config.do_pad;
14441
14447
  this.min_pixels = config.min_pixels;
14442
14448
  this.max_pixels = config.max_pixels;
14443
- if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
14449
+ if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
14444
14450
  this.pad_size = this.size;
14445
14451
  }
14446
14452
  this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -14728,10 +14734,8 @@ var ImageProcessor = class extends Callable2 {
14728
14734
  const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
14729
14735
  [pixelData, imgDims] = padded;
14730
14736
  } else if (this.size_divisibility) {
14731
- const [paddedWidth, paddedHeight] = enforce_size_divisibility(
14732
- [imgDims[1], imgDims[0]],
14733
- this.size_divisibility
14734
- );
14737
+ const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
14738
+ const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
14735
14739
  [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
14736
14740
  }
14737
14741
  }
@@ -14808,6 +14812,7 @@ var image_processors_exports = {};
14808
14812
  __export(image_processors_exports, {
14809
14813
  BeitFeatureExtractor: () => BeitFeatureExtractor,
14810
14814
  BitImageProcessor: () => BitImageProcessor,
14815
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
14811
14816
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
14812
14817
  CLIPImageProcessor: () => CLIPImageProcessor,
14813
14818
  ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -14824,6 +14829,7 @@ __export(image_processors_exports, {
14824
14829
  DonutImageProcessor: () => DonutImageProcessor,
14825
14830
  EfficientNetImageProcessor: () => EfficientNetImageProcessor,
14826
14831
  GLPNFeatureExtractor: () => GLPNFeatureExtractor,
14832
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
14827
14833
  GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
14828
14834
  Idefics3ImageProcessor: () => Idefics3ImageProcessor,
14829
14835
  ImageFeatureExtractor: () => ImageProcessor,
@@ -14884,6 +14890,10 @@ var BitImageProcessor = class extends ImageProcessor {
14884
14890
  var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
14885
14891
  };
14886
14892
 
14893
+ // src/models/chmv2/image_processing_chmv2.js
14894
+ var CHMv2ImageProcessor = class extends ImageProcessor {
14895
+ };
14896
+
14887
14897
  // src/models/clip/image_processing_clip.js
14888
14898
  var CLIPImageProcessor = class extends ImageProcessor {
14889
14899
  };
@@ -15003,6 +15013,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
15003
15013
  }
15004
15014
  };
15005
15015
 
15016
+ // src/models/qwen2_vl/image_processing_qwen2_vl.js
15017
+ var Qwen2VLImageProcessor = class extends ImageProcessor {
15018
+ constructor(config) {
15019
+ super(config);
15020
+ this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
15021
+ this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
15022
+ this.patch_size = config.patch_size;
15023
+ this.merge_size = config.merge_size;
15024
+ }
15025
+ /** @type {ImageProcessor['get_resize_output_image_size']} */
15026
+ get_resize_output_image_size(image, size) {
15027
+ const factor = this.patch_size * this.merge_size;
15028
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
15029
+ }
15030
+ async _call(images, ...args) {
15031
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
15032
+ let patches = pixel_values;
15033
+ const { temporal_patch_size, merge_size, patch_size } = this.config;
15034
+ if (patches.dims[0] === 1) {
15035
+ patches = cat(
15036
+ Array.from({ length: temporal_patch_size }, () => patches),
15037
+ 0
15038
+ );
15039
+ }
15040
+ const grid_t = patches.dims[0] / temporal_patch_size;
15041
+ const channel = patches.dims[1];
15042
+ const grid_h = Math.floor(patches.dims[2] / patch_size);
15043
+ const grid_w = Math.floor(patches.dims[3] / patch_size);
15044
+ const flatten_patches = patches.view(
15045
+ grid_t,
15046
+ temporal_patch_size,
15047
+ channel,
15048
+ Math.floor(grid_h / merge_size),
15049
+ merge_size,
15050
+ patch_size,
15051
+ Math.floor(grid_w / merge_size),
15052
+ merge_size,
15053
+ patch_size
15054
+ ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
15055
+ const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
15056
+ return {
15057
+ pixel_values: flatten_patches,
15058
+ image_grid_thw,
15059
+ original_sizes,
15060
+ reshaped_input_sizes
15061
+ };
15062
+ }
15063
+ };
15064
+
15065
+ // src/models/glm46v/image_processing_glm46v.js
15066
+ var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
15067
+ /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
15068
+ get_resize_output_image_size(image, size) {
15069
+ const factor = this.patch_size * this.merge_size;
15070
+ const temporal_factor = this.config.temporal_patch_size ?? 2;
15071
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
15072
+ }
15073
+ };
15074
+
15006
15075
  // src/models/glpn/image_processing_glpn.js
15007
15076
  var GLPNFeatureExtractor = class extends ImageProcessor {
15008
15077
  };
@@ -15396,7 +15465,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
15396
15465
  const img = pixel_values.unsqueeze_(0);
15397
15466
  const total_factor = this.encoder_patch_size * this.downsample_factor;
15398
15467
  const f2 = total_factor ** 2;
15399
- const [new_height, new_width] = smart_resize(
15468
+ const [new_width, new_height] = smart_resize(
15400
15469
  Math.max(total_factor, height),
15401
15470
  Math.max(total_factor, width),
15402
15471
  total_factor,
@@ -15686,55 +15755,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
15686
15755
  var PvtImageProcessor = class extends ImageProcessor {
15687
15756
  };
15688
15757
 
15689
- // src/models/qwen2_vl/image_processing_qwen2_vl.js
15690
- var Qwen2VLImageProcessor = class extends ImageProcessor {
15691
- constructor(config) {
15692
- super(config);
15693
- this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
15694
- this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
15695
- this.patch_size = config.patch_size;
15696
- this.merge_size = config.merge_size;
15697
- }
15698
- /** @type {ImageProcessor['get_resize_output_image_size']} */
15699
- get_resize_output_image_size(image, size) {
15700
- const factor = this.patch_size * this.merge_size;
15701
- return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
15702
- }
15703
- async _call(images, ...args) {
15704
- const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
15705
- let patches = pixel_values;
15706
- const { temporal_patch_size, merge_size, patch_size } = this.config;
15707
- if (patches.dims[0] === 1) {
15708
- patches = cat(
15709
- Array.from({ length: temporal_patch_size }, () => patches),
15710
- 0
15711
- );
15712
- }
15713
- const grid_t = patches.dims[0] / temporal_patch_size;
15714
- const channel = patches.dims[1];
15715
- const grid_h = Math.floor(patches.dims[2] / patch_size);
15716
- const grid_w = Math.floor(patches.dims[3] / patch_size);
15717
- const flatten_patches = patches.view(
15718
- grid_t,
15719
- temporal_patch_size,
15720
- channel,
15721
- Math.floor(grid_h / merge_size),
15722
- merge_size,
15723
- patch_size,
15724
- Math.floor(grid_w / merge_size),
15725
- merge_size,
15726
- patch_size
15727
- ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
15728
- const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
15729
- return {
15730
- pixel_values: flatten_patches,
15731
- image_grid_thw,
15732
- original_sizes,
15733
- reshaped_input_sizes
15734
- };
15735
- }
15736
- };
15737
-
15738
15758
  // src/models/rt_detr/image_processing_rt_detr.js
15739
15759
  var RTDetrImageProcessor = class extends ImageProcessor {
15740
15760
  /** @type {typeof post_process_object_detection} */
@@ -16288,6 +16308,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
16288
16308
  }
16289
16309
  };
16290
16310
 
16311
+ // src/models/qwen2_vl/processing_qwen2_vl.js
16312
+ var Qwen2VLProcessor = class extends Processor {
16313
+ static image_processor_class = AutoImageProcessor;
16314
+ static tokenizer_class = AutoTokenizer;
16315
+ static image_token = "<|image_pad|>";
16316
+ /**
16317
+ *
16318
+ * @param {string|string[]} text
16319
+ * @param {RawImage|RawImage[]} images
16320
+ * @param {...any} args
16321
+ * @returns {Promise<any>}
16322
+ */
16323
+ async _call(text, images = null, ...args) {
16324
+ if (!Array.isArray(text)) {
16325
+ text = [text];
16326
+ }
16327
+ let image_inputs, image_grid_thw;
16328
+ if (images) {
16329
+ image_inputs = await this.image_processor(images);
16330
+ image_grid_thw = image_inputs.image_grid_thw;
16331
+ }
16332
+ if (image_grid_thw) {
16333
+ let merge_length = this.image_processor.config.merge_size ** 2;
16334
+ let index = 0;
16335
+ const image_token = (
16336
+ /** @type {typeof Qwen2VLProcessor} */
16337
+ this.constructor.image_token
16338
+ );
16339
+ const image_grid_thw_list = image_grid_thw.tolist();
16340
+ text = text.map((t) => {
16341
+ while (t.includes(image_token)) {
16342
+ const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
16343
+ t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
16344
+ }
16345
+ return t.replaceAll("<|placeholder|>", image_token);
16346
+ });
16347
+ }
16348
+ const text_inputs = this.tokenizer(text);
16349
+ return {
16350
+ ...text_inputs,
16351
+ ...image_inputs
16352
+ };
16353
+ }
16354
+ };
16355
+
16356
+ // src/models/glm46v/processing_glm46v.js
16357
+ var Glm46VProcessor = class extends Qwen2VLProcessor {
16358
+ static image_token = "<|image|>";
16359
+ };
16360
+
16291
16361
  // src/models/granite_speech/processing_granite_speech.js
16292
16362
  var GraniteSpeechProcessor = class extends Processor {
16293
16363
  static tokenizer_class = AutoTokenizer;
@@ -17018,47 +17088,6 @@ var PyAnnoteProcessor = class extends Processor {
17018
17088
  }
17019
17089
  };
17020
17090
 
17021
- // src/models/qwen2_vl/processing_qwen2_vl.js
17022
- var Qwen2VLProcessor = class extends Processor {
17023
- static image_processor_class = AutoImageProcessor;
17024
- static tokenizer_class = AutoTokenizer;
17025
- /**
17026
- *
17027
- * @param {string|string[]} text
17028
- * @param {RawImage|RawImage[]} images
17029
- * @param {...any} args
17030
- * @returns {Promise<any>}
17031
- */
17032
- async _call(text, images = null, ...args) {
17033
- if (!Array.isArray(text)) {
17034
- text = [text];
17035
- }
17036
- let image_inputs, image_grid_thw;
17037
- if (images) {
17038
- image_inputs = await this.image_processor(images);
17039
- image_grid_thw = image_inputs.image_grid_thw;
17040
- }
17041
- if (image_grid_thw) {
17042
- let merge_length = this.image_processor.config.merge_size ** 2;
17043
- let index = 0;
17044
- const image_grid_thw_list = image_grid_thw.tolist();
17045
- text = text.map((t) => {
17046
- while (t.includes("<|image_pad|>")) {
17047
- const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
17048
- t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
17049
- }
17050
- return t.replaceAll("<|placeholder|>", "<|image_pad|>");
17051
- });
17052
- }
17053
- const text_inputs = this.tokenizer(text);
17054
- return {
17055
- ...text_inputs,
17056
- ...image_inputs
17057
- // TODO: ...videos_inputs,
17058
- };
17059
- }
17060
- };
17061
-
17062
17091
  // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
17063
17092
  var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
17064
17093
  };
@@ -17402,6 +17431,8 @@ function getNormalizedConfig(config) {
17402
17431
  case "gemma3n":
17403
17432
  case "lfm2_vl":
17404
17433
  case "chatterbox":
17434
+ case "lighton_ocr":
17435
+ case "glm_ocr":
17405
17436
  case "mistral3":
17406
17437
  case "qwen2_5_vl":
17407
17438
  case "qwen3_vl":
@@ -17477,6 +17508,8 @@ function getNormalizedConfig(config) {
17477
17508
  mapping["dim_kv"] = "head_dim";
17478
17509
  break;
17479
17510
  case "qwen3":
17511
+ case "solar_open":
17512
+ case "glm_ocr_text":
17480
17513
  case "gemma":
17481
17514
  case "gemma2":
17482
17515
  case "vaultgemma":
@@ -17487,6 +17520,7 @@ function getNormalizedConfig(config) {
17487
17520
  case "ernie4_5":
17488
17521
  case "hunyuan_v1_dense":
17489
17522
  case "falcon_h1":
17523
+ case "nemotron_h":
17490
17524
  case "ministral":
17491
17525
  case "ministral3":
17492
17526
  mapping["num_heads"] = "num_key_value_heads";
@@ -17521,6 +17555,9 @@ function getNormalizedConfig(config) {
17521
17555
  mapping["num_attention_heads"] = "num_attention_heads";
17522
17556
  break;
17523
17557
  case "youtu":
17558
+ case "deepseek_v3":
17559
+ case "glm_moe_dsa":
17560
+ case "mistral4":
17524
17561
  mapping["num_heads"] = "num_key_value_heads";
17525
17562
  mapping["num_layers"] = "num_hidden_layers";
17526
17563
  mapping["dim_kv"] = "qk_head_dim";
@@ -17609,6 +17646,7 @@ function getCacheShapes(config, options) {
17609
17646
  if (!(config instanceof PretrainedConfig)) {
17610
17647
  config = new PretrainedConfig(config);
17611
17648
  }
17649
+ const batch_size = options?.batch_size ?? 1;
17612
17650
  if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
17613
17651
  const pkv_prefix = options?.prefix ?? "past_key_values";
17614
17652
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -17618,7 +17656,6 @@ function getCacheShapes(config, options) {
17618
17656
  config
17619
17657
  );
17620
17658
  const head_dim = hidden_size / num_attention_heads;
17621
- const batch_size = options?.batch_size ?? 1;
17622
17659
  for (let i = 0; i < layer_types.length; ++i) {
17623
17660
  if (layer_types[i] === "full_attention") {
17624
17661
  for (const kv of ["key", "value"]) {
@@ -17631,31 +17668,26 @@ function getCacheShapes(config, options) {
17631
17668
  }
17632
17669
  }
17633
17670
  return cache_values;
17634
- } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
17671
+ } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
17635
17672
  const pkv_prefix = options?.prefix ?? "past_key_values";
17636
17673
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
17637
- const cache_values = {};
17638
- const {
17639
- layer_types,
17640
- num_hidden_layers,
17641
- num_attention_heads,
17642
- num_key_value_heads,
17643
- hidden_size,
17644
- mamba_d_conv,
17645
- mamba_n_heads,
17646
- mamba_d_head,
17647
- mamba_d_state,
17648
- mamba_n_groups,
17649
- mamba_expand,
17650
- mamba_d_ssm
17651
- } = (
17674
+ const c = (
17652
17675
  /** @type {any} */
17653
17676
  config
17654
17677
  );
17655
- const head_dim = hidden_size / num_attention_heads;
17656
- const batch_size = options?.batch_size ?? 1;
17657
- const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
17658
- for (let i = 0; i < num_hidden_layers; ++i) {
17678
+ const layer_types = c.layer_types ?? c.layers_block_type;
17679
+ const num_layers = c.num_hidden_layers ?? layer_types?.length;
17680
+ const num_key_value_heads = c.num_key_value_heads;
17681
+ const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
17682
+ const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
17683
+ const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
17684
+ const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
17685
+ const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
17686
+ const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
17687
+ const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
17688
+ const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
17689
+ const cache_values = {};
17690
+ for (let i = 0; i < num_layers; ++i) {
17659
17691
  if (!layer_types || layer_types[i] === "mamba") {
17660
17692
  cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
17661
17693
  cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -17689,7 +17721,6 @@ function getCacheShapes(config, options) {
17689
17721
  const key_dim = linear_key_head_dim * linear_num_key_heads;
17690
17722
  const value_dim = linear_value_head_dim * linear_num_value_heads;
17691
17723
  const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
17692
- const batch_size = options?.batch_size ?? 1;
17693
17724
  for (let i = 0; i < layer_types.length; ++i) {
17694
17725
  if (layer_types[i] === "full_attention") {
17695
17726
  for (const kv of ["key", "value"]) {
@@ -20317,7 +20348,9 @@ async function generic_text_to_text_forward(self2, {
20317
20348
  "qwen3_5",
20318
20349
  "qwen3_5_text",
20319
20350
  "qwen3_5_moe",
20320
- "qwen3_5_moe_text"
20351
+ "qwen3_5_moe_text",
20352
+ "glm_ocr",
20353
+ "glm_ocr_text"
20321
20354
  ].includes(self2.config.model_type)
20322
20355
  ) {
20323
20356
  const { image_grid_thw, video_grid_thw } = kwargs;
@@ -20541,6 +20574,8 @@ __export(models_exports, {
20541
20574
  BloomForCausalLM: () => BloomForCausalLM,
20542
20575
  BloomModel: () => BloomModel,
20543
20576
  BloomPreTrainedModel: () => BloomPreTrainedModel,
20577
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
20578
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
20544
20579
  CLIPModel: () => CLIPModel,
20545
20580
  CLIPPreTrainedModel: () => CLIPPreTrainedModel,
20546
20581
  CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -20615,6 +20650,9 @@ __export(models_exports, {
20615
20650
  DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
20616
20651
  DecisionTransformerModel: () => DecisionTransformerModel,
20617
20652
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
20653
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
20654
+ DeepseekV3Model: () => DeepseekV3Model,
20655
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
20618
20656
  DeiTForImageClassification: () => DeiTForImageClassification,
20619
20657
  DeiTModel: () => DeiTModel,
20620
20658
  DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -20660,6 +20698,11 @@ __export(models_exports, {
20660
20698
  EsmForTokenClassification: () => EsmForTokenClassification,
20661
20699
  EsmModel: () => EsmModel,
20662
20700
  EsmPreTrainedModel: () => EsmPreTrainedModel,
20701
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
20702
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
20703
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
20704
+ EuroBertModel: () => EuroBertModel,
20705
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
20663
20706
  ExaoneForCausalLM: () => ExaoneForCausalLM,
20664
20707
  ExaoneModel: () => ExaoneModel,
20665
20708
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -20706,6 +20749,10 @@ __export(models_exports, {
20706
20749
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
20707
20750
  GlmForCausalLM: () => GlmForCausalLM,
20708
20751
  GlmModel: () => GlmModel,
20752
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
20753
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
20754
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
20755
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
20709
20756
  GlmPreTrainedModel: () => GlmPreTrainedModel,
20710
20757
  GptOssForCausalLM: () => GptOssForCausalLM,
20711
20758
  GptOssModel: () => GptOssModel,
@@ -20752,6 +20799,7 @@ __export(models_exports, {
20752
20799
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
20753
20800
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
20754
20801
  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
20802
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
20755
20803
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
20756
20804
  Llama4ForCausalLM: () => Llama4ForCausalLM,
20757
20805
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -20801,6 +20849,9 @@ __export(models_exports, {
20801
20849
  MimiEncoderOutput: () => MimiEncoderOutput,
20802
20850
  MimiModel: () => MimiModel,
20803
20851
  MimiPreTrainedModel: () => MimiPreTrainedModel,
20852
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
20853
+ Mistral4Model: () => Mistral4Model,
20854
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
20804
20855
  MistralForCausalLM: () => MistralForCausalLM,
20805
20856
  MistralModel: () => MistralModel,
20806
20857
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -20858,6 +20909,9 @@ __export(models_exports, {
20858
20909
  NanoChatForCausalLM: () => NanoChatForCausalLM,
20859
20910
  NanoChatModel: () => NanoChatModel,
20860
20911
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
20912
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
20913
+ NemotronHModel: () => NemotronHModel,
20914
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
20861
20915
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
20862
20916
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
20863
20917
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -20995,6 +21049,9 @@ __export(models_exports, {
20995
21049
  SnacEncoderModel: () => SnacEncoderModel,
20996
21050
  SnacModel: () => SnacModel,
20997
21051
  SnacPreTrainedModel: () => SnacPreTrainedModel,
21052
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
21053
+ SolarOpenModel: () => SolarOpenModel,
21054
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
20998
21055
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
20999
21056
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
21000
21057
  SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -21169,7 +21226,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
21169
21226
  var ArceeForCausalLM = class extends ArceePreTrainedModel {
21170
21227
  };
21171
21228
 
21172
- // src/models/ast/modeling_ast.js
21229
+ // src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
21173
21230
  var ASTPreTrainedModel = class extends PreTrainedModel {
21174
21231
  };
21175
21232
  var ASTModel = class extends ASTPreTrainedModel {
@@ -21504,6 +21561,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
21504
21561
  var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
21505
21562
  };
21506
21563
 
21564
+ // src/models/chmv2/modeling_chmv2.js
21565
+ var CHMv2PreTrainedModel = class extends PreTrainedModel {
21566
+ };
21567
+ var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
21568
+ };
21569
+
21507
21570
  // src/models/clap/modeling_clap.js
21508
21571
  var ClapPreTrainedModel = class extends PreTrainedModel {
21509
21572
  };
@@ -21842,6 +21905,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
21842
21905
  }
21843
21906
  };
21844
21907
 
21908
+ // src/models/deepseek_v3/modeling_deepseek_v3.js
21909
+ var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
21910
+ };
21911
+ var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
21912
+ };
21913
+ var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
21914
+ };
21915
+
21845
21916
  // src/models/deberta_v2/modeling_deberta_v2.js
21846
21917
  var DebertaV2PreTrainedModel = class extends PreTrainedModel {
21847
21918
  };
@@ -22190,6 +22261,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
22190
22261
  }
22191
22262
  };
22192
22263
 
22264
+ // src/models/eurobert/modeling_eurobert.js
22265
+ var EuroBertPreTrainedModel = class extends PreTrainedModel {
22266
+ };
22267
+ var EuroBertModel = class extends EuroBertPreTrainedModel {
22268
+ };
22269
+ var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
22270
+ /**
22271
+ * Calls the model on new inputs.
22272
+ *
22273
+ * @param {Object} model_inputs The inputs to the model.
22274
+ * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
22275
+ */
22276
+ async _call(model_inputs) {
22277
+ return new MaskedLMOutput(await super._call(model_inputs));
22278
+ }
22279
+ };
22280
+ var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
22281
+ /**
22282
+ * Calls the model on new inputs.
22283
+ *
22284
+ * @param {Object} model_inputs The inputs to the model.
22285
+ * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
22286
+ */
22287
+ async _call(model_inputs) {
22288
+ return new SequenceClassifierOutput(await super._call(model_inputs));
22289
+ }
22290
+ };
22291
+ var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
22292
+ /**
22293
+ * Calls the model on new inputs.
22294
+ *
22295
+ * @param {Object} model_inputs The inputs to the model.
22296
+ * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
22297
+ */
22298
+ async _call(model_inputs) {
22299
+ return new TokenClassifierOutput(await super._call(model_inputs));
22300
+ }
22301
+ };
22302
+
22193
22303
  // src/models/exaone/modeling_exaone.js
22194
22304
  var ExaonePreTrainedModel = class extends PreTrainedModel {
22195
22305
  };
@@ -22465,18 +22575,389 @@ var GlmModel = class extends GlmPreTrainedModel {
22465
22575
  var GlmForCausalLM = class extends GlmPreTrainedModel {
22466
22576
  };
22467
22577
 
22468
- // src/models/glpn/modeling_glpn.js
22469
- var GLPNPreTrainedModel = class extends PreTrainedModel {
22578
+ // src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
22579
+ var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
22470
22580
  };
22471
- var GLPNModel = class extends GLPNPreTrainedModel {
22581
+ var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
22472
22582
  };
22473
- var GLPNForDepthEstimation = class extends GLPNPreTrainedModel {
22583
+ var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
22474
22584
  };
22475
22585
 
22476
- // src/models/gpt_bigcode/modeling_gpt_bigcode.js
22477
- var GPTBigCodePreTrainedModel = class extends PreTrainedModel {
22478
- };
22479
- var GPTBigCodeModel = class extends GPTBigCodePreTrainedModel {
22586
+ // src/models/qwen2_vl/modeling_qwen2_vl.js
22587
+ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
22588
+ forward_params = [
22589
+ // Text inputs
22590
+ "input_ids",
22591
+ "attention_mask",
22592
+ "position_ids",
22593
+ "past_key_values",
22594
+ // Vision inputs
22595
+ "pixel_values",
22596
+ "image_grid_thw"
22597
+ ];
22598
+ };
22599
+ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
22600
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
22601
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
22602
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
22603
+ image_grid_thw_name = "grid_thw";
22604
+ /**
22605
+ * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
22606
+ * @param {Tensor} input_ids
22607
+ * @param {Tensor} attention_mask
22608
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
22609
+ */
22610
+ _get_text_only_rope_index(input_ids, attention_mask) {
22611
+ if (attention_mask) {
22612
+ const { data, dims } = cumsum_masked_fill(attention_mask);
22613
+ const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
22614
+ const mrope_position_deltas = Array.from(
22615
+ { length: dims[0] },
22616
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
22617
+ );
22618
+ return [
22619
+ new Tensor2("int64", position_ids, [3, ...dims]),
22620
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
22621
+ ];
22622
+ } else {
22623
+ const [batch_size, seq_length] = input_ids.dims;
22624
+ const position_ids = BigInt64Array.from(
22625
+ { length: 3 * batch_size * seq_length },
22626
+ (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
22627
+ );
22628
+ return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
22629
+ }
22630
+ }
22631
+ /**
22632
+ * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
22633
+ * global [all_t, all_h, all_w] order, then write back into the position_ids array
22634
+ * respecting attention mask.
22635
+ * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
22636
+ * @param {number[]} attn_mask Attention mask for this batch element
22637
+ * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
22638
+ * @param {number} batch_idx Current batch index
22639
+ * @returns {number[]} Flat reordered positions of length total_len
22640
+ */
22641
+ _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
22642
+ const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
22643
+ const llm_positions = new Array(total_len);
22644
+ let index = 0;
22645
+ for (let x = 0; x < 3; ++x) {
22646
+ for (const val of llm_pos_ids_list) {
22647
+ const seg_len = val.length / 3;
22648
+ for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
22649
+ llm_positions[index++] = val[z];
22650
+ }
22651
+ }
22652
+ }
22653
+ let count2 = 0;
22654
+ for (let y = 0; y < attn_mask.length; ++y) {
22655
+ if (attn_mask[y] == 1) {
22656
+ for (let x = 0; x < 3; ++x) {
22657
+ position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
22658
+ }
22659
+ ++count2;
22660
+ }
22661
+ }
22662
+ return llm_positions;
22663
+ }
22664
+ /**
22665
+ * Build per-batch position ID segments for multimodal rope.
22666
+ * Override this in subclasses to change how vision/text segments are identified and positioned.
22667
+ * @param {object} params
22668
+ * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
22669
+ * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
22670
+ * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
22671
+ * @param {number} params.spatial_merge_size
22672
+ * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
22673
+ * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
22674
+ */
22675
+ _get_multimodal_rope_positions({
22676
+ filtered_ids,
22677
+ image_grid_thw_list,
22678
+ video_grid_thw_list,
22679
+ spatial_merge_size,
22680
+ state
22681
+ }) {
22682
+ const { image_token_id, video_token_id, vision_start_token_id } = this.config;
22683
+ const ids = filtered_ids;
22684
+ const vision_start_indices = ids.reduce((acc, x, idx) => {
22685
+ if (x == vision_start_token_id) acc.push(idx);
22686
+ return acc;
22687
+ }, []);
22688
+ const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
22689
+ const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
22690
+ const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
22691
+ const llm_pos_ids_list = [];
22692
+ let st = 0;
22693
+ let remain_images = image_nums;
22694
+ let remain_videos = video_nums;
22695
+ for (let j = 0; j < vision_tokens.length; ++j) {
22696
+ const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
22697
+ const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
22698
+ const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
22699
+ const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
22700
+ let ed;
22701
+ let t, h, w;
22702
+ if (ed_image < ed_video) {
22703
+ [t, h, w] = image_grid_thw_list[state.image_index];
22704
+ ++state.image_index;
22705
+ --remain_images;
22706
+ ed = ed_image;
22707
+ } else {
22708
+ [t, h, w] = video_grid_thw_list[state.video_index];
22709
+ ++state.video_index;
22710
+ --remain_videos;
22711
+ ed = ed_video;
22712
+ }
22713
+ const [llm_grid_t, llm_grid_h, llm_grid_w] = [
22714
+ Number(t),
22715
+ Math.floor(Number(h) / spatial_merge_size),
22716
+ Math.floor(Number(w) / spatial_merge_size)
22717
+ ];
22718
+ const text_len = ed - st;
22719
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
22720
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
22721
+ const offset = text_len + st_idx;
22722
+ const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
22723
+ const t_index = Array.from(
22724
+ { length: grid_size },
22725
+ (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
22726
+ );
22727
+ const h_index = Array.from(
22728
+ { length: grid_size },
22729
+ (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
22730
+ );
22731
+ const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
22732
+ llm_pos_ids_list.push([t_index, h_index, w_index].flat());
22733
+ st = ed + grid_size;
22734
+ }
22735
+ if (st < ids.length) {
22736
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
22737
+ const text_len = ids.length - st;
22738
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
22739
+ }
22740
+ return llm_pos_ids_list;
22741
+ }
22742
+ /**
22743
+ * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
22744
+ *
22745
+ * Explanation:
22746
+ * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
22747
+ *
22748
+ * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
22749
+ * Examples:
22750
+ * input_ids: [T T T T T], here T is for text.
22751
+ * temporal position_ids: [0, 1, 2, 3, 4]
22752
+ * height position_ids: [0, 1, 2, 3, 4]
22753
+ * width position_ids: [0, 1, 2, 3, 4]
22754
+ *
22755
+ * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
22756
+ * and 1D rotary position embeddin for text part.
22757
+ * Examples:
22758
+ * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
22759
+ * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
22760
+ * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
22761
+ * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
22762
+ * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
22763
+ * text temporal position_ids: [3, 4, 5, 6, 7]
22764
+ * text height position_ids: [3, 4, 5, 6, 7]
22765
+ * text width position_ids: [3, 4, 5, 6, 7]
22766
+ * Here we calculate the text start position_ids as the max vision position_ids plus 1.
22767
+ *
22768
+ * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
22769
+ * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
22770
+ * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
22771
+ * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
22772
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
22773
+ */
22774
+ get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
22775
+ const { vision_config } = this.config;
22776
+ const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
22777
+ if (image_grid_thw || video_grid_thw) {
22778
+ const total_input_ids = input_ids.tolist();
22779
+ if (!attention_mask) {
22780
+ attention_mask = ones_like(input_ids);
22781
+ }
22782
+ const attention_mask_list = attention_mask.tolist();
22783
+ const position_ids_list = Array.from(
22784
+ { length: 3 },
22785
+ () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
22786
+ );
22787
+ const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
22788
+ const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
22789
+ const state = { image_index: 0, video_index: 0 };
22790
+ const mrope_position_deltas = [];
22791
+ for (let i = 0; i < total_input_ids.length; ++i) {
22792
+ const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
22793
+ const llm_pos_ids_list = this._get_multimodal_rope_positions({
22794
+ filtered_ids,
22795
+ image_grid_thw_list,
22796
+ video_grid_thw_list,
22797
+ spatial_merge_size,
22798
+ state
22799
+ });
22800
+ const llm_positions = this._reorder_and_write_positions(
22801
+ llm_pos_ids_list,
22802
+ attention_mask_list[i],
22803
+ position_ids_list,
22804
+ i
22805
+ );
22806
+ mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
22807
+ }
22808
+ return [
22809
+ new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
22810
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
22811
+ ];
22812
+ } else {
22813
+ return this._get_text_only_rope_index(input_ids, attention_mask);
22814
+ }
22815
+ }
22816
+ async encode_image({ pixel_values, image_grid_thw }) {
22817
+ const features = (await sessionRun(this.sessions["vision_encoder"], {
22818
+ pixel_values,
22819
+ [this.image_grid_thw_name]: image_grid_thw
22820
+ })).image_features;
22821
+ return features;
22822
+ }
22823
+ _merge_input_ids_with_image_features(kwargs) {
22824
+ return default_merge_input_ids_with_image_features({
22825
+ // @ts-ignore
22826
+ image_token_id: this.config.image_token_id,
22827
+ ...kwargs
22828
+ });
22829
+ }
22830
+ prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
22831
+ if (model_inputs.attention_mask && !model_inputs.position_ids) {
22832
+ if (!model_inputs.past_key_values) {
22833
+ [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
22834
+ model_inputs.input_ids,
22835
+ model_inputs.image_grid_thw,
22836
+ model_inputs.video_grid_thw,
22837
+ model_inputs.attention_mask
22838
+ );
22839
+ } else {
22840
+ model_inputs.pixel_values = null;
22841
+ const past_length = model_inputs.past_key_values.get_seq_length();
22842
+ if (past_length < model_inputs.input_ids.dims[1]) {
22843
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
22844
+ model_inputs.input_ids,
22845
+ model_inputs.image_grid_thw,
22846
+ model_inputs.video_grid_thw,
22847
+ model_inputs.attention_mask
22848
+ );
22849
+ model_inputs.rope_deltas = rope_deltas;
22850
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
22851
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
22852
+ } else {
22853
+ if (!model_inputs.rope_deltas) {
22854
+ [, model_inputs.rope_deltas] = this.get_rope_index(
22855
+ model_inputs.input_ids,
22856
+ model_inputs.image_grid_thw,
22857
+ model_inputs.video_grid_thw,
22858
+ model_inputs.attention_mask
22859
+ );
22860
+ }
22861
+ const delta = BigInt(past_length);
22862
+ const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
22863
+ model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
22864
+ }
22865
+ }
22866
+ }
22867
+ return model_inputs;
22868
+ }
22869
+ };
22870
+ var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
22871
+ };
22872
+
22873
+ // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
22874
+ var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
22875
+ image_grid_thw_name = "image_grid_thw";
22876
+ };
22877
+ var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
22878
+ image_grid_thw_name = "image_grid_thw";
22879
+ };
22880
+
22881
+ // src/models/glm_ocr/modeling_glm_ocr.js
22882
+ var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
22883
+ /**
22884
+ * Compute 3D positional indices for vision tokens.
22885
+ * Temporal is constant, height is repeat-interleaved, width tiles.
22886
+ * @param {number} start_position
22887
+ * @param {number[]} grid_thw [T, H, W]
22888
+ * @param {number} temp_merge_size
22889
+ * @param {number} spatial_merge_size
22890
+ * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
22891
+ */
22892
+ get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
22893
+ const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
22894
+ const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
22895
+ const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
22896
+ const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
22897
+ const t_pos = Array.from({ length: seq_len }, () => start_position);
22898
+ const h_pos = Array.from(
22899
+ { length: seq_len },
22900
+ (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
22901
+ );
22902
+ const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
22903
+ return [...t_pos, ...h_pos, ...w_pos];
22904
+ }
22905
+ /**
22906
+ * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
22907
+ * instead of vision_start_token_id scanning used by Qwen2VL.
22908
+ * After a vision segment, position advances by max(h, w) / spatial_merge_size.
22909
+ */
22910
+ _get_multimodal_rope_positions({
22911
+ filtered_ids,
22912
+ image_grid_thw_list,
22913
+ video_grid_thw_list,
22914
+ spatial_merge_size,
22915
+ state
22916
+ }) {
22917
+ const { image_token_id } = this.config;
22918
+ const groups = [];
22919
+ let group_start = 0;
22920
+ let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
22921
+ for (let j = 1; j <= filtered_ids.length; ++j) {
22922
+ const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
22923
+ if (t !== current_type) {
22924
+ groups.push([current_type, group_start, j]);
22925
+ group_start = j;
22926
+ current_type = t;
22927
+ }
22928
+ }
22929
+ let current_pos = 0;
22930
+ const llm_pos_ids_list = [];
22931
+ for (const [modality_type, start_idx, end_idx] of groups) {
22932
+ if (modality_type === 0) {
22933
+ const text_len = end_idx - start_idx;
22934
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
22935
+ current_pos += text_len;
22936
+ } else {
22937
+ const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
22938
+ const temp_merge_size = grid_thw[0];
22939
+ llm_pos_ids_list.push(
22940
+ this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
22941
+ );
22942
+ current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
22943
+ }
22944
+ }
22945
+ return llm_pos_ids_list;
22946
+ }
22947
+ };
22948
+
22949
+ // src/models/glpn/modeling_glpn.js
22950
+ var GLPNPreTrainedModel = class extends PreTrainedModel {
22951
+ };
22952
+ var GLPNModel = class extends GLPNPreTrainedModel {
22953
+ };
22954
+ var GLPNForDepthEstimation = class extends GLPNPreTrainedModel {
22955
+ };
22956
+
22957
+ // src/models/gpt_bigcode/modeling_gpt_bigcode.js
22958
+ var GPTBigCodePreTrainedModel = class extends PreTrainedModel {
22959
+ };
22960
+ var GPTBigCodeModel = class extends GPTBigCodePreTrainedModel {
22480
22961
  };
22481
22962
  var GPTBigCodeForCausalLM = class extends GPTBigCodePreTrainedModel {
22482
22963
  };
@@ -22777,6 +23258,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
22777
23258
  var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
22778
23259
  };
22779
23260
 
23261
+ // src/models/lighton_ocr/modeling_lighton_ocr.js
23262
+ var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
23263
+ };
23264
+
22780
23265
  // src/models/lfm2_moe/modeling_lfm2_moe.js
22781
23266
  var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
22782
23267
  };
@@ -22973,6 +23458,14 @@ var MistralModel = class extends MistralPreTrainedModel {
22973
23458
  var MistralForCausalLM = class extends MistralPreTrainedModel {
22974
23459
  };
22975
23460
 
23461
+ // src/models/mistral4/modeling_mistral4.js
23462
+ var Mistral4PreTrainedModel = class extends PreTrainedModel {
23463
+ };
23464
+ var Mistral4Model = class extends Mistral4PreTrainedModel {
23465
+ };
23466
+ var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
23467
+ };
23468
+
22976
23469
  // src/models/mobilebert/modeling_mobilebert.js
22977
23470
  var MobileBertPreTrainedModel = class extends PreTrainedModel {
22978
23471
  };
@@ -23441,6 +23934,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
23441
23934
  var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
23442
23935
  };
23443
23936
 
23937
+ // src/models/nemotron_h/modeling_nemotron_h.js
23938
+ var NemotronHPreTrainedModel = class extends PreTrainedModel {
23939
+ };
23940
+ var NemotronHModel = class extends NemotronHPreTrainedModel {
23941
+ };
23942
+ var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
23943
+ };
23944
+
23444
23945
  // src/models/neobert/modeling_neobert.js
23445
23946
  var NeoBertPreTrainedModel = class extends PreTrainedModel {
23446
23947
  };
@@ -23721,252 +24222,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
23721
24222
  var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
23722
24223
  };
23723
24224
 
23724
- // src/models/qwen2_vl/modeling_qwen2_vl.js
23725
- var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
23726
- forward_params = [
23727
- // Text inputs
23728
- "input_ids",
23729
- "attention_mask",
23730
- "position_ids",
23731
- "past_key_values",
23732
- // Vision inputs
23733
- "pixel_values",
23734
- "image_grid_thw"
23735
- ];
23736
- };
23737
- var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
23738
- // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
23739
- // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
23740
- // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
23741
- image_grid_thw_name = "grid_thw";
23742
- /**
23743
- * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
23744
- *
23745
- * Explanation:
23746
- * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
23747
- *
23748
- * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
23749
- * Examples:
23750
- * input_ids: [T T T T T], here T is for text.
23751
- * temporal position_ids: [0, 1, 2, 3, 4]
23752
- * height position_ids: [0, 1, 2, 3, 4]
23753
- * width position_ids: [0, 1, 2, 3, 4]
23754
- *
23755
- * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
23756
- * and 1D rotary position embeddin for text part.
23757
- * Examples:
23758
- * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
23759
- * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
23760
- * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
23761
- * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
23762
- * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
23763
- * text temporal position_ids: [3, 4, 5, 6, 7]
23764
- * text height position_ids: [3, 4, 5, 6, 7]
23765
- * text width position_ids: [3, 4, 5, 6, 7]
23766
- * Here we calculate the text start position_ids as the max vision position_ids plus 1.
23767
- *
23768
- * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
23769
- * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
23770
- * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
23771
- * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
23772
- * - 1 for tokens that are **not masked**,
23773
- * - 0 for tokens that are **masked**.
23774
- * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
23775
- * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
23776
- * - mrope_position_deltas: Tensor of shape `(batch_size)`.
23777
- */
23778
- get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
23779
- const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
23780
- const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
23781
- const mrope_position_deltas = [];
23782
- if (image_grid_thw || video_grid_thw) {
23783
- let total_input_ids = input_ids.tolist();
23784
- if (!attention_mask) {
23785
- attention_mask = ones_like(input_ids);
23786
- }
23787
- const attention_mask_list = attention_mask.tolist();
23788
- const position_ids_list = Array.from(
23789
- { length: 3 },
23790
- (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
23791
- );
23792
- const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
23793
- const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
23794
- let image_index = 0;
23795
- let video_index = 0;
23796
- for (let i = 0; i < total_input_ids.length; ++i) {
23797
- const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
23798
- const vision_start_indices = ids.reduce((acc, x, idx) => {
23799
- if (x == vision_start_token_id) acc.push(idx);
23800
- return acc;
23801
- }, []);
23802
- const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
23803
- const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
23804
- const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
23805
- let llm_pos_ids_list = [];
23806
- let st = 0;
23807
- let remain_images = image_nums;
23808
- let remain_videos = video_nums;
23809
- for (let j = 0; j < vision_tokens.length; ++j) {
23810
- const next_image_token = ids.findIndex((x, i2) => i2 > st && x == image_token_id);
23811
- const next_video_token = ids.findIndex((x, i2) => i2 > st && x == video_token_id);
23812
- const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
23813
- const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
23814
- let ed;
23815
- let t, h, w;
23816
- if (ed_image < ed_video) {
23817
- [t, h, w] = image_grid_thw_list[image_index];
23818
- ++image_index;
23819
- --remain_images;
23820
- ed = ed_image;
23821
- } else {
23822
- [t, h, w] = video_grid_thw_list[video_index];
23823
- ++video_index;
23824
- --remain_videos;
23825
- ed = ed_video;
23826
- }
23827
- const [llm_grid_t, llm_grid_h, llm_grid_w] = [
23828
- Number(t),
23829
- Math.floor(Number(h) / spatial_merge_size),
23830
- Math.floor(Number(w) / spatial_merge_size)
23831
- ];
23832
- const text_len = ed - st;
23833
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
23834
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
23835
- const offset = text_len + st_idx;
23836
- const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
23837
- const t_index = Array.from(
23838
- { length: grid_size },
23839
- (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
23840
- );
23841
- const h_index = Array.from(
23842
- { length: grid_size },
23843
- (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
23844
- );
23845
- const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
23846
- llm_pos_ids_list.push([t_index, h_index, w_index].flat());
23847
- st = ed + grid_size;
23848
- }
23849
- if (st < ids.length) {
23850
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
23851
- const text_len = ids.length - st;
23852
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
23853
- }
23854
- const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
23855
- const llm_positions = new Array(num_items);
23856
- let index = 0;
23857
- for (let x = 0; x < 3; ++x) {
23858
- for (let y = 0; y < llm_pos_ids_list.length; ++y) {
23859
- const val = llm_pos_ids_list[y];
23860
- const text_len = val.length / 3;
23861
- for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
23862
- llm_positions[index++] = val[z];
23863
- }
23864
- }
23865
- }
23866
- let count2 = 0;
23867
- const attn_mask = attention_mask_list[i];
23868
- for (let y = 0; y < attn_mask.length; ++y) {
23869
- if (attn_mask[y] == 1) {
23870
- for (let x = 0; x < 3; ++x) {
23871
- position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
23872
- }
23873
- ++count2;
23874
- }
23875
- }
23876
- const max_llm_positions = max(llm_positions)[0];
23877
- mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
23878
- }
23879
- return [
23880
- new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
23881
- new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
23882
- ];
23883
- } else {
23884
- if (attention_mask) {
23885
- const { data, dims } = cumsum_masked_fill(attention_mask);
23886
- const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
23887
- const mrope_position_deltas2 = Array.from(
23888
- { length: dims[0] },
23889
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
23890
- );
23891
- return [
23892
- new Tensor2("int64", position_ids, [3, ...dims]),
23893
- new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
23894
- ];
23895
- } else {
23896
- const [batch_size, seq_length] = input_ids.dims;
23897
- const position_ids = BigInt64Array.from(
23898
- { length: 3 * batch_size * seq_length },
23899
- (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
23900
- );
23901
- return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
23902
- }
23903
- }
23904
- }
23905
- async encode_image({ pixel_values, image_grid_thw }) {
23906
- const features = (await sessionRun(this.sessions["vision_encoder"], {
23907
- pixel_values,
23908
- [this.image_grid_thw_name]: image_grid_thw
23909
- })).image_features;
23910
- return features;
23911
- }
23912
- _merge_input_ids_with_image_features(kwargs) {
23913
- return default_merge_input_ids_with_image_features({
23914
- // @ts-ignore
23915
- image_token_id: this.config.image_token_id,
23916
- ...kwargs
23917
- });
23918
- }
23919
- prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
23920
- if (model_inputs.attention_mask && !model_inputs.position_ids) {
23921
- if (!model_inputs.past_key_values) {
23922
- [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
23923
- model_inputs.input_ids,
23924
- model_inputs.image_grid_thw,
23925
- model_inputs.video_grid_thw,
23926
- model_inputs.attention_mask
23927
- );
23928
- } else {
23929
- model_inputs.pixel_values = null;
23930
- const past_length = model_inputs.past_key_values.get_seq_length();
23931
- if (past_length < model_inputs.input_ids.dims[1]) {
23932
- const [full_position_ids, rope_deltas] = this.get_rope_index(
23933
- model_inputs.input_ids,
23934
- model_inputs.image_grid_thw,
23935
- model_inputs.video_grid_thw,
23936
- model_inputs.attention_mask
23937
- );
23938
- model_inputs.rope_deltas = rope_deltas;
23939
- model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
23940
- model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
23941
- } else {
23942
- if (!model_inputs.rope_deltas) {
23943
- [, model_inputs.rope_deltas] = this.get_rope_index(
23944
- model_inputs.input_ids,
23945
- model_inputs.image_grid_thw,
23946
- model_inputs.video_grid_thw,
23947
- model_inputs.attention_mask
23948
- );
23949
- }
23950
- const delta = BigInt(past_length);
23951
- const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
23952
- model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
23953
- }
23954
- }
23955
- }
23956
- return model_inputs;
23957
- }
23958
- };
23959
- var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
23960
- };
23961
-
23962
- // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
23963
- var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
23964
- image_grid_thw_name = "image_grid_thw";
23965
- };
23966
- var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
23967
- image_grid_thw_name = "image_grid_thw";
23968
- };
23969
-
23970
24225
  // src/models/qwen3/modeling_qwen3.js
23971
24226
  var Qwen3PreTrainedModel = class extends PreTrainedModel {
23972
24227
  };
@@ -24412,6 +24667,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
24412
24667
  }
24413
24668
  };
24414
24669
 
24670
+ // src/models/solar_open/modeling_solar_open.js
24671
+ var SolarOpenPreTrainedModel = class extends PreTrainedModel {
24672
+ };
24673
+ var SolarOpenModel = class extends SolarOpenPreTrainedModel {
24674
+ };
24675
+ var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
24676
+ };
24677
+
24415
24678
  // src/models/speecht5/modeling_speecht5.js
24416
24679
  var SpeechT5PreTrainedModel = class extends PreTrainedModel {
24417
24680
  };
@@ -25528,6 +25791,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
25528
25791
  // src/models/registry.js
25529
25792
  var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
25530
25793
  ["bert", "BertModel"],
25794
+ ["eurobert", "EuroBertModel"],
25531
25795
  ["neobert", "NeoBertModel"],
25532
25796
  ["modernbert", "ModernBertModel"],
25533
25797
  ["nomic_bert", "NomicBertModel"],
@@ -25659,6 +25923,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
25659
25923
  ["gemma3_text", "Gemma3Model"],
25660
25924
  ["helium", "HeliumModel"],
25661
25925
  ["glm", "GlmModel"],
25926
+ ["glm_moe_dsa", "GlmMoeDsaModel"],
25662
25927
  ["openelm", "OpenELMModel"],
25663
25928
  ["qwen2", "Qwen2Model"],
25664
25929
  ["qwen2_moe", "Qwen2MoeModel"],
@@ -25670,12 +25935,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
25670
25935
  ["mpt", "MptModel"],
25671
25936
  ["opt", "OPTModel"],
25672
25937
  ["mistral", "MistralModel"],
25938
+ ["mistral4", "Mistral4Model"],
25673
25939
  ["ministral", "MinistralModel"],
25674
25940
  ["ministral3", "Ministral3Model"],
25675
25941
  ["ernie4_5", "Ernie4_5ForCausalLM"],
25676
25942
  ["starcoder2", "Starcoder2Model"],
25943
+ ["deepseek_v3", "DeepseekV3Model"],
25677
25944
  ["falcon", "FalconModel"],
25678
25945
  ["falcon_h1", "FalconH1Model"],
25946
+ ["nemotron_h", "NemotronHModel"],
25947
+ ["solar_open", "SolarOpenModel"],
25679
25948
  ["stablelm", "StableLmModel"],
25680
25949
  ["modernbert-decoder", "ModernBertDecoderModel"],
25681
25950
  ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -25695,6 +25964,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25695
25964
  ]);
25696
25965
  var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
25697
25966
  ["bert", "BertForSequenceClassification"],
25967
+ ["eurobert", "EuroBertForSequenceClassification"],
25698
25968
  ["neobert", "NeoBertForSequenceClassification"],
25699
25969
  ["modernbert", "ModernBertForSequenceClassification"],
25700
25970
  ["roformer", "RoFormerForSequenceClassification"],
@@ -25717,6 +25987,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
25717
25987
  ]);
25718
25988
  var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
25719
25989
  ["bert", "BertForTokenClassification"],
25990
+ ["eurobert", "EuroBertForTokenClassification"],
25720
25991
  ["neobert", "NeoBertForTokenClassification"],
25721
25992
  ["modernbert", "ModernBertForTokenClassification"],
25722
25993
  ["roformer", "RoFormerForTokenClassification"],
@@ -25779,6 +26050,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25779
26050
  ["gemma3", "Gemma3ForCausalLM"],
25780
26051
  ["helium", "HeliumForCausalLM"],
25781
26052
  ["glm", "GlmForCausalLM"],
26053
+ ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
25782
26054
  ["openelm", "OpenELMForCausalLM"],
25783
26055
  ["qwen2", "Qwen2ForCausalLM"],
25784
26056
  ["qwen2_moe", "Qwen2MoeForCausalLM"],
@@ -25798,13 +26070,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25798
26070
  ["opt", "OPTForCausalLM"],
25799
26071
  ["mbart", "MBartForCausalLM"],
25800
26072
  ["mistral", "MistralForCausalLM"],
26073
+ ["mistral4", "Mistral4ForCausalLM"],
25801
26074
  ["ministral", "MinistralForCausalLM"],
25802
26075
  ["ministral3", "Ministral3ForCausalLM"],
25803
26076
  ["ernie4_5", "Ernie4_5ForCausalLM"],
25804
26077
  ["starcoder2", "Starcoder2ForCausalLM"],
26078
+ ["deepseek_v3", "DeepseekV3ForCausalLM"],
25805
26079
  ["falcon", "FalconForCausalLM"],
25806
26080
  ["falcon_h1", "FalconH1ForCausalLM"],
26081
+ ["nemotron_h", "NemotronHForCausalLM"],
25807
26082
  ["trocr", "TrOCRForCausalLM"],
26083
+ ["solar_open", "SolarOpenForCausalLM"],
25808
26084
  ["stablelm", "StableLmForCausalLM"],
25809
26085
  ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
25810
26086
  ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -25815,6 +26091,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25815
26091
  var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
25816
26092
  var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
25817
26093
  ["bert", "BertForMaskedLM"],
26094
+ ["eurobert", "EuroBertForMaskedLM"],
25818
26095
  ["neobert", "NeoBertForMaskedLM"],
25819
26096
  ["modernbert", "ModernBertForMaskedLM"],
25820
26097
  ["roformer", "RoFormerForMaskedLM"],
@@ -25873,7 +26150,9 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
25873
26150
  ["paligemma", "PaliGemmaForConditionalGeneration"],
25874
26151
  ["llava_qwen2", "LlavaQwen2ForCausalLM"],
25875
26152
  ["gemma3n", "Gemma3nForConditionalGeneration"],
25876
- ["mistral3", "Mistral3ForConditionalGeneration"]
26153
+ ["mistral3", "Mistral3ForConditionalGeneration"],
26154
+ ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
26155
+ ["glm_ocr", "GlmOcrForConditionalGeneration"]
25877
26156
  ]);
25878
26157
  var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
25879
26158
  ["granite_speech", "GraniteSpeechForConditionalGeneration"],
@@ -25978,6 +26257,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
25978
26257
  ]);
25979
26258
  var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
25980
26259
  var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
26260
+ ["chmv2", "CHMv2ForDepthEstimation"],
25981
26261
  ["dpt", "DPTForDepthEstimation"],
25982
26262
  ["depth_anything", "DepthAnythingForDepthEstimation"],
25983
26263
  ["glpn", "GLPNForDepthEstimation"],
@@ -28716,6 +28996,9 @@ export {
28716
28996
  BloomModel,
28717
28997
  BloomPreTrainedModel,
28718
28998
  BloomTokenizer,
28999
+ CHMv2ForDepthEstimation,
29000
+ CHMv2ImageProcessor,
29001
+ CHMv2PreTrainedModel,
28719
29002
  CLIPFeatureExtractor,
28720
29003
  CLIPImageProcessor,
28721
29004
  CLIPModel,
@@ -28811,6 +29094,9 @@ export {
28811
29094
  DebertaV2Tokenizer,
28812
29095
  DecisionTransformerModel,
28813
29096
  DecisionTransformerPreTrainedModel,
29097
+ DeepseekV3ForCausalLM,
29098
+ DeepseekV3Model,
29099
+ DeepseekV3PreTrainedModel,
28814
29100
  DeiTFeatureExtractor,
28815
29101
  DeiTForImageClassification,
28816
29102
  DeiTImageProcessor,
@@ -28871,6 +29157,11 @@ export {
28871
29157
  EsmModel,
28872
29158
  EsmPreTrainedModel,
28873
29159
  EsmTokenizer,
29160
+ EuroBertForMaskedLM,
29161
+ EuroBertForSequenceClassification,
29162
+ EuroBertForTokenClassification,
29163
+ EuroBertModel,
29164
+ EuroBertPreTrainedModel,
28874
29165
  ExaoneForCausalLM,
28875
29166
  ExaoneModel,
28876
29167
  ExaonePreTrainedModel,
@@ -28928,8 +29219,14 @@ export {
28928
29219
  GemmaModel,
28929
29220
  GemmaPreTrainedModel,
28930
29221
  GemmaTokenizer,
29222
+ Glm46VImageProcessor,
29223
+ Glm46VProcessor,
28931
29224
  GlmForCausalLM,
28932
29225
  GlmModel,
29226
+ GlmMoeDsaForCausalLM,
29227
+ GlmMoeDsaModel,
29228
+ GlmMoeDsaPreTrainedModel,
29229
+ GlmOcrForConditionalGeneration,
28933
29230
  GlmPreTrainedModel,
28934
29231
  GptOssForCausalLM,
28935
29232
  GptOssModel,
@@ -28995,6 +29292,7 @@ export {
28995
29292
  Lfm2VlForConditionalGeneration,
28996
29293
  Lfm2VlImageProcessor,
28997
29294
  Lfm2VlProcessor,
29295
+ LightOnOcrForConditionalGeneration,
28998
29296
  LiteWhisperForConditionalGeneration,
28999
29297
  Llama4ForCausalLM,
29000
29298
  Llama4PreTrainedModel,
@@ -29064,6 +29362,9 @@ export {
29064
29362
  MimiPreTrainedModel,
29065
29363
  MinLengthLogitsProcessor,
29066
29364
  MinNewTokensLengthLogitsProcessor,
29365
+ Mistral4ForCausalLM,
29366
+ Mistral4Model,
29367
+ Mistral4PreTrainedModel,
29067
29368
  MistralForCausalLM,
29068
29369
  MistralModel,
29069
29370
  MistralPreTrainedModel,
@@ -29135,6 +29436,9 @@ export {
29135
29436
  NanoChatForCausalLM,
29136
29437
  NanoChatModel,
29137
29438
  NanoChatPreTrainedModel,
29439
+ NemotronHForCausalLM,
29440
+ NemotronHModel,
29441
+ NemotronHPreTrainedModel,
29138
29442
  NeoBertForMaskedLM,
29139
29443
  NeoBertForQuestionAnswering,
29140
29444
  NeoBertForSequenceClassification,
@@ -29324,6 +29628,9 @@ export {
29324
29628
  SnacFeatureExtractor,
29325
29629
  SnacModel,
29326
29630
  SnacPreTrainedModel,
29631
+ SolarOpenForCausalLM,
29632
+ SolarOpenModel,
29633
+ SolarOpenPreTrainedModel,
29327
29634
  SpeechT5FeatureExtractor,
29328
29635
  SpeechT5ForSpeechToText,
29329
29636
  SpeechT5ForTextToSpeech,