@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/README.md +13 -2
  2. package/dist/transformers.js +689 -382
  3. package/dist/transformers.min.js +19 -19
  4. package/dist/transformers.node.cjs +716 -382
  5. package/dist/transformers.node.min.cjs +19 -19
  6. package/dist/transformers.node.min.mjs +19 -19
  7. package/dist/transformers.node.mjs +689 -382
  8. package/dist/transformers.web.js +697 -390
  9. package/dist/transformers.web.min.js +17 -17
  10. package/package.json +2 -2
  11. package/src/configs.js +28 -22
  12. package/src/env.js +1 -1
  13. package/src/image_processors_utils.js +25 -15
  14. package/src/models/chmv2/image_processing_chmv2.js +3 -0
  15. package/src/models/chmv2/modeling_chmv2.js +4 -0
  16. package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
  17. package/src/models/eurobert/modeling_eurobert.js +41 -0
  18. package/src/models/glm46v/image_processing_glm46v.js +12 -0
  19. package/src/models/glm46v/processing_glm46v.js +5 -0
  20. package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
  21. package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
  22. package/src/models/image_processors.js +2 -0
  23. package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
  24. package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
  25. package/src/models/mistral4/modeling_mistral4.js +5 -0
  26. package/src/models/modeling_utils.js +2 -0
  27. package/src/models/models.js +10 -1
  28. package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
  29. package/src/models/processors.js +1 -0
  30. package/src/models/qwen2_vl/modeling_qwen2_vl.js +187 -136
  31. package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
  32. package/src/models/registry.js +17 -0
  33. package/src/models/solar_open/modeling_solar_open.js +5 -0
  34. package/src/pipelines.js +1 -0
  35. package/src/utils/hub.js +4 -1
  36. package/src/utils/model_registry/get_file_metadata.js +1 -0
  37. package/types/configs.d.ts.map +1 -1
  38. package/types/image_processors_utils.d.ts +3 -2
  39. package/types/image_processors_utils.d.ts.map +1 -1
  40. package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
  41. package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
  42. package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
  43. package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
  44. package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
  45. package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
  46. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
  47. package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
  48. package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
  49. package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
  50. package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
  51. package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
  52. package/types/models/glm46v/processing_glm46v.d.ts +4 -0
  53. package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
  54. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
  55. package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
  56. package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
  57. package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
  58. package/types/models/image_processors.d.ts +2 -0
  59. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
  60. package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
  61. package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
  62. package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
  63. package/types/models/modeling_utils.d.ts.map +1 -1
  64. package/types/models/models.d.ts +10 -1
  65. package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
  66. package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
  67. package/types/models/processors.d.ts +1 -0
  68. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
  69. package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
  70. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
  71. package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
  72. package/types/models/registry.d.ts.map +1 -1
  73. package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
  74. package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
  75. package/types/pipelines.d.ts +1 -0
  76. package/types/pipelines.d.ts.map +1 -1
  77. package/types/utils/hub.d.ts.map +1 -1
  78. package/types/models/ast/modeling_ast.d.ts.map +0 -1
  79. /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
@@ -14,7 +14,7 @@ var __export = (target, all) => {
14
14
  import fs from "fs";
15
15
  import path from "path";
16
16
  import url from "url";
17
- var VERSION = "4.0.0-next.7";
17
+ var VERSION = "4.0.0-next.8";
18
18
  var HAS_SELF = typeof self !== "undefined";
19
19
  var IS_FS_AVAILABLE = !isEmpty(fs);
20
20
  var IS_PATH_AVAILABLE = !isEmpty(path);
@@ -244,7 +244,7 @@ var logger = {
244
244
  }
245
245
  };
246
246
 
247
- // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.2/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
247
+ // ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
248
248
  var DictionarySplitter = class {
249
249
  /**
250
250
  * @param dictionary The dictionary of words to use for splitting.
@@ -1900,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
1900
1900
  );
1901
1901
  if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
1902
1902
  output_tokens.push(...byte_tokens);
1903
- } else {
1903
+ } else if (this.unk_token != null) {
1904
1904
  output_tokens.push(this.unk_token);
1905
1905
  }
1906
- } else {
1906
+ } else if (this.unk_token != null) {
1907
1907
  output_tokens.push(this.unk_token);
1908
1908
  }
1909
1909
  }
@@ -6514,13 +6514,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
6514
6514
  wrapped_progress
6515
6515
  );
6516
6516
  } else if (typeof response !== "string") {
6517
+ const headers = new Headers(response.headers);
6518
+ headers.set("content-length", result.byteLength.toString());
6517
6519
  await cache2.put(
6518
6520
  cacheKey,
6519
6521
  new Response(
6520
6522
  /** @type {any} */
6521
6523
  result,
6522
6524
  {
6523
- headers: response.headers
6525
+ headers
6524
6526
  }
6525
6527
  )
6526
6528
  ).catch((err) => {
@@ -15730,6 +15732,7 @@ __export(processors_exports, {
15730
15732
  ChatterboxProcessor: () => ChatterboxProcessor,
15731
15733
  Florence2Processor: () => Florence2Processor,
15732
15734
  Gemma3nProcessor: () => Gemma3nProcessor,
15735
+ Glm46VProcessor: () => Glm46VProcessor,
15733
15736
  GraniteSpeechProcessor: () => GraniteSpeechProcessor,
15734
15737
  GroundingDinoProcessor: () => GroundingDinoProcessor,
15735
15738
  Idefics3Processor: () => Idefics3Processor,
@@ -18234,26 +18237,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
18234
18237
  }
18235
18238
  return [segmentation, segments];
18236
18239
  }
18237
- function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
18240
+ function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
18238
18241
  if (height < factor || width < factor) {
18239
- throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
18240
- } else if (Math.max(height, width) / Math.min(height, width) > 200) {
18242
+ const scale = Math.max(factor / height, factor / width);
18243
+ height = Math.round(height * scale);
18244
+ width = Math.round(width * scale);
18245
+ }
18246
+ if (Math.max(height, width) / Math.min(height, width) > 200) {
18241
18247
  throw new Error(
18242
18248
  `absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
18243
18249
  );
18244
18250
  }
18245
18251
  let h_bar = Math.round(height / factor) * factor;
18246
18252
  let w_bar = Math.round(width / factor) * factor;
18247
- if (h_bar * w_bar > max_pixels) {
18248
- const beta = Math.sqrt(height * width / max_pixels);
18249
- h_bar = Math.floor(height / beta / factor) * factor;
18250
- w_bar = Math.floor(width / beta / factor) * factor;
18251
- } else if (h_bar * w_bar < min_pixels) {
18252
- const beta = Math.sqrt(min_pixels / (height * width));
18253
+ if (temporal_factor * h_bar * w_bar > max_pixels) {
18254
+ const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
18255
+ h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
18256
+ w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
18257
+ } else if (temporal_factor * h_bar * w_bar < min_pixels) {
18258
+ const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
18253
18259
  h_bar = Math.ceil(height * beta / factor) * factor;
18254
18260
  w_bar = Math.ceil(width * beta / factor) * factor;
18255
18261
  }
18256
- return [h_bar, w_bar];
18262
+ return [w_bar, h_bar];
18257
18263
  }
18258
18264
  function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
18259
18265
  if (label_ids_to_fuse === null) {
@@ -18332,7 +18338,7 @@ var ImageProcessor = class extends Callable2 {
18332
18338
  this.do_pad = config.do_pad;
18333
18339
  this.min_pixels = config.min_pixels;
18334
18340
  this.max_pixels = config.max_pixels;
18335
- if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
18341
+ if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
18336
18342
  this.pad_size = this.size;
18337
18343
  }
18338
18344
  this.do_flip_channel_order = config.do_flip_channel_order ?? false;
@@ -18620,10 +18626,8 @@ var ImageProcessor = class extends Callable2 {
18620
18626
  const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
18621
18627
  [pixelData, imgDims] = padded;
18622
18628
  } else if (this.size_divisibility) {
18623
- const [paddedWidth, paddedHeight] = enforce_size_divisibility(
18624
- [imgDims[1], imgDims[0]],
18625
- this.size_divisibility
18626
- );
18629
+ const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
18630
+ const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
18627
18631
  [pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
18628
18632
  }
18629
18633
  }
@@ -18700,6 +18704,7 @@ var image_processors_exports = {};
18700
18704
  __export(image_processors_exports, {
18701
18705
  BeitFeatureExtractor: () => BeitFeatureExtractor,
18702
18706
  BitImageProcessor: () => BitImageProcessor,
18707
+ CHMv2ImageProcessor: () => CHMv2ImageProcessor,
18703
18708
  CLIPFeatureExtractor: () => CLIPFeatureExtractor,
18704
18709
  CLIPImageProcessor: () => CLIPImageProcessor,
18705
18710
  ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
@@ -18716,6 +18721,7 @@ __export(image_processors_exports, {
18716
18721
  DonutImageProcessor: () => DonutImageProcessor,
18717
18722
  EfficientNetImageProcessor: () => EfficientNetImageProcessor,
18718
18723
  GLPNFeatureExtractor: () => GLPNFeatureExtractor,
18724
+ Glm46VImageProcessor: () => Glm46VImageProcessor,
18719
18725
  GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
18720
18726
  Idefics3ImageProcessor: () => Idefics3ImageProcessor,
18721
18727
  ImageFeatureExtractor: () => ImageProcessor,
@@ -18776,6 +18782,10 @@ var BitImageProcessor = class extends ImageProcessor {
18776
18782
  var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
18777
18783
  };
18778
18784
 
18785
+ // src/models/chmv2/image_processing_chmv2.js
18786
+ var CHMv2ImageProcessor = class extends ImageProcessor {
18787
+ };
18788
+
18779
18789
  // src/models/clip/image_processing_clip.js
18780
18790
  var CLIPImageProcessor = class extends ImageProcessor {
18781
18791
  };
@@ -18895,6 +18905,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
18895
18905
  }
18896
18906
  };
18897
18907
 
18908
+ // src/models/qwen2_vl/image_processing_qwen2_vl.js
18909
+ var Qwen2VLImageProcessor = class extends ImageProcessor {
18910
+ constructor(config) {
18911
+ super(config);
18912
+ this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
18913
+ this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
18914
+ this.patch_size = config.patch_size;
18915
+ this.merge_size = config.merge_size;
18916
+ }
18917
+ /** @type {ImageProcessor['get_resize_output_image_size']} */
18918
+ get_resize_output_image_size(image, size) {
18919
+ const factor = this.patch_size * this.merge_size;
18920
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
18921
+ }
18922
+ async _call(images, ...args) {
18923
+ const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
18924
+ let patches = pixel_values;
18925
+ const { temporal_patch_size, merge_size, patch_size } = this.config;
18926
+ if (patches.dims[0] === 1) {
18927
+ patches = cat(
18928
+ Array.from({ length: temporal_patch_size }, () => patches),
18929
+ 0
18930
+ );
18931
+ }
18932
+ const grid_t = patches.dims[0] / temporal_patch_size;
18933
+ const channel = patches.dims[1];
18934
+ const grid_h = Math.floor(patches.dims[2] / patch_size);
18935
+ const grid_w = Math.floor(patches.dims[3] / patch_size);
18936
+ const flatten_patches = patches.view(
18937
+ grid_t,
18938
+ temporal_patch_size,
18939
+ channel,
18940
+ Math.floor(grid_h / merge_size),
18941
+ merge_size,
18942
+ patch_size,
18943
+ Math.floor(grid_w / merge_size),
18944
+ merge_size,
18945
+ patch_size
18946
+ ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
18947
+ const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
18948
+ return {
18949
+ pixel_values: flatten_patches,
18950
+ image_grid_thw,
18951
+ original_sizes,
18952
+ reshaped_input_sizes
18953
+ };
18954
+ }
18955
+ };
18956
+
18957
+ // src/models/glm46v/image_processing_glm46v.js
18958
+ var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
18959
+ /** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
18960
+ get_resize_output_image_size(image, size) {
18961
+ const factor = this.patch_size * this.merge_size;
18962
+ const temporal_factor = this.config.temporal_patch_size ?? 2;
18963
+ return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
18964
+ }
18965
+ };
18966
+
18898
18967
  // src/models/glpn/image_processing_glpn.js
18899
18968
  var GLPNFeatureExtractor = class extends ImageProcessor {
18900
18969
  };
@@ -19288,7 +19357,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
19288
19357
  const img = pixel_values.unsqueeze_(0);
19289
19358
  const total_factor = this.encoder_patch_size * this.downsample_factor;
19290
19359
  const f2 = total_factor ** 2;
19291
- const [new_height, new_width] = smart_resize(
19360
+ const [new_width, new_height] = smart_resize(
19292
19361
  Math.max(total_factor, height),
19293
19362
  Math.max(total_factor, width),
19294
19363
  total_factor,
@@ -19578,55 +19647,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
19578
19647
  var PvtImageProcessor = class extends ImageProcessor {
19579
19648
  };
19580
19649
 
19581
- // src/models/qwen2_vl/image_processing_qwen2_vl.js
19582
- var Qwen2VLImageProcessor = class extends ImageProcessor {
19583
- constructor(config) {
19584
- super(config);
19585
- this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
19586
- this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
19587
- this.patch_size = config.patch_size;
19588
- this.merge_size = config.merge_size;
19589
- }
19590
- /** @type {ImageProcessor['get_resize_output_image_size']} */
19591
- get_resize_output_image_size(image, size) {
19592
- const factor = this.patch_size * this.merge_size;
19593
- return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
19594
- }
19595
- async _call(images, ...args) {
19596
- const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
19597
- let patches = pixel_values;
19598
- const { temporal_patch_size, merge_size, patch_size } = this.config;
19599
- if (patches.dims[0] === 1) {
19600
- patches = cat(
19601
- Array.from({ length: temporal_patch_size }, () => patches),
19602
- 0
19603
- );
19604
- }
19605
- const grid_t = patches.dims[0] / temporal_patch_size;
19606
- const channel = patches.dims[1];
19607
- const grid_h = Math.floor(patches.dims[2] / patch_size);
19608
- const grid_w = Math.floor(patches.dims[3] / patch_size);
19609
- const flatten_patches = patches.view(
19610
- grid_t,
19611
- temporal_patch_size,
19612
- channel,
19613
- Math.floor(grid_h / merge_size),
19614
- merge_size,
19615
- patch_size,
19616
- Math.floor(grid_w / merge_size),
19617
- merge_size,
19618
- patch_size
19619
- ).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
19620
- const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
19621
- return {
19622
- pixel_values: flatten_patches,
19623
- image_grid_thw,
19624
- original_sizes,
19625
- reshaped_input_sizes
19626
- };
19627
- }
19628
- };
19629
-
19630
19650
  // src/models/rt_detr/image_processing_rt_detr.js
19631
19651
  var RTDetrImageProcessor = class extends ImageProcessor {
19632
19652
  /** @type {typeof post_process_object_detection} */
@@ -20180,6 +20200,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
20180
20200
  }
20181
20201
  };
20182
20202
 
20203
+ // src/models/qwen2_vl/processing_qwen2_vl.js
20204
+ var Qwen2VLProcessor = class extends Processor {
20205
+ static image_processor_class = AutoImageProcessor;
20206
+ static tokenizer_class = AutoTokenizer;
20207
+ static image_token = "<|image_pad|>";
20208
+ /**
20209
+ *
20210
+ * @param {string|string[]} text
20211
+ * @param {RawImage|RawImage[]} images
20212
+ * @param {...any} args
20213
+ * @returns {Promise<any>}
20214
+ */
20215
+ async _call(text, images = null, ...args) {
20216
+ if (!Array.isArray(text)) {
20217
+ text = [text];
20218
+ }
20219
+ let image_inputs, image_grid_thw;
20220
+ if (images) {
20221
+ image_inputs = await this.image_processor(images);
20222
+ image_grid_thw = image_inputs.image_grid_thw;
20223
+ }
20224
+ if (image_grid_thw) {
20225
+ let merge_length = this.image_processor.config.merge_size ** 2;
20226
+ let index = 0;
20227
+ const image_token = (
20228
+ /** @type {typeof Qwen2VLProcessor} */
20229
+ this.constructor.image_token
20230
+ );
20231
+ const image_grid_thw_list = image_grid_thw.tolist();
20232
+ text = text.map((t) => {
20233
+ while (t.includes(image_token)) {
20234
+ const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
20235
+ t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
20236
+ }
20237
+ return t.replaceAll("<|placeholder|>", image_token);
20238
+ });
20239
+ }
20240
+ const text_inputs = this.tokenizer(text);
20241
+ return {
20242
+ ...text_inputs,
20243
+ ...image_inputs
20244
+ };
20245
+ }
20246
+ };
20247
+
20248
+ // src/models/glm46v/processing_glm46v.js
20249
+ var Glm46VProcessor = class extends Qwen2VLProcessor {
20250
+ static image_token = "<|image|>";
20251
+ };
20252
+
20183
20253
  // src/models/granite_speech/processing_granite_speech.js
20184
20254
  var GraniteSpeechProcessor = class extends Processor {
20185
20255
  static tokenizer_class = AutoTokenizer;
@@ -20910,47 +20980,6 @@ var PyAnnoteProcessor = class extends Processor {
20910
20980
  }
20911
20981
  };
20912
20982
 
20913
- // src/models/qwen2_vl/processing_qwen2_vl.js
20914
- var Qwen2VLProcessor = class extends Processor {
20915
- static image_processor_class = AutoImageProcessor;
20916
- static tokenizer_class = AutoTokenizer;
20917
- /**
20918
- *
20919
- * @param {string|string[]} text
20920
- * @param {RawImage|RawImage[]} images
20921
- * @param {...any} args
20922
- * @returns {Promise<any>}
20923
- */
20924
- async _call(text, images = null, ...args) {
20925
- if (!Array.isArray(text)) {
20926
- text = [text];
20927
- }
20928
- let image_inputs, image_grid_thw;
20929
- if (images) {
20930
- image_inputs = await this.image_processor(images);
20931
- image_grid_thw = image_inputs.image_grid_thw;
20932
- }
20933
- if (image_grid_thw) {
20934
- let merge_length = this.image_processor.config.merge_size ** 2;
20935
- let index = 0;
20936
- const image_grid_thw_list = image_grid_thw.tolist();
20937
- text = text.map((t) => {
20938
- while (t.includes("<|image_pad|>")) {
20939
- const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
20940
- t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
20941
- }
20942
- return t.replaceAll("<|placeholder|>", "<|image_pad|>");
20943
- });
20944
- }
20945
- const text_inputs = this.tokenizer(text);
20946
- return {
20947
- ...text_inputs,
20948
- ...image_inputs
20949
- // TODO: ...videos_inputs,
20950
- };
20951
- }
20952
- };
20953
-
20954
20983
  // src/models/qwen2_5_vl/processing_qwen2_5_vl.js
20955
20984
  var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
20956
20985
  };
@@ -21294,6 +21323,8 @@ function getNormalizedConfig(config) {
21294
21323
  case "gemma3n":
21295
21324
  case "lfm2_vl":
21296
21325
  case "chatterbox":
21326
+ case "lighton_ocr":
21327
+ case "glm_ocr":
21297
21328
  case "mistral3":
21298
21329
  case "qwen2_5_vl":
21299
21330
  case "qwen3_vl":
@@ -21369,6 +21400,8 @@ function getNormalizedConfig(config) {
21369
21400
  mapping["dim_kv"] = "head_dim";
21370
21401
  break;
21371
21402
  case "qwen3":
21403
+ case "solar_open":
21404
+ case "glm_ocr_text":
21372
21405
  case "gemma":
21373
21406
  case "gemma2":
21374
21407
  case "vaultgemma":
@@ -21379,6 +21412,7 @@ function getNormalizedConfig(config) {
21379
21412
  case "ernie4_5":
21380
21413
  case "hunyuan_v1_dense":
21381
21414
  case "falcon_h1":
21415
+ case "nemotron_h":
21382
21416
  case "ministral":
21383
21417
  case "ministral3":
21384
21418
  mapping["num_heads"] = "num_key_value_heads";
@@ -21413,6 +21447,9 @@ function getNormalizedConfig(config) {
21413
21447
  mapping["num_attention_heads"] = "num_attention_heads";
21414
21448
  break;
21415
21449
  case "youtu":
21450
+ case "deepseek_v3":
21451
+ case "glm_moe_dsa":
21452
+ case "mistral4":
21416
21453
  mapping["num_heads"] = "num_key_value_heads";
21417
21454
  mapping["num_layers"] = "num_hidden_layers";
21418
21455
  mapping["dim_kv"] = "qk_head_dim";
@@ -21501,6 +21538,7 @@ function getCacheShapes(config, options) {
21501
21538
  if (!(config instanceof PretrainedConfig)) {
21502
21539
  config = new PretrainedConfig(config);
21503
21540
  }
21541
+ const batch_size = options?.batch_size ?? 1;
21504
21542
  if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
21505
21543
  const pkv_prefix = options?.prefix ?? "past_key_values";
21506
21544
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
@@ -21510,7 +21548,6 @@ function getCacheShapes(config, options) {
21510
21548
  config
21511
21549
  );
21512
21550
  const head_dim = hidden_size / num_attention_heads;
21513
- const batch_size = options?.batch_size ?? 1;
21514
21551
  for (let i = 0; i < layer_types.length; ++i) {
21515
21552
  if (layer_types[i] === "full_attention") {
21516
21553
  for (const kv of ["key", "value"]) {
@@ -21523,31 +21560,26 @@ function getCacheShapes(config, options) {
21523
21560
  }
21524
21561
  }
21525
21562
  return cache_values;
21526
- } else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
21563
+ } else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
21527
21564
  const pkv_prefix = options?.prefix ?? "past_key_values";
21528
21565
  const conv_prefix = pkv_prefix === "present" ? "present" : "past";
21529
- const cache_values = {};
21530
- const {
21531
- layer_types,
21532
- num_hidden_layers,
21533
- num_attention_heads,
21534
- num_key_value_heads,
21535
- hidden_size,
21536
- mamba_d_conv,
21537
- mamba_n_heads,
21538
- mamba_d_head,
21539
- mamba_d_state,
21540
- mamba_n_groups,
21541
- mamba_expand,
21542
- mamba_d_ssm
21543
- } = (
21566
+ const c = (
21544
21567
  /** @type {any} */
21545
21568
  config
21546
21569
  );
21547
- const head_dim = hidden_size / num_attention_heads;
21548
- const batch_size = options?.batch_size ?? 1;
21549
- const conv_d_inner = (mamba_d_ssm ?? mamba_expand * hidden_size) + 2 * mamba_n_groups * mamba_d_state;
21550
- for (let i = 0; i < num_hidden_layers; ++i) {
21570
+ const layer_types = c.layer_types ?? c.layers_block_type;
21571
+ const num_layers = c.num_hidden_layers ?? layer_types?.length;
21572
+ const num_key_value_heads = c.num_key_value_heads;
21573
+ const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
21574
+ const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
21575
+ const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
21576
+ const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
21577
+ const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
21578
+ const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
21579
+ const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
21580
+ const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
21581
+ const cache_values = {};
21582
+ for (let i = 0; i < num_layers; ++i) {
21551
21583
  if (!layer_types || layer_types[i] === "mamba") {
21552
21584
  cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
21553
21585
  cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
@@ -21581,7 +21613,6 @@ function getCacheShapes(config, options) {
21581
21613
  const key_dim = linear_key_head_dim * linear_num_key_heads;
21582
21614
  const value_dim = linear_value_head_dim * linear_num_value_heads;
21583
21615
  const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
21584
- const batch_size = options?.batch_size ?? 1;
21585
21616
  for (let i = 0; i < layer_types.length; ++i) {
21586
21617
  if (layer_types[i] === "full_attention") {
21587
21618
  for (const kv of ["key", "value"]) {
@@ -24209,7 +24240,9 @@ async function generic_text_to_text_forward(self2, {
24209
24240
  "qwen3_5",
24210
24241
  "qwen3_5_text",
24211
24242
  "qwen3_5_moe",
24212
- "qwen3_5_moe_text"
24243
+ "qwen3_5_moe_text",
24244
+ "glm_ocr",
24245
+ "glm_ocr_text"
24213
24246
  ].includes(self2.config.model_type)
24214
24247
  ) {
24215
24248
  const { image_grid_thw, video_grid_thw } = kwargs;
@@ -24433,6 +24466,8 @@ __export(models_exports, {
24433
24466
  BloomForCausalLM: () => BloomForCausalLM,
24434
24467
  BloomModel: () => BloomModel,
24435
24468
  BloomPreTrainedModel: () => BloomPreTrainedModel,
24469
+ CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
24470
+ CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
24436
24471
  CLIPModel: () => CLIPModel,
24437
24472
  CLIPPreTrainedModel: () => CLIPPreTrainedModel,
24438
24473
  CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
@@ -24507,6 +24542,9 @@ __export(models_exports, {
24507
24542
  DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
24508
24543
  DecisionTransformerModel: () => DecisionTransformerModel,
24509
24544
  DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
24545
+ DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
24546
+ DeepseekV3Model: () => DeepseekV3Model,
24547
+ DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
24510
24548
  DeiTForImageClassification: () => DeiTForImageClassification,
24511
24549
  DeiTModel: () => DeiTModel,
24512
24550
  DeiTPreTrainedModel: () => DeiTPreTrainedModel,
@@ -24552,6 +24590,11 @@ __export(models_exports, {
24552
24590
  EsmForTokenClassification: () => EsmForTokenClassification,
24553
24591
  EsmModel: () => EsmModel,
24554
24592
  EsmPreTrainedModel: () => EsmPreTrainedModel,
24593
+ EuroBertForMaskedLM: () => EuroBertForMaskedLM,
24594
+ EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
24595
+ EuroBertForTokenClassification: () => EuroBertForTokenClassification,
24596
+ EuroBertModel: () => EuroBertModel,
24597
+ EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
24555
24598
  ExaoneForCausalLM: () => ExaoneForCausalLM,
24556
24599
  ExaoneModel: () => ExaoneModel,
24557
24600
  ExaonePreTrainedModel: () => ExaonePreTrainedModel,
@@ -24598,6 +24641,10 @@ __export(models_exports, {
24598
24641
  GemmaPreTrainedModel: () => GemmaPreTrainedModel,
24599
24642
  GlmForCausalLM: () => GlmForCausalLM,
24600
24643
  GlmModel: () => GlmModel,
24644
+ GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
24645
+ GlmMoeDsaModel: () => GlmMoeDsaModel,
24646
+ GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
24647
+ GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
24601
24648
  GlmPreTrainedModel: () => GlmPreTrainedModel,
24602
24649
  GptOssForCausalLM: () => GptOssForCausalLM,
24603
24650
  GptOssModel: () => GptOssModel,
@@ -24644,6 +24691,7 @@ __export(models_exports, {
24644
24691
  Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
24645
24692
  Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
24646
24693
  Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
24694
+ LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
24647
24695
  LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
24648
24696
  Llama4ForCausalLM: () => Llama4ForCausalLM,
24649
24697
  Llama4PreTrainedModel: () => Llama4PreTrainedModel,
@@ -24693,6 +24741,9 @@ __export(models_exports, {
24693
24741
  MimiEncoderOutput: () => MimiEncoderOutput,
24694
24742
  MimiModel: () => MimiModel,
24695
24743
  MimiPreTrainedModel: () => MimiPreTrainedModel,
24744
+ Mistral4ForCausalLM: () => Mistral4ForCausalLM,
24745
+ Mistral4Model: () => Mistral4Model,
24746
+ Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
24696
24747
  MistralForCausalLM: () => MistralForCausalLM,
24697
24748
  MistralModel: () => MistralModel,
24698
24749
  MistralPreTrainedModel: () => MistralPreTrainedModel,
@@ -24750,6 +24801,9 @@ __export(models_exports, {
24750
24801
  NanoChatForCausalLM: () => NanoChatForCausalLM,
24751
24802
  NanoChatModel: () => NanoChatModel,
24752
24803
  NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
24804
+ NemotronHForCausalLM: () => NemotronHForCausalLM,
24805
+ NemotronHModel: () => NemotronHModel,
24806
+ NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
24753
24807
  NeoBertForMaskedLM: () => NeoBertForMaskedLM,
24754
24808
  NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
24755
24809
  NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
@@ -24887,6 +24941,9 @@ __export(models_exports, {
24887
24941
  SnacEncoderModel: () => SnacEncoderModel,
24888
24942
  SnacModel: () => SnacModel,
24889
24943
  SnacPreTrainedModel: () => SnacPreTrainedModel,
24944
+ SolarOpenForCausalLM: () => SolarOpenForCausalLM,
24945
+ SolarOpenModel: () => SolarOpenModel,
24946
+ SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
24890
24947
  SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
24891
24948
  SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
24892
24949
  SpeechT5HifiGan: () => SpeechT5HifiGan,
@@ -25061,7 +25118,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
25061
25118
  var ArceeForCausalLM = class extends ArceePreTrainedModel {
25062
25119
  };
25063
25120
 
25064
- // src/models/ast/modeling_ast.js
25121
+ // src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
25065
25122
  var ASTPreTrainedModel = class extends PreTrainedModel {
25066
25123
  };
25067
25124
  var ASTModel = class extends ASTPreTrainedModel {
@@ -25396,6 +25453,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
25396
25453
  var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
25397
25454
  };
25398
25455
 
25456
+ // src/models/chmv2/modeling_chmv2.js
25457
+ var CHMv2PreTrainedModel = class extends PreTrainedModel {
25458
+ };
25459
+ var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
25460
+ };
25461
+
25399
25462
  // src/models/clap/modeling_clap.js
25400
25463
  var ClapPreTrainedModel = class extends PreTrainedModel {
25401
25464
  };
@@ -25734,6 +25797,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
25734
25797
  }
25735
25798
  };
25736
25799
 
25800
+ // src/models/deepseek_v3/modeling_deepseek_v3.js
25801
+ var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
25802
+ };
25803
+ var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
25804
+ };
25805
+ var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
25806
+ };
25807
+
25737
25808
  // src/models/deberta_v2/modeling_deberta_v2.js
25738
25809
  var DebertaV2PreTrainedModel = class extends PreTrainedModel {
25739
25810
  };
@@ -26082,6 +26153,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
26082
26153
  }
26083
26154
  };
26084
26155
 
26156
+ // src/models/eurobert/modeling_eurobert.js
26157
+ var EuroBertPreTrainedModel = class extends PreTrainedModel {
26158
+ };
26159
+ var EuroBertModel = class extends EuroBertPreTrainedModel {
26160
+ };
26161
+ var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
26162
+ /**
26163
+ * Calls the model on new inputs.
26164
+ *
26165
+ * @param {Object} model_inputs The inputs to the model.
26166
+ * @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
26167
+ */
26168
+ async _call(model_inputs) {
26169
+ return new MaskedLMOutput(await super._call(model_inputs));
26170
+ }
26171
+ };
26172
+ var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
26173
+ /**
26174
+ * Calls the model on new inputs.
26175
+ *
26176
+ * @param {Object} model_inputs The inputs to the model.
26177
+ * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
26178
+ */
26179
+ async _call(model_inputs) {
26180
+ return new SequenceClassifierOutput(await super._call(model_inputs));
26181
+ }
26182
+ };
26183
+ var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
26184
+ /**
26185
+ * Calls the model on new inputs.
26186
+ *
26187
+ * @param {Object} model_inputs The inputs to the model.
26188
+ * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
26189
+ */
26190
+ async _call(model_inputs) {
26191
+ return new TokenClassifierOutput(await super._call(model_inputs));
26192
+ }
26193
+ };
26194
+
26085
26195
  // src/models/exaone/modeling_exaone.js
26086
26196
  var ExaonePreTrainedModel = class extends PreTrainedModel {
26087
26197
  };
@@ -26357,6 +26467,377 @@ var GlmModel = class extends GlmPreTrainedModel {
26357
26467
  var GlmForCausalLM = class extends GlmPreTrainedModel {
26358
26468
  };
26359
26469
 
26470
+ // src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
26471
+ var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
26472
+ };
26473
+ var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
26474
+ };
26475
+ var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
26476
+ };
26477
+
26478
+ // src/models/qwen2_vl/modeling_qwen2_vl.js
26479
+ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
26480
+ forward_params = [
26481
+ // Text inputs
26482
+ "input_ids",
26483
+ "attention_mask",
26484
+ "position_ids",
26485
+ "past_key_values",
26486
+ // Vision inputs
26487
+ "pixel_values",
26488
+ "image_grid_thw"
26489
+ ];
26490
+ };
26491
+ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
26492
+ // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
26493
+ // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
26494
+ // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
26495
+ image_grid_thw_name = "grid_thw";
26496
+ /**
26497
+ * Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
26498
+ * @param {Tensor} input_ids
26499
+ * @param {Tensor} attention_mask
26500
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
26501
+ */
26502
+ _get_text_only_rope_index(input_ids, attention_mask) {
26503
+ if (attention_mask) {
26504
+ const { data, dims } = cumsum_masked_fill(attention_mask);
26505
+ const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
26506
+ const mrope_position_deltas = Array.from(
26507
+ { length: dims[0] },
26508
+ (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
26509
+ );
26510
+ return [
26511
+ new Tensor2("int64", position_ids, [3, ...dims]),
26512
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
26513
+ ];
26514
+ } else {
26515
+ const [batch_size, seq_length] = input_ids.dims;
26516
+ const position_ids = BigInt64Array.from(
26517
+ { length: 3 * batch_size * seq_length },
26518
+ (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
26519
+ );
26520
+ return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
26521
+ }
26522
+ }
26523
+ /**
26524
+ * Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
26525
+ * global [all_t, all_h, all_w] order, then write back into the position_ids array
26526
+ * respecting attention mask.
26527
+ * @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
26528
+ * @param {number[]} attn_mask Attention mask for this batch element
26529
+ * @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
26530
+ * @param {number} batch_idx Current batch index
26531
+ * @returns {number[]} Flat reordered positions of length total_len
26532
+ */
26533
+ _reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
26534
+ const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
26535
+ const llm_positions = new Array(total_len);
26536
+ let index = 0;
26537
+ for (let x = 0; x < 3; ++x) {
26538
+ for (const val of llm_pos_ids_list) {
26539
+ const seg_len = val.length / 3;
26540
+ for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
26541
+ llm_positions[index++] = val[z];
26542
+ }
26543
+ }
26544
+ }
26545
+ let count2 = 0;
26546
+ for (let y = 0; y < attn_mask.length; ++y) {
26547
+ if (attn_mask[y] == 1) {
26548
+ for (let x = 0; x < 3; ++x) {
26549
+ position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
26550
+ }
26551
+ ++count2;
26552
+ }
26553
+ }
26554
+ return llm_positions;
26555
+ }
26556
+ /**
26557
+ * Build per-batch position ID segments for multimodal rope.
26558
+ * Override this in subclasses to change how vision/text segments are identified and positioned.
26559
+ * @param {object} params
26560
+ * @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
26561
+ * @param {any[][]} params.image_grid_thw_list - all image grid dimensions
26562
+ * @param {any[][]} params.video_grid_thw_list - all video grid dimensions
26563
+ * @param {number} params.spatial_merge_size
26564
+ * @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
26565
+ * @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
26566
+ */
26567
+ _get_multimodal_rope_positions({
26568
+ filtered_ids,
26569
+ image_grid_thw_list,
26570
+ video_grid_thw_list,
26571
+ spatial_merge_size,
26572
+ state
26573
+ }) {
26574
+ const { image_token_id, video_token_id, vision_start_token_id } = this.config;
26575
+ const ids = filtered_ids;
26576
+ const vision_start_indices = ids.reduce((acc, x, idx) => {
26577
+ if (x == vision_start_token_id) acc.push(idx);
26578
+ return acc;
26579
+ }, []);
26580
+ const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
26581
+ const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
26582
+ const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
26583
+ const llm_pos_ids_list = [];
26584
+ let st2 = 0;
26585
+ let remain_images = image_nums;
26586
+ let remain_videos = video_nums;
26587
+ for (let j = 0; j < vision_tokens.length; ++j) {
26588
+ const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
26589
+ const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
26590
+ const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
26591
+ const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
26592
+ let ed;
26593
+ let t, h, w;
26594
+ if (ed_image < ed_video) {
26595
+ [t, h, w] = image_grid_thw_list[state.image_index];
26596
+ ++state.image_index;
26597
+ --remain_images;
26598
+ ed = ed_image;
26599
+ } else {
26600
+ [t, h, w] = video_grid_thw_list[state.video_index];
26601
+ ++state.video_index;
26602
+ --remain_videos;
26603
+ ed = ed_video;
26604
+ }
26605
+ const [llm_grid_t, llm_grid_h, llm_grid_w] = [
26606
+ Number(t),
26607
+ Math.floor(Number(h) / spatial_merge_size),
26608
+ Math.floor(Number(w) / spatial_merge_size)
26609
+ ];
26610
+ const text_len = ed - st2;
26611
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
26612
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
26613
+ const offset = text_len + st_idx;
26614
+ const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
26615
+ const t_index = Array.from(
26616
+ { length: grid_size },
26617
+ (_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
26618
+ );
26619
+ const h_index = Array.from(
26620
+ { length: grid_size },
26621
+ (_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
26622
+ );
26623
+ const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
26624
+ llm_pos_ids_list.push([t_index, h_index, w_index].flat());
26625
+ st2 = ed + grid_size;
26626
+ }
26627
+ if (st2 < ids.length) {
26628
+ const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
26629
+ const text_len = ids.length - st2;
26630
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
26631
+ }
26632
+ return llm_pos_ids_list;
26633
+ }
26634
+ /**
26635
+ * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
26636
+ *
26637
+ * Explanation:
26638
+ * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
26639
+ *
26640
+ * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
26641
+ * Examples:
26642
+ * input_ids: [T T T T T], here T is for text.
26643
+ * temporal position_ids: [0, 1, 2, 3, 4]
26644
+ * height position_ids: [0, 1, 2, 3, 4]
26645
+ * width position_ids: [0, 1, 2, 3, 4]
26646
+ *
26647
+ * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
26648
+ * and 1D rotary position embeddin for text part.
26649
+ * Examples:
26650
+ * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
26651
+ * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
26652
+ * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
26653
+ * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
26654
+ * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
26655
+ * text temporal position_ids: [3, 4, 5, 6, 7]
26656
+ * text height position_ids: [3, 4, 5, 6, 7]
26657
+ * text width position_ids: [3, 4, 5, 6, 7]
26658
+ * Here we calculate the text start position_ids as the max vision position_ids plus 1.
26659
+ *
26660
+ * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
26661
+ * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
26662
+ * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
26663
+ * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
26664
+ * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
26665
+ */
26666
+ get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
26667
+ const { vision_config } = this.config;
26668
+ const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
26669
+ if (image_grid_thw || video_grid_thw) {
26670
+ const total_input_ids = input_ids.tolist();
26671
+ if (!attention_mask) {
26672
+ attention_mask = ones_like(input_ids);
26673
+ }
26674
+ const attention_mask_list = attention_mask.tolist();
26675
+ const position_ids_list = Array.from(
26676
+ { length: 3 },
26677
+ () => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
26678
+ );
26679
+ const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
26680
+ const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
26681
+ const state = { image_index: 0, video_index: 0 };
26682
+ const mrope_position_deltas = [];
26683
+ for (let i = 0; i < total_input_ids.length; ++i) {
26684
+ const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
26685
+ const llm_pos_ids_list = this._get_multimodal_rope_positions({
26686
+ filtered_ids,
26687
+ image_grid_thw_list,
26688
+ video_grid_thw_list,
26689
+ spatial_merge_size,
26690
+ state
26691
+ });
26692
+ const llm_positions = this._reorder_and_write_positions(
26693
+ llm_pos_ids_list,
26694
+ attention_mask_list[i],
26695
+ position_ids_list,
26696
+ i
26697
+ );
26698
+ mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
26699
+ }
26700
+ return [
26701
+ new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
26702
+ new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
26703
+ ];
26704
+ } else {
26705
+ return this._get_text_only_rope_index(input_ids, attention_mask);
26706
+ }
26707
+ }
26708
+ async encode_image({ pixel_values, image_grid_thw }) {
26709
+ const features = (await sessionRun(this.sessions["vision_encoder"], {
26710
+ pixel_values,
26711
+ [this.image_grid_thw_name]: image_grid_thw
26712
+ })).image_features;
26713
+ return features;
26714
+ }
26715
+ _merge_input_ids_with_image_features(kwargs) {
26716
+ return default_merge_input_ids_with_image_features({
26717
+ // @ts-ignore
26718
+ image_token_id: this.config.image_token_id,
26719
+ ...kwargs
26720
+ });
26721
+ }
26722
+ prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
26723
+ if (model_inputs.attention_mask && !model_inputs.position_ids) {
26724
+ if (!model_inputs.past_key_values) {
26725
+ [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
26726
+ model_inputs.input_ids,
26727
+ model_inputs.image_grid_thw,
26728
+ model_inputs.video_grid_thw,
26729
+ model_inputs.attention_mask
26730
+ );
26731
+ } else {
26732
+ model_inputs.pixel_values = null;
26733
+ const past_length = model_inputs.past_key_values.get_seq_length();
26734
+ if (past_length < model_inputs.input_ids.dims[1]) {
26735
+ const [full_position_ids, rope_deltas] = this.get_rope_index(
26736
+ model_inputs.input_ids,
26737
+ model_inputs.image_grid_thw,
26738
+ model_inputs.video_grid_thw,
26739
+ model_inputs.attention_mask
26740
+ );
26741
+ model_inputs.rope_deltas = rope_deltas;
26742
+ model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
26743
+ model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
26744
+ } else {
26745
+ if (!model_inputs.rope_deltas) {
26746
+ [, model_inputs.rope_deltas] = this.get_rope_index(
26747
+ model_inputs.input_ids,
26748
+ model_inputs.image_grid_thw,
26749
+ model_inputs.video_grid_thw,
26750
+ model_inputs.attention_mask
26751
+ );
26752
+ }
26753
+ const delta = BigInt(past_length);
26754
+ const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
26755
+ model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
26756
+ }
26757
+ }
26758
+ }
26759
+ return model_inputs;
26760
+ }
26761
+ };
26762
+ var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
26763
+ };
26764
+
26765
+ // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
26766
+ var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
26767
+ image_grid_thw_name = "image_grid_thw";
26768
+ };
26769
+ var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
26770
+ image_grid_thw_name = "image_grid_thw";
26771
+ };
26772
+
26773
+ // src/models/glm_ocr/modeling_glm_ocr.js
26774
+ var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
26775
+ /**
26776
+ * Compute 3D positional indices for vision tokens.
26777
+ * Temporal is constant, height is repeat-interleaved, width tiles.
26778
+ * @param {number} start_position
26779
+ * @param {number[]} grid_thw [T, H, W]
26780
+ * @param {number} temp_merge_size
26781
+ * @param {number} spatial_merge_size
26782
+ * @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
26783
+ */
26784
+ get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
26785
+ const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
26786
+ const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
26787
+ const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
26788
+ const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
26789
+ const t_pos = Array.from({ length: seq_len }, () => start_position);
26790
+ const h_pos = Array.from(
26791
+ { length: seq_len },
26792
+ (_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
26793
+ );
26794
+ const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
26795
+ return [...t_pos, ...h_pos, ...w_pos];
26796
+ }
26797
+ /**
26798
+ * GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
26799
+ * instead of vision_start_token_id scanning used by Qwen2VL.
26800
+ * After a vision segment, position advances by max(h, w) / spatial_merge_size.
26801
+ */
26802
+ _get_multimodal_rope_positions({
26803
+ filtered_ids,
26804
+ image_grid_thw_list,
26805
+ video_grid_thw_list,
26806
+ spatial_merge_size,
26807
+ state
26808
+ }) {
26809
+ const { image_token_id } = this.config;
26810
+ const groups = [];
26811
+ let group_start = 0;
26812
+ let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
26813
+ for (let j = 1; j <= filtered_ids.length; ++j) {
26814
+ const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
26815
+ if (t !== current_type) {
26816
+ groups.push([current_type, group_start, j]);
26817
+ group_start = j;
26818
+ current_type = t;
26819
+ }
26820
+ }
26821
+ let current_pos = 0;
26822
+ const llm_pos_ids_list = [];
26823
+ for (const [modality_type, start_idx, end_idx] of groups) {
26824
+ if (modality_type === 0) {
26825
+ const text_len = end_idx - start_idx;
26826
+ llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
26827
+ current_pos += text_len;
26828
+ } else {
26829
+ const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
26830
+ const temp_merge_size = grid_thw[0];
26831
+ llm_pos_ids_list.push(
26832
+ this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
26833
+ );
26834
+ current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
26835
+ }
26836
+ }
26837
+ return llm_pos_ids_list;
26838
+ }
26839
+ };
26840
+
26360
26841
  // src/models/glpn/modeling_glpn.js
26361
26842
  var GLPNPreTrainedModel = class extends PreTrainedModel {
26362
26843
  };
@@ -26669,6 +27150,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
26669
27150
  var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
26670
27151
  };
26671
27152
 
27153
+ // src/models/lighton_ocr/modeling_lighton_ocr.js
27154
+ var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
27155
+ };
27156
+
26672
27157
  // src/models/lfm2_moe/modeling_lfm2_moe.js
26673
27158
  var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
26674
27159
  };
@@ -26865,6 +27350,14 @@ var MistralModel = class extends MistralPreTrainedModel {
26865
27350
  var MistralForCausalLM = class extends MistralPreTrainedModel {
26866
27351
  };
26867
27352
 
27353
+ // src/models/mistral4/modeling_mistral4.js
27354
+ var Mistral4PreTrainedModel = class extends PreTrainedModel {
27355
+ };
27356
+ var Mistral4Model = class extends Mistral4PreTrainedModel {
27357
+ };
27358
+ var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
27359
+ };
27360
+
26868
27361
  // src/models/mobilebert/modeling_mobilebert.js
26869
27362
  var MobileBertPreTrainedModel = class extends PreTrainedModel {
26870
27363
  };
@@ -27333,6 +27826,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
27333
27826
  var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
27334
27827
  };
27335
27828
 
27829
+ // src/models/nemotron_h/modeling_nemotron_h.js
27830
+ var NemotronHPreTrainedModel = class extends PreTrainedModel {
27831
+ };
27832
+ var NemotronHModel = class extends NemotronHPreTrainedModel {
27833
+ };
27834
+ var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
27835
+ };
27836
+
27336
27837
  // src/models/neobert/modeling_neobert.js
27337
27838
  var NeoBertPreTrainedModel = class extends PreTrainedModel {
27338
27839
  };
@@ -27613,252 +28114,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
27613
28114
  var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
27614
28115
  };
27615
28116
 
27616
- // src/models/qwen2_vl/modeling_qwen2_vl.js
27617
- var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
27618
- forward_params = [
27619
- // Text inputs
27620
- "input_ids",
27621
- "attention_mask",
27622
- "position_ids",
27623
- "past_key_values",
27624
- // Vision inputs
27625
- "pixel_values",
27626
- "image_grid_thw"
27627
- ];
27628
- };
27629
- var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
27630
- // NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
27631
- // CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
27632
- // embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
27633
- image_grid_thw_name = "grid_thw";
27634
- /**
27635
- * Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
27636
- *
27637
- * Explanation:
27638
- * Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
27639
- *
27640
- * For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
27641
- * Examples:
27642
- * input_ids: [T T T T T], here T is for text.
27643
- * temporal position_ids: [0, 1, 2, 3, 4]
27644
- * height position_ids: [0, 1, 2, 3, 4]
27645
- * width position_ids: [0, 1, 2, 3, 4]
27646
- *
27647
- * For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
27648
- * and 1D rotary position embeddin for text part.
27649
- * Examples:
27650
- * Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
27651
- * input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
27652
- * vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
27653
- * vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
27654
- * vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
27655
- * text temporal position_ids: [3, 4, 5, 6, 7]
27656
- * text height position_ids: [3, 4, 5, 6, 7]
27657
- * text width position_ids: [3, 4, 5, 6, 7]
27658
- * Here we calculate the text start position_ids as the max vision position_ids plus 1.
27659
- *
27660
- * @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
27661
- * @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
27662
- * @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
27663
- * @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
27664
- * - 1 for tokens that are **not masked**,
27665
- * - 0 for tokens that are **masked**.
27666
- * @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
27667
- * - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
27668
- * - mrope_position_deltas: Tensor of shape `(batch_size)`.
27669
- */
27670
- get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
27671
- const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
27672
- const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
27673
- const mrope_position_deltas = [];
27674
- if (image_grid_thw || video_grid_thw) {
27675
- let total_input_ids = input_ids.tolist();
27676
- if (!attention_mask) {
27677
- attention_mask = ones_like(input_ids);
27678
- }
27679
- const attention_mask_list = attention_mask.tolist();
27680
- const position_ids_list = Array.from(
27681
- { length: 3 },
27682
- (_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
27683
- );
27684
- const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
27685
- const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
27686
- let image_index = 0;
27687
- let video_index = 0;
27688
- for (let i = 0; i < total_input_ids.length; ++i) {
27689
- const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
27690
- const vision_start_indices = ids.reduce((acc, x, idx) => {
27691
- if (x == vision_start_token_id) acc.push(idx);
27692
- return acc;
27693
- }, []);
27694
- const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
27695
- const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
27696
- const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
27697
- let llm_pos_ids_list = [];
27698
- let st2 = 0;
27699
- let remain_images = image_nums;
27700
- let remain_videos = video_nums;
27701
- for (let j = 0; j < vision_tokens.length; ++j) {
27702
- const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
27703
- const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
27704
- const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
27705
- const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
27706
- let ed;
27707
- let t, h, w;
27708
- if (ed_image < ed_video) {
27709
- [t, h, w] = image_grid_thw_list[image_index];
27710
- ++image_index;
27711
- --remain_images;
27712
- ed = ed_image;
27713
- } else {
27714
- [t, h, w] = video_grid_thw_list[video_index];
27715
- ++video_index;
27716
- --remain_videos;
27717
- ed = ed_video;
27718
- }
27719
- const [llm_grid_t, llm_grid_h, llm_grid_w] = [
27720
- Number(t),
27721
- Math.floor(Number(h) / spatial_merge_size),
27722
- Math.floor(Number(w) / spatial_merge_size)
27723
- ];
27724
- const text_len = ed - st2;
27725
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27726
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
27727
- const offset = text_len + st_idx;
27728
- const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
27729
- const t_index = Array.from(
27730
- { length: grid_size },
27731
- (_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
27732
- );
27733
- const h_index = Array.from(
27734
- { length: grid_size },
27735
- (_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
27736
- );
27737
- const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
27738
- llm_pos_ids_list.push([t_index, h_index, w_index].flat());
27739
- st2 = ed + grid_size;
27740
- }
27741
- if (st2 < ids.length) {
27742
- const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
27743
- const text_len = ids.length - st2;
27744
- llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
27745
- }
27746
- const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
27747
- const llm_positions = new Array(num_items);
27748
- let index = 0;
27749
- for (let x = 0; x < 3; ++x) {
27750
- for (let y = 0; y < llm_pos_ids_list.length; ++y) {
27751
- const val = llm_pos_ids_list[y];
27752
- const text_len = val.length / 3;
27753
- for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
27754
- llm_positions[index++] = val[z];
27755
- }
27756
- }
27757
- }
27758
- let count2 = 0;
27759
- const attn_mask = attention_mask_list[i];
27760
- for (let y = 0; y < attn_mask.length; ++y) {
27761
- if (attn_mask[y] == 1) {
27762
- for (let x = 0; x < 3; ++x) {
27763
- position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
27764
- }
27765
- ++count2;
27766
- }
27767
- }
27768
- const max_llm_positions = max(llm_positions)[0];
27769
- mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
27770
- }
27771
- return [
27772
- new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
27773
- new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
27774
- ];
27775
- } else {
27776
- if (attention_mask) {
27777
- const { data, dims } = cumsum_masked_fill(attention_mask);
27778
- const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
27779
- const mrope_position_deltas2 = Array.from(
27780
- { length: dims[0] },
27781
- (_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
27782
- );
27783
- return [
27784
- new Tensor2("int64", position_ids, [3, ...dims]),
27785
- new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
27786
- ];
27787
- } else {
27788
- const [batch_size, seq_length] = input_ids.dims;
27789
- const position_ids = BigInt64Array.from(
27790
- { length: 3 * batch_size * seq_length },
27791
- (_, i) => BigInt(Math.floor(i % seq_length / batch_size))
27792
- );
27793
- return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
27794
- }
27795
- }
27796
- }
27797
- async encode_image({ pixel_values, image_grid_thw }) {
27798
- const features = (await sessionRun(this.sessions["vision_encoder"], {
27799
- pixel_values,
27800
- [this.image_grid_thw_name]: image_grid_thw
27801
- })).image_features;
27802
- return features;
27803
- }
27804
- _merge_input_ids_with_image_features(kwargs) {
27805
- return default_merge_input_ids_with_image_features({
27806
- // @ts-ignore
27807
- image_token_id: this.config.image_token_id,
27808
- ...kwargs
27809
- });
27810
- }
27811
- prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
27812
- if (model_inputs.attention_mask && !model_inputs.position_ids) {
27813
- if (!model_inputs.past_key_values) {
27814
- [model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
27815
- model_inputs.input_ids,
27816
- model_inputs.image_grid_thw,
27817
- model_inputs.video_grid_thw,
27818
- model_inputs.attention_mask
27819
- );
27820
- } else {
27821
- model_inputs.pixel_values = null;
27822
- const past_length = model_inputs.past_key_values.get_seq_length();
27823
- if (past_length < model_inputs.input_ids.dims[1]) {
27824
- const [full_position_ids, rope_deltas] = this.get_rope_index(
27825
- model_inputs.input_ids,
27826
- model_inputs.image_grid_thw,
27827
- model_inputs.video_grid_thw,
27828
- model_inputs.attention_mask
27829
- );
27830
- model_inputs.rope_deltas = rope_deltas;
27831
- model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
27832
- model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
27833
- } else {
27834
- if (!model_inputs.rope_deltas) {
27835
- [, model_inputs.rope_deltas] = this.get_rope_index(
27836
- model_inputs.input_ids,
27837
- model_inputs.image_grid_thw,
27838
- model_inputs.video_grid_thw,
27839
- model_inputs.attention_mask
27840
- );
27841
- }
27842
- const delta = BigInt(past_length);
27843
- const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
27844
- model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
27845
- }
27846
- }
27847
- }
27848
- return model_inputs;
27849
- }
27850
- };
27851
- var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
27852
- };
27853
-
27854
- // src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
27855
- var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
27856
- image_grid_thw_name = "image_grid_thw";
27857
- };
27858
- var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
27859
- image_grid_thw_name = "image_grid_thw";
27860
- };
27861
-
27862
28117
  // src/models/qwen3/modeling_qwen3.js
27863
28118
  var Qwen3PreTrainedModel = class extends PreTrainedModel {
27864
28119
  };
@@ -28304,6 +28559,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
28304
28559
  }
28305
28560
  };
28306
28561
 
28562
+ // src/models/solar_open/modeling_solar_open.js
28563
+ var SolarOpenPreTrainedModel = class extends PreTrainedModel {
28564
+ };
28565
+ var SolarOpenModel = class extends SolarOpenPreTrainedModel {
28566
+ };
28567
+ var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
28568
+ };
28569
+
28307
28570
  // src/models/speecht5/modeling_speecht5.js
28308
28571
  var SpeechT5PreTrainedModel = class extends PreTrainedModel {
28309
28572
  };
@@ -29420,6 +29683,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
29420
29683
  // src/models/registry.js
29421
29684
  var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
29422
29685
  ["bert", "BertModel"],
29686
+ ["eurobert", "EuroBertModel"],
29423
29687
  ["neobert", "NeoBertModel"],
29424
29688
  ["modernbert", "ModernBertModel"],
29425
29689
  ["nomic_bert", "NomicBertModel"],
@@ -29551,6 +29815,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
29551
29815
  ["gemma3_text", "Gemma3Model"],
29552
29816
  ["helium", "HeliumModel"],
29553
29817
  ["glm", "GlmModel"],
29818
+ ["glm_moe_dsa", "GlmMoeDsaModel"],
29554
29819
  ["openelm", "OpenELMModel"],
29555
29820
  ["qwen2", "Qwen2Model"],
29556
29821
  ["qwen2_moe", "Qwen2MoeModel"],
@@ -29562,12 +29827,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
29562
29827
  ["mpt", "MptModel"],
29563
29828
  ["opt", "OPTModel"],
29564
29829
  ["mistral", "MistralModel"],
29830
+ ["mistral4", "Mistral4Model"],
29565
29831
  ["ministral", "MinistralModel"],
29566
29832
  ["ministral3", "Ministral3Model"],
29567
29833
  ["ernie4_5", "Ernie4_5ForCausalLM"],
29568
29834
  ["starcoder2", "Starcoder2Model"],
29835
+ ["deepseek_v3", "DeepseekV3Model"],
29569
29836
  ["falcon", "FalconModel"],
29570
29837
  ["falcon_h1", "FalconH1Model"],
29838
+ ["nemotron_h", "NemotronHModel"],
29839
+ ["solar_open", "SolarOpenModel"],
29571
29840
  ["stablelm", "StableLmModel"],
29572
29841
  ["modernbert-decoder", "ModernBertDecoderModel"],
29573
29842
  ["hunyuan_v1_dense", "HunYuanDenseV1Model"],
@@ -29587,6 +29856,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29587
29856
  ]);
29588
29857
  var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29589
29858
  ["bert", "BertForSequenceClassification"],
29859
+ ["eurobert", "EuroBertForSequenceClassification"],
29590
29860
  ["neobert", "NeoBertForSequenceClassification"],
29591
29861
  ["modernbert", "ModernBertForSequenceClassification"],
29592
29862
  ["roformer", "RoFormerForSequenceClassification"],
@@ -29609,6 +29879,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29609
29879
  ]);
29610
29880
  var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29611
29881
  ["bert", "BertForTokenClassification"],
29882
+ ["eurobert", "EuroBertForTokenClassification"],
29612
29883
  ["neobert", "NeoBertForTokenClassification"],
29613
29884
  ["modernbert", "ModernBertForTokenClassification"],
29614
29885
  ["roformer", "RoFormerForTokenClassification"],
@@ -29671,6 +29942,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29671
29942
  ["gemma3", "Gemma3ForCausalLM"],
29672
29943
  ["helium", "HeliumForCausalLM"],
29673
29944
  ["glm", "GlmForCausalLM"],
29945
+ ["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
29674
29946
  ["openelm", "OpenELMForCausalLM"],
29675
29947
  ["qwen2", "Qwen2ForCausalLM"],
29676
29948
  ["qwen2_moe", "Qwen2MoeForCausalLM"],
@@ -29690,13 +29962,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29690
29962
  ["opt", "OPTForCausalLM"],
29691
29963
  ["mbart", "MBartForCausalLM"],
29692
29964
  ["mistral", "MistralForCausalLM"],
29965
+ ["mistral4", "Mistral4ForCausalLM"],
29693
29966
  ["ministral", "MinistralForCausalLM"],
29694
29967
  ["ministral3", "Ministral3ForCausalLM"],
29695
29968
  ["ernie4_5", "Ernie4_5ForCausalLM"],
29696
29969
  ["starcoder2", "Starcoder2ForCausalLM"],
29970
+ ["deepseek_v3", "DeepseekV3ForCausalLM"],
29697
29971
  ["falcon", "FalconForCausalLM"],
29698
29972
  ["falcon_h1", "FalconH1ForCausalLM"],
29973
+ ["nemotron_h", "NemotronHForCausalLM"],
29699
29974
  ["trocr", "TrOCRForCausalLM"],
29975
+ ["solar_open", "SolarOpenForCausalLM"],
29700
29976
  ["stablelm", "StableLmForCausalLM"],
29701
29977
  ["modernbert-decoder", "ModernBertDecoderForCausalLM"],
29702
29978
  ["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
@@ -29707,6 +29983,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29707
29983
  var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
29708
29984
  var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
29709
29985
  ["bert", "BertForMaskedLM"],
29986
+ ["eurobert", "EuroBertForMaskedLM"],
29710
29987
  ["neobert", "NeoBertForMaskedLM"],
29711
29988
  ["modernbert", "ModernBertForMaskedLM"],
29712
29989
  ["roformer", "RoFormerForMaskedLM"],
@@ -29765,7 +30042,9 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
29765
30042
  ["paligemma", "PaliGemmaForConditionalGeneration"],
29766
30043
  ["llava_qwen2", "LlavaQwen2ForCausalLM"],
29767
30044
  ["gemma3n", "Gemma3nForConditionalGeneration"],
29768
- ["mistral3", "Mistral3ForConditionalGeneration"]
30045
+ ["mistral3", "Mistral3ForConditionalGeneration"],
30046
+ ["lighton_ocr", "LightOnOcrForConditionalGeneration"],
30047
+ ["glm_ocr", "GlmOcrForConditionalGeneration"]
29769
30048
  ]);
29770
30049
  var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
29771
30050
  ["granite_speech", "GraniteSpeechForConditionalGeneration"],
@@ -29870,6 +30149,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
29870
30149
  ]);
29871
30150
  var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
29872
30151
  var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
30152
+ ["chmv2", "CHMv2ForDepthEstimation"],
29873
30153
  ["dpt", "DPTForDepthEstimation"],
29874
30154
  ["depth_anything", "DepthAnythingForDepthEstimation"],
29875
30155
  ["glpn", "GLPNForDepthEstimation"],
@@ -32608,6 +32888,9 @@ export {
32608
32888
  BloomModel,
32609
32889
  BloomPreTrainedModel,
32610
32890
  BloomTokenizer,
32891
+ CHMv2ForDepthEstimation,
32892
+ CHMv2ImageProcessor,
32893
+ CHMv2PreTrainedModel,
32611
32894
  CLIPFeatureExtractor,
32612
32895
  CLIPImageProcessor,
32613
32896
  CLIPModel,
@@ -32703,6 +32986,9 @@ export {
32703
32986
  DebertaV2Tokenizer,
32704
32987
  DecisionTransformerModel,
32705
32988
  DecisionTransformerPreTrainedModel,
32989
+ DeepseekV3ForCausalLM,
32990
+ DeepseekV3Model,
32991
+ DeepseekV3PreTrainedModel,
32706
32992
  DeiTFeatureExtractor,
32707
32993
  DeiTForImageClassification,
32708
32994
  DeiTImageProcessor,
@@ -32763,6 +33049,11 @@ export {
32763
33049
  EsmModel,
32764
33050
  EsmPreTrainedModel,
32765
33051
  EsmTokenizer,
33052
+ EuroBertForMaskedLM,
33053
+ EuroBertForSequenceClassification,
33054
+ EuroBertForTokenClassification,
33055
+ EuroBertModel,
33056
+ EuroBertPreTrainedModel,
32766
33057
  ExaoneForCausalLM,
32767
33058
  ExaoneModel,
32768
33059
  ExaonePreTrainedModel,
@@ -32820,8 +33111,14 @@ export {
32820
33111
  GemmaModel,
32821
33112
  GemmaPreTrainedModel,
32822
33113
  GemmaTokenizer,
33114
+ Glm46VImageProcessor,
33115
+ Glm46VProcessor,
32823
33116
  GlmForCausalLM,
32824
33117
  GlmModel,
33118
+ GlmMoeDsaForCausalLM,
33119
+ GlmMoeDsaModel,
33120
+ GlmMoeDsaPreTrainedModel,
33121
+ GlmOcrForConditionalGeneration,
32825
33122
  GlmPreTrainedModel,
32826
33123
  GptOssForCausalLM,
32827
33124
  GptOssModel,
@@ -32887,6 +33184,7 @@ export {
32887
33184
  Lfm2VlForConditionalGeneration,
32888
33185
  Lfm2VlImageProcessor,
32889
33186
  Lfm2VlProcessor,
33187
+ LightOnOcrForConditionalGeneration,
32890
33188
  LiteWhisperForConditionalGeneration,
32891
33189
  Llama4ForCausalLM,
32892
33190
  Llama4PreTrainedModel,
@@ -32956,6 +33254,9 @@ export {
32956
33254
  MimiPreTrainedModel,
32957
33255
  MinLengthLogitsProcessor,
32958
33256
  MinNewTokensLengthLogitsProcessor,
33257
+ Mistral4ForCausalLM,
33258
+ Mistral4Model,
33259
+ Mistral4PreTrainedModel,
32959
33260
  MistralForCausalLM,
32960
33261
  MistralModel,
32961
33262
  MistralPreTrainedModel,
@@ -33027,6 +33328,9 @@ export {
33027
33328
  NanoChatForCausalLM,
33028
33329
  NanoChatModel,
33029
33330
  NanoChatPreTrainedModel,
33331
+ NemotronHForCausalLM,
33332
+ NemotronHModel,
33333
+ NemotronHPreTrainedModel,
33030
33334
  NeoBertForMaskedLM,
33031
33335
  NeoBertForQuestionAnswering,
33032
33336
  NeoBertForSequenceClassification,
@@ -33216,6 +33520,9 @@ export {
33216
33520
  SnacFeatureExtractor,
33217
33521
  SnacModel,
33218
33522
  SnacPreTrainedModel,
33523
+ SolarOpenForCausalLM,
33524
+ SolarOpenModel,
33525
+ SolarOpenPreTrainedModel,
33219
33526
  SpeechT5FeatureExtractor,
33220
33527
  SpeechT5ForSpeechToText,
33221
33528
  SpeechT5ForTextToSpeech,