@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/transformers.js +689 -382
- package/dist/transformers.min.js +19 -19
- package/dist/transformers.node.cjs +716 -382
- package/dist/transformers.node.min.cjs +19 -19
- package/dist/transformers.node.min.mjs +19 -19
- package/dist/transformers.node.mjs +689 -382
- package/dist/transformers.web.js +697 -390
- package/dist/transformers.web.min.js +17 -17
- package/package.json +2 -2
- package/src/configs.js +28 -22
- package/src/env.js +1 -1
- package/src/image_processors_utils.js +25 -15
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/image_processors.js +2 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +2 -0
- package/src/models/models.js +10 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/processors.js +1 -0
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +187 -136
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/registry.js +17 -0
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/pipelines.js +1 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/model_registry/get_file_metadata.js +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +3 -2
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +2 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +10 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/processors.d.ts +1 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
package/dist/transformers.js
CHANGED
|
@@ -20,7 +20,7 @@ var node_path_default = {};
|
|
|
20
20
|
var node_url_default = {};
|
|
21
21
|
|
|
22
22
|
// src/env.js
|
|
23
|
-
var VERSION = "4.0.0-next.
|
|
23
|
+
var VERSION = "4.0.0-next.8";
|
|
24
24
|
var HAS_SELF = typeof self !== "undefined";
|
|
25
25
|
var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
|
|
26
26
|
var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
|
|
@@ -250,7 +250,7 @@ var logger = {
|
|
|
250
250
|
}
|
|
251
251
|
};
|
|
252
252
|
|
|
253
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
253
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
254
254
|
var DictionarySplitter = class {
|
|
255
255
|
/**
|
|
256
256
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -1906,10 +1906,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
1906
1906
|
);
|
|
1907
1907
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
1908
1908
|
output_tokens.push(...byte_tokens);
|
|
1909
|
-
} else {
|
|
1909
|
+
} else if (this.unk_token != null) {
|
|
1910
1910
|
output_tokens.push(this.unk_token);
|
|
1911
1911
|
}
|
|
1912
|
-
} else {
|
|
1912
|
+
} else if (this.unk_token != null) {
|
|
1913
1913
|
output_tokens.push(this.unk_token);
|
|
1914
1914
|
}
|
|
1915
1915
|
}
|
|
@@ -6515,13 +6515,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
|
|
|
6515
6515
|
wrapped_progress
|
|
6516
6516
|
);
|
|
6517
6517
|
} else if (typeof response !== "string") {
|
|
6518
|
+
const headers = new Headers(response.headers);
|
|
6519
|
+
headers.set("content-length", result.byteLength.toString());
|
|
6518
6520
|
await cache2.put(
|
|
6519
6521
|
cacheKey,
|
|
6520
6522
|
new Response(
|
|
6521
6523
|
/** @type {any} */
|
|
6522
6524
|
result,
|
|
6523
6525
|
{
|
|
6524
|
-
headers
|
|
6526
|
+
headers
|
|
6525
6527
|
}
|
|
6526
6528
|
)
|
|
6527
6529
|
).catch((err) => {
|
|
@@ -16498,6 +16500,7 @@ __export(processors_exports, {
|
|
|
16498
16500
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
16499
16501
|
Florence2Processor: () => Florence2Processor,
|
|
16500
16502
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
16503
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
16501
16504
|
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
16502
16505
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
16503
16506
|
Idefics3Processor: () => Idefics3Processor,
|
|
@@ -19011,26 +19014,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
19011
19014
|
}
|
|
19012
19015
|
return [segmentation, segments];
|
|
19013
19016
|
}
|
|
19014
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
19017
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
19015
19018
|
if (height < factor || width < factor) {
|
|
19016
|
-
|
|
19017
|
-
|
|
19019
|
+
const scale = Math.max(factor / height, factor / width);
|
|
19020
|
+
height = Math.round(height * scale);
|
|
19021
|
+
width = Math.round(width * scale);
|
|
19022
|
+
}
|
|
19023
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19018
19024
|
throw new Error(
|
|
19019
19025
|
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19020
19026
|
);
|
|
19021
19027
|
}
|
|
19022
19028
|
let h_bar = Math.round(height / factor) * factor;
|
|
19023
19029
|
let w_bar = Math.round(width / factor) * factor;
|
|
19024
|
-
if (h_bar * w_bar > max_pixels) {
|
|
19025
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
19026
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
19027
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
19028
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
19029
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
19030
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
19031
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
19032
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
19033
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
19034
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
19035
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
19030
19036
|
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19031
19037
|
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19032
19038
|
}
|
|
19033
|
-
return [
|
|
19039
|
+
return [w_bar, h_bar];
|
|
19034
19040
|
}
|
|
19035
19041
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
19036
19042
|
if (label_ids_to_fuse === null) {
|
|
@@ -19109,7 +19115,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
19109
19115
|
this.do_pad = config.do_pad;
|
|
19110
19116
|
this.min_pixels = config.min_pixels;
|
|
19111
19117
|
this.max_pixels = config.max_pixels;
|
|
19112
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
19118
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
19113
19119
|
this.pad_size = this.size;
|
|
19114
19120
|
}
|
|
19115
19121
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -19397,10 +19403,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
19397
19403
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
19398
19404
|
[pixelData, imgDims] = padded;
|
|
19399
19405
|
} else if (this.size_divisibility) {
|
|
19400
|
-
const
|
|
19401
|
-
|
|
19402
|
-
this.size_divisibility
|
|
19403
|
-
);
|
|
19406
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
19407
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
19404
19408
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
19405
19409
|
}
|
|
19406
19410
|
}
|
|
@@ -19477,6 +19481,7 @@ var image_processors_exports = {};
|
|
|
19477
19481
|
__export(image_processors_exports, {
|
|
19478
19482
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
19479
19483
|
BitImageProcessor: () => BitImageProcessor,
|
|
19484
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
19480
19485
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
19481
19486
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
19482
19487
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -19493,6 +19498,7 @@ __export(image_processors_exports, {
|
|
|
19493
19498
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
19494
19499
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
19495
19500
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
19501
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
19496
19502
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
19497
19503
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
19498
19504
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
@@ -19553,6 +19559,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
19553
19559
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
19554
19560
|
};
|
|
19555
19561
|
|
|
19562
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
19563
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
19564
|
+
};
|
|
19565
|
+
|
|
19556
19566
|
// src/models/clip/image_processing_clip.js
|
|
19557
19567
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
19558
19568
|
};
|
|
@@ -19672,6 +19682,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
19672
19682
|
}
|
|
19673
19683
|
};
|
|
19674
19684
|
|
|
19685
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19686
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19687
|
+
constructor(config) {
|
|
19688
|
+
super(config);
|
|
19689
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
19690
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
19691
|
+
this.patch_size = config.patch_size;
|
|
19692
|
+
this.merge_size = config.merge_size;
|
|
19693
|
+
}
|
|
19694
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
19695
|
+
get_resize_output_image_size(image, size) {
|
|
19696
|
+
const factor = this.patch_size * this.merge_size;
|
|
19697
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
19698
|
+
}
|
|
19699
|
+
async _call(images, ...args) {
|
|
19700
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
19701
|
+
let patches = pixel_values;
|
|
19702
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
19703
|
+
if (patches.dims[0] === 1) {
|
|
19704
|
+
patches = cat(
|
|
19705
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
19706
|
+
0
|
|
19707
|
+
);
|
|
19708
|
+
}
|
|
19709
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
19710
|
+
const channel = patches.dims[1];
|
|
19711
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
19712
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
19713
|
+
const flatten_patches = patches.view(
|
|
19714
|
+
grid_t,
|
|
19715
|
+
temporal_patch_size,
|
|
19716
|
+
channel,
|
|
19717
|
+
Math.floor(grid_h / merge_size),
|
|
19718
|
+
merge_size,
|
|
19719
|
+
patch_size,
|
|
19720
|
+
Math.floor(grid_w / merge_size),
|
|
19721
|
+
merge_size,
|
|
19722
|
+
patch_size
|
|
19723
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
19724
|
+
const image_grid_thw = new Tensor3("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
19725
|
+
return {
|
|
19726
|
+
pixel_values: flatten_patches,
|
|
19727
|
+
image_grid_thw,
|
|
19728
|
+
original_sizes,
|
|
19729
|
+
reshaped_input_sizes
|
|
19730
|
+
};
|
|
19731
|
+
}
|
|
19732
|
+
};
|
|
19733
|
+
|
|
19734
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
19735
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
19736
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
19737
|
+
get_resize_output_image_size(image, size) {
|
|
19738
|
+
const factor = this.patch_size * this.merge_size;
|
|
19739
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
19740
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
19741
|
+
}
|
|
19742
|
+
};
|
|
19743
|
+
|
|
19675
19744
|
// src/models/glpn/image_processing_glpn.js
|
|
19676
19745
|
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
19677
19746
|
};
|
|
@@ -20065,7 +20134,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
|
20065
20134
|
const img = pixel_values.unsqueeze_(0);
|
|
20066
20135
|
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20067
20136
|
const f2 = total_factor ** 2;
|
|
20068
|
-
const [
|
|
20137
|
+
const [new_width, new_height] = smart_resize(
|
|
20069
20138
|
Math.max(total_factor, height),
|
|
20070
20139
|
Math.max(total_factor, width),
|
|
20071
20140
|
total_factor,
|
|
@@ -20355,55 +20424,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
20355
20424
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
20356
20425
|
};
|
|
20357
20426
|
|
|
20358
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
20359
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
20360
|
-
constructor(config) {
|
|
20361
|
-
super(config);
|
|
20362
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
20363
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
20364
|
-
this.patch_size = config.patch_size;
|
|
20365
|
-
this.merge_size = config.merge_size;
|
|
20366
|
-
}
|
|
20367
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
20368
|
-
get_resize_output_image_size(image, size) {
|
|
20369
|
-
const factor = this.patch_size * this.merge_size;
|
|
20370
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
20371
|
-
}
|
|
20372
|
-
async _call(images, ...args) {
|
|
20373
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
20374
|
-
let patches = pixel_values;
|
|
20375
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
20376
|
-
if (patches.dims[0] === 1) {
|
|
20377
|
-
patches = cat(
|
|
20378
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
20379
|
-
0
|
|
20380
|
-
);
|
|
20381
|
-
}
|
|
20382
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
20383
|
-
const channel = patches.dims[1];
|
|
20384
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
20385
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
20386
|
-
const flatten_patches = patches.view(
|
|
20387
|
-
grid_t,
|
|
20388
|
-
temporal_patch_size,
|
|
20389
|
-
channel,
|
|
20390
|
-
Math.floor(grid_h / merge_size),
|
|
20391
|
-
merge_size,
|
|
20392
|
-
patch_size,
|
|
20393
|
-
Math.floor(grid_w / merge_size),
|
|
20394
|
-
merge_size,
|
|
20395
|
-
patch_size
|
|
20396
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
20397
|
-
const image_grid_thw = new Tensor3("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
20398
|
-
return {
|
|
20399
|
-
pixel_values: flatten_patches,
|
|
20400
|
-
image_grid_thw,
|
|
20401
|
-
original_sizes,
|
|
20402
|
-
reshaped_input_sizes
|
|
20403
|
-
};
|
|
20404
|
-
}
|
|
20405
|
-
};
|
|
20406
|
-
|
|
20407
20427
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
20408
20428
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
20409
20429
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -20957,6 +20977,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
20957
20977
|
}
|
|
20958
20978
|
};
|
|
20959
20979
|
|
|
20980
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
20981
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
20982
|
+
static image_processor_class = AutoImageProcessor;
|
|
20983
|
+
static tokenizer_class = AutoTokenizer;
|
|
20984
|
+
static image_token = "<|image_pad|>";
|
|
20985
|
+
/**
|
|
20986
|
+
*
|
|
20987
|
+
* @param {string|string[]} text
|
|
20988
|
+
* @param {RawImage|RawImage[]} images
|
|
20989
|
+
* @param {...any} args
|
|
20990
|
+
* @returns {Promise<any>}
|
|
20991
|
+
*/
|
|
20992
|
+
async _call(text, images = null, ...args) {
|
|
20993
|
+
if (!Array.isArray(text)) {
|
|
20994
|
+
text = [text];
|
|
20995
|
+
}
|
|
20996
|
+
let image_inputs, image_grid_thw;
|
|
20997
|
+
if (images) {
|
|
20998
|
+
image_inputs = await this.image_processor(images);
|
|
20999
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
21000
|
+
}
|
|
21001
|
+
if (image_grid_thw) {
|
|
21002
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
21003
|
+
let index = 0;
|
|
21004
|
+
const image_token = (
|
|
21005
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
21006
|
+
this.constructor.image_token
|
|
21007
|
+
);
|
|
21008
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
21009
|
+
text = text.map((t) => {
|
|
21010
|
+
while (t.includes(image_token)) {
|
|
21011
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
21012
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
21013
|
+
}
|
|
21014
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
21015
|
+
});
|
|
21016
|
+
}
|
|
21017
|
+
const text_inputs = this.tokenizer(text);
|
|
21018
|
+
return {
|
|
21019
|
+
...text_inputs,
|
|
21020
|
+
...image_inputs
|
|
21021
|
+
};
|
|
21022
|
+
}
|
|
21023
|
+
};
|
|
21024
|
+
|
|
21025
|
+
// src/models/glm46v/processing_glm46v.js
|
|
21026
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
21027
|
+
static image_token = "<|image|>";
|
|
21028
|
+
};
|
|
21029
|
+
|
|
20960
21030
|
// src/models/granite_speech/processing_granite_speech.js
|
|
20961
21031
|
var GraniteSpeechProcessor = class extends Processor {
|
|
20962
21032
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -21687,47 +21757,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
21687
21757
|
}
|
|
21688
21758
|
};
|
|
21689
21759
|
|
|
21690
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
21691
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
21692
|
-
static image_processor_class = AutoImageProcessor;
|
|
21693
|
-
static tokenizer_class = AutoTokenizer;
|
|
21694
|
-
/**
|
|
21695
|
-
*
|
|
21696
|
-
* @param {string|string[]} text
|
|
21697
|
-
* @param {RawImage|RawImage[]} images
|
|
21698
|
-
* @param {...any} args
|
|
21699
|
-
* @returns {Promise<any>}
|
|
21700
|
-
*/
|
|
21701
|
-
async _call(text, images = null, ...args) {
|
|
21702
|
-
if (!Array.isArray(text)) {
|
|
21703
|
-
text = [text];
|
|
21704
|
-
}
|
|
21705
|
-
let image_inputs, image_grid_thw;
|
|
21706
|
-
if (images) {
|
|
21707
|
-
image_inputs = await this.image_processor(images);
|
|
21708
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
21709
|
-
}
|
|
21710
|
-
if (image_grid_thw) {
|
|
21711
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
21712
|
-
let index = 0;
|
|
21713
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
21714
|
-
text = text.map((t) => {
|
|
21715
|
-
while (t.includes("<|image_pad|>")) {
|
|
21716
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
21717
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
21718
|
-
}
|
|
21719
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
21720
|
-
});
|
|
21721
|
-
}
|
|
21722
|
-
const text_inputs = this.tokenizer(text);
|
|
21723
|
-
return {
|
|
21724
|
-
...text_inputs,
|
|
21725
|
-
...image_inputs
|
|
21726
|
-
// TODO: ...videos_inputs,
|
|
21727
|
-
};
|
|
21728
|
-
}
|
|
21729
|
-
};
|
|
21730
|
-
|
|
21731
21760
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
21732
21761
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
21733
21762
|
};
|
|
@@ -22071,6 +22100,8 @@ function getNormalizedConfig(config) {
|
|
|
22071
22100
|
case "gemma3n":
|
|
22072
22101
|
case "lfm2_vl":
|
|
22073
22102
|
case "chatterbox":
|
|
22103
|
+
case "lighton_ocr":
|
|
22104
|
+
case "glm_ocr":
|
|
22074
22105
|
case "mistral3":
|
|
22075
22106
|
case "qwen2_5_vl":
|
|
22076
22107
|
case "qwen3_vl":
|
|
@@ -22146,6 +22177,8 @@ function getNormalizedConfig(config) {
|
|
|
22146
22177
|
mapping["dim_kv"] = "head_dim";
|
|
22147
22178
|
break;
|
|
22148
22179
|
case "qwen3":
|
|
22180
|
+
case "solar_open":
|
|
22181
|
+
case "glm_ocr_text":
|
|
22149
22182
|
case "gemma":
|
|
22150
22183
|
case "gemma2":
|
|
22151
22184
|
case "vaultgemma":
|
|
@@ -22156,6 +22189,7 @@ function getNormalizedConfig(config) {
|
|
|
22156
22189
|
case "ernie4_5":
|
|
22157
22190
|
case "hunyuan_v1_dense":
|
|
22158
22191
|
case "falcon_h1":
|
|
22192
|
+
case "nemotron_h":
|
|
22159
22193
|
case "ministral":
|
|
22160
22194
|
case "ministral3":
|
|
22161
22195
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -22190,6 +22224,9 @@ function getNormalizedConfig(config) {
|
|
|
22190
22224
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
22191
22225
|
break;
|
|
22192
22226
|
case "youtu":
|
|
22227
|
+
case "deepseek_v3":
|
|
22228
|
+
case "glm_moe_dsa":
|
|
22229
|
+
case "mistral4":
|
|
22193
22230
|
mapping["num_heads"] = "num_key_value_heads";
|
|
22194
22231
|
mapping["num_layers"] = "num_hidden_layers";
|
|
22195
22232
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -22278,6 +22315,7 @@ function getCacheShapes(config, options) {
|
|
|
22278
22315
|
if (!(config instanceof PretrainedConfig)) {
|
|
22279
22316
|
config = new PretrainedConfig(config);
|
|
22280
22317
|
}
|
|
22318
|
+
const batch_size = options?.batch_size ?? 1;
|
|
22281
22319
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
22282
22320
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
22283
22321
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -22287,7 +22325,6 @@ function getCacheShapes(config, options) {
|
|
|
22287
22325
|
config
|
|
22288
22326
|
);
|
|
22289
22327
|
const head_dim = hidden_size / num_attention_heads;
|
|
22290
|
-
const batch_size = options?.batch_size ?? 1;
|
|
22291
22328
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
22292
22329
|
if (layer_types[i] === "full_attention") {
|
|
22293
22330
|
for (const kv of ["key", "value"]) {
|
|
@@ -22300,31 +22337,26 @@ function getCacheShapes(config, options) {
|
|
|
22300
22337
|
}
|
|
22301
22338
|
}
|
|
22302
22339
|
return cache_values;
|
|
22303
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
22340
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
22304
22341
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
22305
22342
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
22306
|
-
const
|
|
22307
|
-
const {
|
|
22308
|
-
layer_types,
|
|
22309
|
-
num_hidden_layers,
|
|
22310
|
-
num_attention_heads,
|
|
22311
|
-
num_key_value_heads,
|
|
22312
|
-
hidden_size,
|
|
22313
|
-
mamba_d_conv,
|
|
22314
|
-
mamba_n_heads,
|
|
22315
|
-
mamba_d_head,
|
|
22316
|
-
mamba_d_state,
|
|
22317
|
-
mamba_n_groups,
|
|
22318
|
-
mamba_expand,
|
|
22319
|
-
mamba_d_ssm
|
|
22320
|
-
} = (
|
|
22343
|
+
const c = (
|
|
22321
22344
|
/** @type {any} */
|
|
22322
22345
|
config
|
|
22323
22346
|
);
|
|
22324
|
-
const
|
|
22325
|
-
const
|
|
22326
|
-
const
|
|
22327
|
-
|
|
22347
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
22348
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
22349
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
22350
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
22351
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
22352
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
22353
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
22354
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
22355
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
22356
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
22357
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
22358
|
+
const cache_values = {};
|
|
22359
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
22328
22360
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
22329
22361
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
22330
22362
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -22358,7 +22390,6 @@ function getCacheShapes(config, options) {
|
|
|
22358
22390
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
22359
22391
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
22360
22392
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
22361
|
-
const batch_size = options?.batch_size ?? 1;
|
|
22362
22393
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
22363
22394
|
if (layer_types[i] === "full_attention") {
|
|
22364
22395
|
for (const kv of ["key", "value"]) {
|
|
@@ -24986,7 +25017,9 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24986
25017
|
"qwen3_5",
|
|
24987
25018
|
"qwen3_5_text",
|
|
24988
25019
|
"qwen3_5_moe",
|
|
24989
|
-
"qwen3_5_moe_text"
|
|
25020
|
+
"qwen3_5_moe_text",
|
|
25021
|
+
"glm_ocr",
|
|
25022
|
+
"glm_ocr_text"
|
|
24990
25023
|
].includes(self2.config.model_type)
|
|
24991
25024
|
) {
|
|
24992
25025
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -25210,6 +25243,8 @@ __export(models_exports, {
|
|
|
25210
25243
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
25211
25244
|
BloomModel: () => BloomModel,
|
|
25212
25245
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
25246
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
25247
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
25213
25248
|
CLIPModel: () => CLIPModel,
|
|
25214
25249
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
25215
25250
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -25284,6 +25319,9 @@ __export(models_exports, {
|
|
|
25284
25319
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
25285
25320
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
25286
25321
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
25322
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
25323
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
25324
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
25287
25325
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
25288
25326
|
DeiTModel: () => DeiTModel,
|
|
25289
25327
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -25329,6 +25367,11 @@ __export(models_exports, {
|
|
|
25329
25367
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
25330
25368
|
EsmModel: () => EsmModel,
|
|
25331
25369
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
25370
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
25371
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
25372
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
25373
|
+
EuroBertModel: () => EuroBertModel,
|
|
25374
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
25332
25375
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
25333
25376
|
ExaoneModel: () => ExaoneModel,
|
|
25334
25377
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -25375,6 +25418,10 @@ __export(models_exports, {
|
|
|
25375
25418
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
25376
25419
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
25377
25420
|
GlmModel: () => GlmModel,
|
|
25421
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
25422
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
25423
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
25424
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
25378
25425
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
25379
25426
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
25380
25427
|
GptOssModel: () => GptOssModel,
|
|
@@ -25421,6 +25468,7 @@ __export(models_exports, {
|
|
|
25421
25468
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
25422
25469
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
25423
25470
|
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
25471
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
25424
25472
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
25425
25473
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
25426
25474
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -25470,6 +25518,9 @@ __export(models_exports, {
|
|
|
25470
25518
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
25471
25519
|
MimiModel: () => MimiModel,
|
|
25472
25520
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
25521
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
25522
|
+
Mistral4Model: () => Mistral4Model,
|
|
25523
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
25473
25524
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
25474
25525
|
MistralModel: () => MistralModel,
|
|
25475
25526
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -25527,6 +25578,9 @@ __export(models_exports, {
|
|
|
25527
25578
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
25528
25579
|
NanoChatModel: () => NanoChatModel,
|
|
25529
25580
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
25581
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
25582
|
+
NemotronHModel: () => NemotronHModel,
|
|
25583
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
25530
25584
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
25531
25585
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
25532
25586
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -25664,6 +25718,9 @@ __export(models_exports, {
|
|
|
25664
25718
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
25665
25719
|
SnacModel: () => SnacModel,
|
|
25666
25720
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
25721
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
25722
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
25723
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
25667
25724
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
25668
25725
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
25669
25726
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -25838,7 +25895,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
25838
25895
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
25839
25896
|
};
|
|
25840
25897
|
|
|
25841
|
-
// src/models/
|
|
25898
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
25842
25899
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
25843
25900
|
};
|
|
25844
25901
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -26173,6 +26230,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
26173
26230
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
26174
26231
|
};
|
|
26175
26232
|
|
|
26233
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
26234
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
26235
|
+
};
|
|
26236
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
26237
|
+
};
|
|
26238
|
+
|
|
26176
26239
|
// src/models/clap/modeling_clap.js
|
|
26177
26240
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
26178
26241
|
};
|
|
@@ -26511,6 +26574,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
26511
26574
|
}
|
|
26512
26575
|
};
|
|
26513
26576
|
|
|
26577
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
26578
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
26579
|
+
};
|
|
26580
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
26581
|
+
};
|
|
26582
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
26583
|
+
};
|
|
26584
|
+
|
|
26514
26585
|
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
26515
26586
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
26516
26587
|
};
|
|
@@ -26859,6 +26930,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
26859
26930
|
}
|
|
26860
26931
|
};
|
|
26861
26932
|
|
|
26933
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
26934
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
26935
|
+
};
|
|
26936
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
26937
|
+
};
|
|
26938
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
26939
|
+
/**
|
|
26940
|
+
* Calls the model on new inputs.
|
|
26941
|
+
*
|
|
26942
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26943
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
26944
|
+
*/
|
|
26945
|
+
async _call(model_inputs) {
|
|
26946
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
26947
|
+
}
|
|
26948
|
+
};
|
|
26949
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
26950
|
+
/**
|
|
26951
|
+
* Calls the model on new inputs.
|
|
26952
|
+
*
|
|
26953
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26954
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
26955
|
+
*/
|
|
26956
|
+
async _call(model_inputs) {
|
|
26957
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
26958
|
+
}
|
|
26959
|
+
};
|
|
26960
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
26961
|
+
/**
|
|
26962
|
+
* Calls the model on new inputs.
|
|
26963
|
+
*
|
|
26964
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26965
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
26966
|
+
*/
|
|
26967
|
+
async _call(model_inputs) {
|
|
26968
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
26969
|
+
}
|
|
26970
|
+
};
|
|
26971
|
+
|
|
26862
26972
|
// src/models/exaone/modeling_exaone.js
|
|
26863
26973
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
26864
26974
|
};
|
|
@@ -27134,6 +27244,377 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
27134
27244
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
27135
27245
|
};
|
|
27136
27246
|
|
|
27247
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
27248
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
27249
|
+
};
|
|
27250
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
27251
|
+
};
|
|
27252
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
27253
|
+
};
|
|
27254
|
+
|
|
27255
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
27256
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
27257
|
+
forward_params = [
|
|
27258
|
+
// Text inputs
|
|
27259
|
+
"input_ids",
|
|
27260
|
+
"attention_mask",
|
|
27261
|
+
"position_ids",
|
|
27262
|
+
"past_key_values",
|
|
27263
|
+
// Vision inputs
|
|
27264
|
+
"pixel_values",
|
|
27265
|
+
"image_grid_thw"
|
|
27266
|
+
];
|
|
27267
|
+
};
|
|
27268
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
27269
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
27270
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
27271
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
27272
|
+
image_grid_thw_name = "grid_thw";
|
|
27273
|
+
/**
|
|
27274
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
27275
|
+
* @param {Tensor} input_ids
|
|
27276
|
+
* @param {Tensor} attention_mask
|
|
27277
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
27278
|
+
*/
|
|
27279
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
27280
|
+
if (attention_mask) {
|
|
27281
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
27282
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
27283
|
+
const mrope_position_deltas = Array.from(
|
|
27284
|
+
{ length: dims[0] },
|
|
27285
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
27286
|
+
);
|
|
27287
|
+
return [
|
|
27288
|
+
new Tensor3("int64", position_ids, [3, ...dims]),
|
|
27289
|
+
new Tensor3("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27290
|
+
];
|
|
27291
|
+
} else {
|
|
27292
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
27293
|
+
const position_ids = BigInt64Array.from(
|
|
27294
|
+
{ length: 3 * batch_size * seq_length },
|
|
27295
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
27296
|
+
);
|
|
27297
|
+
return [new Tensor3("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
27298
|
+
}
|
|
27299
|
+
}
|
|
27300
|
+
/**
|
|
27301
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
27302
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
27303
|
+
* respecting attention mask.
|
|
27304
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
27305
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
27306
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
27307
|
+
* @param {number} batch_idx Current batch index
|
|
27308
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
27309
|
+
*/
|
|
27310
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
27311
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
27312
|
+
const llm_positions = new Array(total_len);
|
|
27313
|
+
let index = 0;
|
|
27314
|
+
for (let x = 0; x < 3; ++x) {
|
|
27315
|
+
for (const val of llm_pos_ids_list) {
|
|
27316
|
+
const seg_len = val.length / 3;
|
|
27317
|
+
for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
|
|
27318
|
+
llm_positions[index++] = val[z];
|
|
27319
|
+
}
|
|
27320
|
+
}
|
|
27321
|
+
}
|
|
27322
|
+
let count2 = 0;
|
|
27323
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
27324
|
+
if (attn_mask[y] == 1) {
|
|
27325
|
+
for (let x = 0; x < 3; ++x) {
|
|
27326
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
27327
|
+
}
|
|
27328
|
+
++count2;
|
|
27329
|
+
}
|
|
27330
|
+
}
|
|
27331
|
+
return llm_positions;
|
|
27332
|
+
}
|
|
27333
|
+
/**
|
|
27334
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
27335
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
27336
|
+
* @param {object} params
|
|
27337
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
27338
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
27339
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
27340
|
+
* @param {number} params.spatial_merge_size
|
|
27341
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
27342
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
27343
|
+
*/
|
|
27344
|
+
_get_multimodal_rope_positions({
|
|
27345
|
+
filtered_ids,
|
|
27346
|
+
image_grid_thw_list,
|
|
27347
|
+
video_grid_thw_list,
|
|
27348
|
+
spatial_merge_size,
|
|
27349
|
+
state
|
|
27350
|
+
}) {
|
|
27351
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
27352
|
+
const ids = filtered_ids;
|
|
27353
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
27354
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
27355
|
+
return acc;
|
|
27356
|
+
}, []);
|
|
27357
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
27358
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
27359
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
27360
|
+
const llm_pos_ids_list = [];
|
|
27361
|
+
let st2 = 0;
|
|
27362
|
+
let remain_images = image_nums;
|
|
27363
|
+
let remain_videos = video_nums;
|
|
27364
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
27365
|
+
const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
|
|
27366
|
+
const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
|
|
27367
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
27368
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
27369
|
+
let ed;
|
|
27370
|
+
let t, h, w;
|
|
27371
|
+
if (ed_image < ed_video) {
|
|
27372
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
27373
|
+
++state.image_index;
|
|
27374
|
+
--remain_images;
|
|
27375
|
+
ed = ed_image;
|
|
27376
|
+
} else {
|
|
27377
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
27378
|
+
++state.video_index;
|
|
27379
|
+
--remain_videos;
|
|
27380
|
+
ed = ed_video;
|
|
27381
|
+
}
|
|
27382
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
27383
|
+
Number(t),
|
|
27384
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
27385
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
27386
|
+
];
|
|
27387
|
+
const text_len = ed - st2;
|
|
27388
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27389
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
27390
|
+
const offset = text_len + st_idx;
|
|
27391
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
27392
|
+
const t_index = Array.from(
|
|
27393
|
+
{ length: grid_size },
|
|
27394
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
27395
|
+
);
|
|
27396
|
+
const h_index = Array.from(
|
|
27397
|
+
{ length: grid_size },
|
|
27398
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
27399
|
+
);
|
|
27400
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
27401
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
27402
|
+
st2 = ed + grid_size;
|
|
27403
|
+
}
|
|
27404
|
+
if (st2 < ids.length) {
|
|
27405
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27406
|
+
const text_len = ids.length - st2;
|
|
27407
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
27408
|
+
}
|
|
27409
|
+
return llm_pos_ids_list;
|
|
27410
|
+
}
|
|
27411
|
+
/**
|
|
27412
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
27413
|
+
*
|
|
27414
|
+
* Explanation:
|
|
27415
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
27416
|
+
*
|
|
27417
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
27418
|
+
* Examples:
|
|
27419
|
+
* input_ids: [T T T T T], here T is for text.
|
|
27420
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
27421
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
27422
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
27423
|
+
*
|
|
27424
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
27425
|
+
* and 1D rotary position embeddin for text part.
|
|
27426
|
+
* Examples:
|
|
27427
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
27428
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
27429
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
27430
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
27431
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
27432
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
27433
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
27434
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
27435
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
27436
|
+
*
|
|
27437
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
27438
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
27439
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
27440
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
27441
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
27442
|
+
*/
|
|
27443
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
27444
|
+
const { vision_config } = this.config;
|
|
27445
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
27446
|
+
if (image_grid_thw || video_grid_thw) {
|
|
27447
|
+
const total_input_ids = input_ids.tolist();
|
|
27448
|
+
if (!attention_mask) {
|
|
27449
|
+
attention_mask = ones_like(input_ids);
|
|
27450
|
+
}
|
|
27451
|
+
const attention_mask_list = attention_mask.tolist();
|
|
27452
|
+
const position_ids_list = Array.from(
|
|
27453
|
+
{ length: 3 },
|
|
27454
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
27455
|
+
);
|
|
27456
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
27457
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
27458
|
+
const state = { image_index: 0, video_index: 0 };
|
|
27459
|
+
const mrope_position_deltas = [];
|
|
27460
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
27461
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
27462
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
27463
|
+
filtered_ids,
|
|
27464
|
+
image_grid_thw_list,
|
|
27465
|
+
video_grid_thw_list,
|
|
27466
|
+
spatial_merge_size,
|
|
27467
|
+
state
|
|
27468
|
+
});
|
|
27469
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
27470
|
+
llm_pos_ids_list,
|
|
27471
|
+
attention_mask_list[i],
|
|
27472
|
+
position_ids_list,
|
|
27473
|
+
i
|
|
27474
|
+
);
|
|
27475
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
27476
|
+
}
|
|
27477
|
+
return [
|
|
27478
|
+
new Tensor3("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
27479
|
+
new Tensor3("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27480
|
+
];
|
|
27481
|
+
} else {
|
|
27482
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
27483
|
+
}
|
|
27484
|
+
}
|
|
27485
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
27486
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
27487
|
+
pixel_values,
|
|
27488
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
27489
|
+
})).image_features;
|
|
27490
|
+
return features;
|
|
27491
|
+
}
|
|
27492
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
27493
|
+
return default_merge_input_ids_with_image_features({
|
|
27494
|
+
// @ts-ignore
|
|
27495
|
+
image_token_id: this.config.image_token_id,
|
|
27496
|
+
...kwargs
|
|
27497
|
+
});
|
|
27498
|
+
}
|
|
27499
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
27500
|
+
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
27501
|
+
if (!model_inputs.past_key_values) {
|
|
27502
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27503
|
+
model_inputs.input_ids,
|
|
27504
|
+
model_inputs.image_grid_thw,
|
|
27505
|
+
model_inputs.video_grid_thw,
|
|
27506
|
+
model_inputs.attention_mask
|
|
27507
|
+
);
|
|
27508
|
+
} else {
|
|
27509
|
+
model_inputs.pixel_values = null;
|
|
27510
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
27511
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27512
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27513
|
+
model_inputs.input_ids,
|
|
27514
|
+
model_inputs.image_grid_thw,
|
|
27515
|
+
model_inputs.video_grid_thw,
|
|
27516
|
+
model_inputs.attention_mask
|
|
27517
|
+
);
|
|
27518
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
27519
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
27520
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
27521
|
+
} else {
|
|
27522
|
+
if (!model_inputs.rope_deltas) {
|
|
27523
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27524
|
+
model_inputs.input_ids,
|
|
27525
|
+
model_inputs.image_grid_thw,
|
|
27526
|
+
model_inputs.video_grid_thw,
|
|
27527
|
+
model_inputs.attention_mask
|
|
27528
|
+
);
|
|
27529
|
+
}
|
|
27530
|
+
const delta = BigInt(past_length);
|
|
27531
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
27532
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
27533
|
+
}
|
|
27534
|
+
}
|
|
27535
|
+
}
|
|
27536
|
+
return model_inputs;
|
|
27537
|
+
}
|
|
27538
|
+
};
|
|
27539
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
27540
|
+
};
|
|
27541
|
+
|
|
27542
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27543
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27544
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27545
|
+
};
|
|
27546
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
27547
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27548
|
+
};
|
|
27549
|
+
|
|
27550
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
27551
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27552
|
+
/**
|
|
27553
|
+
* Compute 3D positional indices for vision tokens.
|
|
27554
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
27555
|
+
* @param {number} start_position
|
|
27556
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
27557
|
+
* @param {number} temp_merge_size
|
|
27558
|
+
* @param {number} spatial_merge_size
|
|
27559
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
27560
|
+
*/
|
|
27561
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
27562
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
27563
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
27564
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
27565
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
27566
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
27567
|
+
const h_pos = Array.from(
|
|
27568
|
+
{ length: seq_len },
|
|
27569
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
27570
|
+
);
|
|
27571
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
27572
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
27573
|
+
}
|
|
27574
|
+
/**
|
|
27575
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
27576
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
27577
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
27578
|
+
*/
|
|
27579
|
+
_get_multimodal_rope_positions({
|
|
27580
|
+
filtered_ids,
|
|
27581
|
+
image_grid_thw_list,
|
|
27582
|
+
video_grid_thw_list,
|
|
27583
|
+
spatial_merge_size,
|
|
27584
|
+
state
|
|
27585
|
+
}) {
|
|
27586
|
+
const { image_token_id } = this.config;
|
|
27587
|
+
const groups = [];
|
|
27588
|
+
let group_start = 0;
|
|
27589
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
27590
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
27591
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
27592
|
+
if (t !== current_type) {
|
|
27593
|
+
groups.push([current_type, group_start, j]);
|
|
27594
|
+
group_start = j;
|
|
27595
|
+
current_type = t;
|
|
27596
|
+
}
|
|
27597
|
+
}
|
|
27598
|
+
let current_pos = 0;
|
|
27599
|
+
const llm_pos_ids_list = [];
|
|
27600
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
27601
|
+
if (modality_type === 0) {
|
|
27602
|
+
const text_len = end_idx - start_idx;
|
|
27603
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
27604
|
+
current_pos += text_len;
|
|
27605
|
+
} else {
|
|
27606
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
27607
|
+
const temp_merge_size = grid_thw[0];
|
|
27608
|
+
llm_pos_ids_list.push(
|
|
27609
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
27610
|
+
);
|
|
27611
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
27612
|
+
}
|
|
27613
|
+
}
|
|
27614
|
+
return llm_pos_ids_list;
|
|
27615
|
+
}
|
|
27616
|
+
};
|
|
27617
|
+
|
|
27137
27618
|
// src/models/glpn/modeling_glpn.js
|
|
27138
27619
|
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
27139
27620
|
};
|
|
@@ -27446,6 +27927,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
27446
27927
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
27447
27928
|
};
|
|
27448
27929
|
|
|
27930
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
27931
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27932
|
+
};
|
|
27933
|
+
|
|
27449
27934
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
27450
27935
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
27451
27936
|
};
|
|
@@ -27642,6 +28127,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
27642
28127
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
27643
28128
|
};
|
|
27644
28129
|
|
|
28130
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
28131
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
28132
|
+
};
|
|
28133
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
28134
|
+
};
|
|
28135
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
28136
|
+
};
|
|
28137
|
+
|
|
27645
28138
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
27646
28139
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
27647
28140
|
};
|
|
@@ -28110,6 +28603,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
28110
28603
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
28111
28604
|
};
|
|
28112
28605
|
|
|
28606
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
28607
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
28608
|
+
};
|
|
28609
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
28610
|
+
};
|
|
28611
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
28612
|
+
};
|
|
28613
|
+
|
|
28113
28614
|
// src/models/neobert/modeling_neobert.js
|
|
28114
28615
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
28115
28616
|
};
|
|
@@ -28390,252 +28891,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
28390
28891
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
28391
28892
|
};
|
|
28392
28893
|
|
|
28393
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
28394
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
28395
|
-
forward_params = [
|
|
28396
|
-
// Text inputs
|
|
28397
|
-
"input_ids",
|
|
28398
|
-
"attention_mask",
|
|
28399
|
-
"position_ids",
|
|
28400
|
-
"past_key_values",
|
|
28401
|
-
// Vision inputs
|
|
28402
|
-
"pixel_values",
|
|
28403
|
-
"image_grid_thw"
|
|
28404
|
-
];
|
|
28405
|
-
};
|
|
28406
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
28407
|
-
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
28408
|
-
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
28409
|
-
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
28410
|
-
image_grid_thw_name = "grid_thw";
|
|
28411
|
-
/**
|
|
28412
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
28413
|
-
*
|
|
28414
|
-
* Explanation:
|
|
28415
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
28416
|
-
*
|
|
28417
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
28418
|
-
* Examples:
|
|
28419
|
-
* input_ids: [T T T T T], here T is for text.
|
|
28420
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
28421
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
28422
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
28423
|
-
*
|
|
28424
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
28425
|
-
* and 1D rotary position embeddin for text part.
|
|
28426
|
-
* Examples:
|
|
28427
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
28428
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
28429
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
28430
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
28431
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
28432
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
28433
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
28434
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
28435
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
28436
|
-
*
|
|
28437
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
28438
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
28439
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
28440
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
28441
|
-
* - 1 for tokens that are **not masked**,
|
|
28442
|
-
* - 0 for tokens that are **masked**.
|
|
28443
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
28444
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
28445
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
28446
|
-
*/
|
|
28447
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
28448
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
28449
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
28450
|
-
const mrope_position_deltas = [];
|
|
28451
|
-
if (image_grid_thw || video_grid_thw) {
|
|
28452
|
-
let total_input_ids = input_ids.tolist();
|
|
28453
|
-
if (!attention_mask) {
|
|
28454
|
-
attention_mask = ones_like(input_ids);
|
|
28455
|
-
}
|
|
28456
|
-
const attention_mask_list = attention_mask.tolist();
|
|
28457
|
-
const position_ids_list = Array.from(
|
|
28458
|
-
{ length: 3 },
|
|
28459
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
28460
|
-
);
|
|
28461
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
28462
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
28463
|
-
let image_index = 0;
|
|
28464
|
-
let video_index = 0;
|
|
28465
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
28466
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
28467
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
28468
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
28469
|
-
return acc;
|
|
28470
|
-
}, []);
|
|
28471
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
28472
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
28473
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
28474
|
-
let llm_pos_ids_list = [];
|
|
28475
|
-
let st2 = 0;
|
|
28476
|
-
let remain_images = image_nums;
|
|
28477
|
-
let remain_videos = video_nums;
|
|
28478
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
28479
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
|
|
28480
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
|
|
28481
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
28482
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
28483
|
-
let ed;
|
|
28484
|
-
let t, h, w;
|
|
28485
|
-
if (ed_image < ed_video) {
|
|
28486
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
28487
|
-
++image_index;
|
|
28488
|
-
--remain_images;
|
|
28489
|
-
ed = ed_image;
|
|
28490
|
-
} else {
|
|
28491
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
28492
|
-
++video_index;
|
|
28493
|
-
--remain_videos;
|
|
28494
|
-
ed = ed_video;
|
|
28495
|
-
}
|
|
28496
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
28497
|
-
Number(t),
|
|
28498
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
28499
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
28500
|
-
];
|
|
28501
|
-
const text_len = ed - st2;
|
|
28502
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
28503
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
28504
|
-
const offset = text_len + st_idx;
|
|
28505
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
28506
|
-
const t_index = Array.from(
|
|
28507
|
-
{ length: grid_size },
|
|
28508
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
28509
|
-
);
|
|
28510
|
-
const h_index = Array.from(
|
|
28511
|
-
{ length: grid_size },
|
|
28512
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
28513
|
-
);
|
|
28514
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
28515
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
28516
|
-
st2 = ed + grid_size;
|
|
28517
|
-
}
|
|
28518
|
-
if (st2 < ids.length) {
|
|
28519
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
28520
|
-
const text_len = ids.length - st2;
|
|
28521
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
28522
|
-
}
|
|
28523
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
28524
|
-
const llm_positions = new Array(num_items);
|
|
28525
|
-
let index = 0;
|
|
28526
|
-
for (let x = 0; x < 3; ++x) {
|
|
28527
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
28528
|
-
const val = llm_pos_ids_list[y];
|
|
28529
|
-
const text_len = val.length / 3;
|
|
28530
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
28531
|
-
llm_positions[index++] = val[z];
|
|
28532
|
-
}
|
|
28533
|
-
}
|
|
28534
|
-
}
|
|
28535
|
-
let count2 = 0;
|
|
28536
|
-
const attn_mask = attention_mask_list[i];
|
|
28537
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
28538
|
-
if (attn_mask[y] == 1) {
|
|
28539
|
-
for (let x = 0; x < 3; ++x) {
|
|
28540
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
28541
|
-
}
|
|
28542
|
-
++count2;
|
|
28543
|
-
}
|
|
28544
|
-
}
|
|
28545
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
28546
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
28547
|
-
}
|
|
28548
|
-
return [
|
|
28549
|
-
new Tensor3("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
28550
|
-
new Tensor3("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
28551
|
-
];
|
|
28552
|
-
} else {
|
|
28553
|
-
if (attention_mask) {
|
|
28554
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
28555
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
28556
|
-
const mrope_position_deltas2 = Array.from(
|
|
28557
|
-
{ length: dims[0] },
|
|
28558
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
28559
|
-
);
|
|
28560
|
-
return [
|
|
28561
|
-
new Tensor3("int64", position_ids, [3, ...dims]),
|
|
28562
|
-
new Tensor3("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
28563
|
-
];
|
|
28564
|
-
} else {
|
|
28565
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
28566
|
-
const position_ids = BigInt64Array.from(
|
|
28567
|
-
{ length: 3 * batch_size * seq_length },
|
|
28568
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
28569
|
-
);
|
|
28570
|
-
return [new Tensor3("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
28571
|
-
}
|
|
28572
|
-
}
|
|
28573
|
-
}
|
|
28574
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
28575
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
28576
|
-
pixel_values,
|
|
28577
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
28578
|
-
})).image_features;
|
|
28579
|
-
return features;
|
|
28580
|
-
}
|
|
28581
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
28582
|
-
return default_merge_input_ids_with_image_features({
|
|
28583
|
-
// @ts-ignore
|
|
28584
|
-
image_token_id: this.config.image_token_id,
|
|
28585
|
-
...kwargs
|
|
28586
|
-
});
|
|
28587
|
-
}
|
|
28588
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
28589
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
28590
|
-
if (!model_inputs.past_key_values) {
|
|
28591
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
28592
|
-
model_inputs.input_ids,
|
|
28593
|
-
model_inputs.image_grid_thw,
|
|
28594
|
-
model_inputs.video_grid_thw,
|
|
28595
|
-
model_inputs.attention_mask
|
|
28596
|
-
);
|
|
28597
|
-
} else {
|
|
28598
|
-
model_inputs.pixel_values = null;
|
|
28599
|
-
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
28600
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
28601
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
28602
|
-
model_inputs.input_ids,
|
|
28603
|
-
model_inputs.image_grid_thw,
|
|
28604
|
-
model_inputs.video_grid_thw,
|
|
28605
|
-
model_inputs.attention_mask
|
|
28606
|
-
);
|
|
28607
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
28608
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
28609
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
28610
|
-
} else {
|
|
28611
|
-
if (!model_inputs.rope_deltas) {
|
|
28612
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
28613
|
-
model_inputs.input_ids,
|
|
28614
|
-
model_inputs.image_grid_thw,
|
|
28615
|
-
model_inputs.video_grid_thw,
|
|
28616
|
-
model_inputs.attention_mask
|
|
28617
|
-
);
|
|
28618
|
-
}
|
|
28619
|
-
const delta = BigInt(past_length);
|
|
28620
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
28621
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
28622
|
-
}
|
|
28623
|
-
}
|
|
28624
|
-
}
|
|
28625
|
-
return model_inputs;
|
|
28626
|
-
}
|
|
28627
|
-
};
|
|
28628
|
-
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
28629
|
-
};
|
|
28630
|
-
|
|
28631
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
28632
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
28633
|
-
image_grid_thw_name = "image_grid_thw";
|
|
28634
|
-
};
|
|
28635
|
-
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
28636
|
-
image_grid_thw_name = "image_grid_thw";
|
|
28637
|
-
};
|
|
28638
|
-
|
|
28639
28894
|
// src/models/qwen3/modeling_qwen3.js
|
|
28640
28895
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
28641
28896
|
};
|
|
@@ -29081,6 +29336,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
29081
29336
|
}
|
|
29082
29337
|
};
|
|
29083
29338
|
|
|
29339
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
29340
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
29341
|
+
};
|
|
29342
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
29343
|
+
};
|
|
29344
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
29345
|
+
};
|
|
29346
|
+
|
|
29084
29347
|
// src/models/speecht5/modeling_speecht5.js
|
|
29085
29348
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
29086
29349
|
};
|
|
@@ -30197,6 +30460,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
30197
30460
|
// src/models/registry.js
|
|
30198
30461
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
30199
30462
|
["bert", "BertModel"],
|
|
30463
|
+
["eurobert", "EuroBertModel"],
|
|
30200
30464
|
["neobert", "NeoBertModel"],
|
|
30201
30465
|
["modernbert", "ModernBertModel"],
|
|
30202
30466
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -30328,6 +30592,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
30328
30592
|
["gemma3_text", "Gemma3Model"],
|
|
30329
30593
|
["helium", "HeliumModel"],
|
|
30330
30594
|
["glm", "GlmModel"],
|
|
30595
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
30331
30596
|
["openelm", "OpenELMModel"],
|
|
30332
30597
|
["qwen2", "Qwen2Model"],
|
|
30333
30598
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -30339,12 +30604,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
30339
30604
|
["mpt", "MptModel"],
|
|
30340
30605
|
["opt", "OPTModel"],
|
|
30341
30606
|
["mistral", "MistralModel"],
|
|
30607
|
+
["mistral4", "Mistral4Model"],
|
|
30342
30608
|
["ministral", "MinistralModel"],
|
|
30343
30609
|
["ministral3", "Ministral3Model"],
|
|
30344
30610
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
30345
30611
|
["starcoder2", "Starcoder2Model"],
|
|
30612
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
30346
30613
|
["falcon", "FalconModel"],
|
|
30347
30614
|
["falcon_h1", "FalconH1Model"],
|
|
30615
|
+
["nemotron_h", "NemotronHModel"],
|
|
30616
|
+
["solar_open", "SolarOpenModel"],
|
|
30348
30617
|
["stablelm", "StableLmModel"],
|
|
30349
30618
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
30350
30619
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -30364,6 +30633,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30364
30633
|
]);
|
|
30365
30634
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30366
30635
|
["bert", "BertForSequenceClassification"],
|
|
30636
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
30367
30637
|
["neobert", "NeoBertForSequenceClassification"],
|
|
30368
30638
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
30369
30639
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -30386,6 +30656,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30386
30656
|
]);
|
|
30387
30657
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30388
30658
|
["bert", "BertForTokenClassification"],
|
|
30659
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
30389
30660
|
["neobert", "NeoBertForTokenClassification"],
|
|
30390
30661
|
["modernbert", "ModernBertForTokenClassification"],
|
|
30391
30662
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -30448,6 +30719,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30448
30719
|
["gemma3", "Gemma3ForCausalLM"],
|
|
30449
30720
|
["helium", "HeliumForCausalLM"],
|
|
30450
30721
|
["glm", "GlmForCausalLM"],
|
|
30722
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
30451
30723
|
["openelm", "OpenELMForCausalLM"],
|
|
30452
30724
|
["qwen2", "Qwen2ForCausalLM"],
|
|
30453
30725
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
@@ -30467,13 +30739,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30467
30739
|
["opt", "OPTForCausalLM"],
|
|
30468
30740
|
["mbart", "MBartForCausalLM"],
|
|
30469
30741
|
["mistral", "MistralForCausalLM"],
|
|
30742
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
30470
30743
|
["ministral", "MinistralForCausalLM"],
|
|
30471
30744
|
["ministral3", "Ministral3ForCausalLM"],
|
|
30472
30745
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
30473
30746
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
30747
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
30474
30748
|
["falcon", "FalconForCausalLM"],
|
|
30475
30749
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
30750
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
30476
30751
|
["trocr", "TrOCRForCausalLM"],
|
|
30752
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
30477
30753
|
["stablelm", "StableLmForCausalLM"],
|
|
30478
30754
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
30479
30755
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -30484,6 +30760,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30484
30760
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
30485
30761
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30486
30762
|
["bert", "BertForMaskedLM"],
|
|
30763
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
30487
30764
|
["neobert", "NeoBertForMaskedLM"],
|
|
30488
30765
|
["modernbert", "ModernBertForMaskedLM"],
|
|
30489
30766
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -30542,7 +30819,9 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30542
30819
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
30543
30820
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
30544
30821
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
30545
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
30822
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
30823
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
30824
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
30546
30825
|
]);
|
|
30547
30826
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30548
30827
|
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
@@ -30647,6 +30926,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30647
30926
|
]);
|
|
30648
30927
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
30649
30928
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30929
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
30650
30930
|
["dpt", "DPTForDepthEstimation"],
|
|
30651
30931
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
30652
30932
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -33385,6 +33665,9 @@ export {
|
|
|
33385
33665
|
BloomModel,
|
|
33386
33666
|
BloomPreTrainedModel,
|
|
33387
33667
|
BloomTokenizer,
|
|
33668
|
+
CHMv2ForDepthEstimation,
|
|
33669
|
+
CHMv2ImageProcessor,
|
|
33670
|
+
CHMv2PreTrainedModel,
|
|
33388
33671
|
CLIPFeatureExtractor,
|
|
33389
33672
|
CLIPImageProcessor,
|
|
33390
33673
|
CLIPModel,
|
|
@@ -33480,6 +33763,9 @@ export {
|
|
|
33480
33763
|
DebertaV2Tokenizer,
|
|
33481
33764
|
DecisionTransformerModel,
|
|
33482
33765
|
DecisionTransformerPreTrainedModel,
|
|
33766
|
+
DeepseekV3ForCausalLM,
|
|
33767
|
+
DeepseekV3Model,
|
|
33768
|
+
DeepseekV3PreTrainedModel,
|
|
33483
33769
|
DeiTFeatureExtractor,
|
|
33484
33770
|
DeiTForImageClassification,
|
|
33485
33771
|
DeiTImageProcessor,
|
|
@@ -33540,6 +33826,11 @@ export {
|
|
|
33540
33826
|
EsmModel,
|
|
33541
33827
|
EsmPreTrainedModel,
|
|
33542
33828
|
EsmTokenizer,
|
|
33829
|
+
EuroBertForMaskedLM,
|
|
33830
|
+
EuroBertForSequenceClassification,
|
|
33831
|
+
EuroBertForTokenClassification,
|
|
33832
|
+
EuroBertModel,
|
|
33833
|
+
EuroBertPreTrainedModel,
|
|
33543
33834
|
ExaoneForCausalLM,
|
|
33544
33835
|
ExaoneModel,
|
|
33545
33836
|
ExaonePreTrainedModel,
|
|
@@ -33597,8 +33888,14 @@ export {
|
|
|
33597
33888
|
GemmaModel,
|
|
33598
33889
|
GemmaPreTrainedModel,
|
|
33599
33890
|
GemmaTokenizer,
|
|
33891
|
+
Glm46VImageProcessor,
|
|
33892
|
+
Glm46VProcessor,
|
|
33600
33893
|
GlmForCausalLM,
|
|
33601
33894
|
GlmModel,
|
|
33895
|
+
GlmMoeDsaForCausalLM,
|
|
33896
|
+
GlmMoeDsaModel,
|
|
33897
|
+
GlmMoeDsaPreTrainedModel,
|
|
33898
|
+
GlmOcrForConditionalGeneration,
|
|
33602
33899
|
GlmPreTrainedModel,
|
|
33603
33900
|
GptOssForCausalLM,
|
|
33604
33901
|
GptOssModel,
|
|
@@ -33664,6 +33961,7 @@ export {
|
|
|
33664
33961
|
Lfm2VlForConditionalGeneration,
|
|
33665
33962
|
Lfm2VlImageProcessor,
|
|
33666
33963
|
Lfm2VlProcessor,
|
|
33964
|
+
LightOnOcrForConditionalGeneration,
|
|
33667
33965
|
LiteWhisperForConditionalGeneration,
|
|
33668
33966
|
Llama4ForCausalLM,
|
|
33669
33967
|
Llama4PreTrainedModel,
|
|
@@ -33733,6 +34031,9 @@ export {
|
|
|
33733
34031
|
MimiPreTrainedModel,
|
|
33734
34032
|
MinLengthLogitsProcessor,
|
|
33735
34033
|
MinNewTokensLengthLogitsProcessor,
|
|
34034
|
+
Mistral4ForCausalLM,
|
|
34035
|
+
Mistral4Model,
|
|
34036
|
+
Mistral4PreTrainedModel,
|
|
33736
34037
|
MistralForCausalLM,
|
|
33737
34038
|
MistralModel,
|
|
33738
34039
|
MistralPreTrainedModel,
|
|
@@ -33804,6 +34105,9 @@ export {
|
|
|
33804
34105
|
NanoChatForCausalLM,
|
|
33805
34106
|
NanoChatModel,
|
|
33806
34107
|
NanoChatPreTrainedModel,
|
|
34108
|
+
NemotronHForCausalLM,
|
|
34109
|
+
NemotronHModel,
|
|
34110
|
+
NemotronHPreTrainedModel,
|
|
33807
34111
|
NeoBertForMaskedLM,
|
|
33808
34112
|
NeoBertForQuestionAnswering,
|
|
33809
34113
|
NeoBertForSequenceClassification,
|
|
@@ -33993,6 +34297,9 @@ export {
|
|
|
33993
34297
|
SnacFeatureExtractor,
|
|
33994
34298
|
SnacModel,
|
|
33995
34299
|
SnacPreTrainedModel,
|
|
34300
|
+
SolarOpenForCausalLM,
|
|
34301
|
+
SolarOpenModel,
|
|
34302
|
+
SolarOpenPreTrainedModel,
|
|
33996
34303
|
SpeechT5FeatureExtractor,
|
|
33997
34304
|
SpeechT5ForSpeechToText,
|
|
33998
34305
|
SpeechT5ForTextToSpeech,
|