@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/transformers.js +689 -382
- package/dist/transformers.min.js +19 -19
- package/dist/transformers.node.cjs +716 -382
- package/dist/transformers.node.min.cjs +19 -19
- package/dist/transformers.node.min.mjs +19 -19
- package/dist/transformers.node.mjs +689 -382
- package/dist/transformers.web.js +697 -390
- package/dist/transformers.web.min.js +17 -17
- package/package.json +2 -2
- package/src/configs.js +28 -22
- package/src/env.js +1 -1
- package/src/image_processors_utils.js +25 -15
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/image_processors.js +2 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +2 -0
- package/src/models/models.js +10 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/processors.js +1 -0
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +187 -136
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/registry.js +17 -0
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/pipelines.js +1 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/model_registry/get_file_metadata.js +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +3 -2
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +2 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +10 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/processors.d.ts +1 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
|
@@ -14,7 +14,7 @@ var __export = (target, all) => {
|
|
|
14
14
|
import fs from "fs";
|
|
15
15
|
import path from "path";
|
|
16
16
|
import url from "url";
|
|
17
|
-
var VERSION = "4.0.0-next.
|
|
17
|
+
var VERSION = "4.0.0-next.8";
|
|
18
18
|
var HAS_SELF = typeof self !== "undefined";
|
|
19
19
|
var IS_FS_AVAILABLE = !isEmpty(fs);
|
|
20
20
|
var IS_PATH_AVAILABLE = !isEmpty(path);
|
|
@@ -244,7 +244,7 @@ var logger = {
|
|
|
244
244
|
}
|
|
245
245
|
};
|
|
246
246
|
|
|
247
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
247
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
248
248
|
var DictionarySplitter = class {
|
|
249
249
|
/**
|
|
250
250
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -1900,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
1900
1900
|
);
|
|
1901
1901
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
1902
1902
|
output_tokens.push(...byte_tokens);
|
|
1903
|
-
} else {
|
|
1903
|
+
} else if (this.unk_token != null) {
|
|
1904
1904
|
output_tokens.push(this.unk_token);
|
|
1905
1905
|
}
|
|
1906
|
-
} else {
|
|
1906
|
+
} else if (this.unk_token != null) {
|
|
1907
1907
|
output_tokens.push(this.unk_token);
|
|
1908
1908
|
}
|
|
1909
1909
|
}
|
|
@@ -6514,13 +6514,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
|
|
|
6514
6514
|
wrapped_progress
|
|
6515
6515
|
);
|
|
6516
6516
|
} else if (typeof response !== "string") {
|
|
6517
|
+
const headers = new Headers(response.headers);
|
|
6518
|
+
headers.set("content-length", result.byteLength.toString());
|
|
6517
6519
|
await cache2.put(
|
|
6518
6520
|
cacheKey,
|
|
6519
6521
|
new Response(
|
|
6520
6522
|
/** @type {any} */
|
|
6521
6523
|
result,
|
|
6522
6524
|
{
|
|
6523
|
-
headers
|
|
6525
|
+
headers
|
|
6524
6526
|
}
|
|
6525
6527
|
)
|
|
6526
6528
|
).catch((err) => {
|
|
@@ -15730,6 +15732,7 @@ __export(processors_exports, {
|
|
|
15730
15732
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
15731
15733
|
Florence2Processor: () => Florence2Processor,
|
|
15732
15734
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
15735
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
15733
15736
|
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
15734
15737
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
15735
15738
|
Idefics3Processor: () => Idefics3Processor,
|
|
@@ -18234,26 +18237,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
18234
18237
|
}
|
|
18235
18238
|
return [segmentation, segments];
|
|
18236
18239
|
}
|
|
18237
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
18240
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
18238
18241
|
if (height < factor || width < factor) {
|
|
18239
|
-
|
|
18240
|
-
|
|
18242
|
+
const scale = Math.max(factor / height, factor / width);
|
|
18243
|
+
height = Math.round(height * scale);
|
|
18244
|
+
width = Math.round(width * scale);
|
|
18245
|
+
}
|
|
18246
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
18241
18247
|
throw new Error(
|
|
18242
18248
|
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
18243
18249
|
);
|
|
18244
18250
|
}
|
|
18245
18251
|
let h_bar = Math.round(height / factor) * factor;
|
|
18246
18252
|
let w_bar = Math.round(width / factor) * factor;
|
|
18247
|
-
if (h_bar * w_bar > max_pixels) {
|
|
18248
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
18249
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
18250
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
18251
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
18252
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
18253
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
18254
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
18255
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
18256
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
18257
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
18258
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
18253
18259
|
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
18254
18260
|
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
18255
18261
|
}
|
|
18256
|
-
return [
|
|
18262
|
+
return [w_bar, h_bar];
|
|
18257
18263
|
}
|
|
18258
18264
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
18259
18265
|
if (label_ids_to_fuse === null) {
|
|
@@ -18332,7 +18338,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18332
18338
|
this.do_pad = config.do_pad;
|
|
18333
18339
|
this.min_pixels = config.min_pixels;
|
|
18334
18340
|
this.max_pixels = config.max_pixels;
|
|
18335
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
18341
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
18336
18342
|
this.pad_size = this.size;
|
|
18337
18343
|
}
|
|
18338
18344
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -18620,10 +18626,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18620
18626
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
18621
18627
|
[pixelData, imgDims] = padded;
|
|
18622
18628
|
} else if (this.size_divisibility) {
|
|
18623
|
-
const
|
|
18624
|
-
|
|
18625
|
-
this.size_divisibility
|
|
18626
|
-
);
|
|
18629
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
18630
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
18627
18631
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
18628
18632
|
}
|
|
18629
18633
|
}
|
|
@@ -18700,6 +18704,7 @@ var image_processors_exports = {};
|
|
|
18700
18704
|
__export(image_processors_exports, {
|
|
18701
18705
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
18702
18706
|
BitImageProcessor: () => BitImageProcessor,
|
|
18707
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
18703
18708
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
18704
18709
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
18705
18710
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -18716,6 +18721,7 @@ __export(image_processors_exports, {
|
|
|
18716
18721
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
18717
18722
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
18718
18723
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
18724
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
18719
18725
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
18720
18726
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
18721
18727
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
@@ -18776,6 +18782,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
18776
18782
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
18777
18783
|
};
|
|
18778
18784
|
|
|
18785
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
18786
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
18787
|
+
};
|
|
18788
|
+
|
|
18779
18789
|
// src/models/clip/image_processing_clip.js
|
|
18780
18790
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
18781
18791
|
};
|
|
@@ -18895,6 +18905,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
18895
18905
|
}
|
|
18896
18906
|
};
|
|
18897
18907
|
|
|
18908
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
18909
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
18910
|
+
constructor(config) {
|
|
18911
|
+
super(config);
|
|
18912
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
18913
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
18914
|
+
this.patch_size = config.patch_size;
|
|
18915
|
+
this.merge_size = config.merge_size;
|
|
18916
|
+
}
|
|
18917
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
18918
|
+
get_resize_output_image_size(image, size) {
|
|
18919
|
+
const factor = this.patch_size * this.merge_size;
|
|
18920
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
18921
|
+
}
|
|
18922
|
+
async _call(images, ...args) {
|
|
18923
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
18924
|
+
let patches = pixel_values;
|
|
18925
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
18926
|
+
if (patches.dims[0] === 1) {
|
|
18927
|
+
patches = cat(
|
|
18928
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
18929
|
+
0
|
|
18930
|
+
);
|
|
18931
|
+
}
|
|
18932
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
18933
|
+
const channel = patches.dims[1];
|
|
18934
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
18935
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
18936
|
+
const flatten_patches = patches.view(
|
|
18937
|
+
grid_t,
|
|
18938
|
+
temporal_patch_size,
|
|
18939
|
+
channel,
|
|
18940
|
+
Math.floor(grid_h / merge_size),
|
|
18941
|
+
merge_size,
|
|
18942
|
+
patch_size,
|
|
18943
|
+
Math.floor(grid_w / merge_size),
|
|
18944
|
+
merge_size,
|
|
18945
|
+
patch_size
|
|
18946
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
18947
|
+
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
18948
|
+
return {
|
|
18949
|
+
pixel_values: flatten_patches,
|
|
18950
|
+
image_grid_thw,
|
|
18951
|
+
original_sizes,
|
|
18952
|
+
reshaped_input_sizes
|
|
18953
|
+
};
|
|
18954
|
+
}
|
|
18955
|
+
};
|
|
18956
|
+
|
|
18957
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
18958
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
18959
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
18960
|
+
get_resize_output_image_size(image, size) {
|
|
18961
|
+
const factor = this.patch_size * this.merge_size;
|
|
18962
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
18963
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
18964
|
+
}
|
|
18965
|
+
};
|
|
18966
|
+
|
|
18898
18967
|
// src/models/glpn/image_processing_glpn.js
|
|
18899
18968
|
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
18900
18969
|
};
|
|
@@ -19288,7 +19357,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
|
19288
19357
|
const img = pixel_values.unsqueeze_(0);
|
|
19289
19358
|
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
19290
19359
|
const f2 = total_factor ** 2;
|
|
19291
|
-
const [
|
|
19360
|
+
const [new_width, new_height] = smart_resize(
|
|
19292
19361
|
Math.max(total_factor, height),
|
|
19293
19362
|
Math.max(total_factor, width),
|
|
19294
19363
|
total_factor,
|
|
@@ -19578,55 +19647,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
19578
19647
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
19579
19648
|
};
|
|
19580
19649
|
|
|
19581
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19582
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19583
|
-
constructor(config) {
|
|
19584
|
-
super(config);
|
|
19585
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
19586
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
19587
|
-
this.patch_size = config.patch_size;
|
|
19588
|
-
this.merge_size = config.merge_size;
|
|
19589
|
-
}
|
|
19590
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
19591
|
-
get_resize_output_image_size(image, size) {
|
|
19592
|
-
const factor = this.patch_size * this.merge_size;
|
|
19593
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
19594
|
-
}
|
|
19595
|
-
async _call(images, ...args) {
|
|
19596
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
19597
|
-
let patches = pixel_values;
|
|
19598
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
19599
|
-
if (patches.dims[0] === 1) {
|
|
19600
|
-
patches = cat(
|
|
19601
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
19602
|
-
0
|
|
19603
|
-
);
|
|
19604
|
-
}
|
|
19605
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
19606
|
-
const channel = patches.dims[1];
|
|
19607
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
19608
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
19609
|
-
const flatten_patches = patches.view(
|
|
19610
|
-
grid_t,
|
|
19611
|
-
temporal_patch_size,
|
|
19612
|
-
channel,
|
|
19613
|
-
Math.floor(grid_h / merge_size),
|
|
19614
|
-
merge_size,
|
|
19615
|
-
patch_size,
|
|
19616
|
-
Math.floor(grid_w / merge_size),
|
|
19617
|
-
merge_size,
|
|
19618
|
-
patch_size
|
|
19619
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
19620
|
-
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
19621
|
-
return {
|
|
19622
|
-
pixel_values: flatten_patches,
|
|
19623
|
-
image_grid_thw,
|
|
19624
|
-
original_sizes,
|
|
19625
|
-
reshaped_input_sizes
|
|
19626
|
-
};
|
|
19627
|
-
}
|
|
19628
|
-
};
|
|
19629
|
-
|
|
19630
19650
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
19631
19651
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
19632
19652
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -20180,6 +20200,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
20180
20200
|
}
|
|
20181
20201
|
};
|
|
20182
20202
|
|
|
20203
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
20204
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
20205
|
+
static image_processor_class = AutoImageProcessor;
|
|
20206
|
+
static tokenizer_class = AutoTokenizer;
|
|
20207
|
+
static image_token = "<|image_pad|>";
|
|
20208
|
+
/**
|
|
20209
|
+
*
|
|
20210
|
+
* @param {string|string[]} text
|
|
20211
|
+
* @param {RawImage|RawImage[]} images
|
|
20212
|
+
* @param {...any} args
|
|
20213
|
+
* @returns {Promise<any>}
|
|
20214
|
+
*/
|
|
20215
|
+
async _call(text, images = null, ...args) {
|
|
20216
|
+
if (!Array.isArray(text)) {
|
|
20217
|
+
text = [text];
|
|
20218
|
+
}
|
|
20219
|
+
let image_inputs, image_grid_thw;
|
|
20220
|
+
if (images) {
|
|
20221
|
+
image_inputs = await this.image_processor(images);
|
|
20222
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
20223
|
+
}
|
|
20224
|
+
if (image_grid_thw) {
|
|
20225
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
20226
|
+
let index = 0;
|
|
20227
|
+
const image_token = (
|
|
20228
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
20229
|
+
this.constructor.image_token
|
|
20230
|
+
);
|
|
20231
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
20232
|
+
text = text.map((t) => {
|
|
20233
|
+
while (t.includes(image_token)) {
|
|
20234
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
20235
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
20236
|
+
}
|
|
20237
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
20238
|
+
});
|
|
20239
|
+
}
|
|
20240
|
+
const text_inputs = this.tokenizer(text);
|
|
20241
|
+
return {
|
|
20242
|
+
...text_inputs,
|
|
20243
|
+
...image_inputs
|
|
20244
|
+
};
|
|
20245
|
+
}
|
|
20246
|
+
};
|
|
20247
|
+
|
|
20248
|
+
// src/models/glm46v/processing_glm46v.js
|
|
20249
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
20250
|
+
static image_token = "<|image|>";
|
|
20251
|
+
};
|
|
20252
|
+
|
|
20183
20253
|
// src/models/granite_speech/processing_granite_speech.js
|
|
20184
20254
|
var GraniteSpeechProcessor = class extends Processor {
|
|
20185
20255
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -20910,47 +20980,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
20910
20980
|
}
|
|
20911
20981
|
};
|
|
20912
20982
|
|
|
20913
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
20914
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
20915
|
-
static image_processor_class = AutoImageProcessor;
|
|
20916
|
-
static tokenizer_class = AutoTokenizer;
|
|
20917
|
-
/**
|
|
20918
|
-
*
|
|
20919
|
-
* @param {string|string[]} text
|
|
20920
|
-
* @param {RawImage|RawImage[]} images
|
|
20921
|
-
* @param {...any} args
|
|
20922
|
-
* @returns {Promise<any>}
|
|
20923
|
-
*/
|
|
20924
|
-
async _call(text, images = null, ...args) {
|
|
20925
|
-
if (!Array.isArray(text)) {
|
|
20926
|
-
text = [text];
|
|
20927
|
-
}
|
|
20928
|
-
let image_inputs, image_grid_thw;
|
|
20929
|
-
if (images) {
|
|
20930
|
-
image_inputs = await this.image_processor(images);
|
|
20931
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
20932
|
-
}
|
|
20933
|
-
if (image_grid_thw) {
|
|
20934
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
20935
|
-
let index = 0;
|
|
20936
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
20937
|
-
text = text.map((t) => {
|
|
20938
|
-
while (t.includes("<|image_pad|>")) {
|
|
20939
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
20940
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
20941
|
-
}
|
|
20942
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
20943
|
-
});
|
|
20944
|
-
}
|
|
20945
|
-
const text_inputs = this.tokenizer(text);
|
|
20946
|
-
return {
|
|
20947
|
-
...text_inputs,
|
|
20948
|
-
...image_inputs
|
|
20949
|
-
// TODO: ...videos_inputs,
|
|
20950
|
-
};
|
|
20951
|
-
}
|
|
20952
|
-
};
|
|
20953
|
-
|
|
20954
20983
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
20955
20984
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
20956
20985
|
};
|
|
@@ -21294,6 +21323,8 @@ function getNormalizedConfig(config) {
|
|
|
21294
21323
|
case "gemma3n":
|
|
21295
21324
|
case "lfm2_vl":
|
|
21296
21325
|
case "chatterbox":
|
|
21326
|
+
case "lighton_ocr":
|
|
21327
|
+
case "glm_ocr":
|
|
21297
21328
|
case "mistral3":
|
|
21298
21329
|
case "qwen2_5_vl":
|
|
21299
21330
|
case "qwen3_vl":
|
|
@@ -21369,6 +21400,8 @@ function getNormalizedConfig(config) {
|
|
|
21369
21400
|
mapping["dim_kv"] = "head_dim";
|
|
21370
21401
|
break;
|
|
21371
21402
|
case "qwen3":
|
|
21403
|
+
case "solar_open":
|
|
21404
|
+
case "glm_ocr_text":
|
|
21372
21405
|
case "gemma":
|
|
21373
21406
|
case "gemma2":
|
|
21374
21407
|
case "vaultgemma":
|
|
@@ -21379,6 +21412,7 @@ function getNormalizedConfig(config) {
|
|
|
21379
21412
|
case "ernie4_5":
|
|
21380
21413
|
case "hunyuan_v1_dense":
|
|
21381
21414
|
case "falcon_h1":
|
|
21415
|
+
case "nemotron_h":
|
|
21382
21416
|
case "ministral":
|
|
21383
21417
|
case "ministral3":
|
|
21384
21418
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -21413,6 +21447,9 @@ function getNormalizedConfig(config) {
|
|
|
21413
21447
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
21414
21448
|
break;
|
|
21415
21449
|
case "youtu":
|
|
21450
|
+
case "deepseek_v3":
|
|
21451
|
+
case "glm_moe_dsa":
|
|
21452
|
+
case "mistral4":
|
|
21416
21453
|
mapping["num_heads"] = "num_key_value_heads";
|
|
21417
21454
|
mapping["num_layers"] = "num_hidden_layers";
|
|
21418
21455
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -21501,6 +21538,7 @@ function getCacheShapes(config, options) {
|
|
|
21501
21538
|
if (!(config instanceof PretrainedConfig)) {
|
|
21502
21539
|
config = new PretrainedConfig(config);
|
|
21503
21540
|
}
|
|
21541
|
+
const batch_size = options?.batch_size ?? 1;
|
|
21504
21542
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
21505
21543
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
21506
21544
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -21510,7 +21548,6 @@ function getCacheShapes(config, options) {
|
|
|
21510
21548
|
config
|
|
21511
21549
|
);
|
|
21512
21550
|
const head_dim = hidden_size / num_attention_heads;
|
|
21513
|
-
const batch_size = options?.batch_size ?? 1;
|
|
21514
21551
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
21515
21552
|
if (layer_types[i] === "full_attention") {
|
|
21516
21553
|
for (const kv of ["key", "value"]) {
|
|
@@ -21523,31 +21560,26 @@ function getCacheShapes(config, options) {
|
|
|
21523
21560
|
}
|
|
21524
21561
|
}
|
|
21525
21562
|
return cache_values;
|
|
21526
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
21563
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
21527
21564
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
21528
21565
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
21529
|
-
const
|
|
21530
|
-
const {
|
|
21531
|
-
layer_types,
|
|
21532
|
-
num_hidden_layers,
|
|
21533
|
-
num_attention_heads,
|
|
21534
|
-
num_key_value_heads,
|
|
21535
|
-
hidden_size,
|
|
21536
|
-
mamba_d_conv,
|
|
21537
|
-
mamba_n_heads,
|
|
21538
|
-
mamba_d_head,
|
|
21539
|
-
mamba_d_state,
|
|
21540
|
-
mamba_n_groups,
|
|
21541
|
-
mamba_expand,
|
|
21542
|
-
mamba_d_ssm
|
|
21543
|
-
} = (
|
|
21566
|
+
const c = (
|
|
21544
21567
|
/** @type {any} */
|
|
21545
21568
|
config
|
|
21546
21569
|
);
|
|
21547
|
-
const
|
|
21548
|
-
const
|
|
21549
|
-
const
|
|
21550
|
-
|
|
21570
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
21571
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
21572
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
21573
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
21574
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
21575
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
21576
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
21577
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
21578
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
21579
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
21580
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
21581
|
+
const cache_values = {};
|
|
21582
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
21551
21583
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
21552
21584
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
21553
21585
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -21581,7 +21613,6 @@ function getCacheShapes(config, options) {
|
|
|
21581
21613
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
21582
21614
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
21583
21615
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
21584
|
-
const batch_size = options?.batch_size ?? 1;
|
|
21585
21616
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
21586
21617
|
if (layer_types[i] === "full_attention") {
|
|
21587
21618
|
for (const kv of ["key", "value"]) {
|
|
@@ -24209,7 +24240,9 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24209
24240
|
"qwen3_5",
|
|
24210
24241
|
"qwen3_5_text",
|
|
24211
24242
|
"qwen3_5_moe",
|
|
24212
|
-
"qwen3_5_moe_text"
|
|
24243
|
+
"qwen3_5_moe_text",
|
|
24244
|
+
"glm_ocr",
|
|
24245
|
+
"glm_ocr_text"
|
|
24213
24246
|
].includes(self2.config.model_type)
|
|
24214
24247
|
) {
|
|
24215
24248
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -24433,6 +24466,8 @@ __export(models_exports, {
|
|
|
24433
24466
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
24434
24467
|
BloomModel: () => BloomModel,
|
|
24435
24468
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
24469
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
24470
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
24436
24471
|
CLIPModel: () => CLIPModel,
|
|
24437
24472
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
24438
24473
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -24507,6 +24542,9 @@ __export(models_exports, {
|
|
|
24507
24542
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
24508
24543
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
24509
24544
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
24545
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
24546
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
24547
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
24510
24548
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
24511
24549
|
DeiTModel: () => DeiTModel,
|
|
24512
24550
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -24552,6 +24590,11 @@ __export(models_exports, {
|
|
|
24552
24590
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
24553
24591
|
EsmModel: () => EsmModel,
|
|
24554
24592
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
24593
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
24594
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
24595
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
24596
|
+
EuroBertModel: () => EuroBertModel,
|
|
24597
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
24555
24598
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
24556
24599
|
ExaoneModel: () => ExaoneModel,
|
|
24557
24600
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -24598,6 +24641,10 @@ __export(models_exports, {
|
|
|
24598
24641
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
24599
24642
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
24600
24643
|
GlmModel: () => GlmModel,
|
|
24644
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
24645
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
24646
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
24647
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
24601
24648
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
24602
24649
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
24603
24650
|
GptOssModel: () => GptOssModel,
|
|
@@ -24644,6 +24691,7 @@ __export(models_exports, {
|
|
|
24644
24691
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
24645
24692
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
24646
24693
|
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
24694
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
24647
24695
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
24648
24696
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
24649
24697
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -24693,6 +24741,9 @@ __export(models_exports, {
|
|
|
24693
24741
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
24694
24742
|
MimiModel: () => MimiModel,
|
|
24695
24743
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
24744
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
24745
|
+
Mistral4Model: () => Mistral4Model,
|
|
24746
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
24696
24747
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
24697
24748
|
MistralModel: () => MistralModel,
|
|
24698
24749
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -24750,6 +24801,9 @@ __export(models_exports, {
|
|
|
24750
24801
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
24751
24802
|
NanoChatModel: () => NanoChatModel,
|
|
24752
24803
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
24804
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
24805
|
+
NemotronHModel: () => NemotronHModel,
|
|
24806
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
24753
24807
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
24754
24808
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
24755
24809
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -24887,6 +24941,9 @@ __export(models_exports, {
|
|
|
24887
24941
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
24888
24942
|
SnacModel: () => SnacModel,
|
|
24889
24943
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
24944
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
24945
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
24946
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
24890
24947
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
24891
24948
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
24892
24949
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -25061,7 +25118,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
25061
25118
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
25062
25119
|
};
|
|
25063
25120
|
|
|
25064
|
-
// src/models/
|
|
25121
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
25065
25122
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
25066
25123
|
};
|
|
25067
25124
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -25396,6 +25453,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
25396
25453
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
25397
25454
|
};
|
|
25398
25455
|
|
|
25456
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
25457
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
25458
|
+
};
|
|
25459
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
25460
|
+
};
|
|
25461
|
+
|
|
25399
25462
|
// src/models/clap/modeling_clap.js
|
|
25400
25463
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
25401
25464
|
};
|
|
@@ -25734,6 +25797,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
25734
25797
|
}
|
|
25735
25798
|
};
|
|
25736
25799
|
|
|
25800
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
25801
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
25802
|
+
};
|
|
25803
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
25804
|
+
};
|
|
25805
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
25806
|
+
};
|
|
25807
|
+
|
|
25737
25808
|
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
25738
25809
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
25739
25810
|
};
|
|
@@ -26082,6 +26153,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
26082
26153
|
}
|
|
26083
26154
|
};
|
|
26084
26155
|
|
|
26156
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
26157
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
26158
|
+
};
|
|
26159
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
26160
|
+
};
|
|
26161
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
26162
|
+
/**
|
|
26163
|
+
* Calls the model on new inputs.
|
|
26164
|
+
*
|
|
26165
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26166
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
26167
|
+
*/
|
|
26168
|
+
async _call(model_inputs) {
|
|
26169
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
26170
|
+
}
|
|
26171
|
+
};
|
|
26172
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
26173
|
+
/**
|
|
26174
|
+
* Calls the model on new inputs.
|
|
26175
|
+
*
|
|
26176
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26177
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
26178
|
+
*/
|
|
26179
|
+
async _call(model_inputs) {
|
|
26180
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
26181
|
+
}
|
|
26182
|
+
};
|
|
26183
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
26184
|
+
/**
|
|
26185
|
+
* Calls the model on new inputs.
|
|
26186
|
+
*
|
|
26187
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
26188
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
26189
|
+
*/
|
|
26190
|
+
async _call(model_inputs) {
|
|
26191
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
26192
|
+
}
|
|
26193
|
+
};
|
|
26194
|
+
|
|
26085
26195
|
// src/models/exaone/modeling_exaone.js
|
|
26086
26196
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
26087
26197
|
};
|
|
@@ -26357,6 +26467,377 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
26357
26467
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
26358
26468
|
};
|
|
26359
26469
|
|
|
26470
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
26471
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
26472
|
+
};
|
|
26473
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
26474
|
+
};
|
|
26475
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
26476
|
+
};
|
|
26477
|
+
|
|
26478
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
26479
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
26480
|
+
forward_params = [
|
|
26481
|
+
// Text inputs
|
|
26482
|
+
"input_ids",
|
|
26483
|
+
"attention_mask",
|
|
26484
|
+
"position_ids",
|
|
26485
|
+
"past_key_values",
|
|
26486
|
+
// Vision inputs
|
|
26487
|
+
"pixel_values",
|
|
26488
|
+
"image_grid_thw"
|
|
26489
|
+
];
|
|
26490
|
+
};
|
|
26491
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
26492
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
26493
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
26494
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
26495
|
+
image_grid_thw_name = "grid_thw";
|
|
26496
|
+
/**
|
|
26497
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
26498
|
+
* @param {Tensor} input_ids
|
|
26499
|
+
* @param {Tensor} attention_mask
|
|
26500
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
26501
|
+
*/
|
|
26502
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
26503
|
+
if (attention_mask) {
|
|
26504
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
26505
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
26506
|
+
const mrope_position_deltas = Array.from(
|
|
26507
|
+
{ length: dims[0] },
|
|
26508
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
26509
|
+
);
|
|
26510
|
+
return [
|
|
26511
|
+
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
26512
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
26513
|
+
];
|
|
26514
|
+
} else {
|
|
26515
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
26516
|
+
const position_ids = BigInt64Array.from(
|
|
26517
|
+
{ length: 3 * batch_size * seq_length },
|
|
26518
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
26519
|
+
);
|
|
26520
|
+
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
26521
|
+
}
|
|
26522
|
+
}
|
|
26523
|
+
/**
|
|
26524
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
26525
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
26526
|
+
* respecting attention mask.
|
|
26527
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
26528
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
26529
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
26530
|
+
* @param {number} batch_idx Current batch index
|
|
26531
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
26532
|
+
*/
|
|
26533
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
26534
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
26535
|
+
const llm_positions = new Array(total_len);
|
|
26536
|
+
let index = 0;
|
|
26537
|
+
for (let x = 0; x < 3; ++x) {
|
|
26538
|
+
for (const val of llm_pos_ids_list) {
|
|
26539
|
+
const seg_len = val.length / 3;
|
|
26540
|
+
for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
|
|
26541
|
+
llm_positions[index++] = val[z];
|
|
26542
|
+
}
|
|
26543
|
+
}
|
|
26544
|
+
}
|
|
26545
|
+
let count2 = 0;
|
|
26546
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
26547
|
+
if (attn_mask[y] == 1) {
|
|
26548
|
+
for (let x = 0; x < 3; ++x) {
|
|
26549
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
26550
|
+
}
|
|
26551
|
+
++count2;
|
|
26552
|
+
}
|
|
26553
|
+
}
|
|
26554
|
+
return llm_positions;
|
|
26555
|
+
}
|
|
26556
|
+
/**
|
|
26557
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
26558
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
26559
|
+
* @param {object} params
|
|
26560
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
26561
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
26562
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
26563
|
+
* @param {number} params.spatial_merge_size
|
|
26564
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
26565
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
26566
|
+
*/
|
|
26567
|
+
_get_multimodal_rope_positions({
|
|
26568
|
+
filtered_ids,
|
|
26569
|
+
image_grid_thw_list,
|
|
26570
|
+
video_grid_thw_list,
|
|
26571
|
+
spatial_merge_size,
|
|
26572
|
+
state
|
|
26573
|
+
}) {
|
|
26574
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
26575
|
+
const ids = filtered_ids;
|
|
26576
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
26577
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
26578
|
+
return acc;
|
|
26579
|
+
}, []);
|
|
26580
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
26581
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
26582
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
26583
|
+
const llm_pos_ids_list = [];
|
|
26584
|
+
let st2 = 0;
|
|
26585
|
+
let remain_images = image_nums;
|
|
26586
|
+
let remain_videos = video_nums;
|
|
26587
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
26588
|
+
const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
|
|
26589
|
+
const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
|
|
26590
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
26591
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
26592
|
+
let ed;
|
|
26593
|
+
let t, h, w;
|
|
26594
|
+
if (ed_image < ed_video) {
|
|
26595
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
26596
|
+
++state.image_index;
|
|
26597
|
+
--remain_images;
|
|
26598
|
+
ed = ed_image;
|
|
26599
|
+
} else {
|
|
26600
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
26601
|
+
++state.video_index;
|
|
26602
|
+
--remain_videos;
|
|
26603
|
+
ed = ed_video;
|
|
26604
|
+
}
|
|
26605
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
26606
|
+
Number(t),
|
|
26607
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
26608
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
26609
|
+
];
|
|
26610
|
+
const text_len = ed - st2;
|
|
26611
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
26612
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
26613
|
+
const offset = text_len + st_idx;
|
|
26614
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
26615
|
+
const t_index = Array.from(
|
|
26616
|
+
{ length: grid_size },
|
|
26617
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
26618
|
+
);
|
|
26619
|
+
const h_index = Array.from(
|
|
26620
|
+
{ length: grid_size },
|
|
26621
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
26622
|
+
);
|
|
26623
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
26624
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
26625
|
+
st2 = ed + grid_size;
|
|
26626
|
+
}
|
|
26627
|
+
if (st2 < ids.length) {
|
|
26628
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
26629
|
+
const text_len = ids.length - st2;
|
|
26630
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
26631
|
+
}
|
|
26632
|
+
return llm_pos_ids_list;
|
|
26633
|
+
}
|
|
26634
|
+
/**
|
|
26635
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
26636
|
+
*
|
|
26637
|
+
* Explanation:
|
|
26638
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
26639
|
+
*
|
|
26640
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
26641
|
+
* Examples:
|
|
26642
|
+
* input_ids: [T T T T T], here T is for text.
|
|
26643
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
26644
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
26645
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
26646
|
+
*
|
|
26647
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
26648
|
+
* and 1D rotary position embeddin for text part.
|
|
26649
|
+
* Examples:
|
|
26650
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
26651
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
26652
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
26653
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
26654
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
26655
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
26656
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
26657
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
26658
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
26659
|
+
*
|
|
26660
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
26661
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
26662
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
26663
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
26664
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
26665
|
+
*/
|
|
26666
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
26667
|
+
const { vision_config } = this.config;
|
|
26668
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
26669
|
+
if (image_grid_thw || video_grid_thw) {
|
|
26670
|
+
const total_input_ids = input_ids.tolist();
|
|
26671
|
+
if (!attention_mask) {
|
|
26672
|
+
attention_mask = ones_like(input_ids);
|
|
26673
|
+
}
|
|
26674
|
+
const attention_mask_list = attention_mask.tolist();
|
|
26675
|
+
const position_ids_list = Array.from(
|
|
26676
|
+
{ length: 3 },
|
|
26677
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
26678
|
+
);
|
|
26679
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
26680
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
26681
|
+
const state = { image_index: 0, video_index: 0 };
|
|
26682
|
+
const mrope_position_deltas = [];
|
|
26683
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
26684
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
26685
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
26686
|
+
filtered_ids,
|
|
26687
|
+
image_grid_thw_list,
|
|
26688
|
+
video_grid_thw_list,
|
|
26689
|
+
spatial_merge_size,
|
|
26690
|
+
state
|
|
26691
|
+
});
|
|
26692
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
26693
|
+
llm_pos_ids_list,
|
|
26694
|
+
attention_mask_list[i],
|
|
26695
|
+
position_ids_list,
|
|
26696
|
+
i
|
|
26697
|
+
);
|
|
26698
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
26699
|
+
}
|
|
26700
|
+
return [
|
|
26701
|
+
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
26702
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
26703
|
+
];
|
|
26704
|
+
} else {
|
|
26705
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
26706
|
+
}
|
|
26707
|
+
}
|
|
26708
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
26709
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
26710
|
+
pixel_values,
|
|
26711
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
26712
|
+
})).image_features;
|
|
26713
|
+
return features;
|
|
26714
|
+
}
|
|
26715
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
26716
|
+
return default_merge_input_ids_with_image_features({
|
|
26717
|
+
// @ts-ignore
|
|
26718
|
+
image_token_id: this.config.image_token_id,
|
|
26719
|
+
...kwargs
|
|
26720
|
+
});
|
|
26721
|
+
}
|
|
26722
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
26723
|
+
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
26724
|
+
if (!model_inputs.past_key_values) {
|
|
26725
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
26726
|
+
model_inputs.input_ids,
|
|
26727
|
+
model_inputs.image_grid_thw,
|
|
26728
|
+
model_inputs.video_grid_thw,
|
|
26729
|
+
model_inputs.attention_mask
|
|
26730
|
+
);
|
|
26731
|
+
} else {
|
|
26732
|
+
model_inputs.pixel_values = null;
|
|
26733
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
26734
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
26735
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
26736
|
+
model_inputs.input_ids,
|
|
26737
|
+
model_inputs.image_grid_thw,
|
|
26738
|
+
model_inputs.video_grid_thw,
|
|
26739
|
+
model_inputs.attention_mask
|
|
26740
|
+
);
|
|
26741
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
26742
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
26743
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
26744
|
+
} else {
|
|
26745
|
+
if (!model_inputs.rope_deltas) {
|
|
26746
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
26747
|
+
model_inputs.input_ids,
|
|
26748
|
+
model_inputs.image_grid_thw,
|
|
26749
|
+
model_inputs.video_grid_thw,
|
|
26750
|
+
model_inputs.attention_mask
|
|
26751
|
+
);
|
|
26752
|
+
}
|
|
26753
|
+
const delta = BigInt(past_length);
|
|
26754
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
26755
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
26756
|
+
}
|
|
26757
|
+
}
|
|
26758
|
+
}
|
|
26759
|
+
return model_inputs;
|
|
26760
|
+
}
|
|
26761
|
+
};
|
|
26762
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
26763
|
+
};
|
|
26764
|
+
|
|
26765
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
26766
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
26767
|
+
image_grid_thw_name = "image_grid_thw";
|
|
26768
|
+
};
|
|
26769
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
26770
|
+
image_grid_thw_name = "image_grid_thw";
|
|
26771
|
+
};
|
|
26772
|
+
|
|
26773
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
26774
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
26775
|
+
/**
|
|
26776
|
+
* Compute 3D positional indices for vision tokens.
|
|
26777
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
26778
|
+
* @param {number} start_position
|
|
26779
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
26780
|
+
* @param {number} temp_merge_size
|
|
26781
|
+
* @param {number} spatial_merge_size
|
|
26782
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
26783
|
+
*/
|
|
26784
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
26785
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
26786
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
26787
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
26788
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
26789
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
26790
|
+
const h_pos = Array.from(
|
|
26791
|
+
{ length: seq_len },
|
|
26792
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
26793
|
+
);
|
|
26794
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
26795
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
26796
|
+
}
|
|
26797
|
+
/**
|
|
26798
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
26799
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
26800
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
26801
|
+
*/
|
|
26802
|
+
_get_multimodal_rope_positions({
|
|
26803
|
+
filtered_ids,
|
|
26804
|
+
image_grid_thw_list,
|
|
26805
|
+
video_grid_thw_list,
|
|
26806
|
+
spatial_merge_size,
|
|
26807
|
+
state
|
|
26808
|
+
}) {
|
|
26809
|
+
const { image_token_id } = this.config;
|
|
26810
|
+
const groups = [];
|
|
26811
|
+
let group_start = 0;
|
|
26812
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
26813
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
26814
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
26815
|
+
if (t !== current_type) {
|
|
26816
|
+
groups.push([current_type, group_start, j]);
|
|
26817
|
+
group_start = j;
|
|
26818
|
+
current_type = t;
|
|
26819
|
+
}
|
|
26820
|
+
}
|
|
26821
|
+
let current_pos = 0;
|
|
26822
|
+
const llm_pos_ids_list = [];
|
|
26823
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
26824
|
+
if (modality_type === 0) {
|
|
26825
|
+
const text_len = end_idx - start_idx;
|
|
26826
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
26827
|
+
current_pos += text_len;
|
|
26828
|
+
} else {
|
|
26829
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
26830
|
+
const temp_merge_size = grid_thw[0];
|
|
26831
|
+
llm_pos_ids_list.push(
|
|
26832
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
26833
|
+
);
|
|
26834
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
26835
|
+
}
|
|
26836
|
+
}
|
|
26837
|
+
return llm_pos_ids_list;
|
|
26838
|
+
}
|
|
26839
|
+
};
|
|
26840
|
+
|
|
26360
26841
|
// src/models/glpn/modeling_glpn.js
|
|
26361
26842
|
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
26362
26843
|
};
|
|
@@ -26669,6 +27150,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
26669
27150
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
26670
27151
|
};
|
|
26671
27152
|
|
|
27153
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
27154
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27155
|
+
};
|
|
27156
|
+
|
|
26672
27157
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
26673
27158
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
26674
27159
|
};
|
|
@@ -26865,6 +27350,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
26865
27350
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
26866
27351
|
};
|
|
26867
27352
|
|
|
27353
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
27354
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
27355
|
+
};
|
|
27356
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
27357
|
+
};
|
|
27358
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
27359
|
+
};
|
|
27360
|
+
|
|
26868
27361
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
26869
27362
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
26870
27363
|
};
|
|
@@ -27333,6 +27826,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
27333
27826
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
27334
27827
|
};
|
|
27335
27828
|
|
|
27829
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
27830
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
27831
|
+
};
|
|
27832
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
27833
|
+
};
|
|
27834
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
27835
|
+
};
|
|
27836
|
+
|
|
27336
27837
|
// src/models/neobert/modeling_neobert.js
|
|
27337
27838
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
27338
27839
|
};
|
|
@@ -27613,252 +28114,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
27613
28114
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
27614
28115
|
};
|
|
27615
28116
|
|
|
27616
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
27617
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
27618
|
-
forward_params = [
|
|
27619
|
-
// Text inputs
|
|
27620
|
-
"input_ids",
|
|
27621
|
-
"attention_mask",
|
|
27622
|
-
"position_ids",
|
|
27623
|
-
"past_key_values",
|
|
27624
|
-
// Vision inputs
|
|
27625
|
-
"pixel_values",
|
|
27626
|
-
"image_grid_thw"
|
|
27627
|
-
];
|
|
27628
|
-
};
|
|
27629
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
27630
|
-
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
27631
|
-
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
27632
|
-
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
27633
|
-
image_grid_thw_name = "grid_thw";
|
|
27634
|
-
/**
|
|
27635
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
27636
|
-
*
|
|
27637
|
-
* Explanation:
|
|
27638
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
27639
|
-
*
|
|
27640
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
27641
|
-
* Examples:
|
|
27642
|
-
* input_ids: [T T T T T], here T is for text.
|
|
27643
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
27644
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
27645
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
27646
|
-
*
|
|
27647
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
27648
|
-
* and 1D rotary position embeddin for text part.
|
|
27649
|
-
* Examples:
|
|
27650
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
27651
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
27652
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
27653
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
27654
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
27655
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
27656
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
27657
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
27658
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
27659
|
-
*
|
|
27660
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
27661
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
27662
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
27663
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
27664
|
-
* - 1 for tokens that are **not masked**,
|
|
27665
|
-
* - 0 for tokens that are **masked**.
|
|
27666
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
27667
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
27668
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
27669
|
-
*/
|
|
27670
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
27671
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
27672
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
27673
|
-
const mrope_position_deltas = [];
|
|
27674
|
-
if (image_grid_thw || video_grid_thw) {
|
|
27675
|
-
let total_input_ids = input_ids.tolist();
|
|
27676
|
-
if (!attention_mask) {
|
|
27677
|
-
attention_mask = ones_like(input_ids);
|
|
27678
|
-
}
|
|
27679
|
-
const attention_mask_list = attention_mask.tolist();
|
|
27680
|
-
const position_ids_list = Array.from(
|
|
27681
|
-
{ length: 3 },
|
|
27682
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
27683
|
-
);
|
|
27684
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
27685
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
27686
|
-
let image_index = 0;
|
|
27687
|
-
let video_index = 0;
|
|
27688
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
27689
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
27690
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
27691
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
27692
|
-
return acc;
|
|
27693
|
-
}, []);
|
|
27694
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
27695
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
27696
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
27697
|
-
let llm_pos_ids_list = [];
|
|
27698
|
-
let st2 = 0;
|
|
27699
|
-
let remain_images = image_nums;
|
|
27700
|
-
let remain_videos = video_nums;
|
|
27701
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
27702
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
|
|
27703
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
|
|
27704
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
27705
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
27706
|
-
let ed;
|
|
27707
|
-
let t, h, w;
|
|
27708
|
-
if (ed_image < ed_video) {
|
|
27709
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
27710
|
-
++image_index;
|
|
27711
|
-
--remain_images;
|
|
27712
|
-
ed = ed_image;
|
|
27713
|
-
} else {
|
|
27714
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
27715
|
-
++video_index;
|
|
27716
|
-
--remain_videos;
|
|
27717
|
-
ed = ed_video;
|
|
27718
|
-
}
|
|
27719
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
27720
|
-
Number(t),
|
|
27721
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
27722
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
27723
|
-
];
|
|
27724
|
-
const text_len = ed - st2;
|
|
27725
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27726
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
27727
|
-
const offset = text_len + st_idx;
|
|
27728
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
27729
|
-
const t_index = Array.from(
|
|
27730
|
-
{ length: grid_size },
|
|
27731
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
27732
|
-
);
|
|
27733
|
-
const h_index = Array.from(
|
|
27734
|
-
{ length: grid_size },
|
|
27735
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
27736
|
-
);
|
|
27737
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
27738
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
27739
|
-
st2 = ed + grid_size;
|
|
27740
|
-
}
|
|
27741
|
-
if (st2 < ids.length) {
|
|
27742
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27743
|
-
const text_len = ids.length - st2;
|
|
27744
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
27745
|
-
}
|
|
27746
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
27747
|
-
const llm_positions = new Array(num_items);
|
|
27748
|
-
let index = 0;
|
|
27749
|
-
for (let x = 0; x < 3; ++x) {
|
|
27750
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
27751
|
-
const val = llm_pos_ids_list[y];
|
|
27752
|
-
const text_len = val.length / 3;
|
|
27753
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
27754
|
-
llm_positions[index++] = val[z];
|
|
27755
|
-
}
|
|
27756
|
-
}
|
|
27757
|
-
}
|
|
27758
|
-
let count2 = 0;
|
|
27759
|
-
const attn_mask = attention_mask_list[i];
|
|
27760
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
27761
|
-
if (attn_mask[y] == 1) {
|
|
27762
|
-
for (let x = 0; x < 3; ++x) {
|
|
27763
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
27764
|
-
}
|
|
27765
|
-
++count2;
|
|
27766
|
-
}
|
|
27767
|
-
}
|
|
27768
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
27769
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
27770
|
-
}
|
|
27771
|
-
return [
|
|
27772
|
-
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
27773
|
-
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27774
|
-
];
|
|
27775
|
-
} else {
|
|
27776
|
-
if (attention_mask) {
|
|
27777
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
27778
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
27779
|
-
const mrope_position_deltas2 = Array.from(
|
|
27780
|
-
{ length: dims[0] },
|
|
27781
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
27782
|
-
);
|
|
27783
|
-
return [
|
|
27784
|
-
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
27785
|
-
new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
27786
|
-
];
|
|
27787
|
-
} else {
|
|
27788
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
27789
|
-
const position_ids = BigInt64Array.from(
|
|
27790
|
-
{ length: 3 * batch_size * seq_length },
|
|
27791
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
27792
|
-
);
|
|
27793
|
-
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
27794
|
-
}
|
|
27795
|
-
}
|
|
27796
|
-
}
|
|
27797
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
27798
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
27799
|
-
pixel_values,
|
|
27800
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
27801
|
-
})).image_features;
|
|
27802
|
-
return features;
|
|
27803
|
-
}
|
|
27804
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
27805
|
-
return default_merge_input_ids_with_image_features({
|
|
27806
|
-
// @ts-ignore
|
|
27807
|
-
image_token_id: this.config.image_token_id,
|
|
27808
|
-
...kwargs
|
|
27809
|
-
});
|
|
27810
|
-
}
|
|
27811
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
27812
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
27813
|
-
if (!model_inputs.past_key_values) {
|
|
27814
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27815
|
-
model_inputs.input_ids,
|
|
27816
|
-
model_inputs.image_grid_thw,
|
|
27817
|
-
model_inputs.video_grid_thw,
|
|
27818
|
-
model_inputs.attention_mask
|
|
27819
|
-
);
|
|
27820
|
-
} else {
|
|
27821
|
-
model_inputs.pixel_values = null;
|
|
27822
|
-
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
27823
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27824
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27825
|
-
model_inputs.input_ids,
|
|
27826
|
-
model_inputs.image_grid_thw,
|
|
27827
|
-
model_inputs.video_grid_thw,
|
|
27828
|
-
model_inputs.attention_mask
|
|
27829
|
-
);
|
|
27830
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
27831
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
27832
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
27833
|
-
} else {
|
|
27834
|
-
if (!model_inputs.rope_deltas) {
|
|
27835
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27836
|
-
model_inputs.input_ids,
|
|
27837
|
-
model_inputs.image_grid_thw,
|
|
27838
|
-
model_inputs.video_grid_thw,
|
|
27839
|
-
model_inputs.attention_mask
|
|
27840
|
-
);
|
|
27841
|
-
}
|
|
27842
|
-
const delta = BigInt(past_length);
|
|
27843
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
27844
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
27845
|
-
}
|
|
27846
|
-
}
|
|
27847
|
-
}
|
|
27848
|
-
return model_inputs;
|
|
27849
|
-
}
|
|
27850
|
-
};
|
|
27851
|
-
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
27852
|
-
};
|
|
27853
|
-
|
|
27854
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27855
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27856
|
-
image_grid_thw_name = "image_grid_thw";
|
|
27857
|
-
};
|
|
27858
|
-
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
27859
|
-
image_grid_thw_name = "image_grid_thw";
|
|
27860
|
-
};
|
|
27861
|
-
|
|
27862
28117
|
// src/models/qwen3/modeling_qwen3.js
|
|
27863
28118
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
27864
28119
|
};
|
|
@@ -28304,6 +28559,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
28304
28559
|
}
|
|
28305
28560
|
};
|
|
28306
28561
|
|
|
28562
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
28563
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
28564
|
+
};
|
|
28565
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
28566
|
+
};
|
|
28567
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
28568
|
+
};
|
|
28569
|
+
|
|
28307
28570
|
// src/models/speecht5/modeling_speecht5.js
|
|
28308
28571
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
28309
28572
|
};
|
|
@@ -29420,6 +29683,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
29420
29683
|
// src/models/registry.js
|
|
29421
29684
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
29422
29685
|
["bert", "BertModel"],
|
|
29686
|
+
["eurobert", "EuroBertModel"],
|
|
29423
29687
|
["neobert", "NeoBertModel"],
|
|
29424
29688
|
["modernbert", "ModernBertModel"],
|
|
29425
29689
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -29551,6 +29815,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
29551
29815
|
["gemma3_text", "Gemma3Model"],
|
|
29552
29816
|
["helium", "HeliumModel"],
|
|
29553
29817
|
["glm", "GlmModel"],
|
|
29818
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
29554
29819
|
["openelm", "OpenELMModel"],
|
|
29555
29820
|
["qwen2", "Qwen2Model"],
|
|
29556
29821
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -29562,12 +29827,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
29562
29827
|
["mpt", "MptModel"],
|
|
29563
29828
|
["opt", "OPTModel"],
|
|
29564
29829
|
["mistral", "MistralModel"],
|
|
29830
|
+
["mistral4", "Mistral4Model"],
|
|
29565
29831
|
["ministral", "MinistralModel"],
|
|
29566
29832
|
["ministral3", "Ministral3Model"],
|
|
29567
29833
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
29568
29834
|
["starcoder2", "Starcoder2Model"],
|
|
29835
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
29569
29836
|
["falcon", "FalconModel"],
|
|
29570
29837
|
["falcon_h1", "FalconH1Model"],
|
|
29838
|
+
["nemotron_h", "NemotronHModel"],
|
|
29839
|
+
["solar_open", "SolarOpenModel"],
|
|
29571
29840
|
["stablelm", "StableLmModel"],
|
|
29572
29841
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
29573
29842
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -29587,6 +29856,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29587
29856
|
]);
|
|
29588
29857
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29589
29858
|
["bert", "BertForSequenceClassification"],
|
|
29859
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
29590
29860
|
["neobert", "NeoBertForSequenceClassification"],
|
|
29591
29861
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
29592
29862
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -29609,6 +29879,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29609
29879
|
]);
|
|
29610
29880
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29611
29881
|
["bert", "BertForTokenClassification"],
|
|
29882
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
29612
29883
|
["neobert", "NeoBertForTokenClassification"],
|
|
29613
29884
|
["modernbert", "ModernBertForTokenClassification"],
|
|
29614
29885
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -29671,6 +29942,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29671
29942
|
["gemma3", "Gemma3ForCausalLM"],
|
|
29672
29943
|
["helium", "HeliumForCausalLM"],
|
|
29673
29944
|
["glm", "GlmForCausalLM"],
|
|
29945
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
29674
29946
|
["openelm", "OpenELMForCausalLM"],
|
|
29675
29947
|
["qwen2", "Qwen2ForCausalLM"],
|
|
29676
29948
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
@@ -29690,13 +29962,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29690
29962
|
["opt", "OPTForCausalLM"],
|
|
29691
29963
|
["mbart", "MBartForCausalLM"],
|
|
29692
29964
|
["mistral", "MistralForCausalLM"],
|
|
29965
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
29693
29966
|
["ministral", "MinistralForCausalLM"],
|
|
29694
29967
|
["ministral3", "Ministral3ForCausalLM"],
|
|
29695
29968
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
29696
29969
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
29970
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
29697
29971
|
["falcon", "FalconForCausalLM"],
|
|
29698
29972
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
29973
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
29699
29974
|
["trocr", "TrOCRForCausalLM"],
|
|
29975
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
29700
29976
|
["stablelm", "StableLmForCausalLM"],
|
|
29701
29977
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
29702
29978
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -29707,6 +29983,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29707
29983
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
29708
29984
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29709
29985
|
["bert", "BertForMaskedLM"],
|
|
29986
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
29710
29987
|
["neobert", "NeoBertForMaskedLM"],
|
|
29711
29988
|
["modernbert", "ModernBertForMaskedLM"],
|
|
29712
29989
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -29765,7 +30042,9 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29765
30042
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
29766
30043
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
29767
30044
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
29768
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
30045
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
30046
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
30047
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
29769
30048
|
]);
|
|
29770
30049
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29771
30050
|
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
@@ -29870,6 +30149,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29870
30149
|
]);
|
|
29871
30150
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
29872
30151
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30152
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
29873
30153
|
["dpt", "DPTForDepthEstimation"],
|
|
29874
30154
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
29875
30155
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -32608,6 +32888,9 @@ export {
|
|
|
32608
32888
|
BloomModel,
|
|
32609
32889
|
BloomPreTrainedModel,
|
|
32610
32890
|
BloomTokenizer,
|
|
32891
|
+
CHMv2ForDepthEstimation,
|
|
32892
|
+
CHMv2ImageProcessor,
|
|
32893
|
+
CHMv2PreTrainedModel,
|
|
32611
32894
|
CLIPFeatureExtractor,
|
|
32612
32895
|
CLIPImageProcessor,
|
|
32613
32896
|
CLIPModel,
|
|
@@ -32703,6 +32986,9 @@ export {
|
|
|
32703
32986
|
DebertaV2Tokenizer,
|
|
32704
32987
|
DecisionTransformerModel,
|
|
32705
32988
|
DecisionTransformerPreTrainedModel,
|
|
32989
|
+
DeepseekV3ForCausalLM,
|
|
32990
|
+
DeepseekV3Model,
|
|
32991
|
+
DeepseekV3PreTrainedModel,
|
|
32706
32992
|
DeiTFeatureExtractor,
|
|
32707
32993
|
DeiTForImageClassification,
|
|
32708
32994
|
DeiTImageProcessor,
|
|
@@ -32763,6 +33049,11 @@ export {
|
|
|
32763
33049
|
EsmModel,
|
|
32764
33050
|
EsmPreTrainedModel,
|
|
32765
33051
|
EsmTokenizer,
|
|
33052
|
+
EuroBertForMaskedLM,
|
|
33053
|
+
EuroBertForSequenceClassification,
|
|
33054
|
+
EuroBertForTokenClassification,
|
|
33055
|
+
EuroBertModel,
|
|
33056
|
+
EuroBertPreTrainedModel,
|
|
32766
33057
|
ExaoneForCausalLM,
|
|
32767
33058
|
ExaoneModel,
|
|
32768
33059
|
ExaonePreTrainedModel,
|
|
@@ -32820,8 +33111,14 @@ export {
|
|
|
32820
33111
|
GemmaModel,
|
|
32821
33112
|
GemmaPreTrainedModel,
|
|
32822
33113
|
GemmaTokenizer,
|
|
33114
|
+
Glm46VImageProcessor,
|
|
33115
|
+
Glm46VProcessor,
|
|
32823
33116
|
GlmForCausalLM,
|
|
32824
33117
|
GlmModel,
|
|
33118
|
+
GlmMoeDsaForCausalLM,
|
|
33119
|
+
GlmMoeDsaModel,
|
|
33120
|
+
GlmMoeDsaPreTrainedModel,
|
|
33121
|
+
GlmOcrForConditionalGeneration,
|
|
32825
33122
|
GlmPreTrainedModel,
|
|
32826
33123
|
GptOssForCausalLM,
|
|
32827
33124
|
GptOssModel,
|
|
@@ -32887,6 +33184,7 @@ export {
|
|
|
32887
33184
|
Lfm2VlForConditionalGeneration,
|
|
32888
33185
|
Lfm2VlImageProcessor,
|
|
32889
33186
|
Lfm2VlProcessor,
|
|
33187
|
+
LightOnOcrForConditionalGeneration,
|
|
32890
33188
|
LiteWhisperForConditionalGeneration,
|
|
32891
33189
|
Llama4ForCausalLM,
|
|
32892
33190
|
Llama4PreTrainedModel,
|
|
@@ -32956,6 +33254,9 @@ export {
|
|
|
32956
33254
|
MimiPreTrainedModel,
|
|
32957
33255
|
MinLengthLogitsProcessor,
|
|
32958
33256
|
MinNewTokensLengthLogitsProcessor,
|
|
33257
|
+
Mistral4ForCausalLM,
|
|
33258
|
+
Mistral4Model,
|
|
33259
|
+
Mistral4PreTrainedModel,
|
|
32959
33260
|
MistralForCausalLM,
|
|
32960
33261
|
MistralModel,
|
|
32961
33262
|
MistralPreTrainedModel,
|
|
@@ -33027,6 +33328,9 @@ export {
|
|
|
33027
33328
|
NanoChatForCausalLM,
|
|
33028
33329
|
NanoChatModel,
|
|
33029
33330
|
NanoChatPreTrainedModel,
|
|
33331
|
+
NemotronHForCausalLM,
|
|
33332
|
+
NemotronHModel,
|
|
33333
|
+
NemotronHPreTrainedModel,
|
|
33030
33334
|
NeoBertForMaskedLM,
|
|
33031
33335
|
NeoBertForQuestionAnswering,
|
|
33032
33336
|
NeoBertForSequenceClassification,
|
|
@@ -33216,6 +33520,9 @@ export {
|
|
|
33216
33520
|
SnacFeatureExtractor,
|
|
33217
33521
|
SnacModel,
|
|
33218
33522
|
SnacPreTrainedModel,
|
|
33523
|
+
SolarOpenForCausalLM,
|
|
33524
|
+
SolarOpenModel,
|
|
33525
|
+
SolarOpenPreTrainedModel,
|
|
33219
33526
|
SpeechT5FeatureExtractor,
|
|
33220
33527
|
SpeechT5ForSpeechToText,
|
|
33221
33528
|
SpeechT5ForTextToSpeech,
|