@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/transformers.js +689 -382
- package/dist/transformers.min.js +19 -19
- package/dist/transformers.node.cjs +716 -382
- package/dist/transformers.node.min.cjs +19 -19
- package/dist/transformers.node.min.mjs +19 -19
- package/dist/transformers.node.mjs +689 -382
- package/dist/transformers.web.js +697 -390
- package/dist/transformers.web.min.js +17 -17
- package/package.json +2 -2
- package/src/configs.js +28 -22
- package/src/env.js +1 -1
- package/src/image_processors_utils.js +25 -15
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/image_processors.js +2 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +2 -0
- package/src/models/models.js +10 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/processors.js +1 -0
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +187 -136
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/registry.js +17 -0
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/pipelines.js +1 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/model_registry/get_file_metadata.js +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +3 -2
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +2 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +10 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/processors.d.ts +1 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
package/dist/transformers.web.js
CHANGED
|
@@ -14,7 +14,7 @@ var node_path_default = {};
|
|
|
14
14
|
var node_url_default = {};
|
|
15
15
|
|
|
16
16
|
// src/env.js
|
|
17
|
-
var VERSION = "4.0.0-next.
|
|
17
|
+
var VERSION = "4.0.0-next.8";
|
|
18
18
|
var HAS_SELF = typeof self !== "undefined";
|
|
19
19
|
var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
|
|
20
20
|
var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
|
|
@@ -244,7 +244,7 @@ var logger = {
|
|
|
244
244
|
}
|
|
245
245
|
};
|
|
246
246
|
|
|
247
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
247
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
248
248
|
var DictionarySplitter = class {
|
|
249
249
|
/**
|
|
250
250
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -1900,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
1900
1900
|
);
|
|
1901
1901
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
1902
1902
|
output_tokens.push(...byte_tokens);
|
|
1903
|
-
} else {
|
|
1903
|
+
} else if (this.unk_token != null) {
|
|
1904
1904
|
output_tokens.push(this.unk_token);
|
|
1905
1905
|
}
|
|
1906
|
-
} else {
|
|
1906
|
+
} else if (this.unk_token != null) {
|
|
1907
1907
|
output_tokens.push(this.unk_token);
|
|
1908
1908
|
}
|
|
1909
1909
|
}
|
|
@@ -6509,13 +6509,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
|
|
|
6509
6509
|
wrapped_progress
|
|
6510
6510
|
);
|
|
6511
6511
|
} else if (typeof response !== "string") {
|
|
6512
|
+
const headers = new Headers(response.headers);
|
|
6513
|
+
headers.set("content-length", result.byteLength.toString());
|
|
6512
6514
|
await cache2.put(
|
|
6513
6515
|
cacheKey,
|
|
6514
6516
|
new Response(
|
|
6515
6517
|
/** @type {any} */
|
|
6516
6518
|
result,
|
|
6517
6519
|
{
|
|
6518
|
-
headers
|
|
6520
|
+
headers
|
|
6519
6521
|
}
|
|
6520
6522
|
)
|
|
6521
6523
|
).catch((err) => {
|
|
@@ -11829,6 +11831,7 @@ __export(processors_exports, {
|
|
|
11829
11831
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
11830
11832
|
Florence2Processor: () => Florence2Processor,
|
|
11831
11833
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
11834
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
11832
11835
|
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
11833
11836
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
11834
11837
|
Idefics3Processor: () => Idefics3Processor,
|
|
@@ -14342,26 +14345,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
14342
14345
|
}
|
|
14343
14346
|
return [segmentation, segments];
|
|
14344
14347
|
}
|
|
14345
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
14348
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
14346
14349
|
if (height < factor || width < factor) {
|
|
14347
|
-
|
|
14348
|
-
|
|
14350
|
+
const scale = Math.max(factor / height, factor / width);
|
|
14351
|
+
height = Math.round(height * scale);
|
|
14352
|
+
width = Math.round(width * scale);
|
|
14353
|
+
}
|
|
14354
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
14349
14355
|
throw new Error(
|
|
14350
14356
|
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
14351
14357
|
);
|
|
14352
14358
|
}
|
|
14353
14359
|
let h_bar = Math.round(height / factor) * factor;
|
|
14354
14360
|
let w_bar = Math.round(width / factor) * factor;
|
|
14355
|
-
if (h_bar * w_bar > max_pixels) {
|
|
14356
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
14357
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
14358
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
14359
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
14360
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
14361
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
14362
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
14363
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
14364
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
14365
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
14366
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
14361
14367
|
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
14362
14368
|
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
14363
14369
|
}
|
|
14364
|
-
return [
|
|
14370
|
+
return [w_bar, h_bar];
|
|
14365
14371
|
}
|
|
14366
14372
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
14367
14373
|
if (label_ids_to_fuse === null) {
|
|
@@ -14440,7 +14446,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
14440
14446
|
this.do_pad = config.do_pad;
|
|
14441
14447
|
this.min_pixels = config.min_pixels;
|
|
14442
14448
|
this.max_pixels = config.max_pixels;
|
|
14443
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
14449
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
14444
14450
|
this.pad_size = this.size;
|
|
14445
14451
|
}
|
|
14446
14452
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -14728,10 +14734,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
14728
14734
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
14729
14735
|
[pixelData, imgDims] = padded;
|
|
14730
14736
|
} else if (this.size_divisibility) {
|
|
14731
|
-
const
|
|
14732
|
-
|
|
14733
|
-
this.size_divisibility
|
|
14734
|
-
);
|
|
14737
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
14738
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
14735
14739
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
14736
14740
|
}
|
|
14737
14741
|
}
|
|
@@ -14808,6 +14812,7 @@ var image_processors_exports = {};
|
|
|
14808
14812
|
__export(image_processors_exports, {
|
|
14809
14813
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
14810
14814
|
BitImageProcessor: () => BitImageProcessor,
|
|
14815
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
14811
14816
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
14812
14817
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
14813
14818
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -14824,6 +14829,7 @@ __export(image_processors_exports, {
|
|
|
14824
14829
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
14825
14830
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
14826
14831
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
14832
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
14827
14833
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
14828
14834
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
14829
14835
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
@@ -14884,6 +14890,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
14884
14890
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
14885
14891
|
};
|
|
14886
14892
|
|
|
14893
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
14894
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
14895
|
+
};
|
|
14896
|
+
|
|
14887
14897
|
// src/models/clip/image_processing_clip.js
|
|
14888
14898
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
14889
14899
|
};
|
|
@@ -15003,6 +15013,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
15003
15013
|
}
|
|
15004
15014
|
};
|
|
15005
15015
|
|
|
15016
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
15017
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
15018
|
+
constructor(config) {
|
|
15019
|
+
super(config);
|
|
15020
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
15021
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
15022
|
+
this.patch_size = config.patch_size;
|
|
15023
|
+
this.merge_size = config.merge_size;
|
|
15024
|
+
}
|
|
15025
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
15026
|
+
get_resize_output_image_size(image, size) {
|
|
15027
|
+
const factor = this.patch_size * this.merge_size;
|
|
15028
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
15029
|
+
}
|
|
15030
|
+
async _call(images, ...args) {
|
|
15031
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
15032
|
+
let patches = pixel_values;
|
|
15033
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
15034
|
+
if (patches.dims[0] === 1) {
|
|
15035
|
+
patches = cat(
|
|
15036
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
15037
|
+
0
|
|
15038
|
+
);
|
|
15039
|
+
}
|
|
15040
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
15041
|
+
const channel = patches.dims[1];
|
|
15042
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
15043
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
15044
|
+
const flatten_patches = patches.view(
|
|
15045
|
+
grid_t,
|
|
15046
|
+
temporal_patch_size,
|
|
15047
|
+
channel,
|
|
15048
|
+
Math.floor(grid_h / merge_size),
|
|
15049
|
+
merge_size,
|
|
15050
|
+
patch_size,
|
|
15051
|
+
Math.floor(grid_w / merge_size),
|
|
15052
|
+
merge_size,
|
|
15053
|
+
patch_size
|
|
15054
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
15055
|
+
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
15056
|
+
return {
|
|
15057
|
+
pixel_values: flatten_patches,
|
|
15058
|
+
image_grid_thw,
|
|
15059
|
+
original_sizes,
|
|
15060
|
+
reshaped_input_sizes
|
|
15061
|
+
};
|
|
15062
|
+
}
|
|
15063
|
+
};
|
|
15064
|
+
|
|
15065
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
15066
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
15067
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
15068
|
+
get_resize_output_image_size(image, size) {
|
|
15069
|
+
const factor = this.patch_size * this.merge_size;
|
|
15070
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
15071
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
15072
|
+
}
|
|
15073
|
+
};
|
|
15074
|
+
|
|
15006
15075
|
// src/models/glpn/image_processing_glpn.js
|
|
15007
15076
|
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
15008
15077
|
};
|
|
@@ -15396,7 +15465,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
|
15396
15465
|
const img = pixel_values.unsqueeze_(0);
|
|
15397
15466
|
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
15398
15467
|
const f2 = total_factor ** 2;
|
|
15399
|
-
const [
|
|
15468
|
+
const [new_width, new_height] = smart_resize(
|
|
15400
15469
|
Math.max(total_factor, height),
|
|
15401
15470
|
Math.max(total_factor, width),
|
|
15402
15471
|
total_factor,
|
|
@@ -15686,55 +15755,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
15686
15755
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
15687
15756
|
};
|
|
15688
15757
|
|
|
15689
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
15690
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
15691
|
-
constructor(config) {
|
|
15692
|
-
super(config);
|
|
15693
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
15694
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
15695
|
-
this.patch_size = config.patch_size;
|
|
15696
|
-
this.merge_size = config.merge_size;
|
|
15697
|
-
}
|
|
15698
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
15699
|
-
get_resize_output_image_size(image, size) {
|
|
15700
|
-
const factor = this.patch_size * this.merge_size;
|
|
15701
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
15702
|
-
}
|
|
15703
|
-
async _call(images, ...args) {
|
|
15704
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
15705
|
-
let patches = pixel_values;
|
|
15706
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
15707
|
-
if (patches.dims[0] === 1) {
|
|
15708
|
-
patches = cat(
|
|
15709
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
15710
|
-
0
|
|
15711
|
-
);
|
|
15712
|
-
}
|
|
15713
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
15714
|
-
const channel = patches.dims[1];
|
|
15715
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
15716
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
15717
|
-
const flatten_patches = patches.view(
|
|
15718
|
-
grid_t,
|
|
15719
|
-
temporal_patch_size,
|
|
15720
|
-
channel,
|
|
15721
|
-
Math.floor(grid_h / merge_size),
|
|
15722
|
-
merge_size,
|
|
15723
|
-
patch_size,
|
|
15724
|
-
Math.floor(grid_w / merge_size),
|
|
15725
|
-
merge_size,
|
|
15726
|
-
patch_size
|
|
15727
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
15728
|
-
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
15729
|
-
return {
|
|
15730
|
-
pixel_values: flatten_patches,
|
|
15731
|
-
image_grid_thw,
|
|
15732
|
-
original_sizes,
|
|
15733
|
-
reshaped_input_sizes
|
|
15734
|
-
};
|
|
15735
|
-
}
|
|
15736
|
-
};
|
|
15737
|
-
|
|
15738
15758
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
15739
15759
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
15740
15760
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -16288,6 +16308,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
16288
16308
|
}
|
|
16289
16309
|
};
|
|
16290
16310
|
|
|
16311
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
16312
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
16313
|
+
static image_processor_class = AutoImageProcessor;
|
|
16314
|
+
static tokenizer_class = AutoTokenizer;
|
|
16315
|
+
static image_token = "<|image_pad|>";
|
|
16316
|
+
/**
|
|
16317
|
+
*
|
|
16318
|
+
* @param {string|string[]} text
|
|
16319
|
+
* @param {RawImage|RawImage[]} images
|
|
16320
|
+
* @param {...any} args
|
|
16321
|
+
* @returns {Promise<any>}
|
|
16322
|
+
*/
|
|
16323
|
+
async _call(text, images = null, ...args) {
|
|
16324
|
+
if (!Array.isArray(text)) {
|
|
16325
|
+
text = [text];
|
|
16326
|
+
}
|
|
16327
|
+
let image_inputs, image_grid_thw;
|
|
16328
|
+
if (images) {
|
|
16329
|
+
image_inputs = await this.image_processor(images);
|
|
16330
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
16331
|
+
}
|
|
16332
|
+
if (image_grid_thw) {
|
|
16333
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
16334
|
+
let index = 0;
|
|
16335
|
+
const image_token = (
|
|
16336
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
16337
|
+
this.constructor.image_token
|
|
16338
|
+
);
|
|
16339
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
16340
|
+
text = text.map((t) => {
|
|
16341
|
+
while (t.includes(image_token)) {
|
|
16342
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
16343
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
16344
|
+
}
|
|
16345
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
16346
|
+
});
|
|
16347
|
+
}
|
|
16348
|
+
const text_inputs = this.tokenizer(text);
|
|
16349
|
+
return {
|
|
16350
|
+
...text_inputs,
|
|
16351
|
+
...image_inputs
|
|
16352
|
+
};
|
|
16353
|
+
}
|
|
16354
|
+
};
|
|
16355
|
+
|
|
16356
|
+
// src/models/glm46v/processing_glm46v.js
|
|
16357
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
16358
|
+
static image_token = "<|image|>";
|
|
16359
|
+
};
|
|
16360
|
+
|
|
16291
16361
|
// src/models/granite_speech/processing_granite_speech.js
|
|
16292
16362
|
var GraniteSpeechProcessor = class extends Processor {
|
|
16293
16363
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -17018,47 +17088,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
17018
17088
|
}
|
|
17019
17089
|
};
|
|
17020
17090
|
|
|
17021
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
17022
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
17023
|
-
static image_processor_class = AutoImageProcessor;
|
|
17024
|
-
static tokenizer_class = AutoTokenizer;
|
|
17025
|
-
/**
|
|
17026
|
-
*
|
|
17027
|
-
* @param {string|string[]} text
|
|
17028
|
-
* @param {RawImage|RawImage[]} images
|
|
17029
|
-
* @param {...any} args
|
|
17030
|
-
* @returns {Promise<any>}
|
|
17031
|
-
*/
|
|
17032
|
-
async _call(text, images = null, ...args) {
|
|
17033
|
-
if (!Array.isArray(text)) {
|
|
17034
|
-
text = [text];
|
|
17035
|
-
}
|
|
17036
|
-
let image_inputs, image_grid_thw;
|
|
17037
|
-
if (images) {
|
|
17038
|
-
image_inputs = await this.image_processor(images);
|
|
17039
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
17040
|
-
}
|
|
17041
|
-
if (image_grid_thw) {
|
|
17042
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
17043
|
-
let index = 0;
|
|
17044
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
17045
|
-
text = text.map((t) => {
|
|
17046
|
-
while (t.includes("<|image_pad|>")) {
|
|
17047
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
17048
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
17049
|
-
}
|
|
17050
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
17051
|
-
});
|
|
17052
|
-
}
|
|
17053
|
-
const text_inputs = this.tokenizer(text);
|
|
17054
|
-
return {
|
|
17055
|
-
...text_inputs,
|
|
17056
|
-
...image_inputs
|
|
17057
|
-
// TODO: ...videos_inputs,
|
|
17058
|
-
};
|
|
17059
|
-
}
|
|
17060
|
-
};
|
|
17061
|
-
|
|
17062
17091
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
17063
17092
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
17064
17093
|
};
|
|
@@ -17402,6 +17431,8 @@ function getNormalizedConfig(config) {
|
|
|
17402
17431
|
case "gemma3n":
|
|
17403
17432
|
case "lfm2_vl":
|
|
17404
17433
|
case "chatterbox":
|
|
17434
|
+
case "lighton_ocr":
|
|
17435
|
+
case "glm_ocr":
|
|
17405
17436
|
case "mistral3":
|
|
17406
17437
|
case "qwen2_5_vl":
|
|
17407
17438
|
case "qwen3_vl":
|
|
@@ -17477,6 +17508,8 @@ function getNormalizedConfig(config) {
|
|
|
17477
17508
|
mapping["dim_kv"] = "head_dim";
|
|
17478
17509
|
break;
|
|
17479
17510
|
case "qwen3":
|
|
17511
|
+
case "solar_open":
|
|
17512
|
+
case "glm_ocr_text":
|
|
17480
17513
|
case "gemma":
|
|
17481
17514
|
case "gemma2":
|
|
17482
17515
|
case "vaultgemma":
|
|
@@ -17487,6 +17520,7 @@ function getNormalizedConfig(config) {
|
|
|
17487
17520
|
case "ernie4_5":
|
|
17488
17521
|
case "hunyuan_v1_dense":
|
|
17489
17522
|
case "falcon_h1":
|
|
17523
|
+
case "nemotron_h":
|
|
17490
17524
|
case "ministral":
|
|
17491
17525
|
case "ministral3":
|
|
17492
17526
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -17521,6 +17555,9 @@ function getNormalizedConfig(config) {
|
|
|
17521
17555
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
17522
17556
|
break;
|
|
17523
17557
|
case "youtu":
|
|
17558
|
+
case "deepseek_v3":
|
|
17559
|
+
case "glm_moe_dsa":
|
|
17560
|
+
case "mistral4":
|
|
17524
17561
|
mapping["num_heads"] = "num_key_value_heads";
|
|
17525
17562
|
mapping["num_layers"] = "num_hidden_layers";
|
|
17526
17563
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -17609,6 +17646,7 @@ function getCacheShapes(config, options) {
|
|
|
17609
17646
|
if (!(config instanceof PretrainedConfig)) {
|
|
17610
17647
|
config = new PretrainedConfig(config);
|
|
17611
17648
|
}
|
|
17649
|
+
const batch_size = options?.batch_size ?? 1;
|
|
17612
17650
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
17613
17651
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
17614
17652
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -17618,7 +17656,6 @@ function getCacheShapes(config, options) {
|
|
|
17618
17656
|
config
|
|
17619
17657
|
);
|
|
17620
17658
|
const head_dim = hidden_size / num_attention_heads;
|
|
17621
|
-
const batch_size = options?.batch_size ?? 1;
|
|
17622
17659
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
17623
17660
|
if (layer_types[i] === "full_attention") {
|
|
17624
17661
|
for (const kv of ["key", "value"]) {
|
|
@@ -17631,31 +17668,26 @@ function getCacheShapes(config, options) {
|
|
|
17631
17668
|
}
|
|
17632
17669
|
}
|
|
17633
17670
|
return cache_values;
|
|
17634
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
17671
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
17635
17672
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
17636
17673
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
17637
|
-
const
|
|
17638
|
-
const {
|
|
17639
|
-
layer_types,
|
|
17640
|
-
num_hidden_layers,
|
|
17641
|
-
num_attention_heads,
|
|
17642
|
-
num_key_value_heads,
|
|
17643
|
-
hidden_size,
|
|
17644
|
-
mamba_d_conv,
|
|
17645
|
-
mamba_n_heads,
|
|
17646
|
-
mamba_d_head,
|
|
17647
|
-
mamba_d_state,
|
|
17648
|
-
mamba_n_groups,
|
|
17649
|
-
mamba_expand,
|
|
17650
|
-
mamba_d_ssm
|
|
17651
|
-
} = (
|
|
17674
|
+
const c = (
|
|
17652
17675
|
/** @type {any} */
|
|
17653
17676
|
config
|
|
17654
17677
|
);
|
|
17655
|
-
const
|
|
17656
|
-
const
|
|
17657
|
-
const
|
|
17658
|
-
|
|
17678
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
17679
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
17680
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
17681
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
17682
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
17683
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
17684
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
17685
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
17686
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
17687
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
17688
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
17689
|
+
const cache_values = {};
|
|
17690
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
17659
17691
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
17660
17692
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
17661
17693
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -17689,7 +17721,6 @@ function getCacheShapes(config, options) {
|
|
|
17689
17721
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
17690
17722
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
17691
17723
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
17692
|
-
const batch_size = options?.batch_size ?? 1;
|
|
17693
17724
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
17694
17725
|
if (layer_types[i] === "full_attention") {
|
|
17695
17726
|
for (const kv of ["key", "value"]) {
|
|
@@ -20317,7 +20348,9 @@ async function generic_text_to_text_forward(self2, {
|
|
|
20317
20348
|
"qwen3_5",
|
|
20318
20349
|
"qwen3_5_text",
|
|
20319
20350
|
"qwen3_5_moe",
|
|
20320
|
-
"qwen3_5_moe_text"
|
|
20351
|
+
"qwen3_5_moe_text",
|
|
20352
|
+
"glm_ocr",
|
|
20353
|
+
"glm_ocr_text"
|
|
20321
20354
|
].includes(self2.config.model_type)
|
|
20322
20355
|
) {
|
|
20323
20356
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -20541,6 +20574,8 @@ __export(models_exports, {
|
|
|
20541
20574
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
20542
20575
|
BloomModel: () => BloomModel,
|
|
20543
20576
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
20577
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
20578
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
20544
20579
|
CLIPModel: () => CLIPModel,
|
|
20545
20580
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
20546
20581
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -20615,6 +20650,9 @@ __export(models_exports, {
|
|
|
20615
20650
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
20616
20651
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
20617
20652
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
20653
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
20654
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
20655
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
20618
20656
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
20619
20657
|
DeiTModel: () => DeiTModel,
|
|
20620
20658
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -20660,6 +20698,11 @@ __export(models_exports, {
|
|
|
20660
20698
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
20661
20699
|
EsmModel: () => EsmModel,
|
|
20662
20700
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
20701
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
20702
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
20703
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
20704
|
+
EuroBertModel: () => EuroBertModel,
|
|
20705
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
20663
20706
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
20664
20707
|
ExaoneModel: () => ExaoneModel,
|
|
20665
20708
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -20706,6 +20749,10 @@ __export(models_exports, {
|
|
|
20706
20749
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
20707
20750
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
20708
20751
|
GlmModel: () => GlmModel,
|
|
20752
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
20753
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
20754
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
20755
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
20709
20756
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
20710
20757
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
20711
20758
|
GptOssModel: () => GptOssModel,
|
|
@@ -20752,6 +20799,7 @@ __export(models_exports, {
|
|
|
20752
20799
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
20753
20800
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
20754
20801
|
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
20802
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
20755
20803
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
20756
20804
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
20757
20805
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -20801,6 +20849,9 @@ __export(models_exports, {
|
|
|
20801
20849
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
20802
20850
|
MimiModel: () => MimiModel,
|
|
20803
20851
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
20852
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
20853
|
+
Mistral4Model: () => Mistral4Model,
|
|
20854
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
20804
20855
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
20805
20856
|
MistralModel: () => MistralModel,
|
|
20806
20857
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -20858,6 +20909,9 @@ __export(models_exports, {
|
|
|
20858
20909
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
20859
20910
|
NanoChatModel: () => NanoChatModel,
|
|
20860
20911
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
20912
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
20913
|
+
NemotronHModel: () => NemotronHModel,
|
|
20914
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
20861
20915
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
20862
20916
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
20863
20917
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -20995,6 +21049,9 @@ __export(models_exports, {
|
|
|
20995
21049
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
20996
21050
|
SnacModel: () => SnacModel,
|
|
20997
21051
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
21052
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
21053
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
21054
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
20998
21055
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
20999
21056
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
21000
21057
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -21169,7 +21226,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
21169
21226
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
21170
21227
|
};
|
|
21171
21228
|
|
|
21172
|
-
// src/models/
|
|
21229
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
21173
21230
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
21174
21231
|
};
|
|
21175
21232
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -21504,6 +21561,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
21504
21561
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
21505
21562
|
};
|
|
21506
21563
|
|
|
21564
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
21565
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
21566
|
+
};
|
|
21567
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
21568
|
+
};
|
|
21569
|
+
|
|
21507
21570
|
// src/models/clap/modeling_clap.js
|
|
21508
21571
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
21509
21572
|
};
|
|
@@ -21842,6 +21905,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
21842
21905
|
}
|
|
21843
21906
|
};
|
|
21844
21907
|
|
|
21908
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
21909
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
21910
|
+
};
|
|
21911
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
21912
|
+
};
|
|
21913
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
21914
|
+
};
|
|
21915
|
+
|
|
21845
21916
|
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
21846
21917
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
21847
21918
|
};
|
|
@@ -22190,6 +22261,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
22190
22261
|
}
|
|
22191
22262
|
};
|
|
22192
22263
|
|
|
22264
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
22265
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
22266
|
+
};
|
|
22267
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
22268
|
+
};
|
|
22269
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
22270
|
+
/**
|
|
22271
|
+
* Calls the model on new inputs.
|
|
22272
|
+
*
|
|
22273
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
22274
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
22275
|
+
*/
|
|
22276
|
+
async _call(model_inputs) {
|
|
22277
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
22278
|
+
}
|
|
22279
|
+
};
|
|
22280
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
22281
|
+
/**
|
|
22282
|
+
* Calls the model on new inputs.
|
|
22283
|
+
*
|
|
22284
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
22285
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
22286
|
+
*/
|
|
22287
|
+
async _call(model_inputs) {
|
|
22288
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
22289
|
+
}
|
|
22290
|
+
};
|
|
22291
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
22292
|
+
/**
|
|
22293
|
+
* Calls the model on new inputs.
|
|
22294
|
+
*
|
|
22295
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
22296
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
22297
|
+
*/
|
|
22298
|
+
async _call(model_inputs) {
|
|
22299
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
22300
|
+
}
|
|
22301
|
+
};
|
|
22302
|
+
|
|
22193
22303
|
// src/models/exaone/modeling_exaone.js
|
|
22194
22304
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
22195
22305
|
};
|
|
@@ -22465,18 +22575,389 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
22465
22575
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
22466
22576
|
};
|
|
22467
22577
|
|
|
22468
|
-
// src/models/
|
|
22469
|
-
var
|
|
22578
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
22579
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
22470
22580
|
};
|
|
22471
|
-
var
|
|
22581
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
22472
22582
|
};
|
|
22473
|
-
var
|
|
22583
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
22474
22584
|
};
|
|
22475
22585
|
|
|
22476
|
-
// src/models/
|
|
22477
|
-
var
|
|
22478
|
-
|
|
22479
|
-
|
|
22586
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
22587
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
22588
|
+
forward_params = [
|
|
22589
|
+
// Text inputs
|
|
22590
|
+
"input_ids",
|
|
22591
|
+
"attention_mask",
|
|
22592
|
+
"position_ids",
|
|
22593
|
+
"past_key_values",
|
|
22594
|
+
// Vision inputs
|
|
22595
|
+
"pixel_values",
|
|
22596
|
+
"image_grid_thw"
|
|
22597
|
+
];
|
|
22598
|
+
};
|
|
22599
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
22600
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
22601
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
22602
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
22603
|
+
image_grid_thw_name = "grid_thw";
|
|
22604
|
+
/**
|
|
22605
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
22606
|
+
* @param {Tensor} input_ids
|
|
22607
|
+
* @param {Tensor} attention_mask
|
|
22608
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
22609
|
+
*/
|
|
22610
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
22611
|
+
if (attention_mask) {
|
|
22612
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
22613
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
22614
|
+
const mrope_position_deltas = Array.from(
|
|
22615
|
+
{ length: dims[0] },
|
|
22616
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
22617
|
+
);
|
|
22618
|
+
return [
|
|
22619
|
+
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
22620
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
22621
|
+
];
|
|
22622
|
+
} else {
|
|
22623
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
22624
|
+
const position_ids = BigInt64Array.from(
|
|
22625
|
+
{ length: 3 * batch_size * seq_length },
|
|
22626
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
22627
|
+
);
|
|
22628
|
+
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
22629
|
+
}
|
|
22630
|
+
}
|
|
22631
|
+
/**
|
|
22632
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
22633
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
22634
|
+
* respecting attention mask.
|
|
22635
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
22636
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
22637
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
22638
|
+
* @param {number} batch_idx Current batch index
|
|
22639
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
22640
|
+
*/
|
|
22641
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
22642
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
22643
|
+
const llm_positions = new Array(total_len);
|
|
22644
|
+
let index = 0;
|
|
22645
|
+
for (let x = 0; x < 3; ++x) {
|
|
22646
|
+
for (const val of llm_pos_ids_list) {
|
|
22647
|
+
const seg_len = val.length / 3;
|
|
22648
|
+
for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
|
|
22649
|
+
llm_positions[index++] = val[z];
|
|
22650
|
+
}
|
|
22651
|
+
}
|
|
22652
|
+
}
|
|
22653
|
+
let count2 = 0;
|
|
22654
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
22655
|
+
if (attn_mask[y] == 1) {
|
|
22656
|
+
for (let x = 0; x < 3; ++x) {
|
|
22657
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
22658
|
+
}
|
|
22659
|
+
++count2;
|
|
22660
|
+
}
|
|
22661
|
+
}
|
|
22662
|
+
return llm_positions;
|
|
22663
|
+
}
|
|
22664
|
+
/**
|
|
22665
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
22666
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
22667
|
+
* @param {object} params
|
|
22668
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
22669
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
22670
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
22671
|
+
* @param {number} params.spatial_merge_size
|
|
22672
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
22673
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
22674
|
+
*/
|
|
22675
|
+
_get_multimodal_rope_positions({
|
|
22676
|
+
filtered_ids,
|
|
22677
|
+
image_grid_thw_list,
|
|
22678
|
+
video_grid_thw_list,
|
|
22679
|
+
spatial_merge_size,
|
|
22680
|
+
state
|
|
22681
|
+
}) {
|
|
22682
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
22683
|
+
const ids = filtered_ids;
|
|
22684
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
22685
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
22686
|
+
return acc;
|
|
22687
|
+
}, []);
|
|
22688
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
22689
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
22690
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
22691
|
+
const llm_pos_ids_list = [];
|
|
22692
|
+
let st = 0;
|
|
22693
|
+
let remain_images = image_nums;
|
|
22694
|
+
let remain_videos = video_nums;
|
|
22695
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
22696
|
+
const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
|
|
22697
|
+
const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
|
|
22698
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
22699
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
22700
|
+
let ed;
|
|
22701
|
+
let t, h, w;
|
|
22702
|
+
if (ed_image < ed_video) {
|
|
22703
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
22704
|
+
++state.image_index;
|
|
22705
|
+
--remain_images;
|
|
22706
|
+
ed = ed_image;
|
|
22707
|
+
} else {
|
|
22708
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
22709
|
+
++state.video_index;
|
|
22710
|
+
--remain_videos;
|
|
22711
|
+
ed = ed_video;
|
|
22712
|
+
}
|
|
22713
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
22714
|
+
Number(t),
|
|
22715
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
22716
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
22717
|
+
];
|
|
22718
|
+
const text_len = ed - st;
|
|
22719
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
22720
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
22721
|
+
const offset = text_len + st_idx;
|
|
22722
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
22723
|
+
const t_index = Array.from(
|
|
22724
|
+
{ length: grid_size },
|
|
22725
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
22726
|
+
);
|
|
22727
|
+
const h_index = Array.from(
|
|
22728
|
+
{ length: grid_size },
|
|
22729
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
22730
|
+
);
|
|
22731
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
22732
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
22733
|
+
st = ed + grid_size;
|
|
22734
|
+
}
|
|
22735
|
+
if (st < ids.length) {
|
|
22736
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
22737
|
+
const text_len = ids.length - st;
|
|
22738
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
22739
|
+
}
|
|
22740
|
+
return llm_pos_ids_list;
|
|
22741
|
+
}
|
|
22742
|
+
/**
|
|
22743
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
22744
|
+
*
|
|
22745
|
+
* Explanation:
|
|
22746
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
22747
|
+
*
|
|
22748
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
22749
|
+
* Examples:
|
|
22750
|
+
* input_ids: [T T T T T], here T is for text.
|
|
22751
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
22752
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
22753
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
22754
|
+
*
|
|
22755
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
22756
|
+
* and 1D rotary position embeddin for text part.
|
|
22757
|
+
* Examples:
|
|
22758
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
22759
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
22760
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
22761
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
22762
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
22763
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
22764
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
22765
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
22766
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
22767
|
+
*
|
|
22768
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
22769
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
22770
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
22771
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
22772
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
22773
|
+
*/
|
|
22774
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
22775
|
+
const { vision_config } = this.config;
|
|
22776
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
22777
|
+
if (image_grid_thw || video_grid_thw) {
|
|
22778
|
+
const total_input_ids = input_ids.tolist();
|
|
22779
|
+
if (!attention_mask) {
|
|
22780
|
+
attention_mask = ones_like(input_ids);
|
|
22781
|
+
}
|
|
22782
|
+
const attention_mask_list = attention_mask.tolist();
|
|
22783
|
+
const position_ids_list = Array.from(
|
|
22784
|
+
{ length: 3 },
|
|
22785
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
22786
|
+
);
|
|
22787
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
22788
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
22789
|
+
const state = { image_index: 0, video_index: 0 };
|
|
22790
|
+
const mrope_position_deltas = [];
|
|
22791
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
22792
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
22793
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
22794
|
+
filtered_ids,
|
|
22795
|
+
image_grid_thw_list,
|
|
22796
|
+
video_grid_thw_list,
|
|
22797
|
+
spatial_merge_size,
|
|
22798
|
+
state
|
|
22799
|
+
});
|
|
22800
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
22801
|
+
llm_pos_ids_list,
|
|
22802
|
+
attention_mask_list[i],
|
|
22803
|
+
position_ids_list,
|
|
22804
|
+
i
|
|
22805
|
+
);
|
|
22806
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
22807
|
+
}
|
|
22808
|
+
return [
|
|
22809
|
+
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
22810
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
22811
|
+
];
|
|
22812
|
+
} else {
|
|
22813
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
22814
|
+
}
|
|
22815
|
+
}
|
|
22816
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
22817
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
22818
|
+
pixel_values,
|
|
22819
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
22820
|
+
})).image_features;
|
|
22821
|
+
return features;
|
|
22822
|
+
}
|
|
22823
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
22824
|
+
return default_merge_input_ids_with_image_features({
|
|
22825
|
+
// @ts-ignore
|
|
22826
|
+
image_token_id: this.config.image_token_id,
|
|
22827
|
+
...kwargs
|
|
22828
|
+
});
|
|
22829
|
+
}
|
|
22830
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
22831
|
+
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
22832
|
+
if (!model_inputs.past_key_values) {
|
|
22833
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
22834
|
+
model_inputs.input_ids,
|
|
22835
|
+
model_inputs.image_grid_thw,
|
|
22836
|
+
model_inputs.video_grid_thw,
|
|
22837
|
+
model_inputs.attention_mask
|
|
22838
|
+
);
|
|
22839
|
+
} else {
|
|
22840
|
+
model_inputs.pixel_values = null;
|
|
22841
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
22842
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
22843
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
22844
|
+
model_inputs.input_ids,
|
|
22845
|
+
model_inputs.image_grid_thw,
|
|
22846
|
+
model_inputs.video_grid_thw,
|
|
22847
|
+
model_inputs.attention_mask
|
|
22848
|
+
);
|
|
22849
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
22850
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
22851
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
22852
|
+
} else {
|
|
22853
|
+
if (!model_inputs.rope_deltas) {
|
|
22854
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
22855
|
+
model_inputs.input_ids,
|
|
22856
|
+
model_inputs.image_grid_thw,
|
|
22857
|
+
model_inputs.video_grid_thw,
|
|
22858
|
+
model_inputs.attention_mask
|
|
22859
|
+
);
|
|
22860
|
+
}
|
|
22861
|
+
const delta = BigInt(past_length);
|
|
22862
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
22863
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
22864
|
+
}
|
|
22865
|
+
}
|
|
22866
|
+
}
|
|
22867
|
+
return model_inputs;
|
|
22868
|
+
}
|
|
22869
|
+
};
|
|
22870
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
22871
|
+
};
|
|
22872
|
+
|
|
22873
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
22874
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
22875
|
+
image_grid_thw_name = "image_grid_thw";
|
|
22876
|
+
};
|
|
22877
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
22878
|
+
image_grid_thw_name = "image_grid_thw";
|
|
22879
|
+
};
|
|
22880
|
+
|
|
22881
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
22882
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
22883
|
+
/**
|
|
22884
|
+
* Compute 3D positional indices for vision tokens.
|
|
22885
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
22886
|
+
* @param {number} start_position
|
|
22887
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
22888
|
+
* @param {number} temp_merge_size
|
|
22889
|
+
* @param {number} spatial_merge_size
|
|
22890
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
22891
|
+
*/
|
|
22892
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
22893
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
22894
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
22895
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
22896
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
22897
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
22898
|
+
const h_pos = Array.from(
|
|
22899
|
+
{ length: seq_len },
|
|
22900
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
22901
|
+
);
|
|
22902
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
22903
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
22904
|
+
}
|
|
22905
|
+
/**
|
|
22906
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
22907
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
22908
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
22909
|
+
*/
|
|
22910
|
+
_get_multimodal_rope_positions({
|
|
22911
|
+
filtered_ids,
|
|
22912
|
+
image_grid_thw_list,
|
|
22913
|
+
video_grid_thw_list,
|
|
22914
|
+
spatial_merge_size,
|
|
22915
|
+
state
|
|
22916
|
+
}) {
|
|
22917
|
+
const { image_token_id } = this.config;
|
|
22918
|
+
const groups = [];
|
|
22919
|
+
let group_start = 0;
|
|
22920
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
22921
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
22922
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
22923
|
+
if (t !== current_type) {
|
|
22924
|
+
groups.push([current_type, group_start, j]);
|
|
22925
|
+
group_start = j;
|
|
22926
|
+
current_type = t;
|
|
22927
|
+
}
|
|
22928
|
+
}
|
|
22929
|
+
let current_pos = 0;
|
|
22930
|
+
const llm_pos_ids_list = [];
|
|
22931
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
22932
|
+
if (modality_type === 0) {
|
|
22933
|
+
const text_len = end_idx - start_idx;
|
|
22934
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
22935
|
+
current_pos += text_len;
|
|
22936
|
+
} else {
|
|
22937
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
22938
|
+
const temp_merge_size = grid_thw[0];
|
|
22939
|
+
llm_pos_ids_list.push(
|
|
22940
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
22941
|
+
);
|
|
22942
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
22943
|
+
}
|
|
22944
|
+
}
|
|
22945
|
+
return llm_pos_ids_list;
|
|
22946
|
+
}
|
|
22947
|
+
};
|
|
22948
|
+
|
|
22949
|
+
// src/models/glpn/modeling_glpn.js
|
|
22950
|
+
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
22951
|
+
};
|
|
22952
|
+
var GLPNModel = class extends GLPNPreTrainedModel {
|
|
22953
|
+
};
|
|
22954
|
+
var GLPNForDepthEstimation = class extends GLPNPreTrainedModel {
|
|
22955
|
+
};
|
|
22956
|
+
|
|
22957
|
+
// src/models/gpt_bigcode/modeling_gpt_bigcode.js
|
|
22958
|
+
var GPTBigCodePreTrainedModel = class extends PreTrainedModel {
|
|
22959
|
+
};
|
|
22960
|
+
var GPTBigCodeModel = class extends GPTBigCodePreTrainedModel {
|
|
22480
22961
|
};
|
|
22481
22962
|
var GPTBigCodeForCausalLM = class extends GPTBigCodePreTrainedModel {
|
|
22482
22963
|
};
|
|
@@ -22777,6 +23258,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
22777
23258
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
22778
23259
|
};
|
|
22779
23260
|
|
|
23261
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
23262
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
23263
|
+
};
|
|
23264
|
+
|
|
22780
23265
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
22781
23266
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
22782
23267
|
};
|
|
@@ -22973,6 +23458,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
22973
23458
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
22974
23459
|
};
|
|
22975
23460
|
|
|
23461
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
23462
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
23463
|
+
};
|
|
23464
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
23465
|
+
};
|
|
23466
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
23467
|
+
};
|
|
23468
|
+
|
|
22976
23469
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
22977
23470
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
22978
23471
|
};
|
|
@@ -23441,6 +23934,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
23441
23934
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
23442
23935
|
};
|
|
23443
23936
|
|
|
23937
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
23938
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
23939
|
+
};
|
|
23940
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
23941
|
+
};
|
|
23942
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
23943
|
+
};
|
|
23944
|
+
|
|
23444
23945
|
// src/models/neobert/modeling_neobert.js
|
|
23445
23946
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
23446
23947
|
};
|
|
@@ -23721,252 +24222,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
23721
24222
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
23722
24223
|
};
|
|
23723
24224
|
|
|
23724
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
23725
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
23726
|
-
forward_params = [
|
|
23727
|
-
// Text inputs
|
|
23728
|
-
"input_ids",
|
|
23729
|
-
"attention_mask",
|
|
23730
|
-
"position_ids",
|
|
23731
|
-
"past_key_values",
|
|
23732
|
-
// Vision inputs
|
|
23733
|
-
"pixel_values",
|
|
23734
|
-
"image_grid_thw"
|
|
23735
|
-
];
|
|
23736
|
-
};
|
|
23737
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
23738
|
-
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
23739
|
-
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
23740
|
-
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
23741
|
-
image_grid_thw_name = "grid_thw";
|
|
23742
|
-
/**
|
|
23743
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
23744
|
-
*
|
|
23745
|
-
* Explanation:
|
|
23746
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
23747
|
-
*
|
|
23748
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
23749
|
-
* Examples:
|
|
23750
|
-
* input_ids: [T T T T T], here T is for text.
|
|
23751
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
23752
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
23753
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
23754
|
-
*
|
|
23755
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
23756
|
-
* and 1D rotary position embeddin for text part.
|
|
23757
|
-
* Examples:
|
|
23758
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
23759
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
23760
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
23761
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
23762
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
23763
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
23764
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
23765
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
23766
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
23767
|
-
*
|
|
23768
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
23769
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
23770
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
23771
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
23772
|
-
* - 1 for tokens that are **not masked**,
|
|
23773
|
-
* - 0 for tokens that are **masked**.
|
|
23774
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
23775
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
23776
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
23777
|
-
*/
|
|
23778
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
23779
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
23780
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
23781
|
-
const mrope_position_deltas = [];
|
|
23782
|
-
if (image_grid_thw || video_grid_thw) {
|
|
23783
|
-
let total_input_ids = input_ids.tolist();
|
|
23784
|
-
if (!attention_mask) {
|
|
23785
|
-
attention_mask = ones_like(input_ids);
|
|
23786
|
-
}
|
|
23787
|
-
const attention_mask_list = attention_mask.tolist();
|
|
23788
|
-
const position_ids_list = Array.from(
|
|
23789
|
-
{ length: 3 },
|
|
23790
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
23791
|
-
);
|
|
23792
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
23793
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
23794
|
-
let image_index = 0;
|
|
23795
|
-
let video_index = 0;
|
|
23796
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
23797
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
23798
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
23799
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
23800
|
-
return acc;
|
|
23801
|
-
}, []);
|
|
23802
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
23803
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
23804
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
23805
|
-
let llm_pos_ids_list = [];
|
|
23806
|
-
let st = 0;
|
|
23807
|
-
let remain_images = image_nums;
|
|
23808
|
-
let remain_videos = video_nums;
|
|
23809
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
23810
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st && x == image_token_id);
|
|
23811
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st && x == video_token_id);
|
|
23812
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
23813
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
23814
|
-
let ed;
|
|
23815
|
-
let t, h, w;
|
|
23816
|
-
if (ed_image < ed_video) {
|
|
23817
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
23818
|
-
++image_index;
|
|
23819
|
-
--remain_images;
|
|
23820
|
-
ed = ed_image;
|
|
23821
|
-
} else {
|
|
23822
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
23823
|
-
++video_index;
|
|
23824
|
-
--remain_videos;
|
|
23825
|
-
ed = ed_video;
|
|
23826
|
-
}
|
|
23827
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
23828
|
-
Number(t),
|
|
23829
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
23830
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
23831
|
-
];
|
|
23832
|
-
const text_len = ed - st;
|
|
23833
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
23834
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
23835
|
-
const offset = text_len + st_idx;
|
|
23836
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
23837
|
-
const t_index = Array.from(
|
|
23838
|
-
{ length: grid_size },
|
|
23839
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
23840
|
-
);
|
|
23841
|
-
const h_index = Array.from(
|
|
23842
|
-
{ length: grid_size },
|
|
23843
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
23844
|
-
);
|
|
23845
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
23846
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
23847
|
-
st = ed + grid_size;
|
|
23848
|
-
}
|
|
23849
|
-
if (st < ids.length) {
|
|
23850
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
23851
|
-
const text_len = ids.length - st;
|
|
23852
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
23853
|
-
}
|
|
23854
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
23855
|
-
const llm_positions = new Array(num_items);
|
|
23856
|
-
let index = 0;
|
|
23857
|
-
for (let x = 0; x < 3; ++x) {
|
|
23858
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
23859
|
-
const val = llm_pos_ids_list[y];
|
|
23860
|
-
const text_len = val.length / 3;
|
|
23861
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
23862
|
-
llm_positions[index++] = val[z];
|
|
23863
|
-
}
|
|
23864
|
-
}
|
|
23865
|
-
}
|
|
23866
|
-
let count2 = 0;
|
|
23867
|
-
const attn_mask = attention_mask_list[i];
|
|
23868
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
23869
|
-
if (attn_mask[y] == 1) {
|
|
23870
|
-
for (let x = 0; x < 3; ++x) {
|
|
23871
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
23872
|
-
}
|
|
23873
|
-
++count2;
|
|
23874
|
-
}
|
|
23875
|
-
}
|
|
23876
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
23877
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
23878
|
-
}
|
|
23879
|
-
return [
|
|
23880
|
-
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
23881
|
-
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
23882
|
-
];
|
|
23883
|
-
} else {
|
|
23884
|
-
if (attention_mask) {
|
|
23885
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
23886
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
23887
|
-
const mrope_position_deltas2 = Array.from(
|
|
23888
|
-
{ length: dims[0] },
|
|
23889
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
23890
|
-
);
|
|
23891
|
-
return [
|
|
23892
|
-
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
23893
|
-
new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
23894
|
-
];
|
|
23895
|
-
} else {
|
|
23896
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
23897
|
-
const position_ids = BigInt64Array.from(
|
|
23898
|
-
{ length: 3 * batch_size * seq_length },
|
|
23899
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
23900
|
-
);
|
|
23901
|
-
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
23902
|
-
}
|
|
23903
|
-
}
|
|
23904
|
-
}
|
|
23905
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
23906
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
23907
|
-
pixel_values,
|
|
23908
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
23909
|
-
})).image_features;
|
|
23910
|
-
return features;
|
|
23911
|
-
}
|
|
23912
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
23913
|
-
return default_merge_input_ids_with_image_features({
|
|
23914
|
-
// @ts-ignore
|
|
23915
|
-
image_token_id: this.config.image_token_id,
|
|
23916
|
-
...kwargs
|
|
23917
|
-
});
|
|
23918
|
-
}
|
|
23919
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
23920
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
23921
|
-
if (!model_inputs.past_key_values) {
|
|
23922
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
23923
|
-
model_inputs.input_ids,
|
|
23924
|
-
model_inputs.image_grid_thw,
|
|
23925
|
-
model_inputs.video_grid_thw,
|
|
23926
|
-
model_inputs.attention_mask
|
|
23927
|
-
);
|
|
23928
|
-
} else {
|
|
23929
|
-
model_inputs.pixel_values = null;
|
|
23930
|
-
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
23931
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
23932
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
23933
|
-
model_inputs.input_ids,
|
|
23934
|
-
model_inputs.image_grid_thw,
|
|
23935
|
-
model_inputs.video_grid_thw,
|
|
23936
|
-
model_inputs.attention_mask
|
|
23937
|
-
);
|
|
23938
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
23939
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
23940
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
23941
|
-
} else {
|
|
23942
|
-
if (!model_inputs.rope_deltas) {
|
|
23943
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
23944
|
-
model_inputs.input_ids,
|
|
23945
|
-
model_inputs.image_grid_thw,
|
|
23946
|
-
model_inputs.video_grid_thw,
|
|
23947
|
-
model_inputs.attention_mask
|
|
23948
|
-
);
|
|
23949
|
-
}
|
|
23950
|
-
const delta = BigInt(past_length);
|
|
23951
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
23952
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
23953
|
-
}
|
|
23954
|
-
}
|
|
23955
|
-
}
|
|
23956
|
-
return model_inputs;
|
|
23957
|
-
}
|
|
23958
|
-
};
|
|
23959
|
-
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
23960
|
-
};
|
|
23961
|
-
|
|
23962
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
23963
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
23964
|
-
image_grid_thw_name = "image_grid_thw";
|
|
23965
|
-
};
|
|
23966
|
-
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
23967
|
-
image_grid_thw_name = "image_grid_thw";
|
|
23968
|
-
};
|
|
23969
|
-
|
|
23970
24225
|
// src/models/qwen3/modeling_qwen3.js
|
|
23971
24226
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
23972
24227
|
};
|
|
@@ -24412,6 +24667,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
24412
24667
|
}
|
|
24413
24668
|
};
|
|
24414
24669
|
|
|
24670
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
24671
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
24672
|
+
};
|
|
24673
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
24674
|
+
};
|
|
24675
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
24676
|
+
};
|
|
24677
|
+
|
|
24415
24678
|
// src/models/speecht5/modeling_speecht5.js
|
|
24416
24679
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
24417
24680
|
};
|
|
@@ -25528,6 +25791,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
25528
25791
|
// src/models/registry.js
|
|
25529
25792
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
25530
25793
|
["bert", "BertModel"],
|
|
25794
|
+
["eurobert", "EuroBertModel"],
|
|
25531
25795
|
["neobert", "NeoBertModel"],
|
|
25532
25796
|
["modernbert", "ModernBertModel"],
|
|
25533
25797
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -25659,6 +25923,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
25659
25923
|
["gemma3_text", "Gemma3Model"],
|
|
25660
25924
|
["helium", "HeliumModel"],
|
|
25661
25925
|
["glm", "GlmModel"],
|
|
25926
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
25662
25927
|
["openelm", "OpenELMModel"],
|
|
25663
25928
|
["qwen2", "Qwen2Model"],
|
|
25664
25929
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -25670,12 +25935,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
25670
25935
|
["mpt", "MptModel"],
|
|
25671
25936
|
["opt", "OPTModel"],
|
|
25672
25937
|
["mistral", "MistralModel"],
|
|
25938
|
+
["mistral4", "Mistral4Model"],
|
|
25673
25939
|
["ministral", "MinistralModel"],
|
|
25674
25940
|
["ministral3", "Ministral3Model"],
|
|
25675
25941
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
25676
25942
|
["starcoder2", "Starcoder2Model"],
|
|
25943
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
25677
25944
|
["falcon", "FalconModel"],
|
|
25678
25945
|
["falcon_h1", "FalconH1Model"],
|
|
25946
|
+
["nemotron_h", "NemotronHModel"],
|
|
25947
|
+
["solar_open", "SolarOpenModel"],
|
|
25679
25948
|
["stablelm", "StableLmModel"],
|
|
25680
25949
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
25681
25950
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -25695,6 +25964,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25695
25964
|
]);
|
|
25696
25965
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
25697
25966
|
["bert", "BertForSequenceClassification"],
|
|
25967
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
25698
25968
|
["neobert", "NeoBertForSequenceClassification"],
|
|
25699
25969
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
25700
25970
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -25717,6 +25987,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25717
25987
|
]);
|
|
25718
25988
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
25719
25989
|
["bert", "BertForTokenClassification"],
|
|
25990
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
25720
25991
|
["neobert", "NeoBertForTokenClassification"],
|
|
25721
25992
|
["modernbert", "ModernBertForTokenClassification"],
|
|
25722
25993
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -25779,6 +26050,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25779
26050
|
["gemma3", "Gemma3ForCausalLM"],
|
|
25780
26051
|
["helium", "HeliumForCausalLM"],
|
|
25781
26052
|
["glm", "GlmForCausalLM"],
|
|
26053
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
25782
26054
|
["openelm", "OpenELMForCausalLM"],
|
|
25783
26055
|
["qwen2", "Qwen2ForCausalLM"],
|
|
25784
26056
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
@@ -25798,13 +26070,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25798
26070
|
["opt", "OPTForCausalLM"],
|
|
25799
26071
|
["mbart", "MBartForCausalLM"],
|
|
25800
26072
|
["mistral", "MistralForCausalLM"],
|
|
26073
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
25801
26074
|
["ministral", "MinistralForCausalLM"],
|
|
25802
26075
|
["ministral3", "Ministral3ForCausalLM"],
|
|
25803
26076
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
25804
26077
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
26078
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
25805
26079
|
["falcon", "FalconForCausalLM"],
|
|
25806
26080
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
26081
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
25807
26082
|
["trocr", "TrOCRForCausalLM"],
|
|
26083
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
25808
26084
|
["stablelm", "StableLmForCausalLM"],
|
|
25809
26085
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
25810
26086
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -25815,6 +26091,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25815
26091
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
25816
26092
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
25817
26093
|
["bert", "BertForMaskedLM"],
|
|
26094
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
25818
26095
|
["neobert", "NeoBertForMaskedLM"],
|
|
25819
26096
|
["modernbert", "ModernBertForMaskedLM"],
|
|
25820
26097
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -25873,7 +26150,9 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25873
26150
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
25874
26151
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
25875
26152
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
25876
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
26153
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
26154
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
26155
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
25877
26156
|
]);
|
|
25878
26157
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
25879
26158
|
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
@@ -25978,6 +26257,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25978
26257
|
]);
|
|
25979
26258
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
25980
26259
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
26260
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
25981
26261
|
["dpt", "DPTForDepthEstimation"],
|
|
25982
26262
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
25983
26263
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -28716,6 +28996,9 @@ export {
|
|
|
28716
28996
|
BloomModel,
|
|
28717
28997
|
BloomPreTrainedModel,
|
|
28718
28998
|
BloomTokenizer,
|
|
28999
|
+
CHMv2ForDepthEstimation,
|
|
29000
|
+
CHMv2ImageProcessor,
|
|
29001
|
+
CHMv2PreTrainedModel,
|
|
28719
29002
|
CLIPFeatureExtractor,
|
|
28720
29003
|
CLIPImageProcessor,
|
|
28721
29004
|
CLIPModel,
|
|
@@ -28811,6 +29094,9 @@ export {
|
|
|
28811
29094
|
DebertaV2Tokenizer,
|
|
28812
29095
|
DecisionTransformerModel,
|
|
28813
29096
|
DecisionTransformerPreTrainedModel,
|
|
29097
|
+
DeepseekV3ForCausalLM,
|
|
29098
|
+
DeepseekV3Model,
|
|
29099
|
+
DeepseekV3PreTrainedModel,
|
|
28814
29100
|
DeiTFeatureExtractor,
|
|
28815
29101
|
DeiTForImageClassification,
|
|
28816
29102
|
DeiTImageProcessor,
|
|
@@ -28871,6 +29157,11 @@ export {
|
|
|
28871
29157
|
EsmModel,
|
|
28872
29158
|
EsmPreTrainedModel,
|
|
28873
29159
|
EsmTokenizer,
|
|
29160
|
+
EuroBertForMaskedLM,
|
|
29161
|
+
EuroBertForSequenceClassification,
|
|
29162
|
+
EuroBertForTokenClassification,
|
|
29163
|
+
EuroBertModel,
|
|
29164
|
+
EuroBertPreTrainedModel,
|
|
28874
29165
|
ExaoneForCausalLM,
|
|
28875
29166
|
ExaoneModel,
|
|
28876
29167
|
ExaonePreTrainedModel,
|
|
@@ -28928,8 +29219,14 @@ export {
|
|
|
28928
29219
|
GemmaModel,
|
|
28929
29220
|
GemmaPreTrainedModel,
|
|
28930
29221
|
GemmaTokenizer,
|
|
29222
|
+
Glm46VImageProcessor,
|
|
29223
|
+
Glm46VProcessor,
|
|
28931
29224
|
GlmForCausalLM,
|
|
28932
29225
|
GlmModel,
|
|
29226
|
+
GlmMoeDsaForCausalLM,
|
|
29227
|
+
GlmMoeDsaModel,
|
|
29228
|
+
GlmMoeDsaPreTrainedModel,
|
|
29229
|
+
GlmOcrForConditionalGeneration,
|
|
28933
29230
|
GlmPreTrainedModel,
|
|
28934
29231
|
GptOssForCausalLM,
|
|
28935
29232
|
GptOssModel,
|
|
@@ -28995,6 +29292,7 @@ export {
|
|
|
28995
29292
|
Lfm2VlForConditionalGeneration,
|
|
28996
29293
|
Lfm2VlImageProcessor,
|
|
28997
29294
|
Lfm2VlProcessor,
|
|
29295
|
+
LightOnOcrForConditionalGeneration,
|
|
28998
29296
|
LiteWhisperForConditionalGeneration,
|
|
28999
29297
|
Llama4ForCausalLM,
|
|
29000
29298
|
Llama4PreTrainedModel,
|
|
@@ -29064,6 +29362,9 @@ export {
|
|
|
29064
29362
|
MimiPreTrainedModel,
|
|
29065
29363
|
MinLengthLogitsProcessor,
|
|
29066
29364
|
MinNewTokensLengthLogitsProcessor,
|
|
29365
|
+
Mistral4ForCausalLM,
|
|
29366
|
+
Mistral4Model,
|
|
29367
|
+
Mistral4PreTrainedModel,
|
|
29067
29368
|
MistralForCausalLM,
|
|
29068
29369
|
MistralModel,
|
|
29069
29370
|
MistralPreTrainedModel,
|
|
@@ -29135,6 +29436,9 @@ export {
|
|
|
29135
29436
|
NanoChatForCausalLM,
|
|
29136
29437
|
NanoChatModel,
|
|
29137
29438
|
NanoChatPreTrainedModel,
|
|
29439
|
+
NemotronHForCausalLM,
|
|
29440
|
+
NemotronHModel,
|
|
29441
|
+
NemotronHPreTrainedModel,
|
|
29138
29442
|
NeoBertForMaskedLM,
|
|
29139
29443
|
NeoBertForQuestionAnswering,
|
|
29140
29444
|
NeoBertForSequenceClassification,
|
|
@@ -29324,6 +29628,9 @@ export {
|
|
|
29324
29628
|
SnacFeatureExtractor,
|
|
29325
29629
|
SnacModel,
|
|
29326
29630
|
SnacPreTrainedModel,
|
|
29631
|
+
SolarOpenForCausalLM,
|
|
29632
|
+
SolarOpenModel,
|
|
29633
|
+
SolarOpenPreTrainedModel,
|
|
29327
29634
|
SpeechT5FeatureExtractor,
|
|
29328
29635
|
SpeechT5ForSpeechToText,
|
|
29329
29636
|
SpeechT5ForTextToSpeech,
|