@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +26 -26
- package/dist/transformers.js +1002 -587
- package/dist/transformers.min.js +23 -19
- package/dist/transformers.node.cjs +1030 -585
- package/dist/transformers.node.min.cjs +21 -17
- package/dist/transformers.node.min.mjs +21 -17
- package/dist/transformers.node.mjs +1000 -585
- package/dist/transformers.web.js +887 -472
- package/dist/transformers.web.min.js +21 -17
- package/package.json +3 -3
- package/src/configs.js +28 -22
- package/src/env.js +1 -1
- package/src/image_processors_utils.js +25 -15
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/gemma3/image_processing_gemma3.js +3 -0
- package/src/models/gemma3/modeling_gemma3.js +4 -1
- package/src/models/gemma3/processing_gemma3.js +45 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/image_processors.js +3 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +48 -25
- package/src/models/models.js +10 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/processors.js +2 -0
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +226 -168
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/registry.js +19 -8
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/pipelines.js +1 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/model_registry/ModelRegistry.js +36 -0
- package/src/utils/model_registry/get_available_dtypes.js +68 -0
- package/src/utils/model_registry/get_file_metadata.js +1 -0
- package/src/utils/model_registry/get_model_files.js +7 -60
- package/src/utils/model_registry/resolve_model_type.js +66 -0
- package/types/configs.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +3 -2
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/gemma3/image_processing_gemma3.d.ts +4 -0
- package/types/models/gemma3/image_processing_gemma3.d.ts.map +1 -0
- package/types/models/gemma3/modeling_gemma3.d.ts +4 -1
- package/types/models/gemma3/modeling_gemma3.d.ts.map +1 -1
- package/types/models/gemma3/processing_gemma3.d.ts +20 -0
- package/types/models/gemma3/processing_gemma3.d.ts.map +1 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +3 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts +2 -3
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +10 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/processors.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/model_registry/ModelRegistry.d.ts +27 -0
- package/types/utils/model_registry/ModelRegistry.d.ts.map +1 -1
- package/types/utils/model_registry/get_available_dtypes.d.ts +26 -0
- package/types/utils/model_registry/get_available_dtypes.d.ts.map +1 -0
- package/types/utils/model_registry/get_model_files.d.ts +25 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/model_registry/resolve_model_type.d.ts +24 -0
- package/types/utils/model_registry/resolve_model_type.d.ts.map +1 -0
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
package/dist/transformers.web.js
CHANGED
|
@@ -14,7 +14,7 @@ var node_path_default = {};
|
|
|
14
14
|
var node_url_default = {};
|
|
15
15
|
|
|
16
16
|
// src/env.js
|
|
17
|
-
var VERSION = "4.0.0-next.
|
|
17
|
+
var VERSION = "4.0.0-next.9";
|
|
18
18
|
var HAS_SELF = typeof self !== "undefined";
|
|
19
19
|
var IS_FS_AVAILABLE = !isEmpty(node_fs_default);
|
|
20
20
|
var IS_PATH_AVAILABLE = !isEmpty(node_path_default);
|
|
@@ -244,7 +244,7 @@ var logger = {
|
|
|
244
244
|
}
|
|
245
245
|
};
|
|
246
246
|
|
|
247
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
247
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
248
248
|
var DictionarySplitter = class {
|
|
249
249
|
/**
|
|
250
250
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -1900,10 +1900,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
1900
1900
|
);
|
|
1901
1901
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
1902
1902
|
output_tokens.push(...byte_tokens);
|
|
1903
|
-
} else {
|
|
1903
|
+
} else if (this.unk_token != null) {
|
|
1904
1904
|
output_tokens.push(this.unk_token);
|
|
1905
1905
|
}
|
|
1906
|
-
} else {
|
|
1906
|
+
} else if (this.unk_token != null) {
|
|
1907
1907
|
output_tokens.push(this.unk_token);
|
|
1908
1908
|
}
|
|
1909
1909
|
}
|
|
@@ -6509,13 +6509,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
|
|
|
6509
6509
|
wrapped_progress
|
|
6510
6510
|
);
|
|
6511
6511
|
} else if (typeof response !== "string") {
|
|
6512
|
+
const headers = new Headers(response.headers);
|
|
6513
|
+
headers.set("content-length", result.byteLength.toString());
|
|
6512
6514
|
await cache2.put(
|
|
6513
6515
|
cacheKey,
|
|
6514
6516
|
new Response(
|
|
6515
6517
|
/** @type {any} */
|
|
6516
6518
|
result,
|
|
6517
6519
|
{
|
|
6518
|
-
headers
|
|
6520
|
+
headers
|
|
6519
6521
|
}
|
|
6520
6522
|
)
|
|
6521
6523
|
).catch((err) => {
|
|
@@ -11828,7 +11830,9 @@ var processors_exports = {};
|
|
|
11828
11830
|
__export(processors_exports, {
|
|
11829
11831
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
11830
11832
|
Florence2Processor: () => Florence2Processor,
|
|
11833
|
+
Gemma3Processor: () => Gemma3Processor,
|
|
11831
11834
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
11835
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
11832
11836
|
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
11833
11837
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
11834
11838
|
Idefics3Processor: () => Idefics3Processor,
|
|
@@ -14342,26 +14346,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
14342
14346
|
}
|
|
14343
14347
|
return [segmentation, segments];
|
|
14344
14348
|
}
|
|
14345
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
14349
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
14346
14350
|
if (height < factor || width < factor) {
|
|
14347
|
-
|
|
14348
|
-
|
|
14351
|
+
const scale = Math.max(factor / height, factor / width);
|
|
14352
|
+
height = Math.round(height * scale);
|
|
14353
|
+
width = Math.round(width * scale);
|
|
14354
|
+
}
|
|
14355
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
14349
14356
|
throw new Error(
|
|
14350
14357
|
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
14351
14358
|
);
|
|
14352
14359
|
}
|
|
14353
14360
|
let h_bar = Math.round(height / factor) * factor;
|
|
14354
14361
|
let w_bar = Math.round(width / factor) * factor;
|
|
14355
|
-
if (h_bar * w_bar > max_pixels) {
|
|
14356
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
14357
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
14358
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
14359
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
14360
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
14362
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
14363
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
14364
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
14365
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
14366
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
14367
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
14361
14368
|
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
14362
14369
|
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
14363
14370
|
}
|
|
14364
|
-
return [
|
|
14371
|
+
return [w_bar, h_bar];
|
|
14365
14372
|
}
|
|
14366
14373
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
14367
14374
|
if (label_ids_to_fuse === null) {
|
|
@@ -14440,7 +14447,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
14440
14447
|
this.do_pad = config.do_pad;
|
|
14441
14448
|
this.min_pixels = config.min_pixels;
|
|
14442
14449
|
this.max_pixels = config.max_pixels;
|
|
14443
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
14450
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
14444
14451
|
this.pad_size = this.size;
|
|
14445
14452
|
}
|
|
14446
14453
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -14728,10 +14735,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
14728
14735
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
14729
14736
|
[pixelData, imgDims] = padded;
|
|
14730
14737
|
} else if (this.size_divisibility) {
|
|
14731
|
-
const
|
|
14732
|
-
|
|
14733
|
-
this.size_divisibility
|
|
14734
|
-
);
|
|
14738
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
14739
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
14735
14740
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
14736
14741
|
}
|
|
14737
14742
|
}
|
|
@@ -14808,6 +14813,7 @@ var image_processors_exports = {};
|
|
|
14808
14813
|
__export(image_processors_exports, {
|
|
14809
14814
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
14810
14815
|
BitImageProcessor: () => BitImageProcessor,
|
|
14816
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
14811
14817
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
14812
14818
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
14813
14819
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -14824,6 +14830,8 @@ __export(image_processors_exports, {
|
|
|
14824
14830
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
14825
14831
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
14826
14832
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
14833
|
+
Gemma3ImageProcessor: () => Gemma3ImageProcessor,
|
|
14834
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
14827
14835
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
14828
14836
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
14829
14837
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
@@ -14884,6 +14892,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
14884
14892
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
14885
14893
|
};
|
|
14886
14894
|
|
|
14895
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
14896
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
14897
|
+
};
|
|
14898
|
+
|
|
14887
14899
|
// src/models/clip/image_processing_clip.js
|
|
14888
14900
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
14889
14901
|
};
|
|
@@ -15003,6 +15015,69 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
15003
15015
|
}
|
|
15004
15016
|
};
|
|
15005
15017
|
|
|
15018
|
+
// src/models/gemma3/image_processing_gemma3.js
|
|
15019
|
+
var Gemma3ImageProcessor = class extends ImageProcessor {
|
|
15020
|
+
};
|
|
15021
|
+
|
|
15022
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
15023
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
15024
|
+
constructor(config) {
|
|
15025
|
+
super(config);
|
|
15026
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
15027
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
15028
|
+
this.patch_size = config.patch_size;
|
|
15029
|
+
this.merge_size = config.merge_size;
|
|
15030
|
+
}
|
|
15031
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
15032
|
+
get_resize_output_image_size(image, size) {
|
|
15033
|
+
const factor = this.patch_size * this.merge_size;
|
|
15034
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
15035
|
+
}
|
|
15036
|
+
async _call(images, ...args) {
|
|
15037
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
15038
|
+
let patches = pixel_values;
|
|
15039
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
15040
|
+
if (patches.dims[0] === 1) {
|
|
15041
|
+
patches = cat(
|
|
15042
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
15043
|
+
0
|
|
15044
|
+
);
|
|
15045
|
+
}
|
|
15046
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
15047
|
+
const channel = patches.dims[1];
|
|
15048
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
15049
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
15050
|
+
const flatten_patches = patches.view(
|
|
15051
|
+
grid_t,
|
|
15052
|
+
temporal_patch_size,
|
|
15053
|
+
channel,
|
|
15054
|
+
Math.floor(grid_h / merge_size),
|
|
15055
|
+
merge_size,
|
|
15056
|
+
patch_size,
|
|
15057
|
+
Math.floor(grid_w / merge_size),
|
|
15058
|
+
merge_size,
|
|
15059
|
+
patch_size
|
|
15060
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
15061
|
+
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
15062
|
+
return {
|
|
15063
|
+
pixel_values: flatten_patches,
|
|
15064
|
+
image_grid_thw,
|
|
15065
|
+
original_sizes,
|
|
15066
|
+
reshaped_input_sizes
|
|
15067
|
+
};
|
|
15068
|
+
}
|
|
15069
|
+
};
|
|
15070
|
+
|
|
15071
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
15072
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
15073
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
15074
|
+
get_resize_output_image_size(image, size) {
|
|
15075
|
+
const factor = this.patch_size * this.merge_size;
|
|
15076
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
15077
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
15078
|
+
}
|
|
15079
|
+
};
|
|
15080
|
+
|
|
15006
15081
|
// src/models/glpn/image_processing_glpn.js
|
|
15007
15082
|
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
15008
15083
|
};
|
|
@@ -15396,7 +15471,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
|
15396
15471
|
const img = pixel_values.unsqueeze_(0);
|
|
15397
15472
|
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
15398
15473
|
const f2 = total_factor ** 2;
|
|
15399
|
-
const [
|
|
15474
|
+
const [new_width, new_height] = smart_resize(
|
|
15400
15475
|
Math.max(total_factor, height),
|
|
15401
15476
|
Math.max(total_factor, width),
|
|
15402
15477
|
total_factor,
|
|
@@ -15686,55 +15761,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
15686
15761
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
15687
15762
|
};
|
|
15688
15763
|
|
|
15689
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
15690
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
15691
|
-
constructor(config) {
|
|
15692
|
-
super(config);
|
|
15693
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
15694
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
15695
|
-
this.patch_size = config.patch_size;
|
|
15696
|
-
this.merge_size = config.merge_size;
|
|
15697
|
-
}
|
|
15698
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
15699
|
-
get_resize_output_image_size(image, size) {
|
|
15700
|
-
const factor = this.patch_size * this.merge_size;
|
|
15701
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
15702
|
-
}
|
|
15703
|
-
async _call(images, ...args) {
|
|
15704
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
15705
|
-
let patches = pixel_values;
|
|
15706
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
15707
|
-
if (patches.dims[0] === 1) {
|
|
15708
|
-
patches = cat(
|
|
15709
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
15710
|
-
0
|
|
15711
|
-
);
|
|
15712
|
-
}
|
|
15713
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
15714
|
-
const channel = patches.dims[1];
|
|
15715
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
15716
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
15717
|
-
const flatten_patches = patches.view(
|
|
15718
|
-
grid_t,
|
|
15719
|
-
temporal_patch_size,
|
|
15720
|
-
channel,
|
|
15721
|
-
Math.floor(grid_h / merge_size),
|
|
15722
|
-
merge_size,
|
|
15723
|
-
patch_size,
|
|
15724
|
-
Math.floor(grid_w / merge_size),
|
|
15725
|
-
merge_size,
|
|
15726
|
-
patch_size
|
|
15727
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
15728
|
-
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
15729
|
-
return {
|
|
15730
|
-
pixel_values: flatten_patches,
|
|
15731
|
-
image_grid_thw,
|
|
15732
|
-
original_sizes,
|
|
15733
|
-
reshaped_input_sizes
|
|
15734
|
-
};
|
|
15735
|
-
}
|
|
15736
|
-
};
|
|
15737
|
-
|
|
15738
15764
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
15739
15765
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
15740
15766
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -16216,6 +16242,48 @@ var Florence2Processor = class extends Processor {
|
|
|
16216
16242
|
}
|
|
16217
16243
|
};
|
|
16218
16244
|
|
|
16245
|
+
// src/models/gemma3/processing_gemma3.js
|
|
16246
|
+
var Gemma3Processor = class extends Processor {
|
|
16247
|
+
static tokenizer_class = AutoTokenizer;
|
|
16248
|
+
static image_processor_class = AutoImageProcessor;
|
|
16249
|
+
static uses_processor_config = true;
|
|
16250
|
+
static uses_chat_template_file = true;
|
|
16251
|
+
constructor(config, components, chat_template) {
|
|
16252
|
+
super(config, components, chat_template);
|
|
16253
|
+
this.image_seq_length = this.config.image_seq_length;
|
|
16254
|
+
const { boi_token, image_token, eoi_token } = this.tokenizer.config;
|
|
16255
|
+
this.boi_token = boi_token;
|
|
16256
|
+
this.image_token = image_token;
|
|
16257
|
+
this.eoi_token = eoi_token;
|
|
16258
|
+
const image_tokens_expanded = image_token.repeat(this.image_seq_length);
|
|
16259
|
+
this.full_image_sequence = `
|
|
16260
|
+
|
|
16261
|
+
${boi_token}${image_tokens_expanded}${eoi_token}
|
|
16262
|
+
|
|
16263
|
+
`;
|
|
16264
|
+
}
|
|
16265
|
+
/**
|
|
16266
|
+
* @param {string|string[]} text
|
|
16267
|
+
* @param {import('../../utils/image.js').RawImage|import('../../utils/image.js').RawImage[]} [images]
|
|
16268
|
+
* @param {Object} [options]
|
|
16269
|
+
*/
|
|
16270
|
+
async _call(text, images = null, options = {}) {
|
|
16271
|
+
if (typeof text === "string") {
|
|
16272
|
+
text = [text];
|
|
16273
|
+
}
|
|
16274
|
+
let image_inputs;
|
|
16275
|
+
if (images) {
|
|
16276
|
+
image_inputs = await this.image_processor(images, options);
|
|
16277
|
+
text = text.map((prompt) => prompt.replaceAll(this.boi_token, this.full_image_sequence));
|
|
16278
|
+
}
|
|
16279
|
+
const text_inputs = this.tokenizer(text, options);
|
|
16280
|
+
return {
|
|
16281
|
+
...text_inputs,
|
|
16282
|
+
...image_inputs
|
|
16283
|
+
};
|
|
16284
|
+
}
|
|
16285
|
+
};
|
|
16286
|
+
|
|
16219
16287
|
// src/models/gemma3n/processing_gemma3n.js
|
|
16220
16288
|
var Gemma3nProcessor = class extends Processor {
|
|
16221
16289
|
static image_processor_class = AutoImageProcessor;
|
|
@@ -16288,6 +16356,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
16288
16356
|
}
|
|
16289
16357
|
};
|
|
16290
16358
|
|
|
16359
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
16360
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
16361
|
+
static image_processor_class = AutoImageProcessor;
|
|
16362
|
+
static tokenizer_class = AutoTokenizer;
|
|
16363
|
+
static image_token = "<|image_pad|>";
|
|
16364
|
+
/**
|
|
16365
|
+
*
|
|
16366
|
+
* @param {string|string[]} text
|
|
16367
|
+
* @param {RawImage|RawImage[]} images
|
|
16368
|
+
* @param {...any} args
|
|
16369
|
+
* @returns {Promise<any>}
|
|
16370
|
+
*/
|
|
16371
|
+
async _call(text, images = null, ...args) {
|
|
16372
|
+
if (!Array.isArray(text)) {
|
|
16373
|
+
text = [text];
|
|
16374
|
+
}
|
|
16375
|
+
let image_inputs, image_grid_thw;
|
|
16376
|
+
if (images) {
|
|
16377
|
+
image_inputs = await this.image_processor(images);
|
|
16378
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
16379
|
+
}
|
|
16380
|
+
if (image_grid_thw) {
|
|
16381
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
16382
|
+
let index = 0;
|
|
16383
|
+
const image_token = (
|
|
16384
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
16385
|
+
this.constructor.image_token
|
|
16386
|
+
);
|
|
16387
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
16388
|
+
text = text.map((t) => {
|
|
16389
|
+
while (t.includes(image_token)) {
|
|
16390
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
16391
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
16392
|
+
}
|
|
16393
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
16394
|
+
});
|
|
16395
|
+
}
|
|
16396
|
+
const text_inputs = this.tokenizer(text);
|
|
16397
|
+
return {
|
|
16398
|
+
...text_inputs,
|
|
16399
|
+
...image_inputs
|
|
16400
|
+
};
|
|
16401
|
+
}
|
|
16402
|
+
};
|
|
16403
|
+
|
|
16404
|
+
// src/models/glm46v/processing_glm46v.js
|
|
16405
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
16406
|
+
static image_token = "<|image|>";
|
|
16407
|
+
};
|
|
16408
|
+
|
|
16291
16409
|
// src/models/granite_speech/processing_granite_speech.js
|
|
16292
16410
|
var GraniteSpeechProcessor = class extends Processor {
|
|
16293
16411
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -17018,47 +17136,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
17018
17136
|
}
|
|
17019
17137
|
};
|
|
17020
17138
|
|
|
17021
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
17022
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
17023
|
-
static image_processor_class = AutoImageProcessor;
|
|
17024
|
-
static tokenizer_class = AutoTokenizer;
|
|
17025
|
-
/**
|
|
17026
|
-
*
|
|
17027
|
-
* @param {string|string[]} text
|
|
17028
|
-
* @param {RawImage|RawImage[]} images
|
|
17029
|
-
* @param {...any} args
|
|
17030
|
-
* @returns {Promise<any>}
|
|
17031
|
-
*/
|
|
17032
|
-
async _call(text, images = null, ...args) {
|
|
17033
|
-
if (!Array.isArray(text)) {
|
|
17034
|
-
text = [text];
|
|
17035
|
-
}
|
|
17036
|
-
let image_inputs, image_grid_thw;
|
|
17037
|
-
if (images) {
|
|
17038
|
-
image_inputs = await this.image_processor(images);
|
|
17039
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
17040
|
-
}
|
|
17041
|
-
if (image_grid_thw) {
|
|
17042
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
17043
|
-
let index = 0;
|
|
17044
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
17045
|
-
text = text.map((t) => {
|
|
17046
|
-
while (t.includes("<|image_pad|>")) {
|
|
17047
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
17048
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
17049
|
-
}
|
|
17050
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
17051
|
-
});
|
|
17052
|
-
}
|
|
17053
|
-
const text_inputs = this.tokenizer(text);
|
|
17054
|
-
return {
|
|
17055
|
-
...text_inputs,
|
|
17056
|
-
...image_inputs
|
|
17057
|
-
// TODO: ...videos_inputs,
|
|
17058
|
-
};
|
|
17059
|
-
}
|
|
17060
|
-
};
|
|
17061
|
-
|
|
17062
17139
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
17063
17140
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
17064
17141
|
};
|
|
@@ -17402,6 +17479,8 @@ function getNormalizedConfig(config) {
|
|
|
17402
17479
|
case "gemma3n":
|
|
17403
17480
|
case "lfm2_vl":
|
|
17404
17481
|
case "chatterbox":
|
|
17482
|
+
case "lighton_ocr":
|
|
17483
|
+
case "glm_ocr":
|
|
17405
17484
|
case "mistral3":
|
|
17406
17485
|
case "qwen2_5_vl":
|
|
17407
17486
|
case "qwen3_vl":
|
|
@@ -17477,6 +17556,8 @@ function getNormalizedConfig(config) {
|
|
|
17477
17556
|
mapping["dim_kv"] = "head_dim";
|
|
17478
17557
|
break;
|
|
17479
17558
|
case "qwen3":
|
|
17559
|
+
case "solar_open":
|
|
17560
|
+
case "glm_ocr_text":
|
|
17480
17561
|
case "gemma":
|
|
17481
17562
|
case "gemma2":
|
|
17482
17563
|
case "vaultgemma":
|
|
@@ -17487,6 +17568,7 @@ function getNormalizedConfig(config) {
|
|
|
17487
17568
|
case "ernie4_5":
|
|
17488
17569
|
case "hunyuan_v1_dense":
|
|
17489
17570
|
case "falcon_h1":
|
|
17571
|
+
case "nemotron_h":
|
|
17490
17572
|
case "ministral":
|
|
17491
17573
|
case "ministral3":
|
|
17492
17574
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -17521,6 +17603,9 @@ function getNormalizedConfig(config) {
|
|
|
17521
17603
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
17522
17604
|
break;
|
|
17523
17605
|
case "youtu":
|
|
17606
|
+
case "deepseek_v3":
|
|
17607
|
+
case "glm_moe_dsa":
|
|
17608
|
+
case "mistral4":
|
|
17524
17609
|
mapping["num_heads"] = "num_key_value_heads";
|
|
17525
17610
|
mapping["num_layers"] = "num_hidden_layers";
|
|
17526
17611
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -17609,6 +17694,7 @@ function getCacheShapes(config, options) {
|
|
|
17609
17694
|
if (!(config instanceof PretrainedConfig)) {
|
|
17610
17695
|
config = new PretrainedConfig(config);
|
|
17611
17696
|
}
|
|
17697
|
+
const batch_size = options?.batch_size ?? 1;
|
|
17612
17698
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
17613
17699
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
17614
17700
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -17618,7 +17704,6 @@ function getCacheShapes(config, options) {
|
|
|
17618
17704
|
config
|
|
17619
17705
|
);
|
|
17620
17706
|
const head_dim = hidden_size / num_attention_heads;
|
|
17621
|
-
const batch_size = options?.batch_size ?? 1;
|
|
17622
17707
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
17623
17708
|
if (layer_types[i] === "full_attention") {
|
|
17624
17709
|
for (const kv of ["key", "value"]) {
|
|
@@ -17631,31 +17716,26 @@ function getCacheShapes(config, options) {
|
|
|
17631
17716
|
}
|
|
17632
17717
|
}
|
|
17633
17718
|
return cache_values;
|
|
17634
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
17719
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
17635
17720
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
17636
17721
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
17637
|
-
const
|
|
17638
|
-
const {
|
|
17639
|
-
layer_types,
|
|
17640
|
-
num_hidden_layers,
|
|
17641
|
-
num_attention_heads,
|
|
17642
|
-
num_key_value_heads,
|
|
17643
|
-
hidden_size,
|
|
17644
|
-
mamba_d_conv,
|
|
17645
|
-
mamba_n_heads,
|
|
17646
|
-
mamba_d_head,
|
|
17647
|
-
mamba_d_state,
|
|
17648
|
-
mamba_n_groups,
|
|
17649
|
-
mamba_expand,
|
|
17650
|
-
mamba_d_ssm
|
|
17651
|
-
} = (
|
|
17722
|
+
const c = (
|
|
17652
17723
|
/** @type {any} */
|
|
17653
17724
|
config
|
|
17654
17725
|
);
|
|
17655
|
-
const
|
|
17656
|
-
const
|
|
17657
|
-
const
|
|
17658
|
-
|
|
17726
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
17727
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
17728
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
17729
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
17730
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
17731
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
17732
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
17733
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
17734
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
17735
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
17736
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
17737
|
+
const cache_values = {};
|
|
17738
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
17659
17739
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
17660
17740
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
17661
17741
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -17689,7 +17769,6 @@ function getCacheShapes(config, options) {
|
|
|
17689
17769
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
17690
17770
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
17691
17771
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
17692
|
-
const batch_size = options?.batch_size ?? 1;
|
|
17693
17772
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
17694
17773
|
if (layer_types[i] === "full_attention") {
|
|
17695
17774
|
for (const kv of ["key", "value"]) {
|
|
@@ -19385,8 +19464,7 @@ var MODEL_TYPES = {
|
|
|
19385
19464
|
ImageAudioTextToText: 13,
|
|
19386
19465
|
Supertonic: 14,
|
|
19387
19466
|
Chatterbox: 15,
|
|
19388
|
-
|
|
19389
|
-
VoxtralRealtime: 17
|
|
19467
|
+
VoxtralRealtime: 16
|
|
19390
19468
|
};
|
|
19391
19469
|
var MODEL_TYPE_CONFIG = {
|
|
19392
19470
|
[MODEL_TYPES.DecoderOnly]: {
|
|
@@ -19443,12 +19521,12 @@ var MODEL_TYPE_CONFIG = {
|
|
|
19443
19521
|
can_generate: true,
|
|
19444
19522
|
forward: image_text_to_text_forward,
|
|
19445
19523
|
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19446
|
-
sessions: (config) => {
|
|
19524
|
+
sessions: (config, options, textOnly) => {
|
|
19447
19525
|
const s = {
|
|
19448
19526
|
embed_tokens: "embed_tokens",
|
|
19449
|
-
vision_encoder: "vision_encoder",
|
|
19450
19527
|
decoder_model_merged: "decoder_model_merged"
|
|
19451
19528
|
};
|
|
19529
|
+
if (!textOnly) s["vision_encoder"] = "vision_encoder";
|
|
19452
19530
|
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
19453
19531
|
return s;
|
|
19454
19532
|
},
|
|
@@ -19470,12 +19548,17 @@ var MODEL_TYPE_CONFIG = {
|
|
|
19470
19548
|
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
19471
19549
|
can_generate: true,
|
|
19472
19550
|
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19473
|
-
sessions: () =>
|
|
19474
|
-
|
|
19475
|
-
|
|
19476
|
-
|
|
19477
|
-
|
|
19478
|
-
|
|
19551
|
+
sessions: (config, options, textOnly) => {
|
|
19552
|
+
const s = {
|
|
19553
|
+
embed_tokens: "embed_tokens",
|
|
19554
|
+
decoder_model_merged: "decoder_model_merged"
|
|
19555
|
+
};
|
|
19556
|
+
if (!textOnly) {
|
|
19557
|
+
s["audio_encoder"] = "audio_encoder";
|
|
19558
|
+
s["vision_encoder"] = "vision_encoder";
|
|
19559
|
+
}
|
|
19560
|
+
return s;
|
|
19561
|
+
},
|
|
19479
19562
|
optional_configs: { generation_config: "generation_config.json" }
|
|
19480
19563
|
},
|
|
19481
19564
|
[MODEL_TYPES.Phi3V]: {
|
|
@@ -19526,14 +19609,6 @@ var MODEL_TYPE_CONFIG = {
|
|
|
19526
19609
|
cache_sessions: { model: true },
|
|
19527
19610
|
optional_configs: { generation_config: "generation_config.json" }
|
|
19528
19611
|
},
|
|
19529
|
-
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
19530
|
-
can_generate: true,
|
|
19531
|
-
forward: image_text_to_text_forward,
|
|
19532
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
19533
|
-
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
19534
|
-
cache_sessions: { decoder_model_merged: true },
|
|
19535
|
-
optional_configs: { generation_config: "generation_config.json" }
|
|
19536
|
-
},
|
|
19537
19612
|
[MODEL_TYPES.VoxtralRealtime]: {
|
|
19538
19613
|
can_generate: true,
|
|
19539
19614
|
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
@@ -19559,6 +19634,19 @@ function getSessionsConfig(modelType, config, options = {}) {
|
|
|
19559
19634
|
optional_configs: typeConfig.optional_configs
|
|
19560
19635
|
};
|
|
19561
19636
|
}
|
|
19637
|
+
function resolveTypeConfig(modelName, config) {
|
|
19638
|
+
let modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
19639
|
+
let textOnly = false;
|
|
19640
|
+
const nativeArch = config?.architectures?.[0];
|
|
19641
|
+
if (nativeArch && nativeArch !== modelName && modelName?.endsWith("ForCausalLM") && nativeArch.endsWith("ForConditionalGeneration")) {
|
|
19642
|
+
const nativeType = MODEL_TYPE_MAPPING.get(nativeArch);
|
|
19643
|
+
if (nativeType !== void 0) {
|
|
19644
|
+
modelType = nativeType;
|
|
19645
|
+
textOnly = true;
|
|
19646
|
+
}
|
|
19647
|
+
}
|
|
19648
|
+
return { typeConfig: MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default, textOnly, modelType };
|
|
19649
|
+
}
|
|
19562
19650
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
19563
19651
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
19564
19652
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -19578,8 +19666,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19578
19666
|
this.sessions = sessions;
|
|
19579
19667
|
this.configs = configs;
|
|
19580
19668
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this.constructor);
|
|
19581
|
-
const
|
|
19582
|
-
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
19669
|
+
const { typeConfig } = resolveTypeConfig(modelName, config);
|
|
19583
19670
|
this.can_generate = typeConfig.can_generate;
|
|
19584
19671
|
this._forward = typeConfig.forward;
|
|
19585
19672
|
this._prepare_inputs_for_generation = typeConfig.prepare_inputs;
|
|
@@ -19642,9 +19729,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19642
19729
|
session_options
|
|
19643
19730
|
};
|
|
19644
19731
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
19645
|
-
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
19646
19732
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
19647
|
-
const typeConfig
|
|
19733
|
+
const { typeConfig, textOnly, modelType } = resolveTypeConfig(modelName, config);
|
|
19648
19734
|
if (modelType === void 0) {
|
|
19649
19735
|
const type = modelName ?? config?.model_type;
|
|
19650
19736
|
if (type !== "custom") {
|
|
@@ -19653,7 +19739,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
19653
19739
|
);
|
|
19654
19740
|
}
|
|
19655
19741
|
}
|
|
19656
|
-
const sessions = typeConfig.sessions(config, options);
|
|
19742
|
+
const sessions = typeConfig.sessions(config, options, textOnly);
|
|
19657
19743
|
const promises = [
|
|
19658
19744
|
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
19659
19745
|
];
|
|
@@ -20317,7 +20403,9 @@ async function generic_text_to_text_forward(self2, {
|
|
|
20317
20403
|
"qwen3_5",
|
|
20318
20404
|
"qwen3_5_text",
|
|
20319
20405
|
"qwen3_5_moe",
|
|
20320
|
-
"qwen3_5_moe_text"
|
|
20406
|
+
"qwen3_5_moe_text",
|
|
20407
|
+
"glm_ocr",
|
|
20408
|
+
"glm_ocr_text"
|
|
20321
20409
|
].includes(self2.config.model_type)
|
|
20322
20410
|
) {
|
|
20323
20411
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -20541,6 +20629,8 @@ __export(models_exports, {
|
|
|
20541
20629
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
20542
20630
|
BloomModel: () => BloomModel,
|
|
20543
20631
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
20632
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
20633
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
20544
20634
|
CLIPModel: () => CLIPModel,
|
|
20545
20635
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
20546
20636
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -20615,6 +20705,9 @@ __export(models_exports, {
|
|
|
20615
20705
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
20616
20706
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
20617
20707
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
20708
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
20709
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
20710
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
20618
20711
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
20619
20712
|
DeiTModel: () => DeiTModel,
|
|
20620
20713
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -20660,6 +20753,11 @@ __export(models_exports, {
|
|
|
20660
20753
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
20661
20754
|
EsmModel: () => EsmModel,
|
|
20662
20755
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
20756
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
20757
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
20758
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
20759
|
+
EuroBertModel: () => EuroBertModel,
|
|
20760
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
20663
20761
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
20664
20762
|
ExaoneModel: () => ExaoneModel,
|
|
20665
20763
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -20696,6 +20794,7 @@ __export(models_exports, {
|
|
|
20696
20794
|
Gemma2Model: () => Gemma2Model,
|
|
20697
20795
|
Gemma2PreTrainedModel: () => Gemma2PreTrainedModel,
|
|
20698
20796
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
20797
|
+
Gemma3ForConditionalGeneration: () => Gemma3ForConditionalGeneration,
|
|
20699
20798
|
Gemma3Model: () => Gemma3Model,
|
|
20700
20799
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
20701
20800
|
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
@@ -20706,6 +20805,10 @@ __export(models_exports, {
|
|
|
20706
20805
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
20707
20806
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
20708
20807
|
GlmModel: () => GlmModel,
|
|
20808
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
20809
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
20810
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
20811
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
20709
20812
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
20710
20813
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
20711
20814
|
GptOssModel: () => GptOssModel,
|
|
@@ -20752,6 +20855,7 @@ __export(models_exports, {
|
|
|
20752
20855
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
20753
20856
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
20754
20857
|
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
20858
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
20755
20859
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
20756
20860
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
20757
20861
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -20801,6 +20905,9 @@ __export(models_exports, {
|
|
|
20801
20905
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
20802
20906
|
MimiModel: () => MimiModel,
|
|
20803
20907
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
20908
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
20909
|
+
Mistral4Model: () => Mistral4Model,
|
|
20910
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
20804
20911
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
20805
20912
|
MistralModel: () => MistralModel,
|
|
20806
20913
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -20858,6 +20965,9 @@ __export(models_exports, {
|
|
|
20858
20965
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
20859
20966
|
NanoChatModel: () => NanoChatModel,
|
|
20860
20967
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
20968
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
20969
|
+
NemotronHModel: () => NemotronHModel,
|
|
20970
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
20861
20971
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
20862
20972
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
20863
20973
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -20995,6 +21105,9 @@ __export(models_exports, {
|
|
|
20995
21105
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
20996
21106
|
SnacModel: () => SnacModel,
|
|
20997
21107
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
21108
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
21109
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
21110
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
20998
21111
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
20999
21112
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
21000
21113
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -21169,7 +21282,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
21169
21282
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
21170
21283
|
};
|
|
21171
21284
|
|
|
21172
|
-
// src/models/
|
|
21285
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
21173
21286
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
21174
21287
|
};
|
|
21175
21288
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -21504,6 +21617,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
21504
21617
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
21505
21618
|
};
|
|
21506
21619
|
|
|
21620
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
21621
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
21622
|
+
};
|
|
21623
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
21624
|
+
};
|
|
21625
|
+
|
|
21507
21626
|
// src/models/clap/modeling_clap.js
|
|
21508
21627
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
21509
21628
|
};
|
|
@@ -21842,7 +21961,15 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
21842
21961
|
}
|
|
21843
21962
|
};
|
|
21844
21963
|
|
|
21845
|
-
// src/models/
|
|
21964
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
21965
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
21966
|
+
};
|
|
21967
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
21968
|
+
};
|
|
21969
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
21970
|
+
};
|
|
21971
|
+
|
|
21972
|
+
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
21846
21973
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
21847
21974
|
};
|
|
21848
21975
|
var DebertaV2Model = class extends DebertaV2PreTrainedModel {
|
|
@@ -22190,6 +22317,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
22190
22317
|
}
|
|
22191
22318
|
};
|
|
22192
22319
|
|
|
22320
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
22321
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
22322
|
+
};
|
|
22323
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
22324
|
+
};
|
|
22325
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
22326
|
+
/**
|
|
22327
|
+
* Calls the model on new inputs.
|
|
22328
|
+
*
|
|
22329
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
22330
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
22331
|
+
*/
|
|
22332
|
+
async _call(model_inputs) {
|
|
22333
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
22334
|
+
}
|
|
22335
|
+
};
|
|
22336
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
22337
|
+
/**
|
|
22338
|
+
* Calls the model on new inputs.
|
|
22339
|
+
*
|
|
22340
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
22341
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
22342
|
+
*/
|
|
22343
|
+
async _call(model_inputs) {
|
|
22344
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
22345
|
+
}
|
|
22346
|
+
};
|
|
22347
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
22348
|
+
/**
|
|
22349
|
+
* Calls the model on new inputs.
|
|
22350
|
+
*
|
|
22351
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
22352
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
22353
|
+
*/
|
|
22354
|
+
async _call(model_inputs) {
|
|
22355
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
22356
|
+
}
|
|
22357
|
+
};
|
|
22358
|
+
|
|
22193
22359
|
// src/models/exaone/modeling_exaone.js
|
|
22194
22360
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
22195
22361
|
};
|
|
@@ -22347,12 +22513,35 @@ var Gemma2Model = class extends Gemma2PreTrainedModel {
|
|
|
22347
22513
|
var Gemma2ForCausalLM = class extends Gemma2PreTrainedModel {
|
|
22348
22514
|
};
|
|
22349
22515
|
|
|
22516
|
+
// src/models/llava/modeling_llava.js
|
|
22517
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
22518
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
22519
|
+
};
|
|
22520
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
22521
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
22522
|
+
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
22523
|
+
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
22524
|
+
return default_merge_input_ids_with_image_features({
|
|
22525
|
+
// @ts-ignore
|
|
22526
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
22527
|
+
...kwargs,
|
|
22528
|
+
image_features: reshaped_image_hidden_states
|
|
22529
|
+
});
|
|
22530
|
+
}
|
|
22531
|
+
};
|
|
22532
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22533
|
+
};
|
|
22534
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
22535
|
+
};
|
|
22536
|
+
|
|
22350
22537
|
// src/models/gemma3/modeling_gemma3.js
|
|
22351
22538
|
var Gemma3PreTrainedModel = class extends PreTrainedModel {
|
|
22352
22539
|
};
|
|
22353
22540
|
var Gemma3Model = class extends Gemma3PreTrainedModel {
|
|
22354
22541
|
};
|
|
22355
|
-
var
|
|
22542
|
+
var Gemma3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22543
|
+
};
|
|
22544
|
+
var Gemma3ForCausalLM = class extends Gemma3ForConditionalGeneration {
|
|
22356
22545
|
};
|
|
22357
22546
|
|
|
22358
22547
|
// src/models/gemma3n/modeling_gemma3n.js
|
|
@@ -22465,6 +22654,382 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
22465
22654
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
22466
22655
|
};
|
|
22467
22656
|
|
|
22657
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
22658
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
22659
|
+
};
|
|
22660
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
22661
|
+
};
|
|
22662
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
22663
|
+
};
|
|
22664
|
+
|
|
22665
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
22666
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
22667
|
+
forward_params = [
|
|
22668
|
+
// Text inputs
|
|
22669
|
+
"input_ids",
|
|
22670
|
+
"attention_mask",
|
|
22671
|
+
"position_ids",
|
|
22672
|
+
"past_key_values",
|
|
22673
|
+
// Vision inputs
|
|
22674
|
+
"pixel_values",
|
|
22675
|
+
"image_grid_thw"
|
|
22676
|
+
];
|
|
22677
|
+
};
|
|
22678
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
22679
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
22680
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
22681
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
22682
|
+
image_grid_thw_name = "grid_thw";
|
|
22683
|
+
/**
|
|
22684
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
22685
|
+
* @param {Tensor} input_ids
|
|
22686
|
+
* @param {Tensor} attention_mask
|
|
22687
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
22688
|
+
*/
|
|
22689
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
22690
|
+
if (attention_mask) {
|
|
22691
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
22692
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
22693
|
+
const mrope_position_deltas = Array.from(
|
|
22694
|
+
{ length: dims[0] },
|
|
22695
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
22696
|
+
);
|
|
22697
|
+
return [
|
|
22698
|
+
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
22699
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
22700
|
+
];
|
|
22701
|
+
} else {
|
|
22702
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
22703
|
+
const position_ids = BigInt64Array.from(
|
|
22704
|
+
{ length: 3 * batch_size * seq_length },
|
|
22705
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
22706
|
+
);
|
|
22707
|
+
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
22708
|
+
}
|
|
22709
|
+
}
|
|
22710
|
+
/**
|
|
22711
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
22712
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
22713
|
+
* respecting attention mask.
|
|
22714
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
22715
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
22716
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
22717
|
+
* @param {number} batch_idx Current batch index
|
|
22718
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
22719
|
+
*/
|
|
22720
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
22721
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
22722
|
+
const llm_positions = new Array(total_len);
|
|
22723
|
+
let index = 0;
|
|
22724
|
+
for (let x = 0; x < 3; ++x) {
|
|
22725
|
+
for (const val of llm_pos_ids_list) {
|
|
22726
|
+
const seg_len = val.length / 3;
|
|
22727
|
+
for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
|
|
22728
|
+
llm_positions[index++] = val[z];
|
|
22729
|
+
}
|
|
22730
|
+
}
|
|
22731
|
+
}
|
|
22732
|
+
let count2 = 0;
|
|
22733
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
22734
|
+
if (attn_mask[y] == 1) {
|
|
22735
|
+
for (let x = 0; x < 3; ++x) {
|
|
22736
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
22737
|
+
}
|
|
22738
|
+
++count2;
|
|
22739
|
+
}
|
|
22740
|
+
}
|
|
22741
|
+
return llm_positions;
|
|
22742
|
+
}
|
|
22743
|
+
/**
|
|
22744
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
22745
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
22746
|
+
* @param {object} params
|
|
22747
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
22748
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
22749
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
22750
|
+
* @param {number} params.spatial_merge_size
|
|
22751
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
22752
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
22753
|
+
*/
|
|
22754
|
+
_get_multimodal_rope_positions({
|
|
22755
|
+
filtered_ids,
|
|
22756
|
+
image_grid_thw_list,
|
|
22757
|
+
video_grid_thw_list,
|
|
22758
|
+
spatial_merge_size,
|
|
22759
|
+
state
|
|
22760
|
+
}) {
|
|
22761
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
22762
|
+
const ids = filtered_ids;
|
|
22763
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
22764
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
22765
|
+
return acc;
|
|
22766
|
+
}, []);
|
|
22767
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
22768
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
22769
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
22770
|
+
const llm_pos_ids_list = [];
|
|
22771
|
+
let st = 0;
|
|
22772
|
+
let remain_images = image_nums;
|
|
22773
|
+
let remain_videos = video_nums;
|
|
22774
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
22775
|
+
const next_image_token = ids.findIndex((x, i) => i > st && x == image_token_id);
|
|
22776
|
+
const next_video_token = ids.findIndex((x, i) => i > st && x == video_token_id);
|
|
22777
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
22778
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
22779
|
+
let ed;
|
|
22780
|
+
let t, h, w;
|
|
22781
|
+
if (ed_image < ed_video) {
|
|
22782
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
22783
|
+
++state.image_index;
|
|
22784
|
+
--remain_images;
|
|
22785
|
+
ed = ed_image;
|
|
22786
|
+
} else {
|
|
22787
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
22788
|
+
++state.video_index;
|
|
22789
|
+
--remain_videos;
|
|
22790
|
+
ed = ed_video;
|
|
22791
|
+
}
|
|
22792
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
22793
|
+
Number(t),
|
|
22794
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
22795
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
22796
|
+
];
|
|
22797
|
+
const text_len = ed - st;
|
|
22798
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
22799
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
22800
|
+
const offset = text_len + st_idx;
|
|
22801
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
22802
|
+
const t_index = Array.from(
|
|
22803
|
+
{ length: grid_size },
|
|
22804
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
22805
|
+
);
|
|
22806
|
+
const h_index = Array.from(
|
|
22807
|
+
{ length: grid_size },
|
|
22808
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
22809
|
+
);
|
|
22810
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
22811
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
22812
|
+
st = ed + grid_size;
|
|
22813
|
+
}
|
|
22814
|
+
if (st < ids.length) {
|
|
22815
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
22816
|
+
const text_len = ids.length - st;
|
|
22817
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
22818
|
+
}
|
|
22819
|
+
return llm_pos_ids_list;
|
|
22820
|
+
}
|
|
22821
|
+
/**
|
|
22822
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
22823
|
+
*
|
|
22824
|
+
* Explanation:
|
|
22825
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
22826
|
+
*
|
|
22827
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
22828
|
+
* Examples:
|
|
22829
|
+
* input_ids: [T T T T T], here T is for text.
|
|
22830
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
22831
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
22832
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
22833
|
+
*
|
|
22834
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
22835
|
+
* and 1D rotary position embeddin for text part.
|
|
22836
|
+
* Examples:
|
|
22837
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
22838
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
22839
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
22840
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
22841
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
22842
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
22843
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
22844
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
22845
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
22846
|
+
*
|
|
22847
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
22848
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
22849
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
22850
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
22851
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
22852
|
+
*/
|
|
22853
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
22854
|
+
const { vision_config } = this.config;
|
|
22855
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
22856
|
+
if (image_grid_thw || video_grid_thw) {
|
|
22857
|
+
const total_input_ids = input_ids.tolist();
|
|
22858
|
+
if (!attention_mask) {
|
|
22859
|
+
attention_mask = ones_like(input_ids);
|
|
22860
|
+
}
|
|
22861
|
+
const attention_mask_list = attention_mask.tolist();
|
|
22862
|
+
const position_ids_list = Array.from(
|
|
22863
|
+
{ length: 3 },
|
|
22864
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
22865
|
+
);
|
|
22866
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
22867
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
22868
|
+
const state = { image_index: 0, video_index: 0 };
|
|
22869
|
+
const mrope_position_deltas = [];
|
|
22870
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
22871
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
22872
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
22873
|
+
filtered_ids,
|
|
22874
|
+
image_grid_thw_list,
|
|
22875
|
+
video_grid_thw_list,
|
|
22876
|
+
spatial_merge_size,
|
|
22877
|
+
state
|
|
22878
|
+
});
|
|
22879
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
22880
|
+
llm_pos_ids_list,
|
|
22881
|
+
attention_mask_list[i],
|
|
22882
|
+
position_ids_list,
|
|
22883
|
+
i
|
|
22884
|
+
);
|
|
22885
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
22886
|
+
}
|
|
22887
|
+
return [
|
|
22888
|
+
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
22889
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
22890
|
+
];
|
|
22891
|
+
} else {
|
|
22892
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
22893
|
+
}
|
|
22894
|
+
}
|
|
22895
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
22896
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
22897
|
+
pixel_values,
|
|
22898
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
22899
|
+
})).image_features;
|
|
22900
|
+
return features;
|
|
22901
|
+
}
|
|
22902
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
22903
|
+
return default_merge_input_ids_with_image_features({
|
|
22904
|
+
// @ts-ignore
|
|
22905
|
+
image_token_id: this.config.image_token_id,
|
|
22906
|
+
...kwargs
|
|
22907
|
+
});
|
|
22908
|
+
}
|
|
22909
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
22910
|
+
if (!model_inputs.attention_mask || model_inputs.position_ids) {
|
|
22911
|
+
return model_inputs;
|
|
22912
|
+
}
|
|
22913
|
+
const session = this.sessions["decoder_model_merged"] ?? this.sessions["model"];
|
|
22914
|
+
if (!session.inputNames.includes("position_ids")) {
|
|
22915
|
+
return model_inputs;
|
|
22916
|
+
}
|
|
22917
|
+
if (!model_inputs.past_key_values) {
|
|
22918
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
22919
|
+
model_inputs.input_ids,
|
|
22920
|
+
model_inputs.image_grid_thw,
|
|
22921
|
+
model_inputs.video_grid_thw,
|
|
22922
|
+
model_inputs.attention_mask
|
|
22923
|
+
);
|
|
22924
|
+
} else {
|
|
22925
|
+
model_inputs.pixel_values = null;
|
|
22926
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
22927
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
22928
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
22929
|
+
model_inputs.input_ids,
|
|
22930
|
+
model_inputs.image_grid_thw,
|
|
22931
|
+
model_inputs.video_grid_thw,
|
|
22932
|
+
model_inputs.attention_mask
|
|
22933
|
+
);
|
|
22934
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
22935
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
22936
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
22937
|
+
} else {
|
|
22938
|
+
if (!model_inputs.rope_deltas) {
|
|
22939
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
22940
|
+
model_inputs.input_ids,
|
|
22941
|
+
model_inputs.image_grid_thw,
|
|
22942
|
+
model_inputs.video_grid_thw,
|
|
22943
|
+
model_inputs.attention_mask
|
|
22944
|
+
);
|
|
22945
|
+
}
|
|
22946
|
+
const delta = BigInt(past_length);
|
|
22947
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
22948
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
22949
|
+
}
|
|
22950
|
+
}
|
|
22951
|
+
return model_inputs;
|
|
22952
|
+
}
|
|
22953
|
+
};
|
|
22954
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
22955
|
+
};
|
|
22956
|
+
|
|
22957
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
22958
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
22959
|
+
image_grid_thw_name = "image_grid_thw";
|
|
22960
|
+
};
|
|
22961
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
22962
|
+
image_grid_thw_name = "image_grid_thw";
|
|
22963
|
+
};
|
|
22964
|
+
|
|
22965
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
22966
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
22967
|
+
/**
|
|
22968
|
+
* Compute 3D positional indices for vision tokens.
|
|
22969
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
22970
|
+
* @param {number} start_position
|
|
22971
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
22972
|
+
* @param {number} temp_merge_size
|
|
22973
|
+
* @param {number} spatial_merge_size
|
|
22974
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
22975
|
+
*/
|
|
22976
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
22977
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
22978
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
22979
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
22980
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
22981
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
22982
|
+
const h_pos = Array.from(
|
|
22983
|
+
{ length: seq_len },
|
|
22984
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
22985
|
+
);
|
|
22986
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
22987
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
22988
|
+
}
|
|
22989
|
+
/**
|
|
22990
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
22991
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
22992
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
22993
|
+
*/
|
|
22994
|
+
_get_multimodal_rope_positions({
|
|
22995
|
+
filtered_ids,
|
|
22996
|
+
image_grid_thw_list,
|
|
22997
|
+
video_grid_thw_list,
|
|
22998
|
+
spatial_merge_size,
|
|
22999
|
+
state
|
|
23000
|
+
}) {
|
|
23001
|
+
const { image_token_id } = this.config;
|
|
23002
|
+
const groups = [];
|
|
23003
|
+
let group_start = 0;
|
|
23004
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
23005
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
23006
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
23007
|
+
if (t !== current_type) {
|
|
23008
|
+
groups.push([current_type, group_start, j]);
|
|
23009
|
+
group_start = j;
|
|
23010
|
+
current_type = t;
|
|
23011
|
+
}
|
|
23012
|
+
}
|
|
23013
|
+
let current_pos = 0;
|
|
23014
|
+
const llm_pos_ids_list = [];
|
|
23015
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
23016
|
+
if (modality_type === 0) {
|
|
23017
|
+
const text_len = end_idx - start_idx;
|
|
23018
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
23019
|
+
current_pos += text_len;
|
|
23020
|
+
} else {
|
|
23021
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
23022
|
+
const temp_merge_size = grid_thw[0];
|
|
23023
|
+
llm_pos_ids_list.push(
|
|
23024
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
23025
|
+
);
|
|
23026
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
23027
|
+
}
|
|
23028
|
+
}
|
|
23029
|
+
return llm_pos_ids_list;
|
|
23030
|
+
}
|
|
23031
|
+
};
|
|
23032
|
+
|
|
22468
23033
|
// src/models/glpn/modeling_glpn.js
|
|
22469
23034
|
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
22470
23035
|
};
|
|
@@ -22663,27 +23228,6 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
22663
23228
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
22664
23229
|
};
|
|
22665
23230
|
|
|
22666
|
-
// src/models/llava/modeling_llava.js
|
|
22667
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
22668
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
22669
|
-
};
|
|
22670
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
22671
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
22672
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
22673
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
22674
|
-
return default_merge_input_ids_with_image_features({
|
|
22675
|
-
// @ts-ignore
|
|
22676
|
-
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
22677
|
-
...kwargs,
|
|
22678
|
-
image_features: reshaped_image_hidden_states
|
|
22679
|
-
});
|
|
22680
|
-
}
|
|
22681
|
-
};
|
|
22682
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22683
|
-
};
|
|
22684
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
22685
|
-
};
|
|
22686
|
-
|
|
22687
23231
|
// src/models/idefics3/modeling_idefics3.js
|
|
22688
23232
|
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
22689
23233
|
forward_params = [
|
|
@@ -22777,6 +23321,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
22777
23321
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
22778
23322
|
};
|
|
22779
23323
|
|
|
23324
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
23325
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
23326
|
+
};
|
|
23327
|
+
|
|
22780
23328
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
22781
23329
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
22782
23330
|
};
|
|
@@ -22973,6 +23521,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
22973
23521
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
22974
23522
|
};
|
|
22975
23523
|
|
|
23524
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
23525
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
23526
|
+
};
|
|
23527
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
23528
|
+
};
|
|
23529
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
23530
|
+
};
|
|
23531
|
+
|
|
22976
23532
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
22977
23533
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
22978
23534
|
};
|
|
@@ -23441,6 +23997,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
23441
23997
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
23442
23998
|
};
|
|
23443
23999
|
|
|
24000
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
24001
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
24002
|
+
};
|
|
24003
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
24004
|
+
};
|
|
24005
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
24006
|
+
};
|
|
24007
|
+
|
|
23444
24008
|
// src/models/neobert/modeling_neobert.js
|
|
23445
24009
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
23446
24010
|
};
|
|
@@ -23721,252 +24285,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
23721
24285
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
23722
24286
|
};
|
|
23723
24287
|
|
|
23724
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
23725
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
23726
|
-
forward_params = [
|
|
23727
|
-
// Text inputs
|
|
23728
|
-
"input_ids",
|
|
23729
|
-
"attention_mask",
|
|
23730
|
-
"position_ids",
|
|
23731
|
-
"past_key_values",
|
|
23732
|
-
// Vision inputs
|
|
23733
|
-
"pixel_values",
|
|
23734
|
-
"image_grid_thw"
|
|
23735
|
-
];
|
|
23736
|
-
};
|
|
23737
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
23738
|
-
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
23739
|
-
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
23740
|
-
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
23741
|
-
image_grid_thw_name = "grid_thw";
|
|
23742
|
-
/**
|
|
23743
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
23744
|
-
*
|
|
23745
|
-
* Explanation:
|
|
23746
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
23747
|
-
*
|
|
23748
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
23749
|
-
* Examples:
|
|
23750
|
-
* input_ids: [T T T T T], here T is for text.
|
|
23751
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
23752
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
23753
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
23754
|
-
*
|
|
23755
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
23756
|
-
* and 1D rotary position embeddin for text part.
|
|
23757
|
-
* Examples:
|
|
23758
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
23759
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
23760
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
23761
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
23762
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
23763
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
23764
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
23765
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
23766
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
23767
|
-
*
|
|
23768
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
23769
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
23770
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
23771
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
23772
|
-
* - 1 for tokens that are **not masked**,
|
|
23773
|
-
* - 0 for tokens that are **masked**.
|
|
23774
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
23775
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
23776
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
23777
|
-
*/
|
|
23778
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
23779
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
23780
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
23781
|
-
const mrope_position_deltas = [];
|
|
23782
|
-
if (image_grid_thw || video_grid_thw) {
|
|
23783
|
-
let total_input_ids = input_ids.tolist();
|
|
23784
|
-
if (!attention_mask) {
|
|
23785
|
-
attention_mask = ones_like(input_ids);
|
|
23786
|
-
}
|
|
23787
|
-
const attention_mask_list = attention_mask.tolist();
|
|
23788
|
-
const position_ids_list = Array.from(
|
|
23789
|
-
{ length: 3 },
|
|
23790
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
23791
|
-
);
|
|
23792
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
23793
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
23794
|
-
let image_index = 0;
|
|
23795
|
-
let video_index = 0;
|
|
23796
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
23797
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
23798
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
23799
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
23800
|
-
return acc;
|
|
23801
|
-
}, []);
|
|
23802
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
23803
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
23804
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
23805
|
-
let llm_pos_ids_list = [];
|
|
23806
|
-
let st = 0;
|
|
23807
|
-
let remain_images = image_nums;
|
|
23808
|
-
let remain_videos = video_nums;
|
|
23809
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
23810
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st && x == image_token_id);
|
|
23811
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st && x == video_token_id);
|
|
23812
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
23813
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
23814
|
-
let ed;
|
|
23815
|
-
let t, h, w;
|
|
23816
|
-
if (ed_image < ed_video) {
|
|
23817
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
23818
|
-
++image_index;
|
|
23819
|
-
--remain_images;
|
|
23820
|
-
ed = ed_image;
|
|
23821
|
-
} else {
|
|
23822
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
23823
|
-
++video_index;
|
|
23824
|
-
--remain_videos;
|
|
23825
|
-
ed = ed_video;
|
|
23826
|
-
}
|
|
23827
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
23828
|
-
Number(t),
|
|
23829
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
23830
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
23831
|
-
];
|
|
23832
|
-
const text_len = ed - st;
|
|
23833
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
23834
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
23835
|
-
const offset = text_len + st_idx;
|
|
23836
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
23837
|
-
const t_index = Array.from(
|
|
23838
|
-
{ length: grid_size },
|
|
23839
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
23840
|
-
);
|
|
23841
|
-
const h_index = Array.from(
|
|
23842
|
-
{ length: grid_size },
|
|
23843
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
23844
|
-
);
|
|
23845
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
23846
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
23847
|
-
st = ed + grid_size;
|
|
23848
|
-
}
|
|
23849
|
-
if (st < ids.length) {
|
|
23850
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
23851
|
-
const text_len = ids.length - st;
|
|
23852
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
23853
|
-
}
|
|
23854
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
23855
|
-
const llm_positions = new Array(num_items);
|
|
23856
|
-
let index = 0;
|
|
23857
|
-
for (let x = 0; x < 3; ++x) {
|
|
23858
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
23859
|
-
const val = llm_pos_ids_list[y];
|
|
23860
|
-
const text_len = val.length / 3;
|
|
23861
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
23862
|
-
llm_positions[index++] = val[z];
|
|
23863
|
-
}
|
|
23864
|
-
}
|
|
23865
|
-
}
|
|
23866
|
-
let count2 = 0;
|
|
23867
|
-
const attn_mask = attention_mask_list[i];
|
|
23868
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
23869
|
-
if (attn_mask[y] == 1) {
|
|
23870
|
-
for (let x = 0; x < 3; ++x) {
|
|
23871
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
23872
|
-
}
|
|
23873
|
-
++count2;
|
|
23874
|
-
}
|
|
23875
|
-
}
|
|
23876
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
23877
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
23878
|
-
}
|
|
23879
|
-
return [
|
|
23880
|
-
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
23881
|
-
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
23882
|
-
];
|
|
23883
|
-
} else {
|
|
23884
|
-
if (attention_mask) {
|
|
23885
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
23886
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
23887
|
-
const mrope_position_deltas2 = Array.from(
|
|
23888
|
-
{ length: dims[0] },
|
|
23889
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
23890
|
-
);
|
|
23891
|
-
return [
|
|
23892
|
-
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
23893
|
-
new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
23894
|
-
];
|
|
23895
|
-
} else {
|
|
23896
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
23897
|
-
const position_ids = BigInt64Array.from(
|
|
23898
|
-
{ length: 3 * batch_size * seq_length },
|
|
23899
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
23900
|
-
);
|
|
23901
|
-
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
23902
|
-
}
|
|
23903
|
-
}
|
|
23904
|
-
}
|
|
23905
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
23906
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
23907
|
-
pixel_values,
|
|
23908
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
23909
|
-
})).image_features;
|
|
23910
|
-
return features;
|
|
23911
|
-
}
|
|
23912
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
23913
|
-
return default_merge_input_ids_with_image_features({
|
|
23914
|
-
// @ts-ignore
|
|
23915
|
-
image_token_id: this.config.image_token_id,
|
|
23916
|
-
...kwargs
|
|
23917
|
-
});
|
|
23918
|
-
}
|
|
23919
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
23920
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
23921
|
-
if (!model_inputs.past_key_values) {
|
|
23922
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
23923
|
-
model_inputs.input_ids,
|
|
23924
|
-
model_inputs.image_grid_thw,
|
|
23925
|
-
model_inputs.video_grid_thw,
|
|
23926
|
-
model_inputs.attention_mask
|
|
23927
|
-
);
|
|
23928
|
-
} else {
|
|
23929
|
-
model_inputs.pixel_values = null;
|
|
23930
|
-
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
23931
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
23932
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
23933
|
-
model_inputs.input_ids,
|
|
23934
|
-
model_inputs.image_grid_thw,
|
|
23935
|
-
model_inputs.video_grid_thw,
|
|
23936
|
-
model_inputs.attention_mask
|
|
23937
|
-
);
|
|
23938
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
23939
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
23940
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
23941
|
-
} else {
|
|
23942
|
-
if (!model_inputs.rope_deltas) {
|
|
23943
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
23944
|
-
model_inputs.input_ids,
|
|
23945
|
-
model_inputs.image_grid_thw,
|
|
23946
|
-
model_inputs.video_grid_thw,
|
|
23947
|
-
model_inputs.attention_mask
|
|
23948
|
-
);
|
|
23949
|
-
}
|
|
23950
|
-
const delta = BigInt(past_length);
|
|
23951
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
23952
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
23953
|
-
}
|
|
23954
|
-
}
|
|
23955
|
-
}
|
|
23956
|
-
return model_inputs;
|
|
23957
|
-
}
|
|
23958
|
-
};
|
|
23959
|
-
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
23960
|
-
};
|
|
23961
|
-
|
|
23962
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
23963
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
23964
|
-
image_grid_thw_name = "image_grid_thw";
|
|
23965
|
-
};
|
|
23966
|
-
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
23967
|
-
image_grid_thw_name = "image_grid_thw";
|
|
23968
|
-
};
|
|
23969
|
-
|
|
23970
24288
|
// src/models/qwen3/modeling_qwen3.js
|
|
23971
24289
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
23972
24290
|
};
|
|
@@ -24412,6 +24730,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
24412
24730
|
}
|
|
24413
24731
|
};
|
|
24414
24732
|
|
|
24733
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
24734
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
24735
|
+
};
|
|
24736
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
24737
|
+
};
|
|
24738
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
24739
|
+
};
|
|
24740
|
+
|
|
24415
24741
|
// src/models/speecht5/modeling_speecht5.js
|
|
24416
24742
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
24417
24743
|
};
|
|
@@ -25528,6 +25854,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
25528
25854
|
// src/models/registry.js
|
|
25529
25855
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
25530
25856
|
["bert", "BertModel"],
|
|
25857
|
+
["eurobert", "EuroBertModel"],
|
|
25531
25858
|
["neobert", "NeoBertModel"],
|
|
25532
25859
|
["modernbert", "ModernBertModel"],
|
|
25533
25860
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -25659,6 +25986,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
25659
25986
|
["gemma3_text", "Gemma3Model"],
|
|
25660
25987
|
["helium", "HeliumModel"],
|
|
25661
25988
|
["glm", "GlmModel"],
|
|
25989
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
25662
25990
|
["openelm", "OpenELMModel"],
|
|
25663
25991
|
["qwen2", "Qwen2Model"],
|
|
25664
25992
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -25670,12 +25998,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
25670
25998
|
["mpt", "MptModel"],
|
|
25671
25999
|
["opt", "OPTModel"],
|
|
25672
26000
|
["mistral", "MistralModel"],
|
|
26001
|
+
["mistral4", "Mistral4Model"],
|
|
25673
26002
|
["ministral", "MinistralModel"],
|
|
25674
26003
|
["ministral3", "Ministral3Model"],
|
|
25675
26004
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
25676
26005
|
["starcoder2", "Starcoder2Model"],
|
|
26006
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
25677
26007
|
["falcon", "FalconModel"],
|
|
25678
26008
|
["falcon_h1", "FalconH1Model"],
|
|
26009
|
+
["nemotron_h", "NemotronHModel"],
|
|
26010
|
+
["solar_open", "SolarOpenModel"],
|
|
25679
26011
|
["stablelm", "StableLmModel"],
|
|
25680
26012
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
25681
26013
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -25695,6 +26027,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25695
26027
|
]);
|
|
25696
26028
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
25697
26029
|
["bert", "BertForSequenceClassification"],
|
|
26030
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
25698
26031
|
["neobert", "NeoBertForSequenceClassification"],
|
|
25699
26032
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
25700
26033
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -25717,6 +26050,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25717
26050
|
]);
|
|
25718
26051
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
25719
26052
|
["bert", "BertForTokenClassification"],
|
|
26053
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
25720
26054
|
["neobert", "NeoBertForTokenClassification"],
|
|
25721
26055
|
["modernbert", "ModernBertForTokenClassification"],
|
|
25722
26056
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -25779,6 +26113,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25779
26113
|
["gemma3", "Gemma3ForCausalLM"],
|
|
25780
26114
|
["helium", "HeliumForCausalLM"],
|
|
25781
26115
|
["glm", "GlmForCausalLM"],
|
|
26116
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
25782
26117
|
["openelm", "OpenELMForCausalLM"],
|
|
25783
26118
|
["qwen2", "Qwen2ForCausalLM"],
|
|
25784
26119
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
@@ -25790,6 +26125,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25790
26125
|
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
25791
26126
|
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
25792
26127
|
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
26128
|
+
["qwen3_5_text", "Qwen3_5ForCausalLM"],
|
|
25793
26129
|
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
25794
26130
|
["gemma3n", "Gemma3nForCausalLM"],
|
|
25795
26131
|
["phi", "PhiForCausalLM"],
|
|
@@ -25798,13 +26134,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25798
26134
|
["opt", "OPTForCausalLM"],
|
|
25799
26135
|
["mbart", "MBartForCausalLM"],
|
|
25800
26136
|
["mistral", "MistralForCausalLM"],
|
|
26137
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
25801
26138
|
["ministral", "MinistralForCausalLM"],
|
|
25802
26139
|
["ministral3", "Ministral3ForCausalLM"],
|
|
25803
26140
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
25804
26141
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
26142
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
25805
26143
|
["falcon", "FalconForCausalLM"],
|
|
25806
26144
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
26145
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
25807
26146
|
["trocr", "TrOCRForCausalLM"],
|
|
26147
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
25808
26148
|
["stablelm", "StableLmForCausalLM"],
|
|
25809
26149
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
25810
26150
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -25815,6 +26155,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25815
26155
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
25816
26156
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
25817
26157
|
["bert", "BertForMaskedLM"],
|
|
26158
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
25818
26159
|
["neobert", "NeoBertForMaskedLM"],
|
|
25819
26160
|
["modernbert", "ModernBertForMaskedLM"],
|
|
25820
26161
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -25872,8 +26213,11 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25872
26213
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
25873
26214
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
25874
26215
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
26216
|
+
["gemma3", "Gemma3ForConditionalGeneration"],
|
|
25875
26217
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
25876
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
26218
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
26219
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
26220
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
25877
26221
|
]);
|
|
25878
26222
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
25879
26223
|
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
@@ -25978,6 +26322,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
25978
26322
|
]);
|
|
25979
26323
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
25980
26324
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
26325
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
25981
26326
|
["dpt", "DPTForDepthEstimation"],
|
|
25982
26327
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
25983
26328
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -26063,13 +26408,6 @@ var CUSTOM_MAPPING = [
|
|
|
26063
26408
|
],
|
|
26064
26409
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
26065
26410
|
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
26066
|
-
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26067
|
-
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26068
|
-
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26069
|
-
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26070
|
-
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26071
|
-
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26072
|
-
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
26073
26411
|
[
|
|
26074
26412
|
"VoxtralRealtimeForConditionalGeneration",
|
|
26075
26413
|
VoxtralRealtimeForConditionalGeneration,
|
|
@@ -27751,6 +28089,41 @@ var TASK_ALIASES = Object.freeze({
|
|
|
27751
28089
|
embeddings: "feature-extraction"
|
|
27752
28090
|
});
|
|
27753
28091
|
|
|
28092
|
+
// src/utils/model_registry/resolve_model_type.js
|
|
28093
|
+
function resolve_model_type(config, { warn = true } = {}) {
|
|
28094
|
+
const architectures = (
|
|
28095
|
+
/** @type {string[]} */
|
|
28096
|
+
config.architectures || []
|
|
28097
|
+
);
|
|
28098
|
+
for (const arch of architectures) {
|
|
28099
|
+
const mappedType = MODEL_TYPE_MAPPING.get(arch);
|
|
28100
|
+
if (mappedType !== void 0) {
|
|
28101
|
+
return mappedType;
|
|
28102
|
+
}
|
|
28103
|
+
}
|
|
28104
|
+
if (config.model_type) {
|
|
28105
|
+
const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
|
|
28106
|
+
if (mappedType !== void 0) {
|
|
28107
|
+
return mappedType;
|
|
28108
|
+
}
|
|
28109
|
+
for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
|
|
28110
|
+
if (mapping.has(config.model_type)) {
|
|
28111
|
+
const resolved = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
|
|
28112
|
+
if (resolved !== void 0) {
|
|
28113
|
+
return resolved;
|
|
28114
|
+
}
|
|
28115
|
+
}
|
|
28116
|
+
}
|
|
28117
|
+
}
|
|
28118
|
+
if (warn) {
|
|
28119
|
+
const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
|
|
28120
|
+
logger.warn(
|
|
28121
|
+
`[resolve_model_type] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
|
|
28122
|
+
);
|
|
28123
|
+
}
|
|
28124
|
+
return MODEL_TYPES.EncoderOnly;
|
|
28125
|
+
}
|
|
28126
|
+
|
|
27754
28127
|
// src/utils/model_registry/get_model_files.js
|
|
27755
28128
|
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
27756
28129
|
if (config !== null) {
|
|
@@ -27773,43 +28146,7 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
27773
28146
|
const subfolder = "onnx";
|
|
27774
28147
|
const rawDevice = overrideDevice ?? custom_config.device;
|
|
27775
28148
|
let dtype = overrideDtype ?? custom_config.dtype;
|
|
27776
|
-
|
|
27777
|
-
const architectures = (
|
|
27778
|
-
/** @type {string[]} */
|
|
27779
|
-
config.architectures || []
|
|
27780
|
-
);
|
|
27781
|
-
let foundInMapping = false;
|
|
27782
|
-
for (const arch of architectures) {
|
|
27783
|
-
const mappedType = MODEL_TYPE_MAPPING.get(arch);
|
|
27784
|
-
if (mappedType !== void 0) {
|
|
27785
|
-
modelType = mappedType;
|
|
27786
|
-
foundInMapping = true;
|
|
27787
|
-
break;
|
|
27788
|
-
}
|
|
27789
|
-
}
|
|
27790
|
-
if (!foundInMapping && config.model_type) {
|
|
27791
|
-
const mappedType = MODEL_TYPE_MAPPING.get(config.model_type);
|
|
27792
|
-
if (mappedType !== void 0) {
|
|
27793
|
-
modelType = mappedType;
|
|
27794
|
-
foundInMapping = true;
|
|
27795
|
-
}
|
|
27796
|
-
if (!foundInMapping) {
|
|
27797
|
-
for (const mapping of Object.values(MODEL_MAPPING_NAMES)) {
|
|
27798
|
-
if (mapping.has(config.model_type)) {
|
|
27799
|
-
modelType = MODEL_TYPE_MAPPING.get(mapping.get(config.model_type));
|
|
27800
|
-
foundInMapping = true;
|
|
27801
|
-
break;
|
|
27802
|
-
}
|
|
27803
|
-
}
|
|
27804
|
-
}
|
|
27805
|
-
}
|
|
27806
|
-
if (!foundInMapping) {
|
|
27807
|
-
const archList = architectures.length > 0 ? architectures.join(", ") : "(none)";
|
|
27808
|
-
logger.warn(
|
|
27809
|
-
`[get_model_files] Architecture(s) not found in MODEL_TYPE_MAPPING: [${archList}] for model type '${config.model_type}'. Falling back to EncoderOnly (single model.onnx file). If you encounter issues, please report at: ${GITHUB_ISSUE_URL}`
|
|
27810
|
-
);
|
|
27811
|
-
modelType = MODEL_TYPES.EncoderOnly;
|
|
27812
|
-
}
|
|
28149
|
+
const modelType = resolve_model_type(config);
|
|
27813
28150
|
const add_model_file = (fileName, baseName = null) => {
|
|
27814
28151
|
baseName = baseName ?? fileName;
|
|
27815
28152
|
const selectedDevice = selectDevice(rawDevice, fileName);
|
|
@@ -28396,6 +28733,31 @@ async function clear_pipeline_cache(task, modelId, options = {}) {
|
|
|
28396
28733
|
return await clear_files_from_cache(modelId, files, options);
|
|
28397
28734
|
}
|
|
28398
28735
|
|
|
28736
|
+
// src/utils/model_registry/get_available_dtypes.js
|
|
28737
|
+
var CONCRETE_DTYPES = Object.keys(DEFAULT_DTYPE_SUFFIX_MAPPING);
|
|
28738
|
+
async function get_available_dtypes(modelId, { config = null, model_file_name = null, revision = "main", cache_dir = null, local_files_only = false } = {}) {
|
|
28739
|
+
config = await get_config(modelId, { config, cache_dir, local_files_only, revision });
|
|
28740
|
+
const subfolder = "onnx";
|
|
28741
|
+
const modelType = resolve_model_type(config);
|
|
28742
|
+
const { sessions } = getSessionsConfig(modelType, config, { model_file_name });
|
|
28743
|
+
const baseNames = Object.values(sessions);
|
|
28744
|
+
const metadataOptions = { revision, cache_dir, local_files_only };
|
|
28745
|
+
const probeResults = await Promise.all(
|
|
28746
|
+
CONCRETE_DTYPES.map(async (dtype) => {
|
|
28747
|
+
const suffix = DEFAULT_DTYPE_SUFFIX_MAPPING[dtype] ?? "";
|
|
28748
|
+
const allExist = await Promise.all(
|
|
28749
|
+
baseNames.map(async (baseName) => {
|
|
28750
|
+
const filename = `${subfolder}/${baseName}${suffix}.onnx`;
|
|
28751
|
+
const metadata = await get_file_metadata(modelId, filename, metadataOptions);
|
|
28752
|
+
return metadata.exists;
|
|
28753
|
+
})
|
|
28754
|
+
);
|
|
28755
|
+
return { dtype, available: allExist.every(Boolean) };
|
|
28756
|
+
})
|
|
28757
|
+
);
|
|
28758
|
+
return probeResults.filter((r) => r.available).map((r) => r.dtype);
|
|
28759
|
+
}
|
|
28760
|
+
|
|
28399
28761
|
// src/utils/model_registry/ModelRegistry.js
|
|
28400
28762
|
var ModelRegistry = class {
|
|
28401
28763
|
/**
|
|
@@ -28482,6 +28844,29 @@ var ModelRegistry = class {
|
|
|
28482
28844
|
static async get_processor_files(modelId) {
|
|
28483
28845
|
return get_processor_files(modelId);
|
|
28484
28846
|
}
|
|
28847
|
+
/**
|
|
28848
|
+
* Detects which quantization levels (dtypes) are available for a model
|
|
28849
|
+
* by checking which ONNX files exist on the hub or locally.
|
|
28850
|
+
*
|
|
28851
|
+
* A dtype is considered available if all required model session files
|
|
28852
|
+
* exist for that dtype.
|
|
28853
|
+
*
|
|
28854
|
+
* @param {string} modelId - The model id (e.g., "onnx-community/all-MiniLM-L6-v2-ONNX")
|
|
28855
|
+
* @param {Object} [options] - Optional parameters
|
|
28856
|
+
* @param {import('../../configs.js').PretrainedConfig} [options.config=null] - Pre-loaded config
|
|
28857
|
+
* @param {string} [options.model_file_name=null] - Override the model file name (excluding .onnx suffix)
|
|
28858
|
+
* @param {string} [options.revision='main'] - Model revision
|
|
28859
|
+
* @param {string} [options.cache_dir=null] - Custom cache directory
|
|
28860
|
+
* @param {boolean} [options.local_files_only=false] - Only check local files
|
|
28861
|
+
* @returns {Promise<string[]>} Array of available dtype strings (e.g., ['fp32', 'fp16', 'q4', 'q8'])
|
|
28862
|
+
*
|
|
28863
|
+
* @example
|
|
28864
|
+
* const dtypes = await ModelRegistry.get_available_dtypes('onnx-community/all-MiniLM-L6-v2-ONNX');
|
|
28865
|
+
* console.log(dtypes); // ['fp32', 'fp16', 'int8', 'uint8', 'q8', 'q4']
|
|
28866
|
+
*/
|
|
28867
|
+
static async get_available_dtypes(modelId, options = {}) {
|
|
28868
|
+
return get_available_dtypes(modelId, options);
|
|
28869
|
+
}
|
|
28485
28870
|
/**
|
|
28486
28871
|
* Quickly checks if a model is fully cached by verifying `config.json` is present,
|
|
28487
28872
|
* then confirming all required files are cached.
|
|
@@ -28716,6 +29101,9 @@ export {
|
|
|
28716
29101
|
BloomModel,
|
|
28717
29102
|
BloomPreTrainedModel,
|
|
28718
29103
|
BloomTokenizer,
|
|
29104
|
+
CHMv2ForDepthEstimation,
|
|
29105
|
+
CHMv2ImageProcessor,
|
|
29106
|
+
CHMv2PreTrainedModel,
|
|
28719
29107
|
CLIPFeatureExtractor,
|
|
28720
29108
|
CLIPImageProcessor,
|
|
28721
29109
|
CLIPModel,
|
|
@@ -28811,6 +29199,9 @@ export {
|
|
|
28811
29199
|
DebertaV2Tokenizer,
|
|
28812
29200
|
DecisionTransformerModel,
|
|
28813
29201
|
DecisionTransformerPreTrainedModel,
|
|
29202
|
+
DeepseekV3ForCausalLM,
|
|
29203
|
+
DeepseekV3Model,
|
|
29204
|
+
DeepseekV3PreTrainedModel,
|
|
28814
29205
|
DeiTFeatureExtractor,
|
|
28815
29206
|
DeiTForImageClassification,
|
|
28816
29207
|
DeiTImageProcessor,
|
|
@@ -28871,6 +29262,11 @@ export {
|
|
|
28871
29262
|
EsmModel,
|
|
28872
29263
|
EsmPreTrainedModel,
|
|
28873
29264
|
EsmTokenizer,
|
|
29265
|
+
EuroBertForMaskedLM,
|
|
29266
|
+
EuroBertForSequenceClassification,
|
|
29267
|
+
EuroBertForTokenClassification,
|
|
29268
|
+
EuroBertModel,
|
|
29269
|
+
EuroBertPreTrainedModel,
|
|
28874
29270
|
ExaoneForCausalLM,
|
|
28875
29271
|
ExaoneModel,
|
|
28876
29272
|
ExaonePreTrainedModel,
|
|
@@ -28917,8 +29313,11 @@ export {
|
|
|
28917
29313
|
Gemma2Model,
|
|
28918
29314
|
Gemma2PreTrainedModel,
|
|
28919
29315
|
Gemma3ForCausalLM,
|
|
29316
|
+
Gemma3ForConditionalGeneration,
|
|
29317
|
+
Gemma3ImageProcessor,
|
|
28920
29318
|
Gemma3Model,
|
|
28921
29319
|
Gemma3PreTrainedModel,
|
|
29320
|
+
Gemma3Processor,
|
|
28922
29321
|
Gemma3nAudioFeatureExtractor,
|
|
28923
29322
|
Gemma3nForCausalLM,
|
|
28924
29323
|
Gemma3nForConditionalGeneration,
|
|
@@ -28928,8 +29327,14 @@ export {
|
|
|
28928
29327
|
GemmaModel,
|
|
28929
29328
|
GemmaPreTrainedModel,
|
|
28930
29329
|
GemmaTokenizer,
|
|
29330
|
+
Glm46VImageProcessor,
|
|
29331
|
+
Glm46VProcessor,
|
|
28931
29332
|
GlmForCausalLM,
|
|
28932
29333
|
GlmModel,
|
|
29334
|
+
GlmMoeDsaForCausalLM,
|
|
29335
|
+
GlmMoeDsaModel,
|
|
29336
|
+
GlmMoeDsaPreTrainedModel,
|
|
29337
|
+
GlmOcrForConditionalGeneration,
|
|
28933
29338
|
GlmPreTrainedModel,
|
|
28934
29339
|
GptOssForCausalLM,
|
|
28935
29340
|
GptOssModel,
|
|
@@ -28995,6 +29400,7 @@ export {
|
|
|
28995
29400
|
Lfm2VlForConditionalGeneration,
|
|
28996
29401
|
Lfm2VlImageProcessor,
|
|
28997
29402
|
Lfm2VlProcessor,
|
|
29403
|
+
LightOnOcrForConditionalGeneration,
|
|
28998
29404
|
LiteWhisperForConditionalGeneration,
|
|
28999
29405
|
Llama4ForCausalLM,
|
|
29000
29406
|
Llama4PreTrainedModel,
|
|
@@ -29064,6 +29470,9 @@ export {
|
|
|
29064
29470
|
MimiPreTrainedModel,
|
|
29065
29471
|
MinLengthLogitsProcessor,
|
|
29066
29472
|
MinNewTokensLengthLogitsProcessor,
|
|
29473
|
+
Mistral4ForCausalLM,
|
|
29474
|
+
Mistral4Model,
|
|
29475
|
+
Mistral4PreTrainedModel,
|
|
29067
29476
|
MistralForCausalLM,
|
|
29068
29477
|
MistralModel,
|
|
29069
29478
|
MistralPreTrainedModel,
|
|
@@ -29135,6 +29544,9 @@ export {
|
|
|
29135
29544
|
NanoChatForCausalLM,
|
|
29136
29545
|
NanoChatModel,
|
|
29137
29546
|
NanoChatPreTrainedModel,
|
|
29547
|
+
NemotronHForCausalLM,
|
|
29548
|
+
NemotronHModel,
|
|
29549
|
+
NemotronHPreTrainedModel,
|
|
29138
29550
|
NeoBertForMaskedLM,
|
|
29139
29551
|
NeoBertForQuestionAnswering,
|
|
29140
29552
|
NeoBertForSequenceClassification,
|
|
@@ -29324,6 +29736,9 @@ export {
|
|
|
29324
29736
|
SnacFeatureExtractor,
|
|
29325
29737
|
SnacModel,
|
|
29326
29738
|
SnacPreTrainedModel,
|
|
29739
|
+
SolarOpenForCausalLM,
|
|
29740
|
+
SolarOpenModel,
|
|
29741
|
+
SolarOpenPreTrainedModel,
|
|
29327
29742
|
SpeechT5FeatureExtractor,
|
|
29328
29743
|
SpeechT5ForSpeechToText,
|
|
29329
29744
|
SpeechT5ForTextToSpeech,
|