@huggingface/transformers 4.0.0-next.7 → 4.0.0-next.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +13 -2
- package/dist/transformers.js +689 -382
- package/dist/transformers.min.js +19 -19
- package/dist/transformers.node.cjs +716 -382
- package/dist/transformers.node.min.cjs +19 -19
- package/dist/transformers.node.min.mjs +19 -19
- package/dist/transformers.node.mjs +689 -382
- package/dist/transformers.web.js +697 -390
- package/dist/transformers.web.min.js +17 -17
- package/package.json +2 -2
- package/src/configs.js +28 -22
- package/src/env.js +1 -1
- package/src/image_processors_utils.js +25 -15
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/image_processors.js +2 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +1 -1
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +2 -0
- package/src/models/models.js +10 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/processors.js +1 -0
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +187 -136
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/registry.js +17 -0
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/pipelines.js +1 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/model_registry/get_file_metadata.js +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +3 -2
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/image_processors.d.ts +2 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +10 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/processors.d.ts +1 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +41 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
|
@@ -117,6 +117,9 @@ __export(transformers_exports, {
|
|
|
117
117
|
BloomModel: () => BloomModel,
|
|
118
118
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
119
119
|
BloomTokenizer: () => BloomTokenizer,
|
|
120
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
121
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
122
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
120
123
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
121
124
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
122
125
|
CLIPModel: () => CLIPModel,
|
|
@@ -212,6 +215,9 @@ __export(transformers_exports, {
|
|
|
212
215
|
DebertaV2Tokenizer: () => DebertaV2Tokenizer,
|
|
213
216
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
214
217
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
218
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
219
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
220
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
215
221
|
DeiTFeatureExtractor: () => DeiTFeatureExtractor,
|
|
216
222
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
217
223
|
DeiTImageProcessor: () => DeiTImageProcessor,
|
|
@@ -272,6 +278,11 @@ __export(transformers_exports, {
|
|
|
272
278
|
EsmModel: () => EsmModel,
|
|
273
279
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
274
280
|
EsmTokenizer: () => EsmTokenizer,
|
|
281
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
282
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
283
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
284
|
+
EuroBertModel: () => EuroBertModel,
|
|
285
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
275
286
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
276
287
|
ExaoneModel: () => ExaoneModel,
|
|
277
288
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -329,8 +340,14 @@ __export(transformers_exports, {
|
|
|
329
340
|
GemmaModel: () => GemmaModel,
|
|
330
341
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
331
342
|
GemmaTokenizer: () => GemmaTokenizer,
|
|
343
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
344
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
332
345
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
333
346
|
GlmModel: () => GlmModel,
|
|
347
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
348
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
349
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
350
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
334
351
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
335
352
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
336
353
|
GptOssModel: () => GptOssModel,
|
|
@@ -396,6 +413,7 @@ __export(transformers_exports, {
|
|
|
396
413
|
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
397
414
|
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
398
415
|
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
416
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
399
417
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
400
418
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
401
419
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -465,6 +483,9 @@ __export(transformers_exports, {
|
|
|
465
483
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
466
484
|
MinLengthLogitsProcessor: () => MinLengthLogitsProcessor,
|
|
467
485
|
MinNewTokensLengthLogitsProcessor: () => MinNewTokensLengthLogitsProcessor,
|
|
486
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
487
|
+
Mistral4Model: () => Mistral4Model,
|
|
488
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
468
489
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
469
490
|
MistralModel: () => MistralModel,
|
|
470
491
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -536,6 +557,9 @@ __export(transformers_exports, {
|
|
|
536
557
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
537
558
|
NanoChatModel: () => NanoChatModel,
|
|
538
559
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
560
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
561
|
+
NemotronHModel: () => NemotronHModel,
|
|
562
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
539
563
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
540
564
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
541
565
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -725,6 +749,9 @@ __export(transformers_exports, {
|
|
|
725
749
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
726
750
|
SnacModel: () => SnacModel,
|
|
727
751
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
752
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
753
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
754
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
728
755
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
729
756
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
730
757
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
@@ -925,7 +952,7 @@ var import_node_fs = __toESM(require("fs"), 1);
|
|
|
925
952
|
var import_node_path = __toESM(require("path"), 1);
|
|
926
953
|
var import_node_url = __toESM(require("url"), 1);
|
|
927
954
|
var import_meta = {};
|
|
928
|
-
var VERSION = "4.0.0-next.
|
|
955
|
+
var VERSION = "4.0.0-next.8";
|
|
929
956
|
var HAS_SELF = typeof self !== "undefined";
|
|
930
957
|
var IS_FS_AVAILABLE = !isEmpty(import_node_fs.default);
|
|
931
958
|
var IS_PATH_AVAILABLE = !isEmpty(import_node_path.default);
|
|
@@ -1155,7 +1182,7 @@ var logger = {
|
|
|
1155
1182
|
}
|
|
1156
1183
|
};
|
|
1157
1184
|
|
|
1158
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
1185
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
1159
1186
|
var DictionarySplitter = class {
|
|
1160
1187
|
/**
|
|
1161
1188
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -2811,10 +2838,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
2811
2838
|
);
|
|
2812
2839
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
2813
2840
|
output_tokens.push(...byte_tokens);
|
|
2814
|
-
} else {
|
|
2841
|
+
} else if (this.unk_token != null) {
|
|
2815
2842
|
output_tokens.push(this.unk_token);
|
|
2816
2843
|
}
|
|
2817
|
-
} else {
|
|
2844
|
+
} else if (this.unk_token != null) {
|
|
2818
2845
|
output_tokens.push(this.unk_token);
|
|
2819
2846
|
}
|
|
2820
2847
|
}
|
|
@@ -7426,13 +7453,15 @@ async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey,
|
|
|
7426
7453
|
wrapped_progress
|
|
7427
7454
|
);
|
|
7428
7455
|
} else if (typeof response !== "string") {
|
|
7456
|
+
const headers = new Headers(response.headers);
|
|
7457
|
+
headers.set("content-length", result.byteLength.toString());
|
|
7429
7458
|
await cache2.put(
|
|
7430
7459
|
cacheKey,
|
|
7431
7460
|
new Response(
|
|
7432
7461
|
/** @type {any} */
|
|
7433
7462
|
result,
|
|
7434
7463
|
{
|
|
7435
|
-
headers
|
|
7464
|
+
headers
|
|
7436
7465
|
}
|
|
7437
7466
|
)
|
|
7438
7467
|
).catch((err) => {
|
|
@@ -16643,6 +16672,7 @@ __export(processors_exports, {
|
|
|
16643
16672
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
16644
16673
|
Florence2Processor: () => Florence2Processor,
|
|
16645
16674
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
16675
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
16646
16676
|
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
16647
16677
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
16648
16678
|
Idefics3Processor: () => Idefics3Processor,
|
|
@@ -19147,26 +19177,29 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
19147
19177
|
}
|
|
19148
19178
|
return [segmentation, segments];
|
|
19149
19179
|
}
|
|
19150
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
19180
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
19151
19181
|
if (height < factor || width < factor) {
|
|
19152
|
-
|
|
19153
|
-
|
|
19182
|
+
const scale = Math.max(factor / height, factor / width);
|
|
19183
|
+
height = Math.round(height * scale);
|
|
19184
|
+
width = Math.round(width * scale);
|
|
19185
|
+
}
|
|
19186
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19154
19187
|
throw new Error(
|
|
19155
19188
|
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19156
19189
|
);
|
|
19157
19190
|
}
|
|
19158
19191
|
let h_bar = Math.round(height / factor) * factor;
|
|
19159
19192
|
let w_bar = Math.round(width / factor) * factor;
|
|
19160
|
-
if (h_bar * w_bar > max_pixels) {
|
|
19161
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
19162
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
19163
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
19164
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
19165
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
19193
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
19194
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
19195
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
19196
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
19197
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
19198
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
19166
19199
|
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19167
19200
|
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19168
19201
|
}
|
|
19169
|
-
return [
|
|
19202
|
+
return [w_bar, h_bar];
|
|
19170
19203
|
}
|
|
19171
19204
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
19172
19205
|
if (label_ids_to_fuse === null) {
|
|
@@ -19245,7 +19278,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
19245
19278
|
this.do_pad = config.do_pad;
|
|
19246
19279
|
this.min_pixels = config.min_pixels;
|
|
19247
19280
|
this.max_pixels = config.max_pixels;
|
|
19248
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
19281
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
19249
19282
|
this.pad_size = this.size;
|
|
19250
19283
|
}
|
|
19251
19284
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -19533,10 +19566,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
19533
19566
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
19534
19567
|
[pixelData, imgDims] = padded;
|
|
19535
19568
|
} else if (this.size_divisibility) {
|
|
19536
|
-
const
|
|
19537
|
-
|
|
19538
|
-
this.size_divisibility
|
|
19539
|
-
);
|
|
19569
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
19570
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
19540
19571
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
19541
19572
|
}
|
|
19542
19573
|
}
|
|
@@ -19613,6 +19644,7 @@ var image_processors_exports = {};
|
|
|
19613
19644
|
__export(image_processors_exports, {
|
|
19614
19645
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
19615
19646
|
BitImageProcessor: () => BitImageProcessor,
|
|
19647
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
19616
19648
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
19617
19649
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
19618
19650
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -19629,6 +19661,7 @@ __export(image_processors_exports, {
|
|
|
19629
19661
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
19630
19662
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
19631
19663
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
19664
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
19632
19665
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
19633
19666
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
19634
19667
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
@@ -19689,6 +19722,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
19689
19722
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
19690
19723
|
};
|
|
19691
19724
|
|
|
19725
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
19726
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
19727
|
+
};
|
|
19728
|
+
|
|
19692
19729
|
// src/models/clip/image_processing_clip.js
|
|
19693
19730
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
19694
19731
|
};
|
|
@@ -19808,6 +19845,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
19808
19845
|
}
|
|
19809
19846
|
};
|
|
19810
19847
|
|
|
19848
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19849
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19850
|
+
constructor(config) {
|
|
19851
|
+
super(config);
|
|
19852
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
19853
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
19854
|
+
this.patch_size = config.patch_size;
|
|
19855
|
+
this.merge_size = config.merge_size;
|
|
19856
|
+
}
|
|
19857
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
19858
|
+
get_resize_output_image_size(image, size) {
|
|
19859
|
+
const factor = this.patch_size * this.merge_size;
|
|
19860
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
19861
|
+
}
|
|
19862
|
+
async _call(images, ...args) {
|
|
19863
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
19864
|
+
let patches = pixel_values;
|
|
19865
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
19866
|
+
if (patches.dims[0] === 1) {
|
|
19867
|
+
patches = cat(
|
|
19868
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
19869
|
+
0
|
|
19870
|
+
);
|
|
19871
|
+
}
|
|
19872
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
19873
|
+
const channel = patches.dims[1];
|
|
19874
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
19875
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
19876
|
+
const flatten_patches = patches.view(
|
|
19877
|
+
grid_t,
|
|
19878
|
+
temporal_patch_size,
|
|
19879
|
+
channel,
|
|
19880
|
+
Math.floor(grid_h / merge_size),
|
|
19881
|
+
merge_size,
|
|
19882
|
+
patch_size,
|
|
19883
|
+
Math.floor(grid_w / merge_size),
|
|
19884
|
+
merge_size,
|
|
19885
|
+
patch_size
|
|
19886
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
19887
|
+
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
19888
|
+
return {
|
|
19889
|
+
pixel_values: flatten_patches,
|
|
19890
|
+
image_grid_thw,
|
|
19891
|
+
original_sizes,
|
|
19892
|
+
reshaped_input_sizes
|
|
19893
|
+
};
|
|
19894
|
+
}
|
|
19895
|
+
};
|
|
19896
|
+
|
|
19897
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
19898
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
19899
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
19900
|
+
get_resize_output_image_size(image, size) {
|
|
19901
|
+
const factor = this.patch_size * this.merge_size;
|
|
19902
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
19903
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
19904
|
+
}
|
|
19905
|
+
};
|
|
19906
|
+
|
|
19811
19907
|
// src/models/glpn/image_processing_glpn.js
|
|
19812
19908
|
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
19813
19909
|
};
|
|
@@ -20201,7 +20297,7 @@ var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
|
20201
20297
|
const img = pixel_values.unsqueeze_(0);
|
|
20202
20298
|
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20203
20299
|
const f2 = total_factor ** 2;
|
|
20204
|
-
const [
|
|
20300
|
+
const [new_width, new_height] = smart_resize(
|
|
20205
20301
|
Math.max(total_factor, height),
|
|
20206
20302
|
Math.max(total_factor, width),
|
|
20207
20303
|
total_factor,
|
|
@@ -20491,55 +20587,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
20491
20587
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
20492
20588
|
};
|
|
20493
20589
|
|
|
20494
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
20495
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
20496
|
-
constructor(config) {
|
|
20497
|
-
super(config);
|
|
20498
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
20499
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
20500
|
-
this.patch_size = config.patch_size;
|
|
20501
|
-
this.merge_size = config.merge_size;
|
|
20502
|
-
}
|
|
20503
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
20504
|
-
get_resize_output_image_size(image, size) {
|
|
20505
|
-
const factor = this.patch_size * this.merge_size;
|
|
20506
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
20507
|
-
}
|
|
20508
|
-
async _call(images, ...args) {
|
|
20509
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
20510
|
-
let patches = pixel_values;
|
|
20511
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
20512
|
-
if (patches.dims[0] === 1) {
|
|
20513
|
-
patches = cat(
|
|
20514
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
20515
|
-
0
|
|
20516
|
-
);
|
|
20517
|
-
}
|
|
20518
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
20519
|
-
const channel = patches.dims[1];
|
|
20520
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
20521
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
20522
|
-
const flatten_patches = patches.view(
|
|
20523
|
-
grid_t,
|
|
20524
|
-
temporal_patch_size,
|
|
20525
|
-
channel,
|
|
20526
|
-
Math.floor(grid_h / merge_size),
|
|
20527
|
-
merge_size,
|
|
20528
|
-
patch_size,
|
|
20529
|
-
Math.floor(grid_w / merge_size),
|
|
20530
|
-
merge_size,
|
|
20531
|
-
patch_size
|
|
20532
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
20533
|
-
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
20534
|
-
return {
|
|
20535
|
-
pixel_values: flatten_patches,
|
|
20536
|
-
image_grid_thw,
|
|
20537
|
-
original_sizes,
|
|
20538
|
-
reshaped_input_sizes
|
|
20539
|
-
};
|
|
20540
|
-
}
|
|
20541
|
-
};
|
|
20542
|
-
|
|
20543
20590
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
20544
20591
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
20545
20592
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -21093,6 +21140,56 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
21093
21140
|
}
|
|
21094
21141
|
};
|
|
21095
21142
|
|
|
21143
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
21144
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
21145
|
+
static image_processor_class = AutoImageProcessor;
|
|
21146
|
+
static tokenizer_class = AutoTokenizer;
|
|
21147
|
+
static image_token = "<|image_pad|>";
|
|
21148
|
+
/**
|
|
21149
|
+
*
|
|
21150
|
+
* @param {string|string[]} text
|
|
21151
|
+
* @param {RawImage|RawImage[]} images
|
|
21152
|
+
* @param {...any} args
|
|
21153
|
+
* @returns {Promise<any>}
|
|
21154
|
+
*/
|
|
21155
|
+
async _call(text, images = null, ...args) {
|
|
21156
|
+
if (!Array.isArray(text)) {
|
|
21157
|
+
text = [text];
|
|
21158
|
+
}
|
|
21159
|
+
let image_inputs, image_grid_thw;
|
|
21160
|
+
if (images) {
|
|
21161
|
+
image_inputs = await this.image_processor(images);
|
|
21162
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
21163
|
+
}
|
|
21164
|
+
if (image_grid_thw) {
|
|
21165
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
21166
|
+
let index = 0;
|
|
21167
|
+
const image_token = (
|
|
21168
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
21169
|
+
this.constructor.image_token
|
|
21170
|
+
);
|
|
21171
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
21172
|
+
text = text.map((t) => {
|
|
21173
|
+
while (t.includes(image_token)) {
|
|
21174
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
21175
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
21176
|
+
}
|
|
21177
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
21178
|
+
});
|
|
21179
|
+
}
|
|
21180
|
+
const text_inputs = this.tokenizer(text);
|
|
21181
|
+
return {
|
|
21182
|
+
...text_inputs,
|
|
21183
|
+
...image_inputs
|
|
21184
|
+
};
|
|
21185
|
+
}
|
|
21186
|
+
};
|
|
21187
|
+
|
|
21188
|
+
// src/models/glm46v/processing_glm46v.js
|
|
21189
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
21190
|
+
static image_token = "<|image|>";
|
|
21191
|
+
};
|
|
21192
|
+
|
|
21096
21193
|
// src/models/granite_speech/processing_granite_speech.js
|
|
21097
21194
|
var GraniteSpeechProcessor = class extends Processor {
|
|
21098
21195
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -21823,47 +21920,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
21823
21920
|
}
|
|
21824
21921
|
};
|
|
21825
21922
|
|
|
21826
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
21827
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
21828
|
-
static image_processor_class = AutoImageProcessor;
|
|
21829
|
-
static tokenizer_class = AutoTokenizer;
|
|
21830
|
-
/**
|
|
21831
|
-
*
|
|
21832
|
-
* @param {string|string[]} text
|
|
21833
|
-
* @param {RawImage|RawImage[]} images
|
|
21834
|
-
* @param {...any} args
|
|
21835
|
-
* @returns {Promise<any>}
|
|
21836
|
-
*/
|
|
21837
|
-
async _call(text, images = null, ...args) {
|
|
21838
|
-
if (!Array.isArray(text)) {
|
|
21839
|
-
text = [text];
|
|
21840
|
-
}
|
|
21841
|
-
let image_inputs, image_grid_thw;
|
|
21842
|
-
if (images) {
|
|
21843
|
-
image_inputs = await this.image_processor(images);
|
|
21844
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
21845
|
-
}
|
|
21846
|
-
if (image_grid_thw) {
|
|
21847
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
21848
|
-
let index = 0;
|
|
21849
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
21850
|
-
text = text.map((t) => {
|
|
21851
|
-
while (t.includes("<|image_pad|>")) {
|
|
21852
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
21853
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
21854
|
-
}
|
|
21855
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
21856
|
-
});
|
|
21857
|
-
}
|
|
21858
|
-
const text_inputs = this.tokenizer(text);
|
|
21859
|
-
return {
|
|
21860
|
-
...text_inputs,
|
|
21861
|
-
...image_inputs
|
|
21862
|
-
// TODO: ...videos_inputs,
|
|
21863
|
-
};
|
|
21864
|
-
}
|
|
21865
|
-
};
|
|
21866
|
-
|
|
21867
21923
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
21868
21924
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
21869
21925
|
};
|
|
@@ -22207,6 +22263,8 @@ function getNormalizedConfig(config) {
|
|
|
22207
22263
|
case "gemma3n":
|
|
22208
22264
|
case "lfm2_vl":
|
|
22209
22265
|
case "chatterbox":
|
|
22266
|
+
case "lighton_ocr":
|
|
22267
|
+
case "glm_ocr":
|
|
22210
22268
|
case "mistral3":
|
|
22211
22269
|
case "qwen2_5_vl":
|
|
22212
22270
|
case "qwen3_vl":
|
|
@@ -22282,6 +22340,8 @@ function getNormalizedConfig(config) {
|
|
|
22282
22340
|
mapping["dim_kv"] = "head_dim";
|
|
22283
22341
|
break;
|
|
22284
22342
|
case "qwen3":
|
|
22343
|
+
case "solar_open":
|
|
22344
|
+
case "glm_ocr_text":
|
|
22285
22345
|
case "gemma":
|
|
22286
22346
|
case "gemma2":
|
|
22287
22347
|
case "vaultgemma":
|
|
@@ -22292,6 +22352,7 @@ function getNormalizedConfig(config) {
|
|
|
22292
22352
|
case "ernie4_5":
|
|
22293
22353
|
case "hunyuan_v1_dense":
|
|
22294
22354
|
case "falcon_h1":
|
|
22355
|
+
case "nemotron_h":
|
|
22295
22356
|
case "ministral":
|
|
22296
22357
|
case "ministral3":
|
|
22297
22358
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -22326,6 +22387,9 @@ function getNormalizedConfig(config) {
|
|
|
22326
22387
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
22327
22388
|
break;
|
|
22328
22389
|
case "youtu":
|
|
22390
|
+
case "deepseek_v3":
|
|
22391
|
+
case "glm_moe_dsa":
|
|
22392
|
+
case "mistral4":
|
|
22329
22393
|
mapping["num_heads"] = "num_key_value_heads";
|
|
22330
22394
|
mapping["num_layers"] = "num_hidden_layers";
|
|
22331
22395
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -22414,6 +22478,7 @@ function getCacheShapes(config, options) {
|
|
|
22414
22478
|
if (!(config instanceof PretrainedConfig)) {
|
|
22415
22479
|
config = new PretrainedConfig(config);
|
|
22416
22480
|
}
|
|
22481
|
+
const batch_size = options?.batch_size ?? 1;
|
|
22417
22482
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
22418
22483
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
22419
22484
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -22423,7 +22488,6 @@ function getCacheShapes(config, options) {
|
|
|
22423
22488
|
config
|
|
22424
22489
|
);
|
|
22425
22490
|
const head_dim = hidden_size / num_attention_heads;
|
|
22426
|
-
const batch_size = options?.batch_size ?? 1;
|
|
22427
22491
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
22428
22492
|
if (layer_types[i] === "full_attention") {
|
|
22429
22493
|
for (const kv of ["key", "value"]) {
|
|
@@ -22436,31 +22500,26 @@ function getCacheShapes(config, options) {
|
|
|
22436
22500
|
}
|
|
22437
22501
|
}
|
|
22438
22502
|
return cache_values;
|
|
22439
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
22503
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
22440
22504
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
22441
22505
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
22442
|
-
const
|
|
22443
|
-
const {
|
|
22444
|
-
layer_types,
|
|
22445
|
-
num_hidden_layers,
|
|
22446
|
-
num_attention_heads,
|
|
22447
|
-
num_key_value_heads,
|
|
22448
|
-
hidden_size,
|
|
22449
|
-
mamba_d_conv,
|
|
22450
|
-
mamba_n_heads,
|
|
22451
|
-
mamba_d_head,
|
|
22452
|
-
mamba_d_state,
|
|
22453
|
-
mamba_n_groups,
|
|
22454
|
-
mamba_expand,
|
|
22455
|
-
mamba_d_ssm
|
|
22456
|
-
} = (
|
|
22506
|
+
const c = (
|
|
22457
22507
|
/** @type {any} */
|
|
22458
22508
|
config
|
|
22459
22509
|
);
|
|
22460
|
-
const
|
|
22461
|
-
const
|
|
22462
|
-
const
|
|
22463
|
-
|
|
22510
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
22511
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
22512
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
22513
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
22514
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
22515
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
22516
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
22517
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
22518
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
22519
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
22520
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
22521
|
+
const cache_values = {};
|
|
22522
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
22464
22523
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
22465
22524
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
22466
22525
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -22494,7 +22553,6 @@ function getCacheShapes(config, options) {
|
|
|
22494
22553
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
22495
22554
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
22496
22555
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
22497
|
-
const batch_size = options?.batch_size ?? 1;
|
|
22498
22556
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
22499
22557
|
if (layer_types[i] === "full_attention") {
|
|
22500
22558
|
for (const kv of ["key", "value"]) {
|
|
@@ -25122,7 +25180,9 @@ async function generic_text_to_text_forward(self2, {
|
|
|
25122
25180
|
"qwen3_5",
|
|
25123
25181
|
"qwen3_5_text",
|
|
25124
25182
|
"qwen3_5_moe",
|
|
25125
|
-
"qwen3_5_moe_text"
|
|
25183
|
+
"qwen3_5_moe_text",
|
|
25184
|
+
"glm_ocr",
|
|
25185
|
+
"glm_ocr_text"
|
|
25126
25186
|
].includes(self2.config.model_type)
|
|
25127
25187
|
) {
|
|
25128
25188
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -25346,6 +25406,8 @@ __export(models_exports, {
|
|
|
25346
25406
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
25347
25407
|
BloomModel: () => BloomModel,
|
|
25348
25408
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
25409
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
25410
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
25349
25411
|
CLIPModel: () => CLIPModel,
|
|
25350
25412
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
25351
25413
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -25420,6 +25482,9 @@ __export(models_exports, {
|
|
|
25420
25482
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
25421
25483
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
25422
25484
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
25485
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
25486
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
25487
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
25423
25488
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
25424
25489
|
DeiTModel: () => DeiTModel,
|
|
25425
25490
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -25465,6 +25530,11 @@ __export(models_exports, {
|
|
|
25465
25530
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
25466
25531
|
EsmModel: () => EsmModel,
|
|
25467
25532
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
25533
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
25534
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
25535
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
25536
|
+
EuroBertModel: () => EuroBertModel,
|
|
25537
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
25468
25538
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
25469
25539
|
ExaoneModel: () => ExaoneModel,
|
|
25470
25540
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -25511,6 +25581,10 @@ __export(models_exports, {
|
|
|
25511
25581
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
25512
25582
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
25513
25583
|
GlmModel: () => GlmModel,
|
|
25584
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
25585
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
25586
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
25587
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
25514
25588
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
25515
25589
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
25516
25590
|
GptOssModel: () => GptOssModel,
|
|
@@ -25557,6 +25631,7 @@ __export(models_exports, {
|
|
|
25557
25631
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
25558
25632
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
25559
25633
|
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
25634
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
25560
25635
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
25561
25636
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
25562
25637
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -25606,6 +25681,9 @@ __export(models_exports, {
|
|
|
25606
25681
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
25607
25682
|
MimiModel: () => MimiModel,
|
|
25608
25683
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
25684
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
25685
|
+
Mistral4Model: () => Mistral4Model,
|
|
25686
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
25609
25687
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
25610
25688
|
MistralModel: () => MistralModel,
|
|
25611
25689
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -25663,6 +25741,9 @@ __export(models_exports, {
|
|
|
25663
25741
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
25664
25742
|
NanoChatModel: () => NanoChatModel,
|
|
25665
25743
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
25744
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
25745
|
+
NemotronHModel: () => NemotronHModel,
|
|
25746
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
25666
25747
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
25667
25748
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
25668
25749
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -25800,6 +25881,9 @@ __export(models_exports, {
|
|
|
25800
25881
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
25801
25882
|
SnacModel: () => SnacModel,
|
|
25802
25883
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
25884
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
25885
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
25886
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
25803
25887
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
25804
25888
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
25805
25889
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -25974,7 +26058,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
25974
26058
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
25975
26059
|
};
|
|
25976
26060
|
|
|
25977
|
-
// src/models/
|
|
26061
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
25978
26062
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
25979
26063
|
};
|
|
25980
26064
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -26309,6 +26393,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
26309
26393
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
26310
26394
|
};
|
|
26311
26395
|
|
|
26396
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
26397
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
26398
|
+
};
|
|
26399
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
26400
|
+
};
|
|
26401
|
+
|
|
26312
26402
|
// src/models/clap/modeling_clap.js
|
|
26313
26403
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
26314
26404
|
};
|
|
@@ -26647,6 +26737,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
26647
26737
|
}
|
|
26648
26738
|
};
|
|
26649
26739
|
|
|
26740
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
26741
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
26742
|
+
};
|
|
26743
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
26744
|
+
};
|
|
26745
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
26746
|
+
};
|
|
26747
|
+
|
|
26650
26748
|
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
26651
26749
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
26652
26750
|
};
|
|
@@ -26995,6 +27093,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
26995
27093
|
}
|
|
26996
27094
|
};
|
|
26997
27095
|
|
|
27096
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
27097
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
27098
|
+
};
|
|
27099
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
27100
|
+
};
|
|
27101
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
27102
|
+
/**
|
|
27103
|
+
* Calls the model on new inputs.
|
|
27104
|
+
*
|
|
27105
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
27106
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
27107
|
+
*/
|
|
27108
|
+
async _call(model_inputs) {
|
|
27109
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
27110
|
+
}
|
|
27111
|
+
};
|
|
27112
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
27113
|
+
/**
|
|
27114
|
+
* Calls the model on new inputs.
|
|
27115
|
+
*
|
|
27116
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
27117
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
27118
|
+
*/
|
|
27119
|
+
async _call(model_inputs) {
|
|
27120
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
27121
|
+
}
|
|
27122
|
+
};
|
|
27123
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
27124
|
+
/**
|
|
27125
|
+
* Calls the model on new inputs.
|
|
27126
|
+
*
|
|
27127
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
27128
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
27129
|
+
*/
|
|
27130
|
+
async _call(model_inputs) {
|
|
27131
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
27132
|
+
}
|
|
27133
|
+
};
|
|
27134
|
+
|
|
26998
27135
|
// src/models/exaone/modeling_exaone.js
|
|
26999
27136
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
27000
27137
|
};
|
|
@@ -27270,6 +27407,377 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
27270
27407
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
27271
27408
|
};
|
|
27272
27409
|
|
|
27410
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
27411
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
27412
|
+
};
|
|
27413
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
27414
|
+
};
|
|
27415
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
27416
|
+
};
|
|
27417
|
+
|
|
27418
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
27419
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
27420
|
+
forward_params = [
|
|
27421
|
+
// Text inputs
|
|
27422
|
+
"input_ids",
|
|
27423
|
+
"attention_mask",
|
|
27424
|
+
"position_ids",
|
|
27425
|
+
"past_key_values",
|
|
27426
|
+
// Vision inputs
|
|
27427
|
+
"pixel_values",
|
|
27428
|
+
"image_grid_thw"
|
|
27429
|
+
];
|
|
27430
|
+
};
|
|
27431
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
27432
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
27433
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
27434
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
27435
|
+
image_grid_thw_name = "grid_thw";
|
|
27436
|
+
/**
|
|
27437
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
27438
|
+
* @param {Tensor} input_ids
|
|
27439
|
+
* @param {Tensor} attention_mask
|
|
27440
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
27441
|
+
*/
|
|
27442
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
27443
|
+
if (attention_mask) {
|
|
27444
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
27445
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
27446
|
+
const mrope_position_deltas = Array.from(
|
|
27447
|
+
{ length: dims[0] },
|
|
27448
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
27449
|
+
);
|
|
27450
|
+
return [
|
|
27451
|
+
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
27452
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27453
|
+
];
|
|
27454
|
+
} else {
|
|
27455
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
27456
|
+
const position_ids = BigInt64Array.from(
|
|
27457
|
+
{ length: 3 * batch_size * seq_length },
|
|
27458
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
27459
|
+
);
|
|
27460
|
+
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
27461
|
+
}
|
|
27462
|
+
}
|
|
27463
|
+
/**
|
|
27464
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
27465
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
27466
|
+
* respecting attention mask.
|
|
27467
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
27468
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
27469
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
27470
|
+
* @param {number} batch_idx Current batch index
|
|
27471
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
27472
|
+
*/
|
|
27473
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
27474
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
27475
|
+
const llm_positions = new Array(total_len);
|
|
27476
|
+
let index = 0;
|
|
27477
|
+
for (let x = 0; x < 3; ++x) {
|
|
27478
|
+
for (const val of llm_pos_ids_list) {
|
|
27479
|
+
const seg_len = val.length / 3;
|
|
27480
|
+
for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
|
|
27481
|
+
llm_positions[index++] = val[z];
|
|
27482
|
+
}
|
|
27483
|
+
}
|
|
27484
|
+
}
|
|
27485
|
+
let count2 = 0;
|
|
27486
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
27487
|
+
if (attn_mask[y] == 1) {
|
|
27488
|
+
for (let x = 0; x < 3; ++x) {
|
|
27489
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
27490
|
+
}
|
|
27491
|
+
++count2;
|
|
27492
|
+
}
|
|
27493
|
+
}
|
|
27494
|
+
return llm_positions;
|
|
27495
|
+
}
|
|
27496
|
+
/**
|
|
27497
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
27498
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
27499
|
+
* @param {object} params
|
|
27500
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
27501
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
27502
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
27503
|
+
* @param {number} params.spatial_merge_size
|
|
27504
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
27505
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
27506
|
+
*/
|
|
27507
|
+
_get_multimodal_rope_positions({
|
|
27508
|
+
filtered_ids,
|
|
27509
|
+
image_grid_thw_list,
|
|
27510
|
+
video_grid_thw_list,
|
|
27511
|
+
spatial_merge_size,
|
|
27512
|
+
state
|
|
27513
|
+
}) {
|
|
27514
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
27515
|
+
const ids = filtered_ids;
|
|
27516
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
27517
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
27518
|
+
return acc;
|
|
27519
|
+
}, []);
|
|
27520
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
27521
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
27522
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
27523
|
+
const llm_pos_ids_list = [];
|
|
27524
|
+
let st2 = 0;
|
|
27525
|
+
let remain_images = image_nums;
|
|
27526
|
+
let remain_videos = video_nums;
|
|
27527
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
27528
|
+
const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
|
|
27529
|
+
const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
|
|
27530
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
27531
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
27532
|
+
let ed;
|
|
27533
|
+
let t, h, w;
|
|
27534
|
+
if (ed_image < ed_video) {
|
|
27535
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
27536
|
+
++state.image_index;
|
|
27537
|
+
--remain_images;
|
|
27538
|
+
ed = ed_image;
|
|
27539
|
+
} else {
|
|
27540
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
27541
|
+
++state.video_index;
|
|
27542
|
+
--remain_videos;
|
|
27543
|
+
ed = ed_video;
|
|
27544
|
+
}
|
|
27545
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
27546
|
+
Number(t),
|
|
27547
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
27548
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
27549
|
+
];
|
|
27550
|
+
const text_len = ed - st2;
|
|
27551
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27552
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
27553
|
+
const offset = text_len + st_idx;
|
|
27554
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
27555
|
+
const t_index = Array.from(
|
|
27556
|
+
{ length: grid_size },
|
|
27557
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
27558
|
+
);
|
|
27559
|
+
const h_index = Array.from(
|
|
27560
|
+
{ length: grid_size },
|
|
27561
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
27562
|
+
);
|
|
27563
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
27564
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
27565
|
+
st2 = ed + grid_size;
|
|
27566
|
+
}
|
|
27567
|
+
if (st2 < ids.length) {
|
|
27568
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27569
|
+
const text_len = ids.length - st2;
|
|
27570
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
27571
|
+
}
|
|
27572
|
+
return llm_pos_ids_list;
|
|
27573
|
+
}
|
|
27574
|
+
/**
|
|
27575
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
27576
|
+
*
|
|
27577
|
+
* Explanation:
|
|
27578
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
27579
|
+
*
|
|
27580
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
27581
|
+
* Examples:
|
|
27582
|
+
* input_ids: [T T T T T], here T is for text.
|
|
27583
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
27584
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
27585
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
27586
|
+
*
|
|
27587
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
27588
|
+
* and 1D rotary position embeddin for text part.
|
|
27589
|
+
* Examples:
|
|
27590
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
27591
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
27592
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
27593
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
27594
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
27595
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
27596
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
27597
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
27598
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
27599
|
+
*
|
|
27600
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
27601
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
27602
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
27603
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
27604
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
27605
|
+
*/
|
|
27606
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
27607
|
+
const { vision_config } = this.config;
|
|
27608
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
27609
|
+
if (image_grid_thw || video_grid_thw) {
|
|
27610
|
+
const total_input_ids = input_ids.tolist();
|
|
27611
|
+
if (!attention_mask) {
|
|
27612
|
+
attention_mask = ones_like(input_ids);
|
|
27613
|
+
}
|
|
27614
|
+
const attention_mask_list = attention_mask.tolist();
|
|
27615
|
+
const position_ids_list = Array.from(
|
|
27616
|
+
{ length: 3 },
|
|
27617
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
27618
|
+
);
|
|
27619
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
27620
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
27621
|
+
const state = { image_index: 0, video_index: 0 };
|
|
27622
|
+
const mrope_position_deltas = [];
|
|
27623
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
27624
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
27625
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
27626
|
+
filtered_ids,
|
|
27627
|
+
image_grid_thw_list,
|
|
27628
|
+
video_grid_thw_list,
|
|
27629
|
+
spatial_merge_size,
|
|
27630
|
+
state
|
|
27631
|
+
});
|
|
27632
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
27633
|
+
llm_pos_ids_list,
|
|
27634
|
+
attention_mask_list[i],
|
|
27635
|
+
position_ids_list,
|
|
27636
|
+
i
|
|
27637
|
+
);
|
|
27638
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
27639
|
+
}
|
|
27640
|
+
return [
|
|
27641
|
+
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
27642
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27643
|
+
];
|
|
27644
|
+
} else {
|
|
27645
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
27646
|
+
}
|
|
27647
|
+
}
|
|
27648
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
27649
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
27650
|
+
pixel_values,
|
|
27651
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
27652
|
+
})).image_features;
|
|
27653
|
+
return features;
|
|
27654
|
+
}
|
|
27655
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
27656
|
+
return default_merge_input_ids_with_image_features({
|
|
27657
|
+
// @ts-ignore
|
|
27658
|
+
image_token_id: this.config.image_token_id,
|
|
27659
|
+
...kwargs
|
|
27660
|
+
});
|
|
27661
|
+
}
|
|
27662
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
27663
|
+
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
27664
|
+
if (!model_inputs.past_key_values) {
|
|
27665
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27666
|
+
model_inputs.input_ids,
|
|
27667
|
+
model_inputs.image_grid_thw,
|
|
27668
|
+
model_inputs.video_grid_thw,
|
|
27669
|
+
model_inputs.attention_mask
|
|
27670
|
+
);
|
|
27671
|
+
} else {
|
|
27672
|
+
model_inputs.pixel_values = null;
|
|
27673
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
27674
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27675
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27676
|
+
model_inputs.input_ids,
|
|
27677
|
+
model_inputs.image_grid_thw,
|
|
27678
|
+
model_inputs.video_grid_thw,
|
|
27679
|
+
model_inputs.attention_mask
|
|
27680
|
+
);
|
|
27681
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
27682
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
27683
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
27684
|
+
} else {
|
|
27685
|
+
if (!model_inputs.rope_deltas) {
|
|
27686
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27687
|
+
model_inputs.input_ids,
|
|
27688
|
+
model_inputs.image_grid_thw,
|
|
27689
|
+
model_inputs.video_grid_thw,
|
|
27690
|
+
model_inputs.attention_mask
|
|
27691
|
+
);
|
|
27692
|
+
}
|
|
27693
|
+
const delta = BigInt(past_length);
|
|
27694
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
27695
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
27696
|
+
}
|
|
27697
|
+
}
|
|
27698
|
+
}
|
|
27699
|
+
return model_inputs;
|
|
27700
|
+
}
|
|
27701
|
+
};
|
|
27702
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
27703
|
+
};
|
|
27704
|
+
|
|
27705
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27706
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27707
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27708
|
+
};
|
|
27709
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
27710
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27711
|
+
};
|
|
27712
|
+
|
|
27713
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
27714
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27715
|
+
/**
|
|
27716
|
+
* Compute 3D positional indices for vision tokens.
|
|
27717
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
27718
|
+
* @param {number} start_position
|
|
27719
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
27720
|
+
* @param {number} temp_merge_size
|
|
27721
|
+
* @param {number} spatial_merge_size
|
|
27722
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
27723
|
+
*/
|
|
27724
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
27725
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
27726
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
27727
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
27728
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
27729
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
27730
|
+
const h_pos = Array.from(
|
|
27731
|
+
{ length: seq_len },
|
|
27732
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
27733
|
+
);
|
|
27734
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
27735
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
27736
|
+
}
|
|
27737
|
+
/**
|
|
27738
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
27739
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
27740
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
27741
|
+
*/
|
|
27742
|
+
_get_multimodal_rope_positions({
|
|
27743
|
+
filtered_ids,
|
|
27744
|
+
image_grid_thw_list,
|
|
27745
|
+
video_grid_thw_list,
|
|
27746
|
+
spatial_merge_size,
|
|
27747
|
+
state
|
|
27748
|
+
}) {
|
|
27749
|
+
const { image_token_id } = this.config;
|
|
27750
|
+
const groups = [];
|
|
27751
|
+
let group_start = 0;
|
|
27752
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
27753
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
27754
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
27755
|
+
if (t !== current_type) {
|
|
27756
|
+
groups.push([current_type, group_start, j]);
|
|
27757
|
+
group_start = j;
|
|
27758
|
+
current_type = t;
|
|
27759
|
+
}
|
|
27760
|
+
}
|
|
27761
|
+
let current_pos = 0;
|
|
27762
|
+
const llm_pos_ids_list = [];
|
|
27763
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
27764
|
+
if (modality_type === 0) {
|
|
27765
|
+
const text_len = end_idx - start_idx;
|
|
27766
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
27767
|
+
current_pos += text_len;
|
|
27768
|
+
} else {
|
|
27769
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
27770
|
+
const temp_merge_size = grid_thw[0];
|
|
27771
|
+
llm_pos_ids_list.push(
|
|
27772
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
27773
|
+
);
|
|
27774
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
27775
|
+
}
|
|
27776
|
+
}
|
|
27777
|
+
return llm_pos_ids_list;
|
|
27778
|
+
}
|
|
27779
|
+
};
|
|
27780
|
+
|
|
27273
27781
|
// src/models/glpn/modeling_glpn.js
|
|
27274
27782
|
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
27275
27783
|
};
|
|
@@ -27582,6 +28090,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
27582
28090
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
27583
28091
|
};
|
|
27584
28092
|
|
|
28093
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
28094
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
28095
|
+
};
|
|
28096
|
+
|
|
27585
28097
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
27586
28098
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
27587
28099
|
};
|
|
@@ -27778,6 +28290,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
27778
28290
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
27779
28291
|
};
|
|
27780
28292
|
|
|
28293
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
28294
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
28295
|
+
};
|
|
28296
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
28297
|
+
};
|
|
28298
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
28299
|
+
};
|
|
28300
|
+
|
|
27781
28301
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
27782
28302
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
27783
28303
|
};
|
|
@@ -28246,6 +28766,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
28246
28766
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
28247
28767
|
};
|
|
28248
28768
|
|
|
28769
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
28770
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
28771
|
+
};
|
|
28772
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
28773
|
+
};
|
|
28774
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
28775
|
+
};
|
|
28776
|
+
|
|
28249
28777
|
// src/models/neobert/modeling_neobert.js
|
|
28250
28778
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
28251
28779
|
};
|
|
@@ -28526,252 +29054,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
28526
29054
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
28527
29055
|
};
|
|
28528
29056
|
|
|
28529
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
28530
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
28531
|
-
forward_params = [
|
|
28532
|
-
// Text inputs
|
|
28533
|
-
"input_ids",
|
|
28534
|
-
"attention_mask",
|
|
28535
|
-
"position_ids",
|
|
28536
|
-
"past_key_values",
|
|
28537
|
-
// Vision inputs
|
|
28538
|
-
"pixel_values",
|
|
28539
|
-
"image_grid_thw"
|
|
28540
|
-
];
|
|
28541
|
-
};
|
|
28542
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
28543
|
-
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
28544
|
-
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
28545
|
-
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
28546
|
-
image_grid_thw_name = "grid_thw";
|
|
28547
|
-
/**
|
|
28548
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
28549
|
-
*
|
|
28550
|
-
* Explanation:
|
|
28551
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
28552
|
-
*
|
|
28553
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
28554
|
-
* Examples:
|
|
28555
|
-
* input_ids: [T T T T T], here T is for text.
|
|
28556
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
28557
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
28558
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
28559
|
-
*
|
|
28560
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
28561
|
-
* and 1D rotary position embeddin for text part.
|
|
28562
|
-
* Examples:
|
|
28563
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
28564
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
28565
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
28566
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
28567
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
28568
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
28569
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
28570
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
28571
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
28572
|
-
*
|
|
28573
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
28574
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
28575
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
28576
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
28577
|
-
* - 1 for tokens that are **not masked**,
|
|
28578
|
-
* - 0 for tokens that are **masked**.
|
|
28579
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
28580
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
28581
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
28582
|
-
*/
|
|
28583
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
28584
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
28585
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
28586
|
-
const mrope_position_deltas = [];
|
|
28587
|
-
if (image_grid_thw || video_grid_thw) {
|
|
28588
|
-
let total_input_ids = input_ids.tolist();
|
|
28589
|
-
if (!attention_mask) {
|
|
28590
|
-
attention_mask = ones_like(input_ids);
|
|
28591
|
-
}
|
|
28592
|
-
const attention_mask_list = attention_mask.tolist();
|
|
28593
|
-
const position_ids_list = Array.from(
|
|
28594
|
-
{ length: 3 },
|
|
28595
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
28596
|
-
);
|
|
28597
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
28598
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
28599
|
-
let image_index = 0;
|
|
28600
|
-
let video_index = 0;
|
|
28601
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
28602
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
28603
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
28604
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
28605
|
-
return acc;
|
|
28606
|
-
}, []);
|
|
28607
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
28608
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
28609
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
28610
|
-
let llm_pos_ids_list = [];
|
|
28611
|
-
let st2 = 0;
|
|
28612
|
-
let remain_images = image_nums;
|
|
28613
|
-
let remain_videos = video_nums;
|
|
28614
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
28615
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
|
|
28616
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
|
|
28617
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
28618
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
28619
|
-
let ed;
|
|
28620
|
-
let t, h, w;
|
|
28621
|
-
if (ed_image < ed_video) {
|
|
28622
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
28623
|
-
++image_index;
|
|
28624
|
-
--remain_images;
|
|
28625
|
-
ed = ed_image;
|
|
28626
|
-
} else {
|
|
28627
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
28628
|
-
++video_index;
|
|
28629
|
-
--remain_videos;
|
|
28630
|
-
ed = ed_video;
|
|
28631
|
-
}
|
|
28632
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
28633
|
-
Number(t),
|
|
28634
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
28635
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
28636
|
-
];
|
|
28637
|
-
const text_len = ed - st2;
|
|
28638
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
28639
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
28640
|
-
const offset = text_len + st_idx;
|
|
28641
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
28642
|
-
const t_index = Array.from(
|
|
28643
|
-
{ length: grid_size },
|
|
28644
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
28645
|
-
);
|
|
28646
|
-
const h_index = Array.from(
|
|
28647
|
-
{ length: grid_size },
|
|
28648
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
28649
|
-
);
|
|
28650
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
28651
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
28652
|
-
st2 = ed + grid_size;
|
|
28653
|
-
}
|
|
28654
|
-
if (st2 < ids.length) {
|
|
28655
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
28656
|
-
const text_len = ids.length - st2;
|
|
28657
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
28658
|
-
}
|
|
28659
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
28660
|
-
const llm_positions = new Array(num_items);
|
|
28661
|
-
let index = 0;
|
|
28662
|
-
for (let x = 0; x < 3; ++x) {
|
|
28663
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
28664
|
-
const val = llm_pos_ids_list[y];
|
|
28665
|
-
const text_len = val.length / 3;
|
|
28666
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
28667
|
-
llm_positions[index++] = val[z];
|
|
28668
|
-
}
|
|
28669
|
-
}
|
|
28670
|
-
}
|
|
28671
|
-
let count2 = 0;
|
|
28672
|
-
const attn_mask = attention_mask_list[i];
|
|
28673
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
28674
|
-
if (attn_mask[y] == 1) {
|
|
28675
|
-
for (let x = 0; x < 3; ++x) {
|
|
28676
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
28677
|
-
}
|
|
28678
|
-
++count2;
|
|
28679
|
-
}
|
|
28680
|
-
}
|
|
28681
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
28682
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
28683
|
-
}
|
|
28684
|
-
return [
|
|
28685
|
-
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
28686
|
-
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
28687
|
-
];
|
|
28688
|
-
} else {
|
|
28689
|
-
if (attention_mask) {
|
|
28690
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
28691
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
28692
|
-
const mrope_position_deltas2 = Array.from(
|
|
28693
|
-
{ length: dims[0] },
|
|
28694
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
28695
|
-
);
|
|
28696
|
-
return [
|
|
28697
|
-
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
28698
|
-
new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
28699
|
-
];
|
|
28700
|
-
} else {
|
|
28701
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
28702
|
-
const position_ids = BigInt64Array.from(
|
|
28703
|
-
{ length: 3 * batch_size * seq_length },
|
|
28704
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
28705
|
-
);
|
|
28706
|
-
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
28707
|
-
}
|
|
28708
|
-
}
|
|
28709
|
-
}
|
|
28710
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
28711
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
28712
|
-
pixel_values,
|
|
28713
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
28714
|
-
})).image_features;
|
|
28715
|
-
return features;
|
|
28716
|
-
}
|
|
28717
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
28718
|
-
return default_merge_input_ids_with_image_features({
|
|
28719
|
-
// @ts-ignore
|
|
28720
|
-
image_token_id: this.config.image_token_id,
|
|
28721
|
-
...kwargs
|
|
28722
|
-
});
|
|
28723
|
-
}
|
|
28724
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
28725
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
28726
|
-
if (!model_inputs.past_key_values) {
|
|
28727
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
28728
|
-
model_inputs.input_ids,
|
|
28729
|
-
model_inputs.image_grid_thw,
|
|
28730
|
-
model_inputs.video_grid_thw,
|
|
28731
|
-
model_inputs.attention_mask
|
|
28732
|
-
);
|
|
28733
|
-
} else {
|
|
28734
|
-
model_inputs.pixel_values = null;
|
|
28735
|
-
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
28736
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
28737
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
28738
|
-
model_inputs.input_ids,
|
|
28739
|
-
model_inputs.image_grid_thw,
|
|
28740
|
-
model_inputs.video_grid_thw,
|
|
28741
|
-
model_inputs.attention_mask
|
|
28742
|
-
);
|
|
28743
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
28744
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
28745
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
28746
|
-
} else {
|
|
28747
|
-
if (!model_inputs.rope_deltas) {
|
|
28748
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
28749
|
-
model_inputs.input_ids,
|
|
28750
|
-
model_inputs.image_grid_thw,
|
|
28751
|
-
model_inputs.video_grid_thw,
|
|
28752
|
-
model_inputs.attention_mask
|
|
28753
|
-
);
|
|
28754
|
-
}
|
|
28755
|
-
const delta = BigInt(past_length);
|
|
28756
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
28757
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
28758
|
-
}
|
|
28759
|
-
}
|
|
28760
|
-
}
|
|
28761
|
-
return model_inputs;
|
|
28762
|
-
}
|
|
28763
|
-
};
|
|
28764
|
-
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
28765
|
-
};
|
|
28766
|
-
|
|
28767
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
28768
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
28769
|
-
image_grid_thw_name = "image_grid_thw";
|
|
28770
|
-
};
|
|
28771
|
-
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
28772
|
-
image_grid_thw_name = "image_grid_thw";
|
|
28773
|
-
};
|
|
28774
|
-
|
|
28775
29057
|
// src/models/qwen3/modeling_qwen3.js
|
|
28776
29058
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
28777
29059
|
};
|
|
@@ -29217,6 +29499,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
29217
29499
|
}
|
|
29218
29500
|
};
|
|
29219
29501
|
|
|
29502
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
29503
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
29504
|
+
};
|
|
29505
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
29506
|
+
};
|
|
29507
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
29508
|
+
};
|
|
29509
|
+
|
|
29220
29510
|
// src/models/speecht5/modeling_speecht5.js
|
|
29221
29511
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
29222
29512
|
};
|
|
@@ -30333,6 +30623,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
30333
30623
|
// src/models/registry.js
|
|
30334
30624
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
30335
30625
|
["bert", "BertModel"],
|
|
30626
|
+
["eurobert", "EuroBertModel"],
|
|
30336
30627
|
["neobert", "NeoBertModel"],
|
|
30337
30628
|
["modernbert", "ModernBertModel"],
|
|
30338
30629
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -30464,6 +30755,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
30464
30755
|
["gemma3_text", "Gemma3Model"],
|
|
30465
30756
|
["helium", "HeliumModel"],
|
|
30466
30757
|
["glm", "GlmModel"],
|
|
30758
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
30467
30759
|
["openelm", "OpenELMModel"],
|
|
30468
30760
|
["qwen2", "Qwen2Model"],
|
|
30469
30761
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -30475,12 +30767,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
30475
30767
|
["mpt", "MptModel"],
|
|
30476
30768
|
["opt", "OPTModel"],
|
|
30477
30769
|
["mistral", "MistralModel"],
|
|
30770
|
+
["mistral4", "Mistral4Model"],
|
|
30478
30771
|
["ministral", "MinistralModel"],
|
|
30479
30772
|
["ministral3", "Ministral3Model"],
|
|
30480
30773
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
30481
30774
|
["starcoder2", "Starcoder2Model"],
|
|
30775
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
30482
30776
|
["falcon", "FalconModel"],
|
|
30483
30777
|
["falcon_h1", "FalconH1Model"],
|
|
30778
|
+
["nemotron_h", "NemotronHModel"],
|
|
30779
|
+
["solar_open", "SolarOpenModel"],
|
|
30484
30780
|
["stablelm", "StableLmModel"],
|
|
30485
30781
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
30486
30782
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -30500,6 +30796,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30500
30796
|
]);
|
|
30501
30797
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30502
30798
|
["bert", "BertForSequenceClassification"],
|
|
30799
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
30503
30800
|
["neobert", "NeoBertForSequenceClassification"],
|
|
30504
30801
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
30505
30802
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -30522,6 +30819,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30522
30819
|
]);
|
|
30523
30820
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30524
30821
|
["bert", "BertForTokenClassification"],
|
|
30822
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
30525
30823
|
["neobert", "NeoBertForTokenClassification"],
|
|
30526
30824
|
["modernbert", "ModernBertForTokenClassification"],
|
|
30527
30825
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -30584,6 +30882,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30584
30882
|
["gemma3", "Gemma3ForCausalLM"],
|
|
30585
30883
|
["helium", "HeliumForCausalLM"],
|
|
30586
30884
|
["glm", "GlmForCausalLM"],
|
|
30885
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
30587
30886
|
["openelm", "OpenELMForCausalLM"],
|
|
30588
30887
|
["qwen2", "Qwen2ForCausalLM"],
|
|
30589
30888
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
@@ -30603,13 +30902,17 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30603
30902
|
["opt", "OPTForCausalLM"],
|
|
30604
30903
|
["mbart", "MBartForCausalLM"],
|
|
30605
30904
|
["mistral", "MistralForCausalLM"],
|
|
30905
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
30606
30906
|
["ministral", "MinistralForCausalLM"],
|
|
30607
30907
|
["ministral3", "Ministral3ForCausalLM"],
|
|
30608
30908
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
30609
30909
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
30910
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
30610
30911
|
["falcon", "FalconForCausalLM"],
|
|
30611
30912
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
30913
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
30612
30914
|
["trocr", "TrOCRForCausalLM"],
|
|
30915
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
30613
30916
|
["stablelm", "StableLmForCausalLM"],
|
|
30614
30917
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
30615
30918
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -30620,6 +30923,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30620
30923
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
30621
30924
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30622
30925
|
["bert", "BertForMaskedLM"],
|
|
30926
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
30623
30927
|
["neobert", "NeoBertForMaskedLM"],
|
|
30624
30928
|
["modernbert", "ModernBertForMaskedLM"],
|
|
30625
30929
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -30678,7 +30982,9 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30678
30982
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
30679
30983
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
30680
30984
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
30681
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
30985
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
30986
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
30987
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
30682
30988
|
]);
|
|
30683
30989
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30684
30990
|
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
@@ -30783,6 +31089,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
30783
31089
|
]);
|
|
30784
31090
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
30785
31091
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
31092
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
30786
31093
|
["dpt", "DPTForDepthEstimation"],
|
|
30787
31094
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
30788
31095
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -33522,6 +33829,9 @@ var ModelRegistry = class {
|
|
|
33522
33829
|
BloomModel,
|
|
33523
33830
|
BloomPreTrainedModel,
|
|
33524
33831
|
BloomTokenizer,
|
|
33832
|
+
CHMv2ForDepthEstimation,
|
|
33833
|
+
CHMv2ImageProcessor,
|
|
33834
|
+
CHMv2PreTrainedModel,
|
|
33525
33835
|
CLIPFeatureExtractor,
|
|
33526
33836
|
CLIPImageProcessor,
|
|
33527
33837
|
CLIPModel,
|
|
@@ -33617,6 +33927,9 @@ var ModelRegistry = class {
|
|
|
33617
33927
|
DebertaV2Tokenizer,
|
|
33618
33928
|
DecisionTransformerModel,
|
|
33619
33929
|
DecisionTransformerPreTrainedModel,
|
|
33930
|
+
DeepseekV3ForCausalLM,
|
|
33931
|
+
DeepseekV3Model,
|
|
33932
|
+
DeepseekV3PreTrainedModel,
|
|
33620
33933
|
DeiTFeatureExtractor,
|
|
33621
33934
|
DeiTForImageClassification,
|
|
33622
33935
|
DeiTImageProcessor,
|
|
@@ -33677,6 +33990,11 @@ var ModelRegistry = class {
|
|
|
33677
33990
|
EsmModel,
|
|
33678
33991
|
EsmPreTrainedModel,
|
|
33679
33992
|
EsmTokenizer,
|
|
33993
|
+
EuroBertForMaskedLM,
|
|
33994
|
+
EuroBertForSequenceClassification,
|
|
33995
|
+
EuroBertForTokenClassification,
|
|
33996
|
+
EuroBertModel,
|
|
33997
|
+
EuroBertPreTrainedModel,
|
|
33680
33998
|
ExaoneForCausalLM,
|
|
33681
33999
|
ExaoneModel,
|
|
33682
34000
|
ExaonePreTrainedModel,
|
|
@@ -33734,8 +34052,14 @@ var ModelRegistry = class {
|
|
|
33734
34052
|
GemmaModel,
|
|
33735
34053
|
GemmaPreTrainedModel,
|
|
33736
34054
|
GemmaTokenizer,
|
|
34055
|
+
Glm46VImageProcessor,
|
|
34056
|
+
Glm46VProcessor,
|
|
33737
34057
|
GlmForCausalLM,
|
|
33738
34058
|
GlmModel,
|
|
34059
|
+
GlmMoeDsaForCausalLM,
|
|
34060
|
+
GlmMoeDsaModel,
|
|
34061
|
+
GlmMoeDsaPreTrainedModel,
|
|
34062
|
+
GlmOcrForConditionalGeneration,
|
|
33739
34063
|
GlmPreTrainedModel,
|
|
33740
34064
|
GptOssForCausalLM,
|
|
33741
34065
|
GptOssModel,
|
|
@@ -33801,6 +34125,7 @@ var ModelRegistry = class {
|
|
|
33801
34125
|
Lfm2VlForConditionalGeneration,
|
|
33802
34126
|
Lfm2VlImageProcessor,
|
|
33803
34127
|
Lfm2VlProcessor,
|
|
34128
|
+
LightOnOcrForConditionalGeneration,
|
|
33804
34129
|
LiteWhisperForConditionalGeneration,
|
|
33805
34130
|
Llama4ForCausalLM,
|
|
33806
34131
|
Llama4PreTrainedModel,
|
|
@@ -33870,6 +34195,9 @@ var ModelRegistry = class {
|
|
|
33870
34195
|
MimiPreTrainedModel,
|
|
33871
34196
|
MinLengthLogitsProcessor,
|
|
33872
34197
|
MinNewTokensLengthLogitsProcessor,
|
|
34198
|
+
Mistral4ForCausalLM,
|
|
34199
|
+
Mistral4Model,
|
|
34200
|
+
Mistral4PreTrainedModel,
|
|
33873
34201
|
MistralForCausalLM,
|
|
33874
34202
|
MistralModel,
|
|
33875
34203
|
MistralPreTrainedModel,
|
|
@@ -33941,6 +34269,9 @@ var ModelRegistry = class {
|
|
|
33941
34269
|
NanoChatForCausalLM,
|
|
33942
34270
|
NanoChatModel,
|
|
33943
34271
|
NanoChatPreTrainedModel,
|
|
34272
|
+
NemotronHForCausalLM,
|
|
34273
|
+
NemotronHModel,
|
|
34274
|
+
NemotronHPreTrainedModel,
|
|
33944
34275
|
NeoBertForMaskedLM,
|
|
33945
34276
|
NeoBertForQuestionAnswering,
|
|
33946
34277
|
NeoBertForSequenceClassification,
|
|
@@ -34130,6 +34461,9 @@ var ModelRegistry = class {
|
|
|
34130
34461
|
SnacFeatureExtractor,
|
|
34131
34462
|
SnacModel,
|
|
34132
34463
|
SnacPreTrainedModel,
|
|
34464
|
+
SolarOpenForCausalLM,
|
|
34465
|
+
SolarOpenModel,
|
|
34466
|
+
SolarOpenPreTrainedModel,
|
|
34133
34467
|
SpeechT5FeatureExtractor,
|
|
34134
34468
|
SpeechT5ForSpeechToText,
|
|
34135
34469
|
SpeechT5ForTextToSpeech,
|