@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +16 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +2255 -931
- package/dist/transformers.min.js +19 -19
- package/dist/transformers.node.cjs +2300 -934
- package/dist/transformers.node.min.cjs +20 -20
- package/dist/transformers.node.min.mjs +20 -20
- package/dist/transformers.node.mjs +2336 -1012
- package/dist/transformers.web.js +2327 -1003
- package/dist/transformers.web.min.js +17 -17
- package/package.json +4 -4
- package/src/cache_utils.js +62 -0
- package/src/configs.js +45 -24
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +27 -17
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/chmv2/image_processing_chmv2.js +3 -0
- package/src/models/chmv2/modeling_chmv2.js +4 -0
- package/src/models/deepseek_v3/modeling_deepseek_v3.js +5 -0
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/eurobert/modeling_eurobert.js +41 -0
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/glm46v/image_processing_glm46v.js +12 -0
- package/src/models/glm46v/processing_glm46v.js +5 -0
- package/src/models/glm_moe_dsa/modeling_glm_moe_dsa.js +5 -0
- package/src/models/glm_ocr/modeling_glm_ocr.js +78 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +3 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/lighton_ocr/modeling_lighton_ocr.js +3 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/mistral4/modeling_mistral4.js +5 -0
- package/src/models/modeling_utils.js +224 -308
- package/src/models/models.js +14 -1
- package/src/models/nemotron_h/modeling_nemotron_h.js +5 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +4 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +194 -143
- package/src/models/qwen2_vl/processing_qwen2_vl.js +5 -4
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +42 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/solar_open/modeling_solar_open.js +5 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/pipelines.js +1 -0
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/hub.js +4 -1
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +15 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +18 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/{ast/modeling_ast.d.ts → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts} +1 -1
- package/types/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.d.ts.map +1 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts +4 -0
- package/types/models/chmv2/image_processing_chmv2.d.ts.map +1 -0
- package/types/models/chmv2/modeling_chmv2.d.ts +6 -0
- package/types/models/chmv2/modeling_chmv2.d.ts.map +1 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts +8 -0
- package/types/models/deepseek_v3/modeling_deepseek_v3.d.ts.map +1 -0
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/eurobert/modeling_eurobert.d.ts +36 -0
- package/types/models/eurobert/modeling_eurobert.d.ts.map +1 -0
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/glm46v/image_processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/image_processing_glm46v.d.ts.map +1 -0
- package/types/models/glm46v/processing_glm46v.d.ts +4 -0
- package/types/models/glm46v/processing_glm46v.d.ts.map +1 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts +8 -0
- package/types/models/glm_moe_dsa/modeling_glm_moe_dsa.d.ts.map +1 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts +26 -0
- package/types/models/glm_ocr/modeling_glm_ocr.d.ts.map +1 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +3 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts +4 -0
- package/types/models/lighton_ocr/modeling_lighton_ocr.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/mistral4/modeling_mistral4.d.ts +8 -0
- package/types/models/mistral4/modeling_mistral4.d.ts.map +1 -0
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +14 -1
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts +8 -0
- package/types/models/nemotron_h/modeling_nemotron_h.d.ts.map +1 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +4 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +43 -6
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts +1 -0
- package/types/models/qwen2_vl/processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/solar_open/modeling_solar_open.d.ts +8 -0
- package/types/models/solar_open/modeling_solar_open.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/pipelines.d.ts +1 -0
- package/types/pipelines.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/hub.d.ts.map +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/models/ast/modeling_ast.d.ts.map +0 -1
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
- /package/src/models/{ast/modeling_ast.js → audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js} +0 -0
|
@@ -117,6 +117,9 @@ __export(transformers_exports, {
|
|
|
117
117
|
BloomModel: () => BloomModel,
|
|
118
118
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
119
119
|
BloomTokenizer: () => BloomTokenizer,
|
|
120
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
121
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
122
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
120
123
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
121
124
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
122
125
|
CLIPModel: () => CLIPModel,
|
|
@@ -212,6 +215,9 @@ __export(transformers_exports, {
|
|
|
212
215
|
DebertaV2Tokenizer: () => DebertaV2Tokenizer,
|
|
213
216
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
214
217
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
218
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
219
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
220
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
215
221
|
DeiTFeatureExtractor: () => DeiTFeatureExtractor,
|
|
216
222
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
217
223
|
DeiTImageProcessor: () => DeiTImageProcessor,
|
|
@@ -248,6 +254,7 @@ __export(transformers_exports, {
|
|
|
248
254
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
249
255
|
DonutSwinModel: () => DonutSwinModel,
|
|
250
256
|
DonutSwinPreTrainedModel: () => DonutSwinPreTrainedModel,
|
|
257
|
+
DynamicCache: () => DynamicCache,
|
|
251
258
|
EdgeTamModel: () => EdgeTamModel,
|
|
252
259
|
EfficientNetForImageClassification: () => EfficientNetForImageClassification,
|
|
253
260
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
@@ -271,6 +278,11 @@ __export(transformers_exports, {
|
|
|
271
278
|
EsmModel: () => EsmModel,
|
|
272
279
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
273
280
|
EsmTokenizer: () => EsmTokenizer,
|
|
281
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
282
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
283
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
284
|
+
EuroBertModel: () => EuroBertModel,
|
|
285
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
274
286
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
275
287
|
ExaoneModel: () => ExaoneModel,
|
|
276
288
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -320,6 +332,7 @@ __export(transformers_exports, {
|
|
|
320
332
|
Gemma3Model: () => Gemma3Model,
|
|
321
333
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
322
334
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
335
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
323
336
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
324
337
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
325
338
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
@@ -327,8 +340,14 @@ __export(transformers_exports, {
|
|
|
327
340
|
GemmaModel: () => GemmaModel,
|
|
328
341
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
329
342
|
GemmaTokenizer: () => GemmaTokenizer,
|
|
343
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
344
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
330
345
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
331
346
|
GlmModel: () => GlmModel,
|
|
347
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
348
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
349
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
350
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
332
351
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
333
352
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
334
353
|
GptOssModel: () => GptOssModel,
|
|
@@ -339,6 +358,9 @@ __export(transformers_exports, {
|
|
|
339
358
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
340
359
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
341
360
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
361
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
362
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
363
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
342
364
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
343
365
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
344
366
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
@@ -364,7 +386,6 @@ __export(transformers_exports, {
|
|
|
364
386
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
365
387
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
366
388
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
367
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
368
389
|
Idefics3Processor: () => Idefics3Processor,
|
|
369
390
|
ImageClassificationPipeline: () => ImageClassificationPipeline,
|
|
370
391
|
ImageFeatureExtractionPipeline: () => ImageFeatureExtractionPipeline,
|
|
@@ -389,6 +410,10 @@ __export(transformers_exports, {
|
|
|
389
410
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
390
411
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
391
412
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
413
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
414
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
415
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
416
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
392
417
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
393
418
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
394
419
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -458,6 +483,9 @@ __export(transformers_exports, {
|
|
|
458
483
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
459
484
|
MinLengthLogitsProcessor: () => MinLengthLogitsProcessor,
|
|
460
485
|
MinNewTokensLengthLogitsProcessor: () => MinNewTokensLengthLogitsProcessor,
|
|
486
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
487
|
+
Mistral4Model: () => Mistral4Model,
|
|
488
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
461
489
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
462
490
|
MistralModel: () => MistralModel,
|
|
463
491
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -529,6 +557,9 @@ __export(transformers_exports, {
|
|
|
529
557
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
530
558
|
NanoChatModel: () => NanoChatModel,
|
|
531
559
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
560
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
561
|
+
NemotronHModel: () => NemotronHModel,
|
|
562
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
532
563
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
533
564
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
534
565
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -572,7 +603,6 @@ __export(transformers_exports, {
|
|
|
572
603
|
Owlv2Model: () => Owlv2Model,
|
|
573
604
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
574
605
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
575
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
576
606
|
PaliGemmaProcessor: () => PaliGemmaProcessor,
|
|
577
607
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
578
608
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
@@ -616,10 +646,12 @@ __export(transformers_exports, {
|
|
|
616
646
|
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
617
647
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
618
648
|
Qwen2Tokenizer: () => Qwen2Tokenizer,
|
|
649
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
619
650
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
620
651
|
Qwen2VLImageProcessor: () => Qwen2VLImageProcessor,
|
|
621
652
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
622
653
|
Qwen2VLProcessor: () => Qwen2VLProcessor,
|
|
654
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
623
655
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
624
656
|
Qwen2_5_VLProcessor: () => Qwen2_5_VLProcessor,
|
|
625
657
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
@@ -631,10 +663,14 @@ __export(transformers_exports, {
|
|
|
631
663
|
Qwen3NextModel: () => Qwen3NextModel,
|
|
632
664
|
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
633
665
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
666
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
634
667
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
668
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
635
669
|
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
636
670
|
Qwen3VLProcessor: () => Qwen3VLProcessor,
|
|
671
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
637
672
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
673
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
638
674
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
639
675
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
640
676
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -706,7 +742,6 @@ __export(transformers_exports, {
|
|
|
706
742
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
707
743
|
SmolLM3Model: () => SmolLM3Model,
|
|
708
744
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
709
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
710
745
|
SmolVLMImageProcessor: () => Idefics3ImageProcessor,
|
|
711
746
|
SmolVLMProcessor: () => Idefics3Processor,
|
|
712
747
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
@@ -714,6 +749,9 @@ __export(transformers_exports, {
|
|
|
714
749
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
715
750
|
SnacModel: () => SnacModel,
|
|
716
751
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
752
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
753
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
754
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
717
755
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
718
756
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
719
757
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
@@ -812,6 +850,10 @@ __export(transformers_exports, {
|
|
|
812
850
|
VitsTokenizer: () => VitsTokenizer,
|
|
813
851
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
814
852
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
853
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
854
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
855
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
856
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
815
857
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
816
858
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
817
859
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -910,7 +952,7 @@ var import_node_fs = __toESM(require("fs"), 1);
|
|
|
910
952
|
var import_node_path = __toESM(require("path"), 1);
|
|
911
953
|
var import_node_url = __toESM(require("url"), 1);
|
|
912
954
|
var import_meta = {};
|
|
913
|
-
var VERSION = "4.0.0-next.
|
|
955
|
+
var VERSION = "4.0.0-next.8";
|
|
914
956
|
var HAS_SELF = typeof self !== "undefined";
|
|
915
957
|
var IS_FS_AVAILABLE = !isEmpty(import_node_fs.default);
|
|
916
958
|
var IS_PATH_AVAILABLE = !isEmpty(import_node_path.default);
|
|
@@ -1038,6 +1080,7 @@ var env = {
|
|
|
1038
1080
|
customCache: null,
|
|
1039
1081
|
useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
|
|
1040
1082
|
cacheKey: "transformers-cache",
|
|
1083
|
+
experimental_useCrossOriginStorage: false,
|
|
1041
1084
|
/////////////////// Custom fetch /////////////////////
|
|
1042
1085
|
fetch: DEFAULT_FETCH
|
|
1043
1086
|
//////////////////////////////////////////////////////
|
|
@@ -1139,7 +1182,7 @@ var logger = {
|
|
|
1139
1182
|
}
|
|
1140
1183
|
};
|
|
1141
1184
|
|
|
1142
|
-
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.
|
|
1185
|
+
// ../../node_modules/.pnpm/@huggingface+tokenizers@0.1.3/node_modules/@huggingface/tokenizers/dist/tokenizers.mjs
|
|
1143
1186
|
var DictionarySplitter = class {
|
|
1144
1187
|
/**
|
|
1145
1188
|
* @param dictionary The dictionary of words to use for splitting.
|
|
@@ -2795,10 +2838,10 @@ var BPE = class extends TokenizerModel_default {
|
|
|
2795
2838
|
);
|
|
2796
2839
|
if (byte_tokens.every((x) => this.tokens_to_ids.has(x))) {
|
|
2797
2840
|
output_tokens.push(...byte_tokens);
|
|
2798
|
-
} else {
|
|
2841
|
+
} else if (this.unk_token != null) {
|
|
2799
2842
|
output_tokens.push(this.unk_token);
|
|
2800
2843
|
}
|
|
2801
|
-
} else {
|
|
2844
|
+
} else if (this.unk_token != null) {
|
|
2802
2845
|
output_tokens.push(this.unk_token);
|
|
2803
2846
|
}
|
|
2804
2847
|
}
|
|
@@ -3588,7 +3631,7 @@ var Tokenizer = class {
|
|
|
3588
3631
|
};
|
|
3589
3632
|
var Tokenizer_default = Tokenizer;
|
|
3590
3633
|
|
|
3591
|
-
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.
|
|
3634
|
+
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
|
|
3592
3635
|
var TOKEN_TYPES = Object.freeze({
|
|
3593
3636
|
Text: "Text",
|
|
3594
3637
|
// The text between Jinja statements or expressions
|
|
@@ -5107,7 +5150,11 @@ var Environment = class {
|
|
|
5107
5150
|
["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
|
|
5108
5151
|
["integer", (operand) => operand instanceof IntegerValue],
|
|
5109
5152
|
["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
|
|
5110
|
-
["mapping", (operand) => operand
|
|
5153
|
+
["mapping", (operand) => operand instanceof ObjectValue],
|
|
5154
|
+
[
|
|
5155
|
+
"sequence",
|
|
5156
|
+
(operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
|
|
5157
|
+
],
|
|
5111
5158
|
[
|
|
5112
5159
|
"lower",
|
|
5113
5160
|
(operand) => {
|
|
@@ -5380,6 +5427,9 @@ var Interpreter = class {
|
|
|
5380
5427
|
applyFilter(operand, filterNode, environment) {
|
|
5381
5428
|
if (filterNode.type === "Identifier") {
|
|
5382
5429
|
const filter = filterNode;
|
|
5430
|
+
if (filter.value === "safe") {
|
|
5431
|
+
return operand;
|
|
5432
|
+
}
|
|
5383
5433
|
if (filter.value === "tojson") {
|
|
5384
5434
|
return new StringValue(toJSON(operand, {}));
|
|
5385
5435
|
}
|
|
@@ -5469,6 +5519,8 @@ var Interpreter = class {
|
|
|
5469
5519
|
return new IntegerValue(Math.floor(operand.value));
|
|
5470
5520
|
case "float":
|
|
5471
5521
|
return new FloatValue(operand.value);
|
|
5522
|
+
case "string":
|
|
5523
|
+
return new StringValue(operand.toString());
|
|
5472
5524
|
default:
|
|
5473
5525
|
throw new Error(`Unknown NumericValue filter: ${filter.value}`);
|
|
5474
5526
|
}
|
|
@@ -6897,9 +6949,216 @@ function toAbsoluteURL(url2) {
|
|
|
6897
6949
|
return new URL(url2, baseURL).href;
|
|
6898
6950
|
}
|
|
6899
6951
|
|
|
6952
|
+
// src/utils/cache/CrossOriginStorageCache.js
|
|
6953
|
+
var HASH_ALGORITHM = "SHA-256";
|
|
6954
|
+
var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
|
|
6955
|
+
var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
|
|
6956
|
+
var CrossOriginStorage = class {
|
|
6957
|
+
/** @type {Promise<Cache> | null} */
|
|
6958
|
+
#hashCache = null;
|
|
6959
|
+
/**
|
|
6960
|
+
* Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
|
|
6961
|
+
* @returns {Promise<Cache>}
|
|
6962
|
+
*/
|
|
6963
|
+
_getHashCache = () => {
|
|
6964
|
+
this.#hashCache ??= caches.open(HASH_CACHE_NAME);
|
|
6965
|
+
return this.#hashCache;
|
|
6966
|
+
};
|
|
6967
|
+
/**
|
|
6968
|
+
* Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
|
|
6969
|
+
* @returns {boolean}
|
|
6970
|
+
*/
|
|
6971
|
+
static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
|
|
6972
|
+
/**
|
|
6973
|
+
* Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
|
|
6974
|
+
* the corresponding file handle from cross-origin storage.
|
|
6975
|
+
*
|
|
6976
|
+
* Implements `CacheInterface.match`.
|
|
6977
|
+
*
|
|
6978
|
+
* @param {string} request The URL of the resource to look up.
|
|
6979
|
+
* @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
|
|
6980
|
+
*/
|
|
6981
|
+
match = async (request) => {
|
|
6982
|
+
const hashValue = await this._getFileHash(request);
|
|
6983
|
+
if (!hashValue) {
|
|
6984
|
+
return void 0;
|
|
6985
|
+
}
|
|
6986
|
+
try {
|
|
6987
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
|
|
6988
|
+
const blob = await handle.getFile();
|
|
6989
|
+
return new Response(blob, {
|
|
6990
|
+
headers: {
|
|
6991
|
+
"Content-Length": String(blob.size)
|
|
6992
|
+
}
|
|
6993
|
+
});
|
|
6994
|
+
} catch {
|
|
6995
|
+
return void 0;
|
|
6996
|
+
}
|
|
6997
|
+
};
|
|
6998
|
+
/**
|
|
6999
|
+
* Stores a response in cross-origin storage, keyed by its SHA-256 hash.
|
|
7000
|
+
*
|
|
7001
|
+
* For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
|
|
7002
|
+
* `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
|
|
7003
|
+
* without reading the response body a second time.
|
|
7004
|
+
*
|
|
7005
|
+
* For non-LFS resources the hash is unknown upfront. In that case the body is consumed
|
|
7006
|
+
* in the background: the stream is read to compute the content hash, the file is written
|
|
7007
|
+
* into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
|
|
7008
|
+
* so that future `match` calls can resolve the file without a network round-trip.
|
|
7009
|
+
*
|
|
7010
|
+
* Implements `CacheInterface.put`.
|
|
7011
|
+
*
|
|
7012
|
+
* @param {string} request The URL of the resource (used as the hash-cache key).
|
|
7013
|
+
* @param {Response} response The response whose body will be written to the cache.
|
|
7014
|
+
* @returns {Promise<void>}
|
|
7015
|
+
*/
|
|
7016
|
+
put = async (request, response) => {
|
|
7017
|
+
const hashValue = await this._getFileHash(request);
|
|
7018
|
+
if (hashValue) {
|
|
7019
|
+
const blob = await response.blob();
|
|
7020
|
+
await this._storeBlobInCOS(blob, hashValue);
|
|
7021
|
+
} else {
|
|
7022
|
+
this._processAndStore(request, response.body);
|
|
7023
|
+
}
|
|
7024
|
+
};
|
|
7025
|
+
/**
|
|
7026
|
+
* Writes a blob into cross-origin storage using the given pre-computed hex hash string.
|
|
7027
|
+
*
|
|
7028
|
+
* @param {Blob} blob
|
|
7029
|
+
* @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
|
|
7030
|
+
* @returns {Promise<void>}
|
|
7031
|
+
*/
|
|
7032
|
+
_storeBlobInCOS = async (blob, hashHex) => {
|
|
7033
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
|
|
7034
|
+
create: true
|
|
7035
|
+
});
|
|
7036
|
+
const writableStream = await handle.createWritable();
|
|
7037
|
+
await writableStream.write(blob);
|
|
7038
|
+
await writableStream.close();
|
|
7039
|
+
};
|
|
7040
|
+
/**
|
|
7041
|
+
* Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
|
|
7042
|
+
* of the resulting blob, stores it in cross-origin storage, and persists the computed
|
|
7043
|
+
* hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
|
|
7044
|
+
* file without a network round-trip.
|
|
7045
|
+
*
|
|
7046
|
+
* Called fire-and-forget from `put` — errors are swallowed so failures never surface to
|
|
7047
|
+
* the caller.
|
|
7048
|
+
*
|
|
7049
|
+
* @param {string} request The original resource URL.
|
|
7050
|
+
* @param {ReadableStream} stream The response body stream to consume.
|
|
7051
|
+
* @returns {Promise<void>}
|
|
7052
|
+
*/
|
|
7053
|
+
_processAndStore = async (request, stream) => {
|
|
7054
|
+
try {
|
|
7055
|
+
const chunks = [];
|
|
7056
|
+
for await (const chunk2 of stream) {
|
|
7057
|
+
chunks.push(chunk2);
|
|
7058
|
+
}
|
|
7059
|
+
const blob = new Blob(chunks);
|
|
7060
|
+
const hashHex = await this._getBlobHash(blob);
|
|
7061
|
+
await this._storeBlobInCOS(blob, hashHex);
|
|
7062
|
+
try {
|
|
7063
|
+
const hashCache = await this._getHashCache();
|
|
7064
|
+
await hashCache.put(request, new Response(hashHex));
|
|
7065
|
+
} catch {
|
|
7066
|
+
}
|
|
7067
|
+
} catch {
|
|
7068
|
+
}
|
|
7069
|
+
};
|
|
7070
|
+
/**
|
|
7071
|
+
* Deletes the cache entry for the given request.
|
|
7072
|
+
*
|
|
7073
|
+
* Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
|
|
7074
|
+
* expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
|
|
7075
|
+
* permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
|
|
7076
|
+
* re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
|
|
7077
|
+
*
|
|
7078
|
+
* Implements `CacheInterface.delete`.
|
|
7079
|
+
*
|
|
7080
|
+
* @param {string} request
|
|
7081
|
+
* @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
|
|
7082
|
+
*/
|
|
7083
|
+
delete = async (request) => {
|
|
7084
|
+
try {
|
|
7085
|
+
const hashCache = await this._getHashCache();
|
|
7086
|
+
return await hashCache.delete(request);
|
|
7087
|
+
} catch {
|
|
7088
|
+
return false;
|
|
7089
|
+
}
|
|
7090
|
+
};
|
|
7091
|
+
/**
|
|
7092
|
+
* Resolves the SHA-256 hash for a given URL.
|
|
7093
|
+
*
|
|
7094
|
+
* Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
|
|
7095
|
+
* Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
|
|
7096
|
+
* LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
|
|
7097
|
+
*
|
|
7098
|
+
* Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
|
|
7099
|
+
*
|
|
7100
|
+
* @param {string} url The resource URL to resolve a hash for.
|
|
7101
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
7102
|
+
*/
|
|
7103
|
+
_getFileHash = async (url2) => {
|
|
7104
|
+
try {
|
|
7105
|
+
const hashCache = await this._getHashCache();
|
|
7106
|
+
const cached = await hashCache.match(url2);
|
|
7107
|
+
if (cached) {
|
|
7108
|
+
return cached.text();
|
|
7109
|
+
}
|
|
7110
|
+
const hash = await this._getLfsFileHash(url2);
|
|
7111
|
+
if (hash) {
|
|
7112
|
+
await hashCache.put(url2, new Response(hash));
|
|
7113
|
+
return hash;
|
|
7114
|
+
}
|
|
7115
|
+
return null;
|
|
7116
|
+
} catch {
|
|
7117
|
+
return null;
|
|
7118
|
+
}
|
|
7119
|
+
};
|
|
7120
|
+
/**
|
|
7121
|
+
* Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
|
|
7122
|
+
* Git LFS pointer file.
|
|
7123
|
+
*
|
|
7124
|
+
* Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
|
|
7125
|
+
* The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
|
|
7126
|
+
* Returns `null` for non-LFS URLs or when the network request fails.
|
|
7127
|
+
*
|
|
7128
|
+
* @see https://huggingface.co/docs/hub/en/storage-backends#xet
|
|
7129
|
+
* @param {string} url The resolved Hugging Face URL of the resource.
|
|
7130
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
7131
|
+
*/
|
|
7132
|
+
_getLfsFileHash = async (url2) => {
|
|
7133
|
+
if (!url2.includes("/resolve/")) {
|
|
7134
|
+
return null;
|
|
7135
|
+
}
|
|
7136
|
+
const rawUrl = url2.replace("/resolve/", "/raw/");
|
|
7137
|
+
try {
|
|
7138
|
+
const text = await fetch(rawUrl).then((r) => r.text());
|
|
7139
|
+
const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
|
|
7140
|
+
return match ? match[1] : null;
|
|
7141
|
+
} catch {
|
|
7142
|
+
return null;
|
|
7143
|
+
}
|
|
7144
|
+
};
|
|
7145
|
+
/**
|
|
7146
|
+
* Computes the SHA-256 hash of a `Blob`'s contents.
|
|
7147
|
+
*
|
|
7148
|
+
* @param {Blob} blob The blob to hash.
|
|
7149
|
+
* @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
|
|
7150
|
+
*/
|
|
7151
|
+
_getBlobHash = async (blob) => {
|
|
7152
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
7153
|
+
const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
|
|
7154
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
7155
|
+
return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
7156
|
+
};
|
|
7157
|
+
};
|
|
7158
|
+
|
|
6900
7159
|
// src/utils/cache.js
|
|
6901
7160
|
async function getCache(file_cache_dir = null) {
|
|
6902
|
-
let
|
|
7161
|
+
let cache2 = null;
|
|
6903
7162
|
if (env.useCustomCache) {
|
|
6904
7163
|
if (!env.customCache) {
|
|
6905
7164
|
throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
|
|
@@ -6909,30 +7168,33 @@ async function getCache(file_cache_dir = null) {
|
|
|
6909
7168
|
"`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
|
|
6910
7169
|
);
|
|
6911
7170
|
}
|
|
6912
|
-
|
|
7171
|
+
cache2 = env.customCache;
|
|
6913
7172
|
}
|
|
6914
|
-
if (!
|
|
7173
|
+
if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
|
|
7174
|
+
cache2 = new CrossOriginStorage();
|
|
7175
|
+
}
|
|
7176
|
+
if (!cache2 && env.useBrowserCache) {
|
|
6915
7177
|
if (typeof caches === "undefined") {
|
|
6916
7178
|
throw Error("Browser cache is not available in this environment.");
|
|
6917
7179
|
}
|
|
6918
7180
|
try {
|
|
6919
|
-
|
|
7181
|
+
cache2 = await caches.open(env.cacheKey);
|
|
6920
7182
|
} catch (e) {
|
|
6921
7183
|
logger.warn("An error occurred while opening the browser cache:", e);
|
|
6922
7184
|
}
|
|
6923
7185
|
}
|
|
6924
|
-
if (!
|
|
7186
|
+
if (!cache2 && env.useFSCache) {
|
|
6925
7187
|
if (!apis.IS_FS_AVAILABLE) {
|
|
6926
7188
|
throw Error("File System Cache is not available in this environment.");
|
|
6927
7189
|
}
|
|
6928
|
-
|
|
7190
|
+
cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
|
|
6929
7191
|
}
|
|
6930
|
-
return
|
|
7192
|
+
return cache2;
|
|
6931
7193
|
}
|
|
6932
|
-
async function tryCache(
|
|
7194
|
+
async function tryCache(cache2, ...names) {
|
|
6933
7195
|
for (let name of names) {
|
|
6934
7196
|
try {
|
|
6935
|
-
let result = await
|
|
7197
|
+
let result = await cache2.match(name);
|
|
6936
7198
|
if (result) return result;
|
|
6937
7199
|
} catch (e) {
|
|
6938
7200
|
continue;
|
|
@@ -6941,6 +7203,83 @@ async function tryCache(cache, ...names) {
|
|
|
6941
7203
|
return void 0;
|
|
6942
7204
|
}
|
|
6943
7205
|
|
|
7206
|
+
// src/utils/lru_cache.js
|
|
7207
|
+
var LRUCache2 = class {
|
|
7208
|
+
/** @type {number} */
|
|
7209
|
+
#capacity;
|
|
7210
|
+
/** @type {Map<any, any>} */
|
|
7211
|
+
#cache;
|
|
7212
|
+
/**
|
|
7213
|
+
* Creates an LRUCache instance.
|
|
7214
|
+
* @param {number} capacity The maximum number of items the cache can hold.
|
|
7215
|
+
*/
|
|
7216
|
+
constructor(capacity) {
|
|
7217
|
+
this.#capacity = capacity;
|
|
7218
|
+
this.#cache = /* @__PURE__ */ new Map();
|
|
7219
|
+
}
|
|
7220
|
+
/**
|
|
7221
|
+
* Retrieves the value associated with the given key and marks the key as recently used.
|
|
7222
|
+
* @param {any} key The key to retrieve.
|
|
7223
|
+
* @returns {any} The value associated with the key, or undefined if the key does not exist.
|
|
7224
|
+
*/
|
|
7225
|
+
get(key) {
|
|
7226
|
+
if (!this.#cache.has(key)) return void 0;
|
|
7227
|
+
const value = this.#cache.get(key);
|
|
7228
|
+
this.#cache.delete(key);
|
|
7229
|
+
this.#cache.set(key, value);
|
|
7230
|
+
return value;
|
|
7231
|
+
}
|
|
7232
|
+
/**
|
|
7233
|
+
* Inserts or updates the key-value pair in the cache.
|
|
7234
|
+
* If the key already exists, it is updated and marked as recently used.
|
|
7235
|
+
* If the cache exceeds its capacity, the least recently used item is evicted.
|
|
7236
|
+
* @param {any} key The key to add or update.
|
|
7237
|
+
* @param {any} value The value to associate with the key.
|
|
7238
|
+
*/
|
|
7239
|
+
put(key, value) {
|
|
7240
|
+
if (this.#cache.has(key)) {
|
|
7241
|
+
this.#cache.delete(key);
|
|
7242
|
+
}
|
|
7243
|
+
this.#cache.set(key, value);
|
|
7244
|
+
if (this.#cache.size > this.#capacity) {
|
|
7245
|
+
this.#cache.delete(this.#cache.keys().next().value);
|
|
7246
|
+
}
|
|
7247
|
+
}
|
|
7248
|
+
/**
|
|
7249
|
+
* Removes the entry for the given key from the cache.
|
|
7250
|
+
* @param {any} key The key to delete.
|
|
7251
|
+
* @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
|
|
7252
|
+
*/
|
|
7253
|
+
delete(key) {
|
|
7254
|
+
return this.#cache.delete(key);
|
|
7255
|
+
}
|
|
7256
|
+
/**
|
|
7257
|
+
* Clears the cache.
|
|
7258
|
+
*/
|
|
7259
|
+
clear() {
|
|
7260
|
+
this.#cache.clear();
|
|
7261
|
+
}
|
|
7262
|
+
};
|
|
7263
|
+
|
|
7264
|
+
// src/utils/memoize_promise.js
|
|
7265
|
+
var MAX_CACHE_SIZE = 100;
|
|
7266
|
+
var cache = new LRUCache2(MAX_CACHE_SIZE);
|
|
7267
|
+
function memoizePromise(key, factory) {
|
|
7268
|
+
const cached = cache.get(key);
|
|
7269
|
+
if (cached !== void 0) {
|
|
7270
|
+
return cached;
|
|
7271
|
+
}
|
|
7272
|
+
const promise = factory().then(
|
|
7273
|
+
(value) => value,
|
|
7274
|
+
(err) => {
|
|
7275
|
+
cache.delete(key);
|
|
7276
|
+
return Promise.reject(err);
|
|
7277
|
+
}
|
|
7278
|
+
);
|
|
7279
|
+
cache.put(key, promise);
|
|
7280
|
+
return promise;
|
|
7281
|
+
}
|
|
7282
|
+
|
|
6944
7283
|
// src/utils/model_registry/get_file_metadata.js
|
|
6945
7284
|
async function fetch_file_head(urlOrPath) {
|
|
6946
7285
|
if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
|
|
@@ -6948,17 +7287,27 @@ async function fetch_file_head(urlOrPath) {
|
|
|
6948
7287
|
}
|
|
6949
7288
|
const headers = getFetchHeaders(urlOrPath);
|
|
6950
7289
|
headers.set("Range", "bytes=0-0");
|
|
6951
|
-
return env.fetch(urlOrPath, { method: "GET", headers });
|
|
7290
|
+
return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
|
|
7291
|
+
}
|
|
7292
|
+
function get_file_metadata(path_or_repo_id, filename, options = {}) {
|
|
7293
|
+
const key = JSON.stringify([
|
|
7294
|
+
path_or_repo_id,
|
|
7295
|
+
filename,
|
|
7296
|
+
options?.revision,
|
|
7297
|
+
options?.cache_dir,
|
|
7298
|
+
options?.local_files_only
|
|
7299
|
+
]);
|
|
7300
|
+
return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
|
|
6952
7301
|
}
|
|
6953
|
-
async function
|
|
6954
|
-
const
|
|
7302
|
+
async function _get_file_metadata(path_or_repo_id, filename, options) {
|
|
7303
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6955
7304
|
const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6956
7305
|
path_or_repo_id,
|
|
6957
7306
|
filename,
|
|
6958
7307
|
options,
|
|
6959
|
-
|
|
7308
|
+
cache2
|
|
6960
7309
|
);
|
|
6961
|
-
const cachedResponse = await checkCachedResource(
|
|
7310
|
+
const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6962
7311
|
if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
|
|
6963
7312
|
const size = cachedResponse.headers.get("content-length");
|
|
6964
7313
|
const contentType = cachedResponse.headers.get("content-type");
|
|
@@ -7056,7 +7405,7 @@ function getFetchHeaders(urlOrPath) {
|
|
|
7056
7405
|
}
|
|
7057
7406
|
return headers;
|
|
7058
7407
|
}
|
|
7059
|
-
function buildResourcePaths(path_or_repo_id, filename, options = {},
|
|
7408
|
+
function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
|
|
7060
7409
|
const revision = options.revision ?? "main";
|
|
7061
7410
|
const requestURL = pathJoin(path_or_repo_id, filename);
|
|
7062
7411
|
const validModelId = isValidHfModelId(path_or_repo_id);
|
|
@@ -7066,7 +7415,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
7066
7415
|
env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
|
|
7067
7416
|
filename
|
|
7068
7417
|
);
|
|
7069
|
-
const proposedCacheKey =
|
|
7418
|
+
const proposedCacheKey = cache2 instanceof FileCache ? (
|
|
7070
7419
|
// Choose cache key for filesystem cache
|
|
7071
7420
|
// When using the main revision (default), we use the request URL as the cache key.
|
|
7072
7421
|
// If a specific revision is requested, we account for this in the cache key.
|
|
@@ -7080,14 +7429,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
7080
7429
|
validModelId
|
|
7081
7430
|
};
|
|
7082
7431
|
}
|
|
7083
|
-
async function checkCachedResource(
|
|
7084
|
-
if (!
|
|
7432
|
+
async function checkCachedResource(cache2, localPath, proposedCacheKey) {
|
|
7433
|
+
if (!cache2) {
|
|
7085
7434
|
return void 0;
|
|
7086
7435
|
}
|
|
7087
|
-
return await tryCache(
|
|
7436
|
+
return await tryCache(cache2, localPath, proposedCacheKey);
|
|
7088
7437
|
}
|
|
7089
|
-
async function storeCachedResource(path_or_repo_id, filename,
|
|
7090
|
-
if (await
|
|
7438
|
+
async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
|
|
7439
|
+
if (await cache2.match(cacheKey) !== void 0) {
|
|
7091
7440
|
return;
|
|
7092
7441
|
}
|
|
7093
7442
|
if (!result) {
|
|
@@ -7097,20 +7446,22 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
7097
7446
|
file: filename,
|
|
7098
7447
|
...data
|
|
7099
7448
|
}) : void 0;
|
|
7100
|
-
await
|
|
7449
|
+
await cache2.put(
|
|
7101
7450
|
cacheKey,
|
|
7102
7451
|
/** @type {Response} */
|
|
7103
7452
|
response,
|
|
7104
7453
|
wrapped_progress
|
|
7105
7454
|
);
|
|
7106
7455
|
} else if (typeof response !== "string") {
|
|
7107
|
-
|
|
7456
|
+
const headers = new Headers(response.headers);
|
|
7457
|
+
headers.set("content-length", result.byteLength.toString());
|
|
7458
|
+
await cache2.put(
|
|
7108
7459
|
cacheKey,
|
|
7109
7460
|
new Response(
|
|
7110
7461
|
/** @type {any} */
|
|
7111
7462
|
result,
|
|
7112
7463
|
{
|
|
7113
|
-
headers
|
|
7464
|
+
headers
|
|
7114
7465
|
}
|
|
7115
7466
|
)
|
|
7116
7467
|
).catch((err) => {
|
|
@@ -7118,17 +7469,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
7118
7469
|
});
|
|
7119
7470
|
}
|
|
7120
7471
|
}
|
|
7121
|
-
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false,
|
|
7472
|
+
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
|
|
7122
7473
|
const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
7123
7474
|
path_or_repo_id,
|
|
7124
7475
|
filename,
|
|
7125
7476
|
options,
|
|
7126
|
-
|
|
7477
|
+
cache2
|
|
7127
7478
|
);
|
|
7128
7479
|
let cacheKey;
|
|
7129
7480
|
let toCacheResponse = false;
|
|
7130
7481
|
let response;
|
|
7131
|
-
response = await checkCachedResource(
|
|
7482
|
+
response = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
7132
7483
|
const cacheHit = response !== void 0;
|
|
7133
7484
|
if (!cacheHit) {
|
|
7134
7485
|
if (env.allowLocalModels) {
|
|
@@ -7169,7 +7520,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
7169
7520
|
}
|
|
7170
7521
|
cacheKey = proposedCacheKey;
|
|
7171
7522
|
}
|
|
7172
|
-
toCacheResponse =
|
|
7523
|
+
toCacheResponse = cache2 && // 1. A caching system is available
|
|
7173
7524
|
typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
|
|
7174
7525
|
response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
|
|
7175
7526
|
response.status === 200;
|
|
@@ -7231,7 +7582,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
7231
7582
|
// i.e., do not cache FileResponses (prevents duplication)
|
|
7232
7583
|
toCacheResponse && cacheKey && typeof response !== "string"
|
|
7233
7584
|
) {
|
|
7234
|
-
await storeCachedResource(path_or_repo_id, filename,
|
|
7585
|
+
await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
|
|
7235
7586
|
}
|
|
7236
7587
|
dispatchCallback(options.progress_callback, {
|
|
7237
7588
|
status: "done",
|
|
@@ -7247,7 +7598,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
7247
7598
|
if (response instanceof FileResponse) {
|
|
7248
7599
|
return response.filePath;
|
|
7249
7600
|
}
|
|
7250
|
-
const cachedResponse = await
|
|
7601
|
+
const cachedResponse = await cache2?.match(cacheKey);
|
|
7251
7602
|
if (cachedResponse instanceof FileResponse) {
|
|
7252
7603
|
return cachedResponse.filePath;
|
|
7253
7604
|
} else if (cachedResponse instanceof Response) {
|
|
@@ -7274,8 +7625,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
|
|
|
7274
7625
|
name: path_or_repo_id,
|
|
7275
7626
|
file: filename
|
|
7276
7627
|
});
|
|
7277
|
-
const
|
|
7278
|
-
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path,
|
|
7628
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
7629
|
+
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
|
|
7279
7630
|
}
|
|
7280
7631
|
async function getModelText(modelPath, fileName, fatal = true, options = {}) {
|
|
7281
7632
|
const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
|
|
@@ -8068,7 +8419,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
|
|
|
8068
8419
|
// src/backends/onnx.js
|
|
8069
8420
|
var ONNX_NODE = __toESM(require("onnxruntime-node"), 1);
|
|
8070
8421
|
|
|
8071
|
-
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.
|
|
8422
|
+
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
|
|
8072
8423
|
var ort_webgpu_bundle_min_exports = {};
|
|
8073
8424
|
__export(ort_webgpu_bundle_min_exports, {
|
|
8074
8425
|
InferenceSession: () => Jf,
|
|
@@ -8837,7 +9188,7 @@ async function ts(a = {}) {
|
|
|
8837
9188
|
throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
|
|
8838
9189
|
}
|
|
8839
9190
|
function Ye() {
|
|
8840
|
-
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii,
|
|
9191
|
+
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
|
|
8841
9192
|
}
|
|
8842
9193
|
async function bt() {
|
|
8843
9194
|
function e(o, u) {
|
|
@@ -10024,7 +10375,7 @@ async function ts(a = {}) {
|
|
|
10024
10375
|
Te(`invalid type for getValue: ${t}`);
|
|
10025
10376
|
}
|
|
10026
10377
|
}, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
|
|
10027
|
-
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = {
|
|
10378
|
+
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
|
|
10028
10379
|
if (r === void 0 || !r.Uc) return 1;
|
|
10029
10380
|
if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
|
|
10030
10381
|
if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
|
|
@@ -10044,11 +10395,11 @@ async function ts(a = {}) {
|
|
|
10044
10395
|
} catch {
|
|
10045
10396
|
return 4;
|
|
10046
10397
|
}
|
|
10047
|
-
},
|
|
10398
|
+
}, 926500: (e, t, n) => {
|
|
10048
10399
|
r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
|
|
10049
|
-
},
|
|
10400
|
+
}, 926564: () => r.me(), 926606: (e) => {
|
|
10050
10401
|
r.jd(e);
|
|
10051
|
-
},
|
|
10402
|
+
}, 926643: () => typeof wasmOffsetConverter < "u" };
|
|
10052
10403
|
function af(e, t, n, o) {
|
|
10053
10404
|
var u = P();
|
|
10054
10405
|
try {
|
|
@@ -11964,7 +12315,7 @@ var $s = k(() => {
|
|
|
11964
12315
|
Ve();
|
|
11965
12316
|
Ve();
|
|
11966
12317
|
Ve();
|
|
11967
|
-
var Xa = "1.25.0-dev.
|
|
12318
|
+
var Xa = "1.25.0-dev.20260307-d626b568e0";
|
|
11968
12319
|
var Tl = Zr;
|
|
11969
12320
|
{
|
|
11970
12321
|
let a = ($s(), $t(Gs)).wasmBackend;
|
|
@@ -11975,11 +12326,11 @@ Object.defineProperty(K.versions, "web", { value: Xa, enumerable: true });
|
|
|
11975
12326
|
// src/backends/utils/cacheWasm.js
|
|
11976
12327
|
async function loadAndCacheFile(url2) {
|
|
11977
12328
|
const fileName = url2.split("/").pop();
|
|
11978
|
-
let
|
|
12329
|
+
let cache2;
|
|
11979
12330
|
try {
|
|
11980
|
-
|
|
11981
|
-
if (
|
|
11982
|
-
const result = await
|
|
12331
|
+
cache2 = await getCache();
|
|
12332
|
+
if (cache2) {
|
|
12333
|
+
const result = await cache2.match(url2);
|
|
11983
12334
|
if (result) {
|
|
11984
12335
|
return result;
|
|
11985
12336
|
}
|
|
@@ -11991,9 +12342,9 @@ async function loadAndCacheFile(url2) {
|
|
|
11991
12342
|
if (!response.ok) {
|
|
11992
12343
|
throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
|
|
11993
12344
|
}
|
|
11994
|
-
if (
|
|
12345
|
+
if (cache2) {
|
|
11995
12346
|
try {
|
|
11996
|
-
await
|
|
12347
|
+
await cache2.put(url2, response.clone());
|
|
11997
12348
|
} catch (e) {
|
|
11998
12349
|
logger.warn(`Failed to cache ${fileName}:`, e);
|
|
11999
12350
|
}
|
|
@@ -13845,9 +14196,23 @@ var Tensor2 = class _Tensor {
|
|
|
13845
14196
|
throw Error(`Unsupported norm: ${p}`);
|
|
13846
14197
|
}
|
|
13847
14198
|
const this_data = this.data;
|
|
13848
|
-
const
|
|
14199
|
+
const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
|
|
14200
|
+
if (is_bigint && p !== 1) {
|
|
14201
|
+
throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
|
|
14202
|
+
}
|
|
14203
|
+
let fn2, zero;
|
|
14204
|
+
if (is_bigint) {
|
|
14205
|
+
fn2 = (a, b) => a + b;
|
|
14206
|
+
zero = 0n;
|
|
14207
|
+
} else {
|
|
14208
|
+
fn2 = (a, b) => a + b ** p;
|
|
14209
|
+
zero = 0;
|
|
14210
|
+
}
|
|
13849
14211
|
if (dim === null) {
|
|
13850
|
-
|
|
14212
|
+
let val = this_data.reduce(fn2, zero);
|
|
14213
|
+
if (p !== 1) {
|
|
14214
|
+
val = val ** (1 / p);
|
|
14215
|
+
}
|
|
13851
14216
|
return new _Tensor(this.type, [val], []);
|
|
13852
14217
|
}
|
|
13853
14218
|
const [type, result, resultDims] = reduce_helper(fn2, this, dim, keepdim);
|
|
@@ -16307,9 +16672,12 @@ __export(processors_exports, {
|
|
|
16307
16672
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
16308
16673
|
Florence2Processor: () => Florence2Processor,
|
|
16309
16674
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
16675
|
+
Glm46VProcessor: () => Glm46VProcessor,
|
|
16676
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
16310
16677
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
16311
16678
|
Idefics3Processor: () => Idefics3Processor,
|
|
16312
16679
|
JinaCLIPProcessor: () => JinaCLIPProcessor,
|
|
16680
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
16313
16681
|
LlavaProcessor: () => LlavaProcessor,
|
|
16314
16682
|
MgpstrProcessor: () => MgpstrProcessor,
|
|
16315
16683
|
MoonshineProcessor: () => MoonshineProcessor,
|
|
@@ -16330,6 +16698,7 @@ __export(processors_exports, {
|
|
|
16330
16698
|
UltravoxProcessor: () => UltravoxProcessor,
|
|
16331
16699
|
VLChatProcessor: () => VLChatProcessor,
|
|
16332
16700
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
16701
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
16333
16702
|
Wav2Vec2Processor: () => Wav2Vec2Processor,
|
|
16334
16703
|
Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
|
|
16335
16704
|
WhisperProcessor: () => WhisperProcessor
|
|
@@ -16384,12 +16753,14 @@ __export(feature_extractors_exports, {
|
|
|
16384
16753
|
EncodecFeatureExtractor: () => EncodecFeatureExtractor,
|
|
16385
16754
|
FeatureExtractor: () => FeatureExtractor,
|
|
16386
16755
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
16756
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
16387
16757
|
MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
|
|
16388
16758
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
16389
16759
|
PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
|
|
16390
16760
|
SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
|
|
16391
16761
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
16392
16762
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
16763
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
16393
16764
|
Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
|
|
16394
16765
|
WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
|
|
16395
16766
|
WhisperFeatureExtractor: () => WhisperFeatureExtractor
|
|
@@ -16617,6 +16988,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16617
16988
|
mel_filters = null,
|
|
16618
16989
|
mel_floor = 1e-10,
|
|
16619
16990
|
log_mel = null,
|
|
16991
|
+
max_log_mel = null,
|
|
16620
16992
|
reference = 1,
|
|
16621
16993
|
min_value = 1e-10,
|
|
16622
16994
|
db_range = null,
|
|
@@ -16756,6 +17128,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16756
17128
|
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
16757
17129
|
}
|
|
16758
17130
|
break;
|
|
17131
|
+
case "log10_max_norm": {
|
|
17132
|
+
for (let i = 0; i < o; ++i) {
|
|
17133
|
+
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
17134
|
+
}
|
|
17135
|
+
const logMax = max_log_mel ?? max(mel_spec_data)[0];
|
|
17136
|
+
const threshold = logMax - 8;
|
|
17137
|
+
for (let i = 0; i < o; ++i) {
|
|
17138
|
+
mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
|
|
17139
|
+
}
|
|
17140
|
+
break;
|
|
17141
|
+
}
|
|
16759
17142
|
case "dB":
|
|
16760
17143
|
if (power === 1) {
|
|
16761
17144
|
amplitude_to_db(mel_spec_data, reference, min_value, db_range);
|
|
@@ -16766,7 +17149,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16766
17149
|
}
|
|
16767
17150
|
break;
|
|
16768
17151
|
default:
|
|
16769
|
-
throw new Error(
|
|
17152
|
+
throw new Error(
|
|
17153
|
+
`log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
|
|
17154
|
+
);
|
|
16770
17155
|
}
|
|
16771
17156
|
}
|
|
16772
17157
|
return mel_spec;
|
|
@@ -17271,6 +17656,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
|
|
|
17271
17656
|
}
|
|
17272
17657
|
};
|
|
17273
17658
|
|
|
17659
|
+
// src/models/granite_speech/feature_extraction_granite_speech.js
|
|
17660
|
+
var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
|
|
17661
|
+
constructor(config) {
|
|
17662
|
+
super(config);
|
|
17663
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
17664
|
+
this.mel_filters = mel_filter_bank(
|
|
17665
|
+
Math.floor(1 + n_fft / 2),
|
|
17666
|
+
// num_frequency_bins = 257
|
|
17667
|
+
n_mels,
|
|
17668
|
+
// 80
|
|
17669
|
+
0,
|
|
17670
|
+
// min_frequency
|
|
17671
|
+
sample_rate / 2,
|
|
17672
|
+
// max_frequency = 8000
|
|
17673
|
+
sample_rate,
|
|
17674
|
+
// 16000
|
|
17675
|
+
null,
|
|
17676
|
+
// norm (torchaudio default: no norm)
|
|
17677
|
+
"htk"
|
|
17678
|
+
// mel_scale (torchaudio default)
|
|
17679
|
+
);
|
|
17680
|
+
const raw_window = window_function(win_length, "hann");
|
|
17681
|
+
this.window = new Float64Array(n_fft);
|
|
17682
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
17683
|
+
this.window.set(raw_window, pad);
|
|
17684
|
+
}
|
|
17685
|
+
/**
|
|
17686
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
17687
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
17688
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
17689
|
+
*/
|
|
17690
|
+
async _call(audio) {
|
|
17691
|
+
validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
|
|
17692
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
17693
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
17694
|
+
const max_num_frames = num_frames - num_frames % 2;
|
|
17695
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
17696
|
+
power: 2,
|
|
17697
|
+
mel_filters: this.mel_filters,
|
|
17698
|
+
log_mel: "log10_max_norm",
|
|
17699
|
+
transpose: true,
|
|
17700
|
+
// [time, n_mels]
|
|
17701
|
+
max_num_frames,
|
|
17702
|
+
do_pad: false
|
|
17703
|
+
});
|
|
17704
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
17705
|
+
return { input_features };
|
|
17706
|
+
}
|
|
17707
|
+
};
|
|
17708
|
+
|
|
17274
17709
|
// src/models/moonshine/feature_extraction_moonshine.js
|
|
17275
17710
|
var MoonshineFeatureExtractor = class extends FeatureExtractor {
|
|
17276
17711
|
/**
|
|
@@ -17751,6 +18186,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
|
|
|
17751
18186
|
}
|
|
17752
18187
|
};
|
|
17753
18188
|
|
|
18189
|
+
// src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
|
|
18190
|
+
var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
|
|
18191
|
+
constructor(config) {
|
|
18192
|
+
super(config);
|
|
18193
|
+
this.config.mel_filters ??= mel_filter_bank(
|
|
18194
|
+
Math.floor(1 + this.config.n_fft / 2),
|
|
18195
|
+
// num_frequency_bins
|
|
18196
|
+
this.config.feature_size,
|
|
18197
|
+
// num_mel_filters
|
|
18198
|
+
0,
|
|
18199
|
+
// min_frequency
|
|
18200
|
+
8e3,
|
|
18201
|
+
// max_frequency
|
|
18202
|
+
this.config.sampling_rate,
|
|
18203
|
+
// sampling_rate
|
|
18204
|
+
"slaney",
|
|
18205
|
+
// norm
|
|
18206
|
+
"slaney"
|
|
18207
|
+
// mel_scale
|
|
18208
|
+
);
|
|
18209
|
+
this.window = window_function(this.config.n_fft, "hann");
|
|
18210
|
+
}
|
|
18211
|
+
/**
|
|
18212
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
18213
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
18214
|
+
* @param {Object} [options]
|
|
18215
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
|
|
18216
|
+
* @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
|
|
18217
|
+
*/
|
|
18218
|
+
async _extract_fbank_features(waveform, { center = true } = {}) {
|
|
18219
|
+
const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
|
|
18220
|
+
const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
|
|
18221
|
+
return await spectrogram(
|
|
18222
|
+
waveform,
|
|
18223
|
+
this.window,
|
|
18224
|
+
n_fft,
|
|
18225
|
+
// frame_length
|
|
18226
|
+
hop_length,
|
|
18227
|
+
{
|
|
18228
|
+
power: 2,
|
|
18229
|
+
mel_filters,
|
|
18230
|
+
log_mel: "log10_max_norm",
|
|
18231
|
+
max_log_mel: global_log_mel_max,
|
|
18232
|
+
center,
|
|
18233
|
+
max_num_frames,
|
|
18234
|
+
do_pad: false
|
|
18235
|
+
}
|
|
18236
|
+
);
|
|
18237
|
+
}
|
|
18238
|
+
/**
|
|
18239
|
+
* Extract mel spectrogram features from audio.
|
|
18240
|
+
* @param {Float32Array|Float64Array} audio The audio data.
|
|
18241
|
+
* @param {Object} [options]
|
|
18242
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform.
|
|
18243
|
+
* @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
|
|
18244
|
+
*/
|
|
18245
|
+
async _call(audio, { center = true } = {}) {
|
|
18246
|
+
validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
|
|
18247
|
+
const features = await this._extract_fbank_features(audio, { center });
|
|
18248
|
+
return {
|
|
18249
|
+
input_features: features.unsqueeze_(0)
|
|
18250
|
+
};
|
|
18251
|
+
}
|
|
18252
|
+
};
|
|
18253
|
+
|
|
17754
18254
|
// src/models/whisper/feature_extraction_whisper.js
|
|
17755
18255
|
var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
17756
18256
|
constructor(config) {
|
|
@@ -17779,7 +18279,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17779
18279
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
17780
18280
|
*/
|
|
17781
18281
|
async _extract_fbank_features(waveform) {
|
|
17782
|
-
|
|
18282
|
+
return await spectrogram(
|
|
17783
18283
|
waveform,
|
|
17784
18284
|
this.window,
|
|
17785
18285
|
// window
|
|
@@ -17790,7 +18290,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17790
18290
|
{
|
|
17791
18291
|
power: 2,
|
|
17792
18292
|
mel_filters: this.config.mel_filters,
|
|
17793
|
-
log_mel: "
|
|
18293
|
+
log_mel: "log10_max_norm",
|
|
17794
18294
|
// Custom
|
|
17795
18295
|
max_num_frames: Math.min(
|
|
17796
18296
|
Math.floor(waveform.length / this.config.hop_length),
|
|
@@ -17799,15 +18299,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17799
18299
|
)
|
|
17800
18300
|
}
|
|
17801
18301
|
);
|
|
17802
|
-
const data = features.data;
|
|
17803
|
-
const maxValue = max(
|
|
17804
|
-
/** @type {Float32Array} */
|
|
17805
|
-
data
|
|
17806
|
-
)[0];
|
|
17807
|
-
for (let i = 0; i < data.length; ++i) {
|
|
17808
|
-
data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
|
|
17809
|
-
}
|
|
17810
|
-
return features;
|
|
17811
18302
|
}
|
|
17812
18303
|
/**
|
|
17813
18304
|
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
@@ -18686,6 +19177,30 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
18686
19177
|
}
|
|
18687
19178
|
return [segmentation, segments];
|
|
18688
19179
|
}
|
|
19180
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280, temporal_factor = 1) {
|
|
19181
|
+
if (height < factor || width < factor) {
|
|
19182
|
+
const scale = Math.max(factor / height, factor / width);
|
|
19183
|
+
height = Math.round(height * scale);
|
|
19184
|
+
width = Math.round(width * scale);
|
|
19185
|
+
}
|
|
19186
|
+
if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19187
|
+
throw new Error(
|
|
19188
|
+
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19189
|
+
);
|
|
19190
|
+
}
|
|
19191
|
+
let h_bar = Math.round(height / factor) * factor;
|
|
19192
|
+
let w_bar = Math.round(width / factor) * factor;
|
|
19193
|
+
if (temporal_factor * h_bar * w_bar > max_pixels) {
|
|
19194
|
+
const beta = Math.sqrt(temporal_factor * height * width / max_pixels);
|
|
19195
|
+
h_bar = Math.max(factor, Math.floor(height / beta / factor) * factor);
|
|
19196
|
+
w_bar = Math.max(factor, Math.floor(width / beta / factor) * factor);
|
|
19197
|
+
} else if (temporal_factor * h_bar * w_bar < min_pixels) {
|
|
19198
|
+
const beta = Math.sqrt(min_pixels / (temporal_factor * height * width));
|
|
19199
|
+
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19200
|
+
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19201
|
+
}
|
|
19202
|
+
return [w_bar, h_bar];
|
|
19203
|
+
}
|
|
18689
19204
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
18690
19205
|
if (label_ids_to_fuse === null) {
|
|
18691
19206
|
logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
|
|
@@ -18763,7 +19278,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18763
19278
|
this.do_pad = config.do_pad;
|
|
18764
19279
|
this.min_pixels = config.min_pixels;
|
|
18765
19280
|
this.max_pixels = config.max_pixels;
|
|
18766
|
-
if (this.do_pad && !this.pad_size && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
19281
|
+
if (this.do_pad && !this.pad_size && !this.size_divisibility && this.size && this.size.width !== void 0 && this.size.height !== void 0) {
|
|
18767
19282
|
this.pad_size = this.size;
|
|
18768
19283
|
}
|
|
18769
19284
|
this.do_flip_channel_order = config.do_flip_channel_order ?? false;
|
|
@@ -18974,7 +19489,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18974
19489
|
});
|
|
18975
19490
|
}
|
|
18976
19491
|
/**
|
|
18977
|
-
* @typedef {
|
|
19492
|
+
* @typedef {Object} PreprocessedImage
|
|
18978
19493
|
* @property {HeightWidth} original_size The original size of the image.
|
|
18979
19494
|
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
|
|
18980
19495
|
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
|
|
@@ -19051,10 +19566,8 @@ var ImageProcessor = class extends Callable2 {
|
|
|
19051
19566
|
const padded = this.pad_image(pixelData, [image.height, image.width, image.channels], this.pad_size);
|
|
19052
19567
|
[pixelData, imgDims] = padded;
|
|
19053
19568
|
} else if (this.size_divisibility) {
|
|
19054
|
-
const
|
|
19055
|
-
|
|
19056
|
-
this.size_divisibility
|
|
19057
|
-
);
|
|
19569
|
+
const paddedWidth = Math.ceil(imgDims[1] / this.size_divisibility) * this.size_divisibility;
|
|
19570
|
+
const paddedHeight = Math.ceil(imgDims[0] / this.size_divisibility) * this.size_divisibility;
|
|
19058
19571
|
[pixelData, imgDims] = this.pad_image(pixelData, imgDims, { width: paddedWidth, height: paddedHeight });
|
|
19059
19572
|
}
|
|
19060
19573
|
}
|
|
@@ -19131,6 +19644,7 @@ var image_processors_exports = {};
|
|
|
19131
19644
|
__export(image_processors_exports, {
|
|
19132
19645
|
BeitFeatureExtractor: () => BeitFeatureExtractor,
|
|
19133
19646
|
BitImageProcessor: () => BitImageProcessor,
|
|
19647
|
+
CHMv2ImageProcessor: () => CHMv2ImageProcessor,
|
|
19134
19648
|
CLIPFeatureExtractor: () => CLIPFeatureExtractor,
|
|
19135
19649
|
CLIPImageProcessor: () => CLIPImageProcessor,
|
|
19136
19650
|
ChineseCLIPFeatureExtractor: () => ChineseCLIPFeatureExtractor,
|
|
@@ -19147,11 +19661,13 @@ __export(image_processors_exports, {
|
|
|
19147
19661
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
19148
19662
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
19149
19663
|
GLPNFeatureExtractor: () => GLPNFeatureExtractor,
|
|
19664
|
+
Glm46VImageProcessor: () => Glm46VImageProcessor,
|
|
19150
19665
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
19151
19666
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
19152
19667
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
19153
19668
|
ImageProcessor: () => ImageProcessor,
|
|
19154
19669
|
JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
|
|
19670
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
19155
19671
|
LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
|
|
19156
19672
|
Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
|
|
19157
19673
|
MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
|
|
@@ -19206,6 +19722,10 @@ var BitImageProcessor = class extends ImageProcessor {
|
|
|
19206
19722
|
var ChineseCLIPFeatureExtractor = class extends ImageProcessor {
|
|
19207
19723
|
};
|
|
19208
19724
|
|
|
19725
|
+
// src/models/chmv2/image_processing_chmv2.js
|
|
19726
|
+
var CHMv2ImageProcessor = class extends ImageProcessor {
|
|
19727
|
+
};
|
|
19728
|
+
|
|
19209
19729
|
// src/models/clip/image_processing_clip.js
|
|
19210
19730
|
var CLIPImageProcessor = class extends ImageProcessor {
|
|
19211
19731
|
};
|
|
@@ -19325,6 +19845,65 @@ var EfficientNetImageProcessor = class extends ImageProcessor {
|
|
|
19325
19845
|
}
|
|
19326
19846
|
};
|
|
19327
19847
|
|
|
19848
|
+
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19849
|
+
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19850
|
+
constructor(config) {
|
|
19851
|
+
super(config);
|
|
19852
|
+
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
19853
|
+
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
19854
|
+
this.patch_size = config.patch_size;
|
|
19855
|
+
this.merge_size = config.merge_size;
|
|
19856
|
+
}
|
|
19857
|
+
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
19858
|
+
get_resize_output_image_size(image, size) {
|
|
19859
|
+
const factor = this.patch_size * this.merge_size;
|
|
19860
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
19861
|
+
}
|
|
19862
|
+
async _call(images, ...args) {
|
|
19863
|
+
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
19864
|
+
let patches = pixel_values;
|
|
19865
|
+
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
19866
|
+
if (patches.dims[0] === 1) {
|
|
19867
|
+
patches = cat(
|
|
19868
|
+
Array.from({ length: temporal_patch_size }, () => patches),
|
|
19869
|
+
0
|
|
19870
|
+
);
|
|
19871
|
+
}
|
|
19872
|
+
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
19873
|
+
const channel = patches.dims[1];
|
|
19874
|
+
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
19875
|
+
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
19876
|
+
const flatten_patches = patches.view(
|
|
19877
|
+
grid_t,
|
|
19878
|
+
temporal_patch_size,
|
|
19879
|
+
channel,
|
|
19880
|
+
Math.floor(grid_h / merge_size),
|
|
19881
|
+
merge_size,
|
|
19882
|
+
patch_size,
|
|
19883
|
+
Math.floor(grid_w / merge_size),
|
|
19884
|
+
merge_size,
|
|
19885
|
+
patch_size
|
|
19886
|
+
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
19887
|
+
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
19888
|
+
return {
|
|
19889
|
+
pixel_values: flatten_patches,
|
|
19890
|
+
image_grid_thw,
|
|
19891
|
+
original_sizes,
|
|
19892
|
+
reshaped_input_sizes
|
|
19893
|
+
};
|
|
19894
|
+
}
|
|
19895
|
+
};
|
|
19896
|
+
|
|
19897
|
+
// src/models/glm46v/image_processing_glm46v.js
|
|
19898
|
+
var Glm46VImageProcessor = class extends Qwen2VLImageProcessor {
|
|
19899
|
+
/** @type {Qwen2VLImageProcessor['get_resize_output_image_size']} */
|
|
19900
|
+
get_resize_output_image_size(image, size) {
|
|
19901
|
+
const factor = this.patch_size * this.merge_size;
|
|
19902
|
+
const temporal_factor = this.config.temporal_patch_size ?? 2;
|
|
19903
|
+
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels, temporal_factor);
|
|
19904
|
+
}
|
|
19905
|
+
};
|
|
19906
|
+
|
|
19328
19907
|
// src/models/glpn/image_processing_glpn.js
|
|
19329
19908
|
var GLPNFeatureExtractor = class extends ImageProcessor {
|
|
19330
19909
|
};
|
|
@@ -19555,6 +20134,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
|
|
|
19555
20134
|
}
|
|
19556
20135
|
};
|
|
19557
20136
|
|
|
20137
|
+
// src/models/lfm2_vl/image_processing_lfm2_vl.js
|
|
20138
|
+
function round_by_factor(number, factor) {
|
|
20139
|
+
return Math.round(number / factor) * factor;
|
|
20140
|
+
}
|
|
20141
|
+
function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
|
|
20142
|
+
let best_ratio_diff = Infinity;
|
|
20143
|
+
let best_ratio = [1, 1];
|
|
20144
|
+
const area = width * height;
|
|
20145
|
+
for (const ratio of target_ratios) {
|
|
20146
|
+
const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
|
|
20147
|
+
if (ratio_diff < best_ratio_diff) {
|
|
20148
|
+
best_ratio_diff = ratio_diff;
|
|
20149
|
+
best_ratio = ratio;
|
|
20150
|
+
} else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
|
|
20151
|
+
best_ratio = ratio;
|
|
20152
|
+
}
|
|
20153
|
+
}
|
|
20154
|
+
return best_ratio;
|
|
20155
|
+
}
|
|
20156
|
+
function get_target_ratios(min_tiles, max_tiles) {
|
|
20157
|
+
const ratios = [];
|
|
20158
|
+
const seen = /* @__PURE__ */ new Set();
|
|
20159
|
+
for (let n = min_tiles; n <= max_tiles; ++n) {
|
|
20160
|
+
for (let w = 1; w <= n; ++w) {
|
|
20161
|
+
for (let h = 1; h <= n; ++h) {
|
|
20162
|
+
const product2 = w * h;
|
|
20163
|
+
if (product2 >= min_tiles && product2 <= max_tiles) {
|
|
20164
|
+
const key = w << 16 | h;
|
|
20165
|
+
if (!seen.has(key)) {
|
|
20166
|
+
seen.add(key);
|
|
20167
|
+
ratios.push([w, h]);
|
|
20168
|
+
}
|
|
20169
|
+
}
|
|
20170
|
+
}
|
|
20171
|
+
}
|
|
20172
|
+
}
|
|
20173
|
+
return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
|
|
20174
|
+
}
|
|
20175
|
+
function convert_image_to_patches(images, patch_size) {
|
|
20176
|
+
const [B, C, H, W] = images.dims;
|
|
20177
|
+
const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
|
|
20178
|
+
const patch_dim = patch_size * patch_size * C;
|
|
20179
|
+
const data = (
|
|
20180
|
+
/** @type {Float32Array} */
|
|
20181
|
+
images.data
|
|
20182
|
+
);
|
|
20183
|
+
const result = new Float32Array(B * ph * pw * patch_dim);
|
|
20184
|
+
const ch_stride = H * W;
|
|
20185
|
+
for (let b = 0; b < B; ++b) {
|
|
20186
|
+
const b_src = b * C * ch_stride;
|
|
20187
|
+
const b_dst = b * ph * pw * patch_dim;
|
|
20188
|
+
for (let py = 0; py < ph; ++py) {
|
|
20189
|
+
for (let px = 0; px < pw; ++px) {
|
|
20190
|
+
let off = b_dst + (py * pw + px) * patch_dim;
|
|
20191
|
+
for (let dy = 0; dy < patch_size; ++dy) {
|
|
20192
|
+
const row = (py * patch_size + dy) * W + px * patch_size;
|
|
20193
|
+
for (let dx = 0; dx < patch_size; ++dx) {
|
|
20194
|
+
const pixel = row + dx;
|
|
20195
|
+
for (let c = 0; c < C; ++c) {
|
|
20196
|
+
result[off++] = data[b_src + c * ch_stride + pixel];
|
|
20197
|
+
}
|
|
20198
|
+
}
|
|
20199
|
+
}
|
|
20200
|
+
}
|
|
20201
|
+
}
|
|
20202
|
+
}
|
|
20203
|
+
return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
|
|
20204
|
+
}
|
|
20205
|
+
function pad_along_first_dim(patches, target_length) {
|
|
20206
|
+
const [, len2, dim] = patches.dims;
|
|
20207
|
+
const mask_data = new BigInt64Array(target_length);
|
|
20208
|
+
mask_data.fill(1n, 0, len2);
|
|
20209
|
+
let padded = patches;
|
|
20210
|
+
if (len2 < target_length) {
|
|
20211
|
+
const padded_data = new Float32Array(target_length * dim);
|
|
20212
|
+
padded_data.set(
|
|
20213
|
+
/** @type {Float32Array} */
|
|
20214
|
+
patches.data
|
|
20215
|
+
);
|
|
20216
|
+
padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
|
|
20217
|
+
}
|
|
20218
|
+
return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
|
|
20219
|
+
}
|
|
20220
|
+
var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
20221
|
+
constructor(config) {
|
|
20222
|
+
super(config);
|
|
20223
|
+
this.downsample_factor = config.downsample_factor ?? 2;
|
|
20224
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
20225
|
+
this.min_tiles = config.min_tiles ?? 2;
|
|
20226
|
+
this.max_tiles = config.max_tiles ?? 10;
|
|
20227
|
+
this.use_thumbnail = config.use_thumbnail ?? true;
|
|
20228
|
+
this.min_image_tokens = config.min_image_tokens ?? 64;
|
|
20229
|
+
this.max_image_tokens = config.max_image_tokens ?? 256;
|
|
20230
|
+
this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
|
|
20231
|
+
this.tile_size = config.tile_size ?? 512;
|
|
20232
|
+
this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
|
|
20233
|
+
this.return_row_col_info = config.return_row_col_info ?? false;
|
|
20234
|
+
const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
|
|
20235
|
+
const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
|
|
20236
|
+
this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
|
|
20237
|
+
}
|
|
20238
|
+
/**
|
|
20239
|
+
* Check if the image is too large to be processed as a single tile.
|
|
20240
|
+
* @param {number} height
|
|
20241
|
+
* @param {number} width
|
|
20242
|
+
* @returns {boolean}
|
|
20243
|
+
*/
|
|
20244
|
+
_is_image_too_large(height, width) {
|
|
20245
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20246
|
+
const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
|
|
20247
|
+
const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
|
|
20248
|
+
return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
|
|
20249
|
+
}
|
|
20250
|
+
/**
|
|
20251
|
+
* Get the grid layout for tiling a large image.
|
|
20252
|
+
* @param {number} height
|
|
20253
|
+
* @param {number} width
|
|
20254
|
+
* @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
|
|
20255
|
+
*/
|
|
20256
|
+
_get_grid_layout(height, width) {
|
|
20257
|
+
const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
|
|
20258
|
+
const [grid_width, grid_height] = find_closest_aspect_ratio(
|
|
20259
|
+
width / height,
|
|
20260
|
+
target_ratios,
|
|
20261
|
+
width,
|
|
20262
|
+
height,
|
|
20263
|
+
this.tile_size
|
|
20264
|
+
);
|
|
20265
|
+
return {
|
|
20266
|
+
grid_width,
|
|
20267
|
+
grid_height,
|
|
20268
|
+
target_width: this.tile_size * grid_width,
|
|
20269
|
+
target_height: this.tile_size * grid_height
|
|
20270
|
+
};
|
|
20271
|
+
}
|
|
20272
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
20273
|
+
// @ts-expect-error
|
|
20274
|
+
async _call(images, { return_row_col_info = null } = {}) {
|
|
20275
|
+
let batched_images;
|
|
20276
|
+
if (!Array.isArray(images)) {
|
|
20277
|
+
batched_images = [[images]];
|
|
20278
|
+
} else if (!Array.isArray(images[0])) {
|
|
20279
|
+
batched_images = [
|
|
20280
|
+
/** @type {RawImage[]} */
|
|
20281
|
+
images
|
|
20282
|
+
];
|
|
20283
|
+
} else {
|
|
20284
|
+
batched_images = /** @type {RawImage[][]} */
|
|
20285
|
+
images;
|
|
20286
|
+
}
|
|
20287
|
+
const all_pixel_values = [];
|
|
20288
|
+
const all_pixel_masks = [];
|
|
20289
|
+
const all_spatial_shapes = [];
|
|
20290
|
+
const all_rows = [];
|
|
20291
|
+
const all_cols = [];
|
|
20292
|
+
const all_image_sizes = [];
|
|
20293
|
+
for (const image_batch of batched_images) {
|
|
20294
|
+
const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
|
|
20295
|
+
for (const { pixel_values } of preprocessed) {
|
|
20296
|
+
const [, height, width] = pixel_values.dims;
|
|
20297
|
+
const img = pixel_values.unsqueeze_(0);
|
|
20298
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20299
|
+
const f2 = total_factor ** 2;
|
|
20300
|
+
const [new_width, new_height] = smart_resize(
|
|
20301
|
+
Math.max(total_factor, height),
|
|
20302
|
+
Math.max(total_factor, width),
|
|
20303
|
+
total_factor,
|
|
20304
|
+
this.min_image_tokens * f2,
|
|
20305
|
+
this.max_image_tokens * f2
|
|
20306
|
+
).map((x) => Math.max(total_factor, x));
|
|
20307
|
+
let tiles;
|
|
20308
|
+
let num_rows = 1, num_cols = 1;
|
|
20309
|
+
const is_large = this._is_image_too_large(height, width);
|
|
20310
|
+
const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
|
|
20311
|
+
if (is_large && do_splitting) {
|
|
20312
|
+
const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
|
|
20313
|
+
height,
|
|
20314
|
+
width
|
|
20315
|
+
);
|
|
20316
|
+
num_rows = grid_height;
|
|
20317
|
+
num_cols = grid_width;
|
|
20318
|
+
const resized = await interpolate_4d(img, {
|
|
20319
|
+
size: [target_height, target_width]
|
|
20320
|
+
});
|
|
20321
|
+
tiles = [];
|
|
20322
|
+
for (let r = 0; r < grid_height; ++r) {
|
|
20323
|
+
for (let c = 0; c < grid_width; ++c) {
|
|
20324
|
+
const y = r * this.tile_size;
|
|
20325
|
+
const x = c * this.tile_size;
|
|
20326
|
+
tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
|
|
20327
|
+
}
|
|
20328
|
+
}
|
|
20329
|
+
if (this.use_thumbnail && grid_width * grid_height !== 1) {
|
|
20330
|
+
tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
|
|
20331
|
+
}
|
|
20332
|
+
} else {
|
|
20333
|
+
tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
|
|
20334
|
+
}
|
|
20335
|
+
for (const tile of tiles) {
|
|
20336
|
+
const [, , th, tw] = tile.dims;
|
|
20337
|
+
const patches = convert_image_to_patches(tile, this.encoder_patch_size);
|
|
20338
|
+
const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
|
|
20339
|
+
all_pixel_values.push(padded);
|
|
20340
|
+
all_pixel_masks.push(mask);
|
|
20341
|
+
all_spatial_shapes.push([
|
|
20342
|
+
Math.floor(th / this.encoder_patch_size),
|
|
20343
|
+
Math.floor(tw / this.encoder_patch_size)
|
|
20344
|
+
]);
|
|
20345
|
+
}
|
|
20346
|
+
all_rows.push(num_rows);
|
|
20347
|
+
all_cols.push(num_cols);
|
|
20348
|
+
all_image_sizes.push([new_height, new_width]);
|
|
20349
|
+
}
|
|
20350
|
+
}
|
|
20351
|
+
const result = {
|
|
20352
|
+
pixel_values: cat(all_pixel_values, 0),
|
|
20353
|
+
pixel_attention_mask: stack(all_pixel_masks, 0),
|
|
20354
|
+
spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
|
|
20355
|
+
all_spatial_shapes.length,
|
|
20356
|
+
2
|
|
20357
|
+
])
|
|
20358
|
+
};
|
|
20359
|
+
if (return_row_col_info ?? this.return_row_col_info) {
|
|
20360
|
+
result.image_rows = all_rows;
|
|
20361
|
+
result.image_cols = all_cols;
|
|
20362
|
+
result.image_sizes = all_image_sizes;
|
|
20363
|
+
}
|
|
20364
|
+
return result;
|
|
20365
|
+
}
|
|
20366
|
+
};
|
|
20367
|
+
|
|
19558
20368
|
// src/models/llava_onevision/image_processing_llava_onevision.js
|
|
19559
20369
|
var LlavaOnevisionImageProcessor = class extends ImageProcessor {
|
|
19560
20370
|
};
|
|
@@ -19777,76 +20587,6 @@ var PixtralImageProcessor = class extends ImageProcessor {
|
|
|
19777
20587
|
var PvtImageProcessor = class extends ImageProcessor {
|
|
19778
20588
|
};
|
|
19779
20589
|
|
|
19780
|
-
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19781
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
19782
|
-
if (height < factor || width < factor) {
|
|
19783
|
-
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
19784
|
-
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19785
|
-
throw new Error(
|
|
19786
|
-
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19787
|
-
);
|
|
19788
|
-
}
|
|
19789
|
-
let h_bar = Math.round(height / factor) * factor;
|
|
19790
|
-
let w_bar = Math.round(width / factor) * factor;
|
|
19791
|
-
if (h_bar * w_bar > max_pixels) {
|
|
19792
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
19793
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
19794
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
19795
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
19796
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
19797
|
-
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19798
|
-
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19799
|
-
}
|
|
19800
|
-
return [h_bar, w_bar];
|
|
19801
|
-
}
|
|
19802
|
-
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19803
|
-
constructor(config) {
|
|
19804
|
-
super(config);
|
|
19805
|
-
this.min_pixels = config.min_pixels ?? config.size?.shortest_edge;
|
|
19806
|
-
this.max_pixels = config.max_pixels ?? config.size?.longest_edge;
|
|
19807
|
-
this.patch_size = config.patch_size;
|
|
19808
|
-
this.merge_size = config.merge_size;
|
|
19809
|
-
}
|
|
19810
|
-
/** @type {ImageProcessor['get_resize_output_image_size']} */
|
|
19811
|
-
get_resize_output_image_size(image, size) {
|
|
19812
|
-
const factor = this.patch_size * this.merge_size;
|
|
19813
|
-
return smart_resize(image.height, image.width, factor, this.min_pixels, this.max_pixels);
|
|
19814
|
-
}
|
|
19815
|
-
async _call(images, ...args) {
|
|
19816
|
-
const { pixel_values, original_sizes, reshaped_input_sizes } = await super._call(images, ...args);
|
|
19817
|
-
let patches = pixel_values;
|
|
19818
|
-
const { temporal_patch_size, merge_size, patch_size } = this.config;
|
|
19819
|
-
if (patches.dims[0] === 1) {
|
|
19820
|
-
patches = cat(
|
|
19821
|
-
Array.from({ length: temporal_patch_size }, () => patches),
|
|
19822
|
-
0
|
|
19823
|
-
);
|
|
19824
|
-
}
|
|
19825
|
-
const grid_t = patches.dims[0] / temporal_patch_size;
|
|
19826
|
-
const channel = patches.dims[1];
|
|
19827
|
-
const grid_h = Math.floor(patches.dims[2] / patch_size);
|
|
19828
|
-
const grid_w = Math.floor(patches.dims[3] / patch_size);
|
|
19829
|
-
const flatten_patches = patches.view(
|
|
19830
|
-
grid_t,
|
|
19831
|
-
temporal_patch_size,
|
|
19832
|
-
channel,
|
|
19833
|
-
Math.floor(grid_h / merge_size),
|
|
19834
|
-
merge_size,
|
|
19835
|
-
patch_size,
|
|
19836
|
-
Math.floor(grid_w / merge_size),
|
|
19837
|
-
merge_size,
|
|
19838
|
-
patch_size
|
|
19839
|
-
).permute(0, 3, 6, 4, 7, 2, 1, 5, 8).view(grid_t * grid_h * grid_w, channel * temporal_patch_size * patch_size * patch_size);
|
|
19840
|
-
const image_grid_thw = new Tensor2("int64", [grid_t, grid_h, grid_w], [1, 3]);
|
|
19841
|
-
return {
|
|
19842
|
-
pixel_values: flatten_patches,
|
|
19843
|
-
image_grid_thw,
|
|
19844
|
-
original_sizes,
|
|
19845
|
-
reshaped_input_sizes
|
|
19846
|
-
};
|
|
19847
|
-
}
|
|
19848
|
-
};
|
|
19849
|
-
|
|
19850
20590
|
// src/models/rt_detr/image_processing_rt_detr.js
|
|
19851
20591
|
var RTDetrImageProcessor = class extends ImageProcessor {
|
|
19852
20592
|
/** @type {typeof post_process_object_detection} */
|
|
@@ -20400,6 +21140,107 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
20400
21140
|
}
|
|
20401
21141
|
};
|
|
20402
21142
|
|
|
21143
|
+
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
21144
|
+
var Qwen2VLProcessor = class extends Processor {
|
|
21145
|
+
static image_processor_class = AutoImageProcessor;
|
|
21146
|
+
static tokenizer_class = AutoTokenizer;
|
|
21147
|
+
static image_token = "<|image_pad|>";
|
|
21148
|
+
/**
|
|
21149
|
+
*
|
|
21150
|
+
* @param {string|string[]} text
|
|
21151
|
+
* @param {RawImage|RawImage[]} images
|
|
21152
|
+
* @param {...any} args
|
|
21153
|
+
* @returns {Promise<any>}
|
|
21154
|
+
*/
|
|
21155
|
+
async _call(text, images = null, ...args) {
|
|
21156
|
+
if (!Array.isArray(text)) {
|
|
21157
|
+
text = [text];
|
|
21158
|
+
}
|
|
21159
|
+
let image_inputs, image_grid_thw;
|
|
21160
|
+
if (images) {
|
|
21161
|
+
image_inputs = await this.image_processor(images);
|
|
21162
|
+
image_grid_thw = image_inputs.image_grid_thw;
|
|
21163
|
+
}
|
|
21164
|
+
if (image_grid_thw) {
|
|
21165
|
+
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
21166
|
+
let index = 0;
|
|
21167
|
+
const image_token = (
|
|
21168
|
+
/** @type {typeof Qwen2VLProcessor} */
|
|
21169
|
+
this.constructor.image_token
|
|
21170
|
+
);
|
|
21171
|
+
const image_grid_thw_list = image_grid_thw.tolist();
|
|
21172
|
+
text = text.map((t) => {
|
|
21173
|
+
while (t.includes(image_token)) {
|
|
21174
|
+
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
21175
|
+
t = t.replace(image_token, "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
21176
|
+
}
|
|
21177
|
+
return t.replaceAll("<|placeholder|>", image_token);
|
|
21178
|
+
});
|
|
21179
|
+
}
|
|
21180
|
+
const text_inputs = this.tokenizer(text);
|
|
21181
|
+
return {
|
|
21182
|
+
...text_inputs,
|
|
21183
|
+
...image_inputs
|
|
21184
|
+
};
|
|
21185
|
+
}
|
|
21186
|
+
};
|
|
21187
|
+
|
|
21188
|
+
// src/models/glm46v/processing_glm46v.js
|
|
21189
|
+
var Glm46VProcessor = class extends Qwen2VLProcessor {
|
|
21190
|
+
static image_token = "<|image|>";
|
|
21191
|
+
};
|
|
21192
|
+
|
|
21193
|
+
// src/models/granite_speech/processing_granite_speech.js
|
|
21194
|
+
var GraniteSpeechProcessor = class extends Processor {
|
|
21195
|
+
static tokenizer_class = AutoTokenizer;
|
|
21196
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
21197
|
+
static uses_processor_config = true;
|
|
21198
|
+
/**
|
|
21199
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
21200
|
+
* @param {number} audioLength Raw audio sample count.
|
|
21201
|
+
* @returns {number} Number of projector output tokens.
|
|
21202
|
+
*/
|
|
21203
|
+
_get_num_audio_features(audioLength) {
|
|
21204
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
21205
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
21206
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
21207
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
21208
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
21209
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
21210
|
+
return nblocks * effective_window_size;
|
|
21211
|
+
}
|
|
21212
|
+
/**
|
|
21213
|
+
* @param {string} text The text input to process.
|
|
21214
|
+
* @param {Float32Array} audio The audio input to process.
|
|
21215
|
+
*/
|
|
21216
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
21217
|
+
if (Array.isArray(text)) {
|
|
21218
|
+
throw new Error("Batched inputs are not supported yet.");
|
|
21219
|
+
}
|
|
21220
|
+
let audio_inputs = {};
|
|
21221
|
+
if (audio) {
|
|
21222
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
21223
|
+
audio_inputs["input_features"] = input_features;
|
|
21224
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
21225
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
21226
|
+
audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
|
|
21227
|
+
const audio_token = this.config.audio_token ?? "<|audio|>";
|
|
21228
|
+
if (!text.includes(audio_token)) {
|
|
21229
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
21230
|
+
}
|
|
21231
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
21232
|
+
}
|
|
21233
|
+
const text_inputs = this.tokenizer(text, {
|
|
21234
|
+
add_special_tokens: false,
|
|
21235
|
+
...kwargs
|
|
21236
|
+
});
|
|
21237
|
+
return {
|
|
21238
|
+
...text_inputs,
|
|
21239
|
+
...audio_inputs
|
|
21240
|
+
};
|
|
21241
|
+
}
|
|
21242
|
+
};
|
|
21243
|
+
|
|
20403
21244
|
// src/models/grounding_dino/processing_grounding_dino.js
|
|
20404
21245
|
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
20405
21246
|
const left_idx = 0;
|
|
@@ -20676,6 +21517,66 @@ var JinaCLIPProcessor = class extends Processor {
|
|
|
20676
21517
|
}
|
|
20677
21518
|
};
|
|
20678
21519
|
|
|
21520
|
+
// src/models/lfm2_vl/processing_lfm2_vl.js
|
|
21521
|
+
var Lfm2VlProcessor = class extends Processor {
|
|
21522
|
+
static tokenizer_class = AutoTokenizer;
|
|
21523
|
+
static image_processor_class = AutoImageProcessor;
|
|
21524
|
+
/**
|
|
21525
|
+
* @param {RawImage|RawImage[]} images
|
|
21526
|
+
* @param {string|string[]|null} [text]
|
|
21527
|
+
* @param {Record<string, any>} [kwargs]
|
|
21528
|
+
*/
|
|
21529
|
+
async _call(images, text = null, kwargs = {}) {
|
|
21530
|
+
const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
|
|
21531
|
+
...kwargs,
|
|
21532
|
+
return_row_col_info: true
|
|
21533
|
+
});
|
|
21534
|
+
if (text) {
|
|
21535
|
+
const image_token = this.config.image_token ?? "<image>";
|
|
21536
|
+
const {
|
|
21537
|
+
tile_size = 512,
|
|
21538
|
+
downsample_factor = 2,
|
|
21539
|
+
encoder_patch_size = 16,
|
|
21540
|
+
use_thumbnail = true
|
|
21541
|
+
} = (
|
|
21542
|
+
/** @type {Record<string, any>} */
|
|
21543
|
+
this.image_processor.config
|
|
21544
|
+
);
|
|
21545
|
+
const ds2 = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
|
|
21546
|
+
const tokens_per_tile = ds2(tile_size) ** 2;
|
|
21547
|
+
const image_start = this.config.image_start_token ?? "<|image_start|>";
|
|
21548
|
+
const image_end = this.config.image_end_token ?? "<|image_end|>";
|
|
21549
|
+
const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
|
|
21550
|
+
if (!Array.isArray(text)) text = [text];
|
|
21551
|
+
let image_idx = 0;
|
|
21552
|
+
text = text.map((sample) => {
|
|
21553
|
+
const parts = sample.split(image_token);
|
|
21554
|
+
return parts[0] + parts.slice(1).map((part) => {
|
|
21555
|
+
const idx = image_idx++;
|
|
21556
|
+
const [h, w] = image_sizes[idx];
|
|
21557
|
+
const rows = image_rows[idx], cols = image_cols[idx];
|
|
21558
|
+
const tokens_for_image = ds2(h) * ds2(w);
|
|
21559
|
+
let expanded = image_start;
|
|
21560
|
+
if (rows > 1 || cols > 1) {
|
|
21561
|
+
const tile_str = image_token.repeat(tokens_per_tile);
|
|
21562
|
+
for (let r = 0; r < rows; ++r)
|
|
21563
|
+
for (let c = 0; c < cols; ++c)
|
|
21564
|
+
expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
|
|
21565
|
+
if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
|
|
21566
|
+
} else {
|
|
21567
|
+
expanded += image_token.repeat(tokens_for_image);
|
|
21568
|
+
}
|
|
21569
|
+
return expanded + image_end + part;
|
|
21570
|
+
}).join("");
|
|
21571
|
+
});
|
|
21572
|
+
}
|
|
21573
|
+
return {
|
|
21574
|
+
...image_inputs,
|
|
21575
|
+
...text ? this.tokenizer(text, kwargs) : {}
|
|
21576
|
+
};
|
|
21577
|
+
}
|
|
21578
|
+
};
|
|
21579
|
+
|
|
20679
21580
|
// src/models/llava/processing_llava.js
|
|
20680
21581
|
var LlavaProcessor = class extends Processor {
|
|
20681
21582
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -21019,47 +21920,6 @@ var PyAnnoteProcessor = class extends Processor {
|
|
|
21019
21920
|
}
|
|
21020
21921
|
};
|
|
21021
21922
|
|
|
21022
|
-
// src/models/qwen2_vl/processing_qwen2_vl.js
|
|
21023
|
-
var Qwen2VLProcessor = class extends Processor {
|
|
21024
|
-
static image_processor_class = AutoImageProcessor;
|
|
21025
|
-
static tokenizer_class = AutoTokenizer;
|
|
21026
|
-
/**
|
|
21027
|
-
*
|
|
21028
|
-
* @param {string|string[]} text
|
|
21029
|
-
* @param {RawImage|RawImage[]} images
|
|
21030
|
-
* @param {...any} args
|
|
21031
|
-
* @returns {Promise<any>}
|
|
21032
|
-
*/
|
|
21033
|
-
async _call(text, images = null, ...args) {
|
|
21034
|
-
if (!Array.isArray(text)) {
|
|
21035
|
-
text = [text];
|
|
21036
|
-
}
|
|
21037
|
-
let image_inputs, image_grid_thw;
|
|
21038
|
-
if (images) {
|
|
21039
|
-
image_inputs = await this.image_processor(images);
|
|
21040
|
-
image_grid_thw = image_inputs.image_grid_thw;
|
|
21041
|
-
}
|
|
21042
|
-
if (image_grid_thw) {
|
|
21043
|
-
let merge_length = this.image_processor.config.merge_size ** 2;
|
|
21044
|
-
let index = 0;
|
|
21045
|
-
const image_grid_thw_list = image_grid_thw.tolist();
|
|
21046
|
-
text = text.map((t) => {
|
|
21047
|
-
while (t.includes("<|image_pad|>")) {
|
|
21048
|
-
const prod = Number(image_grid_thw_list[index++].reduce((a, b) => a * b, 1n));
|
|
21049
|
-
t = t.replace("<|image_pad|>", "<|placeholder|>".repeat(Math.floor(prod / merge_length)));
|
|
21050
|
-
}
|
|
21051
|
-
return t.replaceAll("<|placeholder|>", "<|image_pad|>");
|
|
21052
|
-
});
|
|
21053
|
-
}
|
|
21054
|
-
const text_inputs = this.tokenizer(text);
|
|
21055
|
-
return {
|
|
21056
|
-
...text_inputs,
|
|
21057
|
-
...image_inputs
|
|
21058
|
-
// TODO: ...videos_inputs,
|
|
21059
|
-
};
|
|
21060
|
-
}
|
|
21061
|
-
};
|
|
21062
|
-
|
|
21063
21923
|
// src/models/qwen2_5_vl/processing_qwen2_5_vl.js
|
|
21064
21924
|
var Qwen2_5_VLProcessor = class extends Qwen2VLProcessor {
|
|
21065
21925
|
};
|
|
@@ -21208,6 +22068,94 @@ var VoxtralProcessor = class extends Processor {
|
|
|
21208
22068
|
}
|
|
21209
22069
|
};
|
|
21210
22070
|
|
|
22071
|
+
// src/models/voxtral_realtime/processing_voxtral_realtime.js
|
|
22072
|
+
var NUM_LEFT_PAD_TOKENS = 32;
|
|
22073
|
+
var NUM_DELAY_TOKENS = 6;
|
|
22074
|
+
var AUDIO_LENGTH_PER_TOK = 8;
|
|
22075
|
+
var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
22076
|
+
var STREAMING_PAD_TOKEN_ID = 32;
|
|
22077
|
+
var VoxtralRealtimeProcessor = class extends Processor {
|
|
22078
|
+
static tokenizer_class = AutoTokenizer;
|
|
22079
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
22080
|
+
static uses_processor_config = false;
|
|
22081
|
+
/** Number of mel frames in the first audio chunk. */
|
|
22082
|
+
get num_mel_frames_first_audio_chunk() {
|
|
22083
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
22084
|
+
}
|
|
22085
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
22086
|
+
get num_samples_first_audio_chunk() {
|
|
22087
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
22088
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
22089
|
+
}
|
|
22090
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
22091
|
+
get num_samples_per_audio_chunk() {
|
|
22092
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
22093
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
22094
|
+
}
|
|
22095
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
22096
|
+
get num_right_pad_tokens() {
|
|
22097
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
22098
|
+
}
|
|
22099
|
+
/** Number of mel frames per text token. */
|
|
22100
|
+
get audio_length_per_tok() {
|
|
22101
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
22102
|
+
}
|
|
22103
|
+
/** Number of raw audio samples per token. */
|
|
22104
|
+
get raw_audio_length_per_tok() {
|
|
22105
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
22106
|
+
}
|
|
22107
|
+
/**
|
|
22108
|
+
* Process audio input for VoxtralRealtime.
|
|
22109
|
+
*
|
|
22110
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
22111
|
+
* with silence and mel features are extracted with `center=true`.
|
|
22112
|
+
* Returns `{ input_ids, input_features }`.
|
|
22113
|
+
*
|
|
22114
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
22115
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
22116
|
+
*
|
|
22117
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
22118
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
22119
|
+
* Returns `{ input_features }`.
|
|
22120
|
+
*
|
|
22121
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
22122
|
+
* @param {Object} [options]
|
|
22123
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
22124
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
22125
|
+
* @returns {Promise<Object>}
|
|
22126
|
+
*/
|
|
22127
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
22128
|
+
validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
|
|
22129
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
22130
|
+
throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
|
|
22131
|
+
}
|
|
22132
|
+
if (is_first_audio_chunk) {
|
|
22133
|
+
if (is_streaming) {
|
|
22134
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
22135
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
22136
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
22137
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
22138
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
22139
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
22140
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
22141
|
+
input_ids_data[0] = 1n;
|
|
22142
|
+
const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
|
|
22143
|
+
return {
|
|
22144
|
+
input_ids,
|
|
22145
|
+
...audio_encoding
|
|
22146
|
+
};
|
|
22147
|
+
} else {
|
|
22148
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
22149
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
22150
|
+
padded_audio.set(audio);
|
|
22151
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
22152
|
+
}
|
|
22153
|
+
} else {
|
|
22154
|
+
return await this.feature_extractor(audio, { center: false });
|
|
22155
|
+
}
|
|
22156
|
+
}
|
|
22157
|
+
};
|
|
22158
|
+
|
|
21211
22159
|
// src/models/wav2vec2/processing_wav2vec2.js
|
|
21212
22160
|
var Wav2Vec2Processor = class extends Processor {
|
|
21213
22161
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -21307,11 +22255,16 @@ function getNormalizedConfig(config) {
|
|
|
21307
22255
|
case "florence2":
|
|
21308
22256
|
case "llava_onevision":
|
|
21309
22257
|
case "idefics3":
|
|
22258
|
+
case "granite_speech":
|
|
21310
22259
|
case "ultravox":
|
|
21311
22260
|
case "voxtral":
|
|
22261
|
+
case "voxtral_realtime":
|
|
21312
22262
|
case "smolvlm":
|
|
21313
22263
|
case "gemma3n":
|
|
22264
|
+
case "lfm2_vl":
|
|
21314
22265
|
case "chatterbox":
|
|
22266
|
+
case "lighton_ocr":
|
|
22267
|
+
case "glm_ocr":
|
|
21315
22268
|
case "mistral3":
|
|
21316
22269
|
case "qwen2_5_vl":
|
|
21317
22270
|
case "qwen3_vl":
|
|
@@ -21365,10 +22318,13 @@ function getNormalizedConfig(config) {
|
|
|
21365
22318
|
case "cohere":
|
|
21366
22319
|
case "cohere2":
|
|
21367
22320
|
case "mistral":
|
|
22321
|
+
case "voxtral_realtime_text":
|
|
22322
|
+
case "voxtral_realtime_encoder":
|
|
21368
22323
|
case "starcoder2":
|
|
21369
22324
|
case "qwen2":
|
|
21370
22325
|
case "qwen2_moe":
|
|
21371
22326
|
case "qwen2_vl":
|
|
22327
|
+
case "qwen2_vl_text":
|
|
21372
22328
|
case "qwen2_5_vl_text":
|
|
21373
22329
|
case "qwen3_moe":
|
|
21374
22330
|
case "qwen3_vl_text":
|
|
@@ -21384,6 +22340,8 @@ function getNormalizedConfig(config) {
|
|
|
21384
22340
|
mapping["dim_kv"] = "head_dim";
|
|
21385
22341
|
break;
|
|
21386
22342
|
case "qwen3":
|
|
22343
|
+
case "solar_open":
|
|
22344
|
+
case "glm_ocr_text":
|
|
21387
22345
|
case "gemma":
|
|
21388
22346
|
case "gemma2":
|
|
21389
22347
|
case "vaultgemma":
|
|
@@ -21394,6 +22352,7 @@ function getNormalizedConfig(config) {
|
|
|
21394
22352
|
case "ernie4_5":
|
|
21395
22353
|
case "hunyuan_v1_dense":
|
|
21396
22354
|
case "falcon_h1":
|
|
22355
|
+
case "nemotron_h":
|
|
21397
22356
|
case "ministral":
|
|
21398
22357
|
case "ministral3":
|
|
21399
22358
|
mapping["num_heads"] = "num_key_value_heads";
|
|
@@ -21428,6 +22387,9 @@ function getNormalizedConfig(config) {
|
|
|
21428
22387
|
mapping["num_attention_heads"] = "num_attention_heads";
|
|
21429
22388
|
break;
|
|
21430
22389
|
case "youtu":
|
|
22390
|
+
case "deepseek_v3":
|
|
22391
|
+
case "glm_moe_dsa":
|
|
22392
|
+
case "mistral4":
|
|
21431
22393
|
mapping["num_heads"] = "num_key_value_heads";
|
|
21432
22394
|
mapping["num_layers"] = "num_hidden_layers";
|
|
21433
22395
|
mapping["dim_kv"] = "qk_head_dim";
|
|
@@ -21513,6 +22475,10 @@ function getNormalizedConfig(config) {
|
|
|
21513
22475
|
return normalized_config;
|
|
21514
22476
|
}
|
|
21515
22477
|
function getCacheShapes(config, options) {
|
|
22478
|
+
if (!(config instanceof PretrainedConfig)) {
|
|
22479
|
+
config = new PretrainedConfig(config);
|
|
22480
|
+
}
|
|
22481
|
+
const batch_size = options?.batch_size ?? 1;
|
|
21516
22482
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
21517
22483
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
21518
22484
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -21522,7 +22488,6 @@ function getCacheShapes(config, options) {
|
|
|
21522
22488
|
config
|
|
21523
22489
|
);
|
|
21524
22490
|
const head_dim = hidden_size / num_attention_heads;
|
|
21525
|
-
const batch_size = options?.batch_size ?? 1;
|
|
21526
22491
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
21527
22492
|
if (layer_types[i] === "full_attention") {
|
|
21528
22493
|
for (const kv of ["key", "value"]) {
|
|
@@ -21535,31 +22500,26 @@ function getCacheShapes(config, options) {
|
|
|
21535
22500
|
}
|
|
21536
22501
|
}
|
|
21537
22502
|
return cache_values;
|
|
21538
|
-
} else if (["granitemoehybrid", "falcon_h1"].includes(config.model_type)) {
|
|
22503
|
+
} else if (["granitemoehybrid", "falcon_h1", "nemotron_h"].includes(config.model_type)) {
|
|
21539
22504
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
21540
22505
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
21541
|
-
const
|
|
21542
|
-
const {
|
|
21543
|
-
layer_types,
|
|
21544
|
-
num_hidden_layers,
|
|
21545
|
-
num_attention_heads,
|
|
21546
|
-
num_key_value_heads,
|
|
21547
|
-
hidden_size,
|
|
21548
|
-
mamba_d_conv,
|
|
21549
|
-
mamba_n_heads,
|
|
21550
|
-
mamba_d_head,
|
|
21551
|
-
mamba_d_state,
|
|
21552
|
-
mamba_n_groups,
|
|
21553
|
-
mamba_expand,
|
|
21554
|
-
mamba_d_ssm
|
|
21555
|
-
} = (
|
|
22506
|
+
const c = (
|
|
21556
22507
|
/** @type {any} */
|
|
21557
22508
|
config
|
|
21558
22509
|
);
|
|
21559
|
-
const
|
|
21560
|
-
const
|
|
21561
|
-
const
|
|
21562
|
-
|
|
22510
|
+
const layer_types = c.layer_types ?? c.layers_block_type;
|
|
22511
|
+
const num_layers = c.num_hidden_layers ?? layer_types?.length;
|
|
22512
|
+
const num_key_value_heads = c.num_key_value_heads;
|
|
22513
|
+
const head_dim = c.head_dim ?? c.hidden_size / c.num_attention_heads;
|
|
22514
|
+
const mamba_n_heads = c.mamba_n_heads ?? c.mamba_num_heads;
|
|
22515
|
+
const mamba_d_head = c.mamba_d_head ?? c.mamba_head_dim;
|
|
22516
|
+
const mamba_d_state = c.mamba_d_state ?? c.ssm_state_size;
|
|
22517
|
+
const mamba_n_groups = c.mamba_n_groups ?? c.n_groups;
|
|
22518
|
+
const mamba_d_conv = c.mamba_d_conv ?? c.conv_kernel;
|
|
22519
|
+
const mamba_d_ssm = c.mamba_d_ssm ?? (c.mamba_expand ? c.mamba_expand * c.hidden_size : mamba_n_heads * mamba_d_head);
|
|
22520
|
+
const conv_d_inner = mamba_d_ssm + 2 * mamba_n_groups * mamba_d_state;
|
|
22521
|
+
const cache_values = {};
|
|
22522
|
+
for (let i = 0; i < num_layers; ++i) {
|
|
21563
22523
|
if (!layer_types || layer_types[i] === "mamba") {
|
|
21564
22524
|
cache_values[`${conv_prefix}_conv.${i}`] = [batch_size, conv_d_inner, mamba_d_conv];
|
|
21565
22525
|
cache_values[`${conv_prefix}_ssm.${i}`] = [batch_size, mamba_n_heads, mamba_d_head, mamba_d_state];
|
|
@@ -21593,7 +22553,6 @@ function getCacheShapes(config, options) {
|
|
|
21593
22553
|
const key_dim = linear_key_head_dim * linear_num_key_heads;
|
|
21594
22554
|
const value_dim = linear_value_head_dim * linear_num_value_heads;
|
|
21595
22555
|
const final_head_dim = head_dim ?? hidden_size / num_attention_heads;
|
|
21596
|
-
const batch_size = options?.batch_size ?? 1;
|
|
21597
22556
|
for (let i = 0; i < layer_types.length; ++i) {
|
|
21598
22557
|
if (layer_types[i] === "full_attention") {
|
|
21599
22558
|
for (const kv of ["key", "value"]) {
|
|
@@ -21619,12 +22578,16 @@ function getCacheShapes(config, options) {
|
|
|
21619
22578
|
}
|
|
21620
22579
|
}
|
|
21621
22580
|
return cache_values;
|
|
21622
|
-
} else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
|
|
21623
|
-
|
|
21624
|
-
|
|
21625
|
-
|
|
21626
|
-
|
|
21627
|
-
|
|
22581
|
+
} else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
|
|
22582
|
+
let subConfig;
|
|
22583
|
+
if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
|
|
22584
|
+
subConfig = /** @type {any} */
|
|
22585
|
+
config.audio_config;
|
|
22586
|
+
} else {
|
|
22587
|
+
subConfig = /** @type {any} */
|
|
22588
|
+
config.text_config;
|
|
22589
|
+
}
|
|
22590
|
+
return getCacheShapes(subConfig, options);
|
|
21628
22591
|
}
|
|
21629
22592
|
return getKeyValueShapes(config, options);
|
|
21630
22593
|
}
|
|
@@ -21790,7 +22753,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
|
|
|
21790
22753
|
}
|
|
21791
22754
|
|
|
21792
22755
|
// src/models/session.js
|
|
21793
|
-
async function getSession(pretrained_model_name_or_path, fileName, options,
|
|
22756
|
+
async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
|
|
21794
22757
|
let custom_config = options.config?.["transformers.js_config"] ?? {};
|
|
21795
22758
|
const selectedDevice = (
|
|
21796
22759
|
/** @type {import("../utils/devices.js").DeviceType} */
|
|
@@ -21848,9 +22811,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
21848
22811
|
if (externalData.length > 0 && !apis.IS_NODE_ENV) {
|
|
21849
22812
|
session_options.externalData = externalData;
|
|
21850
22813
|
}
|
|
21851
|
-
if (
|
|
22814
|
+
if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
|
|
21852
22815
|
const shapes = getCacheShapes(options.config, {
|
|
21853
|
-
prefix: "present"
|
|
22816
|
+
prefix: "present",
|
|
22817
|
+
session_name
|
|
21854
22818
|
});
|
|
21855
22819
|
if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
|
|
21856
22820
|
const preferredOutputLocation = {};
|
|
@@ -21868,15 +22832,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
21868
22832
|
};
|
|
21869
22833
|
return { buffer_or_path, session_options, session_config };
|
|
21870
22834
|
}
|
|
21871
|
-
async function constructSessions(pretrained_model_name_or_path, names, options,
|
|
22835
|
+
async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
|
|
21872
22836
|
return Object.fromEntries(
|
|
21873
22837
|
await Promise.all(
|
|
21874
22838
|
Object.keys(names).map(async (name) => {
|
|
22839
|
+
const cache_config = cache_sessions?.[name] ?? false;
|
|
21875
22840
|
const { buffer_or_path, session_options, session_config } = await getSession(
|
|
21876
22841
|
pretrained_model_name_or_path,
|
|
21877
22842
|
names[name],
|
|
21878
22843
|
options,
|
|
21879
|
-
|
|
22844
|
+
cache_config,
|
|
22845
|
+
name
|
|
21880
22846
|
);
|
|
21881
22847
|
const session = await createInferenceSession(buffer_or_path, session_options, session_config);
|
|
21882
22848
|
return [name, session];
|
|
@@ -23176,19 +24142,71 @@ var BeamSearchSampler = class extends LogitsSampler {
|
|
|
23176
24142
|
}
|
|
23177
24143
|
};
|
|
23178
24144
|
|
|
24145
|
+
// src/cache_utils.js
|
|
24146
|
+
var _DynamicCache = class {
|
|
24147
|
+
/**
|
|
24148
|
+
* Create a DynamicCache, optionally pre-populated with entries.
|
|
24149
|
+
* @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
|
|
24150
|
+
*/
|
|
24151
|
+
constructor(entries) {
|
|
24152
|
+
if (!entries) return;
|
|
24153
|
+
for (const key in entries) {
|
|
24154
|
+
if (key in this) {
|
|
24155
|
+
throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
|
|
24156
|
+
}
|
|
24157
|
+
const value = entries[key];
|
|
24158
|
+
if (!(value instanceof Tensor2)) {
|
|
24159
|
+
throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
|
|
24160
|
+
}
|
|
24161
|
+
this[key] = value;
|
|
24162
|
+
}
|
|
24163
|
+
}
|
|
24164
|
+
/**
|
|
24165
|
+
* Get the cached sequence length. This requires at least one attention cache entry to be present.
|
|
24166
|
+
* @returns {number} The past sequence length.
|
|
24167
|
+
*/
|
|
24168
|
+
get_seq_length() {
|
|
24169
|
+
const self2 = (
|
|
24170
|
+
/** @type {any} */
|
|
24171
|
+
this
|
|
24172
|
+
);
|
|
24173
|
+
for (const name in self2) {
|
|
24174
|
+
if (name.startsWith("past_key_values.")) {
|
|
24175
|
+
return self2[name].dims.at(-2);
|
|
24176
|
+
}
|
|
24177
|
+
}
|
|
24178
|
+
throw new Error("Unable to determine sequence length from the cache.");
|
|
24179
|
+
}
|
|
24180
|
+
/**
|
|
24181
|
+
* Dispose all contained tensors whose data resides on the GPU.
|
|
24182
|
+
* Returns a promise that resolves when all disposals are complete.
|
|
24183
|
+
* @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
|
|
24184
|
+
*/
|
|
24185
|
+
async dispose() {
|
|
24186
|
+
const promises = [];
|
|
24187
|
+
for (
|
|
24188
|
+
const t of
|
|
24189
|
+
/** @type {Tensor[]} */
|
|
24190
|
+
Object.values(this)
|
|
24191
|
+
) {
|
|
24192
|
+
if (t.location === "gpu-buffer") {
|
|
24193
|
+
promises.push(t.dispose());
|
|
24194
|
+
}
|
|
24195
|
+
}
|
|
24196
|
+
await Promise.all(promises);
|
|
24197
|
+
}
|
|
24198
|
+
};
|
|
24199
|
+
var DynamicCache = (
|
|
24200
|
+
/** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
|
|
24201
|
+
/** @type {unknown} */
|
|
24202
|
+
_DynamicCache
|
|
24203
|
+
);
|
|
24204
|
+
|
|
23179
24205
|
// src/models/modeling_utils.js
|
|
23180
24206
|
var MODEL_MAPPING_NAMES = null;
|
|
23181
24207
|
function registerTaskMappings(mappings) {
|
|
23182
24208
|
MODEL_MAPPING_NAMES = mappings;
|
|
23183
24209
|
}
|
|
23184
|
-
function getPastLength(past_key_values) {
|
|
23185
|
-
for (const name in past_key_values) {
|
|
23186
|
-
if (name.startsWith("past_key_values.")) {
|
|
23187
|
-
return past_key_values[name].dims.at(-2);
|
|
23188
|
-
}
|
|
23189
|
-
}
|
|
23190
|
-
return Object.values(past_key_values)[0].dims.at(-2);
|
|
23191
|
-
}
|
|
23192
24210
|
function toI64Tensor(items) {
|
|
23193
24211
|
if (items instanceof Tensor2) {
|
|
23194
24212
|
return items;
|
|
@@ -23229,71 +24247,181 @@ var MODEL_TYPES = {
|
|
|
23229
24247
|
AutoEncoder: 12,
|
|
23230
24248
|
ImageAudioTextToText: 13,
|
|
23231
24249
|
Supertonic: 14,
|
|
23232
|
-
Chatterbox: 15
|
|
24250
|
+
Chatterbox: 15,
|
|
24251
|
+
MultimodalLanguageModelOnly: 16,
|
|
24252
|
+
VoxtralRealtime: 17
|
|
23233
24253
|
};
|
|
23234
24254
|
var MODEL_TYPE_CONFIG = {
|
|
23235
24255
|
[MODEL_TYPES.DecoderOnly]: {
|
|
23236
24256
|
can_generate: true,
|
|
23237
24257
|
forward: decoder_forward,
|
|
23238
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
24258
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24259
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
|
|
24260
|
+
cache_sessions: { model: true },
|
|
24261
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23239
24262
|
},
|
|
23240
24263
|
[MODEL_TYPES.DecoderOnlyWithoutHead]: {
|
|
23241
24264
|
can_generate: false,
|
|
23242
24265
|
forward: decoder_forward,
|
|
23243
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
24266
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24267
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
23244
24268
|
},
|
|
23245
24269
|
[MODEL_TYPES.Seq2Seq]: {
|
|
23246
24270
|
can_generate: true,
|
|
23247
24271
|
forward: seq2seq_forward,
|
|
23248
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
24272
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
24273
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24274
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24275
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23249
24276
|
},
|
|
23250
24277
|
[MODEL_TYPES.Vision2Seq]: {
|
|
23251
24278
|
can_generate: true,
|
|
23252
24279
|
forward: seq2seq_forward,
|
|
23253
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
24280
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
24281
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24282
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24283
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23254
24284
|
},
|
|
23255
24285
|
[MODEL_TYPES.Musicgen]: {
|
|
23256
24286
|
can_generate: true,
|
|
23257
|
-
forward: seq2seq_forward
|
|
24287
|
+
forward: seq2seq_forward,
|
|
24288
|
+
sessions: () => ({
|
|
24289
|
+
model: "text_encoder",
|
|
24290
|
+
decoder_model_merged: "decoder_model_merged",
|
|
24291
|
+
encodec_decode: "encodec_decode"
|
|
24292
|
+
}),
|
|
24293
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24294
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23258
24295
|
},
|
|
23259
24296
|
[MODEL_TYPES.EncoderDecoder]: {
|
|
23260
24297
|
can_generate: false,
|
|
23261
|
-
forward: seq2seq_forward
|
|
24298
|
+
forward: seq2seq_forward,
|
|
24299
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24300
|
+
cache_sessions: { decoder_model_merged: true }
|
|
24301
|
+
},
|
|
24302
|
+
[MODEL_TYPES.MaskGeneration]: {
|
|
24303
|
+
sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
|
|
23262
24304
|
},
|
|
23263
24305
|
[MODEL_TYPES.ImageTextToText]: {
|
|
23264
24306
|
can_generate: true,
|
|
23265
24307
|
forward: image_text_to_text_forward,
|
|
23266
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24308
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24309
|
+
sessions: (config) => {
|
|
24310
|
+
const s = {
|
|
24311
|
+
embed_tokens: "embed_tokens",
|
|
24312
|
+
vision_encoder: "vision_encoder",
|
|
24313
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24314
|
+
};
|
|
24315
|
+
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
24316
|
+
return s;
|
|
24317
|
+
},
|
|
24318
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24319
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23267
24320
|
},
|
|
23268
24321
|
[MODEL_TYPES.AudioTextToText]: {
|
|
23269
24322
|
can_generate: true,
|
|
23270
24323
|
forward: audio_text_to_text_forward,
|
|
23271
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24324
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24325
|
+
sessions: () => ({
|
|
24326
|
+
embed_tokens: "embed_tokens",
|
|
24327
|
+
audio_encoder: "audio_encoder",
|
|
24328
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24329
|
+
}),
|
|
24330
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24331
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23272
24332
|
},
|
|
23273
|
-
[MODEL_TYPES.
|
|
24333
|
+
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
23274
24334
|
can_generate: true,
|
|
23275
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24335
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24336
|
+
sessions: () => ({
|
|
24337
|
+
embed_tokens: "embed_tokens",
|
|
24338
|
+
audio_encoder: "audio_encoder",
|
|
24339
|
+
vision_encoder: "vision_encoder",
|
|
24340
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24341
|
+
}),
|
|
24342
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23276
24343
|
},
|
|
23277
|
-
[MODEL_TYPES.
|
|
24344
|
+
[MODEL_TYPES.Phi3V]: {
|
|
23278
24345
|
can_generate: true,
|
|
23279
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24346
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24347
|
+
sessions: () => ({
|
|
24348
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
24349
|
+
model: "model",
|
|
24350
|
+
vision_encoder: "vision_encoder"
|
|
24351
|
+
}),
|
|
24352
|
+
cache_sessions: { model: true },
|
|
24353
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23280
24354
|
},
|
|
23281
24355
|
[MODEL_TYPES.MultiModality]: {
|
|
23282
|
-
can_generate: true
|
|
24356
|
+
can_generate: true,
|
|
24357
|
+
sessions: () => ({
|
|
24358
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
24359
|
+
model: "language_model",
|
|
24360
|
+
lm_head: "lm_head",
|
|
24361
|
+
gen_head: "gen_head",
|
|
24362
|
+
gen_img_embeds: "gen_img_embeds",
|
|
24363
|
+
image_decode: "image_decode"
|
|
24364
|
+
}),
|
|
24365
|
+
cache_sessions: { model: true },
|
|
24366
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23283
24367
|
},
|
|
23284
24368
|
[MODEL_TYPES.AutoEncoder]: {
|
|
23285
24369
|
can_generate: false,
|
|
23286
|
-
forward: auto_encoder_forward
|
|
24370
|
+
forward: auto_encoder_forward,
|
|
24371
|
+
sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
|
|
24372
|
+
},
|
|
24373
|
+
[MODEL_TYPES.Supertonic]: {
|
|
24374
|
+
sessions: () => ({
|
|
24375
|
+
text_encoder: "text_encoder",
|
|
24376
|
+
latent_denoiser: "latent_denoiser",
|
|
24377
|
+
voice_decoder: "voice_decoder"
|
|
24378
|
+
})
|
|
23287
24379
|
},
|
|
23288
24380
|
[MODEL_TYPES.Chatterbox]: {
|
|
23289
24381
|
can_generate: true,
|
|
23290
|
-
forward: encoder_forward
|
|
24382
|
+
forward: encoder_forward,
|
|
24383
|
+
sessions: () => ({
|
|
24384
|
+
embed_tokens: "embed_tokens",
|
|
24385
|
+
speech_encoder: "speech_encoder",
|
|
24386
|
+
model: "language_model",
|
|
24387
|
+
conditional_decoder: "conditional_decoder"
|
|
24388
|
+
}),
|
|
24389
|
+
cache_sessions: { model: true },
|
|
24390
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
24391
|
+
},
|
|
24392
|
+
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
24393
|
+
can_generate: true,
|
|
24394
|
+
forward: image_text_to_text_forward,
|
|
24395
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24396
|
+
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
24397
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24398
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
24399
|
+
},
|
|
24400
|
+
[MODEL_TYPES.VoxtralRealtime]: {
|
|
24401
|
+
can_generate: true,
|
|
24402
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24403
|
+
sessions: () => ({
|
|
24404
|
+
embed_tokens: "embed_tokens",
|
|
24405
|
+
audio_encoder: "audio_encoder",
|
|
24406
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24407
|
+
}),
|
|
24408
|
+
cache_sessions: { decoder_model_merged: true, audio_encoder: true },
|
|
24409
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23291
24410
|
},
|
|
23292
24411
|
default: {
|
|
23293
24412
|
can_generate: false,
|
|
23294
|
-
forward: encoder_forward
|
|
24413
|
+
forward: encoder_forward,
|
|
24414
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
23295
24415
|
}
|
|
23296
24416
|
};
|
|
24417
|
+
function getSessionsConfig(modelType, config, options = {}) {
|
|
24418
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
24419
|
+
return {
|
|
24420
|
+
sessions: typeConfig.sessions(config, options),
|
|
24421
|
+
cache_sessions: typeConfig.cache_sessions,
|
|
24422
|
+
optional_configs: typeConfig.optional_configs
|
|
24423
|
+
};
|
|
24424
|
+
}
|
|
23297
24425
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
23298
24426
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
23299
24427
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -23379,245 +24507,23 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23379
24507
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
23380
24508
|
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
23381
24509
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
23382
|
-
|
|
23383
|
-
if (modelType ===
|
|
23384
|
-
|
|
23385
|
-
|
|
23386
|
-
|
|
23387
|
-
{
|
|
23388
|
-
|
|
23389
|
-
},
|
|
23390
|
-
options,
|
|
23391
|
-
"model"
|
|
23392
|
-
),
|
|
23393
|
-
get_optional_configs(
|
|
23394
|
-
pretrained_model_name_or_path,
|
|
23395
|
-
{
|
|
23396
|
-
generation_config: "generation_config.json"
|
|
23397
|
-
},
|
|
23398
|
-
options
|
|
23399
|
-
)
|
|
23400
|
-
]);
|
|
23401
|
-
} else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
|
|
23402
|
-
info = await Promise.all([
|
|
23403
|
-
constructSessions(
|
|
23404
|
-
pretrained_model_name_or_path,
|
|
23405
|
-
{
|
|
23406
|
-
model: "encoder_model",
|
|
23407
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23408
|
-
},
|
|
23409
|
-
options,
|
|
23410
|
-
"decoder_model_merged"
|
|
23411
|
-
),
|
|
23412
|
-
get_optional_configs(
|
|
23413
|
-
pretrained_model_name_or_path,
|
|
23414
|
-
{
|
|
23415
|
-
generation_config: "generation_config.json"
|
|
23416
|
-
},
|
|
23417
|
-
options
|
|
23418
|
-
)
|
|
23419
|
-
]);
|
|
23420
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
23421
|
-
info = await Promise.all([
|
|
23422
|
-
constructSessions(
|
|
23423
|
-
pretrained_model_name_or_path,
|
|
23424
|
-
{
|
|
23425
|
-
model: "vision_encoder",
|
|
23426
|
-
prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
|
|
23427
|
-
},
|
|
23428
|
-
options
|
|
23429
|
-
)
|
|
23430
|
-
]);
|
|
23431
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
23432
|
-
info = await Promise.all([
|
|
23433
|
-
constructSessions(
|
|
23434
|
-
pretrained_model_name_or_path,
|
|
23435
|
-
{
|
|
23436
|
-
model: "encoder_model",
|
|
23437
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23438
|
-
},
|
|
23439
|
-
options,
|
|
23440
|
-
"decoder_model_merged"
|
|
23441
|
-
)
|
|
23442
|
-
]);
|
|
23443
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
23444
|
-
const sessions = {
|
|
23445
|
-
embed_tokens: "embed_tokens",
|
|
23446
|
-
vision_encoder: "vision_encoder",
|
|
23447
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23448
|
-
};
|
|
23449
|
-
if (config.is_encoder_decoder) {
|
|
23450
|
-
sessions["model"] = "encoder_model";
|
|
23451
|
-
}
|
|
23452
|
-
info = await Promise.all([
|
|
23453
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
23454
|
-
get_optional_configs(
|
|
23455
|
-
pretrained_model_name_or_path,
|
|
23456
|
-
{
|
|
23457
|
-
generation_config: "generation_config.json"
|
|
23458
|
-
},
|
|
23459
|
-
options
|
|
23460
|
-
)
|
|
23461
|
-
]);
|
|
23462
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
23463
|
-
const sessions = {
|
|
23464
|
-
embed_tokens: "embed_tokens",
|
|
23465
|
-
audio_encoder: "audio_encoder",
|
|
23466
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23467
|
-
};
|
|
23468
|
-
info = await Promise.all([
|
|
23469
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
23470
|
-
get_optional_configs(
|
|
23471
|
-
pretrained_model_name_or_path,
|
|
23472
|
-
{
|
|
23473
|
-
generation_config: "generation_config.json"
|
|
23474
|
-
},
|
|
23475
|
-
options
|
|
23476
|
-
)
|
|
23477
|
-
]);
|
|
23478
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
23479
|
-
const sessions = {
|
|
23480
|
-
embed_tokens: "embed_tokens",
|
|
23481
|
-
audio_encoder: "audio_encoder",
|
|
23482
|
-
vision_encoder: "vision_encoder",
|
|
23483
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23484
|
-
};
|
|
23485
|
-
info = await Promise.all([
|
|
23486
|
-
constructSessions(pretrained_model_name_or_path, sessions, options),
|
|
23487
|
-
get_optional_configs(
|
|
23488
|
-
pretrained_model_name_or_path,
|
|
23489
|
-
{
|
|
23490
|
-
generation_config: "generation_config.json"
|
|
23491
|
-
},
|
|
23492
|
-
options
|
|
23493
|
-
)
|
|
23494
|
-
]);
|
|
23495
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
23496
|
-
info = await Promise.all([
|
|
23497
|
-
constructSessions(
|
|
23498
|
-
pretrained_model_name_or_path,
|
|
23499
|
-
{
|
|
23500
|
-
model: "text_encoder",
|
|
23501
|
-
decoder_model_merged: "decoder_model_merged",
|
|
23502
|
-
encodec_decode: "encodec_decode"
|
|
23503
|
-
},
|
|
23504
|
-
options,
|
|
23505
|
-
"decoder_model_merged"
|
|
23506
|
-
),
|
|
23507
|
-
get_optional_configs(
|
|
23508
|
-
pretrained_model_name_or_path,
|
|
23509
|
-
{
|
|
23510
|
-
generation_config: "generation_config.json"
|
|
23511
|
-
},
|
|
23512
|
-
options
|
|
23513
|
-
)
|
|
23514
|
-
]);
|
|
23515
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
23516
|
-
info = await Promise.all([
|
|
23517
|
-
constructSessions(
|
|
23518
|
-
pretrained_model_name_or_path,
|
|
23519
|
-
{
|
|
23520
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23521
|
-
model: "language_model",
|
|
23522
|
-
lm_head: "lm_head",
|
|
23523
|
-
gen_head: "gen_head",
|
|
23524
|
-
gen_img_embeds: "gen_img_embeds",
|
|
23525
|
-
image_decode: "image_decode"
|
|
23526
|
-
},
|
|
23527
|
-
options,
|
|
23528
|
-
"model"
|
|
23529
|
-
),
|
|
23530
|
-
get_optional_configs(
|
|
23531
|
-
pretrained_model_name_or_path,
|
|
23532
|
-
{
|
|
23533
|
-
generation_config: "generation_config.json"
|
|
23534
|
-
},
|
|
23535
|
-
options
|
|
23536
|
-
)
|
|
23537
|
-
]);
|
|
23538
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
23539
|
-
info = await Promise.all([
|
|
23540
|
-
constructSessions(
|
|
23541
|
-
pretrained_model_name_or_path,
|
|
23542
|
-
{
|
|
23543
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23544
|
-
model: "model",
|
|
23545
|
-
vision_encoder: "vision_encoder"
|
|
23546
|
-
},
|
|
23547
|
-
options,
|
|
23548
|
-
"model"
|
|
23549
|
-
),
|
|
23550
|
-
get_optional_configs(
|
|
23551
|
-
pretrained_model_name_or_path,
|
|
23552
|
-
{
|
|
23553
|
-
generation_config: "generation_config.json"
|
|
23554
|
-
},
|
|
23555
|
-
options
|
|
23556
|
-
)
|
|
23557
|
-
]);
|
|
23558
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
23559
|
-
info = await Promise.all([
|
|
23560
|
-
constructSessions(
|
|
23561
|
-
pretrained_model_name_or_path,
|
|
23562
|
-
{
|
|
23563
|
-
embed_tokens: "embed_tokens",
|
|
23564
|
-
speech_encoder: "speech_encoder",
|
|
23565
|
-
model: "language_model",
|
|
23566
|
-
conditional_decoder: "conditional_decoder"
|
|
23567
|
-
},
|
|
23568
|
-
options,
|
|
23569
|
-
"model"
|
|
23570
|
-
),
|
|
23571
|
-
get_optional_configs(
|
|
23572
|
-
pretrained_model_name_or_path,
|
|
23573
|
-
{
|
|
23574
|
-
generation_config: "generation_config.json"
|
|
23575
|
-
},
|
|
23576
|
-
options
|
|
23577
|
-
)
|
|
23578
|
-
]);
|
|
23579
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
23580
|
-
info = await Promise.all([
|
|
23581
|
-
constructSessions(
|
|
23582
|
-
pretrained_model_name_or_path,
|
|
23583
|
-
{
|
|
23584
|
-
encoder_model: "encoder_model",
|
|
23585
|
-
decoder_model: "decoder_model"
|
|
23586
|
-
},
|
|
23587
|
-
options
|
|
23588
|
-
)
|
|
23589
|
-
]);
|
|
23590
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
23591
|
-
info = await Promise.all([
|
|
23592
|
-
constructSessions(
|
|
23593
|
-
pretrained_model_name_or_path,
|
|
23594
|
-
{
|
|
23595
|
-
text_encoder: "text_encoder",
|
|
23596
|
-
latent_denoiser: "latent_denoiser",
|
|
23597
|
-
voice_decoder: "voice_decoder"
|
|
23598
|
-
},
|
|
23599
|
-
options
|
|
23600
|
-
)
|
|
23601
|
-
]);
|
|
23602
|
-
} else {
|
|
23603
|
-
if (modelType === void 0) {
|
|
23604
|
-
const type = modelName ?? config?.model_type;
|
|
23605
|
-
if (type !== "custom") {
|
|
23606
|
-
logger.warn(
|
|
23607
|
-
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
23608
|
-
);
|
|
23609
|
-
}
|
|
24510
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
24511
|
+
if (modelType === void 0) {
|
|
24512
|
+
const type = modelName ?? config?.model_type;
|
|
24513
|
+
if (type !== "custom") {
|
|
24514
|
+
logger.warn(
|
|
24515
|
+
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
24516
|
+
);
|
|
23610
24517
|
}
|
|
23611
|
-
info = await Promise.all([
|
|
23612
|
-
constructSessions(
|
|
23613
|
-
pretrained_model_name_or_path,
|
|
23614
|
-
{
|
|
23615
|
-
model: options.model_file_name ?? "model"
|
|
23616
|
-
},
|
|
23617
|
-
options
|
|
23618
|
-
)
|
|
23619
|
-
]);
|
|
23620
24518
|
}
|
|
24519
|
+
const sessions = typeConfig.sessions(config, options);
|
|
24520
|
+
const promises = [
|
|
24521
|
+
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
24522
|
+
];
|
|
24523
|
+
if (typeConfig.optional_configs) {
|
|
24524
|
+
promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
|
|
24525
|
+
}
|
|
24526
|
+
const info = await Promise.all(promises);
|
|
23621
24527
|
return new this(config, ...info);
|
|
23622
24528
|
}
|
|
23623
24529
|
/**
|
|
@@ -23816,7 +24722,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23816
24722
|
* @param {Tensor} [params.inputs=null]
|
|
23817
24723
|
* @param {number} [params.bos_token_id=null]
|
|
23818
24724
|
* @param {Record<string, Tensor|number[]>} [params.model_kwargs]
|
|
23819
|
-
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor
|
|
24725
|
+
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
|
|
23820
24726
|
*/
|
|
23821
24727
|
_prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
|
|
23822
24728
|
const model_inputs = pick(model_kwargs, this.forward_params);
|
|
@@ -24057,11 +24963,12 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24057
24963
|
}
|
|
24058
24964
|
}
|
|
24059
24965
|
/**
|
|
24060
|
-
* Returns
|
|
24966
|
+
* Returns a DynamicCache containing past key values from the given decoder results object.
|
|
24061
24967
|
*
|
|
24062
24968
|
* @param {Object} decoderResults The decoder results object.
|
|
24063
|
-
* @param {
|
|
24064
|
-
* @
|
|
24969
|
+
* @param {DynamicCache} pastKeyValues The previous past key values.
|
|
24970
|
+
* @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
|
|
24971
|
+
* @returns {DynamicCache} A new DynamicCache containing the updated past key values.
|
|
24065
24972
|
*/
|
|
24066
24973
|
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
24067
24974
|
const pkvs = /* @__PURE__ */ Object.create(null);
|
|
@@ -24082,7 +24989,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24082
24989
|
}
|
|
24083
24990
|
}
|
|
24084
24991
|
}
|
|
24085
|
-
return pkvs;
|
|
24992
|
+
return new DynamicCache(pkvs);
|
|
24086
24993
|
}
|
|
24087
24994
|
/**
|
|
24088
24995
|
* Returns an object containing attentions from the given model output object.
|
|
@@ -24107,8 +25014,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24107
25014
|
/**
|
|
24108
25015
|
* Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
|
|
24109
25016
|
*
|
|
24110
|
-
* @param {
|
|
24111
|
-
* @param {
|
|
25017
|
+
* @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
|
|
25018
|
+
* @param {DynamicCache|null} pastKeyValues The cache containing past key values.
|
|
24112
25019
|
*/
|
|
24113
25020
|
addPastKeyValues(decoderFeeds, pastKeyValues) {
|
|
24114
25021
|
if (pastKeyValues) {
|
|
@@ -24125,14 +25032,29 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24125
25032
|
}
|
|
24126
25033
|
}
|
|
24127
25034
|
}
|
|
24128
|
-
|
|
24129
|
-
|
|
25035
|
+
/**
|
|
25036
|
+
* Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
|
|
25037
|
+
* @param {string} sessionName
|
|
25038
|
+
* @param {Record<string, Tensor>} inputs
|
|
25039
|
+
* @param {string} outputName
|
|
25040
|
+
* @private
|
|
25041
|
+
*/
|
|
25042
|
+
async _encode_input(sessionName, inputs, outputName) {
|
|
25043
|
+
if (!Object.hasOwn(this.sessions, sessionName)) {
|
|
25044
|
+
throw new Error(`Model does not have a ${sessionName} session.`);
|
|
25045
|
+
}
|
|
25046
|
+
const session = this.sessions[sessionName];
|
|
25047
|
+
const output = await sessionRun(session, pick(inputs, session.inputNames));
|
|
25048
|
+
return output[outputName];
|
|
24130
25049
|
}
|
|
24131
|
-
async
|
|
24132
|
-
return
|
|
25050
|
+
async encode_image(inputs) {
|
|
25051
|
+
return this._encode_input("vision_encoder", inputs, "image_features");
|
|
24133
25052
|
}
|
|
24134
|
-
async
|
|
24135
|
-
return
|
|
25053
|
+
async encode_text(inputs) {
|
|
25054
|
+
return this._encode_input("embed_tokens", inputs, "inputs_embeds");
|
|
25055
|
+
}
|
|
25056
|
+
async encode_audio(inputs) {
|
|
25057
|
+
return this._encode_input("audio_encoder", inputs, "audio_features");
|
|
24136
25058
|
}
|
|
24137
25059
|
};
|
|
24138
25060
|
async function seq2seq_forward(self2, model_inputs) {
|
|
@@ -24187,6 +25109,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
|
|
|
24187
25109
|
const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
|
|
24188
25110
|
new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
|
|
24189
25111
|
}
|
|
25112
|
+
if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
|
|
25113
|
+
new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
|
|
25114
|
+
}
|
|
24190
25115
|
self2.addPastKeyValues(new_model_inputs, past_key_values);
|
|
24191
25116
|
const fixed = pick(new_model_inputs, session.inputNames);
|
|
24192
25117
|
return await sessionRun(session, fixed);
|
|
@@ -24195,7 +25120,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24195
25120
|
// Generic parameters:
|
|
24196
25121
|
encode_function,
|
|
24197
25122
|
merge_function,
|
|
24198
|
-
|
|
25123
|
+
modality_input_names,
|
|
24199
25124
|
modality_output_name,
|
|
24200
25125
|
// Produced by the tokenizer/processor:
|
|
24201
25126
|
input_ids = null,
|
|
@@ -24210,32 +25135,34 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24210
25135
|
// Additional parameters
|
|
24211
25136
|
...kwargs
|
|
24212
25137
|
}) {
|
|
24213
|
-
const modality_values = kwargs[modality_input_name];
|
|
24214
25138
|
if (!inputs_embeds) {
|
|
24215
25139
|
inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
|
|
24216
|
-
|
|
24217
|
-
|
|
24218
|
-
|
|
24219
|
-
|
|
24220
|
-
|
|
24221
|
-
|
|
24222
|
-
|
|
24223
|
-
|
|
24224
|
-
|
|
24225
|
-
inputs_embeds,
|
|
24226
|
-
|
|
24227
|
-
|
|
24228
|
-
|
|
24229
|
-
|
|
24230
|
-
|
|
24231
|
-
|
|
24232
|
-
|
|
24233
|
-
|
|
24234
|
-
|
|
24235
|
-
|
|
24236
|
-
|
|
24237
|
-
|
|
24238
|
-
|
|
25140
|
+
const modality_values = pick(kwargs, modality_input_names);
|
|
25141
|
+
if (Object.keys(modality_values).length > 0) {
|
|
25142
|
+
if (input_ids.dims[1] !== 1) {
|
|
25143
|
+
const modality_features = await encode_function({
|
|
25144
|
+
// Pass the modality values under its expected key.
|
|
25145
|
+
// The caller knows whether this is audio or image.
|
|
25146
|
+
...modality_values,
|
|
25147
|
+
...kwargs
|
|
25148
|
+
});
|
|
25149
|
+
({ inputs_embeds, attention_mask } = merge_function({
|
|
25150
|
+
[modality_output_name]: modality_features,
|
|
25151
|
+
inputs_embeds,
|
|
25152
|
+
input_ids,
|
|
25153
|
+
attention_mask
|
|
25154
|
+
}));
|
|
25155
|
+
} else if (past_key_values && input_ids.dims[1] === 1) {
|
|
25156
|
+
const target_length = input_ids.dims[1];
|
|
25157
|
+
const past_length = past_key_values.get_seq_length();
|
|
25158
|
+
attention_mask = cat(
|
|
25159
|
+
[
|
|
25160
|
+
ones([input_ids.dims[0], past_length]),
|
|
25161
|
+
attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
|
|
25162
|
+
],
|
|
25163
|
+
1
|
|
25164
|
+
);
|
|
25165
|
+
}
|
|
24239
25166
|
}
|
|
24240
25167
|
}
|
|
24241
25168
|
if (!position_ids) {
|
|
@@ -24243,14 +25170,19 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24243
25170
|
// Handle special case for qwen vl models
|
|
24244
25171
|
[
|
|
24245
25172
|
"qwen2_vl",
|
|
25173
|
+
"qwen2_vl_text",
|
|
24246
25174
|
"qwen2_5_vl",
|
|
24247
25175
|
"qwen2_5_vl_text",
|
|
24248
25176
|
"qwen3_vl",
|
|
24249
25177
|
"qwen3_vl_text",
|
|
25178
|
+
"qwen3_vl_moe",
|
|
25179
|
+
"qwen3_vl_moe_text",
|
|
24250
25180
|
"qwen3_5",
|
|
24251
25181
|
"qwen3_5_text",
|
|
24252
25182
|
"qwen3_5_moe",
|
|
24253
|
-
"qwen3_5_moe_text"
|
|
25183
|
+
"qwen3_5_moe_text",
|
|
25184
|
+
"glm_ocr",
|
|
25185
|
+
"glm_ocr_text"
|
|
24254
25186
|
].includes(self2.config.model_type)
|
|
24255
25187
|
) {
|
|
24256
25188
|
const { image_grid_thw, video_grid_thw } = kwargs;
|
|
@@ -24274,7 +25206,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24274
25206
|
async function audio_text_to_text_forward(self2, params) {
|
|
24275
25207
|
return await generic_text_to_text_forward(self2, {
|
|
24276
25208
|
...params,
|
|
24277
|
-
|
|
25209
|
+
modality_input_names: ["audio_values", "input_features"],
|
|
24278
25210
|
modality_output_name: "audio_features",
|
|
24279
25211
|
encode_function: self2.encode_audio.bind(self2),
|
|
24280
25212
|
merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
|
|
@@ -24283,7 +25215,7 @@ async function audio_text_to_text_forward(self2, params) {
|
|
|
24283
25215
|
async function image_text_to_text_forward(self2, params) {
|
|
24284
25216
|
return await generic_text_to_text_forward(self2, {
|
|
24285
25217
|
...params,
|
|
24286
|
-
|
|
25218
|
+
modality_input_names: ["pixel_values"],
|
|
24287
25219
|
modality_output_name: "image_features",
|
|
24288
25220
|
encode_function: self2.encode_image.bind(self2),
|
|
24289
25221
|
merge_function: self2._merge_input_ids_with_image_features.bind(self2)
|
|
@@ -24319,7 +25251,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
|
|
|
24319
25251
|
return position_ids;
|
|
24320
25252
|
}
|
|
24321
25253
|
function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
|
|
24322
|
-
const past_length = model_inputs.past_key_values ?
|
|
25254
|
+
const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
|
|
25255
|
+
const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
|
|
25256
|
+
if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
|
|
25257
|
+
model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
|
|
25258
|
+
}
|
|
24323
25259
|
if (!model_inputs.attention_mask) {
|
|
24324
25260
|
let dims;
|
|
24325
25261
|
for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
|
|
@@ -24470,6 +25406,8 @@ __export(models_exports, {
|
|
|
24470
25406
|
BloomForCausalLM: () => BloomForCausalLM,
|
|
24471
25407
|
BloomModel: () => BloomModel,
|
|
24472
25408
|
BloomPreTrainedModel: () => BloomPreTrainedModel,
|
|
25409
|
+
CHMv2ForDepthEstimation: () => CHMv2ForDepthEstimation,
|
|
25410
|
+
CHMv2PreTrainedModel: () => CHMv2PreTrainedModel,
|
|
24473
25411
|
CLIPModel: () => CLIPModel,
|
|
24474
25412
|
CLIPPreTrainedModel: () => CLIPPreTrainedModel,
|
|
24475
25413
|
CLIPSegForImageSegmentation: () => CLIPSegForImageSegmentation,
|
|
@@ -24544,6 +25482,9 @@ __export(models_exports, {
|
|
|
24544
25482
|
DebertaV2PreTrainedModel: () => DebertaV2PreTrainedModel,
|
|
24545
25483
|
DecisionTransformerModel: () => DecisionTransformerModel,
|
|
24546
25484
|
DecisionTransformerPreTrainedModel: () => DecisionTransformerPreTrainedModel,
|
|
25485
|
+
DeepseekV3ForCausalLM: () => DeepseekV3ForCausalLM,
|
|
25486
|
+
DeepseekV3Model: () => DeepseekV3Model,
|
|
25487
|
+
DeepseekV3PreTrainedModel: () => DeepseekV3PreTrainedModel,
|
|
24547
25488
|
DeiTForImageClassification: () => DeiTForImageClassification,
|
|
24548
25489
|
DeiTModel: () => DeiTModel,
|
|
24549
25490
|
DeiTPreTrainedModel: () => DeiTPreTrainedModel,
|
|
@@ -24589,6 +25530,11 @@ __export(models_exports, {
|
|
|
24589
25530
|
EsmForTokenClassification: () => EsmForTokenClassification,
|
|
24590
25531
|
EsmModel: () => EsmModel,
|
|
24591
25532
|
EsmPreTrainedModel: () => EsmPreTrainedModel,
|
|
25533
|
+
EuroBertForMaskedLM: () => EuroBertForMaskedLM,
|
|
25534
|
+
EuroBertForSequenceClassification: () => EuroBertForSequenceClassification,
|
|
25535
|
+
EuroBertForTokenClassification: () => EuroBertForTokenClassification,
|
|
25536
|
+
EuroBertModel: () => EuroBertModel,
|
|
25537
|
+
EuroBertPreTrainedModel: () => EuroBertPreTrainedModel,
|
|
24592
25538
|
ExaoneForCausalLM: () => ExaoneForCausalLM,
|
|
24593
25539
|
ExaoneModel: () => ExaoneModel,
|
|
24594
25540
|
ExaonePreTrainedModel: () => ExaonePreTrainedModel,
|
|
@@ -24627,6 +25573,7 @@ __export(models_exports, {
|
|
|
24627
25573
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
24628
25574
|
Gemma3Model: () => Gemma3Model,
|
|
24629
25575
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
25576
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
24630
25577
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
24631
25578
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
24632
25579
|
GemmaForCausalLM: () => GemmaForCausalLM,
|
|
@@ -24634,6 +25581,10 @@ __export(models_exports, {
|
|
|
24634
25581
|
GemmaPreTrainedModel: () => GemmaPreTrainedModel,
|
|
24635
25582
|
GlmForCausalLM: () => GlmForCausalLM,
|
|
24636
25583
|
GlmModel: () => GlmModel,
|
|
25584
|
+
GlmMoeDsaForCausalLM: () => GlmMoeDsaForCausalLM,
|
|
25585
|
+
GlmMoeDsaModel: () => GlmMoeDsaModel,
|
|
25586
|
+
GlmMoeDsaPreTrainedModel: () => GlmMoeDsaPreTrainedModel,
|
|
25587
|
+
GlmOcrForConditionalGeneration: () => GlmOcrForConditionalGeneration,
|
|
24637
25588
|
GlmPreTrainedModel: () => GlmPreTrainedModel,
|
|
24638
25589
|
GptOssForCausalLM: () => GptOssForCausalLM,
|
|
24639
25590
|
GptOssModel: () => GptOssModel,
|
|
@@ -24644,6 +25595,7 @@ __export(models_exports, {
|
|
|
24644
25595
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
24645
25596
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
24646
25597
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
25598
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
24647
25599
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
24648
25600
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
24649
25601
|
GroupViTModel: () => GroupViTModel,
|
|
@@ -24665,7 +25617,6 @@ __export(models_exports, {
|
|
|
24665
25617
|
IJepaModel: () => IJepaModel,
|
|
24666
25618
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
24667
25619
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
24668
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
24669
25620
|
JAISLMHeadModel: () => JAISLMHeadModel,
|
|
24670
25621
|
JAISModel: () => JAISModel,
|
|
24671
25622
|
JAISPreTrainedModel: () => JAISPreTrainedModel,
|
|
@@ -24679,6 +25630,8 @@ __export(models_exports, {
|
|
|
24679
25630
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
24680
25631
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
24681
25632
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
25633
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
25634
|
+
LightOnOcrForConditionalGeneration: () => LightOnOcrForConditionalGeneration,
|
|
24682
25635
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
24683
25636
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
24684
25637
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -24728,6 +25681,9 @@ __export(models_exports, {
|
|
|
24728
25681
|
MimiEncoderOutput: () => MimiEncoderOutput,
|
|
24729
25682
|
MimiModel: () => MimiModel,
|
|
24730
25683
|
MimiPreTrainedModel: () => MimiPreTrainedModel,
|
|
25684
|
+
Mistral4ForCausalLM: () => Mistral4ForCausalLM,
|
|
25685
|
+
Mistral4Model: () => Mistral4Model,
|
|
25686
|
+
Mistral4PreTrainedModel: () => Mistral4PreTrainedModel,
|
|
24731
25687
|
MistralForCausalLM: () => MistralForCausalLM,
|
|
24732
25688
|
MistralModel: () => MistralModel,
|
|
24733
25689
|
MistralPreTrainedModel: () => MistralPreTrainedModel,
|
|
@@ -24785,6 +25741,9 @@ __export(models_exports, {
|
|
|
24785
25741
|
NanoChatForCausalLM: () => NanoChatForCausalLM,
|
|
24786
25742
|
NanoChatModel: () => NanoChatModel,
|
|
24787
25743
|
NanoChatPreTrainedModel: () => NanoChatPreTrainedModel,
|
|
25744
|
+
NemotronHForCausalLM: () => NemotronHForCausalLM,
|
|
25745
|
+
NemotronHModel: () => NemotronHModel,
|
|
25746
|
+
NemotronHPreTrainedModel: () => NemotronHPreTrainedModel,
|
|
24788
25747
|
NeoBertForMaskedLM: () => NeoBertForMaskedLM,
|
|
24789
25748
|
NeoBertForQuestionAnswering: () => NeoBertForQuestionAnswering,
|
|
24790
25749
|
NeoBertForSequenceClassification: () => NeoBertForSequenceClassification,
|
|
@@ -24818,7 +25777,6 @@ __export(models_exports, {
|
|
|
24818
25777
|
Owlv2Model: () => Owlv2Model,
|
|
24819
25778
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
24820
25779
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
24821
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
24822
25780
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
24823
25781
|
ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
|
|
24824
25782
|
PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
|
|
@@ -24848,8 +25806,10 @@ __export(models_exports, {
|
|
|
24848
25806
|
Qwen2MoeModel: () => Qwen2MoeModel,
|
|
24849
25807
|
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
24850
25808
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
25809
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
24851
25810
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
24852
25811
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
25812
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
24853
25813
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
24854
25814
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
24855
25815
|
Qwen3Model: () => Qwen3Model,
|
|
@@ -24860,9 +25820,13 @@ __export(models_exports, {
|
|
|
24860
25820
|
Qwen3NextModel: () => Qwen3NextModel,
|
|
24861
25821
|
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
24862
25822
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
25823
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
24863
25824
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
25825
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
24864
25826
|
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
25827
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
24865
25828
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
25829
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
24866
25830
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
24867
25831
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
24868
25832
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -24913,11 +25877,13 @@ __export(models_exports, {
|
|
|
24913
25877
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
24914
25878
|
SmolLM3Model: () => SmolLM3Model,
|
|
24915
25879
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
24916
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
24917
25880
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
24918
25881
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
24919
25882
|
SnacModel: () => SnacModel,
|
|
24920
25883
|
SnacPreTrainedModel: () => SnacPreTrainedModel,
|
|
25884
|
+
SolarOpenForCausalLM: () => SolarOpenForCausalLM,
|
|
25885
|
+
SolarOpenModel: () => SolarOpenModel,
|
|
25886
|
+
SolarOpenPreTrainedModel: () => SolarOpenPreTrainedModel,
|
|
24921
25887
|
SpeechT5ForSpeechToText: () => SpeechT5ForSpeechToText,
|
|
24922
25888
|
SpeechT5ForTextToSpeech: () => SpeechT5ForTextToSpeech,
|
|
24923
25889
|
SpeechT5HifiGan: () => SpeechT5HifiGan,
|
|
@@ -24985,6 +25951,8 @@ __export(models_exports, {
|
|
|
24985
25951
|
VitsModelOutput: () => VitsModelOutput,
|
|
24986
25952
|
VitsPreTrainedModel: () => VitsPreTrainedModel,
|
|
24987
25953
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
25954
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
25955
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
24988
25956
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
24989
25957
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
24990
25958
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -25090,7 +26058,7 @@ var ArceeModel = class extends ArceePreTrainedModel {
|
|
|
25090
26058
|
var ArceeForCausalLM = class extends ArceePreTrainedModel {
|
|
25091
26059
|
};
|
|
25092
26060
|
|
|
25093
|
-
// src/models/
|
|
26061
|
+
// src/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.js
|
|
25094
26062
|
var ASTPreTrainedModel = class extends PreTrainedModel {
|
|
25095
26063
|
};
|
|
25096
26064
|
var ASTModel = class extends ASTPreTrainedModel {
|
|
@@ -25345,7 +26313,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
|
|
|
25345
26313
|
if (!past_key_values || target_length !== 1) {
|
|
25346
26314
|
throw new Error("Incorrect state encountered during generation.");
|
|
25347
26315
|
}
|
|
25348
|
-
const past_length =
|
|
26316
|
+
const past_length = past_key_values.get_seq_length();
|
|
25349
26317
|
attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
|
|
25350
26318
|
}
|
|
25351
26319
|
}
|
|
@@ -25425,6 +26393,12 @@ var ChineseCLIPPreTrainedModel = class extends PreTrainedModel {
|
|
|
25425
26393
|
var ChineseCLIPModel = class extends ChineseCLIPPreTrainedModel {
|
|
25426
26394
|
};
|
|
25427
26395
|
|
|
26396
|
+
// src/models/chmv2/modeling_chmv2.js
|
|
26397
|
+
var CHMv2PreTrainedModel = class extends PreTrainedModel {
|
|
26398
|
+
};
|
|
26399
|
+
var CHMv2ForDepthEstimation = class extends CHMv2PreTrainedModel {
|
|
26400
|
+
};
|
|
26401
|
+
|
|
25428
26402
|
// src/models/clap/modeling_clap.js
|
|
25429
26403
|
var ClapPreTrainedModel = class extends PreTrainedModel {
|
|
25430
26404
|
};
|
|
@@ -25763,6 +26737,14 @@ var DebertaForQuestionAnswering = class extends DebertaPreTrainedModel {
|
|
|
25763
26737
|
}
|
|
25764
26738
|
};
|
|
25765
26739
|
|
|
26740
|
+
// src/models/deepseek_v3/modeling_deepseek_v3.js
|
|
26741
|
+
var DeepseekV3PreTrainedModel = class extends PreTrainedModel {
|
|
26742
|
+
};
|
|
26743
|
+
var DeepseekV3Model = class extends DeepseekV3PreTrainedModel {
|
|
26744
|
+
};
|
|
26745
|
+
var DeepseekV3ForCausalLM = class extends DeepseekV3PreTrainedModel {
|
|
26746
|
+
};
|
|
26747
|
+
|
|
25766
26748
|
// src/models/deberta_v2/modeling_deberta_v2.js
|
|
25767
26749
|
var DebertaV2PreTrainedModel = class extends PreTrainedModel {
|
|
25768
26750
|
};
|
|
@@ -26111,6 +27093,45 @@ var EsmForTokenClassification = class extends EsmPreTrainedModel {
|
|
|
26111
27093
|
}
|
|
26112
27094
|
};
|
|
26113
27095
|
|
|
27096
|
+
// src/models/eurobert/modeling_eurobert.js
|
|
27097
|
+
var EuroBertPreTrainedModel = class extends PreTrainedModel {
|
|
27098
|
+
};
|
|
27099
|
+
var EuroBertModel = class extends EuroBertPreTrainedModel {
|
|
27100
|
+
};
|
|
27101
|
+
var EuroBertForMaskedLM = class extends EuroBertPreTrainedModel {
|
|
27102
|
+
/**
|
|
27103
|
+
* Calls the model on new inputs.
|
|
27104
|
+
*
|
|
27105
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
27106
|
+
* @returns {Promise<MaskedLMOutput>} An object containing the model's output logits for masked language modeling.
|
|
27107
|
+
*/
|
|
27108
|
+
async _call(model_inputs) {
|
|
27109
|
+
return new MaskedLMOutput(await super._call(model_inputs));
|
|
27110
|
+
}
|
|
27111
|
+
};
|
|
27112
|
+
var EuroBertForSequenceClassification = class extends EuroBertPreTrainedModel {
|
|
27113
|
+
/**
|
|
27114
|
+
* Calls the model on new inputs.
|
|
27115
|
+
*
|
|
27116
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
27117
|
+
* @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
|
|
27118
|
+
*/
|
|
27119
|
+
async _call(model_inputs) {
|
|
27120
|
+
return new SequenceClassifierOutput(await super._call(model_inputs));
|
|
27121
|
+
}
|
|
27122
|
+
};
|
|
27123
|
+
var EuroBertForTokenClassification = class extends EuroBertPreTrainedModel {
|
|
27124
|
+
/**
|
|
27125
|
+
* Calls the model on new inputs.
|
|
27126
|
+
*
|
|
27127
|
+
* @param {Object} model_inputs The inputs to the model.
|
|
27128
|
+
* @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for token classification.
|
|
27129
|
+
*/
|
|
27130
|
+
async _call(model_inputs) {
|
|
27131
|
+
return new TokenClassifierOutput(await super._call(model_inputs));
|
|
27132
|
+
}
|
|
27133
|
+
};
|
|
27134
|
+
|
|
26114
27135
|
// src/models/exaone/modeling_exaone.js
|
|
26115
27136
|
var ExaonePreTrainedModel = class extends PreTrainedModel {
|
|
26116
27137
|
};
|
|
@@ -26375,6 +27396,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
|
|
|
26375
27396
|
});
|
|
26376
27397
|
}
|
|
26377
27398
|
};
|
|
27399
|
+
var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
|
|
27400
|
+
};
|
|
26378
27401
|
|
|
26379
27402
|
// src/models/glm/modeling_glm.js
|
|
26380
27403
|
var GlmPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -26384,6 +27407,377 @@ var GlmModel = class extends GlmPreTrainedModel {
|
|
|
26384
27407
|
var GlmForCausalLM = class extends GlmPreTrainedModel {
|
|
26385
27408
|
};
|
|
26386
27409
|
|
|
27410
|
+
// src/models/glm_moe_dsa/modeling_glm_moe_dsa.js
|
|
27411
|
+
var GlmMoeDsaPreTrainedModel = class extends PreTrainedModel {
|
|
27412
|
+
};
|
|
27413
|
+
var GlmMoeDsaModel = class extends GlmMoeDsaPreTrainedModel {
|
|
27414
|
+
};
|
|
27415
|
+
var GlmMoeDsaForCausalLM = class extends GlmMoeDsaPreTrainedModel {
|
|
27416
|
+
};
|
|
27417
|
+
|
|
27418
|
+
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
27419
|
+
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
27420
|
+
forward_params = [
|
|
27421
|
+
// Text inputs
|
|
27422
|
+
"input_ids",
|
|
27423
|
+
"attention_mask",
|
|
27424
|
+
"position_ids",
|
|
27425
|
+
"past_key_values",
|
|
27426
|
+
// Vision inputs
|
|
27427
|
+
"pixel_values",
|
|
27428
|
+
"image_grid_thw"
|
|
27429
|
+
];
|
|
27430
|
+
};
|
|
27431
|
+
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
27432
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
27433
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
27434
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
27435
|
+
image_grid_thw_name = "grid_thw";
|
|
27436
|
+
/**
|
|
27437
|
+
* Compute text-only 3D rope position IDs (all 3 dims get the same 1D positions).
|
|
27438
|
+
* @param {Tensor} input_ids
|
|
27439
|
+
* @param {Tensor} attention_mask
|
|
27440
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
27441
|
+
*/
|
|
27442
|
+
_get_text_only_rope_index(input_ids, attention_mask) {
|
|
27443
|
+
if (attention_mask) {
|
|
27444
|
+
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
27445
|
+
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
27446
|
+
const mrope_position_deltas = Array.from(
|
|
27447
|
+
{ length: dims[0] },
|
|
27448
|
+
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
27449
|
+
);
|
|
27450
|
+
return [
|
|
27451
|
+
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
27452
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27453
|
+
];
|
|
27454
|
+
} else {
|
|
27455
|
+
const [batch_size, seq_length] = input_ids.dims;
|
|
27456
|
+
const position_ids = BigInt64Array.from(
|
|
27457
|
+
{ length: 3 * batch_size * seq_length },
|
|
27458
|
+
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
27459
|
+
);
|
|
27460
|
+
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
27461
|
+
}
|
|
27462
|
+
}
|
|
27463
|
+
/**
|
|
27464
|
+
* Reorder per-segment position ID lists from [seg1[t,h,w], seg2[t,h,w], ...] into
|
|
27465
|
+
* global [all_t, all_h, all_w] order, then write back into the position_ids array
|
|
27466
|
+
* respecting attention mask.
|
|
27467
|
+
* @param {number[][]} llm_pos_ids_list List of per-segment position arrays, each of length 3*seg_len
|
|
27468
|
+
* @param {number[]} attn_mask Attention mask for this batch element
|
|
27469
|
+
* @param {number[][][]} position_ids_list [3][batch][seq] output array to write into
|
|
27470
|
+
* @param {number} batch_idx Current batch index
|
|
27471
|
+
* @returns {number[]} Flat reordered positions of length total_len
|
|
27472
|
+
*/
|
|
27473
|
+
_reorder_and_write_positions(llm_pos_ids_list, attn_mask, position_ids_list, batch_idx) {
|
|
27474
|
+
const total_len = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
27475
|
+
const llm_positions = new Array(total_len);
|
|
27476
|
+
let index = 0;
|
|
27477
|
+
for (let x = 0; x < 3; ++x) {
|
|
27478
|
+
for (const val of llm_pos_ids_list) {
|
|
27479
|
+
const seg_len = val.length / 3;
|
|
27480
|
+
for (let z = x * seg_len; z < (x + 1) * seg_len; ++z) {
|
|
27481
|
+
llm_positions[index++] = val[z];
|
|
27482
|
+
}
|
|
27483
|
+
}
|
|
27484
|
+
}
|
|
27485
|
+
let count2 = 0;
|
|
27486
|
+
for (let y = 0; y < attn_mask.length; ++y) {
|
|
27487
|
+
if (attn_mask[y] == 1) {
|
|
27488
|
+
for (let x = 0; x < 3; ++x) {
|
|
27489
|
+
position_ids_list[x][batch_idx][y] = llm_positions[x * total_len / 3 + count2];
|
|
27490
|
+
}
|
|
27491
|
+
++count2;
|
|
27492
|
+
}
|
|
27493
|
+
}
|
|
27494
|
+
return llm_positions;
|
|
27495
|
+
}
|
|
27496
|
+
/**
|
|
27497
|
+
* Build per-batch position ID segments for multimodal rope.
|
|
27498
|
+
* Override this in subclasses to change how vision/text segments are identified and positioned.
|
|
27499
|
+
* @param {object} params
|
|
27500
|
+
* @param {any[]} params.filtered_ids - attention-masked token IDs for this batch element
|
|
27501
|
+
* @param {any[][]} params.image_grid_thw_list - all image grid dimensions
|
|
27502
|
+
* @param {any[][]} params.video_grid_thw_list - all video grid dimensions
|
|
27503
|
+
* @param {number} params.spatial_merge_size
|
|
27504
|
+
* @param {{image_index: number, video_index: number}} params.state - mutable counters shared across batches
|
|
27505
|
+
* @returns {number[][]} llm_pos_ids_list - segments of [t..., h..., w...] positions
|
|
27506
|
+
*/
|
|
27507
|
+
_get_multimodal_rope_positions({
|
|
27508
|
+
filtered_ids,
|
|
27509
|
+
image_grid_thw_list,
|
|
27510
|
+
video_grid_thw_list,
|
|
27511
|
+
spatial_merge_size,
|
|
27512
|
+
state
|
|
27513
|
+
}) {
|
|
27514
|
+
const { image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
27515
|
+
const ids = filtered_ids;
|
|
27516
|
+
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
27517
|
+
if (x == vision_start_token_id) acc.push(idx);
|
|
27518
|
+
return acc;
|
|
27519
|
+
}, []);
|
|
27520
|
+
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
27521
|
+
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
27522
|
+
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
27523
|
+
const llm_pos_ids_list = [];
|
|
27524
|
+
let st2 = 0;
|
|
27525
|
+
let remain_images = image_nums;
|
|
27526
|
+
let remain_videos = video_nums;
|
|
27527
|
+
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
27528
|
+
const next_image_token = ids.findIndex((x, i) => i > st2 && x == image_token_id);
|
|
27529
|
+
const next_video_token = ids.findIndex((x, i) => i > st2 && x == video_token_id);
|
|
27530
|
+
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
27531
|
+
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
27532
|
+
let ed;
|
|
27533
|
+
let t, h, w;
|
|
27534
|
+
if (ed_image < ed_video) {
|
|
27535
|
+
[t, h, w] = image_grid_thw_list[state.image_index];
|
|
27536
|
+
++state.image_index;
|
|
27537
|
+
--remain_images;
|
|
27538
|
+
ed = ed_image;
|
|
27539
|
+
} else {
|
|
27540
|
+
[t, h, w] = video_grid_thw_list[state.video_index];
|
|
27541
|
+
++state.video_index;
|
|
27542
|
+
--remain_videos;
|
|
27543
|
+
ed = ed_video;
|
|
27544
|
+
}
|
|
27545
|
+
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
27546
|
+
Number(t),
|
|
27547
|
+
Math.floor(Number(h) / spatial_merge_size),
|
|
27548
|
+
Math.floor(Number(w) / spatial_merge_size)
|
|
27549
|
+
];
|
|
27550
|
+
const text_len = ed - st2;
|
|
27551
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27552
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
27553
|
+
const offset = text_len + st_idx;
|
|
27554
|
+
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
27555
|
+
const t_index = Array.from(
|
|
27556
|
+
{ length: grid_size },
|
|
27557
|
+
(_, i) => offset + Math.floor(i / (llm_grid_h * llm_grid_w))
|
|
27558
|
+
);
|
|
27559
|
+
const h_index = Array.from(
|
|
27560
|
+
{ length: grid_size },
|
|
27561
|
+
(_, i) => offset + Math.floor(i / llm_grid_w) % llm_grid_h
|
|
27562
|
+
);
|
|
27563
|
+
const w_index = Array.from({ length: grid_size }, (_, i) => offset + i % llm_grid_w);
|
|
27564
|
+
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
27565
|
+
st2 = ed + grid_size;
|
|
27566
|
+
}
|
|
27567
|
+
if (st2 < ids.length) {
|
|
27568
|
+
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27569
|
+
const text_len = ids.length - st2;
|
|
27570
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => st_idx + i % text_len));
|
|
27571
|
+
}
|
|
27572
|
+
return llm_pos_ids_list;
|
|
27573
|
+
}
|
|
27574
|
+
/**
|
|
27575
|
+
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
27576
|
+
*
|
|
27577
|
+
* Explanation:
|
|
27578
|
+
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
27579
|
+
*
|
|
27580
|
+
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
27581
|
+
* Examples:
|
|
27582
|
+
* input_ids: [T T T T T], here T is for text.
|
|
27583
|
+
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
27584
|
+
* height position_ids: [0, 1, 2, 3, 4]
|
|
27585
|
+
* width position_ids: [0, 1, 2, 3, 4]
|
|
27586
|
+
*
|
|
27587
|
+
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
27588
|
+
* and 1D rotary position embeddin for text part.
|
|
27589
|
+
* Examples:
|
|
27590
|
+
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
27591
|
+
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
27592
|
+
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
27593
|
+
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
27594
|
+
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
27595
|
+
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
27596
|
+
* text height position_ids: [3, 4, 5, 6, 7]
|
|
27597
|
+
* text width position_ids: [3, 4, 5, 6, 7]
|
|
27598
|
+
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
27599
|
+
*
|
|
27600
|
+
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
27601
|
+
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
27602
|
+
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
27603
|
+
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`.
|
|
27604
|
+
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas]
|
|
27605
|
+
*/
|
|
27606
|
+
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
27607
|
+
const { vision_config } = this.config;
|
|
27608
|
+
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
27609
|
+
if (image_grid_thw || video_grid_thw) {
|
|
27610
|
+
const total_input_ids = input_ids.tolist();
|
|
27611
|
+
if (!attention_mask) {
|
|
27612
|
+
attention_mask = ones_like(input_ids);
|
|
27613
|
+
}
|
|
27614
|
+
const attention_mask_list = attention_mask.tolist();
|
|
27615
|
+
const position_ids_list = Array.from(
|
|
27616
|
+
{ length: 3 },
|
|
27617
|
+
() => Array.from({ length: input_ids.dims[0] }, () => Array.from({ length: input_ids.dims[1] }, () => 0))
|
|
27618
|
+
);
|
|
27619
|
+
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
27620
|
+
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
27621
|
+
const state = { image_index: 0, video_index: 0 };
|
|
27622
|
+
const mrope_position_deltas = [];
|
|
27623
|
+
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
27624
|
+
const filtered_ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
27625
|
+
const llm_pos_ids_list = this._get_multimodal_rope_positions({
|
|
27626
|
+
filtered_ids,
|
|
27627
|
+
image_grid_thw_list,
|
|
27628
|
+
video_grid_thw_list,
|
|
27629
|
+
spatial_merge_size,
|
|
27630
|
+
state
|
|
27631
|
+
});
|
|
27632
|
+
const llm_positions = this._reorder_and_write_positions(
|
|
27633
|
+
llm_pos_ids_list,
|
|
27634
|
+
attention_mask_list[i],
|
|
27635
|
+
position_ids_list,
|
|
27636
|
+
i
|
|
27637
|
+
);
|
|
27638
|
+
mrope_position_deltas.push(max(llm_positions)[0] + 1 - total_input_ids[i].length);
|
|
27639
|
+
}
|
|
27640
|
+
return [
|
|
27641
|
+
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
27642
|
+
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27643
|
+
];
|
|
27644
|
+
} else {
|
|
27645
|
+
return this._get_text_only_rope_index(input_ids, attention_mask);
|
|
27646
|
+
}
|
|
27647
|
+
}
|
|
27648
|
+
async encode_image({ pixel_values, image_grid_thw }) {
|
|
27649
|
+
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
27650
|
+
pixel_values,
|
|
27651
|
+
[this.image_grid_thw_name]: image_grid_thw
|
|
27652
|
+
})).image_features;
|
|
27653
|
+
return features;
|
|
27654
|
+
}
|
|
27655
|
+
_merge_input_ids_with_image_features(kwargs) {
|
|
27656
|
+
return default_merge_input_ids_with_image_features({
|
|
27657
|
+
// @ts-ignore
|
|
27658
|
+
image_token_id: this.config.image_token_id,
|
|
27659
|
+
...kwargs
|
|
27660
|
+
});
|
|
27661
|
+
}
|
|
27662
|
+
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
27663
|
+
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
27664
|
+
if (!model_inputs.past_key_values) {
|
|
27665
|
+
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27666
|
+
model_inputs.input_ids,
|
|
27667
|
+
model_inputs.image_grid_thw,
|
|
27668
|
+
model_inputs.video_grid_thw,
|
|
27669
|
+
model_inputs.attention_mask
|
|
27670
|
+
);
|
|
27671
|
+
} else {
|
|
27672
|
+
model_inputs.pixel_values = null;
|
|
27673
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
27674
|
+
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27675
|
+
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27676
|
+
model_inputs.input_ids,
|
|
27677
|
+
model_inputs.image_grid_thw,
|
|
27678
|
+
model_inputs.video_grid_thw,
|
|
27679
|
+
model_inputs.attention_mask
|
|
27680
|
+
);
|
|
27681
|
+
model_inputs.rope_deltas = rope_deltas;
|
|
27682
|
+
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
27683
|
+
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
27684
|
+
} else {
|
|
27685
|
+
if (!model_inputs.rope_deltas) {
|
|
27686
|
+
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27687
|
+
model_inputs.input_ids,
|
|
27688
|
+
model_inputs.image_grid_thw,
|
|
27689
|
+
model_inputs.video_grid_thw,
|
|
27690
|
+
model_inputs.attention_mask
|
|
27691
|
+
);
|
|
27692
|
+
}
|
|
27693
|
+
const delta = BigInt(past_length);
|
|
27694
|
+
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
27695
|
+
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
27696
|
+
}
|
|
27697
|
+
}
|
|
27698
|
+
}
|
|
27699
|
+
return model_inputs;
|
|
27700
|
+
}
|
|
27701
|
+
};
|
|
27702
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
27703
|
+
};
|
|
27704
|
+
|
|
27705
|
+
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27706
|
+
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27707
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27708
|
+
};
|
|
27709
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
27710
|
+
image_grid_thw_name = "image_grid_thw";
|
|
27711
|
+
};
|
|
27712
|
+
|
|
27713
|
+
// src/models/glm_ocr/modeling_glm_ocr.js
|
|
27714
|
+
var GlmOcrForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27715
|
+
/**
|
|
27716
|
+
* Compute 3D positional indices for vision tokens.
|
|
27717
|
+
* Temporal is constant, height is repeat-interleaved, width tiles.
|
|
27718
|
+
* @param {number} start_position
|
|
27719
|
+
* @param {number[]} grid_thw [T, H, W]
|
|
27720
|
+
* @param {number} temp_merge_size
|
|
27721
|
+
* @param {number} spatial_merge_size
|
|
27722
|
+
* @returns {number[]} Flat array of length 3 * seq_len: [temporal..., height..., width...]
|
|
27723
|
+
*/
|
|
27724
|
+
get_vision_position_ids(start_position, grid_thw, temp_merge_size, spatial_merge_size) {
|
|
27725
|
+
const llm_grid_t = Math.floor(grid_thw[0] / temp_merge_size);
|
|
27726
|
+
const llm_grid_h = Math.floor(grid_thw[1] / spatial_merge_size);
|
|
27727
|
+
const llm_grid_w = Math.floor(grid_thw[2] / spatial_merge_size);
|
|
27728
|
+
const seq_len = llm_grid_h * llm_grid_w * llm_grid_t;
|
|
27729
|
+
const t_pos = Array.from({ length: seq_len }, () => start_position);
|
|
27730
|
+
const h_pos = Array.from(
|
|
27731
|
+
{ length: seq_len },
|
|
27732
|
+
(_, i) => start_position + Math.floor(i / (llm_grid_w * llm_grid_t))
|
|
27733
|
+
);
|
|
27734
|
+
const w_pos = Array.from({ length: seq_len }, (_, i) => start_position + i % llm_grid_w);
|
|
27735
|
+
return [...t_pos, ...h_pos, ...w_pos];
|
|
27736
|
+
}
|
|
27737
|
+
/**
|
|
27738
|
+
* GlmOcr uses mm_token_type_ids-style grouping (image tokens identified by image_token_id)
|
|
27739
|
+
* instead of vision_start_token_id scanning used by Qwen2VL.
|
|
27740
|
+
* After a vision segment, position advances by max(h, w) / spatial_merge_size.
|
|
27741
|
+
*/
|
|
27742
|
+
_get_multimodal_rope_positions({
|
|
27743
|
+
filtered_ids,
|
|
27744
|
+
image_grid_thw_list,
|
|
27745
|
+
video_grid_thw_list,
|
|
27746
|
+
spatial_merge_size,
|
|
27747
|
+
state
|
|
27748
|
+
}) {
|
|
27749
|
+
const { image_token_id } = this.config;
|
|
27750
|
+
const groups = [];
|
|
27751
|
+
let group_start = 0;
|
|
27752
|
+
let current_type = filtered_ids[0] == image_token_id ? 1 : 0;
|
|
27753
|
+
for (let j = 1; j <= filtered_ids.length; ++j) {
|
|
27754
|
+
const t = j < filtered_ids.length ? filtered_ids[j] == image_token_id ? 1 : 0 : -1;
|
|
27755
|
+
if (t !== current_type) {
|
|
27756
|
+
groups.push([current_type, group_start, j]);
|
|
27757
|
+
group_start = j;
|
|
27758
|
+
current_type = t;
|
|
27759
|
+
}
|
|
27760
|
+
}
|
|
27761
|
+
let current_pos = 0;
|
|
27762
|
+
const llm_pos_ids_list = [];
|
|
27763
|
+
for (const [modality_type, start_idx, end_idx] of groups) {
|
|
27764
|
+
if (modality_type === 0) {
|
|
27765
|
+
const text_len = end_idx - start_idx;
|
|
27766
|
+
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i) => current_pos + i % text_len));
|
|
27767
|
+
current_pos += text_len;
|
|
27768
|
+
} else {
|
|
27769
|
+
const grid_thw = image_grid_thw_list[state.image_index++].map(Number);
|
|
27770
|
+
const temp_merge_size = grid_thw[0];
|
|
27771
|
+
llm_pos_ids_list.push(
|
|
27772
|
+
this.get_vision_position_ids(current_pos, grid_thw, temp_merge_size, spatial_merge_size)
|
|
27773
|
+
);
|
|
27774
|
+
current_pos += Math.max(grid_thw[1], grid_thw[2]) / spatial_merge_size;
|
|
27775
|
+
}
|
|
27776
|
+
}
|
|
27777
|
+
return llm_pos_ids_list;
|
|
27778
|
+
}
|
|
27779
|
+
};
|
|
27780
|
+
|
|
26387
27781
|
// src/models/glpn/modeling_glpn.js
|
|
26388
27782
|
var GLPNPreTrainedModel = class extends PreTrainedModel {
|
|
26389
27783
|
};
|
|
@@ -26456,6 +27850,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
|
|
|
26456
27850
|
var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
|
|
26457
27851
|
};
|
|
26458
27852
|
|
|
27853
|
+
// src/models/ultravox/modeling_ultravox.js
|
|
27854
|
+
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
27855
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
27856
|
+
};
|
|
27857
|
+
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
27858
|
+
_merge_input_ids_with_audio_features(kwargs) {
|
|
27859
|
+
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
27860
|
+
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
27861
|
+
return default_merge_input_ids_with_audio_features({
|
|
27862
|
+
// @ts-ignore
|
|
27863
|
+
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
|
|
27864
|
+
...kwargs,
|
|
27865
|
+
audio_features: reshaped_audio_features
|
|
27866
|
+
});
|
|
27867
|
+
}
|
|
27868
|
+
};
|
|
27869
|
+
|
|
27870
|
+
// src/models/granite_speech/modeling_granite_speech.js
|
|
27871
|
+
var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
|
|
27872
|
+
forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
|
|
27873
|
+
};
|
|
27874
|
+
|
|
26459
27875
|
// src/models/grounding_dino/modeling_grounding_dino.js
|
|
26460
27876
|
var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
|
|
26461
27877
|
};
|
|
@@ -26560,34 +27976,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
26560
27976
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
26561
27977
|
};
|
|
26562
27978
|
|
|
26563
|
-
// src/models/
|
|
26564
|
-
var
|
|
26565
|
-
forward_params = [
|
|
26566
|
-
"input_ids",
|
|
26567
|
-
"attention_mask",
|
|
26568
|
-
"pixel_values",
|
|
26569
|
-
"pixel_attention_mask",
|
|
26570
|
-
"position_ids",
|
|
26571
|
-
"past_key_values"
|
|
26572
|
-
];
|
|
27979
|
+
// src/models/llava/modeling_llava.js
|
|
27980
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
27981
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
26573
27982
|
};
|
|
26574
|
-
var
|
|
26575
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
26576
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
|
|
26577
|
-
return features;
|
|
26578
|
-
}
|
|
27983
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
26579
27984
|
_merge_input_ids_with_image_features(kwargs) {
|
|
26580
27985
|
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26581
27986
|
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26582
27987
|
return default_merge_input_ids_with_image_features({
|
|
26583
27988
|
// @ts-ignore
|
|
26584
|
-
image_token_id: this.config.image_token_id,
|
|
27989
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
26585
27990
|
...kwargs,
|
|
26586
27991
|
image_features: reshaped_image_hidden_states
|
|
26587
27992
|
});
|
|
26588
27993
|
}
|
|
26589
27994
|
};
|
|
26590
|
-
var
|
|
27995
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27996
|
+
};
|
|
27997
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
27998
|
+
};
|
|
27999
|
+
|
|
28000
|
+
// src/models/idefics3/modeling_idefics3.js
|
|
28001
|
+
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
28002
|
+
forward_params = [
|
|
28003
|
+
"input_ids",
|
|
28004
|
+
"attention_mask",
|
|
28005
|
+
"pixel_values",
|
|
28006
|
+
"pixel_attention_mask",
|
|
28007
|
+
"position_ids",
|
|
28008
|
+
"past_key_values"
|
|
28009
|
+
];
|
|
26591
28010
|
};
|
|
26592
28011
|
|
|
26593
28012
|
// src/models/ijepa/modeling_ijepa.js
|
|
@@ -26671,6 +28090,10 @@ var Lfm2Model = class extends Lfm2PreTrainedModel {
|
|
|
26671
28090
|
var Lfm2ForCausalLM = class extends Lfm2PreTrainedModel {
|
|
26672
28091
|
};
|
|
26673
28092
|
|
|
28093
|
+
// src/models/lighton_ocr/modeling_lighton_ocr.js
|
|
28094
|
+
var LightOnOcrForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
28095
|
+
};
|
|
28096
|
+
|
|
26674
28097
|
// src/models/lfm2_moe/modeling_lfm2_moe.js
|
|
26675
28098
|
var Lfm2MoePreTrainedModel = class extends PreTrainedModel {
|
|
26676
28099
|
};
|
|
@@ -26679,6 +28102,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
|
|
|
26679
28102
|
var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
|
|
26680
28103
|
};
|
|
26681
28104
|
|
|
28105
|
+
// src/models/lfm2_vl/modeling_lfm2_vl.js
|
|
28106
|
+
var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
28107
|
+
forward_params = [
|
|
28108
|
+
"input_ids",
|
|
28109
|
+
"attention_mask",
|
|
28110
|
+
"pixel_values",
|
|
28111
|
+
"pixel_attention_mask",
|
|
28112
|
+
"spatial_shapes",
|
|
28113
|
+
"position_ids",
|
|
28114
|
+
"past_key_values"
|
|
28115
|
+
];
|
|
28116
|
+
};
|
|
28117
|
+
|
|
26682
28118
|
// src/models/llama/modeling_llama.js
|
|
26683
28119
|
var LlamaPreTrainedModel = class extends PreTrainedModel {
|
|
26684
28120
|
};
|
|
@@ -26693,27 +28129,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
|
|
|
26693
28129
|
var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
|
|
26694
28130
|
};
|
|
26695
28131
|
|
|
26696
|
-
// src/models/llava/modeling_llava.js
|
|
26697
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
26698
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
26699
|
-
};
|
|
26700
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
26701
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
26702
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26703
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26704
|
-
return default_merge_input_ids_with_image_features({
|
|
26705
|
-
// @ts-ignore
|
|
26706
|
-
image_token_id: this.config.image_token_index,
|
|
26707
|
-
...kwargs,
|
|
26708
|
-
image_features: reshaped_image_hidden_states
|
|
26709
|
-
});
|
|
26710
|
-
}
|
|
26711
|
-
};
|
|
26712
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26713
|
-
};
|
|
26714
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
26715
|
-
};
|
|
26716
|
-
|
|
26717
28132
|
// src/models/longt5/modeling_longt5.js
|
|
26718
28133
|
var LongT5PreTrainedModel = class extends PreTrainedModel {
|
|
26719
28134
|
};
|
|
@@ -26875,6 +28290,14 @@ var MistralModel = class extends MistralPreTrainedModel {
|
|
|
26875
28290
|
var MistralForCausalLM = class extends MistralPreTrainedModel {
|
|
26876
28291
|
};
|
|
26877
28292
|
|
|
28293
|
+
// src/models/mistral4/modeling_mistral4.js
|
|
28294
|
+
var Mistral4PreTrainedModel = class extends PreTrainedModel {
|
|
28295
|
+
};
|
|
28296
|
+
var Mistral4Model = class extends Mistral4PreTrainedModel {
|
|
28297
|
+
};
|
|
28298
|
+
var Mistral4ForCausalLM = class extends Mistral4PreTrainedModel {
|
|
28299
|
+
};
|
|
28300
|
+
|
|
26878
28301
|
// src/models/mobilebert/modeling_mobilebert.js
|
|
26879
28302
|
var MobileBertPreTrainedModel = class extends PreTrainedModel {
|
|
26880
28303
|
};
|
|
@@ -27343,6 +28766,14 @@ var NanoChatModel = class extends NanoChatPreTrainedModel {
|
|
|
27343
28766
|
var NanoChatForCausalLM = class extends NanoChatPreTrainedModel {
|
|
27344
28767
|
};
|
|
27345
28768
|
|
|
28769
|
+
// src/models/nemotron_h/modeling_nemotron_h.js
|
|
28770
|
+
var NemotronHPreTrainedModel = class extends PreTrainedModel {
|
|
28771
|
+
};
|
|
28772
|
+
var NemotronHModel = class extends NemotronHPreTrainedModel {
|
|
28773
|
+
};
|
|
28774
|
+
var NemotronHForCausalLM = class extends NemotronHPreTrainedModel {
|
|
28775
|
+
};
|
|
28776
|
+
|
|
27346
28777
|
// src/models/neobert/modeling_neobert.js
|
|
27347
28778
|
var NeoBertPreTrainedModel = class extends PreTrainedModel {
|
|
27348
28779
|
};
|
|
@@ -27464,27 +28895,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
|
|
|
27464
28895
|
};
|
|
27465
28896
|
|
|
27466
28897
|
// src/models/paligemma/modeling_paligemma.js
|
|
27467
|
-
var
|
|
27468
|
-
forward_params = [
|
|
27469
|
-
"input_ids",
|
|
27470
|
-
// 'inputs_embeds',
|
|
27471
|
-
"attention_mask",
|
|
27472
|
-
"pixel_values",
|
|
27473
|
-
"position_ids",
|
|
27474
|
-
"past_key_values"
|
|
27475
|
-
];
|
|
27476
|
-
};
|
|
27477
|
-
var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
|
|
27478
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
27479
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
27480
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
27481
|
-
return default_merge_input_ids_with_image_features({
|
|
27482
|
-
// @ts-ignore
|
|
27483
|
-
image_token_id: this.config.image_token_index,
|
|
27484
|
-
...kwargs,
|
|
27485
|
-
image_features: reshaped_image_hidden_states
|
|
27486
|
-
});
|
|
27487
|
-
}
|
|
28898
|
+
var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27488
28899
|
};
|
|
27489
28900
|
|
|
27490
28901
|
// src/models/parakeet/modeling_parakeet.js
|
|
@@ -27643,244 +29054,6 @@ var Qwen2MoeModel = class extends Qwen2MoePreTrainedModel {
|
|
|
27643
29054
|
var Qwen2MoeForCausalLM = class extends Qwen2MoePreTrainedModel {
|
|
27644
29055
|
};
|
|
27645
29056
|
|
|
27646
|
-
// src/models/qwen2_vl/modeling_qwen2_vl.js
|
|
27647
|
-
var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
27648
|
-
forward_params = [
|
|
27649
|
-
// Text inputs
|
|
27650
|
-
"input_ids",
|
|
27651
|
-
"attention_mask",
|
|
27652
|
-
"position_ids",
|
|
27653
|
-
"past_key_values",
|
|
27654
|
-
// Vision inputs
|
|
27655
|
-
"pixel_values",
|
|
27656
|
-
"image_grid_thw"
|
|
27657
|
-
];
|
|
27658
|
-
};
|
|
27659
|
-
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
27660
|
-
image_grid_thw_name = "grid_thw";
|
|
27661
|
-
/**
|
|
27662
|
-
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
27663
|
-
*
|
|
27664
|
-
* Explanation:
|
|
27665
|
-
* Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
|
|
27666
|
-
*
|
|
27667
|
-
* For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
|
|
27668
|
-
* Examples:
|
|
27669
|
-
* input_ids: [T T T T T], here T is for text.
|
|
27670
|
-
* temporal position_ids: [0, 1, 2, 3, 4]
|
|
27671
|
-
* height position_ids: [0, 1, 2, 3, 4]
|
|
27672
|
-
* width position_ids: [0, 1, 2, 3, 4]
|
|
27673
|
-
*
|
|
27674
|
-
* For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
|
|
27675
|
-
* and 1D rotary position embeddin for text part.
|
|
27676
|
-
* Examples:
|
|
27677
|
-
* Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
|
|
27678
|
-
* input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
|
|
27679
|
-
* vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
|
|
27680
|
-
* vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
|
|
27681
|
-
* vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
|
|
27682
|
-
* text temporal position_ids: [3, 4, 5, 6, 7]
|
|
27683
|
-
* text height position_ids: [3, 4, 5, 6, 7]
|
|
27684
|
-
* text width position_ids: [3, 4, 5, 6, 7]
|
|
27685
|
-
* Here we calculate the text start position_ids as the max vision position_ids plus 1.
|
|
27686
|
-
*
|
|
27687
|
-
* @param {Tensor} input_ids Indices of input sequence tokens in the vocabulary. Tensor of shape `(batch_size, sequence_length)`.
|
|
27688
|
-
* @param {Tensor} image_grid_thw (Optional) The temporal, height and width of feature shape of each image in LLM. Tensor of shape `(num_images, 3)`.
|
|
27689
|
-
* @param {Tensor} video_grid_thw (Optional) The temporal, height and width of feature shape of each video in LLM. Tensor of shape `(num_videos, 3)`.
|
|
27690
|
-
* @param {Tensor} attention_mask (Optional) Mask to avoid performing attention on padding token indices. Tensor of shape `(batch_size, sequence_length)`. Mask values selected in `[0, 1]`:
|
|
27691
|
-
* - 1 for tokens that are **not masked**,
|
|
27692
|
-
* - 0 for tokens that are **masked**.
|
|
27693
|
-
* @returns {[Tensor, Tensor]} [position_ids, mrope_position_deltas] with:
|
|
27694
|
-
* - position_ids: Tensor of shape `(3, batch_size, sequence_length)`.
|
|
27695
|
-
* - mrope_position_deltas: Tensor of shape `(batch_size)`.
|
|
27696
|
-
*/
|
|
27697
|
-
get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask) {
|
|
27698
|
-
const { vision_config, image_token_id, video_token_id, vision_start_token_id } = this.config;
|
|
27699
|
-
const spatial_merge_size = vision_config.spatial_merge_size ?? 2;
|
|
27700
|
-
const mrope_position_deltas = [];
|
|
27701
|
-
if (image_grid_thw || video_grid_thw) {
|
|
27702
|
-
let total_input_ids = input_ids.tolist();
|
|
27703
|
-
if (!attention_mask) {
|
|
27704
|
-
attention_mask = ones_like(input_ids);
|
|
27705
|
-
}
|
|
27706
|
-
const attention_mask_list = attention_mask.tolist();
|
|
27707
|
-
const position_ids_list = Array.from(
|
|
27708
|
-
{ length: 3 },
|
|
27709
|
-
(_) => Array.from({ length: input_ids.dims[0] }, (_2) => Array.from({ length: input_ids.dims[1] }, (_3) => 1))
|
|
27710
|
-
);
|
|
27711
|
-
const image_grid_thw_list = image_grid_thw ? image_grid_thw.tolist() : [];
|
|
27712
|
-
const video_grid_thw_list = video_grid_thw ? video_grid_thw.tolist() : [];
|
|
27713
|
-
let image_index = 0;
|
|
27714
|
-
let video_index = 0;
|
|
27715
|
-
for (let i = 0; i < total_input_ids.length; ++i) {
|
|
27716
|
-
const ids = total_input_ids[i].filter((_, j) => attention_mask_list[i][j] == 1);
|
|
27717
|
-
const vision_start_indices = ids.reduce((acc, x, idx) => {
|
|
27718
|
-
if (x == vision_start_token_id) acc.push(idx);
|
|
27719
|
-
return acc;
|
|
27720
|
-
}, []);
|
|
27721
|
-
const vision_tokens = vision_start_indices.map((x) => ids[x + 1]);
|
|
27722
|
-
const image_nums = vision_tokens.filter((x) => x == image_token_id).length;
|
|
27723
|
-
const video_nums = vision_tokens.filter((x) => x == video_token_id).length;
|
|
27724
|
-
let llm_pos_ids_list = [];
|
|
27725
|
-
let st2 = 0;
|
|
27726
|
-
let remain_images = image_nums;
|
|
27727
|
-
let remain_videos = video_nums;
|
|
27728
|
-
for (let j = 0; j < vision_tokens.length; ++j) {
|
|
27729
|
-
const next_image_token = ids.findIndex((x, i2) => i2 > st2 && x == image_token_id);
|
|
27730
|
-
const next_video_token = ids.findIndex((x, i2) => i2 > st2 && x == video_token_id);
|
|
27731
|
-
const ed_image = remain_images > 0 && next_image_token !== -1 ? next_image_token : ids.length + 1;
|
|
27732
|
-
const ed_video = remain_videos > 0 && next_video_token !== -1 ? next_video_token : ids.length + 1;
|
|
27733
|
-
let ed;
|
|
27734
|
-
let t, h, w;
|
|
27735
|
-
if (ed_image < ed_video) {
|
|
27736
|
-
[t, h, w] = image_grid_thw_list[image_index];
|
|
27737
|
-
++image_index;
|
|
27738
|
-
--remain_images;
|
|
27739
|
-
ed = ed_image;
|
|
27740
|
-
} else {
|
|
27741
|
-
[t, h, w] = video_grid_thw_list[video_index];
|
|
27742
|
-
++video_index;
|
|
27743
|
-
--remain_videos;
|
|
27744
|
-
ed = ed_video;
|
|
27745
|
-
}
|
|
27746
|
-
const [llm_grid_t, llm_grid_h, llm_grid_w] = [
|
|
27747
|
-
Number(t),
|
|
27748
|
-
Math.floor(Number(h) / spatial_merge_size),
|
|
27749
|
-
Math.floor(Number(w) / spatial_merge_size)
|
|
27750
|
-
];
|
|
27751
|
-
const text_len = ed - st2;
|
|
27752
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27753
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
27754
|
-
const offset = text_len + st_idx;
|
|
27755
|
-
const grid_size = llm_grid_t * llm_grid_h * llm_grid_w;
|
|
27756
|
-
const t_index = Array.from(
|
|
27757
|
-
{ length: grid_size },
|
|
27758
|
-
(_, i2) => offset + Math.floor(i2 / (llm_grid_h * llm_grid_w))
|
|
27759
|
-
);
|
|
27760
|
-
const h_index = Array.from(
|
|
27761
|
-
{ length: grid_size },
|
|
27762
|
-
(_, i2) => offset + Math.floor(i2 / llm_grid_w) % llm_grid_h
|
|
27763
|
-
);
|
|
27764
|
-
const w_index = Array.from({ length: grid_size }, (_, i2) => offset + i2 % llm_grid_w);
|
|
27765
|
-
llm_pos_ids_list.push([t_index, h_index, w_index].flat());
|
|
27766
|
-
st2 = ed + grid_size;
|
|
27767
|
-
}
|
|
27768
|
-
if (st2 < ids.length) {
|
|
27769
|
-
const st_idx = llm_pos_ids_list.length > 0 ? max(llm_pos_ids_list.at(-1))[0] + 1 : 0;
|
|
27770
|
-
const text_len = ids.length - st2;
|
|
27771
|
-
llm_pos_ids_list.push(Array.from({ length: 3 * text_len }, (_, i2) => st_idx + i2 % text_len));
|
|
27772
|
-
}
|
|
27773
|
-
const num_items = llm_pos_ids_list.reduce((acc, x) => acc + x.length, 0);
|
|
27774
|
-
const llm_positions = new Array(num_items);
|
|
27775
|
-
let index = 0;
|
|
27776
|
-
for (let x = 0; x < 3; ++x) {
|
|
27777
|
-
for (let y = 0; y < llm_pos_ids_list.length; ++y) {
|
|
27778
|
-
const val = llm_pos_ids_list[y];
|
|
27779
|
-
const text_len = val.length / 3;
|
|
27780
|
-
for (let z = x * text_len; z < (x + 1) * text_len; ++z) {
|
|
27781
|
-
llm_positions[index++] = val[z];
|
|
27782
|
-
}
|
|
27783
|
-
}
|
|
27784
|
-
}
|
|
27785
|
-
let count2 = 0;
|
|
27786
|
-
const attn_mask = attention_mask_list[i];
|
|
27787
|
-
for (let y = 0; y < attn_mask.length; ++y) {
|
|
27788
|
-
if (attn_mask[y] == 1) {
|
|
27789
|
-
for (let x = 0; x < 3; ++x) {
|
|
27790
|
-
position_ids_list[x][i][y] = llm_positions[x * num_items / 3 + count2];
|
|
27791
|
-
}
|
|
27792
|
-
++count2;
|
|
27793
|
-
}
|
|
27794
|
-
}
|
|
27795
|
-
const max_llm_positions = max(llm_positions)[0];
|
|
27796
|
-
mrope_position_deltas.push(max_llm_positions + 1 - total_input_ids[i].length);
|
|
27797
|
-
}
|
|
27798
|
-
return [
|
|
27799
|
-
new Tensor2("int64", position_ids_list.flat(Infinity), [3, input_ids.dims[0], input_ids.dims[1]]),
|
|
27800
|
-
new Tensor2("int64", mrope_position_deltas, [mrope_position_deltas.length, 1])
|
|
27801
|
-
];
|
|
27802
|
-
} else {
|
|
27803
|
-
if (attention_mask) {
|
|
27804
|
-
const { data, dims } = cumsum_masked_fill(attention_mask);
|
|
27805
|
-
const position_ids = BigInt64Array.from({ length: 3 * data.length }, (_, i) => data[i % data.length]);
|
|
27806
|
-
const mrope_position_deltas2 = Array.from(
|
|
27807
|
-
{ length: dims[0] },
|
|
27808
|
-
(_, i) => max(data.subarray(dims[1] * i, dims[1] * (i + 1)))[0] + 1n + BigInt(dims[1])
|
|
27809
|
-
);
|
|
27810
|
-
return [
|
|
27811
|
-
new Tensor2("int64", position_ids, [3, ...dims]),
|
|
27812
|
-
new Tensor2("int64", mrope_position_deltas2, [mrope_position_deltas2.length, 1])
|
|
27813
|
-
];
|
|
27814
|
-
} else {
|
|
27815
|
-
const [batch_size, seq_length] = input_ids.dims;
|
|
27816
|
-
const position_ids = BigInt64Array.from(
|
|
27817
|
-
{ length: 3 * batch_size * seq_length },
|
|
27818
|
-
(_, i) => BigInt(Math.floor(i % seq_length / batch_size))
|
|
27819
|
-
);
|
|
27820
|
-
return [new Tensor2("int64", position_ids, [3, ...input_ids.dims]), zeros([batch_size, 1])];
|
|
27821
|
-
}
|
|
27822
|
-
}
|
|
27823
|
-
}
|
|
27824
|
-
async encode_image({ pixel_values, image_grid_thw }) {
|
|
27825
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], {
|
|
27826
|
-
pixel_values,
|
|
27827
|
-
[this.image_grid_thw_name]: image_grid_thw
|
|
27828
|
-
})).image_features;
|
|
27829
|
-
return features;
|
|
27830
|
-
}
|
|
27831
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
27832
|
-
return default_merge_input_ids_with_image_features({
|
|
27833
|
-
// @ts-ignore
|
|
27834
|
-
image_token_id: this.config.image_token_id,
|
|
27835
|
-
...kwargs
|
|
27836
|
-
});
|
|
27837
|
-
}
|
|
27838
|
-
prepare_inputs_for_generation(input_ids, model_inputs, generation_config) {
|
|
27839
|
-
if (model_inputs.attention_mask && !model_inputs.position_ids) {
|
|
27840
|
-
if (!model_inputs.past_key_values) {
|
|
27841
|
-
[model_inputs.position_ids, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27842
|
-
model_inputs.input_ids,
|
|
27843
|
-
model_inputs.image_grid_thw,
|
|
27844
|
-
model_inputs.video_grid_thw,
|
|
27845
|
-
model_inputs.attention_mask
|
|
27846
|
-
);
|
|
27847
|
-
} else {
|
|
27848
|
-
model_inputs.pixel_values = null;
|
|
27849
|
-
const past_length = getPastLength(model_inputs.past_key_values);
|
|
27850
|
-
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27851
|
-
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27852
|
-
model_inputs.input_ids,
|
|
27853
|
-
model_inputs.image_grid_thw,
|
|
27854
|
-
model_inputs.video_grid_thw,
|
|
27855
|
-
model_inputs.attention_mask
|
|
27856
|
-
);
|
|
27857
|
-
model_inputs.rope_deltas = rope_deltas;
|
|
27858
|
-
model_inputs.position_ids = full_position_ids.slice(null, null, [past_length, null]);
|
|
27859
|
-
model_inputs.input_ids = model_inputs.input_ids.slice(null, [past_length, null]);
|
|
27860
|
-
} else {
|
|
27861
|
-
if (!model_inputs.rope_deltas) {
|
|
27862
|
-
[, model_inputs.rope_deltas] = this.get_rope_index(
|
|
27863
|
-
model_inputs.input_ids,
|
|
27864
|
-
model_inputs.image_grid_thw,
|
|
27865
|
-
model_inputs.video_grid_thw,
|
|
27866
|
-
model_inputs.attention_mask
|
|
27867
|
-
);
|
|
27868
|
-
}
|
|
27869
|
-
const delta = BigInt(past_length);
|
|
27870
|
-
const rope_deltas_list = model_inputs.rope_deltas.map((x) => delta + x);
|
|
27871
|
-
model_inputs.position_ids = stack([rope_deltas_list, rope_deltas_list, rope_deltas_list], 0);
|
|
27872
|
-
}
|
|
27873
|
-
}
|
|
27874
|
-
}
|
|
27875
|
-
return model_inputs;
|
|
27876
|
-
}
|
|
27877
|
-
};
|
|
27878
|
-
|
|
27879
|
-
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27880
|
-
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27881
|
-
image_grid_thw_name = "image_grid_thw";
|
|
27882
|
-
};
|
|
27883
|
-
|
|
27884
29057
|
// src/models/qwen3/modeling_qwen3.js
|
|
27885
29058
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
27886
29059
|
};
|
|
@@ -27908,18 +29081,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
|
|
|
27908
29081
|
// src/models/qwen3_vl/modeling_qwen3_vl.js
|
|
27909
29082
|
var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27910
29083
|
};
|
|
29084
|
+
var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
|
|
29085
|
+
};
|
|
27911
29086
|
|
|
27912
29087
|
// src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
|
|
27913
29088
|
var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27914
29089
|
};
|
|
29090
|
+
var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
|
|
29091
|
+
};
|
|
27915
29092
|
|
|
27916
29093
|
// src/models/qwen3_5/modeling_qwen3_5.js
|
|
27917
29094
|
var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27918
29095
|
};
|
|
29096
|
+
var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
|
|
29097
|
+
};
|
|
27919
29098
|
|
|
27920
29099
|
// src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
|
|
27921
29100
|
var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
|
|
27922
29101
|
};
|
|
29102
|
+
var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
|
|
29103
|
+
};
|
|
27923
29104
|
|
|
27924
29105
|
// src/models/resnet/modeling_resnet.js
|
|
27925
29106
|
var ResNetPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -28318,6 +29499,14 @@ var SnacDecoderModel = class extends SnacPreTrainedModel {
|
|
|
28318
29499
|
}
|
|
28319
29500
|
};
|
|
28320
29501
|
|
|
29502
|
+
// src/models/solar_open/modeling_solar_open.js
|
|
29503
|
+
var SolarOpenPreTrainedModel = class extends PreTrainedModel {
|
|
29504
|
+
};
|
|
29505
|
+
var SolarOpenModel = class extends SolarOpenPreTrainedModel {
|
|
29506
|
+
};
|
|
29507
|
+
var SolarOpenForCausalLM = class extends SolarOpenPreTrainedModel {
|
|
29508
|
+
};
|
|
29509
|
+
|
|
28321
29510
|
// src/models/speecht5/modeling_speecht5.js
|
|
28322
29511
|
var SpeechT5PreTrainedModel = class extends PreTrainedModel {
|
|
28323
29512
|
};
|
|
@@ -28600,25 +29789,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
|
|
|
28600
29789
|
var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
|
|
28601
29790
|
};
|
|
28602
29791
|
|
|
28603
|
-
// src/models/ultravox/modeling_ultravox.js
|
|
28604
|
-
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
28605
|
-
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
28606
|
-
};
|
|
28607
|
-
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
28608
|
-
_merge_input_ids_with_audio_features(kwargs) {
|
|
28609
|
-
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
28610
|
-
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
28611
|
-
return default_merge_input_ids_with_audio_features({
|
|
28612
|
-
// @ts-ignore
|
|
28613
|
-
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
|
|
28614
|
-
...kwargs,
|
|
28615
|
-
audio_features: reshaped_audio_features
|
|
28616
|
-
});
|
|
28617
|
-
}
|
|
28618
|
-
};
|
|
28619
|
-
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
28620
|
-
};
|
|
28621
|
-
|
|
28622
29792
|
// src/models/unispeech/modeling_unispeech.js
|
|
28623
29793
|
var UniSpeechPreTrainedModel = class extends PreTrainedModel {
|
|
28624
29794
|
};
|
|
@@ -28784,6 +29954,170 @@ var VitsModel = class extends VitsPreTrainedModel {
|
|
|
28784
29954
|
}
|
|
28785
29955
|
};
|
|
28786
29956
|
|
|
29957
|
+
// src/models/voxtral/modeling_voxtral.js
|
|
29958
|
+
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
29959
|
+
};
|
|
29960
|
+
|
|
29961
|
+
// src/models/voxtral_realtime/modeling_voxtral_realtime.js
|
|
29962
|
+
var CONV1_LEFT_PAD = 2;
|
|
29963
|
+
var CONV2_LEFT_PAD = 1;
|
|
29964
|
+
var states = /* @__PURE__ */ new WeakMap();
|
|
29965
|
+
function createEncoderState(model, input_features) {
|
|
29966
|
+
const { text_config, audio_config } = (
|
|
29967
|
+
/** @type {any} */
|
|
29968
|
+
model.config
|
|
29969
|
+
);
|
|
29970
|
+
const encoder_session = model.sessions["audio_encoder"];
|
|
29971
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
29972
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
29973
|
+
const enc_kv_cache = new DynamicCache();
|
|
29974
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
|
|
29975
|
+
const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
29976
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
29977
|
+
for (const name in enc_shapes) {
|
|
29978
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
29979
|
+
enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
29980
|
+
}
|
|
29981
|
+
const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
29982
|
+
1,
|
|
29983
|
+
PADDING_CACHE_CHANNELS,
|
|
29984
|
+
CONV1_LEFT_PAD
|
|
29985
|
+
]);
|
|
29986
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
29987
|
+
if (!chunks_iter) {
|
|
29988
|
+
throw new Error("input_features must be iterable or async iterable");
|
|
29989
|
+
}
|
|
29990
|
+
return {
|
|
29991
|
+
encoder_session,
|
|
29992
|
+
enc_kv_cache,
|
|
29993
|
+
enc_padding_cache,
|
|
29994
|
+
enc_past_seq_len: 0,
|
|
29995
|
+
audio_embed_queue: [],
|
|
29996
|
+
audio_embed_total_tokens: 0,
|
|
29997
|
+
audio_queue_offset: 0,
|
|
29998
|
+
audio_consumed: 0,
|
|
29999
|
+
stream_exhausted: false,
|
|
30000
|
+
chunks_iter,
|
|
30001
|
+
text_hidden_size: text_config.hidden_size
|
|
30002
|
+
};
|
|
30003
|
+
}
|
|
30004
|
+
async function encodeChunk(s, chunk_features) {
|
|
30005
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
30006
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
30007
|
+
const position_ids = new Tensor2(
|
|
30008
|
+
"int64",
|
|
30009
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
30010
|
+
[1, conv2_output_len]
|
|
30011
|
+
);
|
|
30012
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
30013
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
30014
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
30015
|
+
input_features: chunk_features,
|
|
30016
|
+
attention_mask,
|
|
30017
|
+
position_ids,
|
|
30018
|
+
past_padding_cache: s.enc_padding_cache,
|
|
30019
|
+
...s.enc_kv_cache
|
|
30020
|
+
});
|
|
30021
|
+
if (s.enc_padding_cache.location === "gpu-buffer") {
|
|
30022
|
+
s.enc_padding_cache.dispose();
|
|
30023
|
+
}
|
|
30024
|
+
s.enc_padding_cache = present_padding_cache;
|
|
30025
|
+
for (const name in present_cache) {
|
|
30026
|
+
if (name.startsWith("present.")) {
|
|
30027
|
+
const pastName = name.replace("present", "past_key_values");
|
|
30028
|
+
const prev = s.enc_kv_cache[pastName];
|
|
30029
|
+
if (prev?.location === "gpu-buffer") {
|
|
30030
|
+
prev.dispose();
|
|
30031
|
+
}
|
|
30032
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
30033
|
+
}
|
|
30034
|
+
}
|
|
30035
|
+
s.enc_past_seq_len = total_seq_len;
|
|
30036
|
+
return audio_embeds;
|
|
30037
|
+
}
|
|
30038
|
+
async function fillAudioBuffer(s, needed) {
|
|
30039
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
30040
|
+
const result = await s.chunks_iter.next();
|
|
30041
|
+
if (result.done) {
|
|
30042
|
+
s.stream_exhausted = true;
|
|
30043
|
+
break;
|
|
30044
|
+
}
|
|
30045
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
30046
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
30047
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
30048
|
+
}
|
|
30049
|
+
}
|
|
30050
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
30051
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
30052
|
+
const embed_data = inputs_embeds.data;
|
|
30053
|
+
let embed_write_pos = 0;
|
|
30054
|
+
let remaining = current_len;
|
|
30055
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
30056
|
+
const front = s.audio_embed_queue[0];
|
|
30057
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
30058
|
+
const n = Math.min(remaining, available);
|
|
30059
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
30060
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
30061
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
30062
|
+
}
|
|
30063
|
+
embed_write_pos += n;
|
|
30064
|
+
remaining -= n;
|
|
30065
|
+
s.audio_queue_offset += n;
|
|
30066
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
30067
|
+
s.audio_embed_queue.shift();
|
|
30068
|
+
s.audio_queue_offset = 0;
|
|
30069
|
+
}
|
|
30070
|
+
}
|
|
30071
|
+
s.audio_consumed += current_len - remaining;
|
|
30072
|
+
}
|
|
30073
|
+
var AudioExhaustedCriteria = class extends StoppingCriteria {
|
|
30074
|
+
constructor(enc_state) {
|
|
30075
|
+
super();
|
|
30076
|
+
this._s = enc_state;
|
|
30077
|
+
}
|
|
30078
|
+
_call(input_ids) {
|
|
30079
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
30080
|
+
return input_ids.map(() => done);
|
|
30081
|
+
}
|
|
30082
|
+
};
|
|
30083
|
+
var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
|
|
30084
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
|
|
30085
|
+
};
|
|
30086
|
+
var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
|
|
30087
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
30088
|
+
const current_len = input_ids.dims[1];
|
|
30089
|
+
const enc = states.get(this);
|
|
30090
|
+
if (enc) {
|
|
30091
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
30092
|
+
}
|
|
30093
|
+
const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
|
|
30094
|
+
if (enc) {
|
|
30095
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
30096
|
+
}
|
|
30097
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
30098
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
30099
|
+
const session = this.sessions["decoder_model_merged"];
|
|
30100
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
30101
|
+
return await sessionRun(session, fixed);
|
|
30102
|
+
}
|
|
30103
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
30104
|
+
if (!input_features) {
|
|
30105
|
+
throw new Error("input_features (generator/iterable) must be provided");
|
|
30106
|
+
}
|
|
30107
|
+
const enc_state = createEncoderState(this, input_features);
|
|
30108
|
+
states.set(this, enc_state);
|
|
30109
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
30110
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
30111
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
30112
|
+
try {
|
|
30113
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
30114
|
+
} finally {
|
|
30115
|
+
enc_state.enc_kv_cache.dispose();
|
|
30116
|
+
states.delete(this);
|
|
30117
|
+
}
|
|
30118
|
+
}
|
|
30119
|
+
};
|
|
30120
|
+
|
|
28787
30121
|
// src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
|
|
28788
30122
|
var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
|
|
28789
30123
|
};
|
|
@@ -29289,6 +30623,7 @@ var YoutuForCausalLM = class extends YoutuPreTrainedModel {
|
|
|
29289
30623
|
// src/models/registry.js
|
|
29290
30624
|
var MODEL_MAPPING_NAMES_ENCODER_ONLY = /* @__PURE__ */ new Map([
|
|
29291
30625
|
["bert", "BertModel"],
|
|
30626
|
+
["eurobert", "EuroBertModel"],
|
|
29292
30627
|
["neobert", "NeoBertModel"],
|
|
29293
30628
|
["modernbert", "ModernBertModel"],
|
|
29294
30629
|
["nomic_bert", "NomicBertModel"],
|
|
@@ -29420,6 +30755,7 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
29420
30755
|
["gemma3_text", "Gemma3Model"],
|
|
29421
30756
|
["helium", "HeliumModel"],
|
|
29422
30757
|
["glm", "GlmModel"],
|
|
30758
|
+
["glm_moe_dsa", "GlmMoeDsaModel"],
|
|
29423
30759
|
["openelm", "OpenELMModel"],
|
|
29424
30760
|
["qwen2", "Qwen2Model"],
|
|
29425
30761
|
["qwen2_moe", "Qwen2MoeModel"],
|
|
@@ -29431,12 +30767,16 @@ var MODEL_MAPPING_NAMES_DECODER_ONLY = /* @__PURE__ */ new Map([
|
|
|
29431
30767
|
["mpt", "MptModel"],
|
|
29432
30768
|
["opt", "OPTModel"],
|
|
29433
30769
|
["mistral", "MistralModel"],
|
|
30770
|
+
["mistral4", "Mistral4Model"],
|
|
29434
30771
|
["ministral", "MinistralModel"],
|
|
29435
30772
|
["ministral3", "Ministral3Model"],
|
|
29436
30773
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
29437
30774
|
["starcoder2", "Starcoder2Model"],
|
|
30775
|
+
["deepseek_v3", "DeepseekV3Model"],
|
|
29438
30776
|
["falcon", "FalconModel"],
|
|
29439
30777
|
["falcon_h1", "FalconH1Model"],
|
|
30778
|
+
["nemotron_h", "NemotronHModel"],
|
|
30779
|
+
["solar_open", "SolarOpenModel"],
|
|
29440
30780
|
["stablelm", "StableLmModel"],
|
|
29441
30781
|
["modernbert-decoder", "ModernBertDecoderModel"],
|
|
29442
30782
|
["hunyuan_v1_dense", "HunYuanDenseV1Model"],
|
|
@@ -29456,6 +30796,7 @@ var MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29456
30796
|
]);
|
|
29457
30797
|
var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29458
30798
|
["bert", "BertForSequenceClassification"],
|
|
30799
|
+
["eurobert", "EuroBertForSequenceClassification"],
|
|
29459
30800
|
["neobert", "NeoBertForSequenceClassification"],
|
|
29460
30801
|
["modernbert", "ModernBertForSequenceClassification"],
|
|
29461
30802
|
["roformer", "RoFormerForSequenceClassification"],
|
|
@@ -29478,6 +30819,7 @@ var MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29478
30819
|
]);
|
|
29479
30820
|
var MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29480
30821
|
["bert", "BertForTokenClassification"],
|
|
30822
|
+
["eurobert", "EuroBertForTokenClassification"],
|
|
29481
30823
|
["neobert", "NeoBertForTokenClassification"],
|
|
29482
30824
|
["modernbert", "ModernBertForTokenClassification"],
|
|
29483
30825
|
["roformer", "RoFormerForTokenClassification"],
|
|
@@ -29537,27 +30879,40 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29537
30879
|
["gemma2", "Gemma2ForCausalLM"],
|
|
29538
30880
|
["vaultgemma", "VaultGemmaForCausalLM"],
|
|
29539
30881
|
["gemma3_text", "Gemma3ForCausalLM"],
|
|
30882
|
+
["gemma3", "Gemma3ForCausalLM"],
|
|
29540
30883
|
["helium", "HeliumForCausalLM"],
|
|
29541
30884
|
["glm", "GlmForCausalLM"],
|
|
30885
|
+
["glm_moe_dsa", "GlmMoeDsaForCausalLM"],
|
|
29542
30886
|
["openelm", "OpenELMForCausalLM"],
|
|
29543
30887
|
["qwen2", "Qwen2ForCausalLM"],
|
|
29544
30888
|
["qwen2_moe", "Qwen2MoeForCausalLM"],
|
|
29545
30889
|
["qwen3", "Qwen3ForCausalLM"],
|
|
29546
30890
|
["qwen3_moe", "Qwen3MoeForCausalLM"],
|
|
29547
30891
|
["qwen3_next", "Qwen3NextForCausalLM"],
|
|
30892
|
+
["qwen2_vl", "Qwen2VLForCausalLM"],
|
|
30893
|
+
["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
|
|
30894
|
+
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
30895
|
+
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
30896
|
+
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
30897
|
+
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
30898
|
+
["gemma3n", "Gemma3nForCausalLM"],
|
|
29548
30899
|
["phi", "PhiForCausalLM"],
|
|
29549
30900
|
["phi3", "Phi3ForCausalLM"],
|
|
29550
30901
|
["mpt", "MptForCausalLM"],
|
|
29551
30902
|
["opt", "OPTForCausalLM"],
|
|
29552
30903
|
["mbart", "MBartForCausalLM"],
|
|
29553
30904
|
["mistral", "MistralForCausalLM"],
|
|
30905
|
+
["mistral4", "Mistral4ForCausalLM"],
|
|
29554
30906
|
["ministral", "MinistralForCausalLM"],
|
|
29555
30907
|
["ministral3", "Ministral3ForCausalLM"],
|
|
29556
30908
|
["ernie4_5", "Ernie4_5ForCausalLM"],
|
|
29557
30909
|
["starcoder2", "Starcoder2ForCausalLM"],
|
|
30910
|
+
["deepseek_v3", "DeepseekV3ForCausalLM"],
|
|
29558
30911
|
["falcon", "FalconForCausalLM"],
|
|
29559
30912
|
["falcon_h1", "FalconH1ForCausalLM"],
|
|
30913
|
+
["nemotron_h", "NemotronHForCausalLM"],
|
|
29560
30914
|
["trocr", "TrOCRForCausalLM"],
|
|
30915
|
+
["solar_open", "SolarOpenForCausalLM"],
|
|
29561
30916
|
["stablelm", "StableLmForCausalLM"],
|
|
29562
30917
|
["modernbert-decoder", "ModernBertDecoderForCausalLM"],
|
|
29563
30918
|
["hunyuan_v1_dense", "HunYuanDenseV1ForCausalLM"],
|
|
@@ -29568,6 +30923,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29568
30923
|
var MODEL_FOR_MULTIMODALITY_MAPPING_NAMES = /* @__PURE__ */ new Map([["multi_modality", "MultiModalityCausalLM"]]);
|
|
29569
30924
|
var MODEL_FOR_MASKED_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29570
30925
|
["bert", "BertForMaskedLM"],
|
|
30926
|
+
["eurobert", "EuroBertForMaskedLM"],
|
|
29571
30927
|
["neobert", "NeoBertForMaskedLM"],
|
|
29572
30928
|
["modernbert", "ModernBertForMaskedLM"],
|
|
29573
30929
|
["roformer", "RoFormerForMaskedLM"],
|
|
@@ -29620,16 +30976,21 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29620
30976
|
["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
|
|
29621
30977
|
["qwen3_5", "Qwen3_5ForConditionalGeneration"],
|
|
29622
30978
|
["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
|
|
30979
|
+
["lfm2_vl", "Lfm2VlForConditionalGeneration"],
|
|
29623
30980
|
["idefics3", "Idefics3ForConditionalGeneration"],
|
|
29624
30981
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
29625
30982
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
29626
30983
|
["llava_qwen2", "LlavaQwen2ForCausalLM"],
|
|
29627
30984
|
["gemma3n", "Gemma3nForConditionalGeneration"],
|
|
29628
|
-
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
30985
|
+
["mistral3", "Mistral3ForConditionalGeneration"],
|
|
30986
|
+
["lighton_ocr", "LightOnOcrForConditionalGeneration"],
|
|
30987
|
+
["glm_ocr", "GlmOcrForConditionalGeneration"]
|
|
29629
30988
|
]);
|
|
29630
30989
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30990
|
+
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
29631
30991
|
["ultravox", "UltravoxModel"],
|
|
29632
|
-
["voxtral", "VoxtralForConditionalGeneration"]
|
|
30992
|
+
["voxtral", "VoxtralForConditionalGeneration"],
|
|
30993
|
+
["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
|
|
29633
30994
|
]);
|
|
29634
30995
|
var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29635
30996
|
["vision-encoder-decoder", "VisionEncoderDecoderModel"]
|
|
@@ -29728,6 +31089,7 @@ var MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29728
31089
|
]);
|
|
29729
31090
|
var MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = /* @__PURE__ */ new Map([["swin2sr", "Swin2SRForImageSuperResolution"]]);
|
|
29730
31091
|
var MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
31092
|
+
["chmv2", "CHMv2ForDepthEstimation"],
|
|
29731
31093
|
["dpt", "DPTForDepthEstimation"],
|
|
29732
31094
|
["depth_anything", "DepthAnythingForDepthEstimation"],
|
|
29733
31095
|
["glpn", "GLPNForDepthEstimation"],
|
|
@@ -29812,7 +31174,19 @@ var CUSTOM_MAPPING = [
|
|
|
29812
31174
|
MODEL_TYPES.ImageAudioTextToText
|
|
29813
31175
|
],
|
|
29814
31176
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
29815
|
-
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
|
|
31177
|
+
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
31178
|
+
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31179
|
+
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31180
|
+
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31181
|
+
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31182
|
+
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31183
|
+
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31184
|
+
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
31185
|
+
[
|
|
31186
|
+
"VoxtralRealtimeForConditionalGeneration",
|
|
31187
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
31188
|
+
MODEL_TYPES.VoxtralRealtime
|
|
31189
|
+
]
|
|
29816
31190
|
];
|
|
29817
31191
|
for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
29818
31192
|
MODEL_TYPE_MAPPING.set(name, type);
|
|
@@ -31490,8 +32864,18 @@ var TASK_ALIASES = Object.freeze({
|
|
|
31490
32864
|
});
|
|
31491
32865
|
|
|
31492
32866
|
// src/utils/model_registry/get_model_files.js
|
|
32867
|
+
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
32868
|
+
if (config !== null) {
|
|
32869
|
+
return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
|
|
32870
|
+
}
|
|
32871
|
+
const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
|
|
32872
|
+
return memoizePromise(
|
|
32873
|
+
key,
|
|
32874
|
+
() => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
|
|
32875
|
+
);
|
|
32876
|
+
}
|
|
31493
32877
|
async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
|
|
31494
|
-
config = await
|
|
32878
|
+
config = await get_config(modelId, { config });
|
|
31495
32879
|
const files = [
|
|
31496
32880
|
// Add config.json (always loaded)
|
|
31497
32881
|
"config.json"
|
|
@@ -31552,74 +32936,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
31552
32936
|
files.push(dataFilePath);
|
|
31553
32937
|
}
|
|
31554
32938
|
};
|
|
31555
|
-
const
|
|
31556
|
-
|
|
31557
|
-
add_model_file(
|
|
31558
|
-
|
|
31559
|
-
|
|
31560
|
-
|
|
31561
|
-
|
|
31562
|
-
|
|
31563
|
-
add_model_file("decoder_model_merged");
|
|
31564
|
-
files.push("generation_config.json");
|
|
31565
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
31566
|
-
add_model_file("model", "vision_encoder");
|
|
31567
|
-
add_model_file("prompt_encoder_mask_decoder");
|
|
31568
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
31569
|
-
add_model_file("model", "encoder_model");
|
|
31570
|
-
add_model_file("decoder_model_merged");
|
|
31571
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
31572
|
-
add_model_file("embed_tokens");
|
|
31573
|
-
add_model_file("vision_encoder");
|
|
31574
|
-
add_model_file("decoder_model_merged");
|
|
31575
|
-
if (config.is_encoder_decoder) {
|
|
31576
|
-
add_model_file("model", "encoder_model");
|
|
31577
|
-
}
|
|
31578
|
-
files.push("generation_config.json");
|
|
31579
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
31580
|
-
add_model_file("embed_tokens");
|
|
31581
|
-
add_model_file("audio_encoder");
|
|
31582
|
-
add_model_file("decoder_model_merged");
|
|
31583
|
-
files.push("generation_config.json");
|
|
31584
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
31585
|
-
add_model_file("embed_tokens");
|
|
31586
|
-
add_model_file("audio_encoder");
|
|
31587
|
-
add_model_file("vision_encoder");
|
|
31588
|
-
add_model_file("decoder_model_merged");
|
|
31589
|
-
files.push("generation_config.json");
|
|
31590
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
31591
|
-
add_model_file("model", "text_encoder");
|
|
31592
|
-
add_model_file("decoder_model_merged");
|
|
31593
|
-
add_model_file("encodec_decode");
|
|
31594
|
-
files.push("generation_config.json");
|
|
31595
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
31596
|
-
add_model_file("prepare_inputs_embeds");
|
|
31597
|
-
add_model_file("model", "language_model");
|
|
31598
|
-
add_model_file("lm_head");
|
|
31599
|
-
add_model_file("gen_head");
|
|
31600
|
-
add_model_file("gen_img_embeds");
|
|
31601
|
-
add_model_file("image_decode");
|
|
31602
|
-
files.push("generation_config.json");
|
|
31603
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
31604
|
-
add_model_file("prepare_inputs_embeds");
|
|
31605
|
-
add_model_file("model");
|
|
31606
|
-
add_model_file("vision_encoder");
|
|
31607
|
-
files.push("generation_config.json");
|
|
31608
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
31609
|
-
add_model_file("embed_tokens");
|
|
31610
|
-
add_model_file("speech_encoder");
|
|
31611
|
-
add_model_file("model", "language_model");
|
|
31612
|
-
add_model_file("conditional_decoder");
|
|
31613
|
-
files.push("generation_config.json");
|
|
31614
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
31615
|
-
add_model_file("encoder_model");
|
|
31616
|
-
add_model_file("decoder_model");
|
|
31617
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
31618
|
-
add_model_file("text_encoder");
|
|
31619
|
-
add_model_file("latent_denoiser");
|
|
31620
|
-
add_model_file("voice_decoder");
|
|
31621
|
-
} else {
|
|
31622
|
-
add_model_file("model", singleModelName);
|
|
32939
|
+
const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
|
|
32940
|
+
for (const [sessionKey, baseName] of Object.entries(sessions)) {
|
|
32941
|
+
add_model_file(sessionKey, baseName);
|
|
32942
|
+
}
|
|
32943
|
+
if (optional_configs) {
|
|
32944
|
+
for (const configFile of Object.values(optional_configs)) {
|
|
32945
|
+
files.push(configFile);
|
|
32946
|
+
}
|
|
31623
32947
|
}
|
|
31624
32948
|
return files;
|
|
31625
32949
|
}
|
|
@@ -32070,25 +33394,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
|
32070
33394
|
|
|
32071
33395
|
// src/utils/model_registry/is_cached.js
|
|
32072
33396
|
async function check_files_cache(modelId, files, options = {}) {
|
|
32073
|
-
const
|
|
32074
|
-
if (!
|
|
33397
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
33398
|
+
if (!cache2) {
|
|
32075
33399
|
const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
|
|
32076
33400
|
return { allCached: false, files: fileStatuses2 };
|
|
32077
33401
|
}
|
|
32078
33402
|
const fileStatuses = await Promise.all(
|
|
32079
33403
|
files.map(async (filename) => {
|
|
32080
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
32081
|
-
const cached = await checkCachedResource(
|
|
33404
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
33405
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
32082
33406
|
return { file: filename, cached: !!cached };
|
|
32083
33407
|
})
|
|
32084
33408
|
);
|
|
32085
33409
|
return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
|
|
32086
33410
|
}
|
|
32087
33411
|
async function is_file_cached(modelId, filename, options = {}) {
|
|
32088
|
-
const
|
|
32089
|
-
if (!
|
|
32090
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
32091
|
-
return !!await checkCachedResource(
|
|
33412
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
33413
|
+
if (!cache2) return false;
|
|
33414
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
33415
|
+
return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
32092
33416
|
}
|
|
32093
33417
|
async function is_cached(modelId, options = {}) {
|
|
32094
33418
|
if (!modelId) {
|
|
@@ -32135,26 +33459,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
|
|
|
32135
33459
|
|
|
32136
33460
|
// src/utils/model_registry/clear_cache.js
|
|
32137
33461
|
async function clear_files_from_cache(modelId, files, options = {}) {
|
|
32138
|
-
const
|
|
32139
|
-
if (!
|
|
33462
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
33463
|
+
if (!cache2) {
|
|
32140
33464
|
return {
|
|
32141
33465
|
filesDeleted: 0,
|
|
32142
33466
|
filesCached: 0,
|
|
32143
33467
|
files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
|
|
32144
33468
|
};
|
|
32145
33469
|
}
|
|
32146
|
-
if (!
|
|
33470
|
+
if (!cache2.delete) {
|
|
32147
33471
|
throw new Error("Cache does not support delete operation");
|
|
32148
33472
|
}
|
|
32149
33473
|
const results = await Promise.all(
|
|
32150
33474
|
files.map(async (filename) => {
|
|
32151
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
32152
|
-
const cached = await checkCachedResource(
|
|
33475
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
33476
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
32153
33477
|
const wasCached = !!cached;
|
|
32154
33478
|
let deleted = false;
|
|
32155
33479
|
if (wasCached) {
|
|
32156
|
-
const deletedWithProposed = await
|
|
32157
|
-
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await
|
|
33480
|
+
const deletedWithProposed = await cache2.delete(proposedCacheKey);
|
|
33481
|
+
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
|
|
32158
33482
|
deleted = deletedWithProposed || deletedWithLocal;
|
|
32159
33483
|
}
|
|
32160
33484
|
return { file: filename, deleted, wasCached };
|
|
@@ -32505,6 +33829,9 @@ var ModelRegistry = class {
|
|
|
32505
33829
|
BloomModel,
|
|
32506
33830
|
BloomPreTrainedModel,
|
|
32507
33831
|
BloomTokenizer,
|
|
33832
|
+
CHMv2ForDepthEstimation,
|
|
33833
|
+
CHMv2ImageProcessor,
|
|
33834
|
+
CHMv2PreTrainedModel,
|
|
32508
33835
|
CLIPFeatureExtractor,
|
|
32509
33836
|
CLIPImageProcessor,
|
|
32510
33837
|
CLIPModel,
|
|
@@ -32600,6 +33927,9 @@ var ModelRegistry = class {
|
|
|
32600
33927
|
DebertaV2Tokenizer,
|
|
32601
33928
|
DecisionTransformerModel,
|
|
32602
33929
|
DecisionTransformerPreTrainedModel,
|
|
33930
|
+
DeepseekV3ForCausalLM,
|
|
33931
|
+
DeepseekV3Model,
|
|
33932
|
+
DeepseekV3PreTrainedModel,
|
|
32603
33933
|
DeiTFeatureExtractor,
|
|
32604
33934
|
DeiTForImageClassification,
|
|
32605
33935
|
DeiTImageProcessor,
|
|
@@ -32636,6 +33966,7 @@ var ModelRegistry = class {
|
|
|
32636
33966
|
DonutImageProcessor,
|
|
32637
33967
|
DonutSwinModel,
|
|
32638
33968
|
DonutSwinPreTrainedModel,
|
|
33969
|
+
DynamicCache,
|
|
32639
33970
|
EdgeTamModel,
|
|
32640
33971
|
EfficientNetForImageClassification,
|
|
32641
33972
|
EfficientNetImageProcessor,
|
|
@@ -32659,6 +33990,11 @@ var ModelRegistry = class {
|
|
|
32659
33990
|
EsmModel,
|
|
32660
33991
|
EsmPreTrainedModel,
|
|
32661
33992
|
EsmTokenizer,
|
|
33993
|
+
EuroBertForMaskedLM,
|
|
33994
|
+
EuroBertForSequenceClassification,
|
|
33995
|
+
EuroBertForTokenClassification,
|
|
33996
|
+
EuroBertModel,
|
|
33997
|
+
EuroBertPreTrainedModel,
|
|
32662
33998
|
ExaoneForCausalLM,
|
|
32663
33999
|
ExaoneModel,
|
|
32664
34000
|
ExaonePreTrainedModel,
|
|
@@ -32708,6 +34044,7 @@ var ModelRegistry = class {
|
|
|
32708
34044
|
Gemma3Model,
|
|
32709
34045
|
Gemma3PreTrainedModel,
|
|
32710
34046
|
Gemma3nAudioFeatureExtractor,
|
|
34047
|
+
Gemma3nForCausalLM,
|
|
32711
34048
|
Gemma3nForConditionalGeneration,
|
|
32712
34049
|
Gemma3nPreTrainedModel,
|
|
32713
34050
|
Gemma3nProcessor,
|
|
@@ -32715,8 +34052,14 @@ var ModelRegistry = class {
|
|
|
32715
34052
|
GemmaModel,
|
|
32716
34053
|
GemmaPreTrainedModel,
|
|
32717
34054
|
GemmaTokenizer,
|
|
34055
|
+
Glm46VImageProcessor,
|
|
34056
|
+
Glm46VProcessor,
|
|
32718
34057
|
GlmForCausalLM,
|
|
32719
34058
|
GlmModel,
|
|
34059
|
+
GlmMoeDsaForCausalLM,
|
|
34060
|
+
GlmMoeDsaModel,
|
|
34061
|
+
GlmMoeDsaPreTrainedModel,
|
|
34062
|
+
GlmOcrForConditionalGeneration,
|
|
32720
34063
|
GlmPreTrainedModel,
|
|
32721
34064
|
GptOssForCausalLM,
|
|
32722
34065
|
GptOssModel,
|
|
@@ -32727,6 +34070,9 @@ var ModelRegistry = class {
|
|
|
32727
34070
|
GraniteMoeHybridModel,
|
|
32728
34071
|
GraniteMoeHybridPreTrainedModel,
|
|
32729
34072
|
GranitePreTrainedModel,
|
|
34073
|
+
GraniteSpeechFeatureExtractor,
|
|
34074
|
+
GraniteSpeechForConditionalGeneration,
|
|
34075
|
+
GraniteSpeechProcessor,
|
|
32730
34076
|
GroundingDinoForObjectDetection,
|
|
32731
34077
|
GroundingDinoImageProcessor,
|
|
32732
34078
|
GroundingDinoPreTrainedModel,
|
|
@@ -32752,7 +34098,6 @@ var ModelRegistry = class {
|
|
|
32752
34098
|
IJepaPreTrainedModel,
|
|
32753
34099
|
Idefics3ForConditionalGeneration,
|
|
32754
34100
|
Idefics3ImageProcessor,
|
|
32755
|
-
Idefics3PreTrainedModel,
|
|
32756
34101
|
Idefics3Processor,
|
|
32757
34102
|
ImageClassificationPipeline,
|
|
32758
34103
|
ImageFeatureExtractionPipeline,
|
|
@@ -32777,6 +34122,10 @@ var ModelRegistry = class {
|
|
|
32777
34122
|
Lfm2MoeModel,
|
|
32778
34123
|
Lfm2MoePreTrainedModel,
|
|
32779
34124
|
Lfm2PreTrainedModel,
|
|
34125
|
+
Lfm2VlForConditionalGeneration,
|
|
34126
|
+
Lfm2VlImageProcessor,
|
|
34127
|
+
Lfm2VlProcessor,
|
|
34128
|
+
LightOnOcrForConditionalGeneration,
|
|
32780
34129
|
LiteWhisperForConditionalGeneration,
|
|
32781
34130
|
Llama4ForCausalLM,
|
|
32782
34131
|
Llama4PreTrainedModel,
|
|
@@ -32846,6 +34195,9 @@ var ModelRegistry = class {
|
|
|
32846
34195
|
MimiPreTrainedModel,
|
|
32847
34196
|
MinLengthLogitsProcessor,
|
|
32848
34197
|
MinNewTokensLengthLogitsProcessor,
|
|
34198
|
+
Mistral4ForCausalLM,
|
|
34199
|
+
Mistral4Model,
|
|
34200
|
+
Mistral4PreTrainedModel,
|
|
32849
34201
|
MistralForCausalLM,
|
|
32850
34202
|
MistralModel,
|
|
32851
34203
|
MistralPreTrainedModel,
|
|
@@ -32917,6 +34269,9 @@ var ModelRegistry = class {
|
|
|
32917
34269
|
NanoChatForCausalLM,
|
|
32918
34270
|
NanoChatModel,
|
|
32919
34271
|
NanoChatPreTrainedModel,
|
|
34272
|
+
NemotronHForCausalLM,
|
|
34273
|
+
NemotronHModel,
|
|
34274
|
+
NemotronHPreTrainedModel,
|
|
32920
34275
|
NeoBertForMaskedLM,
|
|
32921
34276
|
NeoBertForQuestionAnswering,
|
|
32922
34277
|
NeoBertForSequenceClassification,
|
|
@@ -32960,7 +34315,6 @@ var ModelRegistry = class {
|
|
|
32960
34315
|
Owlv2Model,
|
|
32961
34316
|
Owlv2PreTrainedModel,
|
|
32962
34317
|
PaliGemmaForConditionalGeneration,
|
|
32963
|
-
PaliGemmaPreTrainedModel,
|
|
32964
34318
|
PaliGemmaProcessor,
|
|
32965
34319
|
ParakeetFeatureExtractor,
|
|
32966
34320
|
ParakeetForCTC,
|
|
@@ -33004,10 +34358,12 @@ var ModelRegistry = class {
|
|
|
33004
34358
|
Qwen2MoePreTrainedModel,
|
|
33005
34359
|
Qwen2PreTrainedModel,
|
|
33006
34360
|
Qwen2Tokenizer,
|
|
34361
|
+
Qwen2VLForCausalLM,
|
|
33007
34362
|
Qwen2VLForConditionalGeneration,
|
|
33008
34363
|
Qwen2VLImageProcessor,
|
|
33009
34364
|
Qwen2VLPreTrainedModel,
|
|
33010
34365
|
Qwen2VLProcessor,
|
|
34366
|
+
Qwen2_5_VLForCausalLM,
|
|
33011
34367
|
Qwen2_5_VLForConditionalGeneration,
|
|
33012
34368
|
Qwen2_5_VLProcessor,
|
|
33013
34369
|
Qwen3ForCausalLM,
|
|
@@ -33019,10 +34375,14 @@ var ModelRegistry = class {
|
|
|
33019
34375
|
Qwen3NextModel,
|
|
33020
34376
|
Qwen3NextPreTrainedModel,
|
|
33021
34377
|
Qwen3PreTrainedModel,
|
|
34378
|
+
Qwen3VLForCausalLM,
|
|
33022
34379
|
Qwen3VLForConditionalGeneration,
|
|
34380
|
+
Qwen3VLMoeForCausalLM,
|
|
33023
34381
|
Qwen3VLMoeForConditionalGeneration,
|
|
33024
34382
|
Qwen3VLProcessor,
|
|
34383
|
+
Qwen3_5ForCausalLM,
|
|
33025
34384
|
Qwen3_5ForConditionalGeneration,
|
|
34385
|
+
Qwen3_5MoeForCausalLM,
|
|
33026
34386
|
Qwen3_5MoeForConditionalGeneration,
|
|
33027
34387
|
RFDetrForObjectDetection,
|
|
33028
34388
|
RFDetrModel,
|
|
@@ -33094,7 +34454,6 @@ var ModelRegistry = class {
|
|
|
33094
34454
|
SmolLM3ForCausalLM,
|
|
33095
34455
|
SmolLM3Model,
|
|
33096
34456
|
SmolLM3PreTrainedModel,
|
|
33097
|
-
SmolVLMForConditionalGeneration,
|
|
33098
34457
|
SmolVLMImageProcessor,
|
|
33099
34458
|
SmolVLMProcessor,
|
|
33100
34459
|
SnacDecoderModel,
|
|
@@ -33102,6 +34461,9 @@ var ModelRegistry = class {
|
|
|
33102
34461
|
SnacFeatureExtractor,
|
|
33103
34462
|
SnacModel,
|
|
33104
34463
|
SnacPreTrainedModel,
|
|
34464
|
+
SolarOpenForCausalLM,
|
|
34465
|
+
SolarOpenModel,
|
|
34466
|
+
SolarOpenPreTrainedModel,
|
|
33105
34467
|
SpeechT5FeatureExtractor,
|
|
33106
34468
|
SpeechT5ForSpeechToText,
|
|
33107
34469
|
SpeechT5ForTextToSpeech,
|
|
@@ -33200,6 +34562,10 @@ var ModelRegistry = class {
|
|
|
33200
34562
|
VitsTokenizer,
|
|
33201
34563
|
VoxtralForConditionalGeneration,
|
|
33202
34564
|
VoxtralProcessor,
|
|
34565
|
+
VoxtralRealtimeFeatureExtractor,
|
|
34566
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
34567
|
+
VoxtralRealtimePreTrainedModel,
|
|
34568
|
+
VoxtralRealtimeProcessor,
|
|
33203
34569
|
Wav2Vec2BertForCTC,
|
|
33204
34570
|
Wav2Vec2BertForSequenceClassification,
|
|
33205
34571
|
Wav2Vec2BertModel,
|
|
@@ -33295,7 +34661,7 @@ var ModelRegistry = class {
|
|
|
33295
34661
|
|
|
33296
34662
|
onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
|
|
33297
34663
|
(*!
|
|
33298
|
-
* ONNX Runtime Web v1.25.0-dev.
|
|
34664
|
+
* ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
|
|
33299
34665
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
33300
34666
|
* Licensed under the MIT License.
|
|
33301
34667
|
*)
|