@huggingface/transformers 4.0.0-next.6 → 4.0.0-next.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/ort-wasm-simd-threaded.jsep.mjs +24 -24
- package/dist/transformers.js +1587 -570
- package/dist/transformers.min.js +17 -17
- package/dist/transformers.node.cjs +1605 -573
- package/dist/transformers.node.min.cjs +21 -21
- package/dist/transformers.node.min.mjs +21 -21
- package/dist/transformers.node.mjs +1600 -583
- package/dist/transformers.web.js +1592 -575
- package/dist/transformers.web.min.js +15 -15
- package/package.json +3 -3
- package/src/cache_utils.js +62 -0
- package/src/configs.js +17 -2
- package/src/env.js +8 -1
- package/src/image_processors_utils.js +3 -3
- package/src/models/chatterbox/modeling_chatterbox.js +1 -1
- package/src/models/detr/image_processing_detr.js +1 -1
- package/src/models/feature_extractors.js +2 -0
- package/src/models/gemma3n/modeling_gemma3n.js +2 -0
- package/src/models/granite_speech/feature_extraction_granite_speech.js +58 -0
- package/src/models/granite_speech/modeling_granite_speech.js +5 -0
- package/src/models/granite_speech/processing_granite_speech.js +62 -0
- package/src/models/grounding_dino/image_processing_grounding_dino.js +1 -1
- package/src/models/idefics3/modeling_idefics3.js +5 -32
- package/src/models/image_processors.js +1 -0
- package/src/models/lfm2_vl/image_processing_lfm2_vl.js +305 -0
- package/src/models/lfm2_vl/modeling_lfm2_vl.js +13 -0
- package/src/models/lfm2_vl/processing_lfm2_vl.js +77 -0
- package/src/models/llava/modeling_llava.js +1 -1
- package/src/models/mistral3/modeling_mistral3.js +2 -2
- package/src/models/modeling_utils.js +222 -308
- package/src/models/models.js +4 -0
- package/src/models/paligemma/modeling_paligemma.js +2 -25
- package/src/models/processors.js +3 -0
- package/src/models/qwen2_5_vl/modeling_qwen2_5_vl.js +5 -1
- package/src/models/qwen2_vl/image_processing_qwen2_vl.js +1 -41
- package/src/models/qwen2_vl/modeling_qwen2_vl.js +7 -7
- package/src/models/qwen3_5/modeling_qwen3_5.js +1 -0
- package/src/models/qwen3_5_moe/modeling_qwen3_5_moe.js +2 -1
- package/src/models/qwen3_vl/modeling_qwen3_vl.js +2 -1
- package/src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js +2 -1
- package/src/models/registry.js +25 -0
- package/src/models/sam/image_processing_sam.js +1 -1
- package/src/models/session.js +17 -6
- package/src/models/smolvlm/modeling_smolvlm.js +7 -0
- package/src/models/ultravox/modeling_ultravox.js +1 -3
- package/src/models/voxtral/modeling_voxtral.js +3 -0
- package/src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js +71 -0
- package/src/models/voxtral_realtime/modeling_voxtral_realtime.js +239 -0
- package/src/models/voxtral_realtime/processing_voxtral_realtime.js +113 -0
- package/src/models/whisper/feature_extraction_whisper.js +2 -12
- package/src/transformers.js +2 -0
- package/src/utils/audio.js +18 -2
- package/src/utils/cache/CrossOriginStorageCache.js +251 -0
- package/src/utils/cache/cross-origin-storage.d.ts +38 -0
- package/src/utils/cache.js +5 -0
- package/src/utils/lru_cache.js +67 -0
- package/src/utils/memoize_promise.js +45 -0
- package/src/utils/model_registry/get_file_metadata.js +14 -2
- package/src/utils/model_registry/get_model_files.js +52 -78
- package/src/utils/tensor.js +18 -2
- package/types/cache_utils.d.ts +29 -0
- package/types/cache_utils.d.ts.map +1 -0
- package/types/configs.d.ts.map +1 -1
- package/types/env.d.ts +8 -0
- package/types/env.d.ts.map +1 -1
- package/types/image_processors_utils.d.ts +17 -1
- package/types/image_processors_utils.d.ts.map +1 -1
- package/types/models/detr/image_processing_detr.d.ts +1 -1
- package/types/models/feature_extractors.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts +2 -0
- package/types/models/gemma3n/modeling_gemma3n.d.ts.map +1 -1
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts +16 -0
- package/types/models/granite_speech/feature_extraction_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts +4 -0
- package/types/models/granite_speech/modeling_granite_speech.d.ts.map +1 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts +19 -0
- package/types/models/granite_speech/processing_granite_speech.d.ts.map +1 -0
- package/types/models/grounding_dino/image_processing_grounding_dino.d.ts +1 -1
- package/types/models/idefics3/modeling_idefics3.d.ts +2 -18
- package/types/models/idefics3/modeling_idefics3.d.ts.map +1 -1
- package/types/models/image_processors.d.ts +1 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts +41 -0
- package/types/models/lfm2_vl/image_processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts +4 -0
- package/types/models/lfm2_vl/modeling_lfm2_vl.d.ts.map +1 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts +18 -0
- package/types/models/lfm2_vl/processing_lfm2_vl.d.ts.map +1 -0
- package/types/models/mistral3/modeling_mistral3.d.ts +2 -2
- package/types/models/mistral3/modeling_mistral3.d.ts.map +1 -1
- package/types/models/modeling_utils.d.ts +44 -35
- package/types/models/modeling_utils.d.ts.map +1 -1
- package/types/models/models.d.ts +4 -0
- package/types/models/paligemma/modeling_paligemma.d.ts +2 -8
- package/types/models/paligemma/modeling_paligemma.d.ts.map +1 -1
- package/types/models/processors.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts +3 -0
- package/types/models/qwen2_5_vl/modeling_qwen2_5_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/image_processing_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts +2 -0
- package/types/models/qwen2_vl/modeling_qwen2_vl.d.ts.map +1 -1
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts +2 -0
- package/types/models/qwen3_5/modeling_qwen3_5.d.ts.map +1 -1
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts +3 -0
- package/types/models/qwen3_5_moe/modeling_qwen3_5_moe.d.ts.map +1 -1
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts +3 -0
- package/types/models/qwen3_vl/modeling_qwen3_vl.d.ts.map +1 -1
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts +3 -0
- package/types/models/qwen3_vl_moe/modeling_qwen3_vl_moe.d.ts.map +1 -1
- package/types/models/registry.d.ts.map +1 -1
- package/types/models/sam/image_processing_sam.d.ts +1 -1
- package/types/models/session.d.ts +3 -2
- package/types/models/session.d.ts.map +1 -1
- package/types/models/smolvlm/modeling_smolvlm.d.ts +8 -0
- package/types/models/smolvlm/modeling_smolvlm.d.ts.map +1 -0
- package/types/models/ultravox/modeling_ultravox.d.ts +0 -2
- package/types/models/ultravox/modeling_ultravox.d.ts.map +1 -1
- package/types/models/voxtral/modeling_voxtral.d.ts +4 -0
- package/types/models/voxtral/modeling_voxtral.d.ts.map +1 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts +28 -0
- package/types/models/voxtral_realtime/feature_extraction_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts +17 -0
- package/types/models/voxtral_realtime/modeling_voxtral_realtime.d.ts.map +1 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts +44 -0
- package/types/models/voxtral_realtime/processing_voxtral_realtime.d.ts.map +1 -0
- package/types/models/whisper/feature_extraction_whisper.d.ts.map +1 -1
- package/types/transformers.d.ts +1 -0
- package/types/transformers.d.ts.map +1 -1
- package/types/utils/audio.d.ts +5 -2
- package/types/utils/audio.d.ts.map +1 -1
- package/types/utils/cache/CrossOriginStorageCache.d.ts +120 -0
- package/types/utils/cache/CrossOriginStorageCache.d.ts.map +1 -0
- package/types/utils/cache.d.ts.map +1 -1
- package/types/utils/dtypes.d.ts +1 -1
- package/types/utils/image.d.ts +1 -1
- package/types/utils/lru_cache.d.ts +38 -0
- package/types/utils/lru_cache.d.ts.map +1 -0
- package/types/utils/memoize_promise.d.ts +14 -0
- package/types/utils/memoize_promise.d.ts.map +1 -0
- package/types/utils/model_registry/get_file_metadata.d.ts.map +1 -1
- package/types/utils/model_registry/get_model_files.d.ts +1 -0
- package/types/utils/model_registry/get_model_files.d.ts.map +1 -1
- package/types/utils/tensor.d.ts.map +1 -1
- package/src/utils/data-structures.js +0 -572
- package/types/utils/data-structures.d.ts +0 -294
- package/types/utils/data-structures.d.ts.map +0 -1
|
@@ -248,6 +248,7 @@ __export(transformers_exports, {
|
|
|
248
248
|
DonutImageProcessor: () => DonutImageProcessor,
|
|
249
249
|
DonutSwinModel: () => DonutSwinModel,
|
|
250
250
|
DonutSwinPreTrainedModel: () => DonutSwinPreTrainedModel,
|
|
251
|
+
DynamicCache: () => DynamicCache,
|
|
251
252
|
EdgeTamModel: () => EdgeTamModel,
|
|
252
253
|
EfficientNetForImageClassification: () => EfficientNetForImageClassification,
|
|
253
254
|
EfficientNetImageProcessor: () => EfficientNetImageProcessor,
|
|
@@ -320,6 +321,7 @@ __export(transformers_exports, {
|
|
|
320
321
|
Gemma3Model: () => Gemma3Model,
|
|
321
322
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
322
323
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
324
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
323
325
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
324
326
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
325
327
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
@@ -339,6 +341,9 @@ __export(transformers_exports, {
|
|
|
339
341
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
340
342
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
341
343
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
344
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
345
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
346
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
342
347
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
343
348
|
GroundingDinoImageProcessor: () => GroundingDinoImageProcessor,
|
|
344
349
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
@@ -364,7 +369,6 @@ __export(transformers_exports, {
|
|
|
364
369
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
365
370
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
366
371
|
Idefics3ImageProcessor: () => Idefics3ImageProcessor,
|
|
367
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
368
372
|
Idefics3Processor: () => Idefics3Processor,
|
|
369
373
|
ImageClassificationPipeline: () => ImageClassificationPipeline,
|
|
370
374
|
ImageFeatureExtractionPipeline: () => ImageFeatureExtractionPipeline,
|
|
@@ -389,6 +393,9 @@ __export(transformers_exports, {
|
|
|
389
393
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
390
394
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
391
395
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
396
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
397
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
398
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
392
399
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
393
400
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
394
401
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -572,7 +579,6 @@ __export(transformers_exports, {
|
|
|
572
579
|
Owlv2Model: () => Owlv2Model,
|
|
573
580
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
574
581
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
575
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
576
582
|
PaliGemmaProcessor: () => PaliGemmaProcessor,
|
|
577
583
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
578
584
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
@@ -616,10 +622,12 @@ __export(transformers_exports, {
|
|
|
616
622
|
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
617
623
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
618
624
|
Qwen2Tokenizer: () => Qwen2Tokenizer,
|
|
625
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
619
626
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
620
627
|
Qwen2VLImageProcessor: () => Qwen2VLImageProcessor,
|
|
621
628
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
622
629
|
Qwen2VLProcessor: () => Qwen2VLProcessor,
|
|
630
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
623
631
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
624
632
|
Qwen2_5_VLProcessor: () => Qwen2_5_VLProcessor,
|
|
625
633
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
@@ -631,10 +639,14 @@ __export(transformers_exports, {
|
|
|
631
639
|
Qwen3NextModel: () => Qwen3NextModel,
|
|
632
640
|
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
633
641
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
642
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
634
643
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
644
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
635
645
|
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
636
646
|
Qwen3VLProcessor: () => Qwen3VLProcessor,
|
|
647
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
637
648
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
649
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
638
650
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
639
651
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
640
652
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -706,7 +718,6 @@ __export(transformers_exports, {
|
|
|
706
718
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
707
719
|
SmolLM3Model: () => SmolLM3Model,
|
|
708
720
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
709
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
710
721
|
SmolVLMImageProcessor: () => Idefics3ImageProcessor,
|
|
711
722
|
SmolVLMProcessor: () => Idefics3Processor,
|
|
712
723
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
@@ -812,6 +823,10 @@ __export(transformers_exports, {
|
|
|
812
823
|
VitsTokenizer: () => VitsTokenizer,
|
|
813
824
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
814
825
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
826
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
827
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
828
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
829
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
815
830
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
816
831
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
817
832
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -910,7 +925,7 @@ var import_node_fs = __toESM(require("fs"), 1);
|
|
|
910
925
|
var import_node_path = __toESM(require("path"), 1);
|
|
911
926
|
var import_node_url = __toESM(require("url"), 1);
|
|
912
927
|
var import_meta = {};
|
|
913
|
-
var VERSION = "4.0.0-next.
|
|
928
|
+
var VERSION = "4.0.0-next.7";
|
|
914
929
|
var HAS_SELF = typeof self !== "undefined";
|
|
915
930
|
var IS_FS_AVAILABLE = !isEmpty(import_node_fs.default);
|
|
916
931
|
var IS_PATH_AVAILABLE = !isEmpty(import_node_path.default);
|
|
@@ -1038,6 +1053,7 @@ var env = {
|
|
|
1038
1053
|
customCache: null,
|
|
1039
1054
|
useWasmCache: IS_WEB_CACHE_AVAILABLE || IS_FS_AVAILABLE,
|
|
1040
1055
|
cacheKey: "transformers-cache",
|
|
1056
|
+
experimental_useCrossOriginStorage: false,
|
|
1041
1057
|
/////////////////// Custom fetch /////////////////////
|
|
1042
1058
|
fetch: DEFAULT_FETCH
|
|
1043
1059
|
//////////////////////////////////////////////////////
|
|
@@ -3588,7 +3604,7 @@ var Tokenizer = class {
|
|
|
3588
3604
|
};
|
|
3589
3605
|
var Tokenizer_default = Tokenizer;
|
|
3590
3606
|
|
|
3591
|
-
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.
|
|
3607
|
+
// ../../node_modules/.pnpm/@huggingface+jinja@0.5.6/node_modules/@huggingface/jinja/dist/index.js
|
|
3592
3608
|
var TOKEN_TYPES = Object.freeze({
|
|
3593
3609
|
Text: "Text",
|
|
3594
3610
|
// The text between Jinja statements or expressions
|
|
@@ -5107,7 +5123,11 @@ var Environment = class {
|
|
|
5107
5123
|
["number", (operand) => operand instanceof IntegerValue || operand instanceof FloatValue],
|
|
5108
5124
|
["integer", (operand) => operand instanceof IntegerValue],
|
|
5109
5125
|
["iterable", (operand) => operand.type === "ArrayValue" || operand.type === "StringValue"],
|
|
5110
|
-
["mapping", (operand) => operand
|
|
5126
|
+
["mapping", (operand) => operand instanceof ObjectValue],
|
|
5127
|
+
[
|
|
5128
|
+
"sequence",
|
|
5129
|
+
(operand) => operand instanceof ArrayValue || operand instanceof ObjectValue || operand instanceof StringValue
|
|
5130
|
+
],
|
|
5111
5131
|
[
|
|
5112
5132
|
"lower",
|
|
5113
5133
|
(operand) => {
|
|
@@ -5380,6 +5400,9 @@ var Interpreter = class {
|
|
|
5380
5400
|
applyFilter(operand, filterNode, environment) {
|
|
5381
5401
|
if (filterNode.type === "Identifier") {
|
|
5382
5402
|
const filter = filterNode;
|
|
5403
|
+
if (filter.value === "safe") {
|
|
5404
|
+
return operand;
|
|
5405
|
+
}
|
|
5383
5406
|
if (filter.value === "tojson") {
|
|
5384
5407
|
return new StringValue(toJSON(operand, {}));
|
|
5385
5408
|
}
|
|
@@ -5469,6 +5492,8 @@ var Interpreter = class {
|
|
|
5469
5492
|
return new IntegerValue(Math.floor(operand.value));
|
|
5470
5493
|
case "float":
|
|
5471
5494
|
return new FloatValue(operand.value);
|
|
5495
|
+
case "string":
|
|
5496
|
+
return new StringValue(operand.toString());
|
|
5472
5497
|
default:
|
|
5473
5498
|
throw new Error(`Unknown NumericValue filter: ${filter.value}`);
|
|
5474
5499
|
}
|
|
@@ -6897,9 +6922,216 @@ function toAbsoluteURL(url2) {
|
|
|
6897
6922
|
return new URL(url2, baseURL).href;
|
|
6898
6923
|
}
|
|
6899
6924
|
|
|
6925
|
+
// src/utils/cache/CrossOriginStorageCache.js
|
|
6926
|
+
var HASH_ALGORITHM = "SHA-256";
|
|
6927
|
+
var HASH_CACHE_NAME = "experimental_transformers-hash-cache";
|
|
6928
|
+
var makeHashDescriptor = (value) => ({ algorithm: HASH_ALGORITHM, value });
|
|
6929
|
+
var CrossOriginStorage = class {
|
|
6930
|
+
/** @type {Promise<Cache> | null} */
|
|
6931
|
+
#hashCache = null;
|
|
6932
|
+
/**
|
|
6933
|
+
* Returns (and lazily opens) the hash cache, reusing the same promise across concurrent callers.
|
|
6934
|
+
* @returns {Promise<Cache>}
|
|
6935
|
+
*/
|
|
6936
|
+
_getHashCache = () => {
|
|
6937
|
+
this.#hashCache ??= caches.open(HASH_CACHE_NAME);
|
|
6938
|
+
return this.#hashCache;
|
|
6939
|
+
};
|
|
6940
|
+
/**
|
|
6941
|
+
* Returns whether the `navigator.crossOriginStorage` API is available in the current environment.
|
|
6942
|
+
* @returns {boolean}
|
|
6943
|
+
*/
|
|
6944
|
+
static isAvailable = () => typeof navigator !== "undefined" && "crossOriginStorage" in navigator;
|
|
6945
|
+
/**
|
|
6946
|
+
* Looks up a cached response for the given URL by resolving its SHA-256 hash and requesting
|
|
6947
|
+
* the corresponding file handle from cross-origin storage.
|
|
6948
|
+
*
|
|
6949
|
+
* Implements `CacheInterface.match`.
|
|
6950
|
+
*
|
|
6951
|
+
* @param {string} request The URL of the resource to look up.
|
|
6952
|
+
* @returns {Promise<Response|undefined>} The cached `Response`, or `undefined` if not found.
|
|
6953
|
+
*/
|
|
6954
|
+
match = async (request) => {
|
|
6955
|
+
const hashValue = await this._getFileHash(request);
|
|
6956
|
+
if (!hashValue) {
|
|
6957
|
+
return void 0;
|
|
6958
|
+
}
|
|
6959
|
+
try {
|
|
6960
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashValue)]);
|
|
6961
|
+
const blob = await handle.getFile();
|
|
6962
|
+
return new Response(blob, {
|
|
6963
|
+
headers: {
|
|
6964
|
+
"Content-Length": String(blob.size)
|
|
6965
|
+
}
|
|
6966
|
+
});
|
|
6967
|
+
} catch {
|
|
6968
|
+
return void 0;
|
|
6969
|
+
}
|
|
6970
|
+
};
|
|
6971
|
+
/**
|
|
6972
|
+
* Stores a response in cross-origin storage, keyed by its SHA-256 hash.
|
|
6973
|
+
*
|
|
6974
|
+
* For LFS-backed URLs the hash is resolved cheaply via `_getFileHash` (which checks
|
|
6975
|
+
* `HASH_CACHE_NAME` first, then falls back to fetching the Git LFS pointer file)
|
|
6976
|
+
* without reading the response body a second time.
|
|
6977
|
+
*
|
|
6978
|
+
* For non-LFS resources the hash is unknown upfront. In that case the body is consumed
|
|
6979
|
+
* in the background: the stream is read to compute the content hash, the file is written
|
|
6980
|
+
* into cross-origin storage, and the computed hash is persisted to `HASH_CACHE_NAME`
|
|
6981
|
+
* so that future `match` calls can resolve the file without a network round-trip.
|
|
6982
|
+
*
|
|
6983
|
+
* Implements `CacheInterface.put`.
|
|
6984
|
+
*
|
|
6985
|
+
* @param {string} request The URL of the resource (used as the hash-cache key).
|
|
6986
|
+
* @param {Response} response The response whose body will be written to the cache.
|
|
6987
|
+
* @returns {Promise<void>}
|
|
6988
|
+
*/
|
|
6989
|
+
put = async (request, response) => {
|
|
6990
|
+
const hashValue = await this._getFileHash(request);
|
|
6991
|
+
if (hashValue) {
|
|
6992
|
+
const blob = await response.blob();
|
|
6993
|
+
await this._storeBlobInCOS(blob, hashValue);
|
|
6994
|
+
} else {
|
|
6995
|
+
this._processAndStore(request, response.body);
|
|
6996
|
+
}
|
|
6997
|
+
};
|
|
6998
|
+
/**
|
|
6999
|
+
* Writes a blob into cross-origin storage using the given pre-computed hex hash string.
|
|
7000
|
+
*
|
|
7001
|
+
* @param {Blob} blob
|
|
7002
|
+
* @param {string} hashHex Hex-encoded SHA-256 hash of `blob`.
|
|
7003
|
+
* @returns {Promise<void>}
|
|
7004
|
+
*/
|
|
7005
|
+
_storeBlobInCOS = async (blob, hashHex) => {
|
|
7006
|
+
const [handle] = await navigator.crossOriginStorage.requestFileHandles([makeHashDescriptor(hashHex)], {
|
|
7007
|
+
create: true
|
|
7008
|
+
});
|
|
7009
|
+
const writableStream = await handle.createWritable();
|
|
7010
|
+
await writableStream.write(blob);
|
|
7011
|
+
await writableStream.close();
|
|
7012
|
+
};
|
|
7013
|
+
/**
|
|
7014
|
+
* Background task for non-LFS resources: consumes `stream`, computes the SHA-256 hash
|
|
7015
|
+
* of the resulting blob, stores it in cross-origin storage, and persists the computed
|
|
7016
|
+
* hash to `HASH_CACHE_NAME` keyed by `request` so future `match` calls can resolve the
|
|
7017
|
+
* file without a network round-trip.
|
|
7018
|
+
*
|
|
7019
|
+
* Called fire-and-forget from `put` — errors are swallowed so failures never surface to
|
|
7020
|
+
* the caller.
|
|
7021
|
+
*
|
|
7022
|
+
* @param {string} request The original resource URL.
|
|
7023
|
+
* @param {ReadableStream} stream The response body stream to consume.
|
|
7024
|
+
* @returns {Promise<void>}
|
|
7025
|
+
*/
|
|
7026
|
+
_processAndStore = async (request, stream) => {
|
|
7027
|
+
try {
|
|
7028
|
+
const chunks = [];
|
|
7029
|
+
for await (const chunk2 of stream) {
|
|
7030
|
+
chunks.push(chunk2);
|
|
7031
|
+
}
|
|
7032
|
+
const blob = new Blob(chunks);
|
|
7033
|
+
const hashHex = await this._getBlobHash(blob);
|
|
7034
|
+
await this._storeBlobInCOS(blob, hashHex);
|
|
7035
|
+
try {
|
|
7036
|
+
const hashCache = await this._getHashCache();
|
|
7037
|
+
await hashCache.put(request, new Response(hashHex));
|
|
7038
|
+
} catch {
|
|
7039
|
+
}
|
|
7040
|
+
} catch {
|
|
7041
|
+
}
|
|
7042
|
+
};
|
|
7043
|
+
/**
|
|
7044
|
+
* Deletes the cache entry for the given request.
|
|
7045
|
+
*
|
|
7046
|
+
* Removes the hash entry from `HASH_CACHE_NAME`. Note: cross-origin storage itself does not
|
|
7047
|
+
* expose a delete API, so only the local hash mapping is removed. For non-LFS URLs this
|
|
7048
|
+
* permanently prevents `match` from resolving the file. For LFS-backed URLs, `match` will
|
|
7049
|
+
* re-fetch the LFS pointer file on the next call and repopulate the hash cache automatically.
|
|
7050
|
+
*
|
|
7051
|
+
* Implements `CacheInterface.delete`.
|
|
7052
|
+
*
|
|
7053
|
+
* @param {string} request
|
|
7054
|
+
* @returns {Promise<boolean>} Resolves to `true` if the hash entry was deleted, `false` otherwise.
|
|
7055
|
+
*/
|
|
7056
|
+
delete = async (request) => {
|
|
7057
|
+
try {
|
|
7058
|
+
const hashCache = await this._getHashCache();
|
|
7059
|
+
return await hashCache.delete(request);
|
|
7060
|
+
} catch {
|
|
7061
|
+
return false;
|
|
7062
|
+
}
|
|
7063
|
+
};
|
|
7064
|
+
/**
|
|
7065
|
+
* Resolves the SHA-256 hash for a given URL.
|
|
7066
|
+
*
|
|
7067
|
+
* Returns the cached hash immediately if one has been persisted to `HASH_CACHE_NAME`.
|
|
7068
|
+
* Otherwise falls back to `_getLfsFileHash` to retrieve the hash from the Hugging Face
|
|
7069
|
+
* LFS pointer file, persisting the result to `HASH_CACHE_NAME` for future lookups.
|
|
7070
|
+
*
|
|
7071
|
+
* Returns `null` if the hash cannot be determined (e.g. non-LFS URL with no cached entry).
|
|
7072
|
+
*
|
|
7073
|
+
* @param {string} url The resource URL to resolve a hash for.
|
|
7074
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
7075
|
+
*/
|
|
7076
|
+
_getFileHash = async (url2) => {
|
|
7077
|
+
try {
|
|
7078
|
+
const hashCache = await this._getHashCache();
|
|
7079
|
+
const cached = await hashCache.match(url2);
|
|
7080
|
+
if (cached) {
|
|
7081
|
+
return cached.text();
|
|
7082
|
+
}
|
|
7083
|
+
const hash = await this._getLfsFileHash(url2);
|
|
7084
|
+
if (hash) {
|
|
7085
|
+
await hashCache.put(url2, new Response(hash));
|
|
7086
|
+
return hash;
|
|
7087
|
+
}
|
|
7088
|
+
return null;
|
|
7089
|
+
} catch {
|
|
7090
|
+
return null;
|
|
7091
|
+
}
|
|
7092
|
+
};
|
|
7093
|
+
/**
|
|
7094
|
+
* Attempts to retrieve the SHA-256 hash for a Hugging Face resource URL from its raw
|
|
7095
|
+
* Git LFS pointer file.
|
|
7096
|
+
*
|
|
7097
|
+
* Only applicable to URLs containing `/resolve/` (i.e. Hugging Face resolved file URLs).
|
|
7098
|
+
* The `/resolve/` segment is rewritten to `/raw/` to fetch the LFS pointer directly.
|
|
7099
|
+
* Returns `null` for non-LFS URLs or when the network request fails.
|
|
7100
|
+
*
|
|
7101
|
+
* @see https://huggingface.co/docs/hub/en/storage-backends#xet
|
|
7102
|
+
* @param {string} url The resolved Hugging Face URL of the resource.
|
|
7103
|
+
* @returns {Promise<string|null>} The hex-encoded SHA-256 hash, or `null` if unavailable.
|
|
7104
|
+
*/
|
|
7105
|
+
_getLfsFileHash = async (url2) => {
|
|
7106
|
+
if (!url2.includes("/resolve/")) {
|
|
7107
|
+
return null;
|
|
7108
|
+
}
|
|
7109
|
+
const rawUrl = url2.replace("/resolve/", "/raw/");
|
|
7110
|
+
try {
|
|
7111
|
+
const text = await fetch(rawUrl).then((r) => r.text());
|
|
7112
|
+
const match = text.match(/^oid sha256:([0-9a-f]+)$/m);
|
|
7113
|
+
return match ? match[1] : null;
|
|
7114
|
+
} catch {
|
|
7115
|
+
return null;
|
|
7116
|
+
}
|
|
7117
|
+
};
|
|
7118
|
+
/**
|
|
7119
|
+
* Computes the SHA-256 hash of a `Blob`'s contents.
|
|
7120
|
+
*
|
|
7121
|
+
* @param {Blob} blob The blob to hash.
|
|
7122
|
+
* @returns {Promise<string>} The lowercase hex-encoded SHA-256 hash.
|
|
7123
|
+
*/
|
|
7124
|
+
_getBlobHash = async (blob) => {
|
|
7125
|
+
const arrayBuffer = await blob.arrayBuffer();
|
|
7126
|
+
const hashBuffer = await crypto.subtle.digest(HASH_ALGORITHM, arrayBuffer);
|
|
7127
|
+
const hashArray = Array.from(new Uint8Array(hashBuffer));
|
|
7128
|
+
return hashArray.map((byte) => byte.toString(16).padStart(2, "0")).join("");
|
|
7129
|
+
};
|
|
7130
|
+
};
|
|
7131
|
+
|
|
6900
7132
|
// src/utils/cache.js
|
|
6901
7133
|
async function getCache(file_cache_dir = null) {
|
|
6902
|
-
let
|
|
7134
|
+
let cache2 = null;
|
|
6903
7135
|
if (env.useCustomCache) {
|
|
6904
7136
|
if (!env.customCache) {
|
|
6905
7137
|
throw Error("`env.useCustomCache=true`, but `env.customCache` is not defined.");
|
|
@@ -6909,30 +7141,33 @@ async function getCache(file_cache_dir = null) {
|
|
|
6909
7141
|
"`env.customCache` must be an object which implements the `match` and `put` functions of the Web Cache API. For more information, see https://developer.mozilla.org/en-US/docs/Web/API/Cache"
|
|
6910
7142
|
);
|
|
6911
7143
|
}
|
|
6912
|
-
|
|
7144
|
+
cache2 = env.customCache;
|
|
7145
|
+
}
|
|
7146
|
+
if (!cache2 && env.experimental_useCrossOriginStorage && CrossOriginStorage.isAvailable()) {
|
|
7147
|
+
cache2 = new CrossOriginStorage();
|
|
6913
7148
|
}
|
|
6914
|
-
if (!
|
|
7149
|
+
if (!cache2 && env.useBrowserCache) {
|
|
6915
7150
|
if (typeof caches === "undefined") {
|
|
6916
7151
|
throw Error("Browser cache is not available in this environment.");
|
|
6917
7152
|
}
|
|
6918
7153
|
try {
|
|
6919
|
-
|
|
7154
|
+
cache2 = await caches.open(env.cacheKey);
|
|
6920
7155
|
} catch (e) {
|
|
6921
7156
|
logger.warn("An error occurred while opening the browser cache:", e);
|
|
6922
7157
|
}
|
|
6923
7158
|
}
|
|
6924
|
-
if (!
|
|
7159
|
+
if (!cache2 && env.useFSCache) {
|
|
6925
7160
|
if (!apis.IS_FS_AVAILABLE) {
|
|
6926
7161
|
throw Error("File System Cache is not available in this environment.");
|
|
6927
7162
|
}
|
|
6928
|
-
|
|
7163
|
+
cache2 = new FileCache(file_cache_dir ?? env.cacheDir);
|
|
6929
7164
|
}
|
|
6930
|
-
return
|
|
7165
|
+
return cache2;
|
|
6931
7166
|
}
|
|
6932
|
-
async function tryCache(
|
|
7167
|
+
async function tryCache(cache2, ...names) {
|
|
6933
7168
|
for (let name of names) {
|
|
6934
7169
|
try {
|
|
6935
|
-
let result = await
|
|
7170
|
+
let result = await cache2.match(name);
|
|
6936
7171
|
if (result) return result;
|
|
6937
7172
|
} catch (e) {
|
|
6938
7173
|
continue;
|
|
@@ -6941,6 +7176,83 @@ async function tryCache(cache, ...names) {
|
|
|
6941
7176
|
return void 0;
|
|
6942
7177
|
}
|
|
6943
7178
|
|
|
7179
|
+
// src/utils/lru_cache.js
|
|
7180
|
+
var LRUCache2 = class {
|
|
7181
|
+
/** @type {number} */
|
|
7182
|
+
#capacity;
|
|
7183
|
+
/** @type {Map<any, any>} */
|
|
7184
|
+
#cache;
|
|
7185
|
+
/**
|
|
7186
|
+
* Creates an LRUCache instance.
|
|
7187
|
+
* @param {number} capacity The maximum number of items the cache can hold.
|
|
7188
|
+
*/
|
|
7189
|
+
constructor(capacity) {
|
|
7190
|
+
this.#capacity = capacity;
|
|
7191
|
+
this.#cache = /* @__PURE__ */ new Map();
|
|
7192
|
+
}
|
|
7193
|
+
/**
|
|
7194
|
+
* Retrieves the value associated with the given key and marks the key as recently used.
|
|
7195
|
+
* @param {any} key The key to retrieve.
|
|
7196
|
+
* @returns {any} The value associated with the key, or undefined if the key does not exist.
|
|
7197
|
+
*/
|
|
7198
|
+
get(key) {
|
|
7199
|
+
if (!this.#cache.has(key)) return void 0;
|
|
7200
|
+
const value = this.#cache.get(key);
|
|
7201
|
+
this.#cache.delete(key);
|
|
7202
|
+
this.#cache.set(key, value);
|
|
7203
|
+
return value;
|
|
7204
|
+
}
|
|
7205
|
+
/**
|
|
7206
|
+
* Inserts or updates the key-value pair in the cache.
|
|
7207
|
+
* If the key already exists, it is updated and marked as recently used.
|
|
7208
|
+
* If the cache exceeds its capacity, the least recently used item is evicted.
|
|
7209
|
+
* @param {any} key The key to add or update.
|
|
7210
|
+
* @param {any} value The value to associate with the key.
|
|
7211
|
+
*/
|
|
7212
|
+
put(key, value) {
|
|
7213
|
+
if (this.#cache.has(key)) {
|
|
7214
|
+
this.#cache.delete(key);
|
|
7215
|
+
}
|
|
7216
|
+
this.#cache.set(key, value);
|
|
7217
|
+
if (this.#cache.size > this.#capacity) {
|
|
7218
|
+
this.#cache.delete(this.#cache.keys().next().value);
|
|
7219
|
+
}
|
|
7220
|
+
}
|
|
7221
|
+
/**
|
|
7222
|
+
* Removes the entry for the given key from the cache.
|
|
7223
|
+
* @param {any} key The key to delete.
|
|
7224
|
+
* @returns {boolean} `true` if the entry existed and was removed, `false` otherwise.
|
|
7225
|
+
*/
|
|
7226
|
+
delete(key) {
|
|
7227
|
+
return this.#cache.delete(key);
|
|
7228
|
+
}
|
|
7229
|
+
/**
|
|
7230
|
+
* Clears the cache.
|
|
7231
|
+
*/
|
|
7232
|
+
clear() {
|
|
7233
|
+
this.#cache.clear();
|
|
7234
|
+
}
|
|
7235
|
+
};
|
|
7236
|
+
|
|
7237
|
+
// src/utils/memoize_promise.js
|
|
7238
|
+
var MAX_CACHE_SIZE = 100;
|
|
7239
|
+
var cache = new LRUCache2(MAX_CACHE_SIZE);
|
|
7240
|
+
function memoizePromise(key, factory) {
|
|
7241
|
+
const cached = cache.get(key);
|
|
7242
|
+
if (cached !== void 0) {
|
|
7243
|
+
return cached;
|
|
7244
|
+
}
|
|
7245
|
+
const promise = factory().then(
|
|
7246
|
+
(value) => value,
|
|
7247
|
+
(err) => {
|
|
7248
|
+
cache.delete(key);
|
|
7249
|
+
return Promise.reject(err);
|
|
7250
|
+
}
|
|
7251
|
+
);
|
|
7252
|
+
cache.put(key, promise);
|
|
7253
|
+
return promise;
|
|
7254
|
+
}
|
|
7255
|
+
|
|
6944
7256
|
// src/utils/model_registry/get_file_metadata.js
|
|
6945
7257
|
async function fetch_file_head(urlOrPath) {
|
|
6946
7258
|
if (!isValidUrl(urlOrPath, ["http:", "https:"])) {
|
|
@@ -6948,17 +7260,27 @@ async function fetch_file_head(urlOrPath) {
|
|
|
6948
7260
|
}
|
|
6949
7261
|
const headers = getFetchHeaders(urlOrPath);
|
|
6950
7262
|
headers.set("Range", "bytes=0-0");
|
|
6951
|
-
return env.fetch(urlOrPath, { method: "GET", headers });
|
|
7263
|
+
return env.fetch(urlOrPath, { method: "GET", headers, cache: "no-store" });
|
|
7264
|
+
}
|
|
7265
|
+
function get_file_metadata(path_or_repo_id, filename, options = {}) {
|
|
7266
|
+
const key = JSON.stringify([
|
|
7267
|
+
path_or_repo_id,
|
|
7268
|
+
filename,
|
|
7269
|
+
options?.revision,
|
|
7270
|
+
options?.cache_dir,
|
|
7271
|
+
options?.local_files_only
|
|
7272
|
+
]);
|
|
7273
|
+
return memoizePromise(key, () => _get_file_metadata(path_or_repo_id, filename, options));
|
|
6952
7274
|
}
|
|
6953
|
-
async function
|
|
6954
|
-
const
|
|
7275
|
+
async function _get_file_metadata(path_or_repo_id, filename, options) {
|
|
7276
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
6955
7277
|
const { localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
6956
7278
|
path_or_repo_id,
|
|
6957
7279
|
filename,
|
|
6958
7280
|
options,
|
|
6959
|
-
|
|
7281
|
+
cache2
|
|
6960
7282
|
);
|
|
6961
|
-
const cachedResponse = await checkCachedResource(
|
|
7283
|
+
const cachedResponse = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
6962
7284
|
if (cachedResponse !== void 0 && typeof cachedResponse !== "string") {
|
|
6963
7285
|
const size = cachedResponse.headers.get("content-length");
|
|
6964
7286
|
const contentType = cachedResponse.headers.get("content-type");
|
|
@@ -7056,7 +7378,7 @@ function getFetchHeaders(urlOrPath) {
|
|
|
7056
7378
|
}
|
|
7057
7379
|
return headers;
|
|
7058
7380
|
}
|
|
7059
|
-
function buildResourcePaths(path_or_repo_id, filename, options = {},
|
|
7381
|
+
function buildResourcePaths(path_or_repo_id, filename, options = {}, cache2 = null) {
|
|
7060
7382
|
const revision = options.revision ?? "main";
|
|
7061
7383
|
const requestURL = pathJoin(path_or_repo_id, filename);
|
|
7062
7384
|
const validModelId = isValidHfModelId(path_or_repo_id);
|
|
@@ -7066,7 +7388,7 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
7066
7388
|
env.remotePathTemplate.replaceAll("{model}", path_or_repo_id).replaceAll("{revision}", encodeURIComponent(revision)),
|
|
7067
7389
|
filename
|
|
7068
7390
|
);
|
|
7069
|
-
const proposedCacheKey =
|
|
7391
|
+
const proposedCacheKey = cache2 instanceof FileCache ? (
|
|
7070
7392
|
// Choose cache key for filesystem cache
|
|
7071
7393
|
// When using the main revision (default), we use the request URL as the cache key.
|
|
7072
7394
|
// If a specific revision is requested, we account for this in the cache key.
|
|
@@ -7080,14 +7402,14 @@ function buildResourcePaths(path_or_repo_id, filename, options = {}, cache = nul
|
|
|
7080
7402
|
validModelId
|
|
7081
7403
|
};
|
|
7082
7404
|
}
|
|
7083
|
-
async function checkCachedResource(
|
|
7084
|
-
if (!
|
|
7405
|
+
async function checkCachedResource(cache2, localPath, proposedCacheKey) {
|
|
7406
|
+
if (!cache2) {
|
|
7085
7407
|
return void 0;
|
|
7086
7408
|
}
|
|
7087
|
-
return await tryCache(
|
|
7409
|
+
return await tryCache(cache2, localPath, proposedCacheKey);
|
|
7088
7410
|
}
|
|
7089
|
-
async function storeCachedResource(path_or_repo_id, filename,
|
|
7090
|
-
if (await
|
|
7411
|
+
async function storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options = {}) {
|
|
7412
|
+
if (await cache2.match(cacheKey) !== void 0) {
|
|
7091
7413
|
return;
|
|
7092
7414
|
}
|
|
7093
7415
|
if (!result) {
|
|
@@ -7097,14 +7419,14 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
7097
7419
|
file: filename,
|
|
7098
7420
|
...data
|
|
7099
7421
|
}) : void 0;
|
|
7100
|
-
await
|
|
7422
|
+
await cache2.put(
|
|
7101
7423
|
cacheKey,
|
|
7102
7424
|
/** @type {Response} */
|
|
7103
7425
|
response,
|
|
7104
7426
|
wrapped_progress
|
|
7105
7427
|
);
|
|
7106
7428
|
} else if (typeof response !== "string") {
|
|
7107
|
-
await
|
|
7429
|
+
await cache2.put(
|
|
7108
7430
|
cacheKey,
|
|
7109
7431
|
new Response(
|
|
7110
7432
|
/** @type {any} */
|
|
@@ -7118,17 +7440,17 @@ async function storeCachedResource(path_or_repo_id, filename, cache, cacheKey, r
|
|
|
7118
7440
|
});
|
|
7119
7441
|
}
|
|
7120
7442
|
}
|
|
7121
|
-
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false,
|
|
7443
|
+
async function loadResourceFile(path_or_repo_id, filename, fatal = true, options = {}, return_path = false, cache2 = null) {
|
|
7122
7444
|
const { requestURL, localPath, remoteURL, proposedCacheKey, validModelId } = buildResourcePaths(
|
|
7123
7445
|
path_or_repo_id,
|
|
7124
7446
|
filename,
|
|
7125
7447
|
options,
|
|
7126
|
-
|
|
7448
|
+
cache2
|
|
7127
7449
|
);
|
|
7128
7450
|
let cacheKey;
|
|
7129
7451
|
let toCacheResponse = false;
|
|
7130
7452
|
let response;
|
|
7131
|
-
response = await checkCachedResource(
|
|
7453
|
+
response = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
7132
7454
|
const cacheHit = response !== void 0;
|
|
7133
7455
|
if (!cacheHit) {
|
|
7134
7456
|
if (env.allowLocalModels) {
|
|
@@ -7169,7 +7491,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
7169
7491
|
}
|
|
7170
7492
|
cacheKey = proposedCacheKey;
|
|
7171
7493
|
}
|
|
7172
|
-
toCacheResponse =
|
|
7494
|
+
toCacheResponse = cache2 && // 1. A caching system is available
|
|
7173
7495
|
typeof Response !== "undefined" && // 2. `Response` is defined (i.e., we are in a browser-like environment)
|
|
7174
7496
|
response instanceof Response && // 3. result is a `Response` object (i.e., not a `FileResponse`)
|
|
7175
7497
|
response.status === 200;
|
|
@@ -7231,7 +7553,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
7231
7553
|
// i.e., do not cache FileResponses (prevents duplication)
|
|
7232
7554
|
toCacheResponse && cacheKey && typeof response !== "string"
|
|
7233
7555
|
) {
|
|
7234
|
-
await storeCachedResource(path_or_repo_id, filename,
|
|
7556
|
+
await storeCachedResource(path_or_repo_id, filename, cache2, cacheKey, response, result, options);
|
|
7235
7557
|
}
|
|
7236
7558
|
dispatchCallback(options.progress_callback, {
|
|
7237
7559
|
status: "done",
|
|
@@ -7247,7 +7569,7 @@ async function loadResourceFile(path_or_repo_id, filename, fatal = true, options
|
|
|
7247
7569
|
if (response instanceof FileResponse) {
|
|
7248
7570
|
return response.filePath;
|
|
7249
7571
|
}
|
|
7250
|
-
const cachedResponse = await
|
|
7572
|
+
const cachedResponse = await cache2?.match(cacheKey);
|
|
7251
7573
|
if (cachedResponse instanceof FileResponse) {
|
|
7252
7574
|
return cachedResponse.filePath;
|
|
7253
7575
|
} else if (cachedResponse instanceof Response) {
|
|
@@ -7274,8 +7596,8 @@ async function getModelFile(path_or_repo_id, filename, fatal = true, options = {
|
|
|
7274
7596
|
name: path_or_repo_id,
|
|
7275
7597
|
file: filename
|
|
7276
7598
|
});
|
|
7277
|
-
const
|
|
7278
|
-
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path,
|
|
7599
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
7600
|
+
return await loadResourceFile(path_or_repo_id, filename, fatal, options, return_path, cache2);
|
|
7279
7601
|
}
|
|
7280
7602
|
async function getModelText(modelPath, fileName, fatal = true, options = {}) {
|
|
7281
7603
|
const buffer = await getModelFile(modelPath, fileName, fatal, options, false);
|
|
@@ -8068,7 +8390,7 @@ var uint16_to_float32 = /* @__PURE__ */ (function() {
|
|
|
8068
8390
|
// src/backends/onnx.js
|
|
8069
8391
|
var ONNX_NODE = __toESM(require("onnxruntime-node"), 1);
|
|
8070
8392
|
|
|
8071
|
-
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.
|
|
8393
|
+
// ../../node_modules/.pnpm/onnxruntime-web@1.25.0-dev.20260307-d626b568e0/node_modules/onnxruntime-web/dist/ort.webgpu.bundle.min.mjs
|
|
8072
8394
|
var ort_webgpu_bundle_min_exports = {};
|
|
8073
8395
|
__export(ort_webgpu_bundle_min_exports, {
|
|
8074
8396
|
InferenceSession: () => Jf,
|
|
@@ -8837,7 +9159,7 @@ async function ts(a = {}) {
|
|
|
8837
9159
|
throw L(e = "Aborted(" + e + ")"), W = true, e = new WebAssembly.RuntimeError(e + ". Build with -sASSERTIONS for more info."), R?.(e), e;
|
|
8838
9160
|
}
|
|
8839
9161
|
function Ye() {
|
|
8840
|
-
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii,
|
|
9162
|
+
return { a: { f: Vs, J: js, k: Hs, p: Ys, l: qs, ta: Js, b: Xs, ca: Qs, Ka: Sn, s: Zs, da: Ln, _a: On, Ga: Bn, Ia: Mn, $a: Cn, Ya: Un, Ra: Dn, Xa: Pn, pa: _n, Ha: Rn, Yb: Nn, Za: kn, Fa: Wn, eb: Ks, Da: ti, Tb: ri, Rb: oi, Ca: si, M: ii, I: ui, Sb: fi, ka: yi, Ub: bi, Ua: wi, Wb: Ti, La: vi, Pb: Ei, la: Si, Ta: Ar, bb: Ai, U: Oi, n: Di, c: Er, sb: Pi, w: _i, L: Ri, z: Ni, j: ki, o: Yn, tb: Wi, G: Fi, T: Gi, h: $i, u: zi, m: Vi, i: ji, Oa: Hi, Pa: Yi, Qa: qi, Ma: Qn, Na: Zn, Qb: Kn, fb: Xi, db: Ki, Y: eu, rb: tu, ma: ru, cb: Qi, gb: nu, ab: ou, Xb: au, N: Ji, hb: su, X: iu, Vb: uu, ob: bu, C: wu, sa: gu, ra: Tu, qb: vu, W: Eu, v: Su, nb: Au, mb: Iu, lb: xu, pb: Lu, kb: Ou, jb: Bu, ib: Mu, Va: ao, Wa: so, Ja: br, ea: io, oa: uo, Sa: fo, na: co, Db: Gf, xa: Df, Eb: Ff, ya: Uf, F: Ef, e: ff, r: sf, x: af, D: gf, Ib: Bf, ba: Lf, B: df, za: Mf, $: Pf, ha: Of, Fb: kf, Gb: Nf, Ba: Sf, Aa: xf, Jb: Af, wa: Wf, aa: Cf, d: cf, A: lf, q: uf, Cb: $f, t: mf, y: Tf, H: pf, E: hf, K: vf, S: _f, ja: wf, _: Rf, Kb: bf, Lb: yf, P: If2, g: Uu, a: Fe, Ob: qe, Hb: Du, ia: Pu, O: _u, qa: Ru, Mb: Nu, Q: ku, zb: Wu, Ab: Fu, ua: Gu, fa: $u, R: zu, Ea: Vu, va: ju, Z: Hu, xb: Yu, Zb: qu, V: Ju, Bb: Xu, ub: Qu, vb: Ku, wb: ef, ga: tf, yb: rf, Nb: nf } };
|
|
8841
9163
|
}
|
|
8842
9164
|
async function bt() {
|
|
8843
9165
|
function e(o, u) {
|
|
@@ -10024,7 +10346,7 @@ async function ts(a = {}) {
|
|
|
10024
10346
|
Te(`invalid type for getValue: ${t}`);
|
|
10025
10347
|
}
|
|
10026
10348
|
}, r.UTF8ToString = ct, r.stringToUTF8 = Pe, r.lengthBytesUTF8 = _e;
|
|
10027
|
-
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = {
|
|
10349
|
+
var lo, po, Dr, Wt, xe, pt, mo, ho, yo, bo, wo, go, To, vo, Eo, So, Ao, Pr, _r, Rr, Nr, Et, kr, Io, Wr, xo, Lo, Oo, Fr, Bo, Mo, Gr, N, St, Co, D, Ft, P, Uo, $r, Do, Po, _o, zr, Ro, No, ko, Wo, Fo, Go, $o, zo, Vo, jo, Ho, Yo, qo, Jo, Xo, Qo, Zo, Ko, ea, ta, ra, na, oa, aa, sa, ia, ua, fa, ca, da, la, pa, ma, ha, ya, ba, wa, ga, Ta, ke, of = [qe, yr, En, Ln, On, Bn, Mn, Cn, Un, Dn, Pn, _n, Rn, Nn, kn, Wn, Qn, Zn, Kn, ao, so, io, uo, fo, co], Vr = { 925676: (e, t, n, o, u) => {
|
|
10028
10350
|
if (r === void 0 || !r.Uc) return 1;
|
|
10029
10351
|
if ((e = ct(Number(e >>> 0))).startsWith("./") && (e = e.substring(2)), !(e = r.Uc.get(e))) return 2;
|
|
10030
10352
|
if (t = Number(t >>> 0), n = Number(n >>> 0), o = Number(o >>> 0), t + n > e.byteLength) return 3;
|
|
@@ -10044,11 +10366,11 @@ async function ts(a = {}) {
|
|
|
10044
10366
|
} catch {
|
|
10045
10367
|
return 4;
|
|
10046
10368
|
}
|
|
10047
|
-
},
|
|
10369
|
+
}, 926500: (e, t, n) => {
|
|
10048
10370
|
r.Sd(e, (p(), J).subarray(t >>> 0, t + n >>> 0));
|
|
10049
|
-
},
|
|
10371
|
+
}, 926564: () => r.me(), 926606: (e) => {
|
|
10050
10372
|
r.jd(e);
|
|
10051
|
-
},
|
|
10373
|
+
}, 926643: () => typeof wasmOffsetConverter < "u" };
|
|
10052
10374
|
function af(e, t, n, o) {
|
|
10053
10375
|
var u = P();
|
|
10054
10376
|
try {
|
|
@@ -11964,7 +12286,7 @@ var $s = k(() => {
|
|
|
11964
12286
|
Ve();
|
|
11965
12287
|
Ve();
|
|
11966
12288
|
Ve();
|
|
11967
|
-
var Xa = "1.25.0-dev.
|
|
12289
|
+
var Xa = "1.25.0-dev.20260307-d626b568e0";
|
|
11968
12290
|
var Tl = Zr;
|
|
11969
12291
|
{
|
|
11970
12292
|
let a = ($s(), $t(Gs)).wasmBackend;
|
|
@@ -11975,11 +12297,11 @@ Object.defineProperty(K.versions, "web", { value: Xa, enumerable: true });
|
|
|
11975
12297
|
// src/backends/utils/cacheWasm.js
|
|
11976
12298
|
async function loadAndCacheFile(url2) {
|
|
11977
12299
|
const fileName = url2.split("/").pop();
|
|
11978
|
-
let
|
|
12300
|
+
let cache2;
|
|
11979
12301
|
try {
|
|
11980
|
-
|
|
11981
|
-
if (
|
|
11982
|
-
const result = await
|
|
12302
|
+
cache2 = await getCache();
|
|
12303
|
+
if (cache2) {
|
|
12304
|
+
const result = await cache2.match(url2);
|
|
11983
12305
|
if (result) {
|
|
11984
12306
|
return result;
|
|
11985
12307
|
}
|
|
@@ -11991,9 +12313,9 @@ async function loadAndCacheFile(url2) {
|
|
|
11991
12313
|
if (!response.ok) {
|
|
11992
12314
|
throw new Error(`Failed to fetch ${fileName}: ${response.status} ${response.statusText}`);
|
|
11993
12315
|
}
|
|
11994
|
-
if (
|
|
12316
|
+
if (cache2) {
|
|
11995
12317
|
try {
|
|
11996
|
-
await
|
|
12318
|
+
await cache2.put(url2, response.clone());
|
|
11997
12319
|
} catch (e) {
|
|
11998
12320
|
logger.warn(`Failed to cache ${fileName}:`, e);
|
|
11999
12321
|
}
|
|
@@ -13845,9 +14167,23 @@ var Tensor2 = class _Tensor {
|
|
|
13845
14167
|
throw Error(`Unsupported norm: ${p}`);
|
|
13846
14168
|
}
|
|
13847
14169
|
const this_data = this.data;
|
|
13848
|
-
const
|
|
14170
|
+
const is_bigint = this_data instanceof BigInt64Array || this_data instanceof BigUint64Array;
|
|
14171
|
+
if (is_bigint && p !== 1) {
|
|
14172
|
+
throw Error(`Expected a floating point tensor as input. Got ${this.type}`);
|
|
14173
|
+
}
|
|
14174
|
+
let fn2, zero;
|
|
14175
|
+
if (is_bigint) {
|
|
14176
|
+
fn2 = (a, b) => a + b;
|
|
14177
|
+
zero = 0n;
|
|
14178
|
+
} else {
|
|
14179
|
+
fn2 = (a, b) => a + b ** p;
|
|
14180
|
+
zero = 0;
|
|
14181
|
+
}
|
|
13849
14182
|
if (dim === null) {
|
|
13850
|
-
|
|
14183
|
+
let val = this_data.reduce(fn2, zero);
|
|
14184
|
+
if (p !== 1) {
|
|
14185
|
+
val = val ** (1 / p);
|
|
14186
|
+
}
|
|
13851
14187
|
return new _Tensor(this.type, [val], []);
|
|
13852
14188
|
}
|
|
13853
14189
|
const [type, result, resultDims] = reduce_helper(fn2, this, dim, keepdim);
|
|
@@ -16307,9 +16643,11 @@ __export(processors_exports, {
|
|
|
16307
16643
|
ChatterboxProcessor: () => ChatterboxProcessor,
|
|
16308
16644
|
Florence2Processor: () => Florence2Processor,
|
|
16309
16645
|
Gemma3nProcessor: () => Gemma3nProcessor,
|
|
16646
|
+
GraniteSpeechProcessor: () => GraniteSpeechProcessor,
|
|
16310
16647
|
GroundingDinoProcessor: () => GroundingDinoProcessor,
|
|
16311
16648
|
Idefics3Processor: () => Idefics3Processor,
|
|
16312
16649
|
JinaCLIPProcessor: () => JinaCLIPProcessor,
|
|
16650
|
+
Lfm2VlProcessor: () => Lfm2VlProcessor,
|
|
16313
16651
|
LlavaProcessor: () => LlavaProcessor,
|
|
16314
16652
|
MgpstrProcessor: () => MgpstrProcessor,
|
|
16315
16653
|
MoonshineProcessor: () => MoonshineProcessor,
|
|
@@ -16330,6 +16668,7 @@ __export(processors_exports, {
|
|
|
16330
16668
|
UltravoxProcessor: () => UltravoxProcessor,
|
|
16331
16669
|
VLChatProcessor: () => VLChatProcessor,
|
|
16332
16670
|
VoxtralProcessor: () => VoxtralProcessor,
|
|
16671
|
+
VoxtralRealtimeProcessor: () => VoxtralRealtimeProcessor,
|
|
16333
16672
|
Wav2Vec2Processor: () => Wav2Vec2Processor,
|
|
16334
16673
|
Wav2Vec2ProcessorWithLM: () => Wav2Vec2ProcessorWithLM,
|
|
16335
16674
|
WhisperProcessor: () => WhisperProcessor
|
|
@@ -16384,12 +16723,14 @@ __export(feature_extractors_exports, {
|
|
|
16384
16723
|
EncodecFeatureExtractor: () => EncodecFeatureExtractor,
|
|
16385
16724
|
FeatureExtractor: () => FeatureExtractor,
|
|
16386
16725
|
Gemma3nAudioFeatureExtractor: () => Gemma3nAudioFeatureExtractor,
|
|
16726
|
+
GraniteSpeechFeatureExtractor: () => GraniteSpeechFeatureExtractor,
|
|
16387
16727
|
MoonshineFeatureExtractor: () => MoonshineFeatureExtractor,
|
|
16388
16728
|
ParakeetFeatureExtractor: () => ParakeetFeatureExtractor,
|
|
16389
16729
|
PyAnnoteFeatureExtractor: () => PyAnnoteFeatureExtractor,
|
|
16390
16730
|
SeamlessM4TFeatureExtractor: () => SeamlessM4TFeatureExtractor,
|
|
16391
16731
|
SnacFeatureExtractor: () => SnacFeatureExtractor,
|
|
16392
16732
|
SpeechT5FeatureExtractor: () => SpeechT5FeatureExtractor,
|
|
16733
|
+
VoxtralRealtimeFeatureExtractor: () => VoxtralRealtimeFeatureExtractor,
|
|
16393
16734
|
Wav2Vec2FeatureExtractor: () => Wav2Vec2FeatureExtractor,
|
|
16394
16735
|
WeSpeakerFeatureExtractor: () => WeSpeakerFeatureExtractor,
|
|
16395
16736
|
WhisperFeatureExtractor: () => WhisperFeatureExtractor
|
|
@@ -16617,6 +16958,7 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16617
16958
|
mel_filters = null,
|
|
16618
16959
|
mel_floor = 1e-10,
|
|
16619
16960
|
log_mel = null,
|
|
16961
|
+
max_log_mel = null,
|
|
16620
16962
|
reference = 1,
|
|
16621
16963
|
min_value = 1e-10,
|
|
16622
16964
|
db_range = null,
|
|
@@ -16756,6 +17098,17 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16756
17098
|
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
16757
17099
|
}
|
|
16758
17100
|
break;
|
|
17101
|
+
case "log10_max_norm": {
|
|
17102
|
+
for (let i = 0; i < o; ++i) {
|
|
17103
|
+
mel_spec_data[i] = Math.log10(mel_spec_data[i]);
|
|
17104
|
+
}
|
|
17105
|
+
const logMax = max_log_mel ?? max(mel_spec_data)[0];
|
|
17106
|
+
const threshold = logMax - 8;
|
|
17107
|
+
for (let i = 0; i < o; ++i) {
|
|
17108
|
+
mel_spec_data[i] = (Math.max(mel_spec_data[i], threshold) + 4) / 4;
|
|
17109
|
+
}
|
|
17110
|
+
break;
|
|
17111
|
+
}
|
|
16759
17112
|
case "dB":
|
|
16760
17113
|
if (power === 1) {
|
|
16761
17114
|
amplitude_to_db(mel_spec_data, reference, min_value, db_range);
|
|
@@ -16766,7 +17119,9 @@ async function spectrogram(waveform, window2, frame_length, hop_length, {
|
|
|
16766
17119
|
}
|
|
16767
17120
|
break;
|
|
16768
17121
|
default:
|
|
16769
|
-
throw new Error(
|
|
17122
|
+
throw new Error(
|
|
17123
|
+
`log_mel must be one of null, 'log', 'log10', 'log10_max_norm', or 'dB'. Got '${log_mel}'`
|
|
17124
|
+
);
|
|
16770
17125
|
}
|
|
16771
17126
|
}
|
|
16772
17127
|
return mel_spec;
|
|
@@ -17271,6 +17626,56 @@ var Gemma3nAudioFeatureExtractor = class extends FeatureExtractor {
|
|
|
17271
17626
|
}
|
|
17272
17627
|
};
|
|
17273
17628
|
|
|
17629
|
+
// src/models/granite_speech/feature_extraction_granite_speech.js
|
|
17630
|
+
var GraniteSpeechFeatureExtractor = class extends FeatureExtractor {
|
|
17631
|
+
constructor(config) {
|
|
17632
|
+
super(config);
|
|
17633
|
+
const { n_fft, win_length, n_mels, sample_rate } = config.melspec_kwargs;
|
|
17634
|
+
this.mel_filters = mel_filter_bank(
|
|
17635
|
+
Math.floor(1 + n_fft / 2),
|
|
17636
|
+
// num_frequency_bins = 257
|
|
17637
|
+
n_mels,
|
|
17638
|
+
// 80
|
|
17639
|
+
0,
|
|
17640
|
+
// min_frequency
|
|
17641
|
+
sample_rate / 2,
|
|
17642
|
+
// max_frequency = 8000
|
|
17643
|
+
sample_rate,
|
|
17644
|
+
// 16000
|
|
17645
|
+
null,
|
|
17646
|
+
// norm (torchaudio default: no norm)
|
|
17647
|
+
"htk"
|
|
17648
|
+
// mel_scale (torchaudio default)
|
|
17649
|
+
);
|
|
17650
|
+
const raw_window = window_function(win_length, "hann");
|
|
17651
|
+
this.window = new Float64Array(n_fft);
|
|
17652
|
+
const pad = Math.floor((n_fft - win_length) / 2);
|
|
17653
|
+
this.window.set(raw_window, pad);
|
|
17654
|
+
}
|
|
17655
|
+
/**
|
|
17656
|
+
* Extract mel spectrogram features from audio, matching the Python GraniteSpeechFeatureExtractor.
|
|
17657
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
17658
|
+
* @returns {Promise<{input_features: Tensor}>}
|
|
17659
|
+
*/
|
|
17660
|
+
async _call(audio) {
|
|
17661
|
+
validate_audio_inputs(audio, "GraniteSpeechFeatureExtractor");
|
|
17662
|
+
const { n_fft, hop_length, n_mels } = this.config.melspec_kwargs;
|
|
17663
|
+
const num_frames = 1 + Math.floor((audio.length - 1) / hop_length);
|
|
17664
|
+
const max_num_frames = num_frames - num_frames % 2;
|
|
17665
|
+
const mel = await spectrogram(audio, this.window, n_fft, hop_length, {
|
|
17666
|
+
power: 2,
|
|
17667
|
+
mel_filters: this.mel_filters,
|
|
17668
|
+
log_mel: "log10_max_norm",
|
|
17669
|
+
transpose: true,
|
|
17670
|
+
// [time, n_mels]
|
|
17671
|
+
max_num_frames,
|
|
17672
|
+
do_pad: false
|
|
17673
|
+
});
|
|
17674
|
+
const input_features = mel.view(-1, 2 * n_mels).unsqueeze_(0);
|
|
17675
|
+
return { input_features };
|
|
17676
|
+
}
|
|
17677
|
+
};
|
|
17678
|
+
|
|
17274
17679
|
// src/models/moonshine/feature_extraction_moonshine.js
|
|
17275
17680
|
var MoonshineFeatureExtractor = class extends FeatureExtractor {
|
|
17276
17681
|
/**
|
|
@@ -17751,6 +18156,71 @@ var WeSpeakerFeatureExtractor = class extends FeatureExtractor {
|
|
|
17751
18156
|
}
|
|
17752
18157
|
};
|
|
17753
18158
|
|
|
18159
|
+
// src/models/voxtral_realtime/feature_extraction_voxtral_realtime.js
|
|
18160
|
+
var VoxtralRealtimeFeatureExtractor = class extends FeatureExtractor {
|
|
18161
|
+
constructor(config) {
|
|
18162
|
+
super(config);
|
|
18163
|
+
this.config.mel_filters ??= mel_filter_bank(
|
|
18164
|
+
Math.floor(1 + this.config.n_fft / 2),
|
|
18165
|
+
// num_frequency_bins
|
|
18166
|
+
this.config.feature_size,
|
|
18167
|
+
// num_mel_filters
|
|
18168
|
+
0,
|
|
18169
|
+
// min_frequency
|
|
18170
|
+
8e3,
|
|
18171
|
+
// max_frequency
|
|
18172
|
+
this.config.sampling_rate,
|
|
18173
|
+
// sampling_rate
|
|
18174
|
+
"slaney",
|
|
18175
|
+
// norm
|
|
18176
|
+
"slaney"
|
|
18177
|
+
// mel_scale
|
|
18178
|
+
);
|
|
18179
|
+
this.window = window_function(this.config.n_fft, "hann");
|
|
18180
|
+
}
|
|
18181
|
+
/**
|
|
18182
|
+
* Computes the log-Mel spectrogram of the provided audio waveform.
|
|
18183
|
+
* @param {Float32Array|Float64Array} waveform The audio waveform to process.
|
|
18184
|
+
* @param {Object} [options]
|
|
18185
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform for STFT.
|
|
18186
|
+
* @returns {Promise<import('../../utils/tensor.js').Tensor>} The log-Mel spectrogram tensor of shape [num_mel_bins, num_frames].
|
|
18187
|
+
*/
|
|
18188
|
+
async _extract_fbank_features(waveform, { center = true } = {}) {
|
|
18189
|
+
const { n_fft, hop_length, mel_filters, global_log_mel_max } = this.config;
|
|
18190
|
+
const max_num_frames = center ? Math.floor(waveform.length / hop_length) : Math.floor((waveform.length - n_fft) / hop_length);
|
|
18191
|
+
return await spectrogram(
|
|
18192
|
+
waveform,
|
|
18193
|
+
this.window,
|
|
18194
|
+
n_fft,
|
|
18195
|
+
// frame_length
|
|
18196
|
+
hop_length,
|
|
18197
|
+
{
|
|
18198
|
+
power: 2,
|
|
18199
|
+
mel_filters,
|
|
18200
|
+
log_mel: "log10_max_norm",
|
|
18201
|
+
max_log_mel: global_log_mel_max,
|
|
18202
|
+
center,
|
|
18203
|
+
max_num_frames,
|
|
18204
|
+
do_pad: false
|
|
18205
|
+
}
|
|
18206
|
+
);
|
|
18207
|
+
}
|
|
18208
|
+
/**
|
|
18209
|
+
* Extract mel spectrogram features from audio.
|
|
18210
|
+
* @param {Float32Array|Float64Array} audio The audio data.
|
|
18211
|
+
* @param {Object} [options]
|
|
18212
|
+
* @param {boolean} [options.center=true] Whether to center-pad the waveform.
|
|
18213
|
+
* @returns {Promise<{ input_features: import('../../utils/tensor.js').Tensor }>}
|
|
18214
|
+
*/
|
|
18215
|
+
async _call(audio, { center = true } = {}) {
|
|
18216
|
+
validate_audio_inputs(audio, "VoxtralRealtimeFeatureExtractor");
|
|
18217
|
+
const features = await this._extract_fbank_features(audio, { center });
|
|
18218
|
+
return {
|
|
18219
|
+
input_features: features.unsqueeze_(0)
|
|
18220
|
+
};
|
|
18221
|
+
}
|
|
18222
|
+
};
|
|
18223
|
+
|
|
17754
18224
|
// src/models/whisper/feature_extraction_whisper.js
|
|
17755
18225
|
var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
17756
18226
|
constructor(config) {
|
|
@@ -17779,7 +18249,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17779
18249
|
* @returns {Promise<Tensor>} An object containing the log-Mel spectrogram data as a Float32Array and its dimensions as an array of numbers.
|
|
17780
18250
|
*/
|
|
17781
18251
|
async _extract_fbank_features(waveform) {
|
|
17782
|
-
|
|
18252
|
+
return await spectrogram(
|
|
17783
18253
|
waveform,
|
|
17784
18254
|
this.window,
|
|
17785
18255
|
// window
|
|
@@ -17790,7 +18260,7 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17790
18260
|
{
|
|
17791
18261
|
power: 2,
|
|
17792
18262
|
mel_filters: this.config.mel_filters,
|
|
17793
|
-
log_mel: "
|
|
18263
|
+
log_mel: "log10_max_norm",
|
|
17794
18264
|
// Custom
|
|
17795
18265
|
max_num_frames: Math.min(
|
|
17796
18266
|
Math.floor(waveform.length / this.config.hop_length),
|
|
@@ -17799,15 +18269,6 @@ var WhisperFeatureExtractor = class extends FeatureExtractor {
|
|
|
17799
18269
|
)
|
|
17800
18270
|
}
|
|
17801
18271
|
);
|
|
17802
|
-
const data = features.data;
|
|
17803
|
-
const maxValue = max(
|
|
17804
|
-
/** @type {Float32Array} */
|
|
17805
|
-
data
|
|
17806
|
-
)[0];
|
|
17807
|
-
for (let i = 0; i < data.length; ++i) {
|
|
17808
|
-
data[i] = (Math.max(data[i], maxValue - 8) + 4) / 4;
|
|
17809
|
-
}
|
|
17810
|
-
return features;
|
|
17811
18272
|
}
|
|
17812
18273
|
/**
|
|
17813
18274
|
* Asynchronously extracts features from a given audio using the provided configuration.
|
|
@@ -18686,6 +19147,27 @@ function compute_segments(mask_probs, pred_scores, pred_labels, mask_threshold,
|
|
|
18686
19147
|
}
|
|
18687
19148
|
return [segmentation, segments];
|
|
18688
19149
|
}
|
|
19150
|
+
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
19151
|
+
if (height < factor || width < factor) {
|
|
19152
|
+
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
19153
|
+
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19154
|
+
throw new Error(
|
|
19155
|
+
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19156
|
+
);
|
|
19157
|
+
}
|
|
19158
|
+
let h_bar = Math.round(height / factor) * factor;
|
|
19159
|
+
let w_bar = Math.round(width / factor) * factor;
|
|
19160
|
+
if (h_bar * w_bar > max_pixels) {
|
|
19161
|
+
const beta = Math.sqrt(height * width / max_pixels);
|
|
19162
|
+
h_bar = Math.floor(height / beta / factor) * factor;
|
|
19163
|
+
w_bar = Math.floor(width / beta / factor) * factor;
|
|
19164
|
+
} else if (h_bar * w_bar < min_pixels) {
|
|
19165
|
+
const beta = Math.sqrt(min_pixels / (height * width));
|
|
19166
|
+
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19167
|
+
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19168
|
+
}
|
|
19169
|
+
return [h_bar, w_bar];
|
|
19170
|
+
}
|
|
18689
19171
|
function post_process_panoptic_segmentation(outputs, threshold = 0.5, mask_threshold = 0.5, overlap_mask_area_threshold = 0.8, label_ids_to_fuse = null, target_sizes = null) {
|
|
18690
19172
|
if (label_ids_to_fuse === null) {
|
|
18691
19173
|
logger.warn("`label_ids_to_fuse` unset. No instance will be fused.");
|
|
@@ -18974,7 +19456,7 @@ var ImageProcessor = class extends Callable2 {
|
|
|
18974
19456
|
});
|
|
18975
19457
|
}
|
|
18976
19458
|
/**
|
|
18977
|
-
* @typedef {
|
|
19459
|
+
* @typedef {Object} PreprocessedImage
|
|
18978
19460
|
* @property {HeightWidth} original_size The original size of the image.
|
|
18979
19461
|
* @property {HeightWidth} reshaped_input_size The reshaped input size of the image.
|
|
18980
19462
|
* @property {Tensor} pixel_values The pixel values of the preprocessed image.
|
|
@@ -19152,6 +19634,7 @@ __export(image_processors_exports, {
|
|
|
19152
19634
|
ImageFeatureExtractor: () => ImageProcessor,
|
|
19153
19635
|
ImageProcessor: () => ImageProcessor,
|
|
19154
19636
|
JinaCLIPImageProcessor: () => JinaCLIPImageProcessor,
|
|
19637
|
+
Lfm2VlImageProcessor: () => Lfm2VlImageProcessor,
|
|
19155
19638
|
LlavaOnevisionImageProcessor: () => LlavaOnevisionImageProcessor,
|
|
19156
19639
|
Mask2FormerImageProcessor: () => Mask2FormerImageProcessor,
|
|
19157
19640
|
MaskFormerFeatureExtractor: () => MaskFormerFeatureExtractor,
|
|
@@ -19555,6 +20038,237 @@ var JinaCLIPImageProcessor = class extends ImageProcessor {
|
|
|
19555
20038
|
}
|
|
19556
20039
|
};
|
|
19557
20040
|
|
|
20041
|
+
// src/models/lfm2_vl/image_processing_lfm2_vl.js
|
|
20042
|
+
function round_by_factor(number, factor) {
|
|
20043
|
+
return Math.round(number / factor) * factor;
|
|
20044
|
+
}
|
|
20045
|
+
function find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size) {
|
|
20046
|
+
let best_ratio_diff = Infinity;
|
|
20047
|
+
let best_ratio = [1, 1];
|
|
20048
|
+
const area = width * height;
|
|
20049
|
+
for (const ratio of target_ratios) {
|
|
20050
|
+
const ratio_diff = Math.abs(aspect_ratio - ratio[0] / ratio[1]);
|
|
20051
|
+
if (ratio_diff < best_ratio_diff) {
|
|
20052
|
+
best_ratio_diff = ratio_diff;
|
|
20053
|
+
best_ratio = ratio;
|
|
20054
|
+
} else if (ratio_diff === best_ratio_diff && area > 0.5 * image_size * image_size * ratio[0] * ratio[1]) {
|
|
20055
|
+
best_ratio = ratio;
|
|
20056
|
+
}
|
|
20057
|
+
}
|
|
20058
|
+
return best_ratio;
|
|
20059
|
+
}
|
|
20060
|
+
function get_target_ratios(min_tiles, max_tiles) {
|
|
20061
|
+
const ratios = [];
|
|
20062
|
+
const seen = /* @__PURE__ */ new Set();
|
|
20063
|
+
for (let n = min_tiles; n <= max_tiles; ++n) {
|
|
20064
|
+
for (let w = 1; w <= n; ++w) {
|
|
20065
|
+
for (let h = 1; h <= n; ++h) {
|
|
20066
|
+
const product2 = w * h;
|
|
20067
|
+
if (product2 >= min_tiles && product2 <= max_tiles) {
|
|
20068
|
+
const key = w << 16 | h;
|
|
20069
|
+
if (!seen.has(key)) {
|
|
20070
|
+
seen.add(key);
|
|
20071
|
+
ratios.push([w, h]);
|
|
20072
|
+
}
|
|
20073
|
+
}
|
|
20074
|
+
}
|
|
20075
|
+
}
|
|
20076
|
+
}
|
|
20077
|
+
return ratios.sort((a, b) => a[0] * a[1] - b[0] * b[1]);
|
|
20078
|
+
}
|
|
20079
|
+
function convert_image_to_patches(images, patch_size) {
|
|
20080
|
+
const [B, C, H, W] = images.dims;
|
|
20081
|
+
const ph = Math.floor(H / patch_size), pw = Math.floor(W / patch_size);
|
|
20082
|
+
const patch_dim = patch_size * patch_size * C;
|
|
20083
|
+
const data = (
|
|
20084
|
+
/** @type {Float32Array} */
|
|
20085
|
+
images.data
|
|
20086
|
+
);
|
|
20087
|
+
const result = new Float32Array(B * ph * pw * patch_dim);
|
|
20088
|
+
const ch_stride = H * W;
|
|
20089
|
+
for (let b = 0; b < B; ++b) {
|
|
20090
|
+
const b_src = b * C * ch_stride;
|
|
20091
|
+
const b_dst = b * ph * pw * patch_dim;
|
|
20092
|
+
for (let py = 0; py < ph; ++py) {
|
|
20093
|
+
for (let px = 0; px < pw; ++px) {
|
|
20094
|
+
let off = b_dst + (py * pw + px) * patch_dim;
|
|
20095
|
+
for (let dy = 0; dy < patch_size; ++dy) {
|
|
20096
|
+
const row = (py * patch_size + dy) * W + px * patch_size;
|
|
20097
|
+
for (let dx = 0; dx < patch_size; ++dx) {
|
|
20098
|
+
const pixel = row + dx;
|
|
20099
|
+
for (let c = 0; c < C; ++c) {
|
|
20100
|
+
result[off++] = data[b_src + c * ch_stride + pixel];
|
|
20101
|
+
}
|
|
20102
|
+
}
|
|
20103
|
+
}
|
|
20104
|
+
}
|
|
20105
|
+
}
|
|
20106
|
+
}
|
|
20107
|
+
return new Tensor2("float32", result, [B, ph * pw, patch_dim]);
|
|
20108
|
+
}
|
|
20109
|
+
function pad_along_first_dim(patches, target_length) {
|
|
20110
|
+
const [, len2, dim] = patches.dims;
|
|
20111
|
+
const mask_data = new BigInt64Array(target_length);
|
|
20112
|
+
mask_data.fill(1n, 0, len2);
|
|
20113
|
+
let padded = patches;
|
|
20114
|
+
if (len2 < target_length) {
|
|
20115
|
+
const padded_data = new Float32Array(target_length * dim);
|
|
20116
|
+
padded_data.set(
|
|
20117
|
+
/** @type {Float32Array} */
|
|
20118
|
+
patches.data
|
|
20119
|
+
);
|
|
20120
|
+
padded = new Tensor2("float32", padded_data, [1, target_length, dim]);
|
|
20121
|
+
}
|
|
20122
|
+
return { padded, mask: new Tensor2("int64", mask_data, [target_length]) };
|
|
20123
|
+
}
|
|
20124
|
+
var Lfm2VlImageProcessor = class extends ImageProcessor {
|
|
20125
|
+
constructor(config) {
|
|
20126
|
+
super(config);
|
|
20127
|
+
this.downsample_factor = config.downsample_factor ?? 2;
|
|
20128
|
+
this.do_image_splitting = config.do_image_splitting ?? true;
|
|
20129
|
+
this.min_tiles = config.min_tiles ?? 2;
|
|
20130
|
+
this.max_tiles = config.max_tiles ?? 10;
|
|
20131
|
+
this.use_thumbnail = config.use_thumbnail ?? true;
|
|
20132
|
+
this.min_image_tokens = config.min_image_tokens ?? 64;
|
|
20133
|
+
this.max_image_tokens = config.max_image_tokens ?? 256;
|
|
20134
|
+
this.encoder_patch_size = config.encoder_patch_size ?? config.patch_size ?? 16;
|
|
20135
|
+
this.tile_size = config.tile_size ?? 512;
|
|
20136
|
+
this.max_pixels_tolerance = config.max_pixels_tolerance ?? 2;
|
|
20137
|
+
this.return_row_col_info = config.return_row_col_info ?? false;
|
|
20138
|
+
const max_thumbnail_patches = this.max_image_tokens * this.downsample_factor ** 2;
|
|
20139
|
+
const tile_size_patches = this.do_image_splitting ? (this.tile_size / this.encoder_patch_size) ** 2 : 0;
|
|
20140
|
+
this.max_num_patches = Math.max(max_thumbnail_patches, tile_size_patches);
|
|
20141
|
+
}
|
|
20142
|
+
/**
|
|
20143
|
+
* Check if the image is too large to be processed as a single tile.
|
|
20144
|
+
* @param {number} height
|
|
20145
|
+
* @param {number} width
|
|
20146
|
+
* @returns {boolean}
|
|
20147
|
+
*/
|
|
20148
|
+
_is_image_too_large(height, width) {
|
|
20149
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20150
|
+
const h_bar = Math.max(this.encoder_patch_size, round_by_factor(height, total_factor));
|
|
20151
|
+
const w_bar = Math.max(this.encoder_patch_size, round_by_factor(width, total_factor));
|
|
20152
|
+
return h_bar * w_bar > this.max_image_tokens * (this.encoder_patch_size * this.downsample_factor) ** 2 * this.max_pixels_tolerance;
|
|
20153
|
+
}
|
|
20154
|
+
/**
|
|
20155
|
+
* Get the grid layout for tiling a large image.
|
|
20156
|
+
* @param {number} height
|
|
20157
|
+
* @param {number} width
|
|
20158
|
+
* @returns {{ grid_width: number, grid_height: number, target_width: number, target_height: number }}
|
|
20159
|
+
*/
|
|
20160
|
+
_get_grid_layout(height, width) {
|
|
20161
|
+
const target_ratios = get_target_ratios(this.min_tiles, this.max_tiles);
|
|
20162
|
+
const [grid_width, grid_height] = find_closest_aspect_ratio(
|
|
20163
|
+
width / height,
|
|
20164
|
+
target_ratios,
|
|
20165
|
+
width,
|
|
20166
|
+
height,
|
|
20167
|
+
this.tile_size
|
|
20168
|
+
);
|
|
20169
|
+
return {
|
|
20170
|
+
grid_width,
|
|
20171
|
+
grid_height,
|
|
20172
|
+
target_width: this.tile_size * grid_width,
|
|
20173
|
+
target_height: this.tile_size * grid_height
|
|
20174
|
+
};
|
|
20175
|
+
}
|
|
20176
|
+
/** @param {RawImage|RawImage[]|RawImage[][]} images */
|
|
20177
|
+
// @ts-expect-error
|
|
20178
|
+
async _call(images, { return_row_col_info = null } = {}) {
|
|
20179
|
+
let batched_images;
|
|
20180
|
+
if (!Array.isArray(images)) {
|
|
20181
|
+
batched_images = [[images]];
|
|
20182
|
+
} else if (!Array.isArray(images[0])) {
|
|
20183
|
+
batched_images = [
|
|
20184
|
+
/** @type {RawImage[]} */
|
|
20185
|
+
images
|
|
20186
|
+
];
|
|
20187
|
+
} else {
|
|
20188
|
+
batched_images = /** @type {RawImage[][]} */
|
|
20189
|
+
images;
|
|
20190
|
+
}
|
|
20191
|
+
const all_pixel_values = [];
|
|
20192
|
+
const all_pixel_masks = [];
|
|
20193
|
+
const all_spatial_shapes = [];
|
|
20194
|
+
const all_rows = [];
|
|
20195
|
+
const all_cols = [];
|
|
20196
|
+
const all_image_sizes = [];
|
|
20197
|
+
for (const image_batch of batched_images) {
|
|
20198
|
+
const preprocessed = await Promise.all(image_batch.map((x) => this.preprocess(x, { do_pad: false })));
|
|
20199
|
+
for (const { pixel_values } of preprocessed) {
|
|
20200
|
+
const [, height, width] = pixel_values.dims;
|
|
20201
|
+
const img = pixel_values.unsqueeze_(0);
|
|
20202
|
+
const total_factor = this.encoder_patch_size * this.downsample_factor;
|
|
20203
|
+
const f2 = total_factor ** 2;
|
|
20204
|
+
const [new_height, new_width] = smart_resize(
|
|
20205
|
+
Math.max(total_factor, height),
|
|
20206
|
+
Math.max(total_factor, width),
|
|
20207
|
+
total_factor,
|
|
20208
|
+
this.min_image_tokens * f2,
|
|
20209
|
+
this.max_image_tokens * f2
|
|
20210
|
+
).map((x) => Math.max(total_factor, x));
|
|
20211
|
+
let tiles;
|
|
20212
|
+
let num_rows = 1, num_cols = 1;
|
|
20213
|
+
const is_large = this._is_image_too_large(height, width);
|
|
20214
|
+
const do_splitting = this.do_image_splitting && !(this.min_tiles === 1 && this.max_tiles === 1);
|
|
20215
|
+
if (is_large && do_splitting) {
|
|
20216
|
+
const { grid_width, grid_height, target_width, target_height } = this._get_grid_layout(
|
|
20217
|
+
height,
|
|
20218
|
+
width
|
|
20219
|
+
);
|
|
20220
|
+
num_rows = grid_height;
|
|
20221
|
+
num_cols = grid_width;
|
|
20222
|
+
const resized = await interpolate_4d(img, {
|
|
20223
|
+
size: [target_height, target_width]
|
|
20224
|
+
});
|
|
20225
|
+
tiles = [];
|
|
20226
|
+
for (let r = 0; r < grid_height; ++r) {
|
|
20227
|
+
for (let c = 0; c < grid_width; ++c) {
|
|
20228
|
+
const y = r * this.tile_size;
|
|
20229
|
+
const x = c * this.tile_size;
|
|
20230
|
+
tiles.push(resized.slice(null, null, [y, y + this.tile_size], [x, x + this.tile_size]));
|
|
20231
|
+
}
|
|
20232
|
+
}
|
|
20233
|
+
if (this.use_thumbnail && grid_width * grid_height !== 1) {
|
|
20234
|
+
tiles.push(await interpolate_4d(img, { size: [new_height, new_width] }));
|
|
20235
|
+
}
|
|
20236
|
+
} else {
|
|
20237
|
+
tiles = [await interpolate_4d(img, { size: [new_height, new_width] })];
|
|
20238
|
+
}
|
|
20239
|
+
for (const tile of tiles) {
|
|
20240
|
+
const [, , th, tw] = tile.dims;
|
|
20241
|
+
const patches = convert_image_to_patches(tile, this.encoder_patch_size);
|
|
20242
|
+
const { padded, mask } = pad_along_first_dim(patches, this.max_num_patches);
|
|
20243
|
+
all_pixel_values.push(padded);
|
|
20244
|
+
all_pixel_masks.push(mask);
|
|
20245
|
+
all_spatial_shapes.push([
|
|
20246
|
+
Math.floor(th / this.encoder_patch_size),
|
|
20247
|
+
Math.floor(tw / this.encoder_patch_size)
|
|
20248
|
+
]);
|
|
20249
|
+
}
|
|
20250
|
+
all_rows.push(num_rows);
|
|
20251
|
+
all_cols.push(num_cols);
|
|
20252
|
+
all_image_sizes.push([new_height, new_width]);
|
|
20253
|
+
}
|
|
20254
|
+
}
|
|
20255
|
+
const result = {
|
|
20256
|
+
pixel_values: cat(all_pixel_values, 0),
|
|
20257
|
+
pixel_attention_mask: stack(all_pixel_masks, 0),
|
|
20258
|
+
spatial_shapes: new Tensor2("int64", BigInt64Array.from(all_spatial_shapes.flat(), BigInt), [
|
|
20259
|
+
all_spatial_shapes.length,
|
|
20260
|
+
2
|
|
20261
|
+
])
|
|
20262
|
+
};
|
|
20263
|
+
if (return_row_col_info ?? this.return_row_col_info) {
|
|
20264
|
+
result.image_rows = all_rows;
|
|
20265
|
+
result.image_cols = all_cols;
|
|
20266
|
+
result.image_sizes = all_image_sizes;
|
|
20267
|
+
}
|
|
20268
|
+
return result;
|
|
20269
|
+
}
|
|
20270
|
+
};
|
|
20271
|
+
|
|
19558
20272
|
// src/models/llava_onevision/image_processing_llava_onevision.js
|
|
19559
20273
|
var LlavaOnevisionImageProcessor = class extends ImageProcessor {
|
|
19560
20274
|
};
|
|
@@ -19778,27 +20492,6 @@ var PvtImageProcessor = class extends ImageProcessor {
|
|
|
19778
20492
|
};
|
|
19779
20493
|
|
|
19780
20494
|
// src/models/qwen2_vl/image_processing_qwen2_vl.js
|
|
19781
|
-
function smart_resize(height, width, factor = 28, min_pixels = 56 * 56, max_pixels = 14 * 14 * 4 * 1280) {
|
|
19782
|
-
if (height < factor || width < factor) {
|
|
19783
|
-
throw new Error(`height:${height} or width:${width} must be larger than factor:${factor}`);
|
|
19784
|
-
} else if (Math.max(height, width) / Math.min(height, width) > 200) {
|
|
19785
|
-
throw new Error(
|
|
19786
|
-
`absolute aspect ratio must be smaller than 200, got ${Math.max(height, width) / Math.min(height, width)}`
|
|
19787
|
-
);
|
|
19788
|
-
}
|
|
19789
|
-
let h_bar = Math.round(height / factor) * factor;
|
|
19790
|
-
let w_bar = Math.round(width / factor) * factor;
|
|
19791
|
-
if (h_bar * w_bar > max_pixels) {
|
|
19792
|
-
const beta = Math.sqrt(height * width / max_pixels);
|
|
19793
|
-
h_bar = Math.floor(height / beta / factor) * factor;
|
|
19794
|
-
w_bar = Math.floor(width / beta / factor) * factor;
|
|
19795
|
-
} else if (h_bar * w_bar < min_pixels) {
|
|
19796
|
-
const beta = Math.sqrt(min_pixels / (height * width));
|
|
19797
|
-
h_bar = Math.ceil(height * beta / factor) * factor;
|
|
19798
|
-
w_bar = Math.ceil(width * beta / factor) * factor;
|
|
19799
|
-
}
|
|
19800
|
-
return [h_bar, w_bar];
|
|
19801
|
-
}
|
|
19802
20495
|
var Qwen2VLImageProcessor = class extends ImageProcessor {
|
|
19803
20496
|
constructor(config) {
|
|
19804
20497
|
super(config);
|
|
@@ -20400,6 +21093,57 @@ ${boi_token}${image_tokens_expanded}${eoi_token}
|
|
|
20400
21093
|
}
|
|
20401
21094
|
};
|
|
20402
21095
|
|
|
21096
|
+
// src/models/granite_speech/processing_granite_speech.js
|
|
21097
|
+
var GraniteSpeechProcessor = class extends Processor {
|
|
21098
|
+
static tokenizer_class = AutoTokenizer;
|
|
21099
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
21100
|
+
static uses_processor_config = true;
|
|
21101
|
+
/**
|
|
21102
|
+
* Compute the number of audio tokens for a given raw audio length.
|
|
21103
|
+
* @param {number} audioLength Raw audio sample count.
|
|
21104
|
+
* @returns {number} Number of projector output tokens.
|
|
21105
|
+
*/
|
|
21106
|
+
_get_num_audio_features(audioLength) {
|
|
21107
|
+
const { hop_length } = this.feature_extractor.config.melspec_kwargs;
|
|
21108
|
+
const { projector_window_size, projector_downsample_rate } = this.feature_extractor.config;
|
|
21109
|
+
const effective_window_size = Math.floor(projector_window_size / projector_downsample_rate);
|
|
21110
|
+
const mel_length = Math.floor(audioLength / hop_length) + 1;
|
|
21111
|
+
const encoder_length = Math.floor(mel_length / 2);
|
|
21112
|
+
const nblocks = Math.ceil(encoder_length / projector_window_size);
|
|
21113
|
+
return nblocks * effective_window_size;
|
|
21114
|
+
}
|
|
21115
|
+
/**
|
|
21116
|
+
* @param {string} text The text input to process.
|
|
21117
|
+
* @param {Float32Array} audio The audio input to process.
|
|
21118
|
+
*/
|
|
21119
|
+
async _call(text, audio = null, kwargs = {}) {
|
|
21120
|
+
if (Array.isArray(text)) {
|
|
21121
|
+
throw new Error("Batched inputs are not supported yet.");
|
|
21122
|
+
}
|
|
21123
|
+
let audio_inputs = {};
|
|
21124
|
+
if (audio) {
|
|
21125
|
+
const { input_features } = await this.feature_extractor(audio);
|
|
21126
|
+
audio_inputs["input_features"] = input_features;
|
|
21127
|
+
const audio_embed_size = this._get_num_audio_features(audio.length);
|
|
21128
|
+
const mask_data = new Uint8Array(audio_embed_size).fill(1);
|
|
21129
|
+
audio_inputs["input_features_mask"] = new Tensor2("bool", mask_data, [1, audio_embed_size]);
|
|
21130
|
+
const audio_token = this.config.audio_token ?? "<|audio|>";
|
|
21131
|
+
if (!text.includes(audio_token)) {
|
|
21132
|
+
throw new Error(`The input text does not contain the audio token ${audio_token}.`);
|
|
21133
|
+
}
|
|
21134
|
+
text = text.replaceAll(audio_token, audio_token.repeat(audio_embed_size));
|
|
21135
|
+
}
|
|
21136
|
+
const text_inputs = this.tokenizer(text, {
|
|
21137
|
+
add_special_tokens: false,
|
|
21138
|
+
...kwargs
|
|
21139
|
+
});
|
|
21140
|
+
return {
|
|
21141
|
+
...text_inputs,
|
|
21142
|
+
...audio_inputs
|
|
21143
|
+
};
|
|
21144
|
+
}
|
|
21145
|
+
};
|
|
21146
|
+
|
|
20403
21147
|
// src/models/grounding_dino/processing_grounding_dino.js
|
|
20404
21148
|
function get_phrases_from_posmap(posmaps, input_ids) {
|
|
20405
21149
|
const left_idx = 0;
|
|
@@ -20676,6 +21420,66 @@ var JinaCLIPProcessor = class extends Processor {
|
|
|
20676
21420
|
}
|
|
20677
21421
|
};
|
|
20678
21422
|
|
|
21423
|
+
// src/models/lfm2_vl/processing_lfm2_vl.js
|
|
21424
|
+
var Lfm2VlProcessor = class extends Processor {
|
|
21425
|
+
static tokenizer_class = AutoTokenizer;
|
|
21426
|
+
static image_processor_class = AutoImageProcessor;
|
|
21427
|
+
/**
|
|
21428
|
+
* @param {RawImage|RawImage[]} images
|
|
21429
|
+
* @param {string|string[]|null} [text]
|
|
21430
|
+
* @param {Record<string, any>} [kwargs]
|
|
21431
|
+
*/
|
|
21432
|
+
async _call(images, text = null, kwargs = {}) {
|
|
21433
|
+
const { image_rows, image_cols, image_sizes, ...image_inputs } = await this.image_processor(images, {
|
|
21434
|
+
...kwargs,
|
|
21435
|
+
return_row_col_info: true
|
|
21436
|
+
});
|
|
21437
|
+
if (text) {
|
|
21438
|
+
const image_token = this.config.image_token ?? "<image>";
|
|
21439
|
+
const {
|
|
21440
|
+
tile_size = 512,
|
|
21441
|
+
downsample_factor = 2,
|
|
21442
|
+
encoder_patch_size = 16,
|
|
21443
|
+
use_thumbnail = true
|
|
21444
|
+
} = (
|
|
21445
|
+
/** @type {Record<string, any>} */
|
|
21446
|
+
this.image_processor.config
|
|
21447
|
+
);
|
|
21448
|
+
const ds2 = (s) => Math.ceil(Math.floor(s / encoder_patch_size) / downsample_factor);
|
|
21449
|
+
const tokens_per_tile = ds2(tile_size) ** 2;
|
|
21450
|
+
const image_start = this.config.image_start_token ?? "<|image_start|>";
|
|
21451
|
+
const image_end = this.config.image_end_token ?? "<|image_end|>";
|
|
21452
|
+
const thumbnail_token = this.config.image_thumbnail ?? "<|img_thumbnail|>";
|
|
21453
|
+
if (!Array.isArray(text)) text = [text];
|
|
21454
|
+
let image_idx = 0;
|
|
21455
|
+
text = text.map((sample) => {
|
|
21456
|
+
const parts = sample.split(image_token);
|
|
21457
|
+
return parts[0] + parts.slice(1).map((part) => {
|
|
21458
|
+
const idx = image_idx++;
|
|
21459
|
+
const [h, w] = image_sizes[idx];
|
|
21460
|
+
const rows = image_rows[idx], cols = image_cols[idx];
|
|
21461
|
+
const tokens_for_image = ds2(h) * ds2(w);
|
|
21462
|
+
let expanded = image_start;
|
|
21463
|
+
if (rows > 1 || cols > 1) {
|
|
21464
|
+
const tile_str = image_token.repeat(tokens_per_tile);
|
|
21465
|
+
for (let r = 0; r < rows; ++r)
|
|
21466
|
+
for (let c = 0; c < cols; ++c)
|
|
21467
|
+
expanded += `<|img_row_${r + 1}_col_${c + 1}|>` + tile_str;
|
|
21468
|
+
if (use_thumbnail) expanded += thumbnail_token + image_token.repeat(tokens_for_image);
|
|
21469
|
+
} else {
|
|
21470
|
+
expanded += image_token.repeat(tokens_for_image);
|
|
21471
|
+
}
|
|
21472
|
+
return expanded + image_end + part;
|
|
21473
|
+
}).join("");
|
|
21474
|
+
});
|
|
21475
|
+
}
|
|
21476
|
+
return {
|
|
21477
|
+
...image_inputs,
|
|
21478
|
+
...text ? this.tokenizer(text, kwargs) : {}
|
|
21479
|
+
};
|
|
21480
|
+
}
|
|
21481
|
+
};
|
|
21482
|
+
|
|
20679
21483
|
// src/models/llava/processing_llava.js
|
|
20680
21484
|
var LlavaProcessor = class extends Processor {
|
|
20681
21485
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -21208,6 +22012,94 @@ var VoxtralProcessor = class extends Processor {
|
|
|
21208
22012
|
}
|
|
21209
22013
|
};
|
|
21210
22014
|
|
|
22015
|
+
// src/models/voxtral_realtime/processing_voxtral_realtime.js
|
|
22016
|
+
var NUM_LEFT_PAD_TOKENS = 32;
|
|
22017
|
+
var NUM_DELAY_TOKENS = 6;
|
|
22018
|
+
var AUDIO_LENGTH_PER_TOK = 8;
|
|
22019
|
+
var OFFLINE_STREAMING_BUFFER_TOKENS = 10;
|
|
22020
|
+
var STREAMING_PAD_TOKEN_ID = 32;
|
|
22021
|
+
var VoxtralRealtimeProcessor = class extends Processor {
|
|
22022
|
+
static tokenizer_class = AutoTokenizer;
|
|
22023
|
+
static feature_extractor_class = AutoFeatureExtractor;
|
|
22024
|
+
static uses_processor_config = false;
|
|
22025
|
+
/** Number of mel frames in the first audio chunk. */
|
|
22026
|
+
get num_mel_frames_first_audio_chunk() {
|
|
22027
|
+
return (NUM_DELAY_TOKENS + 1) * AUDIO_LENGTH_PER_TOK;
|
|
22028
|
+
}
|
|
22029
|
+
/** Number of raw audio samples in the first audio chunk. */
|
|
22030
|
+
get num_samples_first_audio_chunk() {
|
|
22031
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
22032
|
+
return (this.num_mel_frames_first_audio_chunk - 1) * hop_length + Math.floor(n_fft / 2);
|
|
22033
|
+
}
|
|
22034
|
+
/** Number of raw audio samples per subsequent audio chunk. */
|
|
22035
|
+
get num_samples_per_audio_chunk() {
|
|
22036
|
+
const { hop_length, n_fft } = this.feature_extractor.config;
|
|
22037
|
+
return AUDIO_LENGTH_PER_TOK * hop_length + n_fft;
|
|
22038
|
+
}
|
|
22039
|
+
/** Number of right-pad tokens for non-streaming mode. */
|
|
22040
|
+
get num_right_pad_tokens() {
|
|
22041
|
+
return NUM_DELAY_TOKENS + 1 + OFFLINE_STREAMING_BUFFER_TOKENS;
|
|
22042
|
+
}
|
|
22043
|
+
/** Number of mel frames per text token. */
|
|
22044
|
+
get audio_length_per_tok() {
|
|
22045
|
+
return AUDIO_LENGTH_PER_TOK;
|
|
22046
|
+
}
|
|
22047
|
+
/** Number of raw audio samples per token. */
|
|
22048
|
+
get raw_audio_length_per_tok() {
|
|
22049
|
+
return AUDIO_LENGTH_PER_TOK * this.feature_extractor.config.hop_length;
|
|
22050
|
+
}
|
|
22051
|
+
/**
|
|
22052
|
+
* Process audio input for VoxtralRealtime.
|
|
22053
|
+
*
|
|
22054
|
+
* In streaming mode with `is_first_audio_chunk=true`, the audio is left-padded
|
|
22055
|
+
* with silence and mel features are extracted with `center=true`.
|
|
22056
|
+
* Returns `{ input_ids, input_features }`.
|
|
22057
|
+
*
|
|
22058
|
+
* In streaming mode with `is_first_audio_chunk=false`, the audio chunk is
|
|
22059
|
+
* processed with `center=false` and only `{ input_features }` is returned.
|
|
22060
|
+
*
|
|
22061
|
+
* In non-streaming mode, the audio is right-padded to ensure the model
|
|
22062
|
+
* transcribes the full audio, then processed with `center=true`.
|
|
22063
|
+
* Returns `{ input_features }`.
|
|
22064
|
+
*
|
|
22065
|
+
* @param {Float32Array|Float64Array} audio The audio waveform.
|
|
22066
|
+
* @param {Object} [options]
|
|
22067
|
+
* @param {boolean} [options.is_streaming=false] Whether processing in streaming mode.
|
|
22068
|
+
* @param {boolean} [options.is_first_audio_chunk=true] Whether this is the first audio chunk.
|
|
22069
|
+
* @returns {Promise<Object>}
|
|
22070
|
+
*/
|
|
22071
|
+
async _call(audio, { is_streaming = false, is_first_audio_chunk = true } = {}) {
|
|
22072
|
+
validate_audio_inputs(audio, "VoxtralRealtimeProcessor");
|
|
22073
|
+
if (!is_streaming && !is_first_audio_chunk) {
|
|
22074
|
+
throw new Error("In non-streaming mode (`is_streaming=false`), `is_first_audio_chunk` must be `true`.");
|
|
22075
|
+
}
|
|
22076
|
+
if (is_first_audio_chunk) {
|
|
22077
|
+
if (is_streaming) {
|
|
22078
|
+
const num_left_pad_samples = NUM_LEFT_PAD_TOKENS * this.raw_audio_length_per_tok;
|
|
22079
|
+
const padded_audio = new Float32Array(num_left_pad_samples + audio.length);
|
|
22080
|
+
padded_audio.set(audio, num_left_pad_samples);
|
|
22081
|
+
const audio_encoding = await this.feature_extractor(padded_audio, { center: true });
|
|
22082
|
+
const num_pad_tokens = NUM_LEFT_PAD_TOKENS + NUM_DELAY_TOKENS;
|
|
22083
|
+
const num_input_tokens = 1 + num_pad_tokens;
|
|
22084
|
+
const input_ids_data = new BigInt64Array(num_input_tokens).fill(BigInt(STREAMING_PAD_TOKEN_ID));
|
|
22085
|
+
input_ids_data[0] = 1n;
|
|
22086
|
+
const input_ids = new Tensor2("int64", input_ids_data, [1, num_input_tokens]);
|
|
22087
|
+
return {
|
|
22088
|
+
input_ids,
|
|
22089
|
+
...audio_encoding
|
|
22090
|
+
};
|
|
22091
|
+
} else {
|
|
22092
|
+
const right_pad_samples = this.num_right_pad_tokens * this.raw_audio_length_per_tok;
|
|
22093
|
+
const padded_audio = new Float32Array(audio.length + right_pad_samples);
|
|
22094
|
+
padded_audio.set(audio);
|
|
22095
|
+
return await this.feature_extractor(padded_audio, { center: true });
|
|
22096
|
+
}
|
|
22097
|
+
} else {
|
|
22098
|
+
return await this.feature_extractor(audio, { center: false });
|
|
22099
|
+
}
|
|
22100
|
+
}
|
|
22101
|
+
};
|
|
22102
|
+
|
|
21211
22103
|
// src/models/wav2vec2/processing_wav2vec2.js
|
|
21212
22104
|
var Wav2Vec2Processor = class extends Processor {
|
|
21213
22105
|
static tokenizer_class = AutoTokenizer;
|
|
@@ -21307,10 +22199,13 @@ function getNormalizedConfig(config) {
|
|
|
21307
22199
|
case "florence2":
|
|
21308
22200
|
case "llava_onevision":
|
|
21309
22201
|
case "idefics3":
|
|
22202
|
+
case "granite_speech":
|
|
21310
22203
|
case "ultravox":
|
|
21311
22204
|
case "voxtral":
|
|
22205
|
+
case "voxtral_realtime":
|
|
21312
22206
|
case "smolvlm":
|
|
21313
22207
|
case "gemma3n":
|
|
22208
|
+
case "lfm2_vl":
|
|
21314
22209
|
case "chatterbox":
|
|
21315
22210
|
case "mistral3":
|
|
21316
22211
|
case "qwen2_5_vl":
|
|
@@ -21365,10 +22260,13 @@ function getNormalizedConfig(config) {
|
|
|
21365
22260
|
case "cohere":
|
|
21366
22261
|
case "cohere2":
|
|
21367
22262
|
case "mistral":
|
|
22263
|
+
case "voxtral_realtime_text":
|
|
22264
|
+
case "voxtral_realtime_encoder":
|
|
21368
22265
|
case "starcoder2":
|
|
21369
22266
|
case "qwen2":
|
|
21370
22267
|
case "qwen2_moe":
|
|
21371
22268
|
case "qwen2_vl":
|
|
22269
|
+
case "qwen2_vl_text":
|
|
21372
22270
|
case "qwen2_5_vl_text":
|
|
21373
22271
|
case "qwen3_moe":
|
|
21374
22272
|
case "qwen3_vl_text":
|
|
@@ -21513,6 +22411,9 @@ function getNormalizedConfig(config) {
|
|
|
21513
22411
|
return normalized_config;
|
|
21514
22412
|
}
|
|
21515
22413
|
function getCacheShapes(config, options) {
|
|
22414
|
+
if (!(config instanceof PretrainedConfig)) {
|
|
22415
|
+
config = new PretrainedConfig(config);
|
|
22416
|
+
}
|
|
21516
22417
|
if (["lfm2", "lfm2_moe"].includes(config.model_type)) {
|
|
21517
22418
|
const pkv_prefix = options?.prefix ?? "past_key_values";
|
|
21518
22419
|
const conv_prefix = pkv_prefix === "present" ? "present" : "past";
|
|
@@ -21619,12 +22520,16 @@ function getCacheShapes(config, options) {
|
|
|
21619
22520
|
}
|
|
21620
22521
|
}
|
|
21621
22522
|
return cache_values;
|
|
21622
|
-
} else if (["qwen3_5", "qwen3_5_moe"].includes(config.model_type)) {
|
|
21623
|
-
|
|
21624
|
-
|
|
21625
|
-
|
|
21626
|
-
|
|
21627
|
-
|
|
22523
|
+
} else if (["lfm2_vl", "qwen3_5", "qwen3_5_moe", "voxtral_realtime"].includes(config.model_type)) {
|
|
22524
|
+
let subConfig;
|
|
22525
|
+
if (config.model_type === "voxtral_realtime" && options?.session_name === "audio_encoder") {
|
|
22526
|
+
subConfig = /** @type {any} */
|
|
22527
|
+
config.audio_config;
|
|
22528
|
+
} else {
|
|
22529
|
+
subConfig = /** @type {any} */
|
|
22530
|
+
config.text_config;
|
|
22531
|
+
}
|
|
22532
|
+
return getCacheShapes(subConfig, options);
|
|
21628
22533
|
}
|
|
21629
22534
|
return getKeyValueShapes(config, options);
|
|
21630
22535
|
}
|
|
@@ -21790,7 +22695,7 @@ async function getModelDataFiles(pretrained_model_name_or_path, fileName, suffix
|
|
|
21790
22695
|
}
|
|
21791
22696
|
|
|
21792
22697
|
// src/models/session.js
|
|
21793
|
-
async function getSession(pretrained_model_name_or_path, fileName, options,
|
|
22698
|
+
async function getSession(pretrained_model_name_or_path, fileName, options, cache_config = false, session_name = void 0) {
|
|
21794
22699
|
let custom_config = options.config?.["transformers.js_config"] ?? {};
|
|
21795
22700
|
const selectedDevice = (
|
|
21796
22701
|
/** @type {import("../utils/devices.js").DeviceType} */
|
|
@@ -21848,9 +22753,10 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
21848
22753
|
if (externalData.length > 0 && !apis.IS_NODE_ENV) {
|
|
21849
22754
|
session_options.externalData = externalData;
|
|
21850
22755
|
}
|
|
21851
|
-
if (
|
|
22756
|
+
if (cache_config && selectedDevice === "webgpu" && kv_cache_dtype_config !== false) {
|
|
21852
22757
|
const shapes = getCacheShapes(options.config, {
|
|
21853
|
-
prefix: "present"
|
|
22758
|
+
prefix: "present",
|
|
22759
|
+
session_name
|
|
21854
22760
|
});
|
|
21855
22761
|
if (Object.keys(shapes).length > 0 && !isONNXProxy()) {
|
|
21856
22762
|
const preferredOutputLocation = {};
|
|
@@ -21868,15 +22774,17 @@ async function getSession(pretrained_model_name_or_path, fileName, options, is_d
|
|
|
21868
22774
|
};
|
|
21869
22775
|
return { buffer_or_path, session_options, session_config };
|
|
21870
22776
|
}
|
|
21871
|
-
async function constructSessions(pretrained_model_name_or_path, names, options,
|
|
22777
|
+
async function constructSessions(pretrained_model_name_or_path, names, options, cache_sessions = void 0) {
|
|
21872
22778
|
return Object.fromEntries(
|
|
21873
22779
|
await Promise.all(
|
|
21874
22780
|
Object.keys(names).map(async (name) => {
|
|
22781
|
+
const cache_config = cache_sessions?.[name] ?? false;
|
|
21875
22782
|
const { buffer_or_path, session_options, session_config } = await getSession(
|
|
21876
22783
|
pretrained_model_name_or_path,
|
|
21877
22784
|
names[name],
|
|
21878
22785
|
options,
|
|
21879
|
-
|
|
22786
|
+
cache_config,
|
|
22787
|
+
name
|
|
21880
22788
|
);
|
|
21881
22789
|
const session = await createInferenceSession(buffer_or_path, session_options, session_config);
|
|
21882
22790
|
return [name, session];
|
|
@@ -23176,19 +24084,71 @@ var BeamSearchSampler = class extends LogitsSampler {
|
|
|
23176
24084
|
}
|
|
23177
24085
|
};
|
|
23178
24086
|
|
|
24087
|
+
// src/cache_utils.js
|
|
24088
|
+
var _DynamicCache = class {
|
|
24089
|
+
/**
|
|
24090
|
+
* Create a DynamicCache, optionally pre-populated with entries.
|
|
24091
|
+
* @param {Record<string, Tensor>} [entries] Initial name→Tensor mappings.
|
|
24092
|
+
*/
|
|
24093
|
+
constructor(entries) {
|
|
24094
|
+
if (!entries) return;
|
|
24095
|
+
for (const key in entries) {
|
|
24096
|
+
if (key in this) {
|
|
24097
|
+
throw new TypeError(`Key "${key}" conflicts with an existing property on DynamicCache`);
|
|
24098
|
+
}
|
|
24099
|
+
const value = entries[key];
|
|
24100
|
+
if (!(value instanceof Tensor2)) {
|
|
24101
|
+
throw new TypeError(`Expected a Tensor for key "${key}", got ${typeof value}`);
|
|
24102
|
+
}
|
|
24103
|
+
this[key] = value;
|
|
24104
|
+
}
|
|
24105
|
+
}
|
|
24106
|
+
/**
|
|
24107
|
+
* Get the cached sequence length. This requires at least one attention cache entry to be present.
|
|
24108
|
+
* @returns {number} The past sequence length.
|
|
24109
|
+
*/
|
|
24110
|
+
get_seq_length() {
|
|
24111
|
+
const self2 = (
|
|
24112
|
+
/** @type {any} */
|
|
24113
|
+
this
|
|
24114
|
+
);
|
|
24115
|
+
for (const name in self2) {
|
|
24116
|
+
if (name.startsWith("past_key_values.")) {
|
|
24117
|
+
return self2[name].dims.at(-2);
|
|
24118
|
+
}
|
|
24119
|
+
}
|
|
24120
|
+
throw new Error("Unable to determine sequence length from the cache.");
|
|
24121
|
+
}
|
|
24122
|
+
/**
|
|
24123
|
+
* Dispose all contained tensors whose data resides on the GPU.
|
|
24124
|
+
* Returns a promise that resolves when all disposals are complete.
|
|
24125
|
+
* @returns {Promise<void>} Promise that resolves when all GPU tensors are disposed.
|
|
24126
|
+
*/
|
|
24127
|
+
async dispose() {
|
|
24128
|
+
const promises = [];
|
|
24129
|
+
for (
|
|
24130
|
+
const t of
|
|
24131
|
+
/** @type {Tensor[]} */
|
|
24132
|
+
Object.values(this)
|
|
24133
|
+
) {
|
|
24134
|
+
if (t.location === "gpu-buffer") {
|
|
24135
|
+
promises.push(t.dispose());
|
|
24136
|
+
}
|
|
24137
|
+
}
|
|
24138
|
+
await Promise.all(promises);
|
|
24139
|
+
}
|
|
24140
|
+
};
|
|
24141
|
+
var DynamicCache = (
|
|
24142
|
+
/** @type {new (entries?: Record<string, Tensor>) => DynamicCache} */
|
|
24143
|
+
/** @type {unknown} */
|
|
24144
|
+
_DynamicCache
|
|
24145
|
+
);
|
|
24146
|
+
|
|
23179
24147
|
// src/models/modeling_utils.js
|
|
23180
24148
|
var MODEL_MAPPING_NAMES = null;
|
|
23181
24149
|
function registerTaskMappings(mappings) {
|
|
23182
24150
|
MODEL_MAPPING_NAMES = mappings;
|
|
23183
24151
|
}
|
|
23184
|
-
function getPastLength(past_key_values) {
|
|
23185
|
-
for (const name in past_key_values) {
|
|
23186
|
-
if (name.startsWith("past_key_values.")) {
|
|
23187
|
-
return past_key_values[name].dims.at(-2);
|
|
23188
|
-
}
|
|
23189
|
-
}
|
|
23190
|
-
return Object.values(past_key_values)[0].dims.at(-2);
|
|
23191
|
-
}
|
|
23192
24152
|
function toI64Tensor(items) {
|
|
23193
24153
|
if (items instanceof Tensor2) {
|
|
23194
24154
|
return items;
|
|
@@ -23229,71 +24189,181 @@ var MODEL_TYPES = {
|
|
|
23229
24189
|
AutoEncoder: 12,
|
|
23230
24190
|
ImageAudioTextToText: 13,
|
|
23231
24191
|
Supertonic: 14,
|
|
23232
|
-
Chatterbox: 15
|
|
24192
|
+
Chatterbox: 15,
|
|
24193
|
+
MultimodalLanguageModelOnly: 16,
|
|
24194
|
+
VoxtralRealtime: 17
|
|
23233
24195
|
};
|
|
23234
24196
|
var MODEL_TYPE_CONFIG = {
|
|
23235
24197
|
[MODEL_TYPES.DecoderOnly]: {
|
|
23236
24198
|
can_generate: true,
|
|
23237
24199
|
forward: decoder_forward,
|
|
23238
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
24200
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24201
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" }),
|
|
24202
|
+
cache_sessions: { model: true },
|
|
24203
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23239
24204
|
},
|
|
23240
24205
|
[MODEL_TYPES.DecoderOnlyWithoutHead]: {
|
|
23241
24206
|
can_generate: false,
|
|
23242
24207
|
forward: decoder_forward,
|
|
23243
|
-
prepare_inputs: decoder_prepare_inputs_for_generation
|
|
24208
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24209
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
23244
24210
|
},
|
|
23245
24211
|
[MODEL_TYPES.Seq2Seq]: {
|
|
23246
24212
|
can_generate: true,
|
|
23247
24213
|
forward: seq2seq_forward,
|
|
23248
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
24214
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
24215
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24216
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24217
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23249
24218
|
},
|
|
23250
24219
|
[MODEL_TYPES.Vision2Seq]: {
|
|
23251
24220
|
can_generate: true,
|
|
23252
24221
|
forward: seq2seq_forward,
|
|
23253
|
-
prepare_inputs: encoder_decoder_prepare_inputs_for_generation
|
|
24222
|
+
prepare_inputs: encoder_decoder_prepare_inputs_for_generation,
|
|
24223
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24224
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24225
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23254
24226
|
},
|
|
23255
24227
|
[MODEL_TYPES.Musicgen]: {
|
|
23256
24228
|
can_generate: true,
|
|
23257
|
-
forward: seq2seq_forward
|
|
24229
|
+
forward: seq2seq_forward,
|
|
24230
|
+
sessions: () => ({
|
|
24231
|
+
model: "text_encoder",
|
|
24232
|
+
decoder_model_merged: "decoder_model_merged",
|
|
24233
|
+
encodec_decode: "encodec_decode"
|
|
24234
|
+
}),
|
|
24235
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24236
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23258
24237
|
},
|
|
23259
24238
|
[MODEL_TYPES.EncoderDecoder]: {
|
|
23260
24239
|
can_generate: false,
|
|
23261
|
-
forward: seq2seq_forward
|
|
24240
|
+
forward: seq2seq_forward,
|
|
24241
|
+
sessions: () => ({ model: "encoder_model", decoder_model_merged: "decoder_model_merged" }),
|
|
24242
|
+
cache_sessions: { decoder_model_merged: true }
|
|
24243
|
+
},
|
|
24244
|
+
[MODEL_TYPES.MaskGeneration]: {
|
|
24245
|
+
sessions: () => ({ model: "vision_encoder", prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder" })
|
|
23262
24246
|
},
|
|
23263
24247
|
[MODEL_TYPES.ImageTextToText]: {
|
|
23264
24248
|
can_generate: true,
|
|
23265
24249
|
forward: image_text_to_text_forward,
|
|
23266
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24250
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24251
|
+
sessions: (config) => {
|
|
24252
|
+
const s = {
|
|
24253
|
+
embed_tokens: "embed_tokens",
|
|
24254
|
+
vision_encoder: "vision_encoder",
|
|
24255
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24256
|
+
};
|
|
24257
|
+
if (config.is_encoder_decoder) s["model"] = "encoder_model";
|
|
24258
|
+
return s;
|
|
24259
|
+
},
|
|
24260
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24261
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23267
24262
|
},
|
|
23268
24263
|
[MODEL_TYPES.AudioTextToText]: {
|
|
23269
24264
|
can_generate: true,
|
|
23270
24265
|
forward: audio_text_to_text_forward,
|
|
23271
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24266
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24267
|
+
sessions: () => ({
|
|
24268
|
+
embed_tokens: "embed_tokens",
|
|
24269
|
+
audio_encoder: "audio_encoder",
|
|
24270
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24271
|
+
}),
|
|
24272
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24273
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23272
24274
|
},
|
|
23273
|
-
[MODEL_TYPES.
|
|
24275
|
+
[MODEL_TYPES.ImageAudioTextToText]: {
|
|
23274
24276
|
can_generate: true,
|
|
23275
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24277
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24278
|
+
sessions: () => ({
|
|
24279
|
+
embed_tokens: "embed_tokens",
|
|
24280
|
+
audio_encoder: "audio_encoder",
|
|
24281
|
+
vision_encoder: "vision_encoder",
|
|
24282
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24283
|
+
}),
|
|
24284
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23276
24285
|
},
|
|
23277
|
-
[MODEL_TYPES.
|
|
24286
|
+
[MODEL_TYPES.Phi3V]: {
|
|
23278
24287
|
can_generate: true,
|
|
23279
|
-
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation
|
|
24288
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24289
|
+
sessions: () => ({
|
|
24290
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
24291
|
+
model: "model",
|
|
24292
|
+
vision_encoder: "vision_encoder"
|
|
24293
|
+
}),
|
|
24294
|
+
cache_sessions: { model: true },
|
|
24295
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23280
24296
|
},
|
|
23281
24297
|
[MODEL_TYPES.MultiModality]: {
|
|
23282
|
-
can_generate: true
|
|
24298
|
+
can_generate: true,
|
|
24299
|
+
sessions: () => ({
|
|
24300
|
+
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
24301
|
+
model: "language_model",
|
|
24302
|
+
lm_head: "lm_head",
|
|
24303
|
+
gen_head: "gen_head",
|
|
24304
|
+
gen_img_embeds: "gen_img_embeds",
|
|
24305
|
+
image_decode: "image_decode"
|
|
24306
|
+
}),
|
|
24307
|
+
cache_sessions: { model: true },
|
|
24308
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23283
24309
|
},
|
|
23284
24310
|
[MODEL_TYPES.AutoEncoder]: {
|
|
23285
24311
|
can_generate: false,
|
|
23286
|
-
forward: auto_encoder_forward
|
|
24312
|
+
forward: auto_encoder_forward,
|
|
24313
|
+
sessions: () => ({ encoder_model: "encoder_model", decoder_model: "decoder_model" })
|
|
24314
|
+
},
|
|
24315
|
+
[MODEL_TYPES.Supertonic]: {
|
|
24316
|
+
sessions: () => ({
|
|
24317
|
+
text_encoder: "text_encoder",
|
|
24318
|
+
latent_denoiser: "latent_denoiser",
|
|
24319
|
+
voice_decoder: "voice_decoder"
|
|
24320
|
+
})
|
|
23287
24321
|
},
|
|
23288
24322
|
[MODEL_TYPES.Chatterbox]: {
|
|
23289
24323
|
can_generate: true,
|
|
23290
|
-
forward: encoder_forward
|
|
24324
|
+
forward: encoder_forward,
|
|
24325
|
+
sessions: () => ({
|
|
24326
|
+
embed_tokens: "embed_tokens",
|
|
24327
|
+
speech_encoder: "speech_encoder",
|
|
24328
|
+
model: "language_model",
|
|
24329
|
+
conditional_decoder: "conditional_decoder"
|
|
24330
|
+
}),
|
|
24331
|
+
cache_sessions: { model: true },
|
|
24332
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
24333
|
+
},
|
|
24334
|
+
[MODEL_TYPES.MultimodalLanguageModelOnly]: {
|
|
24335
|
+
can_generate: true,
|
|
24336
|
+
forward: image_text_to_text_forward,
|
|
24337
|
+
prepare_inputs: multimodal_text_to_text_prepare_inputs_for_generation,
|
|
24338
|
+
sessions: () => ({ embed_tokens: "embed_tokens", decoder_model_merged: "decoder_model_merged" }),
|
|
24339
|
+
cache_sessions: { decoder_model_merged: true },
|
|
24340
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
24341
|
+
},
|
|
24342
|
+
[MODEL_TYPES.VoxtralRealtime]: {
|
|
24343
|
+
can_generate: true,
|
|
24344
|
+
prepare_inputs: decoder_prepare_inputs_for_generation,
|
|
24345
|
+
sessions: () => ({
|
|
24346
|
+
embed_tokens: "embed_tokens",
|
|
24347
|
+
audio_encoder: "audio_encoder",
|
|
24348
|
+
decoder_model_merged: "decoder_model_merged"
|
|
24349
|
+
}),
|
|
24350
|
+
cache_sessions: { decoder_model_merged: true, audio_encoder: true },
|
|
24351
|
+
optional_configs: { generation_config: "generation_config.json" }
|
|
23291
24352
|
},
|
|
23292
24353
|
default: {
|
|
23293
24354
|
can_generate: false,
|
|
23294
|
-
forward: encoder_forward
|
|
24355
|
+
forward: encoder_forward,
|
|
24356
|
+
sessions: (config, options) => ({ model: options.model_file_name ?? "model" })
|
|
23295
24357
|
}
|
|
23296
24358
|
};
|
|
24359
|
+
function getSessionsConfig(modelType, config, options = {}) {
|
|
24360
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
24361
|
+
return {
|
|
24362
|
+
sessions: typeConfig.sessions(config, options),
|
|
24363
|
+
cache_sessions: typeConfig.cache_sessions,
|
|
24364
|
+
optional_configs: typeConfig.optional_configs
|
|
24365
|
+
};
|
|
24366
|
+
}
|
|
23297
24367
|
var MODEL_TYPE_MAPPING = /* @__PURE__ */ new Map();
|
|
23298
24368
|
var MODEL_NAME_TO_CLASS_MAPPING = /* @__PURE__ */ new Map();
|
|
23299
24369
|
var MODEL_CLASS_TO_NAME_MAPPING = /* @__PURE__ */ new Map();
|
|
@@ -23379,245 +24449,23 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23379
24449
|
const modelName = MODEL_CLASS_TO_NAME_MAPPING.get(this);
|
|
23380
24450
|
const modelType = MODEL_TYPE_MAPPING.get(modelName);
|
|
23381
24451
|
config = options.config = await AutoConfig.from_pretrained(pretrained_model_name_or_path, options);
|
|
23382
|
-
|
|
23383
|
-
if (modelType ===
|
|
23384
|
-
|
|
23385
|
-
|
|
23386
|
-
|
|
23387
|
-
{
|
|
23388
|
-
|
|
23389
|
-
},
|
|
23390
|
-
options,
|
|
23391
|
-
"model"
|
|
23392
|
-
),
|
|
23393
|
-
get_optional_configs(
|
|
23394
|
-
pretrained_model_name_or_path,
|
|
23395
|
-
{
|
|
23396
|
-
generation_config: "generation_config.json"
|
|
23397
|
-
},
|
|
23398
|
-
options
|
|
23399
|
-
)
|
|
23400
|
-
]);
|
|
23401
|
-
} else if (modelType === MODEL_TYPES.Seq2Seq || modelType === MODEL_TYPES.Vision2Seq) {
|
|
23402
|
-
info = await Promise.all([
|
|
23403
|
-
constructSessions(
|
|
23404
|
-
pretrained_model_name_or_path,
|
|
23405
|
-
{
|
|
23406
|
-
model: "encoder_model",
|
|
23407
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23408
|
-
},
|
|
23409
|
-
options,
|
|
23410
|
-
"decoder_model_merged"
|
|
23411
|
-
),
|
|
23412
|
-
get_optional_configs(
|
|
23413
|
-
pretrained_model_name_or_path,
|
|
23414
|
-
{
|
|
23415
|
-
generation_config: "generation_config.json"
|
|
23416
|
-
},
|
|
23417
|
-
options
|
|
23418
|
-
)
|
|
23419
|
-
]);
|
|
23420
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
23421
|
-
info = await Promise.all([
|
|
23422
|
-
constructSessions(
|
|
23423
|
-
pretrained_model_name_or_path,
|
|
23424
|
-
{
|
|
23425
|
-
model: "vision_encoder",
|
|
23426
|
-
prompt_encoder_mask_decoder: "prompt_encoder_mask_decoder"
|
|
23427
|
-
},
|
|
23428
|
-
options
|
|
23429
|
-
)
|
|
23430
|
-
]);
|
|
23431
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
23432
|
-
info = await Promise.all([
|
|
23433
|
-
constructSessions(
|
|
23434
|
-
pretrained_model_name_or_path,
|
|
23435
|
-
{
|
|
23436
|
-
model: "encoder_model",
|
|
23437
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23438
|
-
},
|
|
23439
|
-
options,
|
|
23440
|
-
"decoder_model_merged"
|
|
23441
|
-
)
|
|
23442
|
-
]);
|
|
23443
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
23444
|
-
const sessions = {
|
|
23445
|
-
embed_tokens: "embed_tokens",
|
|
23446
|
-
vision_encoder: "vision_encoder",
|
|
23447
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23448
|
-
};
|
|
23449
|
-
if (config.is_encoder_decoder) {
|
|
23450
|
-
sessions["model"] = "encoder_model";
|
|
23451
|
-
}
|
|
23452
|
-
info = await Promise.all([
|
|
23453
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
23454
|
-
get_optional_configs(
|
|
23455
|
-
pretrained_model_name_or_path,
|
|
23456
|
-
{
|
|
23457
|
-
generation_config: "generation_config.json"
|
|
23458
|
-
},
|
|
23459
|
-
options
|
|
23460
|
-
)
|
|
23461
|
-
]);
|
|
23462
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
23463
|
-
const sessions = {
|
|
23464
|
-
embed_tokens: "embed_tokens",
|
|
23465
|
-
audio_encoder: "audio_encoder",
|
|
23466
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23467
|
-
};
|
|
23468
|
-
info = await Promise.all([
|
|
23469
|
-
constructSessions(pretrained_model_name_or_path, sessions, options, "decoder_model_merged"),
|
|
23470
|
-
get_optional_configs(
|
|
23471
|
-
pretrained_model_name_or_path,
|
|
23472
|
-
{
|
|
23473
|
-
generation_config: "generation_config.json"
|
|
23474
|
-
},
|
|
23475
|
-
options
|
|
23476
|
-
)
|
|
23477
|
-
]);
|
|
23478
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
23479
|
-
const sessions = {
|
|
23480
|
-
embed_tokens: "embed_tokens",
|
|
23481
|
-
audio_encoder: "audio_encoder",
|
|
23482
|
-
vision_encoder: "vision_encoder",
|
|
23483
|
-
decoder_model_merged: "decoder_model_merged"
|
|
23484
|
-
};
|
|
23485
|
-
info = await Promise.all([
|
|
23486
|
-
constructSessions(pretrained_model_name_or_path, sessions, options),
|
|
23487
|
-
get_optional_configs(
|
|
23488
|
-
pretrained_model_name_or_path,
|
|
23489
|
-
{
|
|
23490
|
-
generation_config: "generation_config.json"
|
|
23491
|
-
},
|
|
23492
|
-
options
|
|
23493
|
-
)
|
|
23494
|
-
]);
|
|
23495
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
23496
|
-
info = await Promise.all([
|
|
23497
|
-
constructSessions(
|
|
23498
|
-
pretrained_model_name_or_path,
|
|
23499
|
-
{
|
|
23500
|
-
model: "text_encoder",
|
|
23501
|
-
decoder_model_merged: "decoder_model_merged",
|
|
23502
|
-
encodec_decode: "encodec_decode"
|
|
23503
|
-
},
|
|
23504
|
-
options,
|
|
23505
|
-
"decoder_model_merged"
|
|
23506
|
-
),
|
|
23507
|
-
get_optional_configs(
|
|
23508
|
-
pretrained_model_name_or_path,
|
|
23509
|
-
{
|
|
23510
|
-
generation_config: "generation_config.json"
|
|
23511
|
-
},
|
|
23512
|
-
options
|
|
23513
|
-
)
|
|
23514
|
-
]);
|
|
23515
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
23516
|
-
info = await Promise.all([
|
|
23517
|
-
constructSessions(
|
|
23518
|
-
pretrained_model_name_or_path,
|
|
23519
|
-
{
|
|
23520
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23521
|
-
model: "language_model",
|
|
23522
|
-
lm_head: "lm_head",
|
|
23523
|
-
gen_head: "gen_head",
|
|
23524
|
-
gen_img_embeds: "gen_img_embeds",
|
|
23525
|
-
image_decode: "image_decode"
|
|
23526
|
-
},
|
|
23527
|
-
options,
|
|
23528
|
-
"model"
|
|
23529
|
-
),
|
|
23530
|
-
get_optional_configs(
|
|
23531
|
-
pretrained_model_name_or_path,
|
|
23532
|
-
{
|
|
23533
|
-
generation_config: "generation_config.json"
|
|
23534
|
-
},
|
|
23535
|
-
options
|
|
23536
|
-
)
|
|
23537
|
-
]);
|
|
23538
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
23539
|
-
info = await Promise.all([
|
|
23540
|
-
constructSessions(
|
|
23541
|
-
pretrained_model_name_or_path,
|
|
23542
|
-
{
|
|
23543
|
-
prepare_inputs_embeds: "prepare_inputs_embeds",
|
|
23544
|
-
model: "model",
|
|
23545
|
-
vision_encoder: "vision_encoder"
|
|
23546
|
-
},
|
|
23547
|
-
options,
|
|
23548
|
-
"model"
|
|
23549
|
-
),
|
|
23550
|
-
get_optional_configs(
|
|
23551
|
-
pretrained_model_name_or_path,
|
|
23552
|
-
{
|
|
23553
|
-
generation_config: "generation_config.json"
|
|
23554
|
-
},
|
|
23555
|
-
options
|
|
23556
|
-
)
|
|
23557
|
-
]);
|
|
23558
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
23559
|
-
info = await Promise.all([
|
|
23560
|
-
constructSessions(
|
|
23561
|
-
pretrained_model_name_or_path,
|
|
23562
|
-
{
|
|
23563
|
-
embed_tokens: "embed_tokens",
|
|
23564
|
-
speech_encoder: "speech_encoder",
|
|
23565
|
-
model: "language_model",
|
|
23566
|
-
conditional_decoder: "conditional_decoder"
|
|
23567
|
-
},
|
|
23568
|
-
options,
|
|
23569
|
-
"model"
|
|
23570
|
-
),
|
|
23571
|
-
get_optional_configs(
|
|
23572
|
-
pretrained_model_name_or_path,
|
|
23573
|
-
{
|
|
23574
|
-
generation_config: "generation_config.json"
|
|
23575
|
-
},
|
|
23576
|
-
options
|
|
23577
|
-
)
|
|
23578
|
-
]);
|
|
23579
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
23580
|
-
info = await Promise.all([
|
|
23581
|
-
constructSessions(
|
|
23582
|
-
pretrained_model_name_or_path,
|
|
23583
|
-
{
|
|
23584
|
-
encoder_model: "encoder_model",
|
|
23585
|
-
decoder_model: "decoder_model"
|
|
23586
|
-
},
|
|
23587
|
-
options
|
|
23588
|
-
)
|
|
23589
|
-
]);
|
|
23590
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
23591
|
-
info = await Promise.all([
|
|
23592
|
-
constructSessions(
|
|
23593
|
-
pretrained_model_name_or_path,
|
|
23594
|
-
{
|
|
23595
|
-
text_encoder: "text_encoder",
|
|
23596
|
-
latent_denoiser: "latent_denoiser",
|
|
23597
|
-
voice_decoder: "voice_decoder"
|
|
23598
|
-
},
|
|
23599
|
-
options
|
|
23600
|
-
)
|
|
23601
|
-
]);
|
|
23602
|
-
} else {
|
|
23603
|
-
if (modelType === void 0) {
|
|
23604
|
-
const type = modelName ?? config?.model_type;
|
|
23605
|
-
if (type !== "custom") {
|
|
23606
|
-
logger.warn(
|
|
23607
|
-
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
23608
|
-
);
|
|
23609
|
-
}
|
|
24452
|
+
const typeConfig = MODEL_TYPE_CONFIG[modelType] ?? MODEL_TYPE_CONFIG.default;
|
|
24453
|
+
if (modelType === void 0) {
|
|
24454
|
+
const type = modelName ?? config?.model_type;
|
|
24455
|
+
if (type !== "custom") {
|
|
24456
|
+
logger.warn(
|
|
24457
|
+
`Model type for '${type}' not found, assuming encoder-only architecture. Please report this at ${GITHUB_ISSUE_URL}.`
|
|
24458
|
+
);
|
|
23610
24459
|
}
|
|
23611
|
-
info = await Promise.all([
|
|
23612
|
-
constructSessions(
|
|
23613
|
-
pretrained_model_name_or_path,
|
|
23614
|
-
{
|
|
23615
|
-
model: options.model_file_name ?? "model"
|
|
23616
|
-
},
|
|
23617
|
-
options
|
|
23618
|
-
)
|
|
23619
|
-
]);
|
|
23620
24460
|
}
|
|
24461
|
+
const sessions = typeConfig.sessions(config, options);
|
|
24462
|
+
const promises = [
|
|
24463
|
+
constructSessions(pretrained_model_name_or_path, sessions, options, typeConfig.cache_sessions)
|
|
24464
|
+
];
|
|
24465
|
+
if (typeConfig.optional_configs) {
|
|
24466
|
+
promises.push(get_optional_configs(pretrained_model_name_or_path, typeConfig.optional_configs, options));
|
|
24467
|
+
}
|
|
24468
|
+
const info = await Promise.all(promises);
|
|
23621
24469
|
return new this(config, ...info);
|
|
23622
24470
|
}
|
|
23623
24471
|
/**
|
|
@@ -23816,7 +24664,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
23816
24664
|
* @param {Tensor} [params.inputs=null]
|
|
23817
24665
|
* @param {number} [params.bos_token_id=null]
|
|
23818
24666
|
* @param {Record<string, Tensor|number[]>} [params.model_kwargs]
|
|
23819
|
-
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor
|
|
24667
|
+
* @returns {{inputs_tensor: Tensor, model_inputs: Record<string, Tensor> & {past_key_values?: DynamicCache}, model_input_name: string}} The model-specific inputs for generation.
|
|
23820
24668
|
*/
|
|
23821
24669
|
_prepare_model_inputs({ inputs, bos_token_id, model_kwargs }) {
|
|
23822
24670
|
const model_inputs = pick(model_kwargs, this.forward_params);
|
|
@@ -24057,11 +24905,12 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24057
24905
|
}
|
|
24058
24906
|
}
|
|
24059
24907
|
/**
|
|
24060
|
-
* Returns
|
|
24908
|
+
* Returns a DynamicCache containing past key values from the given decoder results object.
|
|
24061
24909
|
*
|
|
24062
24910
|
* @param {Object} decoderResults The decoder results object.
|
|
24063
|
-
* @param {
|
|
24064
|
-
* @
|
|
24911
|
+
* @param {DynamicCache} pastKeyValues The previous past key values.
|
|
24912
|
+
* @param {boolean} [disposeEncoderPKVs=false] Whether to dispose encoder past key values.
|
|
24913
|
+
* @returns {DynamicCache} A new DynamicCache containing the updated past key values.
|
|
24065
24914
|
*/
|
|
24066
24915
|
getPastKeyValues(decoderResults, pastKeyValues, disposeEncoderPKVs = false) {
|
|
24067
24916
|
const pkvs = /* @__PURE__ */ Object.create(null);
|
|
@@ -24082,7 +24931,7 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24082
24931
|
}
|
|
24083
24932
|
}
|
|
24084
24933
|
}
|
|
24085
|
-
return pkvs;
|
|
24934
|
+
return new DynamicCache(pkvs);
|
|
24086
24935
|
}
|
|
24087
24936
|
/**
|
|
24088
24937
|
* Returns an object containing attentions from the given model output object.
|
|
@@ -24107,8 +24956,8 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24107
24956
|
/**
|
|
24108
24957
|
* Adds past key values to the decoder feeds object. If pastKeyValues is null, creates new tensors for past key values.
|
|
24109
24958
|
*
|
|
24110
|
-
* @param {
|
|
24111
|
-
* @param {
|
|
24959
|
+
* @param {Record<string, any>} decoderFeeds The decoder feeds object to add past key values to.
|
|
24960
|
+
* @param {DynamicCache|null} pastKeyValues The cache containing past key values.
|
|
24112
24961
|
*/
|
|
24113
24962
|
addPastKeyValues(decoderFeeds, pastKeyValues) {
|
|
24114
24963
|
if (pastKeyValues) {
|
|
@@ -24125,14 +24974,29 @@ var PreTrainedModel = class extends Callable2 {
|
|
|
24125
24974
|
}
|
|
24126
24975
|
}
|
|
24127
24976
|
}
|
|
24128
|
-
|
|
24129
|
-
|
|
24977
|
+
/**
|
|
24978
|
+
* Helper function to select valid inputs and run through the appropriate encoder (vision, text, audio) based on the input type.
|
|
24979
|
+
* @param {string} sessionName
|
|
24980
|
+
* @param {Record<string, Tensor>} inputs
|
|
24981
|
+
* @param {string} outputName
|
|
24982
|
+
* @private
|
|
24983
|
+
*/
|
|
24984
|
+
async _encode_input(sessionName, inputs, outputName) {
|
|
24985
|
+
if (!Object.hasOwn(this.sessions, sessionName)) {
|
|
24986
|
+
throw new Error(`Model does not have a ${sessionName} session.`);
|
|
24987
|
+
}
|
|
24988
|
+
const session = this.sessions[sessionName];
|
|
24989
|
+
const output = await sessionRun(session, pick(inputs, session.inputNames));
|
|
24990
|
+
return output[outputName];
|
|
24991
|
+
}
|
|
24992
|
+
async encode_image(inputs) {
|
|
24993
|
+
return this._encode_input("vision_encoder", inputs, "image_features");
|
|
24130
24994
|
}
|
|
24131
|
-
async encode_text(
|
|
24132
|
-
return
|
|
24995
|
+
async encode_text(inputs) {
|
|
24996
|
+
return this._encode_input("embed_tokens", inputs, "inputs_embeds");
|
|
24133
24997
|
}
|
|
24134
|
-
async encode_audio(
|
|
24135
|
-
return
|
|
24998
|
+
async encode_audio(inputs) {
|
|
24999
|
+
return this._encode_input("audio_encoder", inputs, "audio_features");
|
|
24136
25000
|
}
|
|
24137
25001
|
};
|
|
24138
25002
|
async function seq2seq_forward(self2, model_inputs) {
|
|
@@ -24187,6 +25051,9 @@ async function decoder_forward(self2, model_inputs, is_encoder_decoder = false)
|
|
|
24187
25051
|
const start_index = ["paligemma", "gemma3_text", "gemma3"].includes(self2.config.model_type) ? 1 : 0;
|
|
24188
25052
|
new_model_inputs.position_ids = create_position_ids(new_model_inputs, past_key_values, start_index);
|
|
24189
25053
|
}
|
|
25054
|
+
if (session.inputNames.includes("num_logits_to_keep") && !new_model_inputs.num_logits_to_keep) {
|
|
25055
|
+
new_model_inputs.num_logits_to_keep = new Tensor2("int64", [0n], []);
|
|
25056
|
+
}
|
|
24190
25057
|
self2.addPastKeyValues(new_model_inputs, past_key_values);
|
|
24191
25058
|
const fixed = pick(new_model_inputs, session.inputNames);
|
|
24192
25059
|
return await sessionRun(session, fixed);
|
|
@@ -24195,7 +25062,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24195
25062
|
// Generic parameters:
|
|
24196
25063
|
encode_function,
|
|
24197
25064
|
merge_function,
|
|
24198
|
-
|
|
25065
|
+
modality_input_names,
|
|
24199
25066
|
modality_output_name,
|
|
24200
25067
|
// Produced by the tokenizer/processor:
|
|
24201
25068
|
input_ids = null,
|
|
@@ -24210,32 +25077,34 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24210
25077
|
// Additional parameters
|
|
24211
25078
|
...kwargs
|
|
24212
25079
|
}) {
|
|
24213
|
-
const modality_values = kwargs[modality_input_name];
|
|
24214
25080
|
if (!inputs_embeds) {
|
|
24215
25081
|
inputs_embeds = await self2.encode_text({ input_ids, ...kwargs });
|
|
24216
|
-
|
|
24217
|
-
|
|
24218
|
-
|
|
24219
|
-
|
|
24220
|
-
|
|
24221
|
-
|
|
24222
|
-
|
|
24223
|
-
|
|
24224
|
-
|
|
24225
|
-
inputs_embeds,
|
|
24226
|
-
|
|
24227
|
-
|
|
24228
|
-
|
|
24229
|
-
|
|
24230
|
-
|
|
24231
|
-
|
|
24232
|
-
|
|
24233
|
-
|
|
24234
|
-
|
|
24235
|
-
|
|
24236
|
-
|
|
24237
|
-
|
|
24238
|
-
|
|
25082
|
+
const modality_values = pick(kwargs, modality_input_names);
|
|
25083
|
+
if (Object.keys(modality_values).length > 0) {
|
|
25084
|
+
if (input_ids.dims[1] !== 1) {
|
|
25085
|
+
const modality_features = await encode_function({
|
|
25086
|
+
// Pass the modality values under its expected key.
|
|
25087
|
+
// The caller knows whether this is audio or image.
|
|
25088
|
+
...modality_values,
|
|
25089
|
+
...kwargs
|
|
25090
|
+
});
|
|
25091
|
+
({ inputs_embeds, attention_mask } = merge_function({
|
|
25092
|
+
[modality_output_name]: modality_features,
|
|
25093
|
+
inputs_embeds,
|
|
25094
|
+
input_ids,
|
|
25095
|
+
attention_mask
|
|
25096
|
+
}));
|
|
25097
|
+
} else if (past_key_values && input_ids.dims[1] === 1) {
|
|
25098
|
+
const target_length = input_ids.dims[1];
|
|
25099
|
+
const past_length = past_key_values.get_seq_length();
|
|
25100
|
+
attention_mask = cat(
|
|
25101
|
+
[
|
|
25102
|
+
ones([input_ids.dims[0], past_length]),
|
|
25103
|
+
attention_mask.slice(null, [attention_mask.dims[1] - target_length, attention_mask.dims[1]])
|
|
25104
|
+
],
|
|
25105
|
+
1
|
|
25106
|
+
);
|
|
25107
|
+
}
|
|
24239
25108
|
}
|
|
24240
25109
|
}
|
|
24241
25110
|
if (!position_ids) {
|
|
@@ -24243,10 +25112,13 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24243
25112
|
// Handle special case for qwen vl models
|
|
24244
25113
|
[
|
|
24245
25114
|
"qwen2_vl",
|
|
25115
|
+
"qwen2_vl_text",
|
|
24246
25116
|
"qwen2_5_vl",
|
|
24247
25117
|
"qwen2_5_vl_text",
|
|
24248
25118
|
"qwen3_vl",
|
|
24249
25119
|
"qwen3_vl_text",
|
|
25120
|
+
"qwen3_vl_moe",
|
|
25121
|
+
"qwen3_vl_moe_text",
|
|
24250
25122
|
"qwen3_5",
|
|
24251
25123
|
"qwen3_5_text",
|
|
24252
25124
|
"qwen3_5_moe",
|
|
@@ -24274,7 +25146,7 @@ async function generic_text_to_text_forward(self2, {
|
|
|
24274
25146
|
async function audio_text_to_text_forward(self2, params) {
|
|
24275
25147
|
return await generic_text_to_text_forward(self2, {
|
|
24276
25148
|
...params,
|
|
24277
|
-
|
|
25149
|
+
modality_input_names: ["audio_values", "input_features"],
|
|
24278
25150
|
modality_output_name: "audio_features",
|
|
24279
25151
|
encode_function: self2.encode_audio.bind(self2),
|
|
24280
25152
|
merge_function: self2._merge_input_ids_with_audio_features.bind(self2)
|
|
@@ -24283,7 +25155,7 @@ async function audio_text_to_text_forward(self2, params) {
|
|
|
24283
25155
|
async function image_text_to_text_forward(self2, params) {
|
|
24284
25156
|
return await generic_text_to_text_forward(self2, {
|
|
24285
25157
|
...params,
|
|
24286
|
-
|
|
25158
|
+
modality_input_names: ["pixel_values"],
|
|
24287
25159
|
modality_output_name: "image_features",
|
|
24288
25160
|
encode_function: self2.encode_image.bind(self2),
|
|
24289
25161
|
merge_function: self2._merge_input_ids_with_image_features.bind(self2)
|
|
@@ -24319,7 +25191,11 @@ function create_position_ids(model_inputs, past_key_values = null, start_index =
|
|
|
24319
25191
|
return position_ids;
|
|
24320
25192
|
}
|
|
24321
25193
|
function decoder_prepare_inputs_for_generation(self2, input_ids, model_inputs, generation_config) {
|
|
24322
|
-
const past_length = model_inputs.past_key_values ?
|
|
25194
|
+
const past_length = model_inputs.past_key_values ? model_inputs.past_key_values.get_seq_length() : 0;
|
|
25195
|
+
const session = self2.sessions["decoder_model_merged"] ?? self2.sessions["model"];
|
|
25196
|
+
if (session?.inputNames.includes("num_logits_to_keep") && !model_inputs.num_logits_to_keep) {
|
|
25197
|
+
model_inputs.num_logits_to_keep = new Tensor2("int64", [1n], []);
|
|
25198
|
+
}
|
|
24323
25199
|
if (!model_inputs.attention_mask) {
|
|
24324
25200
|
let dims;
|
|
24325
25201
|
for (const key of ["input_ids", "inputs_embeds", "position_ids"]) {
|
|
@@ -24627,6 +25503,7 @@ __export(models_exports, {
|
|
|
24627
25503
|
Gemma3ForCausalLM: () => Gemma3ForCausalLM,
|
|
24628
25504
|
Gemma3Model: () => Gemma3Model,
|
|
24629
25505
|
Gemma3PreTrainedModel: () => Gemma3PreTrainedModel,
|
|
25506
|
+
Gemma3nForCausalLM: () => Gemma3nForCausalLM,
|
|
24630
25507
|
Gemma3nForConditionalGeneration: () => Gemma3nForConditionalGeneration,
|
|
24631
25508
|
Gemma3nPreTrainedModel: () => Gemma3nPreTrainedModel,
|
|
24632
25509
|
GemmaForCausalLM: () => GemmaForCausalLM,
|
|
@@ -24644,6 +25521,7 @@ __export(models_exports, {
|
|
|
24644
25521
|
GraniteMoeHybridModel: () => GraniteMoeHybridModel,
|
|
24645
25522
|
GraniteMoeHybridPreTrainedModel: () => GraniteMoeHybridPreTrainedModel,
|
|
24646
25523
|
GranitePreTrainedModel: () => GranitePreTrainedModel,
|
|
25524
|
+
GraniteSpeechForConditionalGeneration: () => GraniteSpeechForConditionalGeneration,
|
|
24647
25525
|
GroundingDinoForObjectDetection: () => GroundingDinoForObjectDetection,
|
|
24648
25526
|
GroundingDinoPreTrainedModel: () => GroundingDinoPreTrainedModel,
|
|
24649
25527
|
GroupViTModel: () => GroupViTModel,
|
|
@@ -24665,7 +25543,6 @@ __export(models_exports, {
|
|
|
24665
25543
|
IJepaModel: () => IJepaModel,
|
|
24666
25544
|
IJepaPreTrainedModel: () => IJepaPreTrainedModel,
|
|
24667
25545
|
Idefics3ForConditionalGeneration: () => Idefics3ForConditionalGeneration,
|
|
24668
|
-
Idefics3PreTrainedModel: () => Idefics3PreTrainedModel,
|
|
24669
25546
|
JAISLMHeadModel: () => JAISLMHeadModel,
|
|
24670
25547
|
JAISModel: () => JAISModel,
|
|
24671
25548
|
JAISPreTrainedModel: () => JAISPreTrainedModel,
|
|
@@ -24679,6 +25556,7 @@ __export(models_exports, {
|
|
|
24679
25556
|
Lfm2MoeModel: () => Lfm2MoeModel,
|
|
24680
25557
|
Lfm2MoePreTrainedModel: () => Lfm2MoePreTrainedModel,
|
|
24681
25558
|
Lfm2PreTrainedModel: () => Lfm2PreTrainedModel,
|
|
25559
|
+
Lfm2VlForConditionalGeneration: () => Lfm2VlForConditionalGeneration,
|
|
24682
25560
|
LiteWhisperForConditionalGeneration: () => LiteWhisperForConditionalGeneration,
|
|
24683
25561
|
Llama4ForCausalLM: () => Llama4ForCausalLM,
|
|
24684
25562
|
Llama4PreTrainedModel: () => Llama4PreTrainedModel,
|
|
@@ -24818,7 +25696,6 @@ __export(models_exports, {
|
|
|
24818
25696
|
Owlv2Model: () => Owlv2Model,
|
|
24819
25697
|
Owlv2PreTrainedModel: () => Owlv2PreTrainedModel,
|
|
24820
25698
|
PaliGemmaForConditionalGeneration: () => PaliGemmaForConditionalGeneration,
|
|
24821
|
-
PaliGemmaPreTrainedModel: () => PaliGemmaPreTrainedModel,
|
|
24822
25699
|
ParakeetForCTC: () => ParakeetForCTC,
|
|
24823
25700
|
ParakeetPreTrainedModel: () => ParakeetPreTrainedModel,
|
|
24824
25701
|
PatchTSMixerForPrediction: () => PatchTSMixerForPrediction,
|
|
@@ -24848,8 +25725,10 @@ __export(models_exports, {
|
|
|
24848
25725
|
Qwen2MoeModel: () => Qwen2MoeModel,
|
|
24849
25726
|
Qwen2MoePreTrainedModel: () => Qwen2MoePreTrainedModel,
|
|
24850
25727
|
Qwen2PreTrainedModel: () => Qwen2PreTrainedModel,
|
|
25728
|
+
Qwen2VLForCausalLM: () => Qwen2VLForCausalLM,
|
|
24851
25729
|
Qwen2VLForConditionalGeneration: () => Qwen2VLForConditionalGeneration,
|
|
24852
25730
|
Qwen2VLPreTrainedModel: () => Qwen2VLPreTrainedModel,
|
|
25731
|
+
Qwen2_5_VLForCausalLM: () => Qwen2_5_VLForCausalLM,
|
|
24853
25732
|
Qwen2_5_VLForConditionalGeneration: () => Qwen2_5_VLForConditionalGeneration,
|
|
24854
25733
|
Qwen3ForCausalLM: () => Qwen3ForCausalLM,
|
|
24855
25734
|
Qwen3Model: () => Qwen3Model,
|
|
@@ -24860,9 +25739,13 @@ __export(models_exports, {
|
|
|
24860
25739
|
Qwen3NextModel: () => Qwen3NextModel,
|
|
24861
25740
|
Qwen3NextPreTrainedModel: () => Qwen3NextPreTrainedModel,
|
|
24862
25741
|
Qwen3PreTrainedModel: () => Qwen3PreTrainedModel,
|
|
25742
|
+
Qwen3VLForCausalLM: () => Qwen3VLForCausalLM,
|
|
24863
25743
|
Qwen3VLForConditionalGeneration: () => Qwen3VLForConditionalGeneration,
|
|
25744
|
+
Qwen3VLMoeForCausalLM: () => Qwen3VLMoeForCausalLM,
|
|
24864
25745
|
Qwen3VLMoeForConditionalGeneration: () => Qwen3VLMoeForConditionalGeneration,
|
|
25746
|
+
Qwen3_5ForCausalLM: () => Qwen3_5ForCausalLM,
|
|
24865
25747
|
Qwen3_5ForConditionalGeneration: () => Qwen3_5ForConditionalGeneration,
|
|
25748
|
+
Qwen3_5MoeForCausalLM: () => Qwen3_5MoeForCausalLM,
|
|
24866
25749
|
Qwen3_5MoeForConditionalGeneration: () => Qwen3_5MoeForConditionalGeneration,
|
|
24867
25750
|
RFDetrForObjectDetection: () => RFDetrForObjectDetection,
|
|
24868
25751
|
RFDetrModel: () => RFDetrModel,
|
|
@@ -24913,7 +25796,6 @@ __export(models_exports, {
|
|
|
24913
25796
|
SmolLM3ForCausalLM: () => SmolLM3ForCausalLM,
|
|
24914
25797
|
SmolLM3Model: () => SmolLM3Model,
|
|
24915
25798
|
SmolLM3PreTrainedModel: () => SmolLM3PreTrainedModel,
|
|
24916
|
-
SmolVLMForConditionalGeneration: () => SmolVLMForConditionalGeneration,
|
|
24917
25799
|
SnacDecoderModel: () => SnacDecoderModel,
|
|
24918
25800
|
SnacEncoderModel: () => SnacEncoderModel,
|
|
24919
25801
|
SnacModel: () => SnacModel,
|
|
@@ -24985,6 +25867,8 @@ __export(models_exports, {
|
|
|
24985
25867
|
VitsModelOutput: () => VitsModelOutput,
|
|
24986
25868
|
VitsPreTrainedModel: () => VitsPreTrainedModel,
|
|
24987
25869
|
VoxtralForConditionalGeneration: () => VoxtralForConditionalGeneration,
|
|
25870
|
+
VoxtralRealtimeForConditionalGeneration: () => VoxtralRealtimeForConditionalGeneration,
|
|
25871
|
+
VoxtralRealtimePreTrainedModel: () => VoxtralRealtimePreTrainedModel,
|
|
24988
25872
|
Wav2Vec2BertForCTC: () => Wav2Vec2BertForCTC,
|
|
24989
25873
|
Wav2Vec2BertForSequenceClassification: () => Wav2Vec2BertForSequenceClassification,
|
|
24990
25874
|
Wav2Vec2BertModel: () => Wav2Vec2BertModel,
|
|
@@ -25345,7 +26229,7 @@ var ChatterboxModel = class extends ChatterboxPreTrainedModel {
|
|
|
25345
26229
|
if (!past_key_values || target_length !== 1) {
|
|
25346
26230
|
throw new Error("Incorrect state encountered during generation.");
|
|
25347
26231
|
}
|
|
25348
|
-
const past_length =
|
|
26232
|
+
const past_length = past_key_values.get_seq_length();
|
|
25349
26233
|
attention_mask = ones([inputs_embeds.dims[0], past_length + target_length]);
|
|
25350
26234
|
}
|
|
25351
26235
|
}
|
|
@@ -26375,6 +27259,8 @@ var Gemma3nForConditionalGeneration = class extends Gemma3nPreTrainedModel {
|
|
|
26375
27259
|
});
|
|
26376
27260
|
}
|
|
26377
27261
|
};
|
|
27262
|
+
var Gemma3nForCausalLM = class extends Gemma3nForConditionalGeneration {
|
|
27263
|
+
};
|
|
26378
27264
|
|
|
26379
27265
|
// src/models/glm/modeling_glm.js
|
|
26380
27266
|
var GlmPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -26456,6 +27342,28 @@ var GraniteMoeHybridModel = class extends GraniteMoeHybridPreTrainedModel {
|
|
|
26456
27342
|
var GraniteMoeHybridForCausalLM = class extends GraniteMoeHybridPreTrainedModel {
|
|
26457
27343
|
};
|
|
26458
27344
|
|
|
27345
|
+
// src/models/ultravox/modeling_ultravox.js
|
|
27346
|
+
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
27347
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
27348
|
+
};
|
|
27349
|
+
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
27350
|
+
_merge_input_ids_with_audio_features(kwargs) {
|
|
27351
|
+
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
27352
|
+
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
27353
|
+
return default_merge_input_ids_with_audio_features({
|
|
27354
|
+
// @ts-ignore
|
|
27355
|
+
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id ?? this.config.audio_token_index,
|
|
27356
|
+
...kwargs,
|
|
27357
|
+
audio_features: reshaped_audio_features
|
|
27358
|
+
});
|
|
27359
|
+
}
|
|
27360
|
+
};
|
|
27361
|
+
|
|
27362
|
+
// src/models/granite_speech/modeling_granite_speech.js
|
|
27363
|
+
var GraniteSpeechForConditionalGeneration = class extends UltravoxModel {
|
|
27364
|
+
forward_params = ["input_ids", "attention_mask", "input_features", "past_key_values"];
|
|
27365
|
+
};
|
|
27366
|
+
|
|
26459
27367
|
// src/models/grounding_dino/modeling_grounding_dino.js
|
|
26460
27368
|
var GroundingDinoPreTrainedModel = class extends PreTrainedModel {
|
|
26461
27369
|
};
|
|
@@ -26560,34 +27468,37 @@ var HunYuanDenseV1Model = class extends HunYuanDenseV1PreTrainedModel {
|
|
|
26560
27468
|
var HunYuanDenseV1ForCausalLM = class extends HunYuanDenseV1PreTrainedModel {
|
|
26561
27469
|
};
|
|
26562
27470
|
|
|
26563
|
-
// src/models/
|
|
26564
|
-
var
|
|
26565
|
-
forward_params = [
|
|
26566
|
-
"input_ids",
|
|
26567
|
-
"attention_mask",
|
|
26568
|
-
"pixel_values",
|
|
26569
|
-
"pixel_attention_mask",
|
|
26570
|
-
"position_ids",
|
|
26571
|
-
"past_key_values"
|
|
26572
|
-
];
|
|
27471
|
+
// src/models/llava/modeling_llava.js
|
|
27472
|
+
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
27473
|
+
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
26573
27474
|
};
|
|
26574
|
-
var
|
|
26575
|
-
async encode_image({ pixel_values, pixel_attention_mask }) {
|
|
26576
|
-
const features = (await sessionRun(this.sessions["vision_encoder"], { pixel_values, pixel_attention_mask })).image_features;
|
|
26577
|
-
return features;
|
|
26578
|
-
}
|
|
27475
|
+
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
26579
27476
|
_merge_input_ids_with_image_features(kwargs) {
|
|
26580
27477
|
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26581
27478
|
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26582
27479
|
return default_merge_input_ids_with_image_features({
|
|
26583
27480
|
// @ts-ignore
|
|
26584
|
-
image_token_id: this.config.image_token_id,
|
|
27481
|
+
image_token_id: this.config.image_token_index ?? this.config.image_token_id,
|
|
26585
27482
|
...kwargs,
|
|
26586
27483
|
image_features: reshaped_image_hidden_states
|
|
26587
27484
|
});
|
|
26588
27485
|
}
|
|
26589
27486
|
};
|
|
26590
|
-
var
|
|
27487
|
+
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27488
|
+
};
|
|
27489
|
+
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
27490
|
+
};
|
|
27491
|
+
|
|
27492
|
+
// src/models/idefics3/modeling_idefics3.js
|
|
27493
|
+
var Idefics3ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27494
|
+
forward_params = [
|
|
27495
|
+
"input_ids",
|
|
27496
|
+
"attention_mask",
|
|
27497
|
+
"pixel_values",
|
|
27498
|
+
"pixel_attention_mask",
|
|
27499
|
+
"position_ids",
|
|
27500
|
+
"past_key_values"
|
|
27501
|
+
];
|
|
26591
27502
|
};
|
|
26592
27503
|
|
|
26593
27504
|
// src/models/ijepa/modeling_ijepa.js
|
|
@@ -26679,6 +27590,19 @@ var Lfm2MoeModel = class extends Lfm2MoePreTrainedModel {
|
|
|
26679
27590
|
var Lfm2MoeForCausalLM = class extends Lfm2MoePreTrainedModel {
|
|
26680
27591
|
};
|
|
26681
27592
|
|
|
27593
|
+
// src/models/lfm2_vl/modeling_lfm2_vl.js
|
|
27594
|
+
var Lfm2VlForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27595
|
+
forward_params = [
|
|
27596
|
+
"input_ids",
|
|
27597
|
+
"attention_mask",
|
|
27598
|
+
"pixel_values",
|
|
27599
|
+
"pixel_attention_mask",
|
|
27600
|
+
"spatial_shapes",
|
|
27601
|
+
"position_ids",
|
|
27602
|
+
"past_key_values"
|
|
27603
|
+
];
|
|
27604
|
+
};
|
|
27605
|
+
|
|
26682
27606
|
// src/models/llama/modeling_llama.js
|
|
26683
27607
|
var LlamaPreTrainedModel = class extends PreTrainedModel {
|
|
26684
27608
|
};
|
|
@@ -26693,27 +27617,6 @@ var Llama4PreTrainedModel = class extends PreTrainedModel {
|
|
|
26693
27617
|
var Llama4ForCausalLM = class extends Llama4PreTrainedModel {
|
|
26694
27618
|
};
|
|
26695
27619
|
|
|
26696
|
-
// src/models/llava/modeling_llava.js
|
|
26697
|
-
var LlavaPreTrainedModel = class extends PreTrainedModel {
|
|
26698
|
-
forward_params = ["input_ids", "attention_mask", "pixel_values", "position_ids", "past_key_values"];
|
|
26699
|
-
};
|
|
26700
|
-
var LlavaForConditionalGeneration = class extends LlavaPreTrainedModel {
|
|
26701
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
26702
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
26703
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
26704
|
-
return default_merge_input_ids_with_image_features({
|
|
26705
|
-
// @ts-ignore
|
|
26706
|
-
image_token_id: this.config.image_token_index,
|
|
26707
|
-
...kwargs,
|
|
26708
|
-
image_features: reshaped_image_hidden_states
|
|
26709
|
-
});
|
|
26710
|
-
}
|
|
26711
|
-
};
|
|
26712
|
-
var Moondream1ForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
26713
|
-
};
|
|
26714
|
-
var LlavaQwen2ForCausalLM = class extends LlavaForConditionalGeneration {
|
|
26715
|
-
};
|
|
26716
|
-
|
|
26717
27620
|
// src/models/longt5/modeling_longt5.js
|
|
26718
27621
|
var LongT5PreTrainedModel = class extends PreTrainedModel {
|
|
26719
27622
|
};
|
|
@@ -27464,27 +28367,7 @@ var OwlViTForObjectDetection = class extends OwlViTPreTrainedModel {
|
|
|
27464
28367
|
};
|
|
27465
28368
|
|
|
27466
28369
|
// src/models/paligemma/modeling_paligemma.js
|
|
27467
|
-
var
|
|
27468
|
-
forward_params = [
|
|
27469
|
-
"input_ids",
|
|
27470
|
-
// 'inputs_embeds',
|
|
27471
|
-
"attention_mask",
|
|
27472
|
-
"pixel_values",
|
|
27473
|
-
"position_ids",
|
|
27474
|
-
"past_key_values"
|
|
27475
|
-
];
|
|
27476
|
-
};
|
|
27477
|
-
var PaliGemmaForConditionalGeneration = class extends PaliGemmaPreTrainedModel {
|
|
27478
|
-
_merge_input_ids_with_image_features(kwargs) {
|
|
27479
|
-
const vision_hidden_size = kwargs.image_features.dims.at(-1);
|
|
27480
|
-
const reshaped_image_hidden_states = kwargs.image_features.view(-1, vision_hidden_size);
|
|
27481
|
-
return default_merge_input_ids_with_image_features({
|
|
27482
|
-
// @ts-ignore
|
|
27483
|
-
image_token_id: this.config.image_token_index,
|
|
27484
|
-
...kwargs,
|
|
27485
|
-
image_features: reshaped_image_hidden_states
|
|
27486
|
-
});
|
|
27487
|
-
}
|
|
28370
|
+
var PaliGemmaForConditionalGeneration = class extends LlavaForConditionalGeneration {
|
|
27488
28371
|
};
|
|
27489
28372
|
|
|
27490
28373
|
// src/models/parakeet/modeling_parakeet.js
|
|
@@ -27657,6 +28540,9 @@ var Qwen2VLPreTrainedModel = class extends PreTrainedModel {
|
|
|
27657
28540
|
];
|
|
27658
28541
|
};
|
|
27659
28542
|
var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
28543
|
+
// NOTE: This is used as the base class for all Qwen VL models and their CausalLM variants.
|
|
28544
|
+
// CausalLM variants (e.g., Qwen2VLForCausalLM) extend this class but load only
|
|
28545
|
+
// embed_tokens + decoder_model_merged (no vision_encoder) via MultimodalLanguageModelOnly type.
|
|
27660
28546
|
image_grid_thw_name = "grid_thw";
|
|
27661
28547
|
/**
|
|
27662
28548
|
* Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
|
|
@@ -27846,7 +28732,7 @@ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
|
27846
28732
|
);
|
|
27847
28733
|
} else {
|
|
27848
28734
|
model_inputs.pixel_values = null;
|
|
27849
|
-
const past_length =
|
|
28735
|
+
const past_length = model_inputs.past_key_values.get_seq_length();
|
|
27850
28736
|
if (past_length < model_inputs.input_ids.dims[1]) {
|
|
27851
28737
|
const [full_position_ids, rope_deltas] = this.get_rope_index(
|
|
27852
28738
|
model_inputs.input_ids,
|
|
@@ -27875,11 +28761,16 @@ var Qwen2VLForConditionalGeneration = class extends Qwen2VLPreTrainedModel {
|
|
|
27875
28761
|
return model_inputs;
|
|
27876
28762
|
}
|
|
27877
28763
|
};
|
|
28764
|
+
var Qwen2VLForCausalLM = class extends Qwen2VLForConditionalGeneration {
|
|
28765
|
+
};
|
|
27878
28766
|
|
|
27879
28767
|
// src/models/qwen2_5_vl/modeling_qwen2_5_vl.js
|
|
27880
28768
|
var Qwen2_5_VLForConditionalGeneration = class extends Qwen2VLForConditionalGeneration {
|
|
27881
28769
|
image_grid_thw_name = "image_grid_thw";
|
|
27882
28770
|
};
|
|
28771
|
+
var Qwen2_5_VLForCausalLM = class extends Qwen2VLForCausalLM {
|
|
28772
|
+
image_grid_thw_name = "image_grid_thw";
|
|
28773
|
+
};
|
|
27883
28774
|
|
|
27884
28775
|
// src/models/qwen3/modeling_qwen3.js
|
|
27885
28776
|
var Qwen3PreTrainedModel = class extends PreTrainedModel {
|
|
@@ -27908,18 +28799,26 @@ var Qwen3NextForCausalLM = class extends Qwen3NextPreTrainedModel {
|
|
|
27908
28799
|
// src/models/qwen3_vl/modeling_qwen3_vl.js
|
|
27909
28800
|
var Qwen3VLForConditionalGeneration = class extends Qwen2_5_VLForConditionalGeneration {
|
|
27910
28801
|
};
|
|
28802
|
+
var Qwen3VLForCausalLM = class extends Qwen2_5_VLForCausalLM {
|
|
28803
|
+
};
|
|
27911
28804
|
|
|
27912
28805
|
// src/models/qwen3_vl_moe/modeling_qwen3_vl_moe.js
|
|
27913
28806
|
var Qwen3VLMoeForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27914
28807
|
};
|
|
28808
|
+
var Qwen3VLMoeForCausalLM = class extends Qwen3VLForCausalLM {
|
|
28809
|
+
};
|
|
27915
28810
|
|
|
27916
28811
|
// src/models/qwen3_5/modeling_qwen3_5.js
|
|
27917
28812
|
var Qwen3_5ForConditionalGeneration = class extends Qwen3VLForConditionalGeneration {
|
|
27918
28813
|
};
|
|
28814
|
+
var Qwen3_5ForCausalLM = class extends Qwen3_5ForConditionalGeneration {
|
|
28815
|
+
};
|
|
27919
28816
|
|
|
27920
28817
|
// src/models/qwen3_5_moe/modeling_qwen3_5_moe.js
|
|
27921
28818
|
var Qwen3_5MoeForConditionalGeneration = class extends Qwen3_5ForConditionalGeneration {
|
|
27922
28819
|
};
|
|
28820
|
+
var Qwen3_5MoeForCausalLM = class extends Qwen3_5ForCausalLM {
|
|
28821
|
+
};
|
|
27923
28822
|
|
|
27924
28823
|
// src/models/resnet/modeling_resnet.js
|
|
27925
28824
|
var ResNetPreTrainedModel = class extends PreTrainedModel {
|
|
@@ -28600,25 +29499,6 @@ var TrOCRPreTrainedModel = class extends PreTrainedModel {
|
|
|
28600
29499
|
var TrOCRForCausalLM = class extends TrOCRPreTrainedModel {
|
|
28601
29500
|
};
|
|
28602
29501
|
|
|
28603
|
-
// src/models/ultravox/modeling_ultravox.js
|
|
28604
|
-
var UltravoxPreTrainedModel = class extends PreTrainedModel {
|
|
28605
|
-
forward_params = ["input_ids", "attention_mask", "position_ids", "audio_values", "past_key_values"];
|
|
28606
|
-
};
|
|
28607
|
-
var UltravoxModel = class extends UltravoxPreTrainedModel {
|
|
28608
|
-
_merge_input_ids_with_audio_features(kwargs) {
|
|
28609
|
-
const audio_hidden_size = kwargs.audio_features.dims.at(-1);
|
|
28610
|
-
const reshaped_audio_features = kwargs.audio_features.view(-1, audio_hidden_size);
|
|
28611
|
-
return default_merge_input_ids_with_audio_features({
|
|
28612
|
-
// @ts-ignore
|
|
28613
|
-
audio_token_id: this.config.ignore_index ?? this.config.audio_token_id,
|
|
28614
|
-
...kwargs,
|
|
28615
|
-
audio_features: reshaped_audio_features
|
|
28616
|
-
});
|
|
28617
|
-
}
|
|
28618
|
-
};
|
|
28619
|
-
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
28620
|
-
};
|
|
28621
|
-
|
|
28622
29502
|
// src/models/unispeech/modeling_unispeech.js
|
|
28623
29503
|
var UniSpeechPreTrainedModel = class extends PreTrainedModel {
|
|
28624
29504
|
};
|
|
@@ -28784,6 +29664,170 @@ var VitsModel = class extends VitsPreTrainedModel {
|
|
|
28784
29664
|
}
|
|
28785
29665
|
};
|
|
28786
29666
|
|
|
29667
|
+
// src/models/voxtral/modeling_voxtral.js
|
|
29668
|
+
var VoxtralForConditionalGeneration = class extends UltravoxModel {
|
|
29669
|
+
};
|
|
29670
|
+
|
|
29671
|
+
// src/models/voxtral_realtime/modeling_voxtral_realtime.js
|
|
29672
|
+
var CONV1_LEFT_PAD = 2;
|
|
29673
|
+
var CONV2_LEFT_PAD = 1;
|
|
29674
|
+
var states = /* @__PURE__ */ new WeakMap();
|
|
29675
|
+
function createEncoderState(model, input_features) {
|
|
29676
|
+
const { text_config, audio_config } = (
|
|
29677
|
+
/** @type {any} */
|
|
29678
|
+
model.config
|
|
29679
|
+
);
|
|
29680
|
+
const encoder_session = model.sessions["audio_encoder"];
|
|
29681
|
+
const { num_mel_bins, hidden_size: enc_hidden_size } = audio_config;
|
|
29682
|
+
const PADDING_CACHE_CHANNELS = num_mel_bins + enc_hidden_size;
|
|
29683
|
+
const enc_kv_cache = new DynamicCache();
|
|
29684
|
+
const enc_dtype = encoder_session?.config?.kv_cache_dtype ?? "float32";
|
|
29685
|
+
const enc_cls = enc_dtype === "float16" ? DataTypeMap.float16 : DataTypeMap.float32;
|
|
29686
|
+
const enc_shapes = getCacheShapes(audio_config, { batch_size: 1 });
|
|
29687
|
+
for (const name in enc_shapes) {
|
|
29688
|
+
const size = enc_shapes[name].reduce((a, b) => a * b, 1);
|
|
29689
|
+
enc_kv_cache[name] = new Tensor2(enc_dtype, new enc_cls(size), enc_shapes[name]);
|
|
29690
|
+
}
|
|
29691
|
+
const enc_padding_cache = new Tensor2(enc_dtype, new enc_cls(PADDING_CACHE_CHANNELS * CONV1_LEFT_PAD), [
|
|
29692
|
+
1,
|
|
29693
|
+
PADDING_CACHE_CHANNELS,
|
|
29694
|
+
CONV1_LEFT_PAD
|
|
29695
|
+
]);
|
|
29696
|
+
const chunks_iter = input_features[Symbol.asyncIterator]?.() ?? input_features[Symbol.iterator]?.();
|
|
29697
|
+
if (!chunks_iter) {
|
|
29698
|
+
throw new Error("input_features must be iterable or async iterable");
|
|
29699
|
+
}
|
|
29700
|
+
return {
|
|
29701
|
+
encoder_session,
|
|
29702
|
+
enc_kv_cache,
|
|
29703
|
+
enc_padding_cache,
|
|
29704
|
+
enc_past_seq_len: 0,
|
|
29705
|
+
audio_embed_queue: [],
|
|
29706
|
+
audio_embed_total_tokens: 0,
|
|
29707
|
+
audio_queue_offset: 0,
|
|
29708
|
+
audio_consumed: 0,
|
|
29709
|
+
stream_exhausted: false,
|
|
29710
|
+
chunks_iter,
|
|
29711
|
+
text_hidden_size: text_config.hidden_size
|
|
29712
|
+
};
|
|
29713
|
+
}
|
|
29714
|
+
async function encodeChunk(s, chunk_features) {
|
|
29715
|
+
const audio_seq_len = chunk_features.dims[2];
|
|
29716
|
+
const conv2_output_len = Math.floor((CONV2_LEFT_PAD + audio_seq_len - 3) / 2) + 1;
|
|
29717
|
+
const position_ids = new Tensor2(
|
|
29718
|
+
"int64",
|
|
29719
|
+
BigInt64Array.from({ length: conv2_output_len }, (_, i) => BigInt(s.enc_past_seq_len + i)),
|
|
29720
|
+
[1, conv2_output_len]
|
|
29721
|
+
);
|
|
29722
|
+
const total_seq_len = s.enc_past_seq_len + conv2_output_len;
|
|
29723
|
+
const attention_mask = ones([1, total_seq_len]);
|
|
29724
|
+
const { audio_embeds, present_padding_cache, ...present_cache } = await sessionRun(s.encoder_session, {
|
|
29725
|
+
input_features: chunk_features,
|
|
29726
|
+
attention_mask,
|
|
29727
|
+
position_ids,
|
|
29728
|
+
past_padding_cache: s.enc_padding_cache,
|
|
29729
|
+
...s.enc_kv_cache
|
|
29730
|
+
});
|
|
29731
|
+
if (s.enc_padding_cache.location === "gpu-buffer") {
|
|
29732
|
+
s.enc_padding_cache.dispose();
|
|
29733
|
+
}
|
|
29734
|
+
s.enc_padding_cache = present_padding_cache;
|
|
29735
|
+
for (const name in present_cache) {
|
|
29736
|
+
if (name.startsWith("present.")) {
|
|
29737
|
+
const pastName = name.replace("present", "past_key_values");
|
|
29738
|
+
const prev = s.enc_kv_cache[pastName];
|
|
29739
|
+
if (prev?.location === "gpu-buffer") {
|
|
29740
|
+
prev.dispose();
|
|
29741
|
+
}
|
|
29742
|
+
s.enc_kv_cache[pastName] = present_cache[name];
|
|
29743
|
+
}
|
|
29744
|
+
}
|
|
29745
|
+
s.enc_past_seq_len = total_seq_len;
|
|
29746
|
+
return audio_embeds;
|
|
29747
|
+
}
|
|
29748
|
+
async function fillAudioBuffer(s, needed) {
|
|
29749
|
+
while (s.audio_embed_total_tokens < needed && !s.stream_exhausted) {
|
|
29750
|
+
const result = await s.chunks_iter.next();
|
|
29751
|
+
if (result.done) {
|
|
29752
|
+
s.stream_exhausted = true;
|
|
29753
|
+
break;
|
|
29754
|
+
}
|
|
29755
|
+
const new_embeds = await encodeChunk(s, result.value);
|
|
29756
|
+
s.audio_embed_queue.push({ data: new_embeds.data, tokens: new_embeds.dims[1] });
|
|
29757
|
+
s.audio_embed_total_tokens += new_embeds.dims[1];
|
|
29758
|
+
}
|
|
29759
|
+
}
|
|
29760
|
+
function addAudioEmbeddings(s, inputs_embeds, current_len) {
|
|
29761
|
+
if (s.audio_embed_queue.length === 0) return;
|
|
29762
|
+
const embed_data = inputs_embeds.data;
|
|
29763
|
+
let embed_write_pos = 0;
|
|
29764
|
+
let remaining = current_len;
|
|
29765
|
+
while (remaining > 0 && s.audio_embed_queue.length > 0) {
|
|
29766
|
+
const front = s.audio_embed_queue[0];
|
|
29767
|
+
const available = front.tokens - s.audio_queue_offset;
|
|
29768
|
+
const n = Math.min(remaining, available);
|
|
29769
|
+
const src_offset = s.audio_queue_offset * s.text_hidden_size;
|
|
29770
|
+
for (let i = 0; i < n * s.text_hidden_size; ++i) {
|
|
29771
|
+
embed_data[embed_write_pos * s.text_hidden_size + i] += front.data[src_offset + i];
|
|
29772
|
+
}
|
|
29773
|
+
embed_write_pos += n;
|
|
29774
|
+
remaining -= n;
|
|
29775
|
+
s.audio_queue_offset += n;
|
|
29776
|
+
if (s.audio_queue_offset >= front.tokens) {
|
|
29777
|
+
s.audio_embed_queue.shift();
|
|
29778
|
+
s.audio_queue_offset = 0;
|
|
29779
|
+
}
|
|
29780
|
+
}
|
|
29781
|
+
s.audio_consumed += current_len - remaining;
|
|
29782
|
+
}
|
|
29783
|
+
var AudioExhaustedCriteria = class extends StoppingCriteria {
|
|
29784
|
+
constructor(enc_state) {
|
|
29785
|
+
super();
|
|
29786
|
+
this._s = enc_state;
|
|
29787
|
+
}
|
|
29788
|
+
_call(input_ids) {
|
|
29789
|
+
const done = this._s.stream_exhausted && this._s.audio_embed_queue.length === 0;
|
|
29790
|
+
return input_ids.map(() => done);
|
|
29791
|
+
}
|
|
29792
|
+
};
|
|
29793
|
+
var VoxtralRealtimePreTrainedModel = class extends PreTrainedModel {
|
|
29794
|
+
forward_params = ["input_ids", "attention_mask", "position_ids", "past_key_values"];
|
|
29795
|
+
};
|
|
29796
|
+
var VoxtralRealtimeForConditionalGeneration = class extends VoxtralRealtimePreTrainedModel {
|
|
29797
|
+
async forward({ input_ids, past_key_values, ...kwargs }) {
|
|
29798
|
+
const current_len = input_ids.dims[1];
|
|
29799
|
+
const enc = states.get(this);
|
|
29800
|
+
if (enc) {
|
|
29801
|
+
await fillAudioBuffer(enc, enc.audio_consumed + current_len);
|
|
29802
|
+
}
|
|
29803
|
+
const { inputs_embeds } = await sessionRun(this.sessions["embed_tokens"], { input_ids });
|
|
29804
|
+
if (enc) {
|
|
29805
|
+
addAudioEmbeddings(enc, inputs_embeds, current_len);
|
|
29806
|
+
}
|
|
29807
|
+
const decoder_feeds = { inputs_embeds, ...kwargs };
|
|
29808
|
+
this.addPastKeyValues(decoder_feeds, past_key_values);
|
|
29809
|
+
const session = this.sessions["decoder_model_merged"];
|
|
29810
|
+
const fixed = pick(decoder_feeds, session.inputNames);
|
|
29811
|
+
return await sessionRun(session, fixed);
|
|
29812
|
+
}
|
|
29813
|
+
async generate({ input_features, stopping_criteria: userStoppingCriteria, ...kwargs }) {
|
|
29814
|
+
if (!input_features) {
|
|
29815
|
+
throw new Error("input_features (generator/iterable) must be provided");
|
|
29816
|
+
}
|
|
29817
|
+
const enc_state = createEncoderState(this, input_features);
|
|
29818
|
+
states.set(this, enc_state);
|
|
29819
|
+
const stopping_criteria = new StoppingCriteriaList();
|
|
29820
|
+
stopping_criteria.push(new AudioExhaustedCriteria(enc_state));
|
|
29821
|
+
if (userStoppingCriteria) stopping_criteria.extend(userStoppingCriteria);
|
|
29822
|
+
try {
|
|
29823
|
+
return await super.generate({ ...kwargs, stopping_criteria });
|
|
29824
|
+
} finally {
|
|
29825
|
+
enc_state.enc_kv_cache.dispose();
|
|
29826
|
+
states.delete(this);
|
|
29827
|
+
}
|
|
29828
|
+
}
|
|
29829
|
+
};
|
|
29830
|
+
|
|
28787
29831
|
// src/models/wav2vec2_bert/modeling_wav2vec2_bert.js
|
|
28788
29832
|
var Wav2Vec2BertPreTrainedModel = class extends PreTrainedModel {
|
|
28789
29833
|
};
|
|
@@ -29537,6 +30581,7 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29537
30581
|
["gemma2", "Gemma2ForCausalLM"],
|
|
29538
30582
|
["vaultgemma", "VaultGemmaForCausalLM"],
|
|
29539
30583
|
["gemma3_text", "Gemma3ForCausalLM"],
|
|
30584
|
+
["gemma3", "Gemma3ForCausalLM"],
|
|
29540
30585
|
["helium", "HeliumForCausalLM"],
|
|
29541
30586
|
["glm", "GlmForCausalLM"],
|
|
29542
30587
|
["openelm", "OpenELMForCausalLM"],
|
|
@@ -29545,6 +30590,13 @@ var MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29545
30590
|
["qwen3", "Qwen3ForCausalLM"],
|
|
29546
30591
|
["qwen3_moe", "Qwen3MoeForCausalLM"],
|
|
29547
30592
|
["qwen3_next", "Qwen3NextForCausalLM"],
|
|
30593
|
+
["qwen2_vl", "Qwen2VLForCausalLM"],
|
|
30594
|
+
["qwen2_5_vl", "Qwen2_5_VLForCausalLM"],
|
|
30595
|
+
["qwen3_vl", "Qwen3VLForCausalLM"],
|
|
30596
|
+
["qwen3_vl_moe", "Qwen3VLMoeForCausalLM"],
|
|
30597
|
+
["qwen3_5", "Qwen3_5ForCausalLM"],
|
|
30598
|
+
["qwen3_5_moe", "Qwen3_5MoeForCausalLM"],
|
|
30599
|
+
["gemma3n", "Gemma3nForCausalLM"],
|
|
29548
30600
|
["phi", "PhiForCausalLM"],
|
|
29549
30601
|
["phi3", "Phi3ForCausalLM"],
|
|
29550
30602
|
["mpt", "MptForCausalLM"],
|
|
@@ -29620,6 +30672,7 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29620
30672
|
["qwen3_vl_moe", "Qwen3VLMoeForConditionalGeneration"],
|
|
29621
30673
|
["qwen3_5", "Qwen3_5ForConditionalGeneration"],
|
|
29622
30674
|
["qwen3_5_moe", "Qwen3_5MoeForConditionalGeneration"],
|
|
30675
|
+
["lfm2_vl", "Lfm2VlForConditionalGeneration"],
|
|
29623
30676
|
["idefics3", "Idefics3ForConditionalGeneration"],
|
|
29624
30677
|
["smolvlm", "SmolVLMForConditionalGeneration"],
|
|
29625
30678
|
["paligemma", "PaliGemmaForConditionalGeneration"],
|
|
@@ -29628,8 +30681,10 @@ var MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
|
29628
30681
|
["mistral3", "Mistral3ForConditionalGeneration"]
|
|
29629
30682
|
]);
|
|
29630
30683
|
var MODEL_FOR_AUDIO_TEXT_TO_TEXT_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
30684
|
+
["granite_speech", "GraniteSpeechForConditionalGeneration"],
|
|
29631
30685
|
["ultravox", "UltravoxModel"],
|
|
29632
|
-
["voxtral", "VoxtralForConditionalGeneration"]
|
|
30686
|
+
["voxtral", "VoxtralForConditionalGeneration"],
|
|
30687
|
+
["voxtral_realtime", "VoxtralRealtimeForConditionalGeneration"]
|
|
29633
30688
|
]);
|
|
29634
30689
|
var MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = /* @__PURE__ */ new Map([
|
|
29635
30690
|
["vision-encoder-decoder", "VisionEncoderDecoderModel"]
|
|
@@ -29812,7 +30867,19 @@ var CUSTOM_MAPPING = [
|
|
|
29812
30867
|
MODEL_TYPES.ImageAudioTextToText
|
|
29813
30868
|
],
|
|
29814
30869
|
["SupertonicForConditionalGeneration", SupertonicForConditionalGeneration, MODEL_TYPES.Supertonic],
|
|
29815
|
-
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox]
|
|
30870
|
+
["ChatterboxModel", ChatterboxModel, MODEL_TYPES.Chatterbox],
|
|
30871
|
+
["Qwen2VLForCausalLM", Qwen2VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30872
|
+
["Qwen2_5_VLForCausalLM", Qwen2_5_VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30873
|
+
["Qwen3VLForCausalLM", Qwen3VLForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30874
|
+
["Qwen3VLMoeForCausalLM", Qwen3VLMoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30875
|
+
["Qwen3_5ForCausalLM", Qwen3_5ForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30876
|
+
["Qwen3_5MoeForCausalLM", Qwen3_5MoeForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30877
|
+
["Gemma3nForCausalLM", Gemma3nForCausalLM, MODEL_TYPES.MultimodalLanguageModelOnly],
|
|
30878
|
+
[
|
|
30879
|
+
"VoxtralRealtimeForConditionalGeneration",
|
|
30880
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
30881
|
+
MODEL_TYPES.VoxtralRealtime
|
|
30882
|
+
]
|
|
29816
30883
|
];
|
|
29817
30884
|
for (const [name, model, type] of CUSTOM_MAPPING) {
|
|
29818
30885
|
MODEL_TYPE_MAPPING.set(name, type);
|
|
@@ -31490,8 +32557,18 @@ var TASK_ALIASES = Object.freeze({
|
|
|
31490
32557
|
});
|
|
31491
32558
|
|
|
31492
32559
|
// src/utils/model_registry/get_model_files.js
|
|
32560
|
+
function get_config(modelId, { config = null, cache_dir = null, local_files_only = false, revision = "main" } = {}) {
|
|
32561
|
+
if (config !== null) {
|
|
32562
|
+
return AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision });
|
|
32563
|
+
}
|
|
32564
|
+
const key = JSON.stringify([modelId, cache_dir, local_files_only, revision]);
|
|
32565
|
+
return memoizePromise(
|
|
32566
|
+
key,
|
|
32567
|
+
() => AutoConfig.from_pretrained(modelId, { config, cache_dir, local_files_only, revision })
|
|
32568
|
+
);
|
|
32569
|
+
}
|
|
31493
32570
|
async function get_model_files(modelId, { config = null, dtype: overrideDtype = null, device: overrideDevice = null, model_file_name = null } = {}) {
|
|
31494
|
-
config = await
|
|
32571
|
+
config = await get_config(modelId, { config });
|
|
31495
32572
|
const files = [
|
|
31496
32573
|
// Add config.json (always loaded)
|
|
31497
32574
|
"config.json"
|
|
@@ -31552,74 +32629,14 @@ async function get_model_files(modelId, { config = null, dtype: overrideDtype =
|
|
|
31552
32629
|
files.push(dataFilePath);
|
|
31553
32630
|
}
|
|
31554
32631
|
};
|
|
31555
|
-
const
|
|
31556
|
-
|
|
31557
|
-
add_model_file(
|
|
31558
|
-
|
|
31559
|
-
|
|
31560
|
-
|
|
31561
|
-
|
|
31562
|
-
|
|
31563
|
-
add_model_file("decoder_model_merged");
|
|
31564
|
-
files.push("generation_config.json");
|
|
31565
|
-
} else if (modelType === MODEL_TYPES.MaskGeneration) {
|
|
31566
|
-
add_model_file("model", "vision_encoder");
|
|
31567
|
-
add_model_file("prompt_encoder_mask_decoder");
|
|
31568
|
-
} else if (modelType === MODEL_TYPES.EncoderDecoder) {
|
|
31569
|
-
add_model_file("model", "encoder_model");
|
|
31570
|
-
add_model_file("decoder_model_merged");
|
|
31571
|
-
} else if (modelType === MODEL_TYPES.ImageTextToText) {
|
|
31572
|
-
add_model_file("embed_tokens");
|
|
31573
|
-
add_model_file("vision_encoder");
|
|
31574
|
-
add_model_file("decoder_model_merged");
|
|
31575
|
-
if (config.is_encoder_decoder) {
|
|
31576
|
-
add_model_file("model", "encoder_model");
|
|
31577
|
-
}
|
|
31578
|
-
files.push("generation_config.json");
|
|
31579
|
-
} else if (modelType === MODEL_TYPES.AudioTextToText) {
|
|
31580
|
-
add_model_file("embed_tokens");
|
|
31581
|
-
add_model_file("audio_encoder");
|
|
31582
|
-
add_model_file("decoder_model_merged");
|
|
31583
|
-
files.push("generation_config.json");
|
|
31584
|
-
} else if (modelType === MODEL_TYPES.ImageAudioTextToText) {
|
|
31585
|
-
add_model_file("embed_tokens");
|
|
31586
|
-
add_model_file("audio_encoder");
|
|
31587
|
-
add_model_file("vision_encoder");
|
|
31588
|
-
add_model_file("decoder_model_merged");
|
|
31589
|
-
files.push("generation_config.json");
|
|
31590
|
-
} else if (modelType === MODEL_TYPES.Musicgen) {
|
|
31591
|
-
add_model_file("model", "text_encoder");
|
|
31592
|
-
add_model_file("decoder_model_merged");
|
|
31593
|
-
add_model_file("encodec_decode");
|
|
31594
|
-
files.push("generation_config.json");
|
|
31595
|
-
} else if (modelType === MODEL_TYPES.MultiModality) {
|
|
31596
|
-
add_model_file("prepare_inputs_embeds");
|
|
31597
|
-
add_model_file("model", "language_model");
|
|
31598
|
-
add_model_file("lm_head");
|
|
31599
|
-
add_model_file("gen_head");
|
|
31600
|
-
add_model_file("gen_img_embeds");
|
|
31601
|
-
add_model_file("image_decode");
|
|
31602
|
-
files.push("generation_config.json");
|
|
31603
|
-
} else if (modelType === MODEL_TYPES.Phi3V) {
|
|
31604
|
-
add_model_file("prepare_inputs_embeds");
|
|
31605
|
-
add_model_file("model");
|
|
31606
|
-
add_model_file("vision_encoder");
|
|
31607
|
-
files.push("generation_config.json");
|
|
31608
|
-
} else if (modelType === MODEL_TYPES.Chatterbox) {
|
|
31609
|
-
add_model_file("embed_tokens");
|
|
31610
|
-
add_model_file("speech_encoder");
|
|
31611
|
-
add_model_file("model", "language_model");
|
|
31612
|
-
add_model_file("conditional_decoder");
|
|
31613
|
-
files.push("generation_config.json");
|
|
31614
|
-
} else if (modelType === MODEL_TYPES.AutoEncoder) {
|
|
31615
|
-
add_model_file("encoder_model");
|
|
31616
|
-
add_model_file("decoder_model");
|
|
31617
|
-
} else if (modelType === MODEL_TYPES.Supertonic) {
|
|
31618
|
-
add_model_file("text_encoder");
|
|
31619
|
-
add_model_file("latent_denoiser");
|
|
31620
|
-
add_model_file("voice_decoder");
|
|
31621
|
-
} else {
|
|
31622
|
-
add_model_file("model", singleModelName);
|
|
32632
|
+
const { sessions, optional_configs } = getSessionsConfig(modelType, config, { model_file_name });
|
|
32633
|
+
for (const [sessionKey, baseName] of Object.entries(sessions)) {
|
|
32634
|
+
add_model_file(sessionKey, baseName);
|
|
32635
|
+
}
|
|
32636
|
+
if (optional_configs) {
|
|
32637
|
+
for (const configFile of Object.values(optional_configs)) {
|
|
32638
|
+
files.push(configFile);
|
|
32639
|
+
}
|
|
31623
32640
|
}
|
|
31624
32641
|
return files;
|
|
31625
32642
|
}
|
|
@@ -32070,25 +33087,25 @@ async function load_video(src, { num_frames = null, fps = null } = {}) {
|
|
|
32070
33087
|
|
|
32071
33088
|
// src/utils/model_registry/is_cached.js
|
|
32072
33089
|
async function check_files_cache(modelId, files, options = {}) {
|
|
32073
|
-
const
|
|
32074
|
-
if (!
|
|
33090
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
33091
|
+
if (!cache2) {
|
|
32075
33092
|
const fileStatuses2 = files.map((filename) => ({ file: filename, cached: false }));
|
|
32076
33093
|
return { allCached: false, files: fileStatuses2 };
|
|
32077
33094
|
}
|
|
32078
33095
|
const fileStatuses = await Promise.all(
|
|
32079
33096
|
files.map(async (filename) => {
|
|
32080
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
32081
|
-
const cached = await checkCachedResource(
|
|
33097
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
33098
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
32082
33099
|
return { file: filename, cached: !!cached };
|
|
32083
33100
|
})
|
|
32084
33101
|
);
|
|
32085
33102
|
return { allCached: fileStatuses.every((f) => f.cached), files: fileStatuses };
|
|
32086
33103
|
}
|
|
32087
33104
|
async function is_file_cached(modelId, filename, options = {}) {
|
|
32088
|
-
const
|
|
32089
|
-
if (!
|
|
32090
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
32091
|
-
return !!await checkCachedResource(
|
|
33105
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
33106
|
+
if (!cache2) return false;
|
|
33107
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
33108
|
+
return !!await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
32092
33109
|
}
|
|
32093
33110
|
async function is_cached(modelId, options = {}) {
|
|
32094
33111
|
if (!modelId) {
|
|
@@ -32135,26 +33152,26 @@ async function is_pipeline_cached_files(task, modelId, options = {}) {
|
|
|
32135
33152
|
|
|
32136
33153
|
// src/utils/model_registry/clear_cache.js
|
|
32137
33154
|
async function clear_files_from_cache(modelId, files, options = {}) {
|
|
32138
|
-
const
|
|
32139
|
-
if (!
|
|
33155
|
+
const cache2 = await getCache(options?.cache_dir);
|
|
33156
|
+
if (!cache2) {
|
|
32140
33157
|
return {
|
|
32141
33158
|
filesDeleted: 0,
|
|
32142
33159
|
filesCached: 0,
|
|
32143
33160
|
files: files.map((filename) => ({ file: filename, deleted: false, wasCached: false }))
|
|
32144
33161
|
};
|
|
32145
33162
|
}
|
|
32146
|
-
if (!
|
|
33163
|
+
if (!cache2.delete) {
|
|
32147
33164
|
throw new Error("Cache does not support delete operation");
|
|
32148
33165
|
}
|
|
32149
33166
|
const results = await Promise.all(
|
|
32150
33167
|
files.map(async (filename) => {
|
|
32151
|
-
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options,
|
|
32152
|
-
const cached = await checkCachedResource(
|
|
33168
|
+
const { localPath, proposedCacheKey } = buildResourcePaths(modelId, filename, options, cache2);
|
|
33169
|
+
const cached = await checkCachedResource(cache2, localPath, proposedCacheKey);
|
|
32153
33170
|
const wasCached = !!cached;
|
|
32154
33171
|
let deleted = false;
|
|
32155
33172
|
if (wasCached) {
|
|
32156
|
-
const deletedWithProposed = await
|
|
32157
|
-
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await
|
|
33173
|
+
const deletedWithProposed = await cache2.delete(proposedCacheKey);
|
|
33174
|
+
const deletedWithLocal = !deletedWithProposed && proposedCacheKey !== localPath ? await cache2.delete(localPath) : false;
|
|
32158
33175
|
deleted = deletedWithProposed || deletedWithLocal;
|
|
32159
33176
|
}
|
|
32160
33177
|
return { file: filename, deleted, wasCached };
|
|
@@ -32636,6 +33653,7 @@ var ModelRegistry = class {
|
|
|
32636
33653
|
DonutImageProcessor,
|
|
32637
33654
|
DonutSwinModel,
|
|
32638
33655
|
DonutSwinPreTrainedModel,
|
|
33656
|
+
DynamicCache,
|
|
32639
33657
|
EdgeTamModel,
|
|
32640
33658
|
EfficientNetForImageClassification,
|
|
32641
33659
|
EfficientNetImageProcessor,
|
|
@@ -32708,6 +33726,7 @@ var ModelRegistry = class {
|
|
|
32708
33726
|
Gemma3Model,
|
|
32709
33727
|
Gemma3PreTrainedModel,
|
|
32710
33728
|
Gemma3nAudioFeatureExtractor,
|
|
33729
|
+
Gemma3nForCausalLM,
|
|
32711
33730
|
Gemma3nForConditionalGeneration,
|
|
32712
33731
|
Gemma3nPreTrainedModel,
|
|
32713
33732
|
Gemma3nProcessor,
|
|
@@ -32727,6 +33746,9 @@ var ModelRegistry = class {
|
|
|
32727
33746
|
GraniteMoeHybridModel,
|
|
32728
33747
|
GraniteMoeHybridPreTrainedModel,
|
|
32729
33748
|
GranitePreTrainedModel,
|
|
33749
|
+
GraniteSpeechFeatureExtractor,
|
|
33750
|
+
GraniteSpeechForConditionalGeneration,
|
|
33751
|
+
GraniteSpeechProcessor,
|
|
32730
33752
|
GroundingDinoForObjectDetection,
|
|
32731
33753
|
GroundingDinoImageProcessor,
|
|
32732
33754
|
GroundingDinoPreTrainedModel,
|
|
@@ -32752,7 +33774,6 @@ var ModelRegistry = class {
|
|
|
32752
33774
|
IJepaPreTrainedModel,
|
|
32753
33775
|
Idefics3ForConditionalGeneration,
|
|
32754
33776
|
Idefics3ImageProcessor,
|
|
32755
|
-
Idefics3PreTrainedModel,
|
|
32756
33777
|
Idefics3Processor,
|
|
32757
33778
|
ImageClassificationPipeline,
|
|
32758
33779
|
ImageFeatureExtractionPipeline,
|
|
@@ -32777,6 +33798,9 @@ var ModelRegistry = class {
|
|
|
32777
33798
|
Lfm2MoeModel,
|
|
32778
33799
|
Lfm2MoePreTrainedModel,
|
|
32779
33800
|
Lfm2PreTrainedModel,
|
|
33801
|
+
Lfm2VlForConditionalGeneration,
|
|
33802
|
+
Lfm2VlImageProcessor,
|
|
33803
|
+
Lfm2VlProcessor,
|
|
32780
33804
|
LiteWhisperForConditionalGeneration,
|
|
32781
33805
|
Llama4ForCausalLM,
|
|
32782
33806
|
Llama4PreTrainedModel,
|
|
@@ -32960,7 +33984,6 @@ var ModelRegistry = class {
|
|
|
32960
33984
|
Owlv2Model,
|
|
32961
33985
|
Owlv2PreTrainedModel,
|
|
32962
33986
|
PaliGemmaForConditionalGeneration,
|
|
32963
|
-
PaliGemmaPreTrainedModel,
|
|
32964
33987
|
PaliGemmaProcessor,
|
|
32965
33988
|
ParakeetFeatureExtractor,
|
|
32966
33989
|
ParakeetForCTC,
|
|
@@ -33004,10 +34027,12 @@ var ModelRegistry = class {
|
|
|
33004
34027
|
Qwen2MoePreTrainedModel,
|
|
33005
34028
|
Qwen2PreTrainedModel,
|
|
33006
34029
|
Qwen2Tokenizer,
|
|
34030
|
+
Qwen2VLForCausalLM,
|
|
33007
34031
|
Qwen2VLForConditionalGeneration,
|
|
33008
34032
|
Qwen2VLImageProcessor,
|
|
33009
34033
|
Qwen2VLPreTrainedModel,
|
|
33010
34034
|
Qwen2VLProcessor,
|
|
34035
|
+
Qwen2_5_VLForCausalLM,
|
|
33011
34036
|
Qwen2_5_VLForConditionalGeneration,
|
|
33012
34037
|
Qwen2_5_VLProcessor,
|
|
33013
34038
|
Qwen3ForCausalLM,
|
|
@@ -33019,10 +34044,14 @@ var ModelRegistry = class {
|
|
|
33019
34044
|
Qwen3NextModel,
|
|
33020
34045
|
Qwen3NextPreTrainedModel,
|
|
33021
34046
|
Qwen3PreTrainedModel,
|
|
34047
|
+
Qwen3VLForCausalLM,
|
|
33022
34048
|
Qwen3VLForConditionalGeneration,
|
|
34049
|
+
Qwen3VLMoeForCausalLM,
|
|
33023
34050
|
Qwen3VLMoeForConditionalGeneration,
|
|
33024
34051
|
Qwen3VLProcessor,
|
|
34052
|
+
Qwen3_5ForCausalLM,
|
|
33025
34053
|
Qwen3_5ForConditionalGeneration,
|
|
34054
|
+
Qwen3_5MoeForCausalLM,
|
|
33026
34055
|
Qwen3_5MoeForConditionalGeneration,
|
|
33027
34056
|
RFDetrForObjectDetection,
|
|
33028
34057
|
RFDetrModel,
|
|
@@ -33094,7 +34123,6 @@ var ModelRegistry = class {
|
|
|
33094
34123
|
SmolLM3ForCausalLM,
|
|
33095
34124
|
SmolLM3Model,
|
|
33096
34125
|
SmolLM3PreTrainedModel,
|
|
33097
|
-
SmolVLMForConditionalGeneration,
|
|
33098
34126
|
SmolVLMImageProcessor,
|
|
33099
34127
|
SmolVLMProcessor,
|
|
33100
34128
|
SnacDecoderModel,
|
|
@@ -33200,6 +34228,10 @@ var ModelRegistry = class {
|
|
|
33200
34228
|
VitsTokenizer,
|
|
33201
34229
|
VoxtralForConditionalGeneration,
|
|
33202
34230
|
VoxtralProcessor,
|
|
34231
|
+
VoxtralRealtimeFeatureExtractor,
|
|
34232
|
+
VoxtralRealtimeForConditionalGeneration,
|
|
34233
|
+
VoxtralRealtimePreTrainedModel,
|
|
34234
|
+
VoxtralRealtimeProcessor,
|
|
33203
34235
|
Wav2Vec2BertForCTC,
|
|
33204
34236
|
Wav2Vec2BertForSequenceClassification,
|
|
33205
34237
|
Wav2Vec2BertModel,
|
|
@@ -33295,7 +34327,7 @@ var ModelRegistry = class {
|
|
|
33295
34327
|
|
|
33296
34328
|
onnxruntime-web/dist/ort.webgpu.bundle.min.mjs:
|
|
33297
34329
|
(*!
|
|
33298
|
-
* ONNX Runtime Web v1.25.0-dev.
|
|
34330
|
+
* ONNX Runtime Web v1.25.0-dev.20260307-d626b568e0
|
|
33299
34331
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
33300
34332
|
* Licensed under the MIT License.
|
|
33301
34333
|
*)
|